linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH, RFC] reimplement flush_workqueue()
@ 2006-12-17 22:34 Oleg Nesterov
  2006-12-18  3:09 ` Linus Torvalds
                   ` (2 more replies)
  0 siblings, 3 replies; 78+ messages in thread
From: Oleg Nesterov @ 2006-12-17 22:34 UTC (permalink / raw)
  To: Andrew Morton
  Cc: David Howells, Christoph Hellwig, Ingo Molnar, Linus Torvalds,
	linux-kernel

Remove ->remove_sequence, ->insert_sequence, and ->work_done from
struct cpu_workqueue_struct. To implement flush_workqueue() we can
queue a barrier work on each CPU and wait for its completition.

We don't need to worry about CPU going down while we are are sleeping
on the completition. take_over_work() will move this work on another
CPU, and the handler will wake up us eventually.

NOTE: I removed 'int cpu' parameter, flush_workqueue() locks/unlocks
workqueue_mutex unconditionally. It may be restored, but I think it
doesn't make much sense, we take the mutex for the very short time,
and the code becomes simpler.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>

 workqueue.c |   87 ++++++++++++++++++++----------------------------------------
 1 files changed, 29 insertions(+), 58 deletions(-)

--- mm-6.20-rc1/kernel/workqueue.c~1_flush_q	2006-12-18 00:56:58.000000000 +0300
+++ mm-6.20-rc1/kernel/workqueue.c	2006-12-18 01:13:22.000000000 +0300
@@ -36,23 +36,13 @@
 /*
  * The per-CPU workqueue (if single thread, we always use the first
  * possible cpu).
- *
- * The sequence counters are for flush_scheduled_work().  It wants to wait
- * until all currently-scheduled works are completed, but it doesn't
- * want to be livelocked by new, incoming ones.  So it waits until
- * remove_sequence is >= the insert_sequence which pertained when
- * flush_scheduled_work() was called.
  */
 struct cpu_workqueue_struct {
 
 	spinlock_t lock;
 
-	long remove_sequence;	/* Least-recently added (next to run) */
-	long insert_sequence;	/* Next to add */
-
 	struct list_head worklist;
 	wait_queue_head_t more_work;
-	wait_queue_head_t work_done;
 
 	struct workqueue_struct *wq;
 	struct task_struct *thread;
@@ -138,8 +128,6 @@ static int __run_work(struct cpu_workque
 		f(work);
 
 		spin_lock_irqsave(&cwq->lock, flags);
-		cwq->remove_sequence++;
-		wake_up(&cwq->work_done);
 		ret = 1;
 	}
 	spin_unlock_irqrestore(&cwq->lock, flags);
@@ -187,7 +175,6 @@ static void __queue_work(struct cpu_work
 	spin_lock_irqsave(&cwq->lock, flags);
 	set_wq_data(work, cwq);
 	list_add_tail(&work->entry, &cwq->worklist);
-	cwq->insert_sequence++;
 	wake_up(&cwq->more_work);
 	spin_unlock_irqrestore(&cwq->lock, flags);
 }
@@ -338,8 +325,6 @@ static void run_workqueue(struct cpu_wor
 		}
 
 		spin_lock_irqsave(&cwq->lock, flags);
-		cwq->remove_sequence++;
-		wake_up(&cwq->work_done);
 	}
 	cwq->run_depth--;
 	spin_unlock_irqrestore(&cwq->lock, flags);
@@ -394,45 +379,39 @@ static int worker_thread(void *__cwq)
 	return 0;
 }
 
-/*
- * If cpu == -1 it's a single-threaded workqueue and the caller does not hold
- * workqueue_mutex
- */
-static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq, int cpu)
+struct wq_barrier {
+	struct work_struct	work;
+	struct completion	done;
+};
+
+static void wq_barrier_func(struct work_struct *work)
+{
+	struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
+	complete(&barr->done);
+}
+
+static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
 {
 	if (cwq->thread == current) {
 		/*
 		 * Probably keventd trying to flush its own queue. So simply run
 		 * it by hand rather than deadlocking.
 		 */
-		if (cpu != -1)
-			mutex_unlock(&workqueue_mutex);
+		mutex_unlock(&workqueue_mutex);
 		run_workqueue(cwq);
-		if (cpu != -1)
-			mutex_lock(&workqueue_mutex);
+		mutex_lock(&workqueue_mutex);
 	} else {
-		DEFINE_WAIT(wait);
-		long sequence_needed;
+		struct wq_barrier barr = {
+			.work = __WORK_INITIALIZER(barr.work, wq_barrier_func),
+			.done = COMPLETION_INITIALIZER_ONSTACK(barr.done),
+		};
 
-		spin_lock_irq(&cwq->lock);
-		sequence_needed = cwq->insert_sequence;
+		__set_bit(WORK_STRUCT_PENDING, &barr.work.management);
+		__queue_work(cwq, &barr.work);
 
-		while (sequence_needed - cwq->remove_sequence > 0) {
-			prepare_to_wait(&cwq->work_done, &wait,
-					TASK_UNINTERRUPTIBLE);
-			spin_unlock_irq(&cwq->lock);
-			if (cpu != -1)
-				mutex_unlock(&workqueue_mutex);
-			schedule();
-			if (cpu != -1) {
-				mutex_lock(&workqueue_mutex);
-				if (!cpu_online(cpu))
-					return; /* oops, CPU unplugged */
-			}
-			spin_lock_irq(&cwq->lock);
-		}
-		finish_wait(&cwq->work_done, &wait);
-		spin_unlock_irq(&cwq->lock);
+		mutex_unlock(&workqueue_mutex);
+		wait_for_completion(&barr.done);
+		mutex_lock(&workqueue_mutex);
 	}
 }
 
@@ -443,30 +422,25 @@ static void flush_cpu_workqueue(struct c
  * Forces execution of the workqueue and blocks until its completion.
  * This is typically used in driver shutdown handlers.
  *
- * This function will sample each workqueue's current insert_sequence number and
- * will sleep until the head sequence is greater than or equal to that.  This
- * means that we sleep until all works which were queued on entry have been
- * handled, but we are not livelocked by new incoming ones.
+ * We sleep until all works which were queued on entry have been handled,
+ * but we are not livelocked by new incoming ones.
  *
  * This function used to run the workqueues itself.  Now we just wait for the
  * helper threads to do it.
  */
 void fastcall flush_workqueue(struct workqueue_struct *wq)
 {
-	might_sleep();
-
+	mutex_lock(&workqueue_mutex);
 	if (is_single_threaded(wq)) {
 		/* Always use first cpu's area. */
-		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu),
-					-1);
+		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu));
 	} else {
 		int cpu;
 
-		mutex_lock(&workqueue_mutex);
 		for_each_online_cpu(cpu)
-			flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu), cpu);
-		mutex_unlock(&workqueue_mutex);
+			flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
 	}
+	mutex_unlock(&workqueue_mutex);
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
 
@@ -479,12 +453,9 @@ static struct task_struct *create_workqu
 	spin_lock_init(&cwq->lock);
 	cwq->wq = wq;
 	cwq->thread = NULL;
-	cwq->insert_sequence = 0;
-	cwq->remove_sequence = 0;
 	cwq->freezeable = freezeable;
 	INIT_LIST_HEAD(&cwq->worklist);
 	init_waitqueue_head(&cwq->more_work);
-	init_waitqueue_head(&cwq->work_done);
 
 	if (is_single_threaded(wq))
 		p = kthread_create(worker_thread, cwq, "%s", wq->name);


^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH, RFC] reimplement flush_workqueue()
  2006-12-17 22:34 [PATCH, RFC] reimplement flush_workqueue() Oleg Nesterov
@ 2006-12-18  3:09 ` Linus Torvalds
  2006-12-19  0:27 ` Andrew Morton
  2007-01-04 12:02 ` [PATCH, RFC] reimplement flush_workqueue() Srivatsa Vaddagiri
  2 siblings, 0 replies; 78+ messages in thread
From: Linus Torvalds @ 2006-12-18  3:09 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	linux-kernel



On Mon, 18 Dec 2006, Oleg Nesterov wrote:
>
> Remove ->remove_sequence, ->insert_sequence, and ->work_done from
> struct cpu_workqueue_struct. To implement flush_workqueue() we can
> queue a barrier work on each CPU and wait for its completition.

Looks fine to me. It's after -rc1 so I won't apply it, but it looks like a 
nice cleanup.

		Linus

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH, RFC] reimplement flush_workqueue()
  2006-12-17 22:34 [PATCH, RFC] reimplement flush_workqueue() Oleg Nesterov
  2006-12-18  3:09 ` Linus Torvalds
@ 2006-12-19  0:27 ` Andrew Morton
  2006-12-19  0:43   ` Oleg Nesterov
  2007-01-04 12:02 ` [PATCH, RFC] reimplement flush_workqueue() Srivatsa Vaddagiri
  2 siblings, 1 reply; 78+ messages in thread
From: Andrew Morton @ 2006-12-19  0:27 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: David Howells, Christoph Hellwig, Ingo Molnar, Linus Torvalds,
	linux-kernel

On Mon, 18 Dec 2006 01:34:16 +0300
Oleg Nesterov <oleg@tv-sign.ru> wrote:

> Remove ->remove_sequence, ->insert_sequence, and ->work_done from
> struct cpu_workqueue_struct. To implement flush_workqueue() we can
> queue a barrier work on each CPU and wait for its completition.

Seems sensible.  I seem to recall considering doing it that way when I
initially implemeted flush_workqueue(), but I don't recall why I didn't do
this.  hmm.

> We don't need to worry about CPU going down while we are are sleeping
> on the completition. take_over_work() will move this work on another
> CPU, and the handler will wake up us eventually.
> 
> NOTE: I removed 'int cpu' parameter, flush_workqueue() locks/unlocks
> workqueue_mutex unconditionally. It may be restored, but I think it
> doesn't make much sense, we take the mutex for the very short time,
> and the code becomes simpler.
> 

Taking workqueue_mutex() unconditionally in flush_workqueue() means
that we'll deadlock if a single-threaded workqueue callback handler calls
flush_workqueue().

It's an idiotic thing to do, but I think I spotted a site last week which
does this.  scsi?  Not sure..

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH, RFC] reimplement flush_workqueue()
  2006-12-19  0:27 ` Andrew Morton
@ 2006-12-19  0:43   ` Oleg Nesterov
  2006-12-19  1:00     ` Andrew Morton
  2007-01-04 11:32     ` Srivatsa Vaddagiri
  0 siblings, 2 replies; 78+ messages in thread
From: Oleg Nesterov @ 2006-12-19  0:43 UTC (permalink / raw)
  To: Andrew Morton
  Cc: David Howells, Christoph Hellwig, Ingo Molnar, Linus Torvalds,
	linux-kernel

On 12/18, Andrew Morton wrote:
>
> On Mon, 18 Dec 2006 01:34:16 +0300
> Oleg Nesterov <oleg@tv-sign.ru> wrote:
> 
> > NOTE: I removed 'int cpu' parameter, flush_workqueue() locks/unlocks
> > workqueue_mutex unconditionally. It may be restored, but I think it
> > doesn't make much sense, we take the mutex for the very short time,
> > and the code becomes simpler.
> > 
> 
> Taking workqueue_mutex() unconditionally in flush_workqueue() means
> that we'll deadlock if a single-threaded workqueue callback handler calls
> flush_workqueue().

Well. But flush_workqueue() drops workqueue_mutex before going to sleep ?

	flush_workqueue(single_threaded_wq);
		...
		mutex_lock(&workqueue_mutex);
		...
		mutex_unlock(&workqueue_mutex);
		wait_for_completition();
							handler runs,
							calls flush_workqueue(),
							workqueue_mutex is free
							
> It's an idiotic thing to do, but I think I spotted a site last week which
> does this.  scsi?  Not sure..

Ok, it is time to sleep. I'll look tomorrov and re-send if flush_cpu_workqueue()
really needs "bool workqueue_mutex_is_locked" parameter.

Oleg.


^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH, RFC] reimplement flush_workqueue()
  2006-12-19  0:43   ` Oleg Nesterov
@ 2006-12-19  1:00     ` Andrew Morton
  2007-01-04 11:32     ` Srivatsa Vaddagiri
  1 sibling, 0 replies; 78+ messages in thread
From: Andrew Morton @ 2006-12-19  1:00 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: David Howells, Christoph Hellwig, Ingo Molnar, Linus Torvalds,
	linux-kernel

On Tue, 19 Dec 2006 03:43:19 +0300
Oleg Nesterov <oleg@tv-sign.ru> wrote:

> On 12/18, Andrew Morton wrote:
> >
> > On Mon, 18 Dec 2006 01:34:16 +0300
> > Oleg Nesterov <oleg@tv-sign.ru> wrote:
> > 
> > > NOTE: I removed 'int cpu' parameter, flush_workqueue() locks/unlocks
> > > workqueue_mutex unconditionally. It may be restored, but I think it
> > > doesn't make much sense, we take the mutex for the very short time,
> > > and the code becomes simpler.
> > > 
> > 
> > Taking workqueue_mutex() unconditionally in flush_workqueue() means
> > that we'll deadlock if a single-threaded workqueue callback handler calls
> > flush_workqueue().
> 
> Well. But flush_workqueue() drops workqueue_mutex before going to sleep ?
> 
> 	flush_workqueue(single_threaded_wq);
> 		...
> 		mutex_lock(&workqueue_mutex);
> 		...
> 		mutex_unlock(&workqueue_mutex);
> 		wait_for_completition();
> 							handler runs,
> 							calls flush_workqueue(),
> 							workqueue_mutex is free

Oh.  OK.  In that case we can switch to preempt_disable() for the
cpu-hotplug holdoff.  Sometime.

> > It's an idiotic thing to do, but I think I spotted a site last week which
> > does this.  scsi?  Not sure..
> 
> Ok, it is time to sleep. I'll look tomorrov and re-send if flush_cpu_workqueue()
> really needs "bool workqueue_mutex_is_locked" parameter.

Hopefully not.

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH, RFC] reimplement flush_workqueue()
  2006-12-19  0:43   ` Oleg Nesterov
  2006-12-19  1:00     ` Andrew Morton
@ 2007-01-04 11:32     ` Srivatsa Vaddagiri
  2007-01-04 14:29       ` Oleg Nesterov
  1 sibling, 1 reply; 78+ messages in thread
From: Srivatsa Vaddagiri @ 2007-01-04 11:32 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On Tue, Dec 19, 2006 at 03:43:19AM +0300, Oleg Nesterov wrote:
> > Taking workqueue_mutex() unconditionally in flush_workqueue() means
> > that we'll deadlock if a single-threaded workqueue callback handler calls
> > flush_workqueue().
> 
> Well. But flush_workqueue() drops workqueue_mutex before going to sleep ?

... and acquires it again after woken from sleep. That can be a problem, which 
will lead to the problem described here:

	http://lkml.org/lkml/2006/12/7/374

In brief:

keventd thread					hotplug thread
--------------					--------------

  run_workqueue()
	|
     work_fn()
	 |
	flush_workqueue()
	     |	
	   flush_cpu_workqueue
		|				cpu_down()
	     mutex_unlock(wq_mutex);		     |
	(above opens window for hotplug)	   mutex_lock(wq_mutex);
    		|				   /* bring down cpu */	
	     wait_for_completition();		     notifier(CPU_DEAD, ..)
		| 				       workqueue_cpu_callback
		| 				        cleanup_workqueue_thread
		|					  kthread_stop()
		|
		|
	     mutex_lock(wq_mutex); <- Can deadlock


The kthread_stop() will wait for keventd() thread to exit, but keventd()
is blocked on mutex_lock(wq_mutex) leading to a deadlock.

> 
> 	flush_workqueue(single_threaded_wq);
> 		...
> 		mutex_lock(&workqueue_mutex);
> 		...
> 		mutex_unlock(&workqueue_mutex);
> 		wait_for_completition();
> 							handler runs,
> 							calls flush_workqueue(),
> 							workqueue_mutex is free

-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH, RFC] reimplement flush_workqueue()
  2006-12-17 22:34 [PATCH, RFC] reimplement flush_workqueue() Oleg Nesterov
  2006-12-18  3:09 ` Linus Torvalds
  2006-12-19  0:27 ` Andrew Morton
@ 2007-01-04 12:02 ` Srivatsa Vaddagiri
  2007-01-04 14:38   ` Oleg Nesterov
  2 siblings, 1 reply; 78+ messages in thread
From: Srivatsa Vaddagiri @ 2007-01-04 12:02 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On Mon, Dec 18, 2006 at 01:34:16AM +0300, Oleg Nesterov wrote:
>  void fastcall flush_workqueue(struct workqueue_struct *wq)
>  {
> -	might_sleep();
> -
> +	mutex_lock(&workqueue_mutex);
>  	if (is_single_threaded(wq)) {
>  		/* Always use first cpu's area. */
> -		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu),
> -					-1);
> +		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu));
>  	} else {
>  		int cpu;
> 
> -		mutex_lock(&workqueue_mutex);
>  		for_each_online_cpu(cpu)


Can compiler optimizations lead to cpu_online_map being cached in a register 
while running this loop? AFAICS cpu_online_map is not declared to be
volatile. If it can be cached, then we have the danger of invoking 
flush_cpu_workqueue() on a dead cpu (because flush_cpu_workqueue drops
workqueue_mutex, cpu hp events can change cpu_online_map while we are in
flush_cpu_workqueue).

> -			flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu), cpu);
> -		mutex_unlock(&workqueue_mutex);
> +			flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));


-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH, RFC] reimplement flush_workqueue()
  2007-01-04 11:32     ` Srivatsa Vaddagiri
@ 2007-01-04 14:29       ` Oleg Nesterov
  2007-01-04 15:56         ` Srivatsa Vaddagiri
  2007-01-04 17:18         ` Andrew Morton
  0 siblings, 2 replies; 78+ messages in thread
From: Oleg Nesterov @ 2007-01-04 14:29 UTC (permalink / raw)
  To: Srivatsa Vaddagiri
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On 01/04, Srivatsa Vaddagiri wrote:
>
> On Tue, Dec 19, 2006 at 03:43:19AM +0300, Oleg Nesterov wrote:
> > > Taking workqueue_mutex() unconditionally in flush_workqueue() means
> > > that we'll deadlock if a single-threaded workqueue callback handler calls
> > > flush_workqueue().
> > 
> > Well. But flush_workqueue() drops workqueue_mutex before going to sleep ?
> 
> ... and acquires it again after woken from sleep. That can be a problem, which 
> will lead to the problem described here:
> 
> 	http://lkml.org/lkml/2006/12/7/374
> 
> In brief:
> 
> keventd thread					hotplug thread
> --------------					--------------
> 
>   run_workqueue()
> 	|
>      work_fn()
> 	 |
> 	flush_workqueue()
> 	     |	
> 	   flush_cpu_workqueue
> 		|				cpu_down()
> 	     mutex_unlock(wq_mutex);		     |
> 	(above opens window for hotplug)	   mutex_lock(wq_mutex);
>     		|				   /* bring down cpu */	
> 	     wait_for_completition();		     notifier(CPU_DEAD, ..)
> 		| 				       workqueue_cpu_callback
> 		| 				        cleanup_workqueue_thread
> 		|					  kthread_stop()
> 		|
> 		|
> 	     mutex_lock(wq_mutex); <- Can deadlock
> 
> 
> The kthread_stop() will wait for keventd() thread to exit, but keventd()
> is blocked on mutex_lock(wq_mutex) leading to a deadlock.

Thanks, I need to think about this.

However I am not sure I fully understand the problem.

First, this deadlock was not introduced by recent changes (including "single
threaded flush_workqueue() takes workqueue_mutex too"), yes?

Also, it seems to me we have a much more simple scenario for deadlock.

events/0 runs run_workqueue(), work->func() sleeps or takes a preemtion. CPU 0
dies, keventd thread migrates to another CPU. CPU_DEAD calls kthread_stop() under
workqueue_mutex and waits for until kevents thread exits. Now, if this work (or
another work pending on cwq->worklist) takes workqueue_mutex (for example, does
flush_workqueue) we have a deadlock.

No?

Oleg.


^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH, RFC] reimplement flush_workqueue()
  2007-01-04 12:02 ` [PATCH, RFC] reimplement flush_workqueue() Srivatsa Vaddagiri
@ 2007-01-04 14:38   ` Oleg Nesterov
  0 siblings, 0 replies; 78+ messages in thread
From: Oleg Nesterov @ 2007-01-04 14:38 UTC (permalink / raw)
  To: Srivatsa Vaddagiri
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On 01/04, Srivatsa Vaddagiri wrote:
>
> On Mon, Dec 18, 2006 at 01:34:16AM +0300, Oleg Nesterov wrote:
> >  void fastcall flush_workqueue(struct workqueue_struct *wq)
> >  {
> > -	might_sleep();
> > -
> > +	mutex_lock(&workqueue_mutex);
> >  	if (is_single_threaded(wq)) {
> >  		/* Always use first cpu's area. */
> > -		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu),
> > -					-1);
> > +		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu));
> >  	} else {
> >  		int cpu;
> > 
> > -		mutex_lock(&workqueue_mutex);
> >  		for_each_online_cpu(cpu)
> 
> 
> Can compiler optimizations lead to cpu_online_map being cached in a register 
> while running this loop? AFAICS cpu_online_map is not declared to be
> volatile.

But it is not const either,

>            If it can be cached,

I believe this would be a compiler's bug. Let's take a more simple example,

	while (!condition)
		schedule();

What if compiler will cache the value of global 'condition' ?

                                  then we have the danger of invoking 
> flush_cpu_workqueue() on a dead cpu (because flush_cpu_workqueue drops
> workqueue_mutex, cpu hp events can change cpu_online_map while we are in
> flush_cpu_workqueue).

Oleg.


^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH, RFC] reimplement flush_workqueue()
  2007-01-04 14:29       ` Oleg Nesterov
@ 2007-01-04 15:56         ` Srivatsa Vaddagiri
  2007-01-04 16:31           ` Oleg Nesterov
  2007-01-04 17:18         ` Andrew Morton
  1 sibling, 1 reply; 78+ messages in thread
From: Srivatsa Vaddagiri @ 2007-01-04 15:56 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On Thu, Jan 04, 2007 at 05:29:36PM +0300, Oleg Nesterov wrote:
> Thanks, I need to think about this.
> 
> However I am not sure I fully understand the problem.
> 
> First, this deadlock was not introduced by recent changes (including "single
> threaded flush_workqueue() takes workqueue_mutex too"), yes?

AFAIK this deadlock originated from Andrew's patch here:

	http://lkml.org/lkml/2006/12/7/231

(Yes, your patches didnt introduce this. I was just reiterating here my
earlier point that workqueue code is broken of late wrt cpu hotplug).

> Also, it seems to me we have a much more simple scenario for deadlock.
> 
> events/0 runs run_workqueue(), work->func() sleeps or takes a preemtion. CPU 0
> dies, keventd thread migrates to another CPU. CPU_DEAD calls kthread_stop() under
> workqueue_mutex and waits for until kevents thread exits. Now, if this work (or
> another work pending on cwq->worklist) takes workqueue_mutex (for example, does
> flush_workqueue) we have a deadlock.
> 
> No?

Yes, the above scenario also will cause a deadlock. 

I supposed one could avoid the deadlock by having a 'workqueue_mutex_held' 
flag and avoid taking the mutex set under some conditions, but IMHO a
more neater solution is to provide a cpu-hotplug lock which works under
all these corner cases. One such proposal was made here:

	http://lkml.org/lkml/2006/10/26/65
	
-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH, RFC] reimplement flush_workqueue()
  2007-01-04 15:56         ` Srivatsa Vaddagiri
@ 2007-01-04 16:31           ` Oleg Nesterov
  2007-01-04 16:57             ` Srivatsa Vaddagiri
  0 siblings, 1 reply; 78+ messages in thread
From: Oleg Nesterov @ 2007-01-04 16:31 UTC (permalink / raw)
  To: Srivatsa Vaddagiri
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On 01/04, Srivatsa Vaddagiri wrote:
>
> On Thu, Jan 04, 2007 at 05:29:36PM +0300, Oleg Nesterov wrote:
> > Thanks, I need to think about this.
> > 
> > However I am not sure I fully understand the problem.
> > 
> > First, this deadlock was not introduced by recent changes (including "single
> > threaded flush_workqueue() takes workqueue_mutex too"), yes?
> 
> AFAIK this deadlock originated from Andrew's patch here:
> 
> 	http://lkml.org/lkml/2006/12/7/231

I don't think so. The core problem is not that we are doing unlock/sleep/lock
with this patch. The thing is: work->func() can't take wq_mutex (and thus use
flush_work/workqueue) because it is possible that CPU_DEAD holds this mutex
and waits for us to complete(kthread_stop_info). I believe this bug is old.

> (Yes, your patches didnt introduce this. I was just reiterating here my
> earlier point that workqueue code is broken of late wrt cpu hotplug).
> 
> > Also, it seems to me we have a much more simple scenario for deadlock.
> > 
> > events/0 runs run_workqueue(), work->func() sleeps or takes a preemtion. CPU 0
> > dies, keventd thread migrates to another CPU. CPU_DEAD calls kthread_stop() under
> > workqueue_mutex and waits for until kevents thread exits. Now, if this work (or
> > another work pending on cwq->worklist) takes workqueue_mutex (for example, does
> > flush_workqueue) we have a deadlock.
> > 
> > No?
> 
> Yes, the above scenario also will cause a deadlock. 

Ok, thanks for acknowledgement.

> I supposed one could avoid the deadlock by having a 'workqueue_mutex_held' 
> flag and avoid taking the mutex set under some conditions,

I am thinking about the same right now. Probably we can do something like this:


	int xxx_lock(void)
	{
		for (;;) {
			if (mutex_trylock(wq_mutex))
				return 1;

			// the owner of wq_mutex sleeps, we can proceed
			if (kthread_should_stop())
				return 0;
		}
	}
	void xxx_unlock(int yesno)
	{
		if (yesno)
			mutex_unlock(wq_mutext);
	}

and then do

	locked = xxx_lock();
	...
	xxx_unlock(locked);

in flush_xxx() instead of plain lock/unlock.

Yes, ugly. I'll try to do something else on weekend.

>                                                             but IMHO a
> more neater solution is to provide a cpu-hotplug lock which works under
> all these corner cases. One such proposal was made here:
> 
> 	http://lkml.org/lkml/2006/10/26/65

I'll take a look later, thanks.

Oleg.


^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH, RFC] reimplement flush_workqueue()
  2007-01-04 16:31           ` Oleg Nesterov
@ 2007-01-04 16:57             ` Srivatsa Vaddagiri
  0 siblings, 0 replies; 78+ messages in thread
From: Srivatsa Vaddagiri @ 2007-01-04 16:57 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On Thu, Jan 04, 2007 at 07:31:39PM +0300, Oleg Nesterov wrote:
> > AFAIK this deadlock originated from Andrew's patch here:
> >
> > 	http://lkml.org/lkml/2006/12/7/231
> 
> I don't think so. The core problem is not that we are doing unlock/sleep/lock
> with this patch. The thing is: work->func() can't take wq_mutex (and thus use
> flush_work/workqueue) because it is possible that CPU_DEAD holds this mutex
> and waits for us to complete(kthread_stop_info). I believe this bug is old.

Yes, this bug is quite old looks like. Thanks for correcting me.

-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH, RFC] reimplement flush_workqueue()
  2007-01-04 14:29       ` Oleg Nesterov
  2007-01-04 15:56         ` Srivatsa Vaddagiri
@ 2007-01-04 17:18         ` Andrew Morton
  2007-01-04 18:09           ` Oleg Nesterov
                             ` (2 more replies)
  1 sibling, 3 replies; 78+ messages in thread
From: Andrew Morton @ 2007-01-04 17:18 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Srivatsa Vaddagiri, David Howells, Christoph Hellwig,
	Ingo Molnar, Linus Torvalds, linux-kernel, Gautham shenoy

On Thu, 4 Jan 2007 17:29:36 +0300
Oleg Nesterov <oleg@tv-sign.ru> wrote:

> > In brief:
> > 
> > keventd thread					hotplug thread
> > --------------					--------------
> > 
> >   run_workqueue()
> > 	|
> >      work_fn()
> > 	 |
> > 	flush_workqueue()
> > 	     |	
> > 	   flush_cpu_workqueue
> > 		|				cpu_down()
> > 	     mutex_unlock(wq_mutex);		     |
> > 	(above opens window for hotplug)	   mutex_lock(wq_mutex);
> >     		|				   /* bring down cpu */	
> > 	     wait_for_completition();		     notifier(CPU_DEAD, ..)
> > 		| 				       workqueue_cpu_callback
> > 		| 				        cleanup_workqueue_thread
> > 		|					  kthread_stop()
> > 		|
> > 		|
> > 	     mutex_lock(wq_mutex); <- Can deadlock
> > 
> > 
> > The kthread_stop() will wait for keventd() thread to exit, but keventd()
> > is blocked on mutex_lock(wq_mutex) leading to a deadlock.

This?


--- a/kernel/workqueue.c~flush_workqueue-use-preempt_disable-to-hold-off-cpu-hotplug
+++ a/kernel/workqueue.c
@@ -419,18 +419,22 @@ static void flush_cpu_workqueue(struct c
 		 * Probably keventd trying to flush its own queue. So simply run
 		 * it by hand rather than deadlocking.
 		 */
-		mutex_unlock(&workqueue_mutex);
+		preempt_enable();
+		/*
+		 * We can still touch *cwq here because we are keventd, and
+		 * hot-unplug will be waiting us to exit.
+		 */
 		run_workqueue(cwq);
-		mutex_lock(&workqueue_mutex);
+		preempt_disable();
 	} else {
 		struct wq_barrier barr;
 
 		init_wq_barrier(&barr);
 		__queue_work(cwq, &barr.work);
 
-		mutex_unlock(&workqueue_mutex);
+		preempt_enable();	/* Can no longer touch *cwq */
 		wait_for_completion(&barr.done);
-		mutex_lock(&workqueue_mutex);
+		preempt_disable();
 	}
 }
 
@@ -449,7 +453,7 @@ static void flush_cpu_workqueue(struct c
  */
 void fastcall flush_workqueue(struct workqueue_struct *wq)
 {
-	mutex_lock(&workqueue_mutex);
+	preempt_disable();		/* CPU hotplug */
 	if (is_single_threaded(wq)) {
 		/* Always use first cpu's area. */
 		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu));
@@ -459,7 +463,7 @@ void fastcall flush_workqueue(struct wor
 		for_each_online_cpu(cpu)
 			flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
 	}
-	mutex_unlock(&workqueue_mutex);
+	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
 
_


^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH, RFC] reimplement flush_workqueue()
  2007-01-04 17:18         ` Andrew Morton
@ 2007-01-04 18:09           ` Oleg Nesterov
  2007-01-04 18:31             ` Andrew Morton
  2007-01-05  8:56           ` Srivatsa Vaddagiri
  2007-01-06 15:10           ` [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update Oleg Nesterov
  2 siblings, 1 reply; 78+ messages in thread
From: Oleg Nesterov @ 2007-01-04 18:09 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Srivatsa Vaddagiri, David Howells, Christoph Hellwig,
	Ingo Molnar, Linus Torvalds, linux-kernel, Gautham shenoy

On 01/04, Andrew Morton wrote:
>
> On Thu, 4 Jan 2007 17:29:36 +0300
> Oleg Nesterov <oleg@tv-sign.ru> wrote:
> 
> > > In brief:
> > > 
> > > keventd thread					hotplug thread
> > > --------------					--------------
> > > 
> > >   run_workqueue()
> > > 	|
> > >      work_fn()
> > > 	 |
> > > 	flush_workqueue()
> > > 	     |	
> > > 	   flush_cpu_workqueue
> > > 		|				cpu_down()
> > > 	     mutex_unlock(wq_mutex);		     |
> > > 	(above opens window for hotplug)	   mutex_lock(wq_mutex);
> > >     		|				   /* bring down cpu */	
> > > 	     wait_for_completition();		     notifier(CPU_DEAD, ..)
> > > 		| 				       workqueue_cpu_callback
> > > 		| 				        cleanup_workqueue_thread
> > > 		|					  kthread_stop()
> > > 		|
> > > 		|
> > > 	     mutex_lock(wq_mutex); <- Can deadlock
> > > 
> > > 
> > > The kthread_stop() will wait for keventd() thread to exit, but keventd()
> > > is blocked on mutex_lock(wq_mutex) leading to a deadlock.
> 
> This?
> 
> 
> --- a/kernel/workqueue.c~flush_workqueue-use-preempt_disable-to-hold-off-cpu-hotplug
> +++ a/kernel/workqueue.c
> @@ -419,18 +419,22 @@ static void flush_cpu_workqueue(struct c
>  		 * Probably keventd trying to flush its own queue. So simply run
>  		 * it by hand rather than deadlocking.
>  		 */
> -		mutex_unlock(&workqueue_mutex);
> +		preempt_enable();

Ah, (looking at _cpu_down()->stop_machine()), so preempt_disable() not only "pins"
the current CPU, it blocks cpu_down(), yes ???

I guess this should work then. I'll try to re-check this code on weekend.

Oleg.


^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH, RFC] reimplement flush_workqueue()
  2007-01-04 18:09           ` Oleg Nesterov
@ 2007-01-04 18:31             ` Andrew Morton
  2007-01-05  9:03               ` Srivatsa Vaddagiri
  0 siblings, 1 reply; 78+ messages in thread
From: Andrew Morton @ 2007-01-04 18:31 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Srivatsa Vaddagiri, David Howells, Christoph Hellwig,
	Ingo Molnar, Linus Torvalds, linux-kernel, Gautham shenoy

On Thu, 4 Jan 2007 21:09:01 +0300
Oleg Nesterov <oleg@tv-sign.ru> wrote:

> > --- a/kernel/workqueue.c~flush_workqueue-use-preempt_disable-to-hold-off-cpu-hotplug
> > +++ a/kernel/workqueue.c
> > @@ -419,18 +419,22 @@ static void flush_cpu_workqueue(struct c
> >  		 * Probably keventd trying to flush its own queue. So simply run
> >  		 * it by hand rather than deadlocking.
> >  		 */
> > -		mutex_unlock(&workqueue_mutex);
> > +		preempt_enable();
> 
> Ah, (looking at _cpu_down()->stop_machine()), so preempt_disable() not only "pins"
> the current CPU, it blocks cpu_down(), yes ???

yep.

But before we do much more of this we should have a wrapper.  Umm

static inline void block_cpu_hotplug(void)
{
	preempt_disable();
}

and use that, so people can see why it's being used.

I spose I'll do that and convert this patch to use it.

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH, RFC] reimplement flush_workqueue()
  2007-01-04 17:18         ` Andrew Morton
  2007-01-04 18:09           ` Oleg Nesterov
@ 2007-01-05  8:56           ` Srivatsa Vaddagiri
  2007-01-05 12:42             ` Oleg Nesterov
  2007-01-06 15:10           ` [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update Oleg Nesterov
  2 siblings, 1 reply; 78+ messages in thread
From: Srivatsa Vaddagiri @ 2007-01-05  8:56 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Oleg Nesterov, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On Thu, Jan 04, 2007 at 09:18:50AM -0800, Andrew Morton wrote:
> This?

This can still lead to the problem spotted by Oleg here:

	http://lkml.org/lkml/2006/12/30/37

and you would need a similar patch he posted there.

>  void fastcall flush_workqueue(struct workqueue_struct *wq)
>  {
> -	mutex_lock(&workqueue_mutex);
> +	preempt_disable();		/* CPU hotplug */
>  	if (is_single_threaded(wq)) {
>  		/* Always use first cpu's area. */
>  		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu));
> @@ -459,7 +463,7 @@ void fastcall flush_workqueue(struct wor
>  		for_each_online_cpu(cpu)
>  			flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
>  	}
> -	mutex_unlock(&workqueue_mutex);
> +	preempt_enable();
>  }
>  EXPORT_SYMBOL_GPL(flush_workqueue);

-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH, RFC] reimplement flush_workqueue()
  2007-01-04 18:31             ` Andrew Morton
@ 2007-01-05  9:03               ` Srivatsa Vaddagiri
  2007-01-05 14:07                 ` Oleg Nesterov
  0 siblings, 1 reply; 78+ messages in thread
From: Srivatsa Vaddagiri @ 2007-01-05  9:03 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Oleg Nesterov, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On Thu, Jan 04, 2007 at 10:31:07AM -0800, Andrew Morton wrote:
> But before we do much more of this we should have a wrapper.  Umm
> 
> static inline void block_cpu_hotplug(void)
> {
> 	preempt_disable();
> }

Nack.

This will only block cpu down, not cpu_up and hence is a misnomer. I would be 
vary of ignoring cpu_up events totally in writing hotplug safe code.

-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH, RFC] reimplement flush_workqueue()
  2007-01-05  8:56           ` Srivatsa Vaddagiri
@ 2007-01-05 12:42             ` Oleg Nesterov
  2007-01-06 15:11               ` Srivatsa Vaddagiri
  0 siblings, 1 reply; 78+ messages in thread
From: Oleg Nesterov @ 2007-01-05 12:42 UTC (permalink / raw)
  To: Srivatsa Vaddagiri
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On 01/05, Srivatsa Vaddagiri wrote:
>
> On Thu, Jan 04, 2007 at 09:18:50AM -0800, Andrew Morton wrote:
> > This?
> 
> This can still lead to the problem spotted by Oleg here:
> 
> 	http://lkml.org/lkml/2006/12/30/37
> 
> and you would need a similar patch he posted there.

preempt_disable() can't prevent cpu_up, but flush_workqueue() doesn't care
_unless_ cpu_down also happened meantime (and hence a fresh CPU may have
pending work_structs which were moved from a dead CPU).

So you are right, we still need the patch above, but I think we don't have
new problems with preempt_disable().

I might have missed your point though.

Oleg.


^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH, RFC] reimplement flush_workqueue()
  2007-01-05  9:03               ` Srivatsa Vaddagiri
@ 2007-01-05 14:07                 ` Oleg Nesterov
  2007-01-06 15:24                   ` Srivatsa Vaddagiri
  0 siblings, 1 reply; 78+ messages in thread
From: Oleg Nesterov @ 2007-01-05 14:07 UTC (permalink / raw)
  To: Srivatsa Vaddagiri
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On 01/05, Srivatsa Vaddagiri wrote:
>
> On Thu, Jan 04, 2007 at 10:31:07AM -0800, Andrew Morton wrote:
> > But before we do much more of this we should have a wrapper.  Umm
> > 
> > static inline void block_cpu_hotplug(void)
> > {
> > 	preempt_disable();
> > }
> 
> Nack.
> 
> This will only block cpu down, not cpu_up and hence is a misnomer. I would be 
> vary of ignoring cpu_up events totally in writing hotplug safe code.

How about block_cpu_down() ?

These cpu-hotplug races delayed the last workqueue patch I have in my queue.
flush_workqueue() misses an important optimization: we don't need to insert
a barrier and have an extra wake_up + wait_for_completion when cwq has no
pending works. But we need ->current_work (introduced in the next patch) to
implement this correctly.

I'll re-send the patch below later, when we finish with the bug you pointed
out, but it would be nice if you can take a look now.

Oleg.

--- mm-6.20-rc2/kernel/workqueue.c~4_speedup	2006-12-30 18:09:07.000000000 +0300
+++ mm-6.20-rc2/kernel/workqueue.c	2007-01-05 16:32:45.000000000 +0300
@@ -405,12 +405,15 @@ static void wq_barrier_func(struct work_
 	complete(&barr->done);
 }
 
-static inline void init_wq_barrier(struct wq_barrier *barr)
+static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
+					struct wq_barrier *barr, int tail)
 {
 	INIT_WORK(&barr->work, wq_barrier_func);
 	__set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
 
 	init_completion(&barr->done);
+
+	insert_work(cwq, &barr->work, tail);
 }
 
 static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
@@ -425,13 +428,20 @@ static void flush_cpu_workqueue(struct c
 		mutex_lock(&workqueue_mutex);
 	} else {
 		struct wq_barrier barr;
+		int active = 0;
 
-		init_wq_barrier(&barr);
-		__queue_work(cwq, &barr.work);
+		spin_lock_irq(&cwq->lock);
+		if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
+			insert_wq_barrier(cwq, &barr, 1);
+			active = 1;
+		}
+		spin_unlock_irq(&cwq->lock);
 
-		mutex_unlock(&workqueue_mutex);
-		wait_for_completion(&barr.done);
-		mutex_lock(&workqueue_mutex);
+		if (active) {
+			mutex_unlock(&workqueue_mutex);
+			wait_for_completion(&barr.done);
+			mutex_lock(&workqueue_mutex);
+		}
 	}
 }
 
@@ -478,8 +488,7 @@ static void wait_on_work(struct cpu_work
 
 	spin_lock_irq(&cwq->lock);
 	if (unlikely(cwq->current_work == work)) {
-		init_wq_barrier(&barr);
-		insert_work(cwq, &barr.work, 0);
+		insert_wq_barrier(cwq, &barr, 0);
 		running = 1;
 	}
 	spin_unlock_irq(&cwq->lock);


^ permalink raw reply	[flat|nested] 78+ messages in thread

* [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update
  2007-01-04 17:18         ` Andrew Morton
  2007-01-04 18:09           ` Oleg Nesterov
  2007-01-05  8:56           ` Srivatsa Vaddagiri
@ 2007-01-06 15:10           ` Oleg Nesterov
  2007-01-06 15:45             ` Srivatsa Vaddagiri
  2 siblings, 1 reply; 78+ messages in thread
From: Oleg Nesterov @ 2007-01-06 15:10 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Srivatsa Vaddagiri, David Howells, Christoph Hellwig,
	Ingo Molnar, Linus Torvalds, linux-kernel, Gautham shenoy

( Srivatsa, Gautham, could you please verify my thinking ? )

On top of fix-flush_workqueue-vs-cpu_dead-race.patch

hotplug_sequence is incremented under "case CPU_DEAD:". This was ok
before flush_workqueue() was changed to use preempt_disable() instead
of workqueue_mutex. However preempt_disable() can't garantee that there
is no CPU_DEAD event in progress (it is possible that flush_workqueue()
runs after STOPMACHINE_EXIT), so flush_workqueue() can miss CPU_DEAD
event.

Increment hotplug_sequence earlier, under CPU_DOWN_PREPARE. We can't
miss the event, the task running flush_workqueue() will be re-scheduled
at least once before CPU actually disappears from cpu_online_map.

We may have a false positive but this is very unlikely and only means
we will have one unneeded "goto again".

Note: this patch depends on
handle-cpu_lock_acquire-and-cpu_lock_release-in-workqueue_cpu_callback
but only "textually".

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>

--- mm-6.20-rc3/kernel/workqueue.c~1_down	2007-01-06 16:15:59.000000000 +0300
+++ mm-6.20-rc3/kernel/workqueue.c	2007-01-06 17:52:10.000000000 +0300
@@ -886,12 +886,15 @@ static int __devinit workqueue_cpu_callb
 		}
 		break;
 
+	case CPU_DOWN_PREPARE:
+		hotplug_sequence++;
+		break;
+
 	case CPU_DEAD:
 		list_for_each_entry(wq, &workqueues, list)
 			cleanup_workqueue_thread(wq, hotcpu);
 		list_for_each_entry(wq, &workqueues, list)
 			take_over_work(wq, hotcpu);
-		hotplug_sequence++;
 		break;
 
 	case CPU_LOCK_RELEASE:


^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH, RFC] reimplement flush_workqueue()
  2007-01-05 12:42             ` Oleg Nesterov
@ 2007-01-06 15:11               ` Srivatsa Vaddagiri
  0 siblings, 0 replies; 78+ messages in thread
From: Srivatsa Vaddagiri @ 2007-01-06 15:11 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On Fri, Jan 05, 2007 at 03:42:46PM +0300, Oleg Nesterov wrote:
> preempt_disable() can't prevent cpu_up, but flush_workqueue() doesn't care
> _unless_ cpu_down also happened meantime (and hence a fresh CPU may have
> pending work_structs which were moved from a dead CPU).

Yes, that was what I had in mind.

> So you are right, we still need the patch above, but I think we don't have
> new problems with preempt_disable().

Right, preempt_disable() hasn't added any new problem. Its just
revealing the same problem as earlier, by opening up window for cpu
hotplug events to happen in the middle of flush_workqueue().

Ideally I would have liked a lock_cpu_hotplug() equivalent which blocks
all cpu hotplug events during the entire flush_workqueue(). In its
absence, I guess we just need to deal with all these ugly races ..

In summary, I think we need to go ahead with the preemp_disable() patch
in flush_workqueue() from Andrew and the race fix you posted here:

	http://lkml.org/lkml/2006/12/30/37

-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH, RFC] reimplement flush_workqueue()
  2007-01-05 14:07                 ` Oleg Nesterov
@ 2007-01-06 15:24                   ` Srivatsa Vaddagiri
  0 siblings, 0 replies; 78+ messages in thread
From: Srivatsa Vaddagiri @ 2007-01-06 15:24 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On Fri, Jan 05, 2007 at 05:07:17PM +0300, Oleg Nesterov wrote:
> How about block_cpu_down() ?

Maybe ..not sure 

If we do introduce such a function, we may need to convert several
preempt_disable() that are there already (with intent of blocking
cpu_down) to block_cpu_down() ..

> These cpu-hotplug races delayed the last workqueue patch I have in my queue.
> flush_workqueue() misses an important optimization: we don't need to insert
> a barrier and have an extra wake_up + wait_for_completion when cwq has no
> pending works. But we need ->current_work (introduced in the next patch) to
> implement this correctly.
> 
> I'll re-send the patch below later, when we finish with the bug you pointed
> out, but it would be nice if you can take a look now.

The patch seems fine to me, though I dont see any cpu hotplug
related problem being exposed/solved in this patch.

-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update
  2007-01-06 15:10           ` [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update Oleg Nesterov
@ 2007-01-06 15:45             ` Srivatsa Vaddagiri
  2007-01-06 16:30               ` Oleg Nesterov
  0 siblings, 1 reply; 78+ messages in thread
From: Srivatsa Vaddagiri @ 2007-01-06 15:45 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On Sat, Jan 06, 2007 at 06:10:36PM +0300, Oleg Nesterov wrote:
> Increment hotplug_sequence earlier, under CPU_DOWN_PREPARE. We can't
> miss the event, the task running flush_workqueue() will be re-scheduled
> at least once before CPU actually disappears from cpu_online_map.

Eww ..what happens if flush_workqueue() starts after CPU_DOWN_PREPARE?

CPU_DOWN_PREPARE(8)
	hotplug_sequence++ = 10

					flush_workqueue()
						sequence = 10
						flush cpus 1 ....7

CPU_DEAD(8)
	take_over_work(8->1)

					return not flushing dead cpu8 (=BUG)


??
 
-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update
  2007-01-06 15:45             ` Srivatsa Vaddagiri
@ 2007-01-06 16:30               ` Oleg Nesterov
  2007-01-06 16:38                 ` Srivatsa Vaddagiri
  0 siblings, 1 reply; 78+ messages in thread
From: Oleg Nesterov @ 2007-01-06 16:30 UTC (permalink / raw)
  To: Srivatsa Vaddagiri
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On 01/06, Srivatsa Vaddagiri wrote:
>
> On Sat, Jan 06, 2007 at 06:10:36PM +0300, Oleg Nesterov wrote:
> > Increment hotplug_sequence earlier, under CPU_DOWN_PREPARE. We can't
> > miss the event, the task running flush_workqueue() will be re-scheduled
> > at least once before CPU actually disappears from cpu_online_map.
> 
> Eww ..what happens if flush_workqueue() starts after CPU_DOWN_PREPARE?
                                                 ^^^^^
Stupid me. Thanks.

> CPU_DOWN_PREPARE(8)
> 	hotplug_sequence++ = 10
> 
> 					flush_workqueue()
> 						sequence = 10
> 						flush cpus 1 ....7
> 
> CPU_DEAD(8)
> 	take_over_work(8->1)
> 
> 					return not flushing dead cpu8 (=BUG)

I'll try to do something else tomorrow. Do you see a simple soulution?

The current usage of workqueue_mutex (I mean stable kernel) is broken
and deadlockable. We really need to change it.

Oleg.


^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update
  2007-01-06 16:30               ` Oleg Nesterov
@ 2007-01-06 16:38                 ` Srivatsa Vaddagiri
  2007-01-06 17:34                   ` Oleg Nesterov
  2007-01-06 19:11                   ` Andrew Morton
  0 siblings, 2 replies; 78+ messages in thread
From: Srivatsa Vaddagiri @ 2007-01-06 16:38 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On Sat, Jan 06, 2007 at 07:30:35PM +0300, Oleg Nesterov wrote:
> Stupid me. Thanks.
> 
> I'll try to do something else tomorrow. Do you see a simple soulution?

Sigh ..I dont see a simple solution, unless we have something like
lock_cpu_hotplug() ..

Andrew,
	This workqueue problem has exposed a classic example of how 
tough/miserable it can be to write hotplug safe code w/o something like
lock_cpu_hotplug() ..Are you still inclined towards banning it? :)

FYI, the lock_cpu_hotplug() rewrite proposed by Gautham at
http://lkml.org/lkml/2006/10/26/65 may still need refinement to avoid
all the kind of deadlocks we have unearthed with workqueue example. I
can review that design with Gautham if there is some interest to
revive lock_cpu_hotplug() ..

> The current usage of workqueue_mutex (I mean stable kernel) is broken
> and deadlockable. We really need to change it.

Yep ..

-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update
  2007-01-06 16:38                 ` Srivatsa Vaddagiri
@ 2007-01-06 17:34                   ` Oleg Nesterov
  2007-01-07 10:43                     ` Srivatsa Vaddagiri
  2007-01-06 19:11                   ` Andrew Morton
  1 sibling, 1 reply; 78+ messages in thread
From: Oleg Nesterov @ 2007-01-06 17:34 UTC (permalink / raw)
  To: Srivatsa Vaddagiri
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On 01/06, Srivatsa Vaddagiri wrote:
>
> On Sat, Jan 06, 2007 at 07:30:35PM +0300, Oleg Nesterov wrote:
> > Stupid me. Thanks.
> > 
> > I'll try to do something else tomorrow. Do you see a simple soulution?
> 
> Sigh ..I dont see a simple solution, unless we have something like
> lock_cpu_hotplug() ..

I suspect this can't help either.

The problem is that flush_workqueue() may be called while cpu hotplug event
in progress and CPU_DEAD waits for kthread_stop(), so we have the same dead
lock if work->func() does flush_workqueue(). This means that Andrew's change
to use preempt_disable() is good and anyway needed.

I am starting to believe we need some more intrusive changes in workquue.c.

Oleg.


^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update
  2007-01-06 16:38                 ` Srivatsa Vaddagiri
  2007-01-06 17:34                   ` Oleg Nesterov
@ 2007-01-06 19:11                   ` Andrew Morton
  2007-01-06 19:13                     ` Ingo Molnar
  2007-01-07 11:00                     ` Srivatsa Vaddagiri
  1 sibling, 2 replies; 78+ messages in thread
From: Andrew Morton @ 2007-01-06 19:11 UTC (permalink / raw)
  To: vatsa
  Cc: Oleg Nesterov, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On Sat, 6 Jan 2007 22:08:51 +0530
Srivatsa Vaddagiri <vatsa@in.ibm.com> wrote:

> 	This workqueue problem has exposed a classic example of how 
> tough/miserable it can be to write hotplug safe code w/o something like
> lock_cpu_hotplug() ..Are you still inclined towards banning it? :)

I don't ban stuff - I just advocate ;)

I would still prefer that we not try to invent a new magical lock,
but yes, the current approach is looking troublesome.

> FYI, the lock_cpu_hotplug() rewrite proposed by Gautham at
> http://lkml.org/lkml/2006/10/26/65 may still need refinement to avoid
> all the kind of deadlocks we have unearthed with workqueue example. I
> can review that design with Gautham if there is some interest to
> revive lock_cpu_hotplug() ..

Has anyone thought seriously about using the process freezer in the
cpu-down/cpu-up paths?  That way we don't need to lock anything anywhere?

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update
  2007-01-06 19:11                   ` Andrew Morton
@ 2007-01-06 19:13                     ` Ingo Molnar
  2007-01-07 11:00                     ` Srivatsa Vaddagiri
  1 sibling, 0 replies; 78+ messages in thread
From: Ingo Molnar @ 2007-01-06 19:13 UTC (permalink / raw)
  To: Andrew Morton
  Cc: vatsa, Oleg Nesterov, David Howells, Christoph Hellwig,
	Linus Torvalds, linux-kernel, Gautham shenoy


* Andrew Morton <akpm@osdl.org> wrote:

> > FYI, the lock_cpu_hotplug() rewrite proposed by Gautham at 
> > http://lkml.org/lkml/2006/10/26/65 may still need refinement to 
> > avoid all the kind of deadlocks we have unearthed with workqueue 
> > example. I can review that design with Gautham if there is some 
> > interest to revive lock_cpu_hotplug() ..
> 
> Has anyone thought seriously about using the process freezer in the 
> cpu-down/cpu-up paths?  That way we don't need to lock anything 
> anywhere?

yes, yes, yes - lets please do that! The process freezer is already used 
for suspend, for hibernate and recently for kprobes - so its performance 
and robustness is being relied on and verified from multiple angles. I 
can see no reason why it couldnt be made really fast even on large 
boxes, if the need arises. (but even the current one is fast enough for 
any human-driven CPU hotplug stuff)

	Ingo

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update
  2007-01-06 17:34                   ` Oleg Nesterov
@ 2007-01-07 10:43                     ` Srivatsa Vaddagiri
  2007-01-07 12:56                       ` Oleg Nesterov
  0 siblings, 1 reply; 78+ messages in thread
From: Srivatsa Vaddagiri @ 2007-01-07 10:43 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On Sat, Jan 06, 2007 at 08:34:16PM +0300, Oleg Nesterov wrote:
> I suspect this can't help either.
> 
> The problem is that flush_workqueue() may be called while cpu hotplug event
> in progress and CPU_DEAD waits for kthread_stop(), so we have the same dead
> lock if work->func() does flush_workqueue(). This means that Andrew's change
> to use preempt_disable() is good and anyway needed.

Well ..a lock_cpu_hotplug() in run_workqueue() and support for recursive
calls to lock_cpu_hotplug() by the same thread will avoid the problem
you mention. This will need changes to task_struct to track the
recursion depth. Alternately this can be supported w/o changes to
task_struct by 'biasing' readers over writers as I believe Gautham's 
patches [1] do.

1. http://lkml.org/lkml/2006/10/26/65

-- 
Regards,
vatsa




^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update
  2007-01-06 19:11                   ` Andrew Morton
  2007-01-06 19:13                     ` Ingo Molnar
@ 2007-01-07 11:00                     ` Srivatsa Vaddagiri
  2007-01-07 19:59                       ` Andrew Morton
  1 sibling, 1 reply; 78+ messages in thread
From: Srivatsa Vaddagiri @ 2007-01-07 11:00 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Oleg Nesterov, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On Sat, Jan 06, 2007 at 11:11:17AM -0800, Andrew Morton wrote:
> Has anyone thought seriously about using the process freezer in the
> cpu-down/cpu-up paths?  That way we don't need to lock anything anywhere?

How would this provide a stable access to cpu_online_map in functions
that need to block while accessing it (as flush_workqueue requires)?

-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update
  2007-01-07 10:43                     ` Srivatsa Vaddagiri
@ 2007-01-07 12:56                       ` Oleg Nesterov
  2007-01-07 14:22                         ` Oleg Nesterov
  2007-01-07 16:21                         ` Srivatsa Vaddagiri
  0 siblings, 2 replies; 78+ messages in thread
From: Oleg Nesterov @ 2007-01-07 12:56 UTC (permalink / raw)
  To: Srivatsa Vaddagiri
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On 01/07, Srivatsa Vaddagiri wrote:
>
> On Sat, Jan 06, 2007 at 08:34:16PM +0300, Oleg Nesterov wrote:
> > I suspect this can't help either.
> > 
> > The problem is that flush_workqueue() may be called while cpu hotplug event
> > in progress and CPU_DEAD waits for kthread_stop(), so we have the same dead
> > lock if work->func() does flush_workqueue(). This means that Andrew's change
> > to use preempt_disable() is good and anyway needed.
> 
> Well ..a lock_cpu_hotplug() in run_workqueue() and support for recursive
> calls to lock_cpu_hotplug() by the same thread will avoid the problem
> you mention.

Srivatsa, I'm completely new to cpu-hotplug, so please correct me if I'm
wrong (in fact I _hope_ I am wrong) but as I see it, the hotplug/workqueue
interaction is broken by design, it can't be fixed by changing just locking.

Once again. CPU dies, CPU_DEAD calls kthread_stop() and sleeps until
cwq->thread exits. To do so, this thread must at least complete the
currently running work->func().

work->func() calls flush_workque(WQ), it does lock_cpu_hotplug() or
_whatever_. Now the question, does it block?

if YES:
	This is what the stable tree does - deadlock.

if NOT:
	This is what we have with Andrew's s/mutex_lock/preempt_disable/
	patch - race or deadlock, we have a choice.

	Suppose that WQ has pending works on that dead CPU. Note that
	at this point this CPU does not present on cpu_online_map.
	This means that (without other changes) we have lost.

		- flush_workque(WQ) can't return until CPU_DEAD transfers
		  these works to some another CPU on the cpu_online_map.

		- CPU_DEAD can't do take_over_work() untill flush_workque()
		  returns.

Andrew, Ingo, this also means that freezer can't solve this particular
problem either (if i am right).

Thoughts?

Oleg.



^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update
  2007-01-07 12:56                       ` Oleg Nesterov
@ 2007-01-07 14:22                         ` Oleg Nesterov
  2007-01-07 14:42                           ` Oleg Nesterov
  2007-01-07 16:43                           ` Srivatsa Vaddagiri
  2007-01-07 16:21                         ` Srivatsa Vaddagiri
  1 sibling, 2 replies; 78+ messages in thread
From: Oleg Nesterov @ 2007-01-07 14:22 UTC (permalink / raw)
  To: Srivatsa Vaddagiri
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On 01/07, Oleg Nesterov wrote:
>
> Thoughts?

How about:

	CPU_DEAD does nothing. After __cpu_disable() cwq->thread runs on
	all CPUs and becomes idle when it flushes cwq->worklist: nobody
	will add work_struct on that list.

	CPU_UP:
		if (!cwq->thread)
			create_workqueue_thread();
		else
			set_cpus_allowed(newcpu);

	flush_workqueue:
		for_each_possible_cpu()		// NOT online!
			if (cwq->thread)
				flush_cpu_workqueue()

Oleg.


^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update
  2007-01-07 14:22                         ` Oleg Nesterov
@ 2007-01-07 14:42                           ` Oleg Nesterov
  2007-01-07 16:43                           ` Srivatsa Vaddagiri
  1 sibling, 0 replies; 78+ messages in thread
From: Oleg Nesterov @ 2007-01-07 14:42 UTC (permalink / raw)
  To: Srivatsa Vaddagiri
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On 01/07, Oleg Nesterov wrote:
> 
> How about:
> 
> 	CPU_DEAD does nothing. After __cpu_disable() cwq->thread runs on
> 	all CPUs and becomes idle when it flushes cwq->worklist: nobody
> 	will add work_struct on that list.

Also, we can add cpu_workqueue_struct->should_exit_after_flush flag, but
we have to be careful to serialize with destroy_workqueue/CPU_UP.

> 	CPU_UP:
> 		if (!cwq->thread)
> 			create_workqueue_thread();
> 		else
> 			set_cpus_allowed(newcpu);
> 
> 	flush_workqueue:
> 		for_each_possible_cpu()		// NOT online!
> 			if (cwq->thread)
> 				flush_cpu_workqueue()


^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update
  2007-01-07 12:56                       ` Oleg Nesterov
  2007-01-07 14:22                         ` Oleg Nesterov
@ 2007-01-07 16:21                         ` Srivatsa Vaddagiri
  2007-01-07 17:09                           ` Oleg Nesterov
  1 sibling, 1 reply; 78+ messages in thread
From: Srivatsa Vaddagiri @ 2007-01-07 16:21 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On Sun, Jan 07, 2007 at 03:56:03PM +0300, Oleg Nesterov wrote:
> Srivatsa, I'm completely new to cpu-hotplug, so please correct me if I'm
> wrong (in fact I _hope_ I am wrong) but as I see it, the hotplug/workqueue
> interaction is broken by design, it can't be fixed by changing just locking.
> 
> Once again. CPU dies, CPU_DEAD calls kthread_stop() and sleeps until
> cwq->thread exits. To do so, this thread must at least complete the
> currently running work->func().

If run_workqueue() takes a lock_cpu_hotplug() successfully, then we shouldnt 
even reach till this point, as it will block writers (cpu_down/up) until it
completes.


	run_workqueue()
	---------------
	
try_again:
	rc = lock_cpu_hotplug_interruptible();
	
	if (rc && kthread_should_stop())
		return;
	
	if (rc != 0)
		goto try_again;
	
	/* cpu_down/up shouldnt happen now untill we call unlock_cpu_hotplug */
	while (!list_empty(..))
		work->func();
	
	unlock_cpu_hotplug();


If work->func() calls something (say flush_workqueue()) which requires a
lock_cpu_hotplug() again, there are two ways to support it:

Method 1: Add a field, hotplug_lock_held, in task_struct

	If current->hotplug_lock_held > 1, then lock_cpu_hotplug()
	merely increments it and returns success. Its counterpart, 
	unlock_cpu_hotplug() will decrement the count.

	Easiest to implement. However additional field is required in
	each task_struct, which may not be attractive for some.

Method 2 : Bias readers over writers:

	This method will support recursive calls to lock_cpu_hotplug()
	by the same thread, w/o requiring a field in task_struct. To 
	accomplish this, readers are biased over writers i.e 


		reader1_lock(); <- success

					writer1_lock(); <- blocks on reader1


		reader2_lock(); <- success

A fair lock would have blocked reader2_lock() until 
writer1_lock()/writer1_unlock() is complete, but since we are required to 
support recursion w/o maintaining a task_struct field, we let reader2_lock() 
succeed, even though it could be from a different thread.
	
> Andrew, Ingo, this also means that freezer can't solve this particular
> problem either (if i am right).

freezer wont give stable access to cpu_online_map either, as could typically be
required in functions like flush_workqueue.

-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update
  2007-01-07 14:22                         ` Oleg Nesterov
  2007-01-07 14:42                           ` Oleg Nesterov
@ 2007-01-07 16:43                           ` Srivatsa Vaddagiri
  2007-01-07 17:01                             ` Srivatsa Vaddagiri
  2007-01-07 17:18                             ` Oleg Nesterov
  1 sibling, 2 replies; 78+ messages in thread
From: Srivatsa Vaddagiri @ 2007-01-07 16:43 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On Sun, Jan 07, 2007 at 05:22:46PM +0300, Oleg Nesterov wrote:
> On 01/07, Oleg Nesterov wrote:
> >
> > Thoughts?
> 
> How about:
> 
> 	CPU_DEAD does nothing. After __cpu_disable() cwq->thread runs on
> 	all CPUs and becomes idle when it flushes cwq->worklist: nobody
	^^^

all except dead cpus that is.

> 	will add work_struct on that list.

If CPU_DEAD does nothing, then the dead cpu's workqueue list may be
non-empty. How will it be flushed, given that no thread can run on the
dead cpu?

We could consider CPU_DEAD moving over work atleast (and not killing
worker threads also). In that case, cwq->thread can flush its work,
however it now requires serialization among worker threads, since more
than one worker thread can now be servicing the same CPU's workqueue
list (this will beat the very purpose of maintaining per-cpu threads to
avoid synchronization between them).

Finally, I am concerned about the (un)friendliness of this programming
model, where programmers are restricted in not having a stable access to
cpu_online_map at all -and- also requiring them to code in non-obvious
terms. Granted that writing hotplug-safe code is non-trivial, but the
absence of "safe access to online_map" will make it more complicated.

-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update
  2007-01-07 16:43                           ` Srivatsa Vaddagiri
@ 2007-01-07 17:01                             ` Srivatsa Vaddagiri
  2007-01-07 17:33                               ` Oleg Nesterov
  2007-01-07 17:18                             ` Oleg Nesterov
  1 sibling, 1 reply; 78+ messages in thread
From: Srivatsa Vaddagiri @ 2007-01-07 17:01 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On Sun, Jan 07, 2007 at 10:13:44PM +0530, Srivatsa Vaddagiri wrote:
> If CPU_DEAD does nothing, then the dead cpu's workqueue list may be
> non-empty. How will it be flushed, given that no thread can run on the
> dead cpu?
> 
> We could consider CPU_DEAD moving over work atleast (and not killing
> worker threads also). In that case, cwq->thread can flush its work,
> however it now requires serialization among worker threads, since more
> than one worker thread can now be servicing the same CPU's workqueue
> list (this will beat the very purpose of maintaining per-cpu threads to
> avoid synchronization between them).

I guess you could have cwq->thread flush only it's cpu's workqueue by
running on another cpu, which will avoid the need to synchronize
between worker threads. I am not 100% sure if that breaks workqueue
model in any way (since we could have two worker threads running on the
same CPU, but servicing different queues). Hopefully it doesnt.

However the concern expressed below remains ..

> Finally, I am concerned about the (un)friendliness of this programming
> model, where programmers are restricted in not having a stable access to
> cpu_online_map at all -and- also requiring them to code in non-obvious
> terms. Granted that writing hotplug-safe code is non-trivial, but the
> absence of "safe access to online_map" will make it more complicated.

-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update
  2007-01-07 16:21                         ` Srivatsa Vaddagiri
@ 2007-01-07 17:09                           ` Oleg Nesterov
  0 siblings, 0 replies; 78+ messages in thread
From: Oleg Nesterov @ 2007-01-07 17:09 UTC (permalink / raw)
  To: Srivatsa Vaddagiri
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On 01/07, Srivatsa Vaddagiri wrote:
>
> On Sun, Jan 07, 2007 at 03:56:03PM +0300, Oleg Nesterov wrote:
> > Srivatsa, I'm completely new to cpu-hotplug, so please correct me if I'm
> > wrong (in fact I _hope_ I am wrong) but as I see it, the hotplug/workqueue
> > interaction is broken by design, it can't be fixed by changing just locking.
> > 
> > Once again. CPU dies, CPU_DEAD calls kthread_stop() and sleeps until
> > cwq->thread exits. To do so, this thread must at least complete the
> > currently running work->func().
> 
> If run_workqueue() takes a lock_cpu_hotplug() successfully, then we shouldnt 
> even reach till this point, as it will block writers (cpu_down/up) until it
> completes.
> 
> 
> 	run_workqueue()
> 	---------------
> 	
> try_again:
> 	rc = lock_cpu_hotplug_interruptible();
> 	
> 	if (rc && kthread_should_stop())
> 		return;
> 	
> 	if (rc != 0)
> 		goto try_again;
> 	
> 	/* cpu_down/up shouldnt happen now untill we call unlock_cpu_hotplug */
> 	while (!list_empty(..))
> 		work->func();

This mean that every work->func() which may sleep delays cpu_down/up unpredictable,
not good. What about work->func which sleeps then re-queues itself? I guess we can
solve this, but this is what I said "other changes".

Also, lock_cpu_hotplug() should be per-cpu, otherwise we have livelock.

Not that I am against lock_cpu_hotplug (I can't judge), but its usage in run_workqueue
looks like complication to me. I may be wrong. But the main problem we don't have it :)

Oleg.


^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update
  2007-01-07 16:43                           ` Srivatsa Vaddagiri
  2007-01-07 17:01                             ` Srivatsa Vaddagiri
@ 2007-01-07 17:18                             ` Oleg Nesterov
  1 sibling, 0 replies; 78+ messages in thread
From: Oleg Nesterov @ 2007-01-07 17:18 UTC (permalink / raw)
  To: Srivatsa Vaddagiri
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On 01/07, Srivatsa Vaddagiri wrote:
>
> On Sun, Jan 07, 2007 at 05:22:46PM +0300, Oleg Nesterov wrote:
> > On 01/07, Oleg Nesterov wrote:
> > >
> > > Thoughts?
> > 
> > How about:
> > 
> > 	CPU_DEAD does nothing. After __cpu_disable() cwq->thread runs on
> > 	all CPUs and becomes idle when it flushes cwq->worklist: nobody
> 	^^^
> 
> all except dead cpus that is.

yes, of course.

> 
> > 	will add work_struct on that list.
> 
> If CPU_DEAD does nothing, then the dead cpu's workqueue list may be
> non-empty. How will it be flushed, given that no thread can run on the
> dead cpu?

But cwq->thread is not bound to the dead CPU at this point, it was aleady
migrated (like all other threads which had that CPU in ->cpus_allowed).

> Finally, I am concerned about the (un)friendliness of this programming
> model, where programmers are restricted in not having a stable access to
> cpu_online_map at all -and- also requiring them to code in non-obvious
> terms. Granted that writing hotplug-safe code is non-trivial, but the
> absence of "safe access to online_map" will make it more complicated.

I guess you misunderstood me, I meant CPU_DEAD does nothing only in
workqueue.c:workqueue_cpu_callback().

Oleg.


^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update
  2007-01-07 17:01                             ` Srivatsa Vaddagiri
@ 2007-01-07 17:33                               ` Oleg Nesterov
  0 siblings, 0 replies; 78+ messages in thread
From: Oleg Nesterov @ 2007-01-07 17:33 UTC (permalink / raw)
  To: Srivatsa Vaddagiri
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On 01/07, Srivatsa Vaddagiri wrote:
>
> On Sun, Jan 07, 2007 at 10:13:44PM +0530, Srivatsa Vaddagiri wrote:
> 
> I guess you could have cwq->thread flush only it's cpu's workqueue by
> running on another cpu,

yes, this is what I meant,

>                           which will avoid the need to synchronize
> between worker threads. I am not 100% sure if that breaks workqueue
> model in any way (since we could have two worker threads running on the
> same CPU, but servicing different queues). Hopefully it doesnt.

We are already doing this on CPU_DEAD->kthread_stop().

> However the concern expressed below remains ..
> 
> > Finally, I am concerned about the (un)friendliness of this programming
> > model, where programmers are restricted in not having a stable access to
> > cpu_online_map at all -and- also requiring them to code in non-obvious
> > terms. Granted that writing hotplug-safe code is non-trivial, but the
> > absence of "safe access to online_map" will make it more complicated.

please see the previous message.

Srivatsa, I don't claim my idea is the best. Actually I still hope somebody
else will suggest something better and simpler :)

Oleg.


^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update
  2007-01-07 11:00                     ` Srivatsa Vaddagiri
@ 2007-01-07 19:59                       ` Andrew Morton
  2007-01-07 21:01                         ` [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist Oleg Nesterov
                                           ` (2 more replies)
  0 siblings, 3 replies; 78+ messages in thread
From: Andrew Morton @ 2007-01-07 19:59 UTC (permalink / raw)
  To: vatsa
  Cc: Oleg Nesterov, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On Sun, 7 Jan 2007 16:30:13 +0530
Srivatsa Vaddagiri <vatsa@in.ibm.com> wrote:

> On Sat, Jan 06, 2007 at 11:11:17AM -0800, Andrew Morton wrote:
> > Has anyone thought seriously about using the process freezer in the
> > cpu-down/cpu-up paths?  That way we don't need to lock anything anywhere?
> 
> How would this provide a stable access to cpu_online_map in functions
> that need to block while accessing it (as flush_workqueue requires)?

If a thread simply blocks, that will not permit a cpu plug/unplug to proceed.

The thread had to explicitly call try_to_freeze().  CPU plug/unplug will
not occur (and cpu_online_map will not change) until every process in the
machine has called try_to_freeze()).

So the problem which you're referring to will only occur if a workqueue
callback function calls try_to_freeze(), which would be mad.



Plus flush_workqueue() is on the way out.  We're slowly edging towards a
working cancel_work() which will only block if the work which you're trying
to cancel is presently running.  With that, pretty much all the
flush_workqueue() calls go away, and all these accidental rarely-occurring
deadlocks go away too.


^ permalink raw reply	[flat|nested] 78+ messages in thread

* [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist
  2007-01-07 19:59                       ` Andrew Morton
@ 2007-01-07 21:01                         ` Oleg Nesterov
  2007-01-08 23:54                           ` Andrew Morton
  2007-01-07 21:51                         ` [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update Oleg Nesterov
  2007-01-08 15:37                         ` Srivatsa Vaddagiri
  2 siblings, 1 reply; 78+ messages in thread
From: Oleg Nesterov @ 2007-01-07 21:01 UTC (permalink / raw)
  To: Andrew Morton
  Cc: vatsa, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

Now when we have ->current_work we can avoid adding a barrier and waiting for
its completition when cwq's queue is empty.

Note: this change is also useful if we change flush_workqueue() to also check
the dead CPUs.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>

--- mm-6.20-rc3/kernel/workqueue.c~1_opt	2007-01-07 23:15:50.000000000 +0300
+++ mm-6.20-rc3/kernel/workqueue.c	2007-01-07 23:26:45.000000000 +0300
@@ -405,12 +405,15 @@ static void wq_barrier_func(struct work_
 	complete(&barr->done);
 }
 
-static inline void init_wq_barrier(struct wq_barrier *barr)
+static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
+					struct wq_barrier *barr, int tail)
 {
 	INIT_WORK(&barr->work, wq_barrier_func);
 	__set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
 
 	init_completion(&barr->done);
+
+	insert_work(cwq, &barr->work, tail);
 }
 
 static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
@@ -429,13 +432,20 @@ static void flush_cpu_workqueue(struct c
 		preempt_disable();
 	} else {
 		struct wq_barrier barr;
+		int active = 0;
 
-		init_wq_barrier(&barr);
-		__queue_work(cwq, &barr.work);
+		spin_lock_irq(&cwq->lock);
+		if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
+			insert_wq_barrier(cwq, &barr, 1);
+			active = 1;
+		}
+		spin_unlock_irq(&cwq->lock);
 
-		preempt_enable();	/* Can no longer touch *cwq */
-		wait_for_completion(&barr.done);
-		preempt_disable();
+		if (active) {
+			preempt_enable();
+			wait_for_completion(&barr.done);
+			preempt_disable();
+		}
 	}
 }
 
@@ -482,8 +492,7 @@ static void wait_on_work(struct cpu_work
 
 	spin_lock_irq(&cwq->lock);
 	if (unlikely(cwq->current_work == work)) {
-		init_wq_barrier(&barr);
-		insert_work(cwq, &barr.work, 0);
+		insert_wq_barrier(cwq, &barr, 0);
 		running = 1;
 	}
 	spin_unlock_irq(&cwq->lock);


^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update
  2007-01-07 19:59                       ` Andrew Morton
  2007-01-07 21:01                         ` [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist Oleg Nesterov
@ 2007-01-07 21:51                         ` Oleg Nesterov
  2007-01-08 15:22                           ` Srivatsa Vaddagiri
  2007-01-08 15:37                         ` Srivatsa Vaddagiri
  2 siblings, 1 reply; 78+ messages in thread
From: Oleg Nesterov @ 2007-01-07 21:51 UTC (permalink / raw)
  To: Andrew Morton
  Cc: vatsa, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On 01/07, Andrew Morton wrote:
>
> Plus flush_workqueue() is on the way out.  We're slowly edging towards a
> working cancel_work() which will only block if the work which you're trying
> to cancel is presently running.  With that, pretty much all the
> flush_workqueue() calls go away, and all these accidental rarely-occurring
> deadlocks go away too.

So. If we can forget about the race we have - fine. Otherwise, how about the
patch below? It is untested and needs a review. I can't suggest any simpler
now.

Change flush_workqueue() to use for_each_possible_cpu(). This means that
flush_cpu_workqueue() may hit CPU which is already dead. However in that
case

	if (!list_empty(&cwq->worklist) || cwq->current_work != NULL)

means that CPU_DEAD in progress, it will do kthread_stop() + take_over_work()
so we can proceed and insert a barrier. We hold cwq->lock, so we are safe.

This patch replaces fix-flush_workqueue-vs-cpu_dead-race.patch which was
broken by switching to preempt_disable (now we don't need locking at all).
Note that migrate_sequence (was hotplug_sequence) is incremented under
cwq->lock. Since flush_workqueue does lock/unlock of cwq->lock on all CPUs,
it must see the new value if take_over_work() happened before we checked
this cwq, and this is the case we should worry about: otherwise we added
a barrier.

Srivatsa?

--- mm-6.20-rc3/kernel/workqueue.c~2_race	2007-01-08 00:07:07.000000000 +0300
+++ mm-6.20-rc3/kernel/workqueue.c	2007-01-08 00:28:55.000000000 +0300
@@ -65,6 +65,7 @@ struct workqueue_struct {
 
 /* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
    threads to each one as cpus come/go. */
+static long migrate_sequence __read_mostly;
 static DEFINE_MUTEX(workqueue_mutex);
 static LIST_HEAD(workqueues);
 
@@ -422,13 +423,7 @@ static void flush_cpu_workqueue(struct c
 		 * Probably keventd trying to flush its own queue. So simply run
 		 * it by hand rather than deadlocking.
 		 */
-		preempt_enable();
-		/*
-		 * We can still touch *cwq here because we are keventd, and
-		 * hot-unplug will be waiting us to exit.
-		 */
 		run_workqueue(cwq);
-		preempt_disable();
 	} else {
 		struct wq_barrier barr;
 		int active = 0;
@@ -441,9 +436,7 @@ static void flush_cpu_workqueue(struct c
 		spin_unlock_irq(&cwq->lock);
 
 		if (active) {
-			preempt_enable();
 			wait_for_completion(&barr.done);
-			preempt_disable();
 		}
 	}
 }
@@ -463,17 +456,21 @@ static void flush_cpu_workqueue(struct c
  */
 void fastcall flush_workqueue(struct workqueue_struct *wq)
 {
-	preempt_disable();		/* CPU hotplug */
 	if (is_single_threaded(wq)) {
 		/* Always use first cpu's area. */
 		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu));
 	} else {
+		long sequence;
 		int cpu;
+again:
+		sequence = migrate_sequence;
 
-		for_each_online_cpu(cpu)
+		for_each_possible_cpu(cpu)
 			flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
+
+		if (unlikely(sequence != migrate_sequence))
+			goto again;
 	}
-	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
 
@@ -545,18 +542,22 @@ out:
 }
 EXPORT_SYMBOL_GPL(flush_work);
 
-static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
-						   int cpu, int freezeable)
+static void init_cpu_workqueue(struct workqueue_struct *wq,
+			struct cpu_workqueue_struct *cwq, int freezeable)
 {
-	struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
-	struct task_struct *p;
-
 	spin_lock_init(&cwq->lock);
 	cwq->wq = wq;
 	cwq->thread = NULL;
 	cwq->freezeable = freezeable;
 	INIT_LIST_HEAD(&cwq->worklist);
 	init_waitqueue_head(&cwq->more_work);
+}
+
+static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
+						   int cpu)
+{
+	struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
+	struct task_struct *p;
 
 	if (is_single_threaded(wq))
 		p = kthread_create(worker_thread, cwq, "%s", wq->name);
@@ -589,15 +590,20 @@ struct workqueue_struct *__create_workqu
 	mutex_lock(&workqueue_mutex);
 	if (singlethread) {
 		INIT_LIST_HEAD(&wq->list);
-		p = create_workqueue_thread(wq, singlethread_cpu, freezeable);
+		init_cpu_workqueue(wq, per_cpu_ptr(wq->cpu_wq, singlethread_cpu),
+					freezeable);
+		p = create_workqueue_thread(wq, singlethread_cpu);
 		if (!p)
 			destroy = 1;
 		else
 			wake_up_process(p);
 	} else {
 		list_add(&wq->list, &workqueues);
+		for_each_possible_cpu(cpu)
+			init_cpu_workqueue(wq, per_cpu_ptr(wq->cpu_wq, cpu),
+						freezeable);
 		for_each_online_cpu(cpu) {
-			p = create_workqueue_thread(wq, cpu, freezeable);
+			p = create_workqueue_thread(wq, cpu);
 			if (p) {
 				kthread_bind(p, cpu);
 				wake_up_process(p);
@@ -833,6 +839,7 @@ static void take_over_work(struct workqu
 
 	spin_lock_irq(&cwq->lock);
 	list_replace_init(&cwq->worklist, &list);
+	migrate_sequence++;
 
 	while (!list_empty(&list)) {
 		printk("Taking work for %s\n", wq->name);
@@ -859,7 +866,7 @@ static int __devinit workqueue_cpu_callb
 	case CPU_UP_PREPARE:
 		/* Create a new workqueue thread for it. */
 		list_for_each_entry(wq, &workqueues, list) {
-			if (!create_workqueue_thread(wq, hotcpu, 0)) {
+			if (!create_workqueue_thread(wq, hotcpu)) {
 				printk("workqueue for %i failed\n", hotcpu);
 				return NOTIFY_BAD;
 			}


^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update
  2007-01-07 21:51                         ` [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update Oleg Nesterov
@ 2007-01-08 15:22                           ` Srivatsa Vaddagiri
  2007-01-08 15:56                             ` Oleg Nesterov
  0 siblings, 1 reply; 78+ messages in thread
From: Srivatsa Vaddagiri @ 2007-01-08 15:22 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On Mon, Jan 08, 2007 at 12:51:03AM +0300, Oleg Nesterov wrote:
> Change flush_workqueue() to use for_each_possible_cpu(). This means that
> flush_cpu_workqueue() may hit CPU which is already dead. However in that
> case
> 
> 	if (!list_empty(&cwq->worklist) || cwq->current_work != NULL)
> 
> means that CPU_DEAD in progress, it will do kthread_stop() + take_over_work()
> so we can proceed and insert a barrier. We hold cwq->lock, so we are safe.
> 
> This patch replaces fix-flush_workqueue-vs-cpu_dead-race.patch which was
> broken by switching to preempt_disable (now we don't need locking at all).
> Note that migrate_sequence (was hotplug_sequence) is incremented under
> cwq->lock. Since flush_workqueue does lock/unlock of cwq->lock on all CPUs,
> it must see the new value if take_over_work() happened before we checked
> this cwq, and this is the case we should worry about: otherwise we added
> a barrier.
> 
> Srivatsa?

This is head-spinning :)

Spotted atleast these problems:

1. run_workqueue()->work.func()->flush_work()->mutex_lock(workqueue_mutex)
   deadlocks if we are blocked in cleanup_workqueue_thread()->kthread_stop() 
   for the same worker thread to exit.

   Looks possible in practice to me.

2. 
     
CPU_DEAD->cleanup_workqueue_thread->(cwq->thread = NULL)->kthread_stop() ..
				    ^^^^^^^^^^^^^^^^^^^^
						|___ Problematic

Now while we are blocked here, if a work->func() calls
flush_workqueue->flush_cpu_workqueue, we clearly cant identify that event 
thread is trying to flush its own queue (cwq->thread == current test
fails) and hence we will deadlock.

A lock_cpu_hotplug(), or any other ability to block concurrent hotplug 
operations from happening, in run_workqueue would have avoided both the above
races.

Alternatively, for the second race, I guess we can avoid setting 
cwq->thread = NULL in cleanup_workqueue_thread() till the thread has exited, 
but I am not sure if that opens up any other race. The first race seems
harder to fix ..

I wonder if spin (spinroot.com) or some other formal model can make this job of
spotting-races easier for us ..

-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update
  2007-01-07 19:59                       ` Andrew Morton
  2007-01-07 21:01                         ` [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist Oleg Nesterov
  2007-01-07 21:51                         ` [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update Oleg Nesterov
@ 2007-01-08 15:37                         ` Srivatsa Vaddagiri
  2 siblings, 0 replies; 78+ messages in thread
From: Srivatsa Vaddagiri @ 2007-01-08 15:37 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Oleg Nesterov, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On Sun, Jan 07, 2007 at 11:59:57AM -0800, Andrew Morton wrote:
> > How would this provide a stable access to cpu_online_map in functions
> > that need to block while accessing it (as flush_workqueue requires)?
> 
> If a thread simply blocks, that will not permit a cpu plug/unplug to proceed.
> 
> The thread had to explicitly call try_to_freeze().  CPU plug/unplug will
> not occur (and cpu_online_map will not change) until every process in the
> machine has called try_to_freeze()).

Maybe my misunderstanding of the code, but:

Looking at the code, it seems to me that try_to_freeze() will be called very 
likely from the signal-delivery path (get_signal_to_deliver()). Isnt
that correct? If so, consider a thread as below:


 Thread1 :
	for_each_online_cpu() { online_map = 0x1111 at this point }
		do_some_thing();
			kmalloc(); <- blocks


Can't Thread1 be frozen in the above blocking state w/o it voluntarily 
calling try_to_freeze?  If so, online_map can change when it returns
from kmalloc() ..

> So the problem which you're referring to will only occur if a workqueue
> callback function calls try_to_freeze(), which would be mad.
> 
> 
> Plus flush_workqueue() is on the way out.  We're slowly edging towards a
> working cancel_work() which will only block if the work which you're trying
> to cancel is presently running.  With that, pretty much all the
> flush_workqueue() calls go away, and all these accidental rarely-occurring
> deadlocks go away too.

Fundamentally, I think it is important to give the ability to block
concurrent hotplug operations from happening ..

-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update
  2007-01-08 15:22                           ` Srivatsa Vaddagiri
@ 2007-01-08 15:56                             ` Oleg Nesterov
  2007-01-08 16:31                               ` Srivatsa Vaddagiri
  0 siblings, 1 reply; 78+ messages in thread
From: Oleg Nesterov @ 2007-01-08 15:56 UTC (permalink / raw)
  To: Srivatsa Vaddagiri
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On 01/08, Srivatsa Vaddagiri wrote:
>
> On Mon, Jan 08, 2007 at 12:51:03AM +0300, Oleg Nesterov wrote:
> > Change flush_workqueue() to use for_each_possible_cpu(). This means that
> > flush_cpu_workqueue() may hit CPU which is already dead. However in that
> > case
> > 
> > 	if (!list_empty(&cwq->worklist) || cwq->current_work != NULL)
> > 
> > means that CPU_DEAD in progress, it will do kthread_stop() + take_over_work()
> > so we can proceed and insert a barrier. We hold cwq->lock, so we are safe.
> > 
> > This patch replaces fix-flush_workqueue-vs-cpu_dead-race.patch which was
> > broken by switching to preempt_disable (now we don't need locking at all).
> > Note that migrate_sequence (was hotplug_sequence) is incremented under
> > cwq->lock. Since flush_workqueue does lock/unlock of cwq->lock on all CPUs,
> > it must see the new value if take_over_work() happened before we checked
> > this cwq, and this is the case we should worry about: otherwise we added
> > a barrier.
> > 
> > Srivatsa?
> 
> This is head-spinning :)

Thank you for review!

> Spotted atleast these problems:
> 
> 1. run_workqueue()->work.func()->flush_work()->mutex_lock(workqueue_mutex)
>    deadlocks if we are blocked in cleanup_workqueue_thread()->kthread_stop() 
>    for the same worker thread to exit.
> 
>    Looks possible in practice to me.

Yes, this is the same (old) problem as we have/had with flush_workqueue().
We can convert flush_work() to use preempt_disable (this is not straightforward,
but easy), or forbid to call flush_work() from work.func().

This patch doesn't touch this problem.

> 2. 
>      
> CPU_DEAD->cleanup_workqueue_thread->(cwq->thread = NULL)->kthread_stop() ..
> 				    ^^^^^^^^^^^^^^^^^^^^
> 						|___ Problematic

Hmm... This should not be possible? cwq->thread != NULL on CPU_DEAD event.
Event IF it was NULL, we don't call kthread_stop() in that case.

> Now while we are blocked here, if a work->func() calls
> flush_workqueue->flush_cpu_workqueue, we clearly cant identify that event 
> thread is trying to flush its own queue (cwq->thread == current test
> fails) and hence we will deadlock.

Could you clarify? I believe cwq->thread == current test always works, we never
"substitute" cwq->thread.

> A lock_cpu_hotplug(), or any other ability to block concurrent hotplug 
> operations from happening, in run_workqueue would have avoided both the above
> races.

I still don't think this is a good idea. We also need
	is_cpu_down_waits_for_lock_cpu_hotplug()

helper, otherwise we have a deadlock if work->func() sleeps and re-queues itself.

> Alternatively, for the second race, I guess we can avoid setting 
> cwq->thread = NULL in cleanup_workqueue_thread() till the thread has exited,

Yes, http://marc.theaimsgroup.com/?l=linux-kernel&m=116818097927685, I believe
we can do this later. This way workqueue will have almost zero interaction
with cpu-hotplug, and cpu UP/DOWN event won't be delayed by sleeping work.func().
take_over_work() can go away, this also allows us to simplify things.

Thanks!

Oleg.


^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update
  2007-01-08 15:56                             ` Oleg Nesterov
@ 2007-01-08 16:31                               ` Srivatsa Vaddagiri
  2007-01-08 17:06                                 ` Oleg Nesterov
  0 siblings, 1 reply; 78+ messages in thread
From: Srivatsa Vaddagiri @ 2007-01-08 16:31 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On Mon, Jan 08, 2007 at 06:56:38PM +0300, Oleg Nesterov wrote:
> > Spotted atleast these problems:
> >
> > 1. run_workqueue()->work.func()->flush_work()->mutex_lock(workqueue_mutex)
> >    deadlocks if we are blocked in cleanup_workqueue_thread()->kthread_stop()
> >    for the same worker thread to exit.
> >
> >    Looks possible in practice to me.
> 
> Yes, this is the same (old) problem as we have/had with flush_workqueue().
> We can convert flush_work() to use preempt_disable (this is not straightforward,
> but easy), or forbid to call flush_work() from work.func().

I think I noticed other problems of avoiding workqueue_mutex() in
flush_work() ..don't recall the exact problem.

> > 2.
> >
> > CPU_DEAD->cleanup_workqueue_thread->(cwq->thread = NULL)->kthread_stop() ..
> > 				    ^^^^^^^^^^^^^^^^^^^^
> > 						|___ Problematic
> 
> Hmm... This should not be possible? cwq->thread != NULL on CPU_DEAD event.

sure, cwq->thread != NULL at CPU_DEAD event. However
cleanup_workqueue_thread() will set it to NULL and block in
kthread_stop(), waiting for the kthread to finish run_workqueue and
exit. If one of the work functions being run by run_workqueue() calls
flush_workqueue()->flush_cpu_workqueue() now, flush_cpu_workqueue() can fail to 
recognize that "keventd is trying to flush its own queue" which can
cause deadlocks.

> > Now while we are blocked here, if a work->func() calls
> > flush_workqueue->flush_cpu_workqueue, we clearly cant identify that event
> > thread is trying to flush its own queue (cwq->thread == current test
> > fails) and hence we will deadlock.
> 
> Could you clarify? I believe cwq->thread == current test always works, we never
> "substitute" cwq->thread.

The test fails in the window described above.

> > A lock_cpu_hotplug(), or any other ability to block concurrent hotplug
> > operations from happening, in run_workqueue would have avoided both the above
> > races.
> 
> I still don't think this is a good idea. We also need
> 	is_cpu_down_waits_for_lock_cpu_hotplug()
> 
> helper, otherwise we have a deadlock if work->func() sleeps and re-queues itself.

Can you elaborate this a bit?

> > Alternatively, for the second race, I guess we can avoid setting
> > cwq->thread = NULL in cleanup_workqueue_thread() till the thread has exited,
> 
> Yes, http://marc.theaimsgroup.com/?l=linux-kernel&m=116818097927685, I believe
> we can do this later. This way workqueue will have almost zero interaction
> with cpu-hotplug, and cpu UP/DOWN event won't be delayed by sleeping work.func().
> take_over_work() can go away, this also allows us to simplify things.

I agree it minimizes the interactions. Maybe worth attempting. However I
suspect it may not be as simple as it appears :)


-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update
  2007-01-08 16:31                               ` Srivatsa Vaddagiri
@ 2007-01-08 17:06                                 ` Oleg Nesterov
  2007-01-08 18:37                                   ` Pallipadi, Venkatesh
  2007-01-09  4:39                                   ` Srivatsa Vaddagiri
  0 siblings, 2 replies; 78+ messages in thread
From: Oleg Nesterov @ 2007-01-08 17:06 UTC (permalink / raw)
  To: Srivatsa Vaddagiri
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On 01/08, Srivatsa Vaddagiri wrote:
>
> On Mon, Jan 08, 2007 at 06:56:38PM +0300, Oleg Nesterov wrote:
> > > 2.
> > >
> > > CPU_DEAD->cleanup_workqueue_thread->(cwq->thread = NULL)->kthread_stop() ..
> > > 				    ^^^^^^^^^^^^^^^^^^^^
> > > 						|___ Problematic
> > 
> > Hmm... This should not be possible? cwq->thread != NULL on CPU_DEAD event.
> 
> sure, cwq->thread != NULL at CPU_DEAD event. However
> cleanup_workqueue_thread() will set it to NULL and block in
> kthread_stop(), waiting for the kthread to finish run_workqueue and
> exit.

Ah, missed you point, thanks. Yet another old problem which was not introduced
by recent changes. And yet another indication we should avoid kthread_stop()
on CPU_DEAD event :) I believe this is easy to fix, but need to think more.

> > > A lock_cpu_hotplug(), or any other ability to block concurrent hotplug
> > > operations from happening, in run_workqueue would have avoided both the above
> > > races.
> > 
> > I still don't think this is a good idea. We also need
> > 	is_cpu_down_waits_for_lock_cpu_hotplug()
> > 
> > helper, otherwise we have a deadlock if work->func() sleeps and re-queues itself.
> 
> Can you elaborate this a bit?

If work->func() re-queues itself, run_workqueue() never returns because
->worklist is never empty. This means we should somehow check and detect
that cpu-hotplug blocked because we hold lock_cpu_hotplug(). In that case
run_workqueue() should return, and drop the lock. This will complicate
worker_thread/run_workqueue further.

	run_workqueue:

		while (!list_empty(&cwq->worklist)) {
			...
			// We hold lock_cpu_hotplug(), cpu event can't make
			// progress. 
			...
		}

> > Yes, http://marc.theaimsgroup.com/?l=linux-kernel&m=116818097927685, I believe
> > we can do this later. This way workqueue will have almost zero interaction
> > with cpu-hotplug, and cpu UP/DOWN event won't be delayed by sleeping work.func().
> > take_over_work() can go away, this also allows us to simplify things.
> 
> I agree it minimizes the interactions. Maybe worth attempting. However I
> suspect it may not be as simple as it appears :)

Yes, that is why this patch only does the first step: flush_workqueue() checks
the dead CPUs as well, this change is minimal.

Do you see any problems this patch adds?

Oleg.


^ permalink raw reply	[flat|nested] 78+ messages in thread

* RE: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update
  2007-01-08 17:06                                 ` Oleg Nesterov
@ 2007-01-08 18:37                                   ` Pallipadi, Venkatesh
  2007-01-09  1:11                                     ` Srivatsa Vaddagiri
  2007-01-09  4:39                                   ` Srivatsa Vaddagiri
  1 sibling, 1 reply; 78+ messages in thread
From: Pallipadi, Venkatesh @ 2007-01-08 18:37 UTC (permalink / raw)
  To: Oleg Nesterov, Srivatsa Vaddagiri
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

 

>-----Original Message-----
>From: linux-kernel-owner@vger.kernel.org 
>[mailto:linux-kernel-owner@vger.kernel.org] On Behalf Of Oleg Nesterov
>Sent: Monday, January 08, 2007 9:07 AM
>To: Srivatsa Vaddagiri
>Cc: Andrew Morton; David Howells; Christoph Hellwig; Ingo 
>Molnar; Linus Torvalds; linux-kernel@vger.kernel.org; Gautham shenoy
>Subject: Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update
>
>On 01/08, Srivatsa Vaddagiri wrote:
>>
>> On Mon, Jan 08, 2007 at 06:56:38PM +0300, Oleg Nesterov wrote:
>> > > 2.
>> > >
>> > > CPU_DEAD->cleanup_workqueue_thread->(cwq->thread = 
>NULL)->kthread_stop() ..
>> > > 				    ^^^^^^^^^^^^^^^^^^^^
>> > > 						|___ Problematic
>> > 
>> > Hmm... This should not be possible? cwq->thread != NULL on 
>CPU_DEAD event.
>> 
>> sure, cwq->thread != NULL at CPU_DEAD event. However
>> cleanup_workqueue_thread() will set it to NULL and block in
>> kthread_stop(), waiting for the kthread to finish run_workqueue and
>> exit.
>
>Ah, missed you point, thanks. Yet another old problem which 
>was not introduced
>by recent changes. And yet another indication we should avoid 
>kthread_stop()
>on CPU_DEAD event :) I believe this is easy to fix, but need 
>to think more.

The current code is workqueue-hptplug path is full of races. I stumbled
upon atleast couple of different deadlock situations being discussed
here with ondemand governor using workqueue and trying to flush during
cpu hot remove.

Specifically, a three way deadlock involving kthread_stop() with
workqueue_mutex held and work itself blocked on some other mutex held by
another task trying to flush the workqueue.

One other approach I was thinking about, was to do all the hardwork in
workqueue CPU_DOWN_PREPARE callback rather than in CPU_DEAD.
We can call cleanup_workqueue_thread and take_over_work in DOWN_PREPARE,
With that, I don't think we need to hold the workqueue_mutex across 
these two callbacks and eliminate the deadlocks related to
flush_workqueue.
Do you think this approach would simply things around here?

Thanks,
Venki 

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist
  2007-01-07 21:01                         ` [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist Oleg Nesterov
@ 2007-01-08 23:54                           ` Andrew Morton
  2007-01-09  5:04                             ` Srivatsa Vaddagiri
  0 siblings, 1 reply; 78+ messages in thread
From: Andrew Morton @ 2007-01-08 23:54 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: vatsa, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy


Just a sad little note that I've utterly lost the plot on the workqueue
changes.

schedule_on_each_cpu-use-preempt_disable.patch
reimplement-flush_workqueue.patch
implement-flush_work.patch
implement-flush_work-sanity.patch
implement-flush_work_keventd.patch
flush_workqueue-use-preempt_disable-to-hold-off-cpu-hotplug.patch
flush_cpu_workqueue-dont-flush-an-empty-worklist.patch
aio-use-flush_work.patch
kblockd-use-flush_work.patch
relayfs-use-flush_keventd_work.patch
tg3-use-flush_keventd_work.patch
e1000-use-flush_keventd_work.patch
libata-use-flush_work.patch
phy-use-flush_work.patch
#bridge-avoid-using-noautorel-workqueues.patch
#
extend-notifier_call_chain-to-count-nr_calls-made.patch
extend-notifier_call_chain-to-count-nr_calls-made-fixes.patch
extend-notifier_call_chain-to-count-nr_calls-made-fixes-2.patch
define-and-use-new-eventscpu_lock_acquire-and-cpu_lock_release.patch
define-and-use-new-eventscpu_lock_acquire-and-cpu_lock_release-fix.patch
eliminate-lock_cpu_hotplug-in-kernel-schedc.patch
eliminate-lock_cpu_hotplug-in-kernel-schedc-fix.patch
handle-cpu_lock_acquire-and-cpu_lock_release-in-workqueue_cpu_callback.patch
fix-flush_workqueue-vs-cpu_dead-race.patch

I don't know which of these are good, bad or indifferent.

Furthermore I don't know which of these need to be tossed overboard if/when
we get around to using the task freezer for CPU hotplug synchronisation. 
Hopefully, a lot of them.  I don't really understand why we're continuing
to struggle with the existing approach before that question is settled.


^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update
  2007-01-08 18:37                                   ` Pallipadi, Venkatesh
@ 2007-01-09  1:11                                     ` Srivatsa Vaddagiri
  0 siblings, 0 replies; 78+ messages in thread
From: Srivatsa Vaddagiri @ 2007-01-09  1:11 UTC (permalink / raw)
  To: Pallipadi, Venkatesh
  Cc: Oleg Nesterov, Andrew Morton, David Howells, Christoph Hellwig,
	Ingo Molnar, Linus Torvalds, linux-kernel, Gautham shenoy

On Mon, Jan 08, 2007 at 10:37:25AM -0800, Pallipadi, Venkatesh wrote:
> One other approach I was thinking about, was to do all the hardwork in
> workqueue CPU_DOWN_PREPARE callback rather than in CPU_DEAD.

Between DOWN_PREPARE and DEAD, more work can get added to the cpu's
workqueue. So DOWN_PREPARE is kind of early to take_over_work ..

> We can call cleanup_workqueue_thread and take_over_work in DOWN_PREPARE,
> With that, I don't think we need to hold the workqueue_mutex across 
> these two callbacks and eliminate the deadlocks related to
> flush_workqueue.
> Do you think this approach would simply things around here?

-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update
  2007-01-08 17:06                                 ` Oleg Nesterov
  2007-01-08 18:37                                   ` Pallipadi, Venkatesh
@ 2007-01-09  4:39                                   ` Srivatsa Vaddagiri
  2007-01-09 14:38                                     ` Oleg Nesterov
  1 sibling, 1 reply; 78+ messages in thread
From: Srivatsa Vaddagiri @ 2007-01-09  4:39 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On Mon, Jan 08, 2007 at 08:06:35PM +0300, Oleg Nesterov wrote:
> Ah, missed you point, thanks. Yet another old problem which was not introduced
> by recent changes. And yet another indication we should avoid kthread_stop()
> on CPU_DEAD event :) I believe this is easy to fix, but need to think more.

I think the problem is not just with CPU_DEAD. Anyone who calls
cleanup_workqueue_thread (say destroy_workqueue?) will see this race. 

Do you see any problems if cleanup_workqueue_thread is changed as:

cleanup_workqueue_thread()
{
	kthread_stop(p);
	spin_lock(cwq->lock);
	cwq->thread = NULL;
	spin_unlock(cwq->lock);
}


> 	run_workqueue:
> 
> 		while (!list_empty(&cwq->worklist)) {
> 			...
> 			// We hold lock_cpu_hotplug(), cpu event can't make
> 			// progress.
> 			...
> 		}

Ok ..yes a cpu_event_waits_for_lock() helper will help here.

> > I agree it minimizes the interactions. Maybe worth attempting. However I
> > suspect it may not be as simple as it appears :)
> 
> Yes, that is why this patch only does the first step: flush_workqueue() checks
> the dead CPUs as well, this change is minimal.
> 
> Do you see any problems this patch adds?

I dont see as of now. I suspect we will know better when we implement
the patch to eliminate CPU_DEAD handling in workqueue.c

-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist
  2007-01-08 23:54                           ` Andrew Morton
@ 2007-01-09  5:04                             ` Srivatsa Vaddagiri
  2007-01-09  5:26                               ` Andrew Morton
  0 siblings, 1 reply; 78+ messages in thread
From: Srivatsa Vaddagiri @ 2007-01-09  5:04 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Oleg Nesterov, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On Mon, Jan 08, 2007 at 03:54:28PM -0800, Andrew Morton wrote:
> Furthermore I don't know which of these need to be tossed overboard if/when
> we get around to using the task freezer for CPU hotplug synchronisation.
> Hopefully, a lot of them.  I don't really understand why we're continuing
> to struggle with the existing approach before that question is settled.

Good point!

Fundamentally, I think we need to answer this question:

"Do we provide *some* mechanism to block concurrent hotplug operations
from happening? By hotplug operations I mean both changes to the bitmap
and execution of all baclbacks in CPU_DEAD/ONLINE etc"

If NO, then IMHO we will be forever fixing races

If YES, then what is that mechanism? freeze_processes()? or a magical
lock?

freeze_processes() cant be that mechanism, if my understanding of it is
correct - see http://lkml.org/lkml/2007/1/8/149 and 
http://marc.theaimsgroup.com/?l=linux-kernel&m=116817460726058.

I would be happy to be corrected if the above impression of
freeze_processes() is corrected ..

-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist
  2007-01-09  5:04                             ` Srivatsa Vaddagiri
@ 2007-01-09  5:26                               ` Andrew Morton
  2007-01-09  6:56                                 ` Ingo Molnar
                                                   ` (2 more replies)
  0 siblings, 3 replies; 78+ messages in thread
From: Andrew Morton @ 2007-01-09  5:26 UTC (permalink / raw)
  To: vatsa
  Cc: Oleg Nesterov, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On Tue, 9 Jan 2007 10:34:17 +0530
Srivatsa Vaddagiri <vatsa@in.ibm.com> wrote:

> On Mon, Jan 08, 2007 at 03:54:28PM -0800, Andrew Morton wrote:
> > Furthermore I don't know which of these need to be tossed overboard if/when
> > we get around to using the task freezer for CPU hotplug synchronisation.
> > Hopefully, a lot of them.  I don't really understand why we're continuing
> > to struggle with the existing approach before that question is settled.
> 
> Good point!
> 
> Fundamentally, I think we need to answer this question:
> 
> "Do we provide *some* mechanism to block concurrent hotplug operations
> from happening? By hotplug operations I mean both changes to the bitmap
> and execution of all baclbacks in CPU_DEAD/ONLINE etc"
> 
> If NO, then IMHO we will be forever fixing races
> 
> If YES, then what is that mechanism? freeze_processes()? or a magical
> lock?
> 
> freeze_processes() cant be that mechanism, if my understanding of it is
> correct - see http://lkml.org/lkml/2007/1/8/149

That's not correct.  freeze_processes() will freeze *all* processes.  All
of them are forced to enter refrigerator().  With the mysterious exception
of some I/O-related kernel threads, which might need some thought.

> and 
> http://marc.theaimsgroup.com/?l=linux-kernel&m=116817460726058.

Am not sure how that's related.

> I would be happy to be corrected if the above impression of
> freeze_processes() is corrected ..

It could be that the freezer needs a bit of work for this application. 
Obviously we're not interested in the handling of disk I/O, so we'd really
like to do a simple
try_to_freeze_tasks(FREEZER_USER_SPACE|FREEZER_KERNEL_THREADS), but the
code isn't set up to do that (it should be).  The other non-swsusp callers
probably want this change as well.  But that's all a minor matter.


^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist
  2007-01-09  5:26                               ` Andrew Morton
@ 2007-01-09  6:56                                 ` Ingo Molnar
  2007-01-09  9:33                                 ` Srivatsa Vaddagiri
  2007-01-09 15:07                                 ` Oleg Nesterov
  2 siblings, 0 replies; 78+ messages in thread
From: Ingo Molnar @ 2007-01-09  6:56 UTC (permalink / raw)
  To: Andrew Morton
  Cc: vatsa, Oleg Nesterov, David Howells, Christoph Hellwig,
	Linus Torvalds, linux-kernel, Gautham shenoy


* Andrew Morton <akpm@osdl.org> wrote:

> > I would be happy to be corrected if the above impression of 
> > freeze_processes() is corrected ..
> 
> It could be that the freezer needs a bit of work for this application. 
> Obviously we're not interested in the handling of disk I/O, so we'd 
> really like to do a simple 
> try_to_freeze_tasks(FREEZER_USER_SPACE|FREEZER_KERNEL_THREADS), but 
> the code isn't set up to do that (it should be).  The other non-swsusp 
> callers probably want this change as well.  But that's all a minor 
> matter.

yes. The freezer does the fundamentally right thing: it stops all tasks 
in user-space or waits for them to return back to user-space to stop 
them there, or if it's a pure kernel-space task it waits until that 
kernel-space task voluntarily stop.

Once the system is in such a state, and all processing has been stopped, 
all of the kernel's data structures are completely 'unused', and we can:

- patch the kernel freely (kprobes)

- save+stop the kernel (sw-suspend) 

- remove a CPU (CPU hotplug and suspend)

- (We could also use this mechanism to live-migrate the kernel to 
   another system btw., possibly useful for containers)

- (We could also use this mechanism to create a live snapshot of a 
   running kernel, together with an LVM snapshot of filesystem state, 
   for possible restore point later on.)

It is a very powerful mechanism that has really nice properties - we 
should work on this one shared infrastructure instead of adding zillions 
of per-subsystem CPU hotplug locks.

	Ingo

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist
  2007-01-09  5:26                               ` Andrew Morton
  2007-01-09  6:56                                 ` Ingo Molnar
@ 2007-01-09  9:33                                 ` Srivatsa Vaddagiri
  2007-01-09  9:44                                   ` Ingo Molnar
  2007-01-09  9:51                                   ` Andrew Morton
  2007-01-09 15:07                                 ` Oleg Nesterov
  2 siblings, 2 replies; 78+ messages in thread
From: Srivatsa Vaddagiri @ 2007-01-09  9:33 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Oleg Nesterov, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On Mon, Jan 08, 2007 at 09:26:56PM -0800, Andrew Morton wrote:
> That's not correct.  freeze_processes() will freeze *all* processes.

I am not arguing whether all processes will be frozen. However my question was 
on the freeze point. Let me ask the question with an example:

rtasd thread (arch/powerpc/platforms/pseries/rtasd.c) executes this simple
loop:


static int rtasd(void *unused)
{

	i = first_cpu(cpu_online_map);

	while (1) {

		set_cpus_allowed(current, cpumask_of_cpu(i));	/* can block */

		/* we should now be running on cpu i */

		do_something_on_a_cpu(i);
		
		/* sleep for some time */

		i = next_cpu(cpu, cpu_online_map);
	}

}

This thread makes absolutely -no- calls to try_to_freeze() in its lifetime.

1. Does this mean that the thread can't be frozen? (lets say that the
   thread's PF_NOFREEZE is not set)

   AFAICS it can still be frozen by sending it a signal and have the signal 
   delivery code call try_to_freeze() ..

2. If the thread can be frozen at any arbitrary point of its execution, then I 
   dont see what prevents cpu_online_map from changing under the feet of rtasd 
   thread, 
 
    In other words, we would have failed to provide the ability to *block* 
    hotplug operations from happening concurrently.

>  All of them are forced to enter refrigerator(). 
	           ^^^^^^

*forced*, yes ..that's the point of concern ..


Warm regards,
vatsa

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist
  2007-01-09  9:33                                 ` Srivatsa Vaddagiri
@ 2007-01-09  9:44                                   ` Ingo Molnar
  2007-01-09  9:51                                   ` Andrew Morton
  1 sibling, 0 replies; 78+ messages in thread
From: Ingo Molnar @ 2007-01-09  9:44 UTC (permalink / raw)
  To: Srivatsa Vaddagiri
  Cc: Andrew Morton, Oleg Nesterov, David Howells, Christoph Hellwig,
	Linus Torvalds, linux-kernel, Gautham shenoy


* Srivatsa Vaddagiri <vatsa@in.ibm.com> wrote:

> This thread makes absolutely -no- calls to try_to_freeze() in its 
> lifetime.

that's a plain bug. suspend will fail for example. A kernel thread 
either needs to mark itself PF_NOFREEZE (which means it can be ignored 
and zapped with impunity), or needs to call into try_to_freeze() with a 
sane frequency.

	Ingo

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist
  2007-01-09  9:33                                 ` Srivatsa Vaddagiri
  2007-01-09  9:44                                   ` Ingo Molnar
@ 2007-01-09  9:51                                   ` Andrew Morton
  2007-01-09 10:09                                     ` Srivatsa Vaddagiri
  1 sibling, 1 reply; 78+ messages in thread
From: Andrew Morton @ 2007-01-09  9:51 UTC (permalink / raw)
  To: vatsa
  Cc: Oleg Nesterov, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On Tue, 9 Jan 2007 15:03:02 +0530
Srivatsa Vaddagiri <vatsa@in.ibm.com> wrote:

> On Mon, Jan 08, 2007 at 09:26:56PM -0800, Andrew Morton wrote:
> > That's not correct.  freeze_processes() will freeze *all* processes.
> 
> I am not arguing whether all processes will be frozen. However my question was 
> on the freeze point. Let me ask the question with an example:
> 
> rtasd thread (arch/powerpc/platforms/pseries/rtasd.c) executes this simple
> loop:
> 
> 
> static int rtasd(void *unused)
> {
> 
> 	i = first_cpu(cpu_online_map);
> 
> 	while (1) {
> 
> 		set_cpus_allowed(current, cpumask_of_cpu(i));	/* can block */
> 
> 		/* we should now be running on cpu i */
> 
> 		do_something_on_a_cpu(i);
> 		
> 		/* sleep for some time */
> 
> 		i = next_cpu(cpu, cpu_online_map);
> 	}
> 
> }
> 
> This thread makes absolutely -no- calls to try_to_freeze() in its lifetime.

Looks like a bug to me.  powerpc does appear to try to support the freezer.

> 1. Does this mean that the thread can't be frozen? (lets say that the
>    thread's PF_NOFREEZE is not set)

yup.  I'd expect the freeze_processes() call would fail if this thread is
running.

>    AFAICS it can still be frozen by sending it a signal and have the signal 
>    delivery code call try_to_freeze() ..

kernel threads don't take signals in the same manner as userspace.  A
kernel thread needs to explicitly poll, via

	if (signal_pending(current))
		do_something()

rtasd doesn't do that, and using signals in-kernel is considered lame.

> 2. If the thread can be frozen at any arbitrary point of its execution, then I 
>    dont see what prevents cpu_online_map from changing under the feet of rtasd 
>    thread, 

It cannot.

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist
  2007-01-09  9:51                                   ` Andrew Morton
@ 2007-01-09 10:09                                     ` Srivatsa Vaddagiri
  2007-01-09 10:15                                       ` Andrew Morton
  0 siblings, 1 reply; 78+ messages in thread
From: Srivatsa Vaddagiri @ 2007-01-09 10:09 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Oleg Nesterov, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On Tue, Jan 09, 2007 at 01:51:52AM -0800, Andrew Morton wrote:
> > This thread makes absolutely -no- calls to try_to_freeze() in its lifetime.
> 
> Looks like a bug to me.  powerpc does appear to try to support the freezer.
> 
> > 1. Does this mean that the thread can't be frozen? (lets say that the
> >    thread's PF_NOFREEZE is not set)
> 
> yup.  I'd expect the freeze_processes() call would fail if this thread is
> running.

ok.

> 
> >    AFAICS it can still be frozen by sending it a signal and have the signal
> >    delivery code call try_to_freeze() ..
> 
> kernel threads don't take signals in the same manner as userspace.  A
> kernel thread needs to explicitly poll, via
> 
> 	if (signal_pending(current))
> 		do_something()

Thanks for the education! I feel much better about the use of process
freezer now ..

> > 2. If the thread can be frozen at any arbitrary point of its execution, then I
> >    dont see what prevents cpu_online_map from changing under the feet of rtasd
> >    thread,
> 
> It cannot.

Excellent ..

I just hope the latency of freeze_processes() is tolerable ..

-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist
  2007-01-09 10:09                                     ` Srivatsa Vaddagiri
@ 2007-01-09 10:15                                       ` Andrew Morton
  0 siblings, 0 replies; 78+ messages in thread
From: Andrew Morton @ 2007-01-09 10:15 UTC (permalink / raw)
  To: vatsa
  Cc: Oleg Nesterov, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On Tue, 9 Jan 2007 15:39:26 +0530
Srivatsa Vaddagiri <vatsa@in.ibm.com> wrote:

> I just hope the latency of freeze_processes() is tolerable ..

It'll be roughly proportional to the number of processes, I guess: if we
have 100,000 processes (or threads) doing sleep(1000000) then yeah, it'll
take some time to wake them all up and capture them in refrigerator().

But I suspect a suitable fix for any problems which arise there is to
implement gang-offlining and onlining, rather than the present
one-cpu-at-a-time.  That'd be pretty simple to do: we already have sysfs
interfaces which take a cpumask.

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update
  2007-01-09  4:39                                   ` Srivatsa Vaddagiri
@ 2007-01-09 14:38                                     ` Oleg Nesterov
  0 siblings, 0 replies; 78+ messages in thread
From: Oleg Nesterov @ 2007-01-09 14:38 UTC (permalink / raw)
  To: Srivatsa Vaddagiri
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On 01/09, Srivatsa Vaddagiri wrote:
>
> On Mon, Jan 08, 2007 at 08:06:35PM +0300, Oleg Nesterov wrote:
> > Ah, missed you point, thanks. Yet another old problem which was not introduced
> > by recent changes. And yet another indication we should avoid kthread_stop()
> > on CPU_DEAD event :) I believe this is easy to fix, but need to think more.
> 
> I think the problem is not just with CPU_DEAD. Anyone who calls
> cleanup_workqueue_thread (say destroy_workqueue?) will see this race. 

destroy_workqueue() first does flush_workqueue(), so it should be ok.

Anyway I agree with you, we shouldn't clear cwq->thread until it exits,

> Do you see any problems if cleanup_workqueue_thread is changed as:
> 
> cleanup_workqueue_thread()
> {
> 	kthread_stop(p);
> 	spin_lock(cwq->lock);
> 	cwq->thread = NULL;
> 	spin_unlock(cwq->lock);
> }

I think the same. In fact I suspect we even don't need spin_lock, but didn't
have a time to read the code since our discussion.

Oleg.


^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist
  2007-01-09  5:26                               ` Andrew Morton
  2007-01-09  6:56                                 ` Ingo Molnar
  2007-01-09  9:33                                 ` Srivatsa Vaddagiri
@ 2007-01-09 15:07                                 ` Oleg Nesterov
  2007-01-09 15:59                                   ` Srivatsa Vaddagiri
  2 siblings, 1 reply; 78+ messages in thread
From: Oleg Nesterov @ 2007-01-09 15:07 UTC (permalink / raw)
  To: Andrew Morton
  Cc: vatsa, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On 01/08, Andrew Morton wrote:
>
> That's not correct.  freeze_processes() will freeze *all* processes.  All
> of them are forced to enter refrigerator().  With the mysterious exception
> of some I/O-related kernel threads, which might need some thought.

Yes, and we can remove a dead CPU safely, this part is ok.

> > and 
> > http://marc.theaimsgroup.com/?l=linux-kernel&m=116817460726058.
> 
> Am not sure how that's related.

but at some point we should thaw processes, including cwq->thread which
should die. So we are doing things like take_over_work() and this is the
source of races, because the dead CPU is not on cpu_online_map.

flush_workqueue() doesn't use any locks now. If we use freezer to implement
cpu-hotplug nothing will change, we still have races.

It looks so obvious to me, so I'm starting to suspect I missed something
very simple :(

Oleg.


^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist
  2007-01-09 15:07                                 ` Oleg Nesterov
@ 2007-01-09 15:59                                   ` Srivatsa Vaddagiri
  2007-01-09 16:38                                     ` Oleg Nesterov
  0 siblings, 1 reply; 78+ messages in thread
From: Srivatsa Vaddagiri @ 2007-01-09 15:59 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On Tue, Jan 09, 2007 at 06:07:55PM +0300, Oleg Nesterov wrote:
> but at some point we should thaw processes, including cwq->thread which
> should die.

I am presuming we will thaw processes after all CPU_DEAD handlers have
run.

> So we are doing things like take_over_work() and this is the
> source of races, because the dead CPU is not on cpu_online_map.
>
> flush_workqueue() doesn't use any locks now. If we use freezer to implement
> cpu-hotplug nothing will change, we still have races.

We have races -if- CPU_DEAD handling can run concurrently with a ongoing
flush_workqueue. From my recent understanding of process freezer, this
is not possible. In other words, flush_workqueue() can be its old
implementation as below w/o any races:

	some_thread:

	for_each_online_cpu(i)
		flush_cpu_workqueue(i);

As long as this loop is running, cpu_down/up will not proceed. This means, 
cpu_online_map is stable even if flush_cpu_workqueue blocks ..

Once this loop is complete and all threads have called try_to_freeze,
cpu_down will proceed to change the bit map and run CPU_DEAD handlers
of everyone. I am presuimg we will thaw processes only after all
CPU_DEAD/ONLINE handlers have run (dont know if that is a problem).
In that case do you still see races?  Yes, this would require some
changes in worker_thread to check for kthread_should_stop() after
try_to_freeze returns ...


-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist
  2007-01-09 15:59                                   ` Srivatsa Vaddagiri
@ 2007-01-09 16:38                                     ` Oleg Nesterov
  2007-01-09 16:46                                       ` Srivatsa Vaddagiri
  0 siblings, 1 reply; 78+ messages in thread
From: Oleg Nesterov @ 2007-01-09 16:38 UTC (permalink / raw)
  To: Srivatsa Vaddagiri
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On 01/09, Srivatsa Vaddagiri wrote:
>
> On Tue, Jan 09, 2007 at 06:07:55PM +0300, Oleg Nesterov wrote:
> > but at some point we should thaw processes, including cwq->thread which
> > should die.
> 
> I am presuming we will thaw processes after all CPU_DEAD handlers have
> run.
> 
> > So we are doing things like take_over_work() and this is the
> > source of races, because the dead CPU is not on cpu_online_map.
> >
> > flush_workqueue() doesn't use any locks now. If we use freezer to implement
> > cpu-hotplug nothing will change, we still have races.
> 
> We have races -if- CPU_DEAD handling can run concurrently with a ongoing
> flush_workqueue. From my recent understanding of process freezer, this
> is not possible. In other words, flush_workqueue() can be its old
> implementation as below w/o any races:
> 
> 	some_thread:
> 
> 	for_each_online_cpu(i)
> 		flush_cpu_workqueue(i);
> 
> As long as this loop is running, cpu_down/up will not proceed. This means, 
> cpu_online_map is stable even if flush_cpu_workqueue blocks ..
> 
> Once this loop is complete and all threads have called try_to_freeze,
> cpu_down will proceed to change the bit map and run CPU_DEAD handlers
> of everyone. I am presuimg we will thaw processes only after all
> CPU_DEAD/ONLINE handlers have run.

We can't do this. We should thaw cwq->thread (which was bound to the
dead CPU) to complete CPU_DEAD event. So we still need some changes.

Oleg.


^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist
  2007-01-09 16:38                                     ` Oleg Nesterov
@ 2007-01-09 16:46                                       ` Srivatsa Vaddagiri
  2007-01-09 16:56                                         ` Oleg Nesterov
  0 siblings, 1 reply; 78+ messages in thread
From: Srivatsa Vaddagiri @ 2007-01-09 16:46 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On Tue, Jan 09, 2007 at 07:38:15PM +0300, Oleg Nesterov wrote:
> We can't do this. We should thaw cwq->thread (which was bound to the
> dead CPU) to complete CPU_DEAD event. So we still need some changes.

I noticed that, but I presumed kthread_stop() will post a wakeup which
will bring it out of frozen state. Looking at refrigerator(), I realize
that is not possible. 

So CPU_DEAD should do a thaw_process on the kthread before doing a
kthread_stop?

-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist
  2007-01-09 16:46                                       ` Srivatsa Vaddagiri
@ 2007-01-09 16:56                                         ` Oleg Nesterov
  2007-01-14 23:54                                           ` Oleg Nesterov
  0 siblings, 1 reply; 78+ messages in thread
From: Oleg Nesterov @ 2007-01-09 16:56 UTC (permalink / raw)
  To: Srivatsa Vaddagiri
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy

On 01/09, Srivatsa Vaddagiri wrote:
>
> On Tue, Jan 09, 2007 at 07:38:15PM +0300, Oleg Nesterov wrote:
> > We can't do this. We should thaw cwq->thread (which was bound to the
> > dead CPU) to complete CPU_DEAD event. So we still need some changes.
> 
> I noticed that, but I presumed kthread_stop() will post a wakeup which
> will bring it out of frozen state. Looking at refrigerator(), I realize
> that is not possible. 
> 
> So CPU_DEAD should do a thaw_process on the kthread before doing a
> kthread_stop?

Probably we can do this, or that. In any case we need changes, that
was my point.

And the best change I believe is to _remove_ CPU_DEAD handling from
workqueue.c as I suggested before. This kthread_stop() is not a good
idea per se, it calls wake_up_process(), but we should in fact use
wake_up(&cwq->more_work). Yes, work->func() should be ready for the
false wakeups, but still.

But for now I hope the last "draft" patch is enough. I'll continue
on next weekend.

Oleg.


^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist
  2007-01-09 16:56                                         ` Oleg Nesterov
@ 2007-01-14 23:54                                           ` Oleg Nesterov
  2007-01-15  4:33                                             ` Srivatsa Vaddagiri
  0 siblings, 1 reply; 78+ messages in thread
From: Oleg Nesterov @ 2007-01-14 23:54 UTC (permalink / raw)
  To: Srivatsa Vaddagiri
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy, Pallipadi,
	Venkatesh

How about the pseudo-code below?

workqueue_mutex is only used to protect "struct list_head workqueues",
all workqueue operations can run in parallel with cpuhotplug callback path.
take_over_work(), migrate_sequence, CPU_LOCK_ACQUIRE/RELEASE go away.

I'd like to make a couple of cleanups (and fix schedule_on_each_cpu) before
sending the patch, but if somebody doesn't like this intrusive change, he
can nack it right now.

Oleg.

struct cpu_workqueue_srtuct {
	...
	int should_stop;
	...
};

// also used by flush_work/flush_workqueue
static cpumask_t cpu_populated_map __read_mostly;

/*
 * NOTE: the caller must not touch *cwq if this func returns true
 */
static inline int cwq_should_stop(struct cpu_workqueue_struct *cwq)
{
	int should_stop = cwq->should_stop;

	if (unlikely(should_stop)) {
		spin_lock_irq(&cwq->lock);
		should_stop = cwq->should_stop && list_empty(&cwq->worklist);
		if (should_stop)
			cwq->thread = NULL;
		spin_unlock_irq(&cwq->lock);
	}

	return should_stop;
}

static int worker_thread(void *cwq)
{
	while (!cwq_should_stop(cwq)) {
		...
		run_workqueue();
		...
	}
}

static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
{
	struct task_struct *p;

	spin_lock_irq(&cwq->lock);
	cwq->should_stop = 0;
	p = cwq->thread;
	spin_unlock_irq(&cwq->lock);

	if (!p) {
		struct workqueue_struct *wq = cwq->wq;
		const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d";

		p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu);
		/*
		 * Nobody can add the work_struct to this cwq,
		 *	if (caller is __create_workqueue)
		 *		nobody should see this wq
		 *	else // caller is CPU_UP_PREPARE
		 *		cpu is not on cpu_online_map
		 * so we can abort safely.
		 */
		if (IS_ERR(p))
			return PTR_ERR(p);

		if (!is_single_threaded(wq))
			kthread_bind(p, cpu);
		/*
		 * Cancels affinity if the caller is CPU_UP_PREPARE.
		 * Needs a cleanup, but OK.
		 */
		wake_up_process(p);
		cwq->thread = p;
	}

	return 0;
}

struct workqueue_struct *__create_workqueue(const char *name,
					    int singlethread, int freezeable)
{
	struct workqueue_struct *wq;
	struct cpu_workqueue_struct *cwq;
	int err = 0, cpu;

	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
	if (!wq)
		return NULL;

	wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
	if (!wq->cpu_wq) {
		kfree(wq);
		return NULL;
	}

	wq->name = name;
	wq->freezeable = freezeable;

	if (singlethread) {
		INIT_LIST_HEAD(&wq->list);
		cwq = init_cpu_workqueue(wq, singlethread_cpu);
		err = create_workqueue_thread(cwq, singlethread_cpu);
	} else {
		mutex_lock(&workqueue_mutex);
		list_add(&wq->list, &workqueues);

		for_each_possible_cpu(cpu) {
			cwq = init_cpu_workqueue(wq, cpu);
			if (err || !cpu_isset(cpu, cpu_populated_map))
				continue;
			err = create_workqueue_thread(cwq, cpu);
		}
		mutex_unlock(&workqueue_mutex);
	}

	if (err) {
		destroy_workqueue(wq);
		wq = NULL;
	}
	return wq;
}

static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
{
	struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
	struct wq_barrier barr;
	int alive = 0;

	spin_lock_irq(&cwq->lock);
	if (cwq->thread != NULL) {
		insert_wq_barrier(cwq, &barr, 1);
		cwq->should_stop = 1;
		alive = 1;
	}
	spin_unlock_irq(&cwq->lock);

	if (alive) {
		wait_for_completion(&barr.done);

		while (unlikely(cwq->thread != NULL))
			cpu_relax();
		/*
		 * Wait until cwq->thread unlocks cwq->lock,
		 * it won't touch *cwq after that.
		 */
		smp_rmb();
		spin_unlock_wait(&cwq->lock);
	}
}

void destroy_workqueue(struct workqueue_struct *wq)
{
	if (is_single_threaded(wq))
		cleanup_workqueue_thread(wq, singlethread_cpu);
	else {
		int cpu;

		mutex_lock(&workqueue_mutex);
		list_del(&wq->list);
		mutex_unlock(&workqueue_mutex);

		for_each_cpu_mask(cpu, cpu_populated_map)
			cleanup_workqueue_thread(wq, cpu);
	}

	free_percpu(wq->cpu_wq);
	kfree(wq);
}

static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
						unsigned long action,
						void *hcpu)
{
	struct workqueue_struct *wq;
	struct cpu_workqueue_struct *cwq;
	unsigned int cpu = (unsigned long)hcpu;
	int ret = NOTIFY_OK;

	mutex_lock(&workqueue_mutex);
	if (action == CPU_UP_PREPARE)
		cpu_set(cpu, cpu_populated_map);

	list_for_each_entry(wq, &workqueues, list) {
		cwq = per_cpu_ptr(wq->cpu_wq, cpu);

		switch (action) {
		case CPU_UP_PREPARE:
			if (create_workqueue_thread(cwq, cpu))
				ret = NOTIFY_BAD;
			break;

		case CPU_ONLINE:
			set_cpus_allowed(cwq->thread, cpumask_of_cpu(cpu));
			break;

		case CPU_UP_CANCELED:
		case CPU_DEAD:
			cwq->should_stop = 1;
			wake_up(&cwq->more_work);
			break;
		}

		if (ret != NOTIFY_OK) {
			printk(KERN_ERR "workqueue for %i failed\n", cpu);
			break;
		}
	}
	mutex_unlock(&workqueue_mutex);

	return ret;
}

void init_workqueues(void)
{
	...
	cpu_populated_map = cpu_online_map;
	...
}


^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist
  2007-01-14 23:54                                           ` Oleg Nesterov
@ 2007-01-15  4:33                                             ` Srivatsa Vaddagiri
  2007-01-15 12:54                                               ` Oleg Nesterov
  0 siblings, 1 reply; 78+ messages in thread
From: Srivatsa Vaddagiri @ 2007-01-15  4:33 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy, Pallipadi,
	Venkatesh

On Mon, Jan 15, 2007 at 02:54:10AM +0300, Oleg Nesterov wrote:
> How about the pseudo-code below?

Some quick comments:

- singlethread_cpu needs to be hotplug safe (broken currently)

- Any reason why cpu_populated_map is not modified on CPU_DEAD?

- I feel more comfortable if workqueue_cpu_callback were to take
  workqueue_mutex in LOCK_ACQ and release it in LOCK_RELEASE 
  notifications. This will provide stable access to cpu_populated_map
  to functions like __create_workqueue.

Finally, I wonder if these changes will be unnecessary if we move to
process freezer based hotplug locking ...

-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist
  2007-01-15  4:33                                             ` Srivatsa Vaddagiri
@ 2007-01-15 12:54                                               ` Oleg Nesterov
  2007-01-15 13:08                                                 ` Oleg Nesterov
  2007-01-15 16:18                                                 ` Srivatsa Vaddagiri
  0 siblings, 2 replies; 78+ messages in thread
From: Oleg Nesterov @ 2007-01-15 12:54 UTC (permalink / raw)
  To: Srivatsa Vaddagiri
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy, Pallipadi,
	Venkatesh

On 01/15, Srivatsa Vaddagiri wrote:
>
> On Mon, Jan 15, 2007 at 02:54:10AM +0300, Oleg Nesterov wrote:
> > How about the pseudo-code below?
> 
> Some quick comments:
> 
> - singlethread_cpu needs to be hotplug safe (broken currently)

Why? Could you explain?

> - Any reason why cpu_populated_map is not modified on CPU_DEAD?

Because CPU_DEAD/CPU_UP_CANCELED doesn't wait for cwq->thread to exit.
cpu_populated_map never shrinks, it only grows on CPU_UP_PREPARE.

We can change this, but it needs some more code, and I am not sure
we need it. Note that a "false" bit in cpu_populated_map only means
that flush_work/flush_workqueue/destroy_workqueu will do lock/unlock
of cwq->lock, nothing more.

> - I feel more comfortable if workqueue_cpu_callback were to take
>   workqueue_mutex in LOCK_ACQ and release it in LOCK_RELEASE
>   notifications.

The whole purpose of this change to avoid this!

	Gautham R Shenoy wrote:
	>
	> Oh! I was refering to the other set of workqueue deadlock woes. The
	> ones caused when subsystems (like cpufreq) try to create/destroy
	> workqueue from the cpuhotplug callback path.
	>
	> Creation/destruction of workqueue requires us to take workqueue_mutex,
	> which would have already been taken during CPU_LOCK_ACQUIRE.

With this change workqueue_mutex is only taken to protect workqueues
list, why should we hold it for (say) CPU_UP_PREPARE->CPU_ONLINE path?

>                   This will provide stable access to cpu_populated_map
>   to functions like __create_workqueue.

I think this is not needed.

> Finally, I wonder if these changes will be unnecessary if we move to
> process freezer based hotplug locking ...

This change ir not strictly necessary but imho make the code better and
shrinks .text by 379 bytes.

But I believe that freezer will change nothing for workqueue. We still
need take_over_work(), and hacks like migrate_sequence. And no, CPU_DEAD
can't just thaw cwq->thread which was bound to the dead CPU to complete
kthread_stop(), we should thaw all processes.

Oleg.


^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist
  2007-01-15 12:54                                               ` Oleg Nesterov
@ 2007-01-15 13:08                                                 ` Oleg Nesterov
  2007-01-15 16:18                                                 ` Srivatsa Vaddagiri
  1 sibling, 0 replies; 78+ messages in thread
From: Oleg Nesterov @ 2007-01-15 13:08 UTC (permalink / raw)
  To: Srivatsa Vaddagiri
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy, Pallipadi,
	Venkatesh

On 01/15, Oleg Nesterov wrote:
>
> cpu_populated_map never shrinks, it only grows on CPU_UP_PREPARE.

__create_workqueue() should not use it. Needs a fix.

Oleg.


^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist
  2007-01-15 12:54                                               ` Oleg Nesterov
  2007-01-15 13:08                                                 ` Oleg Nesterov
@ 2007-01-15 16:18                                                 ` Srivatsa Vaddagiri
  2007-01-15 16:55                                                   ` Oleg Nesterov
  1 sibling, 1 reply; 78+ messages in thread
From: Srivatsa Vaddagiri @ 2007-01-15 16:18 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy, Pallipadi,
	Venkatesh

On Mon, Jan 15, 2007 at 03:54:01PM +0300, Oleg Nesterov wrote:
> > - singlethread_cpu needs to be hotplug safe (broken currently)
> 
> Why? Could you explain?

What if 'singlethread_cpu' dies?

> > - Any reason why cpu_populated_map is not modified on CPU_DEAD?
> 
> Because CPU_DEAD/CPU_UP_CANCELED doesn't wait for cwq->thread to exit.
> cpu_populated_map never shrinks, it only grows on CPU_UP_PREPARE.
> 
> We can change this, but it needs some more code, and I am not sure
> we need it. Note that a "false" bit in cpu_populated_map only means
> that flush_work/flush_workqueue/destroy_workqueu will do lock/unlock
> of cwq->lock, nothing more.

What abt __create_workqueue/schedule_on_each_cpu?

> > - I feel more comfortable if workqueue_cpu_callback were to take
> >   workqueue_mutex in LOCK_ACQ and release it in LOCK_RELEASE
> >   notifications.
> 
> The whole purpose of this change to avoid this!

I guess it depends on how __create_workqueue/schedule_on_each_cpu is
modified (whether we take/release lock upon LOCK_ACQ/LOCK_RELEASE)

> > Finally, I wonder if these changes will be unnecessary if we move to
> > process freezer based hotplug locking ...
> 
> This change ir not strictly necessary but imho make the code better and
> shrinks .text by 379 bytes.
> 
> But I believe that freezer will change nothing for workqueue. We still
> need take_over_work(), and hacks like migrate_sequence. And no, CPU_DEAD
> can't just thaw cwq->thread which was bound to the dead CPU to complete
> kthread_stop(), we should thaw all processes.

What abt stopping that thread in CPU_DOWN_PREPARE (before freezing
processes)? I understand that it may add to the latency, but compared to
the overall latency of process freezer, I suspect it may not be much.

-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist
  2007-01-15 16:18                                                 ` Srivatsa Vaddagiri
@ 2007-01-15 16:55                                                   ` Oleg Nesterov
  2007-01-16  5:26                                                     ` Srivatsa Vaddagiri
  0 siblings, 1 reply; 78+ messages in thread
From: Oleg Nesterov @ 2007-01-15 16:55 UTC (permalink / raw)
  To: Srivatsa Vaddagiri
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy, Pallipadi,
	Venkatesh

On 01/15, Srivatsa Vaddagiri wrote:
>
> On Mon, Jan 15, 2007 at 03:54:01PM +0300, Oleg Nesterov wrote:
> > > - singlethread_cpu needs to be hotplug safe (broken currently)
> > 
> > Why? Could you explain?
> 
> What if 'singlethread_cpu' dies?

Still can't understand you. Probably you missed what singlethread_cpu is.

singlethread_cpu is just a "random" bit from cpu_possible_map. Single
threaded workqueue is not bound to any cpu. We only need it to be sure that
percpu_data.ptrs[singlethread_cpu] is populated by __percpu_alloc_mask().

> > > - Any reason why cpu_populated_map is not modified on CPU_DEAD?
> > 
> > Because CPU_DEAD/CPU_UP_CANCELED doesn't wait for cwq->thread to exit.
> > cpu_populated_map never shrinks, it only grows on CPU_UP_PREPARE.
> > 
> > We can change this, but it needs some more code, and I am not sure
> > we need it. Note that a "false" bit in cpu_populated_map only means
> > that flush_work/flush_workqueue/destroy_workqueu will do lock/unlock
> > of cwq->lock, nothing more.
> 
> What abt __create_workqueue/schedule_on_each_cpu?

As I said already __create_workqueue() needs a fix, schedule_on_each_cpu()
is already broken, and should be fixed as well.

> > > - I feel more comfortable if workqueue_cpu_callback were to take
> > >   workqueue_mutex in LOCK_ACQ and release it in LOCK_RELEASE
> > >   notifications.
> > 
> > The whole purpose of this change to avoid this!
> 
> I guess it depends on how __create_workqueue/schedule_on_each_cpu is
> modified (whether we take/release lock upon LOCK_ACQ/LOCK_RELEASE)

Sorry, can't understand this...

> > > Finally, I wonder if these changes will be unnecessary if we move to
> > > process freezer based hotplug locking ...
> > 
> > This change ir not strictly necessary but imho make the code better and
> > shrinks .text by 379 bytes.
> > 
> > But I believe that freezer will change nothing for workqueue. We still
> > need take_over_work(), and hacks like migrate_sequence. And no, CPU_DEAD
> > can't just thaw cwq->thread which was bound to the dead CPU to complete
> > kthread_stop(), we should thaw all processes.
> 
> What abt stopping that thread in CPU_DOWN_PREPARE (before freezing
> processes)? I understand that it may add to the latency, but compared to
> the overall latency of process freezer, I suspect it may not be much.

Srivatsa, why do you think this would be better?

It add to the complexity! What do you mean by "stopping that thread" ?
Kill it? - this is wrong. Make it TASK_STOPPED? - very untrivial and I
can't see how this helps.

Oleg.


^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist
  2007-01-15 16:55                                                   ` Oleg Nesterov
@ 2007-01-16  5:26                                                     ` Srivatsa Vaddagiri
  2007-01-16 13:27                                                       ` Oleg Nesterov
  0 siblings, 1 reply; 78+ messages in thread
From: Srivatsa Vaddagiri @ 2007-01-16  5:26 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy, Pallipadi,
	Venkatesh

On Mon, Jan 15, 2007 at 07:55:16PM +0300, Oleg Nesterov wrote:
> > What if 'singlethread_cpu' dies?
> 
> Still can't understand you. Probably you missed what singlethread_cpu is.

oops yes ..I had mistakenly thought that create_workqueue_thread() will
bind worker thread to singlethread_cpu for single_threaded workqueue.
So it isn't a problem.
 
> > What abt __create_workqueue/schedule_on_each_cpu?
> 
> As I said already __create_workqueue() needs a fix, schedule_on_each_cpu()
> is already broken, and should be fixed as well.

__create_workqueue() creates worker threads for all online CPUs
currently. Accessing the online_map could be racy unless we 
serialize the access with hotplug event (thr' a mutex like workqueue
mutex held between LOCK_ACQ/LOCK_RELEASE messages or process freezer)
OR take special measures as was done in flush_workqueue. How were
you considering to deal with that raciness?

> > > The whole purpose of this change to avoid this!
> >
> > I guess it depends on how __create_workqueue/schedule_on_each_cpu is
> > modified (whether we take/release lock upon LOCK_ACQ/LOCK_RELEASE)
> 
> Sorry, can't understand this...

I meant to say that depending on how we modify
__create_workqueue/schedule_on_each_cpu to avoid racy-access to
online_map, we can debate whether workqueue mutex needs to be held
between LOCK_ACQ/LOCK_RELEASE messages in the callback.

> > What abt stopping that thread in CPU_DOWN_PREPARE (before freezing
> > processes)? I understand that it may add to the latency, but compared to
> > the overall latency of process freezer, I suspect it may not be much.
> 
> Srivatsa, why do you think this would be better?
> 
> It add to the complexity! What do you mean by "stopping that thread" ?
> Kill it? - this is wrong. 

I meant issuing kthread_stop() in DOWN_PREPARE so that worker
thread exits itself (much before CPU is actually brought down).

Do you see any problems with that?

Even if there are problems with it, how abt something like below:

workqueue_cpu_callback()
{

	CPU_DEAD:
		/* threads are still frozen at this point */
		take_over_work();
		kthread_mark_stop(worker_thread);
		break;

	CPU_CLEAN_THREADS:
		/* all threads resumed by now */
		kthread_stop(worker_thread); /* task_struct ref required? */
		break;

}

kthread_mark_stop() will mark somewhere in task_struct that the thread
should exit when it comes out of refrigerator.

worker_thread()
{
	
        while (!kthread_should_stop()) {
                if (cwq->freezeable)
                        try_to_freeze();

		if (kthread_marked_stop(current))
			break;

		...

	}
}

The advantage I see above is that, when take_over_work() is running, we wont 
race with functions like flush_workqueue() (threads still frozen at that
point) and hence we avoid hacks like migrate_sequence. This will also
let functions like flush_workqueue() easily access cpu_online_map as
below -without- any special locking/hacks (which I consider a great
benefit for programmers).

flush_workqueue()
{
	for_each_online_cpu(i)
		flush_cpu_workqueue(i);
}

Do you see any problems with this later approach?
		
-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist
  2007-01-16  5:26                                                     ` Srivatsa Vaddagiri
@ 2007-01-16 13:27                                                       ` Oleg Nesterov
  2007-01-17  6:17                                                         ` Srivatsa Vaddagiri
  0 siblings, 1 reply; 78+ messages in thread
From: Oleg Nesterov @ 2007-01-16 13:27 UTC (permalink / raw)
  To: Srivatsa Vaddagiri
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy, Pallipadi,
	Venkatesh

On 01/16, Srivatsa Vaddagiri wrote:
>
> On Mon, Jan 15, 2007 at 07:55:16PM +0300, Oleg Nesterov wrote:
> >
> > As I said already __create_workqueue() needs a fix, schedule_on_each_cpu()
> > is already broken, and should be fixed as well.
> 
> __create_workqueue() creates worker threads for all online CPUs
> currently. Accessing the online_map could be racy unless we 
> serialize the access with hotplug event (thr' a mutex like workqueue
> mutex held between LOCK_ACQ/LOCK_RELEASE messages or process freezer)
> OR take special measures as was done in flush_workqueue. How were
> you considering to deal with that raciness?

This is easy to fix in multiple ways. For example (this is not the best
approach), we can make cpu_creation_map. Like cpu_populate_map, but CPU
is cleared on CPU_DEAD/CPU_UP_CANCELED.

I'll continue when I have a time.

> > > What abt stopping that thread in CPU_DOWN_PREPARE (before freezing
> > > processes)? I understand that it may add to the latency, but compared to
> > > the overall latency of process freezer, I suspect it may not be much.
> 
> I meant issuing kthread_stop() in DOWN_PREPARE so that worker
> thread exits itself (much before CPU is actually brought down).

Deadlock if work_struct re-queues itself.

> Even if there are problems with it, how abt something like below:
> 
> workqueue_cpu_callback()
> {
> 
> 	CPU_DEAD:
> 		/* threads are still frozen at this point */
> 		take_over_work();

No, we can't move a currently executing work. This will break flush_workxxx().

> 		kthread_mark_stop(worker_thread);
> 		break;
> 
> 	CPU_CLEAN_THREADS:
> 		/* all threads resumed by now */
> 		kthread_stop(worker_thread); /* task_struct ref required? */

yes, required,

> kthread_mark_stop() will mark somewhere in task_struct that the thread
> should exit when it comes out of refrigerator.
> 
> worker_thread()
> {
> 	
>         while (!kthread_should_stop()) {
>                 if (cwq->freezeable)
>                         try_to_freeze();
> 
> 		if (kthread_marked_stop(current))
> 			break;

Racy. Because of kthread_stop() above we should clear cwq->thread somehow.
But we can't do this: this workqueue may be already destroyed.

> The advantage I see above is that, when take_over_work() is running, we wont 
> race with functions like flush_workqueue() (threads still frozen at that
> point) and hence we avoid hacks like migrate_sequence.

See above.

Please note that the code I posted does something like kthread_mark_stop(), but
it operates on cwq, not on task_struct, this makes a big difference. And it doesn't
need take_over_work() at all. And it doesn't need additional complications. Instead,
it lessens both the source and compiled code.

Note that I am not against using freezer for cpu-hotplug. My point is that this
can't help much for the current workqueue.c, and it will completely transparent
with the change I suggest.

Oleg.


^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist
  2007-01-16 13:27                                                       ` Oleg Nesterov
@ 2007-01-17  6:17                                                         ` Srivatsa Vaddagiri
  2007-01-17 15:47                                                           ` Oleg Nesterov
  0 siblings, 1 reply; 78+ messages in thread
From: Srivatsa Vaddagiri @ 2007-01-17  6:17 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy, Pallipadi,
	Venkatesh

On Tue, Jan 16, 2007 at 04:27:25PM +0300, Oleg Nesterov wrote:
> > I meant issuing kthread_stop() in DOWN_PREPARE so that worker
> > thread exits itself (much before CPU is actually brought down).
> 
> Deadlock if work_struct re-queues itself.

Are you referring to the problem you described here?

	http://lkml.org/lkml/2007/1/8/173

If so, then it can easily be prevented by having run_workqueue() check for 
kthread_should_stop() in its while loop?

> > Even if there are problems with it, how abt something like below:
> >
> > workqueue_cpu_callback()
> > {
> >
> > 	CPU_DEAD:
> > 		/* threads are still frozen at this point */
> > 		take_over_work();
> 
> No, we can't move a currently executing work. This will break flush_workxxx().

What do you mean by "currently" executing work? worker thread executing
some work on the cpu? That is not possible, because all threads are
frozen at this point. There cant be any ongoing flush_workxxx() as well
because of this, which should avoid breaking flush_workxxx() ..

> > 		if (kthread_marked_stop(current))
> > 			break;
> 
> Racy. Because of kthread_stop() above we should clear cwq->thread somehow.
> But we can't do this: this workqueue may be already destroyed.

We will surely take workqueue_mutex in CPU_CLEAN_THREADS (when it
traverses the workqueues list), which should avoid this race?

> Please note that the code I posted does something like kthread_mark_stop(), but
> it operates on cwq, not on task_struct, this makes a big difference.

Ok sure ..putting the flag in cwq makes sense. Others also can follow
similar trick for stopping threads (like ksoftirqd).

> And it doesn't need take_over_work() at all. And it doesn't need additional 
> complications. Instead, it lessens both the source and compiled code.

I guess either way, changes are required.

1st method, what you are suggesting:
	
	- Needs separate bitmap(s), cpu_populated_map and possible another
	  for create_workqueue()?
	- flush_workqueue() traverses thr a separate bitmap
	  cpu_populated_map (separate from the online map) while
	  create_workqueue() traverses the other bitmap

2nd method:

	- Avoids the need for maintenance of separate bitmaps (uses
  	  existing cpu_online_map). All functions can safely use
	  the online_map w/o any races. Personally this is why I like
	  this approach.
	- Needs changes in worker_thread to exit right after it comes
	  out of refrigerator.

I havent made any changes as per 2nd method to see the resulting code
size, so I cant comment on code sizes.

Another point is that once we create code as in 1st method, which
maintains separate bitmaps, that will easily get replicated (over time) 
to other subsystems. Is that a good thing?

-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist
  2007-01-17  6:17                                                         ` Srivatsa Vaddagiri
@ 2007-01-17 15:47                                                           ` Oleg Nesterov
  2007-01-17 16:12                                                             ` Srivatsa Vaddagiri
  2007-01-17 16:25                                                             ` Srivatsa Vaddagiri
  0 siblings, 2 replies; 78+ messages in thread
From: Oleg Nesterov @ 2007-01-17 15:47 UTC (permalink / raw)
  To: Srivatsa Vaddagiri
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy, Pallipadi,
	Venkatesh

On 01/17, Srivatsa Vaddagiri wrote:
>
> On Tue, Jan 16, 2007 at 04:27:25PM +0300, Oleg Nesterov wrote:
> > > I meant issuing kthread_stop() in DOWN_PREPARE so that worker
> > > thread exits itself (much before CPU is actually brought down).
> > 
> > Deadlock if work_struct re-queues itself.
> 
> Are you referring to the problem you described here?
> 
> 	http://lkml.org/lkml/2007/1/8/173
> 
> If so, then it can easily be prevented by having run_workqueue() check for 
> kthread_should_stop() in its while loop?

flush_workqueue() also calls run_workqueue().

> > > workqueue_cpu_callback()
> > > {
> > >
> > > 	CPU_DEAD:
> > > 		/* threads are still frozen at this point */
> > > 		take_over_work();
> > 
> > No, we can't move a currently executing work. This will break flush_workxxx().
> 
> What do you mean by "currently" executing work? worker thread executing
> some work on the cpu? That is not possible, because all threads are
> frozen at this point. There cant be any ongoing flush_workxxx() as well
> because of this, which should avoid breaking flush_workxxx() ..

work->func() sleeps/freezed. We can't move the rest of pending jobs before
it completes. This will break flush_workxxx. And no, this is not because
we use barriers now.

> 1st method, what you are suggesting:
> 	
> 	- Needs separate bitmap(s), cpu_populated_map and possible another
> 	  for create_workqueue()?
> 	- flush_workqueue() traverses thr a separate bitmap
> 	  cpu_populated_map (separate from the online map) while
> 	  create_workqueue() traverses the other bitmap

Yes, we need the additional bitmap. This is optimization, we can just use
cpu_possible_map. create_workqueue() can use cpu_online_map + "int new_cpu".

Yes, this is a complication. But still this is much simpler (IMHO) than
we have now. And imho better.

> 2nd method:
> 
> 	- Avoids the need for maintenance of separate bitmaps (uses
>   	  existing cpu_online_map). All functions can safely use
> 	  the online_map w/o any races. Personally this is why I like
> 	  this approach.
> 	- Needs changes in worker_thread to exit right after it comes
> 	  out of refrigerator.
>
> I havent made any changes as per 2nd method to see the resulting code
> size, so I cant comment on code sizes.

Yes, yes, yes, let's see the code first! :) Then we can compare.
Right now:
	- cpu-hotplug doesn't use freezer yet
	- all ideas about using it to improve workqueue.c were wrong

> Another point is that once we create code as in 1st method, which
> maintains separate bitmaps, that will easily get replicated (over time) 
> to other subsystems. Is that a good thing?

Honestly, I can't understand your point. Why it will get replicated?
Because another subsystem will need cpu_populated_map too? We can remove
"static" and move cpu_populated_map to kernel/cpu.c then.

Btw, I agree it is good to have a sleeping lock to protect cpu_online_map.
But it should be separate from workqueue_mutex, and it is not needed for
create/destroy/flush funcs.

Oleg.


^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist
  2007-01-17 15:47                                                           ` Oleg Nesterov
@ 2007-01-17 16:12                                                             ` Srivatsa Vaddagiri
  2007-01-17 17:01                                                               ` Oleg Nesterov
  2007-01-17 16:25                                                             ` Srivatsa Vaddagiri
  1 sibling, 1 reply; 78+ messages in thread
From: Srivatsa Vaddagiri @ 2007-01-17 16:12 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy, Pallipadi,
	Venkatesh

On Wed, Jan 17, 2007 at 06:47:16PM +0300, Oleg Nesterov wrote:
> > What do you mean by "currently" executing work? worker thread executing
> > some work on the cpu? That is not possible, because all threads are
> > frozen at this point. There cant be any ongoing flush_workxxx() as well
> > because of this, which should avoid breaking flush_workxxx() ..
> 
> work->func() sleeps/freezed. 

Didnt Andrew call that (work->func calling try_to_freeze) madness?

	http://lkml.org/lkml/2007/01/07/166

Does that happen in practice?

-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist
  2007-01-17 15:47                                                           ` Oleg Nesterov
  2007-01-17 16:12                                                             ` Srivatsa Vaddagiri
@ 2007-01-17 16:25                                                             ` Srivatsa Vaddagiri
  1 sibling, 0 replies; 78+ messages in thread
From: Srivatsa Vaddagiri @ 2007-01-17 16:25 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy, Pallipadi,
	Venkatesh

On Wed, Jan 17, 2007 at 06:47:16PM +0300, Oleg Nesterov wrote:
> Btw, I agree it is good to have a sleeping lock to protect cpu_online_map.
> But it should be separate from workqueue_mutex, and it is not needed for
> create/destroy/flush funcs.

Which is what lock_cpu_hotplug() attempted to provide but which
has recd lot of flak in recent days. I guess if someone implements
process freezer based locking of cpu_online_map, we will know better how 
suited later is for cpu hotplug.

-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist
  2007-01-17 16:12                                                             ` Srivatsa Vaddagiri
@ 2007-01-17 17:01                                                               ` Oleg Nesterov
  0 siblings, 0 replies; 78+ messages in thread
From: Oleg Nesterov @ 2007-01-17 17:01 UTC (permalink / raw)
  To: Srivatsa Vaddagiri
  Cc: Andrew Morton, David Howells, Christoph Hellwig, Ingo Molnar,
	Linus Torvalds, linux-kernel, Gautham shenoy, Pallipadi,
	Venkatesh

On 01/17, Srivatsa Vaddagiri wrote:
>
> On Wed, Jan 17, 2007 at 06:47:16PM +0300, Oleg Nesterov wrote:
> > > What do you mean by "currently" executing work? worker thread executing
> > > some work on the cpu? That is not possible, because all threads are
> > > frozen at this point. There cant be any ongoing flush_workxxx() as well
> > > because of this, which should avoid breaking flush_workxxx() ..
> > 
> > work->func() sleeps/freezed. 
> 
> Didnt Andrew call that (work->func calling try_to_freeze) madness?
> 
> 	http://lkml.org/lkml/2007/01/07/166

This means we should move try_to_freeze() to run_workqueue() or we should
forbid auto-requeued works.

I don't have a time to do anything till weekend, but please see a "final"
version below.

	- Do you see any bugs?

	- Do you agree it is better than we have now?

If/when we use freezer/lock_cpu_hotplug() we probably can simplfy things
further.

Note: schedule_on_each_cpu() remains broken.

Oleg.

struct cpu_workqueue_srtuct {
	...
	int should_stop;
	...
};

static cpumask_t cpu_populated_map __read_mostly; //also used in flush_work...
static int embryonic_cpu __read_mostly = -1;

/*
 * NOTE: the caller must not touch *cwq if this func returns true
 */
static inline int cwq_should_stop(struct cpu_workqueue_struct *cwq)
{
	int should_stop = cwq->should_stop;

	if (unlikely(should_stop)) {
		spin_lock_irq(&cwq->lock);
		should_stop = cwq->should_stop && list_empty(&cwq->worklist);
		if (should_stop)
			cwq->thread = NULL;
		spin_unlock_irq(&cwq->lock);
	}

	return should_stop;
}

static int worker_thread(void *cwq)
{
	...
	while (!cwq_should_stop(cwq)) {
		if (cwq->wq->freezeable)
			try_to_freeze();

		prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
		if (!cwq->should_stop && list_empty(&cwq->worklist))
			schedule();
		finish_wait(&cwq->more_work, &wait);

		run_workqueue(cwq);
	}
	return 0;
}

static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
{
	struct task_struct *p;

	spin_lock_irq(&cwq->lock);
	cwq->should_stop = 0;
	p = cwq->thread;
	spin_unlock_irq(&cwq->lock);

	if (!p) {
		struct workqueue_struct *wq = cwq->wq;
		const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d";

		p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu);
		/*
		 * Nobody can add the work_struct to this cwq,
		 *	if (caller is __create_workqueue)
		 *		nobody should see this wq
		 *	else // caller is CPU_UP_PREPARE
		 *		cpu is not on cpu_online_map
		 * so we can abort safely.
		 */
		if (IS_ERR(p))
			return PTR_ERR(p);

		if (!is_single_threaded(wq))
			kthread_bind(p, cpu);
		/*
		 * Cancels affinity if the caller is CPU_UP_PREPARE.
		 * Needs a cleanup, but OK.
		 */
		wake_up_process(p);
		cwq->thread = p;
	}

	return 0;
}

struct workqueue_struct *__create_workqueue(const char *name,
					    int singlethread, int freezeable)
{
	struct workqueue_struct *wq;
	struct cpu_workqueue_struct *cwq;
	int err = 0, cpu;

	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
	if (!wq)
		return NULL;

	wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
	if (!wq->cpu_wq) {
		kfree(wq);
		return NULL;
	}

	wq->name = name;
	wq->freezeable = freezeable;

	if (singlethread) {
		INIT_LIST_HEAD(&wq->list);
		cwq = init_cpu_workqueue(wq, singlethread_cpu);
		err = create_workqueue_thread(cwq, singlethread_cpu);
	} else {
		mutex_lock(&workqueue_mutex);
		list_add(&wq->list, &workqueues);

		for_each_possible_cpu(cpu) {
			cwq = init_cpu_workqueue(wq, cpu);
			if (err || !(cpu_online(cpu) || cpu == embryonic_cpu))
				continue;
			err = create_workqueue_thread(cwq, cpu);
		}
		mutex_unlock(&workqueue_mutex);
	}

	if (err) {
		destroy_workqueue(wq);
		wq = NULL;
	}
	return wq;
}

static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
{
	struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
	struct wq_barrier barr;
	int alive = 0;

	spin_lock_irq(&cwq->lock);
	if (cwq->thread != NULL) {
		insert_wq_barrier(cwq, &barr, 1);
		cwq->should_stop = 1;
		alive = 1;
	}
	spin_unlock_irq(&cwq->lock);

	if (alive) {
		wait_for_completion(&barr.done);

		while (unlikely(cwq->thread != NULL))
			cpu_relax();
		/*
		 * Wait until cwq->thread unlocks cwq->lock,
		 * it won't touch *cwq after that.
		 */
		smp_rmb();
		spin_unlock_wait(&cwq->lock);
	}
}

void destroy_workqueue(struct workqueue_struct *wq)
{
	if (is_single_threaded(wq))
		cleanup_workqueue_thread(wq, singlethread_cpu);
	else {
		int cpu;

		mutex_lock(&workqueue_mutex);
		list_del(&wq->list);
		mutex_unlock(&workqueue_mutex);

		for_each_cpu_mask(cpu, cpu_populated_map)
			cleanup_workqueue_thread(wq, cpu);
	}

	free_percpu(wq->cpu_wq);
	kfree(wq);
}

static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
						unsigned long action,
						void *hcpu)
{
	struct workqueue_struct *wq;
	struct cpu_workqueue_struct *cwq;
	unsigned int cpu = (unsigned long)hcpu;
	int ret = NOTIFY_OK;

	mutex_lock(&workqueue_mutex);
	embryonic_cpu = -1;
	if (action == CPU_UP_PREPARE) {
		cpu_set(cpu, cpu_populated_map);
		embryonic_cpu = cpu;
	}

	list_for_each_entry(wq, &workqueues, list) {
		cwq = per_cpu_ptr(wq->cpu_wq, cpu);

		switch (action) {
		case CPU_UP_PREPARE:
			if (create_workqueue_thread(cwq, cpu))
				ret = NOTIFY_BAD;
			break;

		case CPU_ONLINE:
			set_cpus_allowed(cwq->thread, cpumask_of_cpu(cpu));
			break;

		case CPU_UP_CANCELED:
		case CPU_DEAD:
			cwq->should_stop = 1;
			wake_up(&cwq->more_work);
			break;
		}

		if (ret != NOTIFY_OK) {
			printk(KERN_ERR "workqueue for %i failed\n", cpu);
			break;
		}
	}
	mutex_unlock(&workqueue_mutex);

	return ret;
}

void init_workqueues(void)
{
	...
	cpu_populated_map = cpu_online_map;
	...
}


^ permalink raw reply	[flat|nested] 78+ messages in thread

end of thread, other threads:[~2007-01-17 17:03 UTC | newest]

Thread overview: 78+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-12-17 22:34 [PATCH, RFC] reimplement flush_workqueue() Oleg Nesterov
2006-12-18  3:09 ` Linus Torvalds
2006-12-19  0:27 ` Andrew Morton
2006-12-19  0:43   ` Oleg Nesterov
2006-12-19  1:00     ` Andrew Morton
2007-01-04 11:32     ` Srivatsa Vaddagiri
2007-01-04 14:29       ` Oleg Nesterov
2007-01-04 15:56         ` Srivatsa Vaddagiri
2007-01-04 16:31           ` Oleg Nesterov
2007-01-04 16:57             ` Srivatsa Vaddagiri
2007-01-04 17:18         ` Andrew Morton
2007-01-04 18:09           ` Oleg Nesterov
2007-01-04 18:31             ` Andrew Morton
2007-01-05  9:03               ` Srivatsa Vaddagiri
2007-01-05 14:07                 ` Oleg Nesterov
2007-01-06 15:24                   ` Srivatsa Vaddagiri
2007-01-05  8:56           ` Srivatsa Vaddagiri
2007-01-05 12:42             ` Oleg Nesterov
2007-01-06 15:11               ` Srivatsa Vaddagiri
2007-01-06 15:10           ` [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update Oleg Nesterov
2007-01-06 15:45             ` Srivatsa Vaddagiri
2007-01-06 16:30               ` Oleg Nesterov
2007-01-06 16:38                 ` Srivatsa Vaddagiri
2007-01-06 17:34                   ` Oleg Nesterov
2007-01-07 10:43                     ` Srivatsa Vaddagiri
2007-01-07 12:56                       ` Oleg Nesterov
2007-01-07 14:22                         ` Oleg Nesterov
2007-01-07 14:42                           ` Oleg Nesterov
2007-01-07 16:43                           ` Srivatsa Vaddagiri
2007-01-07 17:01                             ` Srivatsa Vaddagiri
2007-01-07 17:33                               ` Oleg Nesterov
2007-01-07 17:18                             ` Oleg Nesterov
2007-01-07 16:21                         ` Srivatsa Vaddagiri
2007-01-07 17:09                           ` Oleg Nesterov
2007-01-06 19:11                   ` Andrew Morton
2007-01-06 19:13                     ` Ingo Molnar
2007-01-07 11:00                     ` Srivatsa Vaddagiri
2007-01-07 19:59                       ` Andrew Morton
2007-01-07 21:01                         ` [PATCH] flush_cpu_workqueue: don't flush an empty ->worklist Oleg Nesterov
2007-01-08 23:54                           ` Andrew Morton
2007-01-09  5:04                             ` Srivatsa Vaddagiri
2007-01-09  5:26                               ` Andrew Morton
2007-01-09  6:56                                 ` Ingo Molnar
2007-01-09  9:33                                 ` Srivatsa Vaddagiri
2007-01-09  9:44                                   ` Ingo Molnar
2007-01-09  9:51                                   ` Andrew Morton
2007-01-09 10:09                                     ` Srivatsa Vaddagiri
2007-01-09 10:15                                       ` Andrew Morton
2007-01-09 15:07                                 ` Oleg Nesterov
2007-01-09 15:59                                   ` Srivatsa Vaddagiri
2007-01-09 16:38                                     ` Oleg Nesterov
2007-01-09 16:46                                       ` Srivatsa Vaddagiri
2007-01-09 16:56                                         ` Oleg Nesterov
2007-01-14 23:54                                           ` Oleg Nesterov
2007-01-15  4:33                                             ` Srivatsa Vaddagiri
2007-01-15 12:54                                               ` Oleg Nesterov
2007-01-15 13:08                                                 ` Oleg Nesterov
2007-01-15 16:18                                                 ` Srivatsa Vaddagiri
2007-01-15 16:55                                                   ` Oleg Nesterov
2007-01-16  5:26                                                     ` Srivatsa Vaddagiri
2007-01-16 13:27                                                       ` Oleg Nesterov
2007-01-17  6:17                                                         ` Srivatsa Vaddagiri
2007-01-17 15:47                                                           ` Oleg Nesterov
2007-01-17 16:12                                                             ` Srivatsa Vaddagiri
2007-01-17 17:01                                                               ` Oleg Nesterov
2007-01-17 16:25                                                             ` Srivatsa Vaddagiri
2007-01-07 21:51                         ` [PATCH] fix-flush_workqueue-vs-cpu_dead-race-update Oleg Nesterov
2007-01-08 15:22                           ` Srivatsa Vaddagiri
2007-01-08 15:56                             ` Oleg Nesterov
2007-01-08 16:31                               ` Srivatsa Vaddagiri
2007-01-08 17:06                                 ` Oleg Nesterov
2007-01-08 18:37                                   ` Pallipadi, Venkatesh
2007-01-09  1:11                                     ` Srivatsa Vaddagiri
2007-01-09  4:39                                   ` Srivatsa Vaddagiri
2007-01-09 14:38                                     ` Oleg Nesterov
2007-01-08 15:37                         ` Srivatsa Vaddagiri
2007-01-04 12:02 ` [PATCH, RFC] reimplement flush_workqueue() Srivatsa Vaddagiri
2007-01-04 14:38   ` Oleg Nesterov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).