All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC][PATCH 0/3] Move WQ,unplug scheduler hooks
@ 2011-06-21 23:34 Peter Zijlstra
  2011-06-21 23:34 ` [RFC][PATCH 1/3] sched, block: Move unplug Peter Zijlstra
                   ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Peter Zijlstra @ 2011-06-21 23:34 UTC (permalink / raw)
  To: linux-kernel
  Cc: Linus Torvalds, Ingo Molnar, Thomas Gleixner, Peter Zijlstra,
	Tejun Heo, Jens Axboe

Thomas got annoyed at the amount of work done while having IRQs disabled.



^ permalink raw reply	[flat|nested] 16+ messages in thread

* [RFC][PATCH 1/3] sched, block: Move unplug
  2011-06-21 23:34 [RFC][PATCH 0/3] Move WQ,unplug scheduler hooks Peter Zijlstra
@ 2011-06-21 23:34 ` Peter Zijlstra
  2011-06-22  7:01   ` Jens Axboe
  2011-06-21 23:34 ` [RFC][PATCH 2/3] sched, workqueue: Move WQ-sleeper wakeup Peter Zijlstra
  2011-06-21 23:34 ` [RFC][PATCH 3/3] block: Break long IRQ disabled region Peter Zijlstra
  2 siblings, 1 reply; 16+ messages in thread
From: Peter Zijlstra @ 2011-06-21 23:34 UTC (permalink / raw)
  To: linux-kernel
  Cc: Linus Torvalds, Ingo Molnar, Thomas Gleixner, Peter Zijlstra,
	Tejun Heo, Jens Axboe

[-- Attachment #1: tglx1 --]
[-- Type: text/plain, Size: 4847 bytes --]

Thomas found that we're doing a horrendous amount of work in that scheduler
unplug hook while having preempt and IRQs disabled.

Move it to the head of schedule() where both preemption and IRQs are enabled
such that we don't get these silly long IRQ/preempt disable times.

This allows us to remove a lot of special magic in the unplug path,
simplifying that code as a bonus.

Jens Axboe <jaxboe@fusionio.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 block/blk-core.c       |   28 ++++++++--------------------
 include/linux/blkdev.h |   12 ++----------
 kernel/sched.c         |   26 ++++++++++++++++----------
 3 files changed, 26 insertions(+), 40 deletions(-)

Index: linux-2.6/block/blk-core.c
===================================================================
--- linux-2.6.orig/block/blk-core.c
+++ linux-2.6/block/blk-core.c
@@ -2655,25 +2655,13 @@ static int plug_rq_cmp(void *priv, struc
  * additional stack usage in driver dispatch, in places where the originally
  * plugger did not intend it.
  */
-static void queue_unplugged(struct request_queue *q, unsigned int depth,
-			    bool from_schedule)
+static void queue_unplugged(struct request_queue *q, unsigned int depth)
 	__releases(q->queue_lock)
 {
-	trace_block_unplug(q, depth, !from_schedule);
-
-	/*
-	 * If we are punting this to kblockd, then we can safely drop
-	 * the queue_lock before waking kblockd (which needs to take
-	 * this lock).
-	 */
-	if (from_schedule) {
-		spin_unlock(q->queue_lock);
-		blk_run_queue_async(q);
-	} else {
-		__blk_run_queue(q);
-		spin_unlock(q->queue_lock);
-	}
+	trace_block_unplug(q, depth, true);
 
+	__blk_run_queue(q);
+	spin_unlock(q->queue_lock);
 }
 
 static void flush_plug_callbacks(struct blk_plug *plug)
@@ -2694,7 +2682,7 @@ static void flush_plug_callbacks(struct 
 	}
 }
 
-void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
+void blk_flush_plug_list(struct blk_plug *plug)
 {
 	struct request_queue *q;
 	unsigned long flags;
@@ -2732,7 +2720,7 @@ void blk_flush_plug_list(struct blk_plug
 			 * This drops the queue lock
 			 */
 			if (q)
-				queue_unplugged(q, depth, from_schedule);
+				queue_unplugged(q, depth);
 			q = rq->q;
 			depth = 0;
 			spin_lock(q->queue_lock);
@@ -2752,14 +2740,14 @@ void blk_flush_plug_list(struct blk_plug
 	 * This drops the queue lock
 	 */
 	if (q)
-		queue_unplugged(q, depth, from_schedule);
+		queue_unplugged(q, depth);
 
 	local_irq_restore(flags);
 }
 
 void blk_finish_plug(struct blk_plug *plug)
 {
-	blk_flush_plug_list(plug, false);
+	blk_flush_plug_list(plug);
 
 	if (plug == current->plug)
 		current->plug = NULL;
Index: linux-2.6/include/linux/blkdev.h
===================================================================
--- linux-2.6.orig/include/linux/blkdev.h
+++ linux-2.6/include/linux/blkdev.h
@@ -870,22 +870,14 @@ struct blk_plug_cb {
 
 extern void blk_start_plug(struct blk_plug *);
 extern void blk_finish_plug(struct blk_plug *);
-extern void blk_flush_plug_list(struct blk_plug *, bool);
+extern void blk_flush_plug_list(struct blk_plug *);
 
 static inline void blk_flush_plug(struct task_struct *tsk)
 {
 	struct blk_plug *plug = tsk->plug;
 
 	if (plug)
-		blk_flush_plug_list(plug, false);
-}
-
-static inline void blk_schedule_flush_plug(struct task_struct *tsk)
-{
-	struct blk_plug *plug = tsk->plug;
-
-	if (plug)
-		blk_flush_plug_list(plug, true);
+		blk_flush_plug_list(plug);
 }
 
 static inline bool blk_needs_flush_plug(struct task_struct *tsk)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -4209,6 +4209,20 @@ pick_next_task(struct rq *rq)
 	BUG(); /* the idle class will always have a runnable task */
 }
 
+static inline void sched_submit_work(void)
+{
+	struct task_struct *tsk = current;
+
+	if (tsk->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+		/*
+		 * If we are going to sleep and we have plugged IO
+		 * queued, make sure to submit it to avoid deadlocks.
+		 */
+		if (blk_needs_flush_plug(tsk))
+			blk_flush_plug(tsk);
+	}
+}
+
 /*
  * schedule() is the main scheduler function.
  */
@@ -4219,6 +4233,8 @@ asmlinkage void __sched schedule(void)
 	struct rq *rq;
 	int cpu;
 
+	sched_submit_work();
+
 need_resched:
 	preempt_disable();
 	cpu = smp_processor_id();
@@ -4253,16 +4269,6 @@ asmlinkage void __sched schedule(void)
 				if (to_wakeup)
 					try_to_wake_up_local(to_wakeup);
 			}
-
-			/*
-			 * If we are going to sleep and we have plugged IO
-			 * queued, make sure to submit it to avoid deadlocks.
-			 */
-			if (blk_needs_flush_plug(prev)) {
-				raw_spin_unlock(&rq->lock);
-				blk_schedule_flush_plug(prev);
-				raw_spin_lock(&rq->lock);
-			}
 		}
 		switch_count = &prev->nvcsw;
 	}



^ permalink raw reply	[flat|nested] 16+ messages in thread

* [RFC][PATCH 2/3] sched, workqueue: Move WQ-sleeper wakeup
  2011-06-21 23:34 [RFC][PATCH 0/3] Move WQ,unplug scheduler hooks Peter Zijlstra
  2011-06-21 23:34 ` [RFC][PATCH 1/3] sched, block: Move unplug Peter Zijlstra
@ 2011-06-21 23:34 ` Peter Zijlstra
  2011-06-22  9:24   ` Tejun Heo
  2011-06-21 23:34 ` [RFC][PATCH 3/3] block: Break long IRQ disabled region Peter Zijlstra
  2 siblings, 1 reply; 16+ messages in thread
From: Peter Zijlstra @ 2011-06-21 23:34 UTC (permalink / raw)
  To: linux-kernel
  Cc: Linus Torvalds, Ingo Molnar, Thomas Gleixner, Peter Zijlstra,
	Tejun Heo, Jens Axboe

[-- Attachment #1: tglx2 --]
[-- Type: text/plain, Size: 2864 bytes --]

While Thomas was annoyed with the block bits, he found no reason for the WQ
bits to be as they are, the signal thing can only avoid a wakeup, extra wakeups
for the WQ bits increase concurrency (in as far as signals are relevant to
kthreads at all).

Moving the WQ bits allow us to do away with try_to_wake_up_local, removing
more lines.

Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c |   60 ++++++++++++---------------------------------------------
 1 file changed, 13 insertions(+), 47 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -2703,40 +2703,6 @@ try_to_wake_up(struct task_struct *p, un
 }
 
 /**
- * try_to_wake_up_local - try to wake up a local task with rq lock held
- * @p: the thread to be awakened
- *
- * Put @p on the run-queue if it's not already there. The caller must
- * ensure that this_rq() is locked, @p is bound to this_rq() and not
- * the current task.
- */
-static void try_to_wake_up_local(struct task_struct *p)
-{
-	struct rq *rq = task_rq(p);
-
-	BUG_ON(rq != this_rq());
-	BUG_ON(p == current);
-	lockdep_assert_held(&rq->lock);
-
-	if (!raw_spin_trylock(&p->pi_lock)) {
-		raw_spin_unlock(&rq->lock);
-		raw_spin_lock(&p->pi_lock);
-		raw_spin_lock(&rq->lock);
-	}
-
-	if (!(p->state & TASK_NORMAL))
-		goto out;
-
-	if (!p->on_rq)
-		ttwu_activate(rq, p, ENQUEUE_WAKEUP);
-
-	ttwu_do_wakeup(rq, p, 0);
-	ttwu_stat(p, smp_processor_id(), 0);
-out:
-	raw_spin_unlock(&p->pi_lock);
-}
-
-/**
  * wake_up_process - Wake up a specific process
  * @p: The process to be woken up.
  *
@@ -4215,6 +4181,19 @@ static inline void sched_submit_work(voi
 
 	if (tsk->state && !(preempt_count() & PREEMPT_ACTIVE)) {
 		/*
+		 * If a worker went to sleep, notify and ask workqueue
+		 * whether it wants to wake up a task to maintain
+		 * concurrency.
+		 */
+		if (tsk->flags & PF_WQ_WORKER) {
+			struct task_struct *to_wakeup;
+
+			to_wakeup = wq_worker_sleeping(tsk, smp_processor_id());
+			if (to_wakeup)
+				wake_up_process(to_wakeup);
+		}
+
+		/*
 		 * If we are going to sleep and we have plugged IO
 		 * queued, make sure to submit it to avoid deadlocks.
 		 */
@@ -4256,19 +4235,6 @@ asmlinkage void __sched schedule(void)
 		} else {
 			deactivate_task(rq, prev, DEQUEUE_SLEEP);
 			prev->on_rq = 0;
-
-			/*
-			 * If a worker went to sleep, notify and ask workqueue
-			 * whether it wants to wake up a task to maintain
-			 * concurrency.
-			 */
-			if (prev->flags & PF_WQ_WORKER) {
-				struct task_struct *to_wakeup;
-
-				to_wakeup = wq_worker_sleeping(prev, cpu);
-				if (to_wakeup)
-					try_to_wake_up_local(to_wakeup);
-			}
 		}
 		switch_count = &prev->nvcsw;
 	}



^ permalink raw reply	[flat|nested] 16+ messages in thread

* [RFC][PATCH 3/3] block: Break long IRQ disabled region
  2011-06-21 23:34 [RFC][PATCH 0/3] Move WQ,unplug scheduler hooks Peter Zijlstra
  2011-06-21 23:34 ` [RFC][PATCH 1/3] sched, block: Move unplug Peter Zijlstra
  2011-06-21 23:34 ` [RFC][PATCH 2/3] sched, workqueue: Move WQ-sleeper wakeup Peter Zijlstra
@ 2011-06-21 23:34 ` Peter Zijlstra
  2011-06-22  7:03   ` Jens Axboe
  2 siblings, 1 reply; 16+ messages in thread
From: Peter Zijlstra @ 2011-06-21 23:34 UTC (permalink / raw)
  To: linux-kernel
  Cc: Linus Torvalds, Ingo Molnar, Thomas Gleixner, Peter Zijlstra,
	Tejun Heo, Jens Axboe

[-- Attachment #1: tglx3 --]
[-- Type: text/plain, Size: 1470 bytes --]

Keeping IRQs disabled for long is not a feature.

Cc: Jens Axboe <jaxboe@fusionio.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 block/blk-core.c |   11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

Index: linux-2.6/block/blk-core.c
===================================================================
--- linux-2.6.orig/block/blk-core.c
+++ linux-2.6/block/blk-core.c
@@ -2661,7 +2661,7 @@ static void queue_unplugged(struct reque
 	trace_block_unplug(q, depth, true);
 
 	__blk_run_queue(q);
-	spin_unlock(q->queue_lock);
+	spin_unlock_irq(q->queue_lock);
 }
 
 static void flush_plug_callbacks(struct blk_plug *plug)
@@ -2706,11 +2706,6 @@ void blk_flush_plug_list(struct blk_plug
 	q = NULL;
 	depth = 0;
 
-	/*
-	 * Save and disable interrupts here, to avoid doing it for every
-	 * queue lock we have to take.
-	 */
-	local_irq_save(flags);
 	while (!list_empty(&list)) {
 		rq = list_entry_rq(list.next);
 		list_del_init(&rq->queuelist);
@@ -2723,7 +2718,7 @@ void blk_flush_plug_list(struct blk_plug
 				queue_unplugged(q, depth);
 			q = rq->q;
 			depth = 0;
-			spin_lock(q->queue_lock);
+			spin_lock_irq(q->queue_lock);
 		}
 		/*
 		 * rq is already accounted, so use raw insert
@@ -2741,8 +2736,6 @@ void blk_flush_plug_list(struct blk_plug
 	 */
 	if (q)
 		queue_unplugged(q, depth);
-
-	local_irq_restore(flags);
 }
 
 void blk_finish_plug(struct blk_plug *plug)



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC][PATCH 1/3] sched, block: Move unplug
  2011-06-21 23:34 ` [RFC][PATCH 1/3] sched, block: Move unplug Peter Zijlstra
@ 2011-06-22  7:01   ` Jens Axboe
  2011-06-22 13:53     ` Thomas Gleixner
  0 siblings, 1 reply; 16+ messages in thread
From: Jens Axboe @ 2011-06-22  7:01 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Linus Torvalds, Ingo Molnar, Thomas Gleixner, Tejun Heo

On 2011-06-22 01:34, Peter Zijlstra wrote:
> Thomas found that we're doing a horrendous amount of work in that scheduler
> unplug hook while having preempt and IRQs disabled.
> 
> Move it to the head of schedule() where both preemption and IRQs are enabled
> such that we don't get these silly long IRQ/preempt disable times.
> 
> This allows us to remove a lot of special magic in the unplug path,
> simplifying that code as a bonus.

The major change here is moving the queue running inline, instead of
punting to a thread. The worry is/was that we risk blowing the stack if
something ends up blocking inadvertently further down the call path.
Since it's the unlikely way to unplug, a bit of latency was acceptable
to prevent this problem.

I'm curious why you made that change? It seems orthogonal to the change
you are actually describing in the commit message.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC][PATCH 3/3] block: Break long IRQ disabled region
  2011-06-21 23:34 ` [RFC][PATCH 3/3] block: Break long IRQ disabled region Peter Zijlstra
@ 2011-06-22  7:03   ` Jens Axboe
  0 siblings, 0 replies; 16+ messages in thread
From: Jens Axboe @ 2011-06-22  7:03 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Linus Torvalds, Ingo Molnar, Thomas Gleixner, Tejun Heo

On 2011-06-22 01:34, Peter Zijlstra wrote:
> Keeping IRQs disabled for long is not a feature.

This change looks fine to me. What kind of irq-off times did you see?

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC][PATCH 2/3] sched, workqueue: Move WQ-sleeper wakeup
  2011-06-21 23:34 ` [RFC][PATCH 2/3] sched, workqueue: Move WQ-sleeper wakeup Peter Zijlstra
@ 2011-06-22  9:24   ` Tejun Heo
  2011-06-22  9:27     ` Peter Zijlstra
  2011-06-22 13:47     ` Thomas Gleixner
  0 siblings, 2 replies; 16+ messages in thread
From: Tejun Heo @ 2011-06-22  9:24 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Linus Torvalds, Ingo Molnar, Thomas Gleixner, Jens Axboe

Hello, Peter.

On Wed, Jun 22, 2011 at 01:34:46AM +0200, Peter Zijlstra wrote:
> -/**
>   * wake_up_process - Wake up a specific process
>   * @p: The process to be woken up.
>   *
> @@ -4215,6 +4181,19 @@ static inline void sched_submit_work(voi
>  
>  	if (tsk->state && !(preempt_count() & PREEMPT_ACTIVE)) {
>  		/*
> +		 * If a worker went to sleep, notify and ask workqueue
> +		 * whether it wants to wake up a task to maintain
> +		 * concurrency.
> +		 */
> +		if (tsk->flags & PF_WQ_WORKER) {
> +			struct task_struct *to_wakeup;
> +
> +			to_wakeup = wq_worker_sleeping(tsk, smp_processor_id());
> +			if (to_wakeup)
> +				wake_up_process(to_wakeup);
> +		}
> +
> +		/*

Preemption could still be enabled here, right?  What prevents
preemtion kicking after wq_worker_sleeping() and do it again thus
breaking nr_running tracking.

>  		 * If we are going to sleep and we have plugged IO
>  		 * queued, make sure to submit it to avoid deadlocks.
>  		 */
> @@ -4256,19 +4235,6 @@ asmlinkage void __sched schedule(void)
>  		} else {
>  			deactivate_task(rq, prev, DEQUEUE_SLEEP);
>  			prev->on_rq = 0;
> -
> -			/*
> -			 * If a worker went to sleep, notify and ask workqueue
> -			 * whether it wants to wake up a task to maintain
> -			 * concurrency.
> -			 */
> -			if (prev->flags & PF_WQ_WORKER) {
> -				struct task_struct *to_wakeup;
> -
> -				to_wakeup = wq_worker_sleeping(prev, cpu);
> -				if (to_wakeup)
> -					try_to_wake_up_local(to_wakeup);
> -			}

Similarly, the if the 'if {}' part of the above if/else is taken, the
task never goes to sleep and nr_running will again be broken.

-- 
tejun

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC][PATCH 2/3] sched, workqueue: Move WQ-sleeper wakeup
  2011-06-22  9:24   ` Tejun Heo
@ 2011-06-22  9:27     ` Peter Zijlstra
  2011-06-22  9:37       ` Tejun Heo
  2011-06-22 13:47     ` Thomas Gleixner
  1 sibling, 1 reply; 16+ messages in thread
From: Peter Zijlstra @ 2011-06-22  9:27 UTC (permalink / raw)
  To: Tejun Heo
  Cc: linux-kernel, Linus Torvalds, Ingo Molnar, Thomas Gleixner, Jens Axboe

On Wed, 2011-06-22 at 11:24 +0200, Tejun Heo wrote:
> Hello, Peter.
> 
> On Wed, Jun 22, 2011 at 01:34:46AM +0200, Peter Zijlstra wrote:
> > -/**
> >   * wake_up_process - Wake up a specific process
> >   * @p: The process to be woken up.
> >   *
> > @@ -4215,6 +4181,19 @@ static inline void sched_submit_work(voi
> >  
> >  	if (tsk->state && !(preempt_count() & PREEMPT_ACTIVE)) {
> >  		/*
> > +		 * If a worker went to sleep, notify and ask workqueue
> > +		 * whether it wants to wake up a task to maintain
> > +		 * concurrency.
> > +		 */
> > +		if (tsk->flags & PF_WQ_WORKER) {
> > +			struct task_struct *to_wakeup;
> > +
> > +			to_wakeup = wq_worker_sleeping(tsk, smp_processor_id());
> > +			if (to_wakeup)
> > +				wake_up_process(to_wakeup);
> > +		}
> > +
> > +		/*
> 
> Preemption could still be enabled here, right?  What prevents
> preemtion kicking after wq_worker_sleeping() and do it again thus
> breaking nr_running tracking.

Aren't all PF_WQ_WORKER threads cpu-bound?

> >  		 * If we are going to sleep and we have plugged IO
> >  		 * queued, make sure to submit it to avoid deadlocks.
> >  		 */
> > @@ -4256,19 +4235,6 @@ asmlinkage void __sched schedule(void)
> >  		} else {
> >  			deactivate_task(rq, prev, DEQUEUE_SLEEP);
> >  			prev->on_rq = 0;
> > -
> > -			/*
> > -			 * If a worker went to sleep, notify and ask workqueue
> > -			 * whether it wants to wake up a task to maintain
> > -			 * concurrency.
> > -			 */
> > -			if (prev->flags & PF_WQ_WORKER) {
> > -				struct task_struct *to_wakeup;
> > -
> > -				to_wakeup = wq_worker_sleeping(prev, cpu);
> > -				if (to_wakeup)
> > -					try_to_wake_up_local(to_wakeup);
> > -			}
> 
> Similarly, the if the 'if {}' part of the above if/else is taken, the
> task never goes to sleep and nr_running will again be broken.

Bah, indeed, I forgot about that, 2am isn't the best of times to do
these things.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC][PATCH 2/3] sched, workqueue: Move WQ-sleeper wakeup
  2011-06-22  9:27     ` Peter Zijlstra
@ 2011-06-22  9:37       ` Tejun Heo
  0 siblings, 0 replies; 16+ messages in thread
From: Tejun Heo @ 2011-06-22  9:37 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Linus Torvalds, Ingo Molnar, Thomas Gleixner, Jens Axboe

On Wed, Jun 22, 2011 at 11:27:28AM +0200, Peter Zijlstra wrote:
> On Wed, 2011-06-22 at 11:24 +0200, Tejun Heo wrote:
> > Preemption could still be enabled here, right?  What prevents
> > preemtion kicking after wq_worker_sleeping() and do it again thus
> > breaking nr_running tracking.
> 
> Aren't all PF_WQ_WORKER threads cpu-bound?

Hmmm... I don't see how that would matter.  Please consider the
following scenario.

* A work item calls schedule() for whatever reason.

* schedule() calls sched_submit_work() which in turn calls
  wq_worker_sleeping() which adjusts nr_running.  All this happens
  with preemption enabled.

* sched_submit_work() returns but before schedule() does
  preempt_disable(), an IRQ is delivered.  By the time IRQ return path
  is executed, TIF_NEED_RESCHED is set which in turn calls schedule()
  again and repeats the above two steps for the same worker.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC][PATCH 2/3] sched, workqueue: Move WQ-sleeper wakeup
  2011-06-22  9:24   ` Tejun Heo
  2011-06-22  9:27     ` Peter Zijlstra
@ 2011-06-22 13:47     ` Thomas Gleixner
  1 sibling, 0 replies; 16+ messages in thread
From: Thomas Gleixner @ 2011-06-22 13:47 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Peter Zijlstra, linux-kernel, Linus Torvalds, Ingo Molnar, Jens Axboe

On Wed, 22 Jun 2011, Tejun Heo wrote:
> On Wed, Jun 22, 2011 at 01:34:46AM +0200, Peter Zijlstra wrote:
> 
> Preemption could still be enabled here, right?  What prevents
> preemtion kicking after wq_worker_sleeping() and do it again thus
> breaking nr_running tracking.

That wq code should serialize itself and not magically abuse rq->lock
for this.
 
> >  		 * If we are going to sleep and we have plugged IO
> >  		 * queued, make sure to submit it to avoid deadlocks.
> >  		 */
> > @@ -4256,19 +4235,6 @@ asmlinkage void __sched schedule(void)
> >  		} else {
> >  			deactivate_task(rq, prev, DEQUEUE_SLEEP);
> >  			prev->on_rq = 0;
> > -
> > -			/*
> > -			 * If a worker went to sleep, notify and ask workqueue
> > -			 * whether it wants to wake up a task to maintain
> > -			 * concurrency.
> > -			 */
> > -			if (prev->flags & PF_WQ_WORKER) {
> > -				struct task_struct *to_wakeup;
> > -
> > -				to_wakeup = wq_worker_sleeping(prev, cpu);
> > -				if (to_wakeup)
> > -					try_to_wake_up_local(to_wakeup);
> > -			}
> 
> Similarly, the if the 'if {}' part of the above if/else is taken, the
> task never goes to sleep and nr_running will again be broken.

The accounting can be done at schedule entry and exit and not
somewhere magic in the wakeup code. It does really not matter, whether
nr_running is updated on wakeup or when the worker gets on the cpu.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC][PATCH 1/3] sched, block: Move unplug
  2011-06-22  7:01   ` Jens Axboe
@ 2011-06-22 13:53     ` Thomas Gleixner
  2011-06-22 14:01       ` Jens Axboe
  0 siblings, 1 reply; 16+ messages in thread
From: Thomas Gleixner @ 2011-06-22 13:53 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Peter Zijlstra, linux-kernel, Linus Torvalds, Ingo Molnar, Tejun Heo

On Wed, 22 Jun 2011, Jens Axboe wrote:

> On 2011-06-22 01:34, Peter Zijlstra wrote:
> > Thomas found that we're doing a horrendous amount of work in that scheduler
> > unplug hook while having preempt and IRQs disabled.
> > 
> > Move it to the head of schedule() where both preemption and IRQs are enabled
> > such that we don't get these silly long IRQ/preempt disable times.
> > 
> > This allows us to remove a lot of special magic in the unplug path,
> > simplifying that code as a bonus.
> 
> The major change here is moving the queue running inline, instead of
> punting to a thread. The worry is/was that we risk blowing the stack if
> something ends up blocking inadvertently further down the call path.

Is that a real problem or just a "we have no clue what might happen"
countermeasure? The plug list should not be magically refilled once
it's split off so this should not recurse endlessly, right? If it does
then we better fix it at the root cause of the problem and not by
adding some last resort band aid into the scheduler code.

If the stack usage of that whole block code is the real issue, then we
probably need to keep that "delegate to async" workaround [sigh!], but
definitely outside of the scheduler core code.

> Since it's the unlikely way to unplug, a bit of latency was acceptable
> to prevent this problem.

It's not at all acceptable. There is no reason to hook stuff which
runs perfectly fine in preemptible code into the irq disabled region
of the scheduler internals.

> I'm curious why you made that change? It seems orthogonal to the change
> you are actually describing in the commit message.

Right, it should be split into two separate commits, one moving the
stuff out from the irq disabled region and the other removing that
from_schedule hackery. The latter can be dropped.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC][PATCH 1/3] sched, block: Move unplug
  2011-06-22 13:53     ` Thomas Gleixner
@ 2011-06-22 14:01       ` Jens Axboe
  2011-06-22 14:30         ` Thomas Gleixner
  0 siblings, 1 reply; 16+ messages in thread
From: Jens Axboe @ 2011-06-22 14:01 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Peter Zijlstra, linux-kernel, Linus Torvalds, Ingo Molnar, Tejun Heo

On 2011-06-22 15:53, Thomas Gleixner wrote:
> On Wed, 22 Jun 2011, Jens Axboe wrote:
> 
>> On 2011-06-22 01:34, Peter Zijlstra wrote:
>>> Thomas found that we're doing a horrendous amount of work in that scheduler
>>> unplug hook while having preempt and IRQs disabled.
>>>
>>> Move it to the head of schedule() where both preemption and IRQs are enabled
>>> such that we don't get these silly long IRQ/preempt disable times.
>>>
>>> This allows us to remove a lot of special magic in the unplug path,
>>> simplifying that code as a bonus.
>>
>> The major change here is moving the queue running inline, instead of
>> punting to a thread. The worry is/was that we risk blowing the stack if
>> something ends up blocking inadvertently further down the call path.
> 
> Is that a real problem or just a "we have no clue what might happen"
> countermeasure? The plug list should not be magically refilled once
> it's split off so this should not recurse endlessly, right? If it does
> then we better fix it at the root cause of the problem and not by
> adding some last resort band aid into the scheduler code.

It is supposedly a real problem, not just an inkling. It's not about
recursing indefinitely, the plug is fairly bounded. But the IO dispatch
path can be pretty deep, and if you hit that deep inside the reclaim or
file system write path, then you get dangerously close. Dave Chinner
posted some numbers in the 2.6.39-rc1 time frame showing how close we
got.

The scheduler hook has nothing to do wit this, we need that regardless.
My objection was the conversion from async to sync run, something that
wasn't even mentioned in the patch description (yet it was the most
interesting part of the change).
According to eg 

> If the stack usage of that whole block code is the real issue, then we
> probably need to keep that "delegate to async" workaround [sigh!], but
> definitely outside of the scheduler core code.

Placement of the call is also orthogonal. The only requirements are
really:

- IFF the process is going to sleep, flush the plug list

Nothing more, nothing less. We can tolerate false positives, but as a
general rule it should only happen when the process goes to sleep.

>> Since it's the unlikely way to unplug, a bit of latency was acceptable
>> to prevent this problem.
> 
> It's not at all acceptable. There is no reason to hook stuff which
> runs perfectly fine in preemptible code into the irq disabled region
> of the scheduler internals.

We are talking past each other again. Flushing on going to sleep is
needed. Placement of that call was pretty much left in the hands of the
scheduler people. I personally don't care where it's put, as long as it
does what is needed.

>> I'm curious why you made that change? It seems orthogonal to the change
>> you are actually describing in the commit message.
> 
> Right, it should be split into two separate commits, one moving the
> stuff out from the irq disabled region and the other removing that
> from_schedule hackery. The latter can be dropped.

Exactly.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC][PATCH 1/3] sched, block: Move unplug
  2011-06-22 14:01       ` Jens Axboe
@ 2011-06-22 14:30         ` Thomas Gleixner
  2011-06-22 14:38           ` Peter Zijlstra
  0 siblings, 1 reply; 16+ messages in thread
From: Thomas Gleixner @ 2011-06-22 14:30 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Peter Zijlstra, linux-kernel, Linus Torvalds, Ingo Molnar, Tejun Heo

On Wed, 22 Jun 2011, Jens Axboe wrote:
> On 2011-06-22 15:53, Thomas Gleixner wrote:
> > On Wed, 22 Jun 2011, Jens Axboe wrote:
> > Is that a real problem or just a "we have no clue what might happen"
> > countermeasure? The plug list should not be magically refilled once
> > it's split off so this should not recurse endlessly, right? If it does
> > then we better fix it at the root cause of the problem and not by
> > adding some last resort band aid into the scheduler code.
> 
> It is supposedly a real problem, not just an inkling. It's not about
> recursing indefinitely, the plug is fairly bounded. But the IO dispatch
> path can be pretty deep, and if you hit that deep inside the reclaim or
> file system write path, then you get dangerously close. Dave Chinner
> posted some numbers in the 2.6.39-rc1 time frame showing how close we
> got.

Fair enough.
 
> We are talking past each other again. Flushing on going to sleep is
> needed. Placement of that call was pretty much left in the hands of the
> scheduler people. I personally don't care where it's put, as long as it
> does what is needed.

Ok. So we move it out and keep the from_scheduler flag so that code
does not go down the IO path from there.
 
Thanks,

	tglx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC][PATCH 1/3] sched, block: Move unplug
  2011-06-22 14:30         ` Thomas Gleixner
@ 2011-06-22 14:38           ` Peter Zijlstra
  2011-06-22 15:08             ` Vivek Goyal
  0 siblings, 1 reply; 16+ messages in thread
From: Peter Zijlstra @ 2011-06-22 14:38 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Jens Axboe, linux-kernel, Linus Torvalds, Ingo Molnar, Tejun Heo

On Wed, 2011-06-22 at 16:30 +0200, Thomas Gleixner wrote:
> > It is supposedly a real problem, not just an inkling. It's not about
> > recursing indefinitely, the plug is fairly bounded. But the IO dispatch
> > path can be pretty deep, and if you hit that deep inside the reclaim or
> > file system write path, then you get dangerously close. Dave Chinner
> > posted some numbers in the 2.6.39-rc1 time frame showing how close we
> > got.
> 
> Fair enough.

> Ok. So we move it out and keep the from_scheduler flag so that code
> does not go down the IO path from there. 

Won't punting the plug to a worker thread wreck all kinds of io
accounting due to the wrong task doing the actual io submission?

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC][PATCH 1/3] sched, block: Move unplug
  2011-06-22 14:38           ` Peter Zijlstra
@ 2011-06-22 15:08             ` Vivek Goyal
  2011-06-22 16:04               ` Jens Axboe
  0 siblings, 1 reply; 16+ messages in thread
From: Vivek Goyal @ 2011-06-22 15:08 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Thomas Gleixner, Jens Axboe, linux-kernel, Linus Torvalds,
	Ingo Molnar, Tejun Heo

On Wed, Jun 22, 2011 at 04:38:01PM +0200, Peter Zijlstra wrote:
> On Wed, 2011-06-22 at 16:30 +0200, Thomas Gleixner wrote:
> > > It is supposedly a real problem, not just an inkling. It's not about
> > > recursing indefinitely, the plug is fairly bounded. But the IO dispatch
> > > path can be pretty deep, and if you hit that deep inside the reclaim or
> > > file system write path, then you get dangerously close. Dave Chinner
> > > posted some numbers in the 2.6.39-rc1 time frame showing how close we
> > > got.
> > 
> > Fair enough.
> 
> > Ok. So we move it out and keep the from_scheduler flag so that code
> > does not go down the IO path from there. 
> 
> Won't punting the plug to a worker thread wreck all kinds of io
> accounting due to the wrong task doing the actual io submission?

I think all the accounting will the done in IO submission path and
while IO is added to plug. This is just plug flush so should not
have effect on accounting.

Thanks
Vivek

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC][PATCH 1/3] sched, block: Move unplug
  2011-06-22 15:08             ` Vivek Goyal
@ 2011-06-22 16:04               ` Jens Axboe
  0 siblings, 0 replies; 16+ messages in thread
From: Jens Axboe @ 2011-06-22 16:04 UTC (permalink / raw)
  To: Vivek Goyal
  Cc: Peter Zijlstra, Thomas Gleixner, linux-kernel, Linus Torvalds,
	Ingo Molnar, Tejun Heo

On 2011-06-22 17:08, Vivek Goyal wrote:
> On Wed, Jun 22, 2011 at 04:38:01PM +0200, Peter Zijlstra wrote:
>> On Wed, 2011-06-22 at 16:30 +0200, Thomas Gleixner wrote:
>>>> It is supposedly a real problem, not just an inkling. It's not about
>>>> recursing indefinitely, the plug is fairly bounded. But the IO dispatch
>>>> path can be pretty deep, and if you hit that deep inside the reclaim or
>>>> file system write path, then you get dangerously close. Dave Chinner
>>>> posted some numbers in the 2.6.39-rc1 time frame showing how close we
>>>> got.
>>>
>>> Fair enough.
>>
>>> Ok. So we move it out and keep the from_scheduler flag so that code
>>> does not go down the IO path from there. 
>>
>> Won't punting the plug to a worker thread wreck all kinds of io
>> accounting due to the wrong task doing the actual io submission?
> 
> I think all the accounting will the done in IO submission path and
> while IO is added to plug. This is just plug flush so should not
> have effect on accounting.

Exactly, this is just the insert operation, so no worries there. The
request are fully "formulated".

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2011-06-22 16:04 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-06-21 23:34 [RFC][PATCH 0/3] Move WQ,unplug scheduler hooks Peter Zijlstra
2011-06-21 23:34 ` [RFC][PATCH 1/3] sched, block: Move unplug Peter Zijlstra
2011-06-22  7:01   ` Jens Axboe
2011-06-22 13:53     ` Thomas Gleixner
2011-06-22 14:01       ` Jens Axboe
2011-06-22 14:30         ` Thomas Gleixner
2011-06-22 14:38           ` Peter Zijlstra
2011-06-22 15:08             ` Vivek Goyal
2011-06-22 16:04               ` Jens Axboe
2011-06-21 23:34 ` [RFC][PATCH 2/3] sched, workqueue: Move WQ-sleeper wakeup Peter Zijlstra
2011-06-22  9:24   ` Tejun Heo
2011-06-22  9:27     ` Peter Zijlstra
2011-06-22  9:37       ` Tejun Heo
2011-06-22 13:47     ` Thomas Gleixner
2011-06-21 23:34 ` [RFC][PATCH 3/3] block: Break long IRQ disabled region Peter Zijlstra
2011-06-22  7:03   ` Jens Axboe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.