All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch] sched: fix b5d9d734 blunder in task_new_fair()
@ 2009-11-22 12:07 Mike Galbraith
  2009-11-24 13:51 ` Peter Zijlstra
  2009-11-26 16:26 ` Peter Zijlstra
  0 siblings, 2 replies; 17+ messages in thread
From: Mike Galbraith @ 2009-11-22 12:07 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar; +Cc: LKML


sched: fix b5d9d734 blunder in task_new_fair()

b5d9d734 fixed the problem of a forking task's child gaining vruntime..
IFF the child/parent shared a runqueue.  In the other case, it broke
fairness all to pieces by setting the child's vruntime to whatever task
happened to be current on the child's runqueue at wakeup time.  Fix this
by adding a sched_class::task_new parameter to give the class a chance to
prepare the child for wakeup.

At child wakeup time, call task_new() with the parent's rq locked as the
comment in task new states, update the parent's stats (which must be done
with the rq locked), call task_new() to prepare the child, unlock parent rq,
select a runqueue a runqueue for the child, _then_ set_task_cpu() with the
child's vruntime set properly and both runqueue clocks updated to get the
current offset.  Lock child's rq and proceed with wakeup.

Also, since setting scheduling policy requires the tasklist lock, move
sched_fork() under the tasklist lock in copy_process();

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>

---
 include/linux/sched.h |    2 +-
 kernel/fork.c         |    6 +++---
 kernel/sched.c        |   43 ++++++++++++++++++++++++++++---------------
 kernel/sched_fair.c   |   17 +++++++++++------
 4 files changed, 43 insertions(+), 25 deletions(-)

Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -1925,20 +1925,23 @@ static void task_tick_fair(struct rq *rq
  * Share the fairness runtime between parent and child, thus the
  * total amount of pressure for CPU stays equal - new tasks
  * get a chance to run but frequent forkers are not allowed to
- * monopolize the CPU. Note: the parent runqueue is locked,
- * the child is not running yet.
+ * monopolize the CPU. Note: the parent runqueue is locked at
+ * prep time, the child is not running yet.  At wakeup time,
+ * the clild's runqueue is locked.
  */
-static void task_new_fair(struct rq *rq, struct task_struct *p)
+static void task_new_fair(struct rq *rq, struct task_struct *p, int prep)
 {
 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
 	struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
 	int this_cpu = smp_processor_id();
 
-	sched_info_queued(p);
-
 	update_curr(cfs_rq);
-	if (curr)
+
+	if (prep && curr) {
 		se->vruntime = curr->vruntime;
+		return;
+	}
+
 	place_entity(cfs_rq, se, 1);
 
 	/* 'curr' will be NULL if the child belongs to a different group */
@@ -1953,6 +1956,8 @@ static void task_new_fair(struct rq *rq,
 	}
 
 	enqueue_task_fair(rq, p, 0);
+
+	sched_info_queued(p);
 }
 
 /*
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1099,7 +1099,7 @@ struct sched_class {
 
 	void (*set_curr_task) (struct rq *rq);
 	void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
-	void (*task_new) (struct rq *rq, struct task_struct *p);
+	void (*task_new) (struct rq *rq, struct task_struct *p, int prep);
 
 	void (*switched_from) (struct rq *this_rq, struct task_struct *task,
 			       int running);
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -2558,7 +2558,6 @@ static void __sched_fork(struct task_str
 void sched_fork(struct task_struct *p, int clone_flags)
 {
 	int cpu = get_cpu();
-	unsigned long flags;
 
 	__sched_fork(p);
 
@@ -2566,7 +2565,7 @@ void sched_fork(struct task_struct *p, i
 	 * Revert to default priority/policy on fork if requested.
 	 */
 	if (unlikely(p->sched_reset_on_fork)) {
-		if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
+		if (task_has_rt_policy(p)) {
 			p->policy = SCHED_NORMAL;
 			p->normal_prio = p->static_prio;
 		}
@@ -2589,16 +2588,10 @@ void sched_fork(struct task_struct *p, i
 	 */
 	p->prio = current->normal_prio;
 
-	if (!rt_prio(p->prio))
+	if (!task_has_rt_policy(p))
 		p->sched_class = &fair_sched_class;
 
-#ifdef CONFIG_SMP
-	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
-#endif
-	local_irq_save(flags);
-	update_rq_clock(cpu_rq(cpu));
-	set_task_cpu(p, cpu);
-	local_irq_restore(flags);
+	__set_task_cpu(p, cpu);
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	if (likely(sched_info_on()))
@@ -2625,21 +2618,40 @@ void sched_fork(struct task_struct *p, i
  */
 void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 {
+	int cpu = get_cpu();
 	unsigned long flags;
-	struct rq *rq;
+	struct task_struct *parent = current;
+	struct rq *rq, *orig_rq;
 
-	rq = task_rq_lock(p, &flags);
+	smp_wmb();
+	rq = orig_rq = task_rq_lock(parent, &flags);
 	BUG_ON(p->state != TASK_RUNNING);
-	update_rq_clock(rq);
+	update_rq_clock(orig_rq);
 
-	if (!p->sched_class->task_new || !current->se.on_rq) {
+	if (p->sched_class->task_new)
+		p->sched_class->task_new(orig_rq, p, 1);
+#ifdef CONFIG_SMP
+	p->state = TASK_WAKING;
+	__task_rq_unlock(orig_rq);
+	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
+	rq = cpu_rq(cpu);
+	if (rq != orig_rq) {
+		update_rq_clock(rq);
+		set_task_cpu(p, cpu);
+	}
+	__task_rq_lock(p);
+	WARN_ON(p->state != TASK_WAKING);
+	p->state = TASK_RUNNING;
+#endif
+
+	if (!p->sched_class->task_new || !parent->se.on_rq) {
 		activate_task(rq, p, 0);
 	} else {
 		/*
 		 * Let the scheduling class do new task startup
 		 * management (if any):
 		 */
-		p->sched_class->task_new(rq, p);
+		p->sched_class->task_new(rq, p, 0);
 		inc_nr_running(rq);
 	}
 	trace_sched_wakeup_new(rq, p, 1);
@@ -2649,6 +2661,7 @@ void wake_up_new_task(struct task_struct
 		p->sched_class->task_wake_up(rq, p);
 #endif
 	task_rq_unlock(rq, &flags);
+	put_cpu();
 }
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
Index: linux-2.6/kernel/fork.c
===================================================================
--- linux-2.6.orig/kernel/fork.c
+++ linux-2.6/kernel/fork.c
@@ -1125,9 +1125,6 @@ static struct task_struct *copy_process(
 
 	p->stack_start = stack_start;
 
-	/* Perform scheduler related setup. Assign this task to a CPU. */
-	sched_fork(p, clone_flags);
-
 	retval = perf_event_init_task(p);
 	if (retval)
 		goto bad_fork_cleanup_policy;
@@ -1229,6 +1226,9 @@ static struct task_struct *copy_process(
 	/* Need tasklist lock for parent etc handling! */
 	write_lock_irq(&tasklist_lock);
 
+	/* Perform scheduler related setup. Assign this task to a CPU. */
+	sched_fork(p, clone_flags);
+
 	/*
 	 * The task hasn't been attached yet, so its cpus_allowed mask will
 	 * not be changed, nor will its assigned CPU.



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] sched: fix b5d9d734 blunder in task_new_fair()
  2009-11-22 12:07 [patch] sched: fix b5d9d734 blunder in task_new_fair() Mike Galbraith
@ 2009-11-24 13:51 ` Peter Zijlstra
  2009-11-24 17:07   ` Mike Galbraith
  2009-11-26 16:26 ` Peter Zijlstra
  1 sibling, 1 reply; 17+ messages in thread
From: Peter Zijlstra @ 2009-11-24 13:51 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Ingo Molnar, LKML

On Sun, 2009-11-22 at 13:07 +0100, Mike Galbraith wrote:
> sched: fix b5d9d734 blunder in task_new_fair()
> 
> b5d9d734 fixed the problem of a forking task's child gaining vruntime..
> IFF the child/parent shared a runqueue.  In the other case, it broke
> fairness all to pieces by setting the child's vruntime to whatever task
> happened to be current on the child's runqueue at wakeup time.  Fix this
> by adding a sched_class::task_new parameter to give the class a chance to
> prepare the child for wakeup.
> 
> At child wakeup time, call task_new() with the parent's rq locked as the
> comment in task new states, update the parent's stats (which must be done
> with the rq locked), call task_new() to prepare the child, unlock parent rq,
> select a runqueue a runqueue for the child, _then_ set_task_cpu() with the
> child's vruntime set properly and both runqueue clocks updated to get the
> current offset.  Lock child's rq and proceed with wakeup.
> 
> Also, since setting scheduling policy requires the tasklist lock, move
> sched_fork() under the tasklist lock in copy_process();

OK, so hopefully I managed to untangle this...

>  include/linux/sched.h |    2 +-
>  kernel/fork.c         |    6 +++---
>  kernel/sched.c        |   43 ++++++++++++++++++++++++++++---------------
>  kernel/sched_fair.c   |   17 +++++++++++------
>  4 files changed, 43 insertions(+), 25 deletions(-)
> 
> Index: linux-2.6/kernel/sched_fair.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched_fair.c
> +++ linux-2.6/kernel/sched_fair.c
> @@ -1925,20 +1925,23 @@ static void task_tick_fair(struct rq *rq
>   * Share the fairness runtime between parent and child, thus the
>   * total amount of pressure for CPU stays equal - new tasks
>   * get a chance to run but frequent forkers are not allowed to
> - * monopolize the CPU. Note: the parent runqueue is locked,
> - * the child is not running yet.
> + * monopolize the CPU. Note: the parent runqueue is locked at
> + * prep time, the child is not running yet.  At wakeup time,
> + * the clild's runqueue is locked.
>   */
> -static void task_new_fair(struct rq *rq, struct task_struct *p)
> +static void task_new_fair(struct rq *rq, struct task_struct *p, int prep)
>  {
>  	struct cfs_rq *cfs_rq = task_cfs_rq(p);
>  	struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
>  	int this_cpu = smp_processor_id();
>  
> -	sched_info_queued(p);
> -
>  	update_curr(cfs_rq);
> -	if (curr)
> +
> +	if (prep && curr) {
>  		se->vruntime = curr->vruntime;
> +		return;
> +	}
> +
>  	place_entity(cfs_rq, se, 1);
>  
>  	/* 'curr' will be NULL if the child belongs to a different group */
> @@ -1953,6 +1956,8 @@ static void task_new_fair(struct rq *rq,
>  	}
>  
>  	enqueue_task_fair(rq, p, 0);
> +
> +	sched_info_queued(p);
>  }

I don't think we need to call this twice, but see below.

> Index: linux-2.6/kernel/sched.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched.c
> +++ linux-2.6/kernel/sched.c
> @@ -2558,7 +2558,6 @@ static void __sched_fork(struct task_str
>  void sched_fork(struct task_struct *p, int clone_flags)
>  {
>  	int cpu = get_cpu();
> -	unsigned long flags;
>  
>  	__sched_fork(p);
>  
> @@ -2566,7 +2565,7 @@ void sched_fork(struct task_struct *p, i
>  	 * Revert to default priority/policy on fork if requested.
>  	 */
>  	if (unlikely(p->sched_reset_on_fork)) {
> -		if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
> +		if (task_has_rt_policy(p)) {
>  			p->policy = SCHED_NORMAL;
>  			p->normal_prio = p->static_prio;
>  		}

While a nice change, it shouldn't have been mixed in I think.

> @@ -2589,16 +2588,10 @@ void sched_fork(struct task_struct *p, i
>  	 */
>  	p->prio = current->normal_prio;
>  
> -	if (!rt_prio(p->prio))
> +	if (!task_has_rt_policy(p))
>  		p->sched_class = &fair_sched_class;

And I suspect this one is actually buggy, see how rt_mutex_setprio()
only changes ->prio and ->sched_class, but leaves ->policy to the
original value?

> -#ifdef CONFIG_SMP
> -	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
> -#endif
> -	local_irq_save(flags);
> -	update_rq_clock(cpu_rq(cpu));
> -	set_task_cpu(p, cpu);
> -	local_irq_restore(flags);
> +	__set_task_cpu(p, cpu);
>  
>  #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
>  	if (likely(sched_info_on()))

Remove cpu selection from sched_fork(), seems the sane thing to do.

> @@ -2625,21 +2618,40 @@ void sched_fork(struct task_struct *p, i
>   */
>  void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
>  {
> +	int cpu = get_cpu();
>  	unsigned long flags;
> -	struct rq *rq;
> +	struct task_struct *parent = current;
> +	struct rq *rq, *orig_rq;
>  
> -	rq = task_rq_lock(p, &flags);
> +	smp_wmb();
> +	rq = orig_rq = task_rq_lock(parent, &flags);
>  	BUG_ON(p->state != TASK_RUNNING);
> -	update_rq_clock(rq);
> +	update_rq_clock(orig_rq);
>  
> -	if (!p->sched_class->task_new || !current->se.on_rq) {
> +	if (p->sched_class->task_new)
> +		p->sched_class->task_new(orig_rq, p, 1);
> +#ifdef CONFIG_SMP
> +	p->state = TASK_WAKING;
> +	__task_rq_unlock(orig_rq);
> +	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
> +	rq = cpu_rq(cpu);
> +	if (rq != orig_rq) {
> +		update_rq_clock(rq);
> +		set_task_cpu(p, cpu);
> +	}
> +	__task_rq_lock(p);

[ should've been: rq == __task_rq_lock(p), because as we didn't hold the
rq->lock the task could have actually been migrated again after
set_task_cpu() ]

> +	WARN_ON(p->state != TASK_WAKING);
> +	p->state = TASK_RUNNING;
> +#endif
> +
> +	if (!p->sched_class->task_new || !parent->se.on_rq) {
>  		activate_task(rq, p, 0);
>  	} else {
>  		/*
>  		 * Let the scheduling class do new task startup
>  		 * management (if any):
>  		 */
> -		p->sched_class->task_new(rq, p);
> +		p->sched_class->task_new(rq, p, 0);
>  		inc_nr_running(rq);
>  	}
>  	trace_sched_wakeup_new(rq, p, 1);
> @@ -2649,6 +2661,7 @@ void wake_up_new_task(struct task_struct
>  		p->sched_class->task_wake_up(rq, p);
>  #endif
>  	task_rq_unlock(rq, &flags);
> +	put_cpu();
>  }

OK, so the general idea seems to be to call task_new(.prep=1) to update
and copy vruntime from the parent to the child _before_ we muck about
and move the child over to another cpu.

Then we muck about and move the thing to another cpu.

Then we call it again with .prep=0 to actually enqueue the thing.

So, the whole point of ->task_new() was to be able to poke at ->vruntime
before the regular enqueue, I think we folded the enqueue in in order to
avoid two class calls. But if you're going to do two calls, we might as
well use ->task_new() and ->enqueue_task() aka activate_task().

This leaves the problem that task_new() behaviour depends on knowing the
target cpu, could we solve that by relying on the fact that we're
executing on the original cpu, something like:

void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
{
	unsigned long flags;
	struct rq *rq, *orig_rq;
	int cpu = get_cpu();

	rq = task_rq_lock(p, &flags);
	BUG_ON(p->state != TASK_RUNNING);
	update_rq_clock(rq);

#ifdef CONFIG_SMP
	p->state = TASK_WAKING;
	__task_rq_unlock(rq);
	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
	rq = cpu_rq(cpu);
	if (rq != orig_rq) {
		update_rq_clock(rq);
		set_task_cpu(p, cpu);
	}
	rq = __task_rq_lock(p);
	WARN_ON(p->state != TASK_WAKING);
	p->state = TASK_RUNNING;
#endif

	if (p->sched_class->task_new) {
		/* can detect migration through: task_cpu(p) != smp_processor_id() */
		p->sched_class->task_new(rq, p);
	}

	activate_task(rq, p, 0);

	trace_sched_wakeup_new(rq, p, 1);
	check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
	if (p->sched_class->task_wake_up)
		p->sched_class->task_wake_up(rq, p);
#endif
	task_rq_unlock(rq, &flags);
	put_cpu()
}

> Index: linux-2.6/kernel/fork.c
> ===================================================================
> --- linux-2.6.orig/kernel/fork.c
> +++ linux-2.6/kernel/fork.c
> @@ -1125,9 +1125,6 @@ static struct task_struct *copy_process(
>  
>  	p->stack_start = stack_start;
>  
> -	/* Perform scheduler related setup. Assign this task to a CPU. */
> -	sched_fork(p, clone_flags);
> -
>  	retval = perf_event_init_task(p);
>  	if (retval)
>  		goto bad_fork_cleanup_policy;
> @@ -1229,6 +1226,9 @@ static struct task_struct *copy_process(
>  	/* Need tasklist lock for parent etc handling! */
>  	write_lock_irq(&tasklist_lock);
>  
> +	/* Perform scheduler related setup. Assign this task to a CPU. */
> +	sched_fork(p, clone_flags);
> +

You just invalidated that comment ;-)



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] sched: fix b5d9d734 blunder in task_new_fair()
  2009-11-24 13:51 ` Peter Zijlstra
@ 2009-11-24 17:07   ` Mike Galbraith
  2009-11-24 17:35     ` Peter Zijlstra
  0 siblings, 1 reply; 17+ messages in thread
From: Mike Galbraith @ 2009-11-24 17:07 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, LKML

On Tue, 2009-11-24 at 14:51 +0100, Peter Zijlstra wrote:

> > @@ -2566,7 +2565,7 @@ void sched_fork(struct task_struct *p, i
> >  	 * Revert to default priority/policy on fork if requested.
> >  	 */
> >  	if (unlikely(p->sched_reset_on_fork)) {
> > -		if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
> > +		if (task_has_rt_policy(p)) {
> >  			p->policy = SCHED_NORMAL;
> >  			p->normal_prio = p->static_prio;
> >  		}
> 
> While a nice change, it shouldn't have been mixed in I think.

Right, I'll remove it.

> > @@ -2589,16 +2588,10 @@ void sched_fork(struct task_struct *p, i
> >  	 */
> >  	p->prio = current->normal_prio;
> >  
> > -	if (!rt_prio(p->prio))
> > +	if (!task_has_rt_policy(p))
> >  		p->sched_class = &fair_sched_class;
> 
> And I suspect this one is actually buggy, see how rt_mutex_setprio()
> only changes ->prio and ->sched_class, but leaves ->policy to the
> original value?

In PI boosted or reset case, policy will be non-rt, but it's gone.

(hm. ? note to self: have another look at PI, and ask printk what
happens if a boosted rt task with sched_reset_on_fork set comes by)

> > @@ -2625,21 +2618,40 @@ void sched_fork(struct task_struct *p, i
> >   */
> >  void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
> >  {
> > +	int cpu = get_cpu();
> >  	unsigned long flags;
> > -	struct rq *rq;
> > +	struct task_struct *parent = current;
> > +	struct rq *rq, *orig_rq;
> >  
> > -	rq = task_rq_lock(p, &flags);
> > +	smp_wmb();
> > +	rq = orig_rq = task_rq_lock(parent, &flags);
> >  	BUG_ON(p->state != TASK_RUNNING);
> > -	update_rq_clock(rq);
> > +	update_rq_clock(orig_rq);
> >  
> > -	if (!p->sched_class->task_new || !current->se.on_rq) {
> > +	if (p->sched_class->task_new)
> > +		p->sched_class->task_new(orig_rq, p, 1);
> > +#ifdef CONFIG_SMP
> > +	p->state = TASK_WAKING;
> > +	__task_rq_unlock(orig_rq);
> > +	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
> > +	rq = cpu_rq(cpu);
> > +	if (rq != orig_rq) {
> > +		update_rq_clock(rq);
> > +		set_task_cpu(p, cpu);
> > +	}
> > +	__task_rq_lock(p);
> 
> [ should've been: rq == __task_rq_lock(p), because as we didn't hold the
> rq->lock the task could have actually been migrated again after
> set_task_cpu() ]

Oops, yes.

> > +	WARN_ON(p->state != TASK_WAKING);
> > +	p->state = TASK_RUNNING;
> > +#endif
> > +
> > +	if (!p->sched_class->task_new || !parent->se.on_rq) {
> >  		activate_task(rq, p, 0);
> >  	} else {
> >  		/*
> >  		 * Let the scheduling class do new task startup
> >  		 * management (if any):
> >  		 */
> > -		p->sched_class->task_new(rq, p);
> > +		p->sched_class->task_new(rq, p, 0);
> >  		inc_nr_running(rq);
> >  	}
> >  	trace_sched_wakeup_new(rq, p, 1);
> > @@ -2649,6 +2661,7 @@ void wake_up_new_task(struct task_struct
> >  		p->sched_class->task_wake_up(rq, p);
> >  #endif
> >  	task_rq_unlock(rq, &flags);
> > +	put_cpu();
> >  }
> 
> OK, so the general idea seems to be to call task_new(.prep=1) to update
> and copy vruntime from the parent to the child _before_ we muck about
> and move the child over to another cpu.

Right.  We need the parent's vruntime updated so the child cannot end up
left of it no matter where either party lives at wakeup time.  The child
has a copy of the parent's not yet updated vruntime.

> Then we muck about and move the thing to another cpu.
> 
> Then we call it again with .prep=0 to actually enqueue the thing.
> 
> So, the whole point of ->task_new() was to be able to poke at ->vruntime
> before the regular enqueue, I think we folded the enqueue in in order to
> avoid two class calls. But if you're going to do two calls, we might as
> well use ->task_new() and ->enqueue_task() aka activate_task().
> 
> This leaves the problem that task_new() behaviour depends on knowing the
> target cpu, could we solve that by relying on the fact that we're
> executing on the original cpu, something like:
> 
> void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
> {
> 	unsigned long flags;
> 	struct rq *rq, *orig_rq;
> 	int cpu = get_cpu();
> 
> 	rq = task_rq_lock(p, &flags);

Locking the child, but it's the parent, and parent's runqueue we must
modify.  We can't do that unlocked.

> 	BUG_ON(p->state != TASK_RUNNING);
> 	update_rq_clock(rq);
> 
> #ifdef CONFIG_SMP
> 	p->state = TASK_WAKING;
> 	__task_rq_unlock(rq);
> 	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
> 	rq = cpu_rq(cpu);
> 	if (rq != orig_rq) {
> 		update_rq_clock(rq);
> 		set_task_cpu(p, cpu);
> 	}
> 	rq = __task_rq_lock(p);
> 	WARN_ON(p->state != TASK_WAKING);
> 	p->state = TASK_RUNNING;
> #endif
> 
> 	if (p->sched_class->task_new) {
> 		/* can detect migration through: task_cpu(p) != smp_processor_id() */

What if the parent was migrated before we got here?

> > @@ -1229,6 +1226,9 @@ static struct task_struct *copy_process(
> >  	/* Need tasklist lock for parent etc handling! */
> >  	write_lock_irq(&tasklist_lock);
> >  
> > +	/* Perform scheduler related setup. Assign this task to a CPU. */
> > +	sched_fork(p, clone_flags);
> > +
> 
> You just invalidated that comment ;-)

It's a dirty job, but _somebody_'s gotta do it ;-)  Will fix.

	-Mike


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] sched: fix b5d9d734 blunder in task_new_fair()
  2009-11-24 17:07   ` Mike Galbraith
@ 2009-11-24 17:35     ` Peter Zijlstra
  2009-11-24 17:54       ` Peter Zijlstra
  0 siblings, 1 reply; 17+ messages in thread
From: Peter Zijlstra @ 2009-11-24 17:35 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Ingo Molnar, LKML

On Tue, 2009-11-24 at 18:07 +0100, Mike Galbraith wrote:
> >       if (p->sched_class->task_new) {
> >               /* can detect migration through: task_cpu(p) != smp_processor_id() */
> 
> What if the parent was migrated before we got here?

Well, the only case it really matters for is the child_runs_first crap,
which is basically broken on SMP anyway, so I don't think we care too
much if we race here.

Unless I missed some detail that is ;-)


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] sched: fix b5d9d734 blunder in task_new_fair()
  2009-11-24 17:35     ` Peter Zijlstra
@ 2009-11-24 17:54       ` Peter Zijlstra
  2009-11-24 18:21         ` Mike Galbraith
  0 siblings, 1 reply; 17+ messages in thread
From: Peter Zijlstra @ 2009-11-24 17:54 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Ingo Molnar, LKML

On Tue, 2009-11-24 at 18:35 +0100, Peter Zijlstra wrote:
> On Tue, 2009-11-24 at 18:07 +0100, Mike Galbraith wrote:
> > >       if (p->sched_class->task_new) {
> > >               /* can detect migration through: task_cpu(p) != smp_processor_id() */
> > 
> > What if the parent was migrated before we got here?
> 
> Well, the only case it really matters for is the child_runs_first crap,
> which is basically broken on SMP anyway, so I don't think we care too
> much if we race here.
> 
> Unless I missed some detail that is ;-)


Also, we're running all this from the parent context, and we have
preemption disabled, we're not going anywhere.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] sched: fix b5d9d734 blunder in task_new_fair()
  2009-11-24 17:54       ` Peter Zijlstra
@ 2009-11-24 18:21         ` Mike Galbraith
  2009-11-24 18:27           ` Peter Zijlstra
  0 siblings, 1 reply; 17+ messages in thread
From: Mike Galbraith @ 2009-11-24 18:21 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, LKML

On Tue, 2009-11-24 at 18:54 +0100, Peter Zijlstra wrote:
> On Tue, 2009-11-24 at 18:35 +0100, Peter Zijlstra wrote:
> > On Tue, 2009-11-24 at 18:07 +0100, Mike Galbraith wrote:
> > > >       if (p->sched_class->task_new) {
> > > >               /* can detect migration through: task_cpu(p) != smp_processor_id() */
> > > 
> > > What if the parent was migrated before we got here?
> > 
> > Well, the only case it really matters for is the child_runs_first crap,
> > which is basically broken on SMP anyway, so I don't think we care too
> > much if we race here.
> > 
> > Unless I missed some detail that is ;-)
> 
> 
> Also, we're running all this from the parent context, and we have
> preemption disabled, we're not going anywhere.

In sched_fork() and wake_up_new_process(), but in between?

	-Mike


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] sched: fix b5d9d734 blunder in task_new_fair()
  2009-11-24 18:21         ` Mike Galbraith
@ 2009-11-24 18:27           ` Peter Zijlstra
  2009-11-24 18:36             ` Mike Galbraith
  2009-11-25  6:57             ` Mike Galbraith
  0 siblings, 2 replies; 17+ messages in thread
From: Peter Zijlstra @ 2009-11-24 18:27 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Ingo Molnar, LKML

On Tue, 2009-11-24 at 19:21 +0100, Mike Galbraith wrote:
> On Tue, 2009-11-24 at 18:54 +0100, Peter Zijlstra wrote:
> > On Tue, 2009-11-24 at 18:35 +0100, Peter Zijlstra wrote:
> > > On Tue, 2009-11-24 at 18:07 +0100, Mike Galbraith wrote:
> > > > >       if (p->sched_class->task_new) {
> > > > >               /* can detect migration through: task_cpu(p) != smp_processor_id() */
> > > > 
> > > > What if the parent was migrated before we got here?
> > > 
> > > Well, the only case it really matters for is the child_runs_first crap,
> > > which is basically broken on SMP anyway, so I don't think we care too
> > > much if we race here.
> > > 
> > > Unless I missed some detail that is ;-)
> > 
> > 
> > Also, we're running all this from the parent context, and we have
> > preemption disabled, we're not going anywhere.
> 
> In sched_fork() and wake_up_new_process(), but in between?

Hmm, right, back to the previous argument then ;-)


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] sched: fix b5d9d734 blunder in task_new_fair()
  2009-11-24 18:27           ` Peter Zijlstra
@ 2009-11-24 18:36             ` Mike Galbraith
  2009-11-25  6:57             ` Mike Galbraith
  1 sibling, 0 replies; 17+ messages in thread
From: Mike Galbraith @ 2009-11-24 18:36 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, LKML

On Tue, 2009-11-24 at 19:27 +0100, Peter Zijlstra wrote:
> On Tue, 2009-11-24 at 19:21 +0100, Mike Galbraith wrote:
> > On Tue, 2009-11-24 at 18:54 +0100, Peter Zijlstra wrote:
> > > On Tue, 2009-11-24 at 18:35 +0100, Peter Zijlstra wrote:
> > > > On Tue, 2009-11-24 at 18:07 +0100, Mike Galbraith wrote:
> > > > > >       if (p->sched_class->task_new) {
> > > > > >               /* can detect migration through: task_cpu(p) != smp_processor_id() */
> > > > > 
> > > > > What if the parent was migrated before we got here?
> > > > 
> > > > Well, the only case it really matters for is the child_runs_first crap,
> > > > which is basically broken on SMP anyway, so I don't think we care too
> > > > much if we race here.
> > > > 
> > > > Unless I missed some detail that is ;-)
> > > 
> > > 
> > > Also, we're running all this from the parent context, and we have
> > > preemption disabled, we're not going anywhere.
> > 
> > In sched_fork() and wake_up_new_process(), but in between?
> 
> Hmm, right, back to the previous argument then ;-)

Yeah.

We can be preempted between original task struct copy and getting to
sched_fork(), and after leaving copy_process(), so I don't see any way
around lock parent, update and copy vruntime.  Whether we race in
placing the child wrt parent isn't a big deal, but the child's vruntime
is, as is fiddling with the parent's task struct and runqueue.

	-Mike


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] sched: fix b5d9d734 blunder in task_new_fair()
  2009-11-24 18:27           ` Peter Zijlstra
  2009-11-24 18:36             ` Mike Galbraith
@ 2009-11-25  6:57             ` Mike Galbraith
  2009-11-25  9:51               ` Peter Zijlstra
  1 sibling, 1 reply; 17+ messages in thread
From: Mike Galbraith @ 2009-11-25  6:57 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, LKML

On Tue, 2009-11-24 at 19:27 +0100, Peter Zijlstra wrote:
> On Tue, 2009-11-24 at 19:21 +0100, Mike Galbraith wrote:
> > On Tue, 2009-11-24 at 18:54 +0100, Peter Zijlstra wrote:
> > > On Tue, 2009-11-24 at 18:35 +0100, Peter Zijlstra wrote:
> > > > On Tue, 2009-11-24 at 18:07 +0100, Mike Galbraith wrote:
> > > > > >       if (p->sched_class->task_new) {
> > > > > >               /* can detect migration through: task_cpu(p) != smp_processor_id() */
> > > > > 
> > > > > What if the parent was migrated before we got here?
> > > > 
> > > > Well, the only case it really matters for is the child_runs_first crap,
> > > > which is basically broken on SMP anyway, so I don't think we care too
> > > > much if we race here.
> > > > 
> > > > Unless I missed some detail that is ;-)
> > > 
> > > 
> > > Also, we're running all this from the parent context, and we have
> > > preemption disabled, we're not going anywhere.
> > 
> > In sched_fork() and wake_up_new_process(), but in between?
> 
> Hmm, right, back to the previous argument then ;-)

Ok, how about this.  Neither task is going anywhere.  Don't worry about
what all may or may not have happened in between, just copy the parent's
vruntime as it sits NOW (wakeup time), add in any unaccounted parent
vruntime, and redo the child's offset.  The child's runqueue has just
been updated, the only vruntime missing, if any, is that sitting in the
parent.


sched: fix sched_fair.c::task_new_fair vruntime bug.

b5d9d734 fixed the problem of a forking task's child gaining vruntime..
IFF the child/parent shared a runqueue.  In the other case, it broke
fairness all to pieces by setting the child's vruntime to whatever task
happened to be current on the child's runqueue at wakeup time.  Fix this
by selecting a runqueue for the child at wakeup time, copying the parent's
vruntime, adding any unaccounted vruntime, and redoing cross cpu offset.

Also, since setting scheduling policy requires the tasklist lock, move
sched_fork() under the tasklist lock in copy_process();

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>

---
 kernel/fork.c       |    9 ++++++---
 kernel/sched.c      |   35 +++++++++++++++++++++++------------
 kernel/sched_fair.c |   31 +++++++++++++++++++++++++------
 3 files changed, 54 insertions(+), 21 deletions(-)

Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -1925,20 +1925,37 @@ static void task_tick_fair(struct rq *rq
  * Share the fairness runtime between parent and child, thus the
  * total amount of pressure for CPU stays equal - new tasks
  * get a chance to run but frequent forkers are not allowed to
- * monopolize the CPU. Note: the parent runqueue is locked,
- * the child is not running yet.
+ * monopolize the CPU. Note: the child's runqueue is locked, the
+ * child is not yet running, and the parent is not preemptible.
  */
 static void task_new_fair(struct rq *rq, struct task_struct *p)
 {
 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
 	struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
+	struct sched_entity *pse = &current->se;
 	int this_cpu = smp_processor_id();
 
-	sched_info_queued(p);
-
 	update_curr(cfs_rq);
-	if (curr)
-		se->vruntime = curr->vruntime;
+
+	if (is_same_group(se, pse)) {
+		se->vruntime = pse->vruntime;
+
+		/*
+		 * If we're not sharing a runqueue, redo the child's vruntime
+		 * offset after accounting for any yet to be booked vruntime.
+		 */
+		if (this_cpu != task_cpu(p)) {
+			struct cfs_rq *old_cfs_rq = cfs_rq_of(pse);
+			u64 now = cpu_rq(this_cpu)->clock;
+			unsigned long delta_exec = now - pse->exec_start;
+
+			delta_exec = calc_delta_fair(delta_exec, se);
+			se->vruntime += delta_exec;
+			se->vruntime -= old_cfs_rq->min_vruntime -
+						cfs_rq->min_vruntime;
+		}
+	}
+
 	place_entity(cfs_rq, se, 1);
 
 	/* 'curr' will be NULL if the child belongs to a different group */
@@ -1953,6 +1970,8 @@ static void task_new_fair(struct rq *rq,
 	}
 
 	enqueue_task_fair(rq, p, 0);
+
+	sched_info_queued(p);
 }
 
 /*
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -2558,7 +2558,6 @@ static void __sched_fork(struct task_str
 void sched_fork(struct task_struct *p, int clone_flags)
 {
 	int cpu = get_cpu();
-	unsigned long flags;
 
 	__sched_fork(p);
 
@@ -2592,13 +2591,7 @@ void sched_fork(struct task_struct *p, i
 	if (!rt_prio(p->prio))
 		p->sched_class = &fair_sched_class;
 
-#ifdef CONFIG_SMP
-	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
-#endif
-	local_irq_save(flags);
-	update_rq_clock(cpu_rq(cpu));
-	set_task_cpu(p, cpu);
-	local_irq_restore(flags);
+	__set_task_cpu(p, cpu);
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	if (likely(sched_info_on()))
@@ -2625,14 +2618,31 @@ void sched_fork(struct task_struct *p, i
  */
 void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 {
+	int cpu = get_cpu();
 	unsigned long flags;
-	struct rq *rq;
+	struct task_struct *parent = current;
+	struct rq *rq, *orig_rq;
 
-	rq = task_rq_lock(p, &flags);
+	smp_wmb();
+	rq = orig_rq = task_rq_lock(parent, &flags);
 	BUG_ON(p->state != TASK_RUNNING);
-	update_rq_clock(rq);
+	update_rq_clock(orig_rq);
 
-	if (!p->sched_class->task_new || !current->se.on_rq) {
+#ifdef CONFIG_SMP
+	p->state = TASK_WAKING;
+	__task_rq_unlock(rq);
+	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
+	rq = cpu_rq(cpu);
+	if (rq != orig_rq) {
+		update_rq_clock(rq);
+		set_task_cpu(p, cpu);
+	}
+	rq = __task_rq_lock(p);
+	WARN_ON(p->state != TASK_WAKING);
+	p->state = TASK_RUNNING;
+#endif
+
+	if (!p->sched_class->task_new || !parent->se.on_rq) {
 		activate_task(rq, p, 0);
 	} else {
 		/*
@@ -2649,6 +2659,7 @@ void wake_up_new_task(struct task_struct
 		p->sched_class->task_wake_up(rq, p);
 #endif
 	task_rq_unlock(rq, &flags);
+	put_cpu();
 }
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
Index: linux-2.6/kernel/fork.c
===================================================================
--- linux-2.6.orig/kernel/fork.c
+++ linux-2.6/kernel/fork.c
@@ -1125,9 +1125,6 @@ static struct task_struct *copy_process(
 
 	p->stack_start = stack_start;
 
-	/* Perform scheduler related setup. Assign this task to a CPU. */
-	sched_fork(p, clone_flags);
-
 	retval = perf_event_init_task(p);
 	if (retval)
 		goto bad_fork_cleanup_policy;
@@ -1230,6 +1227,12 @@ static struct task_struct *copy_process(
 	write_lock_irq(&tasklist_lock);
 
 	/*
+	 * Perform scheduler related setup.  Final CPU assignment will
+	 * be performed at wakeup time.
+	 */
+	sched_fork(p, clone_flags);
+
+	/*
 	 * The task hasn't been attached yet, so its cpus_allowed mask will
 	 * not be changed, nor will its assigned CPU.
 	 *



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] sched: fix b5d9d734 blunder in task_new_fair()
  2009-11-25  6:57             ` Mike Galbraith
@ 2009-11-25  9:51               ` Peter Zijlstra
  2009-11-25 13:09                 ` Mike Galbraith
  0 siblings, 1 reply; 17+ messages in thread
From: Peter Zijlstra @ 2009-11-25  9:51 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Ingo Molnar, LKML

On Wed, 2009-11-25 at 07:57 +0100, Mike Galbraith wrote:

>         update_curr(cfs_rq);
> +
> +       if (is_same_group(se, pse)) {
> +               se->vruntime = pse->vruntime;
> +
> +               /*
> +                * If we're not sharing a runqueue, redo the child's vruntime
> +                * offset after accounting for any yet to be booked vruntime.
> +                */
> +               if (this_cpu != task_cpu(p)) {
> +                       struct cfs_rq *old_cfs_rq = cfs_rq_of(pse);
> +                       u64 now = cpu_rq(this_cpu)->clock;
> +                       unsigned long delta_exec = now - pse->exec_start;
> +
> +                       delta_exec = calc_delta_fair(delta_exec, se);
> +                       se->vruntime += delta_exec;
> +                       se->vruntime -= old_cfs_rq->min_vruntime -
> +                                               cfs_rq->min_vruntime;
> +               }
> +       }
> +
>         place_entity(cfs_rq, se, 1); 

/me got his head in a twist..

 - is_same_group() assumes both things are on the same cpu and will
   fail (for the group configs) when this is not so.

 - if we're not on the same cpu, update_curr() will have updated current
   on the target cpu, but the parent vruntime will still be stale.

 - when all is said and done, place_entity(.initial=1) will do:
    ->vruntime = max(min_vruntime + debit, ->vruntime)
   which ought to place a new task far enough ahead in any case...




^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] sched: fix b5d9d734 blunder in task_new_fair()
  2009-11-25  9:51               ` Peter Zijlstra
@ 2009-11-25 13:09                 ` Mike Galbraith
  0 siblings, 0 replies; 17+ messages in thread
From: Mike Galbraith @ 2009-11-25 13:09 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, LKML

On Wed, 2009-11-25 at 10:51 +0100, Peter Zijlstra wrote:
> On Wed, 2009-11-25 at 07:57 +0100, Mike Galbraith wrote:
> 
> >         update_curr(cfs_rq);
> > +
> > +       if (is_same_group(se, pse)) {
> > +               se->vruntime = pse->vruntime;
> > +
> > +               /*
> > +                * If we're not sharing a runqueue, redo the child's vruntime
> > +                * offset after accounting for any yet to be booked vruntime.
> > +                */
> > +               if (this_cpu != task_cpu(p)) {
> > +                       struct cfs_rq *old_cfs_rq = cfs_rq_of(pse);
> > +                       u64 now = cpu_rq(this_cpu)->clock;
> > +                       unsigned long delta_exec = now - pse->exec_start;
> > +
> > +                       delta_exec = calc_delta_fair(delta_exec, se);
> > +                       se->vruntime += delta_exec;
> > +                       se->vruntime -= old_cfs_rq->min_vruntime -
> > +                                               cfs_rq->min_vruntime;
> > +               }
> > +       }
> > +
> >         place_entity(cfs_rq, se, 1); 
> 
> /me got his head in a twist..

The group stuff does that to me every time.

>  - is_same_group() assumes both things are on the same cpu and will
>    fail (for the group configs) when this is not so.

Gak.  (grr)  This is getting ridiculous :)  There is only one caller of
wake_up_new_task(), and that is the all fired be damned parent waking
it's child.  There is one caller of sched_class::task_new() as well, and
that is wake_up_new_task().  If I can't just copy the silly thing and be
done with it, there's something wrong.

(and if parent turns into grim reaper, heartless /me doesn't care)


sched: fix sched_fair.c::task_new_fair vruntime bug.

b5d9d734 fixed the problem of a forking task's child gaining vruntime..
IFF the child/parent shared a runqueue.  In the other case, it broke
fairness all to pieces by setting the child's vruntime to whatever task
happened to be current on the child's runqueue at wakeup time.  Fix this
by selecting a runqueue for the child at wakeup time, copying the parent's
vruntime, adding any unaccounted vruntime, and redoing cross cpu offset.

Also, since setting scheduling policy requires the tasklist lock, move
sched_fork() under the tasklist lock in copy_process();

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>

---
 kernel/fork.c       |    9 ++++++---
 kernel/sched.c      |   35 +++++++++++++++++++++++------------
 kernel/sched_fair.c |   29 +++++++++++++++++++++++------
 3 files changed, 52 insertions(+), 21 deletions(-)

Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -1925,20 +1925,35 @@ static void task_tick_fair(struct rq *rq
  * Share the fairness runtime between parent and child, thus the
  * total amount of pressure for CPU stays equal - new tasks
  * get a chance to run but frequent forkers are not allowed to
- * monopolize the CPU. Note: the parent runqueue is locked,
- * the child is not running yet.
+ * monopolize the CPU. Note: the child's runqueue is locked, the
+ * child is not yet running, and the parent is not preemptible.
  */
 static void task_new_fair(struct rq *rq, struct task_struct *p)
 {
 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
 	struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
+	struct sched_entity *pse = &current->se;
 	int this_cpu = smp_processor_id();
 
-	sched_info_queued(p);
-
 	update_curr(cfs_rq);
-	if (curr)
-		se->vruntime = curr->vruntime;
+
+	se->vruntime = pse->vruntime;
+
+	/*
+	 * If we're not sharing a runqueue, redo the child's vruntime
+	 * offset after accounting for any yet to be booked vruntime.
+	 */
+	if (this_cpu != task_cpu(p)) {
+		struct cfs_rq *old_cfs_rq = cfs_rq_of(pse);
+		u64 now = cpu_rq(this_cpu)->clock;
+		unsigned long delta_exec = now - pse->exec_start;
+
+		delta_exec = calc_delta_fair(delta_exec, se);
+		se->vruntime += delta_exec;
+		se->vruntime -= old_cfs_rq->min_vruntime -
+					cfs_rq->min_vruntime;
+	}
+
 	place_entity(cfs_rq, se, 1);
 
 	/* 'curr' will be NULL if the child belongs to a different group */
@@ -1953,6 +1968,8 @@ static void task_new_fair(struct rq *rq,
 	}
 
 	enqueue_task_fair(rq, p, 0);
+
+	sched_info_queued(p);
 }
 
 /*
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -2558,7 +2558,6 @@ static void __sched_fork(struct task_str
 void sched_fork(struct task_struct *p, int clone_flags)
 {
 	int cpu = get_cpu();
-	unsigned long flags;
 
 	__sched_fork(p);
 
@@ -2592,13 +2591,7 @@ void sched_fork(struct task_struct *p, i
 	if (!rt_prio(p->prio))
 		p->sched_class = &fair_sched_class;
 
-#ifdef CONFIG_SMP
-	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
-#endif
-	local_irq_save(flags);
-	update_rq_clock(cpu_rq(cpu));
-	set_task_cpu(p, cpu);
-	local_irq_restore(flags);
+	__set_task_cpu(p, cpu);
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	if (likely(sched_info_on()))
@@ -2625,14 +2618,31 @@ void sched_fork(struct task_struct *p, i
  */
 void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 {
+	int cpu = get_cpu();
 	unsigned long flags;
-	struct rq *rq;
+	struct task_struct *parent = current;
+	struct rq *rq, *orig_rq;
 
-	rq = task_rq_lock(p, &flags);
+	smp_wmb();
+	rq = orig_rq = task_rq_lock(parent, &flags);
 	BUG_ON(p->state != TASK_RUNNING);
-	update_rq_clock(rq);
+	update_rq_clock(orig_rq);
 
-	if (!p->sched_class->task_new || !current->se.on_rq) {
+#ifdef CONFIG_SMP
+	p->state = TASK_WAKING;
+	__task_rq_unlock(rq);
+	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
+	rq = cpu_rq(cpu);
+	if (rq != orig_rq) {
+		update_rq_clock(rq);
+		set_task_cpu(p, cpu);
+	}
+	rq = __task_rq_lock(p);
+	WARN_ON(p->state != TASK_WAKING);
+	p->state = TASK_RUNNING;
+#endif
+
+	if (!p->sched_class->task_new || !parent->se.on_rq) {
 		activate_task(rq, p, 0);
 	} else {
 		/*
@@ -2649,6 +2659,7 @@ void wake_up_new_task(struct task_struct
 		p->sched_class->task_wake_up(rq, p);
 #endif
 	task_rq_unlock(rq, &flags);
+	put_cpu();
 }
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
Index: linux-2.6/kernel/fork.c
===================================================================
--- linux-2.6.orig/kernel/fork.c
+++ linux-2.6/kernel/fork.c
@@ -1125,9 +1125,6 @@ static struct task_struct *copy_process(
 
 	p->stack_start = stack_start;
 
-	/* Perform scheduler related setup. Assign this task to a CPU. */
-	sched_fork(p, clone_flags);
-
 	retval = perf_event_init_task(p);
 	if (retval)
 		goto bad_fork_cleanup_policy;
@@ -1230,6 +1227,12 @@ static struct task_struct *copy_process(
 	write_lock_irq(&tasklist_lock);
 
 	/*
+	 * Perform scheduler related setup.  Final CPU assignment will
+	 * be performed at wakeup time.
+	 */
+	sched_fork(p, clone_flags);
+
+	/*
 	 * The task hasn't been attached yet, so its cpus_allowed mask will
 	 * not be changed, nor will its assigned CPU.
 	 *



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] sched: fix b5d9d734 blunder in task_new_fair()
  2009-11-22 12:07 [patch] sched: fix b5d9d734 blunder in task_new_fair() Mike Galbraith
  2009-11-24 13:51 ` Peter Zijlstra
@ 2009-11-26 16:26 ` Peter Zijlstra
  2009-11-27  8:45   ` Mike Galbraith
  2009-11-27 11:55   ` Mike Galbraith
  1 sibling, 2 replies; 17+ messages in thread
From: Peter Zijlstra @ 2009-11-26 16:26 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Ingo Molnar, LKML

On Sun, 2009-11-22 at 13:07 +0100, Mike Galbraith wrote:
> @@ -2589,16 +2588,10 @@ void sched_fork(struct task_struct *p, i
>          */
>         p->prio = current->normal_prio;
>  
> -       if (!rt_prio(p->prio))
> +       if (!task_has_rt_policy(p))
>                 p->sched_class = &fair_sched_class;
>  
> -#ifdef CONFIG_SMP
> -       cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
> -#endif
> -       local_irq_save(flags);
> -       update_rq_clock(cpu_rq(cpu));
> -       set_task_cpu(p, cpu);
> -       local_irq_restore(flags);
> +       __set_task_cpu(p, cpu); 

OK, so I figured out why it was in sched_fork() and not in
wake_up_new_task().

It is because in sched_fork() the new task isn't in the tasklist yet and
can therefore not race with any other migration logic.



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] sched: fix b5d9d734 blunder in task_new_fair()
  2009-11-26 16:26 ` Peter Zijlstra
@ 2009-11-27  8:45   ` Mike Galbraith
  2009-11-27  8:57     ` Mike Galbraith
  2009-11-27 11:55   ` Mike Galbraith
  1 sibling, 1 reply; 17+ messages in thread
From: Mike Galbraith @ 2009-11-27  8:45 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, LKML

Off list

On Thu, 2009-11-26 at 17:26 +0100, Peter Zijlstra wrote:
> On Sun, 2009-11-22 at 13:07 +0100, Mike Galbraith wrote:
> > @@ -2589,16 +2588,10 @@ void sched_fork(struct task_struct *p, i
> >          */
> >         p->prio = current->normal_prio;
> >  
> > -       if (!rt_prio(p->prio))
> > +       if (!task_has_rt_policy(p))
> >                 p->sched_class = &fair_sched_class;
> >  
> > -#ifdef CONFIG_SMP
> > -       cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
> > -#endif
> > -       local_irq_save(flags);
> > -       update_rq_clock(cpu_rq(cpu));
> > -       set_task_cpu(p, cpu);
> > -       local_irq_restore(flags);
> > +       __set_task_cpu(p, cpu); 
> 
> OK, so I figured out why it was in sched_fork() and not in
> wake_up_new_task().
> 
> It is because in sched_fork() the new task isn't in the tasklist yet and
> can therefore not race with any other migration logic.

All the raciness I'm fretting over probably just doesn't matter much.
Things aren't exploding.  Maybe min_vruntime is the only thing I should
be worrying about.  That concern is in-flight deltas of SCHED_IDLE
magnitude, ie cross cpu "fuzziness" on a very large scale.

However :-/  (aw sh*t, here i go again.  aaaaOOOOOgah! dive! dive!;)

WRT affinity, sched_class, nice level fretting, that can all change from
userland at any instant that you do not hold the task's runqueue lock
and the tasklist lock is not held by somebody to keep them from getting
a task reference to start the ball rolling.  As soon as you drop the
runqueue lock, userland can acquire, and change whatever it likes under
you, so afaikt, we can call the wrong sched_class method etc etc.

3f029d3 agrees fully wrt sched_class at least:
    In addition, we fix a race condition where we try to access
    current->sched_class without holding the rq->lock.  This is
    technically racy, as the sched-class could change out from
    under us.  Instead, we reference the per-rq post_schedule
    variable with the runqueue unlocked, but with preemption
    disabled to see if we need to reacquire the rq->lock.

The only thing that really changes with the unlocked _rummaging_ is that
we now can't count on nr_running/load on the task's current runqueue,
sched_class etc while you're rummaging, ALL state is fuzzy, instead of
only most.

However, I don't think we can even count on the task remaining untouched
while in TASK_WAKING state, and that might be a bigger deal.

afaikt, userland can migrate the task you're in the process of waking
while you're off rummaging around looking for a place to put it, like
so:  Wakee is on the tasklist, can be accessed by userland.  We wouldn't
be in ttwu either were it not.  We're waking, we set task state to
TASK_WAKING, release the lock, userland acquires, nobody but ttwu has
ever heard of a TASK_WAKING, so it happily changes task's affinity,
migrates the sleeping task to the one and only (pins) correct runqueue,
sets task cpu etc, releases the lock, and goes home.  We finish
rummaging, do NOT check for an intervening affinity change, instead, we
do an unlocked scribble over what userland just wrote, resetting cpu and
vruntime to a now illegal cpu, and activate.  I'm not seeing any
inhibitor for this scenario.

When I moved fork balancing runqueue selection to the much more logical
wakeup and enqueue time, vs copy and fiddle time, I didn't fix anything,
I merely duplicated the races that are now in ttwu.

No matter were we do the selection, we can race with userland if the
darn thing isn't locked all the while.  With .31 ttwu locking, there is
no race, because nobody can get to the task struct.  If target cpu
changes via rq selection, we set cpu, _then_ unlock, at which point
userland or whomever may override _our_ decision, but we never write
after re-acquiring, so intervening changes, if any, stay intact.

With an exec, userland policy/affinity change will deactivate/activate
or do a migration call.  We don't have the thing locked while we're
rummaging, so what keeps sched_class from changing after we evaluated,
so we call the wrong method, and then do our own migration call?

/me is still pretty befuddled, and haven't even crawled over PI.

I flat don't see how we can do this race free, unless we put every task
in some untouchable state while we're rummaging, and teach everything
having to do with migration about that state.

	-Mike


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] sched: fix b5d9d734 blunder in task_new_fair()
  2009-11-27  8:45   ` Mike Galbraith
@ 2009-11-27  8:57     ` Mike Galbraith
  0 siblings, 0 replies; 17+ messages in thread
From: Mike Galbraith @ 2009-11-27  8:57 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, LKML

On Fri, 2009-11-27 at 09:45 +0100, Mike Galbraith wrote:
> Off list

Guess not, mouse went click on the way to save menu.  Oh well, was going
to spare thousands of mailboxes another confused /me message, but too
late now.

> On Thu, 2009-11-26 at 17:26 +0100, Peter Zijlstra wrote:
> > On Sun, 2009-11-22 at 13:07 +0100, Mike Galbraith wrote:
> > > @@ -2589,16 +2588,10 @@ void sched_fork(struct task_struct *p, i
> > >          */
> > >         p->prio = current->normal_prio;
> > >  
> > > -       if (!rt_prio(p->prio))
> > > +       if (!task_has_rt_policy(p))
> > >                 p->sched_class = &fair_sched_class;
> > >  
> > > -#ifdef CONFIG_SMP
> > > -       cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
> > > -#endif
> > > -       local_irq_save(flags);
> > > -       update_rq_clock(cpu_rq(cpu));
> > > -       set_task_cpu(p, cpu);
> > > -       local_irq_restore(flags);
> > > +       __set_task_cpu(p, cpu); 
> > 
> > OK, so I figured out why it was in sched_fork() and not in
> > wake_up_new_task().
> > 
> > It is because in sched_fork() the new task isn't in the tasklist yet and
> > can therefore not race with any other migration logic.
> 
> All the raciness I'm fretting over probably just doesn't matter much.
> Things aren't exploding.  Maybe min_vruntime is the only thing I should
> be worrying about.  That concern is in-flight deltas of SCHED_IDLE
> magnitude, ie cross cpu "fuzziness" on a very large scale.
> 
> However :-/  (aw sh*t, here i go again.  aaaaOOOOOgah! dive! dive!;)
> 
> WRT affinity, sched_class, nice level fretting, that can all change from
> userland at any instant that you do not hold the task's runqueue lock
> and the tasklist lock is not held by somebody to keep them from getting
> a task reference to start the ball rolling.  As soon as you drop the
> runqueue lock, userland can acquire, and change whatever it likes under
> you, so afaikt, we can call the wrong sched_class method etc etc.
> 
> 3f029d3 agrees fully wrt sched_class at least:
>     In addition, we fix a race condition where we try to access
>     current->sched_class without holding the rq->lock.  This is
>     technically racy, as the sched-class could change out from
>     under us.  Instead, we reference the per-rq post_schedule
>     variable with the runqueue unlocked, but with preemption
>     disabled to see if we need to reacquire the rq->lock.
> 
> The only thing that really changes with the unlocked _rummaging_ is that
> we now can't count on nr_running/load on the task's current runqueue,
> sched_class etc while you're rummaging, ALL state is fuzzy, instead of
> only most.
> 
> However, I don't think we can even count on the task remaining untouched
> while in TASK_WAKING state, and that might be a bigger deal.
> 
> afaikt, userland can migrate the task you're in the process of waking
> while you're off rummaging around looking for a place to put it, like
> so:  Wakee is on the tasklist, can be accessed by userland.  We wouldn't
> be in ttwu either were it not.  We're waking, we set task state to
> TASK_WAKING, release the lock, userland acquires, nobody but ttwu has
> ever heard of a TASK_WAKING, so it happily changes task's affinity,
> migrates the sleeping task to the one and only (pins) correct runqueue,
> sets task cpu etc, releases the lock, and goes home.  We finish
> rummaging, do NOT check for an intervening affinity change, instead, we
> do an unlocked scribble over what userland just wrote, resetting cpu and
> vruntime to a now illegal cpu, and activate.  I'm not seeing any
> inhibitor for this scenario.
> 
> When I moved fork balancing runqueue selection to the much more logical
> wakeup and enqueue time, vs copy and fiddle time, I didn't fix anything,
> I merely duplicated the races that are now in ttwu.
> 
> No matter were we do the selection, we can race with userland if the
> darn thing isn't locked all the while.  With .31 ttwu locking, there is
> no race, because nobody can get to the task struct.  If target cpu
> changes via rq selection, we set cpu, _then_ unlock, at which point
> userland or whomever may override _our_ decision, but we never write
> after re-acquiring, so intervening changes, if any, stay intact.
> 
> With an exec, userland policy/affinity change will deactivate/activate
> or do a migration call.  We don't have the thing locked while we're
> rummaging, so what keeps sched_class from changing after we evaluated,
> so we call the wrong method, and then do our own migration call?
> 
> /me is still pretty befuddled, and haven't even crawled over PI.
> 
> I flat don't see how we can do this race free, unless we put every task
> in some untouchable state while we're rummaging, and teach everything
> having to do with migration about that state.
> 
> 	-Mike
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] sched: fix b5d9d734 blunder in task_new_fair()
  2009-11-26 16:26 ` Peter Zijlstra
  2009-11-27  8:45   ` Mike Galbraith
@ 2009-11-27 11:55   ` Mike Galbraith
  2009-11-27 12:21     ` Peter Zijlstra
  1 sibling, 1 reply; 17+ messages in thread
From: Mike Galbraith @ 2009-11-27 11:55 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, LKML

On Thu, 2009-11-26 at 17:26 +0100, Peter Zijlstra wrote:
> On Sun, 2009-11-22 at 13:07 +0100, Mike Galbraith wrote:
> > @@ -2589,16 +2588,10 @@ void sched_fork(struct task_struct *p, i
> >          */
> >         p->prio = current->normal_prio;
> >  
> > -       if (!rt_prio(p->prio))
> > +       if (!task_has_rt_policy(p))
> >                 p->sched_class = &fair_sched_class;
> >  
> > -#ifdef CONFIG_SMP
> > -       cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
> > -#endif
> > -       local_irq_save(flags);
> > -       update_rq_clock(cpu_rq(cpu));
> > -       set_task_cpu(p, cpu);
> > -       local_irq_restore(flags);
> > +       __set_task_cpu(p, cpu); 
> 
> OK, so I figured out why it was in sched_fork() and not in
> wake_up_new_task().
> 
> It is because in sched_fork() the new task isn't in the tasklist yet and
> can therefore not race with any other migration logic.

With no regard to other issues, real or imagined, I can squash my bug in
three ways.

1. double lock runqueues and update both in the migration case, copy
parent and adjust child.  Maximally correct, though it's use of
double_lock_balance() looks a little odd. (below)

2. only lock the child, copy the parent, and manually add any unbooked
vruntime before adjusting the child.  Not 100% correct, but doesn't add
a double lock, and should also ensure no vruntime gain over the fork.
The child gets a fresh copy, not some dusty old thing that was copied
who knows when, and been who knows where (below 1)

3. ??

sched: fix sched_fair.c::task_new_fair vruntime bug.

b5d9d734 fixed the problem of a forking task's child gaining vruntime..
IFF the child/parent shared a runqueue.  In the other case, it broke
fairness all to pieces by setting the child's vruntime to whatever task
happened to be current on the child's runqueue at wakeup time.  Fix this
by copying the parent's vruntime after adding any unaccounted vruntime,
and adjust for cross cpu offset in the migration case.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>

---
 kernel/sched.c      |    8 +++++++-
 kernel/sched_fair.c |   25 +++++++++++++++++++------
 2 files changed, 26 insertions(+), 7 deletions(-)

Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -1925,20 +1925,31 @@ static void task_tick_fair(struct rq *rq
  * Share the fairness runtime between parent and child, thus the
  * total amount of pressure for CPU stays equal - new tasks
  * get a chance to run but frequent forkers are not allowed to
- * monopolize the CPU. Note: the parent runqueue is locked,
- * the child is not running yet.
+ * monopolize the CPU. Note: both child and parent's runqueues are
+ * locked in the migration case, the child is not yet running.
  */
 static void task_new_fair(struct rq *rq, struct task_struct *p)
 {
 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
 	struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
+	struct sched_entity *pse = &current->se;
+	struct cfs_rq *old_cfs_rq = cfs_rq_of(pse);
 	int this_cpu = smp_processor_id();
 
-	sched_info_queued(p);
-
 	update_curr(cfs_rq);
-	if (curr)
-		se->vruntime = curr->vruntime;
+
+	/*
+	 * If we're not sharing a runqueue, copy the parent's vruntime
+	 * after accounting for any yet to be booked vruntime.
+	 */
+	if (cfs_rq != old_cfs_rq)
+		update_curr(old_cfs_rq);
+
+	se->vruntime = pse->vruntime;
+
+	if (cfs_rq != old_cfs_rq)
+		se->vruntime -= old_cfs_rq->min_vruntime - cfs_rq->min_vruntime;
+
 	place_entity(cfs_rq, se, 1);
 
 	/* 'curr' will be NULL if the child belongs to a different group */
@@ -1953,6 +1964,8 @@ static void task_new_fair(struct rq *rq,
 	}
 
 	enqueue_task_fair(rq, p, 0);
+
+	sched_info_queued(p);
 }
 
 /*
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -2626,11 +2626,15 @@ void sched_fork(struct task_struct *p, i
 void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 {
 	unsigned long flags;
-	struct rq *rq;
+	struct rq *rq, *this_rq = this_rq();;
 
 	rq = task_rq_lock(p, &flags);
 	BUG_ON(p->state != TASK_RUNNING);
 	update_rq_clock(rq);
+	if (rq != this_rq) {
+		double_lock_balance(rq, this_rq);
+		update_rq_clock(this_rq);
+	}
 
 	if (!p->sched_class->task_new || !current->se.on_rq) {
 		activate_task(rq, p, 0);
@@ -2648,6 +2652,8 @@ void wake_up_new_task(struct task_struct
 	if (p->sched_class->task_wake_up)
 		p->sched_class->task_wake_up(rq, p);
 #endif
+	if (rq != this_rq)
+		double_unlock_balance(rq, this_rq);
 	task_rq_unlock(rq, &flags);
 }
 

sched: fix sched_fair.c::task_new_fair vruntime bug.

b5d9d734 fixed the problem of a forking task's child gaining vruntime..
IFF the child/parent shared a runqueue.  In the other case, it broke
fairness all to pieces by setting the child's vruntime to whatever task
happened to be current on the child's runqueue at wakeup time.  Fix this
by copying the parent's vruntime, adding any unaccounted vruntime, and
redoing cross cpu offset.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>

---
 kernel/sched.c      |    2 ++
 kernel/sched_fair.c |   29 +++++++++++++++++++++++------
 2 files changed, 25 insertions(+), 6 deletions(-)

Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -1925,20 +1925,35 @@ static void task_tick_fair(struct rq *rq
  * Share the fairness runtime between parent and child, thus the
  * total amount of pressure for CPU stays equal - new tasks
  * get a chance to run but frequent forkers are not allowed to
- * monopolize the CPU. Note: the parent runqueue is locked,
- * the child is not running yet.
+ * monopolize the CPU. Note: the child's runqueue is locked, the
+ * child is not yet running, and the parent is not preemptible.
  */
 static void task_new_fair(struct rq *rq, struct task_struct *p)
 {
 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
 	struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
+	struct sched_entity *pse = &current->se;
 	int this_cpu = smp_processor_id();
 
-	sched_info_queued(p);
-
 	update_curr(cfs_rq);
-	if (curr)
-		se->vruntime = curr->vruntime;
+
+	se->vruntime = pse->vruntime;
+
+	/*
+	 * If we're not sharing a runqueue, redo the child's vruntime
+	 * offset after accounting for any yet to be booked vruntime.
+	 */
+	if (this_cpu != task_cpu(p)) {
+		struct cfs_rq *old_cfs_rq = cfs_rq_of(pse);
+		u64 now = cpu_rq(this_cpu)->clock;
+		unsigned long delta_exec = now - pse->exec_start;
+
+		delta_exec = calc_delta_fair(delta_exec, se);
+		se->vruntime += delta_exec;
+		se->vruntime -= old_cfs_rq->min_vruntime -
+					cfs_rq->min_vruntime;
+	}
+
 	place_entity(cfs_rq, se, 1);
 
 	/* 'curr' will be NULL if the child belongs to a different group */
@@ -1953,6 +1968,8 @@ static void task_new_fair(struct rq *rq,
 	}
 
 	enqueue_task_fair(rq, p, 0);
+
+	sched_info_queued(p);
 }
 
 /*
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -2631,6 +2631,8 @@ void wake_up_new_task(struct task_struct
 	rq = task_rq_lock(p, &flags);
 	BUG_ON(p->state != TASK_RUNNING);
 	update_rq_clock(rq);
+	if (rq != this_rq())
+		update_rq_clock(this_rq());
 
 	if (!p->sched_class->task_new || !current->se.on_rq) {
 		activate_task(rq, p, 0);




^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] sched: fix b5d9d734 blunder in task_new_fair()
  2009-11-27 11:55   ` Mike Galbraith
@ 2009-11-27 12:21     ` Peter Zijlstra
  2009-11-27 12:38       ` Peter Zijlstra
  0 siblings, 1 reply; 17+ messages in thread
From: Peter Zijlstra @ 2009-11-27 12:21 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Ingo Molnar, LKML

I'm poking at it like so...

the below hasn't yet seen a compiler and I know there to still be a few
races in the ttwu() path.


---
 include/linux/sched.h |    6 +-
 kernel/fork.c         |    2 +-
 kernel/sched.c        |  170 +++++++++++++++++++++++++++++++------------------
 kernel/sched_fair.c   |   26 +++----
 4 files changed, 124 insertions(+), 80 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1c46023..ff5ec5a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1099,7 +1099,7 @@ struct sched_class {
 
 	void (*set_curr_task) (struct rq *rq);
 	void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
-	void (*task_new) (struct rq *rq, struct task_struct *p);
+	void (*fork) (struct task_struct *p);
 
 	void (*switched_from) (struct rq *this_rq, struct task_struct *task,
 			       int running);
@@ -2456,7 +2456,7 @@ static inline unsigned int task_cpu(const struct task_struct *p)
 	return task_thread_info(p)->cpu;
 }
 
-extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
+extern void set_task_cpu_locked(struct task_struct *p, int cpu);
 
 #else
 
@@ -2465,7 +2465,7 @@ static inline unsigned int task_cpu(const struct task_struct *p)
 	return 0;
 }
 
-static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
+static inline void set_task_cpu_locked(struct task_struct *p, int cpu)
 {
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 166b8c4..29c4aec 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1242,7 +1242,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
 	if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
 			!cpu_online(task_cpu(p))))
-		set_task_cpu(p, smp_processor_id());
+		set_task_cpu_locked(p, smp_processor_id());
 
 	/* CLONE_PARENT re-uses the old parent */
 	if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
diff --git a/kernel/sched.c b/kernel/sched.c
index 0cbf2ef..dced5af 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1967,7 +1967,7 @@ inline int task_curr(const struct task_struct *p)
 	return cpu_curr(task_cpu(p)) == p;
 }
 
-static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
+static inline void raw_set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
 	set_task_rq(p, cpu);
 #ifdef CONFIG_SMP
@@ -2008,6 +2008,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 void kthread_bind(struct task_struct *p, unsigned int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
+	struct rq *old_rq = task_rq(p);
 	unsigned long flags;
 
 	/* Must have done schedule() in kthread() before we set_task_cpu */
@@ -2016,13 +2017,14 @@ void kthread_bind(struct task_struct *p, unsigned int cpu)
 		return;
 	}
 
-	spin_lock_irqsave(&rq->lock, flags);
-	update_rq_clock(rq);
+	local_irq_save(flags);
+	double_rq_lock(old_rq, rq);
 	set_task_cpu(p, cpu);
 	p->cpus_allowed = cpumask_of_cpu(cpu);
 	p->rt.nr_cpus_allowed = 1;
 	p->flags |= PF_THREAD_BOUND;
-	spin_unlock_irqrestore(&rq->lock, flags);
+	double_rq_unlock(old_rq, rq);
+	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(kthread_bind);
 
@@ -2056,40 +2058,101 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 	return delta < (s64)sysctl_sched_migration_cost;
 }
 
+static void set_task_cpu_all(struct task_struct *p, int old_cpu, int new_cpu)
+{
+	trace_sched_migrate_task(p, new_cpu);
+
+	p->se.nr_migrations++;
+#ifdef CONFIG_SCHEDSTATS
+	if (task_hot(p, old_rq->clock, NULL))
+		schedstat_inc(p, se.nr_forced2_migrations);
+#endif
+	perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
+
+	raw_set_task_cpu(p, new_cpu);
+}
 
-void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
+static void set_task_cpu(struct task_struct *p, int new_cpu)
 {
 	int old_cpu = task_cpu(p);
-	struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
-	struct cfs_rq *old_cfsrq = task_cfs_rq(p),
-		      *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
-	u64 clock_offset;
 
-	clock_offset = old_rq->clock - new_rq->clock;
+	if (old_cpu == new_cpu)
+		return;
 
-	trace_sched_migrate_task(p, new_cpu);
+	if (p->sched_class == &fair_sched_class) {
+		struct cfs_rq *old_cfsrq = task_cfs_rq(p),
+			      *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
 
-#ifdef CONFIG_SCHEDSTATS
-	if (p->se.wait_start)
-		p->se.wait_start -= clock_offset;
-	if (p->se.sleep_start)
-		p->se.sleep_start -= clock_offset;
-	if (p->se.block_start)
-		p->se.block_start -= clock_offset;
-#endif
-	if (old_cpu != new_cpu) {
-		p->se.nr_migrations++;
-#ifdef CONFIG_SCHEDSTATS
-		if (task_hot(p, old_rq->clock, NULL))
-			schedstat_inc(p, se.nr_forced2_migrations);
-#endif
-		perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
-				     1, 1, NULL, 0);
+		p->se.vruntime -= old_cfsrq->min_vruntime -
+			new_cfsrq->min_vruntime;
+	}
+
+	set_task_cpu_all(p, old_cpu, new_cpu);
+}
+
+static inline
+int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
+{
+	return p->sched_class->select_task_rq(p, sd_flags, wake_flags);
+}
+
+/*
+ * self balance a task without holding both rq locks
+ *
+ * used by fork and wakeup, since in neither case the task was present
+ * on a rq the load-balancer wouldn't encounter it and move it away
+ * underneath us.
+ *
+ * and since we have IRQs disabled we keep off hot-unplug.
+ */
+static struct rq *
+balance_task(struct task_struct *p, int sd_flags, int wake_flags)
+{
+	struct rq *rq, *old_rq;
+	u64 vdelta;
+
+       	rq = old_rq = task_rq(p);
+
+	if (p->sched_class == &fair_sched_class)
+		vdelta = task_cfs_rq(p)->min_vruntime;
+
+	__task_rq_unlock(old_rq);
+
+	cpu = select_task_rq(p, sd_flags, wake_flags);
+
+	rq = cpu_rq(cpu);
+	spin_lock(&rq->lock);
+	if (rq == old_rq)
+		return rq;
+
+	update_rq_clock(rq);
+
+	set_task_cpu_all(p, task_cpu(p), cpu);
+
+	if (p->sched_class == &fair_sched_class) {
+		vdelta -= task_cfs_rq(p)->min_vruntime;
+		p->se.vruntime -= vdelta;
 	}
-	p->se.vruntime -= old_cfsrq->min_vruntime -
-					 new_cfsrq->min_vruntime;
 
-	__set_task_cpu(p, new_cpu);
+	return rq;
+}
+
+void set_task_cpu_locked(struct task_struct *p, int new_cpu)
+{
+	struct rq *old_rq = task_rq(p);
+	struct rq *new_rq = cpu_rq(new_cpu);
+	unsigned long flags;
+
+	local_irq_save(flags);
+	double_rq_lock(old_rq, new_rq);
+	/*
+	 * called from fork() on the child task before it gets
+	 * put on the tasklist, no way we could've been migrated
+	 */
+	WARN_ON_ONCE(old_rq != task_rq(p));
+	set_task_cpu(p, new_cpu);
+	double_rq_unlock(old_rq, new_rq);
+	local_irq_restore(flags);
 }
 
 struct migration_req {
@@ -2373,19 +2436,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 	 */
 	if (task_contributes_to_load(p))
 		rq->nr_uninterruptible--;
+
 	p->state = TASK_WAKING;
+	rq = balance_task(p, SD_BALANCE_WAKE, wake_flags);
 	task_rq_unlock(rq, &flags);
 
-	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
-	if (cpu != orig_cpu) {
-		local_irq_save(flags);
-		rq = cpu_rq(cpu);
-		update_rq_clock(rq);
-		set_task_cpu(p, cpu);
-		local_irq_restore(flags);
-	}
-	rq = task_rq_lock(p, &flags);
-
 	WARN_ON(p->state != TASK_WAKING);
 	cpu = task_cpu(p);
 
@@ -2557,8 +2612,9 @@ static void __sched_fork(struct task_struct *p)
  */
 void sched_fork(struct task_struct *p, int clone_flags)
 {
-	int cpu = get_cpu();
 	unsigned long flags;
+	struct *rq;
+	int cpu;
 
 	__sched_fork(p);
 
@@ -2592,13 +2648,15 @@ void sched_fork(struct task_struct *p, int clone_flags)
 	if (!rt_prio(p->prio))
 		p->sched_class = &fair_sched_class;
 
-#ifdef CONFIG_SMP
-	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
-#endif
-	local_irq_save(flags);
-	update_rq_clock(cpu_rq(cpu));
-	set_task_cpu(p, cpu);
-	local_irq_restore(flags);
+	rq = task_rq_lock(current, &flags);
+	update_rq_clock(rq);
+
+	if (p->sched_class->fork)
+		p->sched_class->fork(p);
+
+	rq = balance_task(p, SD_BALANCE_FORK, 0);
+	task_rq_unlock(rq, &flags);
+
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	if (likely(sched_info_on()))
@@ -2631,17 +2689,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 	rq = task_rq_lock(p, &flags);
 	BUG_ON(p->state != TASK_RUNNING);
 	update_rq_clock(rq);
-
-	if (!p->sched_class->task_new || !current->se.on_rq) {
-		activate_task(rq, p, 0);
-	} else {
-		/*
-		 * Let the scheduling class do new task startup
-		 * management (if any):
-		 */
-		p->sched_class->task_new(rq, p);
-		inc_nr_running(rq);
-	}
+	activate_task(rq, p, 0);
 	trace_sched_wakeup_new(rq, p, 1);
 	check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
@@ -3156,7 +3204,7 @@ out:
 void sched_exec(void)
 {
 	int new_cpu, this_cpu = get_cpu();
-	new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
+	new_cpu = select_task_rq(current, SD_BALANCE_EXEC, 0);
 	put_cpu();
 	if (new_cpu != this_cpu)
 		sched_migrate_task(current, new_cpu);
@@ -6980,7 +7028,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 
 	idle->prio = idle->normal_prio = MAX_PRIO;
 	cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
-	__set_task_cpu(idle, cpu);
+	raw_set_task_cpu(idle, cpu);
 
 	rq->curr = rq->idle = idle;
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0ff21af..7c27d2f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1922,28 +1922,26 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 }
 
 /*
- * Share the fairness runtime between parent and child, thus the
- * total amount of pressure for CPU stays equal - new tasks
- * get a chance to run but frequent forkers are not allowed to
- * monopolize the CPU. Note: the parent runqueue is locked,
- * the child is not running yet.
+ * Called at fork, before the child is on the tasklist and before
+ * self-balance, current (parent) his rq is locked.
  */
-static void task_new_fair(struct rq *rq, struct task_struct *p)
+static void fork_fair(struct task_struct *p)
 {
-	struct cfs_rq *cfs_rq = task_cfs_rq(p);
-	struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
+	struct rq *rq = this_rq();
+	struct cfs_rq *cfs_rq = task_cfs_rq(current);
+	struct sched_entity *se = &p->se, *curr = cfs_rq-curr;
 	int this_cpu = smp_processor_id();
 
+	if (unlikely(task_cpu(p) != this_cpu))
+		raw_set_task_cpu(p, this_cpu);
+	
 	sched_info_queued(p);
-
 	update_curr(cfs_rq);
 	if (curr)
 		se->vruntime = curr->vruntime;
 	place_entity(cfs_rq, se, 1);
 
-	/* 'curr' will be NULL if the child belongs to a different group */
-	if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
-			curr && entity_before(curr, se)) {
+	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
 		/*
 		 * Upon rescheduling, sched_class::put_prev_task() will place
 		 * 'current' within the tree based on its new key value.
@@ -1951,8 +1949,6 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 		swap(curr->vruntime, se->vruntime);
 		resched_task(rq->curr);
 	}
-
-	enqueue_task_fair(rq, p, 0);
 }
 
 /*
@@ -2056,7 +2052,7 @@ static const struct sched_class fair_sched_class = {
 
 	.set_curr_task          = set_curr_task_fair,
 	.task_tick		= task_tick_fair,
-	.task_new		= task_new_fair,
+	.fork			= fork_fair,
 
 	.prio_changed		= prio_changed_fair,
 	.switched_to		= switched_to_fair,



^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [patch] sched: fix b5d9d734 blunder in task_new_fair()
  2009-11-27 12:21     ` Peter Zijlstra
@ 2009-11-27 12:38       ` Peter Zijlstra
  0 siblings, 0 replies; 17+ messages in thread
From: Peter Zijlstra @ 2009-11-27 12:38 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Ingo Molnar, LKML

On Fri, 2009-11-27 at 13:21 +0100, Peter Zijlstra wrote:
> +static struct rq *
> +balance_task(struct task_struct *p, int sd_flags, int wake_flags)
> +{
> +       struct rq *rq, *old_rq;
> +       u64 vdelta;
> +
> +               rq = old_rq = task_rq(p);
> +
> +       if (p->sched_class == &fair_sched_class)
> +               vdelta = task_cfs_rq(p)->min_vruntime;
> +
> +       __task_rq_unlock(old_rq);
> +
> +       cpu = select_task_rq(p, sd_flags, wake_flags);
> +
> +       rq = cpu_rq(cpu);
> +       spin_lock(&rq->lock);
> +       if (rq == old_rq)
> +               return rq;
> +
> +       update_rq_clock(rq);
> +
> +       set_task_cpu_all(p, task_cpu(p), cpu);
> +
> +       if (p->sched_class == &fair_sched_class) {
> +               vdelta -= task_cfs_rq(p)->min_vruntime;
> +               p->se.vruntime -= vdelta;
>         }
>  
> +       return rq;
> +} 

Feh, there's a much easier way to deal with that min_vruntime crap.

Do se->vruntime -= cfs_rq->min_vruntime, on dequeue, and
   se->vruntime += cfs_rq->min_vruntime, on enqueue

That leaves the whole thing invariant to the cfs_rq when its not
enqueued, and we don't have to fix it up when moving it around.

Also, note that I ripped out the clock_offset thingy, because with the
current sched_clock.c stuff clocks should get synchronized when we do a
remote clock update (or at least appear monotonic).

Getting these two things sorted returns set_task_cpu() to sanity.

/me tosses patch and starts over.


^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2009-11-27 12:38 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-11-22 12:07 [patch] sched: fix b5d9d734 blunder in task_new_fair() Mike Galbraith
2009-11-24 13:51 ` Peter Zijlstra
2009-11-24 17:07   ` Mike Galbraith
2009-11-24 17:35     ` Peter Zijlstra
2009-11-24 17:54       ` Peter Zijlstra
2009-11-24 18:21         ` Mike Galbraith
2009-11-24 18:27           ` Peter Zijlstra
2009-11-24 18:36             ` Mike Galbraith
2009-11-25  6:57             ` Mike Galbraith
2009-11-25  9:51               ` Peter Zijlstra
2009-11-25 13:09                 ` Mike Galbraith
2009-11-26 16:26 ` Peter Zijlstra
2009-11-27  8:45   ` Mike Galbraith
2009-11-27  8:57     ` Mike Galbraith
2009-11-27 11:55   ` Mike Galbraith
2009-11-27 12:21     ` Peter Zijlstra
2009-11-27 12:38       ` Peter Zijlstra

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.