linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] sched: fix task and run queue run_delay inconsistencies
@ 2015-09-23  0:37 Meyer, Mike
  2015-09-30 15:44 ` Peter Zijlstra
  0 siblings, 1 reply; 7+ messages in thread
From: Meyer, Mike @ 2015-09-23  0:37 UTC (permalink / raw)
  To: linux-kernel; +Cc: mingo, peterz

During evaluation of some performance data, it was discovered thread
and run queue run_delay accounting data was inconsistent with the other
accounting data that was collected.  Further investigation found under
certain circumstances execution time was leaking into the task and
run queue accounting of run_delay.

Consider the following sequence:

    a. thread is running.
    b. thread moves beween cgroups, changes scheduling class or priority.
    c. thread sleeps OR
    d. thread involuntarily gives up cpu.

a. implies:

    thread->sched_info.last_queued = 0

a. and b. results in the following:

    1. dequeue_task(rq, thread)

           sched_info_dequeued(rq, thread)
               delta = 0

               sched_info_reset_dequeued(thread)
                   thread->sched_info.last_queued = 0

               thread->sched_info.run_delay += delta

    2. enqueue_task(rq, thread)

           sched_info_queued(rq, thread)

               /* thread is still on cpu at this point. */
               thread->sched_info.last_queued = task_rq(thread)->clock;

c. results in:

    dequeue_task(rq, thread)

        sched_info_dequeued(rq, thread)

            /* delta is execution time not run_delay. */
            delta = task_rq(thread)->clock - thread->sched_info.last_queued

        sched_info_reset_dequeued(thread)
            thread->sched_info.last_queued = 0

        thread->sched_info.run_delay += delta

    Since thread was running between enqueue_task(rq, thread) and
    dequeue_task(rq, thread), the delta above is really execution
    time and not run_delay.

d. results in:

    __sched_info_switch(thread, next_thread)

        sched_info_depart(rq, thread)

            sched_info_queued(rq, thread)

                /* last_queued not updated due to being non-zero */
                return

    Since thread was running between enqueue_task(rq, thread) and
    __sched_info_switch(thread, next_thread), the execution time
    between enqueue_task(rq, thread) and
    __sched_info_switch(thread, next_thread) now will become
    associated with run_delay due to when last_queued was last updated.

The proposed patch addresses the issue by calling
sched_info_reset_dequeued(thread) following the call to
enqueue_task(rq, thread) for running threads in situations in which
thread->sched_info.last_queued should remain 0.

Signed-off-by: Mike Meyer <mike.meyer@teradata.com>
---
 kernel/sched/core.c | 36 ++++++++++++++++++++++++++++++------
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2f9c928..88bfe43 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1187,8 +1187,12 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
-	if (queued)
+	if (queued) {
 		enqueue_task(rq, p, 0);
+
+		if (running)
+			sched_info_reset_dequeued(p);
+	}
 }
 
 /*
@@ -3378,9 +3382,13 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
-	if (queued)
+	if (queued) {
 		enqueue_task(rq, p, enqueue_flag);
 
+		if (running)
+			sched_info_reset_dequeued(p);
+	}
+
 	check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
 	preempt_disable(); /* avoid rq from going away on us */
@@ -3393,7 +3401,7 @@ out_unlock:
 
 void set_user_nice(struct task_struct *p, long nice)
 {
-	int old_prio, delta, queued;
+	int old_prio, delta, queued, running;
 	unsigned long flags;
 	struct rq *rq;
 
@@ -3415,6 +3423,7 @@ void set_user_nice(struct task_struct *p, long nice)
 		goto out_unlock;
 	}
 	queued = task_on_rq_queued(p);
+	running = task_current(rq, p);
 	if (queued)
 		dequeue_task(rq, p, 0);
 
@@ -3426,11 +3435,15 @@ void set_user_nice(struct task_struct *p, long nice)
 
 	if (queued) {
 		enqueue_task(rq, p, 0);
+
+		if (running)
+			sched_info_reset_dequeued(p);
+
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
 		 */
-		if (delta < 0 || (delta > 0 && task_running(rq, p)))
+		if (delta < 0 || (delta > 0 && running))
 			resched_curr(rq);
 	}
 out_unlock:
@@ -3945,6 +3958,9 @@ change:
 		 * increased (user space view).
 		 */
 		enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+
+		if (running)
+			sched_info_reset_dequeued(p);
 	}
 
 	check_class_changed(rq, p, prev_class, oldprio);
@@ -5093,8 +5109,12 @@ void sched_setnuma(struct task_struct *p, int nid)
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
-	if (queued)
+	if (queued) {
 		enqueue_task(rq, p, 0);
+
+		if (running)
+			sched_info_reset_dequeued(p);
+	}
 	task_rq_unlock(rq, p, &flags);
 }
 #endif /* CONFIG_NUMA_BALANCING */
@@ -7735,9 +7755,13 @@ void sched_move_task(struct task_struct *tsk)
 
 	if (unlikely(running))
 		tsk->sched_class->set_curr_task(rq);
-	if (queued)
+	if (queued) {
 		enqueue_task(rq, tsk, 0);
 
+		if (unlikely(running))
+			sched_info_reset_dequeued(tsk);
+	}
+
 	task_rq_unlock(rq, tsk, &flags);
 }
 #endif /* CONFIG_CGROUP_SCHED */
-- 
2.1.4




^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] sched: fix task and run queue run_delay inconsistencies
  2015-09-23  0:37 [PATCH] sched: fix task and run queue run_delay inconsistencies Meyer, Mike
@ 2015-09-30 15:44 ` Peter Zijlstra
  2015-09-30 20:28   ` Meyer, Mike
  2015-10-06 16:17   ` [tip:sched/core] sched/core: Fix task and run queue sched_info:: " tip-bot for Peter Zijlstra
  0 siblings, 2 replies; 7+ messages in thread
From: Peter Zijlstra @ 2015-09-30 15:44 UTC (permalink / raw)
  To: Meyer, Mike; +Cc: linux-kernel, mingo

On Wed, Sep 23, 2015 at 12:37:18AM +0000, Meyer, Mike wrote:
> During evaluation of some performance data, it was discovered thread
> and run queue run_delay accounting data was inconsistent with the other
> accounting data that was collected.  Further investigation found under
> certain circumstances execution time was leaking into the task and
> run queue accounting of run_delay.
> 
> Consider the following sequence:
> 
>     a. thread is running.
>     b. thread moves beween cgroups, changes scheduling class or priority.
>     c. thread sleeps OR
>     d. thread involuntarily gives up cpu.
> 
> a. implies:
> 
>     thread->sched_info.last_queued = 0
> 
> a. and b. results in the following:
> 
>     1. dequeue_task(rq, thread)
> 
>            sched_info_dequeued(rq, thread)
>                delta = 0
> 
>                sched_info_reset_dequeued(thread)
>                    thread->sched_info.last_queued = 0
> 
>                thread->sched_info.run_delay += delta
> 
>     2. enqueue_task(rq, thread)
> 
>            sched_info_queued(rq, thread)
> 
>                /* thread is still on cpu at this point. */
>                thread->sched_info.last_queued = task_rq(thread)->clock;
> 
> c. results in:
> 
>     dequeue_task(rq, thread)
> 
>         sched_info_dequeued(rq, thread)
> 
>             /* delta is execution time not run_delay. */
>             delta = task_rq(thread)->clock - thread->sched_info.last_queued
> 
>         sched_info_reset_dequeued(thread)
>             thread->sched_info.last_queued = 0
> 
>         thread->sched_info.run_delay += delta
> 
>     Since thread was running between enqueue_task(rq, thread) and
>     dequeue_task(rq, thread), the delta above is really execution
>     time and not run_delay.
> 
> d. results in:
> 
>     __sched_info_switch(thread, next_thread)
> 
>         sched_info_depart(rq, thread)
> 
>             sched_info_queued(rq, thread)
> 
>                 /* last_queued not updated due to being non-zero */
>                 return
> 
>     Since thread was running between enqueue_task(rq, thread) and
>     __sched_info_switch(thread, next_thread), the execution time
>     between enqueue_task(rq, thread) and
>     __sched_info_switch(thread, next_thread) now will become
>     associated with run_delay due to when last_queued was last updated.
> 
> The proposed patch addresses the issue by calling
> sched_info_reset_dequeued(thread) following the call to
> enqueue_task(rq, thread) for running threads in situations in which
> thread->sched_info.last_queued should remain 0.

Would something like the below; which avoids calling
sched_info_{de,}queued() for these sites also work?

It even shrinks the code (due to inlining {en,de}queue_task()):

$ size defconfig-build/kernel/sched/core.o defconfig-build/kernel/sched/core.o.orig
   text    data     bss     dec     hex filename
  64019   23378    2344   89741   15e8d defconfig-build/kernel/sched/core.o
  64149   23378    2344   89871   15f0f defconfig-build/kernel/sched/core.o.orig

---
 kernel/sched/core.c  | 42 ++++++++++++++++++++++++------------------
 kernel/sched/sched.h | 14 ++++++++------
 2 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fe819298c220..c5d579ad70cd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -827,17 +827,19 @@ static void set_load_weight(struct task_struct *p)
 	load->inv_weight = prio_to_wmult[prio];
 }
 
-static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	update_rq_clock(rq);
-	sched_info_queued(rq, p);
+	if (!(flags & ENQUEUE_TEMP))
+		sched_info_queued(rq, p);
 	p->sched_class->enqueue_task(rq, p, flags);
 }
 
-static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	update_rq_clock(rq);
-	sched_info_dequeued(rq, p);
+	if (!(flags & DEQUEUE_TEMP))
+		sched_info_dequeued(rq, p);
 	p->sched_class->dequeue_task(rq, p, flags);
 }
 
@@ -1178,7 +1180,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 		 * holding rq->lock.
 		 */
 		lockdep_assert_held(&rq->lock);
-		dequeue_task(rq, p, 0);
+		dequeue_task(rq, p, DEQUEUE_TEMP);
 	}
 	if (running)
 		put_prev_task(rq, p);
@@ -1188,7 +1190,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (queued)
-		enqueue_task(rq, p, 0);
+		enqueue_task(rq, p, ENQUEUE_TEMP);
 }
 
 /*
@@ -3300,7 +3302,7 @@ EXPORT_SYMBOL(default_wake_function);
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-	int oldprio, queued, running, enqueue_flag = 0;
+	int oldprio, queued, running, enqueue_flag = ENQUEUE_TEMP;
 	struct rq *rq;
 	const struct sched_class *prev_class;
 
@@ -3332,7 +3334,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
-		dequeue_task(rq, p, 0);
+		dequeue_task(rq, p, DEQUEUE_TEMP);
 	if (running)
 		put_prev_task(rq, p);
 
@@ -3350,7 +3352,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		if (!dl_prio(p->normal_prio) ||
 		    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
 			p->dl.dl_boosted = 1;
-			enqueue_flag = ENQUEUE_REPLENISH;
+			enqueue_flag |= ENQUEUE_REPLENISH;
 		} else
 			p->dl.dl_boosted = 0;
 		p->sched_class = &dl_sched_class;
@@ -3358,7 +3360,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		if (dl_prio(oldprio))
 			p->dl.dl_boosted = 0;
 		if (oldprio < prio)
-			enqueue_flag = ENQUEUE_HEAD;
+			enqueue_flag |= ENQUEUE_HEAD;
 		p->sched_class = &rt_sched_class;
 	} else {
 		if (dl_prio(oldprio))
@@ -3410,7 +3412,7 @@ void set_user_nice(struct task_struct *p, long nice)
 	}
 	queued = task_on_rq_queued(p);
 	if (queued)
-		dequeue_task(rq, p, 0);
+		dequeue_task(rq, p, DEQUEUE_TEMP);
 
 	p->static_prio = NICE_TO_PRIO(nice);
 	set_load_weight(p);
@@ -3419,7 +3421,7 @@ void set_user_nice(struct task_struct *p, long nice)
 	delta = p->prio - old_prio;
 
 	if (queued) {
-		enqueue_task(rq, p, 0);
+		enqueue_task(rq, p, ENQUEUE_TEMP);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
@@ -3921,7 +3923,7 @@ static int __sched_setscheduler(struct task_struct *p,
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
-		dequeue_task(rq, p, 0);
+		dequeue_task(rq, p, DEQUEUE_TEMP);
 	if (running)
 		put_prev_task(rq, p);
 
@@ -3931,11 +3933,15 @@ static int __sched_setscheduler(struct task_struct *p,
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (queued) {
+		int enqueue_flags = ENQUEUE_TEMP;
 		/*
 		 * We enqueue to tail when the priority of a task is
 		 * increased (user space view).
 		 */
-		enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+		if (oldprio <= p->prio)
+			enqueue_flags |= ENQUEUE_HEAD;
+
+		enqueue_task(rq, p, enqueue_flags);
 	}
 
 	check_class_changed(rq, p, prev_class, oldprio);
@@ -5084,7 +5090,7 @@ void sched_setnuma(struct task_struct *p, int nid)
 	running = task_current(rq, p);
 
 	if (queued)
-		dequeue_task(rq, p, 0);
+		dequeue_task(rq, p, DEQUEUE_TEMP);
 	if (running)
 		put_prev_task(rq, p);
 
@@ -5093,7 +5099,7 @@ void sched_setnuma(struct task_struct *p, int nid)
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (queued)
-		enqueue_task(rq, p, 0);
+		enqueue_task(rq, p, ENQUEUE_TEMP);
 	task_rq_unlock(rq, p, &flags);
 }
 #endif /* CONFIG_NUMA_BALANCING */
@@ -7712,7 +7718,7 @@ void sched_move_task(struct task_struct *tsk)
 	queued = task_on_rq_queued(tsk);
 
 	if (queued)
-		dequeue_task(rq, tsk, 0);
+		dequeue_task(rq, tsk, DEQUEUE_TEMP);
 	if (unlikely(running))
 		put_prev_task(rq, tsk);
 
@@ -7736,7 +7742,7 @@ void sched_move_task(struct task_struct *tsk)
 	if (unlikely(running))
 		tsk->sched_class->set_curr_task(rq);
 	if (queued)
-		enqueue_task(rq, tsk, 0);
+		enqueue_task(rq, tsk, ENQUEUE_TEMP);
 
 	task_rq_unlock(rq, tsk, &flags);
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index af6f252e7e34..d97a8d1abc66 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1150,16 +1150,18 @@ static const u32 prio_to_wmult[40] = {
  /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
 
-#define ENQUEUE_WAKEUP		1
-#define ENQUEUE_HEAD		2
+#define ENQUEUE_WAKEUP		0x01
+#define ENQUEUE_HEAD		0x02
 #ifdef CONFIG_SMP
-#define ENQUEUE_WAKING		4	/* sched_class::task_waking was called */
+#define ENQUEUE_WAKING		0x04	/* sched_class::task_waking was called */
 #else
-#define ENQUEUE_WAKING		0
+#define ENQUEUE_WAKING		0x00
 #endif
-#define ENQUEUE_REPLENISH	8
+#define ENQUEUE_REPLENISH	0x08
+#define ENQUEUE_TEMP		0x10
 
-#define DEQUEUE_SLEEP		1
+#define DEQUEUE_SLEEP		0x01
+#define DEQUEUE_TEMP		0x02
 
 #define RETRY_TASK		((void *)-1UL)
 

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* RE: [PATCH] sched: fix task and run queue run_delay inconsistencies
  2015-09-30 15:44 ` Peter Zijlstra
@ 2015-09-30 20:28   ` Meyer, Mike
  2015-10-01  6:37     ` Peter Zijlstra
  2015-10-06 16:17   ` [tip:sched/core] sched/core: Fix task and run queue sched_info:: " tip-bot for Peter Zijlstra
  1 sibling, 1 reply; 7+ messages in thread
From: Meyer, Mike @ 2015-09-30 20:28 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: linux-kernel, mingo

> From: Peter Zijlstra [mailto:peterz@infradead.org]
>
> On Wed, Sep 23, 2015 at 12:37:18AM +0000, Meyer, Mike wrote:
> >
> > The proposed patch addresses the issue by calling
> > sched_info_reset_dequeued(thread) following the call to
> > enqueue_task(rq, thread) for running threads in situations in which
> > thread->sched_info.last_queued should remain 0.
> 
> Would something like the below; which avoids calling
> sched_info_{de,}queued() for these sites also work?
> 
> It even shrinks the code (due to inlining {en,de}queue_task()):
> 
> $ size defconfig-build/kernel/sched/core.o defconfig-
> build/kernel/sched/core.o.orig
>    text    data     bss     dec     hex filename
>   64019   23378    2344   89741   15e8d defconfig-build/kernel/sched/core.o
>   64149   23378    2344   89871   15f0f defconfig-build/kernel/sched/core.o.orig
> 
Yes that will also address the issue.

The reason I approached the way I did was to avoid adding code path to the far more common uses of {en,de}queue_task() but I doubt anyone is going to notice a difference with the addition of some register save/restores and a compare in that path.  Overall the code does shrink with the alternative which is good.

My only comment is I am not sure about the naming of the flag ENQUEUE_TEMP which implies (to me) the enqueue is temporary which clearly it isn't.    Maybe something like DEQUEUE_MOVE/ENQUEUE_MOVE would be a bit more descriptive of the use case.

Other than that I am fine with what you proposed.

Thanks!


 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] sched: fix task and run queue run_delay inconsistencies
  2015-09-30 20:28   ` Meyer, Mike
@ 2015-10-01  6:37     ` Peter Zijlstra
  2015-10-01  9:43       ` Peter Zijlstra
  0 siblings, 1 reply; 7+ messages in thread
From: Peter Zijlstra @ 2015-10-01  6:37 UTC (permalink / raw)
  To: Meyer, Mike; +Cc: linux-kernel, mingo

On Wed, Sep 30, 2015 at 08:28:41PM +0000, Meyer, Mike wrote:
> > From: Peter Zijlstra [mailto:peterz@infradead.org]
> >
> > On Wed, Sep 23, 2015 at 12:37:18AM +0000, Meyer, Mike wrote:
> > >
> > > The proposed patch addresses the issue by calling
> > > sched_info_reset_dequeued(thread) following the call to
> > > enqueue_task(rq, thread) for running threads in situations in which
> > > thread->sched_info.last_queued should remain 0.
> > 
> > Would something like the below; which avoids calling
> > sched_info_{de,}queued() for these sites also work?
> > 
> > It even shrinks the code (due to inlining {en,de}queue_task()):
> > 
> > $ size defconfig-build/kernel/sched/core.o defconfig-
> > build/kernel/sched/core.o.orig
> >    text    data     bss     dec     hex filename
> >   64019   23378    2344   89741   15e8d defconfig-build/kernel/sched/core.o
> >   64149   23378    2344   89871   15f0f defconfig-build/kernel/sched/core.o.orig
> > 
> Yes that will also address the issue.
> 
> The reason I approached the way I did was to avoid adding code path to
> the far more common uses of {en,de}queue_task() but I doubt anyone is
> going to notice a difference with the addition of some register
> save/restores and a compare in that path.  Overall the code does
> shrink with the alternative which is good.

In most cases the flags should be compile time constants, and with the
inline we can determine the branch at compile time, avoiding emitting
that branch instruction entirely.

But let me double check the asm for a few important sites.

> My only comment is I am not sure about the naming of the flag
> ENQUEUE_TEMP which implies (to me) the enqueue is temporary which
> clearly it isn't.    Maybe something like DEQUEUE_MOVE/ENQUEUE_MOVE
> would be a bit more descriptive of the use case.

Yes, I ran out of creative juices, let me attempt a better name once
I've woken up a bit.


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] sched: fix task and run queue run_delay inconsistencies
  2015-10-01  6:37     ` Peter Zijlstra
@ 2015-10-01  9:43       ` Peter Zijlstra
  2015-10-01 16:06         ` Meyer, Mike
  0 siblings, 1 reply; 7+ messages in thread
From: Peter Zijlstra @ 2015-10-01  9:43 UTC (permalink / raw)
  To: Meyer, Mike; +Cc: linux-kernel, mingo

On Thu, Oct 01, 2015 at 08:37:32AM +0200, Peter Zijlstra wrote:
> On Wed, Sep 30, 2015 at 08:28:41PM +0000, Meyer, Mike wrote:

> > Yes that will also address the issue.
> > 
> > The reason I approached the way I did was to avoid adding code path to
> > the far more common uses of {en,de}queue_task() but I doubt anyone is
> > going to notice a difference with the addition of some register
> > save/restores and a compare in that path.  Overall the code does
> > shrink with the alternative which is good.
> 
> In most cases the flags should be compile time constants, and with the
> inline we can determine the branch at compile time, avoiding emitting
> that branch instruction entirely.
> 
> But let me double check the asm for a few important sites.

It looks like the sites in the wakeup path do indeed not get any
additional conditionals.

> > My only comment is I am not sure about the naming of the flag
> > ENQUEUE_TEMP which implies (to me) the enqueue is temporary which
> > clearly it isn't.    Maybe something like DEQUEUE_MOVE/ENQUEUE_MOVE
> > would be a bit more descriptive of the use case.
> 
> Yes, I ran out of creative juices, let me attempt a better name once
> I've woken up a bit.

How about DEQUEUE_SAVE, ENQUEUE_RESTORE ? Ideally I'd wrap the whole
pattern into a helper but C isn't really supportive of pre+post patterns
like this.


^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [PATCH] sched: fix task and run queue run_delay inconsistencies
  2015-10-01  9:43       ` Peter Zijlstra
@ 2015-10-01 16:06         ` Meyer, Mike
  0 siblings, 0 replies; 7+ messages in thread
From: Meyer, Mike @ 2015-10-01 16:06 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: linux-kernel, mingo

> On Thu, Oct 01, 2015 at 08:37:32AM +0200, Peter Zijlstra wrote:
>
> > > On Wed, Sep 30, 2015 at 08:28:41PM +0000, Meyer, Mike wrote:
> 
> It looks like the sites in the wakeup path do indeed not get any additional
> conditionals.
> 
> > > My only comment is I am not sure about the naming of the flag
> > > ENQUEUE_TEMP which implies (to me) the enqueue is temporary which
> > > clearly it isn't.    Maybe something like
> DEQUEUE_MOVE/ENQUEUE_MOVE
> > > would be a bit more descriptive of the use case.
> >
> > Yes, I ran out of creative juices, let me attempt a better name once
> > I've woken up a bit.
> 
> How about DEQUEUE_SAVE, ENQUEUE_RESTORE ? Ideally I'd wrap the whole
> pattern into a helper but C isn't really supportive of pre+post patterns like
> this.

Sounds fine to me!

Thanks again.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [tip:sched/core] sched/core: Fix task and run queue sched_info:: run_delay inconsistencies
  2015-09-30 15:44 ` Peter Zijlstra
  2015-09-30 20:28   ` Meyer, Mike
@ 2015-10-06 16:17   ` tip-bot for Peter Zijlstra
  1 sibling, 0 replies; 7+ messages in thread
From: tip-bot for Peter Zijlstra @ 2015-10-06 16:17 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: peterz, Mike.Meyer, linux-kernel, torvalds, efault, tglx, mingo, hpa

Commit-ID:  1de64443d755f83af8ba8b558fded0c61afaef47
Gitweb:     http://git.kernel.org/tip/1de64443d755f83af8ba8b558fded0c61afaef47
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Wed, 30 Sep 2015 17:44:13 +0200
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Tue, 6 Oct 2015 17:08:22 +0200

sched/core: Fix task and run queue sched_info::run_delay inconsistencies

Mike Meyer reported the following bug:

> During evaluation of some performance data, it was discovered thread
> and run queue run_delay accounting data was inconsistent with the other
> accounting data that was collected.  Further investigation found under
> certain circumstances execution time was leaking into the task and
> run queue accounting of run_delay.
>
> Consider the following sequence:
>
>     a. thread is running.
>     b. thread moves beween cgroups, changes scheduling class or priority.
>     c. thread sleeps OR
>     d. thread involuntarily gives up cpu.
>
> a. implies:
>
>     thread->sched_info.last_queued = 0
>
> a. and b. results in the following:
>
>     1. dequeue_task(rq, thread)
>
>            sched_info_dequeued(rq, thread)
>                delta = 0
>
>                sched_info_reset_dequeued(thread)
>                    thread->sched_info.last_queued = 0
>
>                thread->sched_info.run_delay += delta
>
>     2. enqueue_task(rq, thread)
>
>            sched_info_queued(rq, thread)
>
>                /* thread is still on cpu at this point. */
>                thread->sched_info.last_queued = task_rq(thread)->clock;
>
> c. results in:
>
>     dequeue_task(rq, thread)
>
>         sched_info_dequeued(rq, thread)
>
>             /* delta is execution time not run_delay. */
>             delta = task_rq(thread)->clock - thread->sched_info.last_queued
>
>         sched_info_reset_dequeued(thread)
>             thread->sched_info.last_queued = 0
>
>         thread->sched_info.run_delay += delta
>
>     Since thread was running between enqueue_task(rq, thread) and
>     dequeue_task(rq, thread), the delta above is really execution
>     time and not run_delay.
>
> d. results in:
>
>     __sched_info_switch(thread, next_thread)
>
>         sched_info_depart(rq, thread)
>
>             sched_info_queued(rq, thread)
>
>                 /* last_queued not updated due to being non-zero */
>                 return
>
>     Since thread was running between enqueue_task(rq, thread) and
>     __sched_info_switch(thread, next_thread), the execution time
>     between enqueue_task(rq, thread) and
>     __sched_info_switch(thread, next_thread) now will become
>     associated with run_delay due to when last_queued was last updated.
>

This alternative patch solves the problem by not calling
sched_info_{de,}queued() in {de,en}queue_task(). Therefore the
sched_info state is preserved and things work as expected.

By inlining the {de,en}queue_task() functions the new condition
becomes (mostly) a compile-time constant and we'll not emit any new
branch instructions.

It even shrinks the code (due to inlining {en,de}queue_task()):

$ size defconfig-build/kernel/sched/core.o defconfig-build/kernel/sched/core.o.orig
   text    data     bss     dec     hex filename
  64019   23378    2344   89741   15e8d defconfig-build/kernel/sched/core.o
  64149   23378    2344   89871   15f0f defconfig-build/kernel/sched/core.o.orig

Reported-by: Mike Meyer <Mike.Meyer@Teradata.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20150930154413.GO3604@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  | 44 +++++++++++++++++++++++++-------------------
 kernel/sched/sched.h | 14 ++++++++------
 2 files changed, 33 insertions(+), 25 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4554cde..fb14a01 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -827,17 +827,19 @@ static void set_load_weight(struct task_struct *p)
 	load->inv_weight = prio_to_wmult[prio];
 }
 
-static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	update_rq_clock(rq);
-	sched_info_queued(rq, p);
+	if (!(flags & ENQUEUE_RESTORE))
+		sched_info_queued(rq, p);
 	p->sched_class->enqueue_task(rq, p, flags);
 }
 
-static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	update_rq_clock(rq);
-	sched_info_dequeued(rq, p);
+	if (!(flags & DEQUEUE_SAVE))
+		sched_info_dequeued(rq, p);
 	p->sched_class->dequeue_task(rq, p, flags);
 }
 
@@ -1178,7 +1180,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 		 * holding rq->lock.
 		 */
 		lockdep_assert_held(&rq->lock);
-		dequeue_task(rq, p, 0);
+		dequeue_task(rq, p, DEQUEUE_SAVE);
 	}
 	if (running)
 		put_prev_task(rq, p);
@@ -1188,7 +1190,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (queued)
-		enqueue_task(rq, p, 0);
+		enqueue_task(rq, p, ENQUEUE_RESTORE);
 }
 
 /*
@@ -1692,7 +1694,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 #endif /* CONFIG_SCHEDSTATS */
 }
 
-static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
+static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
 {
 	activate_task(rq, p, en_flags);
 	p->on_rq = TASK_ON_RQ_QUEUED;
@@ -3325,7 +3327,7 @@ EXPORT_SYMBOL(default_wake_function);
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-	int oldprio, queued, running, enqueue_flag = 0;
+	int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
 	struct rq *rq;
 	const struct sched_class *prev_class;
 
@@ -3357,7 +3359,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
-		dequeue_task(rq, p, 0);
+		dequeue_task(rq, p, DEQUEUE_SAVE);
 	if (running)
 		put_prev_task(rq, p);
 
@@ -3375,7 +3377,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		if (!dl_prio(p->normal_prio) ||
 		    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
 			p->dl.dl_boosted = 1;
-			enqueue_flag = ENQUEUE_REPLENISH;
+			enqueue_flag |= ENQUEUE_REPLENISH;
 		} else
 			p->dl.dl_boosted = 0;
 		p->sched_class = &dl_sched_class;
@@ -3383,7 +3385,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		if (dl_prio(oldprio))
 			p->dl.dl_boosted = 0;
 		if (oldprio < prio)
-			enqueue_flag = ENQUEUE_HEAD;
+			enqueue_flag |= ENQUEUE_HEAD;
 		p->sched_class = &rt_sched_class;
 	} else {
 		if (dl_prio(oldprio))
@@ -3435,7 +3437,7 @@ void set_user_nice(struct task_struct *p, long nice)
 	}
 	queued = task_on_rq_queued(p);
 	if (queued)
-		dequeue_task(rq, p, 0);
+		dequeue_task(rq, p, DEQUEUE_SAVE);
 
 	p->static_prio = NICE_TO_PRIO(nice);
 	set_load_weight(p);
@@ -3444,7 +3446,7 @@ void set_user_nice(struct task_struct *p, long nice)
 	delta = p->prio - old_prio;
 
 	if (queued) {
-		enqueue_task(rq, p, 0);
+		enqueue_task(rq, p, ENQUEUE_RESTORE);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
@@ -3946,7 +3948,7 @@ change:
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
-		dequeue_task(rq, p, 0);
+		dequeue_task(rq, p, DEQUEUE_SAVE);
 	if (running)
 		put_prev_task(rq, p);
 
@@ -3956,11 +3958,15 @@ change:
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (queued) {
+		int enqueue_flags = ENQUEUE_RESTORE;
 		/*
 		 * We enqueue to tail when the priority of a task is
 		 * increased (user space view).
 		 */
-		enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+		if (oldprio <= p->prio)
+			enqueue_flags |= ENQUEUE_HEAD;
+
+		enqueue_task(rq, p, enqueue_flags);
 	}
 
 	check_class_changed(rq, p, prev_class, oldprio);
@@ -5109,7 +5115,7 @@ void sched_setnuma(struct task_struct *p, int nid)
 	running = task_current(rq, p);
 
 	if (queued)
-		dequeue_task(rq, p, 0);
+		dequeue_task(rq, p, DEQUEUE_SAVE);
 	if (running)
 		put_prev_task(rq, p);
 
@@ -5118,7 +5124,7 @@ void sched_setnuma(struct task_struct *p, int nid)
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (queued)
-		enqueue_task(rq, p, 0);
+		enqueue_task(rq, p, ENQUEUE_RESTORE);
 	task_rq_unlock(rq, p, &flags);
 }
 #endif /* CONFIG_NUMA_BALANCING */
@@ -7737,7 +7743,7 @@ void sched_move_task(struct task_struct *tsk)
 	queued = task_on_rq_queued(tsk);
 
 	if (queued)
-		dequeue_task(rq, tsk, 0);
+		dequeue_task(rq, tsk, DEQUEUE_SAVE);
 	if (unlikely(running))
 		put_prev_task(rq, tsk);
 
@@ -7761,7 +7767,7 @@ void sched_move_task(struct task_struct *tsk)
 	if (unlikely(running))
 		tsk->sched_class->set_curr_task(rq);
 	if (queued)
-		enqueue_task(rq, tsk, 0);
+		enqueue_task(rq, tsk, ENQUEUE_RESTORE);
 
 	task_rq_unlock(rq, tsk, &flags);
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 046242f..e08cc4c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1151,16 +1151,18 @@ static const u32 prio_to_wmult[40] = {
  /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
 
-#define ENQUEUE_WAKEUP		1
-#define ENQUEUE_HEAD		2
+#define ENQUEUE_WAKEUP		0x01
+#define ENQUEUE_HEAD		0x02
 #ifdef CONFIG_SMP
-#define ENQUEUE_WAKING		4	/* sched_class::task_waking was called */
+#define ENQUEUE_WAKING		0x04	/* sched_class::task_waking was called */
 #else
-#define ENQUEUE_WAKING		0
+#define ENQUEUE_WAKING		0x00
 #endif
-#define ENQUEUE_REPLENISH	8
+#define ENQUEUE_REPLENISH	0x08
+#define ENQUEUE_RESTORE	0x10
 
-#define DEQUEUE_SLEEP		1
+#define DEQUEUE_SLEEP		0x01
+#define DEQUEUE_SAVE		0x02
 
 #define RETRY_TASK		((void *)-1UL)
 

^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2015-10-06 16:18 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-09-23  0:37 [PATCH] sched: fix task and run queue run_delay inconsistencies Meyer, Mike
2015-09-30 15:44 ` Peter Zijlstra
2015-09-30 20:28   ` Meyer, Mike
2015-10-01  6:37     ` Peter Zijlstra
2015-10-01  9:43       ` Peter Zijlstra
2015-10-01 16:06         ` Meyer, Mike
2015-10-06 16:17   ` [tip:sched/core] sched/core: Fix task and run queue sched_info:: " tip-bot for Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).