linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] sched: fix incorrect wait time and wait count statistics
@ 2015-10-25  5:23 Joonwoo Park
  2015-10-25 10:08 ` Peter Zijlstra
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Joonwoo Park @ 2015-10-25  5:23 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra; +Cc: linux-kernel, linux-arm-msm, Joonwoo Park

At present scheduler resets task's wait start timestamp when the task
migrates to another rq.  This misleads scheduler itself into reporting
less wait time than actual by omitting time spent for waiting prior to
migration and also more wait count than actual by counting migration as
wait end event which can be seen by trace or /proc/<pid>/sched with
CONFIG_SCHEDSTATS=y.

Carry forward migrating task's wait time prior to migration and don't
count migration as a wait end event to fix such statistics error.

Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
---
 kernel/sched/core.c  |  4 ++--
 kernel/sched/fair.c  | 41 ++++++++++++++++++++++++++++-------------
 kernel/sched/sched.h |  2 ++
 3 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bcd214e..4f20895 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1069,7 +1069,7 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
 {
 	lockdep_assert_held(&rq->lock);
 
-	dequeue_task(rq, p, 0);
+	dequeue_task(rq, p, DEQUEUE_MIGRATING);
 	p->on_rq = TASK_ON_RQ_MIGRATING;
 	set_task_cpu(p, new_cpu);
 	raw_spin_unlock(&rq->lock);
@@ -1079,7 +1079,7 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
 	raw_spin_lock(&rq->lock);
 	BUG_ON(task_cpu(p) != new_cpu);
 	p->on_rq = TASK_ON_RQ_QUEUED;
-	enqueue_task(rq, p, 0);
+	enqueue_task(rq, p, ENQUEUE_MIGRATING);
 	check_preempt_curr(rq, p, 0);
 
 	return rq;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9a5e60f..53ec4d4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -738,27 +738,41 @@ static void update_curr_fair(struct rq *rq)
 }
 
 static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se,
+			bool migrating)
 {
-	schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
+	schedstat_set(se->statistics.wait_start,
+		migrating &&
+		likely(rq_clock(rq_of(cfs_rq)) > se->statistics.wait_start) ?
+		rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start :
+		rq_clock(rq_of(cfs_rq)));
 }
 
 /*
  * Task is being enqueued - update stats:
  */
-static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se,
+				 bool migrating)
 {
 	/*
 	 * Are we enqueueing a waiting task? (for current tasks
 	 * a dequeue/enqueue event is a NOP)
 	 */
 	if (se != cfs_rq->curr)
-		update_stats_wait_start(cfs_rq, se);
+		update_stats_wait_start(cfs_rq, se, migrating);
 }
 
 static void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se,
+		      bool migrating)
 {
+	if (migrating) {
+		schedstat_set(se->statistics.wait_start,
+			      rq_clock(rq_of(cfs_rq)) -
+			      se->statistics.wait_start);
+		return;
+	}
+
 	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
 			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
 	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
@@ -774,14 +788,15 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se,
+		     bool migrating)
 {
 	/*
 	 * Mark the end of the wait period if dequeueing a
 	 * waiting task:
 	 */
 	if (se != cfs_rq->curr)
-		update_stats_wait_end(cfs_rq, se);
+		update_stats_wait_end(cfs_rq, se, migrating);
 }
 
 /*
@@ -2960,7 +2975,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 		enqueue_sleeper(cfs_rq, se);
 	}
 
-	update_stats_enqueue(cfs_rq, se);
+	update_stats_enqueue(cfs_rq, se, !!(flags & ENQUEUE_MIGRATING));
 	check_spread(cfs_rq, se);
 	if (se != cfs_rq->curr)
 		__enqueue_entity(cfs_rq, se);
@@ -3028,7 +3043,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	update_curr(cfs_rq);
 	dequeue_entity_load_avg(cfs_rq, se);
 
-	update_stats_dequeue(cfs_rq, se);
+	update_stats_dequeue(cfs_rq, se, !!(flags & DEQUEUE_MIGRATING));
 	if (flags & DEQUEUE_SLEEP) {
 #ifdef CONFIG_SCHEDSTATS
 		if (entity_is_task(se)) {
@@ -3114,7 +3129,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		 * a CPU. So account for the time it spent waiting on the
 		 * runqueue.
 		 */
-		update_stats_wait_end(cfs_rq, se);
+		update_stats_wait_end(cfs_rq, se, false);
 		__dequeue_entity(cfs_rq, se);
 		update_load_avg(se, 1);
 	}
@@ -3212,7 +3227,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 
 	check_spread(cfs_rq, prev);
 	if (prev->on_rq) {
-		update_stats_wait_start(cfs_rq, prev);
+		update_stats_wait_start(cfs_rq, prev, false);
 		/* Put 'current' back into the tree. */
 		__enqueue_entity(cfs_rq, prev);
 		/* in !on_rq case, update occurred at dequeue */
@@ -5656,7 +5671,7 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
 {
 	lockdep_assert_held(&env->src_rq->lock);
 
-	deactivate_task(env->src_rq, p, 0);
+	deactivate_task(env->src_rq, p, DEQUEUE_MIGRATING);
 	p->on_rq = TASK_ON_RQ_MIGRATING;
 	set_task_cpu(p, env->dst_cpu);
 }
@@ -5791,7 +5806,7 @@ static void attach_task(struct rq *rq, struct task_struct *p)
 
 	BUG_ON(task_rq(p) != rq);
 	p->on_rq = TASK_ON_RQ_QUEUED;
-	activate_task(rq, p, 0);
+	activate_task(rq, p, ENQUEUE_MIGRATING);
 	check_preempt_curr(rq, p, 0);
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6d2a119..c1ac118 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1165,8 +1165,10 @@ static const u32 prio_to_wmult[40] = {
 #define ENQUEUE_WAKING		0
 #endif
 #define ENQUEUE_REPLENISH	8
+#define ENQUEUE_MIGRATING	16
 
 #define DEQUEUE_SLEEP		1
+#define DEQUEUE_MIGRATING	2
 
 #define RETRY_TASK		((void *)-1UL)
 
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
hosted by The Linux Foundation


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] sched: fix incorrect wait time and wait count statistics
  2015-10-25  5:23 [PATCH] sched: fix incorrect wait time and wait count statistics Joonwoo Park
@ 2015-10-25 10:08 ` Peter Zijlstra
  2015-10-25 10:26 ` Peter Zijlstra
  2015-10-27 19:17 ` [PATCH v3] sched: fix " Joonwoo Park
  2 siblings, 0 replies; 13+ messages in thread
From: Peter Zijlstra @ 2015-10-25 10:08 UTC (permalink / raw)
  To: Joonwoo Park; +Cc: Ingo Molnar, linux-kernel, linux-arm-msm

On Sat, Oct 24, 2015 at 10:23:14PM -0700, Joonwoo Park wrote:
> At present scheduler resets task's wait start timestamp when the task
> migrates to another rq.  This misleads scheduler itself into reporting
> less wait time than actual by omitting time spent for waiting prior to
> migration and also more wait count than actual by counting migration as
> wait end event which can be seen by trace or /proc/<pid>/sched with
> CONFIG_SCHEDSTATS=y.
> 

Have you seen:

  1de64443d755 ("sched/core: Fix task and run queue sched_info::run_delay inconsistencies")

?

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] sched: fix incorrect wait time and wait count statistics
  2015-10-25  5:23 [PATCH] sched: fix incorrect wait time and wait count statistics Joonwoo Park
  2015-10-25 10:08 ` Peter Zijlstra
@ 2015-10-25 10:26 ` Peter Zijlstra
  2015-10-27  1:44   ` Joonwoo Park
  2015-10-27 19:17 ` [PATCH v3] sched: fix " Joonwoo Park
  2 siblings, 1 reply; 13+ messages in thread
From: Peter Zijlstra @ 2015-10-25 10:26 UTC (permalink / raw)
  To: Joonwoo Park; +Cc: Ingo Molnar, linux-kernel, linux-arm-msm

On Sat, Oct 24, 2015 at 10:23:14PM -0700, Joonwoo Park wrote:
> @@ -1069,7 +1069,7 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
>  {
>  	lockdep_assert_held(&rq->lock);
>  
> -	dequeue_task(rq, p, 0);
> +	dequeue_task(rq, p, DEQUEUE_MIGRATING);
>  	p->on_rq = TASK_ON_RQ_MIGRATING;
>  	set_task_cpu(p, new_cpu);
>  	raw_spin_unlock(&rq->lock);

> @@ -5656,7 +5671,7 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
>  {
>  	lockdep_assert_held(&env->src_rq->lock);
>  
> -	deactivate_task(env->src_rq, p, 0);
> +	deactivate_task(env->src_rq, p, DEQUEUE_MIGRATING);
>  	p->on_rq = TASK_ON_RQ_MIGRATING;
>  	set_task_cpu(p, env->dst_cpu);
>  }

Also note that on both sites we also set TASK_ON_RQ_MIGRATING -- albeit
late. Can't you simply set that earlier (and back to QUEUED later) and
test for task_on_rq_migrating() instead of blowing up the fastpath like
you did?

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] sched: fix incorrect wait time and wait count statistics
  2015-10-25 10:26 ` Peter Zijlstra
@ 2015-10-27  1:44   ` Joonwoo Park
  2015-10-27 12:57     ` Peter Zijlstra
  0 siblings, 1 reply; 13+ messages in thread
From: Joonwoo Park @ 2015-10-27  1:44 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, linux-kernel, linux-arm-msm

[-- Attachment #1: Type: text/plain, Size: 1210 bytes --]

On 10/25/2015 03:26 AM, Peter Zijlstra wrote:
> On Sat, Oct 24, 2015 at 10:23:14PM -0700, Joonwoo Park wrote:
>> @@ -1069,7 +1069,7 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
>>  {
>>  	lockdep_assert_held(&rq->lock);
>>  
>> -	dequeue_task(rq, p, 0);
>> +	dequeue_task(rq, p, DEQUEUE_MIGRATING);
>>  	p->on_rq = TASK_ON_RQ_MIGRATING;
>>  	set_task_cpu(p, new_cpu);
>>  	raw_spin_unlock(&rq->lock);
> 
>> @@ -5656,7 +5671,7 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
>>  {
>>  	lockdep_assert_held(&env->src_rq->lock);
>>  
>> -	deactivate_task(env->src_rq, p, 0);
>> +	deactivate_task(env->src_rq, p, DEQUEUE_MIGRATING);
>>  	p->on_rq = TASK_ON_RQ_MIGRATING;
>>  	set_task_cpu(p, env->dst_cpu);
>>  }
> 
> Also note that on both sites we also set TASK_ON_RQ_MIGRATING -- albeit
> late. Can't you simply set that earlier (and back to QUEUED later) and
> test for task_on_rq_migrating() instead of blowing up the fastpath like
> you did?
> 

Yes it's doable.  I also find it's much simpler.
Please find patch v2.  I verified v2 does same job as v1 by comparing sched_stat_wait time with sched_switch - sched_wakeup timestamp.

Thanks,
Joonwoo

[-- Attachment #2: 0001-sched-fix-incorrect-wait-time-and-wait-count-statist.patch --]
[-- Type: text/x-patch, Size: 3924 bytes --]

>From 98d615d46211a90482a0f9b7204265c54bba8520 Mon Sep 17 00:00:00 2001
From: Joonwoo Park <joonwoop@codeaurora.org>
Date: Mon, 26 Oct 2015 16:37:47 -0700
Subject: [PATCH v2] sched: fix incorrect wait time and wait count statistics

At present scheduler resets task's wait start timestamp when the task
migrates to another rq.  This misleads scheduler itself into reporting
less wait time than actual by omitting time spent for waiting prior to
migration and also more wait count than actual by counting migration as
wait end event which can be seen by trace or /proc/<pid>/sched with
CONFIG_SCHEDSTATS=y.

Carry forward migrating task's wait time prior to migration and
don't count migration as a wait end event to fix such statistics error.

In order to determine whether task is migrating mark task->on_rq with
TASK_ON_RQ_MIGRATING while dequeuing and enqueuing due to migration.

To: Ingo Molnar <mingo@kernel.org>
To: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
---
Changes in v2: 
 * Set p->on_rq = TASK_ON_RQ_MIGRATING while doing migration dequeue/enqueue
   and check whether task's migrating with task_on_rq_migrating().

 kernel/sched/core.c |  4 ++--
 kernel/sched/fair.c | 17 ++++++++++++++---
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bcd214e..d9e4ad5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1069,8 +1069,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
 {
 	lockdep_assert_held(&rq->lock);
 
-	dequeue_task(rq, p, 0);
 	p->on_rq = TASK_ON_RQ_MIGRATING;
+	dequeue_task(rq, p, 0);
 	set_task_cpu(p, new_cpu);
 	raw_spin_unlock(&rq->lock);
 
@@ -1078,8 +1078,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
 
 	raw_spin_lock(&rq->lock);
 	BUG_ON(task_cpu(p) != new_cpu);
-	p->on_rq = TASK_ON_RQ_QUEUED;
 	enqueue_task(rq, p, 0);
+	p->on_rq = TASK_ON_RQ_QUEUED;
 	check_preempt_curr(rq, p, 0);
 
 	return rq;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9a5e60f..7609576 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -740,7 +740,11 @@ static void update_curr_fair(struct rq *rq)
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
+	schedstat_set(se->statistics.wait_start,
+		task_on_rq_migrating(task_of(se)) &&
+		likely(rq_clock(rq_of(cfs_rq)) > se->statistics.wait_start) ?
+		rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start :
+		rq_clock(rq_of(cfs_rq)));
 }
 
 /*
@@ -759,6 +763,13 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 static void
 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
+	if (task_on_rq_migrating(task_of(se))) {
+		schedstat_set(se->statistics.wait_start,
+			      rq_clock(rq_of(cfs_rq)) -
+			      se->statistics.wait_start);
+		return;
+	}
+
 	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
 			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
 	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
@@ -5656,8 +5667,8 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
 {
 	lockdep_assert_held(&env->src_rq->lock);
 
-	deactivate_task(env->src_rq, p, 0);
 	p->on_rq = TASK_ON_RQ_MIGRATING;
+	deactivate_task(env->src_rq, p, 0);
 	set_task_cpu(p, env->dst_cpu);
 }
 
@@ -5790,8 +5801,8 @@ static void attach_task(struct rq *rq, struct task_struct *p)
 	lockdep_assert_held(&rq->lock);
 
 	BUG_ON(task_rq(p) != rq);
-	p->on_rq = TASK_ON_RQ_QUEUED;
 	activate_task(rq, p, 0);
+	p->on_rq = TASK_ON_RQ_QUEUED;
 	check_preempt_curr(rq, p, 0);
 }
 
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
hosted by The Linux Foundation


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] sched: fix incorrect wait time and wait count statistics
  2015-10-27  1:44   ` Joonwoo Park
@ 2015-10-27 12:57     ` Peter Zijlstra
  2015-10-28  2:40       ` Joonwoo Park
  0 siblings, 1 reply; 13+ messages in thread
From: Peter Zijlstra @ 2015-10-27 12:57 UTC (permalink / raw)
  To: Joonwoo Park; +Cc: Ingo Molnar, linux-kernel, linux-arm-msm, ohaugan


(Excessive quoting for Olav)

On Mon, Oct 26, 2015 at 06:44:48PM -0700, Joonwoo Park wrote:
> On 10/25/2015 03:26 AM, Peter Zijlstra wrote:

> > Also note that on both sites we also set TASK_ON_RQ_MIGRATING -- albeit
> > late. Can't you simply set that earlier (and back to QUEUED later) and
> > test for task_on_rq_migrating() instead of blowing up the fastpath like
> > you did?
> > 
> 
> Yes it's doable.  I also find it's much simpler.

> From 98d615d46211a90482a0f9b7204265c54bba8520 Mon Sep 17 00:00:00 2001
> From: Joonwoo Park <joonwoop@codeaurora.org>
> Date: Mon, 26 Oct 2015 16:37:47 -0700
> Subject: [PATCH v2] sched: fix incorrect wait time and wait count statistics
> 
> At present scheduler resets task's wait start timestamp when the task
> migrates to another rq.  This misleads scheduler itself into reporting
> less wait time than actual by omitting time spent for waiting prior to
> migration and also more wait count than actual by counting migration as
> wait end event which can be seen by trace or /proc/<pid>/sched with
> CONFIG_SCHEDSTATS=y.
> 
> Carry forward migrating task's wait time prior to migration and
> don't count migration as a wait end event to fix such statistics error.
> 
> In order to determine whether task is migrating mark task->on_rq with
> TASK_ON_RQ_MIGRATING while dequeuing and enqueuing due to migration.
> 
> To: Ingo Molnar <mingo@kernel.org>
> To: Peter Zijlstra <peterz@infradead.org>
> Cc: linux-kernel@vger.kernel.org
> Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
> ---

So now that you rely on TASK_ON_RQ_MIGRATING; I think you missed one
place that can migrate sched_fair tasks and doesn't set it.

Olav recently did a patch adding TASK_ON_RQ_MIGRATING to _every_
migration path, but that is (still) somewhat overkill. With your changes
we need it for sched_fair though.

So I think you need to change __migrate_swap_task(), which is used by
the NUMA scheduling to swap two running tasks.

Also, it might be prudent to extend the CONFIG_SCHED_DEBUG ifdef in
set_task_cpu() to test for this new requirement:

	WARN_ON_ONCE(p->state == TASK_RUNNING &&
	             p->class == &fair_sched_class &&
		     p->on_rq != TASK_ON_RQ_MIGRATING);

>  kernel/sched/core.c |  4 ++--
>  kernel/sched/fair.c | 17 ++++++++++++++---
>  2 files changed, 16 insertions(+), 5 deletions(-)
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index bcd214e..d9e4ad5 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1069,8 +1069,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
>  {
>  	lockdep_assert_held(&rq->lock);
>  
> -	dequeue_task(rq, p, 0);
>  	p->on_rq = TASK_ON_RQ_MIGRATING;
> +	dequeue_task(rq, p, 0);
>  	set_task_cpu(p, new_cpu);
>  	raw_spin_unlock(&rq->lock);
>  
> @@ -1078,8 +1078,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
>  
>  	raw_spin_lock(&rq->lock);
>  	BUG_ON(task_cpu(p) != new_cpu);
> -	p->on_rq = TASK_ON_RQ_QUEUED;
>  	enqueue_task(rq, p, 0);
> +	p->on_rq = TASK_ON_RQ_QUEUED;
>  	check_preempt_curr(rq, p, 0);
>  
>  	return rq;
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 9a5e60f..7609576 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -740,7 +740,11 @@ static void update_curr_fair(struct rq *rq)
>  static inline void
>  update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  {
> -	schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
> +	schedstat_set(se->statistics.wait_start,
> +		task_on_rq_migrating(task_of(se)) &&
> +		likely(rq_clock(rq_of(cfs_rq)) > se->statistics.wait_start) ?
> +		rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start :
> +		rq_clock(rq_of(cfs_rq)));

So I get that you want to avoid emitting code for !SCHEDSTATS; but that
is rather unreadable. Either use the GCC stmt-expr or wrap the lot in
#ifdef.

>  }
>  
>  /*
> @@ -759,6 +763,13 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  static void
>  update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  {
> +	if (task_on_rq_migrating(task_of(se))) {

Maybe add a comment that we adjust the wait_start time-base and will
rebase on the new rq_clock in update_stats_wait_start().

> +		schedstat_set(se->statistics.wait_start,
> +			      rq_clock(rq_of(cfs_rq)) -
> +			      se->statistics.wait_start);
> +		return;
> +	}
> +
>  	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
>  			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
>  	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
> @@ -5656,8 +5667,8 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
>  {
>  	lockdep_assert_held(&env->src_rq->lock);
>  
> -	deactivate_task(env->src_rq, p, 0);
>  	p->on_rq = TASK_ON_RQ_MIGRATING;
> +	deactivate_task(env->src_rq, p, 0);
>  	set_task_cpu(p, env->dst_cpu);
>  }
>  
> @@ -5790,8 +5801,8 @@ static void attach_task(struct rq *rq, struct task_struct *p)
>  	lockdep_assert_held(&rq->lock);
>  
>  	BUG_ON(task_rq(p) != rq);
> -	p->on_rq = TASK_ON_RQ_QUEUED;
>  	activate_task(rq, p, 0);
> +	p->on_rq = TASK_ON_RQ_QUEUED;
>  	check_preempt_curr(rq, p, 0);
>  }

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v3] sched: fix incorrect wait time and wait count statistics
  2015-10-25  5:23 [PATCH] sched: fix incorrect wait time and wait count statistics Joonwoo Park
  2015-10-25 10:08 ` Peter Zijlstra
  2015-10-25 10:26 ` Peter Zijlstra
@ 2015-10-27 19:17 ` Joonwoo Park
  2 siblings, 0 replies; 13+ messages in thread
From: Joonwoo Park @ 2015-10-27 19:17 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra
  Cc: linux-kernel, linux-arm-msm, fengguang.wu, Joonwoo Park

At present scheduler resets task's wait start timestamp when the task
migrates to another rq.  This misleads scheduler itself into reporting
less wait time than actual by omitting time spent for waiting prior to
migration and also more wait count than actual by counting migration as
wait end event which can be seen by trace or /proc/<pid>/sched with
CONFIG_SCHEDSTATS=y.

Carry forward migrating task's wait time prior to migration and
don't count migration as a wait end event to fix such statistics error.

In order to determine whether task is migrating mark task->on_rq with
TASK_ON_RQ_MIGRATING while dequeuing and enqueuing due to migration.

To: Ingo Molnar <mingo@kernel.org>
To: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
---
Changes in v2: 
 * Set p->on_rq = TASK_ON_RQ_MIGRATING while doing migration dequeue/enqueue
   and check whether task's migrating with task_on_rq_migrating().
Changes in v3: 
 * Fixed "WARNING: CPU: 0 PID: 3 at kernel/sched/fair.c:260 update_stats_wait_end+0x23/0x30()" caught by Intel kernel test robot.

 kernel/sched/core.c |  4 ++--
 kernel/sched/fair.c | 27 ++++++++++++++++++++++-----
 2 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bcd214e..d9e4ad5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1069,8 +1069,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
 {
 	lockdep_assert_held(&rq->lock);
 
-	dequeue_task(rq, p, 0);
 	p->on_rq = TASK_ON_RQ_MIGRATING;
+	dequeue_task(rq, p, 0);
 	set_task_cpu(p, new_cpu);
 	raw_spin_unlock(&rq->lock);
 
@@ -1078,8 +1078,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
 
 	raw_spin_lock(&rq->lock);
 	BUG_ON(task_cpu(p) != new_cpu);
-	p->on_rq = TASK_ON_RQ_QUEUED;
 	enqueue_task(rq, p, 0);
+	p->on_rq = TASK_ON_RQ_QUEUED;
 	check_preempt_curr(rq, p, 0);
 
 	return rq;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9a5e60f..4c174a1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -740,7 +740,11 @@ static void update_curr_fair(struct rq *rq)
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
+	schedstat_set(se->statistics.wait_start,
+		entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
+		likely(rq_clock(rq_of(cfs_rq)) > se->statistics.wait_start) ?
+		rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start :
+		rq_clock(rq_of(cfs_rq)));
 }
 
 /*
@@ -756,22 +760,35 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		update_stats_wait_start(cfs_rq, se);
 }
 
+#ifdef CONFIG_SCHEDSTATS
 static void
 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
+	if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) {
+		schedstat_set(se->statistics.wait_start,
+			      rq_clock(rq_of(cfs_rq)) -
+			      se->statistics.wait_start);
+		return;
+	}
+
 	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
 			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
 	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
 	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
 			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
-#ifdef CONFIG_SCHEDSTATS
+
 	if (entity_is_task(se)) {
 		trace_sched_stat_wait(task_of(se),
 			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
 	}
-#endif
 	schedstat_set(se->statistics.wait_start, 0);
 }
+#else
+static inline void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+#endif
 
 static inline void
 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -5656,8 +5673,8 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
 {
 	lockdep_assert_held(&env->src_rq->lock);
 
-	deactivate_task(env->src_rq, p, 0);
 	p->on_rq = TASK_ON_RQ_MIGRATING;
+	deactivate_task(env->src_rq, p, 0);
 	set_task_cpu(p, env->dst_cpu);
 }
 
@@ -5790,8 +5807,8 @@ static void attach_task(struct rq *rq, struct task_struct *p)
 	lockdep_assert_held(&rq->lock);
 
 	BUG_ON(task_rq(p) != rq);
-	p->on_rq = TASK_ON_RQ_QUEUED;
 	activate_task(rq, p, 0);
+	p->on_rq = TASK_ON_RQ_QUEUED;
 	check_preempt_curr(rq, p, 0);
 }
 
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
hosted by The Linux Foundation


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] sched: fix incorrect wait time and wait count statistics
  2015-10-27 12:57     ` Peter Zijlstra
@ 2015-10-28  2:40       ` Joonwoo Park
  2015-10-28  4:46         ` [PATCH v4] " Joonwoo Park
  0 siblings, 1 reply; 13+ messages in thread
From: Joonwoo Park @ 2015-10-28  2:40 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, linux-kernel, linux-arm-msm, ohaugan

On Tue, Oct 27, 2015 at 01:57:28PM +0100, Peter Zijlstra wrote:
> 
> (Excessive quoting for Olav)
> 
> On Mon, Oct 26, 2015 at 06:44:48PM -0700, Joonwoo Park wrote:
> > On 10/25/2015 03:26 AM, Peter Zijlstra wrote:
> 
> > > Also note that on both sites we also set TASK_ON_RQ_MIGRATING -- albeit
> > > late. Can't you simply set that earlier (and back to QUEUED later) and
> > > test for task_on_rq_migrating() instead of blowing up the fastpath like
> > > you did?
> > > 
> > 
> > Yes it's doable.  I also find it's much simpler.
> 
> > From 98d615d46211a90482a0f9b7204265c54bba8520 Mon Sep 17 00:00:00 2001
> > From: Joonwoo Park <joonwoop@codeaurora.org>
> > Date: Mon, 26 Oct 2015 16:37:47 -0700
> > Subject: [PATCH v2] sched: fix incorrect wait time and wait count statistics
> > 
> > At present scheduler resets task's wait start timestamp when the task
> > migrates to another rq.  This misleads scheduler itself into reporting
> > less wait time than actual by omitting time spent for waiting prior to
> > migration and also more wait count than actual by counting migration as
> > wait end event which can be seen by trace or /proc/<pid>/sched with
> > CONFIG_SCHEDSTATS=y.
> > 
> > Carry forward migrating task's wait time prior to migration and
> > don't count migration as a wait end event to fix such statistics error.
> > 
> > In order to determine whether task is migrating mark task->on_rq with
> > TASK_ON_RQ_MIGRATING while dequeuing and enqueuing due to migration.
> > 
> > To: Ingo Molnar <mingo@kernel.org>
> > To: Peter Zijlstra <peterz@infradead.org>
> > Cc: linux-kernel@vger.kernel.org
> > Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
> > ---
> 
> So now that you rely on TASK_ON_RQ_MIGRATING; I think you missed one
> place that can migrate sched_fair tasks and doesn't set it.
> 
> Olav recently did a patch adding TASK_ON_RQ_MIGRATING to _every_
> migration path, but that is (still) somewhat overkill. With your changes
> we need it for sched_fair though.
> 
> So I think you need to change __migrate_swap_task(), which is used by
> the NUMA scheduling to swap two running tasks.

Oh yes... __migrate_swap_task() can migrate fair class task so I should mark as TASK_ON_RQ_MIGRATING.
I will fix this in subsequent patch.

> 
> Also, it might be prudent to extend the CONFIG_SCHED_DEBUG ifdef in
> set_task_cpu() to test for this new requirement:
> 
> 	WARN_ON_ONCE(p->state == TASK_RUNNING &&
> 	             p->class == &fair_sched_class &&
> 		     p->on_rq != TASK_ON_RQ_MIGRATING);
> 

I conceived to argue that if we had Olav's change (with revised marking order) dummy like myself don't need to worry
about stale on_rq state and probably didn't make mistake like I did with __migrate_swap_task(). 
But I don't think I can argue for that reason anymore as my patch will set TASK_ON_RQ_MIGRATING for all migration paths for
the fair class tasks at least and moreover above macro you suggested would fortify the new requirement.

I will add that macro. 

I recall Olav had some other reason for his patch though.

> >  kernel/sched/core.c |  4 ++--
> >  kernel/sched/fair.c | 17 ++++++++++++++---
> >  2 files changed, 16 insertions(+), 5 deletions(-)
> > 
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index bcd214e..d9e4ad5 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -1069,8 +1069,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
> >  {
> >  	lockdep_assert_held(&rq->lock);
> >  
> > -	dequeue_task(rq, p, 0);
> >  	p->on_rq = TASK_ON_RQ_MIGRATING;
> > +	dequeue_task(rq, p, 0);
> >  	set_task_cpu(p, new_cpu);
> >  	raw_spin_unlock(&rq->lock);
> >  
> > @@ -1078,8 +1078,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
> >  
> >  	raw_spin_lock(&rq->lock);
> >  	BUG_ON(task_cpu(p) != new_cpu);
> > -	p->on_rq = TASK_ON_RQ_QUEUED;
> >  	enqueue_task(rq, p, 0);
> > +	p->on_rq = TASK_ON_RQ_QUEUED;
> >  	check_preempt_curr(rq, p, 0);
> >  
> >  	return rq;
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 9a5e60f..7609576 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -740,7 +740,11 @@ static void update_curr_fair(struct rq *rq)
> >  static inline void
> >  update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
> >  {
> > -	schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
> > +	schedstat_set(se->statistics.wait_start,
> > +		task_on_rq_migrating(task_of(se)) &&
> > +		likely(rq_clock(rq_of(cfs_rq)) > se->statistics.wait_start) ?
> > +		rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start :
> > +		rq_clock(rq_of(cfs_rq)));
> 
> So I get that you want to avoid emitting code for !SCHEDSTATS; but that
> is rather unreadable. Either use the GCC stmt-expr or wrap the lot in
> #ifdef.
> 

Will do.

> >  }
> >  
> >  /*
> > @@ -759,6 +763,13 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
> >  static void
> >  update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
> >  {
> > +	if (task_on_rq_migrating(task_of(se))) {
> 
> Maybe add a comment that we adjust the wait_start time-base and will
> rebase on the new rq_clock in update_stats_wait_start().
> 

Will do.

Thanks,
Joonwoo

> > +		schedstat_set(se->statistics.wait_start,
> > +			      rq_clock(rq_of(cfs_rq)) -
> > +			      se->statistics.wait_start);
> > +		return;
> > +	}
> > +
> >  	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
> >  			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
> >  	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
> > @@ -5656,8 +5667,8 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
> >  {
> >  	lockdep_assert_held(&env->src_rq->lock);
> >  
> > -	deactivate_task(env->src_rq, p, 0);
> >  	p->on_rq = TASK_ON_RQ_MIGRATING;
> > +	deactivate_task(env->src_rq, p, 0);
> >  	set_task_cpu(p, env->dst_cpu);
> >  }
> >  
> > @@ -5790,8 +5801,8 @@ static void attach_task(struct rq *rq, struct task_struct *p)
> >  	lockdep_assert_held(&rq->lock);
> >  
> >  	BUG_ON(task_rq(p) != rq);
> > -	p->on_rq = TASK_ON_RQ_QUEUED;
> >  	activate_task(rq, p, 0);
> > +	p->on_rq = TASK_ON_RQ_QUEUED;
> >  	check_preempt_curr(rq, p, 0);
> >  }

-- 
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by The Linux Foundation

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v4] sched: fix incorrect wait time and wait count statistics
  2015-10-28  2:40       ` Joonwoo Park
@ 2015-10-28  4:46         ` Joonwoo Park
  2015-11-06 13:57           ` Peter Zijlstra
  0 siblings, 1 reply; 13+ messages in thread
From: Joonwoo Park @ 2015-10-28  4:46 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, linux-kernel, linux-arm-msm, ohaugan, Joonwoo Park

At present scheduler resets task's wait start timestamp when the task
migrates to another rq.  This misleads scheduler itself into reporting
less wait time than actual by omitting time spent for waiting prior to
migration and also more wait count than actual by counting migration as
wait end event which can be seen by trace or /proc/<pid>/sched with
CONFIG_SCHEDSTATS=y.

Carry forward migrating task's wait time prior to migration and
don't count migration as a wait end event to fix such statistics error.

In order to determine whether fair task is migrating mark task->on_rq with
TASK_ON_RQ_MIGRATING while dequeuing and enqueuing due to migration.

To: Ingo Molnar <mingo@kernel.org>
To: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
---
Changes in v2: 
 * Set p->on_rq = TASK_ON_RQ_MIGRATING while doing migration dequeue/enqueue
   and check whether task's migrating with task_on_rq_migrating().
Changes in v3: 
 * Fixed "WARNING: CPU: 0 PID: 3 at kernel/sched/fair.c:260 update_stats_wait_end+0x23/0x30()" caught by Intel kernel test robot.
Changes in v4: 
 * Made __migrate_swap_task() to set p->on_rq = TASK_ON_RQ_MIGRATING.
 * Added WARN_ON_ONCE() inside CONFIG_SCHED_DEBUG.
 * Added comments.
 * Cleanup with ifdefy.

 kernel/sched/core.c | 15 +++++++++++--
 kernel/sched/fair.c | 62 ++++++++++++++++++++++++++++++++++++++---------------
 2 files changed, 58 insertions(+), 19 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bcd214e..1ddbabc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1069,8 +1069,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
 {
 	lockdep_assert_held(&rq->lock);
 
-	dequeue_task(rq, p, 0);
 	p->on_rq = TASK_ON_RQ_MIGRATING;
+	dequeue_task(rq, p, 0);
 	set_task_cpu(p, new_cpu);
 	raw_spin_unlock(&rq->lock);
 
@@ -1078,8 +1078,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
 
 	raw_spin_lock(&rq->lock);
 	BUG_ON(task_cpu(p) != new_cpu);
-	p->on_rq = TASK_ON_RQ_QUEUED;
 	enqueue_task(rq, p, 0);
+	p->on_rq = TASK_ON_RQ_QUEUED;
 	check_preempt_curr(rq, p, 0);
 
 	return rq;
@@ -1272,6 +1272,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
 			!p->on_rq);
 
+	/*
+	 * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
+	 * because schedstat_wait_{start,end} rebase migrating task's wait_start
+	 * time relying on p->on_rq.
+	 */
+	WARN_ON_ONCE(p->state == TASK_RUNNING &&
+		     p->sched_class == &fair_sched_class &&
+		     (p->on_rq && !task_on_rq_migrating(p)));
+
 #ifdef CONFIG_LOCKDEP
 	/*
 	 * The caller should hold either p->pi_lock or rq->lock, when changing
@@ -1308,9 +1317,11 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
 		src_rq = task_rq(p);
 		dst_rq = cpu_rq(cpu);
 
+		p->on_rq = TASK_ON_RQ_MIGRATING;
 		deactivate_task(src_rq, p, 0);
 		set_task_cpu(p, cpu);
 		activate_task(dst_rq, p, 0);
+		p->on_rq = TASK_ON_RQ_QUEUED;
 		check_preempt_curr(dst_rq, p, 0);
 	} else {
 		/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9a5e60f..ce7e869 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -737,41 +737,69 @@ static void update_curr_fair(struct rq *rq)
 	update_curr(cfs_rq_of(&rq->curr->se));
 }
 
+#ifdef CONFIG_SCHEDSTATS
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
-}
+	u64 wait_start = rq_clock(rq_of(cfs_rq));
 
-/*
- * Task is being enqueued - update stats:
- */
-static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-	/*
-	 * Are we enqueueing a waiting task? (for current tasks
-	 * a dequeue/enqueue event is a NOP)
-	 */
-	if (se != cfs_rq->curr)
-		update_stats_wait_start(cfs_rq, se);
+	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
+	    likely(wait_start > se->statistics.wait_start))
+		wait_start -= se->statistics.wait_start;
+
+	schedstat_set(se->statistics.wait_start, wait_start);
 }
 
 static void
 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
+	if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) {
+		/*
+		 * Preserve migrating task's wait time so wait_start time stamp
+		 * can be adjusted to accumulate wait time prior to migration.
+		 */
+		schedstat_set(se->statistics.wait_start,
+			      rq_clock(rq_of(cfs_rq)) -
+			      se->statistics.wait_start);
+		return;
+	}
+
 	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
 			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
 	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
 	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
 			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
-#ifdef CONFIG_SCHEDSTATS
+
 	if (entity_is_task(se)) {
 		trace_sched_stat_wait(task_of(se),
 			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
 	}
-#endif
 	schedstat_set(se->statistics.wait_start, 0);
 }
+#else
+static inline void
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+#endif
+
+/*
+ * Task is being enqueued - update stats:
+ */
+static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	/*
+	 * Are we enqueueing a waiting task? (for current tasks
+	 * a dequeue/enqueue event is a NOP)
+	 */
+	if (se != cfs_rq->curr)
+		update_stats_wait_start(cfs_rq, se);
+}
 
 static inline void
 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -5656,8 +5684,8 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
 {
 	lockdep_assert_held(&env->src_rq->lock);
 
-	deactivate_task(env->src_rq, p, 0);
 	p->on_rq = TASK_ON_RQ_MIGRATING;
+	deactivate_task(env->src_rq, p, 0);
 	set_task_cpu(p, env->dst_cpu);
 }
 
@@ -5790,8 +5818,8 @@ static void attach_task(struct rq *rq, struct task_struct *p)
 	lockdep_assert_held(&rq->lock);
 
 	BUG_ON(task_rq(p) != rq);
-	p->on_rq = TASK_ON_RQ_QUEUED;
 	activate_task(rq, p, 0);
+	p->on_rq = TASK_ON_RQ_QUEUED;
 	check_preempt_curr(rq, p, 0);
 }
 
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
hosted by The Linux Foundation


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v4] sched: fix incorrect wait time and wait count statistics
  2015-10-28  4:46         ` [PATCH v4] " Joonwoo Park
@ 2015-11-06 13:57           ` Peter Zijlstra
  2015-11-07  2:41             ` Joonwoo Park
  0 siblings, 1 reply; 13+ messages in thread
From: Peter Zijlstra @ 2015-11-06 13:57 UTC (permalink / raw)
  To: Joonwoo Park; +Cc: Ingo Molnar, linux-kernel, linux-arm-msm, ohaugan

On Tue, Oct 27, 2015 at 09:46:53PM -0700, Joonwoo Park wrote:
> @@ -1272,6 +1272,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
>  	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
>  			!p->on_rq);
>  
> +	/*
> +	 * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
> +	 * because schedstat_wait_{start,end} rebase migrating task's wait_start
> +	 * time relying on p->on_rq.
> +	 */
> +	WARN_ON_ONCE(p->state == TASK_RUNNING &&
> +		     p->sched_class == &fair_sched_class &&
> +		     (p->on_rq && !task_on_rq_migrating(p)));
> +

Why do we have to test p->on_rq? Would not ->state == RUNNING imply
that?

> +++ b/kernel/sched/fair.c
> @@ -737,41 +737,69 @@ static void update_curr_fair(struct rq *rq)
>  	update_curr(cfs_rq_of(&rq->curr->se));
>  }
>  
> +#ifdef CONFIG_SCHEDSTATS
>  static inline void
>  update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  {
> +	u64 wait_start = rq_clock(rq_of(cfs_rq));
>  
> +	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
> +	    likely(wait_start > se->statistics.wait_start))
> +		wait_start -= se->statistics.wait_start;
> +
> +	schedstat_set(se->statistics.wait_start, wait_start);
>  }
>  
>  static void
>  update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  {

Since this is now all under CONFIG_SCHEDSTAT, would it not make sense
to do something like:

	u64 now = rq_clock(rq_of(cfs_rq));

to avoid the endless calling of that function?

Also, for that very same reason; would it not make sense to drop the
schedstat_set() usage below, that would greatly enhance readability.

> +	if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) {
> +		/*
> +		 * Preserve migrating task's wait time so wait_start time stamp
> +		 * can be adjusted to accumulate wait time prior to migration.
> +		 */
> +		schedstat_set(se->statistics.wait_start,
> +			      rq_clock(rq_of(cfs_rq)) -
> +			      se->statistics.wait_start);
> +		return;
> +	}
> +
>  	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
>  			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
>  	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
>  	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
>  			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
> +
>  	if (entity_is_task(se)) {
>  		trace_sched_stat_wait(task_of(se),
>  			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
>  	}

Is there no means of collapsing the two 'entity_is_task()' branches?

>  	schedstat_set(se->statistics.wait_start, 0);
>  }
> +#else
> +static inline void
> +update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
> +{
> +}
> +
> +static inline void
> +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
> +{
> +}
> +#endif

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v4] sched: fix incorrect wait time and wait count statistics
  2015-11-06 13:57           ` Peter Zijlstra
@ 2015-11-07  2:41             ` Joonwoo Park
  2015-11-09 10:32               ` Peter Zijlstra
  0 siblings, 1 reply; 13+ messages in thread
From: Joonwoo Park @ 2015-11-07  2:41 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, linux-kernel, linux-arm-msm, ohaugan

On Fri, Nov 06, 2015 at 02:57:49PM +0100, Peter Zijlstra wrote:
> On Tue, Oct 27, 2015 at 09:46:53PM -0700, Joonwoo Park wrote:
> > @@ -1272,6 +1272,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
> >  	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
> >  			!p->on_rq);
> >  
> > +	/*
> > +	 * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
> > +	 * because schedstat_wait_{start,end} rebase migrating task's wait_start
> > +	 * time relying on p->on_rq.
> > +	 */
> > +	WARN_ON_ONCE(p->state == TASK_RUNNING &&
> > +		     p->sched_class == &fair_sched_class &&
> > +		     (p->on_rq && !task_on_rq_migrating(p)));
> > +
> 
> Why do we have to test p->on_rq? Would not ->state == RUNNING imply
> that?
> 

sched_fork() sets p->state = RUNNING before changing task cpu.
Please let me know if you got better idea.

> > +++ b/kernel/sched/fair.c
> > @@ -737,41 +737,69 @@ static void update_curr_fair(struct rq *rq)
> >  	update_curr(cfs_rq_of(&rq->curr->se));
> >  }
> >  
> > +#ifdef CONFIG_SCHEDSTATS
> >  static inline void
> >  update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
> >  {
> > +	u64 wait_start = rq_clock(rq_of(cfs_rq));
> >  
> > +	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
> > +	    likely(wait_start > se->statistics.wait_start))
> > +		wait_start -= se->statistics.wait_start;
> > +
> > +	schedstat_set(se->statistics.wait_start, wait_start);
> >  }
> >  
> >  static void
> >  update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
> >  {
> 
> Since this is now all under CONFIG_SCHEDSTAT, would it not make sense
> to do something like:
> 
> 	u64 now = rq_clock(rq_of(cfs_rq));
> 
> to avoid the endless calling of that function?
> 
> Also, for that very same reason; would it not make sense to drop the
> schedstat_set() usage below, that would greatly enhance readability.
> 

Agreed.

> > +	if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) {
> > +		/*
> > +		 * Preserve migrating task's wait time so wait_start time stamp
> > +		 * can be adjusted to accumulate wait time prior to migration.
> > +		 */
> > +		schedstat_set(se->statistics.wait_start,
> > +			      rq_clock(rq_of(cfs_rq)) -
> > +			      se->statistics.wait_start);
> > +		return;
> > +	}
> > +
> >  	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
> >  			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
> >  	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
> >  	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
> >  			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
> > +
> >  	if (entity_is_task(se)) {
> >  		trace_sched_stat_wait(task_of(se),
> >  			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
> >  	}
> 
> Is there no means of collapsing the two 'entity_is_task()' branches?
> 

Agreed.  Will spin v5 with these clean up.

Thanks,
Joonwoo

> >  	schedstat_set(se->statistics.wait_start, 0);
> >  }
> > +#else
> > +static inline void
> > +update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
> > +{
> > +}
> > +
> > +static inline void
> > +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
> > +{
> > +}
> > +#endif

-- 
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by The Linux Foundation

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v4] sched: fix incorrect wait time and wait count statistics
  2015-11-07  2:41             ` Joonwoo Park
@ 2015-11-09 10:32               ` Peter Zijlstra
  2015-11-13  3:38                 ` [PATCH v5] " Joonwoo Park
  0 siblings, 1 reply; 13+ messages in thread
From: Peter Zijlstra @ 2015-11-09 10:32 UTC (permalink / raw)
  To: Joonwoo Park; +Cc: Ingo Molnar, linux-kernel, linux-arm-msm, ohaugan

On Fri, Nov 06, 2015 at 06:41:43PM -0800, Joonwoo Park wrote:
> On Fri, Nov 06, 2015 at 02:57:49PM +0100, Peter Zijlstra wrote:
> > On Tue, Oct 27, 2015 at 09:46:53PM -0700, Joonwoo Park wrote:
> > > @@ -1272,6 +1272,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
> > >  	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
> > >  			!p->on_rq);
> > >  
> > > +	/*
> > > +	 * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
> > > +	 * because schedstat_wait_{start,end} rebase migrating task's wait_start
> > > +	 * time relying on p->on_rq.
> > > +	 */
> > > +	WARN_ON_ONCE(p->state == TASK_RUNNING &&
> > > +		     p->sched_class == &fair_sched_class &&
> > > +		     (p->on_rq && !task_on_rq_migrating(p)));
> > > +
> > 
> > Why do we have to test p->on_rq? Would not ->state == RUNNING imply
> > that?
> > 
> 
> sched_fork() sets p->state = RUNNING before changing task cpu.
> Please let me know if you got better idea.

Ah, indeed. OK.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v5] sched: fix incorrect wait time and wait count statistics
  2015-11-09 10:32               ` Peter Zijlstra
@ 2015-11-13  3:38                 ` Joonwoo Park
  2015-11-23 16:20                   ` [tip:sched/core] sched/core: Fix " tip-bot for Joonwoo Park
  0 siblings, 1 reply; 13+ messages in thread
From: Joonwoo Park @ 2015-11-13  3:38 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, linux-kernel, linux-arm-msm, ohaugan

At present scheduler resets task's wait start timestamp when the task
migrates to another rq.  This misleads scheduler itself into reporting
less wait time than actual by omitting time spent for waiting prior to
migration and also more wait count than actual by counting migration as
wait end event which can be seen by trace or /proc/<pid>/sched with
CONFIG_SCHEDSTATS=y.

Carry forward migrating task's wait time prior to migration and
don't count migration as a wait end event to fix such statistics error.

In order to determine whether task is migrating mark task->on_rq with
TASK_ON_RQ_MIGRATING while dequeuing and enqueuing due to migration.

To: Ingo Molnar <mingo@kernel.org>
To: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
---
Changes in v2: 
 * Set p->on_rq = TASK_ON_RQ_MIGRATING while doing migration dequeue/enqueue
   and check whether task's migrating with task_on_rq_migrating().
Changes in v3: 
 * Fixed "WARNING: CPU: 0 PID: 3 at kernel/sched/fair.c:260 update_stats_wait_end+0x23/0x30()" caught by Intel kernel test robot.
Changes in v4: 
 * Made __migrate_swap_task() to set p->on_rq = TASK_ON_RQ_MIGRATING.
 * Added WARN_ON_ONCE() inside CONFIG_SCHED_DEBUG.
 * Added comments.
 * Cleanup with ifdefy.
Changes in v5:
 * Cleanup update_stats_wait_end().

 kernel/sched/core.c | 15 ++++++++++--
 kernel/sched/fair.c | 67 +++++++++++++++++++++++++++++++++++++----------------
 2 files changed, 60 insertions(+), 22 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bcd214e..1ddbabc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1069,8 +1069,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
 {
 	lockdep_assert_held(&rq->lock);
 
-	dequeue_task(rq, p, 0);
 	p->on_rq = TASK_ON_RQ_MIGRATING;
+	dequeue_task(rq, p, 0);
 	set_task_cpu(p, new_cpu);
 	raw_spin_unlock(&rq->lock);
 
@@ -1078,8 +1078,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
 
 	raw_spin_lock(&rq->lock);
 	BUG_ON(task_cpu(p) != new_cpu);
-	p->on_rq = TASK_ON_RQ_QUEUED;
 	enqueue_task(rq, p, 0);
+	p->on_rq = TASK_ON_RQ_QUEUED;
 	check_preempt_curr(rq, p, 0);
 
 	return rq;
@@ -1272,6 +1272,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
 			!p->on_rq);
 
+	/*
+	 * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
+	 * because schedstat_wait_{start,end} rebase migrating task's wait_start
+	 * time relying on p->on_rq.
+	 */
+	WARN_ON_ONCE(p->state == TASK_RUNNING &&
+		     p->sched_class == &fair_sched_class &&
+		     (p->on_rq && !task_on_rq_migrating(p)));
+
 #ifdef CONFIG_LOCKDEP
 	/*
 	 * The caller should hold either p->pi_lock or rq->lock, when changing
@@ -1308,9 +1317,11 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
 		src_rq = task_rq(p);
 		dst_rq = cpu_rq(cpu);
 
+		p->on_rq = TASK_ON_RQ_MIGRATING;
 		deactivate_task(src_rq, p, 0);
 		set_task_cpu(p, cpu);
 		activate_task(dst_rq, p, 0);
+		p->on_rq = TASK_ON_RQ_QUEUED;
 		check_preempt_curr(dst_rq, p, 0);
 	} else {
 		/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9a5e60f..fc54ecb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -737,12 +737,56 @@ static void update_curr_fair(struct rq *rq)
 	update_curr(cfs_rq_of(&rq->curr->se));
 }
 
+#ifdef CONFIG_SCHEDSTATS
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
+	u64 wait_start = rq_clock(rq_of(cfs_rq));
+
+	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
+	    likely(wait_start > se->statistics.wait_start))
+		wait_start -= se->statistics.wait_start;
+
+	se->statistics.wait_start = wait_start;
 }
 
+static void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	struct task_struct *p;
+	u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+
+	if (entity_is_task(se)) {
+		p = task_of(se);
+		if (task_on_rq_migrating(p)) {
+			/*
+			 * Preserve migrating task's wait time so wait_start
+			 * time stamp can be adjusted to accumulate wait time
+			 * prior to migration.
+			 */
+			se->statistics.wait_start = delta;
+			return;
+		}
+		trace_sched_stat_wait(p, delta);
+	}
+
+	se->statistics.wait_max = max(se->statistics.wait_max, delta);
+	se->statistics.wait_count++;
+	se->statistics.wait_sum += delta;
+	se->statistics.wait_start = 0;
+}
+#else
+static inline void
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+#endif
+
 /*
  * Task is being enqueued - update stats:
  */
@@ -756,23 +800,6 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		update_stats_wait_start(cfs_rq, se);
 }
 
-static void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
-			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
-	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
-	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
-			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
-#ifdef CONFIG_SCHEDSTATS
-	if (entity_is_task(se)) {
-		trace_sched_stat_wait(task_of(se),
-			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
-	}
-#endif
-	schedstat_set(se->statistics.wait_start, 0);
-}
-
 static inline void
 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -5656,8 +5683,8 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
 {
 	lockdep_assert_held(&env->src_rq->lock);
 
-	deactivate_task(env->src_rq, p, 0);
 	p->on_rq = TASK_ON_RQ_MIGRATING;
+	deactivate_task(env->src_rq, p, 0);
 	set_task_cpu(p, env->dst_cpu);
 }
 
@@ -5790,8 +5817,8 @@ static void attach_task(struct rq *rq, struct task_struct *p)
 	lockdep_assert_held(&rq->lock);
 
 	BUG_ON(task_rq(p) != rq);
-	p->on_rq = TASK_ON_RQ_QUEUED;
 	activate_task(rq, p, 0);
+	p->on_rq = TASK_ON_RQ_QUEUED;
 	check_preempt_curr(rq, p, 0);
 }
 
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
hosted by The Linux Foundation



^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [tip:sched/core] sched/core: Fix incorrect wait time and wait count statistics
  2015-11-13  3:38                 ` [PATCH v5] " Joonwoo Park
@ 2015-11-23 16:20                   ` tip-bot for Joonwoo Park
  0 siblings, 0 replies; 13+ messages in thread
From: tip-bot for Joonwoo Park @ 2015-11-23 16:20 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: joonwoop, hpa, torvalds, tglx, linux-kernel, peterz, mingo, efault

Commit-ID:  3ea94de15ce9f3a217f6d0a7e9e0f48388902bb7
Gitweb:     http://git.kernel.org/tip/3ea94de15ce9f3a217f6d0a7e9e0f48388902bb7
Author:     Joonwoo Park <joonwoop@codeaurora.org>
AuthorDate: Thu, 12 Nov 2015 19:38:54 -0800
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Mon, 23 Nov 2015 09:48:17 +0100

sched/core: Fix incorrect wait time and wait count statistics

At present scheduler resets task's wait start timestamp when the task
migrates to another rq.  This misleads scheduler itself into reporting
less wait time than actual by omitting time spent for waiting prior to
migration and also more wait count than actual by counting migration as
wait end event which can be seen by trace or /proc/<pid>/sched with
CONFIG_SCHEDSTATS=y.

Carry forward migrating task's wait time prior to migration and
don't count migration as a wait end event to fix such statistics error.

In order to determine whether task is migrating mark task->on_rq with
TASK_ON_RQ_MIGRATING while dequeuing and enqueuing due to migration.

Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: ohaugan@codeaurora.org
Link: http://lkml.kernel.org/r/20151113033854.GA4247@codeaurora.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 15 ++++++++++--
 kernel/sched/fair.c | 67 +++++++++++++++++++++++++++++++++++++----------------
 2 files changed, 60 insertions(+), 22 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4d568ac..1b7cb5e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1071,8 +1071,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
 {
 	lockdep_assert_held(&rq->lock);
 
-	dequeue_task(rq, p, 0);
 	p->on_rq = TASK_ON_RQ_MIGRATING;
+	dequeue_task(rq, p, 0);
 	set_task_cpu(p, new_cpu);
 	raw_spin_unlock(&rq->lock);
 
@@ -1080,8 +1080,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
 
 	raw_spin_lock(&rq->lock);
 	BUG_ON(task_cpu(p) != new_cpu);
-	p->on_rq = TASK_ON_RQ_QUEUED;
 	enqueue_task(rq, p, 0);
+	p->on_rq = TASK_ON_RQ_QUEUED;
 	check_preempt_curr(rq, p, 0);
 
 	return rq;
@@ -1274,6 +1274,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
 			!p->on_rq);
 
+	/*
+	 * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
+	 * because schedstat_wait_{start,end} rebase migrating task's wait_start
+	 * time relying on p->on_rq.
+	 */
+	WARN_ON_ONCE(p->state == TASK_RUNNING &&
+		     p->sched_class == &fair_sched_class &&
+		     (p->on_rq && !task_on_rq_migrating(p)));
+
 #ifdef CONFIG_LOCKDEP
 	/*
 	 * The caller should hold either p->pi_lock or rq->lock, when changing
@@ -1310,9 +1319,11 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
 		src_rq = task_rq(p);
 		dst_rq = cpu_rq(cpu);
 
+		p->on_rq = TASK_ON_RQ_MIGRATING;
 		deactivate_task(src_rq, p, 0);
 		set_task_cpu(p, cpu);
 		activate_task(dst_rq, p, 0);
+		p->on_rq = TASK_ON_RQ_QUEUED;
 		check_preempt_curr(dst_rq, p, 0);
 	} else {
 		/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 95b944e..f7017ad 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -738,12 +738,56 @@ static void update_curr_fair(struct rq *rq)
 	update_curr(cfs_rq_of(&rq->curr->se));
 }
 
+#ifdef CONFIG_SCHEDSTATS
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
+	u64 wait_start = rq_clock(rq_of(cfs_rq));
+
+	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
+	    likely(wait_start > se->statistics.wait_start))
+		wait_start -= se->statistics.wait_start;
+
+	se->statistics.wait_start = wait_start;
 }
 
+static void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	struct task_struct *p;
+	u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+
+	if (entity_is_task(se)) {
+		p = task_of(se);
+		if (task_on_rq_migrating(p)) {
+			/*
+			 * Preserve migrating task's wait time so wait_start
+			 * time stamp can be adjusted to accumulate wait time
+			 * prior to migration.
+			 */
+			se->statistics.wait_start = delta;
+			return;
+		}
+		trace_sched_stat_wait(p, delta);
+	}
+
+	se->statistics.wait_max = max(se->statistics.wait_max, delta);
+	se->statistics.wait_count++;
+	se->statistics.wait_sum += delta;
+	se->statistics.wait_start = 0;
+}
+#else
+static inline void
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+#endif
+
 /*
  * Task is being enqueued - update stats:
  */
@@ -757,23 +801,6 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		update_stats_wait_start(cfs_rq, se);
 }
 
-static void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
-			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
-	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
-	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
-			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
-#ifdef CONFIG_SCHEDSTATS
-	if (entity_is_task(se)) {
-		trace_sched_stat_wait(task_of(se),
-			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
-	}
-#endif
-	schedstat_set(se->statistics.wait_start, 0);
-}
-
 static inline void
 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -5745,8 +5772,8 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
 {
 	lockdep_assert_held(&env->src_rq->lock);
 
-	deactivate_task(env->src_rq, p, 0);
 	p->on_rq = TASK_ON_RQ_MIGRATING;
+	deactivate_task(env->src_rq, p, 0);
 	set_task_cpu(p, env->dst_cpu);
 }
 
@@ -5879,8 +5906,8 @@ static void attach_task(struct rq *rq, struct task_struct *p)
 	lockdep_assert_held(&rq->lock);
 
 	BUG_ON(task_rq(p) != rq);
-	p->on_rq = TASK_ON_RQ_QUEUED;
 	activate_task(rq, p, 0);
+	p->on_rq = TASK_ON_RQ_QUEUED;
 	check_preempt_curr(rq, p, 0);
 }
 

^ permalink raw reply related	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2015-11-23 16:21 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-10-25  5:23 [PATCH] sched: fix incorrect wait time and wait count statistics Joonwoo Park
2015-10-25 10:08 ` Peter Zijlstra
2015-10-25 10:26 ` Peter Zijlstra
2015-10-27  1:44   ` Joonwoo Park
2015-10-27 12:57     ` Peter Zijlstra
2015-10-28  2:40       ` Joonwoo Park
2015-10-28  4:46         ` [PATCH v4] " Joonwoo Park
2015-11-06 13:57           ` Peter Zijlstra
2015-11-07  2:41             ` Joonwoo Park
2015-11-09 10:32               ` Peter Zijlstra
2015-11-13  3:38                 ` [PATCH v5] " Joonwoo Park
2015-11-23 16:20                   ` [tip:sched/core] sched/core: Fix " tip-bot for Joonwoo Park
2015-10-27 19:17 ` [PATCH v3] sched: fix " Joonwoo Park

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).