linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/2] sched: add trace event for per-entity tracking
@ 2013-07-01  7:10 Lei Wen
  2013-07-01  7:10 ` [PATCH 1/2] sched: add trace events for task and rq usage tracking Lei Wen
                   ` (3 more replies)
  0 siblings, 4 replies; 18+ messages in thread
From: Lei Wen @ 2013-07-01  7:10 UTC (permalink / raw)
  To: Paul Turner, Alex Shi, Peter Zijlstra, Ingo Molnar, mingo,
	leiwen, linux-kernel

Thanks for the per-entity tracking feature, we could know the details of
each task by its help.
This patch add its trace support, so that we could quickly know the system
status in a large time scale, like now we may get each runqueue's usage ratio by:

cfs_rq's usage ratio = cfs_rq->runnable_load_avg/cfs_rq->load.weight

Lei Wen (2):
  sched: add trace events for task and rq usage tracking
  sched: update cfs_rq weight earlier in enqueue_entity

 include/trace/events/sched.h |   73 ++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c          |   31 ++++++++++++++++--
 2 files changed, 101 insertions(+), 3 deletions(-)

-- 
1.7.10.4


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 1/2] sched: add trace events for task and rq usage tracking
  2013-07-01  7:10 [PATCH 0/2] sched: add trace event for per-entity tracking Lei Wen
@ 2013-07-01  7:10 ` Lei Wen
  2013-07-01  9:43   ` Kamalesh Babulal
  2013-07-01  7:10 ` [PATCH 2/2] sched: update cfs_rq weight earlier in enqueue_entity Lei Wen
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 18+ messages in thread
From: Lei Wen @ 2013-07-01  7:10 UTC (permalink / raw)
  To: Paul Turner, Alex Shi, Peter Zijlstra, Ingo Molnar, mingo,
	leiwen, linux-kernel

Since we could track task in the entity level now, we may want to
investigate tasks' running status by recording the trace info, so that
could make some tuning if needed.

Signed-off-by: Lei Wen <leiwen@marvell.com>
---
 include/trace/events/sched.h |   73 ++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c          |   29 +++++++++++++++--
 2 files changed, 100 insertions(+), 2 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index e5586ca..8f1af65 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -430,6 +430,79 @@ TRACE_EVENT(sched_pi_setprio,
 			__entry->oldprio, __entry->newprio)
 );
 
+TRACE_EVENT(sched_task_weighted_load,
+
+	TP_PROTO(struct task_struct *tsk, unsigned long load, unsigned long weight),
+
+	TP_ARGS(tsk, load, weight),
+
+	TP_STRUCT__entry(
+		__field(pid_t, pid)
+		__field(int, cpu)
+		__field(unsigned long, load)
+		__field(unsigned long, weight)
+	),
+
+	TP_fast_assign(
+		__entry->pid   = tsk->pid;
+		__entry->cpu   = task_thread_info(tsk)->cpu;
+		__entry->load  = load;
+		__entry->weight= weight;
+	),
+
+	TP_printk("cpu=%d pid=%d load=%lu weight=%lu",
+			__entry->cpu, __entry->pid,
+			__entry->load, __entry->weight)
+);
+
+TRACE_EVENT(sched_cfs_rq_runnable_load,
+
+	TP_PROTO(int cpu, unsigned long load, unsigned long total),
+
+	TP_ARGS(cpu, load, total),
+
+	TP_STRUCT__entry(
+		__field(int, cpu)
+		__field(unsigned long, load)
+		__field(unsigned long, total)
+	),
+
+	TP_fast_assign(
+		__entry->cpu   = cpu;
+		__entry->load  = load;
+		__entry->total = total;
+	),
+
+	TP_printk("cpu=%d avg=%lu total=%lu",
+			__entry->cpu,
+			__entry->load,
+			__entry->total)
+);
+
+TRACE_EVENT(sched_cfs_rq_blocked_load,
+
+	TP_PROTO(int cpu, unsigned long load, unsigned long total),
+
+	TP_ARGS(cpu, load, total),
+
+	TP_STRUCT__entry(
+		__field(int, cpu)
+		__field(unsigned long, load)
+		__field(unsigned long, total)
+	),
+
+	TP_fast_assign(
+		__entry->cpu   = cpu;
+		__entry->load  = load;
+		__entry->total = total;
+	),
+
+	TP_printk("cpu=%d avg=%lu total=%lu",
+			__entry->cpu,
+			__entry->load,
+			__entry->total)
+);
+
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f77f9c5..07bd74c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1346,6 +1346,7 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se)
 		return 0;
 
 	se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
+	trace_sched_task_weighted_load(task_of(se), se->avg.load_avg_contrib, se->load.weight);
 	se->avg.decay_count = 0;
 
 	return decays;
@@ -1445,6 +1446,7 @@ static inline void __update_task_entity_contrib(struct sched_entity *se)
 	contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
 	contrib /= (se->avg.runnable_avg_period + 1);
 	se->avg.load_avg_contrib = scale_load(contrib);
+	trace_sched_task_weighted_load(task_of(se), se->avg.load_avg_contrib, se->load.weight);
 }
 
 /* Compute the current contribution to load_avg by se, return any delta */
@@ -1498,10 +1500,16 @@ static inline void update_entity_load_avg(struct sched_entity *se,
 	if (!update_cfs_rq)
 		return;
 
-	if (se->on_rq)
+	if (se->on_rq) {
 		cfs_rq->runnable_load_avg += contrib_delta;
-	else
+		trace_sched_cfs_rq_runnable_load(cpu_of(rq_of(cfs_rq)),
+				cfs_rq->runnable_load_avg, cfs_rq->load.weight);
+	} else {
 		subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
+		trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
+				cfs_rq->blocked_load_avg,
+				cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
+	}
 }
 
 /*
@@ -1531,6 +1539,9 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
 	}
 
 	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
+	trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
+			cfs_rq->blocked_load_avg,
+			cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
 }
 
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
@@ -1584,10 +1595,15 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
 	/* migrated tasks did not contribute to our blocked load */
 	if (wakeup) {
 		subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
+		trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
+				cfs_rq->blocked_load_avg,
+				cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
 		update_entity_load_avg(se, 0);
 	}
 
 	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+	trace_sched_cfs_rq_runnable_load(cpu_of(rq_of(cfs_rq)),
+			cfs_rq->runnable_load_avg, cfs_rq->load.weight);
 	/* we force update consideration on load-balancer moves */
 	update_cfs_rq_blocked_load(cfs_rq, !wakeup);
 }
@@ -1608,6 +1624,9 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
 	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
 	if (sleep) {
 		cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
+		trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
+				cfs_rq->blocked_load_avg,
+				cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
 		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
 	} /* migrations, e.g. sleep=0 leave decay_count == 0 */
 }
@@ -5894,6 +5913,9 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
 		__synchronize_entity_decay(&p->se);
 		subtract_blocked_load_contrib(cfs_rq,
 				p->se.avg.load_avg_contrib);
+		trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
+				cfs_rq->blocked_load_avg,
+				cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
 	}
 #endif
 }
@@ -5994,6 +6016,9 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
 		 */
 		p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
 		cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
+		trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
+				cfs_rq->blocked_load_avg,
+				cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
 #endif
 	}
 }
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 2/2] sched: update cfs_rq weight earlier in enqueue_entity
  2013-07-01  7:10 [PATCH 0/2] sched: add trace event for per-entity tracking Lei Wen
  2013-07-01  7:10 ` [PATCH 1/2] sched: add trace events for task and rq usage tracking Lei Wen
@ 2013-07-01  7:10 ` Lei Wen
  2013-07-01  8:06 ` [PATCH 0/2] sched: add trace event for per-entity tracking Alex Shi
  2013-07-01 12:33 ` [PATCH V2 " Lei Wen
  3 siblings, 0 replies; 18+ messages in thread
From: Lei Wen @ 2013-07-01  7:10 UTC (permalink / raw)
  To: Paul Turner, Alex Shi, Peter Zijlstra, Ingo Molnar, mingo,
	leiwen, linux-kernel

Since we are going to calculate cfs_rq's average ratio by
runnable_load_avg/load.weight, if not increase the load.weight prior to
enqueue_entity_load_avg, it may lead to one cfs_rq's avg ratio higher
than 100%.

Adjust the sequence, so that all ratio is kept below 100%.

Signed-off-by: Lei Wen <leiwen@marvell.com>
---
 kernel/sched/fair.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 07bd74c..d1eee84 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1788,8 +1788,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
-	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
 	account_entity_enqueue(cfs_rq, se);
+	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
 	update_cfs_shares(cfs_rq);
 
 	if (flags & ENQUEUE_WAKEUP) {
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 0/2] sched: add trace event for per-entity tracking
  2013-07-01  7:10 [PATCH 0/2] sched: add trace event for per-entity tracking Lei Wen
  2013-07-01  7:10 ` [PATCH 1/2] sched: add trace events for task and rq usage tracking Lei Wen
  2013-07-01  7:10 ` [PATCH 2/2] sched: update cfs_rq weight earlier in enqueue_entity Lei Wen
@ 2013-07-01  8:06 ` Alex Shi
  2013-07-01  8:49   ` Lei Wen
  2013-07-01 12:33 ` [PATCH V2 " Lei Wen
  3 siblings, 1 reply; 18+ messages in thread
From: Alex Shi @ 2013-07-01  8:06 UTC (permalink / raw)
  To: Lei Wen; +Cc: Paul Turner, Peter Zijlstra, Ingo Molnar, mingo, linux-kernel

On 07/01/2013 03:10 PM, Lei Wen wrote:
> Thanks for the per-entity tracking feature, we could know the details of
> each task by its help.
> This patch add its trace support, so that we could quickly know the system
> status in a large time scale, like now we may get each runqueue's usage ratio by:
> 
> cfs_rq's usage ratio = cfs_rq->runnable_load_avg/cfs_rq->load.weight
> 

the direct usage ratio is rq.avg.runnable_avg_sum / rq.avg.runnable_avg_period.

one patch from obsolete power-scheduling could be reference for this:
git@github.com:alexshi/power-scheduling.git power-scheduling

>From 081cd4bcbccfaa1930b031e4dfbf9d23b8c0d5ab Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@intel.com>
Date: Fri, 7 Dec 2012 21:37:58 +0800
Subject: [PATCH 02/23] sched: log the cpu utilization at rq

The cpu's utilization is to measure how busy is the cpu.
        util = cpu_rq(cpu)->avg.runnable_avg_sum * SCHED_POEWR_SCALE
                / cpu_rq(cpu)->avg.runnable_avg_period;

Since the util is no more than 1, we scale its value with 1024, same as
SCHED_POWER_SCALE and set the FULL_UTIL as 1024.

In later power aware scheduling, we are sensitive for how busy of the
cpu. Since as to power consuming, it is tight related with cpu busy
time.

BTW, rq->util can be used for any purposes if needed, not only power
scheduling.

Signed-off-by: Alex Shi <alex.shi@intel.com>
---
 include/linux/sched.h | 2 +-
 kernel/sched/debug.c  | 1 +
 kernel/sched/fair.c   | 5 +++++
 kernel/sched/sched.h  | 4 ++++
 4 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9539597..4e4d9ee 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -794,7 +794,7 @@ enum cpu_idle_type {
 #define SCHED_LOAD_SCALE	(1L << SCHED_LOAD_SHIFT)
 
 /*
- * Increase resolution of cpu_power calculations
+ * Increase resolution of cpu_power and rq->util calculations
  */
 #define SCHED_POWER_SHIFT	10
 #define SCHED_POWER_SCALE	(1L << SCHED_POWER_SHIFT)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 75024a6..f5db759 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -311,6 +311,7 @@ do {									\
 
 	P(ttwu_count);
 	P(ttwu_local);
+	P(util);
 
 #undef P
 #undef P64
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2e49c3f..7124244 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1495,8 +1495,13 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
 
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
 {
+	u32 period;
 	__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
 	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
+
+	period = rq->avg.runnable_avg_period ? rq->avg.runnable_avg_period : 1;
+	rq->util = (u64)(rq->avg.runnable_avg_sum << SCHED_POWER_SHIFT)
+				/ period;
 }
 
 /* Add the load generated by se into cfs_rq's child load-average */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 804ee41..8682110 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -351,6 +351,9 @@ extern struct root_domain def_root_domain;
 
 #endif /* CONFIG_SMP */
 
+/* full cpu utilization */
+#define FULL_UTIL	SCHED_POWER_SCALE
+
 /*
  * This is the main, per-CPU runqueue data structure.
  *
@@ -482,6 +485,7 @@ struct rq {
 #endif
 
 	struct sched_avg avg;
+	unsigned int util;
 };
 
 static inline int cpu_of(struct rq *rq)
-- 
1.7.12

-- 
Thanks
    Alex

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 0/2] sched: add trace event for per-entity tracking
  2013-07-01  8:06 ` [PATCH 0/2] sched: add trace event for per-entity tracking Alex Shi
@ 2013-07-01  8:49   ` Lei Wen
  0 siblings, 0 replies; 18+ messages in thread
From: Lei Wen @ 2013-07-01  8:49 UTC (permalink / raw)
  To: Alex Shi
  Cc: Lei Wen, Paul Turner, Peter Zijlstra, Ingo Molnar, mingo, linux-kernel

Alex,

On Mon, Jul 1, 2013 at 4:06 PM, Alex Shi <alex.shi@intel.com> wrote:
> On 07/01/2013 03:10 PM, Lei Wen wrote:
>> Thanks for the per-entity tracking feature, we could know the details of
>> each task by its help.
>> This patch add its trace support, so that we could quickly know the system
>> status in a large time scale, like now we may get each runqueue's usage ratio by:
>>
>> cfs_rq's usage ratio = cfs_rq->runnable_load_avg/cfs_rq->load.weight
>>
>
> the direct usage ratio is rq.avg.runnable_avg_sum / rq.avg.runnable_avg_period.


>From the parsed data diagram, seem more pretty than my previous use
load as the calculation one. :)
BTW, do you think there is some meaning for doing below calculation?
cfs_rq->runnable_load_avg/cfs_rq->load.weight

I think by this calculation from the
runnable_avg_load/blocked_avg_load trace result,
we may catch some abnormal load distribution when debugging.


>
> one patch from obsolete power-scheduling could be reference for this:
> git@github.com:alexshi/power-scheduling.git power-scheduling
>
> From 081cd4bcbccfaa1930b031e4dfbf9d23b8c0d5ab Mon Sep 17 00:00:00 2001
> From: Alex Shi <alex.shi@intel.com>
> Date: Fri, 7 Dec 2012 21:37:58 +0800
> Subject: [PATCH 02/23] sched: log the cpu utilization at rq
>
> The cpu's utilization is to measure how busy is the cpu.
>         util = cpu_rq(cpu)->avg.runnable_avg_sum * SCHED_POEWR_SCALE
>                 / cpu_rq(cpu)->avg.runnable_avg_period;
>
> Since the util is no more than 1, we scale its value with 1024, same as
> SCHED_POWER_SCALE and set the FULL_UTIL as 1024.
>
> In later power aware scheduling, we are sensitive for how busy of the
> cpu. Since as to power consuming, it is tight related with cpu busy
> time.
>
> BTW, rq->util can be used for any purposes if needed, not only power
> scheduling.
>
> Signed-off-by: Alex Shi <alex.shi@intel.com>


Nice patch, would it be merged? :)

Thanks,
Lei
> ---
>  include/linux/sched.h | 2 +-
>  kernel/sched/debug.c  | 1 +
>  kernel/sched/fair.c   | 5 +++++
>  kernel/sched/sched.h  | 4 ++++
>  4 files changed, 11 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 9539597..4e4d9ee 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -794,7 +794,7 @@ enum cpu_idle_type {
>  #define SCHED_LOAD_SCALE       (1L << SCHED_LOAD_SHIFT)
>
>  /*
> - * Increase resolution of cpu_power calculations
> + * Increase resolution of cpu_power and rq->util calculations
>   */
>  #define SCHED_POWER_SHIFT      10
>  #define SCHED_POWER_SCALE      (1L << SCHED_POWER_SHIFT)
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 75024a6..f5db759 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -311,6 +311,7 @@ do {                                                                        \
>
>         P(ttwu_count);
>         P(ttwu_local);
> +       P(util);
>
>  #undef P
>  #undef P64
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 2e49c3f..7124244 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1495,8 +1495,13 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
>
>  static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
>  {
> +       u32 period;
>         __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
>         __update_tg_runnable_avg(&rq->avg, &rq->cfs);
> +
> +       period = rq->avg.runnable_avg_period ? rq->avg.runnable_avg_period : 1;
> +       rq->util = (u64)(rq->avg.runnable_avg_sum << SCHED_POWER_SHIFT)
> +                               / period;
>  }
>
>  /* Add the load generated by se into cfs_rq's child load-average */
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 804ee41..8682110 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -351,6 +351,9 @@ extern struct root_domain def_root_domain;
>
>  #endif /* CONFIG_SMP */
>
> +/* full cpu utilization */
> +#define FULL_UTIL      SCHED_POWER_SCALE
> +
>  /*
>   * This is the main, per-CPU runqueue data structure.
>   *
> @@ -482,6 +485,7 @@ struct rq {
>  #endif
>
>         struct sched_avg avg;
> +       unsigned int util;
>  };
>
>  static inline int cpu_of(struct rq *rq)
> --
> 1.7.12
>
> --
> Thanks
>     Alex
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/2] sched: add trace events for task and rq usage tracking
  2013-07-01  7:10 ` [PATCH 1/2] sched: add trace events for task and rq usage tracking Lei Wen
@ 2013-07-01  9:43   ` Kamalesh Babulal
  2013-07-01 12:18     ` Lei Wen
  0 siblings, 1 reply; 18+ messages in thread
From: Kamalesh Babulal @ 2013-07-01  9:43 UTC (permalink / raw)
  To: Lei Wen
  Cc: Paul Turner, Alex Shi, Peter Zijlstra, Ingo Molnar, mingo,
	linux-kernel, kamalesh

* Lei Wen <leiwen@marvell.com> [2013-07-01 15:10:32]:

> Since we could track task in the entity level now, we may want to
> investigate tasks' running status by recording the trace info, so that
> could make some tuning if needed.
> 
> Signed-off-by: Lei Wen <leiwen@marvell.com>
> ---
>  include/trace/events/sched.h |   73 ++++++++++++++++++++++++++++++++++++++++++
>  kernel/sched/fair.c          |   29 +++++++++++++++--
>  2 files changed, 100 insertions(+), 2 deletions(-)

[...]

> 
> +TRACE_EVENT(sched_cfs_rq_runnable_load,
> +
> +	TP_PROTO(int cpu, unsigned long load, unsigned long total),
> +
> +	TP_ARGS(cpu, load, total),
> +
> +	TP_STRUCT__entry(
> +		__field(int, cpu)
> +		__field(unsigned long, load)
> +		__field(unsigned long, total)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->cpu   = cpu;
> +		__entry->load  = load;
> +		__entry->total = total;
> +	),
> +
> +	TP_printk("cpu=%d avg=%lu total=%lu",
> +			__entry->cpu,
> +			__entry->load,
> +			__entry->total)
> +);
> +
> +TRACE_EVENT(sched_cfs_rq_blocked_load,
> +
> +	TP_PROTO(int cpu, unsigned long load, unsigned long total),
> +
> +	TP_ARGS(cpu, load, total),
> +
> +	TP_STRUCT__entry(
> +		__field(int, cpu)
> +		__field(unsigned long, load)
> +		__field(unsigned long, total)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->cpu   = cpu;
> +		__entry->load  = load;
> +		__entry->total = total;
> +	),
> +
> +	TP_printk("cpu=%d avg=%lu total=%lu",
> +			__entry->cpu,
> +			__entry->load,
> +			__entry->total)
> +);
> +
>  #endif /* _TRACE_SCHED_H */

above trace points are same and be folded using EVENT_CLASS:

+DECLARE_EVENT_CLASS(sched_cfs_rq_load_contri_template,
+
+	TP_PROTO(int cpu, unsigned long load, unsigned long total),
+
+	TP_ARGS(cpu, load, total),
+
+	TP_STRUCT__entry(
+		__field(int, cpu)
+		__field(unsigned long, load)
+		__field(unsigned long, total)
+	),
+
+	TP_fast_assign(
+		__entry->cpu   = cpu;
+		__entry->load  = load;
+		__entry->total = total;
+	),
+
+	TP_printk("cpu=%d avg=%lu total=%lu",
+			__entry->cpu,
+			__entry->load,
+			__entry->total)
+);
+
+DEFINE_EVENT(sched_cfs_rq_load_contri_template, sched_cfs_rq_runnable_load,
+	TP_PROTO(int cpu, unsigned long load, unsigned long total),
+	TP_ARGS(cpu, load, total));
+
+DEFINE_EVENT(sched_cfs_rq_load_contri_template, sched_cfs_rq_blocked_load,
+	TP_PROTO(int cpu, unsigned long load, unsigned long total),
+	TP_ARGS(cpu, load, total));
+


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/2] sched: add trace events for task and rq usage tracking
  2013-07-01  9:43   ` Kamalesh Babulal
@ 2013-07-01 12:18     ` Lei Wen
  0 siblings, 0 replies; 18+ messages in thread
From: Lei Wen @ 2013-07-01 12:18 UTC (permalink / raw)
  To: Kamalesh Babulal
  Cc: Lei Wen, Paul Turner, Alex Shi, Peter Zijlstra, Ingo Molnar,
	mingo, linux-kernel

Hi Kamalesh,

On Mon, Jul 1, 2013 at 5:43 PM, Kamalesh Babulal
<kamalesh@linux.vnet.ibm.com> wrote:
> * Lei Wen <leiwen@marvell.com> [2013-07-01 15:10:32]:
>
>> Since we could track task in the entity level now, we may want to
>> investigate tasks' running status by recording the trace info, so that
>> could make some tuning if needed.
>>
>> Signed-off-by: Lei Wen <leiwen@marvell.com>
>> ---
>>  include/trace/events/sched.h |   73 ++++++++++++++++++++++++++++++++++++++++++
>>  kernel/sched/fair.c          |   29 +++++++++++++++--
>>  2 files changed, 100 insertions(+), 2 deletions(-)
>
> [...]
>
>>
>> +TRACE_EVENT(sched_cfs_rq_runnable_load,
>> +
>> +     TP_PROTO(int cpu, unsigned long load, unsigned long total),
>> +
>> +     TP_ARGS(cpu, load, total),
>> +
>> +     TP_STRUCT__entry(
>> +             __field(int, cpu)
>> +             __field(unsigned long, load)
>> +             __field(unsigned long, total)
>> +     ),
>> +
>> +     TP_fast_assign(
>> +             __entry->cpu   = cpu;
>> +             __entry->load  = load;
>> +             __entry->total = total;
>> +     ),
>> +
>> +     TP_printk("cpu=%d avg=%lu total=%lu",
>> +                     __entry->cpu,
>> +                     __entry->load,
>> +                     __entry->total)
>> +);
>> +
>> +TRACE_EVENT(sched_cfs_rq_blocked_load,
>> +
>> +     TP_PROTO(int cpu, unsigned long load, unsigned long total),
>> +
>> +     TP_ARGS(cpu, load, total),
>> +
>> +     TP_STRUCT__entry(
>> +             __field(int, cpu)
>> +             __field(unsigned long, load)
>> +             __field(unsigned long, total)
>> +     ),
>> +
>> +     TP_fast_assign(
>> +             __entry->cpu   = cpu;
>> +             __entry->load  = load;
>> +             __entry->total = total;
>> +     ),
>> +
>> +     TP_printk("cpu=%d avg=%lu total=%lu",
>> +                     __entry->cpu,
>> +                     __entry->load,
>> +                     __entry->total)
>> +);
>> +
>>  #endif /* _TRACE_SCHED_H */
>
> above trace points are same and be folded using EVENT_CLASS:

Nice abstract. I would merge your change for my V2 patch.

Thanks,
Lei

>
> +DECLARE_EVENT_CLASS(sched_cfs_rq_load_contri_template,
> +
> +       TP_PROTO(int cpu, unsigned long load, unsigned long total),
> +
> +       TP_ARGS(cpu, load, total),
> +
> +       TP_STRUCT__entry(
> +               __field(int, cpu)
> +               __field(unsigned long, load)
> +               __field(unsigned long, total)
> +       ),
> +
> +       TP_fast_assign(
> +               __entry->cpu   = cpu;
> +               __entry->load  = load;
> +               __entry->total = total;
> +       ),
> +
> +       TP_printk("cpu=%d avg=%lu total=%lu",
> +                       __entry->cpu,
> +                       __entry->load,
> +                       __entry->total)
> +);
> +
> +DEFINE_EVENT(sched_cfs_rq_load_contri_template, sched_cfs_rq_runnable_load,
> +       TP_PROTO(int cpu, unsigned long load, unsigned long total),
> +       TP_ARGS(cpu, load, total));
> +
> +DEFINE_EVENT(sched_cfs_rq_load_contri_template, sched_cfs_rq_blocked_load,
> +       TP_PROTO(int cpu, unsigned long load, unsigned long total),
> +       TP_ARGS(cpu, load, total));
> +
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH V2 0/2] sched: add trace event for per-entity tracking
  2013-07-01  7:10 [PATCH 0/2] sched: add trace event for per-entity tracking Lei Wen
                   ` (2 preceding siblings ...)
  2013-07-01  8:06 ` [PATCH 0/2] sched: add trace event for per-entity tracking Alex Shi
@ 2013-07-01 12:33 ` Lei Wen
  2013-07-01 12:33   ` [V2 1/2] sched: add trace events for task and rq usage tracking Lei Wen
                     ` (2 more replies)
  3 siblings, 3 replies; 18+ messages in thread
From: Lei Wen @ 2013-07-01 12:33 UTC (permalink / raw)
  To: Paul Turner, Alex Shi, Peter Zijlstra, Ingo Molnar, mingo,
	Kamalesh Babulal, Lei Wen, linux-kernel

Thanks for the per-entity tracking feature, we could know the details of
each task by its help.
This patch add its trace support, so that we could quickly know the system
status in a large time scale.

The "cfs_rq->runnable_load_avg/cfs_rq->load.weight" is useful in identify
load distribution status in the whole system

V2: Abstract sched_cfs_rq_runnable_load and sched_cfs_rq_blocked_load using
    sched_cfs_rq_load_contri_template. Thanks Kamalesh for this contribution!

Lei Wen (2):
  sched: add trace events for task and rq usage tracking
  sched: update cfs_rq weight earlier in enqueue_entity

 include/trace/events/sched.h |   73 ++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c          |   31 ++++++++++++++++--
 2 files changed, 101 insertions(+), 3 deletions(-)

-- 
1.7.10.4


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [V2 1/2] sched: add trace events for task and rq usage tracking
  2013-07-01 12:33 ` [PATCH V2 " Lei Wen
@ 2013-07-01 12:33   ` Lei Wen
  2013-07-01 12:44     ` Peter Zijlstra
  2013-07-01 12:33   ` [V2 2/2] sched: update cfs_rq weight earlier in enqueue_entity Lei Wen
  2013-07-02 12:15   ` [PATCH V3 0/2] sched: add trace event for per-entity tracking Lei Wen
  2 siblings, 1 reply; 18+ messages in thread
From: Lei Wen @ 2013-07-01 12:33 UTC (permalink / raw)
  To: Paul Turner, Alex Shi, Peter Zijlstra, Ingo Molnar, mingo,
	Kamalesh Babulal, Lei Wen, linux-kernel

Since we could track task in the entity level now, we may want to
investigate tasks' running status by recording the trace info, so that
could make some tuning if needed.

Signed-off-by: Lei Wen <leiwen@marvell.com>
---
 include/trace/events/sched.h |   57 ++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c          |   29 +++++++++++++++++++--
 2 files changed, 84 insertions(+), 2 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index e5586ca..effe047 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -430,6 +430,63 @@ TRACE_EVENT(sched_pi_setprio,
 			__entry->oldprio, __entry->newprio)
 );
 
+TRACE_EVENT(sched_task_weighted_load,
+
+	TP_PROTO(struct task_struct *tsk, unsigned long load, unsigned long weight),
+
+	TP_ARGS(tsk, load, weight),
+
+	TP_STRUCT__entry(
+		__field(pid_t, pid)
+		__field(int, cpu)
+		__field(unsigned long, load)
+		__field(unsigned long, weight)
+	),
+
+	TP_fast_assign(
+		__entry->pid   = tsk->pid;
+		__entry->cpu   = task_thread_info(tsk)->cpu;
+		__entry->load  = load;
+		__entry->weight= weight;
+	),
+
+	TP_printk("cpu=%d pid=%d load=%lu weight=%lu",
+			__entry->cpu, __entry->pid,
+			__entry->load, __entry->weight)
+);
+
+DECLARE_EVENT_CLASS(sched_cfs_rq_load_contri_template,
+
+	TP_PROTO(int cpu, unsigned long load, unsigned long total),
+
+	TP_ARGS(cpu, load, total),
+
+	TP_STRUCT__entry(
+		__field(int, cpu)
+		__field(unsigned long, load)
+		__field(unsigned long, total)
+	),
+
+	TP_fast_assign(
+		__entry->cpu   = cpu;
+		__entry->load  = load;
+		__entry->total = total;
+	),
+
+	TP_printk("cpu=%d avg=%lu total=%lu",
+		__entry->cpu,
+		__entry->load,
+		__entry->total)
+	);
+
+DEFINE_EVENT(sched_cfs_rq_load_contri_template, sched_cfs_rq_runnable_load,
+	TP_PROTO(int cpu, unsigned long load, unsigned long total),
+	TP_ARGS(cpu, load, total));
+
+DEFINE_EVENT(sched_cfs_rq_load_contri_template, sched_cfs_rq_blocked_load,
+	TP_PROTO(int cpu, unsigned long load, unsigned long total),
+	TP_ARGS(cpu, load, total));
+
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f77f9c5..07bd74c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1346,6 +1346,7 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se)
 		return 0;
 
 	se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
+	trace_sched_task_weighted_load(task_of(se), se->avg.load_avg_contrib, se->load.weight);
 	se->avg.decay_count = 0;
 
 	return decays;
@@ -1445,6 +1446,7 @@ static inline void __update_task_entity_contrib(struct sched_entity *se)
 	contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
 	contrib /= (se->avg.runnable_avg_period + 1);
 	se->avg.load_avg_contrib = scale_load(contrib);
+	trace_sched_task_weighted_load(task_of(se), se->avg.load_avg_contrib, se->load.weight);
 }
 
 /* Compute the current contribution to load_avg by se, return any delta */
@@ -1498,10 +1500,16 @@ static inline void update_entity_load_avg(struct sched_entity *se,
 	if (!update_cfs_rq)
 		return;
 
-	if (se->on_rq)
+	if (se->on_rq) {
 		cfs_rq->runnable_load_avg += contrib_delta;
-	else
+		trace_sched_cfs_rq_runnable_load(cpu_of(rq_of(cfs_rq)),
+				cfs_rq->runnable_load_avg, cfs_rq->load.weight);
+	} else {
 		subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
+		trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
+				cfs_rq->blocked_load_avg,
+				cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
+	}
 }
 
 /*
@@ -1531,6 +1539,9 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
 	}
 
 	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
+	trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
+			cfs_rq->blocked_load_avg,
+			cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
 }
 
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
@@ -1584,10 +1595,15 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
 	/* migrated tasks did not contribute to our blocked load */
 	if (wakeup) {
 		subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
+		trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
+				cfs_rq->blocked_load_avg,
+				cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
 		update_entity_load_avg(se, 0);
 	}
 
 	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+	trace_sched_cfs_rq_runnable_load(cpu_of(rq_of(cfs_rq)),
+			cfs_rq->runnable_load_avg, cfs_rq->load.weight);
 	/* we force update consideration on load-balancer moves */
 	update_cfs_rq_blocked_load(cfs_rq, !wakeup);
 }
@@ -1608,6 +1624,9 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
 	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
 	if (sleep) {
 		cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
+		trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
+				cfs_rq->blocked_load_avg,
+				cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
 		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
 	} /* migrations, e.g. sleep=0 leave decay_count == 0 */
 }
@@ -5894,6 +5913,9 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
 		__synchronize_entity_decay(&p->se);
 		subtract_blocked_load_contrib(cfs_rq,
 				p->se.avg.load_avg_contrib);
+		trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
+				cfs_rq->blocked_load_avg,
+				cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
 	}
 #endif
 }
@@ -5994,6 +6016,9 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
 		 */
 		p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
 		cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
+		trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
+				cfs_rq->blocked_load_avg,
+				cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
 #endif
 	}
 }
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [V2 2/2] sched: update cfs_rq weight earlier in enqueue_entity
  2013-07-01 12:33 ` [PATCH V2 " Lei Wen
  2013-07-01 12:33   ` [V2 1/2] sched: add trace events for task and rq usage tracking Lei Wen
@ 2013-07-01 12:33   ` Lei Wen
  2013-07-01 14:07     ` Paul Turner
  2013-07-02 12:15   ` [PATCH V3 0/2] sched: add trace event for per-entity tracking Lei Wen
  2 siblings, 1 reply; 18+ messages in thread
From: Lei Wen @ 2013-07-01 12:33 UTC (permalink / raw)
  To: Paul Turner, Alex Shi, Peter Zijlstra, Ingo Molnar, mingo,
	Kamalesh Babulal, Lei Wen, linux-kernel

Since we are going to calculate cfs_rq's average ratio by
runnable_load_avg/load.weight, if not increase the load.weight prior to
enqueue_entity_load_avg, it may lead to one cfs_rq's avg ratio higher
than 100%.

Adjust the sequence, so that all ratio is kept below 100%.

Signed-off-by: Lei Wen <leiwen@marvell.com>
---
 kernel/sched/fair.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 07bd74c..d1eee84 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1788,8 +1788,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
-	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
 	account_entity_enqueue(cfs_rq, se);
+	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
 	update_cfs_shares(cfs_rq);
 
 	if (flags & ENQUEUE_WAKEUP) {
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [V2 1/2] sched: add trace events for task and rq usage tracking
  2013-07-01 12:33   ` [V2 1/2] sched: add trace events for task and rq usage tracking Lei Wen
@ 2013-07-01 12:44     ` Peter Zijlstra
  2013-07-01 13:25       ` Lei Wen
  0 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2013-07-01 12:44 UTC (permalink / raw)
  To: Lei Wen
  Cc: Paul Turner, Alex Shi, Ingo Molnar, mingo, Kamalesh Babulal,
	linux-kernel

On Mon, Jul 01, 2013 at 08:33:21PM +0800, Lei Wen wrote:
> Since we could track task in the entity level now, we may want to
> investigate tasks' running status by recording the trace info, so that
> could make some tuning if needed.

Why would I want to merge this?


> +	trace_sched_task_weighted_load(task_of(se), se->avg.load_avg_contrib, se->load.weight);
> +	trace_sched_task_weighted_load(task_of(se), se->avg.load_avg_contrib, se->load.weight);

> +		trace_sched_cfs_rq_runnable_load(cpu_of(rq_of(cfs_rq)),
> +				cfs_rq->runnable_load_avg, cfs_rq->load.weight);

> +		trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
> +				cfs_rq->blocked_load_avg,
> +				cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);

> +	trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
> +			cfs_rq->blocked_load_avg,
> +			cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);

> +		trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
> +				cfs_rq->blocked_load_avg,
> +				cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);

> +	trace_sched_cfs_rq_runnable_load(cpu_of(rq_of(cfs_rq)),
> +			cfs_rq->runnable_load_avg, cfs_rq->load.weight);

> +		trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
> +				cfs_rq->blocked_load_avg,
> +				cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);

> +		trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
> +				cfs_rq->blocked_load_avg,
> +				cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);

> +		trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
> +				cfs_rq->blocked_load_avg,
> +				cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);

You're not lazy enough by far, you seem to delight in endless repetition :/

How about you first convince me we actually want to merge this; big hint,
there's a significant lack of tracepoints in the entire balancer.

Secondly; WTH didn't you do:

  trace_sched_task_weighted_load(se);
  trace_sched_cfs_rq_runnable_load(cfs_rq);
  trace_sched_cfs_rq_blocked_load(cfs_rq);

The tracepoints themselves could very well extract whatever they want from
that; no need to actually write it out.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [V2 1/2] sched: add trace events for task and rq usage tracking
  2013-07-01 12:44     ` Peter Zijlstra
@ 2013-07-01 13:25       ` Lei Wen
  0 siblings, 0 replies; 18+ messages in thread
From: Lei Wen @ 2013-07-01 13:25 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Lei Wen, Paul Turner, Alex Shi, Ingo Molnar, mingo,
	Kamalesh Babulal, linux-kernel

Hi Peter,

On Mon, Jul 1, 2013 at 8:44 PM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Mon, Jul 01, 2013 at 08:33:21PM +0800, Lei Wen wrote:
>> Since we could track task in the entity level now, we may want to
>> investigate tasks' running status by recording the trace info, so that
>> could make some tuning if needed.
>
> Why would I want to merge this?

With the merged trace point like those, we could then draw the load
distribution picture easily.

>
>
>> +     trace_sched_task_weighted_load(task_of(se), se->avg.load_avg_contrib, se->load.weight);
>> +     trace_sched_task_weighted_load(task_of(se), se->avg.load_avg_contrib, se->load.weight);
>
>> +             trace_sched_cfs_rq_runnable_load(cpu_of(rq_of(cfs_rq)),
>> +                             cfs_rq->runnable_load_avg, cfs_rq->load.weight);
>
>> +             trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
>> +                             cfs_rq->blocked_load_avg,
>> +                             cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
>
>> +     trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
>> +                     cfs_rq->blocked_load_avg,
>> +                     cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
>
>> +             trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
>> +                             cfs_rq->blocked_load_avg,
>> +                             cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
>
>> +     trace_sched_cfs_rq_runnable_load(cpu_of(rq_of(cfs_rq)),
>> +                     cfs_rq->runnable_load_avg, cfs_rq->load.weight);
>
>> +             trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
>> +                             cfs_rq->blocked_load_avg,
>> +                             cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
>
>> +             trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
>> +                             cfs_rq->blocked_load_avg,
>> +                             cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
>
>> +             trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
>> +                             cfs_rq->blocked_load_avg,
>> +                             cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
>
> You're not lazy enough by far, you seem to delight in endless repetition :/

Yep, I already notice this duplicated...


>
> How about you first convince me we actually want to merge this; big hint,
> there's a significant lack of tracepoints in the entire balancer.

You already said what I want to say. :)
With the pre-embedded tracepoint, we could make our life easy over tracking
the system load, especially since the per-entity load tracking is
recently added,
people may want to use those trace point to get better understanding for
this new feature.

>
> Secondly; WTH didn't you do:
>
>   trace_sched_task_weighted_load(se);
>   trace_sched_cfs_rq_runnable_load(cfs_rq);
>   trace_sched_cfs_rq_blocked_load(cfs_rq);

So cleaner than my previous one!

>
> The tracepoints themselves could very well extract whatever they want from
> that; no need to actually write it out.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [V2 2/2] sched: update cfs_rq weight earlier in enqueue_entity
  2013-07-01 12:33   ` [V2 2/2] sched: update cfs_rq weight earlier in enqueue_entity Lei Wen
@ 2013-07-01 14:07     ` Paul Turner
  2013-07-02  2:52       ` Lei Wen
  0 siblings, 1 reply; 18+ messages in thread
From: Paul Turner @ 2013-07-01 14:07 UTC (permalink / raw)
  To: Lei Wen
  Cc: Alex Shi, Peter Zijlstra, Ingo Molnar, Ingo Molnar,
	Kamalesh Babulal, LKML

Could you please restate the below?

On Mon, Jul 1, 2013 at 5:33 AM, Lei Wen <leiwen@marvell.com> wrote:
> Since we are going to calculate cfs_rq's average ratio by
> runnable_load_avg/load.weight

I don't understand what you mean by this.

>, if not increase the load.weight prior to
> enqueue_entity_load_avg, it may lead to one cfs_rq's avg ratio higher
> than 100%.
>

Or this.

> Adjust the sequence, so that all ratio is kept below 100%.
>
> Signed-off-by: Lei Wen <leiwen@marvell.com>
> ---
>  kernel/sched/fair.c |    2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 07bd74c..d1eee84 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1788,8 +1788,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>          * Update run-time statistics of the 'current'.
>          */
>         update_curr(cfs_rq);
> -       enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
>         account_entity_enqueue(cfs_rq, se);
> +       enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);

account_entity_enqueue is independent of enqueue_entity_load_avg;
their order should not matter.

Further, should we restore the reverted amortization commit (improves
context switch times) enqueue_entity_load_avg needs to precede
account_entity_enqueue as it may update se->load.weight.

>         update_cfs_shares(cfs_rq);
>
>         if (flags & ENQUEUE_WAKEUP) {
> --
> 1.7.10.4
>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [V2 2/2] sched: update cfs_rq weight earlier in enqueue_entity
  2013-07-01 14:07     ` Paul Turner
@ 2013-07-02  2:52       ` Lei Wen
  0 siblings, 0 replies; 18+ messages in thread
From: Lei Wen @ 2013-07-02  2:52 UTC (permalink / raw)
  To: Paul Turner
  Cc: Lei Wen, Alex Shi, Peter Zijlstra, Ingo Molnar, Ingo Molnar,
	Kamalesh Babulal, LKML

Paul,

On Mon, Jul 1, 2013 at 10:07 PM, Paul Turner <pjt@google.com> wrote:
> Could you please restate the below?
>
> On Mon, Jul 1, 2013 at 5:33 AM, Lei Wen <leiwen@marvell.com> wrote:
>> Since we are going to calculate cfs_rq's average ratio by
>> runnable_load_avg/load.weight
>
> I don't understand what you mean by this.

Previously I take runnable_load_avg/load.weight calculation as the cfs_rq's
average ratio. But as Alex point out, the runnable_avg_sum/runnable_avg_period
may better sever this need.

>
>>, if not increase the load.weight prior to
>> enqueue_entity_load_avg, it may lead to one cfs_rq's avg ratio higher
>> than 100%.
>>
>
> Or this.

In my mind, runnable_load_avg in one cfs_rq should always be less than
load.weight.
Not sure whether this assumption stand here, but runnable_load_avg/load.weight
truly could shows out the cfs_rq execution trend in some aspect.

The previous problem that enqueue_entity_load_avg called before
account_entity_enqueue,
which make runnable_load_avg be updated first, then the load.weight.
So that with the trace info log inside of enqueue_entity_load_avg, we
may get the calculation
result for runnable_load_avg/load.weight > 1.
This result is not friendly for the final data being parsed out.


>
>> Adjust the sequence, so that all ratio is kept below 100%.
>>
>> Signed-off-by: Lei Wen <leiwen@marvell.com>
>> ---
>>  kernel/sched/fair.c |    2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 07bd74c..d1eee84 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -1788,8 +1788,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>>          * Update run-time statistics of the 'current'.
>>          */
>>         update_curr(cfs_rq);
>> -       enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
>>         account_entity_enqueue(cfs_rq, se);
>> +       enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
>
> account_entity_enqueue is independent of enqueue_entity_load_avg;
> their order should not matter.

Yes, agree, the order should not be matter, but for make trace info
integrated, we may
need some order here.

>
> Further, should we restore the reverted amortization commit (improves
> context switch times)


Not understand here...
What the "should we restore the reverted amortization commit (improves
context switch times)" means here...?


enqueue_entity_load_avg needs to precede
> account_entity_enqueue as it may update se->load.weight.

account_entity_enqueue needs to precede enqueue_entity_load_avg?

Thanks,
Lei


>
>>         update_cfs_shares(cfs_rq);
>>
>>         if (flags & ENQUEUE_WAKEUP) {
>> --
>> 1.7.10.4
>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH V3 0/2] sched: add trace event for per-entity tracking
  2013-07-01 12:33 ` [PATCH V2 " Lei Wen
  2013-07-01 12:33   ` [V2 1/2] sched: add trace events for task and rq usage tracking Lei Wen
  2013-07-01 12:33   ` [V2 2/2] sched: update cfs_rq weight earlier in enqueue_entity Lei Wen
@ 2013-07-02 12:15   ` Lei Wen
  2013-07-02 12:15     ` [V3 1/2] sched: add trace events for task and rq usage tracking Lei Wen
  2013-07-02 12:15     ` [V3 2/2] sched: update cfs_rq weight earlier in enqueue_entity Lei Wen
  2 siblings, 2 replies; 18+ messages in thread
From: Lei Wen @ 2013-07-02 12:15 UTC (permalink / raw)
  To: Paul Turner, Alex Shi, Peter Zijlstra, Ingo Molnar, mingo,
	Kamalesh Babulal, Lei Wen, linux-kernel

Thanks for the per-entity tracking feature, we could know the details of
each task by its help.
This patch add its trace support, so that we could quickly know the system
status in a large time scale.

The "cfs_rq->runnable_load_avg/cfs_rq->load.weight" is useful in identify
load distribution status in the whole system

With those pre-embedded tracepoint, we could make our life easy over tracking
the system load, especially since the per-entity load tracking is recently added,
people may want to use those trace point to get better understanding for
this new feature.

V3: make trace events passing parameter being simple, and only extend
    its detail in the header file definition. Thanks Peter for pointing out this.

V2: Abstract sched_cfs_rq_runnable_load and sched_cfs_rq_blocked_load using
    sched_cfs_rq_load_contri_template. Thanks Kamalesh for this contribution!

Lei Wen (2):
  sched: add trace events for task and rq usage tracking
  sched: update cfs_rq weight earlier in enqueue_entity

 include/trace/events/sched.h |   73 ++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c          |   31 ++++++++++++++++--
 2 files changed, 101 insertions(+), 3 deletions(-)

-- 
1.7.10.4


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [V3 1/2] sched: add trace events for task and rq usage tracking
  2013-07-02 12:15   ` [PATCH V3 0/2] sched: add trace event for per-entity tracking Lei Wen
@ 2013-07-02 12:15     ` Lei Wen
  2013-07-03 12:46       ` Lei Wen
  2013-07-02 12:15     ` [V3 2/2] sched: update cfs_rq weight earlier in enqueue_entity Lei Wen
  1 sibling, 1 reply; 18+ messages in thread
From: Lei Wen @ 2013-07-02 12:15 UTC (permalink / raw)
  To: Paul Turner, Alex Shi, Peter Zijlstra, Ingo Molnar, mingo,
	Kamalesh Babulal, Lei Wen, linux-kernel

Since we could track task in the entity level now, we may want to
investigate tasks' running status by recording the trace info, so that
could make some tuning if needed.

Signed-off-by: Lei Wen <leiwen@marvell.com>
Cc: Alex Shi <alex.shi@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
---
 include/trace/events/sched.h |   76 ++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c          |   15 +++++++--
 2 files changed, 89 insertions(+), 2 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index e5586ca..768b398 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -430,6 +430,82 @@ TRACE_EVENT(sched_pi_setprio,
 			__entry->oldprio, __entry->newprio)
 );
 
+#ifdef CONFIG_SMP
+TRACE_EVENT(sched_task_weighted_load,
+
+	TP_PROTO(struct sched_entity *se),
+
+	TP_ARGS(se),
+
+	TP_STRUCT__entry(
+		__field(pid_t, pid)
+		__field(int, cpu)
+		__field(unsigned long, load)
+		__field(unsigned long, weight)
+	),
+
+	TP_fast_assign(
+		__entry->pid   = container_of(se, struct task_struct, se)->pid;
+		__entry->cpu   = se->cfs_rq->rq->cpu;
+		__entry->load  = se->avg.load_avg_contrib;
+		__entry->weight= se->load.weight;
+	),
+
+	TP_printk("cpu=%d pid=%d load=%lu weight=%lu",
+			__entry->cpu, __entry->pid,
+			__entry->load, __entry->weight)
+);
+
+TRACE_EVENT(sched_cfs_rq_runnable_load,
+
+	TP_PROTO(struct cfs_rq *cfs_rq),
+
+	TP_ARGS(cfs_rq),
+
+	TP_STRUCT__entry(
+		__field(int, cpu)
+		__field(unsigned long, load)
+		__field(unsigned long, total)
+	),
+
+	TP_fast_assign(
+		__entry->cpu   = cfs_rq->rq->cpu;
+		__entry->load  = cfs_rq->runnable_load_avg;
+		__entry->total = cfs_rq->load.weight;
+	),
+
+	TP_printk("cpu=%d avg=%lu total=%lu",
+		__entry->cpu,
+		__entry->load,
+		__entry->total)
+);
+
+TRACE_EVENT(sched_cfs_rq_blocked_load,
+
+	TP_PROTO(struct cfs_rq *cfs_rq),
+
+	TP_ARGS(cfs_rq),
+
+	TP_STRUCT__entry(
+		__field(int, cpu)
+		__field(unsigned long, load)
+		__field(unsigned long, total)
+	),
+
+	TP_fast_assign(
+		__entry->cpu   = cfs_rq->rq->cpu;
+		__entry->load  = cfs_rq->blocked_load_avg;
+		__entry->total = cfs_rq->blocked_load_avg
+				+ cfs_rq->runnable_load_avg;
+	),
+
+	TP_printk("cpu=%d avg=%lu total=%lu",
+		__entry->cpu,
+		__entry->load,
+		__entry->total)
+);
+#endif
+
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f77f9c5..2290469 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1346,6 +1346,7 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se)
 		return 0;
 
 	se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
+	trace_sched_task_weighted_load(se);
 	se->avg.decay_count = 0;
 
 	return decays;
@@ -1445,6 +1446,7 @@ static inline void __update_task_entity_contrib(struct sched_entity *se)
 	contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
 	contrib /= (se->avg.runnable_avg_period + 1);
 	se->avg.load_avg_contrib = scale_load(contrib);
+	trace_sched_task_weighted_load(se);
 }
 
 /* Compute the current contribution to load_avg by se, return any delta */
@@ -1498,10 +1500,13 @@ static inline void update_entity_load_avg(struct sched_entity *se,
 	if (!update_cfs_rq)
 		return;
 
-	if (se->on_rq)
+	if (se->on_rq) {
 		cfs_rq->runnable_load_avg += contrib_delta;
-	else
+		trace_sched_cfs_rq_runnable_load(cfs_rq);
+	} else {
 		subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
+		trace_sched_cfs_rq_blocked_load(cfs_rq);
+	}
 }
 
 /*
@@ -1531,6 +1536,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
 	}
 
 	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
+	trace_sched_cfs_rq_blocked_load(cfs_rq);
 }
 
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
@@ -1584,10 +1590,12 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
 	/* migrated tasks did not contribute to our blocked load */
 	if (wakeup) {
 		subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
+		trace_sched_cfs_rq_blocked_load(cfs_rq);
 		update_entity_load_avg(se, 0);
 	}
 
 	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+	trace_sched_cfs_rq_runnable_load(cfs_rq);
 	/* we force update consideration on load-balancer moves */
 	update_cfs_rq_blocked_load(cfs_rq, !wakeup);
 }
@@ -1608,6 +1616,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
 	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
 	if (sleep) {
 		cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
+		trace_sched_cfs_rq_blocked_load(cfs_rq);
 		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
 	} /* migrations, e.g. sleep=0 leave decay_count == 0 */
 }
@@ -5894,6 +5903,7 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
 		__synchronize_entity_decay(&p->se);
 		subtract_blocked_load_contrib(cfs_rq,
 				p->se.avg.load_avg_contrib);
+		trace_sched_cfs_rq_blocked_load(cfs_rq);
 	}
 #endif
 }
@@ -5994,6 +6004,7 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
 		 */
 		p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
 		cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
+		trace_sched_cfs_rq_blocked_load(cfs_rq);
 #endif
 	}
 }
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [V3 2/2] sched: update cfs_rq weight earlier in enqueue_entity
  2013-07-02 12:15   ` [PATCH V3 0/2] sched: add trace event for per-entity tracking Lei Wen
  2013-07-02 12:15     ` [V3 1/2] sched: add trace events for task and rq usage tracking Lei Wen
@ 2013-07-02 12:15     ` Lei Wen
  1 sibling, 0 replies; 18+ messages in thread
From: Lei Wen @ 2013-07-02 12:15 UTC (permalink / raw)
  To: Paul Turner, Alex Shi, Peter Zijlstra, Ingo Molnar, mingo,
	Kamalesh Babulal, Lei Wen, linux-kernel

We are expecting runnable_load_avg is less than load.weight, 
So that runnable_load_avg/load.weight could well present out the system's
load distribution.

if not increase the load.weight prior to enqueue_entity_load_avg, it may
lead to runnable_load_avg is higher than load.weight, so that people
may get confused.

Signed-off-by: Lei Wen <leiwen@marvell.com>
Cc: Alex Shi <alex.shi@intel.com>
Cc: Paul Turner <pjt@google.com>
---
 kernel/sched/fair.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2290469..53224d1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1778,8 +1778,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
-	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
 	account_entity_enqueue(cfs_rq, se);
+	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
 	update_cfs_shares(cfs_rq);
 
 	if (flags & ENQUEUE_WAKEUP) {
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [V3 1/2] sched: add trace events for task and rq usage tracking
  2013-07-02 12:15     ` [V3 1/2] sched: add trace events for task and rq usage tracking Lei Wen
@ 2013-07-03 12:46       ` Lei Wen
  0 siblings, 0 replies; 18+ messages in thread
From: Lei Wen @ 2013-07-03 12:46 UTC (permalink / raw)
  To: Lei Wen
  Cc: Paul Turner, Alex Shi, Peter Zijlstra, Ingo Molnar, mingo,
	Kamalesh Babulal, linux-kernel

Hi Peter,

Do you have some further suggestion for this patch? :)

Thanks,
Lei

On Tue, Jul 2, 2013 at 8:15 PM, Lei Wen <leiwen@marvell.com> wrote:
> Since we could track task in the entity level now, we may want to
> investigate tasks' running status by recording the trace info, so that
> could make some tuning if needed.
>
> Signed-off-by: Lei Wen <leiwen@marvell.com>
> Cc: Alex Shi <alex.shi@intel.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
> ---
>  include/trace/events/sched.h |   76 ++++++++++++++++++++++++++++++++++++++++++
>  kernel/sched/fair.c          |   15 +++++++--
>  2 files changed, 89 insertions(+), 2 deletions(-)
>
> diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
> index e5586ca..768b398 100644
> --- a/include/trace/events/sched.h
> +++ b/include/trace/events/sched.h
> @@ -430,6 +430,82 @@ TRACE_EVENT(sched_pi_setprio,
>                         __entry->oldprio, __entry->newprio)
>  );
>
> +#ifdef CONFIG_SMP
> +TRACE_EVENT(sched_task_weighted_load,
> +
> +       TP_PROTO(struct sched_entity *se),
> +
> +       TP_ARGS(se),
> +
> +       TP_STRUCT__entry(
> +               __field(pid_t, pid)
> +               __field(int, cpu)
> +               __field(unsigned long, load)
> +               __field(unsigned long, weight)
> +       ),
> +
> +       TP_fast_assign(
> +               __entry->pid   = container_of(se, struct task_struct, se)->pid;
> +               __entry->cpu   = se->cfs_rq->rq->cpu;
> +               __entry->load  = se->avg.load_avg_contrib;
> +               __entry->weight= se->load.weight;
> +       ),
> +
> +       TP_printk("cpu=%d pid=%d load=%lu weight=%lu",
> +                       __entry->cpu, __entry->pid,
> +                       __entry->load, __entry->weight)
> +);
> +
> +TRACE_EVENT(sched_cfs_rq_runnable_load,
> +
> +       TP_PROTO(struct cfs_rq *cfs_rq),
> +
> +       TP_ARGS(cfs_rq),
> +
> +       TP_STRUCT__entry(
> +               __field(int, cpu)
> +               __field(unsigned long, load)
> +               __field(unsigned long, total)
> +       ),
> +
> +       TP_fast_assign(
> +               __entry->cpu   = cfs_rq->rq->cpu;
> +               __entry->load  = cfs_rq->runnable_load_avg;
> +               __entry->total = cfs_rq->load.weight;
> +       ),
> +
> +       TP_printk("cpu=%d avg=%lu total=%lu",
> +               __entry->cpu,
> +               __entry->load,
> +               __entry->total)
> +);
> +
> +TRACE_EVENT(sched_cfs_rq_blocked_load,
> +
> +       TP_PROTO(struct cfs_rq *cfs_rq),
> +
> +       TP_ARGS(cfs_rq),
> +
> +       TP_STRUCT__entry(
> +               __field(int, cpu)
> +               __field(unsigned long, load)
> +               __field(unsigned long, total)
> +       ),
> +
> +       TP_fast_assign(
> +               __entry->cpu   = cfs_rq->rq->cpu;
> +               __entry->load  = cfs_rq->blocked_load_avg;
> +               __entry->total = cfs_rq->blocked_load_avg
> +                               + cfs_rq->runnable_load_avg;
> +       ),
> +
> +       TP_printk("cpu=%d avg=%lu total=%lu",
> +               __entry->cpu,
> +               __entry->load,
> +               __entry->total)
> +);
> +#endif
> +
>  #endif /* _TRACE_SCHED_H */
>
>  /* This part must be outside protection */
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index f77f9c5..2290469 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1346,6 +1346,7 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se)
>                 return 0;
>
>         se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
> +       trace_sched_task_weighted_load(se);
>         se->avg.decay_count = 0;
>
>         return decays;
> @@ -1445,6 +1446,7 @@ static inline void __update_task_entity_contrib(struct sched_entity *se)
>         contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
>         contrib /= (se->avg.runnable_avg_period + 1);
>         se->avg.load_avg_contrib = scale_load(contrib);
> +       trace_sched_task_weighted_load(se);
>  }
>
>  /* Compute the current contribution to load_avg by se, return any delta */
> @@ -1498,10 +1500,13 @@ static inline void update_entity_load_avg(struct sched_entity *se,
>         if (!update_cfs_rq)
>                 return;
>
> -       if (se->on_rq)
> +       if (se->on_rq) {
>                 cfs_rq->runnable_load_avg += contrib_delta;
> -       else
> +               trace_sched_cfs_rq_runnable_load(cfs_rq);
> +       } else {
>                 subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
> +               trace_sched_cfs_rq_blocked_load(cfs_rq);
> +       }
>  }
>
>  /*
> @@ -1531,6 +1536,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
>         }
>
>         __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
> +       trace_sched_cfs_rq_blocked_load(cfs_rq);
>  }
>
>  static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
> @@ -1584,10 +1590,12 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
>         /* migrated tasks did not contribute to our blocked load */
>         if (wakeup) {
>                 subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
> +               trace_sched_cfs_rq_blocked_load(cfs_rq);
>                 update_entity_load_avg(se, 0);
>         }
>
>         cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
> +       trace_sched_cfs_rq_runnable_load(cfs_rq);
>         /* we force update consideration on load-balancer moves */
>         update_cfs_rq_blocked_load(cfs_rq, !wakeup);
>  }
> @@ -1608,6 +1616,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
>         cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
>         if (sleep) {
>                 cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
> +               trace_sched_cfs_rq_blocked_load(cfs_rq);
>                 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
>         } /* migrations, e.g. sleep=0 leave decay_count == 0 */
>  }
> @@ -5894,6 +5903,7 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
>                 __synchronize_entity_decay(&p->se);
>                 subtract_blocked_load_contrib(cfs_rq,
>                                 p->se.avg.load_avg_contrib);
> +               trace_sched_cfs_rq_blocked_load(cfs_rq);
>         }
>  #endif
>  }
> @@ -5994,6 +6004,7 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
>                  */
>                 p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
>                 cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
> +               trace_sched_cfs_rq_blocked_load(cfs_rq);
>  #endif
>         }
>  }
> --
> 1.7.10.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2013-07-03 12:46 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-07-01  7:10 [PATCH 0/2] sched: add trace event for per-entity tracking Lei Wen
2013-07-01  7:10 ` [PATCH 1/2] sched: add trace events for task and rq usage tracking Lei Wen
2013-07-01  9:43   ` Kamalesh Babulal
2013-07-01 12:18     ` Lei Wen
2013-07-01  7:10 ` [PATCH 2/2] sched: update cfs_rq weight earlier in enqueue_entity Lei Wen
2013-07-01  8:06 ` [PATCH 0/2] sched: add trace event for per-entity tracking Alex Shi
2013-07-01  8:49   ` Lei Wen
2013-07-01 12:33 ` [PATCH V2 " Lei Wen
2013-07-01 12:33   ` [V2 1/2] sched: add trace events for task and rq usage tracking Lei Wen
2013-07-01 12:44     ` Peter Zijlstra
2013-07-01 13:25       ` Lei Wen
2013-07-01 12:33   ` [V2 2/2] sched: update cfs_rq weight earlier in enqueue_entity Lei Wen
2013-07-01 14:07     ` Paul Turner
2013-07-02  2:52       ` Lei Wen
2013-07-02 12:15   ` [PATCH V3 0/2] sched: add trace event for per-entity tracking Lei Wen
2013-07-02 12:15     ` [V3 1/2] sched: add trace events for task and rq usage tracking Lei Wen
2013-07-03 12:46       ` Lei Wen
2013-07-02 12:15     ` [V3 2/2] sched: update cfs_rq weight earlier in enqueue_entity Lei Wen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).