linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] Sched: Add a tracepoint to track rq->nr_running
@ 2020-06-19 14:11 Phil Auld
  2020-06-19 16:46 ` Steven Rostedt
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Phil Auld @ 2020-06-19 14:11 UTC (permalink / raw)
  To: linux-kernel
  Cc: Qais Yousef, Ingo Molnar, Peter Zijlstra, Vincent Guittot,
	Steven Rostedt, Juri Lelli, Mel Gorman

Add a bare tracepoint trace_sched_update_nr_running_tp which tracks
->nr_running CPU's rq. This is used to accurately trace this data and
provide a visualization of scheduler imbalances in, for example, the
form of a heat map.  The tracepoint is accessed by loading an external
kernel module. An example module (forked from Qais' module and including
the pelt related tracepoints) can be found at:

  https://github.com/auldp/tracepoints-helpers.git

A script to turn the trace-cmd report output into a heatmap plot can be
found at:

  https://github.com/jirvoz/plot-nr-running

The tracepoints are added to add_nr_running() and sub_nr_running() which
are in kernel/sched/sched.h. Since sched.h includes trace/events/tlb.h
via mmu_context.h we had to limit when CREATE_TRACE_POINTS is defined.

Signed-off-by: Phil Auld <pauld@redhat.com>
CC: Qais Yousef <qais.yousef@arm.com>
CC: Ingo Molnar <mingo@redhat.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Vincent Guittot <vincent.guittot@linaro.org>
CC: linux-kernel@vger.kernel.org
---
 include/trace/events/sched.h |  4 ++++
 kernel/sched/core.c          |  9 ++++-----
 kernel/sched/fair.c          |  2 --
 kernel/sched/pelt.c          |  2 --
 kernel/sched/sched.h         | 12 ++++++++++++
 5 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index ed168b0e2c53..a6d9fe5a68cf 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -634,6 +634,10 @@ DECLARE_TRACE(sched_overutilized_tp,
 	TP_PROTO(struct root_domain *rd, bool overutilized),
 	TP_ARGS(rd, overutilized));
 
+DECLARE_TRACE(sched_update_nr_running_tp,
+	TP_PROTO(int cpu, int change, unsigned int nr_running),
+	TP_ARGS(cpu, change, nr_running));
+
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9a2fbf98fd6f..6f28fdff1d48 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6,7 +6,10 @@
  *
  *  Copyright (C) 1991-2002  Linus Torvalds
  */
+
+#define SCHED_CREATE_TRACE_POINTS
 #include "sched.h"
+#undef SCHED_CREATE_TRACE_POINTS
 
 #include <linux/nospec.h>
 
@@ -21,9 +24,6 @@
 
 #include "pelt.h"
 
-#define CREATE_TRACE_POINTS
-#include <trace/events/sched.h>
-
 /*
  * Export tracepoints that act as a bare tracehook (ie: have no trace event
  * associated with them) to allow external modules to probe them.
@@ -34,6 +34,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
@@ -7969,5 +7970,3 @@ const u32 sched_prio_to_wmult[40] = {
  /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
  /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
-
-#undef CREATE_TRACE_POINTS
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index da3e5b54715b..fe5d9b6db8f7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -22,8 +22,6 @@
  */
 #include "sched.h"
 
-#include <trace/events/sched.h>
-
 /*
  * Targeted preemption latency for CPU-bound tasks:
  *
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index b647d04d9c8b..bb69a0ae8d6c 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -28,8 +28,6 @@
 #include "sched.h"
 #include "pelt.h"
 
-#include <trace/events/sched.h>
-
 /*
  * Approximate:
  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index db3a57675ccf..6ae96679c169 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -75,6 +75,15 @@
 #include "cpupri.h"
 #include "cpudeadline.h"
 
+#ifdef SCHED_CREATE_TRACE_POINTS
+#define CREATE_TRACE_POINTS
+#endif
+#include <trace/events/sched.h>
+
+#ifdef SCHED_CREATE_TRACE_POINTS
+#undef CREATE_TRACE_POINTS
+#endif
+
 #ifdef CONFIG_SCHED_DEBUG
 # define SCHED_WARN_ON(x)	WARN_ONCE(x, #x)
 #else
@@ -1959,6 +1968,7 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
 	unsigned prev_nr = rq->nr_running;
 
 	rq->nr_running = prev_nr + count;
+	trace_sched_update_nr_running_tp(cpu_of(rq), count, rq->nr_running);
 
 #ifdef CONFIG_SMP
 	if (prev_nr < 2 && rq->nr_running >= 2) {
@@ -1973,6 +1983,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
 static inline void sub_nr_running(struct rq *rq, unsigned count)
 {
 	rq->nr_running -= count;
+	trace_sched_update_nr_running_tp(cpu_of(rq), -count, rq->nr_running);
+
 	/* Check if we still need preemption */
 	sched_update_tick_dependency(rq);
 }
-- 
2.18.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] Sched: Add a tracepoint to track rq->nr_running
  2020-06-19 14:11 [PATCH] Sched: Add a tracepoint to track rq->nr_running Phil Auld
@ 2020-06-19 16:46 ` Steven Rostedt
  2020-06-19 17:34   ` Phil Auld
  2020-06-22 12:17 ` Qais Yousef
  2020-06-29 19:23 ` [PATCH v2] " Phil Auld
  2 siblings, 1 reply; 9+ messages in thread
From: Steven Rostedt @ 2020-06-19 16:46 UTC (permalink / raw)
  To: Phil Auld
  Cc: linux-kernel, Qais Yousef, Ingo Molnar, Peter Zijlstra,
	Vincent Guittot, Juri Lelli, Mel Gorman

On Fri, 19 Jun 2020 10:11:20 -0400
Phil Auld <pauld@redhat.com> wrote:

> 
> diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
> index ed168b0e2c53..a6d9fe5a68cf 100644
> --- a/include/trace/events/sched.h
> +++ b/include/trace/events/sched.h
> @@ -634,6 +634,10 @@ DECLARE_TRACE(sched_overutilized_tp,
>  	TP_PROTO(struct root_domain *rd, bool overutilized),
>  	TP_ARGS(rd, overutilized));
>  
> +DECLARE_TRACE(sched_update_nr_running_tp,
> +	TP_PROTO(int cpu, int change, unsigned int nr_running),
> +	TP_ARGS(cpu, change, nr_running));
> +
>  #endif /* _TRACE_SCHED_H */
>  
>  /* This part must be outside protection */
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 9a2fbf98fd6f..6f28fdff1d48 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6,7 +6,10 @@
>   *
>   *  Copyright (C) 1991-2002  Linus Torvalds
>   */
> +
> +#define SCHED_CREATE_TRACE_POINTS
>  #include "sched.h"
> +#undef SCHED_CREATE_TRACE_POINTS

Because of the macro magic, and really try not to have trace events
defined in any headers. Otherwise, we have weird defines like you are
doing, and it doesn't fully protect it if a C file adds this header and
defines CREATE_TRACE_POINTS first.


>  
>  #include <linux/nospec.h>
>  
> @@ -21,9 +24,6 @@
>  
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -75,6 +75,15 @@
>  #include "cpupri.h"
>  #include "cpudeadline.h"
>  
> +#ifdef SCHED_CREATE_TRACE_POINTS
> +#define CREATE_TRACE_POINTS
> +#endif
> +#include <trace/events/sched.h>
> +
> +#ifdef SCHED_CREATE_TRACE_POINTS
> +#undef CREATE_TRACE_POINTS
> +#endif
> +
>  #ifdef CONFIG_SCHED_DEBUG
>  # define SCHED_WARN_ON(x)	WARN_ONCE(x, #x)
>  #else
> @@ -1959,6 +1968,7 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
>  	unsigned prev_nr = rq->nr_running;
>  
>  	rq->nr_running = prev_nr + count;
> +	trace_sched_update_nr_running_tp(cpu_of(rq), count, rq->nr_running);

Instead of having sched.h define CREATE_TRACE_POINTS, I would have the
following:

	if (trace_sched_update_nr_running_tp_enabled()) {
		call_trace_sched_update_nr_runnig(rq, count);
	}

Then in sched/core.c:

void trace_sched_update_nr_running(struct rq *rq, int count)
{
	trace_sched_update_nr_running_tp(cpu_of(rq), count, rq->nr_running);
}

The trace_*_enabled() above uses static branches, where the if turns to
a nop (pass through) when disabled and a jmp when enabled (same logic
that trace points use themselves).

Then you don't need this macro dance, and risk having another C file
define CREATE_TRACE_POINTS and spend hours debugging why it suddenly
broke.

-- Steve

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Sched: Add a tracepoint to track rq->nr_running
  2020-06-19 16:46 ` Steven Rostedt
@ 2020-06-19 17:34   ` Phil Auld
  0 siblings, 0 replies; 9+ messages in thread
From: Phil Auld @ 2020-06-19 17:34 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: linux-kernel, Qais Yousef, Ingo Molnar, Peter Zijlstra,
	Vincent Guittot, Juri Lelli, Mel Gorman

On Fri, Jun 19, 2020 at 12:46:41PM -0400 Steven Rostedt wrote:
> On Fri, 19 Jun 2020 10:11:20 -0400
> Phil Auld <pauld@redhat.com> wrote:
> 
> > 
> > diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
> > index ed168b0e2c53..a6d9fe5a68cf 100644
> > --- a/include/trace/events/sched.h
> > +++ b/include/trace/events/sched.h
> > @@ -634,6 +634,10 @@ DECLARE_TRACE(sched_overutilized_tp,
> >  	TP_PROTO(struct root_domain *rd, bool overutilized),
> >  	TP_ARGS(rd, overutilized));
> >  
> > +DECLARE_TRACE(sched_update_nr_running_tp,
> > +	TP_PROTO(int cpu, int change, unsigned int nr_running),
> > +	TP_ARGS(cpu, change, nr_running));
> > +
> >  #endif /* _TRACE_SCHED_H */
> >  
> >  /* This part must be outside protection */
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 9a2fbf98fd6f..6f28fdff1d48 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -6,7 +6,10 @@
> >   *
> >   *  Copyright (C) 1991-2002  Linus Torvalds
> >   */
> > +
> > +#define SCHED_CREATE_TRACE_POINTS
> >  #include "sched.h"
> > +#undef SCHED_CREATE_TRACE_POINTS
> 
> Because of the macro magic, and really try not to have trace events
> defined in any headers. Otherwise, we have weird defines like you are
> doing, and it doesn't fully protect it if a C file adds this header and
> defines CREATE_TRACE_POINTS first.
> 
> 
> >  
> >  #include <linux/nospec.h>
> >  
> > @@ -21,9 +24,6 @@
> >  
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -75,6 +75,15 @@
> >  #include "cpupri.h"
> >  #include "cpudeadline.h"
> >  
> > +#ifdef SCHED_CREATE_TRACE_POINTS
> > +#define CREATE_TRACE_POINTS
> > +#endif
> > +#include <trace/events/sched.h>
> > +
> > +#ifdef SCHED_CREATE_TRACE_POINTS
> > +#undef CREATE_TRACE_POINTS
> > +#endif
> > +
> >  #ifdef CONFIG_SCHED_DEBUG
> >  # define SCHED_WARN_ON(x)	WARN_ONCE(x, #x)
> >  #else
> > @@ -1959,6 +1968,7 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
> >  	unsigned prev_nr = rq->nr_running;
> >  
> >  	rq->nr_running = prev_nr + count;
> > +	trace_sched_update_nr_running_tp(cpu_of(rq), count, rq->nr_running);
> 
> Instead of having sched.h define CREATE_TRACE_POINTS, I would have the
> following:
> 
> 	if (trace_sched_update_nr_running_tp_enabled()) {
> 		call_trace_sched_update_nr_runnig(rq, count);
> 	}
> 
> Then in sched/core.c:
> 
> void trace_sched_update_nr_running(struct rq *rq, int count)
> {
> 	trace_sched_update_nr_running_tp(cpu_of(rq), count, rq->nr_running);
> }
> 
> The trace_*_enabled() above uses static branches, where the if turns to
> a nop (pass through) when disabled and a jmp when enabled (same logic
> that trace points use themselves).
> 
> Then you don't need this macro dance, and risk having another C file
> define CREATE_TRACE_POINTS and spend hours debugging why it suddenly
> broke.
>

Awesome, thanks Steve. I was really hoping there was a better way to do
that. I try it this way. 


Cheers,
Phil

> -- Steve
> 

-- 


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Sched: Add a tracepoint to track rq->nr_running
  2020-06-19 14:11 [PATCH] Sched: Add a tracepoint to track rq->nr_running Phil Auld
  2020-06-19 16:46 ` Steven Rostedt
@ 2020-06-22 12:17 ` Qais Yousef
  2020-06-23 19:38   ` Phil Auld
  2020-06-29 19:23 ` [PATCH v2] " Phil Auld
  2 siblings, 1 reply; 9+ messages in thread
From: Qais Yousef @ 2020-06-22 12:17 UTC (permalink / raw)
  To: Phil Auld
  Cc: linux-kernel, Ingo Molnar, Peter Zijlstra, Vincent Guittot,
	Steven Rostedt, Juri Lelli, Mel Gorman

On 06/19/20 10:11, Phil Auld wrote:
> Add a bare tracepoint trace_sched_update_nr_running_tp which tracks
> ->nr_running CPU's rq. This is used to accurately trace this data and
> provide a visualization of scheduler imbalances in, for example, the
> form of a heat map.  The tracepoint is accessed by loading an external
> kernel module. An example module (forked from Qais' module and including
> the pelt related tracepoints) can be found at:
> 
>   https://github.com/auldp/tracepoints-helpers.git
> 
> A script to turn the trace-cmd report output into a heatmap plot can be
> found at:
> 
>   https://github.com/jirvoz/plot-nr-running
> 
> The tracepoints are added to add_nr_running() and sub_nr_running() which
> are in kernel/sched/sched.h. Since sched.h includes trace/events/tlb.h
> via mmu_context.h we had to limit when CREATE_TRACE_POINTS is defined.
> 
> Signed-off-by: Phil Auld <pauld@redhat.com>
> CC: Qais Yousef <qais.yousef@arm.com>
> CC: Ingo Molnar <mingo@redhat.com>
> CC: Peter Zijlstra <peterz@infradead.org>
> CC: Vincent Guittot <vincent.guittot@linaro.org>
> CC: linux-kernel@vger.kernel.org
> ---
>  include/trace/events/sched.h |  4 ++++
>  kernel/sched/core.c          |  9 ++++-----
>  kernel/sched/fair.c          |  2 --
>  kernel/sched/pelt.c          |  2 --
>  kernel/sched/sched.h         | 12 ++++++++++++
>  5 files changed, 20 insertions(+), 9 deletions(-)
> 
> diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
> index ed168b0e2c53..a6d9fe5a68cf 100644
> --- a/include/trace/events/sched.h
> +++ b/include/trace/events/sched.h
> @@ -634,6 +634,10 @@ DECLARE_TRACE(sched_overutilized_tp,
>  	TP_PROTO(struct root_domain *rd, bool overutilized),
>  	TP_ARGS(rd, overutilized));
>  
> +DECLARE_TRACE(sched_update_nr_running_tp,
> +	TP_PROTO(int cpu, int change, unsigned int nr_running),
> +	TP_ARGS(cpu, change, nr_running));
> +
>  #endif /* _TRACE_SCHED_H */
>  
>  /* This part must be outside protection */
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 9a2fbf98fd6f..6f28fdff1d48 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6,7 +6,10 @@
>   *
>   *  Copyright (C) 1991-2002  Linus Torvalds
>   */
> +
> +#define SCHED_CREATE_TRACE_POINTS
>  #include "sched.h"
> +#undef SCHED_CREATE_TRACE_POINTS
>  
>  #include <linux/nospec.h>
>  
> @@ -21,9 +24,6 @@
>  
>  #include "pelt.h"
>  
> -#define CREATE_TRACE_POINTS
> -#include <trace/events/sched.h>
> -
>  /*
>   * Export tracepoints that act as a bare tracehook (ie: have no trace event
>   * associated with them) to allow external modules to probe them.
> @@ -34,6 +34,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
>  EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
>  EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
>  EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
> +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
>  
>  DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
>  
> @@ -7969,5 +7970,3 @@ const u32 sched_prio_to_wmult[40] = {
>   /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
>   /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
>  };
> -
> -#undef CREATE_TRACE_POINTS
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index da3e5b54715b..fe5d9b6db8f7 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -22,8 +22,6 @@
>   */
>  #include "sched.h"
>  
> -#include <trace/events/sched.h>
> -
>  /*
>   * Targeted preemption latency for CPU-bound tasks:
>   *
> diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
> index b647d04d9c8b..bb69a0ae8d6c 100644
> --- a/kernel/sched/pelt.c
> +++ b/kernel/sched/pelt.c
> @@ -28,8 +28,6 @@
>  #include "sched.h"
>  #include "pelt.h"
>  
> -#include <trace/events/sched.h>
> -
>  /*
>   * Approximate:
>   *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index db3a57675ccf..6ae96679c169 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -75,6 +75,15 @@
>  #include "cpupri.h"
>  #include "cpudeadline.h"
>  
> +#ifdef SCHED_CREATE_TRACE_POINTS
> +#define CREATE_TRACE_POINTS
> +#endif
> +#include <trace/events/sched.h>
> +
> +#ifdef SCHED_CREATE_TRACE_POINTS
> +#undef CREATE_TRACE_POINTS
> +#endif
> +
>  #ifdef CONFIG_SCHED_DEBUG
>  # define SCHED_WARN_ON(x)	WARN_ONCE(x, #x)
>  #else
> @@ -1959,6 +1968,7 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
>  	unsigned prev_nr = rq->nr_running;
>  
>  	rq->nr_running = prev_nr + count;
> +	trace_sched_update_nr_running_tp(cpu_of(rq), count, rq->nr_running);

This is a very specific call site, so I guess it looks fine to pass very
specific info too.

But I think we can do better by just passing struct rq and add a new helper
sched_trace_rq_nr_running() (see the bottom of fair.c for a similar helper
functions for tracepoints).

This will allow the user to extract, cpu, nr_running and potentially other info
while only pass a single argument to the tracepoint. Potentially extending its
future usefulness.

The count can be inferred by storing the last nr_running and taking the diff
when a new call happens.

	...

	cpu = sched_trace_rq_cpu(rq);
	nr_running = sched_trace_rq_nr_running(rq);
	count = last_nr_running[cpu] - nr_running;
	last_nr_running[cpu] = nr_running;

	...

I haven't looked at BTF, but it could potentially allow us to access members of
unexported structs reliably without having to export all these helper
functions. It's been something I wanted to look into but no time yet.

Thanks

--
Qais Yousef

>  
>  #ifdef CONFIG_SMP
>  	if (prev_nr < 2 && rq->nr_running >= 2) {
> @@ -1973,6 +1983,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
>  static inline void sub_nr_running(struct rq *rq, unsigned count)
>  {
>  	rq->nr_running -= count;
> +	trace_sched_update_nr_running_tp(cpu_of(rq), -count, rq->nr_running);
> +
>  	/* Check if we still need preemption */
>  	sched_update_tick_dependency(rq);
>  }
> -- 
> 2.18.0
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Sched: Add a tracepoint to track rq->nr_running
  2020-06-22 12:17 ` Qais Yousef
@ 2020-06-23 19:38   ` Phil Auld
  2020-06-24 12:10     ` Qais Yousef
  0 siblings, 1 reply; 9+ messages in thread
From: Phil Auld @ 2020-06-23 19:38 UTC (permalink / raw)
  To: Qais Yousef
  Cc: linux-kernel, Ingo Molnar, Peter Zijlstra, Vincent Guittot,
	Steven Rostedt, Juri Lelli, Mel Gorman

Hi Qais,

On Mon, Jun 22, 2020 at 01:17:47PM +0100 Qais Yousef wrote:
> On 06/19/20 10:11, Phil Auld wrote:
> > Add a bare tracepoint trace_sched_update_nr_running_tp which tracks
> > ->nr_running CPU's rq. This is used to accurately trace this data and
> > provide a visualization of scheduler imbalances in, for example, the
> > form of a heat map.  The tracepoint is accessed by loading an external
> > kernel module. An example module (forked from Qais' module and including
> > the pelt related tracepoints) can be found at:
> > 
> >   https://github.com/auldp/tracepoints-helpers.git
> > 
> > A script to turn the trace-cmd report output into a heatmap plot can be
> > found at:
> > 
> >   https://github.com/jirvoz/plot-nr-running
> > 
> > The tracepoints are added to add_nr_running() and sub_nr_running() which
> > are in kernel/sched/sched.h. Since sched.h includes trace/events/tlb.h
> > via mmu_context.h we had to limit when CREATE_TRACE_POINTS is defined.
> > 
> > Signed-off-by: Phil Auld <pauld@redhat.com>
> > CC: Qais Yousef <qais.yousef@arm.com>
> > CC: Ingo Molnar <mingo@redhat.com>
> > CC: Peter Zijlstra <peterz@infradead.org>
> > CC: Vincent Guittot <vincent.guittot@linaro.org>
> > CC: linux-kernel@vger.kernel.org
> > ---
> >  include/trace/events/sched.h |  4 ++++
> >  kernel/sched/core.c          |  9 ++++-----
> >  kernel/sched/fair.c          |  2 --
> >  kernel/sched/pelt.c          |  2 --
> >  kernel/sched/sched.h         | 12 ++++++++++++
> >  5 files changed, 20 insertions(+), 9 deletions(-)
> > 
> > diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
> > index ed168b0e2c53..a6d9fe5a68cf 100644
> > --- a/include/trace/events/sched.h
> > +++ b/include/trace/events/sched.h
> > @@ -634,6 +634,10 @@ DECLARE_TRACE(sched_overutilized_tp,
> >  	TP_PROTO(struct root_domain *rd, bool overutilized),
> >  	TP_ARGS(rd, overutilized));
> >  
> > +DECLARE_TRACE(sched_update_nr_running_tp,
> > +	TP_PROTO(int cpu, int change, unsigned int nr_running),
> > +	TP_ARGS(cpu, change, nr_running));
> > +
> >  #endif /* _TRACE_SCHED_H */
> >  
> >  /* This part must be outside protection */
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 9a2fbf98fd6f..6f28fdff1d48 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -6,7 +6,10 @@
> >   *
> >   *  Copyright (C) 1991-2002  Linus Torvalds
> >   */
> > +
> > +#define SCHED_CREATE_TRACE_POINTS
> >  #include "sched.h"
> > +#undef SCHED_CREATE_TRACE_POINTS
> >  
> >  #include <linux/nospec.h>
> >  
> > @@ -21,9 +24,6 @@
> >  
> >  #include "pelt.h"
> >  
> > -#define CREATE_TRACE_POINTS
> > -#include <trace/events/sched.h>
> > -
> >  /*
> >   * Export tracepoints that act as a bare tracehook (ie: have no trace event
> >   * associated with them) to allow external modules to probe them.
> > @@ -34,6 +34,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
> >  EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
> >  EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
> >  EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
> > +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
> >  
> >  DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
> >  
> > @@ -7969,5 +7970,3 @@ const u32 sched_prio_to_wmult[40] = {
> >   /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
> >   /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
> >  };
> > -
> > -#undef CREATE_TRACE_POINTS
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index da3e5b54715b..fe5d9b6db8f7 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -22,8 +22,6 @@
> >   */
> >  #include "sched.h"
> >  
> > -#include <trace/events/sched.h>
> > -
> >  /*
> >   * Targeted preemption latency for CPU-bound tasks:
> >   *
> > diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
> > index b647d04d9c8b..bb69a0ae8d6c 100644
> > --- a/kernel/sched/pelt.c
> > +++ b/kernel/sched/pelt.c
> > @@ -28,8 +28,6 @@
> >  #include "sched.h"
> >  #include "pelt.h"
> >  
> > -#include <trace/events/sched.h>
> > -
> >  /*
> >   * Approximate:
> >   *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
> > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > index db3a57675ccf..6ae96679c169 100644
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -75,6 +75,15 @@
> >  #include "cpupri.h"
> >  #include "cpudeadline.h"
> >  
> > +#ifdef SCHED_CREATE_TRACE_POINTS
> > +#define CREATE_TRACE_POINTS
> > +#endif
> > +#include <trace/events/sched.h>
> > +
> > +#ifdef SCHED_CREATE_TRACE_POINTS
> > +#undef CREATE_TRACE_POINTS
> > +#endif
> > +
> >  #ifdef CONFIG_SCHED_DEBUG
> >  # define SCHED_WARN_ON(x)	WARN_ONCE(x, #x)
> >  #else
> > @@ -1959,6 +1968,7 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
> >  	unsigned prev_nr = rq->nr_running;
> >  
> >  	rq->nr_running = prev_nr + count;
> > +	trace_sched_update_nr_running_tp(cpu_of(rq), count, rq->nr_running);
> 
> This is a very specific call site, so I guess it looks fine to pass very
> specific info too.
> 
> But I think we can do better by just passing struct rq and add a new helper
> sched_trace_rq_nr_running() (see the bottom of fair.c for a similar helper
> functions for tracepoints).
> 
> This will allow the user to extract, cpu, nr_running and potentially other info
> while only pass a single argument to the tracepoint. Potentially extending its
> future usefulness.

I can certainly add a sched_trace_rq_nr_running helper and pass the *rq if
you think that is really important. 

I'd prefer to keep the count field though as that is the only way to tell
if this is an add_nr_running or sub_nr_running from looking at a single
trace event.

I could make it two different tracepoints.  Would that be better? To me that
seemed more complicated though. The tooling would need to look at it
different events and there would be more kernel change.

Thanks,
Phil

> 
> The count can be inferred by storing the last nr_running and taking the diff
> when a new call happens.
> 
> 	...
> 
> 	cpu = sched_trace_rq_cpu(rq);
> 	nr_running = sched_trace_rq_nr_running(rq);
> 	count = last_nr_running[cpu] - nr_running;
> 	last_nr_running[cpu] = nr_running;
> 
> 	...
> 
> I haven't looked at BTF, but it could potentially allow us to access members of
> unexported structs reliably without having to export all these helper
> functions. It's been something I wanted to look into but no time yet.
> 
> Thanks
> 
> --
> Qais Yousef
> 
> >  
> >  #ifdef CONFIG_SMP
> >  	if (prev_nr < 2 && rq->nr_running >= 2) {
> > @@ -1973,6 +1983,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
> >  static inline void sub_nr_running(struct rq *rq, unsigned count)
> >  {
> >  	rq->nr_running -= count;
> > +	trace_sched_update_nr_running_tp(cpu_of(rq), -count, rq->nr_running);
> > +
> >  	/* Check if we still need preemption */
> >  	sched_update_tick_dependency(rq);
> >  }
> > -- 
> > 2.18.0
> > 
> 

-- 


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Sched: Add a tracepoint to track rq->nr_running
  2020-06-23 19:38   ` Phil Auld
@ 2020-06-24 12:10     ` Qais Yousef
  0 siblings, 0 replies; 9+ messages in thread
From: Qais Yousef @ 2020-06-24 12:10 UTC (permalink / raw)
  To: Phil Auld
  Cc: linux-kernel, Ingo Molnar, Peter Zijlstra, Vincent Guittot,
	Steven Rostedt, Juri Lelli, Mel Gorman

Hi Phil

On 06/23/20 15:38, Phil Auld wrote:

[...]

> > This is a very specific call site, so I guess it looks fine to pass very
> > specific info too.
> > 
> > But I think we can do better by just passing struct rq and add a new helper
> > sched_trace_rq_nr_running() (see the bottom of fair.c for a similar helper
> > functions for tracepoints).
> > 
> > This will allow the user to extract, cpu, nr_running and potentially other info
> > while only pass a single argument to the tracepoint. Potentially extending its
> > future usefulness.
> 
> I can certainly add a sched_trace_rq_nr_running helper and pass the *rq if
> you think that is really important. 

As I said, this is a very specific call site, so passing specific info should
be fine, so not really important.

My general view on this (which is influenced by what Peter asked for when we
first introduced this) is that it's better to allow a trace point to
extract more signals from this specific call site by passing generic info and
let the event code/module do what it wants.

But the idea behind these tracepoints is that they can evolve when they need
to. So I don't think we should hung up on this if it makes things unnecessarily
complex.

> 
> I'd prefer to keep the count field though as that is the only way to tell
> if this is an add_nr_running or sub_nr_running from looking at a single
> trace event.

Passing the count field is fine by me...

> 
> I could make it two different tracepoints.  Would that be better? To me that
> seemed more complicated though. The tooling would need to look at it
> different events and there would be more kernel change.

... but splitting the tracepoint doesn't look pretty.

If passing the rq and the count is enough for you, I'd vote this is better. If
not, then I won't insist into twisting things too much for the sake of it.

Thanks

--
Qais Yousef

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] Sched: Add a tracepoint to track rq->nr_running
  2020-06-19 14:11 [PATCH] Sched: Add a tracepoint to track rq->nr_running Phil Auld
  2020-06-19 16:46 ` Steven Rostedt
  2020-06-22 12:17 ` Qais Yousef
@ 2020-06-29 19:23 ` Phil Auld
  2020-07-02 10:54   ` Qais Yousef
  2020-07-09  8:45   ` [tip: sched/core] sched: " tip-bot2 for Phil Auld
  2 siblings, 2 replies; 9+ messages in thread
From: Phil Auld @ 2020-06-29 19:23 UTC (permalink / raw)
  To: linux-kernel
  Cc: Qais Yousef, Ingo Molnar, Peter Zijlstra, Vincent Guittot,
	Steven Rostedt, Juri Lelli, Mel Gorman

Add a bare tracepoint trace_sched_update_nr_running_tp which tracks
->nr_running CPU's rq. This is used to accurately trace this data and
provide a visualization of scheduler imbalances in, for example, the
form of a heat map.  The tracepoint is accessed by loading an external
kernel module. An example module (forked from Qais' module and including
the pelt related tracepoints) can be found at:

  https://github.com/auldp/tracepoints-helpers.git

A script to turn the trace-cmd report output into a heatmap plot can be
found at:

  https://github.com/jirvoz/plot-nr-running

The tracepoints are added to add_nr_running() and sub_nr_running() which
are in kernel/sched/sched.h. In order to avoid CREATE_TRACE_POINTS in
the header a wrapper call is used and the trace/events/sched.h include
is moved before sched.h in kernel/sched/core.

Signed-off-by: Phil Auld <pauld@redhat.com>
CC: Qais Yousef <qais.yousef@arm.com>
CC: Ingo Molnar <mingo@redhat.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Vincent Guittot <vincent.guittot@linaro.org>
CC: Steven Rostedt <rostedt@goodmis.org>
CC: linux-kernel@vger.kernel.org
---

V2: Fix use of tracepoint in header from Steven. Pass rq* and use
helper to get nr_running field, from Qais. 


 include/linux/sched.h        |  1 +
 include/trace/events/sched.h |  4 ++++
 kernel/sched/core.c          | 13 +++++++++----
 kernel/sched/fair.c          |  8 ++++++--
 kernel/sched/pelt.c          |  2 --
 kernel/sched/sched.h         | 10 ++++++++++
 6 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4418f5cb8324..5f114faf2247 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2015,6 +2015,7 @@ const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq);
 const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq);
 
 int sched_trace_rq_cpu(struct rq *rq);
+int sched_trace_rq_nr_running(struct rq *rq);
 
 const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
 
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index ed168b0e2c53..8c72f9113694 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -634,6 +634,10 @@ DECLARE_TRACE(sched_overutilized_tp,
 	TP_PROTO(struct root_domain *rd, bool overutilized),
 	TP_ARGS(rd, overutilized));
 
+DECLARE_TRACE(sched_update_nr_running_tp,
+	TP_PROTO(struct rq *rq, int change),
+	TP_ARGS(rq, change));
+
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9a2fbf98fd6f..0d35d7c4c330 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6,6 +6,10 @@
  *
  *  Copyright (C) 1991-2002  Linus Torvalds
  */
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched.h>
+#undef CREATE_TRACE_POINTS
+
 #include "sched.h"
 
 #include <linux/nospec.h>
@@ -21,9 +25,6 @@
 
 #include "pelt.h"
 
-#define CREATE_TRACE_POINTS
-#include <trace/events/sched.h>
-
 /*
  * Export tracepoints that act as a bare tracehook (ie: have no trace event
  * associated with them) to allow external modules to probe them.
@@ -34,6 +35,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
@@ -7970,4 +7972,7 @@ const u32 sched_prio_to_wmult[40] = {
  /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
 
-#undef CREATE_TRACE_POINTS
+void call_trace_sched_update_nr_running(struct rq *rq, int count)
+{
+        trace_sched_update_nr_running_tp(rq, count);
+}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index da3e5b54715b..2e2f3f68e318 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -22,8 +22,6 @@
  */
 #include "sched.h"
 
-#include <trace/events/sched.h>
-
 /*
  * Targeted preemption latency for CPU-bound tasks:
  *
@@ -11293,3 +11291,9 @@ const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
 #endif
 }
 EXPORT_SYMBOL_GPL(sched_trace_rd_span);
+
+int sched_trace_rq_nr_running(struct rq *rq)
+{
+        return rq ? rq->nr_running : -1;
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index b647d04d9c8b..bb69a0ae8d6c 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -28,8 +28,6 @@
 #include "sched.h"
 #include "pelt.h"
 
-#include <trace/events/sched.h>
-
 /*
  * Approximate:
  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index db3a57675ccf..e621eaa44474 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -75,6 +75,8 @@
 #include "cpupri.h"
 #include "cpudeadline.h"
 
+#include <trace/events/sched.h>
+
 #ifdef CONFIG_SCHED_DEBUG
 # define SCHED_WARN_ON(x)	WARN_ONCE(x, #x)
 #else
@@ -96,6 +98,7 @@ extern atomic_long_t calc_load_tasks;
 extern void calc_global_load_tick(struct rq *this_rq);
 extern long calc_load_fold_active(struct rq *this_rq, long adjust);
 
+extern void call_trace_sched_update_nr_running(struct rq *rq, int count);
 /*
  * Helpers for converting nanosecond timing to jiffy resolution
  */
@@ -1959,6 +1962,9 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
 	unsigned prev_nr = rq->nr_running;
 
 	rq->nr_running = prev_nr + count;
+	if (trace_sched_update_nr_running_tp_enabled()) {
+		call_trace_sched_update_nr_running(rq, count);
+	}
 
 #ifdef CONFIG_SMP
 	if (prev_nr < 2 && rq->nr_running >= 2) {
@@ -1973,6 +1979,10 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
 static inline void sub_nr_running(struct rq *rq, unsigned count)
 {
 	rq->nr_running -= count;
+	if (trace_sched_update_nr_running_tp_enabled()) {
+		call_trace_sched_update_nr_running(rq, count);
+	}
+
 	/* Check if we still need preemption */
 	sched_update_tick_dependency(rq);
 }
-- 
2.18.0


-- 


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] Sched: Add a tracepoint to track rq->nr_running
  2020-06-29 19:23 ` [PATCH v2] " Phil Auld
@ 2020-07-02 10:54   ` Qais Yousef
  2020-07-09  8:45   ` [tip: sched/core] sched: " tip-bot2 for Phil Auld
  1 sibling, 0 replies; 9+ messages in thread
From: Qais Yousef @ 2020-07-02 10:54 UTC (permalink / raw)
  To: Phil Auld
  Cc: linux-kernel, Ingo Molnar, Peter Zijlstra, Vincent Guittot,
	Steven Rostedt, Juri Lelli, Mel Gorman

On 06/29/20 15:23, Phil Auld wrote:
> Add a bare tracepoint trace_sched_update_nr_running_tp which tracks
> ->nr_running CPU's rq. This is used to accurately trace this data and
> provide a visualization of scheduler imbalances in, for example, the
> form of a heat map.  The tracepoint is accessed by loading an external
> kernel module. An example module (forked from Qais' module and including
> the pelt related tracepoints) can be found at:
> 
>   https://github.com/auldp/tracepoints-helpers.git
> 
> A script to turn the trace-cmd report output into a heatmap plot can be
> found at:
> 
>   https://github.com/jirvoz/plot-nr-running
> 
> The tracepoints are added to add_nr_running() and sub_nr_running() which
> are in kernel/sched/sched.h. In order to avoid CREATE_TRACE_POINTS in
> the header a wrapper call is used and the trace/events/sched.h include
> is moved before sched.h in kernel/sched/core.
> 
> Signed-off-by: Phil Auld <pauld@redhat.com>
> CC: Qais Yousef <qais.yousef@arm.com>
> CC: Ingo Molnar <mingo@redhat.com>
> CC: Peter Zijlstra <peterz@infradead.org>
> CC: Vincent Guittot <vincent.guittot@linaro.org>
> CC: Steven Rostedt <rostedt@goodmis.org>
> CC: linux-kernel@vger.kernel.org
> ---

LGTM.

Reviewed-by: Qais Yousef <qais.yousef@arm.com>

Thanks

--
Qais Yousef

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [tip: sched/core] sched: Add a tracepoint to track rq->nr_running
  2020-06-29 19:23 ` [PATCH v2] " Phil Auld
  2020-07-02 10:54   ` Qais Yousef
@ 2020-07-09  8:45   ` tip-bot2 for Phil Auld
  1 sibling, 0 replies; 9+ messages in thread
From: tip-bot2 for Phil Auld @ 2020-07-09  8:45 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: Phil Auld, Peter Zijlstra (Intel), x86, LKML

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     9d246053a69196c7c27068870e9b4b66ac536f68
Gitweb:        https://git.kernel.org/tip/9d246053a69196c7c27068870e9b4b66ac536f68
Author:        Phil Auld <pauld@redhat.com>
AuthorDate:    Mon, 29 Jun 2020 15:23:03 -04:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Wed, 08 Jul 2020 11:39:02 +02:00

sched: Add a tracepoint to track rq->nr_running

Add a bare tracepoint trace_sched_update_nr_running_tp which tracks
->nr_running CPU's rq. This is used to accurately trace this data and
provide a visualization of scheduler imbalances in, for example, the
form of a heat map.  The tracepoint is accessed by loading an external
kernel module. An example module (forked from Qais' module and including
the pelt related tracepoints) can be found at:

  https://github.com/auldp/tracepoints-helpers.git

A script to turn the trace-cmd report output into a heatmap plot can be
found at:

  https://github.com/jirvoz/plot-nr-running

The tracepoints are added to add_nr_running() and sub_nr_running() which
are in kernel/sched/sched.h. In order to avoid CREATE_TRACE_POINTS in
the header a wrapper call is used and the trace/events/sched.h include
is moved before sched.h in kernel/sched/core.

Signed-off-by: Phil Auld <pauld@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200629192303.GC120228@lorien.usersys.redhat.com
---
 include/linux/sched.h        |  1 +
 include/trace/events/sched.h |  4 ++++
 kernel/sched/core.c          | 13 +++++++++----
 kernel/sched/fair.c          |  8 ++++++--
 kernel/sched/pelt.c          |  2 --
 kernel/sched/sched.h         | 10 ++++++++++
 6 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6833729..12b10ce 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2044,6 +2044,7 @@ const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq);
 const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq);
 
 int sched_trace_rq_cpu(struct rq *rq);
+int sched_trace_rq_nr_running(struct rq *rq);
 
 const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
 
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 04f9a4c..0d5ff09 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -642,6 +642,10 @@ DECLARE_TRACE(sched_util_est_se_tp,
 	TP_PROTO(struct sched_entity *se),
 	TP_ARGS(se));
 
+DECLARE_TRACE(sched_update_nr_running_tp,
+	TP_PROTO(struct rq *rq, int change),
+	TP_ARGS(rq, change));
+
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4cf30e4..ff05195 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6,6 +6,10 @@
  *
  *  Copyright (C) 1991-2002  Linus Torvalds
  */
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched.h>
+#undef CREATE_TRACE_POINTS
+
 #include "sched.h"
 
 #include <linux/nospec.h>
@@ -23,9 +27,6 @@
 #include "pelt.h"
 #include "smp.h"
 
-#define CREATE_TRACE_POINTS
-#include <trace/events/sched.h>
-
 /*
  * Export tracepoints that act as a bare tracehook (ie: have no trace event
  * associated with them) to allow external modules to probe them.
@@ -38,6 +39,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
@@ -8195,4 +8197,7 @@ const u32 sched_prio_to_wmult[40] = {
  /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
 
-#undef CREATE_TRACE_POINTS
+void call_trace_sched_update_nr_running(struct rq *rq, int count)
+{
+        trace_sched_update_nr_running_tp(rq, count);
+}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6fab1d1..3213cb2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -22,8 +22,6 @@
  */
 #include "sched.h"
 
-#include <trace/events/sched.h>
-
 /*
  * Targeted preemption latency for CPU-bound tasks:
  *
@@ -11296,3 +11294,9 @@ const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
 #endif
 }
 EXPORT_SYMBOL_GPL(sched_trace_rd_span);
+
+int sched_trace_rq_nr_running(struct rq *rq)
+{
+        return rq ? rq->nr_running : -1;
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 11bea3b..2c613e1 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -28,8 +28,6 @@
 #include "sched.h"
 #include "pelt.h"
 
-#include <trace/events/sched.h>
-
 /*
  * Approximate:
  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b1432f6..65b72e0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -76,6 +76,8 @@
 #include "cpupri.h"
 #include "cpudeadline.h"
 
+#include <trace/events/sched.h>
+
 #ifdef CONFIG_SCHED_DEBUG
 # define SCHED_WARN_ON(x)	WARN_ONCE(x, #x)
 #else
@@ -97,6 +99,7 @@ extern atomic_long_t calc_load_tasks;
 extern void calc_global_load_tick(struct rq *this_rq);
 extern long calc_load_fold_active(struct rq *this_rq, long adjust);
 
+extern void call_trace_sched_update_nr_running(struct rq *rq, int count);
 /*
  * Helpers for converting nanosecond timing to jiffy resolution
  */
@@ -1973,6 +1976,9 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
 	unsigned prev_nr = rq->nr_running;
 
 	rq->nr_running = prev_nr + count;
+	if (trace_sched_update_nr_running_tp_enabled()) {
+		call_trace_sched_update_nr_running(rq, count);
+	}
 
 #ifdef CONFIG_SMP
 	if (prev_nr < 2 && rq->nr_running >= 2) {
@@ -1987,6 +1993,10 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
 static inline void sub_nr_running(struct rq *rq, unsigned count)
 {
 	rq->nr_running -= count;
+	if (trace_sched_update_nr_running_tp_enabled()) {
+		call_trace_sched_update_nr_running(rq, count);
+	}
+
 	/* Check if we still need preemption */
 	sched_update_tick_dependency(rq);
 }

^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2020-07-09  8:46 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-06-19 14:11 [PATCH] Sched: Add a tracepoint to track rq->nr_running Phil Auld
2020-06-19 16:46 ` Steven Rostedt
2020-06-19 17:34   ` Phil Auld
2020-06-22 12:17 ` Qais Yousef
2020-06-23 19:38   ` Phil Auld
2020-06-24 12:10     ` Qais Yousef
2020-06-29 19:23 ` [PATCH v2] " Phil Auld
2020-07-02 10:54   ` Qais Yousef
2020-07-09  8:45   ` [tip: sched/core] sched: " tip-bot2 for Phil Auld

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).