All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] perf/core: Add a tracepoint for perf sampling
@ 2016-07-19 23:20 Brendan Gregg
  2016-07-29 18:05 ` Brendan Gregg
  2016-07-29 19:21 ` Arnaldo Carvalho de Melo
  0 siblings, 2 replies; 6+ messages in thread
From: Brendan Gregg @ 2016-07-19 23:20 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin
  Cc: linux-kernel, Alexei Starovoitov, Wang Nan, Brendan Gregg

When perf is performing hrtimer-based sampling, this tracepoint can be used
by BPF to run additional logic on each sample. For example, BPF can fetch
stack traces and frequency count them in kernel context, for an efficient
profiler.

Signed-off-by: Brendan Gregg <bgregg@netflix.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
---
 include/trace/events/perf.h | 29 +++++++++++++++++++++++++++++
 kernel/events/core.c        |  5 +++++
 2 files changed, 34 insertions(+)
 create mode 100644 include/trace/events/perf.h

diff --git a/include/trace/events/perf.h b/include/trace/events/perf.h
new file mode 100644
index 0000000..461770d
--- /dev/null
+++ b/include/trace/events/perf.h
@@ -0,0 +1,29 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM perf
+
+#if !defined(_TRACE_PERF_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PERF_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(perf_hrtimer,
+	TP_PROTO(struct pt_regs *regs, struct perf_event *event),
+
+	TP_ARGS(regs, event),
+
+	TP_STRUCT__entry(
+		__field(struct pt_regs *, regs)
+		__field(struct perf_event *, event)
+	),
+
+	TP_fast_assign(
+		__entry->regs = regs;
+		__entry->event = event;
+	),
+
+	TP_printk("regs=%p evt=%p", __entry->regs, __entry->event)
+);
+#endif /* _TRACE_PERF_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 79dae18..0d843a7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -51,6 +51,9 @@
 
 #include <asm/irq_regs.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/perf.h>
+
 typedef int (*remote_function_f)(void *);
 
 struct remote_function_call {
@@ -8036,6 +8039,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
 	perf_sample_data_init(&data, 0, event->hw.last_period);
 	regs = get_irq_regs();
 
+	trace_perf_hrtimer(regs, event);
+
 	if (regs && !perf_exclude_event(event, regs)) {
 		if (!(event->attr.exclude_idle && is_idle_task(current)))
 			if (__perf_event_overflow(event, 1, &data, regs))
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] perf/core: Add a tracepoint for perf sampling
  2016-07-19 23:20 [PATCH] perf/core: Add a tracepoint for perf sampling Brendan Gregg
@ 2016-07-29 18:05 ` Brendan Gregg
  2016-07-30  3:34   ` Wangnan (F)
  2016-07-29 19:21 ` Arnaldo Carvalho de Melo
  1 sibling, 1 reply; 6+ messages in thread
From: Brendan Gregg @ 2016-07-29 18:05 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin
  Cc: linux-kernel, Alexei Starovoitov, Wang Nan, Brendan Gregg

On Tue, Jul 19, 2016 at 4:20 PM, Brendan Gregg <bgregg@netflix.com> wrote:
> When perf is performing hrtimer-based sampling, this tracepoint can be used
> by BPF to run additional logic on each sample. For example, BPF can fetch
> stack traces and frequency count them in kernel context, for an efficient
> profiler.

Any comments on this patch? Thanks,

Brendan

>
> Signed-off-by: Brendan Gregg <bgregg@netflix.com>
> Cc: Alexei Starovoitov <ast@kernel.org>
> Cc: Wang Nan <wangnan0@huawei.com>
> ---
>  include/trace/events/perf.h | 29 +++++++++++++++++++++++++++++
>  kernel/events/core.c        |  5 +++++
>  2 files changed, 34 insertions(+)
>  create mode 100644 include/trace/events/perf.h
>
> diff --git a/include/trace/events/perf.h b/include/trace/events/perf.h
> new file mode 100644
> index 0000000..461770d
> --- /dev/null
> +++ b/include/trace/events/perf.h
> @@ -0,0 +1,29 @@
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM perf
> +
> +#if !defined(_TRACE_PERF_H) || defined(TRACE_HEADER_MULTI_READ)
> +#define _TRACE_PERF_H
> +
> +#include <linux/tracepoint.h>
> +
> +TRACE_EVENT(perf_hrtimer,
> +       TP_PROTO(struct pt_regs *regs, struct perf_event *event),
> +
> +       TP_ARGS(regs, event),
> +
> +       TP_STRUCT__entry(
> +               __field(struct pt_regs *, regs)
> +               __field(struct perf_event *, event)
> +       ),
> +
> +       TP_fast_assign(
> +               __entry->regs = regs;
> +               __entry->event = event;
> +       ),
> +
> +       TP_printk("regs=%p evt=%p", __entry->regs, __entry->event)
> +);
> +#endif /* _TRACE_PERF_H */
> +
> +/* This part must be outside protection */
> +#include <trace/define_trace.h>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 79dae18..0d843a7 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -51,6 +51,9 @@
>
>  #include <asm/irq_regs.h>
>
> +#define CREATE_TRACE_POINTS
> +#include <trace/events/perf.h>
> +
>  typedef int (*remote_function_f)(void *);
>
>  struct remote_function_call {
> @@ -8036,6 +8039,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
>         perf_sample_data_init(&data, 0, event->hw.last_period);
>         regs = get_irq_regs();
>
> +       trace_perf_hrtimer(regs, event);
> +
>         if (regs && !perf_exclude_event(event, regs)) {
>                 if (!(event->attr.exclude_idle && is_idle_task(current)))
>                         if (__perf_event_overflow(event, 1, &data, regs))
> --
> 2.7.4
>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] perf/core: Add a tracepoint for perf sampling
  2016-07-19 23:20 [PATCH] perf/core: Add a tracepoint for perf sampling Brendan Gregg
  2016-07-29 18:05 ` Brendan Gregg
@ 2016-07-29 19:21 ` Arnaldo Carvalho de Melo
  2016-07-29 19:55   ` Brendan Gregg
  1 sibling, 1 reply; 6+ messages in thread
From: Arnaldo Carvalho de Melo @ 2016-07-29 19:21 UTC (permalink / raw)
  To: Brendan Gregg
  Cc: Peter Zijlstra, Ingo Molnar, Alexander Shishkin, linux-kernel,
	Alexei Starovoitov, Wang Nan

Em Tue, Jul 19, 2016 at 11:20:48PM +0000, Brendan Gregg escreveu:
> When perf is performing hrtimer-based sampling, this tracepoint can be used
> by BPF to run additional logic on each sample. For example, BPF can fetch
> stack traces and frequency count them in kernel context, for an efficient
> profiler.

Could you provide a complete experience? I.e. together with this patch a
bpf script that could then run, with the full set of steps needed to
show it in use.

Also, what would be the value when BPF is not used?

- Arnaldo
 
> Signed-off-by: Brendan Gregg <bgregg@netflix.com>
> Cc: Alexei Starovoitov <ast@kernel.org>
> Cc: Wang Nan <wangnan0@huawei.com>
> ---
>  include/trace/events/perf.h | 29 +++++++++++++++++++++++++++++
>  kernel/events/core.c        |  5 +++++
>  2 files changed, 34 insertions(+)
>  create mode 100644 include/trace/events/perf.h
> 
> diff --git a/include/trace/events/perf.h b/include/trace/events/perf.h
> new file mode 100644
> index 0000000..461770d
> --- /dev/null
> +++ b/include/trace/events/perf.h
> @@ -0,0 +1,29 @@
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM perf
> +
> +#if !defined(_TRACE_PERF_H) || defined(TRACE_HEADER_MULTI_READ)
> +#define _TRACE_PERF_H
> +
> +#include <linux/tracepoint.h>
> +
> +TRACE_EVENT(perf_hrtimer,
> +	TP_PROTO(struct pt_regs *regs, struct perf_event *event),
> +
> +	TP_ARGS(regs, event),
> +
> +	TP_STRUCT__entry(
> +		__field(struct pt_regs *, regs)
> +		__field(struct perf_event *, event)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->regs = regs;
> +		__entry->event = event;
> +	),
> +
> +	TP_printk("regs=%p evt=%p", __entry->regs, __entry->event)
> +);
> +#endif /* _TRACE_PERF_H */
> +
> +/* This part must be outside protection */
> +#include <trace/define_trace.h>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 79dae18..0d843a7 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -51,6 +51,9 @@
>  
>  #include <asm/irq_regs.h>
>  
> +#define CREATE_TRACE_POINTS
> +#include <trace/events/perf.h>
> +
>  typedef int (*remote_function_f)(void *);
>  
>  struct remote_function_call {
> @@ -8036,6 +8039,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
>  	perf_sample_data_init(&data, 0, event->hw.last_period);
>  	regs = get_irq_regs();
>  
> +	trace_perf_hrtimer(regs, event);
> +
>  	if (regs && !perf_exclude_event(event, regs)) {
>  		if (!(event->attr.exclude_idle && is_idle_task(current)))
>  			if (__perf_event_overflow(event, 1, &data, regs))
> -- 
> 2.7.4

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] perf/core: Add a tracepoint for perf sampling
  2016-07-29 19:21 ` Arnaldo Carvalho de Melo
@ 2016-07-29 19:55   ` Brendan Gregg
  0 siblings, 0 replies; 6+ messages in thread
From: Brendan Gregg @ 2016-07-29 19:55 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: Peter Zijlstra, Ingo Molnar, Alexander Shishkin, linux-kernel,
	Alexei Starovoitov, Wang Nan

On Fri, Jul 29, 2016 at 12:21 PM, Arnaldo Carvalho de Melo
<acme@kernel.org> wrote:
> Em Tue, Jul 19, 2016 at 11:20:48PM +0000, Brendan Gregg escreveu:
>> When perf is performing hrtimer-based sampling, this tracepoint can be used
>> by BPF to run additional logic on each sample. For example, BPF can fetch
>> stack traces and frequency count them in kernel context, for an efficient
>> profiler.
>
> Could you provide a complete experience? I.e. together with this patch a
> bpf script that could then run, with the full set of steps needed to
> show it in use.

There's currently profile.py, in bcc, which will either use this
tracepoint or use a kprobe if it doesn't exist (although the kprobe is
unreliable). profile samples stack traces and shows stack traces with
their occurrence counts. Eg:

# ./profile
Sampling at 49 Hertz of all threads by user + kernel stack... Hit Ctrl-C to end.
^C
    ffffffff81189249 filemap_map_pages
    ffffffff811bd3f5 handle_mm_fault
    ffffffff81065990 __do_page_fault
    ffffffff81065caf do_page_fault
    ffffffff817ce228 page_fault
    00007fed989afcc0 [unknown]
    -                cp (9036)
        1
[...]

    ffffffff8105eb66 native_safe_halt
    ffffffff8103659e default_idle
    ffffffff81036d1f arch_cpu_idle
    ffffffff810bba5a default_idle_call
    ffffffff810bbd07 cpu_startup_entry
    ffffffff817bf4a7 rest_init
    ffffffff81d65f58 start_kernel
    ffffffff81d652db x86_64_start_reservations
    ffffffff81d65418 x86_64_start_kernel
    -                swapper/0 (0)
        72

    ffffffff8105eb66 native_safe_halt
    ffffffff8103659e default_idle
    ffffffff81036d1f arch_cpu_idle
    ffffffff810bba5a default_idle_call
    ffffffff810bbd07 cpu_startup_entry
    ffffffff8104df55 start_secondary
    -                swapper/1 (0)
        75

Tool and examples are on github [1][2]. Is this sufficient for this
patch? If not, I could rewrite something for samples/bpf (eg, an IP
sampler, or a task priority sampler), which I may do anyway as a
follow-on if they turned out to be nice examples.

>
> Also, what would be the value when BPF is not used?
>

No big reason comes to mind. I could imagine it might be useful when
debugging perf's sampling behavior, and there might be uses with
ftrace as well. But the big reason is extending perf's existing
sampling capabilities for in-kernel frequency counts of stack traces
(which could include custom BPF-based stack walkers), IP, task
priority, etc. Thanks,

Brendan

[1] https://github.com/iovisor/bcc/blob/master/tools/profile.py
[2] https://github.com/iovisor/bcc/blob/master/tools/profile_example.txt

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] perf/core: Add a tracepoint for perf sampling
  2016-07-29 18:05 ` Brendan Gregg
@ 2016-07-30  3:34   ` Wangnan (F)
  2016-08-03  2:44     ` Brendan Gregg
  0 siblings, 1 reply; 6+ messages in thread
From: Wangnan (F) @ 2016-07-30  3:34 UTC (permalink / raw)
  To: Brendan Gregg, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Alexander Shishkin
  Cc: linux-kernel, Alexei Starovoitov



On 2016/7/30 2:05, Brendan Gregg wrote:
> On Tue, Jul 19, 2016 at 4:20 PM, Brendan Gregg <bgregg@netflix.com> wrote:
>> When perf is performing hrtimer-based sampling, this tracepoint can be used
>> by BPF to run additional logic on each sample. For example, BPF can fetch
>> stack traces and frequency count them in kernel context, for an efficient
>> profiler.
> Any comments on this patch? Thanks,
>
> Brendan

Sorry for the late.

I think it is a useful feature. Could you please provide an example
to show how to use it in perf?

If I understand correctly, I can have a BPF script run 99 times per
second using

   # perf -e cpu-clock/freq=99/ -e mybpf.c ...

And in mybpf.c, attach a BPF script on the new tracepoint. Right?

Also, since we already have timer:hrtimer_expire_entry, please provide
some further information about why we need a new tracepoint.

Thank you.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] perf/core: Add a tracepoint for perf sampling
  2016-07-30  3:34   ` Wangnan (F)
@ 2016-08-03  2:44     ` Brendan Gregg
  0 siblings, 0 replies; 6+ messages in thread
From: Brendan Gregg @ 2016-08-03  2:44 UTC (permalink / raw)
  To: Wangnan (F)
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, linux-kernel, Alexei Starovoitov

On Fri, Jul 29, 2016 at 8:34 PM, Wangnan (F) <wangnan0@huawei.com> wrote:
>
>
> On 2016/7/30 2:05, Brendan Gregg wrote:
>>
>> On Tue, Jul 19, 2016 at 4:20 PM, Brendan Gregg <bgregg@netflix.com> wrote:
>>>
>>> When perf is performing hrtimer-based sampling, this tracepoint can be
>>> used
>>> by BPF to run additional logic on each sample. For example, BPF can fetch
>>> stack traces and frequency count them in kernel context, for an efficient
>>> profiler.
>>
>> Any comments on this patch? Thanks,
>>
>> Brendan
>
>
> Sorry for the late.
>
> I think it is a useful feature. Could you please provide an example
> to show how to use it in perf?

Yes, the following example samples at 999 Hertz, and emits the
instruction pointer only when it is within a custom address range, as
checked by BPF. Eg:

# ./perf record -e bpf-output/no-inherit,name=evt/ \
    -e ./sampleip_range.c/map:channel.event=evt/ \
    -a ./perf record -F 999 -e cpu-clock -N -a -o /dev/null sleep 5
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.000 MB /dev/null ]
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.134 MB perf.data (222 samples) ]

# ./perf script -F comm,pid,time,bpf-output
'bpf-output' not valid for hardware events. Ignoring.
'bpf-output' not valid for unknown events. Ignoring.
'bpf-output' not valid for unknown events. Ignoring.
              dd  6501  3058.117379:
      BPF output: 0000: 3c 4c 21 81 ff ff ff ff  <L!.....
                  0008: 00 00 00 00              ....

              dd  6501  3058.130392:
      BPF output: 0000: 55 4c 21 81 ff ff ff ff  UL!.....
                  0008: 00 00 00 00              ....

              dd  6501  3058.131393:
      BPF output: 0000: 55 4c 21 81 ff ff ff ff  UL!.....
                  0008: 00 00 00 00              ....

              dd  6501  3058.149411:
      BPF output: 0000: e1 4b 21 81 ff ff ff ff  .K!.....
                  0008: 00 00 00 00              ....

              dd  6501  3058.155417:
      BPF output: 0000: 76 4c 21 81 ff ff ff ff  vL!.....
                  0008: 00 00 00 00              ....

For that example, perf is running a BPF program to emit filtered
details, and running a second perf to configure sampling. We can
certainly improve how this works. And this will be much more
interesting once perf can emit maps, and a perf BPF program can
populate a map.

Here's sampleip_range.c:

/************************ BEGIN **************************/
#include <uapi/linux/bpf.h>
#include <uapi/linux/ptrace.h>

#define SEC(NAME) __attribute__((section(NAME), used))

/*
 * Edit the following to match the instruction address range you want to
 * sample. Eg, look in /proc/kallsyms. The addresses will change for each
 * kernel version and build.
 */
#define RANGE_START  0xffffffff81214b90
#define RANGE_END    0xffffffff81214cd0

struct bpf_map_def {
unsigned int type;
unsigned int key_size;
unsigned int value_size;
unsigned int max_entries;
};

static int (*probe_read)(void *dst, int size, void *src) =
    (void *)BPF_FUNC_probe_read;
static int (*get_smp_processor_id)(void) =
    (void *)BPF_FUNC_get_smp_processor_id;
static int (*perf_event_output)(void *, struct bpf_map_def *, int, void *,
    unsigned long) = (void *)BPF_FUNC_perf_event_output;

struct bpf_map_def SEC("maps") channel = {
.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
.key_size = sizeof(int),
.value_size = sizeof(u32),
.max_entries = __NR_CPUS__,
};

/* from /sys/kernel/debug/tracing/events/perf/perf_hrtimer/format */
struct perf_hrtimer_args {
unsigned long long pad;
struct pt_regs *regs;
struct perf_event *event;
};
SEC("perf:perf_hrtimer")
int func(struct perf_hrtimer_args *ctx)
{
struct pt_regs regs = {};
probe_read(&regs, sizeof(regs), ctx->regs);
if (regs.ip >= RANGE_START && regs.ip < RANGE_END) {
perf_event_output(ctx, &channel, get_smp_processor_id(),
   &regs.ip, sizeof(regs.ip));
}
return 0;
}

char _license[] SEC("license") = "GPL";
int _version SEC("version") = LINUX_VERSION_CODE;
/************************* END ***************************/

>
> If I understand correctly, I can have a BPF script run 99 times per
> second using
>
>   # perf -e cpu-clock/freq=99/ -e mybpf.c ...
>
> And in mybpf.c, attach a BPF script on the new tracepoint. Right?
>
> Also, since we already have timer:hrtimer_expire_entry, please provide
> some further information about why we need a new tracepoint.

timer:hrtimer_expire_entry fires for much more than just the perf
timer. The perf:perf_hrtimer tracepoint also has registers and perf
context as arguments, which can be used for profiling programs.

Thanks for the comments,

Brendan

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2016-08-03  2:51 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-07-19 23:20 [PATCH] perf/core: Add a tracepoint for perf sampling Brendan Gregg
2016-07-29 18:05 ` Brendan Gregg
2016-07-30  3:34   ` Wangnan (F)
2016-08-03  2:44     ` Brendan Gregg
2016-07-29 19:21 ` Arnaldo Carvalho de Melo
2016-07-29 19:55   ` Brendan Gregg

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.