[RFC] perf/x86: make perf callchain work without CONFIG_FRAME_POINTER
diff mbox series

Message ID 20190404172545.20928-1-kasong@redhat.com
State New, archived
Headers show
Series
  • [RFC] perf/x86: make perf callchain work without CONFIG_FRAME_POINTER
Related show

Commit Message

Kairui Song April 4, 2019, 5:25 p.m. UTC
Currently perf callchain is not working properly with ORC unwinder,
we'll get useless in kernel callchain like this:

perf  6429 [000]    22.498450:             kmem:mm_page_alloc: page=0x176a17 pfn=1534487 order=0 migratetype=0 gfp_flags=GFP_KERNEL
        ffffffffbe23e32e __alloc_pages_nodemask+0x22e (/lib/modules/5.1.0-rc3+/build/vmlinux)
            7efdf7f7d3e8 __poll+0x18 (/usr/lib64/libc-2.28.so)
            5651468729c1 [unknown] (/usr/bin/perf)
            5651467ee82a main+0x69a (/usr/bin/perf)
            7efdf7eaf413 __libc_start_main+0xf3 (/usr/lib64/libc-2.28.so)
        5541f689495641d7 [unknown] ([unknown])

Without CONFIG_FRAME_POINTER, bp is not reserved as frame pointer so
can't get callers frame pointer, instead current frame pointer is
returned when trying to fetch caller registers. The unwinder will error
out early, and end the stacktrace early.

So instead of let the unwinder start with the dumped register, we start
it right where the unwinding started when the stacktrace is triggered by
trace event directly. And skip until the frame pointer is reached.

This makes the callchain get the full in kernel stacktrace again:

perf  6503 [000]  1567.570191:             kmem:mm_page_alloc: page=0x16c904 pfn=1493252 order=0 migratetype=0 gfp_flags=GFP_KERNEL
        ffffffffb523e2ae __alloc_pages_nodemask+0x22e (/lib/modules/5.1.0-rc3+/build/vmlinux)
        ffffffffb52383bd __get_free_pages+0xd (/lib/modules/5.1.0-rc3+/build/vmlinux)
        ffffffffb52fd28a __pollwait+0x8a (/lib/modules/5.1.0-rc3+/build/vmlinux)
        ffffffffb521426f perf_poll+0x2f (/lib/modules/5.1.0-rc3+/build/vmlinux)
        ffffffffb52fe3e2 do_sys_poll+0x252 (/lib/modules/5.1.0-rc3+/build/vmlinux)
        ffffffffb52ff027 __x64_sys_poll+0x37 (/lib/modules/5.1.0-rc3+/build/vmlinux)
        ffffffffb500418b do_syscall_64+0x5b (/lib/modules/5.1.0-rc3+/build/vmlinux)
        ffffffffb5a0008c entry_SYSCALL_64_after_hwframe+0x44 (/lib/modules/5.1.0-rc3+/build/vmlinux)
            7f71e92d03e8 __poll+0x18 (/usr/lib64/libc-2.28.so)
            55a22960d9c1 [unknown] (/usr/bin/perf)
            55a22958982a main+0x69a (/usr/bin/perf)
            7f71e9202413 __libc_start_main+0xf3 (/usr/lib64/libc-2.28.so)
        5541f689495641d7 [unknown] ([unknown])

----

Just found with ORC unwinder the perf callchain is unusable, and this
seems fixes it well, any suggestion is welcome, thanks!

---
 arch/x86/events/core.c     | 34 ++++++++++++++++++++++++++++------
 include/linux/perf_event.h |  3 ++-
 kernel/bpf/stackmap.c      |  4 ++--
 kernel/events/callchain.c  | 13 +++++++++++--
 kernel/events/core.c       |  2 +-
 5 files changed, 44 insertions(+), 12 deletions(-)

Comments

Peter Zijlstra April 5, 2019, 7:17 a.m. UTC | #1
And you forgot to Cc Josh..

On Fri, Apr 05, 2019 at 01:25:45AM +0800, Kairui Song wrote:
> Currently perf callchain is not working properly with ORC unwinder,
> we'll get useless in kernel callchain like this:
> 
> perf  6429 [000]    22.498450:             kmem:mm_page_alloc: page=0x176a17 pfn=1534487 order=0 migratetype=0 gfp_flags=GFP_KERNEL
>         ffffffffbe23e32e __alloc_pages_nodemask+0x22e (/lib/modules/5.1.0-rc3+/build/vmlinux)
>             7efdf7f7d3e8 __poll+0x18 (/usr/lib64/libc-2.28.so)
>             5651468729c1 [unknown] (/usr/bin/perf)
>             5651467ee82a main+0x69a (/usr/bin/perf)
>             7efdf7eaf413 __libc_start_main+0xf3 (/usr/lib64/libc-2.28.so)
>         5541f689495641d7 [unknown] ([unknown])
> 
> Without CONFIG_FRAME_POINTER, bp is not reserved as frame pointer so
> can't get callers frame pointer, instead current frame pointer is
> returned when trying to fetch caller registers. The unwinder will error
> out early, and end the stacktrace early.
> 
> So instead of let the unwinder start with the dumped register, we start
> it right where the unwinding started when the stacktrace is triggered by
> trace event directly. And skip until the frame pointer is reached.
> 
> This makes the callchain get the full in kernel stacktrace again:
> 
> perf  6503 [000]  1567.570191:             kmem:mm_page_alloc: page=0x16c904 pfn=1493252 order=0 migratetype=0 gfp_flags=GFP_KERNEL
>         ffffffffb523e2ae __alloc_pages_nodemask+0x22e (/lib/modules/5.1.0-rc3+/build/vmlinux)
>         ffffffffb52383bd __get_free_pages+0xd (/lib/modules/5.1.0-rc3+/build/vmlinux)
>         ffffffffb52fd28a __pollwait+0x8a (/lib/modules/5.1.0-rc3+/build/vmlinux)
>         ffffffffb521426f perf_poll+0x2f (/lib/modules/5.1.0-rc3+/build/vmlinux)
>         ffffffffb52fe3e2 do_sys_poll+0x252 (/lib/modules/5.1.0-rc3+/build/vmlinux)
>         ffffffffb52ff027 __x64_sys_poll+0x37 (/lib/modules/5.1.0-rc3+/build/vmlinux)
>         ffffffffb500418b do_syscall_64+0x5b (/lib/modules/5.1.0-rc3+/build/vmlinux)
>         ffffffffb5a0008c entry_SYSCALL_64_after_hwframe+0x44 (/lib/modules/5.1.0-rc3+/build/vmlinux)
>             7f71e92d03e8 __poll+0x18 (/usr/lib64/libc-2.28.so)
>             55a22960d9c1 [unknown] (/usr/bin/perf)
>             55a22958982a main+0x69a (/usr/bin/perf)
>             7f71e9202413 __libc_start_main+0xf3 (/usr/lib64/libc-2.28.so)
>         5541f689495641d7 [unknown] ([unknown])
> 
> ----
> 
> Just found with ORC unwinder the perf callchain is unusable, and this
> seems fixes it well, any suggestion is welcome, thanks!

That whole .direct stuff is horrible crap.

> ---
>  arch/x86/events/core.c     | 34 ++++++++++++++++++++++++++++------
>  include/linux/perf_event.h |  3 ++-
>  kernel/bpf/stackmap.c      |  4 ++--
>  kernel/events/callchain.c  | 13 +++++++++++--
>  kernel/events/core.c       |  2 +-
>  5 files changed, 44 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> index e2b1447192a8..3f3e110794ac 100644
> --- a/arch/x86/events/core.c
> +++ b/arch/x86/events/core.c
> @@ -2355,8 +2355,9 @@ void arch_perf_update_userpage(struct perf_event *event,
>  	cyc2ns_read_end();
>  }
>  
> -void
> -perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
> +static void
> +__perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs,
> +			bool direct_call)
>  {
>  	struct unwind_state state;
>  	unsigned long addr;
> @@ -2366,17 +2367,38 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re
>  		return;
>  	}
>  
> -	if (perf_callchain_store(entry, regs->ip))
> -		return;
> +	/*
> +	 * Without frame pointer, we can't get a reliable caller bp value.
> +	 * If this is called directly from a trace point, just start the
> +	 * unwind from here and skip until the frame is reached.
> +	 */
> +	if (IS_ENABLED(CONFIG_FRAME_POINTER) || !direct_call) {
> +		if (perf_callchain_store(entry, regs->ip))
> +			return;
> +		unwind_start(&state, current, regs, NULL);
> +	} else {
> +		unwind_start(&state, current, NULL, (unsigned long*)regs->bp);
> +	}
>  
> -	for (unwind_start(&state, current, regs, NULL); !unwind_done(&state);
> -	     unwind_next_frame(&state)) {
> +	for (; !unwind_done(&state); unwind_next_frame(&state)) {
>  		addr = unwind_get_return_address(&state);
>  		if (!addr || perf_callchain_store(entry, addr))
>  			return;
>  	}
>  }
>  
> +void
> +perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
> +{
> +	__perf_callchain_kernel(entry, regs, false);
> +}
> +
> +void
> +perf_callchain_kernel_direct(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
> +{
> +	__perf_callchain_kernel(entry, regs, true);
> +}
> +
>  static inline int
>  valid_user_frame(const void __user *fp, unsigned long size)
>  {
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index e47ef764f613..b0e33ba36695 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -1154,9 +1154,10 @@ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);
>  
>  extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
>  extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
> +extern void perf_callchain_kernel_direct(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
>  extern struct perf_callchain_entry *
>  get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
> -		   u32 max_stack, bool crosstask, bool add_mark);
> +		   u32 max_stack, bool crosstask, bool add_mark, bool direct_call);
>  extern struct perf_callchain_entry *perf_callchain(struct perf_event *event, struct pt_regs *regs);
>  extern int get_callchain_buffers(int max_stack);
>  extern void put_callchain_buffers(void);
> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
> index 950ab2f28922..376d774da6b5 100644
> --- a/kernel/bpf/stackmap.c
> +++ b/kernel/bpf/stackmap.c
> @@ -368,7 +368,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
>  		return -EINVAL;
>  
>  	trace = get_perf_callchain(regs, init_nr, kernel, user,
> -				   sysctl_perf_event_max_stack, false, false);
> +				   sysctl_perf_event_max_stack, false, false, false);
>  
>  	if (unlikely(!trace))
>  		/* couldn't fetch the stack trace */
> @@ -476,7 +476,7 @@ BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
>  	else
>  		init_nr = sysctl_perf_event_max_stack - num_elem;
>  	trace = get_perf_callchain(regs, init_nr, kernel, user,
> -				   sysctl_perf_event_max_stack, false, false);
> +				   sysctl_perf_event_max_stack, false, false, false);
>  	if (unlikely(!trace))
>  		goto err_fault;
>  
> diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
> index c2b41a263166..4cb5861f57ee 100644
> --- a/kernel/events/callchain.c
> +++ b/kernel/events/callchain.c
> @@ -40,6 +40,12 @@ __weak void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
>  {
>  }
>  
> +__weak void perf_callchain_kernel_direct(struct perf_callchain_entry_ctx *entry,
> +				  struct pt_regs *regs)
> +{
> +	perf_callchain_kernel(entry, regs);
> +}
> +
>  __weak void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
>  				struct pt_regs *regs)
>  {
> @@ -176,7 +182,7 @@ put_callchain_entry(int rctx)
>  
>  struct perf_callchain_entry *
>  get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
> -		   u32 max_stack, bool crosstask, bool add_mark)
> +		   u32 max_stack, bool crosstask, bool add_mark, bool direct_call)
>  {
>  	struct perf_callchain_entry *entry;
>  	struct perf_callchain_entry_ctx ctx;
> @@ -198,7 +204,10 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
>  	if (kernel && !user_mode(regs)) {
>  		if (add_mark)
>  			perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL);
> -		perf_callchain_kernel(&ctx, regs);
> +		if (direct_call)
> +			perf_callchain_kernel_direct(&ctx, regs);
> +		else
> +			perf_callchain_kernel(&ctx, regs);
>  	}
>  
>  	if (user) {
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 72d06e302e99..af9fb9b03283 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -6377,7 +6377,7 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
>  		return &__empty_callchain;
>  
>  	callchain = get_perf_callchain(regs, 0, kernel, user,
> -				       max_stack, crosstask, true);
> +				       max_stack, crosstask, true, true);
>  	return callchain ?: &__empty_callchain;
>  }
>  
> -- 
> 2.20.1
>
Josh Poimboeuf April 5, 2019, 2:09 p.m. UTC | #2
On Fri, Apr 05, 2019 at 01:25:45AM +0800, Kairui Song wrote:
> Currently perf callchain is not working properly with ORC unwinder,
> we'll get useless in kernel callchain like this:
> 
> perf  6429 [000]    22.498450:             kmem:mm_page_alloc: page=0x176a17 pfn=1534487 order=0 migratetype=0 gfp_flags=GFP_KERNEL
>         ffffffffbe23e32e __alloc_pages_nodemask+0x22e (/lib/modules/5.1.0-rc3+/build/vmlinux)
>             7efdf7f7d3e8 __poll+0x18 (/usr/lib64/libc-2.28.so)
>             5651468729c1 [unknown] (/usr/bin/perf)
>             5651467ee82a main+0x69a (/usr/bin/perf)
>             7efdf7eaf413 __libc_start_main+0xf3 (/usr/lib64/libc-2.28.so)
>         5541f689495641d7 [unknown] ([unknown])
> 
> Without CONFIG_FRAME_POINTER, bp is not reserved as frame pointer so
> can't get callers frame pointer, instead current frame pointer is
> returned when trying to fetch caller registers. The unwinder will error
> out early, and end the stacktrace early.
> 
> So instead of let the unwinder start with the dumped register, we start
> it right where the unwinding started when the stacktrace is triggered by
> trace event directly. And skip until the frame pointer is reached.
> 
> This makes the callchain get the full in kernel stacktrace again:
> 
> perf  6503 [000]  1567.570191:             kmem:mm_page_alloc: page=0x16c904 pfn=1493252 order=0 migratetype=0 gfp_flags=GFP_KERNEL
>         ffffffffb523e2ae __alloc_pages_nodemask+0x22e (/lib/modules/5.1.0-rc3+/build/vmlinux)
>         ffffffffb52383bd __get_free_pages+0xd (/lib/modules/5.1.0-rc3+/build/vmlinux)
>         ffffffffb52fd28a __pollwait+0x8a (/lib/modules/5.1.0-rc3+/build/vmlinux)
>         ffffffffb521426f perf_poll+0x2f (/lib/modules/5.1.0-rc3+/build/vmlinux)
>         ffffffffb52fe3e2 do_sys_poll+0x252 (/lib/modules/5.1.0-rc3+/build/vmlinux)
>         ffffffffb52ff027 __x64_sys_poll+0x37 (/lib/modules/5.1.0-rc3+/build/vmlinux)
>         ffffffffb500418b do_syscall_64+0x5b (/lib/modules/5.1.0-rc3+/build/vmlinux)
>         ffffffffb5a0008c entry_SYSCALL_64_after_hwframe+0x44 (/lib/modules/5.1.0-rc3+/build/vmlinux)
>             7f71e92d03e8 __poll+0x18 (/usr/lib64/libc-2.28.so)
>             55a22960d9c1 [unknown] (/usr/bin/perf)
>             55a22958982a main+0x69a (/usr/bin/perf)
>             7f71e9202413 __libc_start_main+0xf3 (/usr/lib64/libc-2.28.so)
>         5541f689495641d7 [unknown] ([unknown])
> 
> ----
> 
> Just found with ORC unwinder the perf callchain is unusable, and this
> seems fixes it well, any suggestion is welcome, thanks!

Hi Kairui,

Without CONFIG_FRAME_POINTER, the BP register has no meaning, so I don't
see how this patch could work.

Also, perf stack traces seem to work fine for me with ORC.  Can you give
some details on how to recreate the issue?
Kairui Song April 5, 2019, 3:13 p.m. UTC | #3
On Fri, Apr 5, 2019 at 10:09 PM Josh Poimboeuf <jpoimboe@redhat.com> wrote:
>
> On Fri, Apr 05, 2019 at 01:25:45AM +0800, Kairui Song wrote:
> > Currently perf callchain is not working properly with ORC unwinder,
> > we'll get useless in kernel callchain like this:
> >
> > perf  6429 [000]    22.498450:             kmem:mm_page_alloc: page=0x176a17 pfn=1534487 order=0 migratetype=0 gfp_flags=GFP_KERNEL
> >         ffffffffbe23e32e __alloc_pages_nodemask+0x22e (/lib/modules/5.1.0-rc3+/build/vmlinux)
> >             7efdf7f7d3e8 __poll+0x18 (/usr/lib64/libc-2.28.so)
> >             5651468729c1 [unknown] (/usr/bin/perf)
> >             5651467ee82a main+0x69a (/usr/bin/perf)
> >             7efdf7eaf413 __libc_start_main+0xf3 (/usr/lib64/libc-2.28.so)
> >         5541f689495641d7 [unknown] ([unknown])
> >
> > Without CONFIG_FRAME_POINTER, bp is not reserved as frame pointer so
> > can't get callers frame pointer, instead current frame pointer is
> > returned when trying to fetch caller registers. The unwinder will error
> > out early, and end the stacktrace early.
> >
> > So instead of let the unwinder start with the dumped register, we start
> > it right where the unwinding started when the stacktrace is triggered by
> > trace event directly. And skip until the frame pointer is reached.
> >
> > This makes the callchain get the full in kernel stacktrace again:
> >
> > perf  6503 [000]  1567.570191:             kmem:mm_page_alloc: page=0x16c904 pfn=1493252 order=0 migratetype=0 gfp_flags=GFP_KERNEL
> >         ffffffffb523e2ae __alloc_pages_nodemask+0x22e (/lib/modules/5.1.0-rc3+/build/vmlinux)
> >         ffffffffb52383bd __get_free_pages+0xd (/lib/modules/5.1.0-rc3+/build/vmlinux)
> >         ffffffffb52fd28a __pollwait+0x8a (/lib/modules/5.1.0-rc3+/build/vmlinux)
> >         ffffffffb521426f perf_poll+0x2f (/lib/modules/5.1.0-rc3+/build/vmlinux)
> >         ffffffffb52fe3e2 do_sys_poll+0x252 (/lib/modules/5.1.0-rc3+/build/vmlinux)
> >         ffffffffb52ff027 __x64_sys_poll+0x37 (/lib/modules/5.1.0-rc3+/build/vmlinux)
> >         ffffffffb500418b do_syscall_64+0x5b (/lib/modules/5.1.0-rc3+/build/vmlinux)
> >         ffffffffb5a0008c entry_SYSCALL_64_after_hwframe+0x44 (/lib/modules/5.1.0-rc3+/build/vmlinux)
> >             7f71e92d03e8 __poll+0x18 (/usr/lib64/libc-2.28.so)
> >             55a22960d9c1 [unknown] (/usr/bin/perf)
> >             55a22958982a main+0x69a (/usr/bin/perf)
> >             7f71e9202413 __libc_start_main+0xf3 (/usr/lib64/libc-2.28.so)
> >         5541f689495641d7 [unknown] ([unknown])
> >
> > ----
> >
> > Just found with ORC unwinder the perf callchain is unusable, and this
> > seems fixes it well, any suggestion is welcome, thanks!
>
> Hi Kairui,
>
> Without CONFIG_FRAME_POINTER, the BP register has no meaning, so I don't
> see how this patch could work.
>
> Also, perf stack traces seem to work fine for me with ORC.  Can you give
> some details on how to recreate the issue?
>
> --
> Josh


Hi Josh, thanks for the review, I tried again, using latest upstream
kernel commit ea2cec24c8d429ee6f99040e4eb6c7ad627fe777:
# uname -a
Linux localhost.localdomain 5.1.0-rc3+ #29 SMP Fri Apr 5 22:53:05 CST
2019 x86_64 x86_64 x86_64 GNU/Linux

Having following config:
> CONFIG_UNWINDER_ORC=y
> # CONFIG_UNWINDER_FRAME_POINTER is not set
and CONFIG_FRAME_POINTER is off too.

Then record something with perf (also latest upstream version):
./perf record -g -e kmem:* -c 1

Interrupt it, then view the output:
perf script | less

Then I notice the stacktrace in kernle is incomplete like following.
Did I miss anything?
--------------
lvmetad   617 [000]    55.600786:                     kmem:kfree:
call_site=ffffffffb219e269 ptr=(nil)
        ffffffffb22b2d1c kfree+0x11c (/lib/modules/5.1.0-rc3+/build/vmlinux)
            7fba7e58fd0f __select+0x5f (/usr/lib64/libc-2.28.so)

kworker/u2:5-rp   171 [000]    55.628529:
kmem:kmem_cache_alloc: call_site=ffffffffb20e963d
ptr=0xffffa07f39c581e0 bytes_req=80 bytes_alloc=80
gfp_flags=GFP_ATOMIC
        ffffffffb22b0dec kmem_cache_alloc+0x13c
(/lib/modules/5.1.0-rc3+/build/vmlinux)
-------------

And for the patch, I debugged the problem, and found how it happend:
The reason is that we use following code for fetching the registers on
a trace point:
...snip...
#define perf_arch_fetch_caller_regs(regs, __ip) { \
(regs)->ip = (__ip); \
(regs)->bp = caller_frame_pointer(); \
(regs)->cs = __KERNEL_CS;
...snip...

It tries to dump the registers of caller, but in the definition of
caller_frame_pointer:
static inline unsigned long caller_frame_pointer(void)
{
struct stack_frame *frame;

frame = __builtin_frame_address(0);

#ifdef CONFIG_FRAME_POINTER
frame = frame->next_frame;
#endif

return (unsigned long)frame;
}

Without CONFIG_FRAME_POINTER, current frame pointer is returned instead.

The fetched BP value should be still valid in this case, maybe it's
not the real BP value but should be the frame pointer, as it's
returned by __builtin_frame_address. But this will cause the regs
variable to contain the IP of caller function (the function actually
trying to trace), and the BP of current function (a event probe
function). Unwinder will error out.

I think there is noway to get caller's frame without calling the
unwinder, but issue a unwinding call on event tracing point would be
over kill.
For call chain, regs here is only used to tell unwinder to start by
the given stack, IP and frame. If it's unusable, just start the
unwinder directly if the unwinder is started directly inside the event
trace point. That would garentee we will unwind to the event path. And
just skip all the part we won't want.

With this patch, it worked, i got the full stacktrace again, so I
assume I got it right... let me know if I'm making any mistake, thank
you!
Kairui Song April 5, 2019, 3:21 p.m. UTC | #4
On Fri, Apr 5, 2019 at 3:17 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> And you forgot to Cc Josh..
>

Hi, thanks for the reply and Cc more people, I just copies the list
from ./scripts/get_maintainer.pl, will pay more attention next time.

> >
> > Just found with ORC unwinder the perf callchain is unusable, and this
> > seems fixes it well, any suggestion is welcome, thanks!
>
> That whole .direct stuff is horrible crap.
>

Sorry if I did anything dumb, but I didn't find a better way to make
it work so sent this RFC... Would you mind tell me what I'm doing
wrong, or give any suggestion about how should I improve it?

--
Best Regards,
Kairui Song
Josh Poimboeuf April 5, 2019, 4:57 p.m. UTC | #5
On Fri, Apr 05, 2019 at 11:13:02PM +0800, Kairui Song wrote:
> Hi Josh, thanks for the review, I tried again, using latest upstream
> kernel commit ea2cec24c8d429ee6f99040e4eb6c7ad627fe777:
> # uname -a
> Linux localhost.localdomain 5.1.0-rc3+ #29 SMP Fri Apr 5 22:53:05 CST
> 2019 x86_64 x86_64 x86_64 GNU/Linux
> 
> Having following config:
> > CONFIG_UNWINDER_ORC=y
> > # CONFIG_UNWINDER_FRAME_POINTER is not set
> and CONFIG_FRAME_POINTER is off too.
> 
> Then record something with perf (also latest upstream version):
> ./perf record -g -e kmem:* -c 1
> 
> Interrupt it, then view the output:
> perf script | less
> 
> Then I notice the stacktrace in kernle is incomplete like following.
> Did I miss anything?
> --------------
> lvmetad   617 [000]    55.600786:                     kmem:kfree:
> call_site=ffffffffb219e269 ptr=(nil)
>         ffffffffb22b2d1c kfree+0x11c (/lib/modules/5.1.0-rc3+/build/vmlinux)
>             7fba7e58fd0f __select+0x5f (/usr/lib64/libc-2.28.so)
> 
> kworker/u2:5-rp   171 [000]    55.628529:
> kmem:kmem_cache_alloc: call_site=ffffffffb20e963d
> ptr=0xffffa07f39c581e0 bytes_req=80 bytes_alloc=80
> gfp_flags=GFP_ATOMIC
>         ffffffffb22b0dec kmem_cache_alloc+0x13c
> (/lib/modules/5.1.0-rc3+/build/vmlinux)
> -------------
> 
> And for the patch, I debugged the problem, and found how it happend:
> The reason is that we use following code for fetching the registers on
> a trace point:
> ...snip...
> #define perf_arch_fetch_caller_regs(regs, __ip) { \
> (regs)->ip = (__ip); \
> (regs)->bp = caller_frame_pointer(); \
> (regs)->cs = __KERNEL_CS;
> ...snip...

Thanks, I was able to recreate.  It only happens when unwinding from a
tracepoint.  I haven't investigated yet, but
perf_arch_fetch_caller_regs() looks highly suspect, since it's doing
(regs)->bp = caller_frame_pointer(), even for ORC.

My only explanation for how your patch works is that RBP just happens to
point to somewhere higher on the stack, causing the unwinder to start at
a semi-random location.  I suspect the real "fix" is that you're no
longer passing the regs to unwind_start().
Kairui Song April 5, 2019, 5:05 p.m. UTC | #6
On Sat, Apr 6, 2019 at 12:57 AM Josh Poimboeuf <jpoimboe@redhat.com> wrote:
>
> On Fri, Apr 05, 2019 at 11:13:02PM +0800, Kairui Song wrote:
> > Hi Josh, thanks for the review, I tried again, using latest upstream
> > kernel commit ea2cec24c8d429ee6f99040e4eb6c7ad627fe777:
> > # uname -a
> > Linux localhost.localdomain 5.1.0-rc3+ #29 SMP Fri Apr 5 22:53:05 CST
> > 2019 x86_64 x86_64 x86_64 GNU/Linux
> >
> > Having following config:
> > > CONFIG_UNWINDER_ORC=y
> > > # CONFIG_UNWINDER_FRAME_POINTER is not set
> > and CONFIG_FRAME_POINTER is off too.
> >
> > Then record something with perf (also latest upstream version):
> > ./perf record -g -e kmem:* -c 1
> >
> > Interrupt it, then view the output:
> > perf script | less
> >
> > Then I notice the stacktrace in kernle is incomplete like following.
> > Did I miss anything?
> > --------------
> > lvmetad   617 [000]    55.600786:                     kmem:kfree:
> > call_site=ffffffffb219e269 ptr=(nil)
> >         ffffffffb22b2d1c kfree+0x11c (/lib/modules/5.1.0-rc3+/build/vmlinux)
> >             7fba7e58fd0f __select+0x5f (/usr/lib64/libc-2.28.so)
> >
> > kworker/u2:5-rp   171 [000]    55.628529:
> > kmem:kmem_cache_alloc: call_site=ffffffffb20e963d
> > ptr=0xffffa07f39c581e0 bytes_req=80 bytes_alloc=80
> > gfp_flags=GFP_ATOMIC
> >         ffffffffb22b0dec kmem_cache_alloc+0x13c
> > (/lib/modules/5.1.0-rc3+/build/vmlinux)
> > -------------
> >
> > And for the patch, I debugged the problem, and found how it happend:
> > The reason is that we use following code for fetching the registers on
> > a trace point:
> > ...snip...
> > #define perf_arch_fetch_caller_regs(regs, __ip) { \
> > (regs)->ip = (__ip); \
> > (regs)->bp = caller_frame_pointer(); \
> > (regs)->cs = __KERNEL_CS;
> > ...snip...
>
> Thanks, I was able to recreate.  It only happens when unwinding from a
> tracepoint.  I haven't investigated yet, but
> perf_arch_fetch_caller_regs() looks highly suspect, since it's doing
> (regs)->bp = caller_frame_pointer(), even for ORC.
>
> My only explanation for how your patch works is that RBP just happens to
> point to somewhere higher on the stack, causing the unwinder to start at
> a semi-random location.  I suspect the real "fix" is that you're no
> longer passing the regs to unwind_start().
>

Yes that's right. Simply not passing regs to unwind_start will let the
unwind start from the perf sample handling functions, and introduce a
lot of "noise", so I let it skipped the frames until it reached the
frame of the trace point. The regs->bp should still points to the
stack base of the function which get called in the tracepoint that
trigger perf sample, so let unwinder skip all the frames above it made
it work.
Josh Poimboeuf April 5, 2019, 5:26 p.m. UTC | #7
On Sat, Apr 06, 2019 at 01:05:55AM +0800, Kairui Song wrote:
> On Sat, Apr 6, 2019 at 12:57 AM Josh Poimboeuf <jpoimboe@redhat.com> wrote:
> >
> > On Fri, Apr 05, 2019 at 11:13:02PM +0800, Kairui Song wrote:
> > > Hi Josh, thanks for the review, I tried again, using latest upstream
> > > kernel commit ea2cec24c8d429ee6f99040e4eb6c7ad627fe777:
> > > # uname -a
> > > Linux localhost.localdomain 5.1.0-rc3+ #29 SMP Fri Apr 5 22:53:05 CST
> > > 2019 x86_64 x86_64 x86_64 GNU/Linux
> > >
> > > Having following config:
> > > > CONFIG_UNWINDER_ORC=y
> > > > # CONFIG_UNWINDER_FRAME_POINTER is not set
> > > and CONFIG_FRAME_POINTER is off too.
> > >
> > > Then record something with perf (also latest upstream version):
> > > ./perf record -g -e kmem:* -c 1
> > >
> > > Interrupt it, then view the output:
> > > perf script | less
> > >
> > > Then I notice the stacktrace in kernle is incomplete like following.
> > > Did I miss anything?
> > > --------------
> > > lvmetad   617 [000]    55.600786:                     kmem:kfree:
> > > call_site=ffffffffb219e269 ptr=(nil)
> > >         ffffffffb22b2d1c kfree+0x11c (/lib/modules/5.1.0-rc3+/build/vmlinux)
> > >             7fba7e58fd0f __select+0x5f (/usr/lib64/libc-2.28.so)
> > >
> > > kworker/u2:5-rp   171 [000]    55.628529:
> > > kmem:kmem_cache_alloc: call_site=ffffffffb20e963d
> > > ptr=0xffffa07f39c581e0 bytes_req=80 bytes_alloc=80
> > > gfp_flags=GFP_ATOMIC
> > >         ffffffffb22b0dec kmem_cache_alloc+0x13c
> > > (/lib/modules/5.1.0-rc3+/build/vmlinux)
> > > -------------
> > >
> > > And for the patch, I debugged the problem, and found how it happend:
> > > The reason is that we use following code for fetching the registers on
> > > a trace point:
> > > ...snip...
> > > #define perf_arch_fetch_caller_regs(regs, __ip) { \
> > > (regs)->ip = (__ip); \
> > > (regs)->bp = caller_frame_pointer(); \
> > > (regs)->cs = __KERNEL_CS;
> > > ...snip...
> >
> > Thanks, I was able to recreate.  It only happens when unwinding from a
> > tracepoint.  I haven't investigated yet, but
> > perf_arch_fetch_caller_regs() looks highly suspect, since it's doing
> > (regs)->bp = caller_frame_pointer(), even for ORC.
> >
> > My only explanation for how your patch works is that RBP just happens to
> > point to somewhere higher on the stack, causing the unwinder to start at
> > a semi-random location.  I suspect the real "fix" is that you're no
> > longer passing the regs to unwind_start().
> >
> 
> Yes that's right. Simply not passing regs to unwind_start will let the
> unwind start from the perf sample handling functions, and introduce a
> lot of "noise", so I let it skipped the frames until it reached the
> frame of the trace point. The regs->bp should still points to the
> stack base of the function which get called in the tracepoint that
> trigger perf sample, so let unwinder skip all the frames above it made
> it work.

Ah, now I think I understand, thanks.  perf_arch_fetch_caller_regs()
puts it in regs->bp, and then perf_callchain_kernel() reads that value
to tell the unwinder where to start dumping the stack trace.  I guess
that explains why your patch works, though it still seems very odd that
perf_arch_fetch_caller_regs() is using regs->bp to store the frame
address.  Maybe regs->sp would be more appropriate.
Kairui Song April 5, 2019, 6:55 p.m. UTC | #8
On Sat, Apr 6, 2019 at 1:27 AM Josh Poimboeuf <jpoimboe@redhat.com> wrote:
>
> On Sat, Apr 06, 2019 at 01:05:55AM +0800, Kairui Song wrote:
> > On Sat, Apr 6, 2019 at 12:57 AM Josh Poimboeuf <jpoimboe@redhat.com> wrote:
> > >
> > > On Fri, Apr 05, 2019 at 11:13:02PM +0800, Kairui Song wrote:
> > > > Hi Josh, thanks for the review, I tried again, using latest upstream
> > > > kernel commit ea2cec24c8d429ee6f99040e4eb6c7ad627fe777:
> > > > # uname -a
> > > > Linux localhost.localdomain 5.1.0-rc3+ #29 SMP Fri Apr 5 22:53:05 CST
> > > > 2019 x86_64 x86_64 x86_64 GNU/Linux
> > > >
> > > > Having following config:
> > > > > CONFIG_UNWINDER_ORC=y
> > > > > # CONFIG_UNWINDER_FRAME_POINTER is not set
> > > > and CONFIG_FRAME_POINTER is off too.
> > > >
> > > > Then record something with perf (also latest upstream version):
> > > > ./perf record -g -e kmem:* -c 1
> > > >
> > > > Interrupt it, then view the output:
> > > > perf script | less
> > > >
> > > > Then I notice the stacktrace in kernle is incomplete like following.
> > > > Did I miss anything?
> > > > --------------
> > > > lvmetad   617 [000]    55.600786:                     kmem:kfree:
> > > > call_site=ffffffffb219e269 ptr=(nil)
> > > >         ffffffffb22b2d1c kfree+0x11c (/lib/modules/5.1.0-rc3+/build/vmlinux)
> > > >             7fba7e58fd0f __select+0x5f (/usr/lib64/libc-2.28.so)
> > > >
> > > > kworker/u2:5-rp   171 [000]    55.628529:
> > > > kmem:kmem_cache_alloc: call_site=ffffffffb20e963d
> > > > ptr=0xffffa07f39c581e0 bytes_req=80 bytes_alloc=80
> > > > gfp_flags=GFP_ATOMIC
> > > >         ffffffffb22b0dec kmem_cache_alloc+0x13c
> > > > (/lib/modules/5.1.0-rc3+/build/vmlinux)
> > > > -------------
> > > >
> > > > And for the patch, I debugged the problem, and found how it happend:
> > > > The reason is that we use following code for fetching the registers on
> > > > a trace point:
> > > > ...snip...
> > > > #define perf_arch_fetch_caller_regs(regs, __ip) { \
> > > > (regs)->ip = (__ip); \
> > > > (regs)->bp = caller_frame_pointer(); \
> > > > (regs)->cs = __KERNEL_CS;
> > > > ...snip...
> > >
> > > Thanks, I was able to recreate.  It only happens when unwinding from a
> > > tracepoint.  I haven't investigated yet, but
> > > perf_arch_fetch_caller_regs() looks highly suspect, since it's doing
> > > (regs)->bp = caller_frame_pointer(), even for ORC.
> > >
> > > My only explanation for how your patch works is that RBP just happens to
> > > point to somewhere higher on the stack, causing the unwinder to start at
> > > a semi-random location.  I suspect the real "fix" is that you're no
> > > longer passing the regs to unwind_start().
> > >
> >
> > Yes that's right. Simply not passing regs to unwind_start will let the
> > unwind start from the perf sample handling functions, and introduce a
> > lot of "noise", so I let it skipped the frames until it reached the
> > frame of the trace point. The regs->bp should still points to the
> > stack base of the function which get called in the tracepoint that
> > trigger perf sample, so let unwinder skip all the frames above it made
> > it work.
>
> Ah, now I think I understand, thanks.  perf_arch_fetch_caller_regs()
> puts it in regs->bp, and then perf_callchain_kernel() reads that value
> to tell the unwinder where to start dumping the stack trace.  I guess
> that explains why your patch works, though it still seems very odd that
> perf_arch_fetch_caller_regs() is using regs->bp to store the frame
> address.  Maybe regs->sp would be more appropriate.
>
> --
> Josh

Right, thanks for the comment. And after second thought there are some
other issues here in the patch indeed, it still won't fix the problem
when used with ebpf and tracepoint, I made some mistake about handling
the callchain with different ways, will rethink about this and post an
update later.


--
Best Regards,
Kairui Song

Patch
diff mbox series

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index e2b1447192a8..3f3e110794ac 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2355,8 +2355,9 @@  void arch_perf_update_userpage(struct perf_event *event,
 	cyc2ns_read_end();
 }
 
-void
-perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
+static void
+__perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs,
+			bool direct_call)
 {
 	struct unwind_state state;
 	unsigned long addr;
@@ -2366,17 +2367,38 @@  perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re
 		return;
 	}
 
-	if (perf_callchain_store(entry, regs->ip))
-		return;
+	/*
+	 * Without frame pointer, we can't get a reliable caller bp value.
+	 * If this is called directly from a trace point, just start the
+	 * unwind from here and skip until the frame is reached.
+	 */
+	if (IS_ENABLED(CONFIG_FRAME_POINTER) || !direct_call) {
+		if (perf_callchain_store(entry, regs->ip))
+			return;
+		unwind_start(&state, current, regs, NULL);
+	} else {
+		unwind_start(&state, current, NULL, (unsigned long*)regs->bp);
+	}
 
-	for (unwind_start(&state, current, regs, NULL); !unwind_done(&state);
-	     unwind_next_frame(&state)) {
+	for (; !unwind_done(&state); unwind_next_frame(&state)) {
 		addr = unwind_get_return_address(&state);
 		if (!addr || perf_callchain_store(entry, addr))
 			return;
 	}
 }
 
+void
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
+{
+	__perf_callchain_kernel(entry, regs, false);
+}
+
+void
+perf_callchain_kernel_direct(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
+{
+	__perf_callchain_kernel(entry, regs, true);
+}
+
 static inline int
 valid_user_frame(const void __user *fp, unsigned long size)
 {
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e47ef764f613..b0e33ba36695 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1154,9 +1154,10 @@  DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);
 
 extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
 extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
+extern void perf_callchain_kernel_direct(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
 extern struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
-		   u32 max_stack, bool crosstask, bool add_mark);
+		   u32 max_stack, bool crosstask, bool add_mark, bool direct_call);
 extern struct perf_callchain_entry *perf_callchain(struct perf_event *event, struct pt_regs *regs);
 extern int get_callchain_buffers(int max_stack);
 extern void put_callchain_buffers(void);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 950ab2f28922..376d774da6b5 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -368,7 +368,7 @@  BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
 		return -EINVAL;
 
 	trace = get_perf_callchain(regs, init_nr, kernel, user,
-				   sysctl_perf_event_max_stack, false, false);
+				   sysctl_perf_event_max_stack, false, false, false);
 
 	if (unlikely(!trace))
 		/* couldn't fetch the stack trace */
@@ -476,7 +476,7 @@  BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
 	else
 		init_nr = sysctl_perf_event_max_stack - num_elem;
 	trace = get_perf_callchain(regs, init_nr, kernel, user,
-				   sysctl_perf_event_max_stack, false, false);
+				   sysctl_perf_event_max_stack, false, false, false);
 	if (unlikely(!trace))
 		goto err_fault;
 
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index c2b41a263166..4cb5861f57ee 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -40,6 +40,12 @@  __weak void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 {
 }
 
+__weak void perf_callchain_kernel_direct(struct perf_callchain_entry_ctx *entry,
+				  struct pt_regs *regs)
+{
+	perf_callchain_kernel(entry, regs);
+}
+
 __weak void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 				struct pt_regs *regs)
 {
@@ -176,7 +182,7 @@  put_callchain_entry(int rctx)
 
 struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
-		   u32 max_stack, bool crosstask, bool add_mark)
+		   u32 max_stack, bool crosstask, bool add_mark, bool direct_call)
 {
 	struct perf_callchain_entry *entry;
 	struct perf_callchain_entry_ctx ctx;
@@ -198,7 +204,10 @@  get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 	if (kernel && !user_mode(regs)) {
 		if (add_mark)
 			perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL);
-		perf_callchain_kernel(&ctx, regs);
+		if (direct_call)
+			perf_callchain_kernel_direct(&ctx, regs);
+		else
+			perf_callchain_kernel(&ctx, regs);
 	}
 
 	if (user) {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 72d06e302e99..af9fb9b03283 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6377,7 +6377,7 @@  perf_callchain(struct perf_event *event, struct pt_regs *regs)
 		return &__empty_callchain;
 
 	callchain = get_perf_callchain(regs, 0, kernel, user,
-				       max_stack, crosstask, true);
+				       max_stack, crosstask, true, true);
 	return callchain ?: &__empty_callchain;
 }