kretprobe: produce sane stack traces
diff mbox series

Message ID 20181026132210.12569-1-cyphar@cyphar.com
State Superseded
Headers show
Series
  • kretprobe: produce sane stack traces
Related show

Commit Message

Aleksa Sarai Oct. 26, 2018, 1:22 p.m. UTC
Historically, kretprobe has always produced unusable stack traces
(kretprobe_trampoline is the only entry in most cases, because of the
funky stack pointer overwriting). This has caused quite a few annoyances
when using tracing to debug problems[1] -- since return values are only
available with kretprobes but stack traces were only usable for kprobes,
users had to probe both and then manually associate them.

With the advent of bpf_trace, users would have been able to do this
association in bpf, but this was less than ideal (because
bpf_get_stackid would still produce rubbish and programs that didn't
know better would get silly results). The main usecase for stack traces
(at least with bpf_trace) is for DTrace-style aggregation on stack
traces (both entry and exit). Therefore we cannot simply correct the
stack trace on exit -- we must stash away the stack trace and return the
entry stack trace when it is requested.

In theory, patches like commit 76094a2cf46e ("ftrace: distinguish
kretprobe'd functions in trace logs") are no longer necessary *for
tracing* because now all kretprobe traces should produce sane stack
traces. However it's not clear whether removing them completely is
reasonable.

[1]: https://github.com/iovisor/bpftrace/issues/101

Cc: Brendan Gregg <bgregg@netflix.com>
Cc: Christian Brauner <christian@brauner.io>
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 include/linux/kprobes.h   |  15 ++++++
 kernel/events/callchain.c |   8 ++-
 kernel/kprobes.c          | 108 +++++++++++++++++++++++++++++++++++++-
 kernel/trace/trace.c      |  11 +++-
 4 files changed, 138 insertions(+), 4 deletions(-)

Comments

Masami Hiramatsu Oct. 30, 2018, 1:12 a.m. UTC | #1
Hi Aleksa,

On Sat, 27 Oct 2018 00:22:10 +1100
Aleksa Sarai <cyphar@cyphar.com> wrote:

> Historically, kretprobe has always produced unusable stack traces
> (kretprobe_trampoline is the only entry in most cases, because of the
> funky stack pointer overwriting). This has caused quite a few annoyances
> when using tracing to debug problems[1] -- since return values are only
> available with kretprobes but stack traces were only usable for kprobes,
> users had to probe both and then manually associate them.

Yes, this unfortunately still happens. I once tried to fix it by
replacing current "kretprobe instance" with graph-tracer's per-thread
return stack. (https://lkml.org/lkml/2017/8/21/553)

I still believe that direction is the best solution to solve this kind
of issues, otherwise, we have to have 2 different stack fixups for
kretprobe and ftrace graph tracer. (I will have a talk with Steve at
plumbers next month)

Anyway, until that merge happens, this patch looks good to avoid
this issue for generic solution (e.g. for the arch which doesn't
supports retstack).


> 
> With the advent of bpf_trace, users would have been able to do this
> association in bpf, but this was less than ideal (because
> bpf_get_stackid would still produce rubbish and programs that didn't
> know better would get silly results). The main usecase for stack traces
> (at least with bpf_trace) is for DTrace-style aggregation on stack
> traces (both entry and exit). Therefore we cannot simply correct the
> stack trace on exit -- we must stash away the stack trace and return the
> entry stack trace when it is requested.
> 
> In theory, patches like commit 76094a2cf46e ("ftrace: distinguish
> kretprobe'd functions in trace logs") are no longer necessary *for
> tracing* because now all kretprobe traces should produce sane stack
> traces. However it's not clear whether removing them completely is
> reasonable.

Then, let's try to revert it :)

BTW, could you also add a test case for ftrace too?
also, I have some comments below.

> 
> [1]: https://github.com/iovisor/bpftrace/issues/101
> 
> Cc: Brendan Gregg <bgregg@netflix.com>
> Cc: Christian Brauner <christian@brauner.io>
> Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
> ---
>  include/linux/kprobes.h   |  15 ++++++
>  kernel/events/callchain.c |   8 ++-
>  kernel/kprobes.c          | 108 +++++++++++++++++++++++++++++++++++++-
>  kernel/trace/trace.c      |  11 +++-
>  4 files changed, 138 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
> index e909413e4e38..8a4f78a0c990 100644
> --- a/include/linux/kprobes.h
> +++ b/include/linux/kprobes.h
> @@ -40,6 +40,8 @@
>  #include <linux/rcupdate.h>
>  #include <linux/mutex.h>
>  #include <linux/ftrace.h>
> +#include <linux/stacktrace.h>
> +#include <linux/perf_event.h>
>  #include <asm/kprobes.h>
>  
>  #ifdef CONFIG_KPROBES
> @@ -168,11 +170,18 @@ struct kretprobe {
>  	raw_spinlock_t lock;
>  };
>  
> +#define KRETPROBE_TRACE_SIZE 1024
> +struct kretprobe_trace {
> +	int nr_entries;
> +	unsigned long entries[KRETPROBE_TRACE_SIZE];
> +};

Hmm, do we really need all entries? It takes 8KB for each instances.
Note that the number of instances can be big if the system core number
is larger.

> +
>  struct kretprobe_instance {
>  	struct hlist_node hlist;
>  	struct kretprobe *rp;
>  	kprobe_opcode_t *ret_addr;
>  	struct task_struct *task;
> +	struct kretprobe_trace entry;
>  	char data[0];
>  };
>  
> @@ -371,6 +380,12 @@ void unregister_kretprobe(struct kretprobe *rp);
>  int register_kretprobes(struct kretprobe **rps, int num);
>  void unregister_kretprobes(struct kretprobe **rps, int num);
>  
> +struct kretprobe_instance *current_kretprobe_instance(void);
> +void kretprobe_save_stack_trace(struct kretprobe_instance *ri,
> +				struct stack_trace *trace);
> +void kretprobe_perf_callchain_kernel(struct kretprobe_instance *ri,
> +				     struct perf_callchain_entry_ctx *ctx);
> +
>  void kprobe_flush_task(struct task_struct *tk);
>  void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head);
>  
> diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
> index 24a77c34e9ad..98edcd8a6987 100644
> --- a/kernel/events/callchain.c
> +++ b/kernel/events/callchain.c
> @@ -12,6 +12,7 @@
>  #include <linux/perf_event.h>
>  #include <linux/slab.h>
>  #include <linux/sched/task_stack.h>
> +#include <linux/kprobes.h>
>  
>  #include "internal.h"
>  
> @@ -197,9 +198,14 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
>  	ctx.contexts_maxed = false;
>  
>  	if (kernel && !user_mode(regs)) {
> +		struct kretprobe_instance *ri = current_kretprobe_instance();
> +
>  		if (add_mark)
>  			perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL);
> -		perf_callchain_kernel(&ctx, regs);
> +		if (ri)
> +			kretprobe_perf_callchain_kernel(ri, &ctx);
> +		else
> +			perf_callchain_kernel(&ctx, regs);
>  	}
>  
>  	if (user) {
> diff --git a/kernel/kprobes.c b/kernel/kprobes.c
> index 90e98e233647..a0776a7d1244 100644
> --- a/kernel/kprobes.c
> +++ b/kernel/kprobes.c
> @@ -48,6 +48,7 @@
>  #include <linux/ftrace.h>
>  #include <linux/cpu.h>
>  #include <linux/jump_label.h>
> +#include <linux/sched/debug.h>
>  
>  #include <asm/sections.h>
>  #include <asm/cacheflush.h>
> @@ -1206,6 +1207,16 @@ __releases(hlist_lock)
>  }
>  NOKPROBE_SYMBOL(kretprobe_table_unlock);
>  
> +static bool kretprobe_hash_is_locked(struct task_struct *tsk)
> +{
> +	unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
> +	raw_spinlock_t *hlist_lock;
> +
> +	hlist_lock = kretprobe_table_lock_ptr(hash);
> +	return raw_spin_is_locked(hlist_lock);
> +}
> +NOKPROBE_SYMBOL(kretprobe_hash_is_locked);
> +
>  /*
>   * This function is called from finish_task_switch when task tk becomes dead,
>   * so that we can recycle any function-return probe instances associated
> @@ -1800,6 +1811,13 @@ unsigned long __weak arch_deref_entry_point(void *entry)
>  	return (unsigned long)entry;
>  }
>  
> +static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs);
> +
> +static inline bool kprobe_is_retprobe(struct kprobe *kp)
> +{
> +	return kp->pre_handler == pre_handler_kretprobe;
> +}
> +
>  #ifdef CONFIG_KRETPROBES
>  /*
>   * This kprobe pre_handler is registered with every kretprobe. When probe
> @@ -1826,6 +1844,8 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
>  	hash = hash_ptr(current, KPROBE_HASH_BITS);
>  	raw_spin_lock_irqsave(&rp->lock, flags);
>  	if (!hlist_empty(&rp->free_instances)) {
> +		struct stack_trace trace = {};
> +
>  		ri = hlist_entry(rp->free_instances.first,
>  				struct kretprobe_instance, hlist);
>  		hlist_del(&ri->hlist);
> @@ -1834,6 +1854,11 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
>  		ri->rp = rp;
>  		ri->task = current;
>  
> +		trace.entries = &ri->entry.entries[0];
> +		trace.max_entries = KRETPROBE_TRACE_SIZE;
> +		save_stack_trace_regs(regs, &trace);
> +		ri->entry.nr_entries = trace.nr_entries;
> +
>  		if (rp->entry_handler && rp->entry_handler(ri, regs)) {
>  			raw_spin_lock_irqsave(&rp->lock, flags);
>  			hlist_add_head(&ri->hlist, &rp->free_instances);
> @@ -1856,6 +1881,70 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
>  }
>  NOKPROBE_SYMBOL(pre_handler_kretprobe);
>  
> +/*
> + * Return the kretprobe_instance associated with the current_kprobe. Calling
> + * this is only reasonable from within a kretprobe handler context (otherwise
> + * return NULL).
> + *
> + * Must be called within a kretprobe_hash_lock(current, ...) context.
> + */
> +struct kretprobe_instance *current_kretprobe_instance(void)
> +{
> +	struct kprobe *kp;
> +	struct kretprobe *rp;
> +	struct kretprobe_instance *iter, *ri = NULL;
> +	struct hlist_head *head;
> +	struct hlist_node *next;
> +	unsigned long hash = hash_ptr(current, KPROBE_HASH_BITS);
> +
> +	kp = kprobe_running();
> +	if (!kp || !kprobe_is_retprobe(kp))
> +		return NULL;
> +	if (WARN_ON(!kretprobe_hash_is_locked(current)))

Yes, good catch :)

> +		return NULL;
> +
> +	rp = container_of(kp, struct kretprobe, kp);
> +	head = &kretprobe_inst_table[hash];
> +
> +	hlist_for_each_entry_safe(iter, next, head, hlist) {

Why would you use "_safe" variant here? if you don't modify the hlist,
you don't need to use it.

> +		if (iter->task == current && iter->rp == rp) {
> +			ri = iter;
> +			break;
> +		}
> +	}
> +	return ri;
> +}
> +EXPORT_SYMBOL_GPL(current_kretprobe_instance);
> +NOKPROBE_SYMBOL(current_kretprobe_instance);
> +
> +void kretprobe_save_stack_trace(struct kretprobe_instance *ri,
> +				struct stack_trace *trace)
> +{
> +	int i;
> +	struct kretprobe_trace *krt = &ri->entry;
> +
> +	for (i = trace->skip; i < krt->nr_entries; i++) {
> +		if (trace->nr_entries >= trace->max_entries)
> +			break;
> +		trace->entries[trace->nr_entries++] = krt->entries[i];
> +	}
> +}
> +EXPORT_SYMBOL_GPL(kretprobe_save_stack_trace);
> +
> +void kretprobe_perf_callchain_kernel(struct kretprobe_instance *ri,
> +				     struct perf_callchain_entry_ctx *ctx)
> +{
> +	int i;
> +	struct kretprobe_trace *krt = &ri->entry;
> +
> +	for (i = 0; i < krt->nr_entries; i++) {
> +		if (krt->entries[i] == ULONG_MAX)
> +			break;
> +		perf_callchain_store(ctx, (u64) krt->entries[i]);
> +	}
> +}
> +EXPORT_SYMBOL_GPL(kretprobe_perf_callchain_kernel);


Why do we need to export these functions?

Thank you,

> +
>  bool __weak arch_kprobe_on_func_entry(unsigned long offset)
>  {
>  	return !offset;
> @@ -2005,6 +2094,23 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
>  }
>  NOKPROBE_SYMBOL(pre_handler_kretprobe);
>  
> +struct kretprobe_instance *current_kretprobe_instance(void)
> +{
> +	return NULL;
> +}
> +NOKPROBE_SYMBOL(current_kretprobe_instance);
> +
> +void kretprobe_save_stack_trace(struct kretprobe_instance *ri,
> +				struct stack_trace *trace)
> +{
> +}
> +EXPORT_SYMBOL_GPL(kretprobe_save_stack_trace);
> +
> +void kretprobe_perf_callchain_kernel(struct kretprobe_instance *ri,
> +				     struct perf_callchain_entry_ctx *ctx)
> +{
> +}
> +EXPORT_SYMBOL_GPL(kretprobe_perf_callchain_kernel);
>  #endif /* CONFIG_KRETPROBES */
>  
>  /* Set the kprobe gone and remove its instruction buffer. */
> @@ -2241,7 +2347,7 @@ static void report_probe(struct seq_file *pi, struct kprobe *p,
>  	char *kprobe_type;
>  	void *addr = p->addr;
>  
> -	if (p->pre_handler == pre_handler_kretprobe)
> +	if (kprobe_is_retprobe(p))
>  		kprobe_type = "r";
>  	else
>  		kprobe_type = "k";
> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> index bf6f1d70484d..2210d38a4dbf 100644
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -42,6 +42,7 @@
>  #include <linux/nmi.h>
>  #include <linux/fs.h>
>  #include <linux/trace.h>
> +#include <linux/kprobes.h>
>  #include <linux/sched/clock.h>
>  #include <linux/sched/rt.h>
>  
> @@ -2590,6 +2591,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
>  	struct ring_buffer_event *event;
>  	struct stack_entry *entry;
>  	struct stack_trace trace;
> +	struct kretprobe_instance *ri = current_kretprobe_instance();
>  	int use_stack;
>  	int size = FTRACE_STACK_ENTRIES;
>  
> @@ -2626,7 +2628,9 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
>  		trace.entries		= this_cpu_ptr(ftrace_stack.calls);
>  		trace.max_entries	= FTRACE_STACK_MAX_ENTRIES;
>  
> -		if (regs)
> +		if (ri)
> +			kretprobe_save_stack_trace(ri, &trace);
> +		else if (regs)
>  			save_stack_trace_regs(regs, &trace);
>  		else
>  			save_stack_trace(&trace);
> @@ -2653,7 +2657,10 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
>  	else {
>  		trace.max_entries	= FTRACE_STACK_ENTRIES;
>  		trace.entries		= entry->caller;
> -		if (regs)
> +
> +		if (ri)
> +			kretprobe_save_stack_trace(ri, &trace);
> +		else if (regs)
>  			save_stack_trace_regs(regs, &trace);
>  		else
>  			save_stack_trace(&trace);
> -- 
> 2.19.1
>
Aleksa Sarai Oct. 30, 2018, 3:19 a.m. UTC | #2
On 2018-10-30, Masami Hiramatsu <mhiramat@kernel.org> wrote:
> > Historically, kretprobe has always produced unusable stack traces
> > (kretprobe_trampoline is the only entry in most cases, because of the
> > funky stack pointer overwriting). This has caused quite a few annoyances
> > when using tracing to debug problems[1] -- since return values are only
> > available with kretprobes but stack traces were only usable for kprobes,
> > users had to probe both and then manually associate them.
> 
> Yes, this unfortunately still happens. I once tried to fix it by
> replacing current "kretprobe instance" with graph-tracer's per-thread
> return stack. (https://lkml.org/lkml/2017/8/21/553)

I played with graph-tracer a while ago and it didn't appear to have
associated return values? Is this hidden somewhere or did I just miss
it?

> I still believe that direction is the best solution to solve this kind
> of issues, otherwise, we have to have 2 different stack fixups for
> kretprobe and ftrace graph tracer. (I will have a talk with Steve at
> plumbers next month)

I'm definitely :+1: on removing the duplication of the stack fixups, my
first instinct was to try to refactor all of the stack_trace code so
that we didn't have multiple arch-specific "get the stack trace" paths
(and so we could generically add current_kretprobe_instance() to one
codepath). But after looking into it, I was convinced this would be more
than a little ugly to do.

> > With the advent of bpf_trace, users would have been able to do this
> > association in bpf, but this was less than ideal (because
> > bpf_get_stackid would still produce rubbish and programs that didn't
> > know better would get silly results). The main usecase for stack traces
> > (at least with bpf_trace) is for DTrace-style aggregation on stack
> > traces (both entry and exit). Therefore we cannot simply correct the
> > stack trace on exit -- we must stash away the stack trace and return the
> > entry stack trace when it is requested.
> > 
> > In theory, patches like commit 76094a2cf46e ("ftrace: distinguish
> > kretprobe'd functions in trace logs") are no longer necessary *for
> > tracing* because now all kretprobe traces should produce sane stack
> > traces. However it's not clear whether removing them completely is
> > reasonable.
> 
> Then, let's try to revert it :)

Sure. :P

> BTW, could you also add a test case for ftrace too?
> also, I have some comments below.

Yup, will do.

> > +#define KRETPROBE_TRACE_SIZE 1024
> > +struct kretprobe_trace {
> > +	int nr_entries;
> > +	unsigned long entries[KRETPROBE_TRACE_SIZE];
> > +};
> 
> Hmm, do we really need all entries? It takes 8KB for each instances.
> Note that the number of instances can be big if the system core number
> is larger.

Yeah, you're right this is too large for a default.

But the problem is that we need it to be large enough for any of the
tracers to be happy -- otherwise we'd have to dynamically allocate it
and I had a feeling this would be seen as a Bad Idea™ in the kprobe
paths.

  * ftrace uses PAGE_SIZE/sizeof(u64) == 512 (on x86_64).
  * perf_events (and thus BPF) uses 127 as the default but can be
    configured via sysctl -- and thus can be unbounded.
  * show_stack(...) doesn't appear to have a limit, but I might just be
    misreading the x86-specific code.

As mentioned above, the lack of consensus on a single structure for
storing stack traces also means that there is a lack of consensus on
what the largest reasonable stack is.

But maybe just doing 127 would be "reasonable"?

(Athough, dynamically allocating would allow us to just use 'struct
stack_trace' directly without needing to embed a different structure.)

> > +	hlist_for_each_entry_safe(iter, next, head, hlist) {
> 
> Why would you use "_safe" variant here? if you don't modify the hlist,
> you don't need to use it.

Yup, my mistake.

> > +void kretprobe_save_stack_trace(struct kretprobe_instance *ri,
> > +				struct stack_trace *trace)
> > +{
> > +	int i;
> > +	struct kretprobe_trace *krt = &ri->entry;
> > +
> > +	for (i = trace->skip; i < krt->nr_entries; i++) {
> > +		if (trace->nr_entries >= trace->max_entries)
> > +			break;
> > +		trace->entries[trace->nr_entries++] = krt->entries[i];
> > +	}
> > +}
> > +EXPORT_SYMBOL_GPL(kretprobe_save_stack_trace);
> > +
> > +void kretprobe_perf_callchain_kernel(struct kretprobe_instance *ri,
> > +				     struct perf_callchain_entry_ctx *ctx)
> > +{
> > +	int i;
> > +	struct kretprobe_trace *krt = &ri->entry;
> > +
> > +	for (i = 0; i < krt->nr_entries; i++) {
> > +		if (krt->entries[i] == ULONG_MAX)
> > +			break;
> > +		perf_callchain_store(ctx, (u64) krt->entries[i]);
> > +	}
> > +}
> > +EXPORT_SYMBOL_GPL(kretprobe_perf_callchain_kernel);
> 
> 
> Why do we need to export these functions?

That's a good question -- I must've just banged out the EXPORT
statements without thinking. I'll remove them in v2.
Steven Rostedt Oct. 31, 2018, 1:03 p.m. UTC | #3
On Tue, 30 Oct 2018 10:12:06 +0900
Masami Hiramatsu <mhiramat@kernel.org> wrote:

> Anyway, until that merge happens, this patch looks good to avoid
> this issue for generic solution (e.g. for the arch which doesn't
> supports retstack).

I think its time to come up with an algorithm that makes function graph
work with multiple users, and have kretprobes be able to hook into it
just like kprobes hooks into function tracer.

I have some ideas on how to get this done, and will try to have an RFC
patch set ready by plumbers.

-- Steve
Aleksa Sarai Oct. 31, 2018, 1:39 p.m. UTC | #4
On 2018-10-31, Steven Rostedt <rostedt@goodmis.org> wrote:
> > Anyway, until that merge happens, this patch looks good to avoid
> > this issue for generic solution (e.g. for the arch which doesn't
> > supports retstack).
> 
> I think its time to come up with an algorithm that makes function graph
> work with multiple users, and have kretprobes be able to hook into it
> just like kprobes hooks into function tracer.
> 
> I have some ideas on how to get this done, and will try to have an RFC
> patch set ready by plumbers.

Should I continue working on this patchset?
Masami Hiramatsu Nov. 1, 2018, 8:06 a.m. UTC | #5
On Tue, 30 Oct 2018 14:19:53 +1100
Aleksa Sarai <cyphar@cyphar.com> wrote:

> On 2018-10-30, Masami Hiramatsu <mhiramat@kernel.org> wrote:
> > > Historically, kretprobe has always produced unusable stack traces
> > > (kretprobe_trampoline is the only entry in most cases, because of the
> > > funky stack pointer overwriting). This has caused quite a few annoyances
> > > when using tracing to debug problems[1] -- since return values are only
> > > available with kretprobes but stack traces were only usable for kprobes,
> > > users had to probe both and then manually associate them.
> > 
> > Yes, this unfortunately still happens. I once tried to fix it by
> > replacing current "kretprobe instance" with graph-tracer's per-thread
> > return stack. (https://lkml.org/lkml/2017/8/21/553)
> 
> I played with graph-tracer a while ago and it didn't appear to have
> associated return values? Is this hidden somewhere or did I just miss
> it?

Graph tracer just doesn't trace it. We still can access it.

> 
> > I still believe that direction is the best solution to solve this kind
> > of issues, otherwise, we have to have 2 different stack fixups for
> > kretprobe and ftrace graph tracer. (I will have a talk with Steve at
> > plumbers next month)
> 
> I'm definitely :+1: on removing the duplication of the stack fixups, my
> first instinct was to try to refactor all of the stack_trace code so
> that we didn't have multiple arch-specific "get the stack trace" paths
> (and so we could generically add current_kretprobe_instance() to one
> codepath). But after looking into it, I was convinced this would be more
> than a little ugly to do.

Yes, it would take a time to fix it up all, but should be done.

> > > With the advent of bpf_trace, users would have been able to do this
> > > association in bpf, but this was less than ideal (because
> > > bpf_get_stackid would still produce rubbish and programs that didn't
> > > know better would get silly results). The main usecase for stack traces
> > > (at least with bpf_trace) is for DTrace-style aggregation on stack
> > > traces (both entry and exit). Therefore we cannot simply correct the
> > > stack trace on exit -- we must stash away the stack trace and return the
> > > entry stack trace when it is requested.
> > > 
> > > In theory, patches like commit 76094a2cf46e ("ftrace: distinguish
> > > kretprobe'd functions in trace logs") are no longer necessary *for
> > > tracing* because now all kretprobe traces should produce sane stack
> > > traces. However it's not clear whether removing them completely is
> > > reasonable.
> > 
> > Then, let's try to revert it :)
> 
> Sure. :P
> 
> > BTW, could you also add a test case for ftrace too?
> > also, I have some comments below.
> 
> Yup, will do.
> 
> > > +#define KRETPROBE_TRACE_SIZE 1024
> > > +struct kretprobe_trace {
> > > +	int nr_entries;
> > > +	unsigned long entries[KRETPROBE_TRACE_SIZE];
> > > +};
> > 
> > Hmm, do we really need all entries? It takes 8KB for each instances.
> > Note that the number of instances can be big if the system core number
> > is larger.
> 
> Yeah, you're right this is too large for a default.
> 
> But the problem is that we need it to be large enough for any of the
> tracers to be happy -- otherwise we'd have to dynamically allocate it
> and I had a feeling this would be seen as a Bad Idea™ in the kprobe
> paths.

Note that we can skip if it is not enough with just nmissed+1

> 
>   * ftrace uses PAGE_SIZE/sizeof(u64) == 512 (on x86_64).
>   * perf_events (and thus BPF) uses 127 as the default but can be
>     configured via sysctl -- and thus can be unbounded.
>   * show_stack(...) doesn't appear to have a limit, but I might just be
>     misreading the x86-specific code.
> 
> As mentioned above, the lack of consensus on a single structure for
> storing stack traces also means that there is a lack of consensus on
> what the largest reasonable stack is.
> 
> But maybe just doing 127 would be "reasonable"?

Yeah, I think that is reasonable size.

> 
> (Athough, dynamically allocating would allow us to just use 'struct
> stack_trace' directly without needing to embed a different structure.)
> 
> > > +	hlist_for_each_entry_safe(iter, next, head, hlist) {
> > 
> > Why would you use "_safe" variant here? if you don't modify the hlist,
> > you don't need to use it.
> 
> Yup, my mistake.
> 
> > > +void kretprobe_save_stack_trace(struct kretprobe_instance *ri,
> > > +				struct stack_trace *trace)
> > > +{
> > > +	int i;
> > > +	struct kretprobe_trace *krt = &ri->entry;
> > > +
> > > +	for (i = trace->skip; i < krt->nr_entries; i++) {
> > > +		if (trace->nr_entries >= trace->max_entries)
> > > +			break;
> > > +		trace->entries[trace->nr_entries++] = krt->entries[i];
> > > +	}
> > > +}
> > > +EXPORT_SYMBOL_GPL(kretprobe_save_stack_trace);
> > > +
> > > +void kretprobe_perf_callchain_kernel(struct kretprobe_instance *ri,
> > > +				     struct perf_callchain_entry_ctx *ctx)
> > > +{
> > > +	int i;
> > > +	struct kretprobe_trace *krt = &ri->entry;
> > > +
> > > +	for (i = 0; i < krt->nr_entries; i++) {
> > > +		if (krt->entries[i] == ULONG_MAX)
> > > +			break;
> > > +		perf_callchain_store(ctx, (u64) krt->entries[i]);
> > > +	}
> > > +}
> > > +EXPORT_SYMBOL_GPL(kretprobe_perf_callchain_kernel);
> > 
> > 
> > Why do we need to export these functions?
> 
> That's a good question -- I must've just banged out the EXPORT
> statements without thinking. I'll remove them in v2.

OK.

Thank you,
Masami Hiramatsu Nov. 1, 2018, 10:13 a.m. UTC | #6
On Thu, 1 Nov 2018 00:39:12 +1100
Aleksa Sarai <cyphar@cyphar.com> wrote:

> On 2018-10-31, Steven Rostedt <rostedt@goodmis.org> wrote:
> > > Anyway, until that merge happens, this patch looks good to avoid
> > > this issue for generic solution (e.g. for the arch which doesn't
> > > supports retstack).
> > 
> > I think its time to come up with an algorithm that makes function graph
> > work with multiple users, and have kretprobes be able to hook into it
> > just like kprobes hooks into function tracer.
> > 
> > I have some ideas on how to get this done, and will try to have an RFC
> > patch set ready by plumbers.
> 
> Should I continue working on this patchset?

Yes, until we finally introduce Steven's algorithm on all arch (yeah, we still
have some archs which don't support graph-tracer but supports kprobes),
I think your patch is the best fix for this issue.

Thank you,
Aleksa Sarai Nov. 1, 2018, 10:49 a.m. UTC | #7
On 2018-11-01, Masami Hiramatsu <mhiramat@kernel.org> wrote:
> > > > Anyway, until that merge happens, this patch looks good to avoid
> > > > this issue for generic solution (e.g. for the arch which doesn't
> > > > supports retstack).
> > > 
> > > I think its time to come up with an algorithm that makes function graph
> > > work with multiple users, and have kretprobes be able to hook into it
> > > just like kprobes hooks into function tracer.
> > > 
> > > I have some ideas on how to get this done, and will try to have an RFC
> > > patch set ready by plumbers.
> > 
> > Should I continue working on this patchset?
> 
> Yes, until we finally introduce Steven's algorithm on all arch (yeah, we still
> have some archs which don't support graph-tracer but supports kprobes),
> I think your patch is the best fix for this issue.

Thanks, I just sent a v3.

Though, even with Steven's hooking of kprobes I think you'd still need
to stash away the stack trace somewhere (or am I misunderstanding the
proposal?).
Steven Rostedt Nov. 1, 2018, 1:22 p.m. UTC | #8
On Thu, 1 Nov 2018 21:49:48 +1100
Aleksa Sarai <cyphar@cyphar.com> wrote:


> > > Should I continue working on this patchset?  
> > 
> > Yes, until we finally introduce Steven's algorithm on all arch (yeah, we still
> > have some archs which don't support graph-tracer but supports kprobes),
> > I think your patch is the best fix for this issue.  

Agreed.

> 
> Thanks, I just sent a v3.

Thanks. Hopefully I can take a look at it today or tomorrow. I'm still
trying to get home (sitting at yet another airport as I write this).

> 
> Though, even with Steven's hooking of kprobes I think you'd still need
> to stash away the stack trace somewhere (or am I misunderstanding the
> proposal?).

I'll start working on it and then we can see what is needed when there
is actual code. I need to probably change the func graph generic
interface which will require changes in all the archs that support
function graph tracer.

-- Steve
Masami Hiramatsu Nov. 1, 2018, 3:01 p.m. UTC | #9
On Thu, 1 Nov 2018 21:49:48 +1100
Aleksa Sarai <cyphar@cyphar.com> wrote:

> On 2018-11-01, Masami Hiramatsu <mhiramat@kernel.org> wrote:
> > > > > Anyway, until that merge happens, this patch looks good to avoid
> > > > > this issue for generic solution (e.g. for the arch which doesn't
> > > > > supports retstack).
> > > > 
> > > > I think its time to come up with an algorithm that makes function graph
> > > > work with multiple users, and have kretprobes be able to hook into it
> > > > just like kprobes hooks into function tracer.
> > > > 
> > > > I have some ideas on how to get this done, and will try to have an RFC
> > > > patch set ready by plumbers.
> > > 
> > > Should I continue working on this patchset?
> > 
> > Yes, until we finally introduce Steven's algorithm on all arch (yeah, we still
> > have some archs which don't support graph-tracer but supports kprobes),
> > I think your patch is the best fix for this issue.
> 
> Thanks, I just sent a v3.
> 
> Though, even with Steven's hooking of kprobes I think you'd still need
> to stash away the stack trace somewhere (or am I misunderstanding the
> proposal?).

Wait, I might miss something. kretprobe and func-graph tracer just swap the
return address on the stack but no change on other stuffs on the stack.
In that case, if we can restore the stack, isn't it enough?

And anyway, even if using func-graph tracer, stack unwinding works correctly.
I thought it means we don't need to backup whole the stack, doesn't it?

Thank you,
Aleksa Sarai Nov. 1, 2018, 8:25 p.m. UTC | #10
On 2018-11-02, Masami Hiramatsu <mhiramat@kernel.org> wrote:
> On Thu, 1 Nov 2018 21:49:48 +1100
> Aleksa Sarai <cyphar@cyphar.com> wrote:
> 
> > On 2018-11-01, Masami Hiramatsu <mhiramat@kernel.org> wrote:
> > > > > > Anyway, until that merge happens, this patch looks good to avoid
> > > > > > this issue for generic solution (e.g. for the arch which doesn't
> > > > > > supports retstack).
> > > > > 
> > > > > I think its time to come up with an algorithm that makes function graph
> > > > > work with multiple users, and have kretprobes be able to hook into it
> > > > > just like kprobes hooks into function tracer.
> > > > > 
> > > > > I have some ideas on how to get this done, and will try to have an RFC
> > > > > patch set ready by plumbers.
> > > > 
> > > > Should I continue working on this patchset?
> > > 
> > > Yes, until we finally introduce Steven's algorithm on all arch (yeah, we still
> > > have some archs which don't support graph-tracer but supports kprobes),
> > > I think your patch is the best fix for this issue.
> > 
> > Thanks, I just sent a v3.
> > 
> > Though, even with Steven's hooking of kprobes I think you'd still need
> > to stash away the stack trace somewhere (or am I misunderstanding the
> > proposal?).
> 
> Wait, I might miss something. kretprobe and func-graph tracer just swap the
> return address on the stack but no change on other stuffs on the stack.
> In that case, if we can restore the stack, isn't it enough?

If stack traces works within the func-graph tracer context, then that
would be enough.

However, this is not currently the case in regular kretprobes -- the
issue isn't that kretprobe_trampoline is *on* the stack trace but rather
that the stack trace is *only* kretprobe_trampoline.

I spent several days (and got nowhere) trying to understand why the
stack unwinder cannot go past kretprobe_trampoline. You can see this if
you try to use bpftrace (which just runs bpf_get_stackid ->
perf_callchain_kernel):

  % bpftrace -e 'kretprobe:do_filp_open { @x = stack; }'
  Attaching 1 probe...
  ^C

  @x:
          kretprobe_trampoline+1

With my patch, you see the full stack trace you expect. It should be
noted the same problem exists when adding a kretprobe with ftrace's
kprobe_events and options/stacktrace.

Weirdly, after adding show_stack(NULL, NULL) from within a kretprobe
context you get an answer beyond kretprobe_trampoline (though it is
quite ugly and contains lots of information you don't see in
get_perf_callchain or save_stack_trace -- for some reason). I will take
a closer look at this.

But it seems to me that some aspect of the stack trace handling with
kretprobe_trampoline is broken. I wouldn't mind working on fixing this,
but it might require lots of architecture-specific work (and it should
be noted that ultimately we want to have an identical stack trace
between kprobes and kretprobe so that you can DTrace-style aggregate
them).

> And anyway, even if using func-graph tracer, stack unwinding works correctly.
> I thought it means we don't need to backup whole the stack, doesn't it?

Currently, for kretprobes, the entire stack needs to be backed up
because the unwinder stops when it hits kretprobe_trampoline.

Patch
diff mbox series

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index e909413e4e38..8a4f78a0c990 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -40,6 +40,8 @@ 
 #include <linux/rcupdate.h>
 #include <linux/mutex.h>
 #include <linux/ftrace.h>
+#include <linux/stacktrace.h>
+#include <linux/perf_event.h>
 #include <asm/kprobes.h>
 
 #ifdef CONFIG_KPROBES
@@ -168,11 +170,18 @@  struct kretprobe {
 	raw_spinlock_t lock;
 };
 
+#define KRETPROBE_TRACE_SIZE 1024
+struct kretprobe_trace {
+	int nr_entries;
+	unsigned long entries[KRETPROBE_TRACE_SIZE];
+};
+
 struct kretprobe_instance {
 	struct hlist_node hlist;
 	struct kretprobe *rp;
 	kprobe_opcode_t *ret_addr;
 	struct task_struct *task;
+	struct kretprobe_trace entry;
 	char data[0];
 };
 
@@ -371,6 +380,12 @@  void unregister_kretprobe(struct kretprobe *rp);
 int register_kretprobes(struct kretprobe **rps, int num);
 void unregister_kretprobes(struct kretprobe **rps, int num);
 
+struct kretprobe_instance *current_kretprobe_instance(void);
+void kretprobe_save_stack_trace(struct kretprobe_instance *ri,
+				struct stack_trace *trace);
+void kretprobe_perf_callchain_kernel(struct kretprobe_instance *ri,
+				     struct perf_callchain_entry_ctx *ctx);
+
 void kprobe_flush_task(struct task_struct *tk);
 void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head);
 
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 24a77c34e9ad..98edcd8a6987 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -12,6 +12,7 @@ 
 #include <linux/perf_event.h>
 #include <linux/slab.h>
 #include <linux/sched/task_stack.h>
+#include <linux/kprobes.h>
 
 #include "internal.h"
 
@@ -197,9 +198,14 @@  get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 	ctx.contexts_maxed = false;
 
 	if (kernel && !user_mode(regs)) {
+		struct kretprobe_instance *ri = current_kretprobe_instance();
+
 		if (add_mark)
 			perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL);
-		perf_callchain_kernel(&ctx, regs);
+		if (ri)
+			kretprobe_perf_callchain_kernel(ri, &ctx);
+		else
+			perf_callchain_kernel(&ctx, regs);
 	}
 
 	if (user) {
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 90e98e233647..a0776a7d1244 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -48,6 +48,7 @@ 
 #include <linux/ftrace.h>
 #include <linux/cpu.h>
 #include <linux/jump_label.h>
+#include <linux/sched/debug.h>
 
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
@@ -1206,6 +1207,16 @@  __releases(hlist_lock)
 }
 NOKPROBE_SYMBOL(kretprobe_table_unlock);
 
+static bool kretprobe_hash_is_locked(struct task_struct *tsk)
+{
+	unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
+	raw_spinlock_t *hlist_lock;
+
+	hlist_lock = kretprobe_table_lock_ptr(hash);
+	return raw_spin_is_locked(hlist_lock);
+}
+NOKPROBE_SYMBOL(kretprobe_hash_is_locked);
+
 /*
  * This function is called from finish_task_switch when task tk becomes dead,
  * so that we can recycle any function-return probe instances associated
@@ -1800,6 +1811,13 @@  unsigned long __weak arch_deref_entry_point(void *entry)
 	return (unsigned long)entry;
 }
 
+static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs);
+
+static inline bool kprobe_is_retprobe(struct kprobe *kp)
+{
+	return kp->pre_handler == pre_handler_kretprobe;
+}
+
 #ifdef CONFIG_KRETPROBES
 /*
  * This kprobe pre_handler is registered with every kretprobe. When probe
@@ -1826,6 +1844,8 @@  static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
 	hash = hash_ptr(current, KPROBE_HASH_BITS);
 	raw_spin_lock_irqsave(&rp->lock, flags);
 	if (!hlist_empty(&rp->free_instances)) {
+		struct stack_trace trace = {};
+
 		ri = hlist_entry(rp->free_instances.first,
 				struct kretprobe_instance, hlist);
 		hlist_del(&ri->hlist);
@@ -1834,6 +1854,11 @@  static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
 		ri->rp = rp;
 		ri->task = current;
 
+		trace.entries = &ri->entry.entries[0];
+		trace.max_entries = KRETPROBE_TRACE_SIZE;
+		save_stack_trace_regs(regs, &trace);
+		ri->entry.nr_entries = trace.nr_entries;
+
 		if (rp->entry_handler && rp->entry_handler(ri, regs)) {
 			raw_spin_lock_irqsave(&rp->lock, flags);
 			hlist_add_head(&ri->hlist, &rp->free_instances);
@@ -1856,6 +1881,70 @@  static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
 }
 NOKPROBE_SYMBOL(pre_handler_kretprobe);
 
+/*
+ * Return the kretprobe_instance associated with the current_kprobe. Calling
+ * this is only reasonable from within a kretprobe handler context (otherwise
+ * return NULL).
+ *
+ * Must be called within a kretprobe_hash_lock(current, ...) context.
+ */
+struct kretprobe_instance *current_kretprobe_instance(void)
+{
+	struct kprobe *kp;
+	struct kretprobe *rp;
+	struct kretprobe_instance *iter, *ri = NULL;
+	struct hlist_head *head;
+	struct hlist_node *next;
+	unsigned long hash = hash_ptr(current, KPROBE_HASH_BITS);
+
+	kp = kprobe_running();
+	if (!kp || !kprobe_is_retprobe(kp))
+		return NULL;
+	if (WARN_ON(!kretprobe_hash_is_locked(current)))
+		return NULL;
+
+	rp = container_of(kp, struct kretprobe, kp);
+	head = &kretprobe_inst_table[hash];
+
+	hlist_for_each_entry_safe(iter, next, head, hlist) {
+		if (iter->task == current && iter->rp == rp) {
+			ri = iter;
+			break;
+		}
+	}
+	return ri;
+}
+EXPORT_SYMBOL_GPL(current_kretprobe_instance);
+NOKPROBE_SYMBOL(current_kretprobe_instance);
+
+void kretprobe_save_stack_trace(struct kretprobe_instance *ri,
+				struct stack_trace *trace)
+{
+	int i;
+	struct kretprobe_trace *krt = &ri->entry;
+
+	for (i = trace->skip; i < krt->nr_entries; i++) {
+		if (trace->nr_entries >= trace->max_entries)
+			break;
+		trace->entries[trace->nr_entries++] = krt->entries[i];
+	}
+}
+EXPORT_SYMBOL_GPL(kretprobe_save_stack_trace);
+
+void kretprobe_perf_callchain_kernel(struct kretprobe_instance *ri,
+				     struct perf_callchain_entry_ctx *ctx)
+{
+	int i;
+	struct kretprobe_trace *krt = &ri->entry;
+
+	for (i = 0; i < krt->nr_entries; i++) {
+		if (krt->entries[i] == ULONG_MAX)
+			break;
+		perf_callchain_store(ctx, (u64) krt->entries[i]);
+	}
+}
+EXPORT_SYMBOL_GPL(kretprobe_perf_callchain_kernel);
+
 bool __weak arch_kprobe_on_func_entry(unsigned long offset)
 {
 	return !offset;
@@ -2005,6 +2094,23 @@  static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
 }
 NOKPROBE_SYMBOL(pre_handler_kretprobe);
 
+struct kretprobe_instance *current_kretprobe_instance(void)
+{
+	return NULL;
+}
+NOKPROBE_SYMBOL(current_kretprobe_instance);
+
+void kretprobe_save_stack_trace(struct kretprobe_instance *ri,
+				struct stack_trace *trace)
+{
+}
+EXPORT_SYMBOL_GPL(kretprobe_save_stack_trace);
+
+void kretprobe_perf_callchain_kernel(struct kretprobe_instance *ri,
+				     struct perf_callchain_entry_ctx *ctx)
+{
+}
+EXPORT_SYMBOL_GPL(kretprobe_perf_callchain_kernel);
 #endif /* CONFIG_KRETPROBES */
 
 /* Set the kprobe gone and remove its instruction buffer. */
@@ -2241,7 +2347,7 @@  static void report_probe(struct seq_file *pi, struct kprobe *p,
 	char *kprobe_type;
 	void *addr = p->addr;
 
-	if (p->pre_handler == pre_handler_kretprobe)
+	if (kprobe_is_retprobe(p))
 		kprobe_type = "r";
 	else
 		kprobe_type = "k";
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bf6f1d70484d..2210d38a4dbf 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -42,6 +42,7 @@ 
 #include <linux/nmi.h>
 #include <linux/fs.h>
 #include <linux/trace.h>
+#include <linux/kprobes.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/rt.h>
 
@@ -2590,6 +2591,7 @@  static void __ftrace_trace_stack(struct ring_buffer *buffer,
 	struct ring_buffer_event *event;
 	struct stack_entry *entry;
 	struct stack_trace trace;
+	struct kretprobe_instance *ri = current_kretprobe_instance();
 	int use_stack;
 	int size = FTRACE_STACK_ENTRIES;
 
@@ -2626,7 +2628,9 @@  static void __ftrace_trace_stack(struct ring_buffer *buffer,
 		trace.entries		= this_cpu_ptr(ftrace_stack.calls);
 		trace.max_entries	= FTRACE_STACK_MAX_ENTRIES;
 
-		if (regs)
+		if (ri)
+			kretprobe_save_stack_trace(ri, &trace);
+		else if (regs)
 			save_stack_trace_regs(regs, &trace);
 		else
 			save_stack_trace(&trace);
@@ -2653,7 +2657,10 @@  static void __ftrace_trace_stack(struct ring_buffer *buffer,
 	else {
 		trace.max_entries	= FTRACE_STACK_ENTRIES;
 		trace.entries		= entry->caller;
-		if (regs)
+
+		if (ri)
+			kretprobe_save_stack_trace(ri, &trace);
+		else if (regs)
 			save_stack_trace_regs(regs, &trace);
 		else
 			save_stack_trace(&trace);