Re: [RFC PATCH bpf-next 2/6] bpf: add bpf_get_stack helper

From: Alexei Starovoitov <ast@fb.com>
To: Yonghong Song <yhs@fb.com>, <daniel@iogearbox.net>,
	<netdev@vger.kernel.org>
Cc: <kernel-team@fb.com>
Subject: Re: [RFC PATCH bpf-next 2/6] bpf: add bpf_get_stack helper
Date: Sun, 8 Apr 2018 20:34:07 -0700	[thread overview]
Message-ID: <e532530c-d5ea-a889-6467-1d8dd2b4f098@fb.com> (raw)
In-Reply-To: <20180406214846.916265-3-yhs@fb.com>

On 4/6/18 2:48 PM, Yonghong Song wrote:
> Currently, stackmap and bpf_get_stackid helper are provided
> for bpf program to get the stack trace. This approach has
> a limitation though. If two stack traces have the same hash,
> only one will get stored in the stackmap table,
> so some stack traces are missing from user perspective.
>
> This patch implements a new helper, bpf_get_stack, will
> send stack traces directly to bpf program. The bpf program
> is able to see all stack traces, and then can do in-kernel
> processing or send stack traces to user space through
> shared map or bpf_perf_event_output.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  include/linux/bpf.h      |  1 +
>  include/linux/filter.h   |  3 ++-
>  include/uapi/linux/bpf.h | 17 +++++++++++++--
>  kernel/bpf/stackmap.c    | 56 ++++++++++++++++++++++++++++++++++++++++++++++++
>  kernel/bpf/syscall.c     | 12 ++++++++++-
>  kernel/bpf/verifier.c    |  3 +++
>  kernel/trace/bpf_trace.c | 50 +++++++++++++++++++++++++++++++++++++++++-
>  7 files changed, 137 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 95a7abd..72ccb9a 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -676,6 +676,7 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto;
>  extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
>  extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
>  extern const struct bpf_func_proto bpf_get_stackid_proto;
> +extern const struct bpf_func_proto bpf_get_stack_proto;
>  extern const struct bpf_func_proto bpf_sock_map_update_proto;
>
>  /* Shared helpers among cBPF and eBPF. */
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index fc4e8f9..9b64f63 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -467,7 +467,8 @@ struct bpf_prog {
>  				dst_needed:1,	/* Do we need dst entry? */
>  				blinded:1,	/* Was blinded */
>  				is_func:1,	/* program is a bpf function */
> -				kprobe_override:1; /* Do we override a kprobe? */
> +				kprobe_override:1, /* Do we override a kprobe? */
> +				need_callchain_buf:1; /* Needs callchain buffer? */
>  	enum bpf_prog_type	type;		/* Type of BPF program */
>  	enum bpf_attach_type	expected_attach_type; /* For some prog types */
>  	u32			len;		/* Number of filter blocks */
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index c5ec897..a4ff5b7 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -517,6 +517,17 @@ union bpf_attr {
>   *             other bits - reserved
>   *     Return: >= 0 stackid on success or negative error
>   *
> + * int bpf_get_stack(ctx, buf, size, flags)
> + *     walk user or kernel stack and store the ips in buf
> + *     @ctx: struct pt_regs*
> + *     @buf: user buffer to fill stack
> + *     @size: the buf size
> + *     @flags: bits 0-7 - numer of stack frames to skip
> + *             bit 8 - collect user stack instead of kernel
> + *             bit 11 - get build-id as well if user stack
> + *             other bits - reserved
> + *     Return: >= 0 size copied on success or negative error
> + *
>   * s64 bpf_csum_diff(from, from_size, to, to_size, seed)
>   *     calculate csum diff
>   *     @from: raw from buffer
> @@ -821,7 +832,8 @@ union bpf_attr {
>  	FN(msg_apply_bytes),		\
>  	FN(msg_cork_bytes),		\
>  	FN(msg_pull_data),		\
> -	FN(bind),
> +	FN(bind),			\
> +	FN(get_stack),
>
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> @@ -855,11 +867,12 @@ enum bpf_func_id {
>  /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
>  #define BPF_F_TUNINFO_IPV6		(1ULL << 0)
>
> -/* BPF_FUNC_get_stackid flags. */
> +/* BPF_FUNC_get_stackid and BPF_FUNC_get_stack flags. */
>  #define BPF_F_SKIP_FIELD_MASK		0xffULL
>  #define BPF_F_USER_STACK		(1ULL << 8)
>  #define BPF_F_FAST_STACK_CMP		(1ULL << 9)
>  #define BPF_F_REUSE_STACKID		(1ULL << 10)
> +#define BPF_F_USER_BUILD_ID		(1ULL << 11)

the comment above is not quite correct.
This new flag is only available for new helper.

>
>  /* BPF_FUNC_skb_set_tunnel_key flags. */
>  #define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
> index 04f6ec1..371c72e 100644
> --- a/kernel/bpf/stackmap.c
> +++ b/kernel/bpf/stackmap.c
> @@ -402,6 +402,62 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
>  	.arg3_type	= ARG_ANYTHING,
>  };
>
> +BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
> +	   u64, flags)
> +{
> +	u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
> +	bool user_build_id = flags & BPF_F_USER_BUILD_ID;
> +	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
> +	bool user = flags & BPF_F_USER_STACK;
> +	struct perf_callchain_entry *trace;
> +	bool kernel = !user;
> +	u64 *ips;
> +
> +	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
> +			       BPF_F_USER_BUILD_ID)))
> +		return -EINVAL;
> +
> +	elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id)
> +					    : sizeof(u64);
> +	if (unlikely(size % elem_size))
> +		return -EINVAL;
> +
> +	num_elem = size / elem_size;
> +	if (sysctl_perf_event_max_stack < num_elem)
> +		init_nr = 0;

prog's buffer should be zero padded in this case since it
points to uninit_mem.

> +	else
> +		init_nr = sysctl_perf_event_max_stack - num_elem;
> +	trace = get_perf_callchain(regs, init_nr, kernel, user,
> +				   sysctl_perf_event_max_stack, false, false);
> +	if (unlikely(!trace))
> +		return -EFAULT;
> +
> +	trace_nr = trace->nr - init_nr;
> +	if (trace_nr <= skip)
> +		return -EFAULT;
> +
> +	trace_nr -= skip;
> +	trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
> +	copy_len = trace_nr * elem_size;
> +	ips = trace->ip + skip + init_nr;
> +	if (user && user_build_id)

the combination of kern + user_build_id should probably be rejected
earlier with einval.

> +		stack_map_get_build_id_offset(buf, ips, trace_nr, user);
> +	else
> +		memcpy(buf, ips, copy_len);
> +
> +	return copy_len;
> +}
> +
> +const struct bpf_func_proto bpf_get_stack_proto = {
> +	.func		= bpf_get_stack,
> +	.gpl_only	= true,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type	= ARG_PTR_TO_CTX,
> +	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
> +	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,

why allow zero size?
I'm not sure the helper will work correctly when size=0

> +	.arg4_type	= ARG_ANYTHING,
> +};
> +
>  /* Called from eBPF program */
>  static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
>  {
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 0244973..2aa3a65 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -984,10 +984,13 @@ void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
>  static void __bpf_prog_put_rcu(struct rcu_head *rcu)
>  {
>  	struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
> +	bool need_callchain_buf = aux->prog->need_callchain_buf;
>
>  	free_used_maps(aux);
>  	bpf_prog_uncharge_memlock(aux->prog);
>  	security_bpf_prog_free(aux);
> +	if (need_callchain_buf)
> +		put_callchain_buffers();
>  	bpf_prog_free(aux->prog);
>  }
>
> @@ -1004,7 +1007,8 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
>  			bpf_prog_kallsyms_del(prog->aux->func[i]);
>  		bpf_prog_kallsyms_del(prog);
>
> -		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
> +		synchronize_rcu();
> +		__bpf_prog_put_rcu(&prog->aux->rcu);

there should have been lockdep splat.
We cannot call synchronize_rcu here, since we cannot sleep
in some cases.