linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH bpf v2] bpf: preallocate a perf_sample_data per event fd
@ 2019-05-31 22:37 Matt Mullins
  2019-06-01  1:27 ` Song Liu
  2019-06-03 13:08 ` Daniel Borkmann
  0 siblings, 2 replies; 17+ messages in thread
From: Matt Mullins @ 2019-05-31 22:37 UTC (permalink / raw)
  To: hall, mmullins, ast, bpf, netdev
  Cc: linux-kernel, Daniel Borkmann, Martin KaFai Lau, Song Liu,
	Yonghong Song, Steven Rostedt, Ingo Molnar

It is possible that a BPF program can be called while another BPF
program is executing bpf_perf_event_output.  This has been observed with
I/O completion occurring as a result of an interrupt:

	bpf_prog_247fd1341cddaea4_trace_req_end+0x8d7/0x1000
	? trace_call_bpf+0x82/0x100
	? sch_direct_xmit+0xe2/0x230
	? blk_mq_end_request+0x1/0x100
	? blk_mq_end_request+0x5/0x100
	? kprobe_perf_func+0x19b/0x240
	? __qdisc_run+0x86/0x520
	? blk_mq_end_request+0x1/0x100
	? blk_mq_end_request+0x5/0x100
	? kprobe_ftrace_handler+0x90/0xf0
	? ftrace_ops_assist_func+0x6e/0xe0
	? ip6_input_finish+0xbf/0x460
	? 0xffffffffa01e80bf
	? nbd_dbg_flags_show+0xc0/0xc0 [nbd]
	? blkdev_issue_zeroout+0x200/0x200
	? blk_mq_end_request+0x1/0x100
	? blk_mq_end_request+0x5/0x100
	? flush_smp_call_function_queue+0x6c/0xe0
	? smp_call_function_single_interrupt+0x32/0xc0
	? call_function_single_interrupt+0xf/0x20
	? call_function_single_interrupt+0xa/0x20
	? swiotlb_map_page+0x140/0x140
	? refcount_sub_and_test+0x1a/0x50
	? tcp_wfree+0x20/0xf0
	? skb_release_head_state+0x62/0xc0
	? skb_release_all+0xe/0x30
	? napi_consume_skb+0xb5/0x100
	? mlx5e_poll_tx_cq+0x1df/0x4e0
	? mlx5e_poll_tx_cq+0x38c/0x4e0
	? mlx5e_napi_poll+0x58/0xc30
	? mlx5e_napi_poll+0x232/0xc30
	? net_rx_action+0x128/0x340
	? __do_softirq+0xd4/0x2ad
	? irq_exit+0xa5/0xb0
	? do_IRQ+0x7d/0xc0
	? common_interrupt+0xf/0xf
	</IRQ>
	? __rb_free_aux+0xf0/0xf0
	? perf_output_sample+0x28/0x7b0
	? perf_prepare_sample+0x54/0x4a0
	? perf_event_output+0x43/0x60
	? bpf_perf_event_output_raw_tp+0x15f/0x180
	? blk_mq_start_request+0x1/0x120
	? bpf_prog_411a64a706fc6044_should_trace+0xad4/0x1000
	? bpf_trace_run3+0x2c/0x80
	? nbd_send_cmd+0x4c2/0x690 [nbd]

This also cannot be alleviated by further splitting the per-cpu
perf_sample_data structs (as in commit 283ca526a9bd ("bpf: fix
corruption on concurrent perf_event_output calls")), as a raw_tp could
be attached to the block:block_rq_complete tracepoint and execute during
another raw_tp.  Instead, keep a pre-allocated perf_sample_data
structure per perf_event_array element and fail a bpf_perf_event_output
if that element is concurrently being used.

Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data")
Signed-off-by: Matt Mullins <mmullins@fb.com>
---
v1->v2:
	keep a pointer to the struct perf_sample_data rather than directly
	embedding it in the structure, avoiding the circular include and
	removing the need for in_use.  Suggested by Song.

 include/linux/bpf.h      |  1 +
 kernel/bpf/arraymap.c    |  3 ++-
 kernel/trace/bpf_trace.c | 29 ++++++++++++++++-------------
 3 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4fb3aa2dc975..47fd85cfbbaf 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -472,6 +472,7 @@ struct bpf_event_entry {
 	struct file *perf_file;
 	struct file *map_file;
 	struct rcu_head rcu;
+	struct perf_sample_data *sd;
 };
 
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 584636c9e2eb..c7f5d593e04f 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -654,11 +654,12 @@ static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
 {
 	struct bpf_event_entry *ee;
 
-	ee = kzalloc(sizeof(*ee), GFP_ATOMIC);
+	ee = kzalloc(sizeof(*ee) + sizeof(struct perf_sample_data), GFP_ATOMIC);
 	if (ee) {
 		ee->event = perf_file->private_data;
 		ee->perf_file = perf_file;
 		ee->map_file = map_file;
+		ee->sd = (void *)ee + sizeof(*ee);
 	}
 
 	return ee;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f92d6ad5e080..076f8e987355 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -410,17 +410,17 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
 	.arg4_type	= ARG_CONST_SIZE,
 };
 
-static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd);
-
 static __always_inline u64
 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
-			u64 flags, struct perf_sample_data *sd)
+			u64 flags, struct perf_raw_record *raw)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	unsigned int cpu = smp_processor_id();
 	u64 index = flags & BPF_F_INDEX_MASK;
 	struct bpf_event_entry *ee;
 	struct perf_event *event;
+	struct perf_sample_data *sd;
+	u64 ret;
 
 	if (index == BPF_F_CURRENT_CPU)
 		index = cpu;
@@ -439,13 +439,22 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
 	if (unlikely(event->oncpu != cpu))
 		return -EOPNOTSUPP;
 
-	return perf_event_output(event, sd, regs);
+	sd = xchg(&ee->sd, NULL);
+	if (!sd)
+		return -EBUSY;
+
+	perf_sample_data_init(sd, 0, 0);
+	sd->raw = raw;
+
+	ret = perf_event_output(event, sd, regs);
+
+	xchg(&ee->sd, sd);
+	return ret;
 }
 
 BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
 	   u64, flags, void *, data, u64, size)
 {
-	struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd);
 	struct perf_raw_record raw = {
 		.frag = {
 			.size = size,
@@ -456,10 +465,8 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
 	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
 		return -EINVAL;
 
-	perf_sample_data_init(sd, 0, 0);
-	sd->raw = &raw;
 
-	return __bpf_perf_event_output(regs, map, flags, sd);
+	return __bpf_perf_event_output(regs, map, flags, &raw);
 }
 
 static const struct bpf_func_proto bpf_perf_event_output_proto = {
@@ -474,12 +481,10 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
 };
 
 static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
-static DEFINE_PER_CPU(struct perf_sample_data, bpf_misc_sd);
 
 u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 		     void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
 {
-	struct perf_sample_data *sd = this_cpu_ptr(&bpf_misc_sd);
 	struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs);
 	struct perf_raw_frag frag = {
 		.copy		= ctx_copy,
@@ -497,10 +502,8 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 	};
 
 	perf_fetch_caller_regs(regs);
-	perf_sample_data_init(sd, 0, 0);
-	sd->raw = &raw;
 
-	return __bpf_perf_event_output(regs, map, flags, sd);
+	return __bpf_perf_event_output(regs, map, flags, &raw);
 }
 
 BPF_CALL_0(bpf_get_current_task)
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH bpf v2] bpf: preallocate a perf_sample_data per event fd
  2019-05-31 22:37 [PATCH bpf v2] bpf: preallocate a perf_sample_data per event fd Matt Mullins
@ 2019-06-01  1:27 ` Song Liu
  2019-06-01  3:12   ` Alexei Starovoitov
  2019-06-03 13:08 ` Daniel Borkmann
  1 sibling, 1 reply; 17+ messages in thread
From: Song Liu @ 2019-06-01  1:27 UTC (permalink / raw)
  To: Matt Mullins
  Cc: Andrew Hall, Alexei Starovoitov, bpf, Networking, LKML,
	Daniel Borkmann, Martin Lau, Yonghong Song, Steven Rostedt,
	Ingo Molnar



> On May 31, 2019, at 3:37 PM, Matt Mullins <mmullins@fb.com> wrote:
> 
> It is possible that a BPF program can be called while another BPF
> program is executing bpf_perf_event_output.  This has been observed with
> I/O completion occurring as a result of an interrupt:
> 
> 	bpf_prog_247fd1341cddaea4_trace_req_end+0x8d7/0x1000
> 	? trace_call_bpf+0x82/0x100
> 	? sch_direct_xmit+0xe2/0x230
> 	? blk_mq_end_request+0x1/0x100
> 	? blk_mq_end_request+0x5/0x100
> 	? kprobe_perf_func+0x19b/0x240
> 	? __qdisc_run+0x86/0x520
> 	? blk_mq_end_request+0x1/0x100
> 	? blk_mq_end_request+0x5/0x100
> 	? kprobe_ftrace_handler+0x90/0xf0
> 	? ftrace_ops_assist_func+0x6e/0xe0
> 	? ip6_input_finish+0xbf/0x460
> 	? 0xffffffffa01e80bf
> 	? nbd_dbg_flags_show+0xc0/0xc0 [nbd]
> 	? blkdev_issue_zeroout+0x200/0x200
> 	? blk_mq_end_request+0x1/0x100
> 	? blk_mq_end_request+0x5/0x100
> 	? flush_smp_call_function_queue+0x6c/0xe0
> 	? smp_call_function_single_interrupt+0x32/0xc0
> 	? call_function_single_interrupt+0xf/0x20
> 	? call_function_single_interrupt+0xa/0x20
> 	? swiotlb_map_page+0x140/0x140
> 	? refcount_sub_and_test+0x1a/0x50
> 	? tcp_wfree+0x20/0xf0
> 	? skb_release_head_state+0x62/0xc0
> 	? skb_release_all+0xe/0x30
> 	? napi_consume_skb+0xb5/0x100
> 	? mlx5e_poll_tx_cq+0x1df/0x4e0
> 	? mlx5e_poll_tx_cq+0x38c/0x4e0
> 	? mlx5e_napi_poll+0x58/0xc30
> 	? mlx5e_napi_poll+0x232/0xc30
> 	? net_rx_action+0x128/0x340
> 	? __do_softirq+0xd4/0x2ad
> 	? irq_exit+0xa5/0xb0
> 	? do_IRQ+0x7d/0xc0
> 	? common_interrupt+0xf/0xf
> 	</IRQ>
> 	? __rb_free_aux+0xf0/0xf0
> 	? perf_output_sample+0x28/0x7b0
> 	? perf_prepare_sample+0x54/0x4a0
> 	? perf_event_output+0x43/0x60
> 	? bpf_perf_event_output_raw_tp+0x15f/0x180
> 	? blk_mq_start_request+0x1/0x120
> 	? bpf_prog_411a64a706fc6044_should_trace+0xad4/0x1000
> 	? bpf_trace_run3+0x2c/0x80
> 	? nbd_send_cmd+0x4c2/0x690 [nbd]
> 
> This also cannot be alleviated by further splitting the per-cpu
> perf_sample_data structs (as in commit 283ca526a9bd ("bpf: fix
> corruption on concurrent perf_event_output calls")), as a raw_tp could
> be attached to the block:block_rq_complete tracepoint and execute during
> another raw_tp.  Instead, keep a pre-allocated perf_sample_data
> structure per perf_event_array element and fail a bpf_perf_event_output
> if that element is concurrently being used.
> 
> Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data")
> Signed-off-by: Matt Mullins <mmullins@fb.com>

This looks great. Thanks for the fix. 

Acked-by: Song Liu <songliubraving@fb.com>

> ---
> v1->v2:
> 	keep a pointer to the struct perf_sample_data rather than directly
> 	embedding it in the structure, avoiding the circular include and
> 	removing the need for in_use.  Suggested by Song.
> 
> include/linux/bpf.h      |  1 +
> kernel/bpf/arraymap.c    |  3 ++-
> kernel/trace/bpf_trace.c | 29 ++++++++++++++++-------------
> 3 files changed, 19 insertions(+), 14 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 4fb3aa2dc975..47fd85cfbbaf 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -472,6 +472,7 @@ struct bpf_event_entry {
> 	struct file *perf_file;
> 	struct file *map_file;
> 	struct rcu_head rcu;
> +	struct perf_sample_data *sd;
> };
> 
> bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
> diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
> index 584636c9e2eb..c7f5d593e04f 100644
> --- a/kernel/bpf/arraymap.c
> +++ b/kernel/bpf/arraymap.c
> @@ -654,11 +654,12 @@ static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
> {
> 	struct bpf_event_entry *ee;
> 
> -	ee = kzalloc(sizeof(*ee), GFP_ATOMIC);
> +	ee = kzalloc(sizeof(*ee) + sizeof(struct perf_sample_data), GFP_ATOMIC);
> 	if (ee) {
> 		ee->event = perf_file->private_data;
> 		ee->perf_file = perf_file;
> 		ee->map_file = map_file;
> +		ee->sd = (void *)ee + sizeof(*ee);
> 	}
> 
> 	return ee;
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index f92d6ad5e080..076f8e987355 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -410,17 +410,17 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
> 	.arg4_type	= ARG_CONST_SIZE,
> };
> 
> -static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd);
> -
> static __always_inline u64
> __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
> -			u64 flags, struct perf_sample_data *sd)
> +			u64 flags, struct perf_raw_record *raw)
> {
> 	struct bpf_array *array = container_of(map, struct bpf_array, map);
> 	unsigned int cpu = smp_processor_id();
> 	u64 index = flags & BPF_F_INDEX_MASK;
> 	struct bpf_event_entry *ee;
> 	struct perf_event *event;
> +	struct perf_sample_data *sd;
> +	u64 ret;
> 
> 	if (index == BPF_F_CURRENT_CPU)
> 		index = cpu;
> @@ -439,13 +439,22 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
> 	if (unlikely(event->oncpu != cpu))
> 		return -EOPNOTSUPP;
> 
> -	return perf_event_output(event, sd, regs);
> +	sd = xchg(&ee->sd, NULL);
> +	if (!sd)
> +		return -EBUSY;
> +
> +	perf_sample_data_init(sd, 0, 0);
> +	sd->raw = raw;
> +
> +	ret = perf_event_output(event, sd, regs);
> +
> +	xchg(&ee->sd, sd);
> +	return ret;
> }
> 
> BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
> 	   u64, flags, void *, data, u64, size)
> {
> -	struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd);
> 	struct perf_raw_record raw = {
> 		.frag = {
> 			.size = size,
> @@ -456,10 +465,8 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
> 	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
> 		return -EINVAL;
> 
> -	perf_sample_data_init(sd, 0, 0);
> -	sd->raw = &raw;
> 
> -	return __bpf_perf_event_output(regs, map, flags, sd);
> +	return __bpf_perf_event_output(regs, map, flags, &raw);
> }
> 
> static const struct bpf_func_proto bpf_perf_event_output_proto = {
> @@ -474,12 +481,10 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
> };
> 
> static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
> -static DEFINE_PER_CPU(struct perf_sample_data, bpf_misc_sd);
> 
> u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
> 		     void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
> {
> -	struct perf_sample_data *sd = this_cpu_ptr(&bpf_misc_sd);
> 	struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs);
> 	struct perf_raw_frag frag = {
> 		.copy		= ctx_copy,
> @@ -497,10 +502,8 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
> 	};
> 
> 	perf_fetch_caller_regs(regs);
> -	perf_sample_data_init(sd, 0, 0);
> -	sd->raw = &raw;
> 
> -	return __bpf_perf_event_output(regs, map, flags, sd);
> +	return __bpf_perf_event_output(regs, map, flags, &raw);
> }
> 
> BPF_CALL_0(bpf_get_current_task)
> -- 
> 2.17.1
> 


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH bpf v2] bpf: preallocate a perf_sample_data per event fd
  2019-06-01  1:27 ` Song Liu
@ 2019-06-01  3:12   ` Alexei Starovoitov
  0 siblings, 0 replies; 17+ messages in thread
From: Alexei Starovoitov @ 2019-06-01  3:12 UTC (permalink / raw)
  To: Song Liu
  Cc: Matt Mullins, Andrew Hall, Alexei Starovoitov, bpf, Networking,
	LKML, Daniel Borkmann, Martin Lau, Yonghong Song, Steven Rostedt,
	Ingo Molnar

On Fri, May 31, 2019 at 6:28 PM Song Liu <songliubraving@fb.com> wrote:
>
>
>
> > On May 31, 2019, at 3:37 PM, Matt Mullins <mmullins@fb.com> wrote:
> >
> > It is possible that a BPF program can be called while another BPF
> > program is executing bpf_perf_event_output.  This has been observed with
> > I/O completion occurring as a result of an interrupt:
> >
> >       bpf_prog_247fd1341cddaea4_trace_req_end+0x8d7/0x1000
> >       ? trace_call_bpf+0x82/0x100
> >       ? sch_direct_xmit+0xe2/0x230
> >       ? blk_mq_end_request+0x1/0x100
> >       ? blk_mq_end_request+0x5/0x100
> >       ? kprobe_perf_func+0x19b/0x240
> >       ? __qdisc_run+0x86/0x520
> >       ? blk_mq_end_request+0x1/0x100
> >       ? blk_mq_end_request+0x5/0x100
> >       ? kprobe_ftrace_handler+0x90/0xf0
> >       ? ftrace_ops_assist_func+0x6e/0xe0
> >       ? ip6_input_finish+0xbf/0x460
> >       ? 0xffffffffa01e80bf
> >       ? nbd_dbg_flags_show+0xc0/0xc0 [nbd]
> >       ? blkdev_issue_zeroout+0x200/0x200
> >       ? blk_mq_end_request+0x1/0x100
> >       ? blk_mq_end_request+0x5/0x100
> >       ? flush_smp_call_function_queue+0x6c/0xe0
> >       ? smp_call_function_single_interrupt+0x32/0xc0
> >       ? call_function_single_interrupt+0xf/0x20
> >       ? call_function_single_interrupt+0xa/0x20
> >       ? swiotlb_map_page+0x140/0x140
> >       ? refcount_sub_and_test+0x1a/0x50
> >       ? tcp_wfree+0x20/0xf0
> >       ? skb_release_head_state+0x62/0xc0
> >       ? skb_release_all+0xe/0x30
> >       ? napi_consume_skb+0xb5/0x100
> >       ? mlx5e_poll_tx_cq+0x1df/0x4e0
> >       ? mlx5e_poll_tx_cq+0x38c/0x4e0
> >       ? mlx5e_napi_poll+0x58/0xc30
> >       ? mlx5e_napi_poll+0x232/0xc30
> >       ? net_rx_action+0x128/0x340
> >       ? __do_softirq+0xd4/0x2ad
> >       ? irq_exit+0xa5/0xb0
> >       ? do_IRQ+0x7d/0xc0
> >       ? common_interrupt+0xf/0xf
> >       </IRQ>
> >       ? __rb_free_aux+0xf0/0xf0
> >       ? perf_output_sample+0x28/0x7b0
> >       ? perf_prepare_sample+0x54/0x4a0
> >       ? perf_event_output+0x43/0x60
> >       ? bpf_perf_event_output_raw_tp+0x15f/0x180
> >       ? blk_mq_start_request+0x1/0x120
> >       ? bpf_prog_411a64a706fc6044_should_trace+0xad4/0x1000
> >       ? bpf_trace_run3+0x2c/0x80
> >       ? nbd_send_cmd+0x4c2/0x690 [nbd]
> >
> > This also cannot be alleviated by further splitting the per-cpu
> > perf_sample_data structs (as in commit 283ca526a9bd ("bpf: fix
> > corruption on concurrent perf_event_output calls")), as a raw_tp could
> > be attached to the block:block_rq_complete tracepoint and execute during
> > another raw_tp.  Instead, keep a pre-allocated perf_sample_data
> > structure per perf_event_array element and fail a bpf_perf_event_output
> > if that element is concurrently being used.
> >
> > Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data")
> > Signed-off-by: Matt Mullins <mmullins@fb.com>
>
> This looks great. Thanks for the fix.
>
> Acked-by: Song Liu <songliubraving@fb.com>
>
> > ---
> > v1->v2:
> >       keep a pointer to the struct perf_sample_data rather than directly
> >       embedding it in the structure, avoiding the circular include and
> >       removing the need for in_use.  Suggested by Song.
> >
> > include/linux/bpf.h      |  1 +
> > kernel/bpf/arraymap.c    |  3 ++-
> > kernel/trace/bpf_trace.c | 29 ++++++++++++++++-------------
> > 3 files changed, 19 insertions(+), 14 deletions(-)
> >
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index 4fb3aa2dc975..47fd85cfbbaf 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -472,6 +472,7 @@ struct bpf_event_entry {
> >       struct file *perf_file;
> >       struct file *map_file;
> >       struct rcu_head rcu;
> > +     struct perf_sample_data *sd;
> > };
> >
> > bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
> > diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
> > index 584636c9e2eb..c7f5d593e04f 100644
> > --- a/kernel/bpf/arraymap.c
> > +++ b/kernel/bpf/arraymap.c
> > @@ -654,11 +654,12 @@ static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
> > {
> >       struct bpf_event_entry *ee;
> >
> > -     ee = kzalloc(sizeof(*ee), GFP_ATOMIC);
> > +     ee = kzalloc(sizeof(*ee) + sizeof(struct perf_sample_data), GFP_ATOMIC);
> >       if (ee) {
> >               ee->event = perf_file->private_data;
> >               ee->perf_file = perf_file;
> >               ee->map_file = map_file;
> > +             ee->sd = (void *)ee + sizeof(*ee);

This bit looks quite weird, but I don't have better ideas
to avoid circular .h pain.

Applied to bpf tree. Thanks

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH bpf v2] bpf: preallocate a perf_sample_data per event fd
  2019-05-31 22:37 [PATCH bpf v2] bpf: preallocate a perf_sample_data per event fd Matt Mullins
  2019-06-01  1:27 ` Song Liu
@ 2019-06-03 13:08 ` Daniel Borkmann
  2019-06-03 13:22   ` Daniel Borkmann
  1 sibling, 1 reply; 17+ messages in thread
From: Daniel Borkmann @ 2019-06-03 13:08 UTC (permalink / raw)
  To: Matt Mullins, hall, ast, bpf, netdev
  Cc: linux-kernel, Martin KaFai Lau, Song Liu, Yonghong Song,
	Steven Rostedt, Ingo Molnar

On 06/01/2019 12:37 AM, Matt Mullins wrote:
> It is possible that a BPF program can be called while another BPF
> program is executing bpf_perf_event_output.  This has been observed with
> I/O completion occurring as a result of an interrupt:
> 
> 	bpf_prog_247fd1341cddaea4_trace_req_end+0x8d7/0x1000
> 	? trace_call_bpf+0x82/0x100
> 	? sch_direct_xmit+0xe2/0x230
> 	? blk_mq_end_request+0x1/0x100
> 	? blk_mq_end_request+0x5/0x100
> 	? kprobe_perf_func+0x19b/0x240
> 	? __qdisc_run+0x86/0x520
> 	? blk_mq_end_request+0x1/0x100
> 	? blk_mq_end_request+0x5/0x100
> 	? kprobe_ftrace_handler+0x90/0xf0
> 	? ftrace_ops_assist_func+0x6e/0xe0
> 	? ip6_input_finish+0xbf/0x460
> 	? 0xffffffffa01e80bf
> 	? nbd_dbg_flags_show+0xc0/0xc0 [nbd]
> 	? blkdev_issue_zeroout+0x200/0x200
> 	? blk_mq_end_request+0x1/0x100
> 	? blk_mq_end_request+0x5/0x100
> 	? flush_smp_call_function_queue+0x6c/0xe0
> 	? smp_call_function_single_interrupt+0x32/0xc0
> 	? call_function_single_interrupt+0xf/0x20
> 	? call_function_single_interrupt+0xa/0x20
> 	? swiotlb_map_page+0x140/0x140
> 	? refcount_sub_and_test+0x1a/0x50
> 	? tcp_wfree+0x20/0xf0
> 	? skb_release_head_state+0x62/0xc0
> 	? skb_release_all+0xe/0x30
> 	? napi_consume_skb+0xb5/0x100
> 	? mlx5e_poll_tx_cq+0x1df/0x4e0
> 	? mlx5e_poll_tx_cq+0x38c/0x4e0
> 	? mlx5e_napi_poll+0x58/0xc30
> 	? mlx5e_napi_poll+0x232/0xc30
> 	? net_rx_action+0x128/0x340
> 	? __do_softirq+0xd4/0x2ad
> 	? irq_exit+0xa5/0xb0
> 	? do_IRQ+0x7d/0xc0
> 	? common_interrupt+0xf/0xf
> 	</IRQ>
> 	? __rb_free_aux+0xf0/0xf0
> 	? perf_output_sample+0x28/0x7b0
> 	? perf_prepare_sample+0x54/0x4a0
> 	? perf_event_output+0x43/0x60
> 	? bpf_perf_event_output_raw_tp+0x15f/0x180
> 	? blk_mq_start_request+0x1/0x120
> 	? bpf_prog_411a64a706fc6044_should_trace+0xad4/0x1000
> 	? bpf_trace_run3+0x2c/0x80
> 	? nbd_send_cmd+0x4c2/0x690 [nbd]
> 
> This also cannot be alleviated by further splitting the per-cpu
> perf_sample_data structs (as in commit 283ca526a9bd ("bpf: fix
> corruption on concurrent perf_event_output calls")), as a raw_tp could
> be attached to the block:block_rq_complete tracepoint and execute during
> another raw_tp.  Instead, keep a pre-allocated perf_sample_data
> structure per perf_event_array element and fail a bpf_perf_event_output
> if that element is concurrently being used.
> 
> Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data")
> Signed-off-by: Matt Mullins <mmullins@fb.com>

You do not elaborate why is this needed for all the networking programs that
use this functionality. The bpf_misc_sd should therefore be kept as-is. There
cannot be nested occurrences there (xdp, tc ingress/egress). Please explain why
non-tracing should be affected here...

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH bpf v2] bpf: preallocate a perf_sample_data per event fd
  2019-06-03 13:08 ` Daniel Borkmann
@ 2019-06-03 13:22   ` Daniel Borkmann
  2019-06-03 22:58     ` Matt Mullins
  0 siblings, 1 reply; 17+ messages in thread
From: Daniel Borkmann @ 2019-06-03 13:22 UTC (permalink / raw)
  To: Matt Mullins, hall, ast, bpf, netdev
  Cc: linux-kernel, Martin KaFai Lau, Song Liu, Yonghong Song,
	Steven Rostedt, Ingo Molnar

On 06/03/2019 03:08 PM, Daniel Borkmann wrote:
> On 06/01/2019 12:37 AM, Matt Mullins wrote:
>> It is possible that a BPF program can be called while another BPF
>> program is executing bpf_perf_event_output.  This has been observed with
>> I/O completion occurring as a result of an interrupt:
>>
>> 	bpf_prog_247fd1341cddaea4_trace_req_end+0x8d7/0x1000
>> 	? trace_call_bpf+0x82/0x100
>> 	? sch_direct_xmit+0xe2/0x230
>> 	? blk_mq_end_request+0x1/0x100
>> 	? blk_mq_end_request+0x5/0x100
>> 	? kprobe_perf_func+0x19b/0x240
>> 	? __qdisc_run+0x86/0x520
>> 	? blk_mq_end_request+0x1/0x100
>> 	? blk_mq_end_request+0x5/0x100
>> 	? kprobe_ftrace_handler+0x90/0xf0
>> 	? ftrace_ops_assist_func+0x6e/0xe0
>> 	? ip6_input_finish+0xbf/0x460
>> 	? 0xffffffffa01e80bf
>> 	? nbd_dbg_flags_show+0xc0/0xc0 [nbd]
>> 	? blkdev_issue_zeroout+0x200/0x200
>> 	? blk_mq_end_request+0x1/0x100
>> 	? blk_mq_end_request+0x5/0x100
>> 	? flush_smp_call_function_queue+0x6c/0xe0
>> 	? smp_call_function_single_interrupt+0x32/0xc0
>> 	? call_function_single_interrupt+0xf/0x20
>> 	? call_function_single_interrupt+0xa/0x20
>> 	? swiotlb_map_page+0x140/0x140
>> 	? refcount_sub_and_test+0x1a/0x50
>> 	? tcp_wfree+0x20/0xf0
>> 	? skb_release_head_state+0x62/0xc0
>> 	? skb_release_all+0xe/0x30
>> 	? napi_consume_skb+0xb5/0x100
>> 	? mlx5e_poll_tx_cq+0x1df/0x4e0
>> 	? mlx5e_poll_tx_cq+0x38c/0x4e0
>> 	? mlx5e_napi_poll+0x58/0xc30
>> 	? mlx5e_napi_poll+0x232/0xc30
>> 	? net_rx_action+0x128/0x340
>> 	? __do_softirq+0xd4/0x2ad
>> 	? irq_exit+0xa5/0xb0
>> 	? do_IRQ+0x7d/0xc0
>> 	? common_interrupt+0xf/0xf
>> 	</IRQ>
>> 	? __rb_free_aux+0xf0/0xf0
>> 	? perf_output_sample+0x28/0x7b0
>> 	? perf_prepare_sample+0x54/0x4a0
>> 	? perf_event_output+0x43/0x60
>> 	? bpf_perf_event_output_raw_tp+0x15f/0x180
>> 	? blk_mq_start_request+0x1/0x120
>> 	? bpf_prog_411a64a706fc6044_should_trace+0xad4/0x1000
>> 	? bpf_trace_run3+0x2c/0x80
>> 	? nbd_send_cmd+0x4c2/0x690 [nbd]
>>
>> This also cannot be alleviated by further splitting the per-cpu
>> perf_sample_data structs (as in commit 283ca526a9bd ("bpf: fix
>> corruption on concurrent perf_event_output calls")), as a raw_tp could
>> be attached to the block:block_rq_complete tracepoint and execute during
>> another raw_tp.  Instead, keep a pre-allocated perf_sample_data
>> structure per perf_event_array element and fail a bpf_perf_event_output
>> if that element is concurrently being used.
>>
>> Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data")
>> Signed-off-by: Matt Mullins <mmullins@fb.com>
> 
> You do not elaborate why is this needed for all the networking programs that
> use this functionality. The bpf_misc_sd should therefore be kept as-is. There
> cannot be nested occurrences there (xdp, tc ingress/egress). Please explain why
> non-tracing should be affected here...

Aside from that it's also really bad to miss events like this as exporting
through rb is critical. Why can't you have a per-CPU counter that selects a
sample data context based on nesting level in tracing? (I don't see a discussion
of this in your commit message.)

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH bpf v2] bpf: preallocate a perf_sample_data per event fd
  2019-06-03 13:22   ` Daniel Borkmann
@ 2019-06-03 22:58     ` Matt Mullins
  2019-06-03 23:27       ` Alexei Starovoitov
  0 siblings, 1 reply; 17+ messages in thread
From: Matt Mullins @ 2019-06-03 22:58 UTC (permalink / raw)
  To: daniel, netdev, Andrew Hall, bpf, ast
  Cc: linux-kernel, Martin Lau, Yonghong Song, rostedt, mingo, Song Liu

On Mon, 2019-06-03 at 15:22 +0200, Daniel Borkmann wrote:
> On 06/03/2019 03:08 PM, Daniel Borkmann wrote:
> > On 06/01/2019 12:37 AM, Matt Mullins wrote:
> > > It is possible that a BPF program can be called while another BPF
> > > program is executing bpf_perf_event_output.  This has been observed with
> > > I/O completion occurring as a result of an interrupt:
> > > 
> > > 	bpf_prog_247fd1341cddaea4_trace_req_end+0x8d7/0x1000
> > > 	? trace_call_bpf+0x82/0x100
> > > 	? sch_direct_xmit+0xe2/0x230
> > > 	? blk_mq_end_request+0x1/0x100
> > > 	? blk_mq_end_request+0x5/0x100
> > > 	? kprobe_perf_func+0x19b/0x240
> > > 	? __qdisc_run+0x86/0x520
> > > 	? blk_mq_end_request+0x1/0x100
> > > 	? blk_mq_end_request+0x5/0x100
> > > 	? kprobe_ftrace_handler+0x90/0xf0
> > > 	? ftrace_ops_assist_func+0x6e/0xe0
> > > 	? ip6_input_finish+0xbf/0x460
> > > 	? 0xffffffffa01e80bf
> > > 	? nbd_dbg_flags_show+0xc0/0xc0 [nbd]
> > > 	? blkdev_issue_zeroout+0x200/0x200
> > > 	? blk_mq_end_request+0x1/0x100
> > > 	? blk_mq_end_request+0x5/0x100
> > > 	? flush_smp_call_function_queue+0x6c/0xe0
> > > 	? smp_call_function_single_interrupt+0x32/0xc0
> > > 	? call_function_single_interrupt+0xf/0x20
> > > 	? call_function_single_interrupt+0xa/0x20
> > > 	? swiotlb_map_page+0x140/0x140
> > > 	? refcount_sub_and_test+0x1a/0x50
> > > 	? tcp_wfree+0x20/0xf0
> > > 	? skb_release_head_state+0x62/0xc0
> > > 	? skb_release_all+0xe/0x30
> > > 	? napi_consume_skb+0xb5/0x100
> > > 	? mlx5e_poll_tx_cq+0x1df/0x4e0
> > > 	? mlx5e_poll_tx_cq+0x38c/0x4e0
> > > 	? mlx5e_napi_poll+0x58/0xc30
> > > 	? mlx5e_napi_poll+0x232/0xc30
> > > 	? net_rx_action+0x128/0x340
> > > 	? __do_softirq+0xd4/0x2ad
> > > 	? irq_exit+0xa5/0xb0
> > > 	? do_IRQ+0x7d/0xc0
> > > 	? common_interrupt+0xf/0xf
> > > 	</IRQ>
> > > 	? __rb_free_aux+0xf0/0xf0
> > > 	? perf_output_sample+0x28/0x7b0
> > > 	? perf_prepare_sample+0x54/0x4a0
> > > 	? perf_event_output+0x43/0x60
> > > 	? bpf_perf_event_output_raw_tp+0x15f/0x180
> > > 	? blk_mq_start_request+0x1/0x120
> > > 	? bpf_prog_411a64a706fc6044_should_trace+0xad4/0x1000
> > > 	? bpf_trace_run3+0x2c/0x80
> > > 	? nbd_send_cmd+0x4c2/0x690 [nbd]
> > > 
> > > This also cannot be alleviated by further splitting the per-cpu
> > > perf_sample_data structs (as in commit 283ca526a9bd ("bpf: fix
> > > corruption on concurrent perf_event_output calls")), as a raw_tp could
> > > be attached to the block:block_rq_complete tracepoint and execute during
> > > another raw_tp.  Instead, keep a pre-allocated perf_sample_data
> > > structure per perf_event_array element and fail a bpf_perf_event_output
> > > if that element is concurrently being used.
> > > 
> > > Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data")
> > > Signed-off-by: Matt Mullins <mmullins@fb.com>
> > 
> > You do not elaborate why is this needed for all the networking programs that
> > use this functionality. The bpf_misc_sd should therefore be kept as-is. There
> > cannot be nested occurrences there (xdp, tc ingress/egress). Please explain why
> > non-tracing should be affected here...

If these are invariably non-nested, I can easily keep bpf_misc_sd when
I resubmit.  There was no technical reason other than keeping the two
codepaths as similar as possible.

What resource gives you worry about doing this for the networking
codepath?

> Aside from that it's also really bad to miss events like this as exporting
> through rb is critical. Why can't you have a per-CPU counter that selects a
> sample data context based on nesting level in tracing? (I don't see a discussion
> of this in your commit message.)

This change would only drop messages if the same perf_event is
attempted to be used recursively (i.e. the same CPU on the same
PERF_EVENT_ARRAY map, as I haven't observed anything use index !=
BPF_F_CURRENT_CPU in testing).

I'll try to accomplish the same with a percpu nesting level and
allocating 2 or 3 perf_sample_data per cpu.  I think that'll solve the
same problem -- a local patch keeping track of the nesting level is how
I got the above stack trace, too.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH bpf v2] bpf: preallocate a perf_sample_data per event fd
  2019-06-03 22:58     ` Matt Mullins
@ 2019-06-03 23:27       ` Alexei Starovoitov
  2019-06-03 23:48         ` Daniel Borkmann
  0 siblings, 1 reply; 17+ messages in thread
From: Alexei Starovoitov @ 2019-06-03 23:27 UTC (permalink / raw)
  To: Matt Mullins
  Cc: daniel, netdev, Andrew Hall, bpf, ast, linux-kernel, Martin Lau,
	Yonghong Song, rostedt, mingo, Song Liu

On Mon, Jun 3, 2019 at 3:59 PM Matt Mullins <mmullins@fb.com> wrote:
>
> If these are invariably non-nested, I can easily keep bpf_misc_sd when
> I resubmit.  There was no technical reason other than keeping the two
> codepaths as similar as possible.
>
> What resource gives you worry about doing this for the networking
> codepath?

my preference would be to keep tracing and networking the same.
there is already minimal nesting in networking and probably we see
more when reuseport progs will start running from xdp and clsbpf

> > Aside from that it's also really bad to miss events like this as exporting
> > through rb is critical. Why can't you have a per-CPU counter that selects a
> > sample data context based on nesting level in tracing? (I don't see a discussion
> > of this in your commit message.)
>
> This change would only drop messages if the same perf_event is
> attempted to be used recursively (i.e. the same CPU on the same
> PERF_EVENT_ARRAY map, as I haven't observed anything use index !=
> BPF_F_CURRENT_CPU in testing).
>
> I'll try to accomplish the same with a percpu nesting level and
> allocating 2 or 3 perf_sample_data per cpu.  I think that'll solve the
> same problem -- a local patch keeping track of the nesting level is how
> I got the above stack trace, too.

I don't think counter approach works. The amount of nesting is unknown.
imo the approach taken in this patch is good.
I don't see any issue when event_outputs will be dropped for valid progs.
Only when user called the helper incorrectly without BPF_F_CURRENT_CPU.
But that's an error anyway.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH bpf v2] bpf: preallocate a perf_sample_data per event fd
  2019-06-03 23:27       ` Alexei Starovoitov
@ 2019-06-03 23:48         ` Daniel Borkmann
  2019-06-03 23:54           ` Alexei Starovoitov
  0 siblings, 1 reply; 17+ messages in thread
From: Daniel Borkmann @ 2019-06-03 23:48 UTC (permalink / raw)
  To: Alexei Starovoitov, Matt Mullins
  Cc: netdev, Andrew Hall, bpf, ast, linux-kernel, Martin Lau,
	Yonghong Song, rostedt, mingo, Song Liu

On 06/04/2019 01:27 AM, Alexei Starovoitov wrote:
> On Mon, Jun 3, 2019 at 3:59 PM Matt Mullins <mmullins@fb.com> wrote:
>>
>> If these are invariably non-nested, I can easily keep bpf_misc_sd when
>> I resubmit.  There was no technical reason other than keeping the two
>> codepaths as similar as possible.
>>
>> What resource gives you worry about doing this for the networking
>> codepath?
> 
> my preference would be to keep tracing and networking the same.
> there is already minimal nesting in networking and probably we see
> more when reuseport progs will start running from xdp and clsbpf
> 
>>> Aside from that it's also really bad to miss events like this as exporting
>>> through rb is critical. Why can't you have a per-CPU counter that selects a
>>> sample data context based on nesting level in tracing? (I don't see a discussion
>>> of this in your commit message.)
>>
>> This change would only drop messages if the same perf_event is
>> attempted to be used recursively (i.e. the same CPU on the same
>> PERF_EVENT_ARRAY map, as I haven't observed anything use index !=
>> BPF_F_CURRENT_CPU in testing).
>>
>> I'll try to accomplish the same with a percpu nesting level and
>> allocating 2 or 3 perf_sample_data per cpu.  I think that'll solve the
>> same problem -- a local patch keeping track of the nesting level is how
>> I got the above stack trace, too.
> 
> I don't think counter approach works. The amount of nesting is unknown.
> imo the approach taken in this patch is good.
> I don't see any issue when event_outputs will be dropped for valid progs.
> Only when user called the helper incorrectly without BPF_F_CURRENT_CPU.
> But that's an error anyway.

My main worry with this xchg() trick is that we'll miss to export crucial
data with the EBUSY bailing out especially given nesting could increase in
future as you state, so users might have a hard time debugging this kind of
issue if they share the same perf event map among these programs, and no
option to get to this data otherwise. Supporting nesting up to a certain
level would still be better than a lost event which is also not reported
through the usual way aka perf rb.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH bpf v2] bpf: preallocate a perf_sample_data per event fd
  2019-06-03 23:48         ` Daniel Borkmann
@ 2019-06-03 23:54           ` Alexei Starovoitov
  2019-06-04  0:43             ` Daniel Borkmann
  0 siblings, 1 reply; 17+ messages in thread
From: Alexei Starovoitov @ 2019-06-03 23:54 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Matt Mullins, netdev, Andrew Hall, bpf, ast, linux-kernel,
	Martin Lau, Yonghong Song, rostedt, mingo, Song Liu

On Mon, Jun 3, 2019 at 4:48 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 06/04/2019 01:27 AM, Alexei Starovoitov wrote:
> > On Mon, Jun 3, 2019 at 3:59 PM Matt Mullins <mmullins@fb.com> wrote:
> >>
> >> If these are invariably non-nested, I can easily keep bpf_misc_sd when
> >> I resubmit.  There was no technical reason other than keeping the two
> >> codepaths as similar as possible.
> >>
> >> What resource gives you worry about doing this for the networking
> >> codepath?
> >
> > my preference would be to keep tracing and networking the same.
> > there is already minimal nesting in networking and probably we see
> > more when reuseport progs will start running from xdp and clsbpf
> >
> >>> Aside from that it's also really bad to miss events like this as exporting
> >>> through rb is critical. Why can't you have a per-CPU counter that selects a
> >>> sample data context based on nesting level in tracing? (I don't see a discussion
> >>> of this in your commit message.)
> >>
> >> This change would only drop messages if the same perf_event is
> >> attempted to be used recursively (i.e. the same CPU on the same
> >> PERF_EVENT_ARRAY map, as I haven't observed anything use index !=
> >> BPF_F_CURRENT_CPU in testing).
> >>
> >> I'll try to accomplish the same with a percpu nesting level and
> >> allocating 2 or 3 perf_sample_data per cpu.  I think that'll solve the
> >> same problem -- a local patch keeping track of the nesting level is how
> >> I got the above stack trace, too.
> >
> > I don't think counter approach works. The amount of nesting is unknown.
> > imo the approach taken in this patch is good.
> > I don't see any issue when event_outputs will be dropped for valid progs.
> > Only when user called the helper incorrectly without BPF_F_CURRENT_CPU.
> > But that's an error anyway.
>
> My main worry with this xchg() trick is that we'll miss to export crucial
> data with the EBUSY bailing out especially given nesting could increase in
> future as you state, so users might have a hard time debugging this kind of
> issue if they share the same perf event map among these programs, and no
> option to get to this data otherwise. Supporting nesting up to a certain
> level would still be better than a lost event which is also not reported
> through the usual way aka perf rb.

I simply don't see this 'miss to export data' in all but contrived conditions.
Say two progs share the same perf event array.
One prog calls event_output and while rb logic is working
another prog needs to start executing and use the same event array
slot. Today it's only possible for tracing prog combined with networking,
but having two progs use the same event output array is pretty much
a user bug. Just like not passing BPF_F_CURRENT_CPU.
Same kind of a bug.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH bpf v2] bpf: preallocate a perf_sample_data per event fd
  2019-06-03 23:54           ` Alexei Starovoitov
@ 2019-06-04  0:43             ` Daniel Borkmann
  2019-06-04  0:56               ` Alexei Starovoitov
  2019-06-04  3:17               ` Matt Mullins
  0 siblings, 2 replies; 17+ messages in thread
From: Daniel Borkmann @ 2019-06-04  0:43 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Matt Mullins, netdev, Andrew Hall, bpf, ast, linux-kernel,
	Martin Lau, Yonghong Song, rostedt, mingo, Song Liu

On 06/04/2019 01:54 AM, Alexei Starovoitov wrote:
> On Mon, Jun 3, 2019 at 4:48 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>> On 06/04/2019 01:27 AM, Alexei Starovoitov wrote:
>>> On Mon, Jun 3, 2019 at 3:59 PM Matt Mullins <mmullins@fb.com> wrote:
>>>>
>>>> If these are invariably non-nested, I can easily keep bpf_misc_sd when
>>>> I resubmit.  There was no technical reason other than keeping the two
>>>> codepaths as similar as possible.
>>>>
>>>> What resource gives you worry about doing this for the networking
>>>> codepath?
>>>
>>> my preference would be to keep tracing and networking the same.
>>> there is already minimal nesting in networking and probably we see
>>> more when reuseport progs will start running from xdp and clsbpf
>>>
>>>>> Aside from that it's also really bad to miss events like this as exporting
>>>>> through rb is critical. Why can't you have a per-CPU counter that selects a
>>>>> sample data context based on nesting level in tracing? (I don't see a discussion
>>>>> of this in your commit message.)
>>>>
>>>> This change would only drop messages if the same perf_event is
>>>> attempted to be used recursively (i.e. the same CPU on the same
>>>> PERF_EVENT_ARRAY map, as I haven't observed anything use index !=
>>>> BPF_F_CURRENT_CPU in testing).
>>>>
>>>> I'll try to accomplish the same with a percpu nesting level and
>>>> allocating 2 or 3 perf_sample_data per cpu.  I think that'll solve the
>>>> same problem -- a local patch keeping track of the nesting level is how
>>>> I got the above stack trace, too.
>>>
>>> I don't think counter approach works. The amount of nesting is unknown.
>>> imo the approach taken in this patch is good.
>>> I don't see any issue when event_outputs will be dropped for valid progs.
>>> Only when user called the helper incorrectly without BPF_F_CURRENT_CPU.
>>> But that's an error anyway.
>>
>> My main worry with this xchg() trick is that we'll miss to export crucial
>> data with the EBUSY bailing out especially given nesting could increase in
>> future as you state, so users might have a hard time debugging this kind of
>> issue if they share the same perf event map among these programs, and no
>> option to get to this data otherwise. Supporting nesting up to a certain
>> level would still be better than a lost event which is also not reported
>> through the usual way aka perf rb.
> 
> I simply don't see this 'miss to export data' in all but contrived conditions.
> Say two progs share the same perf event array.
> One prog calls event_output and while rb logic is working
> another prog needs to start executing and use the same event array

Correct.

> slot. Today it's only possible for tracing prog combined with networking,
> but having two progs use the same event output array is pretty much
> a user bug. Just like not passing BPF_F_CURRENT_CPU.

I don't see the user bug part, why should that be a user bug? It's the same
as if we would say that sharing a BPF hash map between networking programs
attached to different hooks or networking and tracing would be a user bug
which it is not. One concrete example would be cilium monitor where we
currently expose skb trace and drop events a well as debug data through
the same rb. This should be usable from any type that has perf_event_output
helper enabled (e.g. XDP and tc/BPF) w/o requiring to walk yet another per
cpu mmap rb from user space.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH bpf v2] bpf: preallocate a perf_sample_data per event fd
  2019-06-04  0:43             ` Daniel Borkmann
@ 2019-06-04  0:56               ` Alexei Starovoitov
  2019-06-04  3:17               ` Matt Mullins
  1 sibling, 0 replies; 17+ messages in thread
From: Alexei Starovoitov @ 2019-06-04  0:56 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Matt Mullins, netdev, Andrew Hall, bpf, ast, linux-kernel,
	Martin Lau, Yonghong Song, rostedt, mingo, Song Liu

On Mon, Jun 3, 2019 at 5:44 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 06/04/2019 01:54 AM, Alexei Starovoitov wrote:
> > On Mon, Jun 3, 2019 at 4:48 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >> On 06/04/2019 01:27 AM, Alexei Starovoitov wrote:
> >>> On Mon, Jun 3, 2019 at 3:59 PM Matt Mullins <mmullins@fb.com> wrote:
> >>>>
> >>>> If these are invariably non-nested, I can easily keep bpf_misc_sd when
> >>>> I resubmit.  There was no technical reason other than keeping the two
> >>>> codepaths as similar as possible.
> >>>>
> >>>> What resource gives you worry about doing this for the networking
> >>>> codepath?
> >>>
> >>> my preference would be to keep tracing and networking the same.
> >>> there is already minimal nesting in networking and probably we see
> >>> more when reuseport progs will start running from xdp and clsbpf
> >>>
> >>>>> Aside from that it's also really bad to miss events like this as exporting
> >>>>> through rb is critical. Why can't you have a per-CPU counter that selects a
> >>>>> sample data context based on nesting level in tracing? (I don't see a discussion
> >>>>> of this in your commit message.)
> >>>>
> >>>> This change would only drop messages if the same perf_event is
> >>>> attempted to be used recursively (i.e. the same CPU on the same
> >>>> PERF_EVENT_ARRAY map, as I haven't observed anything use index !=
> >>>> BPF_F_CURRENT_CPU in testing).
> >>>>
> >>>> I'll try to accomplish the same with a percpu nesting level and
> >>>> allocating 2 or 3 perf_sample_data per cpu.  I think that'll solve the
> >>>> same problem -- a local patch keeping track of the nesting level is how
> >>>> I got the above stack trace, too.
> >>>
> >>> I don't think counter approach works. The amount of nesting is unknown.
> >>> imo the approach taken in this patch is good.
> >>> I don't see any issue when event_outputs will be dropped for valid progs.
> >>> Only when user called the helper incorrectly without BPF_F_CURRENT_CPU.
> >>> But that's an error anyway.
> >>
> >> My main worry with this xchg() trick is that we'll miss to export crucial
> >> data with the EBUSY bailing out especially given nesting could increase in
> >> future as you state, so users might have a hard time debugging this kind of
> >> issue if they share the same perf event map among these programs, and no
> >> option to get to this data otherwise. Supporting nesting up to a certain
> >> level would still be better than a lost event which is also not reported
> >> through the usual way aka perf rb.
> >
> > I simply don't see this 'miss to export data' in all but contrived conditions.
> > Say two progs share the same perf event array.
> > One prog calls event_output and while rb logic is working
> > another prog needs to start executing and use the same event array
>
> Correct.
>
> > slot. Today it's only possible for tracing prog combined with networking,
> > but having two progs use the same event output array is pretty much
> > a user bug. Just like not passing BPF_F_CURRENT_CPU.
>
> I don't see the user bug part, why should that be a user bug?

because I suspect that 'struct bpf_event_entry' is not reentrable
(even w/o issues with 'struct perf_sample_data').

Even if we always use 'struct perf_sample_data' on stack, I suspect
the same 'struct bpf_event_entry' cannot be reused in the nested way.

> It's the same
> as if we would say that sharing a BPF hash map between networking programs
> attached to different hooks or networking and tracing would be a user bug
> which it is not. One concrete example would be cilium monitor where we
> currently expose skb trace and drop events a well as debug data through
> the same rb. This should be usable from any type that has perf_event_output
> helper enabled (e.g. XDP and tc/BPF) w/o requiring to walk yet another per
> cpu mmap rb from user space.

sure. those are valid use cases, but in all cases bpf_event_output() on
particular 'struct bpf_event_entry' will end before another one will begin, no?

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH bpf v2] bpf: preallocate a perf_sample_data per event fd
  2019-06-04  0:43             ` Daniel Borkmann
  2019-06-04  0:56               ` Alexei Starovoitov
@ 2019-06-04  3:17               ` Matt Mullins
  2019-06-06 18:54                 ` [PATCH bpf] bpf: fix nested bpf tracepoints with per-cpu data Matt Mullins
  1 sibling, 1 reply; 17+ messages in thread
From: Matt Mullins @ 2019-06-04  3:17 UTC (permalink / raw)
  To: daniel, alexei.starovoitov
  Cc: Song Liu, linux-kernel, bpf, ast, rostedt, Andrew Hall, mingo,
	netdev, Martin Lau, Yonghong Song

On Tue, 2019-06-04 at 02:43 +0200, Daniel Borkmann wrote:
> On 06/04/2019 01:54 AM, Alexei Starovoitov wrote:
> > On Mon, Jun 3, 2019 at 4:48 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
> > > On 06/04/2019 01:27 AM, Alexei Starovoitov wrote:
> > > > On Mon, Jun 3, 2019 at 3:59 PM Matt Mullins <mmullins@fb.com> wrote:
> > > > > 
> > > > > If these are invariably non-nested, I can easily keep bpf_misc_sd when
> > > > > I resubmit.  There was no technical reason other than keeping the two
> > > > > codepaths as similar as possible.
> > > > > 
> > > > > What resource gives you worry about doing this for the networking
> > > > > codepath?
> > > > 
> > > > my preference would be to keep tracing and networking the same.
> > > > there is already minimal nesting in networking and probably we see
> > > > more when reuseport progs will start running from xdp and clsbpf
> > > > 
> > > > > > Aside from that it's also really bad to miss events like this as exporting
> > > > > > through rb is critical. Why can't you have a per-CPU counter that selects a
> > > > > > sample data context based on nesting level in tracing? (I don't see a discussion
> > > > > > of this in your commit message.)
> > > > > 
> > > > > This change would only drop messages if the same perf_event is
> > > > > attempted to be used recursively (i.e. the same CPU on the same
> > > > > PERF_EVENT_ARRAY map, as I haven't observed anything use index !=
> > > > > BPF_F_CURRENT_CPU in testing).
> > > > > 
> > > > > I'll try to accomplish the same with a percpu nesting level and
> > > > > allocating 2 or 3 perf_sample_data per cpu.  I think that'll solve the
> > > > > same problem -- a local patch keeping track of the nesting level is how
> > > > > I got the above stack trace, too.
> > > > 
> > > > I don't think counter approach works. The amount of nesting is unknown.
> > > > imo the approach taken in this patch is good.
> > > > I don't see any issue when event_outputs will be dropped for valid progs.
> > > > Only when user called the helper incorrectly without BPF_F_CURRENT_CPU.
> > > > But that's an error anyway.
> > > 
> > > My main worry with this xchg() trick is that we'll miss to export crucial
> > > data with the EBUSY bailing out especially given nesting could increase in
> > > future as you state, so users might have a hard time debugging this kind of
> > > issue if they share the same perf event map among these programs, and no
> > > option to get to this data otherwise. Supporting nesting up to a certain
> > > level would still be better than a lost event which is also not reported
> > > through the usual way aka perf rb.

Tracing can already be lossy: trace_call_bpf() silently simply doesn't
call the prog and instead returns zero if bpf_prog_active != 1.

> > 
> > I simply don't see this 'miss to export data' in all but contrived conditions.
> > Say two progs share the same perf event array.
> > One prog calls event_output and while rb logic is working
> > another prog needs to start executing and use the same event array
> 
> Correct.
> 
> > slot. Today it's only possible for tracing prog combined with networking,
> > but having two progs use the same event output array is pretty much
> > a user bug. Just like not passing BPF_F_CURRENT_CPU.
> 
> I don't see the user bug part, why should that be a user bug? It's the same
> as if we would say that sharing a BPF hash map between networking programs
> attached to different hooks or networking and tracing would be a user bug
> which it is not. One concrete example would be cilium monitor where we
> currently expose skb trace and drop events a well as debug data through
> the same rb. This should be usable from any type that has perf_event_output
> helper enabled (e.g. XDP and tc/BPF) w/o requiring to walk yet another per
> cpu mmap rb from user space.

Neither of these solutions would affect the behavior of sharing the
perf array between networking programs -- since they're never called in
a nested fashion, then you'll never hit the "xchg() returned NULL" at
all.

That said, I think I can logically limit nesting in tracing to 3
levels:
  - a kprobe or raw tp or perf event,
  - another one of the above that irq context happens to run, and
  - if we're really unlucky, the same in nmi
at most one of which can be a kprobe or perf event.

There is also a comment

/*
 * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
 * to avoid potential recursive reuse issue when/if tracepoints are added
 * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack
 */

that suggests that one day bpf_perf_event_output might grow a static
tracepoint.  However, if the program attached to such a hypothetical
tracepoint were to call bpf_perf_event_output, that would infinitely
recurse ... it seems fine to let that case return -EBUSY as well.  It
does make me wonder if I should do the same nesting for the pt_regs.

I've now got an experiment running with the counter approach, and the
workload that I hit the original crash with seems to be fine with 2
layers' worth -- though if we decide that's the one I should move
forward with, I'll probably bump it to a third to be safe for an NMI.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH bpf] bpf: fix nested bpf tracepoints with per-cpu data
  2019-06-04  3:17               ` Matt Mullins
@ 2019-06-06 18:54                 ` Matt Mullins
  2019-06-06 22:13                   ` Jakub Kicinski
  2019-06-07  2:59                   ` Andrii Nakryiko
  0 siblings, 2 replies; 17+ messages in thread
From: Matt Mullins @ 2019-06-06 18:54 UTC (permalink / raw)
  To: hall, mmullins, ast, daniel, bpf, netdev
  Cc: linux-kernel, Steven Rostedt, Ingo Molnar, Martin KaFai Lau,
	Song Liu, Yonghong Song

BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as
they do not increment bpf_prog_active while executing.

This enables three levels of nesting, to support
  - a kprobe or raw tp or perf event,
  - another one of the above that irq context happens to call, and
  - another one in nmi context
(at most one of which may be a kprobe or perf event).

Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data")
---
This is more lines of code, but possibly less intrusive than the
per-array-element approach.

I don't necessarily like that I duplicated the nest_level logic in two
places, but I don't see a way to unify them:
  - kprobes' bpf_perf_event_output doesn't use bpf_raw_tp_regs, and does
    use the perf_sample_data,
  - raw tracepoints' bpf_get_stackid uses bpf_raw_tp_regs, but not
    the perf_sample_data, and
  - raw tracepoints' bpf_perf_event_output uses both...

 kernel/trace/bpf_trace.c | 95 +++++++++++++++++++++++++++++++++-------
 1 file changed, 80 insertions(+), 15 deletions(-)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f92d6ad5e080..4f5419837ddd 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -410,8 +410,6 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
 	.arg4_type	= ARG_CONST_SIZE,
 };
 
-static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd);
-
 static __always_inline u64
 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
 			u64 flags, struct perf_sample_data *sd)
@@ -442,24 +440,47 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
 	return perf_event_output(event, sd, regs);
 }
 
+/*
+ * Support executing tracepoints in normal, irq, and nmi context that each call
+ * bpf_perf_event_output
+ */
+struct bpf_trace_sample_data {
+	struct perf_sample_data sds[3];
+};
+
+static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds);
+static DEFINE_PER_CPU(int, bpf_trace_nest_level);
 BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
 	   u64, flags, void *, data, u64, size)
 {
-	struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd);
+	struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds);
+	struct perf_sample_data *sd;
+	int nest_level = this_cpu_inc_return(bpf_trace_nest_level);
 	struct perf_raw_record raw = {
 		.frag = {
 			.size = size,
 			.data = data,
 		},
 	};
+	int err = -EBUSY;
 
+	if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds)))
+		goto out;
+
+	sd = &sds->sds[nest_level - 1];
+
+	err = -EINVAL;
 	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
-		return -EINVAL;
+		goto out;
 
 	perf_sample_data_init(sd, 0, 0);
 	sd->raw = &raw;
 
-	return __bpf_perf_event_output(regs, map, flags, sd);
+	err = __bpf_perf_event_output(regs, map, flags, sd);
+
+out:
+	this_cpu_dec(bpf_trace_nest_level);
+	return err;
 }
 
 static const struct bpf_func_proto bpf_perf_event_output_proto = {
@@ -822,16 +843,48 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 /*
  * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
  * to avoid potential recursive reuse issue when/if tracepoints are added
- * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack
+ * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack.
+ *
+ * Since raw tracepoints run despite bpf_prog_active, support concurrent usage
+ * in normal, irq, and nmi context.
  */
-static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs);
+struct bpf_raw_tp_regs {
+	struct pt_regs regs[3];
+};
+static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs);
+static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level);
+static struct pt_regs *get_bpf_raw_tp_regs(void)
+{
+	struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs);
+	int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level);
+
+	if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) {
+		this_cpu_dec(bpf_raw_tp_nest_level);
+		return ERR_PTR(-EBUSY);
+	}
+
+	return &tp_regs->regs[nest_level - 1];
+}
+
+static void put_bpf_raw_tp_regs(void)
+{
+	this_cpu_dec(bpf_raw_tp_nest_level);
+}
+
 BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
 	   struct bpf_map *, map, u64, flags, void *, data, u64, size)
 {
-	struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+	struct pt_regs *regs = get_bpf_raw_tp_regs();
+	int ret;
+
+	if (IS_ERR(regs))
+		return PTR_ERR(regs);
 
 	perf_fetch_caller_regs(regs);
-	return ____bpf_perf_event_output(regs, map, flags, data, size);
+	ret = ____bpf_perf_event_output(regs, map, flags, data, size);
+
+	put_bpf_raw_tp_regs();
+	return ret;
 }
 
 static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
@@ -848,12 +901,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
 BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
 	   struct bpf_map *, map, u64, flags)
 {
-	struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+	struct pt_regs *regs = get_bpf_raw_tp_regs();
+	int ret;
+
+	if (IS_ERR(regs))
+		return PTR_ERR(regs);
 
 	perf_fetch_caller_regs(regs);
 	/* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */
-	return bpf_get_stackid((unsigned long) regs, (unsigned long) map,
-			       flags, 0, 0);
+	ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map,
+			      flags, 0, 0);
+	put_bpf_raw_tp_regs();
+	return ret;
 }
 
 static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
@@ -868,11 +927,17 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
 BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
 	   void *, buf, u32, size, u64, flags)
 {
-	struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+	struct pt_regs *regs = get_bpf_raw_tp_regs();
+	int ret;
+
+	if (IS_ERR(regs))
+		return PTR_ERR(regs);
 
 	perf_fetch_caller_regs(regs);
-	return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
-			     (unsigned long) size, flags, 0);
+	ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+			    (unsigned long) size, flags, 0);
+	put_bpf_raw_tp_regs();
+	return ret;
 }
 
 static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH bpf] bpf: fix nested bpf tracepoints with per-cpu data
  2019-06-06 18:54                 ` [PATCH bpf] bpf: fix nested bpf tracepoints with per-cpu data Matt Mullins
@ 2019-06-06 22:13                   ` Jakub Kicinski
  2019-06-06 22:39                     ` Matt Mullins
  2019-06-07  2:59                   ` Andrii Nakryiko
  1 sibling, 1 reply; 17+ messages in thread
From: Jakub Kicinski @ 2019-06-06 22:13 UTC (permalink / raw)
  To: Matt Mullins
  Cc: hall, ast, daniel, bpf, netdev, linux-kernel, Steven Rostedt,
	Ingo Molnar, Martin KaFai Lau, Song Liu, Yonghong Song

On Thu, 6 Jun 2019 11:54:27 -0700, Matt Mullins wrote:
> BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as
> they do not increment bpf_prog_active while executing.
> 
> This enables three levels of nesting, to support
>   - a kprobe or raw tp or perf event,
>   - another one of the above that irq context happens to call, and
>   - another one in nmi context
> (at most one of which may be a kprobe or perf event).
> 
> Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data")

No comment on the code, but you're definitely missing a sign-off.

> ---
> This is more lines of code, but possibly less intrusive than the
> per-array-element approach.
> 
> I don't necessarily like that I duplicated the nest_level logic in two
> places, but I don't see a way to unify them:
>   - kprobes' bpf_perf_event_output doesn't use bpf_raw_tp_regs, and does
>     use the perf_sample_data,
>   - raw tracepoints' bpf_get_stackid uses bpf_raw_tp_regs, but not
>     the perf_sample_data, and
>   - raw tracepoints' bpf_perf_event_output uses both...


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH bpf] bpf: fix nested bpf tracepoints with per-cpu data
  2019-06-06 22:13                   ` Jakub Kicinski
@ 2019-06-06 22:39                     ` Matt Mullins
  0 siblings, 0 replies; 17+ messages in thread
From: Matt Mullins @ 2019-06-06 22:39 UTC (permalink / raw)
  To: jakub.kicinski
  Cc: Song Liu, linux-kernel, daniel, bpf, rostedt, ast, Andrew Hall,
	mingo, netdev, Martin Lau, Yonghong Song

On Thu, 2019-06-06 at 15:13 -0700, Jakub Kicinski wrote:
> On Thu, 6 Jun 2019 11:54:27 -0700, Matt Mullins wrote:
> > BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as
> > they do not increment bpf_prog_active while executing.
> > 
> > This enables three levels of nesting, to support
> >   - a kprobe or raw tp or perf event,
> >   - another one of the above that irq context happens to call, and
> >   - another one in nmi context
> > (at most one of which may be a kprobe or perf event).
> > 
> > Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data")
> 
> No comment on the code, but you're definitely missing a sign-off.

Oops, I totally am.  I'll give it some more time for opinions to roll
in, and I'll fix that before I resubmit :)

> 
> > ---
> > This is more lines of code, but possibly less intrusive than the
> > per-array-element approach.
> > 
> > I don't necessarily like that I duplicated the nest_level logic in two
> > places, but I don't see a way to unify them:
> >   - kprobes' bpf_perf_event_output doesn't use bpf_raw_tp_regs, and does
> >     use the perf_sample_data,
> >   - raw tracepoints' bpf_get_stackid uses bpf_raw_tp_regs, but not
> >     the perf_sample_data, and
> >   - raw tracepoints' bpf_perf_event_output uses both...
> 
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH bpf] bpf: fix nested bpf tracepoints with per-cpu data
  2019-06-06 18:54                 ` [PATCH bpf] bpf: fix nested bpf tracepoints with per-cpu data Matt Mullins
  2019-06-06 22:13                   ` Jakub Kicinski
@ 2019-06-07  2:59                   ` Andrii Nakryiko
  2019-06-07  7:55                     ` Steven Rostedt
  1 sibling, 1 reply; 17+ messages in thread
From: Andrii Nakryiko @ 2019-06-07  2:59 UTC (permalink / raw)
  To: Matt Mullins
  Cc: hall, Alexei Starovoitov, Daniel Borkmann, bpf, Networking,
	open list, Steven Rostedt, Ingo Molnar, Martin KaFai Lau,
	Song Liu, Yonghong Song

On Thu, Jun 6, 2019 at 1:17 PM Matt Mullins <mmullins@fb.com> wrote:
>
> BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as
> they do not increment bpf_prog_active while executing.
>
> This enables three levels of nesting, to support
>   - a kprobe or raw tp or perf event,
>   - another one of the above that irq context happens to call, and
>   - another one in nmi context

Can NMIs be nested?

> (at most one of which may be a kprobe or perf event).
>
> Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data")
> ---
> This is more lines of code, but possibly less intrusive than the
> per-array-element approach.
>
> I don't necessarily like that I duplicated the nest_level logic in two
> places, but I don't see a way to unify them:
>   - kprobes' bpf_perf_event_output doesn't use bpf_raw_tp_regs, and does
>     use the perf_sample_data,
>   - raw tracepoints' bpf_get_stackid uses bpf_raw_tp_regs, but not
>     the perf_sample_data, and
>   - raw tracepoints' bpf_perf_event_output uses both...
>
>  kernel/trace/bpf_trace.c | 95 +++++++++++++++++++++++++++++++++-------
>  1 file changed, 80 insertions(+), 15 deletions(-)
>
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index f92d6ad5e080..4f5419837ddd 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -410,8 +410,6 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
>         .arg4_type      = ARG_CONST_SIZE,
>  };
>
> -static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd);
> -
>  static __always_inline u64
>  __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
>                         u64 flags, struct perf_sample_data *sd)
> @@ -442,24 +440,47 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
>         return perf_event_output(event, sd, regs);
>  }
>
> +/*
> + * Support executing tracepoints in normal, irq, and nmi context that each call
> + * bpf_perf_event_output
> + */
> +struct bpf_trace_sample_data {
> +       struct perf_sample_data sds[3];
> +};
> +
> +static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds);
> +static DEFINE_PER_CPU(int, bpf_trace_nest_level);
>  BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
>            u64, flags, void *, data, u64, size)
>  {
> -       struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd);
> +       struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds);
> +       struct perf_sample_data *sd;
> +       int nest_level = this_cpu_inc_return(bpf_trace_nest_level);

reverse Christmas tree?

>         struct perf_raw_record raw = {
>                 .frag = {
>                         .size = size,
>                         .data = data,
>                 },
>         };
> +       int err = -EBUSY;
>
> +       if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds)))
> +               goto out;

consider this a nit, but I find it much simpler to follow when err is
set just before goto, so that it's clear what's going to be returned:

int err;

if (something_bad) {
    err = -EBAD_ERR_CODE1;
    goto out;
}


> +
> +       sd = &sds->sds[nest_level - 1];
> +
> +       err = -EINVAL;
>         if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
> -               return -EINVAL;
> +               goto out;

Same here.

>
>         perf_sample_data_init(sd, 0, 0);
>         sd->raw = &raw;
>
> -       return __bpf_perf_event_output(regs, map, flags, sd);
> +       err = __bpf_perf_event_output(regs, map, flags, sd);
> +
> +out:
> +       this_cpu_dec(bpf_trace_nest_level);
> +       return err;
>  }
>
>  static const struct bpf_func_proto bpf_perf_event_output_proto = {
> @@ -822,16 +843,48 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>  /*
>   * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
>   * to avoid potential recursive reuse issue when/if tracepoints are added
> - * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack
> + * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack.
> + *
> + * Since raw tracepoints run despite bpf_prog_active, support concurrent usage
> + * in normal, irq, and nmi context.
>   */
> -static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs);
> +struct bpf_raw_tp_regs {
> +       struct pt_regs regs[3];
> +};
> +static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs);
> +static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level);
> +static struct pt_regs *get_bpf_raw_tp_regs(void)
> +{
> +       struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs);
> +       int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level);
> +
> +       if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) {
> +               this_cpu_dec(bpf_raw_tp_nest_level);
> +               return ERR_PTR(-EBUSY);
> +       }
> +
> +       return &tp_regs->regs[nest_level - 1];
> +}
> +
> +static void put_bpf_raw_tp_regs(void)
> +{
> +       this_cpu_dec(bpf_raw_tp_nest_level);
> +}
> +
>  BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
>            struct bpf_map *, map, u64, flags, void *, data, u64, size)
>  {
> -       struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
> +       struct pt_regs *regs = get_bpf_raw_tp_regs();
> +       int ret;
> +
> +       if (IS_ERR(regs))
> +               return PTR_ERR(regs);
>
>         perf_fetch_caller_regs(regs);
> -       return ____bpf_perf_event_output(regs, map, flags, data, size);
> +       ret = ____bpf_perf_event_output(regs, map, flags, data, size);
> +
> +       put_bpf_raw_tp_regs();
> +       return ret;
>  }
>
>  static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
> @@ -848,12 +901,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
>  BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
>            struct bpf_map *, map, u64, flags)
>  {
> -       struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
> +       struct pt_regs *regs = get_bpf_raw_tp_regs();
> +       int ret;
> +
> +       if (IS_ERR(regs))
> +               return PTR_ERR(regs);
>
>         perf_fetch_caller_regs(regs);
>         /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */
> -       return bpf_get_stackid((unsigned long) regs, (unsigned long) map,
> -                              flags, 0, 0);
> +       ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map,
> +                             flags, 0, 0);
> +       put_bpf_raw_tp_regs();
> +       return ret;
>  }
>
>  static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
> @@ -868,11 +927,17 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
>  BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
>            void *, buf, u32, size, u64, flags)
>  {
> -       struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
> +       struct pt_regs *regs = get_bpf_raw_tp_regs();
> +       int ret;
> +
> +       if (IS_ERR(regs))
> +               return PTR_ERR(regs);
>
>         perf_fetch_caller_regs(regs);
> -       return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
> -                            (unsigned long) size, flags, 0);
> +       ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf,
> +                           (unsigned long) size, flags, 0);
> +       put_bpf_raw_tp_regs();
> +       return ret;
>  }
>
>  static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
> --
> 2.17.1
>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH bpf] bpf: fix nested bpf tracepoints with per-cpu data
  2019-06-07  2:59                   ` Andrii Nakryiko
@ 2019-06-07  7:55                     ` Steven Rostedt
  0 siblings, 0 replies; 17+ messages in thread
From: Steven Rostedt @ 2019-06-07  7:55 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Matt Mullins, hall, Alexei Starovoitov, Daniel Borkmann, bpf,
	Networking, open list, Ingo Molnar, Martin KaFai Lau, Song Liu,
	Yonghong Song

On Thu, 6 Jun 2019 19:59:18 -0700
Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote:

> On Thu, Jun 6, 2019 at 1:17 PM Matt Mullins <mmullins@fb.com> wrote:
> >
> > BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as
> > they do not increment bpf_prog_active while executing.
> >
> > This enables three levels of nesting, to support
> >   - a kprobe or raw tp or perf event,
> >   - another one of the above that irq context happens to call, and
> >   - another one in nmi context  
> 
> Can NMIs be nested?

No, otherwise several things in the kernel will break.

-- Steve

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2019-06-07  7:55 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-05-31 22:37 [PATCH bpf v2] bpf: preallocate a perf_sample_data per event fd Matt Mullins
2019-06-01  1:27 ` Song Liu
2019-06-01  3:12   ` Alexei Starovoitov
2019-06-03 13:08 ` Daniel Borkmann
2019-06-03 13:22   ` Daniel Borkmann
2019-06-03 22:58     ` Matt Mullins
2019-06-03 23:27       ` Alexei Starovoitov
2019-06-03 23:48         ` Daniel Borkmann
2019-06-03 23:54           ` Alexei Starovoitov
2019-06-04  0:43             ` Daniel Borkmann
2019-06-04  0:56               ` Alexei Starovoitov
2019-06-04  3:17               ` Matt Mullins
2019-06-06 18:54                 ` [PATCH bpf] bpf: fix nested bpf tracepoints with per-cpu data Matt Mullins
2019-06-06 22:13                   ` Jakub Kicinski
2019-06-06 22:39                     ` Matt Mullins
2019-06-07  2:59                   ` Andrii Nakryiko
2019-06-07  7:55                     ` Steven Rostedt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).