* [PATCH bpf v2] bpf: fix nested bpf tracepoints with per-cpu data @ 2019-06-11 21:53 Matt Mullins 2019-06-12 5:00 ` Andrii Nakryiko 2019-06-15 23:35 ` Alexei Starovoitov 0 siblings, 2 replies; 9+ messages in thread From: Matt Mullins @ 2019-06-11 21:53 UTC (permalink / raw) To: hall, mmullins, ast, daniel, bpf, netdev Cc: linux-kernel, Steven Rostedt, Ingo Molnar, Martin KaFai Lau, Song Liu, Yonghong Song BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as they do not increment bpf_prog_active while executing. This enables three levels of nesting, to support - a kprobe or raw tp or perf event, - another one of the above that irq context happens to call, and - another one in nmi context (at most one of which may be a kprobe or perf event). Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data") Signed-off-by: Matt Mullins <mmullins@fb.com> --- v1->v2: * reverse-Christmas-tree-ize the declarations in bpf_perf_event_output * instantiate err more readably I've done additional testing with the original workload that hit the irq+raw-tp reentrancy problem, and as far as I can tell, it's still solved with this solution (as opposed to my earlier per-map-element version). kernel/trace/bpf_trace.c | 100 ++++++++++++++++++++++++++++++++------- 1 file changed, 84 insertions(+), 16 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f92d6ad5e080..1c9a4745e596 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -410,8 +410,6 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = { .arg4_type = ARG_CONST_SIZE, }; -static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd); - static __always_inline u64 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, u64 flags, struct perf_sample_data *sd) @@ -442,24 +440,50 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, return perf_event_output(event, sd, regs); } +/* + * Support executing tracepoints in normal, irq, and nmi context that each call + * bpf_perf_event_output + */ +struct bpf_trace_sample_data { + struct perf_sample_data sds[3]; +}; + +static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds); +static DEFINE_PER_CPU(int, bpf_trace_nest_level); BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, u64, flags, void *, data, u64, size) { - struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd); + struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds); + int nest_level = this_cpu_inc_return(bpf_trace_nest_level); struct perf_raw_record raw = { .frag = { .size = size, .data = data, }, }; + struct perf_sample_data *sd; + int err; - if (unlikely(flags & ~(BPF_F_INDEX_MASK))) - return -EINVAL; + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) { + err = -EBUSY; + goto out; + } + + sd = &sds->sds[nest_level - 1]; + + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) { + err = -EINVAL; + goto out; + } perf_sample_data_init(sd, 0, 0); sd->raw = &raw; - return __bpf_perf_event_output(regs, map, flags, sd); + err = __bpf_perf_event_output(regs, map, flags, sd); + +out: + this_cpu_dec(bpf_trace_nest_level); + return err; } static const struct bpf_func_proto bpf_perf_event_output_proto = { @@ -822,16 +846,48 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) /* * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp * to avoid potential recursive reuse issue when/if tracepoints are added - * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack + * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack. + * + * Since raw tracepoints run despite bpf_prog_active, support concurrent usage + * in normal, irq, and nmi context. */ -static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs); +struct bpf_raw_tp_regs { + struct pt_regs regs[3]; +}; +static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs); +static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level); +static struct pt_regs *get_bpf_raw_tp_regs(void) +{ + struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs); + int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level); + + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) { + this_cpu_dec(bpf_raw_tp_nest_level); + return ERR_PTR(-EBUSY); + } + + return &tp_regs->regs[nest_level - 1]; +} + +static void put_bpf_raw_tp_regs(void) +{ + this_cpu_dec(bpf_raw_tp_nest_level); +} + BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args, struct bpf_map *, map, u64, flags, void *, data, u64, size) { - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); + struct pt_regs *regs = get_bpf_raw_tp_regs(); + int ret; + + if (IS_ERR(regs)) + return PTR_ERR(regs); perf_fetch_caller_regs(regs); - return ____bpf_perf_event_output(regs, map, flags, data, size); + ret = ____bpf_perf_event_output(regs, map, flags, data, size); + + put_bpf_raw_tp_regs(); + return ret; } static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { @@ -848,12 +904,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, struct bpf_map *, map, u64, flags) { - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); + struct pt_regs *regs = get_bpf_raw_tp_regs(); + int ret; + + if (IS_ERR(regs)) + return PTR_ERR(regs); perf_fetch_caller_regs(regs); /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */ - return bpf_get_stackid((unsigned long) regs, (unsigned long) map, - flags, 0, 0); + ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map, + flags, 0, 0); + put_bpf_raw_tp_regs(); + return ret; } static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { @@ -868,11 +930,17 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args, void *, buf, u32, size, u64, flags) { - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); + struct pt_regs *regs = get_bpf_raw_tp_regs(); + int ret; + + if (IS_ERR(regs)) + return PTR_ERR(regs); perf_fetch_caller_regs(regs); - return bpf_get_stack((unsigned long) regs, (unsigned long) buf, - (unsigned long) size, flags, 0); + ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf, + (unsigned long) size, flags, 0); + put_bpf_raw_tp_regs(); + return ret; } static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = { -- 2.17.1 ^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH bpf v2] bpf: fix nested bpf tracepoints with per-cpu data 2019-06-11 21:53 [PATCH bpf v2] bpf: fix nested bpf tracepoints with per-cpu data Matt Mullins @ 2019-06-12 5:00 ` Andrii Nakryiko 2019-06-13 22:47 ` Daniel Borkmann 2019-06-15 23:35 ` Alexei Starovoitov 1 sibling, 1 reply; 9+ messages in thread From: Andrii Nakryiko @ 2019-06-12 5:00 UTC (permalink / raw) To: Matt Mullins Cc: hall, Alexei Starovoitov, Daniel Borkmann, bpf, Networking, open list, Steven Rostedt, Ingo Molnar, Martin KaFai Lau, Song Liu, Yonghong Song On Tue, Jun 11, 2019 at 8:48 PM Matt Mullins <mmullins@fb.com> wrote: > > BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as > they do not increment bpf_prog_active while executing. > > This enables three levels of nesting, to support > - a kprobe or raw tp or perf event, > - another one of the above that irq context happens to call, and > - another one in nmi context > (at most one of which may be a kprobe or perf event). > > Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data") > Signed-off-by: Matt Mullins <mmullins@fb.com> > --- LGTM, minor nit below. Acked-by: Andrii Nakryiko <andriin@fb.com> > v1->v2: > * reverse-Christmas-tree-ize the declarations in bpf_perf_event_output > * instantiate err more readably > > I've done additional testing with the original workload that hit the > irq+raw-tp reentrancy problem, and as far as I can tell, it's still > solved with this solution (as opposed to my earlier per-map-element > version). > > kernel/trace/bpf_trace.c | 100 ++++++++++++++++++++++++++++++++------- > 1 file changed, 84 insertions(+), 16 deletions(-) > > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c > index f92d6ad5e080..1c9a4745e596 100644 > --- a/kernel/trace/bpf_trace.c > +++ b/kernel/trace/bpf_trace.c > @@ -410,8 +410,6 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = { > .arg4_type = ARG_CONST_SIZE, > }; > > -static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd); > - > static __always_inline u64 > __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, > u64 flags, struct perf_sample_data *sd) > @@ -442,24 +440,50 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, > return perf_event_output(event, sd, regs); > } > > +/* > + * Support executing tracepoints in normal, irq, and nmi context that each call > + * bpf_perf_event_output > + */ > +struct bpf_trace_sample_data { > + struct perf_sample_data sds[3]; > +}; > + > +static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds); > +static DEFINE_PER_CPU(int, bpf_trace_nest_level); > BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, > u64, flags, void *, data, u64, size) > { > - struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd); > + struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds); > + int nest_level = this_cpu_inc_return(bpf_trace_nest_level); > struct perf_raw_record raw = { > .frag = { > .size = size, > .data = data, > }, > }; > + struct perf_sample_data *sd; > + int err; > > - if (unlikely(flags & ~(BPF_F_INDEX_MASK))) > - return -EINVAL; > + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) { > + err = -EBUSY; > + goto out; > + } > + > + sd = &sds->sds[nest_level - 1]; > + > + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) { > + err = -EINVAL; > + goto out; > + } Feel free to ignore, but just stylistically, given this check doesn't depend on sd, I'd move it either to the very top or right after `nest_level > ARRAY_SIZE(sds->sds)` check, so that all the error checking is grouped without interspersed assignment. > > perf_sample_data_init(sd, 0, 0); > sd->raw = &raw; > > - return __bpf_perf_event_output(regs, map, flags, sd); > + err = __bpf_perf_event_output(regs, map, flags, sd); > + > +out: > + this_cpu_dec(bpf_trace_nest_level); > + return err; > } > > static const struct bpf_func_proto bpf_perf_event_output_proto = { > @@ -822,16 +846,48 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) > /* > * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp > * to avoid potential recursive reuse issue when/if tracepoints are added > - * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack > + * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack. > + * > + * Since raw tracepoints run despite bpf_prog_active, support concurrent usage > + * in normal, irq, and nmi context. > */ > -static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs); > +struct bpf_raw_tp_regs { > + struct pt_regs regs[3]; > +}; > +static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs); > +static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level); > +static struct pt_regs *get_bpf_raw_tp_regs(void) > +{ > + struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs); > + int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level); > + > + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) { > + this_cpu_dec(bpf_raw_tp_nest_level); > + return ERR_PTR(-EBUSY); > + } > + > + return &tp_regs->regs[nest_level - 1]; > +} > + > +static void put_bpf_raw_tp_regs(void) > +{ > + this_cpu_dec(bpf_raw_tp_nest_level); > +} > + > BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args, > struct bpf_map *, map, u64, flags, void *, data, u64, size) > { > - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); > + struct pt_regs *regs = get_bpf_raw_tp_regs(); > + int ret; > + > + if (IS_ERR(regs)) > + return PTR_ERR(regs); > > perf_fetch_caller_regs(regs); > - return ____bpf_perf_event_output(regs, map, flags, data, size); > + ret = ____bpf_perf_event_output(regs, map, flags, data, size); > + > + put_bpf_raw_tp_regs(); > + return ret; > } > > static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { > @@ -848,12 +904,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { > BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, > struct bpf_map *, map, u64, flags) > { > - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); > + struct pt_regs *regs = get_bpf_raw_tp_regs(); > + int ret; > + > + if (IS_ERR(regs)) > + return PTR_ERR(regs); > > perf_fetch_caller_regs(regs); > /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */ > - return bpf_get_stackid((unsigned long) regs, (unsigned long) map, > - flags, 0, 0); > + ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map, > + flags, 0, 0); > + put_bpf_raw_tp_regs(); > + return ret; > } > > static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { > @@ -868,11 +930,17 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { > BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args, > void *, buf, u32, size, u64, flags) > { > - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); > + struct pt_regs *regs = get_bpf_raw_tp_regs(); > + int ret; > + > + if (IS_ERR(regs)) > + return PTR_ERR(regs); > > perf_fetch_caller_regs(regs); > - return bpf_get_stack((unsigned long) regs, (unsigned long) buf, > - (unsigned long) size, flags, 0); > + ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf, > + (unsigned long) size, flags, 0); > + put_bpf_raw_tp_regs(); > + return ret; > } > > static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = { > -- > 2.17.1 > ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH bpf v2] bpf: fix nested bpf tracepoints with per-cpu data 2019-06-12 5:00 ` Andrii Nakryiko @ 2019-06-13 22:47 ` Daniel Borkmann 2019-06-14 0:51 ` Matt Mullins 0 siblings, 1 reply; 9+ messages in thread From: Daniel Borkmann @ 2019-06-13 22:47 UTC (permalink / raw) To: Andrii Nakryiko, Matt Mullins Cc: hall, Alexei Starovoitov, bpf, Networking, open list, Steven Rostedt, Ingo Molnar, Martin KaFai Lau, Song Liu, Yonghong Song On 06/12/2019 07:00 AM, Andrii Nakryiko wrote: > On Tue, Jun 11, 2019 at 8:48 PM Matt Mullins <mmullins@fb.com> wrote: >> >> BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as >> they do not increment bpf_prog_active while executing. >> >> This enables three levels of nesting, to support >> - a kprobe or raw tp or perf event, >> - another one of the above that irq context happens to call, and >> - another one in nmi context >> (at most one of which may be a kprobe or perf event). >> >> Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data") Generally, looks good to me. Two things below: Nit, for stable, shouldn't fixes tag be c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT") instead of the one you currently have? One more question / clarification: we have __bpf_trace_run() vs trace_call_bpf(). Only raw tracepoints can be nested since the rest has the bpf_prog_active per-CPU counter via trace_call_bpf() and would bail out otherwise, iiuc. And raw ones use the __bpf_trace_run() added in c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT"). 1) I tried to recall and find a rationale for mentioned trace_call_bpf() split in the c4f6699dfcb8 log, but couldn't find any. Is the raison d'être purely because of performance overhead (and desire to not miss events as a result of nesting)? (This also means we're not protected by bpf_prog_active in all the map ops, of course.) 2) Wouldn't this also mean that we only need to fix the raw tp programs via get_bpf_raw_tp_regs() / put_bpf_raw_tp_regs() and won't need this duplication for the rest which relies upon trace_call_bpf()? I'm probably missing something, but given they have separate pt_regs there, how could they be affected then? Thanks, Daniel >> Signed-off-by: Matt Mullins <mmullins@fb.com> >> --- > > LGTM, minor nit below. > > Acked-by: Andrii Nakryiko <andriin@fb.com> > >> v1->v2: >> * reverse-Christmas-tree-ize the declarations in bpf_perf_event_output >> * instantiate err more readably >> >> I've done additional testing with the original workload that hit the >> irq+raw-tp reentrancy problem, and as far as I can tell, it's still >> solved with this solution (as opposed to my earlier per-map-element >> version). >> >> kernel/trace/bpf_trace.c | 100 ++++++++++++++++++++++++++++++++------- >> 1 file changed, 84 insertions(+), 16 deletions(-) >> >> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c >> index f92d6ad5e080..1c9a4745e596 100644 >> --- a/kernel/trace/bpf_trace.c >> +++ b/kernel/trace/bpf_trace.c >> @@ -410,8 +410,6 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = { >> .arg4_type = ARG_CONST_SIZE, >> }; >> >> -static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd); >> - >> static __always_inline u64 >> __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, >> u64 flags, struct perf_sample_data *sd) >> @@ -442,24 +440,50 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, >> return perf_event_output(event, sd, regs); >> } >> >> +/* >> + * Support executing tracepoints in normal, irq, and nmi context that each call >> + * bpf_perf_event_output >> + */ >> +struct bpf_trace_sample_data { >> + struct perf_sample_data sds[3]; >> +}; >> + >> +static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds); >> +static DEFINE_PER_CPU(int, bpf_trace_nest_level); >> BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, >> u64, flags, void *, data, u64, size) >> { >> - struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd); >> + struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds); >> + int nest_level = this_cpu_inc_return(bpf_trace_nest_level); >> struct perf_raw_record raw = { >> .frag = { >> .size = size, >> .data = data, >> }, >> }; >> + struct perf_sample_data *sd; >> + int err; >> >> - if (unlikely(flags & ~(BPF_F_INDEX_MASK))) >> - return -EINVAL; >> + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) { >> + err = -EBUSY; >> + goto out; >> + } >> + >> + sd = &sds->sds[nest_level - 1]; >> + >> + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) { >> + err = -EINVAL; >> + goto out; >> + } > > Feel free to ignore, but just stylistically, given this check doesn't > depend on sd, I'd move it either to the very top or right after > `nest_level > ARRAY_SIZE(sds->sds)` check, so that all the error > checking is grouped without interspersed assignment. Makes sense. >> perf_sample_data_init(sd, 0, 0); >> sd->raw = &raw; >> >> - return __bpf_perf_event_output(regs, map, flags, sd); >> + err = __bpf_perf_event_output(regs, map, flags, sd); >> + >> +out: >> + this_cpu_dec(bpf_trace_nest_level); >> + return err; >> } >> >> static const struct bpf_func_proto bpf_perf_event_output_proto = { >> @@ -822,16 +846,48 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) >> /* >> * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp >> * to avoid potential recursive reuse issue when/if tracepoints are added >> - * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack >> + * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack. >> + * >> + * Since raw tracepoints run despite bpf_prog_active, support concurrent usage >> + * in normal, irq, and nmi context. >> */ >> -static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs); >> +struct bpf_raw_tp_regs { >> + struct pt_regs regs[3]; >> +}; >> +static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs); >> +static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level); >> +static struct pt_regs *get_bpf_raw_tp_regs(void) >> +{ >> + struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs); >> + int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level); >> + >> + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) { >> + this_cpu_dec(bpf_raw_tp_nest_level); >> + return ERR_PTR(-EBUSY); >> + } >> + >> + return &tp_regs->regs[nest_level - 1]; >> +} >> + >> +static void put_bpf_raw_tp_regs(void) >> +{ >> + this_cpu_dec(bpf_raw_tp_nest_level); >> +} >> + >> BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args, >> struct bpf_map *, map, u64, flags, void *, data, u64, size) >> { >> - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); >> + struct pt_regs *regs = get_bpf_raw_tp_regs(); >> + int ret; >> + >> + if (IS_ERR(regs)) >> + return PTR_ERR(regs); >> >> perf_fetch_caller_regs(regs); >> - return ____bpf_perf_event_output(regs, map, flags, data, size); >> + ret = ____bpf_perf_event_output(regs, map, flags, data, size); >> + >> + put_bpf_raw_tp_regs(); >> + return ret; >> } >> >> static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { >> @@ -848,12 +904,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { >> BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, >> struct bpf_map *, map, u64, flags) >> { >> - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); >> + struct pt_regs *regs = get_bpf_raw_tp_regs(); >> + int ret; >> + >> + if (IS_ERR(regs)) >> + return PTR_ERR(regs); >> >> perf_fetch_caller_regs(regs); >> /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */ >> - return bpf_get_stackid((unsigned long) regs, (unsigned long) map, >> - flags, 0, 0); >> + ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map, >> + flags, 0, 0); >> + put_bpf_raw_tp_regs(); >> + return ret; >> } >> >> static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { >> @@ -868,11 +930,17 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { >> BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args, >> void *, buf, u32, size, u64, flags) >> { >> - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); >> + struct pt_regs *regs = get_bpf_raw_tp_regs(); >> + int ret; >> + >> + if (IS_ERR(regs)) >> + return PTR_ERR(regs); >> >> perf_fetch_caller_regs(regs); >> - return bpf_get_stack((unsigned long) regs, (unsigned long) buf, >> - (unsigned long) size, flags, 0); >> + ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf, >> + (unsigned long) size, flags, 0); >> + put_bpf_raw_tp_regs(); >> + return ret; >> } >> >> static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = { >> -- >> 2.17.1 >> ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH bpf v2] bpf: fix nested bpf tracepoints with per-cpu data 2019-06-13 22:47 ` Daniel Borkmann @ 2019-06-14 0:51 ` Matt Mullins 2019-06-14 0:55 ` Alexei Starovoitov 2019-06-14 14:50 ` Daniel Borkmann 0 siblings, 2 replies; 9+ messages in thread From: Matt Mullins @ 2019-06-14 0:51 UTC (permalink / raw) To: daniel, andrii.nakryiko Cc: Song Liu, linux-kernel, bpf, rostedt, ast, Andrew Hall, mingo, netdev, Martin Lau, Yonghong Song On Fri, 2019-06-14 at 00:47 +0200, Daniel Borkmann wrote: > On 06/12/2019 07:00 AM, Andrii Nakryiko wrote: > > On Tue, Jun 11, 2019 at 8:48 PM Matt Mullins <mmullins@fb.com> wrote: > > > > > > BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as > > > they do not increment bpf_prog_active while executing. > > > > > > This enables three levels of nesting, to support > > > - a kprobe or raw tp or perf event, > > > - another one of the above that irq context happens to call, and > > > - another one in nmi context > > > (at most one of which may be a kprobe or perf event). > > > > > > Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data") > > Generally, looks good to me. Two things below: > > Nit, for stable, shouldn't fixes tag be c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT") > instead of the one you currently have? Ah, yeah, that's probably more reasonable; I haven't managed to come up with a scenario where one could hit this without raw tracepoints. I'll fix up the nits that've accumulated since v2. > One more question / clarification: we have __bpf_trace_run() vs trace_call_bpf(). > > Only raw tracepoints can be nested since the rest has the bpf_prog_active per-CPU > counter via trace_call_bpf() and would bail out otherwise, iiuc. And raw ones use > the __bpf_trace_run() added in c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT"). > > 1) I tried to recall and find a rationale for mentioned trace_call_bpf() split in > the c4f6699dfcb8 log, but couldn't find any. Is the raison d'être purely because of > performance overhead (and desire to not miss events as a result of nesting)? (This > also means we're not protected by bpf_prog_active in all the map ops, of course.) > 2) Wouldn't this also mean that we only need to fix the raw tp programs via > get_bpf_raw_tp_regs() / put_bpf_raw_tp_regs() and won't need this duplication for > the rest which relies upon trace_call_bpf()? I'm probably missing something, but > given they have separate pt_regs there, how could they be affected then? For the pt_regs, you're correct: I only used get/put_raw_tp_regs for the _raw_tp variants. However, consider the following nesting: trace_nest_level raw_tp_nest_level (kprobe) bpf_perf_event_output 1 0 (raw_tp) bpf_perf_event_output_raw_tp 2 1 (raw_tp) bpf_get_stackid_raw_tp 2 2 I need to increment a nest level (and ideally increment it only once) between the kprobe and the first raw_tp, because they would otherwise share the struct perf_sample_data. But I also need to increment a nest level between the two raw_tps, since they share the pt_regs -- I can't use trace_nest_level for everything because it's not used by get_stackid, and I can't use raw_tp_nest_level for everything because it's not incremented by kprobes. If raw tracepoints were to bump bpf_prog_active, then I could get away with just using that count in these callsites -- I'm reluctant to do that, though, since it would prevent kprobes from ever running inside a raw_tp. I'd like to retain the ability to (e.g.) trace.py -K htab_map_update_elem and get some stack traces from at least within raw tracepoints. That said, as I wrote up this example, bpf_trace_nest_level seems to be wildly misnamed; I should name those after the structure they're protecting... > Thanks, > Daniel > > > > Signed-off-by: Matt Mullins <mmullins@fb.com> > > > --- > > > > LGTM, minor nit below. > > > > Acked-by: Andrii Nakryiko <andriin@fb.com> > > > > > v1->v2: > > > * reverse-Christmas-tree-ize the declarations in bpf_perf_event_output > > > * instantiate err more readably > > > > > > I've done additional testing with the original workload that hit the > > > irq+raw-tp reentrancy problem, and as far as I can tell, it's still > > > solved with this solution (as opposed to my earlier per-map-element > > > version). > > > > > > kernel/trace/bpf_trace.c | 100 ++++++++++++++++++++++++++++++++------- > > > 1 file changed, 84 insertions(+), 16 deletions(-) > > > > > > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c > > > index f92d6ad5e080..1c9a4745e596 100644 > > > --- a/kernel/trace/bpf_trace.c > > > +++ b/kernel/trace/bpf_trace.c > > > @@ -410,8 +410,6 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = { > > > .arg4_type = ARG_CONST_SIZE, > > > }; > > > > > > -static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd); > > > - > > > static __always_inline u64 > > > __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, > > > u64 flags, struct perf_sample_data *sd) > > > @@ -442,24 +440,50 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, > > > return perf_event_output(event, sd, regs); > > > } > > > > > > +/* > > > + * Support executing tracepoints in normal, irq, and nmi context that each call > > > + * bpf_perf_event_output > > > + */ > > > +struct bpf_trace_sample_data { > > > + struct perf_sample_data sds[3]; > > > +}; > > > + > > > +static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds); > > > +static DEFINE_PER_CPU(int, bpf_trace_nest_level); > > > BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, > > > u64, flags, void *, data, u64, size) > > > { > > > - struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd); > > > + struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds); > > > + int nest_level = this_cpu_inc_return(bpf_trace_nest_level); > > > struct perf_raw_record raw = { > > > .frag = { > > > .size = size, > > > .data = data, > > > }, > > > }; > > > + struct perf_sample_data *sd; > > > + int err; > > > > > > - if (unlikely(flags & ~(BPF_F_INDEX_MASK))) > > > - return -EINVAL; > > > + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) { > > > + err = -EBUSY; > > > + goto out; > > > + } > > > + > > > + sd = &sds->sds[nest_level - 1]; > > > + > > > + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) { > > > + err = -EINVAL; > > > + goto out; > > > + } > > > > Feel free to ignore, but just stylistically, given this check doesn't > > depend on sd, I'd move it either to the very top or right after > > `nest_level > ARRAY_SIZE(sds->sds)` check, so that all the error > > checking is grouped without interspersed assignment. > > Makes sense. > > > > perf_sample_data_init(sd, 0, 0); > > > sd->raw = &raw; > > > > > > - return __bpf_perf_event_output(regs, map, flags, sd); > > > + err = __bpf_perf_event_output(regs, map, flags, sd); > > > + > > > +out: > > > + this_cpu_dec(bpf_trace_nest_level); > > > + return err; > > > } > > > > > > static const struct bpf_func_proto bpf_perf_event_output_proto = { > > > @@ -822,16 +846,48 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) > > > /* > > > * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp > > > * to avoid potential recursive reuse issue when/if tracepoints are added > > > - * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack > > > + * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack. > > > + * > > > + * Since raw tracepoints run despite bpf_prog_active, support concurrent usage > > > + * in normal, irq, and nmi context. > > > */ > > > -static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs); > > > +struct bpf_raw_tp_regs { > > > + struct pt_regs regs[3]; > > > +}; > > > +static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs); > > > +static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level); > > > +static struct pt_regs *get_bpf_raw_tp_regs(void) > > > +{ > > > + struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs); > > > + int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level); > > > + > > > + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) { > > > + this_cpu_dec(bpf_raw_tp_nest_level); > > > + return ERR_PTR(-EBUSY); > > > + } > > > + > > > + return &tp_regs->regs[nest_level - 1]; > > > +} > > > + > > > +static void put_bpf_raw_tp_regs(void) > > > +{ > > > + this_cpu_dec(bpf_raw_tp_nest_level); > > > +} > > > + > > > BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args, > > > struct bpf_map *, map, u64, flags, void *, data, u64, size) > > > { > > > - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); > > > + struct pt_regs *regs = get_bpf_raw_tp_regs(); > > > + int ret; > > > + > > > + if (IS_ERR(regs)) > > > + return PTR_ERR(regs); > > > > > > perf_fetch_caller_regs(regs); > > > - return ____bpf_perf_event_output(regs, map, flags, data, size); > > > + ret = ____bpf_perf_event_output(regs, map, flags, data, size); > > > + > > > + put_bpf_raw_tp_regs(); > > > + return ret; > > > } > > > > > > static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { > > > @@ -848,12 +904,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { > > > BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, > > > struct bpf_map *, map, u64, flags) > > > { > > > - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); > > > + struct pt_regs *regs = get_bpf_raw_tp_regs(); > > > + int ret; > > > + > > > + if (IS_ERR(regs)) > > > + return PTR_ERR(regs); > > > > > > perf_fetch_caller_regs(regs); > > > /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */ > > > - return bpf_get_stackid((unsigned long) regs, (unsigned long) map, > > > - flags, 0, 0); > > > + ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map, > > > + flags, 0, 0); > > > + put_bpf_raw_tp_regs(); > > > + return ret; > > > } > > > > > > static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { > > > @@ -868,11 +930,17 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { > > > BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args, > > > void *, buf, u32, size, u64, flags) > > > { > > > - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); > > > + struct pt_regs *regs = get_bpf_raw_tp_regs(); > > > + int ret; > > > + > > > + if (IS_ERR(regs)) > > > + return PTR_ERR(regs); > > > > > > perf_fetch_caller_regs(regs); > > > - return bpf_get_stack((unsigned long) regs, (unsigned long) buf, > > > - (unsigned long) size, flags, 0); > > > + ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf, > > > + (unsigned long) size, flags, 0); > > > + put_bpf_raw_tp_regs(); > > > + return ret; > > > } > > > > > > static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = { > > > -- > > > 2.17.1 > > > > > ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH bpf v2] bpf: fix nested bpf tracepoints with per-cpu data 2019-06-14 0:51 ` Matt Mullins @ 2019-06-14 0:55 ` Alexei Starovoitov 2019-06-14 15:16 ` Daniel Borkmann 2019-06-14 14:50 ` Daniel Borkmann 1 sibling, 1 reply; 9+ messages in thread From: Alexei Starovoitov @ 2019-06-14 0:55 UTC (permalink / raw) To: Matt Mullins Cc: daniel, andrii.nakryiko, Song Liu, linux-kernel, bpf, rostedt, ast, Andrew Hall, mingo, netdev, Martin Lau, Yonghong Song On Thu, Jun 13, 2019 at 5:52 PM Matt Mullins <mmullins@fb.com> wrote: > > On Fri, 2019-06-14 at 00:47 +0200, Daniel Borkmann wrote: > > On 06/12/2019 07:00 AM, Andrii Nakryiko wrote: > > > On Tue, Jun 11, 2019 at 8:48 PM Matt Mullins <mmullins@fb.com> wrote: > > > > > > > > BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as > > > > they do not increment bpf_prog_active while executing. > > > > > > > > This enables three levels of nesting, to support > > > > - a kprobe or raw tp or perf event, > > > > - another one of the above that irq context happens to call, and > > > > - another one in nmi context > > > > (at most one of which may be a kprobe or perf event). > > > > > > > > Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data") > > > > Generally, looks good to me. Two things below: > > > > Nit, for stable, shouldn't fixes tag be c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT") > > instead of the one you currently have? > > Ah, yeah, that's probably more reasonable; I haven't managed to come up > with a scenario where one could hit this without raw tracepoints. I'll > fix up the nits that've accumulated since v2. > > > One more question / clarification: we have __bpf_trace_run() vs trace_call_bpf(). > > > > Only raw tracepoints can be nested since the rest has the bpf_prog_active per-CPU > > counter via trace_call_bpf() and would bail out otherwise, iiuc. And raw ones use > > the __bpf_trace_run() added in c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT"). > > > > 1) I tried to recall and find a rationale for mentioned trace_call_bpf() split in > > the c4f6699dfcb8 log, but couldn't find any. Is the raison d'être purely because of > > performance overhead (and desire to not miss events as a result of nesting)? (This > > also means we're not protected by bpf_prog_active in all the map ops, of course.) > > 2) Wouldn't this also mean that we only need to fix the raw tp programs via > > get_bpf_raw_tp_regs() / put_bpf_raw_tp_regs() and won't need this duplication for > > the rest which relies upon trace_call_bpf()? I'm probably missing something, but > > given they have separate pt_regs there, how could they be affected then? > > For the pt_regs, you're correct: I only used get/put_raw_tp_regs for > the _raw_tp variants. However, consider the following nesting: > > trace_nest_level raw_tp_nest_level > (kprobe) bpf_perf_event_output 1 0 > (raw_tp) bpf_perf_event_output_raw_tp 2 1 > (raw_tp) bpf_get_stackid_raw_tp 2 2 > > I need to increment a nest level (and ideally increment it only once) > between the kprobe and the first raw_tp, because they would otherwise > share the struct perf_sample_data. But I also need to increment a nest > level between the two raw_tps, since they share the pt_regs -- I can't > use trace_nest_level for everything because it's not used by > get_stackid, and I can't use raw_tp_nest_level for everything because > it's not incremented by kprobes. > > If raw tracepoints were to bump bpf_prog_active, then I could get away > with just using that count in these callsites -- I'm reluctant to do > that, though, since it would prevent kprobes from ever running inside a > raw_tp. I'd like to retain the ability to (e.g.) > trace.py -K htab_map_update_elem > and get some stack traces from at least within raw tracepoints. > > That said, as I wrote up this example, bpf_trace_nest_level seems to be > wildly misnamed; I should name those after the structure they're > protecting... I still don't get what's wrong with the previous approach. Didn't I manage to convince both of you that perf_sample_data inside bpf_perf_event_array doesn't have any issue that Daniel brought up? I think this refcnting approach is inferior. ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH bpf v2] bpf: fix nested bpf tracepoints with per-cpu data 2019-06-14 0:55 ` Alexei Starovoitov @ 2019-06-14 15:16 ` Daniel Borkmann 0 siblings, 0 replies; 9+ messages in thread From: Daniel Borkmann @ 2019-06-14 15:16 UTC (permalink / raw) To: Alexei Starovoitov, Matt Mullins Cc: andrii.nakryiko, Song Liu, linux-kernel, bpf, rostedt, ast, Andrew Hall, mingo, netdev, Martin Lau, Yonghong Song On 06/14/2019 02:55 AM, Alexei Starovoitov wrote: > On Thu, Jun 13, 2019 at 5:52 PM Matt Mullins <mmullins@fb.com> wrote: >> On Fri, 2019-06-14 at 00:47 +0200, Daniel Borkmann wrote: >>> On 06/12/2019 07:00 AM, Andrii Nakryiko wrote: >>>> On Tue, Jun 11, 2019 at 8:48 PM Matt Mullins <mmullins@fb.com> wrote: >>>>> >>>>> BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as >>>>> they do not increment bpf_prog_active while executing. >>>>> >>>>> This enables three levels of nesting, to support >>>>> - a kprobe or raw tp or perf event, >>>>> - another one of the above that irq context happens to call, and >>>>> - another one in nmi context >>>>> (at most one of which may be a kprobe or perf event). >>>>> >>>>> Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data") >>> >>> Generally, looks good to me. Two things below: >>> >>> Nit, for stable, shouldn't fixes tag be c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT") >>> instead of the one you currently have? >> >> Ah, yeah, that's probably more reasonable; I haven't managed to come up >> with a scenario where one could hit this without raw tracepoints. I'll >> fix up the nits that've accumulated since v2. >> >>> One more question / clarification: we have __bpf_trace_run() vs trace_call_bpf(). >>> >>> Only raw tracepoints can be nested since the rest has the bpf_prog_active per-CPU >>> counter via trace_call_bpf() and would bail out otherwise, iiuc. And raw ones use >>> the __bpf_trace_run() added in c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT"). >>> >>> 1) I tried to recall and find a rationale for mentioned trace_call_bpf() split in >>> the c4f6699dfcb8 log, but couldn't find any. Is the raison d'être purely because of >>> performance overhead (and desire to not miss events as a result of nesting)? (This >>> also means we're not protected by bpf_prog_active in all the map ops, of course.) >>> 2) Wouldn't this also mean that we only need to fix the raw tp programs via >>> get_bpf_raw_tp_regs() / put_bpf_raw_tp_regs() and won't need this duplication for >>> the rest which relies upon trace_call_bpf()? I'm probably missing something, but >>> given they have separate pt_regs there, how could they be affected then? >> >> For the pt_regs, you're correct: I only used get/put_raw_tp_regs for >> the _raw_tp variants. However, consider the following nesting: >> >> trace_nest_level raw_tp_nest_level >> (kprobe) bpf_perf_event_output 1 0 >> (raw_tp) bpf_perf_event_output_raw_tp 2 1 >> (raw_tp) bpf_get_stackid_raw_tp 2 2 >> >> I need to increment a nest level (and ideally increment it only once) >> between the kprobe and the first raw_tp, because they would otherwise >> share the struct perf_sample_data. But I also need to increment a nest >> level between the two raw_tps, since they share the pt_regs -- I can't >> use trace_nest_level for everything because it's not used by >> get_stackid, and I can't use raw_tp_nest_level for everything because >> it's not incremented by kprobes. >> >> If raw tracepoints were to bump bpf_prog_active, then I could get away >> with just using that count in these callsites -- I'm reluctant to do >> that, though, since it would prevent kprobes from ever running inside a >> raw_tp. I'd like to retain the ability to (e.g.) >> trace.py -K htab_map_update_elem >> and get some stack traces from at least within raw tracepoints. >> >> That said, as I wrote up this example, bpf_trace_nest_level seems to be >> wildly misnamed; I should name those after the structure they're >> protecting... > > I still don't get what's wrong with the previous approach. > Didn't I manage to convince both of you that perf_sample_data > inside bpf_perf_event_array doesn't have any issue that Daniel brought up? > I think this refcnting approach is inferior. Hm, but looking at perf RB handling code, it can deal with nesting situation just fine. I think this was your main concern with prior email: because I suspect that 'struct bpf_event_entry' is not reentrable (even w/o issues with 'struct perf_sample_data'). Even if we always use 'struct perf_sample_data' on stack, I suspect the same 'struct bpf_event_entry' cannot be reused in the nested way. Check the perf_output_{get,put}_handle() and the way it updates head pointer and does the final propagation to user_page. So if it's designed to handle these situations, then bailing out doesn't make sense from BPF side. ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH bpf v2] bpf: fix nested bpf tracepoints with per-cpu data 2019-06-14 0:51 ` Matt Mullins 2019-06-14 0:55 ` Alexei Starovoitov @ 2019-06-14 14:50 ` Daniel Borkmann 2019-06-14 17:25 ` Matt Mullins 1 sibling, 1 reply; 9+ messages in thread From: Daniel Borkmann @ 2019-06-14 14:50 UTC (permalink / raw) To: Matt Mullins, andrii.nakryiko Cc: Song Liu, linux-kernel, bpf, rostedt, ast, Andrew Hall, mingo, netdev, Martin Lau, Yonghong Song On 06/14/2019 02:51 AM, Matt Mullins wrote: > On Fri, 2019-06-14 at 00:47 +0200, Daniel Borkmann wrote: >> On 06/12/2019 07:00 AM, Andrii Nakryiko wrote: >>> On Tue, Jun 11, 2019 at 8:48 PM Matt Mullins <mmullins@fb.com> wrote: >>>> >>>> BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as >>>> they do not increment bpf_prog_active while executing. >>>> >>>> This enables three levels of nesting, to support >>>> - a kprobe or raw tp or perf event, >>>> - another one of the above that irq context happens to call, and >>>> - another one in nmi context >>>> (at most one of which may be a kprobe or perf event). >>>> >>>> Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data") >> >> Generally, looks good to me. Two things below: >> >> Nit, for stable, shouldn't fixes tag be c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT") >> instead of the one you currently have? > > Ah, yeah, that's probably more reasonable; I haven't managed to come up > with a scenario where one could hit this without raw tracepoints. I'll > fix up the nits that've accumulated since v2. > >> One more question / clarification: we have __bpf_trace_run() vs trace_call_bpf(). >> >> Only raw tracepoints can be nested since the rest has the bpf_prog_active per-CPU >> counter via trace_call_bpf() and would bail out otherwise, iiuc. And raw ones use >> the __bpf_trace_run() added in c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT"). >> >> 1) I tried to recall and find a rationale for mentioned trace_call_bpf() split in >> the c4f6699dfcb8 log, but couldn't find any. Is the raison d'être purely because of >> performance overhead (and desire to not miss events as a result of nesting)? (This >> also means we're not protected by bpf_prog_active in all the map ops, of course.) >> 2) Wouldn't this also mean that we only need to fix the raw tp programs via >> get_bpf_raw_tp_regs() / put_bpf_raw_tp_regs() and won't need this duplication for >> the rest which relies upon trace_call_bpf()? I'm probably missing something, but >> given they have separate pt_regs there, how could they be affected then? > > For the pt_regs, you're correct: I only used get/put_raw_tp_regs for > the _raw_tp variants. However, consider the following nesting: > > trace_nest_level raw_tp_nest_level > (kprobe) bpf_perf_event_output 1 0 > (raw_tp) bpf_perf_event_output_raw_tp 2 1 > (raw_tp) bpf_get_stackid_raw_tp 2 2 > > I need to increment a nest level (and ideally increment it only once) > between the kprobe and the first raw_tp, because they would otherwise > share the struct perf_sample_data. But I also need to increment a nest I'm not sure I follow on this one: the former would still keep using the bpf_trace_sd as-is today since only ever /one/ can be active on a given CPU as we otherwise bail out in trace_call_bpf() due to bpf_prog_active counter. Given these two are /not/ shared, you only need the code you have below for nesting to deal with the raw_tps via get_bpf_raw_tp_regs() / put_bpf_raw_tp_regs() which should also simplify the code quite a bit. > level between the two raw_tps, since they share the pt_regs -- I can't > use trace_nest_level for everything because it's not used by > get_stackid, and I can't use raw_tp_nest_level for everything because > it's not incremented by kprobes. (See above wrt kprobes.) > If raw tracepoints were to bump bpf_prog_active, then I could get away > with just using that count in these callsites -- I'm reluctant to do > that, though, since it would prevent kprobes from ever running inside a > raw_tp. I'd like to retain the ability to (e.g.) > trace.py -K htab_map_update_elem > and get some stack traces from at least within raw tracepoints. > > That said, as I wrote up this example, bpf_trace_nest_level seems to be > wildly misnamed; I should name those after the structure they're > protecting... ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH bpf v2] bpf: fix nested bpf tracepoints with per-cpu data 2019-06-14 14:50 ` Daniel Borkmann @ 2019-06-14 17:25 ` Matt Mullins 0 siblings, 0 replies; 9+ messages in thread From: Matt Mullins @ 2019-06-14 17:25 UTC (permalink / raw) To: daniel, andrii.nakryiko Cc: Song Liu, linux-kernel, bpf, rostedt, ast, Andrew Hall, mingo, netdev, Martin Lau, Yonghong Song On Fri, 2019-06-14 at 16:50 +0200, Daniel Borkmann wrote: > On 06/14/2019 02:51 AM, Matt Mullins wrote: > > On Fri, 2019-06-14 at 00:47 +0200, Daniel Borkmann wrote: > > > On 06/12/2019 07:00 AM, Andrii Nakryiko wrote: > > > > On Tue, Jun 11, 2019 at 8:48 PM Matt Mullins <mmullins@fb.com> wrote: > > > > > > > > > > BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as > > > > > they do not increment bpf_prog_active while executing. > > > > > > > > > > This enables three levels of nesting, to support > > > > > - a kprobe or raw tp or perf event, > > > > > - another one of the above that irq context happens to call, and > > > > > - another one in nmi context > > > > > (at most one of which may be a kprobe or perf event). > > > > > > > > > > Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data") > > > > > > Generally, looks good to me. Two things below: > > > > > > Nit, for stable, shouldn't fixes tag be c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT") > > > instead of the one you currently have? > > > > Ah, yeah, that's probably more reasonable; I haven't managed to come up > > with a scenario where one could hit this without raw tracepoints. I'll > > fix up the nits that've accumulated since v2. > > > > > One more question / clarification: we have __bpf_trace_run() vs trace_call_bpf(). > > > > > > Only raw tracepoints can be nested since the rest has the bpf_prog_active per-CPU > > > counter via trace_call_bpf() and would bail out otherwise, iiuc. And raw ones use > > > the __bpf_trace_run() added in c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT"). > > > > > > 1) I tried to recall and find a rationale for mentioned trace_call_bpf() split in > > > the c4f6699dfcb8 log, but couldn't find any. Is the raison d'être purely because of > > > performance overhead (and desire to not miss events as a result of nesting)? (This > > > also means we're not protected by bpf_prog_active in all the map ops, of course.) > > > 2) Wouldn't this also mean that we only need to fix the raw tp programs via > > > get_bpf_raw_tp_regs() / put_bpf_raw_tp_regs() and won't need this duplication for > > > the rest which relies upon trace_call_bpf()? I'm probably missing something, but > > > given they have separate pt_regs there, how could they be affected then? > > > > For the pt_regs, you're correct: I only used get/put_raw_tp_regs for > > the _raw_tp variants. However, consider the following nesting: > > > > trace_nest_level raw_tp_nest_level > > (kprobe) bpf_perf_event_output 1 0 > > (raw_tp) bpf_perf_event_output_raw_tp 2 1 > > (raw_tp) bpf_get_stackid_raw_tp 2 2 > > > > I need to increment a nest level (and ideally increment it only once) > > between the kprobe and the first raw_tp, because they would otherwise > > share the struct perf_sample_data. But I also need to increment a nest > > I'm not sure I follow on this one: the former would still keep using the > bpf_trace_sd as-is today since only ever /one/ can be active on a given CPU > as we otherwise bail out in trace_call_bpf() due to bpf_prog_active counter. > Given these two are /not/ shared, you only need the code you have below for > nesting to deal with the raw_tps via get_bpf_raw_tp_regs() / put_bpf_raw_tp_regs() > which should also simplify the code quite a bit. bpf_perf_event_output_raw_tp calls ____bpf_perf_event_output, so it currently shares bpf_trace_sd with kprobes -- it _can_ be nested. > > > level between the two raw_tps, since they share the pt_regs -- I can't > > use trace_nest_level for everything because it's not used by > > get_stackid, and I can't use raw_tp_nest_level for everything because > > it's not incremented by kprobes. > > (See above wrt kprobes.) > > > If raw tracepoints were to bump bpf_prog_active, then I could get away > > with just using that count in these callsites -- I'm reluctant to do > > that, though, since it would prevent kprobes from ever running inside a > > raw_tp. I'd like to retain the ability to (e.g.) > > trace.py -K htab_map_update_elem > > and get some stack traces from at least within raw tracepoints. > > > > That said, as I wrote up this example, bpf_trace_nest_level seems to be > > wildly misnamed; I should name those after the structure they're > > protecting... ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH bpf v2] bpf: fix nested bpf tracepoints with per-cpu data 2019-06-11 21:53 [PATCH bpf v2] bpf: fix nested bpf tracepoints with per-cpu data Matt Mullins 2019-06-12 5:00 ` Andrii Nakryiko @ 2019-06-15 23:35 ` Alexei Starovoitov 1 sibling, 0 replies; 9+ messages in thread From: Alexei Starovoitov @ 2019-06-15 23:35 UTC (permalink / raw) To: Matt Mullins Cc: Andrew Hall, Alexei Starovoitov, Daniel Borkmann, bpf, Network Development, LKML, Steven Rostedt, Ingo Molnar, Martin KaFai Lau, Song Liu, Yonghong Song On Tue, Jun 11, 2019 at 2:54 PM Matt Mullins <mmullins@fb.com> wrote: > > BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as > they do not increment bpf_prog_active while executing. > > This enables three levels of nesting, to support > - a kprobe or raw tp or perf event, > - another one of the above that irq context happens to call, and > - another one in nmi context > (at most one of which may be a kprobe or perf event). > > Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data") > Signed-off-by: Matt Mullins <mmullins@fb.com> Applied. Thanks ^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2019-06-15 23:35 UTC | newest] Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2019-06-11 21:53 [PATCH bpf v2] bpf: fix nested bpf tracepoints with per-cpu data Matt Mullins 2019-06-12 5:00 ` Andrii Nakryiko 2019-06-13 22:47 ` Daniel Borkmann 2019-06-14 0:51 ` Matt Mullins 2019-06-14 0:55 ` Alexei Starovoitov 2019-06-14 15:16 ` Daniel Borkmann 2019-06-14 14:50 ` Daniel Borkmann 2019-06-14 17:25 ` Matt Mullins 2019-06-15 23:35 ` Alexei Starovoitov
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).