From mboxrd@z Thu Jan 1 00:00:00 1970 From: Martin KaFai Lau Subject: Re: [PATCH bpf-next v3 2/7] bpf: introduce bpf subcommand BPF_TASK_FD_QUERY Date: Wed, 23 May 2018 10:13:22 -0700 Message-ID: <20180523171320.ziswsvpsyelxb6fz@kafai-mbp> References: <20180522163048.3128924-1-yhs@fb.com> <20180522163048.3128924-3-yhs@fb.com> Mime-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Cc: , , , , To: Yonghong Song Return-path: Received: from mx0a-00082601.pphosted.com ([67.231.145.42]:56096 "EHLO mx0a-00082601.pphosted.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S932667AbeEWROL (ORCPT ); Wed, 23 May 2018 13:14:11 -0400 Content-Disposition: inline In-Reply-To: <20180522163048.3128924-3-yhs@fb.com> Sender: netdev-owner@vger.kernel.org List-ID: On Tue, May 22, 2018 at 09:30:46AM -0700, Yonghong Song wrote: > Currently, suppose a userspace application has loaded a bpf program > and attached it to a tracepoint/kprobe/uprobe, and a bpf > introspection tool, e.g., bpftool, wants to show which bpf program > is attached to which tracepoint/kprobe/uprobe. Such attachment > information will be really useful to understand the overall bpf > deployment in the system. > > There is a name field (16 bytes) for each program, which could > be used to encode the attachment point. There are some drawbacks > for this approaches. First, bpftool user (e.g., an admin) may not > really understand the association between the name and the > attachment point. Second, if one program is attached to multiple > places, encoding a proper name which can imply all these > attachments becomes difficult. > > This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY. > Given a pid and fd, if the is associated with a > tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return > . prog_id > . tracepoint name, or > . k[ret]probe funcname + offset or kernel addr, or > . u[ret]probe filename + offset > to the userspace. > The user can use "bpftool prog" to find more information about > bpf program itself with prog_id. LGTM, some comments inline. > > Signed-off-by: Yonghong Song > --- > include/linux/trace_events.h | 16 ++++++ > include/uapi/linux/bpf.h | 27 ++++++++++ > kernel/bpf/syscall.c | 124 +++++++++++++++++++++++++++++++++++++++++++ > kernel/trace/bpf_trace.c | 48 +++++++++++++++++ > kernel/trace/trace_kprobe.c | 29 ++++++++++ > kernel/trace/trace_uprobe.c | 22 ++++++++ > 6 files changed, 266 insertions(+) > > diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h > index 2bde3ef..eab806d 100644 > --- a/include/linux/trace_events.h > +++ b/include/linux/trace_events.h > @@ -473,6 +473,9 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info); > int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); > int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog); > struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name); > +int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, > + u32 *attach_info, const char **buf, > + u64 *probe_offset, u64 *probe_addr); The first arg is 'const struct perf_event *event' while... > #else > static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) > { > @@ -504,6 +507,12 @@ static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name > { > return NULL; > } > +static inline int bpf_get_perf_event_info(const struct file *file, u32 *prog_id, this one has 'const struct file *file'? > + u32 *attach_info, const char **buf, > + u64 *probe_offset, u64 *probe_addr) > +{ > + return -EOPNOTSUPP; > +} > #endif > > enum { > @@ -560,10 +569,17 @@ extern void perf_trace_del(struct perf_event *event, int flags); > #ifdef CONFIG_KPROBE_EVENTS > extern int perf_kprobe_init(struct perf_event *event, bool is_retprobe); > extern void perf_kprobe_destroy(struct perf_event *event); > +extern int bpf_get_kprobe_info(const struct perf_event *event, > + u32 *attach_info, const char **symbol, > + u64 *probe_offset, u64 *probe_addr, > + bool perf_type_tracepoint); > #endif > #ifdef CONFIG_UPROBE_EVENTS > extern int perf_uprobe_init(struct perf_event *event, bool is_retprobe); > extern void perf_uprobe_destroy(struct perf_event *event); > +extern int bpf_get_uprobe_info(const struct perf_event *event, > + u32 *attach_info, const char **filename, > + u64 *probe_offset, bool perf_type_tracepoint); > #endif > extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, > char *filter_str); > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 97446bb..a602150 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -97,6 +97,7 @@ enum bpf_cmd { > BPF_RAW_TRACEPOINT_OPEN, > BPF_BTF_LOAD, > BPF_BTF_GET_FD_BY_ID, > + BPF_TASK_FD_QUERY, > }; > > enum bpf_map_type { > @@ -379,6 +380,22 @@ union bpf_attr { > __u32 btf_log_size; > __u32 btf_log_level; > }; > + > + struct { > + int pid; /* input: pid */ > + int fd; /* input: fd */ Should fd and pid be always positive? The current fd (like map_fd) in bpf_attr is using __u32. > + __u32 flags; /* input: flags */ > + __u32 buf_len; /* input: buf len */ > + __aligned_u64 buf; /* input/output: > + * tp_name for tracepoint > + * symbol for kprobe > + * filename for uprobe > + */ > + __u32 prog_id; /* output: prod_id */ > + __u32 attach_info; /* output: BPF_ATTACH_* */ > + __u64 probe_offset; /* output: probe_offset */ > + __u64 probe_addr; /* output: probe_addr */ > + } task_fd_query; > } __attribute__((aligned(8))); > > /* The description below is an attempt at providing documentation to eBPF > @@ -2458,4 +2475,14 @@ struct bpf_fib_lookup { > __u8 dmac[6]; /* ETH_ALEN */ > }; > > +/* used by based query */ > +enum { Nit. Instead of a comment, is it better to give this enum a descriptive name? > + BPF_ATTACH_RAW_TRACEPOINT, /* tp name */ > + BPF_ATTACH_TRACEPOINT, /* tp name */ > + BPF_ATTACH_KPROBE, /* (symbol + offset) or addr */ > + BPF_ATTACH_KRETPROBE, /* (symbol + offset) or addr */ > + BPF_ATTACH_UPROBE, /* filename + offset */ > + BPF_ATTACH_URETPROBE, /* filename + offset */ > +}; > + > #endif /* _UAPI__LINUX_BPF_H__ */ > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index bfcde94..9356f0e 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -18,7 +18,9 @@ > #include > #include > #include > +#include > #include > +#include > #include > #include > #include > @@ -2102,6 +2104,125 @@ static int bpf_btf_get_fd_by_id(const union bpf_attr *attr) > return btf_get_fd_by_id(attr->btf_id); > } > > +static int bpf_task_fd_query_copy(const union bpf_attr *attr, > + union bpf_attr __user *uattr, > + u32 prog_id, u32 attach_info, > + const char *buf, u64 probe_offset, > + u64 probe_addr) > +{ > + __u64 __user *ubuf; Nit. ubuf is a string instead of an array of __u64? > + int len; > + > + ubuf = u64_to_user_ptr(attr->task_fd_query.buf); > + if (buf) { > + len = strlen(buf); > + if (attr->task_fd_query.buf_len < len + 1) I think the current convention is to take the min, copy whatever it can to buf and return the real len/size in buf_len. F.e., the prog_ids and prog_cnt in __cgroup_bpf_query(). Should the same be done here or it does not make sense to truncate the string? The user may/may not need the tailing char though if its pretty print has limited width anyway. The user still needs to know what the buf_len should be to retry also but I guess any reasonable buf_len should work? > + return -ENOSPC; > + if (copy_to_user(ubuf, buf, len + 1)) > + return -EFAULT; > + } else if (attr->task_fd_query.buf_len) { > + /* copy '\0' to ubuf */ > + __u8 zero = 0; > + > + if (copy_to_user(ubuf, &zero, 1)) > + return -EFAULT; > + } > + > + if (copy_to_user(&uattr->task_fd_query.prog_id, &prog_id, > + sizeof(prog_id)) || > + copy_to_user(&uattr->task_fd_query.attach_info, &attach_info, > + sizeof(attach_info)) || > + copy_to_user(&uattr->task_fd_query.probe_offset, &probe_offset, > + sizeof(probe_offset)) || > + copy_to_user(&uattr->task_fd_query.probe_addr, &probe_addr, > + sizeof(probe_addr))) Nit. put_user() may be able to shorten them. > + return -EFAULT; > + > + return 0; > +} > + > +#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr > + > +static int bpf_task_fd_query(const union bpf_attr *attr, > + union bpf_attr __user *uattr) > +{ > + pid_t pid = attr->task_fd_query.pid; > + int fd = attr->task_fd_query.fd; > + const struct perf_event *event; > + struct files_struct *files; > + struct task_struct *task; > + struct file *file; > + int err; > + > + if (CHECK_ATTR(BPF_TASK_FD_QUERY)) > + return -EINVAL; > + > + if (!capable(CAP_SYS_ADMIN)) > + return -EPERM; > + > + if (attr->task_fd_query.flags != 0) How flags is used? > + return -EINVAL; > + > + task = get_pid_task(find_vpid(pid), PIDTYPE_PID); > + if (!task) > + return -ENOENT; > + > + files = get_files_struct(task); > + put_task_struct(task); > + if (!files) > + return -ENOENT; > + > + err = 0; > + spin_lock(&files->file_lock); > + file = fcheck_files(files, fd); > + if (!file) > + err = -EBADF; > + else > + get_file(file); > + spin_unlock(&files->file_lock); > + put_files_struct(files); > + > + if (err) > + goto out; > + > + if (file->f_op == &bpf_raw_tp_fops) { > + struct bpf_raw_tracepoint *raw_tp = file->private_data; > + struct bpf_raw_event_map *btp = raw_tp->btp; > + > + if (!raw_tp->prog) > + err = -ENOENT; > + else > + err = bpf_task_fd_query_copy(attr, uattr, > + raw_tp->prog->aux->id, > + BPF_ATTACH_RAW_TRACEPOINT, > + btp->tp->name, 0, 0); > + goto put_file; > + } > + > + event = perf_get_event(file); > + if (!IS_ERR(event)) { > + u64 probe_offset, probe_addr; > + u32 prog_id, attach_info; > + const char *buf; > + > + err = bpf_get_perf_event_info(event, &prog_id, &attach_info, > + &buf, &probe_offset, > + &probe_addr); > + if (!err) > + err = bpf_task_fd_query_copy(attr, uattr, prog_id, > + attach_info, buf, > + probe_offset, > + probe_addr); > + goto put_file; > + } > + > + err = -ENOTSUPP; > +put_file: > + fput(file); > +out: > + return err; > +} > + > SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) > { > union bpf_attr attr = {}; > @@ -2188,6 +2309,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz > case BPF_BTF_GET_FD_BY_ID: > err = bpf_btf_get_fd_by_id(&attr); > break; > + case BPF_TASK_FD_QUERY: > + err = bpf_task_fd_query(&attr, uattr); > + break; > default: > err = -EINVAL; > break; > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c > index ce2cbbf..323c80e 100644 > --- a/kernel/trace/bpf_trace.c > +++ b/kernel/trace/bpf_trace.c > @@ -14,6 +14,7 @@ > #include > #include > #include > +#include > #include > > #include "trace_probe.h" > @@ -1163,3 +1164,50 @@ int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog) > mutex_unlock(&bpf_event_mutex); > return err; > } > + > +int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, > + u32 *attach_info, const char **buf, > + u64 *probe_offset, u64 *probe_addr) > +{ > + bool is_tracepoint, is_syscall_tp; > + struct bpf_prog *prog; > + int flags, err = 0; > + > + prog = event->prog; > + if (!prog) > + return -ENOENT; > + > + /* not supporting BPF_PROG_TYPE_PERF_EVENT yet */ > + if (prog->type == BPF_PROG_TYPE_PERF_EVENT) > + return -EOPNOTSUPP; > + > + *prog_id = prog->aux->id; > + flags = event->tp_event->flags; > + is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT; > + is_syscall_tp = is_syscall_trace_event(event->tp_event); > + > + if (is_tracepoint || is_syscall_tp) { > + *buf = is_tracepoint ? event->tp_event->tp->name > + : event->tp_event->name; > + *attach_info = BPF_ATTACH_TRACEPOINT; > + *probe_offset = 0x0; > + *probe_addr = 0x0; > + } else { > + /* kprobe/uprobe */ > + err = -EOPNOTSUPP; > +#ifdef CONFIG_KPROBE_EVENTS > + if (flags & TRACE_EVENT_FL_KPROBE) > + err = bpf_get_kprobe_info(event, attach_info, buf, > + probe_offset, probe_addr, > + event->attr.type == PERF_TYPE_TRACEPOINT); > +#endif > +#ifdef CONFIG_UPROBE_EVENTS > + if (flags & TRACE_EVENT_FL_UPROBE) > + err = bpf_get_uprobe_info(event, attach_info, buf, > + probe_offset, > + event->attr.type == PERF_TYPE_TRACEPOINT); > +#endif > + } > + > + return err; > +} > diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c > index 02aed76..32e9190 100644 > --- a/kernel/trace/trace_kprobe.c > +++ b/kernel/trace/trace_kprobe.c > @@ -1287,6 +1287,35 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, > head, NULL); > } > NOKPROBE_SYMBOL(kretprobe_perf_func); > + > +int bpf_get_kprobe_info(const struct perf_event *event, u32 *attach_info, > + const char **symbol, u64 *probe_offset, > + u64 *probe_addr, bool perf_type_tracepoint) > +{ > + const char *pevent = trace_event_name(event->tp_event); > + const char *group = event->tp_event->class->system; > + struct trace_kprobe *tk; > + > + if (perf_type_tracepoint) > + tk = find_trace_kprobe(pevent, group); > + else > + tk = event->tp_event->data; > + if (!tk) > + return -EINVAL; > + > + *attach_info = trace_kprobe_is_return(tk) ? BPF_ATTACH_KRETPROBE > + : BPF_ATTACH_KPROBE; > + if (tk->symbol) { > + *symbol = tk->symbol; > + *probe_offset = tk->rp.kp.offset; > + *probe_addr = 0; > + } else { > + *symbol = NULL; > + *probe_offset = 0; > + *probe_addr = (unsigned long)tk->rp.kp.addr; > + } > + return 0; > +} > #endif /* CONFIG_PERF_EVENTS */ > > /* > diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c > index ac89287..12a3667 100644 > --- a/kernel/trace/trace_uprobe.c > +++ b/kernel/trace/trace_uprobe.c > @@ -1161,6 +1161,28 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, > { > __uprobe_perf_func(tu, func, regs, ucb, dsize); > } > + > +int bpf_get_uprobe_info(const struct perf_event *event, u32 *attach_info, > + const char **filename, u64 *probe_offset, > + bool perf_type_tracepoint) > +{ > + const char *pevent = trace_event_name(event->tp_event); > + const char *group = event->tp_event->class->system; > + struct trace_uprobe *tu; > + > + if (perf_type_tracepoint) > + tu = find_probe_event(pevent, group); > + else > + tu = event->tp_event->data; > + if (!tu) > + return -EINVAL; > + > + *attach_info = is_ret_probe(tu) ? BPF_ATTACH_URETPROBE > + : BPF_ATTACH_UPROBE; > + *filename = tu->filename; > + *probe_offset = tu->offset; > + return 0; > +} > #endif /* CONFIG_PERF_EVENTS */ > > static int > -- > 2.9.5 >