From mboxrd@z Thu Jan 1 00:00:00 1970 From: Yonghong Song Subject: Re: [PATCH bpf-next v3 2/7] bpf: introduce bpf subcommand BPF_TASK_FD_QUERY Date: Wed, 23 May 2018 14:25:34 -0700 Message-ID: <0937648b-89b5-729c-0089-f0df2ed5e5b4@fb.com> References: <20180522163048.3128924-1-yhs@fb.com> <20180522163048.3128924-3-yhs@fb.com> <20180523171320.ziswsvpsyelxb6fz@kafai-mbp> Mime-Version: 1.0 Content-Type: text/plain; charset="utf-8"; format=flowed Content-Transfer-Encoding: 7bit Cc: , , , , To: Martin KaFai Lau Return-path: Received: from mx0b-00082601.pphosted.com ([67.231.153.30]:34474 "EHLO mx0a-00082601.pphosted.com" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S933195AbeEWV0W (ORCPT ); Wed, 23 May 2018 17:26:22 -0400 In-Reply-To: <20180523171320.ziswsvpsyelxb6fz@kafai-mbp> Content-Language: en-US Sender: netdev-owner@vger.kernel.org List-ID: On 5/23/18 10:13 AM, Martin KaFai Lau wrote: > On Tue, May 22, 2018 at 09:30:46AM -0700, Yonghong Song wrote: >> Currently, suppose a userspace application has loaded a bpf program >> and attached it to a tracepoint/kprobe/uprobe, and a bpf >> introspection tool, e.g., bpftool, wants to show which bpf program >> is attached to which tracepoint/kprobe/uprobe. Such attachment >> information will be really useful to understand the overall bpf >> deployment in the system. >> >> There is a name field (16 bytes) for each program, which could >> be used to encode the attachment point. There are some drawbacks >> for this approaches. First, bpftool user (e.g., an admin) may not >> really understand the association between the name and the >> attachment point. Second, if one program is attached to multiple >> places, encoding a proper name which can imply all these >> attachments becomes difficult. >> >> This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY. >> Given a pid and fd, if the is associated with a >> tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return >> . prog_id >> . tracepoint name, or >> . k[ret]probe funcname + offset or kernel addr, or >> . u[ret]probe filename + offset >> to the userspace. >> The user can use "bpftool prog" to find more information about >> bpf program itself with prog_id. > LGTM, some comments inline. > >> >> Signed-off-by: Yonghong Song >> --- >> include/linux/trace_events.h | 16 ++++++ >> include/uapi/linux/bpf.h | 27 ++++++++++ >> kernel/bpf/syscall.c | 124 +++++++++++++++++++++++++++++++++++++++++++ >> kernel/trace/bpf_trace.c | 48 +++++++++++++++++ >> kernel/trace/trace_kprobe.c | 29 ++++++++++ >> kernel/trace/trace_uprobe.c | 22 ++++++++ >> 6 files changed, 266 insertions(+) >> >> diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h >> index 2bde3ef..eab806d 100644 >> --- a/include/linux/trace_events.h >> +++ b/include/linux/trace_events.h >> @@ -473,6 +473,9 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info); >> int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); >> int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog); >> struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name); >> +int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, >> + u32 *attach_info, const char **buf, >> + u64 *probe_offset, u64 *probe_addr); > The first arg is 'const struct perf_event *event' while... > >> #else >> static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) >> { >> @@ -504,6 +507,12 @@ static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name >> { >> return NULL; >> } >> +static inline int bpf_get_perf_event_info(const struct file *file, u32 *prog_id, > this one has 'const struct file *file'? Thanks for catching this. Will correct this in the next revision. > >> + u32 *attach_info, const char **buf, >> + u64 *probe_offset, u64 *probe_addr) >> +{ >> + return -EOPNOTSUPP; >> +} >> #endif >> >> enum { >> @@ -560,10 +569,17 @@ extern void perf_trace_del(struct perf_event *event, int flags); >> #ifdef CONFIG_KPROBE_EVENTS >> extern int perf_kprobe_init(struct perf_event *event, bool is_retprobe); >> extern void perf_kprobe_destroy(struct perf_event *event); >> +extern int bpf_get_kprobe_info(const struct perf_event *event, >> + u32 *attach_info, const char **symbol, >> + u64 *probe_offset, u64 *probe_addr, >> + bool perf_type_tracepoint); >> #endif >> #ifdef CONFIG_UPROBE_EVENTS >> extern int perf_uprobe_init(struct perf_event *event, bool is_retprobe); >> extern void perf_uprobe_destroy(struct perf_event *event); >> +extern int bpf_get_uprobe_info(const struct perf_event *event, >> + u32 *attach_info, const char **filename, >> + u64 *probe_offset, bool perf_type_tracepoint); >> #endif >> extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, >> char *filter_str); >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h >> index 97446bb..a602150 100644 >> --- a/include/uapi/linux/bpf.h >> +++ b/include/uapi/linux/bpf.h >> @@ -97,6 +97,7 @@ enum bpf_cmd { >> BPF_RAW_TRACEPOINT_OPEN, >> BPF_BTF_LOAD, >> BPF_BTF_GET_FD_BY_ID, >> + BPF_TASK_FD_QUERY, >> }; >> >> enum bpf_map_type { >> @@ -379,6 +380,22 @@ union bpf_attr { >> __u32 btf_log_size; >> __u32 btf_log_level; >> }; >> + >> + struct { >> + int pid; /* input: pid */ >> + int fd; /* input: fd */ > Should fd and pid be always positive? > The current fd (like map_fd) in bpf_attr is using __u32. Will change both pid and fd to __u32. In kernel fd is unsigned, but pid_t is actually an int. The negative pid is often referred to the process group the pid is in. Since here, we are only concerned with actual process pid, make __u32 should be okay. > >> + __u32 flags; /* input: flags */ >> + __u32 buf_len; /* input: buf len */ >> + __aligned_u64 buf; /* input/output: >> + * tp_name for tracepoint >> + * symbol for kprobe >> + * filename for uprobe >> + */ >> + __u32 prog_id; /* output: prod_id */ >> + __u32 attach_info; /* output: BPF_ATTACH_* */ >> + __u64 probe_offset; /* output: probe_offset */ >> + __u64 probe_addr; /* output: probe_addr */ >> + } task_fd_query; >> } __attribute__((aligned(8))); >> >> /* The description below is an attempt at providing documentation to eBPF >> @@ -2458,4 +2475,14 @@ struct bpf_fib_lookup { >> __u8 dmac[6]; /* ETH_ALEN */ >> }; >> >> +/* used by based query */ >> +enum { > Nit. Instead of a comment, is it better to give this > enum a descriptive name? Yes, will add an enum name bpf_task_fd_info to make it easy for correlation with task_fd_query. > >> + BPF_ATTACH_RAW_TRACEPOINT, /* tp name */ >> + BPF_ATTACH_TRACEPOINT, /* tp name */ >> + BPF_ATTACH_KPROBE, /* (symbol + offset) or addr */ >> + BPF_ATTACH_KRETPROBE, /* (symbol + offset) or addr */ >> + BPF_ATTACH_UPROBE, /* filename + offset */ >> + BPF_ATTACH_URETPROBE, /* filename + offset */ >> +}; >> + >> #endif /* _UAPI__LINUX_BPF_H__ */ >> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c >> index bfcde94..9356f0e 100644 >> --- a/kernel/bpf/syscall.c >> +++ b/kernel/bpf/syscall.c >> @@ -18,7 +18,9 @@ >> #include >> #include >> #include >> +#include >> #include >> +#include >> #include >> #include >> #include >> @@ -2102,6 +2104,125 @@ static int bpf_btf_get_fd_by_id(const union bpf_attr *attr) >> return btf_get_fd_by_id(attr->btf_id); >> } >> >> +static int bpf_task_fd_query_copy(const union bpf_attr *attr, >> + union bpf_attr __user *uattr, >> + u32 prog_id, u32 attach_info, >> + const char *buf, u64 probe_offset, >> + u64 probe_addr) >> +{ >> + __u64 __user *ubuf; > Nit. ubuf is a string instead of an array of __u64? will change it to void *. > >> + int len; >> + >> + ubuf = u64_to_user_ptr(attr->task_fd_query.buf); >> + if (buf) { >> + len = strlen(buf); >> + if (attr->task_fd_query.buf_len < len + 1) > I think the current convention is to take the min, > copy whatever it can to buf and return the real > len/size in buf_len. F.e., the prog_ids and prog_cnt in > __cgroup_bpf_query(). > > Should the same be done here or it does not make sense to > truncate the string? The user may/may not need the tailing > char though if its pretty print has limited width anyway. > The user still needs to know what the buf_len should be to > retry also but I guess any reasonable buf_len should > work? Make sense, will make buf_len input/output and copy the actually needed length to buf_len and back to user. > >> + return -ENOSPC; >> + if (copy_to_user(ubuf, buf, len + 1)) >> + return -EFAULT; >> + } else if (attr->task_fd_query.buf_len) { >> + /* copy '\0' to ubuf */ >> + __u8 zero = 0; >> + >> + if (copy_to_user(ubuf, &zero, 1)) >> + return -EFAULT; >> + } >> + >> + if (copy_to_user(&uattr->task_fd_query.prog_id, &prog_id, >> + sizeof(prog_id)) || >> + copy_to_user(&uattr->task_fd_query.attach_info, &attach_info, >> + sizeof(attach_info)) || >> + copy_to_user(&uattr->task_fd_query.probe_offset, &probe_offset, >> + sizeof(probe_offset)) || >> + copy_to_user(&uattr->task_fd_query.probe_addr, &probe_addr, >> + sizeof(probe_addr))) > Nit. put_user() may be able to shorten them. Indeed, thanks for suggestion. > >> + return -EFAULT; >> + >> + return 0; >> +} >> + >> +#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr >> + >> +static int bpf_task_fd_query(const union bpf_attr *attr, >> + union bpf_attr __user *uattr) >> +{ >> + pid_t pid = attr->task_fd_query.pid; >> + int fd = attr->task_fd_query.fd; >> + const struct perf_event *event; >> + struct files_struct *files; >> + struct task_struct *task; >> + struct file *file; >> + int err; >> + >> + if (CHECK_ATTR(BPF_TASK_FD_QUERY)) >> + return -EINVAL; >> + >> + if (!capable(CAP_SYS_ADMIN)) >> + return -EPERM; >> + >> + if (attr->task_fd_query.flags != 0) > How flags is used? flags is not used yet, but for future extension. For example, it could be used to query a specific type of files (raw_tracepoint vs. perf-event based, etc.). > >> + return -EINVAL; >> + >> + task = get_pid_task(find_vpid(pid), PIDTYPE_PID); >> + if (!task) >> + return -ENOENT; >> + >> + files = get_files_struct(task); >> + put_task_struct(task); >> + if (!files) >> + return -ENOENT; >> + >> + err = 0; >> + spin_lock(&files->file_lock); >> + file = fcheck_files(files, fd); >> + if (!file) >> + err = -EBADF; >> + else >> + get_file(file); >> + spin_unlock(&files->file_lock); >> + put_files_struct(files); >> + >> + if (err) >> + goto out; >> + >> + if (file->f_op == &bpf_raw_tp_fops) { >> + struct bpf_raw_tracepoint *raw_tp = file->private_data; >> + struct bpf_raw_event_map *btp = raw_tp->btp; >> + >> + if (!raw_tp->prog) >> + err = -ENOENT; >> + else >> + err = bpf_task_fd_query_copy(attr, uattr, >> + raw_tp->prog->aux->id, >> + BPF_ATTACH_RAW_TRACEPOINT, >> + btp->tp->name, 0, 0); >> + goto put_file; >> + } >> + >> + event = perf_get_event(file); >> + if (!IS_ERR(event)) { >> + u64 probe_offset, probe_addr; >> + u32 prog_id, attach_info; >> + const char *buf; >> + >> + err = bpf_get_perf_event_info(event, &prog_id, &attach_info, >> + &buf, &probe_offset, >> + &probe_addr); >> + if (!err) >> + err = bpf_task_fd_query_copy(attr, uattr, prog_id, >> + attach_info, buf, >> + probe_offset, >> + probe_addr); >> + goto put_file; >> + } >> + >> + err = -ENOTSUPP; >> +put_file: >> + fput(file); >> +out: >> + return err; >> +} >> + >> SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) >> { >> union bpf_attr attr = {}; >> @@ -2188,6 +2309,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz >> case BPF_BTF_GET_FD_BY_ID: >> err = bpf_btf_get_fd_by_id(&attr); >> break; >> + case BPF_TASK_FD_QUERY: >> + err = bpf_task_fd_query(&attr, uattr); >> + break; >> default: >> err = -EINVAL; >> break; >> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c >> index ce2cbbf..323c80e 100644 >> --- a/kernel/trace/bpf_trace.c >> +++ b/kernel/trace/bpf_trace.c >> @@ -14,6 +14,7 @@ >> #include >> #include >> #include >> +#include >> #include >> >> #include "trace_probe.h" >> @@ -1163,3 +1164,50 @@ int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog) >> mutex_unlock(&bpf_event_mutex); >> return err; >> } >> + >> +int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, >> + u32 *attach_info, const char **buf, >> + u64 *probe_offset, u64 *probe_addr) >> +{ >> + bool is_tracepoint, is_syscall_tp; >> + struct bpf_prog *prog; >> + int flags, err = 0; >> + >> + prog = event->prog; >> + if (!prog) >> + return -ENOENT; >> + >> + /* not supporting BPF_PROG_TYPE_PERF_EVENT yet */ >> + if (prog->type == BPF_PROG_TYPE_PERF_EVENT) >> + return -EOPNOTSUPP; >> + >> + *prog_id = prog->aux->id; >> + flags = event->tp_event->flags; >> + is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT; >> + is_syscall_tp = is_syscall_trace_event(event->tp_event); >> + >> + if (is_tracepoint || is_syscall_tp) { >> + *buf = is_tracepoint ? event->tp_event->tp->name >> + : event->tp_event->name; >> + *attach_info = BPF_ATTACH_TRACEPOINT; >> + *probe_offset = 0x0; >> + *probe_addr = 0x0; >> + } else { >> + /* kprobe/uprobe */ >> + err = -EOPNOTSUPP; >> +#ifdef CONFIG_KPROBE_EVENTS >> + if (flags & TRACE_EVENT_FL_KPROBE) >> + err = bpf_get_kprobe_info(event, attach_info, buf, >> + probe_offset, probe_addr, >> + event->attr.type == PERF_TYPE_TRACEPOINT); >> +#endif >> +#ifdef CONFIG_UPROBE_EVENTS >> + if (flags & TRACE_EVENT_FL_UPROBE) >> + err = bpf_get_uprobe_info(event, attach_info, buf, >> + probe_offset, >> + event->attr.type == PERF_TYPE_TRACEPOINT); >> +#endif >> + } >> + >> + return err; >> +} >> diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c >> index 02aed76..32e9190 100644 >> --- a/kernel/trace/trace_kprobe.c >> +++ b/kernel/trace/trace_kprobe.c >> @@ -1287,6 +1287,35 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, >> head, NULL); >> } >> NOKPROBE_SYMBOL(kretprobe_perf_func); >> + >> +int bpf_get_kprobe_info(const struct perf_event *event, u32 *attach_info, >> + const char **symbol, u64 *probe_offset, >> + u64 *probe_addr, bool perf_type_tracepoint) >> +{ >> + const char *pevent = trace_event_name(event->tp_event); >> + const char *group = event->tp_event->class->system; >> + struct trace_kprobe *tk; >> + >> + if (perf_type_tracepoint) >> + tk = find_trace_kprobe(pevent, group); >> + else >> + tk = event->tp_event->data; >> + if (!tk) >> + return -EINVAL; >> + >> + *attach_info = trace_kprobe_is_return(tk) ? BPF_ATTACH_KRETPROBE >> + : BPF_ATTACH_KPROBE; >> + if (tk->symbol) { >> + *symbol = tk->symbol; >> + *probe_offset = tk->rp.kp.offset; >> + *probe_addr = 0; >> + } else { >> + *symbol = NULL; >> + *probe_offset = 0; >> + *probe_addr = (unsigned long)tk->rp.kp.addr; >> + } >> + return 0; >> +} >> #endif /* CONFIG_PERF_EVENTS */ >> >> /* >> diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c >> index ac89287..12a3667 100644 >> --- a/kernel/trace/trace_uprobe.c >> +++ b/kernel/trace/trace_uprobe.c >> @@ -1161,6 +1161,28 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, >> { >> __uprobe_perf_func(tu, func, regs, ucb, dsize); >> } >> + >> +int bpf_get_uprobe_info(const struct perf_event *event, u32 *attach_info, >> + const char **filename, u64 *probe_offset, >> + bool perf_type_tracepoint) >> +{ >> + const char *pevent = trace_event_name(event->tp_event); >> + const char *group = event->tp_event->class->system; >> + struct trace_uprobe *tu; >> + >> + if (perf_type_tracepoint) >> + tu = find_probe_event(pevent, group); >> + else >> + tu = event->tp_event->data; >> + if (!tu) >> + return -EINVAL; >> + >> + *attach_info = is_ret_probe(tu) ? BPF_ATTACH_URETPROBE >> + : BPF_ATTACH_UPROBE; >> + *filename = tu->filename; >> + *probe_offset = tu->offset; >> + return 0; >> +} >> #endif /* CONFIG_PERF_EVENTS */ >> >> static int >> -- >> 2.9.5 >>