From mboxrd@z Thu Jan  1 00:00:00 1970
From: Yonghong Song <yhs@fb.com>
Subject: Re: [PATCH bpf-next v3 2/7] bpf: introduce bpf subcommand
 BPF_TASK_FD_QUERY
Date: Wed, 23 May 2018 14:25:34 -0700
Message-ID: <0937648b-89b5-729c-0089-f0df2ed5e5b4@fb.com>
References: <20180522163048.3128924-1-yhs@fb.com>
 <20180522163048.3128924-3-yhs@fb.com>
 <20180523171320.ziswsvpsyelxb6fz@kafai-mbp>
Mime-Version: 1.0
Content-Type: text/plain; charset="utf-8"; format=flowed
Content-Transfer-Encoding: 7bit
Cc: <peterz@infradead.org>, <ast@fb.com>, <daniel@iogearbox.net>,
        <netdev@vger.kernel.org>, <kernel-team@fb.com>
To: Martin KaFai Lau <kafai@fb.com>
Return-path: <netdev-owner@vger.kernel.org>
Received: from mx0b-00082601.pphosted.com ([67.231.153.30]:34474 "EHLO
        mx0a-00082601.pphosted.com" rhost-flags-OK-OK-OK-FAIL)
        by vger.kernel.org with ESMTP id S933195AbeEWV0W (ORCPT
        <rfc822;netdev@vger.kernel.org>); Wed, 23 May 2018 17:26:22 -0400
In-Reply-To: <20180523171320.ziswsvpsyelxb6fz@kafai-mbp>
Content-Language: en-US
Sender: netdev-owner@vger.kernel.org
List-ID: <netdev.vger.kernel.org>


On 5/23/18 10:13 AM, Martin KaFai Lau wrote:
> On Tue, May 22, 2018 at 09:30:46AM -0700, Yonghong Song wrote:
>> Currently, suppose a userspace application has loaded a bpf program
>> and attached it to a tracepoint/kprobe/uprobe, and a bpf
>> introspection tool, e.g., bpftool, wants to show which bpf program
>> is attached to which tracepoint/kprobe/uprobe. Such attachment
>> information will be really useful to understand the overall bpf
>> deployment in the system.
>>
>> There is a name field (16 bytes) for each program, which could
>> be used to encode the attachment point. There are some drawbacks
>> for this approaches. First, bpftool user (e.g., an admin) may not
>> really understand the association between the name and the
>> attachment point. Second, if one program is attached to multiple
>> places, encoding a proper name which can imply all these
>> attachments becomes difficult.
>>
>> This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
>> Given a pid and fd, if the <pid, fd> is associated with a
>> tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
>>     . prog_id
>>     . tracepoint name, or
>>     . k[ret]probe funcname + offset or kernel addr, or
>>     . u[ret]probe filename + offset
>> to the userspace.
>> The user can use "bpftool prog" to find more information about
>> bpf program itself with prog_id.
> LGTM, some comments inline.
> 
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   include/linux/trace_events.h |  16 ++++++
>>   include/uapi/linux/bpf.h     |  27 ++++++++++
>>   kernel/bpf/syscall.c         | 124 +++++++++++++++++++++++++++++++++++++++++++
>>   kernel/trace/bpf_trace.c     |  48 +++++++++++++++++
>>   kernel/trace/trace_kprobe.c  |  29 ++++++++++
>>   kernel/trace/trace_uprobe.c  |  22 ++++++++
>>   6 files changed, 266 insertions(+)
>>
>> diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
>> index 2bde3ef..eab806d 100644
>> --- a/include/linux/trace_events.h
>> +++ b/include/linux/trace_events.h
>> @@ -473,6 +473,9 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info);
>>   int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
>>   int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
>>   struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name);
>> +int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
>> +			    u32 *attach_info, const char **buf,
>> +			    u64 *probe_offset, u64 *probe_addr);
> The first arg is 'const struct perf_event *event' while...
> 
>>   #else
>>   static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
>>   {
>> @@ -504,6 +507,12 @@ static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name
>>   {
>>   	return NULL;
>>   }
>> +static inline int bpf_get_perf_event_info(const struct file *file, u32 *prog_id,
> this one has 'const struct file *file'?

Thanks for catching this. Will correct this in the next revision.

> 
>> +					  u32 *attach_info, const char **buf,
>> +					  u64 *probe_offset, u64 *probe_addr)
>> +{
>> +	return -EOPNOTSUPP;
>> +}
>>   #endif
>>   
>>   enum {
>> @@ -560,10 +569,17 @@ extern void perf_trace_del(struct perf_event *event, int flags);
>>   #ifdef CONFIG_KPROBE_EVENTS
>>   extern int  perf_kprobe_init(struct perf_event *event, bool is_retprobe);
>>   extern void perf_kprobe_destroy(struct perf_event *event);
>> +extern int bpf_get_kprobe_info(const struct perf_event *event,
>> +			       u32 *attach_info, const char **symbol,
>> +			       u64 *probe_offset, u64 *probe_addr,
>> +			       bool perf_type_tracepoint);
>>   #endif
>>   #ifdef CONFIG_UPROBE_EVENTS
>>   extern int  perf_uprobe_init(struct perf_event *event, bool is_retprobe);
>>   extern void perf_uprobe_destroy(struct perf_event *event);
>> +extern int bpf_get_uprobe_info(const struct perf_event *event,
>> +			       u32 *attach_info, const char **filename,
>> +			       u64 *probe_offset, bool perf_type_tracepoint);
>>   #endif
>>   extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
>>   				     char *filter_str);
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 97446bb..a602150 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -97,6 +97,7 @@ enum bpf_cmd {
>>   	BPF_RAW_TRACEPOINT_OPEN,
>>   	BPF_BTF_LOAD,
>>   	BPF_BTF_GET_FD_BY_ID,
>> +	BPF_TASK_FD_QUERY,
>>   };
>>   
>>   enum bpf_map_type {
>> @@ -379,6 +380,22 @@ union bpf_attr {
>>   		__u32		btf_log_size;
>>   		__u32		btf_log_level;
>>   	};
>> +
>> +	struct {
>> +		int		pid;		/* input: pid */
>> +		int		fd;		/* input: fd */
> Should fd and pid be always positive?
> The current fd (like map_fd) in bpf_attr is using __u32.

Will change both pid and fd to __u32. In kernel fd is unsigned, but
pid_t is actually an int. The negative pid is often referred to
the process group the pid is in. Since here, we are only concerned
with actual process pid, make __u32 should be okay.

> 
>> +		__u32		flags;		/* input: flags */
>> +		__u32		buf_len;	/* input: buf len */
>> +		__aligned_u64	buf;		/* input/output:
>> +						 *   tp_name for tracepoint
>> +						 *   symbol for kprobe
>> +						 *   filename for uprobe
>> +						 */
>> +		__u32		prog_id;	/* output: prod_id */
>> +		__u32		attach_info;	/* output: BPF_ATTACH_* */
>> +		__u64		probe_offset;	/* output: probe_offset */
>> +		__u64		probe_addr;	/* output: probe_addr */
>> +	} task_fd_query;
>>   } __attribute__((aligned(8)));
>>   
>>   /* The description below is an attempt at providing documentation to eBPF
>> @@ -2458,4 +2475,14 @@ struct bpf_fib_lookup {
>>   	__u8	dmac[6];     /* ETH_ALEN */
>>   };
>>   
>> +/* used by <task, fd> based query */
>> +enum {
> Nit. Instead of a comment, is it better to give this
> enum a descriptive name?

Yes, will add an enum name bpf_task_fd_info to make it easy for 
correlation with task_fd_query.

> 
>> +	BPF_ATTACH_RAW_TRACEPOINT,	/* tp name */
>> +	BPF_ATTACH_TRACEPOINT,		/* tp name */
>> +	BPF_ATTACH_KPROBE,		/* (symbol + offset) or addr */
>> +	BPF_ATTACH_KRETPROBE,		/* (symbol + offset) or addr */
>> +	BPF_ATTACH_UPROBE,		/* filename + offset */
>> +	BPF_ATTACH_URETPROBE,		/* filename + offset */
>> +};
>> +
>>   #endif /* _UAPI__LINUX_BPF_H__ */
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index bfcde94..9356f0e 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -18,7 +18,9 @@
>>   #include <linux/vmalloc.h>
>>   #include <linux/mmzone.h>
>>   #include <linux/anon_inodes.h>
>> +#include <linux/fdtable.h>
>>   #include <linux/file.h>
>> +#include <linux/fs.h>
>>   #include <linux/license.h>
>>   #include <linux/filter.h>
>>   #include <linux/version.h>
>> @@ -2102,6 +2104,125 @@ static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
>>   	return btf_get_fd_by_id(attr->btf_id);
>>   }
>>   
>> +static int bpf_task_fd_query_copy(const union bpf_attr *attr,
>> +				    union bpf_attr __user *uattr,
>> +				    u32 prog_id, u32 attach_info,
>> +				    const char *buf, u64 probe_offset,
>> +				    u64 probe_addr)
>> +{
>> +	__u64 __user *ubuf;
> Nit. ubuf is a string instead of an array of __u64?

will change it to void *.

> 
>> +	int len;
>> +
>> +	ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
>> +	if (buf) {
>> +		len = strlen(buf);
>> +		if (attr->task_fd_query.buf_len < len + 1)
> I think the current convention is to take the min,
> copy whatever it can to buf and return the real
> len/size in buf_len.  F.e., the prog_ids and prog_cnt in
> __cgroup_bpf_query().
> 
> Should the same be done here or it does not make sense to
> truncate the string?  The user may/may not need the tailing
> char though if its pretty print has limited width anyway.
> The user still needs to know what the buf_len should be to
> retry also but I guess any reasonable buf_len should
> work?

Make sense, will make buf_len input/output and copy
the actually needed length to buf_len and back to user.

> 
>> +			return -ENOSPC;
>> +		if (copy_to_user(ubuf, buf, len + 1))
>> +			return -EFAULT;
>> +	} else if (attr->task_fd_query.buf_len) {
>> +		/* copy '\0' to ubuf */
>> +		__u8 zero = 0;
>> +
>> +		if (copy_to_user(ubuf, &zero, 1))
>> +			return -EFAULT;
>> +	}
>> +
>> +	if (copy_to_user(&uattr->task_fd_query.prog_id, &prog_id,
>> +			 sizeof(prog_id)) ||
>> +	    copy_to_user(&uattr->task_fd_query.attach_info, &attach_info,
>> +			 sizeof(attach_info)) ||
>> +	    copy_to_user(&uattr->task_fd_query.probe_offset, &probe_offset,
>> +			 sizeof(probe_offset)) ||
>> +	    copy_to_user(&uattr->task_fd_query.probe_addr, &probe_addr,
>> +			 sizeof(probe_addr)))
> Nit. put_user() may be able to shorten them.

Indeed, thanks for suggestion.

> 
>> +		return -EFAULT;
>> +
>> +	return 0;
>> +}
>> +
>> +#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr
>> +
>> +static int bpf_task_fd_query(const union bpf_attr *attr,
>> +			     union bpf_attr __user *uattr)
>> +{
>> +	pid_t pid = attr->task_fd_query.pid;
>> +	int fd = attr->task_fd_query.fd;
>> +	const struct perf_event *event;
>> +	struct files_struct *files;
>> +	struct task_struct *task;
>> +	struct file *file;
>> +	int err;
>> +
>> +	if (CHECK_ATTR(BPF_TASK_FD_QUERY))
>> +		return -EINVAL;
>> +
>> +	if (!capable(CAP_SYS_ADMIN))
>> +		return -EPERM;
>> +
>> +	if (attr->task_fd_query.flags != 0)
> How flags is used?

flags is not used yet, but for future extension.
For example, it could be used to query a specific
type of files (raw_tracepoint vs. perf-event based, etc.).

> 
>> +		return -EINVAL;
>> +
>> +	task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
>> +	if (!task)
>> +		return -ENOENT;
>> +
>> +	files = get_files_struct(task);
>> +	put_task_struct(task);
>> +	if (!files)
>> +		return -ENOENT;
>> +
>> +	err = 0;
>> +	spin_lock(&files->file_lock);
>> +	file = fcheck_files(files, fd);
>> +	if (!file)
>> +		err = -EBADF;
>> +	else
>> +		get_file(file);
>> +	spin_unlock(&files->file_lock);
>> +	put_files_struct(files);
>> +
>> +	if (err)
>> +		goto out;
>> +
>> +	if (file->f_op == &bpf_raw_tp_fops) {
>> +		struct bpf_raw_tracepoint *raw_tp = file->private_data;
>> +		struct bpf_raw_event_map *btp = raw_tp->btp;
>> +
>> +		if (!raw_tp->prog)
>> +			err = -ENOENT;
>> +		else
>> +			err = bpf_task_fd_query_copy(attr, uattr,
>> +						     raw_tp->prog->aux->id,
>> +						     BPF_ATTACH_RAW_TRACEPOINT,
>> +						     btp->tp->name, 0, 0);
>> +		goto put_file;
>> +	}
>> +
>> +	event = perf_get_event(file);
>> +	if (!IS_ERR(event)) {
>> +		u64 probe_offset, probe_addr;
>> +		u32 prog_id, attach_info;
>> +		const char *buf;
>> +
>> +		err = bpf_get_perf_event_info(event, &prog_id, &attach_info,
>> +					      &buf, &probe_offset,
>> +					      &probe_addr);
>> +		if (!err)
>> +			err = bpf_task_fd_query_copy(attr, uattr, prog_id,
>> +						     attach_info, buf,
>> +						     probe_offset,
>> +						     probe_addr);
>> +		goto put_file;
>> +	}
>> +
>> +	err = -ENOTSUPP;
>> +put_file:
>> +	fput(file);
>> +out:
>> +	return err;
>> +}
>> +
>>   SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
>>   {
>>   	union bpf_attr attr = {};
>> @@ -2188,6 +2309,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
>>   	case BPF_BTF_GET_FD_BY_ID:
>>   		err = bpf_btf_get_fd_by_id(&attr);
>>   		break;
>> +	case BPF_TASK_FD_QUERY:
>> +		err = bpf_task_fd_query(&attr, uattr);
>> +		break;
>>   	default:
>>   		err = -EINVAL;
>>   		break;
>> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
>> index ce2cbbf..323c80e 100644
>> --- a/kernel/trace/bpf_trace.c
>> +++ b/kernel/trace/bpf_trace.c
>> @@ -14,6 +14,7 @@
>>   #include <linux/uaccess.h>
>>   #include <linux/ctype.h>
>>   #include <linux/kprobes.h>
>> +#include <linux/syscalls.h>
>>   #include <linux/error-injection.h>
>>   
>>   #include "trace_probe.h"
>> @@ -1163,3 +1164,50 @@ int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
>>   	mutex_unlock(&bpf_event_mutex);
>>   	return err;
>>   }
>> +
>> +int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
>> +			    u32 *attach_info, const char **buf,
>> +			    u64 *probe_offset, u64 *probe_addr)
>> +{
>> +	bool is_tracepoint, is_syscall_tp;
>> +	struct bpf_prog *prog;
>> +	int flags, err = 0;
>> +
>> +	prog = event->prog;
>> +	if (!prog)
>> +		return -ENOENT;
>> +
>> +	/* not supporting BPF_PROG_TYPE_PERF_EVENT yet */
>> +	if (prog->type == BPF_PROG_TYPE_PERF_EVENT)
>> +		return -EOPNOTSUPP;
>> +
>> +	*prog_id = prog->aux->id;
>> +	flags = event->tp_event->flags;
>> +	is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT;
>> +	is_syscall_tp = is_syscall_trace_event(event->tp_event);
>> +
>> +	if (is_tracepoint || is_syscall_tp) {
>> +		*buf = is_tracepoint ? event->tp_event->tp->name
>> +				     : event->tp_event->name;
>> +		*attach_info = BPF_ATTACH_TRACEPOINT;
>> +		*probe_offset = 0x0;
>> +		*probe_addr = 0x0;
>> +	} else {
>> +		/* kprobe/uprobe */
>> +		err = -EOPNOTSUPP;
>> +#ifdef CONFIG_KPROBE_EVENTS
>> +		if (flags & TRACE_EVENT_FL_KPROBE)
>> +			err = bpf_get_kprobe_info(event, attach_info, buf,
>> +						  probe_offset, probe_addr,
>> +						  event->attr.type == PERF_TYPE_TRACEPOINT);
>> +#endif
>> +#ifdef CONFIG_UPROBE_EVENTS
>> +		if (flags & TRACE_EVENT_FL_UPROBE)
>> +			err = bpf_get_uprobe_info(event, attach_info, buf,
>> +						  probe_offset,
>> +						  event->attr.type == PERF_TYPE_TRACEPOINT);
>> +#endif
>> +	}
>> +
>> +	return err;
>> +}
>> diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
>> index 02aed76..32e9190 100644
>> --- a/kernel/trace/trace_kprobe.c
>> +++ b/kernel/trace/trace_kprobe.c
>> @@ -1287,6 +1287,35 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
>>   			      head, NULL);
>>   }
>>   NOKPROBE_SYMBOL(kretprobe_perf_func);
>> +
>> +int bpf_get_kprobe_info(const struct perf_event *event, u32 *attach_info,
>> +			const char **symbol, u64 *probe_offset,
>> +			u64 *probe_addr, bool perf_type_tracepoint)
>> +{
>> +	const char *pevent = trace_event_name(event->tp_event);
>> +	const char *group = event->tp_event->class->system;
>> +	struct trace_kprobe *tk;
>> +
>> +	if (perf_type_tracepoint)
>> +		tk = find_trace_kprobe(pevent, group);
>> +	else
>> +		tk = event->tp_event->data;
>> +	if (!tk)
>> +		return -EINVAL;
>> +
>> +	*attach_info = trace_kprobe_is_return(tk) ? BPF_ATTACH_KRETPROBE
>> +						  : BPF_ATTACH_KPROBE;
>> +	if (tk->symbol) {
>> +		*symbol = tk->symbol;
>> +		*probe_offset = tk->rp.kp.offset;
>> +		*probe_addr = 0;
>> +	} else {
>> +		*symbol = NULL;
>> +		*probe_offset = 0;
>> +		*probe_addr = (unsigned long)tk->rp.kp.addr;
>> +	}
>> +	return 0;
>> +}
>>   #endif	/* CONFIG_PERF_EVENTS */
>>   
>>   /*
>> diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
>> index ac89287..12a3667 100644
>> --- a/kernel/trace/trace_uprobe.c
>> +++ b/kernel/trace/trace_uprobe.c
>> @@ -1161,6 +1161,28 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
>>   {
>>   	__uprobe_perf_func(tu, func, regs, ucb, dsize);
>>   }
>> +
>> +int bpf_get_uprobe_info(const struct perf_event *event, u32 *attach_info,
>> +			const char **filename, u64 *probe_offset,
>> +			bool perf_type_tracepoint)
>> +{
>> +	const char *pevent = trace_event_name(event->tp_event);
>> +	const char *group = event->tp_event->class->system;
>> +	struct trace_uprobe *tu;
>> +
>> +	if (perf_type_tracepoint)
>> +		tu = find_probe_event(pevent, group);
>> +	else
>> +		tu = event->tp_event->data;
>> +	if (!tu)
>> +		return -EINVAL;
>> +
>> +	*attach_info = is_ret_probe(tu) ? BPF_ATTACH_URETPROBE
>> +					: BPF_ATTACH_UPROBE;
>> +	*filename = tu->filename;
>> +	*probe_offset = tu->offset;
>> +	return 0;
>> +}
>>   #endif	/* CONFIG_PERF_EVENTS */
>>   
>>   static int
>> -- 
>> 2.9.5
>>