Re: [PATCH bpf-next 1/4] bpf: unprivileged BPF access via /dev/bpf

From: Song Liu <songliubraving@fb.com>
To: Daniel Borkmann <daniel@iogearbox.net>
Cc: Networking <netdev@vger.kernel.org>, bpf <bpf@vger.kernel.org>,
	"Alexei Starovoitov" <ast@kernel.org>,
	Kernel Team <Kernel-team@fb.com>,
	"jannh@google.com" <jannh@google.com>
Subject: Re: [PATCH bpf-next 1/4] bpf: unprivileged BPF access via /dev/bpf
Date: Wed, 26 Jun 2019 15:17:47 +0000	[thread overview]
Message-ID: <5A472047-F329-43C3-9DBC-9BCFC0A19F1C@fb.com> (raw)
In-Reply-To: <9bc166ca-1ef0-ee1e-6306-6850d4008174@iogearbox.net>

> On Jun 26, 2019, at 6:32 AM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> 
> On 06/25/2019 08:23 PM, Song Liu wrote:
>> This patch introduce unprivileged BPF access. The access control is
>> achieved via device /dev/bpf. Users with access to /dev/bpf are able
>> to access BPF syscall.
>> 
>> Two ioctl command are added to /dev/bpf:
>> 
>> The first two commands get/put permission to access sys_bpf. This
>> permission is noted by setting bit TASK_BPF_FLAG_PERMITTED of
>> current->bpf_flags. This permission cannot be inherited via fork().
>> 
>> Helper function bpf_capable() is added to check whether the task has got
>> permission via /dev/bpf.
>> 
>> Signed-off-by: Song Liu <songliubraving@fb.com>
> 
> [ Lets Cc Jann so he has a chance to review as he was the one who suggested
>  the idea. ]
> 
>> ---
>> Documentation/ioctl/ioctl-number.txt |  1 +
>> include/linux/bpf.h                  | 12 +++++
>> include/linux/sched.h                |  8 ++++
>> include/uapi/linux/bpf.h             |  5 ++
>> kernel/bpf/arraymap.c                |  2 +-
>> kernel/bpf/cgroup.c                  |  2 +-
>> kernel/bpf/core.c                    |  4 +-
>> kernel/bpf/cpumap.c                  |  2 +-
>> kernel/bpf/devmap.c                  |  2 +-
>> kernel/bpf/hashtab.c                 |  4 +-
>> kernel/bpf/lpm_trie.c                |  2 +-
>> kernel/bpf/offload.c                 |  2 +-
>> kernel/bpf/queue_stack_maps.c        |  2 +-
>> kernel/bpf/reuseport_array.c         |  2 +-
>> kernel/bpf/stackmap.c                |  2 +-
>> kernel/bpf/syscall.c                 | 72 +++++++++++++++++++++-------
>> kernel/bpf/verifier.c                |  2 +-
>> kernel/bpf/xskmap.c                  |  2 +-
>> kernel/fork.c                        |  4 ++
>> net/core/filter.c                    |  6 +--
>> 20 files changed, 104 insertions(+), 34 deletions(-)
>> 
>> diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
>> index c9558146ac58..19998b99d603 100644
>> --- a/Documentation/ioctl/ioctl-number.txt
>> +++ b/Documentation/ioctl/ioctl-number.txt
>> @@ -327,6 +327,7 @@ Code  Seq#(hex)	Include File		Comments
>> 0xB4	00-0F	linux/gpio.h		<mailto:linux-gpio@vger.kernel.org>
>> 0xB5	00-0F	uapi/linux/rpmsg.h	<mailto:linux-remoteproc@vger.kernel.org>
>> 0xB6	all	linux/fpga-dfl.h
>> +0xBP	01-02	uapi/linux/bpf.h	<mailto:bpf@vger.kernel.org>
>> 0xC0	00-0F	linux/usb/iowarrior.h
>> 0xCA	00-0F	uapi/misc/cxl.h
>> 0xCA	10-2F	uapi/misc/ocxl.h
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index a62e7889b0b6..dbba7870f6df 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -14,6 +14,10 @@
>> #include <linux/numa.h>
>> #include <linux/wait.h>
>> #include <linux/u64_stats_sync.h>
>> +#include <linux/sched.h>
>> +#include <linux/capability.h>
>> +
>> +#include <asm/current.h>
>> 
>> struct bpf_verifier_env;
>> struct perf_event;
>> @@ -742,6 +746,12 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
>> int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
>> 				     const union bpf_attr *kattr,
>> 				     union bpf_attr __user *uattr);
>> +
>> +static inline bool bpf_capable(int cap)
>> +{
>> +	return test_bit(TASK_BPF_FLAG_PERMITTED, &current->bpf_flags) ||
>> +		capable(cap);
>> +}
>> #else /* !CONFIG_BPF_SYSCALL */
>> static inline struct bpf_prog *bpf_prog_get(u32 ufd)
>> {
>> @@ -874,6 +884,8 @@ static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
>> {
>> 	return -ENOTSUPP;
>> }
>> +
>> +#define bpf_capable(cap) capable((cap))
>> #endif /* CONFIG_BPF_SYSCALL */
>> 
>> static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index 11837410690f..ddd33d4476c5 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -1200,6 +1200,10 @@ struct task_struct {
>> 	unsigned long			prev_lowest_stack;
>> #endif
>> 
>> +#ifdef CONFIG_BPF_SYSCALL
>> +	unsigned long			bpf_flags;
>> +#endif
> 
> There are plenty of bits available here:
> 
>        /* --- cacheline 14 boundary (896 bytes) --- */
>        unsigned int               in_execve:1;          /*   896:31  4 */
>        unsigned int               in_iowait:1;          /*   896:30  4 */
>        unsigned int               restore_sigmask:1;    /*   896:29  4 */
>        unsigned int               in_user_fault:1;      /*   896:28  4 */
>        unsigned int               no_cgroup_migration:1; /*   896:27  4 */
>        unsigned int               frozen:1;             /*   896:26  4 */
>        unsigned int               use_memdelay:1;       /*   896:25  4 */
> 
>        /* XXX 25 bits hole, try to pack */
>        /* XXX 4 bytes hole, try to pack */
> 
> Given that bpf is pretty much enabled by default everywhere, I don't think we
> should waste so much space in task_struct just for this flag (pretty sure that
> task_struct is the equivalent of sk_buff that rather needs a diet). Other options
> could be to add to atomic_flags which also still has space.

Good point. Let me find a free bit for it. 

> 
>> 	/*
>> 	 * New fields for task_struct should be added above here, so that
>> 	 * they are included in the randomized portion of task_struct.
>> @@ -1772,6 +1776,10 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
>> 
>> #endif /* CONFIG_SMP */
[...]
>> +
>> +static long bpf_dev_ioctl(struct file *filp,
>> +			  unsigned int ioctl, unsigned long arg)
>> +{
>> +	switch (ioctl) {
>> +	case BPF_DEV_IOCTL_GET_PERM:
>> +		set_bit(TASK_BPF_FLAG_PERMITTED, &current->bpf_flags);
>> +		break;
>> +	case BPF_DEV_IOCTL_PUT_PERM:
>> +		clear_bit(TASK_BPF_FLAG_PERMITTED, &current->bpf_flags);
> 
> I think the get/put for uapi is a bit misleading, first thought at least for
> me is on get/put_user() when I read the name.

I am not good at naming things. What would be better names here? 

> 
>> +		break;
>> +	default:
>> +		return -EINVAL;
>> +	}
>> +	return 0;
>> +}
>> +
>> +static const struct file_operations bpf_chardev_ops = {
>> +	.unlocked_ioctl = bpf_dev_ioctl,
>> +};
>> +
>> +static struct miscdevice bpf_dev = {
>> +	.minor		= MISC_DYNAMIC_MINOR,
>> +	.name		= "bpf",
>> +	.fops		= &bpf_chardev_ops,
>> +	.mode		= 0440,
>> +	.nodename	= "bpf",
> 
> Here's what kvm does:
> 
> static struct miscdevice kvm_dev = {
>        KVM_MINOR,
>        "kvm",
>        &kvm_chardev_ops,
> };
> 
> Is there an actual reason that mode is not 0 by default in bpf case? Why
> we need to define nodename?

Based on my understanding, mode of 0440 is what we want. If we leave it 
as 0, it will use default value of 0600. I guess we can just set it to 
0440, as user space can change it later anyway. 

I guess we really don't need nodename. I will remove it. 

Thanks,
Song