Re: [PATCH v2 bpf-next 1/3] bpf: Consolidate inner-map-compatible properties into bpf_types.h

From: Daniel Borkmann <daniel@iogearbox.net>
To: Martin KaFai Lau <kafai@fb.com>, bpf@vger.kernel.org
Cc: Alexei Starovoitov <ast@kernel.org>,
	kernel-team@fb.com, netdev@vger.kernel.org,
	Andrey Ignatov <rdna@fb.com>
Subject: Re: [PATCH v2 bpf-next 1/3] bpf: Consolidate inner-map-compatible properties into bpf_types.h
Date: Sat, 23 May 2020 00:22:48 +0200	[thread overview]
Message-ID: <9c00ced2-983f-ad59-d805-777ebd1f1cab@iogearbox.net> (raw)
In-Reply-To: <20200522022342.899756-1-kafai@fb.com>

On 5/22/20 4:23 AM, Martin KaFai Lau wrote:
[...]
>   };
>   
> +/* Cannot be used as an inner map */
> +#define BPF_MAP_NO_INNER_MAP (1 << 0)
> +
>   struct bpf_map {
>   	/* The first two cachelines with read-mostly members of which some
>   	 * are also accessed in fast-path (e.g. ops, max_entries).
> @@ -120,6 +123,7 @@ struct bpf_map {
>   	struct bpf_map_memory memory;
>   	char name[BPF_OBJ_NAME_LEN];
>   	u32 btf_vmlinux_value_type_id;
> +	u32 properties;
>   	bool bypass_spec_v1;
>   	bool frozen; /* write-once; write-protected by freeze_mutex */
>   	/* 22 bytes hole */
> @@ -1037,12 +1041,12 @@ extern const struct file_operations bpf_iter_fops;
>   #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
>   	extern const struct bpf_prog_ops _name ## _prog_ops; \
>   	extern const struct bpf_verifier_ops _name ## _verifier_ops;
> -#define BPF_MAP_TYPE(_id, _ops) \
> +#define BPF_MAP_TYPE_FL(_id, _ops, properties) \
>   	extern const struct bpf_map_ops _ops;
>   #define BPF_LINK_TYPE(_id, _name)
>   #include <linux/bpf_types.h>
>   #undef BPF_PROG_TYPE
> -#undef BPF_MAP_TYPE
> +#undef BPF_MAP_TYPE_FL
>   #undef BPF_LINK_TYPE
>   
>   extern const struct bpf_prog_ops bpf_offload_prog_ops;
> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> index 29d22752fc87..3f32702c9bf4 100644
> --- a/include/linux/bpf_types.h
> +++ b/include/linux/bpf_types.h
> @@ -76,16 +76,25 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LSM, lsm,
>   #endif /* CONFIG_BPF_LSM */
>   #endif
>   
> +#define BPF_MAP_TYPE(x, y) BPF_MAP_TYPE_FL(x, y, 0)
> +
>   BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
>   BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
> -BPF_MAP_TYPE(BPF_MAP_TYPE_PROG_ARRAY, prog_array_map_ops)
> +/* prog_array->aux->{type,jited} is a runtime binding.
> + * Doing static check alone in the verifier is not enough,
> + * so BPF_MAP_NO_INNTER_MAP is needed.

typo: INNTER

> + */
> +BPF_MAP_TYPE_FL(BPF_MAP_TYPE_PROG_ARRAY, prog_array_map_ops,
> +		BPF_MAP_NO_INNER_MAP)

Probably nit, but what is "FL"? flags? We do have map_flags already, but here the
BPF_MAP_NO_INNER_MAP ends up in 'properties' instead. To avoid confusion, it would
probably be better to name it 'map_flags_fixed' since this is what it really means;
fixed flags that cannot be changed/controlled when creating a map.

>   BPF_MAP_TYPE(BPF_MAP_TYPE_PERF_EVENT_ARRAY, perf_event_array_map_ops)
>   #ifdef CONFIG_CGROUPS
>   BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops)
>   #endif
>   #ifdef CONFIG_CGROUP_BPF
> -BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops)
> -BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, cgroup_storage_map_ops)
> +BPF_MAP_TYPE_FL(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops,
> +		BPF_MAP_NO_INNER_MAP)
> +BPF_MAP_TYPE_FL(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, cgroup_storage_map_ops,
> +		BPF_MAP_NO_INNER_MAP)
>   #endif
>   BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops)
>   BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops)
> @@ -116,8 +125,10 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops)
>   BPF_MAP_TYPE(BPF_MAP_TYPE_QUEUE, queue_map_ops)
>   BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops)
>   #if defined(CONFIG_BPF_JIT)
> -BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops)
> +BPF_MAP_TYPE_FL(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops,
> +		BPF_MAP_NO_INNER_MAP)
>   #endif
> +#undef BPF_MAP_TYPE
>   
>   BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint)
>   BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)
[...]
> diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
> index 17738c93bec8..d965a1d328a9 100644
> --- a/kernel/bpf/map_in_map.c
> +++ b/kernel/bpf/map_in_map.c
> @@ -17,13 +17,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
>   	if (IS_ERR(inner_map))
>   		return inner_map;
>   
> -	/* prog_array->aux->{type,jited} is a runtime binding.
> -	 * Doing static check alone in the verifier is not enough.
> -	 */
> -	if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
> -	    inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
> -	    inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE ||
> -	    inner_map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
> +	if (inner_map->properties & BPF_MAP_NO_INNER_MAP) {
>   		fdput(f);
>   		return ERR_PTR(-ENOTSUPP);
>   	}

This whole check here is currently very fragile. For example, given we forbid cgroup
local storage here, why do we not forbid socket local storage? What about other maps
like stackmap? It's quite unclear if it even works as expected and if there's also a
use-case we are aware of. Why not making this an explicit opt-in?

Like explicit annotating via struct bpf_map_ops where everything is visible in one
single place where the map is defined:

const struct bpf_map_ops array_map_ops = {
         .map_alloc_check = array_map_alloc_check,
         [...]
         .map_flags_fixed = BPF_MAP_IN_MAP_OK,
};

That way, if someone forgets to add .map_flags_fixed to a new map type, it's okay since
it's _safe_ to forget to add these flags (and okay to add in future uapi-wise) as opposed
to the other way round where one can easily miss the opt-out case and potentially crash
the machine worst case.

Thanks,
Daniel