Re: [PATCH bpf-next 05/13] bpf: Introduce BPF_PROG_TYPE_STRUCT_OPS

From: Yonghong Song <yhs@fb.com>
To: Martin Lau <kafai@fb.com>, "bpf@vger.kernel.org" <bpf@vger.kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>,
	David Miller <davem@davemloft.net>,
	Kernel Team <Kernel-team@fb.com>,
	"netdev@vger.kernel.org" <netdev@vger.kernel.org>
Subject: Re: [PATCH bpf-next 05/13] bpf: Introduce BPF_PROG_TYPE_STRUCT_OPS
Date: Tue, 17 Dec 2019 06:14:09 +0000	[thread overview]
Message-ID: <5864844f-db69-d025-1eb3-f856257415be@fb.com> (raw)
In-Reply-To: <20191214004748.1652668-1-kafai@fb.com>

On 12/13/19 4:47 PM, Martin KaFai Lau wrote:
> This patch allows the kernel's struct ops (i.e. func ptr) to be
> implemented in BPF.  The first use case in this series is the
> "struct tcp_congestion_ops" which will be introduced in a
> latter patch.
> 
> This patch introduces a new prog type BPF_PROG_TYPE_STRUCT_OPS.
> The BPF_PROG_TYPE_STRUCT_OPS prog is verified against a particular
> func ptr of a kernel struct.  The attr->attach_btf_id is the btf id
> of a kernel struct.  The attr->expected_attach_type is the member
> "index" of that kernel struct.  The first member of a struct starts
> with member index 0.  That will avoid ambiguity when a kernel struct
> has multiple func ptrs with the same func signature.
> 
> For example, a BPF_PROG_TYPE_STRUCT_OPS prog is written
> to implement the "init" func ptr of the "struct tcp_congestion_ops".
> The attr->attach_btf_id is the btf id of the "struct tcp_congestion_ops"
> of the _running_ kernel.  The attr->expected_attach_type is 3.
> 
> The ctx of BPF_PROG_TYPE_STRUCT_OPS is an array of u64 args saved
> by arch_prepare_bpf_trampoline that will be done in the next
> patch when introducing BPF_MAP_TYPE_STRUCT_OPS.
> 
> "struct bpf_struct_ops" is introduced as a common interface for the kernel
> struct that supports BPF_PROG_TYPE_STRUCT_OPS prog.  The supporting kernel
> struct will need to implement an instance of the "struct bpf_struct_ops".
> 
> The supporting kernel struct also needs to implement a bpf_verifier_ops.
> During BPF_PROG_LOAD, bpf_struct_ops_find() will find the right
> bpf_verifier_ops by searching the attr->attach_btf_id.
> 
> A new "btf_struct_access" is also added to the bpf_verifier_ops such
> that the supporting kernel struct can optionally provide its own specific
> check on accessing the func arg (e.g. provide limited write access).
> 
> After btf_vmlinux is parsed, the new bpf_struct_ops_init() is called
> to initialize some values (e.g. the btf id of the supporting kernel
> struct) and it can only be done once the btf_vmlinux is available.
> 
> The R0 checks at BPF_EXIT is excluded for the BPF_PROG_TYPE_STRUCT_OPS prog
> if the return type of the prog->aux->attach_func_proto is "void".
> 
> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> ---
>   include/linux/bpf.h               |  30 +++++++
>   include/linux/bpf_types.h         |   4 +
>   include/linux/btf.h               |  34 ++++++++
>   include/uapi/linux/bpf.h          |   1 +
>   kernel/bpf/Makefile               |   2 +-
>   kernel/bpf/bpf_struct_ops.c       | 124 +++++++++++++++++++++++++++
>   kernel/bpf/bpf_struct_ops_types.h |   4 +
>   kernel/bpf/btf.c                  |  88 ++++++++++++++------
>   kernel/bpf/syscall.c              |  17 ++--
>   kernel/bpf/verifier.c             | 134 +++++++++++++++++++++++-------
>   10 files changed, 374 insertions(+), 64 deletions(-)
>   create mode 100644 kernel/bpf/bpf_struct_ops.c
>   create mode 100644 kernel/bpf/bpf_struct_ops_types.h
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index d467983e61bb..1f0a5fc8c5ee 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -349,6 +349,10 @@ struct bpf_verifier_ops {
>   				  const struct bpf_insn *src,
>   				  struct bpf_insn *dst,
>   				  struct bpf_prog *prog, u32 *target_size);
> +	int (*btf_struct_access)(struct bpf_verifier_log *log,
> +				 const struct btf_type *t, int off, int size,
> +				 enum bpf_access_type atype,
> +				 u32 *next_btf_id);
>   };
>   
>   struct bpf_prog_offload_ops {
> @@ -667,6 +671,32 @@ struct bpf_array_aux {
>   	struct work_struct work;
>   };
>   
> +struct btf_type;
> +struct btf_member;
> +
> +#define BPF_STRUCT_OPS_MAX_NR_MEMBERS 64
> +struct bpf_struct_ops {
> +	const struct bpf_verifier_ops *verifier_ops;
> +	int (*init)(struct btf *_btf_vmlinux);
> +	int (*check_member)(const struct btf_type *t,
> +			    const struct btf_member *member);
> +	const struct btf_type *type;
> +	const char *name;
> +	struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS];
> +	u32 type_id;
> +};
> +
> +#if defined(CONFIG_BPF_JIT)
> +const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id);
> +void bpf_struct_ops_init(struct btf *_btf_vmlinux);
> +#else
> +static inline const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id)
> +{
> +	return NULL;
> +}
> +static inline void bpf_struct_ops_init(struct btf *_btf_vmlinux) { }
> +#endif
> +
>   struct bpf_array {
>   	struct bpf_map map;
>   	u32 elem_size;
[...]
> +const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = {
> +};
> +
> +const struct bpf_prog_ops bpf_struct_ops_prog_ops = {
> +};
> +
> +void bpf_struct_ops_init(struct btf *_btf_vmlinux)
> +{
> +	const struct btf_member *member;
> +	struct bpf_struct_ops *st_ops;
> +	struct bpf_verifier_log log = {};
> +	const struct btf_type *t;
> +	const char *mname;
> +	s32 type_id;
> +	u32 i, j;
> +
> +	for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
> +		st_ops = bpf_struct_ops[i];
> +
> +		type_id = btf_find_by_name_kind(_btf_vmlinux, st_ops->name,
> +						BTF_KIND_STRUCT);
> +		if (type_id < 0) {
> +			pr_warn("Cannot find struct %s in btf_vmlinux\n",
> +				st_ops->name);
> +			continue;
> +		}
> +		t = btf_type_by_id(_btf_vmlinux, type_id);
> +		if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) {
> +			pr_warn("Cannot support #%u members in struct %s\n",
> +				btf_type_vlen(t), st_ops->name);
> +			continue;
> +		}
> +
> +		for_each_member(j, t, member) {
> +			const struct btf_type *func_proto;
> +
> +			mname = btf_name_by_offset(_btf_vmlinux,
> +						   member->name_off);
> +			if (!*mname) {
> +				pr_warn("anon member in struct %s is not supported\n",
> +					st_ops->name);
> +				break;
> +			}
> +
> +			if (btf_member_bitfield_size(t, member)) {
> +				pr_warn("bit field member %s in struct %s is not supported\n",
> +					mname, st_ops->name);
> +				break;
> +			}
> +
> +			func_proto = btf_type_resolve_func_ptr(_btf_vmlinux,
> +							       member->type,
> +							       NULL);
> +			if (func_proto &&
> +			    btf_distill_func_proto(&log, _btf_vmlinux,
> +						   func_proto, mname,
> +						   &st_ops->func_models[j])) {
> +				pr_warn("Error in parsing func ptr %s in struct %s\n",
> +					mname, st_ops->name);
> +				break;
> +			}
> +		}
> +
> +		if (j == btf_type_vlen(t)) {
> +			if (st_ops->init(_btf_vmlinux)) {

is it possible that st_ops->init might be a NULL pointer?

> +				pr_warn("Error in init bpf_struct_ops %s\n",
> +					st_ops->name);
> +			} else {
> +				st_ops->type_id = type_id;
> +				st_ops->type = t;
> +			}
> +		}
> +	}
> +}
> +
> +extern struct btf *btf_vmlinux;
> +
[...]
> index 408264c1d55b..4c1eaa1a2965 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -2858,11 +2858,6 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
>   	u32 btf_id;
>   	int ret;
>   
> -	if (atype != BPF_READ) {
> -		verbose(env, "only read is supported\n");
> -		return -EACCES;
> -	}
> -
>   	if (off < 0) {
>   		verbose(env,
>   			"R%d is ptr_%s invalid negative access: off=%d\n",
> @@ -2879,17 +2874,32 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
>   		return -EACCES;
>   	}
>   
> -	ret = btf_struct_access(&env->log, t, off, size, atype, &btf_id);
> +	if (env->ops->btf_struct_access) {
> +		ret = env->ops->btf_struct_access(&env->log, t, off, size,
> +						  atype, &btf_id);
> +	} else {
> +		if (atype != BPF_READ) {
> +			verbose(env, "only read is supported\n");
> +			return -EACCES;
> +		}
> +
> +		ret = btf_struct_access(&env->log, t, off, size, atype,
> +					&btf_id);
> +	}
> +
>   	if (ret < 0)
>   		return ret;
>   
> -	if (ret == SCALAR_VALUE) {
> -		mark_reg_unknown(env, regs, value_regno);
> -		return 0;
> +	if (atype == BPF_READ) {
> +		if (ret == SCALAR_VALUE) {
> +			mark_reg_unknown(env, regs, value_regno);
> +			return 0;
> +		}
> +		mark_reg_known_zero(env, regs, value_regno);
> +		regs[value_regno].type = PTR_TO_BTF_ID;
> +		regs[value_regno].btf_id = btf_id;
>   	}
> -	mark_reg_known_zero(env, regs, value_regno);
> -	regs[value_regno].type = PTR_TO_BTF_ID;
> -	regs[value_regno].btf_id = btf_id;
> +
>   	return 0;
>   }
>   
> @@ -6343,8 +6353,30 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
>   static int check_return_code(struct bpf_verifier_env *env)
>   {
>   	struct tnum enforce_attach_type_range = tnum_unknown;
> +	const struct bpf_prog *prog = env->prog;
>   	struct bpf_reg_state *reg;
>   	struct tnum range = tnum_range(0, 1);
> +	int err;
> +
> +	/* The struct_ops func-ptr's return type could be "void" */
> +	if (env->prog->type == BPF_PROG_TYPE_STRUCT_OPS &&
> +	    !prog->aux->attach_func_proto->type)
> +		return 0;
> +
> +	/* eBPF calling convetion is such that R0 is used
> +	 * to return the value from eBPF program.
> +	 * Make sure that it's readable at this time
> +	 * of bpf_exit, which means that program wrote
> +	 * something into it earlier
> +	 */
> +	err = check_reg_arg(env, BPF_REG_0, SRC_OP);
> +	if (err)
> +		return err;
> +
> +	if (is_pointer_value(env, BPF_REG_0)) {
> +		verbose(env, "R0 leaks addr as return value\n");
> +		return -EACCES;
> +	}
>   
>   	switch (env->prog->type) {
>   	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
> @@ -8010,21 +8042,6 @@ static int do_check(struct bpf_verifier_env *env)
>   				if (err)
>   					return err;
>   
> -				/* eBPF calling convetion is such that R0 is used
> -				 * to return the value from eBPF program.
> -				 * Make sure that it's readable at this time
> -				 * of bpf_exit, which means that program wrote
> -				 * something into it earlier
> -				 */
> -				err = check_reg_arg(env, BPF_REG_0, SRC_OP);
> -				if (err)
> -					return err;
> -
> -				if (is_pointer_value(env, BPF_REG_0)) {
> -					verbose(env, "R0 leaks addr as return value\n");
> -					return -EACCES;
> -				}
> -
>   				err = check_return_code(env);
>   				if (err)
>   					return err;
> @@ -8833,12 +8850,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
>   			convert_ctx_access = bpf_xdp_sock_convert_ctx_access;
>   			break;
>   		case PTR_TO_BTF_ID:
> -			if (type == BPF_WRITE) {
> +			if (type == BPF_READ) {
> +				insn->code = BPF_LDX | BPF_PROBE_MEM |
> +					BPF_SIZE((insn)->code);
> +				env->prog->aux->num_exentries++;
> +			} else if (env->prog->type != BPF_PROG_TYPE_STRUCT_OPS) {
>   				verbose(env, "Writes through BTF pointers are not allowed\n");
>   				return -EINVAL;
>   			}
> -			insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code);
> -			env->prog->aux->num_exentries++;

Do we need to increase num_exentries for BPF_WRITE as well?

>   			continue;
>   		default:
>   			continue;
> @@ -9505,6 +9524,58 @@ static void print_verification_stats(struct bpf_verifier_env *env)
>   		env->peak_states, env->longest_mark_read_walk);
>   }
>   
[...]