All of lore.kernel.org
 help / color / mirror / Atom feed
From: Song Liu <songliubraving@fb.com>
To: Alexei Starovoitov <ast@kernel.org>
Cc: "davem@davemloft.net" <davem@davemloft.net>,
	"daniel@iogearbox.net" <daniel@iogearbox.net>,
	"x86@kernel.org" <x86@kernel.org>,
	"netdev@vger.kernel.org" <netdev@vger.kernel.org>,
	"bpf@vger.kernel.org" <bpf@vger.kernel.org>,
	Kernel Team <Kernel-team@fb.com>
Subject: Re: [PATCH v2 bpf-next 03/17] bpf: Introduce BPF trampoline
Date: Thu, 7 Nov 2019 22:37:19 +0000	[thread overview]
Message-ID: <5967F93A-235B-447E-9B70-E7768998B718@fb.com> (raw)
In-Reply-To: <20191107054644.1285697-4-ast@kernel.org>



> On Nov 6, 2019, at 9:46 PM, Alexei Starovoitov <ast@kernel.org> wrote:
> 

[...]

> 
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> Acked-by: Andrii Nakryiko <andriin@fb.com>
> ---
> arch/x86/net/bpf_jit_comp.c | 227 ++++++++++++++++++++++++++++++--
> include/linux/bpf.h         |  98 ++++++++++++++
> include/uapi/linux/bpf.h    |   2 +
> kernel/bpf/Makefile         |   1 +
> kernel/bpf/btf.c            |  77 ++++++++++-
> kernel/bpf/core.c           |   1 +
> kernel/bpf/syscall.c        |  53 +++++++-
> kernel/bpf/trampoline.c     | 252 ++++++++++++++++++++++++++++++++++++
> kernel/bpf/verifier.c       |  39 ++++++
> 9 files changed, 732 insertions(+), 18 deletions(-)
> create mode 100644 kernel/bpf/trampoline.c
> 
> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> index 8631d3bd637f..44169e8bffc0 100644
> --- a/arch/x86/net/bpf_jit_comp.c
> +++ b/arch/x86/net/bpf_jit_comp.c
> @@ -98,6 +98,7 @@ static int bpf_size_to_x86_bytes(int bpf_size)
> 
> /* Pick a register outside of BPF range for JIT internal work */
> #define AUX_REG (MAX_BPF_JIT_REG + 1)
> +#define X86_REG_R9 (MAX_BPF_JIT_REG + 2)
> 
> /*
>  * The following table maps BPF registers to x86-64 registers.
> @@ -123,6 +124,7 @@ static const int reg2hex[] = {
> 	[BPF_REG_FP] = 5, /* RBP readonly */
> 	[BPF_REG_AX] = 2, /* R10 temp register */
> 	[AUX_REG] = 3,    /* R11 temp register */
> +	[X86_REG_R9] = 1, /* R9 register, 6th function argument */

We should update the comment above this:

 * Also x86-64 register R9 is unused. ...

> };
> 
> static const int reg2pt_regs[] = {
> @@ -150,6 +152,7 @@ static bool is_ereg(u32 reg)
> 			     BIT(BPF_REG_7) |
> 			     BIT(BPF_REG_8) |
> 			     BIT(BPF_REG_9) |
> +			     BIT(X86_REG_R9) |
> 			     BIT(BPF_REG_AX));
> }
> 
> @@ -492,8 +495,8 @@ static int emit_call(u8 **pprog, void *func, void *ip)
> int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
> 		       void *old_addr, void *new_addr)
> {
> -	u8 old_insn[NOP_ATOMIC5] = {};
> -	u8 new_insn[NOP_ATOMIC5] = {};
> +	u8 old_insn[X86_CALL_SIZE] = {};
> +	u8 new_insn[X86_CALL_SIZE] = {};
> 	u8 *prog;
> 	int ret;
> 
> @@ -507,9 +510,9 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
> 		if (ret)
> 			return ret;
> 	}
> -	if (old_addr) {
> +	if (new_addr) {
> 		prog = new_insn;
> -		ret = emit_call(&prog, old_addr, (void *)ip);
> +		ret = emit_call(&prog, new_addr, (void *)ip);
> 		if (ret)
> 			return ret;
> 	}
> @@ -517,19 +520,19 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
> 	mutex_lock(&text_mutex);
> 	switch (t) {
> 	case BPF_MOD_NOP_TO_CALL:
> -		if (memcmp(ip, ideal_nops, 5))
> +		if (memcmp(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE))
> 			goto out;
> -		text_poke(ip, new_insn, 5);
> +		text_poke(ip, new_insn, X86_CALL_SIZE);
> 		break;
> 	case BPF_MOD_CALL_TO_CALL:
> -		if (memcmp(ip, old_insn, 5))
> +		if (memcmp(ip, old_insn, X86_CALL_SIZE))
> 			goto out;
> -		text_poke(ip, new_insn, 5);
> +		text_poke(ip, new_insn, X86_CALL_SIZE);
> 		break;
> 	case BPF_MOD_CALL_TO_NOP:
> -		if (memcmp(ip, old_insn, 5))
> +		if (memcmp(ip, old_insn, X86_CALL_SIZE))
> 			goto out;
> -		text_poke(ip, ideal_nops, 5);
> +		text_poke(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE);
> 		break;
> 	}
> 	ret = 0;
> @@ -1234,6 +1237,210 @@ xadd:			if (is_imm8(insn->off))
> 	return proglen;
> }
> 
> +static void save_regs(struct btf_func_model *m, u8 **prog, int nr_args,
> +		      int stack_size)
> +{
> +	int i;
> +	/* Store function arguments to stack.
> +	 * For a function that accepts two pointers the sequence will be:
> +	 * mov QWORD PTR [rbp-0x10],rdi
> +	 * mov QWORD PTR [rbp-0x8],rsi
> +	 */
> +	for (i = 0; i < min(nr_args, 6); i++)
> +		emit_stx(prog, bytes_to_bpf_size(m->arg_size[i]),
> +			 BPF_REG_FP,
> +			 i == 5 ? X86_REG_R9 : BPF_REG_1 + i,
> +			 -(stack_size - i * 8));
> +}
> +
> +static void restore_regs(struct btf_func_model *m, u8 **prog, int nr_args,
> +			 int stack_size)
> +{
> +	int i;
> +
> +	/* Restore function arguments from stack.
> +	 * For a function that accepts two pointers the sequence will be:
> +	 * EMIT4(0x48, 0x8B, 0x7D, 0xF0); mov rdi,QWORD PTR [rbp-0x10]
> +	 * EMIT4(0x48, 0x8B, 0x75, 0xF8); mov rsi,QWORD PTR [rbp-0x8]
> +	 */
> +	for (i = 0; i < min(nr_args, 6); i++)
> +		emit_ldx(prog, bytes_to_bpf_size(m->arg_size[i]),
> +			 i == 5 ? X86_REG_R9 : BPF_REG_1 + i,
> +			 BPF_REG_FP,
> +			 -(stack_size - i * 8));
> +}
> +
> +static int invoke_bpf(struct btf_func_model *m, u8 **pprog,
> +		      struct bpf_prog **progs, int prog_cnt, int stack_size)
> +{
> +	u8 *prog = *pprog;
> +	int cnt = 0, i;
> +
> +	for (i = 0; i < prog_cnt; i++) {
> +		if (emit_call(&prog, __bpf_prog_enter, prog))
> +			return -EINVAL;
> +		/* remember prog start time returned by __bpf_prog_enter */
> +		emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
> +
> +		/* arg1: lea rdi, [rbp - stack_size] */
> +		EMIT4(0x48, 0x8D, 0x7D, -stack_size);
> +		/* arg2: progs[i]->insnsi for interpreter */
> +		if (!progs[i]->jited)
> +			emit_mov_imm64(&prog, BPF_REG_2,
> +				       (long) progs[i]->insnsi >> 32,
> +				       (u32) (long) progs[i]->insnsi);
> +		/* call JITed bpf program or interpreter */
> +		if (emit_call(&prog, progs[i]->bpf_func, prog))
> +			return -EINVAL;
> +
> +		/* arg1: mov rdi, progs[i] */
> +		emit_mov_imm64(&prog, BPF_REG_1, (long) progs[i] >> 32,
> +			       (u32) (long) progs[i]);
> +		/* arg2: mov rsi, rbx <- start time in nsec */
> +		emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
> +		if (emit_call(&prog, __bpf_prog_exit, prog))
> +			return -EINVAL;
> +	}
> +	*pprog = prog;
> +	return 0;
> +}
> +
> +/* Example:
> + * __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev);
> + * its 'struct btf_func_model' will be nr_args=2
> + * The assembly code when eth_type_trans is executing after trampoline:
> + *
> + * push rbp
> + * mov rbp, rsp
> + * sub rsp, 16                     // space for skb and dev
> + * push rbx                        // temp regs to pass start time
> + * mov qword ptr [rbp - 16], rdi   // save skb pointer to stack
> + * mov qword ptr [rbp - 8], rsi    // save dev pointer to stack
> + * call __bpf_prog_enter           // rcu_read_lock and preempt_disable
> + * mov rbx, rax                    // remember start time in bpf stats are enabled
> + * lea rdi, [rbp - 16]             // R1==ctx of bpf prog
> + * call addr_of_jited_FENTRY_prog
> + * movabsq rdi, 64bit_addr_of_struct_bpf_prog  // unused if bpf stats are off
> + * mov rsi, rbx                    // prog start time
> + * call __bpf_prog_exit            // rcu_read_unlock, preempt_enable and stats math
> + * mov rdi, qword ptr [rbp - 16]   // restore skb pointer from stack
> + * mov rsi, qword ptr [rbp - 8]    // restore dev pointer from stack
> + * pop rbx
> + * leave
> + * ret
> + *
> + * eth_type_trans has 5 byte nop at the beginning. These 5 bytes will be
> + * replaced with 'call generated_bpf_trampoline'. When it returns
> + * eth_type_trans will continue executing with original skb and dev pointers.
> + *
> + * The assembly code when eth_type_trans is called from trampoline:
> + *
> + * push rbp
> + * mov rbp, rsp
> + * sub rsp, 24                     // space for skb, dev, return value
> + * push rbx                        // temp regs to pass start time
> + * mov qword ptr [rbp - 24], rdi   // save skb pointer to stack
> + * mov qword ptr [rbp - 16], rsi   // save dev pointer to stack
> + * call __bpf_prog_enter           // rcu_read_lock and preempt_disable
> + * mov rbx, rax                    // remember start time if bpf stats are enabled
> + * lea rdi, [rbp - 24]             // R1==ctx of bpf prog
> + * call addr_of_jited_FENTRY_prog  // bpf prog can access skb and dev
> + * movabsq rdi, 64bit_addr_of_struct_bpf_prog  // unused if bpf stats are off
> + * mov rsi, rbx                    // prog start time
> + * call __bpf_prog_exit            // rcu_read_unlock, preempt_enable and stats math
> + * mov rdi, qword ptr [rbp - 24]   // restore skb pointer from stack
> + * mov rsi, qword ptr [rbp - 16]   // restore dev pointer from stack
> + * call eth_type_trans+5           // execute body of eth_type_trans
> + * mov qword ptr [rbp - 8], rax    // save return value
> + * call __bpf_prog_enter           // rcu_read_lock and preempt_disable
> + * mov rbx, rax                    // remember start time in bpf stats are enabled
> + * lea rdi, [rbp - 24]             // R1==ctx of bpf prog
> + * call addr_of_jited_FEXIT_prog   // bpf prog can access skb, dev, return value
> + * movabsq rdi, 64bit_addr_of_struct_bpf_prog  // unused if bpf stats are off
> + * mov rsi, rbx                    // prog start time
> + * call __bpf_prog_exit            // rcu_read_unlock, preempt_enable and stats math
> + * mov rax, qword ptr [rbp - 8]    // restore eth_type_trans's return value
> + * pop rbx
> + * leave
> + * add rsp, 8                      // skip eth_type_trans's frame
> + * ret                             // return to its caller
> + */
> +int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags,
> +				struct bpf_prog **fentry_progs, int fentry_cnt,
> +				struct bpf_prog **fexit_progs, int fexit_cnt,
> +				void *orig_call)
> +{
> +	int cnt = 0, nr_args = m->nr_args;
> +	int stack_size = nr_args * 8;
> +	u8 *prog;
> +
> +	/* x86-64 supports up to 6 arguments. 7+ can be added in the future */
> +	if (nr_args > 6)
> +		return -ENOTSUPP;
> +
> +	if ((flags & BPF_TRAMP_F_RESTORE_REGS) &&
> +	    (flags & BPF_TRAMP_F_SKIP_FRAME))
> +		return -EINVAL;
> +
> +	if (flags & BPF_TRAMP_F_CALL_ORIG)
> +		stack_size += 8; /* room for return value of orig_call */
> +
> +	if (flags & BPF_TRAMP_F_SKIP_FRAME)
> +		/* skip patched call instruction and point orig_call to actual
> +		 * body of the kernel function.
> +		 */
> +		orig_call += X86_CALL_SIZE;
> +
> +	prog = image;
> +
> +	EMIT1(0x55);		 /* push rbp */
> +	EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
> +	EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */
> +	EMIT1(0x53);		 /* push rbx */
> +
> +	save_regs(m, &prog, nr_args, stack_size);
> +
> +	if (fentry_cnt)
> +		if (invoke_bpf(m, &prog, fentry_progs, fentry_cnt, stack_size))
> +			return -EINVAL;
> +
> +	if (flags & BPF_TRAMP_F_CALL_ORIG) {
> +		if (fentry_cnt)
> +			restore_regs(m, &prog, nr_args, stack_size);
> +
> +		/* call original function */
> +		if (emit_call(&prog, orig_call, prog))
> +			return -EINVAL;
> +		/* remember return value in a stack for bpf prog to access */
> +		emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
> +	}
> +
> +	if (fexit_cnt)
> +		if (invoke_bpf(m, &prog, fexit_progs, fexit_cnt, stack_size))
> +			return -EINVAL;
> +
> +	if (flags & BPF_TRAMP_F_RESTORE_REGS)
> +		restore_regs(m, &prog, nr_args, stack_size);
> +
> +	if (flags & BPF_TRAMP_F_CALL_ORIG)
> +		/* restore original return value back into RAX */
> +		emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8);
> +
> +	EMIT1(0x5B); /* pop rbx */
> +	EMIT1(0xC9); /* leave */
> +	if (flags & BPF_TRAMP_F_SKIP_FRAME)
> +		/* skip our return address and return to parent */
> +		EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */
> +	EMIT1(0xC3); /* ret */
> +	/* One half of the page has active running trampoline.
> +	 * Another half is an area for next trampoline.
> +	 * Make sure the trampoline generation logic doesn't overflow.
> +	 */
> +	if (WARN_ON_ONCE(prog - (u8 *)image > PAGE_SIZE / 2 - BPF_INSN_SAFETY))
> +		return -EFAULT;

Given max number of args, can we catch this error at compile time? 

> +	return 0;
> +}
> +
> struct x64_jit_data {
> 	struct bpf_binary_header *header;
> 	int *addrs;
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 8b90db25348a..9206b7e86dde 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -14,6 +14,7 @@
> #include <linux/numa.h>
> #include <linux/wait.h>
> #include <linux/u64_stats_sync.h>
> +#include <linux/refcount.h>
> 
> struct bpf_verifier_env;
> struct bpf_verifier_log;
> @@ -384,6 +385,94 @@ struct bpf_prog_stats {
> 	struct u64_stats_sync syncp;
> } __aligned(2 * sizeof(u64));
> 
> +struct btf_func_model {
> +	u8 ret_size;
> +	u8 nr_args;
> +	u8 arg_size[MAX_BPF_FUNC_ARGS];
> +};
> +
> +/* Restore arguments before returning from trampoline to let original function
> + * continue executing. This flag is used for fentry progs when there are no
> + * fexit progs.
> + */
> +#define BPF_TRAMP_F_RESTORE_REGS	BIT(0)
> +/* Call original function after fentry progs, but before fexit progs.
> + * Makes sense for fentry/fexit, normal calls and indirect calls.
> + */
> +#define BPF_TRAMP_F_CALL_ORIG		BIT(1)
> +/* Skip current frame and return to parent.  Makes sense for fentry/fexit
> + * programs only. Should not be used with normal calls and indirect calls.
> + */
> +#define BPF_TRAMP_F_SKIP_FRAME		BIT(2)
> +
> +/* Different use cases for BPF trampoline:
> + * 1. replace nop at the function entry (kprobe equivalent)
> + *    flags = BPF_TRAMP_F_RESTORE_REGS
> + *    fentry = a set of programs to run before returning from trampoline
> + *
> + * 2. replace nop at the function entry (kprobe + kretprobe equivalent)
> + *    flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME
> + *    orig_call = fentry_ip + MCOUNT_INSN_SIZE
> + *    fentry = a set of program to run before calling original function
> + *    fexit = a set of program to run after original function
> + *
> + * 3. replace direct call instruction anywhere in the function body
> + *    or assign a function pointer for indirect call (like tcp_congestion_ops->cong_avoid)
> + *    With flags = 0
> + *      fentry = a set of programs to run before returning from trampoline
> + *    With flags = BPF_TRAMP_F_CALL_ORIG
> + *      orig_call = original callback addr or direct function addr
> + *      fentry = a set of program to run before calling original function
> + *      fexit = a set of program to run after original function
> + */
> +int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags,
> +				struct bpf_prog **fentry_progs, int fentry_cnt,
> +				struct bpf_prog **fexit_progs, int fexit_cnt,
> +				void *orig_call);
> +/* these two functions are called from generated trampoline */
> +u64 notrace __bpf_prog_enter(void);
> +void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start);
> +
> +enum bpf_tramp_prog_type {
> +	BPF_TRAMP_FENTRY,
> +	BPF_TRAMP_FEXIT,
> +	BPF_TRAMP_MAX
> +};
> +
> +struct bpf_trampoline {
> +	struct hlist_node hlist;
> +	refcount_t refcnt;
> +	u64 key;
> +	struct {
> +		struct btf_func_model model;
> +		void *addr;
> +	} func;
> +	struct hlist_head progs_hlist[BPF_TRAMP_MAX];
> +	int progs_cnt[BPF_TRAMP_MAX];
> +	void *image;
> +	u64 selector;
> +};
> +#ifdef CONFIG_BPF_JIT
> +struct bpf_trampoline *bpf_trampoline_lookup(u64 key);
> +int bpf_trampoline_link_prog(struct bpf_prog *prog);
> +int bpf_trampoline_unlink_prog(struct bpf_prog *prog);
> +void bpf_trampoline_put(struct bpf_trampoline *tr);
> +#else
> +static inline struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
> +{
> +	return NULL;
> +}
> +static inline int bpf_trampoline_link_prog(struct bpf_prog *prog)
> +{
> +	return -ENOTSUPP;
> +}
> +static inline int bpf_trampoline_unlink_prog(struct bpf_prog *prog)
> +{
> +	return -ENOTSUPP;
> +}
> +static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {}
> +#endif
> +
> struct bpf_prog_aux {
> 	atomic_t refcnt;
> 	u32 used_map_cnt;
> @@ -398,6 +487,9 @@ struct bpf_prog_aux {
> 	bool verifier_zext; /* Zero extensions has been inserted by verifier. */
> 	bool offload_requested;
> 	bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */
> +	enum bpf_tramp_prog_type trampoline_prog_type;
> +	struct bpf_trampoline *trampoline;
> +	struct hlist_node tramp_hlist;
> 	/* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
> 	const struct btf_type *attach_func_proto;
> 	/* function name for valid attach_btf_id */
> @@ -784,6 +876,12 @@ int btf_struct_access(struct bpf_verifier_log *log,
> 		      u32 *next_btf_id);
> u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *, int);
> 
> +int btf_distill_func_proto(struct bpf_verifier_log *log,
> +			   struct btf *btf,
> +			   const struct btf_type *func_proto,
> +			   const char *func_name,
> +			   struct btf_func_model *m);
> +
> #else /* !CONFIG_BPF_SYSCALL */
> static inline struct bpf_prog *bpf_prog_get(u32 ufd)
> {
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index df6809a76404..69c200e6e696 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -201,6 +201,8 @@ enum bpf_attach_type {
> 	BPF_CGROUP_GETSOCKOPT,
> 	BPF_CGROUP_SETSOCKOPT,
> 	BPF_TRACE_RAW_TP,
> +	BPF_TRACE_FENTRY,
> +	BPF_TRACE_FEXIT,
> 	__MAX_BPF_ATTACH_TYPE
> };
> 
> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
> index e1d9adb212f9..3f671bf617e8 100644
> --- a/kernel/bpf/Makefile
> +++ b/kernel/bpf/Makefile
> @@ -6,6 +6,7 @@ obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
> obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
> obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
> obj-$(CONFIG_BPF_SYSCALL) += disasm.o
> +obj-$(CONFIG_BPF_JIT) += trampoline.o
> obj-$(CONFIG_BPF_SYSCALL) += btf.o
> ifeq ($(CONFIG_NET),y)
> obj-$(CONFIG_BPF_SYSCALL) += devmap.o
> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> index 128d89601d73..78d9ceaabc00 100644
> --- a/kernel/bpf/btf.c
> +++ b/kernel/bpf/btf.c
> @@ -3441,13 +3441,18 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
> 		args++;
> 		nr_args--;
> 	}
> -	if (arg >= nr_args) {
> +
> +	if (prog->expected_attach_type == BPF_TRACE_FEXIT &&
> +	    arg == nr_args) {
> +		/* function return type */
> +		t = btf_type_by_id(btf_vmlinux, t->type);
> +	} else if (arg >= nr_args) {
> 		bpf_log(log, "func '%s' doesn't have %d-th argument\n",
> -			tname, arg);
> +			tname, arg + 1);
> 		return false;
> +	} else {
> +		t = btf_type_by_id(btf_vmlinux, args[arg].type);
> 	}
> -
> -	t = btf_type_by_id(btf_vmlinux, args[arg].type);
> 	/* skip modifiers */
> 	while (btf_type_is_modifier(t))
> 		t = btf_type_by_id(btf_vmlinux, t->type);
> @@ -3647,6 +3652,70 @@ u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn, int arg)
> 	return btf_id;
> }
> 
> +static int __get_type_size(struct btf *btf, u32 btf_id,
> +			   const struct btf_type **bad_type)
> +{
> +	const struct btf_type *t;
> +
> +	if (!btf_id)
> +		/* void */
> +		return 0;
> +	t = btf_type_by_id(btf, btf_id);
> +	while (t && btf_type_is_modifier(t))
> +		t = btf_type_by_id(btf, t->type);
> +	if (!t)
> +		return -EINVAL;
> +	if (btf_type_is_ptr(t))
> +		/* kernel size of pointer. Not BPF's size of pointer*/
> +		return sizeof(void *);
> +	if (btf_type_is_int(t) || btf_type_is_enum(t))
> +		return t->size;
> +	*bad_type = t;
> +	return -EINVAL;
> +}
> +
> +int btf_distill_func_proto(struct bpf_verifier_log *log,
> +			   struct btf *btf,
> +			   const struct btf_type *func,
> +			   const char *tname,
> +			   struct btf_func_model *m)
> +{
> +	const struct btf_param *args;
> +	const struct btf_type *t;
> +	u32 i, nargs;
> +	int ret;
> +
> +	args = (const struct btf_param *)(func + 1);
> +	nargs = btf_type_vlen(func);
> +	if (nargs >= MAX_BPF_FUNC_ARGS) {
> +		bpf_log(log,
> +			"The function %s has %d arguments. Too many.\n",
> +			tname, nargs);
> +		return -EINVAL;
> +	}
> +	ret = __get_type_size(btf, func->type, &t);
> +	if (ret < 0) {
> +		bpf_log(log,
> +			"The function %s return type %s is unsupported.\n",
> +			tname, btf_kind_str[BTF_INFO_KIND(t->info)]);
> +		return -EINVAL;
> +	}
> +	m->ret_size = ret;
> +
> +	for (i = 0; i < nargs; i++) {
> +		ret = __get_type_size(btf, args[i].type, &t);
> +		if (ret < 0) {
> +			bpf_log(log,
> +				"The function %s arg%d type %s is unsupported.\n",
> +				tname, i, btf_kind_str[BTF_INFO_KIND(t->info)]);
> +			return -EINVAL;
> +		}
> +		m->arg_size[i] = ret;
> +	}
> +	m->nr_args = nargs;
> +	return 0;
> +}
> +
> void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
> 		       struct seq_file *m)
> {
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 856564e97943..7352f5514e57 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -2011,6 +2011,7 @@ static void bpf_prog_free_deferred(struct work_struct *work)
> 	if (aux->prog->has_callchain_buf)
> 		put_callchain_buffers();
> #endif
> +	bpf_trampoline_put(aux->trampoline);
> 	for (i = 0; i < aux->func_cnt; i++)
> 		bpf_jit_free(aux->func[i]);
> 	if (aux->func_cnt) {
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 6d9ce95e5a8d..e2e37bea86bc 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -1799,6 +1799,49 @@ static int bpf_obj_get(const union bpf_attr *attr)
> 				attr->file_flags);
> }
> 
> +static int bpf_tracing_prog_release(struct inode *inode, struct file *filp)
> +{
> +	struct bpf_prog *prog = filp->private_data;
> +
> +	WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog));
> +	bpf_prog_put(prog);
> +	return 0;
> +}
> +
> +static const struct file_operations bpf_tracing_prog_fops = {
> +	.release	= bpf_tracing_prog_release,
> +	.read		= bpf_dummy_read,
> +	.write		= bpf_dummy_write,
> +};
> +
> +static int bpf_tracing_prog_attach(struct bpf_prog *prog)
> +{
> +	int tr_fd, err;
> +
> +	if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
> +	    prog->expected_attach_type != BPF_TRACE_FEXIT) {
> +		err = -EINVAL;
> +		goto out_put_prog;
> +	}
> +
> +	err = bpf_trampoline_link_prog(prog);
> +	if (err)
> +		goto out_put_prog;
> +
> +	tr_fd = anon_inode_getfd("bpf-tracing-prog", &bpf_tracing_prog_fops,
> +				 prog, O_CLOEXEC);
> +	if (tr_fd < 0) {
> +		WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog));
> +		err = tr_fd;
> +		goto out_put_prog;
> +	}
> +	return tr_fd;
> +
> +out_put_prog:
> +	bpf_prog_put(prog);
> +	return err;
> +}
> +
> struct bpf_raw_tracepoint {
> 	struct bpf_raw_event_map *btp;
> 	struct bpf_prog *prog;
> @@ -1850,14 +1893,16 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
> 
> 	if (prog->type == BPF_PROG_TYPE_TRACING) {
> 		if (attr->raw_tracepoint.name) {
> -			/* raw_tp name should not be specified in raw_tp
> -			 * programs that were verified via in-kernel BTF info
> +			/* The attach point for this category of programs
> +			 * should be specified via btf_id during program load.
> 			 */
> 			err = -EINVAL;
> 			goto out_put_prog;
> 		}
> -		/* raw_tp name is taken from type name instead */
> -		tp_name = prog->aux->attach_func_name;
> +		if (prog->expected_attach_type == BPF_TRACE_RAW_TP)
> +			tp_name = prog->aux->attach_func_name;
> +		else
> +			return bpf_tracing_prog_attach(prog);
> 	} else {
> 		if (strncpy_from_user(buf,
> 				      u64_to_user_ptr(attr->raw_tracepoint.name),
> diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
> new file mode 100644
> index 000000000000..214c964cfd72
> --- /dev/null
> +++ b/kernel/bpf/trampoline.c
> @@ -0,0 +1,252 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (c) 2019 Facebook */
> +#include <linux/hash.h>
> +#include <linux/bpf.h>
> +#include <linux/filter.h>
> +
> +/* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */
> +#define TRAMPOLINE_HASH_BITS 10
> +#define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS)
> +
> +static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
> +
> +static DEFINE_MUTEX(trampoline_mutex);
> +
> +struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
> +{
> +	struct bpf_trampoline *tr;
> +	struct hlist_head *head;
> +	void *image;
> +	int i;
> +
> +	mutex_lock(&trampoline_mutex);
> +	head = &trampoline_table[hash_64(key, TRAMPOLINE_HASH_BITS)];
> +	hlist_for_each_entry(tr, head, hlist) {
> +		if (tr->key == key) {
> +			refcount_inc(&tr->refcnt);
> +			goto out;
> +		}
> +	}
> +	tr = kzalloc(sizeof(*tr), GFP_KERNEL);
> +	if (!tr)
> +		goto out;
> +
> +	/* is_root was checked earlier. No need for bpf_jit_charge_modmem() */
> +	image = bpf_jit_alloc_exec(PAGE_SIZE);
> +	if (!image) {
> +		kfree(tr);
> +		tr = NULL;
> +		goto out;
> +	}
> +
> +	tr->key = key;
> +	INIT_HLIST_NODE(&tr->hlist);
> +	hlist_add_head(&tr->hlist, head);
> +	refcount_set(&tr->refcnt, 1);
> +	for (i = 0; i < BPF_TRAMP_MAX; i++)
> +		INIT_HLIST_HEAD(&tr->progs_hlist[i]);
> +
> +	set_vm_flush_reset_perms(image);
> +	/* Keep image as writeable. The alternative is to keep flipping ro/rw
> +	 * everytime new program is attached or detached.
> +	 */
> +	set_memory_x((long)image, 1);
> +	tr->image = image;
> +out:
> +	mutex_unlock(&trampoline_mutex);
> +	return tr;
> +}
> +
> +/* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
> + * bytes on x86.  Pick a number to fit into PAGE_SIZE / 2
> + */
> +#define BPF_MAX_TRAMP_PROGS 40
> +
> +static int bpf_trampoline_update(struct bpf_prog *prog)

Seems argument "prog" is not used at all? 

> +{
> +	struct bpf_trampoline *tr = prog->aux->trampoline;
> +	void *old_image = tr->image + ((tr->selector + 1) & 1) * PAGE_SIZE/2;
> +	void *new_image = tr->image + (tr->selector & 1) * PAGE_SIZE/2;
> +	struct bpf_prog *progs_to_run[BPF_MAX_TRAMP_PROGS];
> +	int fentry_cnt = tr->progs_cnt[BPF_TRAMP_FENTRY];
> +	int fexit_cnt = tr->progs_cnt[BPF_TRAMP_FEXIT];
> +	struct bpf_prog **progs, **fentry, **fexit;
> +	u32 flags = BPF_TRAMP_F_RESTORE_REGS;
> +	struct bpf_prog_aux *aux;
> +	int err;
> +
> +	if (fentry_cnt + fexit_cnt == 0) {
> +		err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL_TO_NOP,
> +					 old_image, NULL);
> +		tr->selector = 0;
> +		goto out;
> +	}
> +
> +	/* populate fentry progs */
> +	fentry = progs = progs_to_run;
> +	hlist_for_each_entry(aux, &tr->progs_hlist[BPF_TRAMP_FENTRY], tramp_hlist)
> +		*progs++ = aux->prog;
> +
> +	/* populate fexit progs */
> +	fexit = progs;
> +	hlist_for_each_entry(aux, &tr->progs_hlist[BPF_TRAMP_FEXIT], tramp_hlist)
> +		*progs++ = aux->prog;
> +
> +	if (fexit_cnt)
> +		flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
> +
> +	err = arch_prepare_bpf_trampoline(new_image, &tr->func.model, flags,
> +					  fentry, fentry_cnt,
> +					  fexit, fexit_cnt,
> +					  tr->func.addr);
> +	if (err)
> +		goto out;
> +
> +	if (tr->selector)
> +		/* progs already running at this address */
> +		err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL_TO_CALL,
> +					 old_image, new_image);
> +	else
> +		/* first time registering */
> +		err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP_TO_CALL,
> +					 NULL, new_image);
> +
> +	if (err)
> +		goto out;
> +	tr->selector++;

Shall we do selector-- for unlink?

> +out:
> +	return err;
> +}
> +
> +static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(enum bpf_attach_type t)
> +{
> +	switch (t) {
> +	case BPF_TRACE_FENTRY:
> +		return BPF_TRAMP_FENTRY;
> +	default:
> +		return BPF_TRAMP_FEXIT;
> +	}
> +}
> +
> +int bpf_trampoline_link_prog(struct bpf_prog *prog)
> +{
> +	enum bpf_tramp_prog_type kind;
> +	struct bpf_trampoline *tr;
> +	int err = 0;
> +
> +	tr = prog->aux->trampoline;
> +	kind = bpf_attach_type_to_tramp(prog->expected_attach_type);
> +	mutex_lock(&trampoline_mutex);
> +	if (tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT]
> +	    >= BPF_MAX_TRAMP_PROGS) {
> +		err = -E2BIG;
> +		goto out;
> +	}
> +	if (!hlist_unhashed(&prog->aux->tramp_hlist)) {
> +		/* prog already linked */
> +		err = -EBUSY;
> +		goto out;
> +	}
> +	hlist_add_head(&prog->aux->tramp_hlist, &tr->progs_hlist[kind]);
> +	tr->progs_cnt[kind]++;
> +	err = bpf_trampoline_update(prog);
> +	if (err) {
> +		hlist_del(&prog->aux->tramp_hlist);
> +		tr->progs_cnt[kind]--;
> +	}
> +out:
> +	mutex_unlock(&trampoline_mutex);
> +	return err;
> +}
> +
> +/* bpf_trampoline_unlink_prog() should never fail. */
> +int bpf_trampoline_unlink_prog(struct bpf_prog *prog)
> +{
> +	enum bpf_tramp_prog_type kind;
> +	struct bpf_trampoline *tr;
> +	int err;
> +
> +	tr = prog->aux->trampoline;
> +	kind = bpf_attach_type_to_tramp(prog->expected_attach_type);
> +	mutex_lock(&trampoline_mutex);
> +	hlist_del(&prog->aux->tramp_hlist);
> +	tr->progs_cnt[kind]--;
> +	err = bpf_trampoline_update(prog);
> +	mutex_unlock(&trampoline_mutex);
> +	return err;
> +}
> +
> +void bpf_trampoline_put(struct bpf_trampoline *tr)
> +{
> +	if (!tr)
> +		return;
> +	mutex_lock(&trampoline_mutex);
> +	if (!refcount_dec_and_test(&tr->refcnt))
> +		goto out;
> +	if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FENTRY])))
> +		goto out;
> +	if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT])))
> +		goto out;
> +	bpf_jit_free_exec(tr->image);
> +	hlist_del(&tr->hlist);
> +	kfree(tr);
> +out:
> +	mutex_unlock(&trampoline_mutex);
> +}
> +
> +/* The logic is similar to BPF_PROG_RUN, but with explicit rcu and preempt that
> + * are needed for trampoline. The macro is split into
> + * call _bpf_prog_enter
> + * call prog->bpf_func
> + * call __bpf_prog_exit
> + */
> +u64 notrace __bpf_prog_enter(void)
> +{
> +	u64 start = 0;
> +
> +	rcu_read_lock();
> +	preempt_disable();
> +	if (static_branch_unlikely(&bpf_stats_enabled_key))
> +		start = sched_clock();
> +	return start;
> +}
> +
> +void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
> +{
> +	struct bpf_prog_stats *stats;
> +
> +	if (static_branch_unlikely(&bpf_stats_enabled_key) &&
> +	    /* static_key could be enabled in __bpf_prog_enter
> +	     * and disabled in __bpf_prog_exit.
> +	     * And vice versa.
> +	     * Hence check that 'start' is not zero.
> +	     */
> +	    start) {
> +		stats = this_cpu_ptr(prog->aux->stats);
> +		u64_stats_update_begin(&stats->syncp);
> +		stats->cnt++;
> +		stats->nsecs += sched_clock() - start;
> +		u64_stats_update_end(&stats->syncp);
> +	}
> +	preempt_enable();
> +	rcu_read_unlock();
> +}
> +
> +int __weak
> +arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags,
> +			    struct bpf_prog **fentry_progs, int fentry_cnt,
> +			    struct bpf_prog **fexit_progs, int fexit_cnt,
> +			    void *orig_call)
> +{
> +	return -ENOTSUPP;
> +}
> +
> +static int __init init_trampolines(void)
> +{
> +	int i;
> +
> +	for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++)
> +		INIT_HLIST_HEAD(&trampoline_table[i]);
> +	return 0;
> +}
> +late_initcall(init_trampolines);
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 2f2374967b36..36542edd4936 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -9382,8 +9382,11 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
> 	struct bpf_prog *prog = env->prog;
> 	u32 btf_id = prog->aux->attach_btf_id;
> 	const char prefix[] = "btf_trace_";
> +	struct bpf_trampoline *tr;
> 	const struct btf_type *t;
> 	const char *tname;
> +	long addr;
> +	int ret;
> 
> 	if (prog->type != BPF_PROG_TYPE_TRACING)
> 		return 0;
> @@ -9432,6 +9435,42 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
> 		prog->aux->attach_func_proto = t;
> 		prog->aux->attach_btf_trace = true;
> 		return 0;
> +	case BPF_TRACE_FENTRY:
> +	case BPF_TRACE_FEXIT:
> +		if (!btf_type_is_func(t)) {
> +			verbose(env, "attach_btf_id %u is not a function\n",
> +				btf_id);
> +			return -EINVAL;
> +		}
> +		t = btf_type_by_id(btf_vmlinux, t->type);
> +		if (!btf_type_is_func_proto(t))
> +			return -EINVAL;
> +		tr = bpf_trampoline_lookup(btf_id);
> +		if (!tr)
> +			return -ENOMEM;
> +		prog->aux->attach_func_name = tname;
> +		prog->aux->attach_func_proto = t;
> +		if (tr->func.addr) {
> +			prog->aux->trampoline = tr;
> +			return 0;
> +		}
> +		ret = btf_distill_func_proto(&env->log, btf_vmlinux, t,
> +					     tname, &tr->func.model);
> +		if (ret < 0) {
> +			bpf_trampoline_put(tr);
> +			return ret;
> +		}
> +		addr = kallsyms_lookup_name(tname);
> +		if (!addr) {
> +			verbose(env,
> +				"The address of function %s cannot be found\n",
> +				tname);
> +			bpf_trampoline_put(tr);
> +			return -ENOENT;
> +		}
> +		tr->func.addr = (void *)addr;
> +		prog->aux->trampoline = tr;
> +		return 0;
> 	default:
> 		return -EINVAL;
> 	}
> -- 
> 2.23.0
> 


  reply	other threads:[~2019-11-07 22:37 UTC|newest]

Thread overview: 45+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-11-07  5:46 [PATCH v2 bpf-next 00/17] Introduce BPF trampoline Alexei Starovoitov
2019-11-07  5:46 ` [PATCH v2 bpf-next 01/17] bpf: refactor x86 JIT into helpers Alexei Starovoitov
2019-11-07 17:05   ` Song Liu
2019-11-07  5:46 ` [PATCH v2 bpf-next 02/17] bpf: Add bpf_arch_text_poke() helper Alexei Starovoitov
2019-11-07 17:20   ` Song Liu
2019-11-07 17:50     ` Alexei Starovoitov
2019-11-07 17:54       ` Alexei Starovoitov
2019-11-07 18:04         ` Song Liu
2019-11-07  5:46 ` [PATCH v2 bpf-next 03/17] bpf: Introduce BPF trampoline Alexei Starovoitov
2019-11-07 22:37   ` Song Liu [this message]
2019-11-07 22:55     ` Alexei Starovoitov
2019-11-07 23:07       ` Song Liu
2019-11-07 23:09         ` Alexei Starovoitov
2019-11-07 23:16           ` Song Liu
2019-11-08  0:09             ` Alexei Starovoitov
2019-11-08  1:10               ` Song Liu
2019-11-08  3:11                 ` Alexei Starovoitov
2019-11-08  4:06                   ` Song Liu
2019-11-08  4:08                     ` Alexei Starovoitov
2019-11-07  5:46 ` [PATCH v2 bpf-next 04/17] libbpf: Add support to attach to fentry/fexit tracing progs Alexei Starovoitov
2019-11-07 22:51   ` Song Liu
2019-11-07 23:07     ` Alexei Starovoitov
2019-11-07 23:13       ` Song Liu
2019-11-07  5:46 ` [PATCH v2 bpf-next 05/17] selftest/bpf: Simple test for fentry/fexit Alexei Starovoitov
2019-11-07 23:22   ` Song Liu
2019-11-07  5:46 ` [PATCH v2 bpf-next 06/17] bpf: Add kernel test functions for fentry testing Alexei Starovoitov
2019-11-07 23:28   ` Song Liu
2019-11-07  5:46 ` [PATCH v2 bpf-next 07/17] selftests/bpf: Add test for BPF trampoline Alexei Starovoitov
2019-11-08  1:17   ` Song Liu
2019-11-08  2:33     ` Alexei Starovoitov
2019-11-07  5:46 ` [PATCH v2 bpf-next 08/17] selftests/bpf: Add fexit tests " Alexei Starovoitov
2019-11-08  1:22   ` Song Liu
2019-11-07  5:46 ` [PATCH v2 bpf-next 09/17] selftests/bpf: Add combined fentry/fexit test Alexei Starovoitov
2019-11-07  5:46 ` [PATCH v2 bpf-next 10/17] selftests/bpf: Add stress test for maximum number of progs Alexei Starovoitov
2019-11-07  5:46 ` [PATCH v2 bpf-next 11/17] bpf: Reserver space for BPF trampoline in BPF programs Alexei Starovoitov
2019-11-08  5:03   ` Andrii Nakryiko
2019-11-07  5:46 ` [PATCH v2 bpf-next 12/17] bpf: Fix race in btf_resolve_helper_id() Alexei Starovoitov
2019-11-08  5:13   ` Andrii Nakryiko
2019-11-08  5:31     ` Alexei Starovoitov
2019-11-07  5:46 ` [PATCH v2 bpf-next 13/17] bpf: Compare BTF types of functions arguments with actual types Alexei Starovoitov
2019-11-07  5:46 ` [PATCH v2 bpf-next 14/17] bpf: Support attaching tracing BPF program to other BPF programs Alexei Starovoitov
2019-11-07  5:46 ` [PATCH v2 bpf-next 15/17] selftests/bpf: Extend test_pkt_access test Alexei Starovoitov
2019-11-07  5:46 ` [PATCH v2 bpf-next 16/17] libbpf: Add support for attaching BPF programs to other BPF programs Alexei Starovoitov
2019-11-07 14:33   ` Alexei Starovoitov
2019-11-07  5:46 ` [PATCH v2 bpf-next 17/17] selftests/bpf: Add a test for attaching BPF prog to another BPF prog and subprog Alexei Starovoitov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5967F93A-235B-447E-9B70-E7768998B718@fb.com \
    --to=songliubraving@fb.com \
    --cc=Kernel-team@fb.com \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=davem@davemloft.net \
    --cc=netdev@vger.kernel.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.