Re: [PATCH v2 bpf-next 03/17] bpf: Introduce BPF trampoline

From: Song Liu <songliubraving@fb.com>
To: Alexei Starovoitov <ast@kernel.org>
Cc: "davem@davemloft.net" <davem@davemloft.net>,
	"daniel@iogearbox.net" <daniel@iogearbox.net>,
	"x86@kernel.org" <x86@kernel.org>,
	"netdev@vger.kernel.org" <netdev@vger.kernel.org>,
	"bpf@vger.kernel.org" <bpf@vger.kernel.org>,
	Kernel Team <Kernel-team@fb.com>
Subject: Re: [PATCH v2 bpf-next 03/17] bpf: Introduce BPF trampoline
Date: Thu, 7 Nov 2019 22:37:19 +0000	[thread overview]
Message-ID: <5967F93A-235B-447E-9B70-E7768998B718@fb.com> (raw)
In-Reply-To: <20191107054644.1285697-4-ast@kernel.org>

> On Nov 6, 2019, at 9:46 PM, Alexei Starovoitov <ast@kernel.org> wrote:
> 

[...]

> 
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> Acked-by: Andrii Nakryiko <andriin@fb.com>
> ---
> arch/x86/net/bpf_jit_comp.c | 227 ++++++++++++++++++++++++++++++--
> include/linux/bpf.h         |  98 ++++++++++++++
> include/uapi/linux/bpf.h    |   2 +
> kernel/bpf/Makefile         |   1 +
> kernel/bpf/btf.c            |  77 ++++++++++-
> kernel/bpf/core.c           |   1 +
> kernel/bpf/syscall.c        |  53 +++++++-
> kernel/bpf/trampoline.c     | 252 ++++++++++++++++++++++++++++++++++++
> kernel/bpf/verifier.c       |  39 ++++++
> 9 files changed, 732 insertions(+), 18 deletions(-)
> create mode 100644 kernel/bpf/trampoline.c
> 
> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> index 8631d3bd637f..44169e8bffc0 100644
> --- a/arch/x86/net/bpf_jit_comp.c
> +++ b/arch/x86/net/bpf_jit_comp.c
> @@ -98,6 +98,7 @@ static int bpf_size_to_x86_bytes(int bpf_size)
> 
> /* Pick a register outside of BPF range for JIT internal work */
> #define AUX_REG (MAX_BPF_JIT_REG + 1)
> +#define X86_REG_R9 (MAX_BPF_JIT_REG + 2)
> 
> /*
>  * The following table maps BPF registers to x86-64 registers.
> @@ -123,6 +124,7 @@ static const int reg2hex[] = {
> 	[BPF_REG_FP] = 5, /* RBP readonly */
> 	[BPF_REG_AX] = 2, /* R10 temp register */
> 	[AUX_REG] = 3,    /* R11 temp register */
> +	[X86_REG_R9] = 1, /* R9 register, 6th function argument */

We should update the comment above this:

 * Also x86-64 register R9 is unused. ...

> };
> 
> static const int reg2pt_regs[] = {
> @@ -150,6 +152,7 @@ static bool is_ereg(u32 reg)
> 			     BIT(BPF_REG_7) |
> 			     BIT(BPF_REG_8) |
> 			     BIT(BPF_REG_9) |
> +			     BIT(X86_REG_R9) |
> 			     BIT(BPF_REG_AX));
> }
> 
> @@ -492,8 +495,8 @@ static int emit_call(u8 **pprog, void *func, void *ip)
> int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
> 		       void *old_addr, void *new_addr)
> {
> -	u8 old_insn[NOP_ATOMIC5] = {};
> -	u8 new_insn[NOP_ATOMIC5] = {};
> +	u8 old_insn[X86_CALL_SIZE] = {};
> +	u8 new_insn[X86_CALL_SIZE] = {};
> 	u8 *prog;
> 	int ret;
> 
> @@ -507,9 +510,9 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
> 		if (ret)
> 			return ret;
> 	}
> -	if (old_addr) {
> +	if (new_addr) {
> 		prog = new_insn;
> -		ret = emit_call(&prog, old_addr, (void *)ip);
> +		ret = emit_call(&prog, new_addr, (void *)ip);
> 		if (ret)
> 			return ret;
> 	}
> @@ -517,19 +520,19 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
> 	mutex_lock(&text_mutex);
> 	switch (t) {
> 	case BPF_MOD_NOP_TO_CALL:
> -		if (memcmp(ip, ideal_nops, 5))
> +		if (memcmp(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE))
> 			goto out;
> -		text_poke(ip, new_insn, 5);
> +		text_poke(ip, new_insn, X86_CALL_SIZE);
> 		break;
> 	case BPF_MOD_CALL_TO_CALL:
> -		if (memcmp(ip, old_insn, 5))
> +		if (memcmp(ip, old_insn, X86_CALL_SIZE))
> 			goto out;
> -		text_poke(ip, new_insn, 5);
> +		text_poke(ip, new_insn, X86_CALL_SIZE);
> 		break;
> 	case BPF_MOD_CALL_TO_NOP:
> -		if (memcmp(ip, old_insn, 5))
> +		if (memcmp(ip, old_insn, X86_CALL_SIZE))
> 			goto out;
> -		text_poke(ip, ideal_nops, 5);
> +		text_poke(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE);
> 		break;
> 	}
> 	ret = 0;
> @@ -1234,6 +1237,210 @@ xadd:			if (is_imm8(insn->off))
> 	return proglen;
> }
> 
> +static void save_regs(struct btf_func_model *m, u8 **prog, int nr_args,
> +		      int stack_size)
> +{
> +	int i;
> +	/* Store function arguments to stack.
> +	 * For a function that accepts two pointers the sequence will be:
> +	 * mov QWORD PTR [rbp-0x10],rdi
> +	 * mov QWORD PTR [rbp-0x8],rsi
> +	 */
> +	for (i = 0; i < min(nr_args, 6); i++)
> +		emit_stx(prog, bytes_to_bpf_size(m->arg_size[i]),
> +			 BPF_REG_FP,
> +			 i == 5 ? X86_REG_R9 : BPF_REG_1 + i,
> +			 -(stack_size - i * 8));
> +}
> +
> +static void restore_regs(struct btf_func_model *m, u8 **prog, int nr_args,
> +			 int stack_size)
> +{
> +	int i;
> +
> +	/* Restore function arguments from stack.
> +	 * For a function that accepts two pointers the sequence will be:
> +	 * EMIT4(0x48, 0x8B, 0x7D, 0xF0); mov rdi,QWORD PTR [rbp-0x10]
> +	 * EMIT4(0x48, 0x8B, 0x75, 0xF8); mov rsi,QWORD PTR [rbp-0x8]
> +	 */
> +	for (i = 0; i < min(nr_args, 6); i++)
> +		emit_ldx(prog, bytes_to_bpf_size(m->arg_size[i]),
> +			 i == 5 ? X86_REG_R9 : BPF_REG_1 + i,
> +			 BPF_REG_FP,
> +			 -(stack_size - i * 8));
> +}
> +
> +static int invoke_bpf(struct btf_func_model *m, u8 **pprog,
> +		      struct bpf_prog **progs, int prog_cnt, int stack_size)
> +{
> +	u8 *prog = *pprog;
> +	int cnt = 0, i;
> +
> +	for (i = 0; i < prog_cnt; i++) {
> +		if (emit_call(&prog, __bpf_prog_enter, prog))
> +			return -EINVAL;
> +		/* remember prog start time returned by __bpf_prog_enter */
> +		emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
> +
> +		/* arg1: lea rdi, [rbp - stack_size] */
> +		EMIT4(0x48, 0x8D, 0x7D, -stack_size);
> +		/* arg2: progs[i]->insnsi for interpreter */
> +		if (!progs[i]->jited)
> +			emit_mov_imm64(&prog, BPF_REG_2,
> +				       (long) progs[i]->insnsi >> 32,
> +				       (u32) (long) progs[i]->insnsi);
> +		/* call JITed bpf program or interpreter */
> +		if (emit_call(&prog, progs[i]->bpf_func, prog))
> +			return -EINVAL;
> +
> +		/* arg1: mov rdi, progs[i] */
> +		emit_mov_imm64(&prog, BPF_REG_1, (long) progs[i] >> 32,
> +			       (u32) (long) progs[i]);
> +		/* arg2: mov rsi, rbx <- start time in nsec */
> +		emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
> +		if (emit_call(&prog, __bpf_prog_exit, prog))
> +			return -EINVAL;
> +	}
> +	*pprog = prog;
> +	return 0;
> +}
> +
> +/* Example:
> + * __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev);
> + * its 'struct btf_func_model' will be nr_args=2
> + * The assembly code when eth_type_trans is executing after trampoline:
> + *
> + * push rbp
> + * mov rbp, rsp
> + * sub rsp, 16                     // space for skb and dev
> + * push rbx                        // temp regs to pass start time
> + * mov qword ptr [rbp - 16], rdi   // save skb pointer to stack
> + * mov qword ptr [rbp - 8], rsi    // save dev pointer to stack
> + * call __bpf_prog_enter           // rcu_read_lock and preempt_disable
> + * mov rbx, rax                    // remember start time in bpf stats are enabled
> + * lea rdi, [rbp - 16]             // R1==ctx of bpf prog
> + * call addr_of_jited_FENTRY_prog
> + * movabsq rdi, 64bit_addr_of_struct_bpf_prog  // unused if bpf stats are off
> + * mov rsi, rbx                    // prog start time
> + * call __bpf_prog_exit            // rcu_read_unlock, preempt_enable and stats math
> + * mov rdi, qword ptr [rbp - 16]   // restore skb pointer from stack
> + * mov rsi, qword ptr [rbp - 8]    // restore dev pointer from stack
> + * pop rbx
> + * leave
> + * ret
> + *
> + * eth_type_trans has 5 byte nop at the beginning. These 5 bytes will be
> + * replaced with 'call generated_bpf_trampoline'. When it returns
> + * eth_type_trans will continue executing with original skb and dev pointers.
> + *
> + * The assembly code when eth_type_trans is called from trampoline:
> + *
> + * push rbp
> + * mov rbp, rsp
> + * sub rsp, 24                     // space for skb, dev, return value
> + * push rbx                        // temp regs to pass start time
> + * mov qword ptr [rbp - 24], rdi   // save skb pointer to stack
> + * mov qword ptr [rbp - 16], rsi   // save dev pointer to stack
> + * call __bpf_prog_enter           // rcu_read_lock and preempt_disable
> + * mov rbx, rax                    // remember start time if bpf stats are enabled
> + * lea rdi, [rbp - 24]             // R1==ctx of bpf prog
> + * call addr_of_jited_FENTRY_prog  // bpf prog can access skb and dev
> + * movabsq rdi, 64bit_addr_of_struct_bpf_prog  // unused if bpf stats are off
> + * mov rsi, rbx                    // prog start time
> + * call __bpf_prog_exit            // rcu_read_unlock, preempt_enable and stats math
> + * mov rdi, qword ptr [rbp - 24]   // restore skb pointer from stack
> + * mov rsi, qword ptr [rbp - 16]   // restore dev pointer from stack
> + * call eth_type_trans+5           // execute body of eth_type_trans
> + * mov qword ptr [rbp - 8], rax    // save return value
> + * call __bpf_prog_enter           // rcu_read_lock and preempt_disable
> + * mov rbx, rax                    // remember start time in bpf stats are enabled
> + * lea rdi, [rbp - 24]             // R1==ctx of bpf prog
> + * call addr_of_jited_FEXIT_prog   // bpf prog can access skb, dev, return value
> + * movabsq rdi, 64bit_addr_of_struct_bpf_prog  // unused if bpf stats are off
> + * mov rsi, rbx                    // prog start time
> + * call __bpf_prog_exit            // rcu_read_unlock, preempt_enable and stats math
> + * mov rax, qword ptr [rbp - 8]    // restore eth_type_trans's return value
> + * pop rbx
> + * leave
> + * add rsp, 8                      // skip eth_type_trans's frame
> + * ret                             // return to its caller
> + */
> +int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags,
> +				struct bpf_prog **fentry_progs, int fentry_cnt,
> +				struct bpf_prog **fexit_progs, int fexit_cnt,
> +				void *orig_call)
> +{
> +	int cnt = 0, nr_args = m->nr_args;
> +	int stack_size = nr_args * 8;
> +	u8 *prog;
> +
> +	/* x86-64 supports up to 6 arguments. 7+ can be added in the future */
> +	if (nr_args > 6)
> +		return -ENOTSUPP;
> +
> +	if ((flags & BPF_TRAMP_F_RESTORE_REGS) &&
> +	    (flags & BPF_TRAMP_F_SKIP_FRAME))
> +		return -EINVAL;
> +
> +	if (flags & BPF_TRAMP_F_CALL_ORIG)
> +		stack_size += 8; /* room for return value of orig_call */
> +
> +	if (flags & BPF_TRAMP_F_SKIP_FRAME)
> +		/* skip patched call instruction and point orig_call to actual
> +		 * body of the kernel function.
> +		 */
> +		orig_call += X86_CALL_SIZE;
> +
> +	prog = image;
> +
> +	EMIT1(0x55);		 /* push rbp */
> +	EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
> +	EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */
> +	EMIT1(0x53);		 /* push rbx */
> +
> +	save_regs(m, &prog, nr_args, stack_size);
> +
> +	if (fentry_cnt)
> +		if (invoke_bpf(m, &prog, fentry_progs, fentry_cnt, stack_size))
> +			return -EINVAL;
> +
> +	if (flags & BPF_TRAMP_F_CALL_ORIG) {
> +		if (fentry_cnt)
> +			restore_regs(m, &prog, nr_args, stack_size);
> +
> +		/* call original function */
> +		if (emit_call(&prog, orig_call, prog))
> +			return -EINVAL;
> +		/* remember return value in a stack for bpf prog to access */
> +		emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
> +	}
> +
> +	if (fexit_cnt)
> +		if (invoke_bpf(m, &prog, fexit_progs, fexit_cnt, stack_size))
> +			return -EINVAL;
> +
> +	if (flags & BPF_TRAMP_F_RESTORE_REGS)
> +		restore_regs(m, &prog, nr_args, stack_size);
> +
> +	if (flags & BPF_TRAMP_F_CALL_ORIG)
> +		/* restore original return value back into RAX */
> +		emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8);
> +
> +	EMIT1(0x5B); /* pop rbx */
> +	EMIT1(0xC9); /* leave */
> +	if (flags & BPF_TRAMP_F_SKIP_FRAME)
> +		/* skip our return address and return to parent */
> +		EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */
> +	EMIT1(0xC3); /* ret */
> +	/* One half of the page has active running trampoline.
> +	 * Another half is an area for next trampoline.
> +	 * Make sure the trampoline generation logic doesn't overflow.
> +	 */
> +	if (WARN_ON_ONCE(prog - (u8 *)image > PAGE_SIZE / 2 - BPF_INSN_SAFETY))
> +		return -EFAULT;

Given max number of args, can we catch this error at compile time? 

> +	return 0;
> +}
> +
> struct x64_jit_data {
> 	struct bpf_binary_header *header;
> 	int *addrs;
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 8b90db25348a..9206b7e86dde 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -14,6 +14,7 @@
> #include <linux/numa.h>
> #include <linux/wait.h>
> #include <linux/u64_stats_sync.h>
> +#include <linux/refcount.h>
> 
> struct bpf_verifier_env;
> struct bpf_verifier_log;
> @@ -384,6 +385,94 @@ struct bpf_prog_stats {
> 	struct u64_stats_sync syncp;
> } __aligned(2 * sizeof(u64));
> 
> +struct btf_func_model {
> +	u8 ret_size;
> +	u8 nr_args;
> +	u8 arg_size[MAX_BPF_FUNC_ARGS];
> +};
> +
> +/* Restore arguments before returning from trampoline to let original function
> + * continue executing. This flag is used for fentry progs when there are no
> + * fexit progs.
> + */
> +#define BPF_TRAMP_F_RESTORE_REGS	BIT(0)
> +/* Call original function after fentry progs, but before fexit progs.
> + * Makes sense for fentry/fexit, normal calls and indirect calls.
> + */
> +#define BPF_TRAMP_F_CALL_ORIG		BIT(1)
> +/* Skip current frame and return to parent.  Makes sense for fentry/fexit
> + * programs only. Should not be used with normal calls and indirect calls.
> + */
> +#define BPF_TRAMP_F_SKIP_FRAME		BIT(2)
> +
> +/* Different use cases for BPF trampoline:
> + * 1. replace nop at the function entry (kprobe equivalent)
> + *    flags = BPF_TRAMP_F_RESTORE_REGS
> + *    fentry = a set of programs to run before returning from trampoline
> + *
> + * 2. replace nop at the function entry (kprobe + kretprobe equivalent)
> + *    flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME
> + *    orig_call = fentry_ip + MCOUNT_INSN_SIZE
> + *    fentry = a set of program to run before calling original function
> + *    fexit = a set of program to run after original function
> + *
> + * 3. replace direct call instruction anywhere in the function body
> + *    or assign a function pointer for indirect call (like tcp_congestion_ops->cong_avoid)
> + *    With flags = 0
> + *      fentry = a set of programs to run before returning from trampoline
> + *    With flags = BPF_TRAMP_F_CALL_ORIG
> + *      orig_call = original callback addr or direct function addr
> + *      fentry = a set of program to run before calling original function
> + *      fexit = a set of program to run after original function
> + */
> +int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags,
> +				struct bpf_prog **fentry_progs, int fentry_cnt,
> +				struct bpf_prog **fexit_progs, int fexit_cnt,
> +				void *orig_call);
> +/* these two functions are called from generated trampoline */
> +u64 notrace __bpf_prog_enter(void);
> +void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start);
> +
> +enum bpf_tramp_prog_type {
> +	BPF_TRAMP_FENTRY,
> +	BPF_TRAMP_FEXIT,
> +	BPF_TRAMP_MAX
> +};
> +
> +struct bpf_trampoline {
> +	struct hlist_node hlist;
> +	refcount_t refcnt;
> +	u64 key;
> +	struct {
> +		struct btf_func_model model;
> +		void *addr;
> +	} func;
> +	struct hlist_head progs_hlist[BPF_TRAMP_MAX];
> +	int progs_cnt[BPF_TRAMP_MAX];
> +	void *image;
> +	u64 selector;
> +};
> +#ifdef CONFIG_BPF_JIT
> +struct bpf_trampoline *bpf_trampoline_lookup(u64 key);
> +int bpf_trampoline_link_prog(struct bpf_prog *prog);
> +int bpf_trampoline_unlink_prog(struct bpf_prog *prog);
> +void bpf_trampoline_put(struct bpf_trampoline *tr);
> +#else
> +static inline struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
> +{
> +	return NULL;
> +}
> +static inline int bpf_trampoline_link_prog(struct bpf_prog *prog)
> +{
> +	return -ENOTSUPP;
> +}
> +static inline int bpf_trampoline_unlink_prog(struct bpf_prog *prog)
> +{
> +	return -ENOTSUPP;
> +}
> +static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {}
> +#endif
> +
> struct bpf_prog_aux {
> 	atomic_t refcnt;
> 	u32 used_map_cnt;
> @@ -398,6 +487,9 @@ struct bpf_prog_aux {
> 	bool verifier_zext; /* Zero extensions has been inserted by verifier. */
> 	bool offload_requested;
> 	bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */
> +	enum bpf_tramp_prog_type trampoline_prog_type;
> +	struct bpf_trampoline *trampoline;
> +	struct hlist_node tramp_hlist;
> 	/* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
> 	const struct btf_type *attach_func_proto;
> 	/* function name for valid attach_btf_id */
> @@ -784,6 +876,12 @@ int btf_struct_access(struct bpf_verifier_log *log,
> 		      u32 *next_btf_id);
> u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *, int);
> 
> +int btf_distill_func_proto(struct bpf_verifier_log *log,
> +			   struct btf *btf,
> +			   const struct btf_type *func_proto,
> +			   const char *func_name,
> +			   struct btf_func_model *m);
> +
> #else /* !CONFIG_BPF_SYSCALL */
> static inline struct bpf_prog *bpf_prog_get(u32 ufd)
> {
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index df6809a76404..69c200e6e696 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -201,6 +201,8 @@ enum bpf_attach_type {
> 	BPF_CGROUP_GETSOCKOPT,
> 	BPF_CGROUP_SETSOCKOPT,
> 	BPF_TRACE_RAW_TP,
> +	BPF_TRACE_FENTRY,
> +	BPF_TRACE_FEXIT,
> 	__MAX_BPF_ATTACH_TYPE
> };
> 
> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
> index e1d9adb212f9..3f671bf617e8 100644
> --- a/kernel/bpf/Makefile
> +++ b/kernel/bpf/Makefile
> @@ -6,6 +6,7 @@ obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
> obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
> obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
> obj-$(CONFIG_BPF_SYSCALL) += disasm.o
> +obj-$(CONFIG_BPF_JIT) += trampoline.o
> obj-$(CONFIG_BPF_SYSCALL) += btf.o
> ifeq ($(CONFIG_NET),y)
> obj-$(CONFIG_BPF_SYSCALL) += devmap.o
> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> index 128d89601d73..78d9ceaabc00 100644
> --- a/kernel/bpf/btf.c
> +++ b/kernel/bpf/btf.c
> @@ -3441,13 +3441,18 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
> 		args++;
> 		nr_args--;
> 	}
> -	if (arg >= nr_args) {
> +
> +	if (prog->expected_attach_type == BPF_TRACE_FEXIT &&
> +	    arg == nr_args) {
> +		/* function return type */
> +		t = btf_type_by_id(btf_vmlinux, t->type);
> +	} else if (arg >= nr_args) {
> 		bpf_log(log, "func '%s' doesn't have %d-th argument\n",
> -			tname, arg);
> +			tname, arg + 1);
> 		return false;
> +	} else {
> +		t = btf_type_by_id(btf_vmlinux, args[arg].type);
> 	}
> -
> -	t = btf_type_by_id(btf_vmlinux, args[arg].type);
> 	/* skip modifiers */
> 	while (btf_type_is_modifier(t))
> 		t = btf_type_by_id(btf_vmlinux, t->type);
> @@ -3647,6 +3652,70 @@ u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn, int arg)
> 	return btf_id;
> }
> 
> +static int __get_type_size(struct btf *btf, u32 btf_id,
> +			   const struct btf_type **bad_type)
> +{
> +	const struct btf_type *t;
> +
> +	if (!btf_id)
> +		/* void */
> +		return 0;
> +	t = btf_type_by_id(btf, btf_id);
> +	while (t && btf_type_is_modifier(t))
> +		t = btf_type_by_id(btf, t->type);
> +	if (!t)
> +		return -EINVAL;
> +	if (btf_type_is_ptr(t))
> +		/* kernel size of pointer. Not BPF's size of pointer*/
> +		return sizeof(void *);
> +	if (btf_type_is_int(t) || btf_type_is_enum(t))
> +		return t->size;
> +	*bad_type = t;
> +	return -EINVAL;
> +}
> +
> +int btf_distill_func_proto(struct bpf_verifier_log *log,
> +			   struct btf *btf,
> +			   const struct btf_type *func,
> +			   const char *tname,
> +			   struct btf_func_model *m)
> +{
> +	const struct btf_param *args;
> +	const struct btf_type *t;
> +	u32 i, nargs;
> +	int ret;
> +
> +	args = (const struct btf_param *)(func + 1);
> +	nargs = btf_type_vlen(func);
> +	if (nargs >= MAX_BPF_FUNC_ARGS) {
> +		bpf_log(log,
> +			"The function %s has %d arguments. Too many.\n",
> +			tname, nargs);
> +		return -EINVAL;
> +	}
> +	ret = __get_type_size(btf, func->type, &t);
> +	if (ret < 0) {
> +		bpf_log(log,
> +			"The function %s return type %s is unsupported.\n",
> +			tname, btf_kind_str[BTF_INFO_KIND(t->info)]);
> +		return -EINVAL;
> +	}
> +	m->ret_size = ret;
> +
> +	for (i = 0; i < nargs; i++) {
> +		ret = __get_type_size(btf, args[i].type, &t);
> +		if (ret < 0) {
> +			bpf_log(log,
> +				"The function %s arg%d type %s is unsupported.\n",
> +				tname, i, btf_kind_str[BTF_INFO_KIND(t->info)]);
> +			return -EINVAL;
> +		}
> +		m->arg_size[i] = ret;
> +	}
> +	m->nr_args = nargs;
> +	return 0;
> +}
> +
> void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
> 		       struct seq_file *m)
> {
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 856564e97943..7352f5514e57 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -2011,6 +2011,7 @@ static void bpf_prog_free_deferred(struct work_struct *work)
> 	if (aux->prog->has_callchain_buf)
> 		put_callchain_buffers();
> #endif
> +	bpf_trampoline_put(aux->trampoline);
> 	for (i = 0; i < aux->func_cnt; i++)
> 		bpf_jit_free(aux->func[i]);
> 	if (aux->func_cnt) {
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 6d9ce95e5a8d..e2e37bea86bc 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -1799,6 +1799,49 @@ static int bpf_obj_get(const union bpf_attr *attr)
> 				attr->file_flags);
> }
> 
> +static int bpf_tracing_prog_release(struct inode *inode, struct file *filp)
> +{
> +	struct bpf_prog *prog = filp->private_data;
> +
> +	WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog));
> +	bpf_prog_put(prog);
> +	return 0;
> +}
> +
> +static const struct file_operations bpf_tracing_prog_fops = {
> +	.release	= bpf_tracing_prog_release,
> +	.read		= bpf_dummy_read,
> +	.write		= bpf_dummy_write,
> +};
> +
> +static int bpf_tracing_prog_attach(struct bpf_prog *prog)
> +{
> +	int tr_fd, err;
> +
> +	if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
> +	    prog->expected_attach_type != BPF_TRACE_FEXIT) {
> +		err = -EINVAL;
> +		goto out_put_prog;
> +	}
> +
> +	err = bpf_trampoline_link_prog(prog);
> +	if (err)
> +		goto out_put_prog;
> +
> +	tr_fd = anon_inode_getfd("bpf-tracing-prog", &bpf_tracing_prog_fops,
> +				 prog, O_CLOEXEC);
> +	if (tr_fd < 0) {
> +		WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog));
> +		err = tr_fd;
> +		goto out_put_prog;
> +	}
> +	return tr_fd;
> +
> +out_put_prog:
> +	bpf_prog_put(prog);
> +	return err;
> +}
> +
> struct bpf_raw_tracepoint {
> 	struct bpf_raw_event_map *btp;
> 	struct bpf_prog *prog;
> @@ -1850,14 +1893,16 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
> 
> 	if (prog->type == BPF_PROG_TYPE_TRACING) {
> 		if (attr->raw_tracepoint.name) {
> -			/* raw_tp name should not be specified in raw_tp
> -			 * programs that were verified via in-kernel BTF info
> +			/* The attach point for this category of programs
> +			 * should be specified via btf_id during program load.
> 			 */
> 			err = -EINVAL;
> 			goto out_put_prog;
> 		}
> -		/* raw_tp name is taken from type name instead */
> -		tp_name = prog->aux->attach_func_name;
> +		if (prog->expected_attach_type == BPF_TRACE_RAW_TP)
> +			tp_name = prog->aux->attach_func_name;
> +		else
> +			return bpf_tracing_prog_attach(prog);
> 	} else {
> 		if (strncpy_from_user(buf,
> 				      u64_to_user_ptr(attr->raw_tracepoint.name),
> diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
> new file mode 100644
> index 000000000000..214c964cfd72
> --- /dev/null
> +++ b/kernel/bpf/trampoline.c
> @@ -0,0 +1,252 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (c) 2019 Facebook */
> +#include <linux/hash.h>
> +#include <linux/bpf.h>
> +#include <linux/filter.h>
> +
> +/* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */
> +#define TRAMPOLINE_HASH_BITS 10
> +#define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS)
> +
> +static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
> +
> +static DEFINE_MUTEX(trampoline_mutex);
> +
> +struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
> +{
> +	struct bpf_trampoline *tr;
> +	struct hlist_head *head;
> +	void *image;
> +	int i;
> +
> +	mutex_lock(&trampoline_mutex);
> +	head = &trampoline_table[hash_64(key, TRAMPOLINE_HASH_BITS)];
> +	hlist_for_each_entry(tr, head, hlist) {
> +		if (tr->key == key) {
> +			refcount_inc(&tr->refcnt);
> +			goto out;
> +		}
> +	}
> +	tr = kzalloc(sizeof(*tr), GFP_KERNEL);
> +	if (!tr)
> +		goto out;
> +
> +	/* is_root was checked earlier. No need for bpf_jit_charge_modmem() */
> +	image = bpf_jit_alloc_exec(PAGE_SIZE);
> +	if (!image) {
> +		kfree(tr);
> +		tr = NULL;
> +		goto out;
> +	}
> +
> +	tr->key = key;
> +	INIT_HLIST_NODE(&tr->hlist);
> +	hlist_add_head(&tr->hlist, head);
> +	refcount_set(&tr->refcnt, 1);
> +	for (i = 0; i < BPF_TRAMP_MAX; i++)
> +		INIT_HLIST_HEAD(&tr->progs_hlist[i]);
> +
> +	set_vm_flush_reset_perms(image);
> +	/* Keep image as writeable. The alternative is to keep flipping ro/rw
> +	 * everytime new program is attached or detached.
> +	 */
> +	set_memory_x((long)image, 1);
> +	tr->image = image;
> +out:
> +	mutex_unlock(&trampoline_mutex);
> +	return tr;
> +}
> +
> +/* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
> + * bytes on x86.  Pick a number to fit into PAGE_SIZE / 2
> + */
> +#define BPF_MAX_TRAMP_PROGS 40
> +
> +static int bpf_trampoline_update(struct bpf_prog *prog)

Seems argument "prog" is not used at all? 

> +{
> +	struct bpf_trampoline *tr = prog->aux->trampoline;
> +	void *old_image = tr->image + ((tr->selector + 1) & 1) * PAGE_SIZE/2;
> +	void *new_image = tr->image + (tr->selector & 1) * PAGE_SIZE/2;
> +	struct bpf_prog *progs_to_run[BPF_MAX_TRAMP_PROGS];
> +	int fentry_cnt = tr->progs_cnt[BPF_TRAMP_FENTRY];
> +	int fexit_cnt = tr->progs_cnt[BPF_TRAMP_FEXIT];
> +	struct bpf_prog **progs, **fentry, **fexit;
> +	u32 flags = BPF_TRAMP_F_RESTORE_REGS;
> +	struct bpf_prog_aux *aux;
> +	int err;
> +
> +	if (fentry_cnt + fexit_cnt == 0) {
> +		err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL_TO_NOP,
> +					 old_image, NULL);
> +		tr->selector = 0;
> +		goto out;
> +	}
> +
> +	/* populate fentry progs */
> +	fentry = progs = progs_to_run;
> +	hlist_for_each_entry(aux, &tr->progs_hlist[BPF_TRAMP_FENTRY], tramp_hlist)
> +		*progs++ = aux->prog;
> +
> +	/* populate fexit progs */
> +	fexit = progs;
> +	hlist_for_each_entry(aux, &tr->progs_hlist[BPF_TRAMP_FEXIT], tramp_hlist)
> +		*progs++ = aux->prog;
> +
> +	if (fexit_cnt)
> +		flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
> +
> +	err = arch_prepare_bpf_trampoline(new_image, &tr->func.model, flags,
> +					  fentry, fentry_cnt,
> +					  fexit, fexit_cnt,
> +					  tr->func.addr);
> +	if (err)
> +		goto out;
> +
> +	if (tr->selector)
> +		/* progs already running at this address */
> +		err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL_TO_CALL,
> +					 old_image, new_image);
> +	else
> +		/* first time registering */
> +		err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP_TO_CALL,
> +					 NULL, new_image);
> +
> +	if (err)
> +		goto out;
> +	tr->selector++;

Shall we do selector-- for unlink?

> +out:
> +	return err;
> +}
> +
> +static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(enum bpf_attach_type t)
> +{
> +	switch (t) {
> +	case BPF_TRACE_FENTRY:
> +		return BPF_TRAMP_FENTRY;
> +	default:
> +		return BPF_TRAMP_FEXIT;
> +	}
> +}
> +
> +int bpf_trampoline_link_prog(struct bpf_prog *prog)
> +{
> +	enum bpf_tramp_prog_type kind;
> +	struct bpf_trampoline *tr;
> +	int err = 0;
> +
> +	tr = prog->aux->trampoline;
> +	kind = bpf_attach_type_to_tramp(prog->expected_attach_type);
> +	mutex_lock(&trampoline_mutex);
> +	if (tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT]
> +	    >= BPF_MAX_TRAMP_PROGS) {
> +		err = -E2BIG;
> +		goto out;
> +	}
> +	if (!hlist_unhashed(&prog->aux->tramp_hlist)) {
> +		/* prog already linked */
> +		err = -EBUSY;
> +		goto out;
> +	}
> +	hlist_add_head(&prog->aux->tramp_hlist, &tr->progs_hlist[kind]);
> +	tr->progs_cnt[kind]++;
> +	err = bpf_trampoline_update(prog);
> +	if (err) {
> +		hlist_del(&prog->aux->tramp_hlist);
> +		tr->progs_cnt[kind]--;
> +	}
> +out:
> +	mutex_unlock(&trampoline_mutex);
> +	return err;
> +}
> +
> +/* bpf_trampoline_unlink_prog() should never fail. */
> +int bpf_trampoline_unlink_prog(struct bpf_prog *prog)
> +{
> +	enum bpf_tramp_prog_type kind;
> +	struct bpf_trampoline *tr;
> +	int err;
> +
> +	tr = prog->aux->trampoline;
> +	kind = bpf_attach_type_to_tramp(prog->expected_attach_type);
> +	mutex_lock(&trampoline_mutex);
> +	hlist_del(&prog->aux->tramp_hlist);
> +	tr->progs_cnt[kind]--;
> +	err = bpf_trampoline_update(prog);
> +	mutex_unlock(&trampoline_mutex);
> +	return err;
> +}
> +
> +void bpf_trampoline_put(struct bpf_trampoline *tr)
> +{
> +	if (!tr)
> +		return;
> +	mutex_lock(&trampoline_mutex);
> +	if (!refcount_dec_and_test(&tr->refcnt))
> +		goto out;
> +	if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FENTRY])))
> +		goto out;
> +	if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT])))
> +		goto out;
> +	bpf_jit_free_exec(tr->image);
> +	hlist_del(&tr->hlist);
> +	kfree(tr);
> +out:
> +	mutex_unlock(&trampoline_mutex);
> +}
> +
> +/* The logic is similar to BPF_PROG_RUN, but with explicit rcu and preempt that
> + * are needed for trampoline. The macro is split into
> + * call _bpf_prog_enter
> + * call prog->bpf_func
> + * call __bpf_prog_exit
> + */
> +u64 notrace __bpf_prog_enter(void)
> +{
> +	u64 start = 0;
> +
> +	rcu_read_lock();
> +	preempt_disable();
> +	if (static_branch_unlikely(&bpf_stats_enabled_key))
> +		start = sched_clock();
> +	return start;
> +}
> +
> +void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
> +{
> +	struct bpf_prog_stats *stats;
> +
> +	if (static_branch_unlikely(&bpf_stats_enabled_key) &&
> +	    /* static_key could be enabled in __bpf_prog_enter
> +	     * and disabled in __bpf_prog_exit.
> +	     * And vice versa.
> +	     * Hence check that 'start' is not zero.
> +	     */
> +	    start) {
> +		stats = this_cpu_ptr(prog->aux->stats);
> +		u64_stats_update_begin(&stats->syncp);
> +		stats->cnt++;
> +		stats->nsecs += sched_clock() - start;
> +		u64_stats_update_end(&stats->syncp);
> +	}
> +	preempt_enable();
> +	rcu_read_unlock();
> +}
> +
> +int __weak
> +arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags,
> +			    struct bpf_prog **fentry_progs, int fentry_cnt,
> +			    struct bpf_prog **fexit_progs, int fexit_cnt,
> +			    void *orig_call)
> +{
> +	return -ENOTSUPP;
> +}
> +
> +static int __init init_trampolines(void)
> +{
> +	int i;
> +
> +	for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++)
> +		INIT_HLIST_HEAD(&trampoline_table[i]);
> +	return 0;
> +}
> +late_initcall(init_trampolines);
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 2f2374967b36..36542edd4936 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -9382,8 +9382,11 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
> 	struct bpf_prog *prog = env->prog;
> 	u32 btf_id = prog->aux->attach_btf_id;
> 	const char prefix[] = "btf_trace_";
> +	struct bpf_trampoline *tr;
> 	const struct btf_type *t;
> 	const char *tname;
> +	long addr;
> +	int ret;
> 
> 	if (prog->type != BPF_PROG_TYPE_TRACING)
> 		return 0;
> @@ -9432,6 +9435,42 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
> 		prog->aux->attach_func_proto = t;
> 		prog->aux->attach_btf_trace = true;
> 		return 0;
> +	case BPF_TRACE_FENTRY:
> +	case BPF_TRACE_FEXIT:
> +		if (!btf_type_is_func(t)) {
> +			verbose(env, "attach_btf_id %u is not a function\n",
> +				btf_id);
> +			return -EINVAL;
> +		}
> +		t = btf_type_by_id(btf_vmlinux, t->type);
> +		if (!btf_type_is_func_proto(t))
> +			return -EINVAL;
> +		tr = bpf_trampoline_lookup(btf_id);
> +		if (!tr)
> +			return -ENOMEM;
> +		prog->aux->attach_func_name = tname;
> +		prog->aux->attach_func_proto = t;
> +		if (tr->func.addr) {
> +			prog->aux->trampoline = tr;
> +			return 0;
> +		}
> +		ret = btf_distill_func_proto(&env->log, btf_vmlinux, t,
> +					     tname, &tr->func.model);
> +		if (ret < 0) {
> +			bpf_trampoline_put(tr);
> +			return ret;
> +		}
> +		addr = kallsyms_lookup_name(tname);
> +		if (!addr) {
> +			verbose(env,
> +				"The address of function %s cannot be found\n",
> +				tname);
> +			bpf_trampoline_put(tr);
> +			return -ENOENT;
> +		}
> +		tr->func.addr = (void *)addr;
> +		prog->aux->trampoline = tr;
> +		return 0;
> 	default:
> 		return -EINVAL;
> 	}
> -- 
> 2.23.0
>