[PATCH rfc bpf-next 0/8] Optimize BPF tail calls for direct jumps

bpf.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH rfc bpf-next 0/8] Optimize BPF tail calls for direct jumps
@ 2019-11-15  1:03 Daniel Borkmann
  2019-11-15  1:03 ` [PATCH rfc bpf-next 1/8] bpf, x86: generalize and extend bpf_arch_text_poke " Daniel Borkmann
                   ` (7 more replies)
  0 siblings, 8 replies; 28+ messages in thread
From: Daniel Borkmann @ 2019-11-15  1:03 UTC (permalink / raw)
  To: ast; +Cc: john.fastabend, netdev, bpf, Daniel Borkmann

This gets rid of indirect jumps for BPF tail calls whenever possible.
See patch 7/8 for more general details. This is on top of Alexei's
'[v4,bpf-next,00/20] Introduce BPF trampoline' series [0]. For non-RFC
I'll still massage commit messages a bit and expand the existing set
of tail call tests with a few more kselftest cases.

Thanks,
Daniel

  [0] https://patchwork.ozlabs.org/project/netdev/list/?series=142923

Daniel Borkmann (8):
  bpf, x86: generalize and extend bpf_arch_text_poke for direct jumps
  bpf: add bpf_prog_under_eviction helper
  bpf: move bpf_free_used_maps into sleepable section
  bpf: move owner type,jited info into array auxillary data
  bpf: add jit poke descriptor mock-up for jit images
  bpf: add poke dependency tracking for prog array maps
  bpf, x86: emit patchable direct jump as tail call
  bpf: constant map key tracking for prog array pokes

 arch/x86/net/bpf_jit_comp.c  | 234 ++++++++++++++++++++++++-----------
 include/linux/bpf.h          |  85 +++++++++++--
 include/linux/bpf_verifier.h |   1 +
 include/linux/filter.h       |  10 ++
 kernel/bpf/arraymap.c        | 152 ++++++++++++++++++++++-
 kernel/bpf/core.c            |  73 ++++++++++-
 kernel/bpf/map_in_map.c      |   5 +-
 kernel/bpf/syscall.c         |  41 ++----
 kernel/bpf/verifier.c        |  98 +++++++++++++++
 9 files changed, 578 insertions(+), 121 deletions(-)

-- 
2.21.0


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH rfc bpf-next 1/8] bpf, x86: generalize and extend bpf_arch_text_poke for direct jumps
  2019-11-15  1:03 [PATCH rfc bpf-next 0/8] Optimize BPF tail calls for direct jumps Daniel Borkmann
@ 2019-11-15  1:03 ` Daniel Borkmann
  2019-11-15 23:22   ` Andrii Nakryiko
  2019-11-15  1:03 ` [PATCH rfc bpf-next 2/8] bpf: add bpf_prog_under_eviction helper Daniel Borkmann
                   ` (6 subsequent siblings)
  7 siblings, 1 reply; 28+ messages in thread
From: Daniel Borkmann @ 2019-11-15  1:03 UTC (permalink / raw)
  To: ast; +Cc: john.fastabend, netdev, bpf, Daniel Borkmann

Add BPF_MOD_{NOP_TO_JUMP,JUMP_TO_JUMP,JUMP_TO_NOP} patching for x86
JIT in order to be able to patch direct jumps or nop them out. We need
this facility in order to patch tail call jumps and in later work also
BPF static keys.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 arch/x86/net/bpf_jit_comp.c | 64 ++++++++++++++++++++++++++-----------
 include/linux/bpf.h         |  6 ++++
 2 files changed, 52 insertions(+), 18 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 2e586f579945..66921f2aeece 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -203,8 +203,9 @@ struct jit_context {
 /* Maximum number of bytes emitted while JITing one eBPF insn */
 #define BPF_MAX_INSN_SIZE	128
 #define BPF_INSN_SAFETY		64
-/* number of bytes emit_call() needs to generate call instruction */
-#define X86_CALL_SIZE		5
+
+/* Number of bytes emit_patchable() needs to generate instructions */
+#define X86_PATCHABLE_SIZE	5
 
 #define PROLOGUE_SIZE		25
 
@@ -215,7 +216,7 @@ struct jit_context {
 static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
 {
 	u8 *prog = *pprog;
-	int cnt = X86_CALL_SIZE;
+	int cnt = X86_PATCHABLE_SIZE;
 
 	/* BPF trampoline can be made to work without these nops,
 	 * but let's waste 5 bytes for now and optimize later
@@ -480,64 +481,91 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
 	*pprog = prog;
 }
 
-static int emit_call(u8 **pprog, void *func, void *ip)
+static int emit_patchable(u8 **pprog, void *func, void *ip, u8 b1)
 {
 	u8 *prog = *pprog;
 	int cnt = 0;
 	s64 offset;
 
-	offset = func - (ip + X86_CALL_SIZE);
+	offset = func - (ip + X86_PATCHABLE_SIZE);
 	if (!is_simm32(offset)) {
 		pr_err("Target call %p is out of range\n", func);
 		return -EINVAL;
 	}
-	EMIT1_off32(0xE8, offset);
+	EMIT1_off32(b1, offset);
 	*pprog = prog;
 	return 0;
 }
 
+static int emit_call(u8 **pprog, void *func, void *ip)
+{
+	return emit_patchable(pprog, func, ip, 0xE8);
+}
+
+static int emit_jump(u8 **pprog, void *func, void *ip)
+{
+	return emit_patchable(pprog, func, ip, 0xE9);
+}
+
 int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 		       void *old_addr, void *new_addr)
 {
-	u8 old_insn[X86_CALL_SIZE] = {};
-	u8 new_insn[X86_CALL_SIZE] = {};
+	int (*emit_patch)(u8 **pprog, void *func, void *ip);
+	u8 old_insn[X86_PATCHABLE_SIZE] = {};
+	u8 new_insn[X86_PATCHABLE_SIZE] = {};
 	u8 *prog;
 	int ret;
 
 	if (!is_kernel_text((long)ip) &&
 	    !is_bpf_text_address((long)ip))
-		/* BPF trampoline in modules is not supported */
+		/* BPF poking in modules is not supported */
 		return -EINVAL;
 
+	switch (t) {
+	case BPF_MOD_NOP_TO_CALL ... BPF_MOD_CALL_TO_NOP:
+		emit_patch = emit_call;
+		break;
+	case BPF_MOD_NOP_TO_JUMP ... BPF_MOD_JUMP_TO_NOP:
+		emit_patch = emit_jump;
+		break;
+	default:
+		return -ENOTSUPP;
+	}
+
 	if (old_addr) {
 		prog = old_insn;
-		ret = emit_call(&prog, old_addr, (void *)ip);
+		ret = emit_patch(&prog, old_addr, (void *)ip);
 		if (ret)
 			return ret;
 	}
 	if (new_addr) {
 		prog = new_insn;
-		ret = emit_call(&prog, new_addr, (void *)ip);
+		ret = emit_patch(&prog, new_addr, (void *)ip);
 		if (ret)
 			return ret;
 	}
+
 	ret = -EBUSY;
 	mutex_lock(&text_mutex);
 	switch (t) {
 	case BPF_MOD_NOP_TO_CALL:
-		if (memcmp(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE))
+	case BPF_MOD_NOP_TO_JUMP:
+		if (memcmp(ip, ideal_nops[NOP_ATOMIC5], X86_PATCHABLE_SIZE))
 			goto out;
-		text_poke_bp(ip, new_insn, X86_CALL_SIZE, NULL);
+		text_poke_bp(ip, new_insn, X86_PATCHABLE_SIZE, NULL);
 		break;
 	case BPF_MOD_CALL_TO_CALL:
-		if (memcmp(ip, old_insn, X86_CALL_SIZE))
+	case BPF_MOD_JUMP_TO_JUMP:
+		if (memcmp(ip, old_insn, X86_PATCHABLE_SIZE))
 			goto out;
-		text_poke_bp(ip, new_insn, X86_CALL_SIZE, NULL);
+		text_poke_bp(ip, new_insn, X86_PATCHABLE_SIZE, NULL);
 		break;
 	case BPF_MOD_CALL_TO_NOP:
-		if (memcmp(ip, old_insn, X86_CALL_SIZE))
+	case BPF_MOD_JUMP_TO_NOP:
+		if (memcmp(ip, old_insn, X86_PATCHABLE_SIZE))
 			goto out;
-		text_poke_bp(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE, NULL);
+		text_poke_bp(ip, ideal_nops[NOP_ATOMIC5], X86_PATCHABLE_SIZE,
+			     NULL);
 		break;
 	}
 	ret = 0;
@@ -1394,7 +1422,7 @@ int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags
 		/* skip patched call instruction and point orig_call to actual
 		 * body of the kernel function.
 		 */
-		orig_call += X86_CALL_SIZE;
+		orig_call += X86_PATCHABLE_SIZE;
 
 	prog = image;
 
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 42c112d2b3a9..562d9ade2926 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1283,10 +1283,16 @@ static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
 #endif /* CONFIG_INET */
 
 enum bpf_text_poke_type {
+	/* All call-related pokes. */
 	BPF_MOD_NOP_TO_CALL,
 	BPF_MOD_CALL_TO_CALL,
 	BPF_MOD_CALL_TO_NOP,
+	/* All jump-related pokes. */
+	BPF_MOD_NOP_TO_JUMP,
+	BPF_MOD_JUMP_TO_JUMP,
+	BPF_MOD_JUMP_TO_NOP,
 };
+
 int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 		       void *addr1, void *addr2);
 
-- 
2.21.0


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH rfc bpf-next 2/8] bpf: add bpf_prog_under_eviction helper
  2019-11-15  1:03 [PATCH rfc bpf-next 0/8] Optimize BPF tail calls for direct jumps Daniel Borkmann
  2019-11-15  1:03 ` [PATCH rfc bpf-next 1/8] bpf, x86: generalize and extend bpf_arch_text_poke " Daniel Borkmann
@ 2019-11-15  1:03 ` Daniel Borkmann
  2019-11-15  1:03 ` [PATCH rfc bpf-next 3/8] bpf: move bpf_free_used_maps into sleepable section Daniel Borkmann
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 28+ messages in thread
From: Daniel Borkmann @ 2019-11-15  1:03 UTC (permalink / raw)
  To: ast; +Cc: john.fastabend, netdev, bpf, Daniel Borkmann

Add a small helper which can be used to check whether we're
currently tearing down a BPF program.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h  | 1 +
 kernel/bpf/syscall.c | 5 +++++
 2 files changed, 6 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 562d9ade2926..0a20618fd8e2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -774,6 +774,7 @@ struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i);
 void bpf_prog_sub(struct bpf_prog *prog, int i);
 struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog);
 struct bpf_prog * __must_check bpf_prog_inc_not_zero(struct bpf_prog *prog);
+bool bpf_prog_under_eviction(struct bpf_prog *prog);
 void bpf_prog_put(struct bpf_prog *prog);
 int __bpf_prog_charge(struct user_struct *user, u32 pages);
 void __bpf_prog_uncharge(struct user_struct *user, u32 pages);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c88c815c2154..2a687cb9b0d2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1482,6 +1482,11 @@ struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_inc);
 
+bool bpf_prog_under_eviction(struct bpf_prog *prog)
+{
+	return atomic_read(&prog->aux->refcnt) == 0;
+}
+
 /* prog_idr_lock should have been held */
 struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
 {
-- 
2.21.0


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH rfc bpf-next 3/8] bpf: move bpf_free_used_maps into sleepable section
  2019-11-15  1:03 [PATCH rfc bpf-next 0/8] Optimize BPF tail calls for direct jumps Daniel Borkmann
  2019-11-15  1:03 ` [PATCH rfc bpf-next 1/8] bpf, x86: generalize and extend bpf_arch_text_poke " Daniel Borkmann
  2019-11-15  1:03 ` [PATCH rfc bpf-next 2/8] bpf: add bpf_prog_under_eviction helper Daniel Borkmann
@ 2019-11-15  1:03 ` Daniel Borkmann
  2019-11-15 23:32   ` Andrii Nakryiko
  2019-11-15  1:03 ` [PATCH rfc bpf-next 4/8] bpf: move owner type,jited info into array auxillary data Daniel Borkmann
                   ` (4 subsequent siblings)
  7 siblings, 1 reply; 28+ messages in thread
From: Daniel Borkmann @ 2019-11-15  1:03 UTC (permalink / raw)
  To: ast; +Cc: john.fastabend, netdev, bpf, Daniel Borkmann

We later on are going to need a sleepable context as opposed to plain
RCU callback in order to untrack programs we need to poke at runtime
and tracking as well as image update is performed under mutex.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h  |  4 ++++
 kernel/bpf/core.c    | 23 +++++++++++++++++++++++
 kernel/bpf/syscall.c | 20 --------------------
 3 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0a20618fd8e2..ed1725539891 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1030,6 +1030,10 @@ static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
 {
 	return -ENOTSUPP;
 }
+
+static inline void bpf_map_put(struct bpf_map *map)
+{
+}
 #endif /* CONFIG_BPF_SYSCALL */
 
 static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 2e319e4a8aae..516ef5923eea 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1999,12 +1999,35 @@ int bpf_prog_array_copy_info(struct bpf_prog_array *array,
 								     : 0;
 }
 
+static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux)
+{
+	enum bpf_cgroup_storage_type stype;
+
+	for_each_cgroup_storage_type(stype) {
+		if (!aux->cgroup_storage[stype])
+			continue;
+		bpf_cgroup_storage_release(aux->prog,
+					   aux->cgroup_storage[stype]);
+	}
+}
+
+static void bpf_free_used_maps(struct bpf_prog_aux *aux)
+{
+	int i;
+
+	bpf_free_cgroup_storage(aux);
+	for (i = 0; i < aux->used_map_cnt; i++)
+		bpf_map_put(aux->used_maps[i]);
+	kfree(aux->used_maps);
+}
+
 static void bpf_prog_free_deferred(struct work_struct *work)
 {
 	struct bpf_prog_aux *aux;
 	int i;
 
 	aux = container_of(work, struct bpf_prog_aux, work);
+	bpf_free_used_maps(aux);
 	if (bpf_prog_is_dev_bound(aux))
 		bpf_prog_offload_destroy(aux->prog);
 #ifdef CONFIG_PERF_EVENTS
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2a687cb9b0d2..1b311742a2ea 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1216,25 +1216,6 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
 	return 0;
 }
 
-/* drop refcnt on maps used by eBPF program and free auxilary data */
-static void free_used_maps(struct bpf_prog_aux *aux)
-{
-	enum bpf_cgroup_storage_type stype;
-	int i;
-
-	for_each_cgroup_storage_type(stype) {
-		if (!aux->cgroup_storage[stype])
-			continue;
-		bpf_cgroup_storage_release(aux->prog,
-					   aux->cgroup_storage[stype]);
-	}
-
-	for (i = 0; i < aux->used_map_cnt; i++)
-		bpf_map_put(aux->used_maps[i]);
-
-	kfree(aux->used_maps);
-}
-
 int __bpf_prog_charge(struct user_struct *user, u32 pages)
 {
 	unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
@@ -1329,7 +1310,6 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 
 	kvfree(aux->func_info);
 	kfree(aux->func_info_aux);
-	free_used_maps(aux);
 	bpf_prog_uncharge_memlock(aux->prog);
 	security_bpf_prog_free(aux);
 	bpf_prog_free(aux->prog);
-- 
2.21.0


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH rfc bpf-next 4/8] bpf: move owner type,jited info into array auxillary data
  2019-11-15  1:03 [PATCH rfc bpf-next 0/8] Optimize BPF tail calls for direct jumps Daniel Borkmann
                   ` (2 preceding siblings ...)
  2019-11-15  1:03 ` [PATCH rfc bpf-next 3/8] bpf: move bpf_free_used_maps into sleepable section Daniel Borkmann
@ 2019-11-15  1:03 ` Daniel Borkmann
  2019-11-15 23:19   ` Andrii Nakryiko
  2019-11-15  1:03 ` [PATCH rfc bpf-next 5/8] bpf: add jit poke descriptor mock-up for jit images Daniel Borkmann
                   ` (3 subsequent siblings)
  7 siblings, 1 reply; 28+ messages in thread
From: Daniel Borkmann @ 2019-11-15  1:03 UTC (permalink / raw)
  To: ast; +Cc: john.fastabend, netdev, bpf, Daniel Borkmann

We're going to extend this with further information which is only
relevant for prog array at this point. Given this info is not used
in critical path, move it into its own structure such that the main
array map structure can be kept on diet.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h     | 18 +++++++++++-------
 kernel/bpf/arraymap.c   | 32 ++++++++++++++++++++++++++++++--
 kernel/bpf/core.c       | 11 +++++------
 kernel/bpf/map_in_map.c |  5 ++---
 kernel/bpf/syscall.c    | 16 ++++++----------
 5 files changed, 54 insertions(+), 28 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ed1725539891..40337fa0e463 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -556,17 +556,21 @@ struct bpf_prog_aux {
 	};
 };
 
+struct bpf_array_aux {
+	/* 'Ownership' of prog array is claimed by the first program that
+	 * is going to use this map or by the first program which FD is
+	 * stored in the map to make sure that all callers and callees have
+	 * the same prog type and JITed flag.
+	 */
+	enum bpf_prog_type type;
+	bool jited;
+};
+
 struct bpf_array {
 	struct bpf_map map;
 	u32 elem_size;
 	u32 index_mask;
-	/* 'ownership' of prog_array is claimed by the first program that
-	 * is going to use this map or by the first program which FD is stored
-	 * in the map to make sure that all callers and callees have the same
-	 * prog_type and JITed flag
-	 */
-	enum bpf_prog_type owner_prog_type;
-	bool owner_jited;
+	struct bpf_array_aux *aux;
 	union {
 		char value[0] __aligned(8);
 		void *ptrs[0] __aligned(8);
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 1c65ce0098a9..88c1363b2925 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -625,10 +625,38 @@ static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,
 	rcu_read_unlock();
 }
 
+static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_array_aux *aux;
+	struct bpf_map *map;
+
+	aux = kzalloc(sizeof(*aux), GFP_KERNEL);
+	if (!aux)
+		return ERR_PTR(-ENOMEM);
+
+	map = array_map_alloc(attr);
+	if (IS_ERR(map)) {
+		kfree(aux);
+		return map;
+	}
+
+	container_of(map, struct bpf_array, map)->aux = aux;
+	return map;
+}
+
+static void prog_array_map_free(struct bpf_map *map)
+{
+	struct bpf_array_aux *aux;
+
+	aux = container_of(map, struct bpf_array, map)->aux;
+	kfree(aux);
+	fd_array_map_free(map);
+}
+
 const struct bpf_map_ops prog_array_map_ops = {
 	.map_alloc_check = fd_array_map_alloc_check,
-	.map_alloc = array_map_alloc,
-	.map_free = fd_array_map_free,
+	.map_alloc = prog_array_map_alloc,
+	.map_free = prog_array_map_free,
 	.map_get_next_key = array_map_get_next_key,
 	.map_lookup_elem = fd_array_map_lookup_elem,
 	.map_delete_elem = fd_array_map_delete_elem,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 516ef5923eea..c359d0a94896 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1687,18 +1687,17 @@ bool bpf_prog_array_compatible(struct bpf_array *array,
 	if (fp->kprobe_override)
 		return false;
 
-	if (!array->owner_prog_type) {
+	if (!array->aux->type) {
 		/* There's no owner yet where we could check for
 		 * compatibility.
 		 */
-		array->owner_prog_type = fp->type;
-		array->owner_jited = fp->jited;
-
+		array->aux->type  = fp->type;
+		array->aux->jited = fp->jited;
 		return true;
 	}
 
-	return array->owner_prog_type == fp->type &&
-	       array->owner_jited == fp->jited;
+	return array->aux->type  == fp->type &&
+	       array->aux->jited == fp->jited;
 }
 
 static int bpf_check_tail_call(const struct bpf_prog *fp)
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index fab4fb134547..296f4ac76211 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -17,9 +17,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
 	if (IS_ERR(inner_map))
 		return inner_map;
 
-	/* prog_array->owner_prog_type and owner_jited
-	 * is a runtime binding.  Doing static check alone
-	 * in the verifier is not enough.
+	/* prog_array->aux->{type,jited} is a runtime binding.
+	 * Doing static check alone in the verifier is not enough.
 	 */
 	if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
 	    inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1b311742a2ea..81057e5922d5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -371,13 +371,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 {
 	const struct bpf_map *map = filp->private_data;
 	const struct bpf_array *array;
-	u32 owner_prog_type = 0;
-	u32 owner_jited = 0;
+	u32 type = 0, jited = 0;
 
 	if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
 		array = container_of(map, struct bpf_array, map);
-		owner_prog_type = array->owner_prog_type;
-		owner_jited = array->owner_jited;
+		type  = array->aux->type;
+		jited = array->aux->jited;
 	}
 
 	seq_printf(m,
@@ -397,12 +396,9 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 		   map->memory.pages * 1ULL << PAGE_SHIFT,
 		   map->id,
 		   READ_ONCE(map->frozen));
-
-	if (owner_prog_type) {
-		seq_printf(m, "owner_prog_type:\t%u\n",
-			   owner_prog_type);
-		seq_printf(m, "owner_jited:\t%u\n",
-			   owner_jited);
+	if (type) {
+		seq_printf(m, "owner_prog_type:\t%u\n", type);
+		seq_printf(m, "owner_jited:\t%u\n", jited);
 	}
 }
 #endif
-- 
2.21.0


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH rfc bpf-next 5/8] bpf: add jit poke descriptor mock-up for jit images
  2019-11-15  1:03 [PATCH rfc bpf-next 0/8] Optimize BPF tail calls for direct jumps Daniel Borkmann
                   ` (3 preceding siblings ...)
  2019-11-15  1:03 ` [PATCH rfc bpf-next 4/8] bpf: move owner type,jited info into array auxillary data Daniel Borkmann
@ 2019-11-15  1:03 ` Daniel Borkmann
  2019-11-18 18:17   ` Andrii Nakryiko
  2019-11-15  1:04 ` [PATCH rfc bpf-next 6/8] bpf: add poke dependency tracking for prog array maps Daniel Borkmann
                   ` (2 subsequent siblings)
  7 siblings, 1 reply; 28+ messages in thread
From: Daniel Borkmann @ 2019-11-15  1:03 UTC (permalink / raw)
  To: ast; +Cc: john.fastabend, netdev, bpf, Daniel Borkmann

Add initial poke table data structures and management to the BPF
prog that can later be used by JITs. Also add an instance of poke
specific data for tail call maps. Plan for later work is to extend
this also for BPF static keys.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h    | 20 ++++++++++++++++++++
 include/linux/filter.h | 10 ++++++++++
 kernel/bpf/core.c      | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 64 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 40337fa0e463..0ff06a0d0058 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -484,6 +484,24 @@ struct bpf_func_info_aux {
 	bool unreliable;
 };
 
+enum bpf_jit_poke_reason {
+	BPF_POKE_REASON_TAIL_CALL,
+};
+
+/* Descriptor of pokes pointing /into/ the JITed image. */
+struct bpf_jit_poke_descriptor {
+	void *ip;
+	union {
+		struct {
+			struct bpf_map *map;
+			u32 key;
+		} tc;
+	};
+	u8 ip_stable;
+	u8 adj_off;
+	u16 reason;
+};
+
 struct bpf_prog_aux {
 	atomic_t refcnt;
 	u32 used_map_cnt;
@@ -509,6 +527,8 @@ struct bpf_prog_aux {
 	const char *attach_func_name;
 	struct bpf_prog **func;
 	void *jit_data; /* JIT specific data. arch dependent */
+	struct bpf_jit_poke_descriptor *poke_tab;
+	u32 size_poke_tab;
 	struct latch_tree_node ksym_tnode;
 	struct list_head ksym_lnode;
 	const struct bpf_prog_ops *ops;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 7a6f8f6f1da4..98dc2be7abc9 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -950,6 +950,9 @@ void *bpf_jit_alloc_exec(unsigned long size);
 void bpf_jit_free_exec(void *addr);
 void bpf_jit_free(struct bpf_prog *fp);
 
+int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
+				struct bpf_jit_poke_descriptor *poke);
+
 int bpf_jit_get_func_addr(const struct bpf_prog *prog,
 			  const struct bpf_insn *insn, bool extra_pass,
 			  u64 *func_addr, bool *func_addr_fixed);
@@ -1053,6 +1056,13 @@ static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp)
 	return false;
 }
 
+static inline int
+bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
+			    struct bpf_jit_poke_descriptor *poke)
+{
+	return -ENOTSUPP;
+}
+
 static inline void bpf_jit_free(struct bpf_prog *fp)
 {
 	bpf_prog_unlock_free(fp);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c359d0a94896..853dc4538442 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -255,6 +255,7 @@ void __bpf_prog_free(struct bpf_prog *fp)
 {
 	if (fp->aux) {
 		free_percpu(fp->aux->stats);
+		kfree(fp->aux->poke_tab);
 		kfree(fp->aux);
 	}
 	vfree(fp);
@@ -755,6 +756,39 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 	return ret;
 }
 
+int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
+				struct bpf_jit_poke_descriptor *poke)
+{
+	struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab;
+	static const u32 poke_tab_max = 1024;
+	u32 slot = prog->aux->size_poke_tab;
+	u32 size = slot + 1;
+
+	if (size > poke_tab_max)
+		return -ENOSPC;
+	if (poke->ip || poke->ip_stable || poke->adj_off)
+		return -EINVAL;
+
+	switch (poke->reason) {
+	case BPF_POKE_REASON_TAIL_CALL:
+		if (!poke->tc.map)
+			return -EINVAL;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	tab = krealloc(tab, size * sizeof(*poke), GFP_KERNEL);
+	if (!tab)
+		return -ENOMEM;
+
+	memcpy(&tab[slot], poke, sizeof(*poke));
+	prog->aux->size_poke_tab = size;
+	prog->aux->poke_tab = tab;
+
+	return slot;
+}
+
 static atomic_long_t bpf_jit_current;
 
 /* Can be overridden by an arch's JIT compiler if it has a custom,
-- 
2.21.0


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH rfc bpf-next 6/8] bpf: add poke dependency tracking for prog array maps
  2019-11-15  1:03 [PATCH rfc bpf-next 0/8] Optimize BPF tail calls for direct jumps Daniel Borkmann
                   ` (4 preceding siblings ...)
  2019-11-15  1:03 ` [PATCH rfc bpf-next 5/8] bpf: add jit poke descriptor mock-up for jit images Daniel Borkmann
@ 2019-11-15  1:04 ` Daniel Borkmann
  2019-11-18 17:39   ` Andrii Nakryiko
  2019-11-15  1:04 ` [PATCH rfc bpf-next 7/8] bpf, x86: emit patchable direct jump as tail call Daniel Borkmann
  2019-11-15  1:04 ` [PATCH rfc bpf-next 8/8] bpf: constant map key tracking for prog array pokes Daniel Borkmann
  7 siblings, 1 reply; 28+ messages in thread
From: Daniel Borkmann @ 2019-11-15  1:04 UTC (permalink / raw)
  To: ast; +Cc: john.fastabend, netdev, bpf, Daniel Borkmann

This work adds program tracking to prog array maps. This is needed such
that upon prog array updates/deletions we can fix up all programs which
make use of this tail call map. We add ops->map_poke_{un,}track() helpers
to maps to maintain the list of programs and ops->map_poke_run() for
triggering the actual update. bpf_array_aux is extended to contain the
list head and poke_mutex in order to serialize program patching during
updates/deletions. bpf_free_used_maps() will untrack the program shortly
before dropping the reference to the map.

The prog_array_map_poke_run() is triggered during updates/deletions and
walks the maintained prog list. It checks in their poke_tabs whether the
map and key is matching and runs the actual bpf_arch_text_poke() for
patching in the nop or new jmp location. Depending on the type of update,
we use one of BPF_MOD_{NOP_TO_JUMP,JUMP_TO_NOP,JUMP_TO_JUMP}.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h   |  36 +++++++++++++
 kernel/bpf/arraymap.c | 120 +++++++++++++++++++++++++++++++++++++++++-
 kernel/bpf/core.c     |   9 +++-
 3 files changed, 162 insertions(+), 3 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0ff06a0d0058..62a369fb8d98 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -21,6 +21,7 @@ struct bpf_verifier_env;
 struct bpf_verifier_log;
 struct perf_event;
 struct bpf_prog;
+struct bpf_prog_aux;
 struct bpf_map;
 struct sock;
 struct seq_file;
@@ -63,6 +64,12 @@ struct bpf_map_ops {
 			     const struct btf_type *key_type,
 			     const struct btf_type *value_type);
 
+	/* Prog poke tracking helpers. */
+	int (*map_poke_track)(struct bpf_map *map, struct bpf_prog_aux *aux);
+	void (*map_poke_untrack)(struct bpf_map *map, struct bpf_prog_aux *aux);
+	void (*map_poke_run)(struct bpf_map *map, u32 key, struct bpf_prog *old,
+			     struct bpf_prog *new);
+
 	/* Direct value access helpers. */
 	int (*map_direct_value_addr)(const struct bpf_map *map,
 				     u64 *imm, u32 off);
@@ -584,6 +591,9 @@ struct bpf_array_aux {
 	 */
 	enum bpf_prog_type type;
 	bool jited;
+	/* Programs with direct jumps into programs part of this array. */
+	struct list_head poke_progs;
+	struct mutex poke_mutex;
 };
 
 struct bpf_array {
@@ -1325,4 +1335,30 @@ enum bpf_text_poke_type {
 int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 		       void *addr1, void *addr2);
 
+static inline void bpf_map_poke_lock(struct bpf_map *map)
+	__acquires(&container_of(map, struct bpf_array, map)->aux->poke_mutex)
+{
+#ifdef CONFIG_BPF_SYSCALL
+	struct bpf_array_aux *aux;
+
+	aux = container_of(map, struct bpf_array, map)->aux;
+	if (aux)
+		mutex_lock(&aux->poke_mutex);
+#endif
+	__acquire(&aux->poke_mutex);
+}
+
+static inline void bpf_map_poke_unlock(struct bpf_map *map)
+	__releases(&container_of(map, struct bpf_array, map)->aux->poke_mutex)
+{
+#ifdef CONFIG_BPF_SYSCALL
+	struct bpf_array_aux *aux;
+
+	aux = container_of(map, struct bpf_array, map)->aux;
+	if (aux)
+		mutex_unlock(&aux->poke_mutex);
+#endif
+	__release(&aux->poke_mutex);
+}
+
 #endif /* _LINUX_BPF_H */
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 88c1363b2925..b9ef993278c6 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -540,10 +540,14 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
 	if (IS_ERR(new_ptr))
 		return PTR_ERR(new_ptr);
 
+	bpf_map_poke_lock(map);
 	old_ptr = xchg(array->ptrs + index, new_ptr);
+	if (map->ops->map_poke_run)
+		map->ops->map_poke_run(map, index, old_ptr, new_ptr);
+	bpf_map_poke_unlock(map);
+
 	if (old_ptr)
 		map->ops->map_fd_put_ptr(old_ptr);
-
 	return 0;
 }
 
@@ -556,7 +560,12 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
 	if (index >= array->map.max_entries)
 		return -E2BIG;
 
+	bpf_map_poke_lock(map);
 	old_ptr = xchg(array->ptrs + index, NULL);
+	if (map->ops->map_poke_run)
+		map->ops->map_poke_run(map, index, old_ptr, NULL);
+	bpf_map_poke_unlock(map);
+
 	if (old_ptr) {
 		map->ops->map_fd_put_ptr(old_ptr);
 		return 0;
@@ -625,6 +634,104 @@ static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,
 	rcu_read_unlock();
 }
 
+struct prog_poke_elem {
+	struct list_head list;
+	struct bpf_prog_aux *aux;
+};
+
+static int prog_array_map_poke_track(struct bpf_map *map,
+				     struct bpf_prog_aux *prog_aux)
+{
+	struct bpf_array_aux *aux;
+	struct prog_poke_elem *elem;
+
+	elem = kmalloc(sizeof(*elem), GFP_KERNEL);
+	if (!elem)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&elem->list);
+	elem->aux = prog_aux;
+
+	aux = container_of(map, struct bpf_array, map)->aux;
+	mutex_lock(&aux->poke_mutex);
+	list_add_tail(&elem->list, &aux->poke_progs);
+	mutex_unlock(&aux->poke_mutex);
+	return 0;
+}
+
+static void prog_array_map_poke_untrack(struct bpf_map *map,
+					struct bpf_prog_aux *prog_aux)
+{
+	struct prog_poke_elem *elem, *tmp;
+	struct bpf_array_aux *aux;
+
+	aux = container_of(map, struct bpf_array, map)->aux;
+	mutex_lock(&aux->poke_mutex);
+	list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) {
+		if (elem->aux == prog_aux) {
+			list_del_init(&elem->list);
+			kfree(elem);
+		}
+	}
+	mutex_unlock(&aux->poke_mutex);
+}
+
+static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
+				    struct bpf_prog *old,
+				    struct bpf_prog *new)
+{
+	enum bpf_text_poke_type type;
+	struct prog_poke_elem *elem;
+	struct bpf_array_aux *aux;
+
+	if (!old && new)
+		type = BPF_MOD_NOP_TO_JUMP;
+	else if (old && !new)
+		type = BPF_MOD_JUMP_TO_NOP;
+	else if (old && new)
+		type = BPF_MOD_JUMP_TO_JUMP;
+	else
+		return;
+
+	aux = container_of(map, struct bpf_array, map)->aux;
+	list_for_each_entry(elem, &aux->poke_progs, list) {
+		struct bpf_prog *prog = elem->aux->prog;
+		struct bpf_jit_poke_descriptor *poke;
+		int i, ret;
+
+		/* The prog's kallsym entry was removed out of RCU callback,
+		 * but we can only untrack from sleepable context, therefore
+		 * bpf_arch_text_poke() might not see that this is in BPF text
+		 * section. Given these programs are unreachable, we can skip
+		 * them here. Also programs reaching refcount of zero while
+		 * patching is in progress is okay since we're protected under
+		 * poke_mutex and untrack the programs before the JIT buffer
+		 * is freed. If that happens and if also the RCU callback did
+		 * remove the kallsyms entry in the meantime, bpf_arch_text_poke()
+		 * will bail out with -EINVAL without patching, which is totally
+		 * fine. Any other error happening at that point is a bug.
+		 */
+		if (bpf_prog_under_eviction(prog))
+			continue;
+		for (i = 0; i < prog->aux->size_poke_tab; i++) {
+			poke = &prog->aux->poke_tab[i];
+			if (!READ_ONCE(poke->ip_stable))
+				continue;
+			if (poke->reason != BPF_POKE_REASON_TAIL_CALL)
+				continue;
+			if (poke->tc.map != map || poke->tc.key != key)
+				continue;
+			ret = bpf_arch_text_poke(poke->ip, type,
+						 old ? (u8 *)old->bpf_func +
+						 poke->adj_off : NULL,
+						 new ? (u8 *)new->bpf_func +
+						 poke->adj_off : NULL);
+			/* See comment above. */
+			BUG_ON(ret < 0 && ret != -EINVAL);
+		}
+	}
+}
+
 static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
 {
 	struct bpf_array_aux *aux;
@@ -634,6 +741,9 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
 	if (!aux)
 		return ERR_PTR(-ENOMEM);
 
+	INIT_LIST_HEAD(&aux->poke_progs);
+	mutex_init(&aux->poke_mutex);
+
 	map = array_map_alloc(attr);
 	if (IS_ERR(map)) {
 		kfree(aux);
@@ -646,9 +756,14 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
 
 static void prog_array_map_free(struct bpf_map *map)
 {
+	struct prog_poke_elem *elem, *tmp;
 	struct bpf_array_aux *aux;
 
 	aux = container_of(map, struct bpf_array, map)->aux;
+	list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) {
+		list_del_init(&elem->list);
+		kfree(elem);
+	}
 	kfree(aux);
 	fd_array_map_free(map);
 }
@@ -657,6 +772,9 @@ const struct bpf_map_ops prog_array_map_ops = {
 	.map_alloc_check = fd_array_map_alloc_check,
 	.map_alloc = prog_array_map_alloc,
 	.map_free = prog_array_map_free,
+	.map_poke_track = prog_array_map_poke_track,
+	.map_poke_untrack = prog_array_map_poke_untrack,
+	.map_poke_run = prog_array_map_poke_run,
 	.map_get_next_key = array_map_get_next_key,
 	.map_lookup_elem = fd_array_map_lookup_elem,
 	.map_delete_elem = fd_array_map_delete_elem,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 853dc4538442..39750ef3ef31 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2046,11 +2046,16 @@ static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux)
 
 static void bpf_free_used_maps(struct bpf_prog_aux *aux)
 {
+	struct bpf_map *map;
 	int i;
 
 	bpf_free_cgroup_storage(aux);
-	for (i = 0; i < aux->used_map_cnt; i++)
-		bpf_map_put(aux->used_maps[i]);
+	for (i = 0; i < aux->used_map_cnt; i++) {
+		map = aux->used_maps[i];
+		if (map->ops->map_poke_untrack)
+			map->ops->map_poke_untrack(map, aux);
+		bpf_map_put(map);
+	}
 	kfree(aux->used_maps);
 }
 
-- 
2.21.0


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH rfc bpf-next 7/8] bpf, x86: emit patchable direct jump as tail call
  2019-11-15  1:03 [PATCH rfc bpf-next 0/8] Optimize BPF tail calls for direct jumps Daniel Borkmann
                   ` (5 preceding siblings ...)
  2019-11-15  1:04 ` [PATCH rfc bpf-next 6/8] bpf: add poke dependency tracking for prog array maps Daniel Borkmann
@ 2019-11-15  1:04 ` Daniel Borkmann
  2019-11-15  3:23   ` Alexei Starovoitov
  2019-11-15  1:04 ` [PATCH rfc bpf-next 8/8] bpf: constant map key tracking for prog array pokes Daniel Borkmann
  7 siblings, 1 reply; 28+ messages in thread
From: Daniel Borkmann @ 2019-11-15  1:04 UTC (permalink / raw)
  To: ast; +Cc: john.fastabend, netdev, bpf, Daniel Borkmann

Add initial code emission for *direct* jumps for tail call maps in
order to avoid the retpoline overhead from a493a87f38cf ("bpf, x64:
implement retpoline for tail call") for situations that allow for
it, meaning, for known constant keys at verification time which are
used as index into the tail call map. In case of Cilium which makes
heavy use of tail calls, constant keys are used in the vast majority,
only for a single occurence we use a dynamic key.

High level outline is that if the target prog is NULL in the map, we
emit a 5-byte nop for the fall-through case and if not we emit a
5-byte direct relative jmp to the target bpf_func + skipped prologue
offset. Later during runtime, we patch these 5-byte nop/jmps upon
tail call map update or deletions dynamically. Note that on x86-64
the direct jmp works as we reuse the same stack frame and skip
prologue (as opposed to some other JIT implementations).

One of the issues is that the tail call map slots can change at any
given time even during JITing. Therefore, we have two passes: i) emit
nops for all patchable locations during main JITing phase until we
declare prog->jited = 1 eventually. At this point the image is stable,
not public yet and with all jmps disabled. While JITing, we collect
additional info like poke->ip in order to remember the patch location
for later modifications. In ii) fixup_bpf_tail_call_direct() walks
over the progs poke_tab, locks the tail call maps poke_mutex to
prevent from parallel updates and patches in the right locations via
__bpf_arch_text_poke(). Note, the main bpf_arch_text_poke() cannot
be used at this point since we're not yet exposed to kallsyms. After
patching, we activate that poke entry through poke->ip_stable.
Meaning, at this point any tail call map updates/deletions are not
going to ignore that poke entry anymore.

Example prog:

  # ./bpftool p d x i 1655
   0: (b7) r3 = 0
   1: (18) r2 = map[id:526]
   3: (85) call bpf_tail_call#12
   4: (b7) r0 = 1
   5: (95) exit

Before:

  # ./bpftool p d j i 1655
  0xffffffffc076e55c:
   0:   nopl   0x0(%rax,%rax,1)
   5:   push   %rbp
   6:   mov    %rsp,%rbp
   9:   sub    $0x200,%rsp
  10:   push   %rbx
  11:   push   %r13
  13:   push   %r14
  15:   push   %r15
  17:   pushq  $0x0                      _
  19:   xor    %edx,%edx                |_ index (arg 3)
  1b:   movabs $0xffff88d95cc82600,%rsi |_ map (arg 2)
  25:   mov    %edx,%edx                |  index >= array->map.max_entries
  27:   cmp    %edx,0x24(%rsi)          |
  2a:   jbe    0x0000000000000066       |_
  2c:   mov    -0x224(%rbp),%eax        |  tail call limit check
  32:   cmp    $0x20,%eax               |
  35:   ja     0x0000000000000066       |
  37:   add    $0x1,%eax                |
  3a:   mov    %eax,-0x224(%rbp)        |_
  40:   mov    0xd0(%rsi,%rdx,8),%rax   |_ prog = array->ptrs[index]
  48:   test   %rax,%rax                |  prog == NULL check
  4b:   je     0x0000000000000066       |_
  4d:   mov    0x30(%rax),%rax          |  goto *(prog->bpf_func + prologue_size)
  51:   add    $0x19,%rax               |
  55:   callq  0x0000000000000061       |  retpoline for indirect jump
  5a:   pause                           |
  5c:   lfence                          |
  5f:   jmp    0x000000000000005a       |
  61:   mov    %rax,(%rsp)              |
  65:   retq                            |_
  66:   mov    $0x1,%eax
  6b:   pop    %rbx
  6c:   pop    %r15
  6e:   pop    %r14
  70:   pop    %r13
  72:   pop    %rbx
  73:   leaveq
  74:   retq

After; state after JIT:

  # ./bpftool p d j i 1655
  0xffffffffc08e8930:
   0:   nopl   0x0(%rax,%rax,1)
   5:   push   %rbp
   6:   mov    %rsp,%rbp
   9:   sub    $0x200,%rsp
  10:   push   %rbx
  11:   push   %r13
  13:   push   %r14
  15:   push   %r15
  17:   pushq  $0x0                      _
  19:   xor    %edx,%edx                |_ index (arg 3)
  1b:   movabs $0xffff9d8afd74c000,%rsi |_ map (arg 2)
  25:   mov    -0x224(%rbp),%eax        |  tail call limit check
  2b:   cmp    $0x20,%eax               |
  2e:   ja     0x000000000000003e       |
  30:   add    $0x1,%eax                |
  33:   mov    %eax,-0x224(%rbp)        |_
  39:   jmpq   0xfffffffffffd1785       |_ [direct] goto *(prog->bpf_func + prologue_size)
  3e:   mov    $0x1,%eax
  43:   pop    %rbx
  44:   pop    %r15
  46:   pop    %r14
  48:   pop    %r13
  4a:   pop    %rbx
  4b:   leaveq
  4c:   retq

After; state after map update (target prog):

  # ./bpftool p d j i 1655
  0xffffffffc08e8930:
   0:   nopl   0x0(%rax,%rax,1)
   5:   push   %rbp
   6:   mov    %rsp,%rbp
   9:   sub    $0x200,%rsp
  10:   push   %rbx
  11:   push   %r13
  13:   push   %r14
  15:   push   %r15
  17:   pushq  $0x0
  19:   xor    %edx,%edx
  1b:   movabs $0xffff9d8afd74c000,%rsi
  25:   mov    -0x224(%rbp),%eax
  2b:   cmp    $0x20,%eax               .
  2e:   ja     0x000000000000003e       .
  30:   add    $0x1,%eax                .
  33:   mov    %eax,-0x224(%rbp)        |_
  39:   jmpq   0xffffffffffb09f55       |_ goto *(prog->bpf_func + prologue_size)
  3e:   mov    $0x1,%eax
  43:   pop    %rbx
  44:   pop    %r15
  46:   pop    %r14
  48:   pop    %r13
  4a:   pop    %rbx
  4b:   leaveq
  4c:   retq

After; state after map update (no prog):

  # ./bpftool p d j i 1655
  0xffffffffc08e8930:
   0:   nopl   0x0(%rax,%rax,1)
   5:   push   %rbp
   6:   mov    %rsp,%rbp
   9:   sub    $0x200,%rsp
  10:   push   %rbx
  11:   push   %r13
  13:   push   %r14
  15:   push   %r15
  17:   pushq  $0x0
  19:   xor    %edx,%edx
  1b:   movabs $0xffff9d8afd74c000,%rsi
  25:   mov    -0x224(%rbp),%eax
  2b:   cmp    $0x20,%eax               .
  2e:   ja     0x000000000000003e       .
  30:   add    $0x1,%eax                .
  33:   mov    %eax,-0x224(%rbp)        |_
  39:   nopl   0x0(%rax,%rax,1)         |_ fall-through nop
  3e:   mov    $0x1,%eax
  43:   pop    %rbx
  44:   pop    %r15
  46:   pop    %r14
  48:   pop    %r13
  4a:   pop    %rbx
  4b:   leaveq
  4c:   retq

Nice side-effect is that this also shrinks the code emission quite a
bit for for every tail call invocation.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 arch/x86/net/bpf_jit_comp.c | 252 ++++++++++++++++++++++--------------
 1 file changed, 157 insertions(+), 95 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 66921f2aeece..f4d294b0af2b 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -239,6 +239,105 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
 	*pprog = prog;
 }
 
+static int emit_patchable(u8 **pprog, void *func, void *ip, u8 b1)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	s64 offset;
+
+	offset = func - (ip + X86_PATCHABLE_SIZE);
+	if (!is_simm32(offset)) {
+		pr_err("Target call %p is out of range\n", func);
+		return -EINVAL;
+	}
+	EMIT1_off32(b1, offset);
+	*pprog = prog;
+	return 0;
+}
+
+static int emit_call(u8 **pprog, void *func, void *ip)
+{
+	return emit_patchable(pprog, func, ip, 0xE8);
+}
+
+static int emit_jump(u8 **pprog, void *func, void *ip)
+{
+	return emit_patchable(pprog, func, ip, 0xE9);
+}
+
+static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
+				void *old_addr, void *new_addr)
+{
+	int (*emit_patch)(u8 **pprog, void *func, void *ip);
+	u8 old_insn[X86_PATCHABLE_SIZE] = {};
+	u8 new_insn[X86_PATCHABLE_SIZE] = {};
+	u8 *prog;
+	int ret;
+
+	switch (t) {
+	case BPF_MOD_NOP_TO_CALL ... BPF_MOD_CALL_TO_NOP:
+		emit_patch = emit_call;
+		break;
+	case BPF_MOD_NOP_TO_JUMP ... BPF_MOD_JUMP_TO_NOP:
+		emit_patch = emit_jump;
+		break;
+	default:
+		return -ENOTSUPP;
+	}
+
+	if (old_addr) {
+		prog = old_insn;
+		ret = emit_patch(&prog, old_addr, (void *)ip);
+		if (ret)
+			return ret;
+	}
+	if (new_addr) {
+		prog = new_insn;
+		ret = emit_patch(&prog, new_addr, (void *)ip);
+		if (ret)
+			return ret;
+	}
+
+	ret = -EBUSY;
+	mutex_lock(&text_mutex);
+	switch (t) {
+	case BPF_MOD_NOP_TO_CALL:
+	case BPF_MOD_NOP_TO_JUMP:
+		if (memcmp(ip, ideal_nops[NOP_ATOMIC5], X86_PATCHABLE_SIZE))
+			goto out;
+		text_poke_bp(ip, new_insn, X86_PATCHABLE_SIZE, NULL);
+		break;
+	case BPF_MOD_CALL_TO_CALL:
+	case BPF_MOD_JUMP_TO_JUMP:
+		if (memcmp(ip, old_insn, X86_PATCHABLE_SIZE))
+			goto out;
+		text_poke_bp(ip, new_insn, X86_PATCHABLE_SIZE, NULL);
+		break;
+	case BPF_MOD_CALL_TO_NOP:
+	case BPF_MOD_JUMP_TO_NOP:
+		if (memcmp(ip, old_insn, X86_PATCHABLE_SIZE))
+			goto out;
+		text_poke_bp(ip, ideal_nops[NOP_ATOMIC5], X86_PATCHABLE_SIZE,
+			     NULL);
+		break;
+	}
+	ret = 0;
+out:
+	mutex_unlock(&text_mutex);
+	return ret;
+}
+
+int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
+		       void *old_addr, void *new_addr)
+{
+	if (!is_kernel_text((long)ip) &&
+	    !is_bpf_text_address((long)ip))
+		/* BPF poking in modules is not supported */
+		return -EINVAL;
+
+	return __bpf_arch_text_poke(ip, t, old_addr, new_addr);
+}
+
 /*
  * Generate the following code:
  *
@@ -253,7 +352,7 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
  *   goto *(prog->bpf_func + prologue_size);
  * out:
  */
-static void emit_bpf_tail_call(u8 **pprog)
+static void emit_bpf_tail_call_indirect(u8 **pprog)
 {
 	u8 *prog = *pprog;
 	int label1, label2, label3;
@@ -320,6 +419,57 @@ static void emit_bpf_tail_call(u8 **pprog)
 	*pprog = prog;
 }
 
+static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke,
+				      u8 **pprog, int addr, u8 *image)
+{
+	u8 *prog = *pprog;
+	int i, cnt = 0;
+
+	/*
+	 * if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+	 *	goto out;
+	 */
+	EMIT2_off32(0x8B, 0x85, -36 - MAX_BPF_STACK); /* mov eax, dword ptr [rbp - 548] */
+	EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT);         /* cmp eax, MAX_TAIL_CALL_CNT */
+	EMIT2(X86_JA, 14);                            /* ja out */
+	EMIT3(0x83, 0xC0, 0x01);                      /* add eax, 1 */
+	EMIT2_off32(0x89, 0x85, -36 - MAX_BPF_STACK); /* mov dword ptr [rbp -548], eax */
+
+	poke->ip = image + (addr - X86_PATCHABLE_SIZE);
+	poke->adj_off = PROLOGUE_SIZE;
+
+	BUILD_BUG_ON(X86_PATCHABLE_SIZE != 5);
+	for (i = 0; i < X86_PATCHABLE_SIZE; i++)
+		EMIT1(ideal_nops[NOP_ATOMIC5][i]);
+
+	*pprog = prog;
+}
+
+static void fixup_bpf_tail_call_direct(struct bpf_prog *prog)
+{
+	static const enum bpf_text_poke_type type = BPF_MOD_NOP_TO_JUMP;
+	struct bpf_jit_poke_descriptor *poke;
+	struct bpf_prog *target;
+	int i, ret;
+
+	for (i = 0; i < prog->aux->size_poke_tab; i++) {
+		poke = &prog->aux->poke_tab[i];
+		if (poke->reason != BPF_POKE_REASON_TAIL_CALL)
+			continue;
+		bpf_map_poke_lock(poke->tc.map);
+		target = container_of(poke->tc.map, struct bpf_array,
+				      map)->ptrs[poke->tc.key];
+		if (target) {
+			ret = __bpf_arch_text_poke(poke->ip, type, NULL,
+						   (u8 *)target->bpf_func +
+						   poke->adj_off);
+			BUG_ON(ret < 0);
+		}
+		WRITE_ONCE(poke->ip_stable, 1);
+		bpf_map_poke_unlock(poke->tc.map);
+	}
+}
+
 static void emit_mov_imm32(u8 **pprog, bool sign_propagate,
 			   u32 dst_reg, const u32 imm32)
 {
@@ -481,99 +631,6 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
 	*pprog = prog;
 }
 
-static int emit_patchable(u8 **pprog, void *func, void *ip, u8 b1)
-{
-	u8 *prog = *pprog;
-	int cnt = 0;
-	s64 offset;
-
-	offset = func - (ip + X86_PATCHABLE_SIZE);
-	if (!is_simm32(offset)) {
-		pr_err("Target call %p is out of range\n", func);
-		return -EINVAL;
-	}
-	EMIT1_off32(b1, offset);
-	*pprog = prog;
-	return 0;
-}
-
-static int emit_call(u8 **pprog, void *func, void *ip)
-{
-	return emit_patchable(pprog, func, ip, 0xE8);
-}
-
-static int emit_jump(u8 **pprog, void *func, void *ip)
-{
-	return emit_patchable(pprog, func, ip, 0xE9);
-}
-
-int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
-		       void *old_addr, void *new_addr)
-{
-	int (*emit_patch)(u8 **pprog, void *func, void *ip);
-	u8 old_insn[X86_PATCHABLE_SIZE] = {};
-	u8 new_insn[X86_PATCHABLE_SIZE] = {};
-	u8 *prog;
-	int ret;
-
-	if (!is_kernel_text((long)ip) &&
-	    !is_bpf_text_address((long)ip))
-		/* BPF poking in modules is not supported */
-		return -EINVAL;
-
-	switch (t) {
-	case BPF_MOD_NOP_TO_CALL ... BPF_MOD_CALL_TO_NOP:
-		emit_patch = emit_call;
-		break;
-	case BPF_MOD_NOP_TO_JUMP ... BPF_MOD_JUMP_TO_NOP:
-		emit_patch = emit_jump;
-		break;
-	default:
-		return -ENOTSUPP;
-	}
-
-	if (old_addr) {
-		prog = old_insn;
-		ret = emit_patch(&prog, old_addr, (void *)ip);
-		if (ret)
-			return ret;
-	}
-	if (new_addr) {
-		prog = new_insn;
-		ret = emit_patch(&prog, new_addr, (void *)ip);
-		if (ret)
-			return ret;
-	}
-
-	ret = -EBUSY;
-	mutex_lock(&text_mutex);
-	switch (t) {
-	case BPF_MOD_NOP_TO_CALL:
-	case BPF_MOD_NOP_TO_JUMP:
-		if (memcmp(ip, ideal_nops[NOP_ATOMIC5], X86_PATCHABLE_SIZE))
-			goto out;
-		text_poke_bp(ip, new_insn, X86_PATCHABLE_SIZE, NULL);
-		break;
-	case BPF_MOD_CALL_TO_CALL:
-	case BPF_MOD_JUMP_TO_JUMP:
-		if (memcmp(ip, old_insn, X86_PATCHABLE_SIZE))
-			goto out;
-		text_poke_bp(ip, new_insn, X86_PATCHABLE_SIZE, NULL);
-		break;
-	case BPF_MOD_CALL_TO_NOP:
-	case BPF_MOD_JUMP_TO_NOP:
-		if (memcmp(ip, old_insn, X86_PATCHABLE_SIZE))
-			goto out;
-		text_poke_bp(ip, ideal_nops[NOP_ATOMIC5], X86_PATCHABLE_SIZE,
-			     NULL);
-		break;
-	}
-	ret = 0;
-out:
-	mutex_unlock(&text_mutex);
-	return ret;
-}
-
 static bool ex_handler_bpf(const struct exception_table_entry *x,
 			   struct pt_regs *regs, int trapnr,
 			   unsigned long error_code, unsigned long fault_addr)
@@ -1041,7 +1098,11 @@ xadd:			if (is_imm8(insn->off))
 			break;
 
 		case BPF_JMP | BPF_TAIL_CALL:
-			emit_bpf_tail_call(&prog);
+			if (imm32)
+				emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1],
+							  &prog, addrs[i], image);
+			else
+				emit_bpf_tail_call_indirect(&prog);
 			break;
 
 			/* cond jump */
@@ -1610,6 +1671,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 		prog->bpf_func = (void *)image;
 		prog->jited = 1;
 		prog->jited_len = proglen;
+		fixup_bpf_tail_call_direct(prog);
 	} else {
 		prog = orig_prog;
 	}
-- 
2.21.0


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH rfc bpf-next 8/8] bpf: constant map key tracking for prog array pokes
  2019-11-15  1:03 [PATCH rfc bpf-next 0/8] Optimize BPF tail calls for direct jumps Daniel Borkmann
                   ` (6 preceding siblings ...)
  2019-11-15  1:04 ` [PATCH rfc bpf-next 7/8] bpf, x86: emit patchable direct jump as tail call Daniel Borkmann
@ 2019-11-15  1:04 ` Daniel Borkmann
  2019-11-15  4:29   ` Alexei Starovoitov
  2019-11-18 18:11   ` Andrii Nakryiko
  7 siblings, 2 replies; 28+ messages in thread
From: Daniel Borkmann @ 2019-11-15  1:04 UTC (permalink / raw)
  To: ast; +Cc: john.fastabend, netdev, bpf, Daniel Borkmann

Add tracking of constant keys into tail call maps. The signature of
bpf_tail_call_proto is that arg1 is ctx, arg2 map pointer and arg3
is a index key. The direct call approach for tail calls can be enabled
if the verifier asserted that for all branches leading to the tail call
helper invocation, the map pointer and index key were both constant
and the same. Tracking of map pointers we already do from prior work
via c93552c443eb ("bpf: properly enforce index mask to prevent out-of-bounds
speculation") and 09772d92cd5a ("bpf: avoid retpoline for lookup/update/
delete calls on maps"). Given the tail call map index key is not on
stack but directly in the register, we can add similar tracking approach
and later in fixup_bpf_calls() add a poke descriptor to the progs poke_tab
with the relevant information for the JITing phase. We internally reuse
insn->imm for the rewritten BPF_JMP | BPF_TAIL_CALL instruction in order
to point into the prog's poke_tab and keep insn->imm == 0 as indicator
that current indirect tail call emission must be used.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h |  1 +
 kernel/bpf/verifier.c        | 98 ++++++++++++++++++++++++++++++++++++
 2 files changed, 99 insertions(+)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index cdd08bf0ec06..f494f0c9ac13 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -301,6 +301,7 @@ struct bpf_insn_aux_data {
 			u32 map_off;		/* offset from value base address */
 		};
 	};
+	u64 key_state; /* constant key tracking for maps */
 	int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
 	int sanitize_stack_off; /* stack slot to be cleared */
 	bool seen; /* this insn was processed by the verifier */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e9dc95a18d44..48d5c9030d60 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -171,6 +171,9 @@ struct bpf_verifier_stack_elem {
 #define BPF_COMPLEXITY_LIMIT_JMP_SEQ	8192
 #define BPF_COMPLEXITY_LIMIT_STATES	64
 
+#define BPF_MAP_KEY_POISON	(1ULL << 63)
+#define BPF_MAP_KEY_SEEN	(1ULL << 62)
+
 #define BPF_MAP_PTR_UNPRIV	1UL
 #define BPF_MAP_PTR_POISON	((void *)((0xeB9FUL << 1) +	\
 					  POISON_POINTER_DELTA))
@@ -195,6 +198,29 @@ static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux,
 			 (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL);
 }
 
+static bool bpf_map_key_poisoned(const struct bpf_insn_aux_data *aux)
+{
+	return aux->key_state & BPF_MAP_KEY_POISON;
+}
+
+static bool bpf_map_key_unseen(const struct bpf_insn_aux_data *aux)
+{
+	return !(aux->key_state & BPF_MAP_KEY_SEEN);
+}
+
+static u64 bpf_map_key_immediate(const struct bpf_insn_aux_data *aux)
+{
+	return aux->key_state & ~BPF_MAP_KEY_SEEN;
+}
+
+static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state)
+{
+	bool poisoned = bpf_map_key_poisoned(aux);
+
+	aux->key_state = state | BPF_MAP_KEY_SEEN |
+			 (poisoned ? BPF_MAP_KEY_POISON : 0ULL);
+}
+
 struct bpf_call_arg_meta {
 	struct bpf_map *map_ptr;
 	bool raw_mode;
@@ -4088,6 +4114,37 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
 	return 0;
 }
 
+static int
+record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
+		int func_id, int insn_idx)
+{
+	struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
+	struct bpf_reg_state *regs = cur_regs(env), *reg;
+	struct tnum range = tnum_range(0, U32_MAX);
+	struct bpf_map *map = meta->map_ptr;
+	u64 val;
+
+	if (func_id != BPF_FUNC_tail_call)
+		return 0;
+	if (!map || map->map_type != BPF_MAP_TYPE_PROG_ARRAY) {
+		verbose(env, "kernel subsystem misconfigured verifier\n");
+		return -EINVAL;
+	}
+
+	reg = &regs[BPF_REG_3];
+	if (!register_is_const(reg) || !tnum_in(range, reg->var_off)) {
+		bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
+		return 0;
+	}
+
+	val = reg->var_off.value;
+	if (bpf_map_key_unseen(aux))
+		bpf_map_key_store(aux, val);
+	else if (bpf_map_key_immediate(aux) != val)
+		bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
+	return 0;
+}
+
 static int check_reference_leak(struct bpf_verifier_env *env)
 {
 	struct bpf_func_state *state = cur_func(env);
@@ -4162,6 +4219,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 	if (err)
 		return err;
 
+	err = record_func_key(env, &meta, func_id, insn_idx);
+	if (err)
+		return err;
+
 	/* Mark slots with STACK_MISC in case of raw mode, stack offset
 	 * is inferred from register state.
 	 */
@@ -9202,6 +9263,43 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			insn->code = BPF_JMP | BPF_TAIL_CALL;
 
 			aux = &env->insn_aux_data[i + delta];
+			if (prog->jit_requested &&
+			    !bpf_map_key_poisoned(aux) &&
+			    !bpf_map_ptr_poisoned(aux) &&
+			    !bpf_map_ptr_unpriv(aux)) {
+				struct bpf_jit_poke_descriptor desc;
+				u32 map_key;
+				int ret;
+
+				map_key = bpf_map_key_immediate(aux);
+				map_ptr = BPF_MAP_PTR(aux->map_state);
+
+				if (map_key >= map_ptr->max_entries)
+					continue;
+				if (!map_ptr->ops->map_poke_track) {
+					verbose(env, "bpf verifier is misconfigured\n");
+					return -EINVAL;
+				}
+
+				memset(&desc, 0, sizeof(desc));
+				desc.reason = BPF_POKE_REASON_TAIL_CALL;
+				desc.tc.map = map_ptr;
+				desc.tc.key = map_key;
+
+				ret = bpf_jit_add_poke_descriptor(prog, &desc);
+				if (ret < 0) {
+					verbose(env, "adding tail call poke descriptor failed\n");
+					return ret;
+				}
+
+				insn->imm = ret + 1;
+
+				ret = map_ptr->ops->map_poke_track(map_ptr, prog->aux);
+				if (ret < 0) {
+					verbose(env, "tracking tail call prog failed\n");
+					return ret;
+				}
+			}
 			if (!bpf_map_ptr_unpriv(aux))
 				continue;
 
-- 
2.21.0


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH rfc bpf-next 7/8] bpf, x86: emit patchable direct jump as tail call
  2019-11-15  1:04 ` [PATCH rfc bpf-next 7/8] bpf, x86: emit patchable direct jump as tail call Daniel Borkmann
@ 2019-11-15  3:23   ` Alexei Starovoitov
  2019-11-15  7:27     ` Daniel Borkmann
  0 siblings, 1 reply; 28+ messages in thread
From: Alexei Starovoitov @ 2019-11-15  3:23 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: ast, john.fastabend, netdev, bpf

On Fri, Nov 15, 2019 at 02:04:01AM +0100, Daniel Borkmann wrote:
> for later modifications. In ii) fixup_bpf_tail_call_direct() walks
> over the progs poke_tab, locks the tail call maps poke_mutex to
> prevent from parallel updates and patches in the right locations via
...
> @@ -1610,6 +1671,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
>  		prog->bpf_func = (void *)image;
>  		prog->jited = 1;
>  		prog->jited_len = proglen;
> +		fixup_bpf_tail_call_direct(prog);

Why not to move fixup_bpf_tail_call_direct() just before
bpf_jit_binary_lock_ro() and use simple memcpy instead of text_poke ?

imo this logic in patch 7:
case BPF_JMP | BPF_TAIL_CALL:
+   if (imm32)
+            emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1],
would have been easier to understand if patch 7 and 8 were swapped.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH rfc bpf-next 8/8] bpf: constant map key tracking for prog array pokes
  2019-11-15  1:04 ` [PATCH rfc bpf-next 8/8] bpf: constant map key tracking for prog array pokes Daniel Borkmann
@ 2019-11-15  4:29   ` Alexei Starovoitov
  2019-11-15  7:13     ` Daniel Borkmann
  2019-11-18 18:11   ` Andrii Nakryiko
  1 sibling, 1 reply; 28+ messages in thread
From: Alexei Starovoitov @ 2019-11-15  4:29 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: ast, john.fastabend, netdev, bpf

On Fri, Nov 15, 2019 at 02:04:02AM +0100, Daniel Borkmann wrote:
> Add tracking of constant keys into tail call maps. The signature of
> bpf_tail_call_proto is that arg1 is ctx, arg2 map pointer and arg3
> is a index key. The direct call approach for tail calls can be enabled
> if the verifier asserted that for all branches leading to the tail call
> helper invocation, the map pointer and index key were both constant
> and the same. Tracking of map pointers we already do from prior work
> via c93552c443eb ("bpf: properly enforce index mask to prevent out-of-bounds
> speculation") and 09772d92cd5a ("bpf: avoid retpoline for lookup/update/
> delete calls on maps"). Given the tail call map index key is not on
> stack but directly in the register, we can add similar tracking approach
> and later in fixup_bpf_calls() add a poke descriptor to the progs poke_tab
> with the relevant information for the JITing phase. We internally reuse
> insn->imm for the rewritten BPF_JMP | BPF_TAIL_CALL instruction in order
> to point into the prog's poke_tab and keep insn->imm == 0 as indicator
> that current indirect tail call emission must be used.
> 
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> ---
>  include/linux/bpf_verifier.h |  1 +
>  kernel/bpf/verifier.c        | 98 ++++++++++++++++++++++++++++++++++++
>  2 files changed, 99 insertions(+)
> 
> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
> index cdd08bf0ec06..f494f0c9ac13 100644
> --- a/include/linux/bpf_verifier.h
> +++ b/include/linux/bpf_verifier.h
> @@ -301,6 +301,7 @@ struct bpf_insn_aux_data {
>  			u32 map_off;		/* offset from value base address */
>  		};
>  	};
> +	u64 key_state; /* constant key tracking for maps */

may be map_key_state ?
key_state is a bit ambiguous in the bpf_insn_aux_data.

> +static int
> +record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
> +		int func_id, int insn_idx)
> +{
> +	struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
> +	struct bpf_reg_state *regs = cur_regs(env), *reg;
> +	struct tnum range = tnum_range(0, U32_MAX);
> +	struct bpf_map *map = meta->map_ptr;
> +	u64 val;
> +
> +	if (func_id != BPF_FUNC_tail_call)
> +		return 0;
> +	if (!map || map->map_type != BPF_MAP_TYPE_PROG_ARRAY) {
> +		verbose(env, "kernel subsystem misconfigured verifier\n");
> +		return -EINVAL;
> +	}
> +
> +	reg = &regs[BPF_REG_3];
> +	if (!register_is_const(reg) || !tnum_in(range, reg->var_off)) {
> +		bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
> +		return 0;
> +	}
> +
> +	val = reg->var_off.value;
> +	if (bpf_map_key_unseen(aux))
> +		bpf_map_key_store(aux, val);
> +	else if (bpf_map_key_immediate(aux) != val)
> +		bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
> +	return 0;
> +}

I think this analysis is very useful in other cases as well. Could you
generalize it for array map lookups ? The key used in bpf_map_lookup_elem() for
arrays is often constant. In such cases we can optimize array_map_gen_lookup()
into absolute pointer. It will be possible to do
if (idx < max_entries) ptr += idx * elem_size;
during verification instead of runtime and the whole
bpf_map_lookup_elem(map, &key); will become single instruction that
assigns &array[idx] into R0.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH rfc bpf-next 8/8] bpf: constant map key tracking for prog array pokes
  2019-11-15  4:29   ` Alexei Starovoitov
@ 2019-11-15  7:13     ` Daniel Borkmann
  2019-11-15 16:33       ` Alexei Starovoitov
  0 siblings, 1 reply; 28+ messages in thread
From: Daniel Borkmann @ 2019-11-15  7:13 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: ast, john.fastabend, netdev, bpf

On Thu, Nov 14, 2019 at 08:29:41PM -0800, Alexei Starovoitov wrote:
> On Fri, Nov 15, 2019 at 02:04:02AM +0100, Daniel Borkmann wrote:
> > Add tracking of constant keys into tail call maps. The signature of
> > bpf_tail_call_proto is that arg1 is ctx, arg2 map pointer and arg3
> > is a index key. The direct call approach for tail calls can be enabled
> > if the verifier asserted that for all branches leading to the tail call
> > helper invocation, the map pointer and index key were both constant
> > and the same. Tracking of map pointers we already do from prior work
> > via c93552c443eb ("bpf: properly enforce index mask to prevent out-of-bounds
> > speculation") and 09772d92cd5a ("bpf: avoid retpoline for lookup/update/
> > delete calls on maps"). Given the tail call map index key is not on
> > stack but directly in the register, we can add similar tracking approach
> > and later in fixup_bpf_calls() add a poke descriptor to the progs poke_tab
> > with the relevant information for the JITing phase. We internally reuse
> > insn->imm for the rewritten BPF_JMP | BPF_TAIL_CALL instruction in order
> > to point into the prog's poke_tab and keep insn->imm == 0 as indicator
> > that current indirect tail call emission must be used.
> > 
> > Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> > ---
> >  include/linux/bpf_verifier.h |  1 +
> >  kernel/bpf/verifier.c        | 98 ++++++++++++++++++++++++++++++++++++
> >  2 files changed, 99 insertions(+)
> > 
> > diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
> > index cdd08bf0ec06..f494f0c9ac13 100644
> > --- a/include/linux/bpf_verifier.h
> > +++ b/include/linux/bpf_verifier.h
> > @@ -301,6 +301,7 @@ struct bpf_insn_aux_data {
> >  			u32 map_off;		/* offset from value base address */
> >  		};
> >  	};
> > +	u64 key_state; /* constant key tracking for maps */
> 
> may be map_key_state ?
> key_state is a bit ambiguous in the bpf_insn_aux_data.

Could be, alternatively could also be idx_state or map_idx_state since
it's really just for u32 type key indices.

> > +static int
> > +record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
> > +		int func_id, int insn_idx)
> > +{
> > +	struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
> > +	struct bpf_reg_state *regs = cur_regs(env), *reg;
> > +	struct tnum range = tnum_range(0, U32_MAX);
> > +	struct bpf_map *map = meta->map_ptr;
> > +	u64 val;
> > +
> > +	if (func_id != BPF_FUNC_tail_call)
> > +		return 0;
> > +	if (!map || map->map_type != BPF_MAP_TYPE_PROG_ARRAY) {
> > +		verbose(env, "kernel subsystem misconfigured verifier\n");
> > +		return -EINVAL;
> > +	}
> > +
> > +	reg = &regs[BPF_REG_3];
> > +	if (!register_is_const(reg) || !tnum_in(range, reg->var_off)) {
> > +		bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
> > +		return 0;
> > +	}
> > +
> > +	val = reg->var_off.value;
> > +	if (bpf_map_key_unseen(aux))
> > +		bpf_map_key_store(aux, val);
> > +	else if (bpf_map_key_immediate(aux) != val)
> > +		bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
> > +	return 0;
> > +}
> 
> I think this analysis is very useful in other cases as well. Could you
> generalize it for array map lookups ? The key used in bpf_map_lookup_elem() for
> arrays is often constant. In such cases we can optimize array_map_gen_lookup()
> into absolute pointer. It will be possible to do
> if (idx < max_entries) ptr += idx * elem_size;
> during verification instead of runtime and the whole
> bpf_map_lookup_elem(map, &key); will become single instruction that
> assigns &array[idx] into R0.

Was thinking exactly the same. ;-) I started coding this yesterday night [0],
but then had the (in hinsight obvious) realization that as-is the key_state
holds the address but not the index for plain array map lookup. Hence I'd need
to go a step further there to look at the const stack content. Will proceed on
this as a separate set on top.

  [0] https://git.kernel.org/pub/scm/linux/kernel/git/dborkman/bpf.git/commit/?h=pr/bpf-tail-call-rebased2&id=b86b7eae4646d8233e3e9058e68fef27536bf0c4

Thanks,
Daniel

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH rfc bpf-next 7/8] bpf, x86: emit patchable direct jump as tail call
  2019-11-15  3:23   ` Alexei Starovoitov
@ 2019-11-15  7:27     ` Daniel Borkmann
  0 siblings, 0 replies; 28+ messages in thread
From: Daniel Borkmann @ 2019-11-15  7:27 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: ast, john.fastabend, netdev, bpf

On Thu, Nov 14, 2019 at 07:23:46PM -0800, Alexei Starovoitov wrote:
> On Fri, Nov 15, 2019 at 02:04:01AM +0100, Daniel Borkmann wrote:
> > for later modifications. In ii) fixup_bpf_tail_call_direct() walks
> > over the progs poke_tab, locks the tail call maps poke_mutex to
> > prevent from parallel updates and patches in the right locations via
> ...
> > @@ -1610,6 +1671,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
> >  		prog->bpf_func = (void *)image;
> >  		prog->jited = 1;
> >  		prog->jited_len = proglen;
> > +		fixup_bpf_tail_call_direct(prog);
> 
> Why not to move fixup_bpf_tail_call_direct() just before
> bpf_jit_binary_lock_ro() and use simple memcpy instead of text_poke ?

Thinking about it, I'll move it right into the branch before we lock ...

  if (!prog->is_func || extra_pass) {
    bpf_tail_call_fixup_direct(prog);
    bpf_jit_binary_lock_ro(header);
  } else { [...]

... and I'll add a __bpf_arch_text_poke() handler which passes in the
a plain memcpy() callback instead of text_poke_bp(), so it keeps reusing
most of the logic/checks from __bpf_arch_text_poke() which we also have
at a later point once the program is live.

> imo this logic in patch 7:
> case BPF_JMP | BPF_TAIL_CALL:
> +   if (imm32)
> +            emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1],
> would have been easier to understand if patch 7 and 8 were swapped.

Makes sense, it's totally fine to swap them, so I'll go do that. Thanks
for the feedback!

Cheers,
Daniel

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH rfc bpf-next 8/8] bpf: constant map key tracking for prog array pokes
  2019-11-15  7:13     ` Daniel Borkmann
@ 2019-11-15 16:33       ` Alexei Starovoitov
  2019-11-15 16:49         ` Daniel Borkmann
  0 siblings, 1 reply; 28+ messages in thread
From: Alexei Starovoitov @ 2019-11-15 16:33 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: ast, john.fastabend, netdev, bpf

On Fri, Nov 15, 2019 at 08:13:58AM +0100, Daniel Borkmann wrote:
> On Thu, Nov 14, 2019 at 08:29:41PM -0800, Alexei Starovoitov wrote:
> > On Fri, Nov 15, 2019 at 02:04:02AM +0100, Daniel Borkmann wrote:
> > > Add tracking of constant keys into tail call maps. The signature of
> > > bpf_tail_call_proto is that arg1 is ctx, arg2 map pointer and arg3
> > > is a index key. The direct call approach for tail calls can be enabled
> > > if the verifier asserted that for all branches leading to the tail call
> > > helper invocation, the map pointer and index key were both constant
> > > and the same. Tracking of map pointers we already do from prior work
> > > via c93552c443eb ("bpf: properly enforce index mask to prevent out-of-bounds
> > > speculation") and 09772d92cd5a ("bpf: avoid retpoline for lookup/update/
> > > delete calls on maps"). Given the tail call map index key is not on
> > > stack but directly in the register, we can add similar tracking approach
> > > and later in fixup_bpf_calls() add a poke descriptor to the progs poke_tab
> > > with the relevant information for the JITing phase. We internally reuse
> > > insn->imm for the rewritten BPF_JMP | BPF_TAIL_CALL instruction in order
> > > to point into the prog's poke_tab and keep insn->imm == 0 as indicator
> > > that current indirect tail call emission must be used.
> > > 
> > > Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> > > ---
> > >  include/linux/bpf_verifier.h |  1 +
> > >  kernel/bpf/verifier.c        | 98 ++++++++++++++++++++++++++++++++++++
> > >  2 files changed, 99 insertions(+)
> > > 
> > > diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
> > > index cdd08bf0ec06..f494f0c9ac13 100644
> > > --- a/include/linux/bpf_verifier.h
> > > +++ b/include/linux/bpf_verifier.h
> > > @@ -301,6 +301,7 @@ struct bpf_insn_aux_data {
> > >  			u32 map_off;		/* offset from value base address */
> > >  		};
> > >  	};
> > > +	u64 key_state; /* constant key tracking for maps */
> > 
> > may be map_key_state ?
> > key_state is a bit ambiguous in the bpf_insn_aux_data.
> 
> Could be, alternatively could also be idx_state or map_idx_state since
> it's really just for u32 type key indices.
> 
> > > +static int
> > > +record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
> > > +		int func_id, int insn_idx)
> > > +{
> > > +	struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
> > > +	struct bpf_reg_state *regs = cur_regs(env), *reg;
> > > +	struct tnum range = tnum_range(0, U32_MAX);
> > > +	struct bpf_map *map = meta->map_ptr;
> > > +	u64 val;
> > > +
> > > +	if (func_id != BPF_FUNC_tail_call)
> > > +		return 0;
> > > +	if (!map || map->map_type != BPF_MAP_TYPE_PROG_ARRAY) {
> > > +		verbose(env, "kernel subsystem misconfigured verifier\n");
> > > +		return -EINVAL;
> > > +	}
> > > +
> > > +	reg = &regs[BPF_REG_3];
> > > +	if (!register_is_const(reg) || !tnum_in(range, reg->var_off)) {
> > > +		bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
> > > +		return 0;
> > > +	}
> > > +
> > > +	val = reg->var_off.value;
> > > +	if (bpf_map_key_unseen(aux))
> > > +		bpf_map_key_store(aux, val);
> > > +	else if (bpf_map_key_immediate(aux) != val)
> > > +		bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
> > > +	return 0;
> > > +}
> > 
> > I think this analysis is very useful in other cases as well. Could you
> > generalize it for array map lookups ? The key used in bpf_map_lookup_elem() for
> > arrays is often constant. In such cases we can optimize array_map_gen_lookup()
> > into absolute pointer. It will be possible to do
> > if (idx < max_entries) ptr += idx * elem_size;
> > during verification instead of runtime and the whole
> > bpf_map_lookup_elem(map, &key); will become single instruction that
> > assigns &array[idx] into R0.
> 
> Was thinking exactly the same. ;-) I started coding this yesterday night [0],
> but then had the (in hinsight obvious) realization that as-is the key_state
> holds the address but not the index for plain array map lookup. Hence I'd need
> to go a step further there to look at the const stack content. Will proceed on
> this as a separate set on top.
> 
>   [0] https://git.kernel.org/pub/scm/linux/kernel/git/dborkman/bpf.git/commit/?h=pr/bpf-tail-call-rebased2&id=b86b7eae4646d8233e3e9058e68fef27536bf0c4

yeah. good point. For map_lookup it's obvious that the verifier needs to
compare both map ptr and *key, but that is the case for bpf_tail_call too, no?
It seems tracking 'key_state' only is not enough. Consider:
if (..)
  map = mapA;
else
  map = mapB;
bpf_tail_call(ctx, map, 1);

May be to generalize the logic the verifier should remember bpf_reg_state
instead of specific part of it like u32 ? The verifier keeps
insn_aux_data[insn_idx].ptr_type; to prevent incorrect ctx access. That can
also be generalized? Probably later, but conceptually it's the same category of
tracking that the verifier needs to do. For bpf_map_lookup and bpf_tail_call
callsite it can remember bpf_reg_state of r1,r2,r3. The bpf_reg_state should be
saved in insn_aux_data the first time the verifier goes through the callsite than 
everytime the verifier goes through the callsite again additional per-helper
logic is invoked. Like for bpf_tail_call it will check:
if (tnum_is_const(insn_aux_data[callsite]->r3_reg_state->var_off))
  // good. may be can optimize later.
and will use insn_aux_data[callsite]->r2_reg_state->map_ptr plus
insn_aux_data[callsite]->r3_reg_state->var_off to compute bpf_prog's jited address
inside that prog_array.
Similarly for bpf_map_lookup... r1_reg_state->map_ptr is the same map
for saved insn_aux_data->r1_reg_state and for current->r1.
The r2_reg_state should be PTR_TO_STACK and that stack value should be u32 const.
Should be a bit more generic and extensible... instead of specific 'key_state' ?


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH rfc bpf-next 8/8] bpf: constant map key tracking for prog array pokes
  2019-11-15 16:33       ` Alexei Starovoitov
@ 2019-11-15 16:49         ` Daniel Borkmann
  0 siblings, 0 replies; 28+ messages in thread
From: Daniel Borkmann @ 2019-11-15 16:49 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: ast, john.fastabend, netdev, bpf

On 11/15/19 5:33 PM, Alexei Starovoitov wrote:
> On Fri, Nov 15, 2019 at 08:13:58AM +0100, Daniel Borkmann wrote:
>> On Thu, Nov 14, 2019 at 08:29:41PM -0800, Alexei Starovoitov wrote:
>>> On Fri, Nov 15, 2019 at 02:04:02AM +0100, Daniel Borkmann wrote:
>>>> Add tracking of constant keys into tail call maps. The signature of
>>>> bpf_tail_call_proto is that arg1 is ctx, arg2 map pointer and arg3
>>>> is a index key. The direct call approach for tail calls can be enabled
>>>> if the verifier asserted that for all branches leading to the tail call
>>>> helper invocation, the map pointer and index key were both constant
>>>> and the same. Tracking of map pointers we already do from prior work
>>>> via c93552c443eb ("bpf: properly enforce index mask to prevent out-of-bounds
>>>> speculation") and 09772d92cd5a ("bpf: avoid retpoline for lookup/update/
>>>> delete calls on maps"). Given the tail call map index key is not on
>>>> stack but directly in the register, we can add similar tracking approach
>>>> and later in fixup_bpf_calls() add a poke descriptor to the progs poke_tab
>>>> with the relevant information for the JITing phase. We internally reuse
>>>> insn->imm for the rewritten BPF_JMP | BPF_TAIL_CALL instruction in order
>>>> to point into the prog's poke_tab and keep insn->imm == 0 as indicator
>>>> that current indirect tail call emission must be used.
>>>>
>>>> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
>>>> ---
>>>>   include/linux/bpf_verifier.h |  1 +
>>>>   kernel/bpf/verifier.c        | 98 ++++++++++++++++++++++++++++++++++++
>>>>   2 files changed, 99 insertions(+)
>>>>
>>>> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
>>>> index cdd08bf0ec06..f494f0c9ac13 100644
>>>> --- a/include/linux/bpf_verifier.h
>>>> +++ b/include/linux/bpf_verifier.h
>>>> @@ -301,6 +301,7 @@ struct bpf_insn_aux_data {
>>>>   			u32 map_off;		/* offset from value base address */
>>>>   		};
>>>>   	};
>>>> +	u64 key_state; /* constant key tracking for maps */
>>>
>>> may be map_key_state ?
>>> key_state is a bit ambiguous in the bpf_insn_aux_data.
>>
>> Could be, alternatively could also be idx_state or map_idx_state since
>> it's really just for u32 type key indices.
>>
>>>> +static int
>>>> +record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
>>>> +		int func_id, int insn_idx)
>>>> +{
>>>> +	struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
>>>> +	struct bpf_reg_state *regs = cur_regs(env), *reg;
>>>> +	struct tnum range = tnum_range(0, U32_MAX);
>>>> +	struct bpf_map *map = meta->map_ptr;
>>>> +	u64 val;
>>>> +
>>>> +	if (func_id != BPF_FUNC_tail_call)
>>>> +		return 0;
>>>> +	if (!map || map->map_type != BPF_MAP_TYPE_PROG_ARRAY) {
>>>> +		verbose(env, "kernel subsystem misconfigured verifier\n");
>>>> +		return -EINVAL;
>>>> +	}
>>>> +
>>>> +	reg = &regs[BPF_REG_3];
>>>> +	if (!register_is_const(reg) || !tnum_in(range, reg->var_off)) {
>>>> +		bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
>>>> +		return 0;
>>>> +	}
>>>> +
>>>> +	val = reg->var_off.value;
>>>> +	if (bpf_map_key_unseen(aux))
>>>> +		bpf_map_key_store(aux, val);
>>>> +	else if (bpf_map_key_immediate(aux) != val)
>>>> +		bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
>>>> +	return 0;
>>>> +}
>>>
>>> I think this analysis is very useful in other cases as well. Could you
>>> generalize it for array map lookups ? The key used in bpf_map_lookup_elem() for
>>> arrays is often constant. In such cases we can optimize array_map_gen_lookup()
>>> into absolute pointer. It will be possible to do
>>> if (idx < max_entries) ptr += idx * elem_size;
>>> during verification instead of runtime and the whole
>>> bpf_map_lookup_elem(map, &key); will become single instruction that
>>> assigns &array[idx] into R0.
>>
>> Was thinking exactly the same. ;-) I started coding this yesterday night [0],
>> but then had the (in hinsight obvious) realization that as-is the key_state
>> holds the address but not the index for plain array map lookup. Hence I'd need
>> to go a step further there to look at the const stack content. Will proceed on
>> this as a separate set on top.
>>
>>    [0] https://git.kernel.org/pub/scm/linux/kernel/git/dborkman/bpf.git/commit/?h=pr/bpf-tail-call-rebased2&id=b86b7eae4646d8233e3e9058e68fef27536bf0c4
> 
> yeah. good point. For map_lookup it's obvious that the verifier needs to
> compare both map ptr and *key, but that is the case for bpf_tail_call too, no?

It's covered in this patch, more below.

> It seems tracking 'key_state' only is not enough. Consider:
> if (..)
>    map = mapA;
> else
>    map = mapB;
> bpf_tail_call(ctx, map, 1);
> 
> May be to generalize the logic the verifier should remember bpf_reg_state
> instead of specific part of it like u32 ? The verifier keeps
> insn_aux_data[insn_idx].ptr_type; to prevent incorrect ctx access. That can
> also be generalized? Probably later, but conceptually it's the same category of
> tracking that the verifier needs to do.

In fixup_bpf_calls(), it checks for !bpf_map_key_poisoned(aux) &&
!bpf_map_ptr_poisoned(aux), so coming from different paths, this can
only be enabled if mapA == mapB in your above example.

For bpf_map_lookup and bpf_tail_call
> callsite it can remember bpf_reg_state of r1,r2,r3. The bpf_reg_state should be
> saved in insn_aux_data the first time the verifier goes through the callsite than
> everytime the verifier goes through the callsite again additional per-helper
> logic is invoked. Like for bpf_tail_call it will check:
> if (tnum_is_const(insn_aux_data[callsite]->r3_reg_state->var_off))
>    // good. may be can optimize later.
> and will use insn_aux_data[callsite]->r2_reg_state->map_ptr plus
> insn_aux_data[callsite]->r3_reg_state->var_off to compute bpf_prog's jited address
> inside that prog_array.

Yeah, that's what I'm doing for the tail call case.

> Similarly for bpf_map_lookup... r1_reg_state->map_ptr is the same map
> for saved insn_aux_data->r1_reg_state and for current->r1.
> The r2_reg_state should be PTR_TO_STACK and that stack value should be u32 const.
> Should be a bit more generic and extensible... instead of specific 'key_state' ?

Remembering all of the reg state could be an option to make it more generic
for the case of covering also plain array map lookup, though it might come with
more memory overhead than necessary. I'll experiment a bit with the various
options for improving that patch [0] from above.

Thanks,
Daniel

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH rfc bpf-next 4/8] bpf: move owner type,jited info into array auxillary data
  2019-11-15  1:03 ` [PATCH rfc bpf-next 4/8] bpf: move owner type,jited info into array auxillary data Daniel Borkmann
@ 2019-11-15 23:19   ` Andrii Nakryiko
  0 siblings, 0 replies; 28+ messages in thread
From: Andrii Nakryiko @ 2019-11-15 23:19 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: Alexei Starovoitov, john fastabend, Networking, bpf

On Thu, Nov 14, 2019 at 5:04 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> We're going to extend this with further information which is only
> relevant for prog array at this point. Given this info is not used
> in critical path, move it into its own structure such that the main
> array map structure can be kept on diet.
>
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> ---

Looks good.

Acked-by: Andrii Nakryiko <andriin@fb.com>


>  include/linux/bpf.h     | 18 +++++++++++-------
>  kernel/bpf/arraymap.c   | 32 ++++++++++++++++++++++++++++++--
>  kernel/bpf/core.c       | 11 +++++------
>  kernel/bpf/map_in_map.c |  5 ++---
>  kernel/bpf/syscall.c    | 16 ++++++----------
>  5 files changed, 54 insertions(+), 28 deletions(-)
>

[...]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH rfc bpf-next 1/8] bpf, x86: generalize and extend bpf_arch_text_poke for direct jumps
  2019-11-15  1:03 ` [PATCH rfc bpf-next 1/8] bpf, x86: generalize and extend bpf_arch_text_poke " Daniel Borkmann
@ 2019-11-15 23:22   ` Andrii Nakryiko
  2019-11-15 23:42     ` Daniel Borkmann
  0 siblings, 1 reply; 28+ messages in thread
From: Andrii Nakryiko @ 2019-11-15 23:22 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: Alexei Starovoitov, john fastabend, Networking, bpf

On Thu, Nov 14, 2019 at 5:04 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> Add BPF_MOD_{NOP_TO_JUMP,JUMP_TO_JUMP,JUMP_TO_NOP} patching for x86
> JIT in order to be able to patch direct jumps or nop them out. We need
> this facility in order to patch tail call jumps and in later work also
> BPF static keys.
>
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> ---

just naming nits, looks good otherwise


>  arch/x86/net/bpf_jit_comp.c | 64 ++++++++++++++++++++++++++-----------
>  include/linux/bpf.h         |  6 ++++
>  2 files changed, 52 insertions(+), 18 deletions(-)
>
> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> index 2e586f579945..66921f2aeece 100644
> --- a/arch/x86/net/bpf_jit_comp.c
> +++ b/arch/x86/net/bpf_jit_comp.c
> @@ -203,8 +203,9 @@ struct jit_context {
>  /* Maximum number of bytes emitted while JITing one eBPF insn */
>  #define BPF_MAX_INSN_SIZE      128
>  #define BPF_INSN_SAFETY                64
> -/* number of bytes emit_call() needs to generate call instruction */
> -#define X86_CALL_SIZE          5
> +
> +/* Number of bytes emit_patchable() needs to generate instructions */
> +#define X86_PATCHABLE_SIZE     5
>
>  #define PROLOGUE_SIZE          25
>
> @@ -215,7 +216,7 @@ struct jit_context {
>  static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
>  {
>         u8 *prog = *pprog;
> -       int cnt = X86_CALL_SIZE;
> +       int cnt = X86_PATCHABLE_SIZE;
>
>         /* BPF trampoline can be made to work without these nops,
>          * but let's waste 5 bytes for now and optimize later
> @@ -480,64 +481,91 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
>         *pprog = prog;
>  }
>
> -static int emit_call(u8 **pprog, void *func, void *ip)
> +static int emit_patchable(u8 **pprog, void *func, void *ip, u8 b1)

I'd strongly prefer opcode instead of b1 :) also would emit_patch() be
a terrible name?

>  {
>         u8 *prog = *pprog;
>         int cnt = 0;
>         s64 offset;
>

[...]

>         case BPF_MOD_CALL_TO_NOP:
> -               if (memcmp(ip, old_insn, X86_CALL_SIZE))
> +       case BPF_MOD_JUMP_TO_NOP:
> +               if (memcmp(ip, old_insn, X86_PATCHABLE_SIZE))
>                         goto out;
> -               text_poke_bp(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE, NULL);
> +               text_poke_bp(ip, ideal_nops[NOP_ATOMIC5], X86_PATCHABLE_SIZE,


maybe keep it shorter with X86_PATCH_SIZE?

> +                            NULL);
>                 break;
>         }
>         ret = 0;

[...]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH rfc bpf-next 3/8] bpf: move bpf_free_used_maps into sleepable section
  2019-11-15  1:03 ` [PATCH rfc bpf-next 3/8] bpf: move bpf_free_used_maps into sleepable section Daniel Borkmann
@ 2019-11-15 23:32   ` Andrii Nakryiko
  0 siblings, 0 replies; 28+ messages in thread
From: Andrii Nakryiko @ 2019-11-15 23:32 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: Alexei Starovoitov, john fastabend, Networking, bpf

On Thu, Nov 14, 2019 at 5:04 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> We later on are going to need a sleepable context as opposed to plain
> RCU callback in order to untrack programs we need to poke at runtime
> and tracking as well as image update is performed under mutex.
>
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> ---

LGTM.

Acked-by: Andrii Nakryiko <andriin@fb.com>

>  include/linux/bpf.h  |  4 ++++
>  kernel/bpf/core.c    | 23 +++++++++++++++++++++++
>  kernel/bpf/syscall.c | 20 --------------------
>  3 files changed, 27 insertions(+), 20 deletions(-)
>

[...]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH rfc bpf-next 1/8] bpf, x86: generalize and extend bpf_arch_text_poke for direct jumps
  2019-11-15 23:22   ` Andrii Nakryiko
@ 2019-11-15 23:42     ` Daniel Borkmann
  2019-11-16  0:01       ` Andrii Nakryiko
  0 siblings, 1 reply; 28+ messages in thread
From: Daniel Borkmann @ 2019-11-15 23:42 UTC (permalink / raw)
  To: Andrii Nakryiko; +Cc: Alexei Starovoitov, john fastabend, Networking, bpf

On 11/16/19 12:22 AM, Andrii Nakryiko wrote:
> On Thu, Nov 14, 2019 at 5:04 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>>
>> Add BPF_MOD_{NOP_TO_JUMP,JUMP_TO_JUMP,JUMP_TO_NOP} patching for x86
>> JIT in order to be able to patch direct jumps or nop them out. We need
>> this facility in order to patch tail call jumps and in later work also
>> BPF static keys.
>>
>> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
>> ---
> 
> just naming nits, looks good otherwise
> 
>>   arch/x86/net/bpf_jit_comp.c | 64 ++++++++++++++++++++++++++-----------
>>   include/linux/bpf.h         |  6 ++++
>>   2 files changed, 52 insertions(+), 18 deletions(-)
>>
>> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
>> index 2e586f579945..66921f2aeece 100644
>> --- a/arch/x86/net/bpf_jit_comp.c
>> +++ b/arch/x86/net/bpf_jit_comp.c
>> @@ -203,8 +203,9 @@ struct jit_context {
>>   /* Maximum number of bytes emitted while JITing one eBPF insn */
>>   #define BPF_MAX_INSN_SIZE      128
>>   #define BPF_INSN_SAFETY                64
>> -/* number of bytes emit_call() needs to generate call instruction */
>> -#define X86_CALL_SIZE          5
>> +
>> +/* Number of bytes emit_patchable() needs to generate instructions */
>> +#define X86_PATCHABLE_SIZE     5
>>
>>   #define PROLOGUE_SIZE          25
>>
>> @@ -215,7 +216,7 @@ struct jit_context {
>>   static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
>>   {
>>          u8 *prog = *pprog;
>> -       int cnt = X86_CALL_SIZE;
>> +       int cnt = X86_PATCHABLE_SIZE;
>>
>>          /* BPF trampoline can be made to work without these nops,
>>           * but let's waste 5 bytes for now and optimize later
>> @@ -480,64 +481,91 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
>>          *pprog = prog;
>>   }
>>
>> -static int emit_call(u8 **pprog, void *func, void *ip)
>> +static int emit_patchable(u8 **pprog, void *func, void *ip, u8 b1)
> 
> I'd strongly prefer opcode instead of b1 :) also would emit_patch() be
> a terrible name?

Hmm, been using b1 since throughout the JIT we use u8 b1/b2/b3/.. for our
EMIT*() macros to denote the encoding positions. So I thought it would be
more conventional, but could also change to op if preferred.

>>   {
>>          u8 *prog = *pprog;
>>          int cnt = 0;
>>          s64 offset;
>>
> 
> [...]
> 
>>          case BPF_MOD_CALL_TO_NOP:
>> -               if (memcmp(ip, old_insn, X86_CALL_SIZE))
>> +       case BPF_MOD_JUMP_TO_NOP:
>> +               if (memcmp(ip, old_insn, X86_PATCHABLE_SIZE))
>>                          goto out;
>> -               text_poke_bp(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE, NULL);
>> +               text_poke_bp(ip, ideal_nops[NOP_ATOMIC5], X86_PATCHABLE_SIZE,
> 
> maybe keep it shorter with X86_PATCH_SIZE?

Sure, then X86_PATCH_SIZE and emit_patch() it is.

>> +                            NULL);
>>                  break;
>>          }
>>          ret = 0;
> 
> [...]
> 


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH rfc bpf-next 1/8] bpf, x86: generalize and extend bpf_arch_text_poke for direct jumps
  2019-11-15 23:42     ` Daniel Borkmann
@ 2019-11-16  0:01       ` Andrii Nakryiko
  0 siblings, 0 replies; 28+ messages in thread
From: Andrii Nakryiko @ 2019-11-16  0:01 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: Alexei Starovoitov, john fastabend, Networking, bpf

On Fri, Nov 15, 2019 at 3:42 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 11/16/19 12:22 AM, Andrii Nakryiko wrote:
> > On Thu, Nov 14, 2019 at 5:04 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >>
> >> Add BPF_MOD_{NOP_TO_JUMP,JUMP_TO_JUMP,JUMP_TO_NOP} patching for x86
> >> JIT in order to be able to patch direct jumps or nop them out. We need
> >> this facility in order to patch tail call jumps and in later work also
> >> BPF static keys.
> >>
> >> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> >> ---
> >
> > just naming nits, looks good otherwise
> >
> >>   arch/x86/net/bpf_jit_comp.c | 64 ++++++++++++++++++++++++++-----------
> >>   include/linux/bpf.h         |  6 ++++
> >>   2 files changed, 52 insertions(+), 18 deletions(-)
> >>
> >> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> >> index 2e586f579945..66921f2aeece 100644
> >> --- a/arch/x86/net/bpf_jit_comp.c
> >> +++ b/arch/x86/net/bpf_jit_comp.c
> >> @@ -203,8 +203,9 @@ struct jit_context {
> >>   /* Maximum number of bytes emitted while JITing one eBPF insn */
> >>   #define BPF_MAX_INSN_SIZE      128
> >>   #define BPF_INSN_SAFETY                64
> >> -/* number of bytes emit_call() needs to generate call instruction */
> >> -#define X86_CALL_SIZE          5
> >> +
> >> +/* Number of bytes emit_patchable() needs to generate instructions */
> >> +#define X86_PATCHABLE_SIZE     5
> >>
> >>   #define PROLOGUE_SIZE          25
> >>
> >> @@ -215,7 +216,7 @@ struct jit_context {
> >>   static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
> >>   {
> >>          u8 *prog = *pprog;
> >> -       int cnt = X86_CALL_SIZE;
> >> +       int cnt = X86_PATCHABLE_SIZE;
> >>
> >>          /* BPF trampoline can be made to work without these nops,
> >>           * but let's waste 5 bytes for now and optimize later
> >> @@ -480,64 +481,91 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
> >>          *pprog = prog;
> >>   }
> >>
> >> -static int emit_call(u8 **pprog, void *func, void *ip)
> >> +static int emit_patchable(u8 **pprog, void *func, void *ip, u8 b1)
> >
> > I'd strongly prefer opcode instead of b1 :) also would emit_patch() be
> > a terrible name?
>
> Hmm, been using b1 since throughout the JIT we use u8 b1/b2/b3/.. for our
> EMIT*() macros to denote the encoding positions. So I thought it would be
> more conventional, but could also change to op if preferred.

Well, I've been looking through text_poke_bp() recently, that one
consistently used opcode terminology. See for yourself, the function
is small, so it not too confusing to figure out what it really is.

>
> >>   {
> >>          u8 *prog = *pprog;
> >>          int cnt = 0;
> >>          s64 offset;
> >>
> >
> > [...]
> >
> >>          case BPF_MOD_CALL_TO_NOP:
> >> -               if (memcmp(ip, old_insn, X86_CALL_SIZE))
> >> +       case BPF_MOD_JUMP_TO_NOP:
> >> +               if (memcmp(ip, old_insn, X86_PATCHABLE_SIZE))
> >>                          goto out;
> >> -               text_poke_bp(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE, NULL);
> >> +               text_poke_bp(ip, ideal_nops[NOP_ATOMIC5], X86_PATCHABLE_SIZE,
> >
> > maybe keep it shorter with X86_PATCH_SIZE?
>
> Sure, then X86_PATCH_SIZE and emit_patch() it is.
>
> >> +                            NULL);
> >>                  break;
> >>          }
> >>          ret = 0;
> >
> > [...]
> >
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH rfc bpf-next 6/8] bpf: add poke dependency tracking for prog array maps
  2019-11-15  1:04 ` [PATCH rfc bpf-next 6/8] bpf: add poke dependency tracking for prog array maps Daniel Borkmann
@ 2019-11-18 17:39   ` Andrii Nakryiko
  2019-11-18 18:39     ` Daniel Borkmann
  0 siblings, 1 reply; 28+ messages in thread
From: Andrii Nakryiko @ 2019-11-18 17:39 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: Alexei Starovoitov, john fastabend, Networking, bpf

On Thu, Nov 14, 2019 at 5:04 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> This work adds program tracking to prog array maps. This is needed such
> that upon prog array updates/deletions we can fix up all programs which
> make use of this tail call map. We add ops->map_poke_{un,}track() helpers
> to maps to maintain the list of programs and ops->map_poke_run() for
> triggering the actual update. bpf_array_aux is extended to contain the
> list head and poke_mutex in order to serialize program patching during
> updates/deletions. bpf_free_used_maps() will untrack the program shortly
> before dropping the reference to the map.
>
> The prog_array_map_poke_run() is triggered during updates/deletions and
> walks the maintained prog list. It checks in their poke_tabs whether the
> map and key is matching and runs the actual bpf_arch_text_poke() for
> patching in the nop or new jmp location. Depending on the type of update,
> we use one of BPF_MOD_{NOP_TO_JUMP,JUMP_TO_NOP,JUMP_TO_JUMP}.
>
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> ---
>  include/linux/bpf.h   |  36 +++++++++++++
>  kernel/bpf/arraymap.c | 120 +++++++++++++++++++++++++++++++++++++++++-
>  kernel/bpf/core.c     |   9 +++-
>  3 files changed, 162 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 0ff06a0d0058..62a369fb8d98 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -21,6 +21,7 @@ struct bpf_verifier_env;
>  struct bpf_verifier_log;
>  struct perf_event;
>  struct bpf_prog;
> +struct bpf_prog_aux;
>  struct bpf_map;
>  struct sock;
>  struct seq_file;
> @@ -63,6 +64,12 @@ struct bpf_map_ops {
>                              const struct btf_type *key_type,
>                              const struct btf_type *value_type);
>
> +       /* Prog poke tracking helpers. */
> +       int (*map_poke_track)(struct bpf_map *map, struct bpf_prog_aux *aux);
> +       void (*map_poke_untrack)(struct bpf_map *map, struct bpf_prog_aux *aux);
> +       void (*map_poke_run)(struct bpf_map *map, u32 key, struct bpf_prog *old,
> +                            struct bpf_prog *new);

You are passing bpf_prog_aux for track/untrack, but bpf_prog itself
for run. Maybe stick to just bpf_prog everywhere?

> +
>         /* Direct value access helpers. */
>         int (*map_direct_value_addr)(const struct bpf_map *map,
>                                      u64 *imm, u32 off);
> @@ -584,6 +591,9 @@ struct bpf_array_aux {
>          */
>         enum bpf_prog_type type;
>         bool jited;
> +       /* Programs with direct jumps into programs part of this array. */
> +       struct list_head poke_progs;
> +       struct mutex poke_mutex;
>  };
>

[...]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH rfc bpf-next 8/8] bpf: constant map key tracking for prog array pokes
  2019-11-15  1:04 ` [PATCH rfc bpf-next 8/8] bpf: constant map key tracking for prog array pokes Daniel Borkmann
  2019-11-15  4:29   ` Alexei Starovoitov
@ 2019-11-18 18:11   ` Andrii Nakryiko
  2019-11-18 21:45     ` Daniel Borkmann
  1 sibling, 1 reply; 28+ messages in thread
From: Andrii Nakryiko @ 2019-11-18 18:11 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: Alexei Starovoitov, john fastabend, Networking, bpf

On Thu, Nov 14, 2019 at 5:04 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> Add tracking of constant keys into tail call maps. The signature of
> bpf_tail_call_proto is that arg1 is ctx, arg2 map pointer and arg3
> is a index key. The direct call approach for tail calls can be enabled
> if the verifier asserted that for all branches leading to the tail call
> helper invocation, the map pointer and index key were both constant
> and the same. Tracking of map pointers we already do from prior work
> via c93552c443eb ("bpf: properly enforce index mask to prevent out-of-bounds
> speculation") and 09772d92cd5a ("bpf: avoid retpoline for lookup/update/
> delete calls on maps"). Given the tail call map index key is not on
> stack but directly in the register, we can add similar tracking approach
> and later in fixup_bpf_calls() add a poke descriptor to the progs poke_tab
> with the relevant information for the JITing phase. We internally reuse
> insn->imm for the rewritten BPF_JMP | BPF_TAIL_CALL instruction in order
> to point into the prog's poke_tab and keep insn->imm == 0 as indicator
> that current indirect tail call emission must be used.
>
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> ---
>  include/linux/bpf_verifier.h |  1 +
>  kernel/bpf/verifier.c        | 98 ++++++++++++++++++++++++++++++++++++
>  2 files changed, 99 insertions(+)
>
> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
> index cdd08bf0ec06..f494f0c9ac13 100644
> --- a/include/linux/bpf_verifier.h
> +++ b/include/linux/bpf_verifier.h
> @@ -301,6 +301,7 @@ struct bpf_insn_aux_data {
>                         u32 map_off;            /* offset from value base address */
>                 };
>         };
> +       u64 key_state; /* constant key tracking for maps */
>         int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
>         int sanitize_stack_off; /* stack slot to be cleared */
>         bool seen; /* this insn was processed by the verifier */
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index e9dc95a18d44..48d5c9030d60 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -171,6 +171,9 @@ struct bpf_verifier_stack_elem {
>  #define BPF_COMPLEXITY_LIMIT_JMP_SEQ   8192
>  #define BPF_COMPLEXITY_LIMIT_STATES    64
>
> +#define BPF_MAP_KEY_POISON     (1ULL << 63)
> +#define BPF_MAP_KEY_SEEN       (1ULL << 62)
> +
>  #define BPF_MAP_PTR_UNPRIV     1UL
>  #define BPF_MAP_PTR_POISON     ((void *)((0xeB9FUL << 1) +     \
>                                           POISON_POINTER_DELTA))
> @@ -195,6 +198,29 @@ static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux,
>                          (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL);
>  }
>
> +static bool bpf_map_key_poisoned(const struct bpf_insn_aux_data *aux)
> +{
> +       return aux->key_state & BPF_MAP_KEY_POISON;
> +}
> +
> +static bool bpf_map_key_unseen(const struct bpf_insn_aux_data *aux)
> +{
> +       return !(aux->key_state & BPF_MAP_KEY_SEEN);
> +}
> +
> +static u64 bpf_map_key_immediate(const struct bpf_insn_aux_data *aux)
> +{
> +       return aux->key_state & ~BPF_MAP_KEY_SEEN;
> +}

This works out for current logic you've implemented, but it's a bit
misleading that bpf_map_key_immediate is also going to return POISON
bit, was this intentional?

> +
> +static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state)
> +{
> +       bool poisoned = bpf_map_key_poisoned(aux);
> +
> +       aux->key_state = state | BPF_MAP_KEY_SEEN |
> +                        (poisoned ? BPF_MAP_KEY_POISON : 0ULL);
> +}
> +
>  struct bpf_call_arg_meta {
>         struct bpf_map *map_ptr;
>         bool raw_mode;
> @@ -4088,6 +4114,37 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
>         return 0;
>  }
>
> +static int
> +record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
> +               int func_id, int insn_idx)
> +{
> +       struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
> +       struct bpf_reg_state *regs = cur_regs(env), *reg;
> +       struct tnum range = tnum_range(0, U32_MAX);

why U32_MAX, instead of actual size of a map?

> +       struct bpf_map *map = meta->map_ptr;
> +       u64 val;
> +
> +       if (func_id != BPF_FUNC_tail_call)
> +               return 0;
> +       if (!map || map->map_type != BPF_MAP_TYPE_PROG_ARRAY) {
> +               verbose(env, "kernel subsystem misconfigured verifier\n");
> +               return -EINVAL;
> +       }
> +
> +       reg = &regs[BPF_REG_3];
> +       if (!register_is_const(reg) || !tnum_in(range, reg->var_off)) {
> +               bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
> +               return 0;
> +       }
> +
> +       val = reg->var_off.value;
> +       if (bpf_map_key_unseen(aux))
> +               bpf_map_key_store(aux, val);
> +       else if (bpf_map_key_immediate(aux) != val)
> +               bpf_map_key_store(aux, BPF_MAP_KEY_POISON);

imo, checking for poison first would make this logic a bit more
straightforward (and will avoid unnecessary key_store calls, but
that's minor)

> +       return 0;
> +}
> +
>  static int check_reference_leak(struct bpf_verifier_env *env)
>  {
>         struct bpf_func_state *state = cur_func(env);

[...]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH rfc bpf-next 5/8] bpf: add jit poke descriptor mock-up for jit images
  2019-11-15  1:03 ` [PATCH rfc bpf-next 5/8] bpf: add jit poke descriptor mock-up for jit images Daniel Borkmann
@ 2019-11-18 18:17   ` Andrii Nakryiko
  2019-11-18 18:43     ` Daniel Borkmann
  0 siblings, 1 reply; 28+ messages in thread
From: Andrii Nakryiko @ 2019-11-18 18:17 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: Alexei Starovoitov, john fastabend, Networking, bpf

On Thu, Nov 14, 2019 at 5:05 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> Add initial poke table data structures and management to the BPF
> prog that can later be used by JITs. Also add an instance of poke
> specific data for tail call maps. Plan for later work is to extend
> this also for BPF static keys.
>
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> ---

looks good, just one more minor naming nit

Acked-by: Andrii Nakryiko <andriin@fb.com>

>  include/linux/bpf.h    | 20 ++++++++++++++++++++
>  include/linux/filter.h | 10 ++++++++++
>  kernel/bpf/core.c      | 34 ++++++++++++++++++++++++++++++++++
>  3 files changed, 64 insertions(+)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 40337fa0e463..0ff06a0d0058 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -484,6 +484,24 @@ struct bpf_func_info_aux {
>         bool unreliable;
>  };
>
> +enum bpf_jit_poke_reason {
> +       BPF_POKE_REASON_TAIL_CALL,
> +};
> +
> +/* Descriptor of pokes pointing /into/ the JITed image. */
> +struct bpf_jit_poke_descriptor {
> +       void *ip;
> +       union {
> +               struct {
> +                       struct bpf_map *map;
> +                       u32 key;
> +               } tc;

tc is a bit overloaded abbreviation, tail_call would be super-clear, though ;)

> +       };
> +       u8 ip_stable;
> +       u8 adj_off;
> +       u16 reason;
> +};
> +

[...]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH rfc bpf-next 6/8] bpf: add poke dependency tracking for prog array maps
  2019-11-18 17:39   ` Andrii Nakryiko
@ 2019-11-18 18:39     ` Daniel Borkmann
  2019-11-18 18:46       ` Andrii Nakryiko
  0 siblings, 1 reply; 28+ messages in thread
From: Daniel Borkmann @ 2019-11-18 18:39 UTC (permalink / raw)
  To: Andrii Nakryiko; +Cc: Alexei Starovoitov, john fastabend, Networking, bpf

On 11/18/19 6:39 PM, Andrii Nakryiko wrote:
> On Thu, Nov 14, 2019 at 5:04 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>>
>> This work adds program tracking to prog array maps. This is needed such
>> that upon prog array updates/deletions we can fix up all programs which
>> make use of this tail call map. We add ops->map_poke_{un,}track() helpers
>> to maps to maintain the list of programs and ops->map_poke_run() for
>> triggering the actual update. bpf_array_aux is extended to contain the
>> list head and poke_mutex in order to serialize program patching during
>> updates/deletions. bpf_free_used_maps() will untrack the program shortly
>> before dropping the reference to the map.
>>
>> The prog_array_map_poke_run() is triggered during updates/deletions and
>> walks the maintained prog list. It checks in their poke_tabs whether the
>> map and key is matching and runs the actual bpf_arch_text_poke() for
>> patching in the nop or new jmp location. Depending on the type of update,
>> we use one of BPF_MOD_{NOP_TO_JUMP,JUMP_TO_NOP,JUMP_TO_JUMP}.
>>
>> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
>> ---
>>   include/linux/bpf.h   |  36 +++++++++++++
>>   kernel/bpf/arraymap.c | 120 +++++++++++++++++++++++++++++++++++++++++-
>>   kernel/bpf/core.c     |   9 +++-
>>   3 files changed, 162 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index 0ff06a0d0058..62a369fb8d98 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -21,6 +21,7 @@ struct bpf_verifier_env;
>>   struct bpf_verifier_log;
>>   struct perf_event;
>>   struct bpf_prog;
>> +struct bpf_prog_aux;
>>   struct bpf_map;
>>   struct sock;
>>   struct seq_file;
>> @@ -63,6 +64,12 @@ struct bpf_map_ops {
>>                               const struct btf_type *key_type,
>>                               const struct btf_type *value_type);
>>
>> +       /* Prog poke tracking helpers. */
>> +       int (*map_poke_track)(struct bpf_map *map, struct bpf_prog_aux *aux);
>> +       void (*map_poke_untrack)(struct bpf_map *map, struct bpf_prog_aux *aux);
>> +       void (*map_poke_run)(struct bpf_map *map, u32 key, struct bpf_prog *old,
>> +                            struct bpf_prog *new);
> 
> You are passing bpf_prog_aux for track/untrack, but bpf_prog itself
> for run. Maybe stick to just bpf_prog everywhere?

This needs to be bpf_prog_aux as prog itself is not stable yet and can still
change, but aux itself is stable.

>> +
>>          /* Direct value access helpers. */
>>          int (*map_direct_value_addr)(const struct bpf_map *map,
>>                                       u64 *imm, u32 off);
>> @@ -584,6 +591,9 @@ struct bpf_array_aux {
>>           */
>>          enum bpf_prog_type type;
>>          bool jited;
>> +       /* Programs with direct jumps into programs part of this array. */
>> +       struct list_head poke_progs;
>> +       struct mutex poke_mutex;
>>   };
>>
> 
> [...]
> 


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH rfc bpf-next 5/8] bpf: add jit poke descriptor mock-up for jit images
  2019-11-18 18:17   ` Andrii Nakryiko
@ 2019-11-18 18:43     ` Daniel Borkmann
  0 siblings, 0 replies; 28+ messages in thread
From: Daniel Borkmann @ 2019-11-18 18:43 UTC (permalink / raw)
  To: Andrii Nakryiko; +Cc: Alexei Starovoitov, john fastabend, Networking, bpf

On 11/18/19 7:17 PM, Andrii Nakryiko wrote:
> On Thu, Nov 14, 2019 at 5:05 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>>
>> Add initial poke table data structures and management to the BPF
>> prog that can later be used by JITs. Also add an instance of poke
>> specific data for tail call maps. Plan for later work is to extend
>> this also for BPF static keys.
>>
>> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
>> ---
> 
> looks good, just one more minor naming nit
> 
> Acked-by: Andrii Nakryiko <andriin@fb.com>
> 
>>   include/linux/bpf.h    | 20 ++++++++++++++++++++
>>   include/linux/filter.h | 10 ++++++++++
>>   kernel/bpf/core.c      | 34 ++++++++++++++++++++++++++++++++++
>>   3 files changed, 64 insertions(+)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index 40337fa0e463..0ff06a0d0058 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -484,6 +484,24 @@ struct bpf_func_info_aux {
>>          bool unreliable;
>>   };
>>
>> +enum bpf_jit_poke_reason {
>> +       BPF_POKE_REASON_TAIL_CALL,
>> +};
>> +
>> +/* Descriptor of pokes pointing /into/ the JITed image. */
>> +struct bpf_jit_poke_descriptor {
>> +       void *ip;
>> +       union {
>> +               struct {
>> +                       struct bpf_map *map;
>> +                       u32 key;
>> +               } tc;
> 
> tc is a bit overloaded abbreviation, tail_call would be super-clear, though ;)

Ok, sure, will include it into the non-rfc version.

>> +       };
>> +       u8 ip_stable;
>> +       u8 adj_off;
>> +       u16 reason;
>> +};
>> +
> 
> [...]
> 


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH rfc bpf-next 6/8] bpf: add poke dependency tracking for prog array maps
  2019-11-18 18:39     ` Daniel Borkmann
@ 2019-11-18 18:46       ` Andrii Nakryiko
  2019-11-18 21:35         ` Daniel Borkmann
  0 siblings, 1 reply; 28+ messages in thread
From: Andrii Nakryiko @ 2019-11-18 18:46 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: Alexei Starovoitov, john fastabend, Networking, bpf

On Mon, Nov 18, 2019 at 10:39 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 11/18/19 6:39 PM, Andrii Nakryiko wrote:
> > On Thu, Nov 14, 2019 at 5:04 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >>
> >> This work adds program tracking to prog array maps. This is needed such
> >> that upon prog array updates/deletions we can fix up all programs which
> >> make use of this tail call map. We add ops->map_poke_{un,}track() helpers
> >> to maps to maintain the list of programs and ops->map_poke_run() for
> >> triggering the actual update. bpf_array_aux is extended to contain the
> >> list head and poke_mutex in order to serialize program patching during
> >> updates/deletions. bpf_free_used_maps() will untrack the program shortly
> >> before dropping the reference to the map.
> >>
> >> The prog_array_map_poke_run() is triggered during updates/deletions and
> >> walks the maintained prog list. It checks in their poke_tabs whether the
> >> map and key is matching and runs the actual bpf_arch_text_poke() for
> >> patching in the nop or new jmp location. Depending on the type of update,
> >> we use one of BPF_MOD_{NOP_TO_JUMP,JUMP_TO_NOP,JUMP_TO_JUMP}.
> >>
> >> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> >> ---
> >>   include/linux/bpf.h   |  36 +++++++++++++
> >>   kernel/bpf/arraymap.c | 120 +++++++++++++++++++++++++++++++++++++++++-
> >>   kernel/bpf/core.c     |   9 +++-
> >>   3 files changed, 162 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> >> index 0ff06a0d0058..62a369fb8d98 100644
> >> --- a/include/linux/bpf.h
> >> +++ b/include/linux/bpf.h
> >> @@ -21,6 +21,7 @@ struct bpf_verifier_env;
> >>   struct bpf_verifier_log;
> >>   struct perf_event;
> >>   struct bpf_prog;
> >> +struct bpf_prog_aux;
> >>   struct bpf_map;
> >>   struct sock;
> >>   struct seq_file;
> >> @@ -63,6 +64,12 @@ struct bpf_map_ops {
> >>                               const struct btf_type *key_type,
> >>                               const struct btf_type *value_type);
> >>
> >> +       /* Prog poke tracking helpers. */
> >> +       int (*map_poke_track)(struct bpf_map *map, struct bpf_prog_aux *aux);
> >> +       void (*map_poke_untrack)(struct bpf_map *map, struct bpf_prog_aux *aux);
> >> +       void (*map_poke_run)(struct bpf_map *map, u32 key, struct bpf_prog *old,
> >> +                            struct bpf_prog *new);
> >
> > You are passing bpf_prog_aux for track/untrack, but bpf_prog itself
> > for run. Maybe stick to just bpf_prog everywhere?
>
> This needs to be bpf_prog_aux as prog itself is not stable yet and can still
> change, but aux itself is stable.

no one will prevent doing container_of() and get bpf_prog itself, so
it's just an implicit knowledge that bpf_prog might be incomplete yet,
that has to be remembered (btw, might be good to add a brief comment
stating that). But I don't feel strongly either way.

>
> >> +
> >>          /* Direct value access helpers. */
> >>          int (*map_direct_value_addr)(const struct bpf_map *map,
> >>                                       u64 *imm, u32 off);
> >> @@ -584,6 +591,9 @@ struct bpf_array_aux {
> >>           */
> >>          enum bpf_prog_type type;
> >>          bool jited;
> >> +       /* Programs with direct jumps into programs part of this array. */
> >> +       struct list_head poke_progs;
> >> +       struct mutex poke_mutex;
> >>   };
> >>
> >
> > [...]
> >
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH rfc bpf-next 6/8] bpf: add poke dependency tracking for prog array maps
  2019-11-18 18:46       ` Andrii Nakryiko
@ 2019-11-18 21:35         ` Daniel Borkmann
  0 siblings, 0 replies; 28+ messages in thread
From: Daniel Borkmann @ 2019-11-18 21:35 UTC (permalink / raw)
  To: Andrii Nakryiko; +Cc: Alexei Starovoitov, john fastabend, Networking, bpf

On 11/18/19 7:46 PM, Andrii Nakryiko wrote:
> On Mon, Nov 18, 2019 at 10:39 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>> On 11/18/19 6:39 PM, Andrii Nakryiko wrote:
>>> On Thu, Nov 14, 2019 at 5:04 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>>>>
>>>> This work adds program tracking to prog array maps. This is needed such
>>>> that upon prog array updates/deletions we can fix up all programs which
>>>> make use of this tail call map. We add ops->map_poke_{un,}track() helpers
>>>> to maps to maintain the list of programs and ops->map_poke_run() for
>>>> triggering the actual update. bpf_array_aux is extended to contain the
>>>> list head and poke_mutex in order to serialize program patching during
>>>> updates/deletions. bpf_free_used_maps() will untrack the program shortly
>>>> before dropping the reference to the map.
>>>>
>>>> The prog_array_map_poke_run() is triggered during updates/deletions and
>>>> walks the maintained prog list. It checks in their poke_tabs whether the
>>>> map and key is matching and runs the actual bpf_arch_text_poke() for
>>>> patching in the nop or new jmp location. Depending on the type of update,
>>>> we use one of BPF_MOD_{NOP_TO_JUMP,JUMP_TO_NOP,JUMP_TO_JUMP}.
>>>>
>>>> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
>>>> ---
>>>>    include/linux/bpf.h   |  36 +++++++++++++
>>>>    kernel/bpf/arraymap.c | 120 +++++++++++++++++++++++++++++++++++++++++-
>>>>    kernel/bpf/core.c     |   9 +++-
>>>>    3 files changed, 162 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>>>> index 0ff06a0d0058..62a369fb8d98 100644
>>>> --- a/include/linux/bpf.h
>>>> +++ b/include/linux/bpf.h
>>>> @@ -21,6 +21,7 @@ struct bpf_verifier_env;
>>>>    struct bpf_verifier_log;
>>>>    struct perf_event;
>>>>    struct bpf_prog;
>>>> +struct bpf_prog_aux;
>>>>    struct bpf_map;
>>>>    struct sock;
>>>>    struct seq_file;
>>>> @@ -63,6 +64,12 @@ struct bpf_map_ops {
>>>>                                const struct btf_type *key_type,
>>>>                                const struct btf_type *value_type);
>>>>
>>>> +       /* Prog poke tracking helpers. */
>>>> +       int (*map_poke_track)(struct bpf_map *map, struct bpf_prog_aux *aux);
>>>> +       void (*map_poke_untrack)(struct bpf_map *map, struct bpf_prog_aux *aux);
>>>> +       void (*map_poke_run)(struct bpf_map *map, u32 key, struct bpf_prog *old,
>>>> +                            struct bpf_prog *new);
>>>
>>> You are passing bpf_prog_aux for track/untrack, but bpf_prog itself
>>> for run. Maybe stick to just bpf_prog everywhere?
>>
>> This needs to be bpf_prog_aux as prog itself is not stable yet and can still
>> change, but aux itself is stable.
> 
> no one will prevent doing container_of() and get bpf_prog itself, so
> it's just an implicit knowledge that bpf_prog might be incomplete yet,
> that has to be remembered (btw, might be good to add a brief comment
> stating that). But I don't feel strongly either way.

Makes sense, expanded the comment to state this.

Thanks,
Daniel

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH rfc bpf-next 8/8] bpf: constant map key tracking for prog array pokes
  2019-11-18 18:11   ` Andrii Nakryiko
@ 2019-11-18 21:45     ` Daniel Borkmann
  0 siblings, 0 replies; 28+ messages in thread
From: Daniel Borkmann @ 2019-11-18 21:45 UTC (permalink / raw)
  To: Andrii Nakryiko; +Cc: Alexei Starovoitov, john fastabend, Networking, bpf

On 11/18/19 7:11 PM, Andrii Nakryiko wrote:
> On Thu, Nov 14, 2019 at 5:04 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>>
>> Add tracking of constant keys into tail call maps. The signature of
>> bpf_tail_call_proto is that arg1 is ctx, arg2 map pointer and arg3
>> is a index key. The direct call approach for tail calls can be enabled
>> if the verifier asserted that for all branches leading to the tail call
>> helper invocation, the map pointer and index key were both constant
>> and the same. Tracking of map pointers we already do from prior work
>> via c93552c443eb ("bpf: properly enforce index mask to prevent out-of-bounds
>> speculation") and 09772d92cd5a ("bpf: avoid retpoline for lookup/update/
>> delete calls on maps"). Given the tail call map index key is not on
>> stack but directly in the register, we can add similar tracking approach
>> and later in fixup_bpf_calls() add a poke descriptor to the progs poke_tab
>> with the relevant information for the JITing phase. We internally reuse
>> insn->imm for the rewritten BPF_JMP | BPF_TAIL_CALL instruction in order
>> to point into the prog's poke_tab and keep insn->imm == 0 as indicator
>> that current indirect tail call emission must be used.
>>
>> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
>> ---
>>   include/linux/bpf_verifier.h |  1 +
>>   kernel/bpf/verifier.c        | 98 ++++++++++++++++++++++++++++++++++++
>>   2 files changed, 99 insertions(+)
>>
>> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
>> index cdd08bf0ec06..f494f0c9ac13 100644
>> --- a/include/linux/bpf_verifier.h
>> +++ b/include/linux/bpf_verifier.h
>> @@ -301,6 +301,7 @@ struct bpf_insn_aux_data {
>>                          u32 map_off;            /* offset from value base address */
>>                  };
>>          };
>> +       u64 key_state; /* constant key tracking for maps */
>>          int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
>>          int sanitize_stack_off; /* stack slot to be cleared */
>>          bool seen; /* this insn was processed by the verifier */
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index e9dc95a18d44..48d5c9030d60 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -171,6 +171,9 @@ struct bpf_verifier_stack_elem {
>>   #define BPF_COMPLEXITY_LIMIT_JMP_SEQ   8192
>>   #define BPF_COMPLEXITY_LIMIT_STATES    64
>>
>> +#define BPF_MAP_KEY_POISON     (1ULL << 63)
>> +#define BPF_MAP_KEY_SEEN       (1ULL << 62)
>> +
>>   #define BPF_MAP_PTR_UNPRIV     1UL
>>   #define BPF_MAP_PTR_POISON     ((void *)((0xeB9FUL << 1) +     \
>>                                            POISON_POINTER_DELTA))
>> @@ -195,6 +198,29 @@ static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux,
>>                           (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL);
>>   }
>>
>> +static bool bpf_map_key_poisoned(const struct bpf_insn_aux_data *aux)
>> +{
>> +       return aux->key_state & BPF_MAP_KEY_POISON;
>> +}
>> +
>> +static bool bpf_map_key_unseen(const struct bpf_insn_aux_data *aux)
>> +{
>> +       return !(aux->key_state & BPF_MAP_KEY_SEEN);
>> +}
>> +
>> +static u64 bpf_map_key_immediate(const struct bpf_insn_aux_data *aux)
>> +{
>> +       return aux->key_state & ~BPF_MAP_KEY_SEEN;
>> +}
> 
> This works out for current logic you've implemented, but it's a bit
> misleading that bpf_map_key_immediate is also going to return POISON
> bit, was this intentional?

Had it intentional, but fair enough, I'll mask it out to make it more clear.

>> +static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state)
>> +{
>> +       bool poisoned = bpf_map_key_poisoned(aux);
>> +
>> +       aux->key_state = state | BPF_MAP_KEY_SEEN |
>> +                        (poisoned ? BPF_MAP_KEY_POISON : 0ULL);
>> +}
>> +
>>   struct bpf_call_arg_meta {
>>          struct bpf_map *map_ptr;
>>          bool raw_mode;
>> @@ -4088,6 +4114,37 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
>>          return 0;
>>   }
>>
>> +static int
>> +record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
>> +               int func_id, int insn_idx)
>> +{
>> +       struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
>> +       struct bpf_reg_state *regs = cur_regs(env), *reg;
>> +       struct tnum range = tnum_range(0, U32_MAX);
> 
> why U32_MAX, instead of actual size of a map?

Hm, good point. That works given we poison that value and then skip later when
add add the poke entry.

>> +       struct bpf_map *map = meta->map_ptr;
>> +       u64 val;
>> +
>> +       if (func_id != BPF_FUNC_tail_call)
>> +               return 0;
>> +       if (!map || map->map_type != BPF_MAP_TYPE_PROG_ARRAY) {
>> +               verbose(env, "kernel subsystem misconfigured verifier\n");
>> +               return -EINVAL;
>> +       }
>> +
>> +       reg = &regs[BPF_REG_3];
>> +       if (!register_is_const(reg) || !tnum_in(range, reg->var_off)) {
>> +               bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
>> +               return 0;
>> +       }
>> +
>> +       val = reg->var_off.value;
>> +       if (bpf_map_key_unseen(aux))
>> +               bpf_map_key_store(aux, val);
>> +       else if (bpf_map_key_immediate(aux) != val)
>> +               bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
> 
> imo, checking for poison first would make this logic a bit more
> straightforward (and will avoid unnecessary key_store calls, but
> that's minor)

Makes sense.

>> +       return 0;
>> +}
>> +
>>   static int check_reference_leak(struct bpf_verifier_env *env)
>>   {
>>          struct bpf_func_state *state = cur_func(env);
> 
> [...]

Thanks,
Daniel

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2019-11-18 21:45 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-11-15  1:03 [PATCH rfc bpf-next 0/8] Optimize BPF tail calls for direct jumps Daniel Borkmann
2019-11-15  1:03 ` [PATCH rfc bpf-next 1/8] bpf, x86: generalize and extend bpf_arch_text_poke " Daniel Borkmann
2019-11-15 23:22   ` Andrii Nakryiko
2019-11-15 23:42     ` Daniel Borkmann
2019-11-16  0:01       ` Andrii Nakryiko
2019-11-15  1:03 ` [PATCH rfc bpf-next 2/8] bpf: add bpf_prog_under_eviction helper Daniel Borkmann
2019-11-15  1:03 ` [PATCH rfc bpf-next 3/8] bpf: move bpf_free_used_maps into sleepable section Daniel Borkmann
2019-11-15 23:32   ` Andrii Nakryiko
2019-11-15  1:03 ` [PATCH rfc bpf-next 4/8] bpf: move owner type,jited info into array auxillary data Daniel Borkmann
2019-11-15 23:19   ` Andrii Nakryiko
2019-11-15  1:03 ` [PATCH rfc bpf-next 5/8] bpf: add jit poke descriptor mock-up for jit images Daniel Borkmann
2019-11-18 18:17   ` Andrii Nakryiko
2019-11-18 18:43     ` Daniel Borkmann
2019-11-15  1:04 ` [PATCH rfc bpf-next 6/8] bpf: add poke dependency tracking for prog array maps Daniel Borkmann
2019-11-18 17:39   ` Andrii Nakryiko
2019-11-18 18:39     ` Daniel Borkmann
2019-11-18 18:46       ` Andrii Nakryiko
2019-11-18 21:35         ` Daniel Borkmann
2019-11-15  1:04 ` [PATCH rfc bpf-next 7/8] bpf, x86: emit patchable direct jump as tail call Daniel Borkmann
2019-11-15  3:23   ` Alexei Starovoitov
2019-11-15  7:27     ` Daniel Borkmann
2019-11-15  1:04 ` [PATCH rfc bpf-next 8/8] bpf: constant map key tracking for prog array pokes Daniel Borkmann
2019-11-15  4:29   ` Alexei Starovoitov
2019-11-15  7:13     ` Daniel Borkmann
2019-11-15 16:33       ` Alexei Starovoitov
2019-11-15 16:49         ` Daniel Borkmann
2019-11-18 18:11   ` Andrii Nakryiko
2019-11-18 21:45     ` Daniel Borkmann

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).