netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3 bpf-next 0/5] net: netfilter: add kfunc helper to update ct timeout
@ 2022-05-18 10:43 Lorenzo Bianconi
  2022-05-18 10:43 ` [PATCH v3 bpf-next 1/5] bpf: Add support for forcing kfunc args to be referenced Lorenzo Bianconi
                   ` (4 more replies)
  0 siblings, 5 replies; 24+ messages in thread
From: Lorenzo Bianconi @ 2022-05-18 10:43 UTC (permalink / raw)
  To: bpf
  Cc: netdev, ast, daniel, andrii, davem, kuba, edumazet, pabeni,
	pablo, fw, netfilter-devel, lorenzo.bianconi, brouer, toke,
	memxor

Changes since v2:
- add bpf_xdp_ct_add and bpf_ct_refresh_timeout kfunc helpers
- remove conntrack dependency from selftests
- add support for forcing kfunc args to be referenced and related selftests

Changes since v1:
- add bpf_ct_refresh_timeout kfunc selftest

Kumar Kartikeya Dwivedi (2):
  bpf: Add support for forcing kfunc args to be referenced
  selftests/bpf: Add verifier selftests for forced kfunc ref args

Lorenzo Bianconi (3):
  net: netfilter: add kfunc helper to update ct timeout
  net: netfilter: add kfunc helper to add a new ct entry
  selftests/bpf: add selftest for bpf_xdp_ct_add and
    bpf_ct_refresh_timeout kfunc

 include/net/netfilter/nf_conntrack.h          |   1 +
 kernel/bpf/btf.c                              |  40 ++-
 net/bpf/test_run.c                            |   5 +
 net/netfilter/nf_conntrack_bpf.c              | 232 ++++++++++++++++--
 net/netfilter/nf_conntrack_core.c             |  21 +-
 .../testing/selftests/bpf/prog_tests/bpf_nf.c |   4 +
 .../testing/selftests/bpf/progs/test_bpf_nf.c |  72 +++++-
 tools/testing/selftests/bpf/verifier/calls.c  |  53 ++++
 8 files changed, 375 insertions(+), 53 deletions(-)

-- 
2.35.3


^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v3 bpf-next 1/5] bpf: Add support for forcing kfunc args to be referenced
  2022-05-18 10:43 [PATCH v3 bpf-next 0/5] net: netfilter: add kfunc helper to update ct timeout Lorenzo Bianconi
@ 2022-05-18 10:43 ` Lorenzo Bianconi
  2022-05-18 17:58   ` Yonghong Song
  2022-05-18 10:43 ` [PATCH v3 bpf-next 2/5] selftests/bpf: Add verifier selftests for forced kfunc ref args Lorenzo Bianconi
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 24+ messages in thread
From: Lorenzo Bianconi @ 2022-05-18 10:43 UTC (permalink / raw)
  To: bpf
  Cc: netdev, ast, daniel, andrii, davem, kuba, edumazet, pabeni,
	pablo, fw, netfilter-devel, lorenzo.bianconi, brouer, toke,
	memxor

From: Kumar Kartikeya Dwivedi <memxor@gmail.com>

Similar to how we detect mem, size pairs in kfunc, teach verifier to
treat __ref suffix on argument name to imply that it must be a
referenced pointer when passed to kfunc. This is required to ensure that
kfunc that operate on some object only work on acquired pointers and not
normal PTR_TO_BTF_ID with same type which can be obtained by pointer
walking. Release functions need not specify such suffix on release
arguments as they are already expected to receive one referenced
argument.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 kernel/bpf/btf.c   | 40 ++++++++++++++++++++++++++++++----------
 net/bpf/test_run.c |  5 +++++
 2 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 2f0b0440131c..83a354732d96 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6021,18 +6021,13 @@ static bool __btf_type_is_scalar_struct(struct bpf_verifier_log *log,
 	return true;
 }
 
-static bool is_kfunc_arg_mem_size(const struct btf *btf,
-				  const struct btf_param *arg,
-				  const struct bpf_reg_state *reg)
+static bool btf_param_match_suffix(const struct btf *btf,
+				   const struct btf_param *arg,
+				   const char *suffix)
 {
-	int len, sfx_len = sizeof("__sz") - 1;
-	const struct btf_type *t;
+	int len, sfx_len = strlen(suffix);
 	const char *param_name;
 
-	t = btf_type_skip_modifiers(btf, arg->type, NULL);
-	if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
-		return false;
-
 	/* In the future, this can be ported to use BTF tagging */
 	param_name = btf_name_by_offset(btf, arg->name_off);
 	if (str_is_empty(param_name))
@@ -6041,12 +6036,31 @@ static bool is_kfunc_arg_mem_size(const struct btf *btf,
 	if (len < sfx_len)
 		return false;
 	param_name += len - sfx_len;
-	if (strncmp(param_name, "__sz", sfx_len))
+	if (strncmp(param_name, suffix, sfx_len))
 		return false;
 
 	return true;
 }
 
+static bool is_kfunc_arg_ref(const struct btf *btf,
+			     const struct btf_param *arg)
+{
+	return btf_param_match_suffix(btf, arg, "__ref");
+}
+
+static bool is_kfunc_arg_mem_size(const struct btf *btf,
+				  const struct btf_param *arg,
+				  const struct bpf_reg_state *reg)
+{
+	const struct btf_type *t;
+
+	t = btf_type_skip_modifiers(btf, arg->type, NULL);
+	if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
+		return false;
+
+	return btf_param_match_suffix(btf, arg, "__sz");
+}
+
 static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 				    const struct btf *btf, u32 func_id,
 				    struct bpf_reg_state *regs,
@@ -6115,6 +6129,12 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 			return -EINVAL;
 		}
 
+		/* Check if argument must be a referenced pointer */
+		if (is_kfunc && is_kfunc_arg_ref(btf, args + i) && !reg->ref_obj_id) {
+			bpf_log(log, "R%d must be referenced\n", regno);
+			return -EINVAL;
+		}
+
 		ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
 		ref_tname = btf_name_by_offset(btf, ref_t->name_off);
 
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 4d08cca771c7..adbc7dd18511 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -690,6 +690,10 @@ noinline void bpf_kfunc_call_test_mem_len_fail2(u64 *mem, int len)
 {
 }
 
+noinline void bpf_kfunc_call_test_ref(struct prog_test_ref_kfunc *p__ref)
+{
+}
+
 __diag_pop();
 
 ALLOW_ERROR_INJECTION(bpf_modify_return_test, ERRNO);
@@ -713,6 +717,7 @@ BTF_ID(func, bpf_kfunc_call_test_fail3)
 BTF_ID(func, bpf_kfunc_call_test_mem_len_pass1)
 BTF_ID(func, bpf_kfunc_call_test_mem_len_fail1)
 BTF_ID(func, bpf_kfunc_call_test_mem_len_fail2)
+BTF_ID(func, bpf_kfunc_call_test_ref)
 BTF_SET_END(test_sk_check_kfunc_ids)
 
 BTF_SET_START(test_sk_acquire_kfunc_ids)
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH v3 bpf-next 2/5] selftests/bpf: Add verifier selftests for forced kfunc ref args
  2022-05-18 10:43 [PATCH v3 bpf-next 0/5] net: netfilter: add kfunc helper to update ct timeout Lorenzo Bianconi
  2022-05-18 10:43 ` [PATCH v3 bpf-next 1/5] bpf: Add support for forcing kfunc args to be referenced Lorenzo Bianconi
@ 2022-05-18 10:43 ` Lorenzo Bianconi
  2022-05-18 18:25   ` Yonghong Song
  2022-05-18 10:43 ` [PATCH v3 bpf-next 3/5] net: netfilter: add kfunc helper to update ct timeout Lorenzo Bianconi
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 24+ messages in thread
From: Lorenzo Bianconi @ 2022-05-18 10:43 UTC (permalink / raw)
  To: bpf
  Cc: netdev, ast, daniel, andrii, davem, kuba, edumazet, pabeni,
	pablo, fw, netfilter-devel, lorenzo.bianconi, brouer, toke,
	memxor

From: Kumar Kartikeya Dwivedi <memxor@gmail.com>

Make sure verifier rejects the bad cases and ensure the good case keeps
working. The selftests make use of the bpf_kfunc_call_test_ref kfunc
added in the previous patch only for verification.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 tools/testing/selftests/bpf/verifier/calls.c | 53 ++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c
index 743ed34c1238..3fb4f69b1962 100644
--- a/tools/testing/selftests/bpf/verifier/calls.c
+++ b/tools/testing/selftests/bpf/verifier/calls.c
@@ -218,6 +218,59 @@
 	.result = REJECT,
 	.errstr = "variable ptr_ access var_off=(0x0; 0x7) disallowed",
 },
+{
+	"calls: invalid kfunc call: referenced arg needs refcounted PTR_TO_BTF_ID",
+	.insns = {
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+	BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),
+	BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+	BPF_EXIT_INSN(),
+	BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, 16),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	.fixup_kfunc_btf_id = {
+		{ "bpf_kfunc_call_test_acquire", 3 },
+		{ "bpf_kfunc_call_test_ref", 8 },
+		{ "bpf_kfunc_call_test_ref", 10 },
+	},
+	.result_unpriv = REJECT,
+	.result = REJECT,
+	.errstr = "R1 must be referenced",
+},
+{
+	"calls: valid kfunc call: referenced arg needs refcounted PTR_TO_BTF_ID",
+	.insns = {
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+	BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),
+	BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+	BPF_EXIT_INSN(),
+	BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	.fixup_kfunc_btf_id = {
+		{ "bpf_kfunc_call_test_acquire", 3 },
+		{ "bpf_kfunc_call_test_ref", 8 },
+		{ "bpf_kfunc_call_test_release", 10 },
+	},
+	.result_unpriv = REJECT,
+	.result = ACCEPT,
+},
 {
 	"calls: basic sanity",
 	.insns = {
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH v3 bpf-next 3/5] net: netfilter: add kfunc helper to update ct timeout
  2022-05-18 10:43 [PATCH v3 bpf-next 0/5] net: netfilter: add kfunc helper to update ct timeout Lorenzo Bianconi
  2022-05-18 10:43 ` [PATCH v3 bpf-next 1/5] bpf: Add support for forcing kfunc args to be referenced Lorenzo Bianconi
  2022-05-18 10:43 ` [PATCH v3 bpf-next 2/5] selftests/bpf: Add verifier selftests for forced kfunc ref args Lorenzo Bianconi
@ 2022-05-18 10:43 ` Lorenzo Bianconi
  2022-05-18 20:42   ` Toke Høiland-Jørgensen
  2022-05-18 10:43 ` [PATCH v3 bpf-next 4/5] net: netfilter: add kfunc helper to add a new ct entry Lorenzo Bianconi
  2022-05-18 10:43 ` [PATCH v3 bpf-next 5/5] selftests/bpf: add selftest for bpf_xdp_ct_add and bpf_ct_refresh_timeout kfunc Lorenzo Bianconi
  4 siblings, 1 reply; 24+ messages in thread
From: Lorenzo Bianconi @ 2022-05-18 10:43 UTC (permalink / raw)
  To: bpf
  Cc: netdev, ast, daniel, andrii, davem, kuba, edumazet, pabeni,
	pablo, fw, netfilter-devel, lorenzo.bianconi, brouer, toke,
	memxor

Introduce bpf_ct_refresh_timeout kfunc helper in order to update
nf_conn lifetime. Move timeout update logic in nf_ct_refresh_timeout
utility routine.

Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 include/net/netfilter/nf_conntrack.h |  1 +
 net/netfilter/nf_conntrack_bpf.c     | 20 ++++++++++++++++++++
 net/netfilter/nf_conntrack_core.c    | 21 +++++++++++++--------
 3 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 69e6c6a218be..02b7115b92d0 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -205,6 +205,7 @@ bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
 		       u_int16_t l3num, struct net *net,
 		       struct nf_conntrack_tuple *tuple);
 
+void nf_ct_refresh_timeout(struct nf_conn *ct, u32 extra_jiffies);
 void __nf_ct_refresh_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
 			  const struct sk_buff *skb,
 			  u32 extra_jiffies, bool do_acct);
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
index bc4d5cd63a94..a9271418db88 100644
--- a/net/netfilter/nf_conntrack_bpf.c
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -217,16 +217,36 @@ void bpf_ct_release(struct nf_conn *nfct)
 	nf_ct_put(nfct);
 }
 
+/* bpf_ct_refresh_timeout - Refresh nf_conn object
+ *
+ * Refresh timeout associated to the provided connection tracking entry.
+ * This must be invoked for referenced PTR_TO_BTF_ID.
+ *
+ * Parameters:
+ * @nf_conn      - Pointer to referenced nf_conn object, obtained using
+ *		   bpf_xdp_ct_lookup or bpf_skb_ct_lookup.
+ * @timeout      - delta time in msecs used to increase the ct entry lifetime.
+ */
+void bpf_ct_refresh_timeout(struct nf_conn *nfct__ref, u32 timeout)
+{
+	if (!nfct__ref)
+		return;
+
+	nf_ct_refresh_timeout(nfct__ref, msecs_to_jiffies(timeout));
+}
+
 __diag_pop()
 
 BTF_SET_START(nf_ct_xdp_check_kfunc_ids)
 BTF_ID(func, bpf_xdp_ct_lookup)
 BTF_ID(func, bpf_ct_release)
+BTF_ID(func, bpf_ct_refresh_timeout);
 BTF_SET_END(nf_ct_xdp_check_kfunc_ids)
 
 BTF_SET_START(nf_ct_tc_check_kfunc_ids)
 BTF_ID(func, bpf_skb_ct_lookup)
 BTF_ID(func, bpf_ct_release)
+BTF_ID(func, bpf_ct_refresh_timeout);
 BTF_SET_END(nf_ct_tc_check_kfunc_ids)
 
 BTF_SET_START(nf_ct_acquire_kfunc_ids)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 0164e5f522e8..f43e743728bd 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -2030,16 +2030,11 @@ void nf_conntrack_alter_reply(struct nf_conn *ct,
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply);
 
-/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
-void __nf_ct_refresh_acct(struct nf_conn *ct,
-			  enum ip_conntrack_info ctinfo,
-			  const struct sk_buff *skb,
-			  u32 extra_jiffies,
-			  bool do_acct)
+void nf_ct_refresh_timeout(struct nf_conn *ct, u32 extra_jiffies)
 {
 	/* Only update if this is not a fixed timeout */
 	if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
-		goto acct;
+		return;
 
 	/* If not in hash table, timer will not be active yet */
 	if (nf_ct_is_confirmed(ct))
@@ -2047,7 +2042,17 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
 
 	if (READ_ONCE(ct->timeout) != extra_jiffies)
 		WRITE_ONCE(ct->timeout, extra_jiffies);
-acct:
+}
+
+/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
+void __nf_ct_refresh_acct(struct nf_conn *ct,
+			  enum ip_conntrack_info ctinfo,
+			  const struct sk_buff *skb,
+			  u32 extra_jiffies,
+			  bool do_acct)
+{
+	nf_ct_refresh_timeout(ct, extra_jiffies);
+
 	if (do_acct)
 		nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len);
 }
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH v3 bpf-next 4/5] net: netfilter: add kfunc helper to add a new ct entry
  2022-05-18 10:43 [PATCH v3 bpf-next 0/5] net: netfilter: add kfunc helper to update ct timeout Lorenzo Bianconi
                   ` (2 preceding siblings ...)
  2022-05-18 10:43 ` [PATCH v3 bpf-next 3/5] net: netfilter: add kfunc helper to update ct timeout Lorenzo Bianconi
@ 2022-05-18 10:43 ` Lorenzo Bianconi
  2022-05-18 20:47   ` Toke Høiland-Jørgensen
  2022-05-19  2:06   ` kernel test robot
  2022-05-18 10:43 ` [PATCH v3 bpf-next 5/5] selftests/bpf: add selftest for bpf_xdp_ct_add and bpf_ct_refresh_timeout kfunc Lorenzo Bianconi
  4 siblings, 2 replies; 24+ messages in thread
From: Lorenzo Bianconi @ 2022-05-18 10:43 UTC (permalink / raw)
  To: bpf
  Cc: netdev, ast, daniel, andrii, davem, kuba, edumazet, pabeni,
	pablo, fw, netfilter-devel, lorenzo.bianconi, brouer, toke,
	memxor

Introduce bpf_xdp_ct_add and bpf_skb_ct_add kfunc helpers in order to
add a new entry to ct map from an ebpf program.
Introduce bpf_nf_ct_tuple_parse utility routine.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 net/netfilter/nf_conntrack_bpf.c | 212 +++++++++++++++++++++++++++----
 1 file changed, 189 insertions(+), 23 deletions(-)

diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
index a9271418db88..3d31b602fdf1 100644
--- a/net/netfilter/nf_conntrack_bpf.c
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -55,41 +55,114 @@ enum {
 	NF_BPF_CT_OPTS_SZ = 12,
 };
 
-static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
-					  struct bpf_sock_tuple *bpf_tuple,
-					  u32 tuple_len, u8 protonum,
-					  s32 netns_id, u8 *dir)
+static int bpf_nf_ct_tuple_parse(struct bpf_sock_tuple *bpf_tuple,
+				 u32 tuple_len, u8 protonum, u8 dir,
+				 struct nf_conntrack_tuple *tuple)
 {
-	struct nf_conntrack_tuple_hash *hash;
-	struct nf_conntrack_tuple tuple;
-	struct nf_conn *ct;
+	union nf_inet_addr *src = dir ? &tuple->dst.u3 : &tuple->src.u3;
+	union nf_inet_addr *dst = dir ? &tuple->src.u3 : &tuple->dst.u3;
+	union nf_conntrack_man_proto *sport = dir ? (void *)&tuple->dst.u
+						  : &tuple->src.u;
+	union nf_conntrack_man_proto *dport = dir ? &tuple->src.u
+						  : (void *)&tuple->dst.u;
 
 	if (unlikely(protonum != IPPROTO_TCP && protonum != IPPROTO_UDP))
-		return ERR_PTR(-EPROTO);
-	if (unlikely(netns_id < BPF_F_CURRENT_NETNS))
-		return ERR_PTR(-EINVAL);
+		return -EPROTO;
+
+	memset(tuple, 0, sizeof(*tuple));
 
-	memset(&tuple, 0, sizeof(tuple));
 	switch (tuple_len) {
 	case sizeof(bpf_tuple->ipv4):
-		tuple.src.l3num = AF_INET;
-		tuple.src.u3.ip = bpf_tuple->ipv4.saddr;
-		tuple.src.u.tcp.port = bpf_tuple->ipv4.sport;
-		tuple.dst.u3.ip = bpf_tuple->ipv4.daddr;
-		tuple.dst.u.tcp.port = bpf_tuple->ipv4.dport;
+		tuple->src.l3num = AF_INET;
+		src->ip = bpf_tuple->ipv4.saddr;
+		sport->tcp.port = bpf_tuple->ipv4.sport;
+		dst->ip = bpf_tuple->ipv4.daddr;
+		dport->tcp.port = bpf_tuple->ipv4.dport;
 		break;
 	case sizeof(bpf_tuple->ipv6):
-		tuple.src.l3num = AF_INET6;
-		memcpy(tuple.src.u3.ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr));
-		tuple.src.u.tcp.port = bpf_tuple->ipv6.sport;
-		memcpy(tuple.dst.u3.ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr));
-		tuple.dst.u.tcp.port = bpf_tuple->ipv6.dport;
+		tuple->src.l3num = AF_INET6;
+		memcpy(src->ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr));
+		sport->tcp.port = bpf_tuple->ipv6.sport;
+		memcpy(dst->ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr));
+		dport->tcp.port = bpf_tuple->ipv6.dport;
 		break;
 	default:
-		return ERR_PTR(-EAFNOSUPPORT);
+		return -EAFNOSUPPORT;
 	}
+	tuple->dst.protonum = protonum;
+	tuple->dst.dir = dir;
+
+	return 0;
+}
 
-	tuple.dst.protonum = protonum;
+struct nf_conn *
+__bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple,
+			u32 tuple_len, u8 protonum, s32 netns_id, u32 timeout)
+{
+	struct nf_conntrack_tuple otuple, rtuple;
+	struct nf_conn *ct;
+	int err;
+
+	if (unlikely(netns_id < BPF_F_CURRENT_NETNS))
+		return ERR_PTR(-EINVAL);
+
+	err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, protonum,
+				    IP_CT_DIR_ORIGINAL, &otuple);
+	if (err < 0)
+		return ERR_PTR(err);
+
+	err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, protonum,
+				    IP_CT_DIR_REPLY, &rtuple);
+	if (err < 0)
+		return ERR_PTR(err);
+
+	if (netns_id >= 0) {
+		net = get_net_ns_by_id(net, netns_id);
+		if (unlikely(!net))
+			return ERR_PTR(-ENONET);
+	}
+
+	ct = nf_conntrack_alloc(net, &nf_ct_zone_dflt, &otuple, &rtuple,
+				GFP_ATOMIC);
+	if (IS_ERR(ct))
+		goto out;
+
+	ct->timeout = timeout * HZ + jiffies;
+	ct->status |= IPS_CONFIRMED;
+
+	memset(&ct->proto, 0, sizeof(ct->proto));
+	if (protonum == IPPROTO_TCP)
+		ct->proto.tcp.state = TCP_CONNTRACK_ESTABLISHED;
+
+	err = nf_conntrack_hash_check_insert(ct);
+	if (err < 0) {
+		nf_conntrack_free(ct);
+		ct = ERR_PTR(err);
+	}
+out:
+	if (netns_id >= 0)
+		put_net(net);
+
+	return ct;
+}
+
+static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
+					  struct bpf_sock_tuple *bpf_tuple,
+					  u32 tuple_len, u8 protonum,
+					  s32 netns_id, u8 *dir)
+{
+	struct nf_conntrack_tuple_hash *hash;
+	struct nf_conntrack_tuple tuple;
+	struct nf_conn *ct;
+	int err;
+
+	if (unlikely(netns_id < BPF_F_CURRENT_NETNS))
+		return ERR_PTR(-EINVAL);
+
+	err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, protonum,
+				    IP_CT_DIR_ORIGINAL, &tuple);
+	if (err < 0)
+		return ERR_PTR(err);
 
 	if (netns_id >= 0) {
 		net = get_net_ns_by_id(net, netns_id);
@@ -114,6 +187,50 @@ __diag_push();
 __diag_ignore_all("-Wmissing-prototypes",
 		  "Global functions as their definitions will be in nf_conntrack BTF");
 
+/* bpf_xdp_ct_add - Add a new CT entry for the given tuple and acquire a
+ *		    reference to it
+ *
+ * Parameters:
+ * @xdp_ctx	- Pointer to ctx (xdp_md) in XDP program
+ *		    Cannot be NULL
+ * @bpf_tuple	- Pointer to memory representing the tuple to look up
+ *		    Cannot be NULL
+ * @tuple__sz	- Length of the tuple structure
+ *		    Must be one of sizeof(bpf_tuple->ipv4) or
+ *		    sizeof(bpf_tuple->ipv6)
+ * @opts	- Additional options for lookup (documented above)
+ *		    Cannot be NULL
+ * @opts__sz	- Length of the bpf_ct_opts structure
+ *		    Must be NF_BPF_CT_OPTS_SZ (12)
+ */
+struct nf_conn *
+bpf_xdp_ct_add(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
+	       u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
+{
+	struct xdp_buff *ctx = (struct xdp_buff *)xdp_ctx;
+	struct nf_conn *nfct;
+
+	BUILD_BUG_ON(sizeof(struct bpf_ct_opts) != NF_BPF_CT_OPTS_SZ);
+	if (!opts)
+		return NULL;
+
+	if (!bpf_tuple || opts->reserved[0] || opts->reserved[1] ||
+	    opts__sz != NF_BPF_CT_OPTS_SZ) {
+		opts->error = -EINVAL;
+		return NULL;
+	}
+
+	nfct =  __bpf_nf_ct_alloc_entry(dev_net(ctx->rxq->dev), bpf_tuple,
+					tuple__sz, opts->l4proto,
+					opts->netns_id, 10);
+	if (IS_ERR_OR_NULL(nfct)) {
+		opts->error = PTR_ERR(nfct);
+		return NULL;
+	}
+
+	return nfct;
+}
+
 /* bpf_xdp_ct_lookup - Lookup CT entry for the given tuple, and acquire a
  *		       reference to it
  *
@@ -157,6 +274,51 @@ bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
 	return nfct;
 }
 
+/* bpf_skb_ct_add - Add a new CT entry for the given tuple and acquire a
+ *		    reference to it
+ *
+ * Parameters:
+ * @skb_ctx	- Pointer to ctx (__sk_buff) in TC program
+ *		    Cannot be NULL
+ * @bpf_tuple	- Pointer to memory representing the tuple to look up
+ *		    Cannot be NULL
+ * @tuple__sz	- Length of the tuple structure
+ *		    Must be one of sizeof(bpf_tuple->ipv4) or
+ *		    sizeof(bpf_tuple->ipv6)
+ * @opts	- Additional options for lookup (documented above)
+ *		    Cannot be NULL
+ * @opts__sz	- Length of the bpf_ct_opts structure
+ *		    Must be NF_BPF_CT_OPTS_SZ (12)
+ */
+struct nf_conn *
+bpf_skb_ct_add(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
+	       u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
+{
+	struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+	struct nf_conn *nfct;
+	struct net *net;
+
+	BUILD_BUG_ON(sizeof(struct bpf_ct_opts) != NF_BPF_CT_OPTS_SZ);
+	if (!opts)
+		return NULL;
+
+	if (!bpf_tuple || opts->reserved[0] || opts->reserved[1] ||
+	    opts__sz != NF_BPF_CT_OPTS_SZ) {
+		opts->error = -EINVAL;
+		return NULL;
+	}
+
+	net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk);
+	nfct = __bpf_nf_ct_alloc_entry(net, bpf_tuple, tuple__sz,
+				       opts->l4proto, opts->netns_id, 10);
+	if (IS_ERR_OR_NULL(nfct)) {
+		opts->error = PTR_ERR(nfct);
+		return NULL;
+	}
+
+	return nfct;
+}
+
 /* bpf_skb_ct_lookup - Lookup CT entry for the given tuple, and acquire a
  *		       reference to it
  *
@@ -238,19 +400,23 @@ void bpf_ct_refresh_timeout(struct nf_conn *nfct__ref, u32 timeout)
 __diag_pop()
 
 BTF_SET_START(nf_ct_xdp_check_kfunc_ids)
+BTF_ID(func, bpf_xdp_ct_add)
 BTF_ID(func, bpf_xdp_ct_lookup)
 BTF_ID(func, bpf_ct_release)
 BTF_ID(func, bpf_ct_refresh_timeout);
 BTF_SET_END(nf_ct_xdp_check_kfunc_ids)
 
 BTF_SET_START(nf_ct_tc_check_kfunc_ids)
+BTF_ID(func, bpf_skb_ct_add)
 BTF_ID(func, bpf_skb_ct_lookup)
 BTF_ID(func, bpf_ct_release)
 BTF_ID(func, bpf_ct_refresh_timeout);
 BTF_SET_END(nf_ct_tc_check_kfunc_ids)
 
 BTF_SET_START(nf_ct_acquire_kfunc_ids)
+BTF_ID(func, bpf_xdp_ct_add)
 BTF_ID(func, bpf_xdp_ct_lookup)
+BTF_ID(func, bpf_skb_ct_add)
 BTF_ID(func, bpf_skb_ct_lookup)
 BTF_SET_END(nf_ct_acquire_kfunc_ids)
 
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH v3 bpf-next 5/5] selftests/bpf: add selftest for bpf_xdp_ct_add and bpf_ct_refresh_timeout kfunc
  2022-05-18 10:43 [PATCH v3 bpf-next 0/5] net: netfilter: add kfunc helper to update ct timeout Lorenzo Bianconi
                   ` (3 preceding siblings ...)
  2022-05-18 10:43 ` [PATCH v3 bpf-next 4/5] net: netfilter: add kfunc helper to add a new ct entry Lorenzo Bianconi
@ 2022-05-18 10:43 ` Lorenzo Bianconi
  2022-05-18 18:55   ` Yonghong Song
  2022-05-20 22:04   ` Andrii Nakryiko
  4 siblings, 2 replies; 24+ messages in thread
From: Lorenzo Bianconi @ 2022-05-18 10:43 UTC (permalink / raw)
  To: bpf
  Cc: netdev, ast, daniel, andrii, davem, kuba, edumazet, pabeni,
	pablo, fw, netfilter-devel, lorenzo.bianconi, brouer, toke,
	memxor

Introduce selftests for the following kfunc helpers:
- bpf_xdp_ct_add
- bpf_skb_ct_add
- bpf_ct_refresh_timeout

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 .../testing/selftests/bpf/prog_tests/bpf_nf.c |  4 ++
 .../testing/selftests/bpf/progs/test_bpf_nf.c | 72 +++++++++++++++----
 2 files changed, 64 insertions(+), 12 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
index dd30b1e3a67c..be6c5650892f 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
@@ -39,6 +39,10 @@ void test_bpf_nf_ct(int mode)
 	ASSERT_EQ(skel->bss->test_enonet_netns_id, -ENONET, "Test ENONET for bad but valid netns_id");
 	ASSERT_EQ(skel->bss->test_enoent_lookup, -ENOENT, "Test ENOENT for failed lookup");
 	ASSERT_EQ(skel->bss->test_eafnosupport, -EAFNOSUPPORT, "Test EAFNOSUPPORT for invalid len__tuple");
+	ASSERT_EQ(skel->bss->test_add_entry, 0, "Test for adding new entry");
+	ASSERT_EQ(skel->bss->test_succ_lookup, 0, "Test for successful lookup");
+	ASSERT_TRUE(skel->bss->test_delta_timeout > 9 && skel->bss->test_delta_timeout <= 10,
+		    "Test for ct timeout update");
 end:
 	test_bpf_nf__destroy(skel);
 }
diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf.c b/tools/testing/selftests/bpf/progs/test_bpf_nf.c
index f00a9731930e..361430dde3f7 100644
--- a/tools/testing/selftests/bpf/progs/test_bpf_nf.c
+++ b/tools/testing/selftests/bpf/progs/test_bpf_nf.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
 
 #define EAFNOSUPPORT 97
 #define EPROTO 71
@@ -8,6 +9,8 @@
 #define EINVAL 22
 #define ENOENT 2
 
+extern unsigned long CONFIG_HZ __kconfig;
+
 int test_einval_bpf_tuple = 0;
 int test_einval_reserved = 0;
 int test_einval_netns_id = 0;
@@ -16,6 +19,9 @@ int test_eproto_l4proto = 0;
 int test_enonet_netns_id = 0;
 int test_enoent_lookup = 0;
 int test_eafnosupport = 0;
+int test_add_entry = 0;
+int test_succ_lookup = 0;
+u32 test_delta_timeout = 0;
 
 struct nf_conn;
 
@@ -26,31 +32,40 @@ struct bpf_ct_opts___local {
 	u8 reserved[3];
 } __attribute__((preserve_access_index));
 
+struct nf_conn *bpf_xdp_ct_add(struct xdp_md *, struct bpf_sock_tuple *, u32,
+			       struct bpf_ct_opts___local *, u32) __ksym;
 struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *, struct bpf_sock_tuple *, u32,
 				  struct bpf_ct_opts___local *, u32) __ksym;
+struct nf_conn *bpf_skb_ct_add(struct __sk_buff *, struct bpf_sock_tuple *, u32,
+			       struct bpf_ct_opts___local *, u32) __ksym;
 struct nf_conn *bpf_skb_ct_lookup(struct __sk_buff *, struct bpf_sock_tuple *, u32,
 				  struct bpf_ct_opts___local *, u32) __ksym;
 void bpf_ct_release(struct nf_conn *) __ksym;
+void bpf_ct_refresh_timeout(struct nf_conn *, u32) __ksym;
 
 static __always_inline void
-nf_ct_test(struct nf_conn *(*func)(void *, struct bpf_sock_tuple *, u32,
-				   struct bpf_ct_opts___local *, u32),
+nf_ct_test(struct nf_conn *(*look_fn)(void *, struct bpf_sock_tuple *, u32,
+				      struct bpf_ct_opts___local *, u32),
+	   struct nf_conn *(*add_fn)(void *, struct bpf_sock_tuple *, u32,
+				     struct bpf_ct_opts___local *, u32),
 	   void *ctx)
 {
 	struct bpf_ct_opts___local opts_def = { .l4proto = IPPROTO_TCP, .netns_id = -1 };
 	struct bpf_sock_tuple bpf_tuple;
 	struct nf_conn *ct;
+	int err;
 
 	__builtin_memset(&bpf_tuple, 0, sizeof(bpf_tuple.ipv4));
 
-	ct = func(ctx, NULL, 0, &opts_def, sizeof(opts_def));
+	ct = look_fn(ctx, NULL, 0, &opts_def, sizeof(opts_def));
 	if (ct)
 		bpf_ct_release(ct);
 	else
 		test_einval_bpf_tuple = opts_def.error;
 
 	opts_def.reserved[0] = 1;
-	ct = func(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, sizeof(opts_def));
+	ct = look_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def,
+		     sizeof(opts_def));
 	opts_def.reserved[0] = 0;
 	opts_def.l4proto = IPPROTO_TCP;
 	if (ct)
@@ -59,21 +74,24 @@ nf_ct_test(struct nf_conn *(*func)(void *, struct bpf_sock_tuple *, u32,
 		test_einval_reserved = opts_def.error;
 
 	opts_def.netns_id = -2;
-	ct = func(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, sizeof(opts_def));
+	ct = look_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def,
+		     sizeof(opts_def));
 	opts_def.netns_id = -1;
 	if (ct)
 		bpf_ct_release(ct);
 	else
 		test_einval_netns_id = opts_def.error;
 
-	ct = func(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, sizeof(opts_def) - 1);
+	ct = look_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def,
+		     sizeof(opts_def) - 1);
 	if (ct)
 		bpf_ct_release(ct);
 	else
 		test_einval_len_opts = opts_def.error;
 
 	opts_def.l4proto = IPPROTO_ICMP;
-	ct = func(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, sizeof(opts_def));
+	ct = look_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def,
+		     sizeof(opts_def));
 	opts_def.l4proto = IPPROTO_TCP;
 	if (ct)
 		bpf_ct_release(ct);
@@ -81,37 +99,67 @@ nf_ct_test(struct nf_conn *(*func)(void *, struct bpf_sock_tuple *, u32,
 		test_eproto_l4proto = opts_def.error;
 
 	opts_def.netns_id = 0xf00f;
-	ct = func(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, sizeof(opts_def));
+	ct = look_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def,
+		     sizeof(opts_def));
 	opts_def.netns_id = -1;
 	if (ct)
 		bpf_ct_release(ct);
 	else
 		test_enonet_netns_id = opts_def.error;
 
-	ct = func(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, sizeof(opts_def));
+	ct = look_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def,
+		     sizeof(opts_def));
 	if (ct)
 		bpf_ct_release(ct);
 	else
 		test_enoent_lookup = opts_def.error;
 
-	ct = func(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4) - 1, &opts_def, sizeof(opts_def));
+	ct = look_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4) - 1, &opts_def,
+		     sizeof(opts_def));
 	if (ct)
 		bpf_ct_release(ct);
 	else
 		test_eafnosupport = opts_def.error;
+
+	bpf_tuple.ipv4.saddr = bpf_get_prandom_u32(); /* src IP */
+	bpf_tuple.ipv4.daddr = bpf_get_prandom_u32(); /* dst IP */
+	bpf_tuple.ipv4.sport = bpf_htons(bpf_get_prandom_u32()); /* src port */
+	bpf_tuple.ipv4.dport = bpf_htons(bpf_get_prandom_u32()); /* dst port */
+
+	ct = add_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def,
+		    sizeof(opts_def));
+	if (ct) {
+		struct nf_conn *ct_lk;
+
+		ct_lk = look_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4),
+				&opts_def, sizeof(opts_def));
+		if (ct_lk) {
+			/* update ct entry timeout */
+			bpf_ct_refresh_timeout(ct_lk, 10000);
+			test_delta_timeout = ct_lk->timeout - bpf_jiffies64();
+			test_delta_timeout /= CONFIG_HZ;
+			bpf_ct_release(ct_lk);
+		} else {
+			test_succ_lookup = opts_def.error;
+		}
+		bpf_ct_release(ct);
+	} else {
+		test_add_entry = opts_def.error;
+		test_succ_lookup = opts_def.error;
+	}
 }
 
 SEC("xdp")
 int nf_xdp_ct_test(struct xdp_md *ctx)
 {
-	nf_ct_test((void *)bpf_xdp_ct_lookup, ctx);
+	nf_ct_test((void *)bpf_xdp_ct_lookup, (void *)bpf_xdp_ct_add, ctx);
 	return 0;
 }
 
 SEC("tc")
 int nf_skb_ct_test(struct __sk_buff *ctx)
 {
-	nf_ct_test((void *)bpf_skb_ct_lookup, ctx);
+	nf_ct_test((void *)bpf_skb_ct_lookup, (void *)bpf_skb_ct_add, ctx);
 	return 0;
 }
 
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 bpf-next 1/5] bpf: Add support for forcing kfunc args to be referenced
  2022-05-18 10:43 ` [PATCH v3 bpf-next 1/5] bpf: Add support for forcing kfunc args to be referenced Lorenzo Bianconi
@ 2022-05-18 17:58   ` Yonghong Song
  2022-05-18 18:06     ` Kumar Kartikeya Dwivedi
  0 siblings, 1 reply; 24+ messages in thread
From: Yonghong Song @ 2022-05-18 17:58 UTC (permalink / raw)
  To: Lorenzo Bianconi, bpf
  Cc: netdev, ast, daniel, andrii, davem, kuba, edumazet, pabeni,
	pablo, fw, netfilter-devel, lorenzo.bianconi, brouer, toke,
	memxor



On 5/18/22 3:43 AM, Lorenzo Bianconi wrote:
> From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
> 
> Similar to how we detect mem, size pairs in kfunc, teach verifier to
> treat __ref suffix on argument name to imply that it must be a
> referenced pointer when passed to kfunc. This is required to ensure that
> kfunc that operate on some object only work on acquired pointers and not
> normal PTR_TO_BTF_ID with same type which can be obtained by pointer
> walking. Release functions need not specify such suffix on release
> arguments as they are already expected to receive one referenced
> argument.
> 
> Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> ---
>   kernel/bpf/btf.c   | 40 ++++++++++++++++++++++++++++++----------
>   net/bpf/test_run.c |  5 +++++
>   2 files changed, 35 insertions(+), 10 deletions(-)
> 
> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> index 2f0b0440131c..83a354732d96 100644
> --- a/kernel/bpf/btf.c
> +++ b/kernel/bpf/btf.c
> @@ -6021,18 +6021,13 @@ static bool __btf_type_is_scalar_struct(struct bpf_verifier_log *log,
>   	return true;
>   }
>   
> -static bool is_kfunc_arg_mem_size(const struct btf *btf,
> -				  const struct btf_param *arg,
> -				  const struct bpf_reg_state *reg)
> +static bool btf_param_match_suffix(const struct btf *btf,
> +				   const struct btf_param *arg,
> +				   const char *suffix)
>   {
> -	int len, sfx_len = sizeof("__sz") - 1;
> -	const struct btf_type *t;
> +	int len, sfx_len = strlen(suffix);
>   	const char *param_name;
>   
> -	t = btf_type_skip_modifiers(btf, arg->type, NULL);
> -	if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
> -		return false;
> -
>   	/* In the future, this can be ported to use BTF tagging */
>   	param_name = btf_name_by_offset(btf, arg->name_off);
>   	if (str_is_empty(param_name))
> @@ -6041,12 +6036,31 @@ static bool is_kfunc_arg_mem_size(const struct btf *btf,
>   	if (len < sfx_len)
>   		return false;
>   	param_name += len - sfx_len;
> -	if (strncmp(param_name, "__sz", sfx_len))
> +	if (strncmp(param_name, suffix, sfx_len))
>   		return false;
>   
>   	return true;
>   }
>   
> +static bool is_kfunc_arg_ref(const struct btf *btf,
> +			     const struct btf_param *arg)
> +{
> +	return btf_param_match_suffix(btf, arg, "__ref");

Do we also need to do btf_type_skip_modifiers and to ensure
the type after skipping modifiers are a pointer type?
The current implementation should work for
bpf_kfunc_call_test_ref(), but with additional checking
we may avoid some accidental mistakes.

> +}
> +
> +static bool is_kfunc_arg_mem_size(const struct btf *btf,
> +				  const struct btf_param *arg,
> +				  const struct bpf_reg_state *reg)
> +{
> +	const struct btf_type *t;
> +
> +	t = btf_type_skip_modifiers(btf, arg->type, NULL);
> +	if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
> +		return false;
> +
> +	return btf_param_match_suffix(btf, arg, "__sz");
> +}
> +
>   static int btf_check_func_arg_match(struct bpf_verifier_env *env,
>   				    const struct btf *btf, u32 func_id,
>   				    struct bpf_reg_state *regs,
> @@ -6115,6 +6129,12 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
>   			return -EINVAL;
>   		}
>   
> +		/* Check if argument must be a referenced pointer */
> +		if (is_kfunc && is_kfunc_arg_ref(btf, args + i) && !reg->ref_obj_id) {
> +			bpf_log(log, "R%d must be referenced\n", regno);
> +			return -EINVAL;
> +		}
> +
>   		ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
>   		ref_tname = btf_name_by_offset(btf, ref_t->name_off);
>   
> diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
> index 4d08cca771c7..adbc7dd18511 100644
> --- a/net/bpf/test_run.c
> +++ b/net/bpf/test_run.c
> @@ -690,6 +690,10 @@ noinline void bpf_kfunc_call_test_mem_len_fail2(u64 *mem, int len)
>   {
>   }
>   
> +noinline void bpf_kfunc_call_test_ref(struct prog_test_ref_kfunc *p__ref)
> +{
> +}
> +
>   __diag_pop();
>   
[...]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 bpf-next 1/5] bpf: Add support for forcing kfunc args to be referenced
  2022-05-18 17:58   ` Yonghong Song
@ 2022-05-18 18:06     ` Kumar Kartikeya Dwivedi
  2022-05-18 18:23       ` Yonghong Song
  0 siblings, 1 reply; 24+ messages in thread
From: Kumar Kartikeya Dwivedi @ 2022-05-18 18:06 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Lorenzo Bianconi, bpf, netdev, ast, daniel, andrii, davem, kuba,
	edumazet, pabeni, pablo, fw, netfilter-devel, lorenzo.bianconi,
	brouer, toke

On Wed, May 18, 2022 at 11:28:12PM IST, Yonghong Song wrote:
>
>
> On 5/18/22 3:43 AM, Lorenzo Bianconi wrote:
> > From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
> >
> > Similar to how we detect mem, size pairs in kfunc, teach verifier to
> > treat __ref suffix on argument name to imply that it must be a
> > referenced pointer when passed to kfunc. This is required to ensure that
> > kfunc that operate on some object only work on acquired pointers and not
> > normal PTR_TO_BTF_ID with same type which can be obtained by pointer
> > walking. Release functions need not specify such suffix on release
> > arguments as they are already expected to receive one referenced
> > argument.
> >
> > Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
> > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> > ---
> >   kernel/bpf/btf.c   | 40 ++++++++++++++++++++++++++++++----------
> >   net/bpf/test_run.c |  5 +++++
> >   2 files changed, 35 insertions(+), 10 deletions(-)
> >
> > diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> > index 2f0b0440131c..83a354732d96 100644
> > --- a/kernel/bpf/btf.c
> > +++ b/kernel/bpf/btf.c
> > @@ -6021,18 +6021,13 @@ static bool __btf_type_is_scalar_struct(struct bpf_verifier_log *log,
> >   	return true;
> >   }
> > -static bool is_kfunc_arg_mem_size(const struct btf *btf,
> > -				  const struct btf_param *arg,
> > -				  const struct bpf_reg_state *reg)
> > +static bool btf_param_match_suffix(const struct btf *btf,
> > +				   const struct btf_param *arg,
> > +				   const char *suffix)
> >   {
> > -	int len, sfx_len = sizeof("__sz") - 1;
> > -	const struct btf_type *t;
> > +	int len, sfx_len = strlen(suffix);
> >   	const char *param_name;
> > -	t = btf_type_skip_modifiers(btf, arg->type, NULL);
> > -	if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
> > -		return false;
> > -
> >   	/* In the future, this can be ported to use BTF tagging */
> >   	param_name = btf_name_by_offset(btf, arg->name_off);
> >   	if (str_is_empty(param_name))
> > @@ -6041,12 +6036,31 @@ static bool is_kfunc_arg_mem_size(const struct btf *btf,
> >   	if (len < sfx_len)
> >   		return false;
> >   	param_name += len - sfx_len;
> > -	if (strncmp(param_name, "__sz", sfx_len))
> > +	if (strncmp(param_name, suffix, sfx_len))
> >   		return false;
> >   	return true;
> >   }
> > +static bool is_kfunc_arg_ref(const struct btf *btf,
> > +			     const struct btf_param *arg)
> > +{
> > +	return btf_param_match_suffix(btf, arg, "__ref");
>
> Do we also need to do btf_type_skip_modifiers and to ensure
> the type after skipping modifiers are a pointer type?
> The current implementation should work for
> bpf_kfunc_call_test_ref(), but with additional checking
> we may avoid some accidental mistakes.
>

The point where this check happens, arg[i].type is already known to be a pointer
type, after skipping modifiers.

> > +}
> > +
> > +static bool is_kfunc_arg_mem_size(const struct btf *btf,
> > +				  const struct btf_param *arg,
> > +				  const struct bpf_reg_state *reg)
> > +{
> > +	const struct btf_type *t;
> > +
> > +	t = btf_type_skip_modifiers(btf, arg->type, NULL);
> > +	if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
> > +		return false;
> > +
> > +	return btf_param_match_suffix(btf, arg, "__sz");
> > +}
> > +
> >   static int btf_check_func_arg_match(struct bpf_verifier_env *env,
> >   				    const struct btf *btf, u32 func_id,
> >   				    struct bpf_reg_state *regs,
> > @@ -6115,6 +6129,12 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
> >   			return -EINVAL;
> >   		}
> > +		/* Check if argument must be a referenced pointer */
> > +		if (is_kfunc && is_kfunc_arg_ref(btf, args + i) && !reg->ref_obj_id) {
> > +			bpf_log(log, "R%d must be referenced\n", regno);
> > +			return -EINVAL;
> > +		}
> > +
> >   		ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
> >   		ref_tname = btf_name_by_offset(btf, ref_t->name_off);
> > diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
> > index 4d08cca771c7..adbc7dd18511 100644
> > --- a/net/bpf/test_run.c
> > +++ b/net/bpf/test_run.c
> > @@ -690,6 +690,10 @@ noinline void bpf_kfunc_call_test_mem_len_fail2(u64 *mem, int len)
> >   {
> >   }
> > +noinline void bpf_kfunc_call_test_ref(struct prog_test_ref_kfunc *p__ref)
> > +{
> > +}
> > +
> >   __diag_pop();
> [...]

--
Kartikeya

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 bpf-next 1/5] bpf: Add support for forcing kfunc args to be referenced
  2022-05-18 18:06     ` Kumar Kartikeya Dwivedi
@ 2022-05-18 18:23       ` Yonghong Song
  0 siblings, 0 replies; 24+ messages in thread
From: Yonghong Song @ 2022-05-18 18:23 UTC (permalink / raw)
  To: Kumar Kartikeya Dwivedi
  Cc: Lorenzo Bianconi, bpf, netdev, ast, daniel, andrii, davem, kuba,
	edumazet, pabeni, pablo, fw, netfilter-devel, lorenzo.bianconi,
	brouer, toke



On 5/18/22 11:06 AM, Kumar Kartikeya Dwivedi wrote:
> On Wed, May 18, 2022 at 11:28:12PM IST, Yonghong Song wrote:
>>
>>
>> On 5/18/22 3:43 AM, Lorenzo Bianconi wrote:
>>> From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
>>>
>>> Similar to how we detect mem, size pairs in kfunc, teach verifier to
>>> treat __ref suffix on argument name to imply that it must be a
>>> referenced pointer when passed to kfunc. This is required to ensure that
>>> kfunc that operate on some object only work on acquired pointers and not
>>> normal PTR_TO_BTF_ID with same type which can be obtained by pointer
>>> walking. Release functions need not specify such suffix on release
>>> arguments as they are already expected to receive one referenced
>>> argument.
>>>
>>> Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
>>> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
>>> ---
>>>    kernel/bpf/btf.c   | 40 ++++++++++++++++++++++++++++++----------
>>>    net/bpf/test_run.c |  5 +++++
>>>    2 files changed, 35 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
>>> index 2f0b0440131c..83a354732d96 100644
>>> --- a/kernel/bpf/btf.c
>>> +++ b/kernel/bpf/btf.c
>>> @@ -6021,18 +6021,13 @@ static bool __btf_type_is_scalar_struct(struct bpf_verifier_log *log,
>>>    	return true;
>>>    }
>>> -static bool is_kfunc_arg_mem_size(const struct btf *btf,
>>> -				  const struct btf_param *arg,
>>> -				  const struct bpf_reg_state *reg)
>>> +static bool btf_param_match_suffix(const struct btf *btf,
>>> +				   const struct btf_param *arg,
>>> +				   const char *suffix)
>>>    {
>>> -	int len, sfx_len = sizeof("__sz") - 1;
>>> -	const struct btf_type *t;
>>> +	int len, sfx_len = strlen(suffix);
>>>    	const char *param_name;
>>> -	t = btf_type_skip_modifiers(btf, arg->type, NULL);
>>> -	if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
>>> -		return false;
>>> -
>>>    	/* In the future, this can be ported to use BTF tagging */
>>>    	param_name = btf_name_by_offset(btf, arg->name_off);
>>>    	if (str_is_empty(param_name))
>>> @@ -6041,12 +6036,31 @@ static bool is_kfunc_arg_mem_size(const struct btf *btf,
>>>    	if (len < sfx_len)
>>>    		return false;
>>>    	param_name += len - sfx_len;
>>> -	if (strncmp(param_name, "__sz", sfx_len))
>>> +	if (strncmp(param_name, suffix, sfx_len))
>>>    		return false;
>>>    	return true;
>>>    }
>>> +static bool is_kfunc_arg_ref(const struct btf *btf,
>>> +			     const struct btf_param *arg)
>>> +{
>>> +	return btf_param_match_suffix(btf, arg, "__ref");
>>
>> Do we also need to do btf_type_skip_modifiers and to ensure
>> the type after skipping modifiers are a pointer type?
>> The current implementation should work for
>> bpf_kfunc_call_test_ref(), but with additional checking
>> we may avoid some accidental mistakes.
>>
> 
> The point where this check happens, arg[i].type is already known to be a pointer
> type, after skipping modifiers.

Okay, maybe we can add a comment here like
	/* the arg has been verified as a pointer */
But anyway, this is minor.

Acked-by: Yonghong Song <yhs@fb.com>

> 
>>> +}
>>> +
>>> +static bool is_kfunc_arg_mem_size(const struct btf *btf,
>>> +				  const struct btf_param *arg,
>>> +				  const struct bpf_reg_state *reg)
>>> +{
>>> +	const struct btf_type *t;
>>> +
>>> +	t = btf_type_skip_modifiers(btf, arg->type, NULL);
>>> +	if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
>>> +		return false;
>>> +
>>> +	return btf_param_match_suffix(btf, arg, "__sz");
>>> +}
>>> +
>>>    static int btf_check_func_arg_match(struct bpf_verifier_env *env,
>>>    				    const struct btf *btf, u32 func_id,
>>>    				    struct bpf_reg_state *regs,
>>> @@ -6115,6 +6129,12 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
>>>    			return -EINVAL;
>>>    		}
>>> +		/* Check if argument must be a referenced pointer */
>>> +		if (is_kfunc && is_kfunc_arg_ref(btf, args + i) && !reg->ref_obj_id) {
>>> +			bpf_log(log, "R%d must be referenced\n", regno);
>>> +			return -EINVAL;
>>> +		}
>>> +
>>>    		ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
>>>    		ref_tname = btf_name_by_offset(btf, ref_t->name_off);
>>> diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
>>> index 4d08cca771c7..adbc7dd18511 100644
>>> --- a/net/bpf/test_run.c
>>> +++ b/net/bpf/test_run.c
>>> @@ -690,6 +690,10 @@ noinline void bpf_kfunc_call_test_mem_len_fail2(u64 *mem, int len)
>>>    {
>>>    }
>>> +noinline void bpf_kfunc_call_test_ref(struct prog_test_ref_kfunc *p__ref)
>>> +{
>>> +}
>>> +
>>>    __diag_pop();
>> [...]
> 
> --
> Kartikeya

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 bpf-next 2/5] selftests/bpf: Add verifier selftests for forced kfunc ref args
  2022-05-18 10:43 ` [PATCH v3 bpf-next 2/5] selftests/bpf: Add verifier selftests for forced kfunc ref args Lorenzo Bianconi
@ 2022-05-18 18:25   ` Yonghong Song
  0 siblings, 0 replies; 24+ messages in thread
From: Yonghong Song @ 2022-05-18 18:25 UTC (permalink / raw)
  To: Lorenzo Bianconi, bpf
  Cc: netdev, ast, daniel, andrii, davem, kuba, edumazet, pabeni,
	pablo, fw, netfilter-devel, lorenzo.bianconi, brouer, toke,
	memxor



On 5/18/22 3:43 AM, Lorenzo Bianconi wrote:
> From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
> 
> Make sure verifier rejects the bad cases and ensure the good case keeps
> working. The selftests make use of the bpf_kfunc_call_test_ref kfunc
> added in the previous patch only for verification.
> 
> Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>

Acked-by: Yonghong Song <yhs@fb.com>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 bpf-next 5/5] selftests/bpf: add selftest for bpf_xdp_ct_add and bpf_ct_refresh_timeout kfunc
  2022-05-18 10:43 ` [PATCH v3 bpf-next 5/5] selftests/bpf: add selftest for bpf_xdp_ct_add and bpf_ct_refresh_timeout kfunc Lorenzo Bianconi
@ 2022-05-18 18:55   ` Yonghong Song
  2022-05-18 20:38     ` Lorenzo Bianconi
  2022-05-20 22:04   ` Andrii Nakryiko
  1 sibling, 1 reply; 24+ messages in thread
From: Yonghong Song @ 2022-05-18 18:55 UTC (permalink / raw)
  To: Lorenzo Bianconi, bpf
  Cc: netdev, ast, daniel, andrii, davem, kuba, edumazet, pabeni,
	pablo, fw, netfilter-devel, lorenzo.bianconi, brouer, toke,
	memxor



On 5/18/22 3:43 AM, Lorenzo Bianconi wrote:
> Introduce selftests for the following kfunc helpers:
> - bpf_xdp_ct_add
> - bpf_skb_ct_add
> - bpf_ct_refresh_timeout
> 
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> ---
>   .../testing/selftests/bpf/prog_tests/bpf_nf.c |  4 ++
>   .../testing/selftests/bpf/progs/test_bpf_nf.c | 72 +++++++++++++++----
>   2 files changed, 64 insertions(+), 12 deletions(-)
> 
> diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
> index dd30b1e3a67c..be6c5650892f 100644
> --- a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
> +++ b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
> @@ -39,6 +39,10 @@ void test_bpf_nf_ct(int mode)
>   	ASSERT_EQ(skel->bss->test_enonet_netns_id, -ENONET, "Test ENONET for bad but valid netns_id");
>   	ASSERT_EQ(skel->bss->test_enoent_lookup, -ENOENT, "Test ENOENT for failed lookup");
>   	ASSERT_EQ(skel->bss->test_eafnosupport, -EAFNOSUPPORT, "Test EAFNOSUPPORT for invalid len__tuple");
> +	ASSERT_EQ(skel->bss->test_add_entry, 0, "Test for adding new entry");
> +	ASSERT_EQ(skel->bss->test_succ_lookup, 0, "Test for successful lookup");

The default value for test_add_entry/test_succ_lookup are 0. So even if 
the program didn't execute, the above still succeeds. So testing with
a non-default value (0) might be a better choice.

> +	ASSERT_TRUE(skel->bss->test_delta_timeout > 9 && skel->bss->test_delta_timeout <= 10,
> +		    "Test for ct timeout update");

ASSERT_TRUE(skel->bss->test_delta_timeout == 10, ...)? Could you add 
some comments on why the value should be 10. It is not obvious by
inspecting the code.

>   end:
>   	test_bpf_nf__destroy(skel);
>   }
> diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf.c b/tools/testing/selftests/bpf/progs/test_bpf_nf.c
> index f00a9731930e..361430dde3f7 100644
> --- a/tools/testing/selftests/bpf/progs/test_bpf_nf.c
> +++ b/tools/testing/selftests/bpf/progs/test_bpf_nf.c
> @@ -1,6 +1,7 @@
>   // SPDX-License-Identifier: GPL-2.0
>   #include <vmlinux.h>
>   #include <bpf/bpf_helpers.h>
> +#include <bpf/bpf_endian.h>
>   
>   #define EAFNOSUPPORT 97
>   #define EPROTO 71
> @@ -8,6 +9,8 @@
>   #define EINVAL 22
>   #define ENOENT 2
>   
> +extern unsigned long CONFIG_HZ __kconfig;
> +
>   int test_einval_bpf_tuple = 0;
>   int test_einval_reserved = 0;
>   int test_einval_netns_id = 0;
> @@ -16,6 +19,9 @@ int test_eproto_l4proto = 0;
>   int test_enonet_netns_id = 0;
>   int test_enoent_lookup = 0;
>   int test_eafnosupport = 0;
> +int test_add_entry = 0;
> +int test_succ_lookup = 0;
> +u32 test_delta_timeout = 0;
>   
>   struct nf_conn;
>   
> @@ -26,31 +32,40 @@ struct bpf_ct_opts___local {
>   	u8 reserved[3];
>   } __attribute__((preserve_access_index));
>   
> +struct nf_conn *bpf_xdp_ct_add(struct xdp_md *, struct bpf_sock_tuple *, u32,
> +			       struct bpf_ct_opts___local *, u32) __ksym;
>   struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *, struct bpf_sock_tuple *, u32,
>   				  struct bpf_ct_opts___local *, u32) __ksym;
> +struct nf_conn *bpf_skb_ct_add(struct __sk_buff *, struct bpf_sock_tuple *, u32,
> +			       struct bpf_ct_opts___local *, u32) __ksym;
>   struct nf_conn *bpf_skb_ct_lookup(struct __sk_buff *, struct bpf_sock_tuple *, u32,
>   				  struct bpf_ct_opts___local *, u32) __ksym;
>   void bpf_ct_release(struct nf_conn *) __ksym;
> +void bpf_ct_refresh_timeout(struct nf_conn *, u32) __ksym;
>   
>   static __always_inline void
> -nf_ct_test(struct nf_conn *(*func)(void *, struct bpf_sock_tuple *, u32,
> -				   struct bpf_ct_opts___local *, u32),
> +nf_ct_test(struct nf_conn *(*look_fn)(void *, struct bpf_sock_tuple *, u32,
> +				      struct bpf_ct_opts___local *, u32),
> +	   struct nf_conn *(*add_fn)(void *, struct bpf_sock_tuple *, u32,
> +				     struct bpf_ct_opts___local *, u32),
>   	   void *ctx)
>   {
>   	struct bpf_ct_opts___local opts_def = { .l4proto = IPPROTO_TCP, .netns_id = -1 };
>   	struct bpf_sock_tuple bpf_tuple;
>   	struct nf_conn *ct;
> +	int err;
>   
>   	__builtin_memset(&bpf_tuple, 0, sizeof(bpf_tuple.ipv4));
>   
> -	ct = func(ctx, NULL, 0, &opts_def, sizeof(opts_def));
> +	ct = look_fn(ctx, NULL, 0, &opts_def, sizeof(opts_def));
>   	if (ct)
>   		bpf_ct_release(ct);
>   	else
>   		test_einval_bpf_tuple = opts_def.error;
>   
>   	opts_def.reserved[0] = 1;
> -	ct = func(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, sizeof(opts_def));
> +	ct = look_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def,
> +		     sizeof(opts_def));
>   	opts_def.reserved[0] = 0;
>   	opts_def.l4proto = IPPROTO_TCP;
>   	if (ct)
> @@ -59,21 +74,24 @@ nf_ct_test(struct nf_conn *(*func)(void *, struct bpf_sock_tuple *, u32,
>   		test_einval_reserved = opts_def.error;
>   
>   	opts_def.netns_id = -2;
> -	ct = func(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, sizeof(opts_def));
> +	ct = look_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def,
> +		     sizeof(opts_def));
>   	opts_def.netns_id = -1;
>   	if (ct)
>   		bpf_ct_release(ct);
>   	else
>   		test_einval_netns_id = opts_def.error;
>   
> -	ct = func(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, sizeof(opts_def) - 1);
> +	ct = look_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def,
> +		     sizeof(opts_def) - 1);
>   	if (ct)
>   		bpf_ct_release(ct);
>   	else
>   		test_einval_len_opts = opts_def.error;
>   
>   	opts_def.l4proto = IPPROTO_ICMP;
> -	ct = func(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, sizeof(opts_def));
> +	ct = look_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def,
> +		     sizeof(opts_def));
>   	opts_def.l4proto = IPPROTO_TCP;
>   	if (ct)
>   		bpf_ct_release(ct);
> @@ -81,37 +99,67 @@ nf_ct_test(struct nf_conn *(*func)(void *, struct bpf_sock_tuple *, u32,
>   		test_eproto_l4proto = opts_def.error;
>   
>   	opts_def.netns_id = 0xf00f;
> -	ct = func(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, sizeof(opts_def));
> +	ct = look_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def,
> +		     sizeof(opts_def));
>   	opts_def.netns_id = -1;
>   	if (ct)
>   		bpf_ct_release(ct);
>   	else
>   		test_enonet_netns_id = opts_def.error;
>   
> -	ct = func(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, sizeof(opts_def));
> +	ct = look_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def,
> +		     sizeof(opts_def));
>   	if (ct)
>   		bpf_ct_release(ct);
>   	else
>   		test_enoent_lookup = opts_def.error;
>   
> -	ct = func(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4) - 1, &opts_def, sizeof(opts_def));
> +	ct = look_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4) - 1, &opts_def,
> +		     sizeof(opts_def));
>   	if (ct)
>   		bpf_ct_release(ct);
>   	else
>   		test_eafnosupport = opts_def.error;
> +
> +	bpf_tuple.ipv4.saddr = bpf_get_prandom_u32(); /* src IP */
> +	bpf_tuple.ipv4.daddr = bpf_get_prandom_u32(); /* dst IP */
> +	bpf_tuple.ipv4.sport = bpf_htons(bpf_get_prandom_u32()); /* src port */
> +	bpf_tuple.ipv4.dport = bpf_htons(bpf_get_prandom_u32()); /* dst port */

Since it is already random number, bpf_htons is not needed here.
bpf_endian.h can also be removed.

> +
> +	ct = add_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def,
> +		    sizeof(opts_def));
> +	if (ct) {
> +		struct nf_conn *ct_lk;
> +
> +		ct_lk = look_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4),
> +				&opts_def, sizeof(opts_def));
> +		if (ct_lk) {
> +			/* update ct entry timeout */
> +			bpf_ct_refresh_timeout(ct_lk, 10000);
> +			test_delta_timeout = ct_lk->timeout - bpf_jiffies64();
> +			test_delta_timeout /= CONFIG_HZ;
> +			bpf_ct_release(ct_lk);
> +		} else {
> +			test_succ_lookup = opts_def.error;
> +		}
> +		bpf_ct_release(ct);
> +	} else {
> +		test_add_entry = opts_def.error;
> +		test_succ_lookup = opts_def.error;
> +	}
>   }
>   
[...]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 bpf-next 5/5] selftests/bpf: add selftest for bpf_xdp_ct_add and bpf_ct_refresh_timeout kfunc
  2022-05-18 18:55   ` Yonghong Song
@ 2022-05-18 20:38     ` Lorenzo Bianconi
  0 siblings, 0 replies; 24+ messages in thread
From: Lorenzo Bianconi @ 2022-05-18 20:38 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Lorenzo Bianconi, bpf, netdev, ast, daniel, andrii, davem, kuba,
	edumazet, pabeni, pablo, fw, netfilter-devel, brouer, toke,
	memxor

[-- Attachment #1: Type: text/plain, Size: 8467 bytes --]

> 
> 
> On 5/18/22 3:43 AM, Lorenzo Bianconi wrote:
> > Introduce selftests for the following kfunc helpers:
> > - bpf_xdp_ct_add
> > - bpf_skb_ct_add
> > - bpf_ct_refresh_timeout
> > 
> > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> > ---
> >   .../testing/selftests/bpf/prog_tests/bpf_nf.c |  4 ++
> >   .../testing/selftests/bpf/progs/test_bpf_nf.c | 72 +++++++++++++++----
> >   2 files changed, 64 insertions(+), 12 deletions(-)
> > 
> > diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
> > index dd30b1e3a67c..be6c5650892f 100644
> > --- a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
> > +++ b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
> > @@ -39,6 +39,10 @@ void test_bpf_nf_ct(int mode)
> >   	ASSERT_EQ(skel->bss->test_enonet_netns_id, -ENONET, "Test ENONET for bad but valid netns_id");
> >   	ASSERT_EQ(skel->bss->test_enoent_lookup, -ENOENT, "Test ENOENT for failed lookup");
> >   	ASSERT_EQ(skel->bss->test_eafnosupport, -EAFNOSUPPORT, "Test EAFNOSUPPORT for invalid len__tuple");
> > +	ASSERT_EQ(skel->bss->test_add_entry, 0, "Test for adding new entry");
> > +	ASSERT_EQ(skel->bss->test_succ_lookup, 0, "Test for successful lookup");
> 
> The default value for test_add_entry/test_succ_lookup are 0. So even if the
> program didn't execute, the above still succeeds. So testing with
> a non-default value (0) might be a better choice.

ack, I will fix it in v4.

> 
> > +	ASSERT_TRUE(skel->bss->test_delta_timeout > 9 && skel->bss->test_delta_timeout <= 10,
> > +		    "Test for ct timeout update");
> 
> ASSERT_TRUE(skel->bss->test_delta_timeout == 10, ...)? Could you add some
> comments on why the value should be 10. It is not obvious by
> inspecting the code.

I added some tolerance to avoid races with jiffies update in test_bpf_nf.c

> 
> >   end:
> >   	test_bpf_nf__destroy(skel);
> >   }
> > diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf.c b/tools/testing/selftests/bpf/progs/test_bpf_nf.c
> > index f00a9731930e..361430dde3f7 100644
> > --- a/tools/testing/selftests/bpf/progs/test_bpf_nf.c
> > +++ b/tools/testing/selftests/bpf/progs/test_bpf_nf.c
> > @@ -1,6 +1,7 @@
> >   // SPDX-License-Identifier: GPL-2.0
> >   #include <vmlinux.h>
> >   #include <bpf/bpf_helpers.h>
> > +#include <bpf/bpf_endian.h>
> >   #define EAFNOSUPPORT 97
> >   #define EPROTO 71
> > @@ -8,6 +9,8 @@
> >   #define EINVAL 22
> >   #define ENOENT 2
> > +extern unsigned long CONFIG_HZ __kconfig;
> > +
> >   int test_einval_bpf_tuple = 0;
> >   int test_einval_reserved = 0;
> >   int test_einval_netns_id = 0;
> > @@ -16,6 +19,9 @@ int test_eproto_l4proto = 0;
> >   int test_enonet_netns_id = 0;
> >   int test_enoent_lookup = 0;
> >   int test_eafnosupport = 0;
> > +int test_add_entry = 0;
> > +int test_succ_lookup = 0;
> > +u32 test_delta_timeout = 0;
> >   struct nf_conn;
> > @@ -26,31 +32,40 @@ struct bpf_ct_opts___local {
> >   	u8 reserved[3];
> >   } __attribute__((preserve_access_index));
> > +struct nf_conn *bpf_xdp_ct_add(struct xdp_md *, struct bpf_sock_tuple *, u32,
> > +			       struct bpf_ct_opts___local *, u32) __ksym;
> >   struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *, struct bpf_sock_tuple *, u32,
> >   				  struct bpf_ct_opts___local *, u32) __ksym;
> > +struct nf_conn *bpf_skb_ct_add(struct __sk_buff *, struct bpf_sock_tuple *, u32,
> > +			       struct bpf_ct_opts___local *, u32) __ksym;
> >   struct nf_conn *bpf_skb_ct_lookup(struct __sk_buff *, struct bpf_sock_tuple *, u32,
> >   				  struct bpf_ct_opts___local *, u32) __ksym;
> >   void bpf_ct_release(struct nf_conn *) __ksym;
> > +void bpf_ct_refresh_timeout(struct nf_conn *, u32) __ksym;
> >   static __always_inline void
> > -nf_ct_test(struct nf_conn *(*func)(void *, struct bpf_sock_tuple *, u32,
> > -				   struct bpf_ct_opts___local *, u32),
> > +nf_ct_test(struct nf_conn *(*look_fn)(void *, struct bpf_sock_tuple *, u32,
> > +				      struct bpf_ct_opts___local *, u32),
> > +	   struct nf_conn *(*add_fn)(void *, struct bpf_sock_tuple *, u32,
> > +				     struct bpf_ct_opts___local *, u32),
> >   	   void *ctx)
> >   {
> >   	struct bpf_ct_opts___local opts_def = { .l4proto = IPPROTO_TCP, .netns_id = -1 };
> >   	struct bpf_sock_tuple bpf_tuple;
> >   	struct nf_conn *ct;
> > +	int err;
> >   	__builtin_memset(&bpf_tuple, 0, sizeof(bpf_tuple.ipv4));
> > -	ct = func(ctx, NULL, 0, &opts_def, sizeof(opts_def));
> > +	ct = look_fn(ctx, NULL, 0, &opts_def, sizeof(opts_def));
> >   	if (ct)
> >   		bpf_ct_release(ct);
> >   	else
> >   		test_einval_bpf_tuple = opts_def.error;
> >   	opts_def.reserved[0] = 1;
> > -	ct = func(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, sizeof(opts_def));
> > +	ct = look_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def,
> > +		     sizeof(opts_def));
> >   	opts_def.reserved[0] = 0;
> >   	opts_def.l4proto = IPPROTO_TCP;
> >   	if (ct)
> > @@ -59,21 +74,24 @@ nf_ct_test(struct nf_conn *(*func)(void *, struct bpf_sock_tuple *, u32,
> >   		test_einval_reserved = opts_def.error;
> >   	opts_def.netns_id = -2;
> > -	ct = func(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, sizeof(opts_def));
> > +	ct = look_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def,
> > +		     sizeof(opts_def));
> >   	opts_def.netns_id = -1;
> >   	if (ct)
> >   		bpf_ct_release(ct);
> >   	else
> >   		test_einval_netns_id = opts_def.error;
> > -	ct = func(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, sizeof(opts_def) - 1);
> > +	ct = look_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def,
> > +		     sizeof(opts_def) - 1);
> >   	if (ct)
> >   		bpf_ct_release(ct);
> >   	else
> >   		test_einval_len_opts = opts_def.error;
> >   	opts_def.l4proto = IPPROTO_ICMP;
> > -	ct = func(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, sizeof(opts_def));
> > +	ct = look_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def,
> > +		     sizeof(opts_def));
> >   	opts_def.l4proto = IPPROTO_TCP;
> >   	if (ct)
> >   		bpf_ct_release(ct);
> > @@ -81,37 +99,67 @@ nf_ct_test(struct nf_conn *(*func)(void *, struct bpf_sock_tuple *, u32,
> >   		test_eproto_l4proto = opts_def.error;
> >   	opts_def.netns_id = 0xf00f;
> > -	ct = func(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, sizeof(opts_def));
> > +	ct = look_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def,
> > +		     sizeof(opts_def));
> >   	opts_def.netns_id = -1;
> >   	if (ct)
> >   		bpf_ct_release(ct);
> >   	else
> >   		test_enonet_netns_id = opts_def.error;
> > -	ct = func(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, sizeof(opts_def));
> > +	ct = look_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def,
> > +		     sizeof(opts_def));
> >   	if (ct)
> >   		bpf_ct_release(ct);
> >   	else
> >   		test_enoent_lookup = opts_def.error;
> > -	ct = func(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4) - 1, &opts_def, sizeof(opts_def));
> > +	ct = look_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4) - 1, &opts_def,
> > +		     sizeof(opts_def));
> >   	if (ct)
> >   		bpf_ct_release(ct);
> >   	else
> >   		test_eafnosupport = opts_def.error;
> > +
> > +	bpf_tuple.ipv4.saddr = bpf_get_prandom_u32(); /* src IP */
> > +	bpf_tuple.ipv4.daddr = bpf_get_prandom_u32(); /* dst IP */
> > +	bpf_tuple.ipv4.sport = bpf_htons(bpf_get_prandom_u32()); /* src port */
> > +	bpf_tuple.ipv4.dport = bpf_htons(bpf_get_prandom_u32()); /* dst port */
> 
> Since it is already random number, bpf_htons is not needed here.
> bpf_endian.h can also be removed.

ack, I will fix it in v4.

Regards,
Lorenzo

> 
> > +
> > +	ct = add_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def,
> > +		    sizeof(opts_def));
> > +	if (ct) {
> > +		struct nf_conn *ct_lk;
> > +
> > +		ct_lk = look_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4),
> > +				&opts_def, sizeof(opts_def));
> > +		if (ct_lk) {
> > +			/* update ct entry timeout */
> > +			bpf_ct_refresh_timeout(ct_lk, 10000);
> > +			test_delta_timeout = ct_lk->timeout - bpf_jiffies64();
> > +			test_delta_timeout /= CONFIG_HZ;
> > +			bpf_ct_release(ct_lk);
> > +		} else {
> > +			test_succ_lookup = opts_def.error;
> > +		}
> > +		bpf_ct_release(ct);
> > +	} else {
> > +		test_add_entry = opts_def.error;
> > +		test_succ_lookup = opts_def.error;
> > +	}
> >   }
> [...]
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 bpf-next 3/5] net: netfilter: add kfunc helper to update ct timeout
  2022-05-18 10:43 ` [PATCH v3 bpf-next 3/5] net: netfilter: add kfunc helper to update ct timeout Lorenzo Bianconi
@ 2022-05-18 20:42   ` Toke Høiland-Jørgensen
  0 siblings, 0 replies; 24+ messages in thread
From: Toke Høiland-Jørgensen @ 2022-05-18 20:42 UTC (permalink / raw)
  To: Lorenzo Bianconi, bpf
  Cc: netdev, ast, daniel, andrii, davem, kuba, edumazet, pabeni,
	pablo, fw, netfilter-devel, lorenzo.bianconi, brouer, memxor

Lorenzo Bianconi <lorenzo@kernel.org> writes:

> Introduce bpf_ct_refresh_timeout kfunc helper in order to update
> nf_conn lifetime. Move timeout update logic in nf_ct_refresh_timeout
> utility routine.
>
> Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>

Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 bpf-next 4/5] net: netfilter: add kfunc helper to add a new ct entry
  2022-05-18 10:43 ` [PATCH v3 bpf-next 4/5] net: netfilter: add kfunc helper to add a new ct entry Lorenzo Bianconi
@ 2022-05-18 20:47   ` Toke Høiland-Jørgensen
  2022-05-18 21:08     ` Lorenzo Bianconi
  2022-05-19  2:06   ` kernel test robot
  1 sibling, 1 reply; 24+ messages in thread
From: Toke Høiland-Jørgensen @ 2022-05-18 20:47 UTC (permalink / raw)
  To: Lorenzo Bianconi, bpf
  Cc: netdev, ast, daniel, andrii, davem, kuba, edumazet, pabeni,
	pablo, fw, netfilter-devel, lorenzo.bianconi, brouer, memxor

Lorenzo Bianconi <lorenzo@kernel.org> writes:

> Introduce bpf_xdp_ct_add and bpf_skb_ct_add kfunc helpers in order to
> add a new entry to ct map from an ebpf program.
> Introduce bpf_nf_ct_tuple_parse utility routine.
>
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> ---
>  net/netfilter/nf_conntrack_bpf.c | 212 +++++++++++++++++++++++++++----
>  1 file changed, 189 insertions(+), 23 deletions(-)
>
> diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
> index a9271418db88..3d31b602fdf1 100644
> --- a/net/netfilter/nf_conntrack_bpf.c
> +++ b/net/netfilter/nf_conntrack_bpf.c
> @@ -55,41 +55,114 @@ enum {
>  	NF_BPF_CT_OPTS_SZ = 12,
>  };
>  
> -static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
> -					  struct bpf_sock_tuple *bpf_tuple,
> -					  u32 tuple_len, u8 protonum,
> -					  s32 netns_id, u8 *dir)
> +static int bpf_nf_ct_tuple_parse(struct bpf_sock_tuple *bpf_tuple,
> +				 u32 tuple_len, u8 protonum, u8 dir,
> +				 struct nf_conntrack_tuple *tuple)
>  {
> -	struct nf_conntrack_tuple_hash *hash;
> -	struct nf_conntrack_tuple tuple;
> -	struct nf_conn *ct;
> +	union nf_inet_addr *src = dir ? &tuple->dst.u3 : &tuple->src.u3;
> +	union nf_inet_addr *dst = dir ? &tuple->src.u3 : &tuple->dst.u3;
> +	union nf_conntrack_man_proto *sport = dir ? (void *)&tuple->dst.u
> +						  : &tuple->src.u;
> +	union nf_conntrack_man_proto *dport = dir ? &tuple->src.u
> +						  : (void *)&tuple->dst.u;
>  
>  	if (unlikely(protonum != IPPROTO_TCP && protonum != IPPROTO_UDP))
> -		return ERR_PTR(-EPROTO);
> -	if (unlikely(netns_id < BPF_F_CURRENT_NETNS))
> -		return ERR_PTR(-EINVAL);
> +		return -EPROTO;
> +
> +	memset(tuple, 0, sizeof(*tuple));
>  
> -	memset(&tuple, 0, sizeof(tuple));
>  	switch (tuple_len) {
>  	case sizeof(bpf_tuple->ipv4):
> -		tuple.src.l3num = AF_INET;
> -		tuple.src.u3.ip = bpf_tuple->ipv4.saddr;
> -		tuple.src.u.tcp.port = bpf_tuple->ipv4.sport;
> -		tuple.dst.u3.ip = bpf_tuple->ipv4.daddr;
> -		tuple.dst.u.tcp.port = bpf_tuple->ipv4.dport;
> +		tuple->src.l3num = AF_INET;
> +		src->ip = bpf_tuple->ipv4.saddr;
> +		sport->tcp.port = bpf_tuple->ipv4.sport;
> +		dst->ip = bpf_tuple->ipv4.daddr;
> +		dport->tcp.port = bpf_tuple->ipv4.dport;
>  		break;
>  	case sizeof(bpf_tuple->ipv6):
> -		tuple.src.l3num = AF_INET6;
> -		memcpy(tuple.src.u3.ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr));
> -		tuple.src.u.tcp.port = bpf_tuple->ipv6.sport;
> -		memcpy(tuple.dst.u3.ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr));
> -		tuple.dst.u.tcp.port = bpf_tuple->ipv6.dport;
> +		tuple->src.l3num = AF_INET6;
> +		memcpy(src->ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr));
> +		sport->tcp.port = bpf_tuple->ipv6.sport;
> +		memcpy(dst->ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr));
> +		dport->tcp.port = bpf_tuple->ipv6.dport;
>  		break;
>  	default:
> -		return ERR_PTR(-EAFNOSUPPORT);
> +		return -EAFNOSUPPORT;
>  	}
> +	tuple->dst.protonum = protonum;
> +	tuple->dst.dir = dir;
> +
> +	return 0;
> +}
>  
> -	tuple.dst.protonum = protonum;
> +struct nf_conn *
> +__bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple,
> +			u32 tuple_len, u8 protonum, s32 netns_id, u32 timeout)
> +{
> +	struct nf_conntrack_tuple otuple, rtuple;
> +	struct nf_conn *ct;
> +	int err;
> +
> +	if (unlikely(netns_id < BPF_F_CURRENT_NETNS))
> +		return ERR_PTR(-EINVAL);
> +
> +	err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, protonum,
> +				    IP_CT_DIR_ORIGINAL, &otuple);
> +	if (err < 0)
> +		return ERR_PTR(err);
> +
> +	err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, protonum,
> +				    IP_CT_DIR_REPLY, &rtuple);
> +	if (err < 0)
> +		return ERR_PTR(err);
> +
> +	if (netns_id >= 0) {
> +		net = get_net_ns_by_id(net, netns_id);
> +		if (unlikely(!net))
> +			return ERR_PTR(-ENONET);
> +	}
> +
> +	ct = nf_conntrack_alloc(net, &nf_ct_zone_dflt, &otuple, &rtuple,
> +				GFP_ATOMIC);
> +	if (IS_ERR(ct))
> +		goto out;
> +
> +	ct->timeout = timeout * HZ + jiffies;
> +	ct->status |= IPS_CONFIRMED;
> +
> +	memset(&ct->proto, 0, sizeof(ct->proto));
> +	if (protonum == IPPROTO_TCP)
> +		ct->proto.tcp.state = TCP_CONNTRACK_ESTABLISHED;

Hmm, isn't it a bit limiting to hard-code this to ESTABLISHED
connections? Presumably for TCP you'd want to use this when you see a
SYN and then rely on conntrack to help with the subsequent state
tracking for when the SYN-ACK comes back? What's the usecase for
creating an entry in ESTABLISHED state, exactly?

(Of course, we'd need to be able to update the state as well, then...)

-Toke


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 bpf-next 4/5] net: netfilter: add kfunc helper to add a new ct entry
  2022-05-18 20:47   ` Toke Høiland-Jørgensen
@ 2022-05-18 21:08     ` Lorenzo Bianconi
  2022-05-18 22:14       ` Alexei Starovoitov
  0 siblings, 1 reply; 24+ messages in thread
From: Lorenzo Bianconi @ 2022-05-18 21:08 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: Lorenzo Bianconi, bpf, netdev, ast, daniel, andrii, davem, kuba,
	edumazet, pabeni, pablo, fw, netfilter-devel, brouer, memxor

[-- Attachment #1: Type: text/plain, Size: 5117 bytes --]

> Lorenzo Bianconi <lorenzo@kernel.org> writes:
> 
> > Introduce bpf_xdp_ct_add and bpf_skb_ct_add kfunc helpers in order to
> > add a new entry to ct map from an ebpf program.
> > Introduce bpf_nf_ct_tuple_parse utility routine.
> >
> > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> > ---
> >  net/netfilter/nf_conntrack_bpf.c | 212 +++++++++++++++++++++++++++----
> >  1 file changed, 189 insertions(+), 23 deletions(-)
> >
> > diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
> > index a9271418db88..3d31b602fdf1 100644
> > --- a/net/netfilter/nf_conntrack_bpf.c
> > +++ b/net/netfilter/nf_conntrack_bpf.c
> > @@ -55,41 +55,114 @@ enum {
> >  	NF_BPF_CT_OPTS_SZ = 12,
> >  };
> >  
> > -static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
> > -					  struct bpf_sock_tuple *bpf_tuple,
> > -					  u32 tuple_len, u8 protonum,
> > -					  s32 netns_id, u8 *dir)
> > +static int bpf_nf_ct_tuple_parse(struct bpf_sock_tuple *bpf_tuple,
> > +				 u32 tuple_len, u8 protonum, u8 dir,
> > +				 struct nf_conntrack_tuple *tuple)
> >  {
> > -	struct nf_conntrack_tuple_hash *hash;
> > -	struct nf_conntrack_tuple tuple;
> > -	struct nf_conn *ct;
> > +	union nf_inet_addr *src = dir ? &tuple->dst.u3 : &tuple->src.u3;
> > +	union nf_inet_addr *dst = dir ? &tuple->src.u3 : &tuple->dst.u3;
> > +	union nf_conntrack_man_proto *sport = dir ? (void *)&tuple->dst.u
> > +						  : &tuple->src.u;
> > +	union nf_conntrack_man_proto *dport = dir ? &tuple->src.u
> > +						  : (void *)&tuple->dst.u;
> >  
> >  	if (unlikely(protonum != IPPROTO_TCP && protonum != IPPROTO_UDP))
> > -		return ERR_PTR(-EPROTO);
> > -	if (unlikely(netns_id < BPF_F_CURRENT_NETNS))
> > -		return ERR_PTR(-EINVAL);
> > +		return -EPROTO;
> > +
> > +	memset(tuple, 0, sizeof(*tuple));
> >  
> > -	memset(&tuple, 0, sizeof(tuple));
> >  	switch (tuple_len) {
> >  	case sizeof(bpf_tuple->ipv4):
> > -		tuple.src.l3num = AF_INET;
> > -		tuple.src.u3.ip = bpf_tuple->ipv4.saddr;
> > -		tuple.src.u.tcp.port = bpf_tuple->ipv4.sport;
> > -		tuple.dst.u3.ip = bpf_tuple->ipv4.daddr;
> > -		tuple.dst.u.tcp.port = bpf_tuple->ipv4.dport;
> > +		tuple->src.l3num = AF_INET;
> > +		src->ip = bpf_tuple->ipv4.saddr;
> > +		sport->tcp.port = bpf_tuple->ipv4.sport;
> > +		dst->ip = bpf_tuple->ipv4.daddr;
> > +		dport->tcp.port = bpf_tuple->ipv4.dport;
> >  		break;
> >  	case sizeof(bpf_tuple->ipv6):
> > -		tuple.src.l3num = AF_INET6;
> > -		memcpy(tuple.src.u3.ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr));
> > -		tuple.src.u.tcp.port = bpf_tuple->ipv6.sport;
> > -		memcpy(tuple.dst.u3.ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr));
> > -		tuple.dst.u.tcp.port = bpf_tuple->ipv6.dport;
> > +		tuple->src.l3num = AF_INET6;
> > +		memcpy(src->ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr));
> > +		sport->tcp.port = bpf_tuple->ipv6.sport;
> > +		memcpy(dst->ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr));
> > +		dport->tcp.port = bpf_tuple->ipv6.dport;
> >  		break;
> >  	default:
> > -		return ERR_PTR(-EAFNOSUPPORT);
> > +		return -EAFNOSUPPORT;
> >  	}
> > +	tuple->dst.protonum = protonum;
> > +	tuple->dst.dir = dir;
> > +
> > +	return 0;
> > +}
> >  
> > -	tuple.dst.protonum = protonum;
> > +struct nf_conn *
> > +__bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple,
> > +			u32 tuple_len, u8 protonum, s32 netns_id, u32 timeout)
> > +{
> > +	struct nf_conntrack_tuple otuple, rtuple;
> > +	struct nf_conn *ct;
> > +	int err;
> > +
> > +	if (unlikely(netns_id < BPF_F_CURRENT_NETNS))
> > +		return ERR_PTR(-EINVAL);
> > +
> > +	err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, protonum,
> > +				    IP_CT_DIR_ORIGINAL, &otuple);
> > +	if (err < 0)
> > +		return ERR_PTR(err);
> > +
> > +	err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, protonum,
> > +				    IP_CT_DIR_REPLY, &rtuple);
> > +	if (err < 0)
> > +		return ERR_PTR(err);
> > +
> > +	if (netns_id >= 0) {
> > +		net = get_net_ns_by_id(net, netns_id);
> > +		if (unlikely(!net))
> > +			return ERR_PTR(-ENONET);
> > +	}
> > +
> > +	ct = nf_conntrack_alloc(net, &nf_ct_zone_dflt, &otuple, &rtuple,
> > +				GFP_ATOMIC);
> > +	if (IS_ERR(ct))
> > +		goto out;
> > +
> > +	ct->timeout = timeout * HZ + jiffies;
> > +	ct->status |= IPS_CONFIRMED;
> > +
> > +	memset(&ct->proto, 0, sizeof(ct->proto));
> > +	if (protonum == IPPROTO_TCP)
> > +		ct->proto.tcp.state = TCP_CONNTRACK_ESTABLISHED;
> 
> Hmm, isn't it a bit limiting to hard-code this to ESTABLISHED
> connections? Presumably for TCP you'd want to use this when you see a
> SYN and then rely on conntrack to help with the subsequent state
> tracking for when the SYN-ACK comes back? What's the usecase for
> creating an entry in ESTABLISHED state, exactly?

I guess we can even add a parameter and pass the state from the caller.
I was not sure if it is mandatory.

Regards,
Lorenzo

> 
> (Of course, we'd need to be able to update the state as well, then...)
> 
> -Toke
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 bpf-next 4/5] net: netfilter: add kfunc helper to add a new ct entry
  2022-05-18 21:08     ` Lorenzo Bianconi
@ 2022-05-18 22:14       ` Alexei Starovoitov
  2022-05-18 22:43         ` Kumar Kartikeya Dwivedi
  0 siblings, 1 reply; 24+ messages in thread
From: Alexei Starovoitov @ 2022-05-18 22:14 UTC (permalink / raw)
  To: Lorenzo Bianconi
  Cc: Toke Høiland-Jørgensen, Lorenzo Bianconi, bpf,
	Network Development, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, David S. Miller, Jakub Kicinski, Eric Dumazet,
	Paolo Abeni, Pablo Neira Ayuso, Florian Westphal,
	netfilter-devel, Jesper Dangaard Brouer, Kumar Kartikeya Dwivedi

On Wed, May 18, 2022 at 2:09 PM Lorenzo Bianconi
<lorenzo.bianconi@redhat.com> wrote:
>
> > Lorenzo Bianconi <lorenzo@kernel.org> writes:
> >
> > > Introduce bpf_xdp_ct_add and bpf_skb_ct_add kfunc helpers in order to
> > > add a new entry to ct map from an ebpf program.
> > > Introduce bpf_nf_ct_tuple_parse utility routine.
> > >
> > > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> > > ---
> > >  net/netfilter/nf_conntrack_bpf.c | 212 +++++++++++++++++++++++++++----
> > >  1 file changed, 189 insertions(+), 23 deletions(-)
> > >
> > > diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
> > > index a9271418db88..3d31b602fdf1 100644
> > > --- a/net/netfilter/nf_conntrack_bpf.c
> > > +++ b/net/netfilter/nf_conntrack_bpf.c
> > > @@ -55,41 +55,114 @@ enum {
> > >     NF_BPF_CT_OPTS_SZ = 12,
> > >  };
> > >
> > > -static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
> > > -                                     struct bpf_sock_tuple *bpf_tuple,
> > > -                                     u32 tuple_len, u8 protonum,
> > > -                                     s32 netns_id, u8 *dir)
> > > +static int bpf_nf_ct_tuple_parse(struct bpf_sock_tuple *bpf_tuple,
> > > +                            u32 tuple_len, u8 protonum, u8 dir,
> > > +                            struct nf_conntrack_tuple *tuple)
> > >  {
> > > -   struct nf_conntrack_tuple_hash *hash;
> > > -   struct nf_conntrack_tuple tuple;
> > > -   struct nf_conn *ct;
> > > +   union nf_inet_addr *src = dir ? &tuple->dst.u3 : &tuple->src.u3;
> > > +   union nf_inet_addr *dst = dir ? &tuple->src.u3 : &tuple->dst.u3;
> > > +   union nf_conntrack_man_proto *sport = dir ? (void *)&tuple->dst.u
> > > +                                             : &tuple->src.u;
> > > +   union nf_conntrack_man_proto *dport = dir ? &tuple->src.u
> > > +                                             : (void *)&tuple->dst.u;
> > >
> > >     if (unlikely(protonum != IPPROTO_TCP && protonum != IPPROTO_UDP))
> > > -           return ERR_PTR(-EPROTO);
> > > -   if (unlikely(netns_id < BPF_F_CURRENT_NETNS))
> > > -           return ERR_PTR(-EINVAL);
> > > +           return -EPROTO;
> > > +
> > > +   memset(tuple, 0, sizeof(*tuple));
> > >
> > > -   memset(&tuple, 0, sizeof(tuple));
> > >     switch (tuple_len) {
> > >     case sizeof(bpf_tuple->ipv4):
> > > -           tuple.src.l3num = AF_INET;
> > > -           tuple.src.u3.ip = bpf_tuple->ipv4.saddr;
> > > -           tuple.src.u.tcp.port = bpf_tuple->ipv4.sport;
> > > -           tuple.dst.u3.ip = bpf_tuple->ipv4.daddr;
> > > -           tuple.dst.u.tcp.port = bpf_tuple->ipv4.dport;
> > > +           tuple->src.l3num = AF_INET;
> > > +           src->ip = bpf_tuple->ipv4.saddr;
> > > +           sport->tcp.port = bpf_tuple->ipv4.sport;
> > > +           dst->ip = bpf_tuple->ipv4.daddr;
> > > +           dport->tcp.port = bpf_tuple->ipv4.dport;
> > >             break;
> > >     case sizeof(bpf_tuple->ipv6):
> > > -           tuple.src.l3num = AF_INET6;
> > > -           memcpy(tuple.src.u3.ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr));
> > > -           tuple.src.u.tcp.port = bpf_tuple->ipv6.sport;
> > > -           memcpy(tuple.dst.u3.ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr));
> > > -           tuple.dst.u.tcp.port = bpf_tuple->ipv6.dport;
> > > +           tuple->src.l3num = AF_INET6;
> > > +           memcpy(src->ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr));
> > > +           sport->tcp.port = bpf_tuple->ipv6.sport;
> > > +           memcpy(dst->ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr));
> > > +           dport->tcp.port = bpf_tuple->ipv6.dport;
> > >             break;
> > >     default:
> > > -           return ERR_PTR(-EAFNOSUPPORT);
> > > +           return -EAFNOSUPPORT;
> > >     }
> > > +   tuple->dst.protonum = protonum;
> > > +   tuple->dst.dir = dir;
> > > +
> > > +   return 0;
> > > +}
> > >
> > > -   tuple.dst.protonum = protonum;
> > > +struct nf_conn *
> > > +__bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple,
> > > +                   u32 tuple_len, u8 protonum, s32 netns_id, u32 timeout)
> > > +{
> > > +   struct nf_conntrack_tuple otuple, rtuple;
> > > +   struct nf_conn *ct;
> > > +   int err;
> > > +
> > > +   if (unlikely(netns_id < BPF_F_CURRENT_NETNS))
> > > +           return ERR_PTR(-EINVAL);
> > > +
> > > +   err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, protonum,
> > > +                               IP_CT_DIR_ORIGINAL, &otuple);
> > > +   if (err < 0)
> > > +           return ERR_PTR(err);
> > > +
> > > +   err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, protonum,
> > > +                               IP_CT_DIR_REPLY, &rtuple);
> > > +   if (err < 0)
> > > +           return ERR_PTR(err);
> > > +
> > > +   if (netns_id >= 0) {
> > > +           net = get_net_ns_by_id(net, netns_id);
> > > +           if (unlikely(!net))
> > > +                   return ERR_PTR(-ENONET);
> > > +   }
> > > +
> > > +   ct = nf_conntrack_alloc(net, &nf_ct_zone_dflt, &otuple, &rtuple,
> > > +                           GFP_ATOMIC);
> > > +   if (IS_ERR(ct))
> > > +           goto out;
> > > +
> > > +   ct->timeout = timeout * HZ + jiffies;
> > > +   ct->status |= IPS_CONFIRMED;
> > > +
> > > +   memset(&ct->proto, 0, sizeof(ct->proto));
> > > +   if (protonum == IPPROTO_TCP)
> > > +           ct->proto.tcp.state = TCP_CONNTRACK_ESTABLISHED;
> >
> > Hmm, isn't it a bit limiting to hard-code this to ESTABLISHED
> > connections? Presumably for TCP you'd want to use this when you see a
> > SYN and then rely on conntrack to help with the subsequent state
> > tracking for when the SYN-ACK comes back? What's the usecase for
> > creating an entry in ESTABLISHED state, exactly?
>
> I guess we can even add a parameter and pass the state from the caller.
> I was not sure if it is mandatory.

It's probably cleaner and more flexible to split
_alloc and _insert into two kfuncs and let bpf
prog populate ct directly.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 bpf-next 4/5] net: netfilter: add kfunc helper to add a new ct entry
  2022-05-18 22:14       ` Alexei Starovoitov
@ 2022-05-18 22:43         ` Kumar Kartikeya Dwivedi
  2022-05-19 10:45           ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 24+ messages in thread
From: Kumar Kartikeya Dwivedi @ 2022-05-18 22:43 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Lorenzo Bianconi, Toke Høiland-Jørgensen,
	Lorenzo Bianconi, bpf, Network Development, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, David S. Miller,
	Jakub Kicinski, Eric Dumazet, Paolo Abeni, Pablo Neira Ayuso,
	Florian Westphal, netfilter-devel, Jesper Dangaard Brouer

On Thu, May 19, 2022 at 03:44:58AM IST, Alexei Starovoitov wrote:
> On Wed, May 18, 2022 at 2:09 PM Lorenzo Bianconi
> <lorenzo.bianconi@redhat.com> wrote:
> >
> > > Lorenzo Bianconi <lorenzo@kernel.org> writes:
> > >
> > > > Introduce bpf_xdp_ct_add and bpf_skb_ct_add kfunc helpers in order to
> > > > add a new entry to ct map from an ebpf program.
> > > > Introduce bpf_nf_ct_tuple_parse utility routine.
> > > >
> > > > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> > > > ---
> > > >  net/netfilter/nf_conntrack_bpf.c | 212 +++++++++++++++++++++++++++----
> > > >  1 file changed, 189 insertions(+), 23 deletions(-)
> > > >
> > > > diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
> > > > index a9271418db88..3d31b602fdf1 100644
> > > > --- a/net/netfilter/nf_conntrack_bpf.c
> > > > +++ b/net/netfilter/nf_conntrack_bpf.c
> > > > @@ -55,41 +55,114 @@ enum {
> > > >     NF_BPF_CT_OPTS_SZ = 12,
> > > >  };
> > > >
> > > > -static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
> > > > -                                     struct bpf_sock_tuple *bpf_tuple,
> > > > -                                     u32 tuple_len, u8 protonum,
> > > > -                                     s32 netns_id, u8 *dir)
> > > > +static int bpf_nf_ct_tuple_parse(struct bpf_sock_tuple *bpf_tuple,
> > > > +                            u32 tuple_len, u8 protonum, u8 dir,
> > > > +                            struct nf_conntrack_tuple *tuple)
> > > >  {
> > > > -   struct nf_conntrack_tuple_hash *hash;
> > > > -   struct nf_conntrack_tuple tuple;
> > > > -   struct nf_conn *ct;
> > > > +   union nf_inet_addr *src = dir ? &tuple->dst.u3 : &tuple->src.u3;
> > > > +   union nf_inet_addr *dst = dir ? &tuple->src.u3 : &tuple->dst.u3;
> > > > +   union nf_conntrack_man_proto *sport = dir ? (void *)&tuple->dst.u
> > > > +                                             : &tuple->src.u;
> > > > +   union nf_conntrack_man_proto *dport = dir ? &tuple->src.u
> > > > +                                             : (void *)&tuple->dst.u;
> > > >
> > > >     if (unlikely(protonum != IPPROTO_TCP && protonum != IPPROTO_UDP))
> > > > -           return ERR_PTR(-EPROTO);
> > > > -   if (unlikely(netns_id < BPF_F_CURRENT_NETNS))
> > > > -           return ERR_PTR(-EINVAL);
> > > > +           return -EPROTO;
> > > > +
> > > > +   memset(tuple, 0, sizeof(*tuple));
> > > >
> > > > -   memset(&tuple, 0, sizeof(tuple));
> > > >     switch (tuple_len) {
> > > >     case sizeof(bpf_tuple->ipv4):
> > > > -           tuple.src.l3num = AF_INET;
> > > > -           tuple.src.u3.ip = bpf_tuple->ipv4.saddr;
> > > > -           tuple.src.u.tcp.port = bpf_tuple->ipv4.sport;
> > > > -           tuple.dst.u3.ip = bpf_tuple->ipv4.daddr;
> > > > -           tuple.dst.u.tcp.port = bpf_tuple->ipv4.dport;
> > > > +           tuple->src.l3num = AF_INET;
> > > > +           src->ip = bpf_tuple->ipv4.saddr;
> > > > +           sport->tcp.port = bpf_tuple->ipv4.sport;
> > > > +           dst->ip = bpf_tuple->ipv4.daddr;
> > > > +           dport->tcp.port = bpf_tuple->ipv4.dport;
> > > >             break;
> > > >     case sizeof(bpf_tuple->ipv6):
> > > > -           tuple.src.l3num = AF_INET6;
> > > > -           memcpy(tuple.src.u3.ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr));
> > > > -           tuple.src.u.tcp.port = bpf_tuple->ipv6.sport;
> > > > -           memcpy(tuple.dst.u3.ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr));
> > > > -           tuple.dst.u.tcp.port = bpf_tuple->ipv6.dport;
> > > > +           tuple->src.l3num = AF_INET6;
> > > > +           memcpy(src->ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr));
> > > > +           sport->tcp.port = bpf_tuple->ipv6.sport;
> > > > +           memcpy(dst->ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr));
> > > > +           dport->tcp.port = bpf_tuple->ipv6.dport;
> > > >             break;
> > > >     default:
> > > > -           return ERR_PTR(-EAFNOSUPPORT);
> > > > +           return -EAFNOSUPPORT;
> > > >     }
> > > > +   tuple->dst.protonum = protonum;
> > > > +   tuple->dst.dir = dir;
> > > > +
> > > > +   return 0;
> > > > +}
> > > >
> > > > -   tuple.dst.protonum = protonum;
> > > > +struct nf_conn *
> > > > +__bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple,
> > > > +                   u32 tuple_len, u8 protonum, s32 netns_id, u32 timeout)
> > > > +{
> > > > +   struct nf_conntrack_tuple otuple, rtuple;
> > > > +   struct nf_conn *ct;
> > > > +   int err;
> > > > +
> > > > +   if (unlikely(netns_id < BPF_F_CURRENT_NETNS))
> > > > +           return ERR_PTR(-EINVAL);
> > > > +
> > > > +   err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, protonum,
> > > > +                               IP_CT_DIR_ORIGINAL, &otuple);
> > > > +   if (err < 0)
> > > > +           return ERR_PTR(err);
> > > > +
> > > > +   err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, protonum,
> > > > +                               IP_CT_DIR_REPLY, &rtuple);
> > > > +   if (err < 0)
> > > > +           return ERR_PTR(err);
> > > > +
> > > > +   if (netns_id >= 0) {
> > > > +           net = get_net_ns_by_id(net, netns_id);
> > > > +           if (unlikely(!net))
> > > > +                   return ERR_PTR(-ENONET);
> > > > +   }
> > > > +
> > > > +   ct = nf_conntrack_alloc(net, &nf_ct_zone_dflt, &otuple, &rtuple,
> > > > +                           GFP_ATOMIC);
> > > > +   if (IS_ERR(ct))
> > > > +           goto out;
> > > > +
> > > > +   ct->timeout = timeout * HZ + jiffies;
> > > > +   ct->status |= IPS_CONFIRMED;
> > > > +
> > > > +   memset(&ct->proto, 0, sizeof(ct->proto));
> > > > +   if (protonum == IPPROTO_TCP)
> > > > +           ct->proto.tcp.state = TCP_CONNTRACK_ESTABLISHED;
> > >
> > > Hmm, isn't it a bit limiting to hard-code this to ESTABLISHED
> > > connections? Presumably for TCP you'd want to use this when you see a
> > > SYN and then rely on conntrack to help with the subsequent state
> > > tracking for when the SYN-ACK comes back? What's the usecase for
> > > creating an entry in ESTABLISHED state, exactly?
> >
> > I guess we can even add a parameter and pass the state from the caller.
> > I was not sure if it is mandatory.
>
> It's probably cleaner and more flexible to split
> _alloc and _insert into two kfuncs and let bpf
> prog populate ct directly.

Right, so we can just whitelist a few fields and allow assignments into those.
One small problem is that we should probably only permit this for nf_conn
PTR_TO_BTF_ID obtained from _alloc, and make it rdonly on _insert.

We can do the rw->ro conversion by taking in ref from alloc, and releasing on
_insert, then returning ref from _insert.

For the other part, either return a different shadow PTR_TO_BTF_ID with only the
fields that can be set, convert insns for it, and then on insert return the
rdonly PTR_TO_BTF_ID of struct nf_conn, or otherwise store the source func in
the per-register state and use that to deny BPF_WRITE for normal nf_conn.
Thoughts?

--
Kartikeya

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 bpf-next 4/5] net: netfilter: add kfunc helper to add a new ct entry
  2022-05-18 10:43 ` [PATCH v3 bpf-next 4/5] net: netfilter: add kfunc helper to add a new ct entry Lorenzo Bianconi
  2022-05-18 20:47   ` Toke Høiland-Jørgensen
@ 2022-05-19  2:06   ` kernel test robot
  1 sibling, 0 replies; 24+ messages in thread
From: kernel test robot @ 2022-05-19  2:06 UTC (permalink / raw)
  To: Lorenzo Bianconi, bpf
  Cc: kbuild-all, netdev, ast, daniel, andrii, davem, kuba, edumazet,
	pabeni, pablo, fw, netfilter-devel, lorenzo.bianconi, brouer,
	toke, memxor

Hi Lorenzo,

I love your patch! Perhaps something to improve:

[auto build test WARNING on bpf-next/master]

url:    https://github.com/intel-lab-lkp/linux/commits/Lorenzo-Bianconi/net-netfilter-add-kfunc-helper-to-update-ct-timeout/20220518-184654
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: s390-defconfig (https://download.01.org/0day-ci/archive/20220519/202205191006.OH1ukt9R-lkp@intel.com/config)
compiler: s390-linux-gcc (GCC) 11.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/7427419a3d3ae771c69eed7318a9b5f5d582b488
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Lorenzo-Bianconi/net-netfilter-add-kfunc-helper-to-update-ct-timeout/20220518-184654
        git checkout 7427419a3d3ae771c69eed7318a9b5f5d582b488
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.3.0 make.cross W=1 O=build_dir ARCH=s390 SHELL=/bin/bash net/netfilter/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> net/netfilter/nf_conntrack_bpf.c:99:1: warning: no previous prototype for '__bpf_nf_ct_alloc_entry' [-Wmissing-prototypes]
      99 | __bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple,
         | ^~~~~~~~~~~~~~~~~~~~~~~


vim +/__bpf_nf_ct_alloc_entry +99 net/netfilter/nf_conntrack_bpf.c

    97	
    98	struct nf_conn *
  > 99	__bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple,
   100				u32 tuple_len, u8 protonum, s32 netns_id, u32 timeout)
   101	{
   102		struct nf_conntrack_tuple otuple, rtuple;
   103		struct nf_conn *ct;
   104		int err;
   105	
   106		if (unlikely(netns_id < BPF_F_CURRENT_NETNS))
   107			return ERR_PTR(-EINVAL);
   108	
   109		err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, protonum,
   110					    IP_CT_DIR_ORIGINAL, &otuple);
   111		if (err < 0)
   112			return ERR_PTR(err);
   113	
   114		err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, protonum,
   115					    IP_CT_DIR_REPLY, &rtuple);
   116		if (err < 0)
   117			return ERR_PTR(err);
   118	
   119		if (netns_id >= 0) {
   120			net = get_net_ns_by_id(net, netns_id);
   121			if (unlikely(!net))
   122				return ERR_PTR(-ENONET);
   123		}
   124	
   125		ct = nf_conntrack_alloc(net, &nf_ct_zone_dflt, &otuple, &rtuple,
   126					GFP_ATOMIC);
   127		if (IS_ERR(ct))
   128			goto out;
   129	
   130		ct->timeout = timeout * HZ + jiffies;
   131		ct->status |= IPS_CONFIRMED;
   132	
   133		memset(&ct->proto, 0, sizeof(ct->proto));
   134		if (protonum == IPPROTO_TCP)
   135			ct->proto.tcp.state = TCP_CONNTRACK_ESTABLISHED;
   136	
   137		err = nf_conntrack_hash_check_insert(ct);
   138		if (err < 0) {
   139			nf_conntrack_free(ct);
   140			ct = ERR_PTR(err);
   141		}
   142	out:
   143		if (netns_id >= 0)
   144			put_net(net);
   145	
   146		return ct;
   147	}
   148	

-- 
0-DAY CI Kernel Test Service
https://01.org/lkp

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 bpf-next 4/5] net: netfilter: add kfunc helper to add a new ct entry
  2022-05-18 22:43         ` Kumar Kartikeya Dwivedi
@ 2022-05-19 10:45           ` Toke Høiland-Jørgensen
  2022-05-19 11:23             ` Kumar Kartikeya Dwivedi
  0 siblings, 1 reply; 24+ messages in thread
From: Toke Høiland-Jørgensen @ 2022-05-19 10:45 UTC (permalink / raw)
  To: Kumar Kartikeya Dwivedi, Alexei Starovoitov
  Cc: Lorenzo Bianconi, Lorenzo Bianconi, bpf, Network Development,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	David S. Miller, Jakub Kicinski, Eric Dumazet, Paolo Abeni,
	Pablo Neira Ayuso, Florian Westphal, netfilter-devel,
	Jesper Dangaard Brouer

Kumar Kartikeya Dwivedi <memxor@gmail.com> writes:

> On Thu, May 19, 2022 at 03:44:58AM IST, Alexei Starovoitov wrote:
>> On Wed, May 18, 2022 at 2:09 PM Lorenzo Bianconi
>> <lorenzo.bianconi@redhat.com> wrote:
>> >
>> > > Lorenzo Bianconi <lorenzo@kernel.org> writes:
>> > >
>> > > > Introduce bpf_xdp_ct_add and bpf_skb_ct_add kfunc helpers in order to
>> > > > add a new entry to ct map from an ebpf program.
>> > > > Introduce bpf_nf_ct_tuple_parse utility routine.
>> > > >
>> > > > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
>> > > > ---
>> > > >  net/netfilter/nf_conntrack_bpf.c | 212 +++++++++++++++++++++++++++----
>> > > >  1 file changed, 189 insertions(+), 23 deletions(-)
>> > > >
>> > > > diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
>> > > > index a9271418db88..3d31b602fdf1 100644
>> > > > --- a/net/netfilter/nf_conntrack_bpf.c
>> > > > +++ b/net/netfilter/nf_conntrack_bpf.c
>> > > > @@ -55,41 +55,114 @@ enum {
>> > > >     NF_BPF_CT_OPTS_SZ = 12,
>> > > >  };
>> > > >
>> > > > -static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
>> > > > -                                     struct bpf_sock_tuple *bpf_tuple,
>> > > > -                                     u32 tuple_len, u8 protonum,
>> > > > -                                     s32 netns_id, u8 *dir)
>> > > > +static int bpf_nf_ct_tuple_parse(struct bpf_sock_tuple *bpf_tuple,
>> > > > +                            u32 tuple_len, u8 protonum, u8 dir,
>> > > > +                            struct nf_conntrack_tuple *tuple)
>> > > >  {
>> > > > -   struct nf_conntrack_tuple_hash *hash;
>> > > > -   struct nf_conntrack_tuple tuple;
>> > > > -   struct nf_conn *ct;
>> > > > +   union nf_inet_addr *src = dir ? &tuple->dst.u3 : &tuple->src.u3;
>> > > > +   union nf_inet_addr *dst = dir ? &tuple->src.u3 : &tuple->dst.u3;
>> > > > +   union nf_conntrack_man_proto *sport = dir ? (void *)&tuple->dst.u
>> > > > +                                             : &tuple->src.u;
>> > > > +   union nf_conntrack_man_proto *dport = dir ? &tuple->src.u
>> > > > +                                             : (void *)&tuple->dst.u;
>> > > >
>> > > >     if (unlikely(protonum != IPPROTO_TCP && protonum != IPPROTO_UDP))
>> > > > -           return ERR_PTR(-EPROTO);
>> > > > -   if (unlikely(netns_id < BPF_F_CURRENT_NETNS))
>> > > > -           return ERR_PTR(-EINVAL);
>> > > > +           return -EPROTO;
>> > > > +
>> > > > +   memset(tuple, 0, sizeof(*tuple));
>> > > >
>> > > > -   memset(&tuple, 0, sizeof(tuple));
>> > > >     switch (tuple_len) {
>> > > >     case sizeof(bpf_tuple->ipv4):
>> > > > -           tuple.src.l3num = AF_INET;
>> > > > -           tuple.src.u3.ip = bpf_tuple->ipv4.saddr;
>> > > > -           tuple.src.u.tcp.port = bpf_tuple->ipv4.sport;
>> > > > -           tuple.dst.u3.ip = bpf_tuple->ipv4.daddr;
>> > > > -           tuple.dst.u.tcp.port = bpf_tuple->ipv4.dport;
>> > > > +           tuple->src.l3num = AF_INET;
>> > > > +           src->ip = bpf_tuple->ipv4.saddr;
>> > > > +           sport->tcp.port = bpf_tuple->ipv4.sport;
>> > > > +           dst->ip = bpf_tuple->ipv4.daddr;
>> > > > +           dport->tcp.port = bpf_tuple->ipv4.dport;
>> > > >             break;
>> > > >     case sizeof(bpf_tuple->ipv6):
>> > > > -           tuple.src.l3num = AF_INET6;
>> > > > -           memcpy(tuple.src.u3.ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr));
>> > > > -           tuple.src.u.tcp.port = bpf_tuple->ipv6.sport;
>> > > > -           memcpy(tuple.dst.u3.ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr));
>> > > > -           tuple.dst.u.tcp.port = bpf_tuple->ipv6.dport;
>> > > > +           tuple->src.l3num = AF_INET6;
>> > > > +           memcpy(src->ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr));
>> > > > +           sport->tcp.port = bpf_tuple->ipv6.sport;
>> > > > +           memcpy(dst->ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr));
>> > > > +           dport->tcp.port = bpf_tuple->ipv6.dport;
>> > > >             break;
>> > > >     default:
>> > > > -           return ERR_PTR(-EAFNOSUPPORT);
>> > > > +           return -EAFNOSUPPORT;
>> > > >     }
>> > > > +   tuple->dst.protonum = protonum;
>> > > > +   tuple->dst.dir = dir;
>> > > > +
>> > > > +   return 0;
>> > > > +}
>> > > >
>> > > > -   tuple.dst.protonum = protonum;
>> > > > +struct nf_conn *
>> > > > +__bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple,
>> > > > +                   u32 tuple_len, u8 protonum, s32 netns_id, u32 timeout)
>> > > > +{
>> > > > +   struct nf_conntrack_tuple otuple, rtuple;
>> > > > +   struct nf_conn *ct;
>> > > > +   int err;
>> > > > +
>> > > > +   if (unlikely(netns_id < BPF_F_CURRENT_NETNS))
>> > > > +           return ERR_PTR(-EINVAL);
>> > > > +
>> > > > +   err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, protonum,
>> > > > +                               IP_CT_DIR_ORIGINAL, &otuple);
>> > > > +   if (err < 0)
>> > > > +           return ERR_PTR(err);
>> > > > +
>> > > > +   err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, protonum,
>> > > > +                               IP_CT_DIR_REPLY, &rtuple);
>> > > > +   if (err < 0)
>> > > > +           return ERR_PTR(err);
>> > > > +
>> > > > +   if (netns_id >= 0) {
>> > > > +           net = get_net_ns_by_id(net, netns_id);
>> > > > +           if (unlikely(!net))
>> > > > +                   return ERR_PTR(-ENONET);
>> > > > +   }
>> > > > +
>> > > > +   ct = nf_conntrack_alloc(net, &nf_ct_zone_dflt, &otuple, &rtuple,
>> > > > +                           GFP_ATOMIC);
>> > > > +   if (IS_ERR(ct))
>> > > > +           goto out;
>> > > > +
>> > > > +   ct->timeout = timeout * HZ + jiffies;
>> > > > +   ct->status |= IPS_CONFIRMED;
>> > > > +
>> > > > +   memset(&ct->proto, 0, sizeof(ct->proto));
>> > > > +   if (protonum == IPPROTO_TCP)
>> > > > +           ct->proto.tcp.state = TCP_CONNTRACK_ESTABLISHED;
>> > >
>> > > Hmm, isn't it a bit limiting to hard-code this to ESTABLISHED
>> > > connections? Presumably for TCP you'd want to use this when you see a
>> > > SYN and then rely on conntrack to help with the subsequent state
>> > > tracking for when the SYN-ACK comes back? What's the usecase for
>> > > creating an entry in ESTABLISHED state, exactly?
>> >
>> > I guess we can even add a parameter and pass the state from the caller.
>> > I was not sure if it is mandatory.
>>
>> It's probably cleaner and more flexible to split
>> _alloc and _insert into two kfuncs and let bpf
>> prog populate ct directly.
>
> Right, so we can just whitelist a few fields and allow assignments into those.
> One small problem is that we should probably only permit this for nf_conn
> PTR_TO_BTF_ID obtained from _alloc, and make it rdonly on _insert.
>
> We can do the rw->ro conversion by taking in ref from alloc, and releasing on
> _insert, then returning ref from _insert.

Sounds reasonable enough; I guess _insert would also need to
sanity-check some of the values to prevent injecting invalid state into
the conntrack table.

> For the other part, either return a different shadow PTR_TO_BTF_ID
> with only the fields that can be set, convert insns for it, and then
> on insert return the rdonly PTR_TO_BTF_ID of struct nf_conn, or
> otherwise store the source func in the per-register state and use that
> to deny BPF_WRITE for normal nf_conn. Thoughts?

Hmm, if they're different BTF IDs wouldn't the BPF program have to be
aware of this and use two different structs for the pointer storage?
That seems a bit awkward from an API PoV?

Also, what about updating? For this to be useful with TCP, you'd really
want to be able to update the CT state as the connection is going
through the handshake state transitions...

-Toke


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 bpf-next 4/5] net: netfilter: add kfunc helper to add a new ct entry
  2022-05-19 10:45           ` Toke Høiland-Jørgensen
@ 2022-05-19 11:23             ` Kumar Kartikeya Dwivedi
  2022-05-19 17:00               ` Yonghong Song
  0 siblings, 1 reply; 24+ messages in thread
From: Kumar Kartikeya Dwivedi @ 2022-05-19 11:23 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: Alexei Starovoitov, Lorenzo Bianconi, Lorenzo Bianconi, bpf,
	Network Development, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, David S. Miller, Jakub Kicinski, Eric Dumazet,
	Paolo Abeni, Pablo Neira Ayuso, Florian Westphal,
	netfilter-devel, Jesper Dangaard Brouer

On Thu, May 19, 2022 at 04:15:51PM IST, Toke Høiland-Jørgensen wrote:
> Kumar Kartikeya Dwivedi <memxor@gmail.com> writes:
>
> > On Thu, May 19, 2022 at 03:44:58AM IST, Alexei Starovoitov wrote:
> >> On Wed, May 18, 2022 at 2:09 PM Lorenzo Bianconi
> >> <lorenzo.bianconi@redhat.com> wrote:
> >> >
> >> > > Lorenzo Bianconi <lorenzo@kernel.org> writes:
> >> > >
> >> > > > Introduce bpf_xdp_ct_add and bpf_skb_ct_add kfunc helpers in order to
> >> > > > add a new entry to ct map from an ebpf program.
> >> > > > Introduce bpf_nf_ct_tuple_parse utility routine.
> >> > > >
> >> > > > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> >> > > > ---
> >> > > >  net/netfilter/nf_conntrack_bpf.c | 212 +++++++++++++++++++++++++++----
> >> > > >  1 file changed, 189 insertions(+), 23 deletions(-)
> >> > > >
> >> > > > diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
> >> > > > index a9271418db88..3d31b602fdf1 100644
> >> > > > --- a/net/netfilter/nf_conntrack_bpf.c
> >> > > > +++ b/net/netfilter/nf_conntrack_bpf.c
> >> > > > @@ -55,41 +55,114 @@ enum {
> >> > > >     NF_BPF_CT_OPTS_SZ = 12,
> >> > > >  };
> >> > > >
> >> > > > -static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
> >> > > > -                                     struct bpf_sock_tuple *bpf_tuple,
> >> > > > -                                     u32 tuple_len, u8 protonum,
> >> > > > -                                     s32 netns_id, u8 *dir)
> >> > > > +static int bpf_nf_ct_tuple_parse(struct bpf_sock_tuple *bpf_tuple,
> >> > > > +                            u32 tuple_len, u8 protonum, u8 dir,
> >> > > > +                            struct nf_conntrack_tuple *tuple)
> >> > > >  {
> >> > > > -   struct nf_conntrack_tuple_hash *hash;
> >> > > > -   struct nf_conntrack_tuple tuple;
> >> > > > -   struct nf_conn *ct;
> >> > > > +   union nf_inet_addr *src = dir ? &tuple->dst.u3 : &tuple->src.u3;
> >> > > > +   union nf_inet_addr *dst = dir ? &tuple->src.u3 : &tuple->dst.u3;
> >> > > > +   union nf_conntrack_man_proto *sport = dir ? (void *)&tuple->dst.u
> >> > > > +                                             : &tuple->src.u;
> >> > > > +   union nf_conntrack_man_proto *dport = dir ? &tuple->src.u
> >> > > > +                                             : (void *)&tuple->dst.u;
> >> > > >
> >> > > >     if (unlikely(protonum != IPPROTO_TCP && protonum != IPPROTO_UDP))
> >> > > > -           return ERR_PTR(-EPROTO);
> >> > > > -   if (unlikely(netns_id < BPF_F_CURRENT_NETNS))
> >> > > > -           return ERR_PTR(-EINVAL);
> >> > > > +           return -EPROTO;
> >> > > > +
> >> > > > +   memset(tuple, 0, sizeof(*tuple));
> >> > > >
> >> > > > -   memset(&tuple, 0, sizeof(tuple));
> >> > > >     switch (tuple_len) {
> >> > > >     case sizeof(bpf_tuple->ipv4):
> >> > > > -           tuple.src.l3num = AF_INET;
> >> > > > -           tuple.src.u3.ip = bpf_tuple->ipv4.saddr;
> >> > > > -           tuple.src.u.tcp.port = bpf_tuple->ipv4.sport;
> >> > > > -           tuple.dst.u3.ip = bpf_tuple->ipv4.daddr;
> >> > > > -           tuple.dst.u.tcp.port = bpf_tuple->ipv4.dport;
> >> > > > +           tuple->src.l3num = AF_INET;
> >> > > > +           src->ip = bpf_tuple->ipv4.saddr;
> >> > > > +           sport->tcp.port = bpf_tuple->ipv4.sport;
> >> > > > +           dst->ip = bpf_tuple->ipv4.daddr;
> >> > > > +           dport->tcp.port = bpf_tuple->ipv4.dport;
> >> > > >             break;
> >> > > >     case sizeof(bpf_tuple->ipv6):
> >> > > > -           tuple.src.l3num = AF_INET6;
> >> > > > -           memcpy(tuple.src.u3.ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr));
> >> > > > -           tuple.src.u.tcp.port = bpf_tuple->ipv6.sport;
> >> > > > -           memcpy(tuple.dst.u3.ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr));
> >> > > > -           tuple.dst.u.tcp.port = bpf_tuple->ipv6.dport;
> >> > > > +           tuple->src.l3num = AF_INET6;
> >> > > > +           memcpy(src->ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr));
> >> > > > +           sport->tcp.port = bpf_tuple->ipv6.sport;
> >> > > > +           memcpy(dst->ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr));
> >> > > > +           dport->tcp.port = bpf_tuple->ipv6.dport;
> >> > > >             break;
> >> > > >     default:
> >> > > > -           return ERR_PTR(-EAFNOSUPPORT);
> >> > > > +           return -EAFNOSUPPORT;
> >> > > >     }
> >> > > > +   tuple->dst.protonum = protonum;
> >> > > > +   tuple->dst.dir = dir;
> >> > > > +
> >> > > > +   return 0;
> >> > > > +}
> >> > > >
> >> > > > -   tuple.dst.protonum = protonum;
> >> > > > +struct nf_conn *
> >> > > > +__bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple,
> >> > > > +                   u32 tuple_len, u8 protonum, s32 netns_id, u32 timeout)
> >> > > > +{
> >> > > > +   struct nf_conntrack_tuple otuple, rtuple;
> >> > > > +   struct nf_conn *ct;
> >> > > > +   int err;
> >> > > > +
> >> > > > +   if (unlikely(netns_id < BPF_F_CURRENT_NETNS))
> >> > > > +           return ERR_PTR(-EINVAL);
> >> > > > +
> >> > > > +   err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, protonum,
> >> > > > +                               IP_CT_DIR_ORIGINAL, &otuple);
> >> > > > +   if (err < 0)
> >> > > > +           return ERR_PTR(err);
> >> > > > +
> >> > > > +   err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, protonum,
> >> > > > +                               IP_CT_DIR_REPLY, &rtuple);
> >> > > > +   if (err < 0)
> >> > > > +           return ERR_PTR(err);
> >> > > > +
> >> > > > +   if (netns_id >= 0) {
> >> > > > +           net = get_net_ns_by_id(net, netns_id);
> >> > > > +           if (unlikely(!net))
> >> > > > +                   return ERR_PTR(-ENONET);
> >> > > > +   }
> >> > > > +
> >> > > > +   ct = nf_conntrack_alloc(net, &nf_ct_zone_dflt, &otuple, &rtuple,
> >> > > > +                           GFP_ATOMIC);
> >> > > > +   if (IS_ERR(ct))
> >> > > > +           goto out;
> >> > > > +
> >> > > > +   ct->timeout = timeout * HZ + jiffies;
> >> > > > +   ct->status |= IPS_CONFIRMED;
> >> > > > +
> >> > > > +   memset(&ct->proto, 0, sizeof(ct->proto));
> >> > > > +   if (protonum == IPPROTO_TCP)
> >> > > > +           ct->proto.tcp.state = TCP_CONNTRACK_ESTABLISHED;
> >> > >
> >> > > Hmm, isn't it a bit limiting to hard-code this to ESTABLISHED
> >> > > connections? Presumably for TCP you'd want to use this when you see a
> >> > > SYN and then rely on conntrack to help with the subsequent state
> >> > > tracking for when the SYN-ACK comes back? What's the usecase for
> >> > > creating an entry in ESTABLISHED state, exactly?
> >> >
> >> > I guess we can even add a parameter and pass the state from the caller.
> >> > I was not sure if it is mandatory.
> >>
> >> It's probably cleaner and more flexible to split
> >> _alloc and _insert into two kfuncs and let bpf
> >> prog populate ct directly.
> >
> > Right, so we can just whitelist a few fields and allow assignments into those.
> > One small problem is that we should probably only permit this for nf_conn
> > PTR_TO_BTF_ID obtained from _alloc, and make it rdonly on _insert.
> >
> > We can do the rw->ro conversion by taking in ref from alloc, and releasing on
> > _insert, then returning ref from _insert.
>
> Sounds reasonable enough; I guess _insert would also need to
> sanity-check some of the values to prevent injecting invalid state into
> the conntrack table.
>
> > For the other part, either return a different shadow PTR_TO_BTF_ID
> > with only the fields that can be set, convert insns for it, and then
> > on insert return the rdonly PTR_TO_BTF_ID of struct nf_conn, or
> > otherwise store the source func in the per-register state and use that
> > to deny BPF_WRITE for normal nf_conn. Thoughts?
>
> Hmm, if they're different BTF IDs wouldn't the BPF program have to be
> aware of this and use two different structs for the pointer storage?
> That seems a bit awkward from an API PoV?
>

You only need to use a different pointer after _alloc and pass it into _insert.

Like:
	struct nf_conn_alloc *nfa = nf_alloc(...);
	if (!nfa) { ... }
	nfa->status = ...; // gets converted to nf_conn access
	nfa->tcp_status = ...; // ditto
	struct nf_conn *nf = nf_insert(nfa, ...); // nfa released, nf acquired

The problem is that if I whitelist it for nf_conn as a whole so that we can
assign after _alloc, there is no way to prevent BPF_WRITE for nf_conn obtained
from other functions. We can fix it though by remembering which function a
pointer came from, then you wouldn't need a different struct. I was just
soliciting opinions for different options. I am leaning towards not having to
use a separate struct as well.

> Also, what about updating? For this to be useful with TCP, you'd really
> want to be able to update the CT state as the connection is going
> through the handshake state transitions...
>

I think updates should be done using dedicated functions, like the timeout
helper. Whatever synchronization is needed to update the CT can go into that
function, instead of allowing direct writes after _insert.

> -Toke
>

--
Kartikeya

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 bpf-next 4/5] net: netfilter: add kfunc helper to add a new ct entry
  2022-05-19 11:23             ` Kumar Kartikeya Dwivedi
@ 2022-05-19 17:00               ` Yonghong Song
  2022-05-19 17:10                 ` Kumar Kartikeya Dwivedi
  0 siblings, 1 reply; 24+ messages in thread
From: Yonghong Song @ 2022-05-19 17:00 UTC (permalink / raw)
  To: Kumar Kartikeya Dwivedi, Toke Høiland-Jørgensen
  Cc: Alexei Starovoitov, Lorenzo Bianconi, Lorenzo Bianconi, bpf,
	Network Development, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, David S. Miller, Jakub Kicinski, Eric Dumazet,
	Paolo Abeni, Pablo Neira Ayuso, Florian Westphal,
	netfilter-devel, Jesper Dangaard Brouer



On 5/19/22 4:23 AM, Kumar Kartikeya Dwivedi wrote:
> On Thu, May 19, 2022 at 04:15:51PM IST, Toke Høiland-Jørgensen wrote:
>> Kumar Kartikeya Dwivedi <memxor@gmail.com> writes:
>>
>>> On Thu, May 19, 2022 at 03:44:58AM IST, Alexei Starovoitov wrote:
>>>> On Wed, May 18, 2022 at 2:09 PM Lorenzo Bianconi
>>>> <lorenzo.bianconi@redhat.com> wrote:
>>>>>
>>>>>> Lorenzo Bianconi <lorenzo@kernel.org> writes:
>>>>>>
>>>>>>> Introduce bpf_xdp_ct_add and bpf_skb_ct_add kfunc helpers in order to
>>>>>>> add a new entry to ct map from an ebpf program.
>>>>>>> Introduce bpf_nf_ct_tuple_parse utility routine.
>>>>>>>
>>>>>>> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
>>>>>>> ---
>>>>>>>   net/netfilter/nf_conntrack_bpf.c | 212 +++++++++++++++++++++++++++----
>>>>>>>   1 file changed, 189 insertions(+), 23 deletions(-)
>>>>>>>
>>>>>>> diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
>>>>>>> index a9271418db88..3d31b602fdf1 100644
>>>>>>> --- a/net/netfilter/nf_conntrack_bpf.c
>>>>>>> +++ b/net/netfilter/nf_conntrack_bpf.c
>>>>>>> @@ -55,41 +55,114 @@ enum {
>>>>>>>      NF_BPF_CT_OPTS_SZ = 12,
>>>>>>>   };
>>>>>>>
>>>>>>> -static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
>>>>>>> -                                     struct bpf_sock_tuple *bpf_tuple,
>>>>>>> -                                     u32 tuple_len, u8 protonum,
>>>>>>> -                                     s32 netns_id, u8 *dir)
>>>>>>> +static int bpf_nf_ct_tuple_parse(struct bpf_sock_tuple *bpf_tuple,
>>>>>>> +                            u32 tuple_len, u8 protonum, u8 dir,
>>>>>>> +                            struct nf_conntrack_tuple *tuple)
>>>>>>>   {
>>>>>>> -   struct nf_conntrack_tuple_hash *hash;
>>>>>>> -   struct nf_conntrack_tuple tuple;
>>>>>>> -   struct nf_conn *ct;
>>>>>>> +   union nf_inet_addr *src = dir ? &tuple->dst.u3 : &tuple->src.u3;
>>>>>>> +   union nf_inet_addr *dst = dir ? &tuple->src.u3 : &tuple->dst.u3;
>>>>>>> +   union nf_conntrack_man_proto *sport = dir ? (void *)&tuple->dst.u
>>>>>>> +                                             : &tuple->src.u;
>>>>>>> +   union nf_conntrack_man_proto *dport = dir ? &tuple->src.u
>>>>>>> +                                             : (void *)&tuple->dst.u;
>>>>>>>
>>>>>>>      if (unlikely(protonum != IPPROTO_TCP && protonum != IPPROTO_UDP))
>>>>>>> -           return ERR_PTR(-EPROTO);
>>>>>>> -   if (unlikely(netns_id < BPF_F_CURRENT_NETNS))
>>>>>>> -           return ERR_PTR(-EINVAL);
>>>>>>> +           return -EPROTO;
>>>>>>> +
>>>>>>> +   memset(tuple, 0, sizeof(*tuple));
>>>>>>>
>>>>>>> -   memset(&tuple, 0, sizeof(tuple));
>>>>>>>      switch (tuple_len) {
>>>>>>>      case sizeof(bpf_tuple->ipv4):
>>>>>>> -           tuple.src.l3num = AF_INET;
>>>>>>> -           tuple.src.u3.ip = bpf_tuple->ipv4.saddr;
>>>>>>> -           tuple.src.u.tcp.port = bpf_tuple->ipv4.sport;
>>>>>>> -           tuple.dst.u3.ip = bpf_tuple->ipv4.daddr;
>>>>>>> -           tuple.dst.u.tcp.port = bpf_tuple->ipv4.dport;
>>>>>>> +           tuple->src.l3num = AF_INET;
>>>>>>> +           src->ip = bpf_tuple->ipv4.saddr;
>>>>>>> +           sport->tcp.port = bpf_tuple->ipv4.sport;
>>>>>>> +           dst->ip = bpf_tuple->ipv4.daddr;
>>>>>>> +           dport->tcp.port = bpf_tuple->ipv4.dport;
>>>>>>>              break;
>>>>>>>      case sizeof(bpf_tuple->ipv6):
>>>>>>> -           tuple.src.l3num = AF_INET6;
>>>>>>> -           memcpy(tuple.src.u3.ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr));
>>>>>>> -           tuple.src.u.tcp.port = bpf_tuple->ipv6.sport;
>>>>>>> -           memcpy(tuple.dst.u3.ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr));
>>>>>>> -           tuple.dst.u.tcp.port = bpf_tuple->ipv6.dport;
>>>>>>> +           tuple->src.l3num = AF_INET6;
>>>>>>> +           memcpy(src->ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr));
>>>>>>> +           sport->tcp.port = bpf_tuple->ipv6.sport;
>>>>>>> +           memcpy(dst->ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr));
>>>>>>> +           dport->tcp.port = bpf_tuple->ipv6.dport;
>>>>>>>              break;
>>>>>>>      default:
>>>>>>> -           return ERR_PTR(-EAFNOSUPPORT);
>>>>>>> +           return -EAFNOSUPPORT;
>>>>>>>      }
>>>>>>> +   tuple->dst.protonum = protonum;
>>>>>>> +   tuple->dst.dir = dir;
>>>>>>> +
>>>>>>> +   return 0;
>>>>>>> +}
>>>>>>>
>>>>>>> -   tuple.dst.protonum = protonum;
>>>>>>> +struct nf_conn *
>>>>>>> +__bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple,
>>>>>>> +                   u32 tuple_len, u8 protonum, s32 netns_id, u32 timeout)
>>>>>>> +{
>>>>>>> +   struct nf_conntrack_tuple otuple, rtuple;
>>>>>>> +   struct nf_conn *ct;
>>>>>>> +   int err;
>>>>>>> +
>>>>>>> +   if (unlikely(netns_id < BPF_F_CURRENT_NETNS))
>>>>>>> +           return ERR_PTR(-EINVAL);
>>>>>>> +
>>>>>>> +   err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, protonum,
>>>>>>> +                               IP_CT_DIR_ORIGINAL, &otuple);
>>>>>>> +   if (err < 0)
>>>>>>> +           return ERR_PTR(err);
>>>>>>> +
>>>>>>> +   err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, protonum,
>>>>>>> +                               IP_CT_DIR_REPLY, &rtuple);
>>>>>>> +   if (err < 0)
>>>>>>> +           return ERR_PTR(err);
>>>>>>> +
>>>>>>> +   if (netns_id >= 0) {
>>>>>>> +           net = get_net_ns_by_id(net, netns_id);
>>>>>>> +           if (unlikely(!net))
>>>>>>> +                   return ERR_PTR(-ENONET);
>>>>>>> +   }
>>>>>>> +
>>>>>>> +   ct = nf_conntrack_alloc(net, &nf_ct_zone_dflt, &otuple, &rtuple,
>>>>>>> +                           GFP_ATOMIC);
>>>>>>> +   if (IS_ERR(ct))
>>>>>>> +           goto out;
>>>>>>> +
>>>>>>> +   ct->timeout = timeout * HZ + jiffies;
>>>>>>> +   ct->status |= IPS_CONFIRMED;
>>>>>>> +
>>>>>>> +   memset(&ct->proto, 0, sizeof(ct->proto));
>>>>>>> +   if (protonum == IPPROTO_TCP)
>>>>>>> +           ct->proto.tcp.state = TCP_CONNTRACK_ESTABLISHED;
>>>>>>
>>>>>> Hmm, isn't it a bit limiting to hard-code this to ESTABLISHED
>>>>>> connections? Presumably for TCP you'd want to use this when you see a
>>>>>> SYN and then rely on conntrack to help with the subsequent state
>>>>>> tracking for when the SYN-ACK comes back? What's the usecase for
>>>>>> creating an entry in ESTABLISHED state, exactly?
>>>>>
>>>>> I guess we can even add a parameter and pass the state from the caller.
>>>>> I was not sure if it is mandatory.
>>>>
>>>> It's probably cleaner and more flexible to split
>>>> _alloc and _insert into two kfuncs and let bpf
>>>> prog populate ct directly.
>>>
>>> Right, so we can just whitelist a few fields and allow assignments into those.
>>> One small problem is that we should probably only permit this for nf_conn
>>> PTR_TO_BTF_ID obtained from _alloc, and make it rdonly on _insert.
>>>
>>> We can do the rw->ro conversion by taking in ref from alloc, and releasing on
>>> _insert, then returning ref from _insert.
>>
>> Sounds reasonable enough; I guess _insert would also need to
>> sanity-check some of the values to prevent injecting invalid state into
>> the conntrack table.
>>
>>> For the other part, either return a different shadow PTR_TO_BTF_ID
>>> with only the fields that can be set, convert insns for it, and then
>>> on insert return the rdonly PTR_TO_BTF_ID of struct nf_conn, or
>>> otherwise store the source func in the per-register state and use that
>>> to deny BPF_WRITE for normal nf_conn. Thoughts?
>>
>> Hmm, if they're different BTF IDs wouldn't the BPF program have to be
>> aware of this and use two different structs for the pointer storage?
>> That seems a bit awkward from an API PoV?
>>
> 
> You only need to use a different pointer after _alloc and pass it into _insert.
> 
> Like:
> 	struct nf_conn_alloc *nfa = nf_alloc(...);
> 	if (!nfa) { ... }
> 	nfa->status = ...; // gets converted to nf_conn access
> 	nfa->tcp_status = ...; // ditto
> 	struct nf_conn *nf = nf_insert(nfa, ...); // nfa released, nf acquired
> 
> The problem is that if I whitelist it for nf_conn as a whole so that we can
> assign after _alloc, there is no way to prevent BPF_WRITE for nf_conn obtained
> from other functions. We can fix it though by remembering which function a
> pointer came from, then you wouldn't need a different struct. I was just
> soliciting opinions for different options. I am leaning towards not having to
> use a separate struct as well.

Is it possible that we define the signature of nf_insert() as
   const struct nf_conn *nf_insert(...)
so for
   const struct nf_conn *nf = nf_insert(nfa, ...);

if there are any nf->status = ..., the compiler will emit a warning.

Also verifier can know the return value of nf_insert() is read-only
and can prevent value overwrite.

Maybe I missed some context, but the above is based on what
I understood so far.

> 
>> Also, what about updating? For this to be useful with TCP, you'd really
>> want to be able to update the CT state as the connection is going
>> through the handshake state transitions...
>>
> 
> I think updates should be done using dedicated functions, like the timeout
> helper. Whatever synchronization is needed to update the CT can go into that
> function, instead of allowing direct writes after _insert.
> 
>> -Toke
>>
> 
> --
> Kartikeya

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 bpf-next 4/5] net: netfilter: add kfunc helper to add a new ct entry
  2022-05-19 17:00               ` Yonghong Song
@ 2022-05-19 17:10                 ` Kumar Kartikeya Dwivedi
  0 siblings, 0 replies; 24+ messages in thread
From: Kumar Kartikeya Dwivedi @ 2022-05-19 17:10 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Toke Høiland-Jørgensen, Alexei Starovoitov,
	Lorenzo Bianconi, Lorenzo Bianconi, bpf, Network Development,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	David S. Miller, Jakub Kicinski, Eric Dumazet, Paolo Abeni,
	Pablo Neira Ayuso, Florian Westphal, netfilter-devel,
	Jesper Dangaard Brouer

On Thu, May 19, 2022 at 10:30:45PM IST, Yonghong Song wrote:
>
>
> On 5/19/22 4:23 AM, Kumar Kartikeya Dwivedi wrote:
> > On Thu, May 19, 2022 at 04:15:51PM IST, Toke Høiland-Jørgensen wrote:
> > > Kumar Kartikeya Dwivedi <memxor@gmail.com> writes:
> > >
> > > > On Thu, May 19, 2022 at 03:44:58AM IST, Alexei Starovoitov wrote:
> > > > > On Wed, May 18, 2022 at 2:09 PM Lorenzo Bianconi
> > > > > <lorenzo.bianconi@redhat.com> wrote:
> > > > > >
> > > > > > > Lorenzo Bianconi <lorenzo@kernel.org> writes:
> > > > > > >
> > > > > > > > Introduce bpf_xdp_ct_add and bpf_skb_ct_add kfunc helpers in order to
> > > > > > > > add a new entry to ct map from an ebpf program.
> > > > > > > > Introduce bpf_nf_ct_tuple_parse utility routine.
> > > > > > > >
> > > > > > > > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> > > > > > > > ---
> > > > > > > >   net/netfilter/nf_conntrack_bpf.c | 212 +++++++++++++++++++++++++++----
> > > > > > > >   1 file changed, 189 insertions(+), 23 deletions(-)
> > > > > > > >
> > > > > > > > diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
> > > > > > > > index a9271418db88..3d31b602fdf1 100644
> > > > > > > > --- a/net/netfilter/nf_conntrack_bpf.c
> > > > > > > > +++ b/net/netfilter/nf_conntrack_bpf.c
> > > > > > > > @@ -55,41 +55,114 @@ enum {
> > > > > > > >      NF_BPF_CT_OPTS_SZ = 12,
> > > > > > > >   };
> > > > > > > >
> > > > > > > > -static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
> > > > > > > > -                                     struct bpf_sock_tuple *bpf_tuple,
> > > > > > > > -                                     u32 tuple_len, u8 protonum,
> > > > > > > > -                                     s32 netns_id, u8 *dir)
> > > > > > > > +static int bpf_nf_ct_tuple_parse(struct bpf_sock_tuple *bpf_tuple,
> > > > > > > > +                            u32 tuple_len, u8 protonum, u8 dir,
> > > > > > > > +                            struct nf_conntrack_tuple *tuple)
> > > > > > > >   {
> > > > > > > > -   struct nf_conntrack_tuple_hash *hash;
> > > > > > > > -   struct nf_conntrack_tuple tuple;
> > > > > > > > -   struct nf_conn *ct;
> > > > > > > > +   union nf_inet_addr *src = dir ? &tuple->dst.u3 : &tuple->src.u3;
> > > > > > > > +   union nf_inet_addr *dst = dir ? &tuple->src.u3 : &tuple->dst.u3;
> > > > > > > > +   union nf_conntrack_man_proto *sport = dir ? (void *)&tuple->dst.u
> > > > > > > > +                                             : &tuple->src.u;
> > > > > > > > +   union nf_conntrack_man_proto *dport = dir ? &tuple->src.u
> > > > > > > > +                                             : (void *)&tuple->dst.u;
> > > > > > > >
> > > > > > > >      if (unlikely(protonum != IPPROTO_TCP && protonum != IPPROTO_UDP))
> > > > > > > > -           return ERR_PTR(-EPROTO);
> > > > > > > > -   if (unlikely(netns_id < BPF_F_CURRENT_NETNS))
> > > > > > > > -           return ERR_PTR(-EINVAL);
> > > > > > > > +           return -EPROTO;
> > > > > > > > +
> > > > > > > > +   memset(tuple, 0, sizeof(*tuple));
> > > > > > > >
> > > > > > > > -   memset(&tuple, 0, sizeof(tuple));
> > > > > > > >      switch (tuple_len) {
> > > > > > > >      case sizeof(bpf_tuple->ipv4):
> > > > > > > > -           tuple.src.l3num = AF_INET;
> > > > > > > > -           tuple.src.u3.ip = bpf_tuple->ipv4.saddr;
> > > > > > > > -           tuple.src.u.tcp.port = bpf_tuple->ipv4.sport;
> > > > > > > > -           tuple.dst.u3.ip = bpf_tuple->ipv4.daddr;
> > > > > > > > -           tuple.dst.u.tcp.port = bpf_tuple->ipv4.dport;
> > > > > > > > +           tuple->src.l3num = AF_INET;
> > > > > > > > +           src->ip = bpf_tuple->ipv4.saddr;
> > > > > > > > +           sport->tcp.port = bpf_tuple->ipv4.sport;
> > > > > > > > +           dst->ip = bpf_tuple->ipv4.daddr;
> > > > > > > > +           dport->tcp.port = bpf_tuple->ipv4.dport;
> > > > > > > >              break;
> > > > > > > >      case sizeof(bpf_tuple->ipv6):
> > > > > > > > -           tuple.src.l3num = AF_INET6;
> > > > > > > > -           memcpy(tuple.src.u3.ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr));
> > > > > > > > -           tuple.src.u.tcp.port = bpf_tuple->ipv6.sport;
> > > > > > > > -           memcpy(tuple.dst.u3.ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr));
> > > > > > > > -           tuple.dst.u.tcp.port = bpf_tuple->ipv6.dport;
> > > > > > > > +           tuple->src.l3num = AF_INET6;
> > > > > > > > +           memcpy(src->ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr));
> > > > > > > > +           sport->tcp.port = bpf_tuple->ipv6.sport;
> > > > > > > > +           memcpy(dst->ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr));
> > > > > > > > +           dport->tcp.port = bpf_tuple->ipv6.dport;
> > > > > > > >              break;
> > > > > > > >      default:
> > > > > > > > -           return ERR_PTR(-EAFNOSUPPORT);
> > > > > > > > +           return -EAFNOSUPPORT;
> > > > > > > >      }
> > > > > > > > +   tuple->dst.protonum = protonum;
> > > > > > > > +   tuple->dst.dir = dir;
> > > > > > > > +
> > > > > > > > +   return 0;
> > > > > > > > +}
> > > > > > > >
> > > > > > > > -   tuple.dst.protonum = protonum;
> > > > > > > > +struct nf_conn *
> > > > > > > > +__bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple,
> > > > > > > > +                   u32 tuple_len, u8 protonum, s32 netns_id, u32 timeout)
> > > > > > > > +{
> > > > > > > > +   struct nf_conntrack_tuple otuple, rtuple;
> > > > > > > > +   struct nf_conn *ct;
> > > > > > > > +   int err;
> > > > > > > > +
> > > > > > > > +   if (unlikely(netns_id < BPF_F_CURRENT_NETNS))
> > > > > > > > +           return ERR_PTR(-EINVAL);
> > > > > > > > +
> > > > > > > > +   err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, protonum,
> > > > > > > > +                               IP_CT_DIR_ORIGINAL, &otuple);
> > > > > > > > +   if (err < 0)
> > > > > > > > +           return ERR_PTR(err);
> > > > > > > > +
> > > > > > > > +   err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, protonum,
> > > > > > > > +                               IP_CT_DIR_REPLY, &rtuple);
> > > > > > > > +   if (err < 0)
> > > > > > > > +           return ERR_PTR(err);
> > > > > > > > +
> > > > > > > > +   if (netns_id >= 0) {
> > > > > > > > +           net = get_net_ns_by_id(net, netns_id);
> > > > > > > > +           if (unlikely(!net))
> > > > > > > > +                   return ERR_PTR(-ENONET);
> > > > > > > > +   }
> > > > > > > > +
> > > > > > > > +   ct = nf_conntrack_alloc(net, &nf_ct_zone_dflt, &otuple, &rtuple,
> > > > > > > > +                           GFP_ATOMIC);
> > > > > > > > +   if (IS_ERR(ct))
> > > > > > > > +           goto out;
> > > > > > > > +
> > > > > > > > +   ct->timeout = timeout * HZ + jiffies;
> > > > > > > > +   ct->status |= IPS_CONFIRMED;
> > > > > > > > +
> > > > > > > > +   memset(&ct->proto, 0, sizeof(ct->proto));
> > > > > > > > +   if (protonum == IPPROTO_TCP)
> > > > > > > > +           ct->proto.tcp.state = TCP_CONNTRACK_ESTABLISHED;
> > > > > > >
> > > > > > > Hmm, isn't it a bit limiting to hard-code this to ESTABLISHED
> > > > > > > connections? Presumably for TCP you'd want to use this when you see a
> > > > > > > SYN and then rely on conntrack to help with the subsequent state
> > > > > > > tracking for when the SYN-ACK comes back? What's the usecase for
> > > > > > > creating an entry in ESTABLISHED state, exactly?
> > > > > >
> > > > > > I guess we can even add a parameter and pass the state from the caller.
> > > > > > I was not sure if it is mandatory.
> > > > >
> > > > > It's probably cleaner and more flexible to split
> > > > > _alloc and _insert into two kfuncs and let bpf
> > > > > prog populate ct directly.
> > > >
> > > > Right, so we can just whitelist a few fields and allow assignments into those.
> > > > One small problem is that we should probably only permit this for nf_conn
> > > > PTR_TO_BTF_ID obtained from _alloc, and make it rdonly on _insert.
> > > >
> > > > We can do the rw->ro conversion by taking in ref from alloc, and releasing on
> > > > _insert, then returning ref from _insert.
> > >
> > > Sounds reasonable enough; I guess _insert would also need to
> > > sanity-check some of the values to prevent injecting invalid state into
> > > the conntrack table.
> > >
> > > > For the other part, either return a different shadow PTR_TO_BTF_ID
> > > > with only the fields that can be set, convert insns for it, and then
> > > > on insert return the rdonly PTR_TO_BTF_ID of struct nf_conn, or
> > > > otherwise store the source func in the per-register state and use that
> > > > to deny BPF_WRITE for normal nf_conn. Thoughts?
> > >
> > > Hmm, if they're different BTF IDs wouldn't the BPF program have to be
> > > aware of this and use two different structs for the pointer storage?
> > > That seems a bit awkward from an API PoV?
> > >
> >
> > You only need to use a different pointer after _alloc and pass it into _insert.
> >
> > Like:
> > 	struct nf_conn_alloc *nfa = nf_alloc(...);
> > 	if (!nfa) { ... }
> > 	nfa->status = ...; // gets converted to nf_conn access
> > 	nfa->tcp_status = ...; // ditto
> > 	struct nf_conn *nf = nf_insert(nfa, ...); // nfa released, nf acquired
> >
> > The problem is that if I whitelist it for nf_conn as a whole so that we can
> > assign after _alloc, there is no way to prevent BPF_WRITE for nf_conn obtained
> > from other functions. We can fix it though by remembering which function a
> > pointer came from, then you wouldn't need a different struct. I was just
> > soliciting opinions for different options. I am leaning towards not having to
> > use a separate struct as well.
>
> Is it possible that we define the signature of nf_insert() as
>   const struct nf_conn *nf_insert(...)
> so for
>   const struct nf_conn *nf = nf_insert(nfa, ...);
>
> if there are any nf->status = ..., the compiler will emit a warning.
>
> Also verifier can know the return value of nf_insert() is read-only
> and can prevent value overwrite.
>
> Maybe I missed some context, but the above is based on what
> I understood so far.
>

That's a great idea. We should certainly use const to force R0 PTR_TO_BTF_ID to
be read only (even if some fields are allowed to be written to). Thanks!

> >
> > > Also, what about updating? For this to be useful with TCP, you'd really
> > > want to be able to update the CT state as the connection is going
> > > through the handshake state transitions...
> > >
> >
> > I think updates should be done using dedicated functions, like the timeout
> > helper. Whatever synchronization is needed to update the CT can go into that
> > function, instead of allowing direct writes after _insert.
> >
> > > -Toke
> > >
> >
> > --
> > Kartikeya

--
Kartikeya

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 bpf-next 5/5] selftests/bpf: add selftest for bpf_xdp_ct_add and bpf_ct_refresh_timeout kfunc
  2022-05-18 10:43 ` [PATCH v3 bpf-next 5/5] selftests/bpf: add selftest for bpf_xdp_ct_add and bpf_ct_refresh_timeout kfunc Lorenzo Bianconi
  2022-05-18 18:55   ` Yonghong Song
@ 2022-05-20 22:04   ` Andrii Nakryiko
  2022-05-21  9:56     ` Lorenzo Bianconi
  1 sibling, 1 reply; 24+ messages in thread
From: Andrii Nakryiko @ 2022-05-20 22:04 UTC (permalink / raw)
  To: Lorenzo Bianconi
  Cc: bpf, Networking, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, David S. Miller, Jakub Kicinski, Eric Dumazet,
	pabeni, Pablo Neira Ayuso, Florian Westphal, netfilter-devel,
	Lorenzo Bianconi, Jesper Dangaard Brouer,
	Toke Høiland-Jørgensen, Kumar Kartikeya Dwivedi

On Wed, May 18, 2022 at 3:44 AM Lorenzo Bianconi <lorenzo@kernel.org> wrote:
>
> Introduce selftests for the following kfunc helpers:
> - bpf_xdp_ct_add
> - bpf_skb_ct_add
> - bpf_ct_refresh_timeout
>
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> ---
>  .../testing/selftests/bpf/prog_tests/bpf_nf.c |  4 ++
>  .../testing/selftests/bpf/progs/test_bpf_nf.c | 72 +++++++++++++++----
>  2 files changed, 64 insertions(+), 12 deletions(-)
>
> diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
> index dd30b1e3a67c..be6c5650892f 100644
> --- a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
> +++ b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
> @@ -39,6 +39,10 @@ void test_bpf_nf_ct(int mode)
>         ASSERT_EQ(skel->bss->test_enonet_netns_id, -ENONET, "Test ENONET for bad but valid netns_id");
>         ASSERT_EQ(skel->bss->test_enoent_lookup, -ENOENT, "Test ENOENT for failed lookup");
>         ASSERT_EQ(skel->bss->test_eafnosupport, -EAFNOSUPPORT, "Test EAFNOSUPPORT for invalid len__tuple");
> +       ASSERT_EQ(skel->bss->test_add_entry, 0, "Test for adding new entry");
> +       ASSERT_EQ(skel->bss->test_succ_lookup, 0, "Test for successful lookup");
> +       ASSERT_TRUE(skel->bss->test_delta_timeout > 9 && skel->bss->test_delta_timeout <= 10,
> +                   "Test for ct timeout update");

if/when this fails we'll have "true != false" message not knowing what
was the actual value of skel->bss->test_delta_timeout.

This is equivalent to a much better:

ASSERT_GT(skel->bss->test_delta_timeout, 9, "ct_timeout1");
ASSERT_LE(skel->bss->test_delta_timeout, 10, "ct_timeout2");

>  end:
>         test_bpf_nf__destroy(skel);
>  }


[...]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 bpf-next 5/5] selftests/bpf: add selftest for bpf_xdp_ct_add and bpf_ct_refresh_timeout kfunc
  2022-05-20 22:04   ` Andrii Nakryiko
@ 2022-05-21  9:56     ` Lorenzo Bianconi
  0 siblings, 0 replies; 24+ messages in thread
From: Lorenzo Bianconi @ 2022-05-21  9:56 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: bpf, Networking, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, David S. Miller, Jakub Kicinski, Eric Dumazet,
	pabeni, Pablo Neira Ayuso, Florian Westphal, netfilter-devel,
	Lorenzo Bianconi, Jesper Dangaard Brouer,
	Toke Høiland-Jørgensen, Kumar Kartikeya Dwivedi

[-- Attachment #1: Type: text/plain, Size: 1921 bytes --]

> On Wed, May 18, 2022 at 3:44 AM Lorenzo Bianconi <lorenzo@kernel.org> wrote:
> >
> > Introduce selftests for the following kfunc helpers:
> > - bpf_xdp_ct_add
> > - bpf_skb_ct_add
> > - bpf_ct_refresh_timeout
> >
> > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> > ---
> >  .../testing/selftests/bpf/prog_tests/bpf_nf.c |  4 ++
> >  .../testing/selftests/bpf/progs/test_bpf_nf.c | 72 +++++++++++++++----
> >  2 files changed, 64 insertions(+), 12 deletions(-)
> >
> > diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
> > index dd30b1e3a67c..be6c5650892f 100644
> > --- a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
> > +++ b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
> > @@ -39,6 +39,10 @@ void test_bpf_nf_ct(int mode)
> >         ASSERT_EQ(skel->bss->test_enonet_netns_id, -ENONET, "Test ENONET for bad but valid netns_id");
> >         ASSERT_EQ(skel->bss->test_enoent_lookup, -ENOENT, "Test ENOENT for failed lookup");
> >         ASSERT_EQ(skel->bss->test_eafnosupport, -EAFNOSUPPORT, "Test EAFNOSUPPORT for invalid len__tuple");
> > +       ASSERT_EQ(skel->bss->test_add_entry, 0, "Test for adding new entry");
> > +       ASSERT_EQ(skel->bss->test_succ_lookup, 0, "Test for successful lookup");
> > +       ASSERT_TRUE(skel->bss->test_delta_timeout > 9 && skel->bss->test_delta_timeout <= 10,
> > +                   "Test for ct timeout update");
> 
> if/when this fails we'll have "true != false" message not knowing what
> was the actual value of skel->bss->test_delta_timeout.
> 
> This is equivalent to a much better:
> 
> ASSERT_GT(skel->bss->test_delta_timeout, 9, "ct_timeout1");
> ASSERT_LE(skel->bss->test_delta_timeout, 10, "ct_timeout2");

ack, I will fix it in the next version.

Regards,
Lorenzo

> 
> >  end:
> >         test_bpf_nf__destroy(skel);
> >  }
> 
> 
> [...]

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2022-05-21  9:56 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-18 10:43 [PATCH v3 bpf-next 0/5] net: netfilter: add kfunc helper to update ct timeout Lorenzo Bianconi
2022-05-18 10:43 ` [PATCH v3 bpf-next 1/5] bpf: Add support for forcing kfunc args to be referenced Lorenzo Bianconi
2022-05-18 17:58   ` Yonghong Song
2022-05-18 18:06     ` Kumar Kartikeya Dwivedi
2022-05-18 18:23       ` Yonghong Song
2022-05-18 10:43 ` [PATCH v3 bpf-next 2/5] selftests/bpf: Add verifier selftests for forced kfunc ref args Lorenzo Bianconi
2022-05-18 18:25   ` Yonghong Song
2022-05-18 10:43 ` [PATCH v3 bpf-next 3/5] net: netfilter: add kfunc helper to update ct timeout Lorenzo Bianconi
2022-05-18 20:42   ` Toke Høiland-Jørgensen
2022-05-18 10:43 ` [PATCH v3 bpf-next 4/5] net: netfilter: add kfunc helper to add a new ct entry Lorenzo Bianconi
2022-05-18 20:47   ` Toke Høiland-Jørgensen
2022-05-18 21:08     ` Lorenzo Bianconi
2022-05-18 22:14       ` Alexei Starovoitov
2022-05-18 22:43         ` Kumar Kartikeya Dwivedi
2022-05-19 10:45           ` Toke Høiland-Jørgensen
2022-05-19 11:23             ` Kumar Kartikeya Dwivedi
2022-05-19 17:00               ` Yonghong Song
2022-05-19 17:10                 ` Kumar Kartikeya Dwivedi
2022-05-19  2:06   ` kernel test robot
2022-05-18 10:43 ` [PATCH v3 bpf-next 5/5] selftests/bpf: add selftest for bpf_xdp_ct_add and bpf_ct_refresh_timeout kfunc Lorenzo Bianconi
2022-05-18 18:55   ` Yonghong Song
2022-05-18 20:38     ` Lorenzo Bianconi
2022-05-20 22:04   ` Andrii Nakryiko
2022-05-21  9:56     ` Lorenzo Bianconi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).