bpf.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH bpf-next] bpf: add bpf_ct_lookup_{tcp,udp}() helpers
@ 2020-01-18  0:01 Matthew Cover
  2020-01-18 11:37 ` kbuild test robot
                   ` (3 more replies)
  0 siblings, 4 replies; 22+ messages in thread
From: Matthew Cover @ 2020-01-18  0:01 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau, Song Liu,
	Yonghong Song, Andrii Nakryiko, David S. Miller, Shuah Khan,
	Jakub Kicinski, Jesper Dangaard Brouer, John Fastabend,
	Jakub Sitnicki, Quentin Monnet, Matthew Cover,
	Stanislav Fomichev, Andrey Ignatov, Lorenz Bauer, Jiong Wang,
	netdev, bpf, linux-kernel, linux-kselftest

Allow looking up an nf_conn. This allows eBPF programs to leverage
nf_conntrack state for similar purposes to socket state use cases,
as provided by the socket lookup helpers. This is particularly
useful when nf_conntrack state is locally available, but socket
state is not.

Signed-off-by: Matthew Cover <matthew.cover@stackpath.com>
---
 include/linux/bpf.h                                |  28 +++
 include/uapi/linux/bpf.h                           | 111 ++++++++-
 kernel/bpf/verifier.c                              | 105 +++++++-
 net/core/filter.c                                  | 277 +++++++++++++++++++++
 scripts/bpf_helpers_doc.py                         |   4 +
 tools/include/uapi/linux/bpf.h                     | 111 ++++++++-
 tools/testing/selftests/bpf/test_verifier.c        |  18 ++
 .../testing/selftests/bpf/verifier/ref_tracking.c  |  48 ++++
 8 files changed, 694 insertions(+), 8 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8e3b8f4..28d35c3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -239,6 +239,7 @@ enum bpf_arg_type {
 	ARG_PTR_TO_LONG,	/* pointer to long */
 	ARG_PTR_TO_SOCKET,	/* pointer to bpf_sock (fullsock) */
 	ARG_PTR_TO_BTF_ID,	/* pointer to in-kernel struct */
+	ARG_PTR_TO_NF_CONN,	/* pointer to bpf_nf_conn */
 };
 
 /* type of values returned from helper functions */
@@ -250,6 +251,7 @@ enum bpf_return_type {
 	RET_PTR_TO_SOCKET_OR_NULL,	/* returns a pointer to a socket or NULL */
 	RET_PTR_TO_TCP_SOCK_OR_NULL,	/* returns a pointer to a tcp_sock or NULL */
 	RET_PTR_TO_SOCK_COMMON_OR_NULL,	/* returns a pointer to a sock_common or NULL */
+	RET_PTR_TO_NF_CONN_OR_NULL,	/* returns a pointer to a nf_conn or NULL */
 };
 
 /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
@@ -316,6 +318,8 @@ enum bpf_reg_type {
 	PTR_TO_TP_BUFFER,	 /* reg points to a writable raw tp's buffer */
 	PTR_TO_XDP_SOCK,	 /* reg points to struct xdp_sock */
 	PTR_TO_BTF_ID,		 /* reg points to kernel struct */
+	PTR_TO_NF_CONN,		 /* reg points to struct nf_conn */
+	PTR_TO_NF_CONN_OR_NULL,	 /* reg points to struct nf_conn or NULL */
 };
 
 /* The information passed from prog-specific *_is_valid_access
@@ -1513,4 +1517,28 @@ enum bpf_text_poke_type {
 int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 		       void *addr1, void *addr2);
 
+#if IS_BUILTIN(CONFIG_NF_CONNTRACK)
+bool bpf_nf_conn_is_valid_access(int off, int size, enum bpf_access_type type,
+				 struct bpf_insn_access_aux *info);
+
+u32 bpf_nf_conn_convert_ctx_access(enum bpf_access_type type,
+				   const struct bpf_insn *si,
+				   struct bpf_insn *insn_buf,
+				   struct bpf_prog *prog, u32 *target_size);
+#else
+bool bpf_nf_conn_is_valid_access(int off, int size, enum bpf_access_type type,
+				 struct bpf_insn_access_aux *info)
+{
+	return false;
+}
+
+u32 bpf_nf_conn_convert_ctx_access(enum bpf_access_type type,
+				   const struct bpf_insn *si,
+				   struct bpf_insn *insn_buf,
+				   struct bpf_prog *prog, u32 *target_size)
+{
+	return 0;
+}
+#endif /* CONFIG_NF_CONNTRACK */
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 033d90a..12e16ad 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2885,6 +2885,88 @@ struct bpf_stack_build_id {
  *		**-EPERM** if no permission to send the *sig*.
  *
  *		**-EAGAIN** if bpf program can try again.
+ *
+ * struct bpf_nf_conn *bpf_ct_lookup_tcp(void *ctx, struct bpf_nf_conntrack_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
+ *	Description
+ *		Look for TCP nf_conntrack entry matching *tuple*, optionally in
+ *		a child network namespace *netns*. The return value must be
+ *		checked, and if non-**NULL**, released via
+ *		**bpf_ct_release**\ ().
+ *
+ *		The *ctx* should point to the context of the program, such as
+ *		the skb or xdp_md (depending on the hook in use). This is used
+ *		to determine the base network namespace for the lookup.
+ *
+ *		*tuple_size* must be one of:
+ *
+ *		**sizeof**\ (*tuple*\ **->ipv4**)
+ *			Look for an IPv4 nf_conn.
+ *		**sizeof**\ (*tuple*\ **->ipv6**)
+ *			Look for an IPv6 nf_conn.
+ *
+ *		If the *netns* is a negative signed 32-bit integer, then the
+ *		nf_conn lookup table in the netns associated with the *ctx* will
+ *		will be used. For the TC hooks, this is the netns of the device
+ *		in the skb. For XDP hooks, this is the netns of the device in
+ *		the xdp_md. If *netns* is any other signed 32-bit value greater
+ *		than or equal to zero then it specifies the ID of the netns
+ *		relative to the netns associated with the *ctx*. *netns* values
+ *		beyond the range of 32-bit integers are reserved for future
+ *		use.
+ *
+ *		All values for *flags* are reserved for future usage, and must
+ *		be left at zero.
+ *
+ *		This helper is available only if the kernel was compiled with
+ *		**CONFIG_NF_CONNTRACK=y** configuration option.
+ *	Return
+ *		Pointer to **struct bpf_nf_conn**, or **NULL** in case of
+ *		failure.
+ *
+ * struct bpf_nf_conn *bpf_ct_lookup_udp(void *ctx, struct bpf_nf_conntrack_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
+ *	Description
+ *		Look for UDP nf_conntrack entry matching *tuple*, optionally in
+ *		a child network namespace *netns*. The return value must be
+ *		checked, and if non-**NULL**, released via
+ *		**bpf_ct_release**\ ().
+ *
+ *		The *ctx* should point to the context of the program, such as
+ *		the skb or xdp_md (depending on the hook in use). This is used
+ *		to determine the base network namespace for the lookup.
+ *
+ *		*tuple_size* must be one of:
+ *
+ *		**sizeof**\ (*tuple*\ **->ipv4**)
+ *			Look for an IPv4 nf_conn.
+ *		**sizeof**\ (*tuple*\ **->ipv6**)
+ *			Look for an IPv6 nf_conn.
+ *
+ *		If the *netns* is a negative signed 32-bit integer, then the
+ *		nf_conn lookup table in the netns associated with the *ctx* will
+ *		will be used. For the TC hooks, this is the netns of the device
+ *		in the skb. For XDP hooks, this is the netns of the device in
+ *		the xdp_md. If *netns* is any other signed 32-bit value greater
+ *		than or equal to zero then it specifies the ID of the netns
+ *		relative to the netns associated with the *ctx*. *netns* values
+ *		beyond the range of 32-bit integers are reserved for future
+ *		use.
+ *
+ *		All values for *flags* are reserved for future usage, and must
+ *		be left at zero.
+ *
+ *		This helper is available only if the kernel was compiled with
+ *		**CONFIG_NF_CONNTRACK=y** configuration option.
+ *	Return
+ *		Pointer to **struct bpf_nf_conn**, or **NULL** in case of
+ *		failure.
+ *
+ * int bpf_ct_release(struct bpf_nf_conn *ct)
+ *	Description
+ *		Release the reference held by *ct*. *ct* must be a
+ *		non-**NULL** pointer that was returned from
+ *		**bpf_ct_lookup_xxx**\ ().
+ *	Return
+ *		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3004,7 +3086,10 @@ struct bpf_stack_build_id {
 	FN(probe_read_user_str),	\
 	FN(probe_read_kernel_str),	\
 	FN(tcp_send_ack),		\
-	FN(send_signal_thread),
+	FN(send_signal_thread),		\
+	FN(ct_lookup_tcp),		\
+	FN(ct_lookup_udp),		\
+	FN(ct_release),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -3278,6 +3363,30 @@ struct bpf_sock_tuple {
 	};
 };
 
+struct bpf_nf_conn {
+	__u32 cpu;
+	__u32 mark;
+	__u32 status;
+	__u32 timeout;
+};
+
+struct bpf_nf_conntrack_tuple {
+	union {
+		struct {
+			__be32 saddr;
+			__be32 daddr;
+			__be16 sport;
+			__be16 dport;
+		} ipv4;
+		struct {
+			__be32 saddr[4];
+			__be32 daddr[4];
+			__be16 sport;
+			__be16 dport;
+		} ipv6;
+	};
+};
+
 struct bpf_xdp_sock {
 	__u32 queue_id;
 };
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ca17dccc..0ea0ee7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -362,6 +362,11 @@ static const char *ltrim(const char *s)
 	env->prev_linfo = linfo;
 }
 
+static bool type_is_nf_ct_pointer(enum bpf_reg_type type)
+{
+	return type == PTR_TO_NF_CONN;
+}
+
 static bool type_is_pkt_pointer(enum bpf_reg_type type)
 {
 	return type == PTR_TO_PACKET ||
@@ -381,7 +386,8 @@ static bool reg_type_may_be_null(enum bpf_reg_type type)
 	return type == PTR_TO_MAP_VALUE_OR_NULL ||
 	       type == PTR_TO_SOCKET_OR_NULL ||
 	       type == PTR_TO_SOCK_COMMON_OR_NULL ||
-	       type == PTR_TO_TCP_SOCK_OR_NULL;
+	       type == PTR_TO_TCP_SOCK_OR_NULL ||
+	       type == PTR_TO_NF_CONN_OR_NULL;
 }
 
 static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
@@ -395,12 +401,15 @@ static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
 	return type == PTR_TO_SOCKET ||
 		type == PTR_TO_SOCKET_OR_NULL ||
 		type == PTR_TO_TCP_SOCK ||
-		type == PTR_TO_TCP_SOCK_OR_NULL;
+		type == PTR_TO_TCP_SOCK_OR_NULL ||
+		type == PTR_TO_NF_CONN ||
+		type == PTR_TO_NF_CONN_OR_NULL;
 }
 
 static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
 {
-	return type == ARG_PTR_TO_SOCK_COMMON;
+	return type == ARG_PTR_TO_SOCK_COMMON ||
+		type == ARG_PTR_TO_NF_CONN;
 }
 
 /* Determine whether the function releases some resources allocated by another
@@ -409,14 +418,17 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
  */
 static bool is_release_function(enum bpf_func_id func_id)
 {
-	return func_id == BPF_FUNC_sk_release;
+	return func_id == BPF_FUNC_sk_release ||
+		func_id == BPF_FUNC_ct_release;
 }
 
 static bool is_acquire_function(enum bpf_func_id func_id)
 {
 	return func_id == BPF_FUNC_sk_lookup_tcp ||
 		func_id == BPF_FUNC_sk_lookup_udp ||
-		func_id == BPF_FUNC_skc_lookup_tcp;
+		func_id == BPF_FUNC_skc_lookup_tcp ||
+		func_id == BPF_FUNC_ct_lookup_tcp ||
+		func_id == BPF_FUNC_ct_lookup_udp;
 }
 
 static bool is_ptr_cast_function(enum bpf_func_id func_id)
@@ -447,6 +459,8 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id)
 	[PTR_TO_TP_BUFFER]	= "tp_buffer",
 	[PTR_TO_XDP_SOCK]	= "xdp_sock",
 	[PTR_TO_BTF_ID]		= "ptr_",
+	[PTR_TO_NF_CONN]	= "nf_conn",
+	[PTR_TO_NF_CONN_OR_NULL] = "nf_conn_or_null",
 };
 
 static char slot_type_char[] = {
@@ -1913,6 +1927,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_TCP_SOCK_OR_NULL:
 	case PTR_TO_XDP_SOCK:
 	case PTR_TO_BTF_ID:
+	case PTR_TO_NF_CONN:
+	case PTR_TO_NF_CONN_OR_NULL:
 		return true;
 	default:
 		return false;
@@ -2440,6 +2456,35 @@ static int check_flow_keys_access(struct bpf_verifier_env *env, int off,
 	return 0;
 }
 
+static int check_nf_ct_access(struct bpf_verifier_env *env, int insn_idx,
+			     u32 regno, int off, int size,
+			     enum bpf_access_type t)
+{
+	struct bpf_reg_state *regs = cur_regs(env);
+	struct bpf_reg_state *reg = &regs[regno];
+	struct bpf_insn_access_aux info = {};
+	bool valid;
+
+	switch (reg->type) {
+	case PTR_TO_NF_CONN:
+		valid = bpf_nf_conn_is_valid_access(off, size, t, &info);
+		break;
+	default:
+		valid = false;
+	}
+
+	if (valid) {
+		env->insn_aux_data[insn_idx].ctx_field_size =
+			info.ctx_field_size;
+		return 0;
+	}
+
+	verbose(env, "R%d invalid %s access off=%d size=%d\n",
+		regno, reg_type_str[reg->type], off, size);
+
+	return -EACCES;
+}
+
 static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
 			     u32 regno, int off, int size,
 			     enum bpf_access_type t)
@@ -2511,6 +2556,13 @@ static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)
 	return reg->type == PTR_TO_CTX;
 }
 
+static bool is_nf_ct_reg(struct bpf_verifier_env *env, int regno)
+{
+	const struct bpf_reg_state *reg = reg_state(env, regno);
+
+	return type_is_nf_ct_pointer(reg->type);
+}
+
 static bool is_sk_reg(struct bpf_verifier_env *env, int regno)
 {
 	const struct bpf_reg_state *reg = reg_state(env, regno);
@@ -2635,6 +2687,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 	case PTR_TO_XDP_SOCK:
 		pointer_desc = "xdp_sock ";
 		break;
+	case PTR_TO_NF_CONN:
+		pointer_desc = "nf_conn ";
+		break;
 	default:
 		break;
 	}
@@ -3050,6 +3105,15 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		err = check_sock_access(env, insn_idx, regno, off, size, t);
 		if (!err && value_regno >= 0)
 			mark_reg_unknown(env, regs, value_regno);
+	} else if (type_is_nf_ct_pointer(reg->type)) {
+		if (t == BPF_WRITE) {
+			verbose(env, "R%d cannot write into %s\n",
+				regno, reg_type_str[reg->type]);
+			return -EACCES;
+		}
+		err = check_nf_ct_access(env, insn_idx, regno, off, size, t);
+		if (!err && value_regno >= 0)
+			mark_reg_unknown(env, regs, value_regno);
 	} else if (reg->type == PTR_TO_TP_BUFFER) {
 		err = check_tp_buffer_access(env, reg, regno, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
@@ -3099,7 +3163,8 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
 	if (is_ctx_reg(env, insn->dst_reg) ||
 	    is_pkt_reg(env, insn->dst_reg) ||
 	    is_flow_key_reg(env, insn->dst_reg) ||
-	    is_sk_reg(env, insn->dst_reg)) {
+	    is_sk_reg(env, insn->dst_reg) ||
+	    is_nf_ct_reg(env, insn->dst_reg)) {
 		verbose(env, "BPF_XADD stores into R%d %s is not allowed\n",
 			insn->dst_reg,
 			reg_type_str[reg_state(env, insn->dst_reg)->type]);
@@ -3501,6 +3566,19 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 				regno);
 			return -EACCES;
 		}
+	} else if (arg_type == ARG_PTR_TO_NF_CONN) {
+		expected_type = PTR_TO_NF_CONN;
+		if (!type_is_nf_ct_pointer(type))
+			goto err_type;
+		if (reg->ref_obj_id) {
+			if (meta->ref_obj_id) {
+				verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
+					regno, reg->ref_obj_id,
+					meta->ref_obj_id);
+				return -EFAULT;
+			}
+			meta->ref_obj_id = reg->ref_obj_id;
+		}
 	} else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
 		if (meta->func_id == BPF_FUNC_spin_lock) {
 			if (process_spin_lock(env, regno, true))
@@ -4368,6 +4446,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		mark_reg_known_zero(env, regs, BPF_REG_0);
 		regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL;
 		regs[BPF_REG_0].id = ++env->id_gen;
+	} else if (fn->ret_type == RET_PTR_TO_NF_CONN_OR_NULL) {
+		mark_reg_known_zero(env, regs, BPF_REG_0);
+		regs[BPF_REG_0].type = PTR_TO_NF_CONN_OR_NULL;
+		regs[BPF_REG_0].id = ++env->id_gen;
 	} else {
 		verbose(env, "unknown return type %d of func %s#%d\n",
 			fn->ret_type, func_id_name(func_id), func_id);
@@ -4649,6 +4731,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	case PTR_TO_TCP_SOCK:
 	case PTR_TO_TCP_SOCK_OR_NULL:
 	case PTR_TO_XDP_SOCK:
+	case PTR_TO_NF_CONN:
+	case PTR_TO_NF_CONN_OR_NULL:
 		verbose(env, "R%d pointer arithmetic on %s prohibited\n",
 			dst, reg_type_str[ptr_reg->type]);
 		return -EACCES;
@@ -5915,6 +5999,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
 			reg->type = PTR_TO_SOCK_COMMON;
 		} else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) {
 			reg->type = PTR_TO_TCP_SOCK;
+		} else if (reg->type == PTR_TO_NF_CONN_OR_NULL) {
+			reg->type = PTR_TO_NF_CONN;
 		}
 		if (is_null) {
 			/* We don't need id and ref_obj_id from this point
@@ -7232,6 +7318,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 	case PTR_TO_TCP_SOCK:
 	case PTR_TO_TCP_SOCK_OR_NULL:
 	case PTR_TO_XDP_SOCK:
+	case PTR_TO_NF_CONN:
+	case PTR_TO_NF_CONN_OR_NULL:
 		/* Only valid matches are exact, which memcmp() above
 		 * would have accepted
 		 */
@@ -7760,6 +7848,8 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type)
 	case PTR_TO_TCP_SOCK_OR_NULL:
 	case PTR_TO_XDP_SOCK:
 	case PTR_TO_BTF_ID:
+	case PTR_TO_NF_CONN:
+	case PTR_TO_NF_CONN_OR_NULL:
 		return false;
 	default:
 		return true;
@@ -8867,6 +8957,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 				return -EINVAL;
 			}
 			continue;
+		case PTR_TO_NF_CONN:
+			convert_ctx_access = bpf_nf_conn_convert_ctx_access;
+			break;
 		default:
 			continue;
 		}
diff --git a/net/core/filter.c b/net/core/filter.c
index 17de674..39ba965 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -74,6 +74,12 @@
 #include <net/ipv6_stubs.h>
 #include <net/bpf_sk_storage.h>
 
+#if IS_BUILTIN(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
 /**
  *	sk_filter_trim_cap - run a packet through a socket filter
  *	@sk: sock associated with &sk_buff
@@ -5122,6 +5128,253 @@ static void bpf_update_srh_state(struct sk_buff *skb)
 };
 #endif /* CONFIG_IPV6_SEG6_BPF */
 
+#if IS_BUILTIN(CONFIG_NF_CONNTRACK)
+bool bpf_nf_conn_is_valid_access(int off, int size, enum bpf_access_type type,
+				 struct bpf_insn_access_aux *info)
+{
+	if (off < 0 || off >= offsetofend(struct bpf_nf_conn,
+					  timeout))
+		return false;
+
+	if (off % size != 0)
+		return false;
+
+	return size == sizeof(__u32);
+}
+
+u32 bpf_nf_conn_convert_ctx_access(enum bpf_access_type type,
+				   const struct bpf_insn *si,
+				   struct bpf_insn *insn_buf,
+				   struct bpf_prog *prog, u32 *target_size)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	switch (si->off) {
+	case offsetof(struct bpf_nf_conn, cpu):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct nf_conn, cpu) != 2);
+
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+				      offsetof(struct nf_conn, cpu));
+
+		break;
+
+	case offsetof(struct bpf_nf_conn, mark):
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
+		BUILD_BUG_ON(FIELD_SIZEOF(struct nf_conn, mark) != 4);
+
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+				      offsetof(struct nf_conn, mark));
+#else
+		*target_size = 4;
+		*insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
+#endif
+		break;
+
+	case offsetof(struct bpf_nf_conn, status):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct nf_conn, status) < 4 ||
+			     __IPS_MAX_BIT > 32);
+
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+				      offsetof(struct nf_conn, status));
+
+		break;
+
+	case offsetof(struct bpf_nf_conn, timeout):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct nf_conn, timeout) != 4);
+
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+				      offsetof(struct nf_conn, timeout));
+
+		break;
+	}
+
+	return insn - insn_buf;
+}
+
+static struct nf_conn *
+ct_lookup(struct net *net, struct bpf_nf_conntrack_tuple *tuple,
+	  u8 family, u8 proto)
+{
+	struct nf_conntrack_tuple_hash *hash;
+	struct nf_conntrack_tuple tup;
+	struct nf_conn *ct = NULL;
+
+	memset(&tup, 0, sizeof(tup));
+
+	tup.dst.protonum = proto;
+	tup.src.l3num = family;
+
+	if (family == AF_INET) {
+		tup.src.u3.ip = tuple->ipv4.saddr;
+		tup.dst.u3.ip = tuple->ipv4.daddr;
+		tup.src.u.tcp.port = tuple->ipv4.sport;
+		tup.dst.u.tcp.port = tuple->ipv4.dport;
+#if IS_ENABLED(CONFIG_IPV6)
+	} else {
+		memcpy(tup.src.u3.ip6, tuple->ipv6.saddr, sizeof(tup.src.u3.ip6));
+		memcpy(tup.dst.u3.ip6, tuple->ipv6.daddr, sizeof(tup.dst.u3.ip6));
+		tup.src.u.tcp.port = tuple->ipv6.sport;
+		tup.dst.u.tcp.port = tuple->ipv6.dport;
+#endif
+	}
+
+	hash = nf_conntrack_find_get(net, &nf_ct_zone_dflt, &tup);
+	if (!hash)
+		goto out;
+	ct = nf_ct_tuplehash_to_ctrack(hash);
+
+out:
+	return ct;
+}
+
+static struct nf_conn *
+__bpf_ct_lookup(struct sk_buff *skb, struct bpf_nf_conntrack_tuple *tuple, u32 len,
+		struct net *caller_net, u8 proto, u64 netns_id, u64 flags)
+{
+	struct nf_conn *ct = NULL;
+	u8 family = AF_UNSPEC;
+	struct net *net;
+
+	if (len == sizeof(tuple->ipv4))
+		family = AF_INET;
+	else if (len == sizeof(tuple->ipv6))
+		family = AF_INET6;
+	else
+		goto out;
+
+	if (unlikely(family == AF_UNSPEC || flags ||
+		     !((s32)netns_id < 0 || netns_id <= S32_MAX)))
+		goto out;
+
+	if ((s32)netns_id < 0) {
+		net = caller_net;
+		ct = ct_lookup(net, tuple, family, proto);
+	} else {
+		net = get_net_ns_by_id(caller_net, netns_id);
+		if (unlikely(!net))
+			goto out;
+		ct = ct_lookup(net, tuple, family, proto);
+		put_net(net);
+	}
+
+out:
+	return ct;
+}
+
+static struct nf_conn *
+bpf_ct_lookup(struct sk_buff *skb, struct bpf_nf_conntrack_tuple *tuple, u32 len,
+	      u8 proto, u64 netns_id, u64 flags)
+{
+	struct net *caller_net;
+
+	if (skb->dev) {
+		caller_net = dev_net(skb->dev);
+	} else {
+		caller_net = sock_net(skb->sk);
+	}
+
+	return __bpf_ct_lookup(skb, tuple, len, caller_net, proto,
+			       netns_id, flags);
+}
+
+BPF_CALL_5(bpf_ct_lookup_tcp, struct sk_buff *, skb,
+	   struct bpf_nf_conntrack_tuple *, tuple, u32, len, u64, netns_id,
+	   u64, flags)
+{
+	return (unsigned long)bpf_ct_lookup(skb, tuple, len, IPPROTO_TCP,
+					     netns_id, flags);
+}
+
+static const struct bpf_func_proto bpf_ct_lookup_tcp_proto = {
+	.func		= bpf_ct_lookup_tcp,
+	.gpl_only	= true,
+	.pkt_access	= true,
+	.ret_type	= RET_PTR_TO_NF_CONN_OR_NULL,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_xdp_ct_lookup_tcp, struct xdp_buff *, ctx,
+	   struct bpf_nf_conntrack_tuple *, tuple, u32, len, u32, netns_id,
+	   u64, flags)
+{
+	struct net *caller_net = dev_net(ctx->rxq->dev);
+
+	return (unsigned long)__bpf_ct_lookup(NULL, tuple, len, caller_net,
+					      IPPROTO_TCP, netns_id, flags);
+}
+
+static const struct bpf_func_proto bpf_xdp_ct_lookup_tcp_proto = {
+	.func		= bpf_xdp_ct_lookup_tcp,
+	.gpl_only	= true,
+	.pkt_access	= true,
+	.ret_type	= RET_PTR_TO_NF_CONN_OR_NULL,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_ct_lookup_udp, struct sk_buff *, skb,
+	   struct bpf_nf_conntrack_tuple *, tuple, u32, len, u64, netns_id,
+	   u64, flags)
+{
+	return (unsigned long)bpf_ct_lookup(skb, tuple, len, IPPROTO_UDP,
+					     netns_id, flags);
+}
+
+static const struct bpf_func_proto bpf_ct_lookup_udp_proto = {
+	.func		= bpf_ct_lookup_udp,
+	.gpl_only	= true,
+	.pkt_access	= true,
+	.ret_type	= RET_PTR_TO_NF_CONN_OR_NULL,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_xdp_ct_lookup_udp, struct xdp_buff *, ctx,
+	   struct bpf_nf_conntrack_tuple *, tuple, u32, len, u32, netns_id,
+	   u64, flags)
+{
+	struct net *caller_net = dev_net(ctx->rxq->dev);
+
+	return (unsigned long)__bpf_ct_lookup(NULL, tuple, len, caller_net,
+					      IPPROTO_UDP, netns_id, flags);
+}
+
+static const struct bpf_func_proto bpf_xdp_ct_lookup_udp_proto = {
+	.func		= bpf_xdp_ct_lookup_udp,
+	.gpl_only	= true,
+	.pkt_access	= true,
+	.ret_type	= RET_PTR_TO_NF_CONN_OR_NULL,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_1(bpf_ct_release, struct nf_conn *, ct)
+{
+	nf_conntrack_put(&ct->ct_general);
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_ct_release_proto = {
+	.func		= bpf_ct_release,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_NF_CONN,
+};
+#endif
+
 #ifdef CONFIG_INET
 static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
 			      int dif, int sdif, u8 family, u8 proto)
@@ -6139,6 +6392,14 @@ bool bpf_helper_changes_pkt_data(void *func)
 	case BPF_FUNC_tcp_gen_syncookie:
 		return &bpf_tcp_gen_syncookie_proto;
 #endif
+#if IS_BUILTIN(CONFIG_NF_CONNTRACK)
+	case BPF_FUNC_ct_lookup_tcp:
+		return &bpf_ct_lookup_tcp_proto;
+	case BPF_FUNC_ct_lookup_udp:
+		return &bpf_ct_lookup_udp_proto;
+	case BPF_FUNC_ct_release:
+		return &bpf_ct_release_proto;
+#endif
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -6180,6 +6441,14 @@ bool bpf_helper_changes_pkt_data(void *func)
 	case BPF_FUNC_tcp_gen_syncookie:
 		return &bpf_tcp_gen_syncookie_proto;
 #endif
+#if IS_BUILTIN(CONFIG_NF_CONNTRACK)
+	case BPF_FUNC_ct_lookup_tcp:
+		return &bpf_xdp_ct_lookup_tcp_proto;
+	case BPF_FUNC_ct_lookup_udp:
+		return &bpf_xdp_ct_lookup_udp_proto;
+	case BPF_FUNC_ct_release:
+		return &bpf_ct_release_proto;
+#endif
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -6284,6 +6553,14 @@ bool bpf_helper_changes_pkt_data(void *func)
 	case BPF_FUNC_skc_lookup_tcp:
 		return &bpf_skc_lookup_tcp_proto;
 #endif
+#if IS_BUILTIN(CONFIG_NF_CONNTRACK)
+	case BPF_FUNC_ct_lookup_tcp:
+		return &bpf_ct_lookup_tcp_proto;
+	case BPF_FUNC_ct_lookup_udp:
+		return &bpf_ct_lookup_udp_proto;
+	case BPF_FUNC_ct_release:
+		return &bpf_ct_release_proto;
+#endif
 	default:
 		return bpf_base_func_proto(func_id);
 	}
diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py
index 90baf7d..26f0c2a 100755
--- a/scripts/bpf_helpers_doc.py
+++ b/scripts/bpf_helpers_doc.py
@@ -398,6 +398,8 @@ class PrinterHelpers(Printer):
 
     type_fwds = [
             'struct bpf_fib_lookup',
+            'struct bpf_nf_conn',
+            'struct bpf_nf_conntrack_tuple',
             'struct bpf_perf_event_data',
             'struct bpf_perf_event_value',
             'struct bpf_sock',
@@ -433,6 +435,8 @@ class PrinterHelpers(Printer):
             '__wsum',
 
             'struct bpf_fib_lookup',
+            'struct bpf_nf_conn',
+            'struct bpf_nf_conntrack_tuple',
             'struct bpf_perf_event_data',
             'struct bpf_perf_event_value',
             'struct bpf_sock',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 033d90a..12e16ad 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2885,6 +2885,88 @@ struct bpf_stack_build_id {
  *		**-EPERM** if no permission to send the *sig*.
  *
  *		**-EAGAIN** if bpf program can try again.
+ *
+ * struct bpf_nf_conn *bpf_ct_lookup_tcp(void *ctx, struct bpf_nf_conntrack_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
+ *	Description
+ *		Look for TCP nf_conntrack entry matching *tuple*, optionally in
+ *		a child network namespace *netns*. The return value must be
+ *		checked, and if non-**NULL**, released via
+ *		**bpf_ct_release**\ ().
+ *
+ *		The *ctx* should point to the context of the program, such as
+ *		the skb or xdp_md (depending on the hook in use). This is used
+ *		to determine the base network namespace for the lookup.
+ *
+ *		*tuple_size* must be one of:
+ *
+ *		**sizeof**\ (*tuple*\ **->ipv4**)
+ *			Look for an IPv4 nf_conn.
+ *		**sizeof**\ (*tuple*\ **->ipv6**)
+ *			Look for an IPv6 nf_conn.
+ *
+ *		If the *netns* is a negative signed 32-bit integer, then the
+ *		nf_conn lookup table in the netns associated with the *ctx* will
+ *		will be used. For the TC hooks, this is the netns of the device
+ *		in the skb. For XDP hooks, this is the netns of the device in
+ *		the xdp_md. If *netns* is any other signed 32-bit value greater
+ *		than or equal to zero then it specifies the ID of the netns
+ *		relative to the netns associated with the *ctx*. *netns* values
+ *		beyond the range of 32-bit integers are reserved for future
+ *		use.
+ *
+ *		All values for *flags* are reserved for future usage, and must
+ *		be left at zero.
+ *
+ *		This helper is available only if the kernel was compiled with
+ *		**CONFIG_NF_CONNTRACK=y** configuration option.
+ *	Return
+ *		Pointer to **struct bpf_nf_conn**, or **NULL** in case of
+ *		failure.
+ *
+ * struct bpf_nf_conn *bpf_ct_lookup_udp(void *ctx, struct bpf_nf_conntrack_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
+ *	Description
+ *		Look for UDP nf_conntrack entry matching *tuple*, optionally in
+ *		a child network namespace *netns*. The return value must be
+ *		checked, and if non-**NULL**, released via
+ *		**bpf_ct_release**\ ().
+ *
+ *		The *ctx* should point to the context of the program, such as
+ *		the skb or xdp_md (depending on the hook in use). This is used
+ *		to determine the base network namespace for the lookup.
+ *
+ *		*tuple_size* must be one of:
+ *
+ *		**sizeof**\ (*tuple*\ **->ipv4**)
+ *			Look for an IPv4 nf_conn.
+ *		**sizeof**\ (*tuple*\ **->ipv6**)
+ *			Look for an IPv6 nf_conn.
+ *
+ *		If the *netns* is a negative signed 32-bit integer, then the
+ *		nf_conn lookup table in the netns associated with the *ctx* will
+ *		will be used. For the TC hooks, this is the netns of the device
+ *		in the skb. For XDP hooks, this is the netns of the device in
+ *		the xdp_md. If *netns* is any other signed 32-bit value greater
+ *		than or equal to zero then it specifies the ID of the netns
+ *		relative to the netns associated with the *ctx*. *netns* values
+ *		beyond the range of 32-bit integers are reserved for future
+ *		use.
+ *
+ *		All values for *flags* are reserved for future usage, and must
+ *		be left at zero.
+ *
+ *		This helper is available only if the kernel was compiled with
+ *		**CONFIG_NF_CONNTRACK=y** configuration option.
+ *	Return
+ *		Pointer to **struct bpf_nf_conn**, or **NULL** in case of
+ *		failure.
+ *
+ * int bpf_ct_release(struct bpf_nf_conn *ct)
+ *	Description
+ *		Release the reference held by *ct*. *ct* must be a
+ *		non-**NULL** pointer that was returned from
+ *		**bpf_ct_lookup_xxx**\ ().
+ *	Return
+ *		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3004,7 +3086,10 @@ struct bpf_stack_build_id {
 	FN(probe_read_user_str),	\
 	FN(probe_read_kernel_str),	\
 	FN(tcp_send_ack),		\
-	FN(send_signal_thread),
+	FN(send_signal_thread),		\
+	FN(ct_lookup_tcp),		\
+	FN(ct_lookup_udp),		\
+	FN(ct_release),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -3278,6 +3363,30 @@ struct bpf_sock_tuple {
 	};
 };
 
+struct bpf_nf_conn {
+	__u32 cpu;
+	__u32 mark;
+	__u32 status;
+	__u32 timeout;
+};
+
+struct bpf_nf_conntrack_tuple {
+	union {
+		struct {
+			__be32 saddr;
+			__be32 daddr;
+			__be16 sport;
+			__be16 dport;
+		} ipv4;
+		struct {
+			__be32 saddr[4];
+			__be32 daddr[4];
+			__be16 sport;
+			__be16 dport;
+		} ipv6;
+	};
+};
+
 struct bpf_xdp_sock {
 	__u32 queue_id;
 };
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 87eaa49..7569db2 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -294,6 +294,24 @@ static void bpf_fill_scale(struct bpf_test *self)
 	}
 }
 
+/* BPF_CT_LOOKUP contains 13 instructions, if you need to fix up maps */
+#define BPF_CT_LOOKUP(func)						\
+	/* struct bpf_nf_conntrack_tuple tuple = {} */			\
+	BPF_MOV64_IMM(BPF_REG_2, 0),					\
+	BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8),			\
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -16),		\
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -24),		\
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -32),		\
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -40),		\
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -48),		\
+	/* ct = func(ctx, &tuple, sizeof tuple, 0, 0) */		\
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),				\
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -48),				\
+	BPF_MOV64_IMM(BPF_REG_3, sizeof(struct bpf_nf_conntrack_tuple)),\
+	BPF_MOV64_IMM(BPF_REG_4, 0),					\
+	BPF_MOV64_IMM(BPF_REG_5, 0),					\
+	BPF_EMIT_CALL(BPF_FUNC_ ## func)
+
 /* BPF_SK_LOOKUP contains 13 instructions, if you need to fix up maps */
 #define BPF_SK_LOOKUP(func)						\
 	/* struct bpf_sock_tuple tuple = {} */				\
diff --git a/tools/testing/selftests/bpf/verifier/ref_tracking.c b/tools/testing/selftests/bpf/verifier/ref_tracking.c
index 604b461..de5c550a 100644
--- a/tools/testing/selftests/bpf/verifier/ref_tracking.c
+++ b/tools/testing/selftests/bpf/verifier/ref_tracking.c
@@ -21,6 +21,17 @@
 	.result = REJECT,
 },
 {
+	"reference tracking: leak potential reference to nf_conn",
+	.insns = {
+	BPF_CT_LOOKUP(ct_lookup_tcp),
+	BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), /* leak reference */
+	BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	.errstr = "Unreleased reference",
+	.result = REJECT,
+},
+{
 	"reference tracking: leak potential reference on stack",
 	.insns = {
 	BPF_SK_LOOKUP(sk_lookup_tcp),
@@ -72,6 +83,17 @@
 	.result = REJECT,
 },
 {
+	"reference tracking: zero potential reference to nf_conn",
+	.insns = {
+	BPF_CT_LOOKUP(ct_lookup_tcp),
+	BPF_MOV64_IMM(BPF_REG_0, 0), /* leak reference */
+	BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	.errstr = "Unreleased reference",
+	.result = REJECT,
+},
+{
 	"reference tracking: copy and zero potential references",
 	.insns = {
 	BPF_SK_LOOKUP(sk_lookup_tcp),
@@ -113,6 +135,20 @@
 	.result = REJECT,
 },
 {
+	"reference tracking: release reference to nf_conn without check",
+	.insns = {
+	BPF_CT_LOOKUP(ct_lookup_tcp),
+	/* reference in r0 may be NULL */
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+	BPF_MOV64_IMM(BPF_REG_2, 0),
+	BPF_EMIT_CALL(BPF_FUNC_ct_release),
+	BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	.errstr = "type=nf_conn_or_null expected=nf_conn",
+	.result = REJECT,
+},
+{
 	"reference tracking: release reference",
 	.insns = {
 	BPF_SK_LOOKUP(sk_lookup_tcp),
@@ -137,6 +173,18 @@
 	.result = ACCEPT,
 },
 {
+	"reference tracking: release reference to nf_conn",
+	.insns = {
+	BPF_CT_LOOKUP(ct_lookup_tcp),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+	BPF_EMIT_CALL(BPF_FUNC_ct_release),
+	BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	.result = ACCEPT,
+},
+{
 	"reference tracking: release reference 2",
 	.insns = {
 	BPF_SK_LOOKUP(sk_lookup_tcp),
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH bpf-next] bpf: add bpf_ct_lookup_{tcp,udp}() helpers
  2020-01-18  0:01 [PATCH bpf-next] bpf: add bpf_ct_lookup_{tcp,udp}() helpers Matthew Cover
@ 2020-01-18 11:37 ` kbuild test robot
  2020-01-18 11:58 ` kbuild test robot
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 22+ messages in thread
From: kbuild test robot @ 2020-01-18 11:37 UTC (permalink / raw)
  To: Matthew Cover
  Cc: kbuild-all, Alexei Starovoitov, Daniel Borkmann,
	Martin KaFai Lau, Song Liu, Yonghong Song, Andrii Nakryiko,
	David S. Miller, Shuah Khan, Jakub Kicinski,
	Jesper Dangaard Brouer, John Fastabend, Jakub Sitnicki,
	Quentin Monnet, Matthew Cover, Stanislav Fomichev,
	Andrey Ignatov, Lorenz Bauer, Jiong Wang, netdev, bpf,
	linux-kernel, linux-kselftest

[-- Attachment #1: Type: text/plain, Size: 12160 bytes --]

Hi Matthew,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on bpf-next/master]
[also build test ERROR on next-20200117]
[cannot apply to bpf/master net-next/master net/master linus/master sparc-next/master v5.5-rc6]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:    https://github.com/0day-ci/linux/commits/Matthew-Cover/bpf-add-bpf_ct_lookup_-tcp-udp-helpers/20200118-153032
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: i386-alldefconfig (attached as .config)
compiler: gcc-7 (Debian 7.5.0-3) 7.5.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   ld: init/do_mounts.o: in function `bpf_nf_conn_is_valid_access':
>> do_mounts.c:(.text+0x70): multiple definition of `bpf_nf_conn_is_valid_access'; init/main.o:main.c:(.text+0x80): first defined here
   ld: init/do_mounts.o: in function `bpf_nf_conn_convert_ctx_access':
>> do_mounts.c:(.text+0x80): multiple definition of `bpf_nf_conn_convert_ctx_access'; init/main.o:main.c:(.text+0x90): first defined here
   ld: init/noinitramfs.o: in function `bpf_nf_conn_is_valid_access':
   noinitramfs.c:(.text+0x0): multiple definition of `bpf_nf_conn_is_valid_access'; init/main.o:main.c:(.text+0x80): first defined here
   ld: init/noinitramfs.o: in function `bpf_nf_conn_convert_ctx_access':
   noinitramfs.c:(.text+0x10): multiple definition of `bpf_nf_conn_convert_ctx_access'; init/main.o:main.c:(.text+0x90): first defined here
   ld: arch/x86/entry/common.o: in function `bpf_nf_conn_is_valid_access':
   common.c:(.text+0x2b0): multiple definition of `bpf_nf_conn_is_valid_access'; init/main.o:main.c:(.text+0x80): first defined here
   ld: arch/x86/entry/common.o: in function `bpf_nf_conn_convert_ctx_access':
   common.c:(.text+0x2c0): multiple definition of `bpf_nf_conn_convert_ctx_access'; init/main.o:main.c:(.text+0x90): first defined here
   ld: arch/x86/events/core.o: in function `bpf_nf_conn_is_valid_access':
   core.c:(.text+0xbe0): multiple definition of `bpf_nf_conn_is_valid_access'; init/main.o:main.c:(.text+0x80): first defined here
   ld: arch/x86/events/core.o: in function `bpf_nf_conn_convert_ctx_access':
   core.c:(.text+0xbf0): multiple definition of `bpf_nf_conn_convert_ctx_access'; init/main.o:main.c:(.text+0x90): first defined here
   ld: arch/x86/events/amd/core.o: in function `bpf_nf_conn_is_valid_access':
   core.c:(.text+0x8f0): multiple definition of `bpf_nf_conn_is_valid_access'; init/main.o:main.c:(.text+0x80): first defined here
   ld: arch/x86/events/amd/core.o: in function `bpf_nf_conn_convert_ctx_access':
   core.c:(.text+0x900): multiple definition of `bpf_nf_conn_convert_ctx_access'; init/main.o:main.c:(.text+0x90): first defined here
   ld: arch/x86/events/amd/uncore.o: in function `bpf_nf_conn_is_valid_access':
   uncore.c:(.text+0x8d0): multiple definition of `bpf_nf_conn_is_valid_access'; init/main.o:main.c:(.text+0x80): first defined here
   ld: arch/x86/events/amd/uncore.o: in function `bpf_nf_conn_convert_ctx_access':
   uncore.c:(.text+0x8e0): multiple definition of `bpf_nf_conn_convert_ctx_access'; init/main.o:main.c:(.text+0x90): first defined here
   ld: arch/x86/events/intel/core.o: in function `bpf_nf_conn_is_valid_access':
   core.c:(.text+0x1d40): multiple definition of `bpf_nf_conn_is_valid_access'; init/main.o:main.c:(.text+0x80): first defined here
   ld: arch/x86/events/intel/core.o: in function `bpf_nf_conn_convert_ctx_access':
   core.c:(.text+0x1d50): multiple definition of `bpf_nf_conn_convert_ctx_access'; init/main.o:main.c:(.text+0x90): first defined here
   ld: arch/x86/events/intel/bts.o: in function `bpf_nf_conn_is_valid_access':
   bts.c:(.text+0x9c0): multiple definition of `bpf_nf_conn_is_valid_access'; init/main.o:main.c:(.text+0x80): first defined here
   ld: arch/x86/events/intel/bts.o: in function `bpf_nf_conn_convert_ctx_access':
   bts.c:(.text+0x9d0): multiple definition of `bpf_nf_conn_convert_ctx_access'; init/main.o:main.c:(.text+0x90): first defined here
   ld: arch/x86/events/intel/ds.o: in function `bpf_nf_conn_is_valid_access':
   ds.c:(.text+0x1920): multiple definition of `bpf_nf_conn_is_valid_access'; init/main.o:main.c:(.text+0x80): first defined here
   ld: arch/x86/events/intel/ds.o: in function `bpf_nf_conn_convert_ctx_access':
   ds.c:(.text+0x1930): multiple definition of `bpf_nf_conn_convert_ctx_access'; init/main.o:main.c:(.text+0x90): first defined here
   ld: arch/x86/events/intel/knc.o: in function `bpf_nf_conn_is_valid_access':
   knc.c:(.text+0x340): multiple definition of `bpf_nf_conn_is_valid_access'; init/main.o:main.c:(.text+0x80): first defined here
   ld: arch/x86/events/intel/knc.o: in function `bpf_nf_conn_convert_ctx_access':
   knc.c:(.text+0x350): multiple definition of `bpf_nf_conn_convert_ctx_access'; init/main.o:main.c:(.text+0x90): first defined here
   ld: arch/x86/events/intel/lbr.o: in function `bpf_nf_conn_is_valid_access':
   lbr.c:(.text+0x680): multiple definition of `bpf_nf_conn_is_valid_access'; init/main.o:main.c:(.text+0x80): first defined here
   ld: arch/x86/events/intel/lbr.o: in function `bpf_nf_conn_convert_ctx_access':
   lbr.c:(.text+0x690): multiple definition of `bpf_nf_conn_convert_ctx_access'; init/main.o:main.c:(.text+0x90): first defined here
   ld: arch/x86/events/intel/p4.o: in function `bpf_nf_conn_is_valid_access':
   p4.c:(.text+0x7d0): multiple definition of `bpf_nf_conn_is_valid_access'; init/main.o:main.c:(.text+0x80): first defined here
   ld: arch/x86/events/intel/p4.o: in function `bpf_nf_conn_convert_ctx_access':
   p4.c:(.text+0x7e0): multiple definition of `bpf_nf_conn_convert_ctx_access'; init/main.o:main.c:(.text+0x90): first defined here
   ld: arch/x86/events/intel/p6.o: in function `bpf_nf_conn_is_valid_access':
   p6.c:(.text+0x170): multiple definition of `bpf_nf_conn_is_valid_access'; init/main.o:main.c:(.text+0x80): first defined here
   ld: arch/x86/events/intel/p6.o: in function `bpf_nf_conn_convert_ctx_access':
   p6.c:(.text+0x180): multiple definition of `bpf_nf_conn_convert_ctx_access'; init/main.o:main.c:(.text+0x90): first defined here
   ld: arch/x86/events/intel/pt.o: in function `bpf_nf_conn_is_valid_access':
   pt.c:(.text+0x1a70): multiple definition of `bpf_nf_conn_is_valid_access'; init/main.o:main.c:(.text+0x80): first defined here
   ld: arch/x86/events/intel/pt.o: in function `bpf_nf_conn_convert_ctx_access':
   pt.c:(.text+0x1a80): multiple definition of `bpf_nf_conn_convert_ctx_access'; init/main.o:main.c:(.text+0x90): first defined here
   ld: arch/x86/kernel/process_32.o: in function `bpf_nf_conn_is_valid_access':
   process_32.c:(.text+0x0): multiple definition of `bpf_nf_conn_is_valid_access'; init/main.o:main.c:(.text+0x80): first defined here
   ld: arch/x86/kernel/process_32.o: in function `bpf_nf_conn_convert_ctx_access':
   process_32.c:(.text+0x10): multiple definition of `bpf_nf_conn_convert_ctx_access'; init/main.o:main.c:(.text+0x90): first defined here
   ld: arch/x86/kernel/signal.o: in function `bpf_nf_conn_is_valid_access':
   signal.c:(.text+0x270): multiple definition of `bpf_nf_conn_is_valid_access'; init/main.o:main.c:(.text+0x80): first defined here
   ld: arch/x86/kernel/signal.o: in function `bpf_nf_conn_convert_ctx_access':
   signal.c:(.text+0x280): multiple definition of `bpf_nf_conn_convert_ctx_access'; init/main.o:main.c:(.text+0x90): first defined here
   ld: arch/x86/kernel/ioport.o: in function `bpf_nf_conn_is_valid_access':
   ioport.c:(.text+0x40): multiple definition of `bpf_nf_conn_is_valid_access'; init/main.o:main.c:(.text+0x80): first defined here
   ld: arch/x86/kernel/ioport.o: in function `bpf_nf_conn_convert_ctx_access':
   ioport.c:(.text+0x50): multiple definition of `bpf_nf_conn_convert_ctx_access'; init/main.o:main.c:(.text+0x90): first defined here
   ld: arch/x86/kernel/ldt.o: in function `bpf_nf_conn_is_valid_access':
   ldt.c:(.text+0x4c0): multiple definition of `bpf_nf_conn_is_valid_access'; init/main.o:main.c:(.text+0x80): first defined here
   ld: arch/x86/kernel/ldt.o: in function `bpf_nf_conn_convert_ctx_access':
   ldt.c:(.text+0x4d0): multiple definition of `bpf_nf_conn_convert_ctx_access'; init/main.o:main.c:(.text+0x90): first defined here
   ld: arch/x86/kernel/setup.o: in function `bpf_nf_conn_is_valid_access':
   setup.c:(.text+0x60): multiple definition of `bpf_nf_conn_is_valid_access'; init/main.o:main.c:(.text+0x80): first defined here
   ld: arch/x86/kernel/setup.o: in function `bpf_nf_conn_convert_ctx_access':
   setup.c:(.text+0x70): multiple definition of `bpf_nf_conn_convert_ctx_access'; init/main.o:main.c:(.text+0x90): first defined here
   ld: arch/x86/kernel/e820.o: in function `bpf_nf_conn_is_valid_access':
   e820.c:(.text+0x0): multiple definition of `bpf_nf_conn_is_valid_access'; init/main.o:main.c:(.text+0x80): first defined here
   ld: arch/x86/kernel/e820.o: in function `bpf_nf_conn_convert_ctx_access':
   e820.c:(.text+0x10): multiple definition of `bpf_nf_conn_convert_ctx_access'; init/main.o:main.c:(.text+0x90): first defined here
   ld: arch/x86/kernel/hw_breakpoint.o: in function `bpf_nf_conn_is_valid_access':
   hw_breakpoint.c:(.text+0x0): multiple definition of `bpf_nf_conn_is_valid_access'; init/main.o:main.c:(.text+0x80): first defined here
   ld: arch/x86/kernel/hw_breakpoint.o: in function `bpf_nf_conn_convert_ctx_access':
   hw_breakpoint.c:(.text+0x10): multiple definition of `bpf_nf_conn_convert_ctx_access'; init/main.o:main.c:(.text+0x90): first defined here
   ld: arch/x86/kernel/process.o: in function `bpf_nf_conn_is_valid_access':
   process.c:(.text+0xe0): multiple definition of `bpf_nf_conn_is_valid_access'; init/main.o:main.c:(.text+0x80): first defined here
   ld: arch/x86/kernel/process.o: in function `bpf_nf_conn_convert_ctx_access':
   process.c:(.text+0xf0): multiple definition of `bpf_nf_conn_convert_ctx_access'; init/main.o:main.c:(.text+0x90): first defined here
   ld: arch/x86/kernel/ptrace.o: in function `bpf_nf_conn_is_valid_access':
   ptrace.c:(.text+0x690): multiple definition of `bpf_nf_conn_is_valid_access'; init/main.o:main.c:(.text+0x80): first defined here
   ld: arch/x86/kernel/ptrace.o: in function `bpf_nf_conn_convert_ctx_access':
   ptrace.c:(.text+0x6a0): multiple definition of `bpf_nf_conn_convert_ctx_access'; init/main.o:main.c:(.text+0x90): first defined here
   ld: arch/x86/kernel/tls.o: in function `bpf_nf_conn_is_valid_access':
   tls.c:(.text+0x2b0): multiple definition of `bpf_nf_conn_is_valid_access'; init/main.o:main.c:(.text+0x80): first defined here
   ld: arch/x86/kernel/tls.o: in function `bpf_nf_conn_convert_ctx_access':
   tls.c:(.text+0x2c0): multiple definition of `bpf_nf_conn_convert_ctx_access'; init/main.o:main.c:(.text+0x90): first defined here
   ld: arch/x86/kernel/cpu/umwait.o: in function `bpf_nf_conn_is_valid_access':
   umwait.c:(.text+0x210): multiple definition of `bpf_nf_conn_is_valid_access'; init/main.o:main.c:(.text+0x80): first defined here
   ld: arch/x86/kernel/cpu/umwait.o: in function `bpf_nf_conn_convert_ctx_access':
   umwait.c:(.text+0x220): multiple definition of `bpf_nf_conn_convert_ctx_access'; init/main.o:main.c:(.text+0x90): first defined here
   ld: arch/x86/kernel/reboot.o: in function `bpf_nf_conn_is_valid_access':
   reboot.c:(.text+0xb0): multiple definition of `bpf_nf_conn_is_valid_access'; init/main.o:main.c:(.text+0x80): first defined here
   ld: arch/x86/kernel/reboot.o: in function `bpf_nf_conn_convert_ctx_access':
   reboot.c:(.text+0xc0): multiple definition of `bpf_nf_conn_convert_ctx_access'; init/main.o:main.c:(.text+0x90): first defined here

---
0-DAY kernel test infrastructure                 Open Source Technology Center
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 10539 bytes --]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH bpf-next] bpf: add bpf_ct_lookup_{tcp,udp}() helpers
  2020-01-18  0:01 [PATCH bpf-next] bpf: add bpf_ct_lookup_{tcp,udp}() helpers Matthew Cover
  2020-01-18 11:37 ` kbuild test robot
@ 2020-01-18 11:58 ` kbuild test robot
  2020-01-19  3:05 ` John Fastabend
  2020-01-21 20:20 ` [PATCH bpf-next v2 1/2] " Matthew Cover
  3 siblings, 0 replies; 22+ messages in thread
From: kbuild test robot @ 2020-01-18 11:58 UTC (permalink / raw)
  To: Matthew Cover
  Cc: kbuild-all, Alexei Starovoitov, Daniel Borkmann,
	Martin KaFai Lau, Song Liu, Yonghong Song, Andrii Nakryiko,
	David S. Miller, Shuah Khan, Jakub Kicinski,
	Jesper Dangaard Brouer, John Fastabend, Jakub Sitnicki,
	Quentin Monnet, Matthew Cover, Stanislav Fomichev,
	Andrey Ignatov, Lorenz Bauer, Jiong Wang, netdev, bpf,
	linux-kernel, linux-kselftest

[-- Attachment #1: Type: text/plain, Size: 8807 bytes --]

Hi Matthew,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on bpf-next/master]
[also build test ERROR on next-20200117]
[cannot apply to bpf/master net-next/master net/master linus/master sparc-next/master v5.5-rc6]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:    https://github.com/0day-ci/linux/commits/Matthew-Cover/bpf-add-bpf_ct_lookup_-tcp-udp-helpers/20200118-153032
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: powerpc-warp_defconfig (attached as .config)
compiler: powerpc-linux-gcc (GCC) 7.5.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        GCC_VERSION=7.5.0 make.cross ARCH=powerpc 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   init/do_mounts.o: In function `bpf_nf_conn_is_valid_access':
>> do_mounts.c:(.text+0x554): multiple definition of `bpf_nf_conn_is_valid_access'
   init/main.o:main.c:(.text+0x198): first defined here
   init/do_mounts.o: In function `bpf_nf_conn_convert_ctx_access':
>> do_mounts.c:(.text+0x564): multiple definition of `bpf_nf_conn_convert_ctx_access'
   init/main.o:main.c:(.text+0x1a8): first defined here
   init/do_mounts_rd.o: In function `bpf_nf_conn_is_valid_access':
   do_mounts_rd.c:(.text+0x0): multiple definition of `bpf_nf_conn_is_valid_access'
   init/main.o:main.c:(.text+0x198): first defined here
   init/do_mounts_rd.o: In function `bpf_nf_conn_convert_ctx_access':
   do_mounts_rd.c:(.text+0x10): multiple definition of `bpf_nf_conn_convert_ctx_access'
   init/main.o:main.c:(.text+0x1a8): first defined here
   init/do_mounts_initrd.o: In function `bpf_nf_conn_is_valid_access':
   do_mounts_initrd.c:(.text+0x70): multiple definition of `bpf_nf_conn_is_valid_access'
   init/main.o:main.c:(.text+0x198): first defined here
   init/do_mounts_initrd.o: In function `bpf_nf_conn_convert_ctx_access':
   do_mounts_initrd.c:(.text+0x80): multiple definition of `bpf_nf_conn_convert_ctx_access'
   init/main.o:main.c:(.text+0x1a8): first defined here
   init/initramfs.o: In function `bpf_nf_conn_is_valid_access':
   initramfs.c:(.text+0x0): multiple definition of `bpf_nf_conn_is_valid_access'
   init/main.o:main.c:(.text+0x198): first defined here
   init/initramfs.o: In function `bpf_nf_conn_convert_ctx_access':
   initramfs.c:(.text+0x10): multiple definition of `bpf_nf_conn_convert_ctx_access'
   init/main.o:main.c:(.text+0x1a8): first defined here
   arch/powerpc/kernel/ptrace.o: In function `bpf_nf_conn_is_valid_access':
   ptrace.c:(.text+0x87c): multiple definition of `bpf_nf_conn_is_valid_access'
   init/main.o:main.c:(.text+0x198): first defined here
   arch/powerpc/kernel/ptrace.o: In function `bpf_nf_conn_convert_ctx_access':
   ptrace.c:(.text+0x88c): multiple definition of `bpf_nf_conn_convert_ctx_access'
   init/main.o:main.c:(.text+0x1a8): first defined here
   arch/powerpc/kernel/syscalls.o: In function `bpf_nf_conn_is_valid_access':
   syscalls.c:(.text+0x0): multiple definition of `bpf_nf_conn_is_valid_access'
   init/main.o:main.c:(.text+0x198): first defined here
   arch/powerpc/kernel/syscalls.o: In function `bpf_nf_conn_convert_ctx_access':
   syscalls.c:(.text+0x10): multiple definition of `bpf_nf_conn_convert_ctx_access'
   init/main.o:main.c:(.text+0x1a8): first defined here
   arch/powerpc/kernel/align.o: In function `bpf_nf_conn_is_valid_access':
   align.c:(.text+0x0): multiple definition of `bpf_nf_conn_is_valid_access'
   init/main.o:main.c:(.text+0x198): first defined here
   arch/powerpc/kernel/align.o: In function `bpf_nf_conn_convert_ctx_access':
   align.c:(.text+0x10): multiple definition of `bpf_nf_conn_convert_ctx_access'
   init/main.o:main.c:(.text+0x1a8): first defined here
   arch/powerpc/kernel/signal_32.o: In function `bpf_nf_conn_is_valid_access':
   signal_32.c:(.text+0x2fc): multiple definition of `bpf_nf_conn_is_valid_access'
   init/main.o:main.c:(.text+0x198): first defined here
   arch/powerpc/kernel/signal_32.o: In function `bpf_nf_conn_convert_ctx_access':
   signal_32.c:(.text+0x30c): multiple definition of `bpf_nf_conn_convert_ctx_access'
   init/main.o:main.c:(.text+0x1a8): first defined here
   arch/powerpc/kernel/process.o: In function `bpf_nf_conn_is_valid_access':
   process.c:(.text+0x55c): multiple definition of `bpf_nf_conn_is_valid_access'
   init/main.o:main.c:(.text+0x198): first defined here
   arch/powerpc/kernel/process.o: In function `bpf_nf_conn_convert_ctx_access':
   process.c:(.text+0x56c): multiple definition of `bpf_nf_conn_convert_ctx_access'
   init/main.o:main.c:(.text+0x1a8): first defined here
   arch/powerpc/kernel/signal.o: In function `bpf_nf_conn_is_valid_access':
   signal.c:(.text+0x0): multiple definition of `bpf_nf_conn_is_valid_access'
   init/main.o:main.c:(.text+0x198): first defined here
   arch/powerpc/kernel/signal.o: In function `bpf_nf_conn_convert_ctx_access':
   signal.c:(.text+0x10): multiple definition of `bpf_nf_conn_convert_ctx_access'
   init/main.o:main.c:(.text+0x1a8): first defined here
   arch/powerpc/kernel/time.o: In function `bpf_nf_conn_is_valid_access':
   time.c:(.text+0x43c): multiple definition of `bpf_nf_conn_is_valid_access'
   init/main.o:main.c:(.text+0x198): first defined here
   arch/powerpc/kernel/time.o: In function `bpf_nf_conn_convert_ctx_access':
   time.c:(.text+0x44c): multiple definition of `bpf_nf_conn_convert_ctx_access'
   init/main.o:main.c:(.text+0x1a8): first defined here
   arch/powerpc/kernel/traps.o: In function `bpf_nf_conn_is_valid_access':
   traps.c:(.text+0x148): multiple definition of `bpf_nf_conn_is_valid_access'
   init/main.o:main.c:(.text+0x198): first defined here
   arch/powerpc/kernel/traps.o: In function `bpf_nf_conn_convert_ctx_access':
   traps.c:(.text+0x158): multiple definition of `bpf_nf_conn_convert_ctx_access'
   init/main.o:main.c:(.text+0x1a8): first defined here
   arch/powerpc/kernel/setup-common.o: In function `bpf_nf_conn_is_valid_access':
   setup-common.c:(.text+0x66c): multiple definition of `bpf_nf_conn_is_valid_access'
   init/main.o:main.c:(.text+0x198): first defined here
   arch/powerpc/kernel/setup-common.o: In function `bpf_nf_conn_convert_ctx_access':
   setup-common.c:(.text+0x67c): multiple definition of `bpf_nf_conn_convert_ctx_access'
   init/main.o:main.c:(.text+0x1a8): first defined here
   arch/powerpc/kernel/prom_parse.o: In function `bpf_nf_conn_is_valid_access':
   prom_parse.c:(.text+0x0): multiple definition of `bpf_nf_conn_is_valid_access'
   init/main.o:main.c:(.text+0x198): first defined here
   arch/powerpc/kernel/prom_parse.o: In function `bpf_nf_conn_convert_ctx_access':
   prom_parse.c:(.text+0x10): multiple definition of `bpf_nf_conn_convert_ctx_access'
   init/main.o:main.c:(.text+0x1a8): first defined here
   arch/powerpc/mm/fault.o: In function `bpf_nf_conn_is_valid_access':
   fault.c:(.text+0x148): multiple definition of `bpf_nf_conn_is_valid_access'
   init/main.o:main.c:(.text+0x198): first defined here
   arch/powerpc/mm/fault.o: In function `bpf_nf_conn_convert_ctx_access':
   fault.c:(.text+0x158): multiple definition of `bpf_nf_conn_convert_ctx_access'
   init/main.o:main.c:(.text+0x1a8): first defined here
   arch/powerpc/mm/mem.o: In function `bpf_nf_conn_is_valid_access':
   mem.c:(.text+0x208): multiple definition of `bpf_nf_conn_is_valid_access'
   init/main.o:main.c:(.text+0x198): first defined here
   arch/powerpc/mm/mem.o: In function `bpf_nf_conn_convert_ctx_access':
   mem.c:(.text+0x218): multiple definition of `bpf_nf_conn_convert_ctx_access'
   init/main.o:main.c:(.text+0x1a8): first defined here
   arch/powerpc/mm/pgtable.o: In function `bpf_nf_conn_is_valid_access':
   pgtable.c:(.text+0xa0): multiple definition of `bpf_nf_conn_is_valid_access'
   init/main.o:main.c:(.text+0x198): first defined here
   arch/powerpc/mm/pgtable.o: In function `bpf_nf_conn_convert_ctx_access':
   pgtable.c:(.text+0xb0): multiple definition of `bpf_nf_conn_convert_ctx_access'
   init/main.o:main.c:(.text+0x1a8): first defined here
   arch/powerpc/mm/init_32.o: In function `bpf_nf_conn_is_valid_access':
   init_32.c:(.text+0x0): multiple definition of `bpf_nf_conn_is_valid_access'
   init/main.o:main.c:(.text+0x198): first defined here

---
0-DAY kernel test infrastructure                 Open Source Technology Center
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 16158 bytes --]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* RE: [PATCH bpf-next] bpf: add bpf_ct_lookup_{tcp,udp}() helpers
  2020-01-18  0:01 [PATCH bpf-next] bpf: add bpf_ct_lookup_{tcp,udp}() helpers Matthew Cover
  2020-01-18 11:37 ` kbuild test robot
  2020-01-18 11:58 ` kbuild test robot
@ 2020-01-19  3:05 ` John Fastabend
  2020-01-20 18:11   ` Matt Cover
  2020-01-21 20:20 ` [PATCH bpf-next v2 1/2] " Matthew Cover
  3 siblings, 1 reply; 22+ messages in thread
From: John Fastabend @ 2020-01-19  3:05 UTC (permalink / raw)
  To: Matthew Cover, Alexei Starovoitov, Daniel Borkmann,
	Martin KaFai Lau, Song Liu, Yonghong Song, Andrii Nakryiko,
	David S. Miller, Shuah Khan, Jakub Kicinski,
	Jesper Dangaard Brouer, John Fastabend, Jakub Sitnicki,
	Quentin Monnet, Matthew Cover, Stanislav Fomichev,
	Andrey Ignatov, Lorenz Bauer, Jiong Wang, netdev, bpf,
	linux-kernel, linux-kselftest

Matthew Cover wrote:
> Allow looking up an nf_conn. This allows eBPF programs to leverage
> nf_conntrack state for similar purposes to socket state use cases,
> as provided by the socket lookup helpers. This is particularly
> useful when nf_conntrack state is locally available, but socket
> state is not.
> 
> Signed-off-by: Matthew Cover <matthew.cover@stackpath.com>
> ---

Couple coding comments below. Also looks like a couple build errors
so fix those up. I'm still thinking over this though.

Also I prefer the tests in their own patch. So make it a two patch
series.

fwiw I think we could build a native xdp lib for connection tracking
but maybe there are reasons to pull in core conn tracking. Seems like 
a separate discussion.

> + * struct bpf_nf_conn *bpf_ct_lookup_udp(void *ctx, struct bpf_nf_conntrack_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
> + *	Description
> + *		Look for UDP nf_conntrack entry matching *tuple*, optionally in
> + *		a child network namespace *netns*. The return value must be
> + *		checked, and if non-**NULL**, released via
> + *		**bpf_ct_release**\ ().
> + *
> + *		The *ctx* should point to the context of the program, such as
> + *		the skb or xdp_md (depending on the hook in use). This is used
> + *		to determine the base network namespace for the lookup.
> + *
> + *		*tuple_size* must be one of:
> + *
> + *		**sizeof**\ (*tuple*\ **->ipv4**)
> + *			Look for an IPv4 nf_conn.
> + *		**sizeof**\ (*tuple*\ **->ipv6**)
> + *			Look for an IPv6 nf_conn.
> + *
> + *		If the *netns* is a negative signed 32-bit integer, then the
> + *		nf_conn lookup table in the netns associated with the *ctx* will
> + *		will be used. For the TC hooks, this is the netns of the device
> + *		in the skb. For XDP hooks, this is the netns of the device in
> + *		the xdp_md. If *netns* is any other signed 32-bit value greater
> + *		than or equal to zero then it specifies the ID of the netns
> + *		relative to the netns associated with the *ctx*. *netns* values
> + *		beyond the range of 32-bit integers are reserved for future
> + *		use.

I find the usage of netns a bit awkward. Its being passed as a u64 and
then used as a signed int with the pivot depending on negative?

How about pivot on a flag instead of the signed bit of netns here.

> + *
> + *		All values for *flags* are reserved for future usage, and must
> + *		be left at zero.
> + *
> + *		This helper is available only if the kernel was compiled with
> + *		**CONFIG_NF_CONNTRACK=y** configuration option.

I suspect this should be,

"This helper will return NULL if the kernel was compiled with ..."

Same comment for the earlier _tcp helper.

> + *	Return
> + *		Pointer to **struct bpf_nf_conn**, or **NULL** in case of
> + *		failure.
> + *
> + * int bpf_ct_release(struct bpf_nf_conn *ct)
> + *	Description
> + *		Release the reference held by *ct*. *ct* must be a
> + *		non-**NULL** pointer that was returned from
> + *		**bpf_ct_lookup_xxx**\ ().
> + *	Return
> + *		0 on success, or a negative error in case of failure.
>   */
>  #define __BPF_FUNC_MAPPER(FN)		\
>  	FN(unspec),			\

[...]
  
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> @@ -3278,6 +3363,30 @@ struct bpf_sock_tuple {
>  	};
>  };
>  
> +struct bpf_nf_conn {
> +	__u32 cpu;
> +	__u32 mark;
> +	__u32 status;
> +	__u32 timeout;
> +};
> +
> +struct bpf_nf_conntrack_tuple {
> +	union {
> +		struct {
> +			__be32 saddr;
> +			__be32 daddr;
> +			__be16 sport;
> +			__be16 dport;
> +		} ipv4;
> +		struct {
> +			__be32 saddr[4];
> +			__be32 daddr[4];
> +			__be16 sport;
> +			__be16 dport;
> +		} ipv6;
> +	};
> +};
> +

[...]

> +static int check_nf_ct_access(struct bpf_verifier_env *env, int insn_idx,
> +			     u32 regno, int off, int size,
> +			     enum bpf_access_type t)
> +{
> +	struct bpf_reg_state *regs = cur_regs(env);
> +	struct bpf_reg_state *reg = &regs[regno];
> +	struct bpf_insn_access_aux info = {};
> +	bool valid;
> +
> +	switch (reg->type) {
> +	case PTR_TO_NF_CONN:
> +		valid = bpf_nf_conn_is_valid_access(off, size, t, &info);
> +		break;
> +	default:
> +		valid = false;
> +	}
> +
> +	if (valid) {
> +		env->insn_aux_data[insn_idx].ctx_field_size =
> +			info.ctx_field_size;
> +		return 0;
> +	}
> +
> +	verbose(env, "R%d invalid %s access off=%d size=%d\n",
> +		regno, reg_type_str[reg->type], off, size);
> +
> +	return -EACCES;

nit, but this construction feels odd to me. How about,

 if (reg->type != PTR_TO_NF_CONN) {
	verbose(...)
	return -EACCES;
 }

 env-> ...
 return 0;

The switch sort of implies you have some ideas on future types? What would
those be?

> +}
> +
>  static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
>  			     u32 regno, int off, int size,
>  			     enum bpf_access_type t)
> @@ -2511,6 +2556,13 @@ static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)
>  	return reg->type == PTR_TO_CTX;
>  }

[...]


> diff --git a/net/core/filter.c b/net/core/filter.c
> index 17de674..39ba965 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -74,6 +74,12 @@

[...]

> +static struct nf_conn *
> +__bpf_ct_lookup(struct sk_buff *skb, struct bpf_nf_conntrack_tuple *tuple, u32 len,
> +		struct net *caller_net, u8 proto, u64 netns_id, u64 flags)

Why not just make netns an int instead of pulling a unsigned from the helper and
then converting it into an int?

> +{
> +	struct nf_conn *ct = NULL;
> +	u8 family = AF_UNSPEC;
> +	struct net *net;
> +
> +	if (len == sizeof(tuple->ipv4))
> +		family = AF_INET;
> +	else if (len == sizeof(tuple->ipv6))
> +		family = AF_INET6;
> +	else
> +		goto out;
> +
> +	if (unlikely(family == AF_UNSPEC || flags ||
> +		     !((s32)netns_id < 0 || netns_id <= S32_MAX)))
                                            ^^^^^^^^^^^^^^^^^^^^
If you pass an int here and use flags to set the type I think you avoid this
check.

> +		goto out;
> +
> +	if ((s32)netns_id < 0) {

I don't like this casting here again fallout from u64->int conversion.

> +		net = caller_net;
> +		ct = ct_lookup(net, tuple, family, proto);
> +	} else {
> +		net = get_net_ns_by_id(caller_net, netns_id);
> +		if (unlikely(!net))
> +			goto out;
> +		ct = ct_lookup(net, tuple, family, proto);
> +		put_net(net);
> +	}
> +
> +out:
> +	return ct;
> +}
> +

[...]

Thanks!
John

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH bpf-next] bpf: add bpf_ct_lookup_{tcp,udp}() helpers
  2020-01-19  3:05 ` John Fastabend
@ 2020-01-20 18:11   ` Matt Cover
  2020-01-20 20:10     ` Matt Cover
  0 siblings, 1 reply; 22+ messages in thread
From: Matt Cover @ 2020-01-20 18:11 UTC (permalink / raw)
  To: John Fastabend
  Cc: Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau, Song Liu,
	Yonghong Song, Andrii Nakryiko, David S. Miller, Shuah Khan,
	Jakub Kicinski, Jesper Dangaard Brouer, Jakub Sitnicki,
	Quentin Monnet, Matthew Cover, Stanislav Fomichev,
	Andrey Ignatov, Lorenz Bauer, Jiong Wang, netdev, bpf,
	linux-kernel, linux-kselftest

On Sat, Jan 18, 2020 at 8:05 PM John Fastabend <john.fastabend@gmail.com> wrote:
>
> Matthew Cover wrote:
> > Allow looking up an nf_conn. This allows eBPF programs to leverage
> > nf_conntrack state for similar purposes to socket state use cases,
> > as provided by the socket lookup helpers. This is particularly
> > useful when nf_conntrack state is locally available, but socket
> > state is not.
> >
> > Signed-off-by: Matthew Cover <matthew.cover@stackpath.com>
> > ---
>
> Couple coding comments below. Also looks like a couple build errors
> so fix those up. I'm still thinking over this though.

Thank you for taking the time to look this over. I will be looking
into the build issues.

>
> Also I prefer the tests in their own patch. So make it a two patch
> series.

Sounds good. I will submit as a series for v2.

>
> fwiw I think we could build a native xdp lib for connection tracking
> but maybe there are reasons to pull in core conn tracking. Seems like
> a separate discussion.

Native xdp connection tracking would be cool as well. Cilium seems to
have ebpf conntrack; perhaps it can provide some useful insights into
that effort.

Even with native xdp connection tracking available, I see value in
these helpers, particularly when core conntrack is already in use.

>
> > + * struct bpf_nf_conn *bpf_ct_lookup_udp(void *ctx, struct bpf_nf_conntrack_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
> > + *   Description
> > + *           Look for UDP nf_conntrack entry matching *tuple*, optionally in
> > + *           a child network namespace *netns*. The return value must be
> > + *           checked, and if non-**NULL**, released via
> > + *           **bpf_ct_release**\ ().
> > + *
> > + *           The *ctx* should point to the context of the program, such as
> > + *           the skb or xdp_md (depending on the hook in use). This is used
> > + *           to determine the base network namespace for the lookup.
> > + *
> > + *           *tuple_size* must be one of:
> > + *
> > + *           **sizeof**\ (*tuple*\ **->ipv4**)
> > + *                   Look for an IPv4 nf_conn.
> > + *           **sizeof**\ (*tuple*\ **->ipv6**)
> > + *                   Look for an IPv6 nf_conn.
> > + *
> > + *           If the *netns* is a negative signed 32-bit integer, then the
> > + *           nf_conn lookup table in the netns associated with the *ctx* will
> > + *           will be used. For the TC hooks, this is the netns of the device
> > + *           in the skb. For XDP hooks, this is the netns of the device in
> > + *           the xdp_md. If *netns* is any other signed 32-bit value greater
> > + *           than or equal to zero then it specifies the ID of the netns
> > + *           relative to the netns associated with the *ctx*. *netns* values
> > + *           beyond the range of 32-bit integers are reserved for future
> > + *           use.
>
> I find the usage of netns a bit awkward. Its being passed as a u64 and
> then used as a signed int with the pivot depending on negative?
>
> How about pivot on a flag instead of the signed bit of netns here.

The interface (and much of the code) is a clone of the
bpf_sk_lookup_xxx helper functions. I figured having it match would
both make it familiar and give this patch a better chance of being
applied.

I'd prefer not to diverge from bpf_sk_lookup_xxx helpers here. That
is my only objection to what you propose.

>
> > + *
> > + *           All values for *flags* are reserved for future usage, and must
> > + *           be left at zero.
> > + *
> > + *           This helper is available only if the kernel was compiled with
> > + *           **CONFIG_NF_CONNTRACK=y** configuration option.
>
> I suspect this should be,
>
> "This helper will return NULL if the kernel was compiled with ..."
>

Good idea. I'll work this into v2 for additional clarity.

> Same comment for the earlier _tcp helper.
>
> > + *   Return
> > + *           Pointer to **struct bpf_nf_conn**, or **NULL** in case of
> > + *           failure.
> > + *
> > + * int bpf_ct_release(struct bpf_nf_conn *ct)
> > + *   Description
> > + *           Release the reference held by *ct*. *ct* must be a
> > + *           non-**NULL** pointer that was returned from
> > + *           **bpf_ct_lookup_xxx**\ ().
> > + *   Return
> > + *           0 on success, or a negative error in case of failure.
> >   */
> >  #define __BPF_FUNC_MAPPER(FN)                \
> >       FN(unspec),                     \
>
> [...]
>
> >  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> >   * function eBPF program intends to call
> > @@ -3278,6 +3363,30 @@ struct bpf_sock_tuple {
> >       };
> >  };
> >
> > +struct bpf_nf_conn {
> > +     __u32 cpu;
> > +     __u32 mark;
> > +     __u32 status;
> > +     __u32 timeout;
> > +};
> > +
> > +struct bpf_nf_conntrack_tuple {
> > +     union {
> > +             struct {
> > +                     __be32 saddr;
> > +                     __be32 daddr;
> > +                     __be16 sport;
> > +                     __be16 dport;
> > +             } ipv4;
> > +             struct {
> > +                     __be32 saddr[4];
> > +                     __be32 daddr[4];
> > +                     __be16 sport;
> > +                     __be16 dport;
> > +             } ipv6;
> > +     };
> > +};
> > +
>
> [...]
>
> > +static int check_nf_ct_access(struct bpf_verifier_env *env, int insn_idx,
> > +                          u32 regno, int off, int size,
> > +                          enum bpf_access_type t)
> > +{
> > +     struct bpf_reg_state *regs = cur_regs(env);
> > +     struct bpf_reg_state *reg = &regs[regno];
> > +     struct bpf_insn_access_aux info = {};
> > +     bool valid;
> > +
> > +     switch (reg->type) {
> > +     case PTR_TO_NF_CONN:
> > +             valid = bpf_nf_conn_is_valid_access(off, size, t, &info);
> > +             break;
> > +     default:
> > +             valid = false;
> > +     }
> > +
> > +     if (valid) {
> > +             env->insn_aux_data[insn_idx].ctx_field_size =
> > +                     info.ctx_field_size;
> > +             return 0;
> > +     }
> > +
> > +     verbose(env, "R%d invalid %s access off=%d size=%d\n",
> > +             regno, reg_type_str[reg->type], off, size);
> > +
> > +     return -EACCES;
>
> nit, but this construction feels odd to me. How about,
>
>  if (reg->type != PTR_TO_NF_CONN) {
>         verbose(...)
>         return -EACCES;
>  }
>
>  env-> ...
>  return 0;
>
> The switch sort of implies you have some ideas on future types? What would
> those be?

Sure, I can reduce this down if desired. I was viewing it more as
following the pattern seen in other check access functions.

I do plan to introduce a "tcp_nf_conn" as another series, akin to
"tcp_sock". When that happens this construct may make more sense.

e.g.
       case offsetof(struct bpf_tcp_nf_conn, state):
...
               *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg,
                               offsetof(struct nf_conn, proto) +
                               offsetof(union nf_conntrack_proto, tcp) +
                               offsetof(struct ip_ct_tcp, state));

>
> > +}
> > +
> >  static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
> >                            u32 regno, int off, int size,
> >                            enum bpf_access_type t)
> > @@ -2511,6 +2556,13 @@ static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)
> >       return reg->type == PTR_TO_CTX;
> >  }
>
> [...]
>
>
> > diff --git a/net/core/filter.c b/net/core/filter.c
> > index 17de674..39ba965 100644
> > --- a/net/core/filter.c
> > +++ b/net/core/filter.c
> > @@ -74,6 +74,12 @@
>
> [...]
>
> > +static struct nf_conn *
> > +__bpf_ct_lookup(struct sk_buff *skb, struct bpf_nf_conntrack_tuple *tuple, u32 len,
> > +             struct net *caller_net, u8 proto, u64 netns_id, u64 flags)
>
> Why not just make netns an int instead of pulling a unsigned from the helper and
> then converting it into an int?

These three are mostly a question of if we want to diverge from
__bpf_sk_lookup. If we choose to do so, then do we want to update
__bpf_sk_lookup to match? I think there is benefit to having the
uapi exposed interfaces match.

>
> > +{
> > +     struct nf_conn *ct = NULL;
> > +     u8 family = AF_UNSPEC;
> > +     struct net *net;
> > +
> > +     if (len == sizeof(tuple->ipv4))
> > +             family = AF_INET;
> > +     else if (len == sizeof(tuple->ipv6))
> > +             family = AF_INET6;
> > +     else
> > +             goto out;
> > +
> > +     if (unlikely(family == AF_UNSPEC || flags ||
> > +                  !((s32)netns_id < 0 || netns_id <= S32_MAX)))
>                                             ^^^^^^^^^^^^^^^^^^^^
> If you pass an int here and use flags to set the type I think you avoid this
> check.

See previous.

>
> > +             goto out;
> > +
> > +     if ((s32)netns_id < 0) {
>
> I don't like this casting here again fallout from u64->int conversion.

See previous.

>
> > +             net = caller_net;
> > +             ct = ct_lookup(net, tuple, family, proto);
> > +     } else {
> > +             net = get_net_ns_by_id(caller_net, netns_id);
> > +             if (unlikely(!net))
> > +                     goto out;
> > +             ct = ct_lookup(net, tuple, family, proto);
> > +             put_net(net);
> > +     }
> > +
> > +out:
> > +     return ct;
> > +}
> > +
>
> [...]
>
> Thanks!
> John

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH bpf-next] bpf: add bpf_ct_lookup_{tcp,udp}() helpers
  2020-01-20 18:11   ` Matt Cover
@ 2020-01-20 20:10     ` Matt Cover
  2020-01-20 21:11       ` Daniel Borkmann
  0 siblings, 1 reply; 22+ messages in thread
From: Matt Cover @ 2020-01-20 20:10 UTC (permalink / raw)
  To: John Fastabend
  Cc: Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau, Song Liu,
	Yonghong Song, Andrii Nakryiko, David S. Miller, Shuah Khan,
	Jakub Kicinski, Jesper Dangaard Brouer, Jakub Sitnicki,
	Quentin Monnet, Matthew Cover, Stanislav Fomichev,
	Andrey Ignatov, Lorenz Bauer, Jiong Wang, netdev, bpf,
	linux-kernel, linux-kselftest

On Mon, Jan 20, 2020 at 11:11 AM Matt Cover <werekraken@gmail.com> wrote:
>
> On Sat, Jan 18, 2020 at 8:05 PM John Fastabend <john.fastabend@gmail.com> wrote:
> >
> > Matthew Cover wrote:
> > > Allow looking up an nf_conn. This allows eBPF programs to leverage
> > > nf_conntrack state for similar purposes to socket state use cases,
> > > as provided by the socket lookup helpers. This is particularly
> > > useful when nf_conntrack state is locally available, but socket
> > > state is not.
> > >
> > > Signed-off-by: Matthew Cover <matthew.cover@stackpath.com>
> > > ---
> >
> > Couple coding comments below. Also looks like a couple build errors
> > so fix those up. I'm still thinking over this though.
>
> Thank you for taking the time to look this over. I will be looking
> into the build issues.

Looks like I missed static inline on a couple functions when
nf_conntrack isn't builtin. I'll include the fix in v2.

>
> >
> > Also I prefer the tests in their own patch. So make it a two patch
> > series.
>
> Sounds good. I will submit as a series for v2.
>
> >
> > fwiw I think we could build a native xdp lib for connection tracking
> > but maybe there are reasons to pull in core conn tracking. Seems like
> > a separate discussion.
>
> Native xdp connection tracking would be cool as well. Cilium seems to
> have ebpf conntrack; perhaps it can provide some useful insights into
> that effort.
>
> Even with native xdp connection tracking available, I see value in
> these helpers, particularly when core conntrack is already in use.
>
> >
> > > + * struct bpf_nf_conn *bpf_ct_lookup_udp(void *ctx, struct bpf_nf_conntrack_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
> > > + *   Description
> > > + *           Look for UDP nf_conntrack entry matching *tuple*, optionally in
> > > + *           a child network namespace *netns*. The return value must be
> > > + *           checked, and if non-**NULL**, released via
> > > + *           **bpf_ct_release**\ ().
> > > + *
> > > + *           The *ctx* should point to the context of the program, such as
> > > + *           the skb or xdp_md (depending on the hook in use). This is used
> > > + *           to determine the base network namespace for the lookup.
> > > + *
> > > + *           *tuple_size* must be one of:
> > > + *
> > > + *           **sizeof**\ (*tuple*\ **->ipv4**)
> > > + *                   Look for an IPv4 nf_conn.
> > > + *           **sizeof**\ (*tuple*\ **->ipv6**)
> > > + *                   Look for an IPv6 nf_conn.
> > > + *
> > > + *           If the *netns* is a negative signed 32-bit integer, then the
> > > + *           nf_conn lookup table in the netns associated with the *ctx* will
> > > + *           will be used. For the TC hooks, this is the netns of the device
> > > + *           in the skb. For XDP hooks, this is the netns of the device in
> > > + *           the xdp_md. If *netns* is any other signed 32-bit value greater
> > > + *           than or equal to zero then it specifies the ID of the netns
> > > + *           relative to the netns associated with the *ctx*. *netns* values
> > > + *           beyond the range of 32-bit integers are reserved for future
> > > + *           use.
> >
> > I find the usage of netns a bit awkward. Its being passed as a u64 and
> > then used as a signed int with the pivot depending on negative?
> >
> > How about pivot on a flag instead of the signed bit of netns here.
>
> The interface (and much of the code) is a clone of the
> bpf_sk_lookup_xxx helper functions. I figured having it match would
> both make it familiar and give this patch a better chance of being
> applied.
>
> I'd prefer not to diverge from bpf_sk_lookup_xxx helpers here. That
> is my only objection to what you propose.
>
> >
> > > + *
> > > + *           All values for *flags* are reserved for future usage, and must
> > > + *           be left at zero.
> > > + *
> > > + *           This helper is available only if the kernel was compiled with
> > > + *           **CONFIG_NF_CONNTRACK=y** configuration option.
> >
> > I suspect this should be,
> >
> > "This helper will return NULL if the kernel was compiled with ..."
> >
>
> Good idea. I'll work this into v2 for additional clarity.
>
> > Same comment for the earlier _tcp helper.
> >
> > > + *   Return
> > > + *           Pointer to **struct bpf_nf_conn**, or **NULL** in case of
> > > + *           failure.
> > > + *
> > > + * int bpf_ct_release(struct bpf_nf_conn *ct)
> > > + *   Description
> > > + *           Release the reference held by *ct*. *ct* must be a
> > > + *           non-**NULL** pointer that was returned from
> > > + *           **bpf_ct_lookup_xxx**\ ().
> > > + *   Return
> > > + *           0 on success, or a negative error in case of failure.
> > >   */
> > >  #define __BPF_FUNC_MAPPER(FN)                \
> > >       FN(unspec),                     \
> >
> > [...]
> >
> > >  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> > >   * function eBPF program intends to call
> > > @@ -3278,6 +3363,30 @@ struct bpf_sock_tuple {
> > >       };
> > >  };
> > >
> > > +struct bpf_nf_conn {
> > > +     __u32 cpu;
> > > +     __u32 mark;
> > > +     __u32 status;
> > > +     __u32 timeout;
> > > +};
> > > +
> > > +struct bpf_nf_conntrack_tuple {
> > > +     union {
> > > +             struct {
> > > +                     __be32 saddr;
> > > +                     __be32 daddr;
> > > +                     __be16 sport;
> > > +                     __be16 dport;
> > > +             } ipv4;
> > > +             struct {
> > > +                     __be32 saddr[4];
> > > +                     __be32 daddr[4];
> > > +                     __be16 sport;
> > > +                     __be16 dport;
> > > +             } ipv6;
> > > +     };
> > > +};
> > > +
> >
> > [...]
> >
> > > +static int check_nf_ct_access(struct bpf_verifier_env *env, int insn_idx,
> > > +                          u32 regno, int off, int size,
> > > +                          enum bpf_access_type t)
> > > +{
> > > +     struct bpf_reg_state *regs = cur_regs(env);
> > > +     struct bpf_reg_state *reg = &regs[regno];
> > > +     struct bpf_insn_access_aux info = {};
> > > +     bool valid;
> > > +
> > > +     switch (reg->type) {
> > > +     case PTR_TO_NF_CONN:
> > > +             valid = bpf_nf_conn_is_valid_access(off, size, t, &info);
> > > +             break;
> > > +     default:
> > > +             valid = false;
> > > +     }
> > > +
> > > +     if (valid) {
> > > +             env->insn_aux_data[insn_idx].ctx_field_size =
> > > +                     info.ctx_field_size;
> > > +             return 0;
> > > +     }
> > > +
> > > +     verbose(env, "R%d invalid %s access off=%d size=%d\n",
> > > +             regno, reg_type_str[reg->type], off, size);
> > > +
> > > +     return -EACCES;
> >
> > nit, but this construction feels odd to me. How about,
> >
> >  if (reg->type != PTR_TO_NF_CONN) {
> >         verbose(...)
> >         return -EACCES;
> >  }
> >
> >  env-> ...
> >  return 0;
> >
> > The switch sort of implies you have some ideas on future types? What would
> > those be?
>
> Sure, I can reduce this down if desired. I was viewing it more as
> following the pattern seen in other check access functions.
>
> I do plan to introduce a "tcp_nf_conn" as another series, akin to
> "tcp_sock". When that happens this construct may make more sense.
>
> e.g.
>        case offsetof(struct bpf_tcp_nf_conn, state):
> ...
>                *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg,
>                                offsetof(struct nf_conn, proto) +
>                                offsetof(union nf_conntrack_proto, tcp) +
>                                offsetof(struct ip_ct_tcp, state));
>
> >
> > > +}
> > > +
> > >  static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
> > >                            u32 regno, int off, int size,
> > >                            enum bpf_access_type t)
> > > @@ -2511,6 +2556,13 @@ static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)
> > >       return reg->type == PTR_TO_CTX;
> > >  }
> >
> > [...]
> >
> >
> > > diff --git a/net/core/filter.c b/net/core/filter.c
> > > index 17de674..39ba965 100644
> > > --- a/net/core/filter.c
> > > +++ b/net/core/filter.c
> > > @@ -74,6 +74,12 @@
> >
> > [...]
> >
> > > +static struct nf_conn *
> > > +__bpf_ct_lookup(struct sk_buff *skb, struct bpf_nf_conntrack_tuple *tuple, u32 len,
> > > +             struct net *caller_net, u8 proto, u64 netns_id, u64 flags)
> >
> > Why not just make netns an int instead of pulling a unsigned from the helper and
> > then converting it into an int?
>
> These three are mostly a question of if we want to diverge from
> __bpf_sk_lookup. If we choose to do so, then do we want to update
> __bpf_sk_lookup to match? I think there is benefit to having the
> uapi exposed interfaces match.
>
> >
> > > +{
> > > +     struct nf_conn *ct = NULL;
> > > +     u8 family = AF_UNSPEC;
> > > +     struct net *net;
> > > +
> > > +     if (len == sizeof(tuple->ipv4))
> > > +             family = AF_INET;
> > > +     else if (len == sizeof(tuple->ipv6))
> > > +             family = AF_INET6;
> > > +     else
> > > +             goto out;
> > > +
> > > +     if (unlikely(family == AF_UNSPEC || flags ||
> > > +                  !((s32)netns_id < 0 || netns_id <= S32_MAX)))
> >                                             ^^^^^^^^^^^^^^^^^^^^
> > If you pass an int here and use flags to set the type I think you avoid this
> > check.
>
> See previous.
>
> >
> > > +             goto out;
> > > +
> > > +     if ((s32)netns_id < 0) {
> >
> > I don't like this casting here again fallout from u64->int conversion.
>
> See previous.
>
> >
> > > +             net = caller_net;
> > > +             ct = ct_lookup(net, tuple, family, proto);
> > > +     } else {
> > > +             net = get_net_ns_by_id(caller_net, netns_id);
> > > +             if (unlikely(!net))
> > > +                     goto out;
> > > +             ct = ct_lookup(net, tuple, family, proto);
> > > +             put_net(net);
> > > +     }
> > > +
> > > +out:
> > > +     return ct;
> > > +}
> > > +
> >
> > [...]
> >
> > Thanks!
> > John

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH bpf-next] bpf: add bpf_ct_lookup_{tcp,udp}() helpers
  2020-01-20 20:10     ` Matt Cover
@ 2020-01-20 21:11       ` Daniel Borkmann
  2020-01-20 21:21         ` Matt Cover
  2020-01-23 21:28         ` Matt Cover
  0 siblings, 2 replies; 22+ messages in thread
From: Daniel Borkmann @ 2020-01-20 21:11 UTC (permalink / raw)
  To: Matt Cover, John Fastabend
  Cc: Alexei Starovoitov, Martin KaFai Lau, Song Liu, Yonghong Song,
	Andrii Nakryiko, David S. Miller, Shuah Khan, Jakub Kicinski,
	Jesper Dangaard Brouer, Jakub Sitnicki, Quentin Monnet,
	Matthew Cover, Stanislav Fomichev, Andrey Ignatov, Lorenz Bauer,
	Jiong Wang, netdev, bpf, linux-kernel, linux-kselftest

On 1/20/20 9:10 PM, Matt Cover wrote:
> On Mon, Jan 20, 2020 at 11:11 AM Matt Cover <werekraken@gmail.com> wrote:
>> On Sat, Jan 18, 2020 at 8:05 PM John Fastabend <john.fastabend@gmail.com> wrote:
>>> Matthew Cover wrote:
>>>> Allow looking up an nf_conn. This allows eBPF programs to leverage
>>>> nf_conntrack state for similar purposes to socket state use cases,
>>>> as provided by the socket lookup helpers. This is particularly
>>>> useful when nf_conntrack state is locally available, but socket
>>>> state is not.
>>>>
>>>> Signed-off-by: Matthew Cover <matthew.cover@stackpath.com>
>>>> ---
>>>
>>> Couple coding comments below. Also looks like a couple build errors
>>> so fix those up. I'm still thinking over this though.
>>
>> Thank you for taking the time to look this over. I will be looking
>> into the build issues.
> 
> Looks like I missed static inline on a couple functions when
> nf_conntrack isn't builtin. I'll include the fix in v2.

One of the big issues I'd see with this integration is that literally no-one
will be able to use it unless they manually recompile their distro kernel with
ct as builtin instead of module .. Have you considered writing a tcp/udp ct in
plain bpf? Perhaps would make sense to have some sort of tools/lib/bpf/util/
with bpf prog library code that can be included.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH bpf-next] bpf: add bpf_ct_lookup_{tcp,udp}() helpers
  2020-01-20 21:11       ` Daniel Borkmann
@ 2020-01-20 21:21         ` Matt Cover
  2020-01-23 21:28         ` Matt Cover
  1 sibling, 0 replies; 22+ messages in thread
From: Matt Cover @ 2020-01-20 21:21 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: John Fastabend, Alexei Starovoitov, Martin KaFai Lau, Song Liu,
	Yonghong Song, Andrii Nakryiko, David S. Miller, Shuah Khan,
	Jakub Kicinski, Jesper Dangaard Brouer, Jakub Sitnicki,
	Quentin Monnet, Matthew Cover, Stanislav Fomichev,
	Andrey Ignatov, Lorenz Bauer, Jiong Wang, netdev, bpf,
	linux-kernel, linux-kselftest

On Mon, Jan 20, 2020 at 2:11 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 1/20/20 9:10 PM, Matt Cover wrote:
> > On Mon, Jan 20, 2020 at 11:11 AM Matt Cover <werekraken@gmail.com> wrote:
> >> On Sat, Jan 18, 2020 at 8:05 PM John Fastabend <john.fastabend@gmail.com> wrote:
> >>> Matthew Cover wrote:
> >>>> Allow looking up an nf_conn. This allows eBPF programs to leverage
> >>>> nf_conntrack state for similar purposes to socket state use cases,
> >>>> as provided by the socket lookup helpers. This is particularly
> >>>> useful when nf_conntrack state is locally available, but socket
> >>>> state is not.
> >>>>
> >>>> Signed-off-by: Matthew Cover <matthew.cover@stackpath.com>
> >>>> ---
> >>>
> >>> Couple coding comments below. Also looks like a couple build errors
> >>> so fix those up. I'm still thinking over this though.
> >>
> >> Thank you for taking the time to look this over. I will be looking
> >> into the build issues.
> >
> > Looks like I missed static inline on a couple functions when
> > nf_conntrack isn't builtin. I'll include the fix in v2.
>
> One of the big issues I'd see with this integration is that literally no-one
> will be able to use it unless they manually recompile their distro kernel with
> ct as builtin instead of module .. Have you considered writing a tcp/udp ct in
> plain bpf? Perhaps would make sense to have some sort of tools/lib/bpf/util/
> with bpf prog library code that can be included.

I don't believe the builtin requirement is permanent. Currently, that
requirement comes from an undefined reference to
nf_conntrack_find_get() during linking. As a future improvement, I am
planning to propose a function pointer which ct_lookup() uses. The
kernel proper would point this to an always NULL stub. nf_conntrack
would populate to the real function when builtin or when loaded as a
module.

If there is a better way to solve the kernel proper using an exported
symbol provided by a module, I'd be happy to hear of it.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH bpf-next v2 1/2] bpf: add bpf_ct_lookup_{tcp,udp}() helpers
  2020-01-18  0:01 [PATCH bpf-next] bpf: add bpf_ct_lookup_{tcp,udp}() helpers Matthew Cover
                   ` (2 preceding siblings ...)
  2020-01-19  3:05 ` John Fastabend
@ 2020-01-21 20:20 ` Matthew Cover
  2020-01-21 20:22   ` [PATCH bpf-next v2 2/2] selftests/bpf: test references to nf_conn Matthew Cover
  2020-01-21 20:35   ` [PATCH bpf-next v2 1/2] bpf: add bpf_ct_lookup_{tcp,udp}() helpers Matt Cover
  3 siblings, 2 replies; 22+ messages in thread
From: Matthew Cover @ 2020-01-21 20:20 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau, Song Liu,
	Yonghong Song, Andrii Nakryiko, David S. Miller, Shuah Khan,
	Jesper Dangaard Brouer, John Fastabend, Jakub Sitnicki,
	Quentin Monnet, Matthew Cover, Stanislav Fomichev,
	Andrey Ignatov, Lorenz Bauer, netdev, bpf, linux-kernel,
	linux-kselftest

Allow looking up an nf_conn. This allows eBPF programs to leverage
nf_conntrack state for similar purposes to socket state use cases,
as provided by the socket lookup helpers. This is particularly
useful when nf_conntrack state is locally available, but socket
state is not.

v2:
  - Fix functions in need of and missing static inline (kbuild)
  - Move tests to separate patch and submit as a series (John)
  - Improve clarity in helper documentation (John)
  - Add CONFIG_NF_CONNTRACK=m support (Daniel)

Signed-off-by: Matthew Cover <matthew.cover@stackpath.com>
---
 include/linux/bpf.h               |  29 ++++
 include/linux/netfilter.h         |  12 ++
 include/uapi/linux/bpf.h          | 111 ++++++++++++++-
 kernel/bpf/verifier.c             | 105 ++++++++++++++-
 net/core/filter.c                 | 277 ++++++++++++++++++++++++++++++++++++++
 net/netfilter/core.c              |  16 +++
 net/netfilter/nf_conntrack_core.c |   1 +
 scripts/bpf_helpers_doc.py        |   4 +
 tools/include/uapi/linux/bpf.h    | 111 ++++++++++++++-
 9 files changed, 658 insertions(+), 8 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8e3b8f4..f502e1f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -239,6 +239,7 @@ enum bpf_arg_type {
 	ARG_PTR_TO_LONG,	/* pointer to long */
 	ARG_PTR_TO_SOCKET,	/* pointer to bpf_sock (fullsock) */
 	ARG_PTR_TO_BTF_ID,	/* pointer to in-kernel struct */
+	ARG_PTR_TO_NF_CONN,	/* pointer to bpf_nf_conn */
 };
 
 /* type of values returned from helper functions */
@@ -250,6 +251,7 @@ enum bpf_return_type {
 	RET_PTR_TO_SOCKET_OR_NULL,	/* returns a pointer to a socket or NULL */
 	RET_PTR_TO_TCP_SOCK_OR_NULL,	/* returns a pointer to a tcp_sock or NULL */
 	RET_PTR_TO_SOCK_COMMON_OR_NULL,	/* returns a pointer to a sock_common or NULL */
+	RET_PTR_TO_NF_CONN_OR_NULL,	/* returns a pointer to a nf_conn or NULL */
 };
 
 /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
@@ -316,6 +318,8 @@ enum bpf_reg_type {
 	PTR_TO_TP_BUFFER,	 /* reg points to a writable raw tp's buffer */
 	PTR_TO_XDP_SOCK,	 /* reg points to struct xdp_sock */
 	PTR_TO_BTF_ID,		 /* reg points to kernel struct */
+	PTR_TO_NF_CONN,		 /* reg points to struct nf_conn */
+	PTR_TO_NF_CONN_OR_NULL,	 /* reg points to struct nf_conn or NULL */
 };
 
 /* The information passed from prog-specific *_is_valid_access
@@ -1513,4 +1517,29 @@ enum bpf_text_poke_type {
 int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 		       void *addr1, void *addr2);
 
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+bool bpf_nf_conn_is_valid_access(int off, int size, enum bpf_access_type type,
+				 struct bpf_insn_access_aux *info);
+
+u32 bpf_nf_conn_convert_ctx_access(enum bpf_access_type type,
+				   const struct bpf_insn *si,
+				   struct bpf_insn *insn_buf,
+				   struct bpf_prog *prog, u32 *target_size);
+#else
+static inline bool bpf_nf_conn_is_valid_access(int off, int size,
+				enum bpf_access_type type,
+				struct bpf_insn_access_aux *info)
+{
+	return false;
+}
+
+static inline u32 bpf_nf_conn_convert_ctx_access(enum bpf_access_type type,
+				const struct bpf_insn *si,
+				struct bpf_insn *insn_buf,
+				struct bpf_prog *prog, u32 *target_size)
+{
+	return 0;
+}
+#endif /* CONFIG_NF_CONNTRACK */
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index eb312e7..a360ced 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -451,6 +451,9 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
 struct nf_conntrack_tuple;
 bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
 			 const struct sk_buff *skb);
+struct nf_conntrack_tuple_hash *
+nf_ct_find_get(struct net *net, const struct nf_conntrack_zone *zone,
+	       const struct nf_conntrack_tuple *tuple);
 #else
 static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
 struct nf_conntrack_tuple;
@@ -459,6 +462,12 @@ static inline bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
 {
 	return false;
 }
+static inline struct nf_conntrack_tuple_hash *
+nf_ct_find_get(struct net *net, const struct nf_conntrack_zone *zone,
+	       const struct nf_conntrack_tuple *tuple)
+{
+	return NULL;
+}
 #endif
 
 struct nf_conn;
@@ -469,6 +478,9 @@ struct nf_ct_hook {
 	void (*destroy)(struct nf_conntrack *);
 	bool (*get_tuple_skb)(struct nf_conntrack_tuple *,
 			      const struct sk_buff *);
+	struct nf_conntrack_tuple_hash *
+	(*find_get)(struct net *net, const struct nf_conntrack_zone *zone,
+                    const struct nf_conntrack_tuple *tuple);
 };
 extern struct nf_ct_hook __rcu *nf_ct_hook;
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 033d90a..85c4b3f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2885,6 +2885,88 @@ struct bpf_stack_build_id {
  *		**-EPERM** if no permission to send the *sig*.
  *
  *		**-EAGAIN** if bpf program can try again.
+ *
+ * struct bpf_nf_conn *bpf_ct_lookup_tcp(void *ctx, struct bpf_nf_conntrack_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
+ *	Description
+ *		Look for TCP nf_conntrack entry matching *tuple*, optionally in
+ *		a child network namespace *netns*. The return value must be
+ *		checked, and if non-**NULL**, released via
+ *		**bpf_ct_release**\ ().
+ *
+ *		The *ctx* should point to the context of the program, such as
+ *		the skb or xdp_md (depending on the hook in use). This is used
+ *		to determine the base network namespace for the lookup.
+ *
+ *		*tuple_size* must be one of:
+ *
+ *		**sizeof**\ (*tuple*\ **->ipv4**)
+ *			Look for an IPv4 nf_conn.
+ *		**sizeof**\ (*tuple*\ **->ipv6**)
+ *			Look for an IPv6 nf_conn.
+ *
+ *		If the *netns* is a negative signed 32-bit integer, then the
+ *		nf_conn lookup table in the netns associated with the *ctx* will
+ *		will be used. For the TC hooks, this is the netns of the device
+ *		in the skb. For XDP hooks, this is the netns of the device in
+ *		the xdp_md. If *netns* is any other signed 32-bit value greater
+ *		than or equal to zero then it specifies the ID of the netns
+ *		relative to the netns associated with the *ctx*. *netns* values
+ *		beyond the range of 32-bit integers are reserved for future
+ *		use.
+ *
+ *		All values for *flags* are reserved for future usage, and must
+ *		be left at zero.
+ *
+ * 		This helper will always return NULL if the kernel was compiled
+ * 		without **CONFIG_NF_CONNTRACK**.
+ *	Return
+ *		Pointer to **struct bpf_nf_conn**, or **NULL** in case of
+ *		failure.
+ *
+ * struct bpf_nf_conn *bpf_ct_lookup_udp(void *ctx, struct bpf_nf_conntrack_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
+ *	Description
+ *		Look for UDP nf_conntrack entry matching *tuple*, optionally in
+ *		a child network namespace *netns*. The return value must be
+ *		checked, and if non-**NULL**, released via
+ *		**bpf_ct_release**\ ().
+ *
+ *		The *ctx* should point to the context of the program, such as
+ *		the skb or xdp_md (depending on the hook in use). This is used
+ *		to determine the base network namespace for the lookup.
+ *
+ *		*tuple_size* must be one of:
+ *
+ *		**sizeof**\ (*tuple*\ **->ipv4**)
+ *			Look for an IPv4 nf_conn.
+ *		**sizeof**\ (*tuple*\ **->ipv6**)
+ *			Look for an IPv6 nf_conn.
+ *
+ *		If the *netns* is a negative signed 32-bit integer, then the
+ *		nf_conn lookup table in the netns associated with the *ctx* will
+ *		will be used. For the TC hooks, this is the netns of the device
+ *		in the skb. For XDP hooks, this is the netns of the device in
+ *		the xdp_md. If *netns* is any other signed 32-bit value greater
+ *		than or equal to zero then it specifies the ID of the netns
+ *		relative to the netns associated with the *ctx*. *netns* values
+ *		beyond the range of 32-bit integers are reserved for future
+ *		use.
+ *
+ *		All values for *flags* are reserved for future usage, and must
+ *		be left at zero.
+ *
+ * 		This helper will always return NULL if the kernel was compiled
+ * 		without **CONFIG_NF_CONNTRACK**.
+ *	Return
+ *		Pointer to **struct bpf_nf_conn**, or **NULL** in case of
+ *		failure.
+ *
+ * int bpf_ct_release(struct bpf_nf_conn *ct)
+ *	Description
+ *		Release the reference held by *ct*. *ct* must be a
+ *		non-**NULL** pointer that was returned from
+ *		**bpf_ct_lookup_xxx**\ ().
+ *	Return
+ *		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3004,7 +3086,10 @@ struct bpf_stack_build_id {
 	FN(probe_read_user_str),	\
 	FN(probe_read_kernel_str),	\
 	FN(tcp_send_ack),		\
-	FN(send_signal_thread),
+	FN(send_signal_thread),		\
+	FN(ct_lookup_tcp),		\
+	FN(ct_lookup_udp),		\
+	FN(ct_release),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -3278,6 +3363,30 @@ struct bpf_sock_tuple {
 	};
 };
 
+struct bpf_nf_conn {
+	__u32 cpu;
+	__u32 mark;
+	__u32 status;
+	__u32 timeout;
+};
+
+struct bpf_nf_conntrack_tuple {
+	union {
+		struct {
+			__be32 saddr;
+			__be32 daddr;
+			__be16 sport;
+			__be16 dport;
+		} ipv4;
+		struct {
+			__be32 saddr[4];
+			__be32 daddr[4];
+			__be16 sport;
+			__be16 dport;
+		} ipv6;
+	};
+};
+
 struct bpf_xdp_sock {
 	__u32 queue_id;
 };
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ca17dccc..0ea0ee7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -362,6 +362,11 @@ static const char *ltrim(const char *s)
 	env->prev_linfo = linfo;
 }
 
+static bool type_is_nf_ct_pointer(enum bpf_reg_type type)
+{
+	return type == PTR_TO_NF_CONN;
+}
+
 static bool type_is_pkt_pointer(enum bpf_reg_type type)
 {
 	return type == PTR_TO_PACKET ||
@@ -381,7 +386,8 @@ static bool reg_type_may_be_null(enum bpf_reg_type type)
 	return type == PTR_TO_MAP_VALUE_OR_NULL ||
 	       type == PTR_TO_SOCKET_OR_NULL ||
 	       type == PTR_TO_SOCK_COMMON_OR_NULL ||
-	       type == PTR_TO_TCP_SOCK_OR_NULL;
+	       type == PTR_TO_TCP_SOCK_OR_NULL ||
+	       type == PTR_TO_NF_CONN_OR_NULL;
 }
 
 static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
@@ -395,12 +401,15 @@ static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
 	return type == PTR_TO_SOCKET ||
 		type == PTR_TO_SOCKET_OR_NULL ||
 		type == PTR_TO_TCP_SOCK ||
-		type == PTR_TO_TCP_SOCK_OR_NULL;
+		type == PTR_TO_TCP_SOCK_OR_NULL ||
+		type == PTR_TO_NF_CONN ||
+		type == PTR_TO_NF_CONN_OR_NULL;
 }
 
 static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
 {
-	return type == ARG_PTR_TO_SOCK_COMMON;
+	return type == ARG_PTR_TO_SOCK_COMMON ||
+		type == ARG_PTR_TO_NF_CONN;
 }
 
 /* Determine whether the function releases some resources allocated by another
@@ -409,14 +418,17 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
  */
 static bool is_release_function(enum bpf_func_id func_id)
 {
-	return func_id == BPF_FUNC_sk_release;
+	return func_id == BPF_FUNC_sk_release ||
+		func_id == BPF_FUNC_ct_release;
 }
 
 static bool is_acquire_function(enum bpf_func_id func_id)
 {
 	return func_id == BPF_FUNC_sk_lookup_tcp ||
 		func_id == BPF_FUNC_sk_lookup_udp ||
-		func_id == BPF_FUNC_skc_lookup_tcp;
+		func_id == BPF_FUNC_skc_lookup_tcp ||
+		func_id == BPF_FUNC_ct_lookup_tcp ||
+		func_id == BPF_FUNC_ct_lookup_udp;
 }
 
 static bool is_ptr_cast_function(enum bpf_func_id func_id)
@@ -447,6 +459,8 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id)
 	[PTR_TO_TP_BUFFER]	= "tp_buffer",
 	[PTR_TO_XDP_SOCK]	= "xdp_sock",
 	[PTR_TO_BTF_ID]		= "ptr_",
+	[PTR_TO_NF_CONN]	= "nf_conn",
+	[PTR_TO_NF_CONN_OR_NULL] = "nf_conn_or_null",
 };
 
 static char slot_type_char[] = {
@@ -1913,6 +1927,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_TCP_SOCK_OR_NULL:
 	case PTR_TO_XDP_SOCK:
 	case PTR_TO_BTF_ID:
+	case PTR_TO_NF_CONN:
+	case PTR_TO_NF_CONN_OR_NULL:
 		return true;
 	default:
 		return false;
@@ -2440,6 +2456,35 @@ static int check_flow_keys_access(struct bpf_verifier_env *env, int off,
 	return 0;
 }
 
+static int check_nf_ct_access(struct bpf_verifier_env *env, int insn_idx,
+			     u32 regno, int off, int size,
+			     enum bpf_access_type t)
+{
+	struct bpf_reg_state *regs = cur_regs(env);
+	struct bpf_reg_state *reg = &regs[regno];
+	struct bpf_insn_access_aux info = {};
+	bool valid;
+
+	switch (reg->type) {
+	case PTR_TO_NF_CONN:
+		valid = bpf_nf_conn_is_valid_access(off, size, t, &info);
+		break;
+	default:
+		valid = false;
+	}
+
+	if (valid) {
+		env->insn_aux_data[insn_idx].ctx_field_size =
+			info.ctx_field_size;
+		return 0;
+	}
+
+	verbose(env, "R%d invalid %s access off=%d size=%d\n",
+		regno, reg_type_str[reg->type], off, size);
+
+	return -EACCES;
+}
+
 static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
 			     u32 regno, int off, int size,
 			     enum bpf_access_type t)
@@ -2511,6 +2556,13 @@ static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)
 	return reg->type == PTR_TO_CTX;
 }
 
+static bool is_nf_ct_reg(struct bpf_verifier_env *env, int regno)
+{
+	const struct bpf_reg_state *reg = reg_state(env, regno);
+
+	return type_is_nf_ct_pointer(reg->type);
+}
+
 static bool is_sk_reg(struct bpf_verifier_env *env, int regno)
 {
 	const struct bpf_reg_state *reg = reg_state(env, regno);
@@ -2635,6 +2687,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 	case PTR_TO_XDP_SOCK:
 		pointer_desc = "xdp_sock ";
 		break;
+	case PTR_TO_NF_CONN:
+		pointer_desc = "nf_conn ";
+		break;
 	default:
 		break;
 	}
@@ -3050,6 +3105,15 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		err = check_sock_access(env, insn_idx, regno, off, size, t);
 		if (!err && value_regno >= 0)
 			mark_reg_unknown(env, regs, value_regno);
+	} else if (type_is_nf_ct_pointer(reg->type)) {
+		if (t == BPF_WRITE) {
+			verbose(env, "R%d cannot write into %s\n",
+				regno, reg_type_str[reg->type]);
+			return -EACCES;
+		}
+		err = check_nf_ct_access(env, insn_idx, regno, off, size, t);
+		if (!err && value_regno >= 0)
+			mark_reg_unknown(env, regs, value_regno);
 	} else if (reg->type == PTR_TO_TP_BUFFER) {
 		err = check_tp_buffer_access(env, reg, regno, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
@@ -3099,7 +3163,8 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
 	if (is_ctx_reg(env, insn->dst_reg) ||
 	    is_pkt_reg(env, insn->dst_reg) ||
 	    is_flow_key_reg(env, insn->dst_reg) ||
-	    is_sk_reg(env, insn->dst_reg)) {
+	    is_sk_reg(env, insn->dst_reg) ||
+	    is_nf_ct_reg(env, insn->dst_reg)) {
 		verbose(env, "BPF_XADD stores into R%d %s is not allowed\n",
 			insn->dst_reg,
 			reg_type_str[reg_state(env, insn->dst_reg)->type]);
@@ -3501,6 +3566,19 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 				regno);
 			return -EACCES;
 		}
+	} else if (arg_type == ARG_PTR_TO_NF_CONN) {
+		expected_type = PTR_TO_NF_CONN;
+		if (!type_is_nf_ct_pointer(type))
+			goto err_type;
+		if (reg->ref_obj_id) {
+			if (meta->ref_obj_id) {
+				verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
+					regno, reg->ref_obj_id,
+					meta->ref_obj_id);
+				return -EFAULT;
+			}
+			meta->ref_obj_id = reg->ref_obj_id;
+		}
 	} else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
 		if (meta->func_id == BPF_FUNC_spin_lock) {
 			if (process_spin_lock(env, regno, true))
@@ -4368,6 +4446,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		mark_reg_known_zero(env, regs, BPF_REG_0);
 		regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL;
 		regs[BPF_REG_0].id = ++env->id_gen;
+	} else if (fn->ret_type == RET_PTR_TO_NF_CONN_OR_NULL) {
+		mark_reg_known_zero(env, regs, BPF_REG_0);
+		regs[BPF_REG_0].type = PTR_TO_NF_CONN_OR_NULL;
+		regs[BPF_REG_0].id = ++env->id_gen;
 	} else {
 		verbose(env, "unknown return type %d of func %s#%d\n",
 			fn->ret_type, func_id_name(func_id), func_id);
@@ -4649,6 +4731,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	case PTR_TO_TCP_SOCK:
 	case PTR_TO_TCP_SOCK_OR_NULL:
 	case PTR_TO_XDP_SOCK:
+	case PTR_TO_NF_CONN:
+	case PTR_TO_NF_CONN_OR_NULL:
 		verbose(env, "R%d pointer arithmetic on %s prohibited\n",
 			dst, reg_type_str[ptr_reg->type]);
 		return -EACCES;
@@ -5915,6 +5999,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
 			reg->type = PTR_TO_SOCK_COMMON;
 		} else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) {
 			reg->type = PTR_TO_TCP_SOCK;
+		} else if (reg->type == PTR_TO_NF_CONN_OR_NULL) {
+			reg->type = PTR_TO_NF_CONN;
 		}
 		if (is_null) {
 			/* We don't need id and ref_obj_id from this point
@@ -7232,6 +7318,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 	case PTR_TO_TCP_SOCK:
 	case PTR_TO_TCP_SOCK_OR_NULL:
 	case PTR_TO_XDP_SOCK:
+	case PTR_TO_NF_CONN:
+	case PTR_TO_NF_CONN_OR_NULL:
 		/* Only valid matches are exact, which memcmp() above
 		 * would have accepted
 		 */
@@ -7760,6 +7848,8 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type)
 	case PTR_TO_TCP_SOCK_OR_NULL:
 	case PTR_TO_XDP_SOCK:
 	case PTR_TO_BTF_ID:
+	case PTR_TO_NF_CONN:
+	case PTR_TO_NF_CONN_OR_NULL:
 		return false;
 	default:
 		return true;
@@ -8867,6 +8957,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 				return -EINVAL;
 			}
 			continue;
+		case PTR_TO_NF_CONN:
+			convert_ctx_access = bpf_nf_conn_convert_ctx_access;
+			break;
 		default:
 			continue;
 		}
diff --git a/net/core/filter.c b/net/core/filter.c
index 17de674..80319d3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -74,6 +74,12 @@
 #include <net/ipv6_stubs.h>
 #include <net/bpf_sk_storage.h>
 
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
 /**
  *	sk_filter_trim_cap - run a packet through a socket filter
  *	@sk: sock associated with &sk_buff
@@ -5122,6 +5128,253 @@ static void bpf_update_srh_state(struct sk_buff *skb)
 };
 #endif /* CONFIG_IPV6_SEG6_BPF */
 
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+bool bpf_nf_conn_is_valid_access(int off, int size, enum bpf_access_type type,
+				 struct bpf_insn_access_aux *info)
+{
+	if (off < 0 || off >= offsetofend(struct bpf_nf_conn,
+					  timeout))
+		return false;
+
+	if (off % size != 0)
+		return false;
+
+	return size == sizeof(__u32);
+}
+
+u32 bpf_nf_conn_convert_ctx_access(enum bpf_access_type type,
+				   const struct bpf_insn *si,
+				   struct bpf_insn *insn_buf,
+				   struct bpf_prog *prog, u32 *target_size)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	switch (si->off) {
+	case offsetof(struct bpf_nf_conn, cpu):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct nf_conn, cpu) != 2);
+
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+				      offsetof(struct nf_conn, cpu));
+
+		break;
+
+	case offsetof(struct bpf_nf_conn, mark):
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
+		BUILD_BUG_ON(FIELD_SIZEOF(struct nf_conn, mark) != 4);
+
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+				      offsetof(struct nf_conn, mark));
+#else
+		*target_size = 4;
+		*insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
+#endif
+		break;
+
+	case offsetof(struct bpf_nf_conn, status):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct nf_conn, status) < 4 ||
+			     __IPS_MAX_BIT > 32);
+
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+				      offsetof(struct nf_conn, status));
+
+		break;
+
+	case offsetof(struct bpf_nf_conn, timeout):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct nf_conn, timeout) != 4);
+
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+				      offsetof(struct nf_conn, timeout));
+
+		break;
+	}
+
+	return insn - insn_buf;
+}
+
+static struct nf_conn *
+ct_lookup(struct net *net, struct bpf_nf_conntrack_tuple *tuple,
+	  u8 family, u8 proto)
+{
+	struct nf_conntrack_tuple_hash *hash;
+	struct nf_conntrack_tuple tup;
+	struct nf_conn *ct = NULL;
+
+	memset(&tup, 0, sizeof(tup));
+
+	tup.dst.protonum = proto;
+	tup.src.l3num = family;
+
+	if (family == AF_INET) {
+		tup.src.u3.ip = tuple->ipv4.saddr;
+		tup.dst.u3.ip = tuple->ipv4.daddr;
+		tup.src.u.tcp.port = tuple->ipv4.sport;
+		tup.dst.u.tcp.port = tuple->ipv4.dport;
+#if IS_ENABLED(CONFIG_IPV6)
+	} else {
+		memcpy(tup.src.u3.ip6, tuple->ipv6.saddr, sizeof(tup.src.u3.ip6));
+		memcpy(tup.dst.u3.ip6, tuple->ipv6.daddr, sizeof(tup.dst.u3.ip6));
+		tup.src.u.tcp.port = tuple->ipv6.sport;
+		tup.dst.u.tcp.port = tuple->ipv6.dport;
+#endif
+	}
+
+	hash = nf_ct_find_get(net, &nf_ct_zone_dflt, &tup);
+	if (!hash)
+		goto out;
+	ct = nf_ct_tuplehash_to_ctrack(hash);
+
+out:
+	return ct;
+}
+
+static struct nf_conn *
+__bpf_ct_lookup(struct sk_buff *skb, struct bpf_nf_conntrack_tuple *tuple, u32 len,
+		struct net *caller_net, u8 proto, u64 netns_id, u64 flags)
+{
+	struct nf_conn *ct = NULL;
+	u8 family = AF_UNSPEC;
+	struct net *net;
+
+	if (len == sizeof(tuple->ipv4))
+		family = AF_INET;
+	else if (len == sizeof(tuple->ipv6))
+		family = AF_INET6;
+	else
+		goto out;
+
+	if (unlikely(family == AF_UNSPEC || flags ||
+		     !((s32)netns_id < 0 || netns_id <= S32_MAX)))
+		goto out;
+
+	if ((s32)netns_id < 0) {
+		net = caller_net;
+		ct = ct_lookup(net, tuple, family, proto);
+	} else {
+		net = get_net_ns_by_id(caller_net, netns_id);
+		if (unlikely(!net))
+			goto out;
+		ct = ct_lookup(net, tuple, family, proto);
+		put_net(net);
+	}
+
+out:
+	return ct;
+}
+
+static struct nf_conn *
+bpf_ct_lookup(struct sk_buff *skb, struct bpf_nf_conntrack_tuple *tuple, u32 len,
+	      u8 proto, u64 netns_id, u64 flags)
+{
+	struct net *caller_net;
+
+	if (skb->dev) {
+		caller_net = dev_net(skb->dev);
+	} else {
+		caller_net = sock_net(skb->sk);
+	}
+
+	return __bpf_ct_lookup(skb, tuple, len, caller_net, proto,
+			       netns_id, flags);
+}
+
+BPF_CALL_5(bpf_ct_lookup_tcp, struct sk_buff *, skb,
+	   struct bpf_nf_conntrack_tuple *, tuple, u32, len, u64, netns_id,
+	   u64, flags)
+{
+	return (unsigned long)bpf_ct_lookup(skb, tuple, len, IPPROTO_TCP,
+					     netns_id, flags);
+}
+
+static const struct bpf_func_proto bpf_ct_lookup_tcp_proto = {
+	.func		= bpf_ct_lookup_tcp,
+	.gpl_only	= true,
+	.pkt_access	= true,
+	.ret_type	= RET_PTR_TO_NF_CONN_OR_NULL,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_xdp_ct_lookup_tcp, struct xdp_buff *, ctx,
+	   struct bpf_nf_conntrack_tuple *, tuple, u32, len, u32, netns_id,
+	   u64, flags)
+{
+	struct net *caller_net = dev_net(ctx->rxq->dev);
+
+	return (unsigned long)__bpf_ct_lookup(NULL, tuple, len, caller_net,
+					      IPPROTO_TCP, netns_id, flags);
+}
+
+static const struct bpf_func_proto bpf_xdp_ct_lookup_tcp_proto = {
+	.func		= bpf_xdp_ct_lookup_tcp,
+	.gpl_only	= true,
+	.pkt_access	= true,
+	.ret_type	= RET_PTR_TO_NF_CONN_OR_NULL,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_ct_lookup_udp, struct sk_buff *, skb,
+	   struct bpf_nf_conntrack_tuple *, tuple, u32, len, u64, netns_id,
+	   u64, flags)
+{
+	return (unsigned long)bpf_ct_lookup(skb, tuple, len, IPPROTO_UDP,
+					     netns_id, flags);
+}
+
+static const struct bpf_func_proto bpf_ct_lookup_udp_proto = {
+	.func		= bpf_ct_lookup_udp,
+	.gpl_only	= true,
+	.pkt_access	= true,
+	.ret_type	= RET_PTR_TO_NF_CONN_OR_NULL,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_xdp_ct_lookup_udp, struct xdp_buff *, ctx,
+	   struct bpf_nf_conntrack_tuple *, tuple, u32, len, u32, netns_id,
+	   u64, flags)
+{
+	struct net *caller_net = dev_net(ctx->rxq->dev);
+
+	return (unsigned long)__bpf_ct_lookup(NULL, tuple, len, caller_net,
+					      IPPROTO_UDP, netns_id, flags);
+}
+
+static const struct bpf_func_proto bpf_xdp_ct_lookup_udp_proto = {
+	.func		= bpf_xdp_ct_lookup_udp,
+	.gpl_only	= true,
+	.pkt_access	= true,
+	.ret_type	= RET_PTR_TO_NF_CONN_OR_NULL,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_1(bpf_ct_release, struct nf_conn *, ct)
+{
+	nf_conntrack_put(&ct->ct_general);
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_ct_release_proto = {
+	.func		= bpf_ct_release,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_NF_CONN,
+};
+#endif
+
 #ifdef CONFIG_INET
 static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
 			      int dif, int sdif, u8 family, u8 proto)
@@ -6139,6 +6392,14 @@ bool bpf_helper_changes_pkt_data(void *func)
 	case BPF_FUNC_tcp_gen_syncookie:
 		return &bpf_tcp_gen_syncookie_proto;
 #endif
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+	case BPF_FUNC_ct_lookup_tcp:
+		return &bpf_ct_lookup_tcp_proto;
+	case BPF_FUNC_ct_lookup_udp:
+		return &bpf_ct_lookup_udp_proto;
+	case BPF_FUNC_ct_release:
+		return &bpf_ct_release_proto;
+#endif
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -6180,6 +6441,14 @@ bool bpf_helper_changes_pkt_data(void *func)
 	case BPF_FUNC_tcp_gen_syncookie:
 		return &bpf_tcp_gen_syncookie_proto;
 #endif
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+	case BPF_FUNC_ct_lookup_tcp:
+		return &bpf_xdp_ct_lookup_tcp_proto;
+	case BPF_FUNC_ct_lookup_udp:
+		return &bpf_xdp_ct_lookup_udp_proto;
+	case BPF_FUNC_ct_release:
+		return &bpf_ct_release_proto;
+#endif
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -6284,6 +6553,14 @@ bool bpf_helper_changes_pkt_data(void *func)
 	case BPF_FUNC_skc_lookup_tcp:
 		return &bpf_skc_lookup_tcp_proto;
 #endif
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+	case BPF_FUNC_ct_lookup_tcp:
+		return &bpf_ct_lookup_tcp_proto;
+	case BPF_FUNC_ct_lookup_udp:
+		return &bpf_ct_lookup_udp_proto;
+	case BPF_FUNC_ct_release:
+		return &bpf_ct_release_proto;
+#endif
 	default:
 		return bpf_base_func_proto(func_id);
 	}
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 78f046e..855c6b0 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -617,6 +617,22 @@ bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
 }
 EXPORT_SYMBOL(nf_ct_get_tuple_skb);
 
+struct nf_conntrack_tuple_hash *
+nf_ct_find_get(struct net *net, const struct nf_conntrack_zone *zone,
+	       const struct nf_conntrack_tuple *tuple)
+{
+	struct nf_ct_hook *ct_hook;
+	struct nf_conntrack_tuple_hash *ret = NULL;
+
+	rcu_read_lock();
+	ct_hook = rcu_dereference(nf_ct_hook);
+	if (ct_hook)
+		ret = ct_hook->find_get(net, zone, tuple);
+	rcu_read_unlock();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_find_get);
+
 /* Built-in default zone used e.g. by modules. */
 const struct nf_conntrack_zone nf_ct_zone_dflt = {
 	.id	= NF_CT_DEFAULT_ZONE_ID,
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index f4c4b46..a44df88 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -2484,6 +2484,7 @@ int nf_conntrack_init_start(void)
 	.update		= nf_conntrack_update,
 	.destroy	= destroy_conntrack,
 	.get_tuple_skb  = nf_conntrack_get_tuple_skb,
+	.find_get	= nf_conntrack_find_get,
 };
 
 void nf_conntrack_init_end(void)
diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py
index 90baf7d..26f0c2a 100755
--- a/scripts/bpf_helpers_doc.py
+++ b/scripts/bpf_helpers_doc.py
@@ -398,6 +398,8 @@ class PrinterHelpers(Printer):
 
     type_fwds = [
             'struct bpf_fib_lookup',
+            'struct bpf_nf_conn',
+            'struct bpf_nf_conntrack_tuple',
             'struct bpf_perf_event_data',
             'struct bpf_perf_event_value',
             'struct bpf_sock',
@@ -433,6 +435,8 @@ class PrinterHelpers(Printer):
             '__wsum',
 
             'struct bpf_fib_lookup',
+            'struct bpf_nf_conn',
+            'struct bpf_nf_conntrack_tuple',
             'struct bpf_perf_event_data',
             'struct bpf_perf_event_value',
             'struct bpf_sock',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 033d90a..85c4b3f 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2885,6 +2885,88 @@ struct bpf_stack_build_id {
  *		**-EPERM** if no permission to send the *sig*.
  *
  *		**-EAGAIN** if bpf program can try again.
+ *
+ * struct bpf_nf_conn *bpf_ct_lookup_tcp(void *ctx, struct bpf_nf_conntrack_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
+ *	Description
+ *		Look for TCP nf_conntrack entry matching *tuple*, optionally in
+ *		a child network namespace *netns*. The return value must be
+ *		checked, and if non-**NULL**, released via
+ *		**bpf_ct_release**\ ().
+ *
+ *		The *ctx* should point to the context of the program, such as
+ *		the skb or xdp_md (depending on the hook in use). This is used
+ *		to determine the base network namespace for the lookup.
+ *
+ *		*tuple_size* must be one of:
+ *
+ *		**sizeof**\ (*tuple*\ **->ipv4**)
+ *			Look for an IPv4 nf_conn.
+ *		**sizeof**\ (*tuple*\ **->ipv6**)
+ *			Look for an IPv6 nf_conn.
+ *
+ *		If the *netns* is a negative signed 32-bit integer, then the
+ *		nf_conn lookup table in the netns associated with the *ctx* will
+ *		will be used. For the TC hooks, this is the netns of the device
+ *		in the skb. For XDP hooks, this is the netns of the device in
+ *		the xdp_md. If *netns* is any other signed 32-bit value greater
+ *		than or equal to zero then it specifies the ID of the netns
+ *		relative to the netns associated with the *ctx*. *netns* values
+ *		beyond the range of 32-bit integers are reserved for future
+ *		use.
+ *
+ *		All values for *flags* are reserved for future usage, and must
+ *		be left at zero.
+ *
+ * 		This helper will always return NULL if the kernel was compiled
+ * 		without **CONFIG_NF_CONNTRACK**.
+ *	Return
+ *		Pointer to **struct bpf_nf_conn**, or **NULL** in case of
+ *		failure.
+ *
+ * struct bpf_nf_conn *bpf_ct_lookup_udp(void *ctx, struct bpf_nf_conntrack_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
+ *	Description
+ *		Look for UDP nf_conntrack entry matching *tuple*, optionally in
+ *		a child network namespace *netns*. The return value must be
+ *		checked, and if non-**NULL**, released via
+ *		**bpf_ct_release**\ ().
+ *
+ *		The *ctx* should point to the context of the program, such as
+ *		the skb or xdp_md (depending on the hook in use). This is used
+ *		to determine the base network namespace for the lookup.
+ *
+ *		*tuple_size* must be one of:
+ *
+ *		**sizeof**\ (*tuple*\ **->ipv4**)
+ *			Look for an IPv4 nf_conn.
+ *		**sizeof**\ (*tuple*\ **->ipv6**)
+ *			Look for an IPv6 nf_conn.
+ *
+ *		If the *netns* is a negative signed 32-bit integer, then the
+ *		nf_conn lookup table in the netns associated with the *ctx* will
+ *		will be used. For the TC hooks, this is the netns of the device
+ *		in the skb. For XDP hooks, this is the netns of the device in
+ *		the xdp_md. If *netns* is any other signed 32-bit value greater
+ *		than or equal to zero then it specifies the ID of the netns
+ *		relative to the netns associated with the *ctx*. *netns* values
+ *		beyond the range of 32-bit integers are reserved for future
+ *		use.
+ *
+ *		All values for *flags* are reserved for future usage, and must
+ *		be left at zero.
+ *
+ * 		This helper will always return NULL if the kernel was compiled
+ * 		without **CONFIG_NF_CONNTRACK**.
+ *	Return
+ *		Pointer to **struct bpf_nf_conn**, or **NULL** in case of
+ *		failure.
+ *
+ * int bpf_ct_release(struct bpf_nf_conn *ct)
+ *	Description
+ *		Release the reference held by *ct*. *ct* must be a
+ *		non-**NULL** pointer that was returned from
+ *		**bpf_ct_lookup_xxx**\ ().
+ *	Return
+ *		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3004,7 +3086,10 @@ struct bpf_stack_build_id {
 	FN(probe_read_user_str),	\
 	FN(probe_read_kernel_str),	\
 	FN(tcp_send_ack),		\
-	FN(send_signal_thread),
+	FN(send_signal_thread),		\
+	FN(ct_lookup_tcp),		\
+	FN(ct_lookup_udp),		\
+	FN(ct_release),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -3278,6 +3363,30 @@ struct bpf_sock_tuple {
 	};
 };
 
+struct bpf_nf_conn {
+	__u32 cpu;
+	__u32 mark;
+	__u32 status;
+	__u32 timeout;
+};
+
+struct bpf_nf_conntrack_tuple {
+	union {
+		struct {
+			__be32 saddr;
+			__be32 daddr;
+			__be16 sport;
+			__be16 dport;
+		} ipv4;
+		struct {
+			__be32 saddr[4];
+			__be32 daddr[4];
+			__be16 sport;
+			__be16 dport;
+		} ipv6;
+	};
+};
+
 struct bpf_xdp_sock {
 	__u32 queue_id;
 };
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH bpf-next v2 2/2] selftests/bpf: test references to nf_conn
  2020-01-21 20:20 ` [PATCH bpf-next v2 1/2] " Matthew Cover
@ 2020-01-21 20:22   ` Matthew Cover
  2020-01-21 20:35   ` [PATCH bpf-next v2 1/2] bpf: add bpf_ct_lookup_{tcp,udp}() helpers Matt Cover
  1 sibling, 0 replies; 22+ messages in thread
From: Matthew Cover @ 2020-01-21 20:22 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau, Song Liu,
	Yonghong Song, Andrii Nakryiko, David S. Miller, Shuah Khan,
	Jesper Dangaard Brouer, John Fastabend, Jakub Sitnicki,
	Quentin Monnet, Matthew Cover, Stanislav Fomichev,
	Andrey Ignatov, Lorenz Bauer, netdev, bpf, linux-kernel,
	linux-kselftest

Make sure that returning a struct nf_conn * reference invokes
the reference tracking machinery in the verifier.

Signed-off-by: Matthew Cover <matthew.cover@stackpath.com>
---
 tools/testing/selftests/bpf/test_verifier.c        | 18 ++++++++
 .../testing/selftests/bpf/verifier/ref_tracking.c  | 48 ++++++++++++++++++++++
 2 files changed, 66 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 87eaa49..7569db2 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -294,6 +294,24 @@ static void bpf_fill_scale(struct bpf_test *self)
 	}
 }
 
+/* BPF_CT_LOOKUP contains 13 instructions, if you need to fix up maps */
+#define BPF_CT_LOOKUP(func)						\
+	/* struct bpf_nf_conntrack_tuple tuple = {} */			\
+	BPF_MOV64_IMM(BPF_REG_2, 0),					\
+	BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8),			\
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -16),		\
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -24),		\
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -32),		\
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -40),		\
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -48),		\
+	/* ct = func(ctx, &tuple, sizeof tuple, 0, 0) */		\
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),				\
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -48),				\
+	BPF_MOV64_IMM(BPF_REG_3, sizeof(struct bpf_nf_conntrack_tuple)),\
+	BPF_MOV64_IMM(BPF_REG_4, 0),					\
+	BPF_MOV64_IMM(BPF_REG_5, 0),					\
+	BPF_EMIT_CALL(BPF_FUNC_ ## func)
+
 /* BPF_SK_LOOKUP contains 13 instructions, if you need to fix up maps */
 #define BPF_SK_LOOKUP(func)						\
 	/* struct bpf_sock_tuple tuple = {} */				\
diff --git a/tools/testing/selftests/bpf/verifier/ref_tracking.c b/tools/testing/selftests/bpf/verifier/ref_tracking.c
index 604b461..de5c550a 100644
--- a/tools/testing/selftests/bpf/verifier/ref_tracking.c
+++ b/tools/testing/selftests/bpf/verifier/ref_tracking.c
@@ -21,6 +21,17 @@
 	.result = REJECT,
 },
 {
+	"reference tracking: leak potential reference to nf_conn",
+	.insns = {
+	BPF_CT_LOOKUP(ct_lookup_tcp),
+	BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), /* leak reference */
+	BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	.errstr = "Unreleased reference",
+	.result = REJECT,
+},
+{
 	"reference tracking: leak potential reference on stack",
 	.insns = {
 	BPF_SK_LOOKUP(sk_lookup_tcp),
@@ -72,6 +83,17 @@
 	.result = REJECT,
 },
 {
+	"reference tracking: zero potential reference to nf_conn",
+	.insns = {
+	BPF_CT_LOOKUP(ct_lookup_tcp),
+	BPF_MOV64_IMM(BPF_REG_0, 0), /* leak reference */
+	BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	.errstr = "Unreleased reference",
+	.result = REJECT,
+},
+{
 	"reference tracking: copy and zero potential references",
 	.insns = {
 	BPF_SK_LOOKUP(sk_lookup_tcp),
@@ -113,6 +135,20 @@
 	.result = REJECT,
 },
 {
+	"reference tracking: release reference to nf_conn without check",
+	.insns = {
+	BPF_CT_LOOKUP(ct_lookup_tcp),
+	/* reference in r0 may be NULL */
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+	BPF_MOV64_IMM(BPF_REG_2, 0),
+	BPF_EMIT_CALL(BPF_FUNC_ct_release),
+	BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	.errstr = "type=nf_conn_or_null expected=nf_conn",
+	.result = REJECT,
+},
+{
 	"reference tracking: release reference",
 	.insns = {
 	BPF_SK_LOOKUP(sk_lookup_tcp),
@@ -137,6 +173,18 @@
 	.result = ACCEPT,
 },
 {
+	"reference tracking: release reference to nf_conn",
+	.insns = {
+	BPF_CT_LOOKUP(ct_lookup_tcp),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+	BPF_EMIT_CALL(BPF_FUNC_ct_release),
+	BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	.result = ACCEPT,
+},
+{
 	"reference tracking: release reference 2",
 	.insns = {
 	BPF_SK_LOOKUP(sk_lookup_tcp),
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH bpf-next v2 1/2] bpf: add bpf_ct_lookup_{tcp,udp}() helpers
  2020-01-21 20:20 ` [PATCH bpf-next v2 1/2] " Matthew Cover
  2020-01-21 20:22   ` [PATCH bpf-next v2 2/2] selftests/bpf: test references to nf_conn Matthew Cover
@ 2020-01-21 20:35   ` Matt Cover
  2020-01-21 21:31     ` Matt Cover
  2020-01-24 19:11     ` Joe Stringer
  1 sibling, 2 replies; 22+ messages in thread
From: Matt Cover @ 2020-01-21 20:35 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau, Song Liu,
	Yonghong Song, Andrii Nakryiko, David S. Miller, Shuah Khan,
	Jesper Dangaard Brouer, John Fastabend, Jakub Sitnicki,
	Quentin Monnet, Matthew Cover, Stanislav Fomichev,
	Andrey Ignatov, Lorenz Bauer, netdev, bpf, linux-kernel,
	linux-kselftest, Pablo Neira Ayuso, Jozsef Kadlecsik,
	Florian Westphal, coreteam

On Tue, Jan 21, 2020 at 1:20 PM Matthew Cover <werekraken@gmail.com> wrote:
>
> Allow looking up an nf_conn. This allows eBPF programs to leverage
> nf_conntrack state for similar purposes to socket state use cases,
> as provided by the socket lookup helpers. This is particularly
> useful when nf_conntrack state is locally available, but socket
> state is not.
>
> v2:
>   - Fix functions in need of and missing static inline (kbuild)
>   - Move tests to separate patch and submit as a series (John)
>   - Improve clarity in helper documentation (John)
>   - Add CONFIG_NF_CONNTRACK=m support (Daniel)

Sorry, missed additional maintainers for v2 changes.

+Pablo Neira Ayuso <pablo@netfilter.org>
+Jozsef Kadlecsik <kadlec@netfilter.org>
+Florian Westphal <fw@strlen.de>
+coreteam@netfilter.org

>
> Signed-off-by: Matthew Cover <matthew.cover@stackpath.com>
> ---
>  include/linux/bpf.h               |  29 ++++
>  include/linux/netfilter.h         |  12 ++
>  include/uapi/linux/bpf.h          | 111 ++++++++++++++-
>  kernel/bpf/verifier.c             | 105 ++++++++++++++-
>  net/core/filter.c                 | 277 ++++++++++++++++++++++++++++++++++++++
>  net/netfilter/core.c              |  16 +++
>  net/netfilter/nf_conntrack_core.c |   1 +
>  scripts/bpf_helpers_doc.py        |   4 +
>  tools/include/uapi/linux/bpf.h    | 111 ++++++++++++++-
>  9 files changed, 658 insertions(+), 8 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 8e3b8f4..f502e1f 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -239,6 +239,7 @@ enum bpf_arg_type {
>         ARG_PTR_TO_LONG,        /* pointer to long */
>         ARG_PTR_TO_SOCKET,      /* pointer to bpf_sock (fullsock) */
>         ARG_PTR_TO_BTF_ID,      /* pointer to in-kernel struct */
> +       ARG_PTR_TO_NF_CONN,     /* pointer to bpf_nf_conn */
>  };
>
>  /* type of values returned from helper functions */
> @@ -250,6 +251,7 @@ enum bpf_return_type {
>         RET_PTR_TO_SOCKET_OR_NULL,      /* returns a pointer to a socket or NULL */
>         RET_PTR_TO_TCP_SOCK_OR_NULL,    /* returns a pointer to a tcp_sock or NULL */
>         RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */
> +       RET_PTR_TO_NF_CONN_OR_NULL,     /* returns a pointer to a nf_conn or NULL */
>  };
>
>  /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
> @@ -316,6 +318,8 @@ enum bpf_reg_type {
>         PTR_TO_TP_BUFFER,        /* reg points to a writable raw tp's buffer */
>         PTR_TO_XDP_SOCK,         /* reg points to struct xdp_sock */
>         PTR_TO_BTF_ID,           /* reg points to kernel struct */
> +       PTR_TO_NF_CONN,          /* reg points to struct nf_conn */
> +       PTR_TO_NF_CONN_OR_NULL,  /* reg points to struct nf_conn or NULL */
>  };
>
>  /* The information passed from prog-specific *_is_valid_access
> @@ -1513,4 +1517,29 @@ enum bpf_text_poke_type {
>  int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
>                        void *addr1, void *addr2);
>
> +#if IS_ENABLED(CONFIG_NF_CONNTRACK)
> +bool bpf_nf_conn_is_valid_access(int off, int size, enum bpf_access_type type,
> +                                struct bpf_insn_access_aux *info);
> +
> +u32 bpf_nf_conn_convert_ctx_access(enum bpf_access_type type,
> +                                  const struct bpf_insn *si,
> +                                  struct bpf_insn *insn_buf,
> +                                  struct bpf_prog *prog, u32 *target_size);
> +#else
> +static inline bool bpf_nf_conn_is_valid_access(int off, int size,
> +                               enum bpf_access_type type,
> +                               struct bpf_insn_access_aux *info)
> +{
> +       return false;
> +}
> +
> +static inline u32 bpf_nf_conn_convert_ctx_access(enum bpf_access_type type,
> +                               const struct bpf_insn *si,
> +                               struct bpf_insn *insn_buf,
> +                               struct bpf_prog *prog, u32 *target_size)
> +{
> +       return 0;
> +}
> +#endif /* CONFIG_NF_CONNTRACK */
> +
>  #endif /* _LINUX_BPF_H */
> diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
> index eb312e7..a360ced 100644
> --- a/include/linux/netfilter.h
> +++ b/include/linux/netfilter.h
> @@ -451,6 +451,9 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
>  struct nf_conntrack_tuple;
>  bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
>                          const struct sk_buff *skb);
> +struct nf_conntrack_tuple_hash *
> +nf_ct_find_get(struct net *net, const struct nf_conntrack_zone *zone,
> +              const struct nf_conntrack_tuple *tuple);
>  #else
>  static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
>  struct nf_conntrack_tuple;
> @@ -459,6 +462,12 @@ static inline bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
>  {
>         return false;
>  }
> +static inline struct nf_conntrack_tuple_hash *
> +nf_ct_find_get(struct net *net, const struct nf_conntrack_zone *zone,
> +              const struct nf_conntrack_tuple *tuple)
> +{
> +       return NULL;
> +}
>  #endif
>
>  struct nf_conn;
> @@ -469,6 +478,9 @@ struct nf_ct_hook {
>         void (*destroy)(struct nf_conntrack *);
>         bool (*get_tuple_skb)(struct nf_conntrack_tuple *,
>                               const struct sk_buff *);
> +       struct nf_conntrack_tuple_hash *
> +       (*find_get)(struct net *net, const struct nf_conntrack_zone *zone,
> +                    const struct nf_conntrack_tuple *tuple);
>  };
>  extern struct nf_ct_hook __rcu *nf_ct_hook;
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 033d90a..85c4b3f 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -2885,6 +2885,88 @@ struct bpf_stack_build_id {
>   *             **-EPERM** if no permission to send the *sig*.
>   *
>   *             **-EAGAIN** if bpf program can try again.
> + *
> + * struct bpf_nf_conn *bpf_ct_lookup_tcp(void *ctx, struct bpf_nf_conntrack_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
> + *     Description
> + *             Look for TCP nf_conntrack entry matching *tuple*, optionally in
> + *             a child network namespace *netns*. The return value must be
> + *             checked, and if non-**NULL**, released via
> + *             **bpf_ct_release**\ ().
> + *
> + *             The *ctx* should point to the context of the program, such as
> + *             the skb or xdp_md (depending on the hook in use). This is used
> + *             to determine the base network namespace for the lookup.
> + *
> + *             *tuple_size* must be one of:
> + *
> + *             **sizeof**\ (*tuple*\ **->ipv4**)
> + *                     Look for an IPv4 nf_conn.
> + *             **sizeof**\ (*tuple*\ **->ipv6**)
> + *                     Look for an IPv6 nf_conn.
> + *
> + *             If the *netns* is a negative signed 32-bit integer, then the
> + *             nf_conn lookup table in the netns associated with the *ctx* will
> + *             will be used. For the TC hooks, this is the netns of the device
> + *             in the skb. For XDP hooks, this is the netns of the device in
> + *             the xdp_md. If *netns* is any other signed 32-bit value greater
> + *             than or equal to zero then it specifies the ID of the netns
> + *             relative to the netns associated with the *ctx*. *netns* values
> + *             beyond the range of 32-bit integers are reserved for future
> + *             use.
> + *
> + *             All values for *flags* are reserved for future usage, and must
> + *             be left at zero.
> + *
> + *             This helper will always return NULL if the kernel was compiled
> + *             without **CONFIG_NF_CONNTRACK**.
> + *     Return
> + *             Pointer to **struct bpf_nf_conn**, or **NULL** in case of
> + *             failure.
> + *
> + * struct bpf_nf_conn *bpf_ct_lookup_udp(void *ctx, struct bpf_nf_conntrack_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
> + *     Description
> + *             Look for UDP nf_conntrack entry matching *tuple*, optionally in
> + *             a child network namespace *netns*. The return value must be
> + *             checked, and if non-**NULL**, released via
> + *             **bpf_ct_release**\ ().
> + *
> + *             The *ctx* should point to the context of the program, such as
> + *             the skb or xdp_md (depending on the hook in use). This is used
> + *             to determine the base network namespace for the lookup.
> + *
> + *             *tuple_size* must be one of:
> + *
> + *             **sizeof**\ (*tuple*\ **->ipv4**)
> + *                     Look for an IPv4 nf_conn.
> + *             **sizeof**\ (*tuple*\ **->ipv6**)
> + *                     Look for an IPv6 nf_conn.
> + *
> + *             If the *netns* is a negative signed 32-bit integer, then the
> + *             nf_conn lookup table in the netns associated with the *ctx* will
> + *             will be used. For the TC hooks, this is the netns of the device
> + *             in the skb. For XDP hooks, this is the netns of the device in
> + *             the xdp_md. If *netns* is any other signed 32-bit value greater
> + *             than or equal to zero then it specifies the ID of the netns
> + *             relative to the netns associated with the *ctx*. *netns* values
> + *             beyond the range of 32-bit integers are reserved for future
> + *             use.
> + *
> + *             All values for *flags* are reserved for future usage, and must
> + *             be left at zero.
> + *
> + *             This helper will always return NULL if the kernel was compiled
> + *             without **CONFIG_NF_CONNTRACK**.
> + *     Return
> + *             Pointer to **struct bpf_nf_conn**, or **NULL** in case of
> + *             failure.
> + *
> + * int bpf_ct_release(struct bpf_nf_conn *ct)
> + *     Description
> + *             Release the reference held by *ct*. *ct* must be a
> + *             non-**NULL** pointer that was returned from
> + *             **bpf_ct_lookup_xxx**\ ().
> + *     Return
> + *             0 on success, or a negative error in case of failure.
>   */
>  #define __BPF_FUNC_MAPPER(FN)          \
>         FN(unspec),                     \
> @@ -3004,7 +3086,10 @@ struct bpf_stack_build_id {
>         FN(probe_read_user_str),        \
>         FN(probe_read_kernel_str),      \
>         FN(tcp_send_ack),               \
> -       FN(send_signal_thread),
> +       FN(send_signal_thread),         \
> +       FN(ct_lookup_tcp),              \
> +       FN(ct_lookup_udp),              \
> +       FN(ct_release),
>
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> @@ -3278,6 +3363,30 @@ struct bpf_sock_tuple {
>         };
>  };
>
> +struct bpf_nf_conn {
> +       __u32 cpu;
> +       __u32 mark;
> +       __u32 status;
> +       __u32 timeout;
> +};
> +
> +struct bpf_nf_conntrack_tuple {
> +       union {
> +               struct {
> +                       __be32 saddr;
> +                       __be32 daddr;
> +                       __be16 sport;
> +                       __be16 dport;
> +               } ipv4;
> +               struct {
> +                       __be32 saddr[4];
> +                       __be32 daddr[4];
> +                       __be16 sport;
> +                       __be16 dport;
> +               } ipv6;
> +       };
> +};
> +
>  struct bpf_xdp_sock {
>         __u32 queue_id;
>  };
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index ca17dccc..0ea0ee7 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -362,6 +362,11 @@ static const char *ltrim(const char *s)
>         env->prev_linfo = linfo;
>  }
>
> +static bool type_is_nf_ct_pointer(enum bpf_reg_type type)
> +{
> +       return type == PTR_TO_NF_CONN;
> +}
> +
>  static bool type_is_pkt_pointer(enum bpf_reg_type type)
>  {
>         return type == PTR_TO_PACKET ||
> @@ -381,7 +386,8 @@ static bool reg_type_may_be_null(enum bpf_reg_type type)
>         return type == PTR_TO_MAP_VALUE_OR_NULL ||
>                type == PTR_TO_SOCKET_OR_NULL ||
>                type == PTR_TO_SOCK_COMMON_OR_NULL ||
> -              type == PTR_TO_TCP_SOCK_OR_NULL;
> +              type == PTR_TO_TCP_SOCK_OR_NULL ||
> +              type == PTR_TO_NF_CONN_OR_NULL;
>  }
>
>  static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
> @@ -395,12 +401,15 @@ static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
>         return type == PTR_TO_SOCKET ||
>                 type == PTR_TO_SOCKET_OR_NULL ||
>                 type == PTR_TO_TCP_SOCK ||
> -               type == PTR_TO_TCP_SOCK_OR_NULL;
> +               type == PTR_TO_TCP_SOCK_OR_NULL ||
> +               type == PTR_TO_NF_CONN ||
> +               type == PTR_TO_NF_CONN_OR_NULL;
>  }
>
>  static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
>  {
> -       return type == ARG_PTR_TO_SOCK_COMMON;
> +       return type == ARG_PTR_TO_SOCK_COMMON ||
> +               type == ARG_PTR_TO_NF_CONN;
>  }
>
>  /* Determine whether the function releases some resources allocated by another
> @@ -409,14 +418,17 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
>   */
>  static bool is_release_function(enum bpf_func_id func_id)
>  {
> -       return func_id == BPF_FUNC_sk_release;
> +       return func_id == BPF_FUNC_sk_release ||
> +               func_id == BPF_FUNC_ct_release;
>  }
>
>  static bool is_acquire_function(enum bpf_func_id func_id)
>  {
>         return func_id == BPF_FUNC_sk_lookup_tcp ||
>                 func_id == BPF_FUNC_sk_lookup_udp ||
> -               func_id == BPF_FUNC_skc_lookup_tcp;
> +               func_id == BPF_FUNC_skc_lookup_tcp ||
> +               func_id == BPF_FUNC_ct_lookup_tcp ||
> +               func_id == BPF_FUNC_ct_lookup_udp;
>  }
>
>  static bool is_ptr_cast_function(enum bpf_func_id func_id)
> @@ -447,6 +459,8 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id)
>         [PTR_TO_TP_BUFFER]      = "tp_buffer",
>         [PTR_TO_XDP_SOCK]       = "xdp_sock",
>         [PTR_TO_BTF_ID]         = "ptr_",
> +       [PTR_TO_NF_CONN]        = "nf_conn",
> +       [PTR_TO_NF_CONN_OR_NULL] = "nf_conn_or_null",
>  };
>
>  static char slot_type_char[] = {
> @@ -1913,6 +1927,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
>         case PTR_TO_TCP_SOCK_OR_NULL:
>         case PTR_TO_XDP_SOCK:
>         case PTR_TO_BTF_ID:
> +       case PTR_TO_NF_CONN:
> +       case PTR_TO_NF_CONN_OR_NULL:
>                 return true;
>         default:
>                 return false;
> @@ -2440,6 +2456,35 @@ static int check_flow_keys_access(struct bpf_verifier_env *env, int off,
>         return 0;
>  }
>
> +static int check_nf_ct_access(struct bpf_verifier_env *env, int insn_idx,
> +                            u32 regno, int off, int size,
> +                            enum bpf_access_type t)
> +{
> +       struct bpf_reg_state *regs = cur_regs(env);
> +       struct bpf_reg_state *reg = &regs[regno];
> +       struct bpf_insn_access_aux info = {};
> +       bool valid;
> +
> +       switch (reg->type) {
> +       case PTR_TO_NF_CONN:
> +               valid = bpf_nf_conn_is_valid_access(off, size, t, &info);
> +               break;
> +       default:
> +               valid = false;
> +       }
> +
> +       if (valid) {
> +               env->insn_aux_data[insn_idx].ctx_field_size =
> +                       info.ctx_field_size;
> +               return 0;
> +       }
> +
> +       verbose(env, "R%d invalid %s access off=%d size=%d\n",
> +               regno, reg_type_str[reg->type], off, size);
> +
> +       return -EACCES;
> +}
> +
>  static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
>                              u32 regno, int off, int size,
>                              enum bpf_access_type t)
> @@ -2511,6 +2556,13 @@ static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)
>         return reg->type == PTR_TO_CTX;
>  }
>
> +static bool is_nf_ct_reg(struct bpf_verifier_env *env, int regno)
> +{
> +       const struct bpf_reg_state *reg = reg_state(env, regno);
> +
> +       return type_is_nf_ct_pointer(reg->type);
> +}
> +
>  static bool is_sk_reg(struct bpf_verifier_env *env, int regno)
>  {
>         const struct bpf_reg_state *reg = reg_state(env, regno);
> @@ -2635,6 +2687,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
>         case PTR_TO_XDP_SOCK:
>                 pointer_desc = "xdp_sock ";
>                 break;
> +       case PTR_TO_NF_CONN:
> +               pointer_desc = "nf_conn ";
> +               break;
>         default:
>                 break;
>         }
> @@ -3050,6 +3105,15 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
>                 err = check_sock_access(env, insn_idx, regno, off, size, t);
>                 if (!err && value_regno >= 0)
>                         mark_reg_unknown(env, regs, value_regno);
> +       } else if (type_is_nf_ct_pointer(reg->type)) {
> +               if (t == BPF_WRITE) {
> +                       verbose(env, "R%d cannot write into %s\n",
> +                               regno, reg_type_str[reg->type]);
> +                       return -EACCES;
> +               }
> +               err = check_nf_ct_access(env, insn_idx, regno, off, size, t);
> +               if (!err && value_regno >= 0)
> +                       mark_reg_unknown(env, regs, value_regno);
>         } else if (reg->type == PTR_TO_TP_BUFFER) {
>                 err = check_tp_buffer_access(env, reg, regno, off, size);
>                 if (!err && t == BPF_READ && value_regno >= 0)
> @@ -3099,7 +3163,8 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
>         if (is_ctx_reg(env, insn->dst_reg) ||
>             is_pkt_reg(env, insn->dst_reg) ||
>             is_flow_key_reg(env, insn->dst_reg) ||
> -           is_sk_reg(env, insn->dst_reg)) {
> +           is_sk_reg(env, insn->dst_reg) ||
> +           is_nf_ct_reg(env, insn->dst_reg)) {
>                 verbose(env, "BPF_XADD stores into R%d %s is not allowed\n",
>                         insn->dst_reg,
>                         reg_type_str[reg_state(env, insn->dst_reg)->type]);
> @@ -3501,6 +3566,19 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
>                                 regno);
>                         return -EACCES;
>                 }
> +       } else if (arg_type == ARG_PTR_TO_NF_CONN) {
> +               expected_type = PTR_TO_NF_CONN;
> +               if (!type_is_nf_ct_pointer(type))
> +                       goto err_type;
> +               if (reg->ref_obj_id) {
> +                       if (meta->ref_obj_id) {
> +                               verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
> +                                       regno, reg->ref_obj_id,
> +                                       meta->ref_obj_id);
> +                               return -EFAULT;
> +                       }
> +                       meta->ref_obj_id = reg->ref_obj_id;
> +               }
>         } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
>                 if (meta->func_id == BPF_FUNC_spin_lock) {
>                         if (process_spin_lock(env, regno, true))
> @@ -4368,6 +4446,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
>                 mark_reg_known_zero(env, regs, BPF_REG_0);
>                 regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL;
>                 regs[BPF_REG_0].id = ++env->id_gen;
> +       } else if (fn->ret_type == RET_PTR_TO_NF_CONN_OR_NULL) {
> +               mark_reg_known_zero(env, regs, BPF_REG_0);
> +               regs[BPF_REG_0].type = PTR_TO_NF_CONN_OR_NULL;
> +               regs[BPF_REG_0].id = ++env->id_gen;
>         } else {
>                 verbose(env, "unknown return type %d of func %s#%d\n",
>                         fn->ret_type, func_id_name(func_id), func_id);
> @@ -4649,6 +4731,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
>         case PTR_TO_TCP_SOCK:
>         case PTR_TO_TCP_SOCK_OR_NULL:
>         case PTR_TO_XDP_SOCK:
> +       case PTR_TO_NF_CONN:
> +       case PTR_TO_NF_CONN_OR_NULL:
>                 verbose(env, "R%d pointer arithmetic on %s prohibited\n",
>                         dst, reg_type_str[ptr_reg->type]);
>                 return -EACCES;
> @@ -5915,6 +5999,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
>                         reg->type = PTR_TO_SOCK_COMMON;
>                 } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) {
>                         reg->type = PTR_TO_TCP_SOCK;
> +               } else if (reg->type == PTR_TO_NF_CONN_OR_NULL) {
> +                       reg->type = PTR_TO_NF_CONN;
>                 }
>                 if (is_null) {
>                         /* We don't need id and ref_obj_id from this point
> @@ -7232,6 +7318,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
>         case PTR_TO_TCP_SOCK:
>         case PTR_TO_TCP_SOCK_OR_NULL:
>         case PTR_TO_XDP_SOCK:
> +       case PTR_TO_NF_CONN:
> +       case PTR_TO_NF_CONN_OR_NULL:
>                 /* Only valid matches are exact, which memcmp() above
>                  * would have accepted
>                  */
> @@ -7760,6 +7848,8 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type)
>         case PTR_TO_TCP_SOCK_OR_NULL:
>         case PTR_TO_XDP_SOCK:
>         case PTR_TO_BTF_ID:
> +       case PTR_TO_NF_CONN:
> +       case PTR_TO_NF_CONN_OR_NULL:
>                 return false;
>         default:
>                 return true;
> @@ -8867,6 +8957,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
>                                 return -EINVAL;
>                         }
>                         continue;
> +               case PTR_TO_NF_CONN:
> +                       convert_ctx_access = bpf_nf_conn_convert_ctx_access;
> +                       break;
>                 default:
>                         continue;
>                 }
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 17de674..80319d3 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -74,6 +74,12 @@
>  #include <net/ipv6_stubs.h>
>  #include <net/bpf_sk_storage.h>
>
> +#if IS_ENABLED(CONFIG_NF_CONNTRACK)
> +#include <net/netfilter/nf_conntrack_tuple.h>
> +#include <net/netfilter/nf_conntrack_core.h>
> +#include <net/netfilter/nf_conntrack.h>
> +#endif
> +
>  /**
>   *     sk_filter_trim_cap - run a packet through a socket filter
>   *     @sk: sock associated with &sk_buff
> @@ -5122,6 +5128,253 @@ static void bpf_update_srh_state(struct sk_buff *skb)
>  };
>  #endif /* CONFIG_IPV6_SEG6_BPF */
>
> +#if IS_ENABLED(CONFIG_NF_CONNTRACK)
> +bool bpf_nf_conn_is_valid_access(int off, int size, enum bpf_access_type type,
> +                                struct bpf_insn_access_aux *info)
> +{
> +       if (off < 0 || off >= offsetofend(struct bpf_nf_conn,
> +                                         timeout))
> +               return false;
> +
> +       if (off % size != 0)
> +               return false;
> +
> +       return size == sizeof(__u32);
> +}
> +
> +u32 bpf_nf_conn_convert_ctx_access(enum bpf_access_type type,
> +                                  const struct bpf_insn *si,
> +                                  struct bpf_insn *insn_buf,
> +                                  struct bpf_prog *prog, u32 *target_size)
> +{
> +       struct bpf_insn *insn = insn_buf;
> +
> +       switch (si->off) {
> +       case offsetof(struct bpf_nf_conn, cpu):
> +               BUILD_BUG_ON(FIELD_SIZEOF(struct nf_conn, cpu) != 2);
> +
> +               *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
> +                                     offsetof(struct nf_conn, cpu));
> +
> +               break;
> +
> +       case offsetof(struct bpf_nf_conn, mark):
> +#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
> +               BUILD_BUG_ON(FIELD_SIZEOF(struct nf_conn, mark) != 4);
> +
> +               *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
> +                                     offsetof(struct nf_conn, mark));
> +#else
> +               *target_size = 4;
> +               *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
> +#endif
> +               break;
> +
> +       case offsetof(struct bpf_nf_conn, status):
> +               BUILD_BUG_ON(FIELD_SIZEOF(struct nf_conn, status) < 4 ||
> +                            __IPS_MAX_BIT > 32);
> +
> +               *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
> +                                     offsetof(struct nf_conn, status));
> +
> +               break;
> +
> +       case offsetof(struct bpf_nf_conn, timeout):
> +               BUILD_BUG_ON(FIELD_SIZEOF(struct nf_conn, timeout) != 4);
> +
> +               *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
> +                                     offsetof(struct nf_conn, timeout));
> +
> +               break;
> +       }
> +
> +       return insn - insn_buf;
> +}
> +
> +static struct nf_conn *
> +ct_lookup(struct net *net, struct bpf_nf_conntrack_tuple *tuple,
> +         u8 family, u8 proto)
> +{
> +       struct nf_conntrack_tuple_hash *hash;
> +       struct nf_conntrack_tuple tup;
> +       struct nf_conn *ct = NULL;
> +
> +       memset(&tup, 0, sizeof(tup));
> +
> +       tup.dst.protonum = proto;
> +       tup.src.l3num = family;
> +
> +       if (family == AF_INET) {
> +               tup.src.u3.ip = tuple->ipv4.saddr;
> +               tup.dst.u3.ip = tuple->ipv4.daddr;
> +               tup.src.u.tcp.port = tuple->ipv4.sport;
> +               tup.dst.u.tcp.port = tuple->ipv4.dport;
> +#if IS_ENABLED(CONFIG_IPV6)
> +       } else {
> +               memcpy(tup.src.u3.ip6, tuple->ipv6.saddr, sizeof(tup.src.u3.ip6));
> +               memcpy(tup.dst.u3.ip6, tuple->ipv6.daddr, sizeof(tup.dst.u3.ip6));
> +               tup.src.u.tcp.port = tuple->ipv6.sport;
> +               tup.dst.u.tcp.port = tuple->ipv6.dport;
> +#endif
> +       }
> +
> +       hash = nf_ct_find_get(net, &nf_ct_zone_dflt, &tup);
> +       if (!hash)
> +               goto out;
> +       ct = nf_ct_tuplehash_to_ctrack(hash);
> +
> +out:
> +       return ct;
> +}
> +
> +static struct nf_conn *
> +__bpf_ct_lookup(struct sk_buff *skb, struct bpf_nf_conntrack_tuple *tuple, u32 len,
> +               struct net *caller_net, u8 proto, u64 netns_id, u64 flags)
> +{
> +       struct nf_conn *ct = NULL;
> +       u8 family = AF_UNSPEC;
> +       struct net *net;
> +
> +       if (len == sizeof(tuple->ipv4))
> +               family = AF_INET;
> +       else if (len == sizeof(tuple->ipv6))
> +               family = AF_INET6;
> +       else
> +               goto out;
> +
> +       if (unlikely(family == AF_UNSPEC || flags ||
> +                    !((s32)netns_id < 0 || netns_id <= S32_MAX)))
> +               goto out;
> +
> +       if ((s32)netns_id < 0) {
> +               net = caller_net;
> +               ct = ct_lookup(net, tuple, family, proto);
> +       } else {
> +               net = get_net_ns_by_id(caller_net, netns_id);
> +               if (unlikely(!net))
> +                       goto out;
> +               ct = ct_lookup(net, tuple, family, proto);
> +               put_net(net);
> +       }
> +
> +out:
> +       return ct;
> +}
> +
> +static struct nf_conn *
> +bpf_ct_lookup(struct sk_buff *skb, struct bpf_nf_conntrack_tuple *tuple, u32 len,
> +             u8 proto, u64 netns_id, u64 flags)
> +{
> +       struct net *caller_net;
> +
> +       if (skb->dev) {
> +               caller_net = dev_net(skb->dev);
> +       } else {
> +               caller_net = sock_net(skb->sk);
> +       }
> +
> +       return __bpf_ct_lookup(skb, tuple, len, caller_net, proto,
> +                              netns_id, flags);
> +}
> +
> +BPF_CALL_5(bpf_ct_lookup_tcp, struct sk_buff *, skb,
> +          struct bpf_nf_conntrack_tuple *, tuple, u32, len, u64, netns_id,
> +          u64, flags)
> +{
> +       return (unsigned long)bpf_ct_lookup(skb, tuple, len, IPPROTO_TCP,
> +                                            netns_id, flags);
> +}
> +
> +static const struct bpf_func_proto bpf_ct_lookup_tcp_proto = {
> +       .func           = bpf_ct_lookup_tcp,
> +       .gpl_only       = true,
> +       .pkt_access     = true,
> +       .ret_type       = RET_PTR_TO_NF_CONN_OR_NULL,
> +       .arg1_type      = ARG_PTR_TO_CTX,
> +       .arg2_type      = ARG_PTR_TO_MEM,
> +       .arg3_type      = ARG_CONST_SIZE,
> +       .arg4_type      = ARG_ANYTHING,
> +       .arg5_type      = ARG_ANYTHING,
> +};
> +
> +BPF_CALL_5(bpf_xdp_ct_lookup_tcp, struct xdp_buff *, ctx,
> +          struct bpf_nf_conntrack_tuple *, tuple, u32, len, u32, netns_id,
> +          u64, flags)
> +{
> +       struct net *caller_net = dev_net(ctx->rxq->dev);
> +
> +       return (unsigned long)__bpf_ct_lookup(NULL, tuple, len, caller_net,
> +                                             IPPROTO_TCP, netns_id, flags);
> +}
> +
> +static const struct bpf_func_proto bpf_xdp_ct_lookup_tcp_proto = {
> +       .func           = bpf_xdp_ct_lookup_tcp,
> +       .gpl_only       = true,
> +       .pkt_access     = true,
> +       .ret_type       = RET_PTR_TO_NF_CONN_OR_NULL,
> +       .arg1_type      = ARG_PTR_TO_CTX,
> +       .arg2_type      = ARG_PTR_TO_MEM,
> +       .arg3_type      = ARG_CONST_SIZE,
> +       .arg4_type      = ARG_ANYTHING,
> +       .arg5_type      = ARG_ANYTHING,
> +};
> +
> +BPF_CALL_5(bpf_ct_lookup_udp, struct sk_buff *, skb,
> +          struct bpf_nf_conntrack_tuple *, tuple, u32, len, u64, netns_id,
> +          u64, flags)
> +{
> +       return (unsigned long)bpf_ct_lookup(skb, tuple, len, IPPROTO_UDP,
> +                                            netns_id, flags);
> +}
> +
> +static const struct bpf_func_proto bpf_ct_lookup_udp_proto = {
> +       .func           = bpf_ct_lookup_udp,
> +       .gpl_only       = true,
> +       .pkt_access     = true,
> +       .ret_type       = RET_PTR_TO_NF_CONN_OR_NULL,
> +       .arg1_type      = ARG_PTR_TO_CTX,
> +       .arg2_type      = ARG_PTR_TO_MEM,
> +       .arg3_type      = ARG_CONST_SIZE,
> +       .arg4_type      = ARG_ANYTHING,
> +       .arg5_type      = ARG_ANYTHING,
> +};
> +
> +BPF_CALL_5(bpf_xdp_ct_lookup_udp, struct xdp_buff *, ctx,
> +          struct bpf_nf_conntrack_tuple *, tuple, u32, len, u32, netns_id,
> +          u64, flags)
> +{
> +       struct net *caller_net = dev_net(ctx->rxq->dev);
> +
> +       return (unsigned long)__bpf_ct_lookup(NULL, tuple, len, caller_net,
> +                                             IPPROTO_UDP, netns_id, flags);
> +}
> +
> +static const struct bpf_func_proto bpf_xdp_ct_lookup_udp_proto = {
> +       .func           = bpf_xdp_ct_lookup_udp,
> +       .gpl_only       = true,
> +       .pkt_access     = true,
> +       .ret_type       = RET_PTR_TO_NF_CONN_OR_NULL,
> +       .arg1_type      = ARG_PTR_TO_CTX,
> +       .arg2_type      = ARG_PTR_TO_MEM,
> +       .arg3_type      = ARG_CONST_SIZE,
> +       .arg4_type      = ARG_ANYTHING,
> +       .arg5_type      = ARG_ANYTHING,
> +};
> +
> +BPF_CALL_1(bpf_ct_release, struct nf_conn *, ct)
> +{
> +       nf_conntrack_put(&ct->ct_general);
> +       return 0;
> +}
> +
> +static const struct bpf_func_proto bpf_ct_release_proto = {
> +       .func           = bpf_ct_release,
> +       .gpl_only       = true,
> +       .ret_type       = RET_INTEGER,
> +       .arg1_type      = ARG_PTR_TO_NF_CONN,
> +};
> +#endif
> +
>  #ifdef CONFIG_INET
>  static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
>                               int dif, int sdif, u8 family, u8 proto)
> @@ -6139,6 +6392,14 @@ bool bpf_helper_changes_pkt_data(void *func)
>         case BPF_FUNC_tcp_gen_syncookie:
>                 return &bpf_tcp_gen_syncookie_proto;
>  #endif
> +#if IS_ENABLED(CONFIG_NF_CONNTRACK)
> +       case BPF_FUNC_ct_lookup_tcp:
> +               return &bpf_ct_lookup_tcp_proto;
> +       case BPF_FUNC_ct_lookup_udp:
> +               return &bpf_ct_lookup_udp_proto;
> +       case BPF_FUNC_ct_release:
> +               return &bpf_ct_release_proto;
> +#endif
>         default:
>                 return bpf_base_func_proto(func_id);
>         }
> @@ -6180,6 +6441,14 @@ bool bpf_helper_changes_pkt_data(void *func)
>         case BPF_FUNC_tcp_gen_syncookie:
>                 return &bpf_tcp_gen_syncookie_proto;
>  #endif
> +#if IS_ENABLED(CONFIG_NF_CONNTRACK)
> +       case BPF_FUNC_ct_lookup_tcp:
> +               return &bpf_xdp_ct_lookup_tcp_proto;
> +       case BPF_FUNC_ct_lookup_udp:
> +               return &bpf_xdp_ct_lookup_udp_proto;
> +       case BPF_FUNC_ct_release:
> +               return &bpf_ct_release_proto;
> +#endif
>         default:
>                 return bpf_base_func_proto(func_id);
>         }
> @@ -6284,6 +6553,14 @@ bool bpf_helper_changes_pkt_data(void *func)
>         case BPF_FUNC_skc_lookup_tcp:
>                 return &bpf_skc_lookup_tcp_proto;
>  #endif
> +#if IS_ENABLED(CONFIG_NF_CONNTRACK)
> +       case BPF_FUNC_ct_lookup_tcp:
> +               return &bpf_ct_lookup_tcp_proto;
> +       case BPF_FUNC_ct_lookup_udp:
> +               return &bpf_ct_lookup_udp_proto;
> +       case BPF_FUNC_ct_release:
> +               return &bpf_ct_release_proto;
> +#endif
>         default:
>                 return bpf_base_func_proto(func_id);
>         }
> diff --git a/net/netfilter/core.c b/net/netfilter/core.c
> index 78f046e..855c6b0 100644
> --- a/net/netfilter/core.c
> +++ b/net/netfilter/core.c
> @@ -617,6 +617,22 @@ bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
>  }
>  EXPORT_SYMBOL(nf_ct_get_tuple_skb);
>
> +struct nf_conntrack_tuple_hash *
> +nf_ct_find_get(struct net *net, const struct nf_conntrack_zone *zone,
> +              const struct nf_conntrack_tuple *tuple)
> +{
> +       struct nf_ct_hook *ct_hook;
> +       struct nf_conntrack_tuple_hash *ret = NULL;
> +
> +       rcu_read_lock();
> +       ct_hook = rcu_dereference(nf_ct_hook);
> +       if (ct_hook)
> +               ret = ct_hook->find_get(net, zone, tuple);
> +       rcu_read_unlock();
> +       return ret;
> +}
> +EXPORT_SYMBOL_GPL(nf_ct_find_get);
> +
>  /* Built-in default zone used e.g. by modules. */
>  const struct nf_conntrack_zone nf_ct_zone_dflt = {
>         .id     = NF_CT_DEFAULT_ZONE_ID,
> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> index f4c4b46..a44df88 100644
> --- a/net/netfilter/nf_conntrack_core.c
> +++ b/net/netfilter/nf_conntrack_core.c
> @@ -2484,6 +2484,7 @@ int nf_conntrack_init_start(void)
>         .update         = nf_conntrack_update,
>         .destroy        = destroy_conntrack,
>         .get_tuple_skb  = nf_conntrack_get_tuple_skb,
> +       .find_get       = nf_conntrack_find_get,
>  };
>
>  void nf_conntrack_init_end(void)
> diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py
> index 90baf7d..26f0c2a 100755
> --- a/scripts/bpf_helpers_doc.py
> +++ b/scripts/bpf_helpers_doc.py
> @@ -398,6 +398,8 @@ class PrinterHelpers(Printer):
>
>      type_fwds = [
>              'struct bpf_fib_lookup',
> +            'struct bpf_nf_conn',
> +            'struct bpf_nf_conntrack_tuple',
>              'struct bpf_perf_event_data',
>              'struct bpf_perf_event_value',
>              'struct bpf_sock',
> @@ -433,6 +435,8 @@ class PrinterHelpers(Printer):
>              '__wsum',
>
>              'struct bpf_fib_lookup',
> +            'struct bpf_nf_conn',
> +            'struct bpf_nf_conntrack_tuple',
>              'struct bpf_perf_event_data',
>              'struct bpf_perf_event_value',
>              'struct bpf_sock',
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index 033d90a..85c4b3f 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -2885,6 +2885,88 @@ struct bpf_stack_build_id {
>   *             **-EPERM** if no permission to send the *sig*.
>   *
>   *             **-EAGAIN** if bpf program can try again.
> + *
> + * struct bpf_nf_conn *bpf_ct_lookup_tcp(void *ctx, struct bpf_nf_conntrack_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
> + *     Description
> + *             Look for TCP nf_conntrack entry matching *tuple*, optionally in
> + *             a child network namespace *netns*. The return value must be
> + *             checked, and if non-**NULL**, released via
> + *             **bpf_ct_release**\ ().
> + *
> + *             The *ctx* should point to the context of the program, such as
> + *             the skb or xdp_md (depending on the hook in use). This is used
> + *             to determine the base network namespace for the lookup.
> + *
> + *             *tuple_size* must be one of:
> + *
> + *             **sizeof**\ (*tuple*\ **->ipv4**)
> + *                     Look for an IPv4 nf_conn.
> + *             **sizeof**\ (*tuple*\ **->ipv6**)
> + *                     Look for an IPv6 nf_conn.
> + *
> + *             If the *netns* is a negative signed 32-bit integer, then the
> + *             nf_conn lookup table in the netns associated with the *ctx* will
> + *             will be used. For the TC hooks, this is the netns of the device
> + *             in the skb. For XDP hooks, this is the netns of the device in
> + *             the xdp_md. If *netns* is any other signed 32-bit value greater
> + *             than or equal to zero then it specifies the ID of the netns
> + *             relative to the netns associated with the *ctx*. *netns* values
> + *             beyond the range of 32-bit integers are reserved for future
> + *             use.
> + *
> + *             All values for *flags* are reserved for future usage, and must
> + *             be left at zero.
> + *
> + *             This helper will always return NULL if the kernel was compiled
> + *             without **CONFIG_NF_CONNTRACK**.
> + *     Return
> + *             Pointer to **struct bpf_nf_conn**, or **NULL** in case of
> + *             failure.
> + *
> + * struct bpf_nf_conn *bpf_ct_lookup_udp(void *ctx, struct bpf_nf_conntrack_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
> + *     Description
> + *             Look for UDP nf_conntrack entry matching *tuple*, optionally in
> + *             a child network namespace *netns*. The return value must be
> + *             checked, and if non-**NULL**, released via
> + *             **bpf_ct_release**\ ().
> + *
> + *             The *ctx* should point to the context of the program, such as
> + *             the skb or xdp_md (depending on the hook in use). This is used
> + *             to determine the base network namespace for the lookup.
> + *
> + *             *tuple_size* must be one of:
> + *
> + *             **sizeof**\ (*tuple*\ **->ipv4**)
> + *                     Look for an IPv4 nf_conn.
> + *             **sizeof**\ (*tuple*\ **->ipv6**)
> + *                     Look for an IPv6 nf_conn.
> + *
> + *             If the *netns* is a negative signed 32-bit integer, then the
> + *             nf_conn lookup table in the netns associated with the *ctx* will
> + *             will be used. For the TC hooks, this is the netns of the device
> + *             in the skb. For XDP hooks, this is the netns of the device in
> + *             the xdp_md. If *netns* is any other signed 32-bit value greater
> + *             than or equal to zero then it specifies the ID of the netns
> + *             relative to the netns associated with the *ctx*. *netns* values
> + *             beyond the range of 32-bit integers are reserved for future
> + *             use.
> + *
> + *             All values for *flags* are reserved for future usage, and must
> + *             be left at zero.
> + *
> + *             This helper will always return NULL if the kernel was compiled
> + *             without **CONFIG_NF_CONNTRACK**.
> + *     Return
> + *             Pointer to **struct bpf_nf_conn**, or **NULL** in case of
> + *             failure.
> + *
> + * int bpf_ct_release(struct bpf_nf_conn *ct)
> + *     Description
> + *             Release the reference held by *ct*. *ct* must be a
> + *             non-**NULL** pointer that was returned from
> + *             **bpf_ct_lookup_xxx**\ ().
> + *     Return
> + *             0 on success, or a negative error in case of failure.
>   */
>  #define __BPF_FUNC_MAPPER(FN)          \
>         FN(unspec),                     \
> @@ -3004,7 +3086,10 @@ struct bpf_stack_build_id {
>         FN(probe_read_user_str),        \
>         FN(probe_read_kernel_str),      \
>         FN(tcp_send_ack),               \
> -       FN(send_signal_thread),
> +       FN(send_signal_thread),         \
> +       FN(ct_lookup_tcp),              \
> +       FN(ct_lookup_udp),              \
> +       FN(ct_release),
>
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> @@ -3278,6 +3363,30 @@ struct bpf_sock_tuple {
>         };
>  };
>
> +struct bpf_nf_conn {
> +       __u32 cpu;
> +       __u32 mark;
> +       __u32 status;
> +       __u32 timeout;
> +};
> +
> +struct bpf_nf_conntrack_tuple {
> +       union {
> +               struct {
> +                       __be32 saddr;
> +                       __be32 daddr;
> +                       __be16 sport;
> +                       __be16 dport;
> +               } ipv4;
> +               struct {
> +                       __be32 saddr[4];
> +                       __be32 daddr[4];
> +                       __be16 sport;
> +                       __be16 dport;
> +               } ipv6;
> +       };
> +};
> +
>  struct bpf_xdp_sock {
>         __u32 queue_id;
>  };
> --
> 1.8.3.1
>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH bpf-next v2 1/2] bpf: add bpf_ct_lookup_{tcp,udp}() helpers
  2020-01-21 20:35   ` [PATCH bpf-next v2 1/2] bpf: add bpf_ct_lookup_{tcp,udp}() helpers Matt Cover
@ 2020-01-21 21:31     ` Matt Cover
  2020-01-24 19:11     ` Joe Stringer
  1 sibling, 0 replies; 22+ messages in thread
From: Matt Cover @ 2020-01-21 21:31 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau, Song Liu,
	Yonghong Song, Andrii Nakryiko, David S. Miller, Shuah Khan,
	Jesper Dangaard Brouer, John Fastabend, Jakub Sitnicki,
	Quentin Monnet, Matthew Cover, Stanislav Fomichev,
	Andrey Ignatov, Lorenz Bauer, netdev, bpf, linux-kernel,
	linux-kselftest, Pablo Neira Ayuso, Jozsef Kadlecsik,
	Florian Westphal, coreteam

On Tue, Jan 21, 2020 at 1:35 PM Matt Cover <werekraken@gmail.com> wrote:
>
> On Tue, Jan 21, 2020 at 1:20 PM Matthew Cover <werekraken@gmail.com> wrote:
> >
> > Allow looking up an nf_conn. This allows eBPF programs to leverage
> > nf_conntrack state for similar purposes to socket state use cases,
> > as provided by the socket lookup helpers. This is particularly
> > useful when nf_conntrack state is locally available, but socket
> > state is not.
> >
> > v2:
> >   - Fix functions in need of and missing static inline (kbuild)
> >   - Move tests to separate patch and submit as a series (John)
> >   - Improve clarity in helper documentation (John)
> >   - Add CONFIG_NF_CONNTRACK=m support (Daniel)
>
> Sorry, missed additional maintainers for v2 changes.
>
> +Pablo Neira Ayuso <pablo@netfilter.org>
> +Jozsef Kadlecsik <kadlec@netfilter.org>
> +Florian Westphal <fw@strlen.de>
> +coreteam@netfilter.org
>
> >
> > Signed-off-by: Matthew Cover <matthew.cover@stackpath.com>
> > ---
> >  include/linux/bpf.h               |  29 ++++
> >  include/linux/netfilter.h         |  12 ++
> >  include/uapi/linux/bpf.h          | 111 ++++++++++++++-
> >  kernel/bpf/verifier.c             | 105 ++++++++++++++-
> >  net/core/filter.c                 | 277 ++++++++++++++++++++++++++++++++++++++
> >  net/netfilter/core.c              |  16 +++
> >  net/netfilter/nf_conntrack_core.c |   1 +
> >  scripts/bpf_helpers_doc.py        |   4 +
> >  tools/include/uapi/linux/bpf.h    | 111 ++++++++++++++-
> >  9 files changed, 658 insertions(+), 8 deletions(-)
> >
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index 8e3b8f4..f502e1f 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -239,6 +239,7 @@ enum bpf_arg_type {
> >         ARG_PTR_TO_LONG,        /* pointer to long */
> >         ARG_PTR_TO_SOCKET,      /* pointer to bpf_sock (fullsock) */
> >         ARG_PTR_TO_BTF_ID,      /* pointer to in-kernel struct */
> > +       ARG_PTR_TO_NF_CONN,     /* pointer to bpf_nf_conn */
> >  };
> >
> >  /* type of values returned from helper functions */
> > @@ -250,6 +251,7 @@ enum bpf_return_type {
> >         RET_PTR_TO_SOCKET_OR_NULL,      /* returns a pointer to a socket or NULL */
> >         RET_PTR_TO_TCP_SOCK_OR_NULL,    /* returns a pointer to a tcp_sock or NULL */
> >         RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */
> > +       RET_PTR_TO_NF_CONN_OR_NULL,     /* returns a pointer to a nf_conn or NULL */
> >  };
> >
> >  /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
> > @@ -316,6 +318,8 @@ enum bpf_reg_type {
> >         PTR_TO_TP_BUFFER,        /* reg points to a writable raw tp's buffer */
> >         PTR_TO_XDP_SOCK,         /* reg points to struct xdp_sock */
> >         PTR_TO_BTF_ID,           /* reg points to kernel struct */
> > +       PTR_TO_NF_CONN,          /* reg points to struct nf_conn */
> > +       PTR_TO_NF_CONN_OR_NULL,  /* reg points to struct nf_conn or NULL */
> >  };
> >
> >  /* The information passed from prog-specific *_is_valid_access
> > @@ -1513,4 +1517,29 @@ enum bpf_text_poke_type {
> >  int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
> >                        void *addr1, void *addr2);
> >
> > +#if IS_ENABLED(CONFIG_NF_CONNTRACK)
> > +bool bpf_nf_conn_is_valid_access(int off, int size, enum bpf_access_type type,
> > +                                struct bpf_insn_access_aux *info);
> > +
> > +u32 bpf_nf_conn_convert_ctx_access(enum bpf_access_type type,
> > +                                  const struct bpf_insn *si,
> > +                                  struct bpf_insn *insn_buf,
> > +                                  struct bpf_prog *prog, u32 *target_size);
> > +#else
> > +static inline bool bpf_nf_conn_is_valid_access(int off, int size,
> > +                               enum bpf_access_type type,
> > +                               struct bpf_insn_access_aux *info)
> > +{
> > +       return false;
> > +}
> > +
> > +static inline u32 bpf_nf_conn_convert_ctx_access(enum bpf_access_type type,
> > +                               const struct bpf_insn *si,
> > +                               struct bpf_insn *insn_buf,
> > +                               struct bpf_prog *prog, u32 *target_size)
> > +{
> > +       return 0;
> > +}
> > +#endif /* CONFIG_NF_CONNTRACK */
> > +
> >  #endif /* _LINUX_BPF_H */
> > diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
> > index eb312e7..a360ced 100644
> > --- a/include/linux/netfilter.h
> > +++ b/include/linux/netfilter.h
> > @@ -451,6 +451,9 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
> >  struct nf_conntrack_tuple;
> >  bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
> >                          const struct sk_buff *skb);
> > +struct nf_conntrack_tuple_hash *
> > +nf_ct_find_get(struct net *net, const struct nf_conntrack_zone *zone,
> > +              const struct nf_conntrack_tuple *tuple);
> >  #else
> >  static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
> >  struct nf_conntrack_tuple;
> > @@ -459,6 +462,12 @@ static inline bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
> >  {
> >         return false;
> >  }
> > +static inline struct nf_conntrack_tuple_hash *
> > +nf_ct_find_get(struct net *net, const struct nf_conntrack_zone *zone,
> > +              const struct nf_conntrack_tuple *tuple)
> > +{
> > +       return NULL;
> > +}
> >  #endif
> >
> >  struct nf_conn;
> > @@ -469,6 +478,9 @@ struct nf_ct_hook {
> >         void (*destroy)(struct nf_conntrack *);
> >         bool (*get_tuple_skb)(struct nf_conntrack_tuple *,
> >                               const struct sk_buff *);
> > +       struct nf_conntrack_tuple_hash *
> > +       (*find_get)(struct net *net, const struct nf_conntrack_zone *zone,
> > +                    const struct nf_conntrack_tuple *tuple);
> >  };
> >  extern struct nf_ct_hook __rcu *nf_ct_hook;
> >
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 033d90a..85c4b3f 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -2885,6 +2885,88 @@ struct bpf_stack_build_id {
> >   *             **-EPERM** if no permission to send the *sig*.
> >   *
> >   *             **-EAGAIN** if bpf program can try again.
> > + *
> > + * struct bpf_nf_conn *bpf_ct_lookup_tcp(void *ctx, struct bpf_nf_conntrack_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
> > + *     Description
> > + *             Look for TCP nf_conntrack entry matching *tuple*, optionally in
> > + *             a child network namespace *netns*. The return value must be
> > + *             checked, and if non-**NULL**, released via
> > + *             **bpf_ct_release**\ ().
> > + *
> > + *             The *ctx* should point to the context of the program, such as
> > + *             the skb or xdp_md (depending on the hook in use). This is used
> > + *             to determine the base network namespace for the lookup.
> > + *
> > + *             *tuple_size* must be one of:
> > + *
> > + *             **sizeof**\ (*tuple*\ **->ipv4**)
> > + *                     Look for an IPv4 nf_conn.
> > + *             **sizeof**\ (*tuple*\ **->ipv6**)
> > + *                     Look for an IPv6 nf_conn.
> > + *
> > + *             If the *netns* is a negative signed 32-bit integer, then the
> > + *             nf_conn lookup table in the netns associated with the *ctx* will
> > + *             will be used. For the TC hooks, this is the netns of the device
> > + *             in the skb. For XDP hooks, this is the netns of the device in
> > + *             the xdp_md. If *netns* is any other signed 32-bit value greater
> > + *             than or equal to zero then it specifies the ID of the netns
> > + *             relative to the netns associated with the *ctx*. *netns* values
> > + *             beyond the range of 32-bit integers are reserved for future
> > + *             use.
> > + *
> > + *             All values for *flags* are reserved for future usage, and must
> > + *             be left at zero.
> > + *
> > + *             This helper will always return NULL if the kernel was compiled
> > + *             without **CONFIG_NF_CONNTRACK**.
> > + *     Return
> > + *             Pointer to **struct bpf_nf_conn**, or **NULL** in case of
> > + *             failure.
> > + *
> > + * struct bpf_nf_conn *bpf_ct_lookup_udp(void *ctx, struct bpf_nf_conntrack_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
> > + *     Description
> > + *             Look for UDP nf_conntrack entry matching *tuple*, optionally in
> > + *             a child network namespace *netns*. The return value must be
> > + *             checked, and if non-**NULL**, released via
> > + *             **bpf_ct_release**\ ().
> > + *
> > + *             The *ctx* should point to the context of the program, such as
> > + *             the skb or xdp_md (depending on the hook in use). This is used
> > + *             to determine the base network namespace for the lookup.
> > + *
> > + *             *tuple_size* must be one of:
> > + *
> > + *             **sizeof**\ (*tuple*\ **->ipv4**)
> > + *                     Look for an IPv4 nf_conn.
> > + *             **sizeof**\ (*tuple*\ **->ipv6**)
> > + *                     Look for an IPv6 nf_conn.
> > + *
> > + *             If the *netns* is a negative signed 32-bit integer, then the
> > + *             nf_conn lookup table in the netns associated with the *ctx* will
> > + *             will be used. For the TC hooks, this is the netns of the device
> > + *             in the skb. For XDP hooks, this is the netns of the device in
> > + *             the xdp_md. If *netns* is any other signed 32-bit value greater
> > + *             than or equal to zero then it specifies the ID of the netns
> > + *             relative to the netns associated with the *ctx*. *netns* values
> > + *             beyond the range of 32-bit integers are reserved for future
> > + *             use.
> > + *
> > + *             All values for *flags* are reserved for future usage, and must
> > + *             be left at zero.
> > + *
> > + *             This helper will always return NULL if the kernel was compiled
> > + *             without **CONFIG_NF_CONNTRACK**.
> > + *     Return
> > + *             Pointer to **struct bpf_nf_conn**, or **NULL** in case of
> > + *             failure.
> > + *
> > + * int bpf_ct_release(struct bpf_nf_conn *ct)
> > + *     Description
> > + *             Release the reference held by *ct*. *ct* must be a
> > + *             non-**NULL** pointer that was returned from
> > + *             **bpf_ct_lookup_xxx**\ ().
> > + *     Return
> > + *             0 on success, or a negative error in case of failure.
> >   */
> >  #define __BPF_FUNC_MAPPER(FN)          \
> >         FN(unspec),                     \
> > @@ -3004,7 +3086,10 @@ struct bpf_stack_build_id {
> >         FN(probe_read_user_str),        \
> >         FN(probe_read_kernel_str),      \
> >         FN(tcp_send_ack),               \
> > -       FN(send_signal_thread),
> > +       FN(send_signal_thread),         \
> > +       FN(ct_lookup_tcp),              \
> > +       FN(ct_lookup_udp),              \
> > +       FN(ct_release),
> >
> >  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> >   * function eBPF program intends to call
> > @@ -3278,6 +3363,30 @@ struct bpf_sock_tuple {
> >         };
> >  };
> >
> > +struct bpf_nf_conn {
> > +       __u32 cpu;
> > +       __u32 mark;
> > +       __u32 status;
> > +       __u32 timeout;
> > +};
> > +
> > +struct bpf_nf_conntrack_tuple {
> > +       union {
> > +               struct {
> > +                       __be32 saddr;
> > +                       __be32 daddr;
> > +                       __be16 sport;
> > +                       __be16 dport;
> > +               } ipv4;
> > +               struct {
> > +                       __be32 saddr[4];
> > +                       __be32 daddr[4];
> > +                       __be16 sport;
> > +                       __be16 dport;
> > +               } ipv6;
> > +       };
> > +};
> > +
> >  struct bpf_xdp_sock {
> >         __u32 queue_id;
> >  };
> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > index ca17dccc..0ea0ee7 100644
> > --- a/kernel/bpf/verifier.c
> > +++ b/kernel/bpf/verifier.c
> > @@ -362,6 +362,11 @@ static const char *ltrim(const char *s)
> >         env->prev_linfo = linfo;
> >  }
> >
> > +static bool type_is_nf_ct_pointer(enum bpf_reg_type type)
> > +{
> > +       return type == PTR_TO_NF_CONN;
> > +}
> > +
> >  static bool type_is_pkt_pointer(enum bpf_reg_type type)
> >  {
> >         return type == PTR_TO_PACKET ||
> > @@ -381,7 +386,8 @@ static bool reg_type_may_be_null(enum bpf_reg_type type)
> >         return type == PTR_TO_MAP_VALUE_OR_NULL ||
> >                type == PTR_TO_SOCKET_OR_NULL ||
> >                type == PTR_TO_SOCK_COMMON_OR_NULL ||
> > -              type == PTR_TO_TCP_SOCK_OR_NULL;
> > +              type == PTR_TO_TCP_SOCK_OR_NULL ||
> > +              type == PTR_TO_NF_CONN_OR_NULL;
> >  }
> >
> >  static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
> > @@ -395,12 +401,15 @@ static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
> >         return type == PTR_TO_SOCKET ||
> >                 type == PTR_TO_SOCKET_OR_NULL ||
> >                 type == PTR_TO_TCP_SOCK ||
> > -               type == PTR_TO_TCP_SOCK_OR_NULL;
> > +               type == PTR_TO_TCP_SOCK_OR_NULL ||
> > +               type == PTR_TO_NF_CONN ||
> > +               type == PTR_TO_NF_CONN_OR_NULL;
> >  }
> >
> >  static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
> >  {
> > -       return type == ARG_PTR_TO_SOCK_COMMON;
> > +       return type == ARG_PTR_TO_SOCK_COMMON ||
> > +               type == ARG_PTR_TO_NF_CONN;
> >  }
> >
> >  /* Determine whether the function releases some resources allocated by another
> > @@ -409,14 +418,17 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
> >   */
> >  static bool is_release_function(enum bpf_func_id func_id)
> >  {
> > -       return func_id == BPF_FUNC_sk_release;
> > +       return func_id == BPF_FUNC_sk_release ||
> > +               func_id == BPF_FUNC_ct_release;
> >  }
> >
> >  static bool is_acquire_function(enum bpf_func_id func_id)
> >  {
> >         return func_id == BPF_FUNC_sk_lookup_tcp ||
> >                 func_id == BPF_FUNC_sk_lookup_udp ||
> > -               func_id == BPF_FUNC_skc_lookup_tcp;
> > +               func_id == BPF_FUNC_skc_lookup_tcp ||
> > +               func_id == BPF_FUNC_ct_lookup_tcp ||
> > +               func_id == BPF_FUNC_ct_lookup_udp;
> >  }
> >
> >  static bool is_ptr_cast_function(enum bpf_func_id func_id)
> > @@ -447,6 +459,8 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id)
> >         [PTR_TO_TP_BUFFER]      = "tp_buffer",
> >         [PTR_TO_XDP_SOCK]       = "xdp_sock",
> >         [PTR_TO_BTF_ID]         = "ptr_",
> > +       [PTR_TO_NF_CONN]        = "nf_conn",
> > +       [PTR_TO_NF_CONN_OR_NULL] = "nf_conn_or_null",
> >  };
> >
> >  static char slot_type_char[] = {
> > @@ -1913,6 +1927,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
> >         case PTR_TO_TCP_SOCK_OR_NULL:
> >         case PTR_TO_XDP_SOCK:
> >         case PTR_TO_BTF_ID:
> > +       case PTR_TO_NF_CONN:
> > +       case PTR_TO_NF_CONN_OR_NULL:
> >                 return true;
> >         default:
> >                 return false;
> > @@ -2440,6 +2456,35 @@ static int check_flow_keys_access(struct bpf_verifier_env *env, int off,
> >         return 0;
> >  }
> >
> > +static int check_nf_ct_access(struct bpf_verifier_env *env, int insn_idx,
> > +                            u32 regno, int off, int size,
> > +                            enum bpf_access_type t)
> > +{
> > +       struct bpf_reg_state *regs = cur_regs(env);
> > +       struct bpf_reg_state *reg = &regs[regno];
> > +       struct bpf_insn_access_aux info = {};
> > +       bool valid;
> > +
> > +       switch (reg->type) {
> > +       case PTR_TO_NF_CONN:
> > +               valid = bpf_nf_conn_is_valid_access(off, size, t, &info);
> > +               break;
> > +       default:
> > +               valid = false;
> > +       }
> > +
> > +       if (valid) {
> > +               env->insn_aux_data[insn_idx].ctx_field_size =
> > +                       info.ctx_field_size;
> > +               return 0;
> > +       }
> > +
> > +       verbose(env, "R%d invalid %s access off=%d size=%d\n",
> > +               regno, reg_type_str[reg->type], off, size);
> > +
> > +       return -EACCES;
> > +}

John, when I began to address your nit I realized that return -EACCES
happens in multiple cases; when reg->type != PTR_TO_NF_CONN and when
bpf_nf_conn_is_valid_access() returns false. I decided to leave this
as-is since the gains of a refactor are minimal and tcp_nf_conn is
planned.

> > +
> >  static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
> >                              u32 regno, int off, int size,
> >                              enum bpf_access_type t)
> > @@ -2511,6 +2556,13 @@ static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)
> >         return reg->type == PTR_TO_CTX;
> >  }
> >
> > +static bool is_nf_ct_reg(struct bpf_verifier_env *env, int regno)
> > +{
> > +       const struct bpf_reg_state *reg = reg_state(env, regno);
> > +
> > +       return type_is_nf_ct_pointer(reg->type);
> > +}
> > +
> >  static bool is_sk_reg(struct bpf_verifier_env *env, int regno)
> >  {
> >         const struct bpf_reg_state *reg = reg_state(env, regno);
> > @@ -2635,6 +2687,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
> >         case PTR_TO_XDP_SOCK:
> >                 pointer_desc = "xdp_sock ";
> >                 break;
> > +       case PTR_TO_NF_CONN:
> > +               pointer_desc = "nf_conn ";
> > +               break;
> >         default:
> >                 break;
> >         }
> > @@ -3050,6 +3105,15 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
> >                 err = check_sock_access(env, insn_idx, regno, off, size, t);
> >                 if (!err && value_regno >= 0)
> >                         mark_reg_unknown(env, regs, value_regno);
> > +       } else if (type_is_nf_ct_pointer(reg->type)) {
> > +               if (t == BPF_WRITE) {
> > +                       verbose(env, "R%d cannot write into %s\n",
> > +                               regno, reg_type_str[reg->type]);
> > +                       return -EACCES;
> > +               }
> > +               err = check_nf_ct_access(env, insn_idx, regno, off, size, t);
> > +               if (!err && value_regno >= 0)
> > +                       mark_reg_unknown(env, regs, value_regno);
> >         } else if (reg->type == PTR_TO_TP_BUFFER) {
> >                 err = check_tp_buffer_access(env, reg, regno, off, size);
> >                 if (!err && t == BPF_READ && value_regno >= 0)
> > @@ -3099,7 +3163,8 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
> >         if (is_ctx_reg(env, insn->dst_reg) ||
> >             is_pkt_reg(env, insn->dst_reg) ||
> >             is_flow_key_reg(env, insn->dst_reg) ||
> > -           is_sk_reg(env, insn->dst_reg)) {
> > +           is_sk_reg(env, insn->dst_reg) ||
> > +           is_nf_ct_reg(env, insn->dst_reg)) {
> >                 verbose(env, "BPF_XADD stores into R%d %s is not allowed\n",
> >                         insn->dst_reg,
> >                         reg_type_str[reg_state(env, insn->dst_reg)->type]);
> > @@ -3501,6 +3566,19 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
> >                                 regno);
> >                         return -EACCES;
> >                 }
> > +       } else if (arg_type == ARG_PTR_TO_NF_CONN) {
> > +               expected_type = PTR_TO_NF_CONN;
> > +               if (!type_is_nf_ct_pointer(type))
> > +                       goto err_type;
> > +               if (reg->ref_obj_id) {
> > +                       if (meta->ref_obj_id) {
> > +                               verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
> > +                                       regno, reg->ref_obj_id,
> > +                                       meta->ref_obj_id);
> > +                               return -EFAULT;
> > +                       }
> > +                       meta->ref_obj_id = reg->ref_obj_id;
> > +               }
> >         } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
> >                 if (meta->func_id == BPF_FUNC_spin_lock) {
> >                         if (process_spin_lock(env, regno, true))
> > @@ -4368,6 +4446,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
> >                 mark_reg_known_zero(env, regs, BPF_REG_0);
> >                 regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL;
> >                 regs[BPF_REG_0].id = ++env->id_gen;
> > +       } else if (fn->ret_type == RET_PTR_TO_NF_CONN_OR_NULL) {
> > +               mark_reg_known_zero(env, regs, BPF_REG_0);
> > +               regs[BPF_REG_0].type = PTR_TO_NF_CONN_OR_NULL;
> > +               regs[BPF_REG_0].id = ++env->id_gen;
> >         } else {
> >                 verbose(env, "unknown return type %d of func %s#%d\n",
> >                         fn->ret_type, func_id_name(func_id), func_id);
> > @@ -4649,6 +4731,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
> >         case PTR_TO_TCP_SOCK:
> >         case PTR_TO_TCP_SOCK_OR_NULL:
> >         case PTR_TO_XDP_SOCK:
> > +       case PTR_TO_NF_CONN:
> > +       case PTR_TO_NF_CONN_OR_NULL:
> >                 verbose(env, "R%d pointer arithmetic on %s prohibited\n",
> >                         dst, reg_type_str[ptr_reg->type]);
> >                 return -EACCES;
> > @@ -5915,6 +5999,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
> >                         reg->type = PTR_TO_SOCK_COMMON;
> >                 } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) {
> >                         reg->type = PTR_TO_TCP_SOCK;
> > +               } else if (reg->type == PTR_TO_NF_CONN_OR_NULL) {
> > +                       reg->type = PTR_TO_NF_CONN;
> >                 }
> >                 if (is_null) {
> >                         /* We don't need id and ref_obj_id from this point
> > @@ -7232,6 +7318,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
> >         case PTR_TO_TCP_SOCK:
> >         case PTR_TO_TCP_SOCK_OR_NULL:
> >         case PTR_TO_XDP_SOCK:
> > +       case PTR_TO_NF_CONN:
> > +       case PTR_TO_NF_CONN_OR_NULL:
> >                 /* Only valid matches are exact, which memcmp() above
> >                  * would have accepted
> >                  */
> > @@ -7760,6 +7848,8 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type)
> >         case PTR_TO_TCP_SOCK_OR_NULL:
> >         case PTR_TO_XDP_SOCK:
> >         case PTR_TO_BTF_ID:
> > +       case PTR_TO_NF_CONN:
> > +       case PTR_TO_NF_CONN_OR_NULL:
> >                 return false;
> >         default:
> >                 return true;
> > @@ -8867,6 +8957,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
> >                                 return -EINVAL;
> >                         }
> >                         continue;
> > +               case PTR_TO_NF_CONN:
> > +                       convert_ctx_access = bpf_nf_conn_convert_ctx_access;
> > +                       break;
> >                 default:
> >                         continue;
> >                 }
> > diff --git a/net/core/filter.c b/net/core/filter.c
> > index 17de674..80319d3 100644
> > --- a/net/core/filter.c
> > +++ b/net/core/filter.c
> > @@ -74,6 +74,12 @@
> >  #include <net/ipv6_stubs.h>
> >  #include <net/bpf_sk_storage.h>
> >
> > +#if IS_ENABLED(CONFIG_NF_CONNTRACK)
> > +#include <net/netfilter/nf_conntrack_tuple.h>
> > +#include <net/netfilter/nf_conntrack_core.h>
> > +#include <net/netfilter/nf_conntrack.h>
> > +#endif
> > +
> >  /**
> >   *     sk_filter_trim_cap - run a packet through a socket filter
> >   *     @sk: sock associated with &sk_buff
> > @@ -5122,6 +5128,253 @@ static void bpf_update_srh_state(struct sk_buff *skb)
> >  };
> >  #endif /* CONFIG_IPV6_SEG6_BPF */
> >
> > +#if IS_ENABLED(CONFIG_NF_CONNTRACK)
> > +bool bpf_nf_conn_is_valid_access(int off, int size, enum bpf_access_type type,
> > +                                struct bpf_insn_access_aux *info)
> > +{
> > +       if (off < 0 || off >= offsetofend(struct bpf_nf_conn,
> > +                                         timeout))
> > +               return false;
> > +
> > +       if (off % size != 0)
> > +               return false;
> > +
> > +       return size == sizeof(__u32);
> > +}
> > +
> > +u32 bpf_nf_conn_convert_ctx_access(enum bpf_access_type type,
> > +                                  const struct bpf_insn *si,
> > +                                  struct bpf_insn *insn_buf,
> > +                                  struct bpf_prog *prog, u32 *target_size)
> > +{
> > +       struct bpf_insn *insn = insn_buf;
> > +
> > +       switch (si->off) {
> > +       case offsetof(struct bpf_nf_conn, cpu):
> > +               BUILD_BUG_ON(FIELD_SIZEOF(struct nf_conn, cpu) != 2);
> > +
> > +               *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
> > +                                     offsetof(struct nf_conn, cpu));
> > +
> > +               break;
> > +
> > +       case offsetof(struct bpf_nf_conn, mark):
> > +#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
> > +               BUILD_BUG_ON(FIELD_SIZEOF(struct nf_conn, mark) != 4);
> > +
> > +               *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
> > +                                     offsetof(struct nf_conn, mark));
> > +#else
> > +               *target_size = 4;
> > +               *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
> > +#endif
> > +               break;
> > +
> > +       case offsetof(struct bpf_nf_conn, status):
> > +               BUILD_BUG_ON(FIELD_SIZEOF(struct nf_conn, status) < 4 ||
> > +                            __IPS_MAX_BIT > 32);
> > +
> > +               *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
> > +                                     offsetof(struct nf_conn, status));
> > +
> > +               break;
> > +
> > +       case offsetof(struct bpf_nf_conn, timeout):
> > +               BUILD_BUG_ON(FIELD_SIZEOF(struct nf_conn, timeout) != 4);
> > +
> > +               *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
> > +                                     offsetof(struct nf_conn, timeout));
> > +
> > +               break;
> > +       }
> > +
> > +       return insn - insn_buf;
> > +}
> > +
> > +static struct nf_conn *
> > +ct_lookup(struct net *net, struct bpf_nf_conntrack_tuple *tuple,
> > +         u8 family, u8 proto)
> > +{
> > +       struct nf_conntrack_tuple_hash *hash;
> > +       struct nf_conntrack_tuple tup;
> > +       struct nf_conn *ct = NULL;
> > +
> > +       memset(&tup, 0, sizeof(tup));
> > +
> > +       tup.dst.protonum = proto;
> > +       tup.src.l3num = family;
> > +
> > +       if (family == AF_INET) {
> > +               tup.src.u3.ip = tuple->ipv4.saddr;
> > +               tup.dst.u3.ip = tuple->ipv4.daddr;
> > +               tup.src.u.tcp.port = tuple->ipv4.sport;
> > +               tup.dst.u.tcp.port = tuple->ipv4.dport;
> > +#if IS_ENABLED(CONFIG_IPV6)
> > +       } else {
> > +               memcpy(tup.src.u3.ip6, tuple->ipv6.saddr, sizeof(tup.src.u3.ip6));
> > +               memcpy(tup.dst.u3.ip6, tuple->ipv6.daddr, sizeof(tup.dst.u3.ip6));
> > +               tup.src.u.tcp.port = tuple->ipv6.sport;
> > +               tup.dst.u.tcp.port = tuple->ipv6.dport;
> > +#endif
> > +       }
> > +
> > +       hash = nf_ct_find_get(net, &nf_ct_zone_dflt, &tup);
> > +       if (!hash)
> > +               goto out;
> > +       ct = nf_ct_tuplehash_to_ctrack(hash);
> > +
> > +out:
> > +       return ct;
> > +}
> > +
> > +static struct nf_conn *
> > +__bpf_ct_lookup(struct sk_buff *skb, struct bpf_nf_conntrack_tuple *tuple, u32 len,
> > +               struct net *caller_net, u8 proto, u64 netns_id, u64 flags)

I also left the uapi (and underlying casts) the same as the sk_lookup
helpers in favor of a familiar experience and emulating well
traversed code. I'm happy to discuss more if this isn't suitable.

> > +{
> > +       struct nf_conn *ct = NULL;
> > +       u8 family = AF_UNSPEC;
> > +       struct net *net;
> > +
> > +       if (len == sizeof(tuple->ipv4))
> > +               family = AF_INET;
> > +       else if (len == sizeof(tuple->ipv6))
> > +               family = AF_INET6;
> > +       else
> > +               goto out;
> > +
> > +       if (unlikely(family == AF_UNSPEC || flags ||
> > +                    !((s32)netns_id < 0 || netns_id <= S32_MAX)))
> > +               goto out;
> > +
> > +       if ((s32)netns_id < 0) {
> > +               net = caller_net;
> > +               ct = ct_lookup(net, tuple, family, proto);
> > +       } else {
> > +               net = get_net_ns_by_id(caller_net, netns_id);
> > +               if (unlikely(!net))
> > +                       goto out;
> > +               ct = ct_lookup(net, tuple, family, proto);
> > +               put_net(net);
> > +       }
> > +
> > +out:
> > +       return ct;
> > +}
> > +
> > +static struct nf_conn *
> > +bpf_ct_lookup(struct sk_buff *skb, struct bpf_nf_conntrack_tuple *tuple, u32 len,
> > +             u8 proto, u64 netns_id, u64 flags)
> > +{
> > +       struct net *caller_net;
> > +
> > +       if (skb->dev) {
> > +               caller_net = dev_net(skb->dev);
> > +       } else {
> > +               caller_net = sock_net(skb->sk);
> > +       }
> > +
> > +       return __bpf_ct_lookup(skb, tuple, len, caller_net, proto,
> > +                              netns_id, flags);
> > +}
> > +
> > +BPF_CALL_5(bpf_ct_lookup_tcp, struct sk_buff *, skb,
> > +          struct bpf_nf_conntrack_tuple *, tuple, u32, len, u64, netns_id,
> > +          u64, flags)
> > +{
> > +       return (unsigned long)bpf_ct_lookup(skb, tuple, len, IPPROTO_TCP,
> > +                                            netns_id, flags);
> > +}
> > +
> > +static const struct bpf_func_proto bpf_ct_lookup_tcp_proto = {
> > +       .func           = bpf_ct_lookup_tcp,
> > +       .gpl_only       = true,
> > +       .pkt_access     = true,
> > +       .ret_type       = RET_PTR_TO_NF_CONN_OR_NULL,
> > +       .arg1_type      = ARG_PTR_TO_CTX,
> > +       .arg2_type      = ARG_PTR_TO_MEM,
> > +       .arg3_type      = ARG_CONST_SIZE,
> > +       .arg4_type      = ARG_ANYTHING,
> > +       .arg5_type      = ARG_ANYTHING,
> > +};
> > +
> > +BPF_CALL_5(bpf_xdp_ct_lookup_tcp, struct xdp_buff *, ctx,
> > +          struct bpf_nf_conntrack_tuple *, tuple, u32, len, u32, netns_id,
> > +          u64, flags)
> > +{
> > +       struct net *caller_net = dev_net(ctx->rxq->dev);
> > +
> > +       return (unsigned long)__bpf_ct_lookup(NULL, tuple, len, caller_net,
> > +                                             IPPROTO_TCP, netns_id, flags);
> > +}
> > +
> > +static const struct bpf_func_proto bpf_xdp_ct_lookup_tcp_proto = {
> > +       .func           = bpf_xdp_ct_lookup_tcp,
> > +       .gpl_only       = true,
> > +       .pkt_access     = true,
> > +       .ret_type       = RET_PTR_TO_NF_CONN_OR_NULL,
> > +       .arg1_type      = ARG_PTR_TO_CTX,
> > +       .arg2_type      = ARG_PTR_TO_MEM,
> > +       .arg3_type      = ARG_CONST_SIZE,
> > +       .arg4_type      = ARG_ANYTHING,
> > +       .arg5_type      = ARG_ANYTHING,
> > +};
> > +
> > +BPF_CALL_5(bpf_ct_lookup_udp, struct sk_buff *, skb,
> > +          struct bpf_nf_conntrack_tuple *, tuple, u32, len, u64, netns_id,
> > +          u64, flags)
> > +{
> > +       return (unsigned long)bpf_ct_lookup(skb, tuple, len, IPPROTO_UDP,
> > +                                            netns_id, flags);
> > +}
> > +
> > +static const struct bpf_func_proto bpf_ct_lookup_udp_proto = {
> > +       .func           = bpf_ct_lookup_udp,
> > +       .gpl_only       = true,
> > +       .pkt_access     = true,
> > +       .ret_type       = RET_PTR_TO_NF_CONN_OR_NULL,
> > +       .arg1_type      = ARG_PTR_TO_CTX,
> > +       .arg2_type      = ARG_PTR_TO_MEM,
> > +       .arg3_type      = ARG_CONST_SIZE,
> > +       .arg4_type      = ARG_ANYTHING,
> > +       .arg5_type      = ARG_ANYTHING,
> > +};
> > +
> > +BPF_CALL_5(bpf_xdp_ct_lookup_udp, struct xdp_buff *, ctx,
> > +          struct bpf_nf_conntrack_tuple *, tuple, u32, len, u32, netns_id,
> > +          u64, flags)
> > +{
> > +       struct net *caller_net = dev_net(ctx->rxq->dev);
> > +
> > +       return (unsigned long)__bpf_ct_lookup(NULL, tuple, len, caller_net,
> > +                                             IPPROTO_UDP, netns_id, flags);
> > +}
> > +
> > +static const struct bpf_func_proto bpf_xdp_ct_lookup_udp_proto = {
> > +       .func           = bpf_xdp_ct_lookup_udp,
> > +       .gpl_only       = true,
> > +       .pkt_access     = true,
> > +       .ret_type       = RET_PTR_TO_NF_CONN_OR_NULL,
> > +       .arg1_type      = ARG_PTR_TO_CTX,
> > +       .arg2_type      = ARG_PTR_TO_MEM,
> > +       .arg3_type      = ARG_CONST_SIZE,
> > +       .arg4_type      = ARG_ANYTHING,
> > +       .arg5_type      = ARG_ANYTHING,
> > +};
> > +
> > +BPF_CALL_1(bpf_ct_release, struct nf_conn *, ct)
> > +{
> > +       nf_conntrack_put(&ct->ct_general);
> > +       return 0;
> > +}
> > +
> > +static const struct bpf_func_proto bpf_ct_release_proto = {
> > +       .func           = bpf_ct_release,
> > +       .gpl_only       = true,
> > +       .ret_type       = RET_INTEGER,
> > +       .arg1_type      = ARG_PTR_TO_NF_CONN,
> > +};
> > +#endif
> > +
> >  #ifdef CONFIG_INET
> >  static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
> >                               int dif, int sdif, u8 family, u8 proto)
> > @@ -6139,6 +6392,14 @@ bool bpf_helper_changes_pkt_data(void *func)
> >         case BPF_FUNC_tcp_gen_syncookie:
> >                 return &bpf_tcp_gen_syncookie_proto;
> >  #endif
> > +#if IS_ENABLED(CONFIG_NF_CONNTRACK)
> > +       case BPF_FUNC_ct_lookup_tcp:
> > +               return &bpf_ct_lookup_tcp_proto;
> > +       case BPF_FUNC_ct_lookup_udp:
> > +               return &bpf_ct_lookup_udp_proto;
> > +       case BPF_FUNC_ct_release:
> > +               return &bpf_ct_release_proto;
> > +#endif
> >         default:
> >                 return bpf_base_func_proto(func_id);
> >         }
> > @@ -6180,6 +6441,14 @@ bool bpf_helper_changes_pkt_data(void *func)
> >         case BPF_FUNC_tcp_gen_syncookie:
> >                 return &bpf_tcp_gen_syncookie_proto;
> >  #endif
> > +#if IS_ENABLED(CONFIG_NF_CONNTRACK)
> > +       case BPF_FUNC_ct_lookup_tcp:
> > +               return &bpf_xdp_ct_lookup_tcp_proto;
> > +       case BPF_FUNC_ct_lookup_udp:
> > +               return &bpf_xdp_ct_lookup_udp_proto;
> > +       case BPF_FUNC_ct_release:
> > +               return &bpf_ct_release_proto;
> > +#endif
> >         default:
> >                 return bpf_base_func_proto(func_id);
> >         }
> > @@ -6284,6 +6553,14 @@ bool bpf_helper_changes_pkt_data(void *func)
> >         case BPF_FUNC_skc_lookup_tcp:
> >                 return &bpf_skc_lookup_tcp_proto;
> >  #endif
> > +#if IS_ENABLED(CONFIG_NF_CONNTRACK)
> > +       case BPF_FUNC_ct_lookup_tcp:
> > +               return &bpf_ct_lookup_tcp_proto;
> > +       case BPF_FUNC_ct_lookup_udp:
> > +               return &bpf_ct_lookup_udp_proto;
> > +       case BPF_FUNC_ct_release:
> > +               return &bpf_ct_release_proto;
> > +#endif
> >         default:
> >                 return bpf_base_func_proto(func_id);
> >         }
> > diff --git a/net/netfilter/core.c b/net/netfilter/core.c
> > index 78f046e..855c6b0 100644
> > --- a/net/netfilter/core.c
> > +++ b/net/netfilter/core.c
> > @@ -617,6 +617,22 @@ bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
> >  }
> >  EXPORT_SYMBOL(nf_ct_get_tuple_skb);
> >
> > +struct nf_conntrack_tuple_hash *
> > +nf_ct_find_get(struct net *net, const struct nf_conntrack_zone *zone,
> > +              const struct nf_conntrack_tuple *tuple)
> > +{
> > +       struct nf_ct_hook *ct_hook;
> > +       struct nf_conntrack_tuple_hash *ret = NULL;
> > +
> > +       rcu_read_lock();
> > +       ct_hook = rcu_dereference(nf_ct_hook);
> > +       if (ct_hook)
> > +               ret = ct_hook->find_get(net, zone, tuple);
> > +       rcu_read_unlock();
> > +       return ret;
> > +}
> > +EXPORT_SYMBOL_GPL(nf_ct_find_get);
> > +
> >  /* Built-in default zone used e.g. by modules. */
> >  const struct nf_conntrack_zone nf_ct_zone_dflt = {
> >         .id     = NF_CT_DEFAULT_ZONE_ID,
> > diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> > index f4c4b46..a44df88 100644
> > --- a/net/netfilter/nf_conntrack_core.c
> > +++ b/net/netfilter/nf_conntrack_core.c
> > @@ -2484,6 +2484,7 @@ int nf_conntrack_init_start(void)
> >         .update         = nf_conntrack_update,
> >         .destroy        = destroy_conntrack,
> >         .get_tuple_skb  = nf_conntrack_get_tuple_skb,
> > +       .find_get       = nf_conntrack_find_get,
> >  };
> >
> >  void nf_conntrack_init_end(void)
> > diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py
> > index 90baf7d..26f0c2a 100755
> > --- a/scripts/bpf_helpers_doc.py
> > +++ b/scripts/bpf_helpers_doc.py
> > @@ -398,6 +398,8 @@ class PrinterHelpers(Printer):
> >
> >      type_fwds = [
> >              'struct bpf_fib_lookup',
> > +            'struct bpf_nf_conn',
> > +            'struct bpf_nf_conntrack_tuple',
> >              'struct bpf_perf_event_data',
> >              'struct bpf_perf_event_value',
> >              'struct bpf_sock',
> > @@ -433,6 +435,8 @@ class PrinterHelpers(Printer):
> >              '__wsum',
> >
> >              'struct bpf_fib_lookup',
> > +            'struct bpf_nf_conn',
> > +            'struct bpf_nf_conntrack_tuple',
> >              'struct bpf_perf_event_data',
> >              'struct bpf_perf_event_value',
> >              'struct bpf_sock',
> > diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> > index 033d90a..85c4b3f 100644
> > --- a/tools/include/uapi/linux/bpf.h
> > +++ b/tools/include/uapi/linux/bpf.h
> > @@ -2885,6 +2885,88 @@ struct bpf_stack_build_id {
> >   *             **-EPERM** if no permission to send the *sig*.
> >   *
> >   *             **-EAGAIN** if bpf program can try again.
> > + *
> > + * struct bpf_nf_conn *bpf_ct_lookup_tcp(void *ctx, struct bpf_nf_conntrack_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
> > + *     Description
> > + *             Look for TCP nf_conntrack entry matching *tuple*, optionally in
> > + *             a child network namespace *netns*. The return value must be
> > + *             checked, and if non-**NULL**, released via
> > + *             **bpf_ct_release**\ ().
> > + *
> > + *             The *ctx* should point to the context of the program, such as
> > + *             the skb or xdp_md (depending on the hook in use). This is used
> > + *             to determine the base network namespace for the lookup.
> > + *
> > + *             *tuple_size* must be one of:
> > + *
> > + *             **sizeof**\ (*tuple*\ **->ipv4**)
> > + *                     Look for an IPv4 nf_conn.
> > + *             **sizeof**\ (*tuple*\ **->ipv6**)
> > + *                     Look for an IPv6 nf_conn.
> > + *
> > + *             If the *netns* is a negative signed 32-bit integer, then the
> > + *             nf_conn lookup table in the netns associated with the *ctx* will
> > + *             will be used. For the TC hooks, this is the netns of the device
> > + *             in the skb. For XDP hooks, this is the netns of the device in
> > + *             the xdp_md. If *netns* is any other signed 32-bit value greater
> > + *             than or equal to zero then it specifies the ID of the netns
> > + *             relative to the netns associated with the *ctx*. *netns* values
> > + *             beyond the range of 32-bit integers are reserved for future
> > + *             use.
> > + *
> > + *             All values for *flags* are reserved for future usage, and must
> > + *             be left at zero.
> > + *
> > + *             This helper will always return NULL if the kernel was compiled
> > + *             without **CONFIG_NF_CONNTRACK**.
> > + *     Return
> > + *             Pointer to **struct bpf_nf_conn**, or **NULL** in case of
> > + *             failure.
> > + *
> > + * struct bpf_nf_conn *bpf_ct_lookup_udp(void *ctx, struct bpf_nf_conntrack_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
> > + *     Description
> > + *             Look for UDP nf_conntrack entry matching *tuple*, optionally in
> > + *             a child network namespace *netns*. The return value must be
> > + *             checked, and if non-**NULL**, released via
> > + *             **bpf_ct_release**\ ().
> > + *
> > + *             The *ctx* should point to the context of the program, such as
> > + *             the skb or xdp_md (depending on the hook in use). This is used
> > + *             to determine the base network namespace for the lookup.
> > + *
> > + *             *tuple_size* must be one of:
> > + *
> > + *             **sizeof**\ (*tuple*\ **->ipv4**)
> > + *                     Look for an IPv4 nf_conn.
> > + *             **sizeof**\ (*tuple*\ **->ipv6**)
> > + *                     Look for an IPv6 nf_conn.
> > + *
> > + *             If the *netns* is a negative signed 32-bit integer, then the
> > + *             nf_conn lookup table in the netns associated with the *ctx* will
> > + *             will be used. For the TC hooks, this is the netns of the device
> > + *             in the skb. For XDP hooks, this is the netns of the device in
> > + *             the xdp_md. If *netns* is any other signed 32-bit value greater
> > + *             than or equal to zero then it specifies the ID of the netns
> > + *             relative to the netns associated with the *ctx*. *netns* values
> > + *             beyond the range of 32-bit integers are reserved for future
> > + *             use.
> > + *
> > + *             All values for *flags* are reserved for future usage, and must
> > + *             be left at zero.
> > + *
> > + *             This helper will always return NULL if the kernel was compiled
> > + *             without **CONFIG_NF_CONNTRACK**.
> > + *     Return
> > + *             Pointer to **struct bpf_nf_conn**, or **NULL** in case of
> > + *             failure.
> > + *
> > + * int bpf_ct_release(struct bpf_nf_conn *ct)
> > + *     Description
> > + *             Release the reference held by *ct*. *ct* must be a
> > + *             non-**NULL** pointer that was returned from
> > + *             **bpf_ct_lookup_xxx**\ ().
> > + *     Return
> > + *             0 on success, or a negative error in case of failure.
> >   */
> >  #define __BPF_FUNC_MAPPER(FN)          \
> >         FN(unspec),                     \
> > @@ -3004,7 +3086,10 @@ struct bpf_stack_build_id {
> >         FN(probe_read_user_str),        \
> >         FN(probe_read_kernel_str),      \
> >         FN(tcp_send_ack),               \
> > -       FN(send_signal_thread),
> > +       FN(send_signal_thread),         \
> > +       FN(ct_lookup_tcp),              \
> > +       FN(ct_lookup_udp),              \
> > +       FN(ct_release),
> >
> >  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> >   * function eBPF program intends to call
> > @@ -3278,6 +3363,30 @@ struct bpf_sock_tuple {
> >         };
> >  };
> >
> > +struct bpf_nf_conn {
> > +       __u32 cpu;
> > +       __u32 mark;
> > +       __u32 status;
> > +       __u32 timeout;
> > +};
> > +
> > +struct bpf_nf_conntrack_tuple {
> > +       union {
> > +               struct {
> > +                       __be32 saddr;
> > +                       __be32 daddr;
> > +                       __be16 sport;
> > +                       __be16 dport;
> > +               } ipv4;
> > +               struct {
> > +                       __be32 saddr[4];
> > +                       __be32 daddr[4];
> > +                       __be16 sport;
> > +                       __be16 dport;
> > +               } ipv6;
> > +       };
> > +};
> > +
> >  struct bpf_xdp_sock {
> >         __u32 queue_id;
> >  };
> > --
> > 1.8.3.1
> >

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH bpf-next] bpf: add bpf_ct_lookup_{tcp,udp}() helpers
  2020-01-20 21:11       ` Daniel Borkmann
  2020-01-20 21:21         ` Matt Cover
@ 2020-01-23 21:28         ` Matt Cover
  1 sibling, 0 replies; 22+ messages in thread
From: Matt Cover @ 2020-01-23 21:28 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: John Fastabend, Alexei Starovoitov, Martin KaFai Lau, Song Liu,
	Yonghong Song, Andrii Nakryiko, David S. Miller, Shuah Khan,
	Jakub Kicinski, Jesper Dangaard Brouer, Jakub Sitnicki,
	Quentin Monnet, Matthew Cover, Stanislav Fomichev,
	Andrey Ignatov, Lorenz Bauer, Jiong Wang, netdev, bpf,
	linux-kernel, linux-kselftest

On Mon, Jan 20, 2020 at 2:11 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 1/20/20 9:10 PM, Matt Cover wrote:
> > On Mon, Jan 20, 2020 at 11:11 AM Matt Cover <werekraken@gmail.com> wrote:
> >> On Sat, Jan 18, 2020 at 8:05 PM John Fastabend <john.fastabend@gmail.com> wrote:
> >>> Matthew Cover wrote:
> >>>> Allow looking up an nf_conn. This allows eBPF programs to leverage
> >>>> nf_conntrack state for similar purposes to socket state use cases,
> >>>> as provided by the socket lookup helpers. This is particularly
> >>>> useful when nf_conntrack state is locally available, but socket
> >>>> state is not.
> >>>>
> >>>> Signed-off-by: Matthew Cover <matthew.cover@stackpath.com>
> >>>> ---
> >>>
> >>> Couple coding comments below. Also looks like a couple build errors
> >>> so fix those up. I'm still thinking over this though.
> >>
> >> Thank you for taking the time to look this over. I will be looking
> >> into the build issues.
> >
> > Looks like I missed static inline on a couple functions when
> > nf_conntrack isn't builtin. I'll include the fix in v2.
>
> One of the big issues I'd see with this integration is that literally no-one
> will be able to use it unless they manually recompile their distro kernel with
> ct as builtin instead of module .. Have you considered writing a tcp/udp ct in
> plain bpf? Perhaps would make sense to have some sort of tools/lib/bpf/util/
> with bpf prog library code that can be included.

Daniel, sorry, I missed addressing your second point in my previous
response. I agree that plain bpf ct is of interest. However, I still
see value in these helpers, particularly when nf_conntrack is
already in use. Reuse of info already in nf_conntrack avoids the
memory cost of another ct table.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH bpf-next v2 1/2] bpf: add bpf_ct_lookup_{tcp,udp}() helpers
  2020-01-21 20:35   ` [PATCH bpf-next v2 1/2] bpf: add bpf_ct_lookup_{tcp,udp}() helpers Matt Cover
  2020-01-21 21:31     ` Matt Cover
@ 2020-01-24 19:11     ` Joe Stringer
  2020-01-24 21:46       ` Matt Cover
  1 sibling, 1 reply; 22+ messages in thread
From: Joe Stringer @ 2020-01-24 19:11 UTC (permalink / raw)
  To: Matt Cover
  Cc: Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau, Song Liu,
	Yonghong Song, Andrii Nakryiko, David S. Miller, Shuah Khan,
	Jesper Dangaard Brouer, John Fastabend, Jakub Sitnicki,
	Quentin Monnet, Matthew Cover, Stanislav Fomichev,
	Andrey Ignatov, Lorenz Bauer, netdev, bpf, LKML, linux-kselftest,
	Pablo Neira Ayuso, Jozsef Kadlecsik, Florian Westphal, coreteam

On Tue, Jan 21, 2020 at 12:36 PM Matt Cover <werekraken@gmail.com> wrote:
>
> On Tue, Jan 21, 2020 at 1:20 PM Matthew Cover <werekraken@gmail.com> wrote:
> >
> > Allow looking up an nf_conn. This allows eBPF programs to leverage
> > nf_conntrack state for similar purposes to socket state use cases,
> > as provided by the socket lookup helpers. This is particularly
> > useful when nf_conntrack state is locally available, but socket
> > state is not.

I think there's an important distinction between accessing sockets and
accessing the connection tracker: Sockets are inherently tied to local
processes. They consume resources regardless of what kind of fancy
networking behaviour you desire out of the stack. Connection-tracking
on the other hand only consumes resources if you enable features that
explicitly require that functionality. This raises some interesting
questions.

The kernel disables nf_conntrack by default to alleviate the costs
associated with it[0]. In the case of this proposal, the BPF program
itself is trying to use nf_conntrack, so does that mean that the
kernel should auto-enable nf_conntrack hooks for the current namespace
(or all namespaces, given that the helper provides access into other
namespaces as well) whenever a BPF program is loaded that uses this
helper?

Related side note: What if you wanted to migitate the performance
penalty of turning on nf_conntrack by programmatically choosing
whether to populate the ct table? Do we then need to define an
interface that allows a BPF program to tell nf_conntrack whether or
not to track a given connection?

More importantly, nf_conntrack has a particular view in mind of what a
connection is and the metadata that can be associated with a
connection. On the other hand, one of the big pulls for building
networking functionality in BPF is to allow flexibility. Over time,
more complex use cases will arise that demand additional metadata to
be stored with their connections. Cilium's connection tracking entries
provides a glimpse of this[1]. I'm sure that the OVS-BPF project would
have similar demands. Longer term, do we encourage such projects to
migrate to this implementation, proposing metadata extensions that are
programmable from BPF?

Taking the metadata question further, there is not only the metadata
that arbitrary BPF programs wish to associate with nf_conntrack. There
is also the various extensions that nf_conntrack itself has which
could be interesting for users that depend on that state. Would we
draw a line before providing access into those aspects of nf_conntrack
from BPF?

Beyond metadata, there is the question of write access to
nf_conntrack. Presumably if a read helper like this is added to the
BPF API, it is only balanced to also add create, update and delete
operations? No doubt if someone wants to build NAT or firewall
functionality in BPF using nf_conntrack, they will want this. Does
this take us on the track of eventually exporting the entire
nf_conntrack module (or even nf_nat) internal kernel APIs as external
BPF API?

If the BPF API is going to provide a connection tracker, I feel that
it should aim to solve connection tracking for various potential
users. This takes us from not just what this patch does, but to the
full vision of where this API goes with a connection tracker
implementation that could be reused by e.g. OVS-BPF or Cilium. At this
point, I'm not convinced why such an implementation should exist in
the BPF API rather than as a common library that can be forked and
tweaked for anyone's uses.

What do you see as the split of responsibility between BPF and other
subsystems long-term for your use case that motivates relying upon
nf_conntrack always running?

[0] https://github.com/torvalds/linux/commit/4d3a57f23dec59f0a2362e63540b2d01b37afe0a
[1] https://github.com/cilium/cilium/blob/v1.6.5/bpf/lib/common.h#L510

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH bpf-next v2 1/2] bpf: add bpf_ct_lookup_{tcp,udp}() helpers
  2020-01-24 19:11     ` Joe Stringer
@ 2020-01-24 21:46       ` Matt Cover
  2020-01-30 21:53         ` unstable bpf helpers proposal. Was: " Alexei Starovoitov
  0 siblings, 1 reply; 22+ messages in thread
From: Matt Cover @ 2020-01-24 21:46 UTC (permalink / raw)
  To: Joe Stringer
  Cc: Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau, Song Liu,
	Yonghong Song, Andrii Nakryiko, David S. Miller, Shuah Khan,
	Jesper Dangaard Brouer, John Fastabend, Jakub Sitnicki,
	Quentin Monnet, Matthew Cover, Stanislav Fomichev,
	Andrey Ignatov, Lorenz Bauer, netdev, bpf, LKML, linux-kselftest,
	Pablo Neira Ayuso, Jozsef Kadlecsik, Florian Westphal, coreteam

On Fri, Jan 24, 2020 at 12:11 PM Joe Stringer <joe@wand.net.nz> wrote:

Joe, thank you for taking the time to respond. And thank you for your
efforts on the sk helpers; I both use them and borrowed heavily from
them in coding this submission.

>
> On Tue, Jan 21, 2020 at 12:36 PM Matt Cover <werekraken@gmail.com> wrote:
> >
> > On Tue, Jan 21, 2020 at 1:20 PM Matthew Cover <werekraken@gmail.com> wrote:
> > >
> > > Allow looking up an nf_conn. This allows eBPF programs to leverage
> > > nf_conntrack state for similar purposes to socket state use cases,
> > > as provided by the socket lookup helpers. This is particularly
> > > useful when nf_conntrack state is locally available, but socket
> > > state is not.
>
> I think there's an important distinction between accessing sockets and
> accessing the connection tracker: Sockets are inherently tied to local
> processes. They consume resources regardless of what kind of fancy
> networking behaviour you desire out of the stack. Connection-tracking
> on the other hand only consumes resources if you enable features that
> explicitly require that functionality. This raises some interesting
> questions.

Sockets require local config to exist; a service must be listening.
nf_conntrack entries require local config to exist. To me, this is
not so different.

In addition to the nf_conntrack helpers, I'm hoping to add helpers for
lookups to the ipvs connection table via ip_vs_conn_in_get(). From my
perspective, this is again similar. The connection is locally
known/owned, but there is no socket. The state lives elsewhere in the
kernel, but already exists.

There is no need to pay the memory cost for native bpf ct in these
cases (i.e. when nf_conntrack is already in use or when the flow
traverses ipvs rather than terminating at a socket).

>
> The kernel disables nf_conntrack by default to alleviate the costs
> associated with it[0]. In the case of this proposal, the BPF program
> itself is trying to use nf_conntrack, so does that mean that the
> kernel should auto-enable nf_conntrack hooks for the current namespace
> (or all namespaces, given that the helper provides access into other
> namespaces as well) whenever a BPF program is loaded that uses this
> helper?

I see no reason to auto-enable nf_conntrack. When nf_conntrack is
not present, unloaded, or not configured the helpers return NULL.
This is similar to the sk helpers when no service is listening on
<daddr>:<dport>.

>
> Related side note: What if you wanted to migitate the performance
> penalty of turning on nf_conntrack by programmatically choosing
> whether to populate the ct table? Do we then need to define an
> interface that allows a BPF program to tell nf_conntrack whether or
> not to track a given connection?

This certainly could be of interest for certain use cases. Adding
such functionality is neither included nor precluded by this
submission.

Writing to an existing nf_conn could be added to this helper in the
future. Then, as an example, an XDP program could populate ct->mark
and a restore mark rule could be used to apply the mark to the skb.
This is conceptually similar to the XDP/tc interaction example.

https://github.com/xdp-project/xdp-tutorial/tree/master/advanced01-xdp-tc-interact

Adding new entries would be another helper if desired. Again, there
is nothing I see in this submission to preclude the addition of such
a helper, it simply isn't part of my current use case.

>
> More importantly, nf_conntrack has a particular view in mind of what a
> connection is and the metadata that can be associated with a
> connection. On the other hand, one of the big pulls for building
> networking functionality in BPF is to allow flexibility. Over time,
> more complex use cases will arise that demand additional metadata to
> be stored with their connections. Cilium's connection tracking entries
> provides a glimpse of this[1]. I'm sure that the OVS-BPF project would
> have similar demands. Longer term, do we encourage such projects to
> migrate to this implementation, proposing metadata extensions that are
> programmable from BPF?

Presumably, such a push would require a helper to add new ct entries
which I see as beyond the scope of this submission. However, I would
imagine that, as long as a metadata extensions approach wasn't overly
cumbersome to use, the performance of the two ct solutions would be
the deciding factor.

>
> Taking the metadata question further, there is not only the metadata
> that arbitrary BPF programs wish to associate with nf_conntrack. There
> is also the various extensions that nf_conntrack itself has which
> could be interesting for users that depend on that state. Would we
> draw a line before providing access into those aspects of nf_conntrack
> from BPF?

I'm planning to add a bpf_tcp_nf_conn() helper which gives access to
members of ip_ct_tcp. This is similar to bpf_tcp_sock() in my mind.
Are these the types of extensions you mean?

>
> Beyond metadata, there is the question of write access to
> nf_conntrack. Presumably if a read helper like this is added to the
> BPF API, it is only balanced to also add create, update and delete
> operations? No doubt if someone wants to build NAT or firewall
> functionality in BPF using nf_conntrack, they will want this. Does
> this take us on the track of eventually exporting the entire
> nf_conntrack module (or even nf_nat) internal kernel APIs as external
> BPF API?

I touched on create and update above. Delete, like create, would
almost certainly be a separate helper. This submission is not
intended to put us on that track. I do not believe it hinders an
effort such as that either. Are you worried that adding nf_conn to
bpf is a slippery slope?

>
> If the BPF API is going to provide a connection tracker, I feel that
> it should aim to solve connection tracking for various potential
> users. This takes us from not just what this patch does, but to the
> full vision of where this API goes with a connection tracker
> implementation that could be reused by e.g. OVS-BPF or Cilium. At this
> point, I'm not convinced why such an implementation should exist in
> the BPF API rather than as a common library that can be forked and
> tweaked for anyone's uses.
>
> What do you see as the split of responsibility between BPF and other
> subsystems long-term for your use case that motivates relying upon
> nf_conntrack always running?

I do not see this as relying on nf_conntrack always running; I see
it as not always relying that flow/connection ownership is determined
by socket presence/state. Sockets, nf_conns, and ip_vs_conns are all
of interest for different workloads.

>
> [0] https://github.com/torvalds/linux/commit/4d3a57f23dec59f0a2362e63540b2d01b37afe0a
> [1] https://github.com/cilium/cilium/blob/v1.6.5/bpf/lib/common.h#L510

^ permalink raw reply	[flat|nested] 22+ messages in thread

* unstable bpf helpers proposal. Was: [PATCH bpf-next v2 1/2] bpf: add bpf_ct_lookup_{tcp,udp}() helpers
  2020-01-24 21:46       ` Matt Cover
@ 2020-01-30 21:53         ` Alexei Starovoitov
  2020-02-06  6:13           ` Matt Cover
  0 siblings, 1 reply; 22+ messages in thread
From: Alexei Starovoitov @ 2020-01-30 21:53 UTC (permalink / raw)
  To: Matt Cover; +Cc: daniel, davem, Matthew Cover, netdev, bpf, kernel-team

On Fri, Jan 24, 2020 at 02:46:30PM -0700, Matt Cover wrote:
> 
> In addition to the nf_conntrack helpers, I'm hoping to add helpers for
> lookups to the ipvs connection table via ip_vs_conn_in_get(). From my
> perspective, this is again similar. 

...

> Writing to an existing nf_conn could be added to this helper in the
> future. Then, as an example, an XDP program could populate ct->mark
> and a restore mark rule could be used to apply the mark to the skb.
> This is conceptually similar to the XDP/tc interaction example.

...

> I'm planning to add a bpf_tcp_nf_conn() helper which gives access to
> members of ip_ct_tcp. This is similar to bpf_tcp_sock() in my mind.

...

> I touched on create and update above. Delete, like create, would
> almost certainly be a separate helper. This submission is not
> intended to put us on that track. I do not believe it hinders an
> effort such as that either. Are you worried that adding nf_conn to
> bpf is a slippery slope?

Looks like there is a need to access quite a bit of ct, ipvs internal states.
I bet neigh, route and other kernel internal tables will be next. The
lookup/update/delete to these tables is necessary. If somebody wants to do a
fast bridge in XDP they may want to reuse icmp_send(). I've seen folks
reimplementing it purely on BPF side, but kernel's icmp_send() is clearly
superior, so exposing it as a helper will be useful too. And so on and so
forth. There are lots of kernel bits that BPF progs want to interact with.

If we expose all of that via existing bpf helper mechanism we will freeze a
large chunk of networking stack. I agree that accessing these data structures
from BPF side is useful, but I don't think we can risk hardening the kernel so
much. We need new helper mechanism that will be unstable api. It needs to be
obviously unstable to both kernel developers and bpf users. Yet such mechanim
should allow bpf progs accessing all these things without sacrificing safety.

I think such new mechanism can be modeled similar to kernel modules and
EXPORT_SYMBOL[_GPL] convention. The kernel has established policy that
these function do change and in-tree kernel modules get updated along the way
while out-of-tree gets broken periodically. I propose to do the same for BPF.
Certain kernel functions can be marked as EXPORT_SYMBOL_BPF and they will be
eligible to be called from BPF program. The verifier will do safety checks and
type matching based on BTF. The same way it already does for tracing progs.
For example the ct lookup can be:
struct nf_conn *
bpf_ct_lookup(struct __sk_buff *skb, struct nf_conntrack_tuple *tuple, u32 len,
              u8 proto, u64 netns_id, u64 flags)
{
}
EXPORT_SYMBOL_BPF(bpf_ct_lookup);
The first argument 'skb' has stable api and type. It's __sk_buff and it's
context for all skb-based progs, so any program that got __sk_buff from
somewhere can pass it into this helper.
The second argument is 'struct nf_conntrack_tuple *'. It's unstable and
kernel internal. Currently the verifier recognizes it as PTR_TO_BTF_ID
for tracing progs and can do the same for networking. It cannot recognize
it on stack though. Like:
int bpf_prog(struct __sk_buff *skb)
{
  struct nf_conntrack_tuple my_tupple = { ...};
  bpf_ct_lookup(skb, &my_tupple, ...);
}
won't work yet. The verifier needs to be taught to deal with PTR_TO_BTF_ID
where it points to the stack.
The last three arguments are scalars and already recognized as SCALAR_VALUE by
the verifier. So with minor extensions the verifier will be able to prove the
safety of argument passing.

The return value is trickier. It can be solved with appropriate type
annotations like:
struct nf_conn *
bpf_ct_lookup(struct __sk_buff *skb, struct nf_conntrack_tuple *tuple, u32 len,
             u8 proto, u64 netns_id, u64 flags)
{ ...
}
EXPORT_SYMBOL_BPF__acquires(bpf_ct_lookup);
int bpf_ct_release(struct nf_conn * ct)
{ ...
}
EXPORT_SYMBOL_BPF__releases(bpf_ct_release);
By convention the return value is acquired and the first argument is released.
Then the verifier will be able to pair them the same way it does
bpf_sk_lookup()/bpf_sk_release(), but in declarative way. So the verifier code
doesn't need to be touched for every such function pair in the future.

Note struct nf_conn and struct nf_conntrack_tuple stay kernel internal.
BPF program can define fields it wants to access as:
struct nf_conn {
  u32 timeout;
  u64 status;
  u32 mark;
} __attribute__((preserve_access_index));
int bpf_prog()
{
  struct nf_conn *ct = bpf_ct_lookup(...);
  if (ct) {
       ct->timeout;
  }
}
and CO-RE logic will deal with kernel specific relocations.
The same way it does for tracing progs that access all kernel data.

I think it's plenty obvious that such bpf helpers are unstable api. The
networking programs will have access to all kernel data structures, receive
them from white listed set of EXPORT_SYMBOL_BPF() functions and pass them into
those functions back. Just like tracing progs that have access to everything.
They can read all fields of kernel internal struct sk_buff and pass it into
bpf_skb_output().
The same way kernel modules can access all kernel data structures and call
white listed set of EXPORT_SYMBOL() helpers.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: unstable bpf helpers proposal. Was: [PATCH bpf-next v2 1/2] bpf: add bpf_ct_lookup_{tcp,udp}() helpers
  2020-01-30 21:53         ` unstable bpf helpers proposal. Was: " Alexei Starovoitov
@ 2020-02-06  6:13           ` Matt Cover
  2020-02-20  4:45             ` Alexei Starovoitov
  0 siblings, 1 reply; 22+ messages in thread
From: Matt Cover @ 2020-02-06  6:13 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Daniel Borkmann, David S. Miller, Matthew Cover, netdev, bpf,
	kernel-team

On Thu, Jan 30, 2020 at 2:53 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Fri, Jan 24, 2020 at 02:46:30PM -0700, Matt Cover wrote:
> >
> > In addition to the nf_conntrack helpers, I'm hoping to add helpers for
> > lookups to the ipvs connection table via ip_vs_conn_in_get(). From my
> > perspective, this is again similar.
>
> ...
>
> > Writing to an existing nf_conn could be added to this helper in the
> > future. Then, as an example, an XDP program could populate ct->mark
> > and a restore mark rule could be used to apply the mark to the skb.
> > This is conceptually similar to the XDP/tc interaction example.
>
> ...
>
> > I'm planning to add a bpf_tcp_nf_conn() helper which gives access to
> > members of ip_ct_tcp. This is similar to bpf_tcp_sock() in my mind.
>
> ...
>
> > I touched on create and update above. Delete, like create, would
> > almost certainly be a separate helper. This submission is not
> > intended to put us on that track. I do not believe it hinders an
> > effort such as that either. Are you worried that adding nf_conn to
> > bpf is a slippery slope?
>
> Looks like there is a need to access quite a bit of ct, ipvs internal states.
> I bet neigh, route and other kernel internal tables will be next. The
> lookup/update/delete to these tables is necessary. If somebody wants to do a
> fast bridge in XDP they may want to reuse icmp_send(). I've seen folks
> reimplementing it purely on BPF side, but kernel's icmp_send() is clearly
> superior, so exposing it as a helper will be useful too. And so on and so
> forth. There are lots of kernel bits that BPF progs want to interact with.
>
> If we expose all of that via existing bpf helper mechanism we will freeze a
> large chunk of networking stack. I agree that accessing these data structures
> from BPF side is useful, but I don't think we can risk hardening the kernel so
> much. We need new helper mechanism that will be unstable api. It needs to be
> obviously unstable to both kernel developers and bpf users. Yet such mechanim
> should allow bpf progs accessing all these things without sacrificing safety.
>
> I think such new mechanism can be modeled similar to kernel modules and
> EXPORT_SYMBOL[_GPL] convention. The kernel has established policy that
> these function do change and in-tree kernel modules get updated along the way
> while out-of-tree gets broken periodically. I propose to do the same for BPF.
> Certain kernel functions can be marked as EXPORT_SYMBOL_BPF and they will be
> eligible to be called from BPF program. The verifier will do safety checks and
> type matching based on BTF. The same way it already does for tracing progs.
> For example the ct lookup can be:
> struct nf_conn *
> bpf_ct_lookup(struct __sk_buff *skb, struct nf_conntrack_tuple *tuple, u32 len,
>               u8 proto, u64 netns_id, u64 flags)
> {
> }
> EXPORT_SYMBOL_BPF(bpf_ct_lookup);
> The first argument 'skb' has stable api and type. It's __sk_buff and it's
> context for all skb-based progs, so any program that got __sk_buff from
> somewhere can pass it into this helper.
> The second argument is 'struct nf_conntrack_tuple *'. It's unstable and
> kernel internal. Currently the verifier recognizes it as PTR_TO_BTF_ID
> for tracing progs and can do the same for networking. It cannot recognize
> it on stack though. Like:
> int bpf_prog(struct __sk_buff *skb)
> {
>   struct nf_conntrack_tuple my_tupple = { ...};
>   bpf_ct_lookup(skb, &my_tupple, ...);
> }
> won't work yet. The verifier needs to be taught to deal with PTR_TO_BTF_ID
> where it points to the stack.
> The last three arguments are scalars and already recognized as SCALAR_VALUE by
> the verifier. So with minor extensions the verifier will be able to prove the
> safety of argument passing.
>
> The return value is trickier. It can be solved with appropriate type
> annotations like:
> struct nf_conn *
> bpf_ct_lookup(struct __sk_buff *skb, struct nf_conntrack_tuple *tuple, u32 len,
>              u8 proto, u64 netns_id, u64 flags)
> { ...
> }
> EXPORT_SYMBOL_BPF__acquires(bpf_ct_lookup);
> int bpf_ct_release(struct nf_conn * ct)
> { ...
> }
> EXPORT_SYMBOL_BPF__releases(bpf_ct_release);
> By convention the return value is acquired and the first argument is released.
> Then the verifier will be able to pair them the same way it does
> bpf_sk_lookup()/bpf_sk_release(), but in declarative way. So the verifier code
> doesn't need to be touched for every such function pair in the future.
>
> Note struct nf_conn and struct nf_conntrack_tuple stay kernel internal.
> BPF program can define fields it wants to access as:
> struct nf_conn {
>   u32 timeout;
>   u64 status;
>   u32 mark;
> } __attribute__((preserve_access_index));
> int bpf_prog()
> {
>   struct nf_conn *ct = bpf_ct_lookup(...);
>   if (ct) {
>        ct->timeout;
>   }
> }
> and CO-RE logic will deal with kernel specific relocations.
> The same way it does for tracing progs that access all kernel data.
>
> I think it's plenty obvious that such bpf helpers are unstable api. The
> networking programs will have access to all kernel data structures, receive
> them from white listed set of EXPORT_SYMBOL_BPF() functions and pass them into
> those functions back. Just like tracing progs that have access to everything.
> They can read all fields of kernel internal struct sk_buff and pass it into
> bpf_skb_output().
> The same way kernel modules can access all kernel data structures and call
> white listed set of EXPORT_SYMBOL() helpers.

I think this sounds great. Looking forward to hearing what others
think of this proposal.

I've started looking into how the exported symbols portion of this
might look. These are just some thoughts on how we could do things
if Alexei's proposal is accepted.

Presumably we want all of
EXPORT_SYMBOL_BPF{,_GPL}{,__acquires,__releases}() as part of the
initial effort.

EXPORT_SYMBOL_BPF(bpf_icmp_send);
EXPORT_SYMBOL_BPF__acquires(bpf_ipvs_conn_in_lookup);
EXPORT_SYMBOL_BPF__releases(bpf_ipvs_conn_release);

EXPORT_SYMBOL_BPF_GPL(bpf_ct_delete);
EXPORT_SYMBOL_BPF_GPL__acquires(bpf_ct_lookup);
EXPORT_SYMBOL_BPF_GPL__releases(bpf_ct_release);

Do we also need a __must_hold type annotation (e.g.
EXPORT_SYMBOL_BPF_GPL__must_hold(bpf_ct_delete))? Would we expect all
unstable helpers to handle being passed NULL? Or will the existing
verifier rule that returned values must be checked non-NULL before
use extend to calls of these functions even without the annotation?

We can optionally include
EXPORT_UNUSED_SYMBOL_BPF{,_GPL}{,__acquires,__releases}() and
EXPORT_SYMBOL_BPF_GPL_FUTURE{,__acquires,__releases}() in the initial
effort, but they aren't needed for the helpers proposed so far. Given
that they won't be used right away, I'd just as soon leave them for a
follow up, when the need arises.

In addition to reusing the EXPORT_SYMBOLS convention, I think reusing
the existing symvers implementation might be a reasonable choice.

Module.symvers already contains an "Export Type" field which
categorizes exported symbols. Exported symbols of each type are
placed in a separate ELF section within a module (e.g.
EXPORT_SYMBOL_GPL maps to __ksymtab_gpl). Given that bpf progs can
and often do exist as ELF files, it seems like this could work for
them as well (at least on the surface).

scripts/mod/modpost.c contains check_for_gpl_usage() and
check_for_unused() which enforce policy on how modules use exported
symbols by type (via containing section). Adding new policy (e.g.
check_for_bpf_usage()) to prevent modules from using bpf exported
symbols is one way exported symbols for modules and bpf can coexist.

We'd also need policy checking on the bpf prog side; the same
categorization mechanisms should work, but we need a util which
actually does it.

Additionally, distros can leverage symvers in the same manner for bpf
progs as is already done for modules. For example, icmpv6_send() is
in the el7 kabi whitelists; perhaps bpf_icmpv6_send() would be a
candidate for kabi whitelisting as well. Greylists can be optionally
generated and shipped with bpf progs which use unstable helpers.
Scripts which provide similar functionality for bpf progs as
/sbin/weak-modules does for modules could be added. Even with
existing el7 kernel rpms (including variants like kernel-ml),
/etc/kernel/p{ostinst,rerm}.d can be used to run a "weak-bpf-progs"
on install/removal. Point is a lot of distro conventions could be
reused; el7 is just a concrete example.

Some additional thoughts:
* Do we want to be able to export the exact same function to modules
    and bpf (i.e. without error: redefinition of '__kstrtab_xxx')?
* Do we want asm versions of EXPORT_SYMBOL_BPF*() (e.g. in
    include/asm-generic/export.h)?
* If a function with more than 5 parameters is exported via
    EXPORT_SYMBOL_BPF*(), should we have the build fail?

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: unstable bpf helpers proposal. Was: [PATCH bpf-next v2 1/2] bpf: add bpf_ct_lookup_{tcp,udp}() helpers
  2020-02-06  6:13           ` Matt Cover
@ 2020-02-20  4:45             ` Alexei Starovoitov
  2020-04-03 23:56               ` Matt Cover
  0 siblings, 1 reply; 22+ messages in thread
From: Alexei Starovoitov @ 2020-02-20  4:45 UTC (permalink / raw)
  To: Matt Cover
  Cc: Daniel Borkmann, David S. Miller, Matthew Cover, netdev, bpf,
	kernel-team

On Wed, Feb 05, 2020 at 11:13:55PM -0700, Matt Cover wrote:
> On Thu, Jan 30, 2020 at 2:53 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Fri, Jan 24, 2020 at 02:46:30PM -0700, Matt Cover wrote:
> > >
> > > In addition to the nf_conntrack helpers, I'm hoping to add helpers for
> > > lookups to the ipvs connection table via ip_vs_conn_in_get(). From my
> > > perspective, this is again similar.
> >
> > ...
> >
> > > Writing to an existing nf_conn could be added to this helper in the
> > > future. Then, as an example, an XDP program could populate ct->mark
> > > and a restore mark rule could be used to apply the mark to the skb.
> > > This is conceptually similar to the XDP/tc interaction example.
> >
> > ...
> >
> > > I'm planning to add a bpf_tcp_nf_conn() helper which gives access to
> > > members of ip_ct_tcp. This is similar to bpf_tcp_sock() in my mind.
> >
> > ...
> >
> > > I touched on create and update above. Delete, like create, would
> > > almost certainly be a separate helper. This submission is not
> > > intended to put us on that track. I do not believe it hinders an
> > > effort such as that either. Are you worried that adding nf_conn to
> > > bpf is a slippery slope?
> >
> > Looks like there is a need to access quite a bit of ct, ipvs internal states.
> > I bet neigh, route and other kernel internal tables will be next. The
> > lookup/update/delete to these tables is necessary. If somebody wants to do a
> > fast bridge in XDP they may want to reuse icmp_send(). I've seen folks
> > reimplementing it purely on BPF side, but kernel's icmp_send() is clearly
> > superior, so exposing it as a helper will be useful too. And so on and so
> > forth. There are lots of kernel bits that BPF progs want to interact with.
> >
> > If we expose all of that via existing bpf helper mechanism we will freeze a
> > large chunk of networking stack. I agree that accessing these data structures
> > from BPF side is useful, but I don't think we can risk hardening the kernel so
> > much. We need new helper mechanism that will be unstable api. It needs to be
> > obviously unstable to both kernel developers and bpf users. Yet such mechanim
> > should allow bpf progs accessing all these things without sacrificing safety.
> >
> > I think such new mechanism can be modeled similar to kernel modules and
> > EXPORT_SYMBOL[_GPL] convention. The kernel has established policy that
> > these function do change and in-tree kernel modules get updated along the way
> > while out-of-tree gets broken periodically. I propose to do the same for BPF.
> > Certain kernel functions can be marked as EXPORT_SYMBOL_BPF and they will be
> > eligible to be called from BPF program. The verifier will do safety checks and
> > type matching based on BTF. The same way it already does for tracing progs.
> > For example the ct lookup can be:
> > struct nf_conn *
> > bpf_ct_lookup(struct __sk_buff *skb, struct nf_conntrack_tuple *tuple, u32 len,
> >               u8 proto, u64 netns_id, u64 flags)
> > {
> > }
> > EXPORT_SYMBOL_BPF(bpf_ct_lookup);
> > The first argument 'skb' has stable api and type. It's __sk_buff and it's
> > context for all skb-based progs, so any program that got __sk_buff from
> > somewhere can pass it into this helper.
> > The second argument is 'struct nf_conntrack_tuple *'. It's unstable and
> > kernel internal. Currently the verifier recognizes it as PTR_TO_BTF_ID
> > for tracing progs and can do the same for networking. It cannot recognize
> > it on stack though. Like:
> > int bpf_prog(struct __sk_buff *skb)
> > {
> >   struct nf_conntrack_tuple my_tupple = { ...};
> >   bpf_ct_lookup(skb, &my_tupple, ...);
> > }
> > won't work yet. The verifier needs to be taught to deal with PTR_TO_BTF_ID
> > where it points to the stack.
> > The last three arguments are scalars and already recognized as SCALAR_VALUE by
> > the verifier. So with minor extensions the verifier will be able to prove the
> > safety of argument passing.
> >
> > The return value is trickier. It can be solved with appropriate type
> > annotations like:
> > struct nf_conn *
> > bpf_ct_lookup(struct __sk_buff *skb, struct nf_conntrack_tuple *tuple, u32 len,
> >              u8 proto, u64 netns_id, u64 flags)
> > { ...
> > }
> > EXPORT_SYMBOL_BPF__acquires(bpf_ct_lookup);
> > int bpf_ct_release(struct nf_conn * ct)
> > { ...
> > }
> > EXPORT_SYMBOL_BPF__releases(bpf_ct_release);
> > By convention the return value is acquired and the first argument is released.
> > Then the verifier will be able to pair them the same way it does
> > bpf_sk_lookup()/bpf_sk_release(), but in declarative way. So the verifier code
> > doesn't need to be touched for every such function pair in the future.
> >
> > Note struct nf_conn and struct nf_conntrack_tuple stay kernel internal.
> > BPF program can define fields it wants to access as:
> > struct nf_conn {
> >   u32 timeout;
> >   u64 status;
> >   u32 mark;
> > } __attribute__((preserve_access_index));
> > int bpf_prog()
> > {
> >   struct nf_conn *ct = bpf_ct_lookup(...);
> >   if (ct) {
> >        ct->timeout;
> >   }
> > }
> > and CO-RE logic will deal with kernel specific relocations.
> > The same way it does for tracing progs that access all kernel data.
> >
> > I think it's plenty obvious that such bpf helpers are unstable api. The
> > networking programs will have access to all kernel data structures, receive
> > them from white listed set of EXPORT_SYMBOL_BPF() functions and pass them into
> > those functions back. Just like tracing progs that have access to everything.
> > They can read all fields of kernel internal struct sk_buff and pass it into
> > bpf_skb_output().
> > The same way kernel modules can access all kernel data structures and call
> > white listed set of EXPORT_SYMBOL() helpers.
> 
> I think this sounds great. Looking forward to hearing what others
> think of this proposal.

No further comments typically means either no objections or lack of
understanding :) In both cases the patches have to do the talking.

> Presumably we want all of
> EXPORT_SYMBOL_BPF{,_GPL}{,__acquires,__releases}() as part of the
> initial effort.

I was thinking that _GPL suffix will be implicit. All such unstable helpers
would require GPL license because they can appear in modules and I want to make
sure people don't use this method as a way to extend BPF without sharing the
code.

> EXPORT_SYMBOL_BPF(bpf_icmp_send);
> EXPORT_SYMBOL_BPF__acquires(bpf_ipvs_conn_in_lookup);
> EXPORT_SYMBOL_BPF__releases(bpf_ipvs_conn_release);
> 
> EXPORT_SYMBOL_BPF_GPL(bpf_ct_delete);
> EXPORT_SYMBOL_BPF_GPL__acquires(bpf_ct_lookup);
> EXPORT_SYMBOL_BPF_GPL__releases(bpf_ct_release);
> 
> Do we also need a __must_hold type annotation (e.g.
> EXPORT_SYMBOL_BPF_GPL__must_hold(bpf_ct_delete))? 

I don't see how 'must_hold' helps safety.
It's purely sparse annotation. What does the verifier suppose to do?

> Would we expect all
> unstable helpers to handle being passed NULL? Or will the existing
> verifier rule that returned values must be checked non-NULL before
> use extend to calls of these functions even without the annotation?

I think both ways will be possible.

> We can optionally include
> EXPORT_UNUSED_SYMBOL_BPF{,_GPL}{,__acquires,__releases}() and
> EXPORT_SYMBOL_BPF_GPL_FUTURE{,__acquires,__releases}() in the initial
> effort, but they aren't needed for the helpers proposed so far. Given
> that they won't be used right away, I'd just as soon leave them for a
> follow up, when the need arises.

Not sure what you mean by UNUSED and FUTURE.
All such helpers will likely by 'unused' by the core kernel.
Just like all existing stable bpf helpers are rarely called directly
by the kernel code.

> In addition to reusing the EXPORT_SYMBOLS convention, I think reusing
> the existing symvers implementation might be a reasonable choice.

I don't think symvers will work, since they're lacking type info.
I was thinking that simple BTF annotation will do the trick.

> Some additional thoughts:
> * Do we want to be able to export the exact same function to modules
>     and bpf (i.e. without error: redefinition of '__kstrtab_xxx')?

Same function to modules is already done via EXPORT_SYMBOL.
Let's not mess with that.

> * Do we want asm versions of EXPORT_SYMBOL_BPF*() (e.g. in
>     include/asm-generic/export.h)?

No. kernel's asm is typeless. bpf verifier cannot call arbitrary functions.
It has to do type match. Otherwise we'll let bpf crash left and right.

> * If a function with more than 5 parameters is exported via
>     EXPORT_SYMBOL_BPF*(), should we have the build fail?

Some kind of error is necessary. build time would be preferred, but I wouldn't
worry about it at this point.

I think doing BTF annotation for EXPORT_SYMBOL_BPF(bpf_icmp_send); is trivial.
The interesting implementation detail is how to do BPF_CALL_x() in a generic
way. The BPF trampoline solved the problem of calling BPF programs from the
kernel. It translates kernel calling convention into BPF. To do unstable
helpers the reverse BPF trampoline is needed. It would need to translate BPF
calling convention into kernel. The nice part that on x86-64 there is no
reverse trampoline necessary. JITed BPF code can call arbitrary (with <=5 args)
functions without any tricks, but other architectures are not that lucky. Hence
infrastructure has to be done to support all archs. It's ok to have it as a nop
for x86-64, but infra should be there from the start. Take a look at
btf_func_model. Kernel function needs to be distilled into it and reverse BPF
trampoline generated.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: unstable bpf helpers proposal. Was: [PATCH bpf-next v2 1/2] bpf: add bpf_ct_lookup_{tcp,udp}() helpers
  2020-02-20  4:45             ` Alexei Starovoitov
@ 2020-04-03 23:56               ` Matt Cover
  2020-04-07  3:03                 ` Alexei Starovoitov
  0 siblings, 1 reply; 22+ messages in thread
From: Matt Cover @ 2020-04-03 23:56 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Daniel Borkmann, David S. Miller, Matthew Cover, netdev, bpf,
	kernel-team

On Wed, Feb 19, 2020 at 9:45 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Wed, Feb 05, 2020 at 11:13:55PM -0700, Matt Cover wrote:
> > On Thu, Jan 30, 2020 at 2:53 PM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Fri, Jan 24, 2020 at 02:46:30PM -0700, Matt Cover wrote:
> > > >
> > > > In addition to the nf_conntrack helpers, I'm hoping to add helpers for
> > > > lookups to the ipvs connection table via ip_vs_conn_in_get(). From my
> > > > perspective, this is again similar.
> > >
> > > ...
> > >
> > > > Writing to an existing nf_conn could be added to this helper in the
> > > > future. Then, as an example, an XDP program could populate ct->mark
> > > > and a restore mark rule could be used to apply the mark to the skb.
> > > > This is conceptually similar to the XDP/tc interaction example.
> > >
> > > ...
> > >
> > > > I'm planning to add a bpf_tcp_nf_conn() helper which gives access to
> > > > members of ip_ct_tcp. This is similar to bpf_tcp_sock() in my mind.
> > >
> > > ...
> > >
> > > > I touched on create and update above. Delete, like create, would
> > > > almost certainly be a separate helper. This submission is not
> > > > intended to put us on that track. I do not believe it hinders an
> > > > effort such as that either. Are you worried that adding nf_conn to
> > > > bpf is a slippery slope?
> > >
> > > Looks like there is a need to access quite a bit of ct, ipvs internal states.
> > > I bet neigh, route and other kernel internal tables will be next. The
> > > lookup/update/delete to these tables is necessary. If somebody wants to do a
> > > fast bridge in XDP they may want to reuse icmp_send(). I've seen folks
> > > reimplementing it purely on BPF side, but kernel's icmp_send() is clearly
> > > superior, so exposing it as a helper will be useful too. And so on and so
> > > forth. There are lots of kernel bits that BPF progs want to interact with.
> > >
> > > If we expose all of that via existing bpf helper mechanism we will freeze a
> > > large chunk of networking stack. I agree that accessing these data structures
> > > from BPF side is useful, but I don't think we can risk hardening the kernel so
> > > much. We need new helper mechanism that will be unstable api. It needs to be
> > > obviously unstable to both kernel developers and bpf users. Yet such mechanim
> > > should allow bpf progs accessing all these things without sacrificing safety.
> > >
> > > I think such new mechanism can be modeled similar to kernel modules and
> > > EXPORT_SYMBOL[_GPL] convention. The kernel has established policy that
> > > these function do change and in-tree kernel modules get updated along the way
> > > while out-of-tree gets broken periodically. I propose to do the same for BPF.
> > > Certain kernel functions can be marked as EXPORT_SYMBOL_BPF and they will be
> > > eligible to be called from BPF program. The verifier will do safety checks and
> > > type matching based on BTF. The same way it already does for tracing progs.
> > > For example the ct lookup can be:
> > > struct nf_conn *
> > > bpf_ct_lookup(struct __sk_buff *skb, struct nf_conntrack_tuple *tuple, u32 len,
> > >               u8 proto, u64 netns_id, u64 flags)
> > > {
> > > }
> > > EXPORT_SYMBOL_BPF(bpf_ct_lookup);
> > > The first argument 'skb' has stable api and type. It's __sk_buff and it's
> > > context for all skb-based progs, so any program that got __sk_buff from
> > > somewhere can pass it into this helper.
> > > The second argument is 'struct nf_conntrack_tuple *'. It's unstable and
> > > kernel internal. Currently the verifier recognizes it as PTR_TO_BTF_ID
> > > for tracing progs and can do the same for networking. It cannot recognize
> > > it on stack though. Like:
> > > int bpf_prog(struct __sk_buff *skb)
> > > {
> > >   struct nf_conntrack_tuple my_tupple = { ...};
> > >   bpf_ct_lookup(skb, &my_tupple, ...);
> > > }
> > > won't work yet. The verifier needs to be taught to deal with PTR_TO_BTF_ID
> > > where it points to the stack.
> > > The last three arguments are scalars and already recognized as SCALAR_VALUE by
> > > the verifier. So with minor extensions the verifier will be able to prove the
> > > safety of argument passing.
> > >
> > > The return value is trickier. It can be solved with appropriate type
> > > annotations like:
> > > struct nf_conn *
> > > bpf_ct_lookup(struct __sk_buff *skb, struct nf_conntrack_tuple *tuple, u32 len,
> > >              u8 proto, u64 netns_id, u64 flags)
> > > { ...
> > > }
> > > EXPORT_SYMBOL_BPF__acquires(bpf_ct_lookup);
> > > int bpf_ct_release(struct nf_conn * ct)
> > > { ...
> > > }
> > > EXPORT_SYMBOL_BPF__releases(bpf_ct_release);
> > > By convention the return value is acquired and the first argument is released.
> > > Then the verifier will be able to pair them the same way it does
> > > bpf_sk_lookup()/bpf_sk_release(), but in declarative way. So the verifier code
> > > doesn't need to be touched for every such function pair in the future.
> > >
> > > Note struct nf_conn and struct nf_conntrack_tuple stay kernel internal.
> > > BPF program can define fields it wants to access as:
> > > struct nf_conn {
> > >   u32 timeout;
> > >   u64 status;
> > >   u32 mark;
> > > } __attribute__((preserve_access_index));
> > > int bpf_prog()
> > > {
> > >   struct nf_conn *ct = bpf_ct_lookup(...);
> > >   if (ct) {
> > >        ct->timeout;
> > >   }
> > > }
> > > and CO-RE logic will deal with kernel specific relocations.
> > > The same way it does for tracing progs that access all kernel data.
> > >
> > > I think it's plenty obvious that such bpf helpers are unstable api. The
> > > networking programs will have access to all kernel data structures, receive
> > > them from white listed set of EXPORT_SYMBOL_BPF() functions and pass them into
> > > those functions back. Just like tracing progs that have access to everything.
> > > They can read all fields of kernel internal struct sk_buff and pass it into
> > > bpf_skb_output().
> > > The same way kernel modules can access all kernel data structures and call
> > > white listed set of EXPORT_SYMBOL() helpers.
> >
> > I think this sounds great. Looking forward to hearing what others
> > think of this proposal.
>
> No further comments typically means either no objections or lack of
> understanding :) In both cases the patches have to do the talking.

Cool. No patches yet, but a little code to show.

>
> > Presumably we want all of
> > EXPORT_SYMBOL_BPF{,_GPL}{,__acquires,__releases}() as part of the
> > initial effort.
>
> I was thinking that _GPL suffix will be implicit. All such unstable helpers
> would require GPL license because they can appear in modules and I want to make
> sure people don't use this method as a way to extend BPF without sharing the
> code.

Implicitly requiring all bpf exported symbols to be gpl sounds
reasonable. However, _GPL suffix is to denote a GPL license
requirement on the caller (i.e. bpf prog), not the exporter. It's the
same as bpf_func_proto's gpl_only member. Stable helpers code are
all GPL, but calling some stable helpers is permitted from non-GPL
bpf prog. bpf_icmp_send is a good example of an unstable helper which
really doesn't need the calling bpf prog to be GPL license (the same
way __icmp_send permits calls from non-GPL modules).

>
> > EXPORT_SYMBOL_BPF(bpf_icmp_send);
> > EXPORT_SYMBOL_BPF__acquires(bpf_ipvs_conn_in_lookup);
> > EXPORT_SYMBOL_BPF__releases(bpf_ipvs_conn_release);
> >
> > EXPORT_SYMBOL_BPF_GPL(bpf_ct_delete);
> > EXPORT_SYMBOL_BPF_GPL__acquires(bpf_ct_lookup);
> > EXPORT_SYMBOL_BPF_GPL__releases(bpf_ct_release);
> >
> > Do we also need a __must_hold type annotation (e.g.
> > EXPORT_SYMBOL_BPF_GPL__must_hold(bpf_ct_delete))?
>
> I don't see how 'must_hold' helps safety.
> It's purely sparse annotation. What does the verifier suppose to do?

Probably a hasty question on my part. I was thinking must_hold would
mark an unstable helper as requiring the first argument be previously
acquired and not yet released.

>
> > Would we expect all
> > unstable helpers to handle being passed NULL? Or will the existing
> > verifier rule that returned values must be checked non-NULL before
> > use extend to calls of these functions even without the annotation?
>
> I think both ways will be possible.
>
> > We can optionally include
> > EXPORT_UNUSED_SYMBOL_BPF{,_GPL}{,__acquires,__releases}() and
> > EXPORT_SYMBOL_BPF_GPL_FUTURE{,__acquires,__releases}() in the initial
> > effort, but they aren't needed for the helpers proposed so far. Given
> > that they won't be used right away, I'd just as soon leave them for a
> > follow up, when the need arises.
>
> Not sure what you mean by UNUSED and FUTURE.
> All such helpers will likely by 'unused' by the core kernel.
> Just like all existing stable bpf helpers are rarely called directly
> by the kernel code.

Good point, UNUSED makes no sense here. FUTURE denotes an exported
symbol as soon-to-be GPL only. I plan to omit them both.

>
> > In addition to reusing the EXPORT_SYMBOLS convention, I think reusing
> > the existing symvers implementation might be a reasonable choice.
>
> I don't think symvers will work, since they're lacking type info.
> I was thinking that simple BTF annotation will do the trick.
>
> > Some additional thoughts:
> > * Do we want to be able to export the exact same function to modules
> >     and bpf (i.e. without error: redefinition of '__kstrtab_xxx')?
>
> Same function to modules is already done via EXPORT_SYMBOL.
> Let's not mess with that.
>
> > * Do we want asm versions of EXPORT_SYMBOL_BPF*() (e.g. in
> >     include/asm-generic/export.h)?
>
> No. kernel's asm is typeless. bpf verifier cannot call arbitrary functions.
> It has to do type match. Otherwise we'll let bpf crash left and right.
>
> > * If a function with more than 5 parameters is exported via
> >     EXPORT_SYMBOL_BPF*(), should we have the build fail?
>
> Some kind of error is necessary. build time would be preferred, but I wouldn't
> worry about it at this point.
>
> I think doing BTF annotation for EXPORT_SYMBOL_BPF(bpf_icmp_send); is trivial.

I've been looking into this more; here is what I'm thinking.

1. Export symbols for bpf the same as modules, but into one or more
   special namespaces.

   Exported symbols recently gained namespaces.
     https://lore.kernel.org/linux-usb/20190906103235.197072-1-maennich@google.com/
     Documentation/kbuild/namespaces.rst

   This makes the in-kernel changes needed for export super simple.

     #define EXPORT_SYMBOL_BPF(sym)     EXPORT_SYMBOL_NS(sym, BPF_PROG)
     #define EXPORT_SYMBOL_BPF_GPL(sym) EXPORT_SYMBOL_NS_GPL(sym, BPF_PROG)

   BPF_PROG is our special namespace above. We can easily add
   BPF_PROG_ACQUIRES and BPF_PROG_RELEASES for those types of
   unstable helpers.

   Exports for bpf progs are then as simple as for modules.

     EXPORT_SYMBOL_BPF(bpf_icmp_send);

   Documenting these namespaces as not for use by modules should be
   enough; an explicit import statement to use namespaced symbols is
   already required. Explicitly preventing module use in
   MODULE_IMPORT_NS or modpost are also options if we feel more is
   needed.

2. Teach pahole's (dwarves') dwarf loader to parse __ksymtab*.

   I've got a functional wip which retrieves the namespace from the
   __kstrtab ELF section. Working to differentiate between __ksymtab
   and __ksymtab_gpl symbols next. Good news is this info is readily
   available in vmlinux and module .o files. The interface here will
   probably end up similar to dwarves' elf_symtab__*, but with an
   struct elf_ksymtab per __ksymtab* section (all pointing to the
   same __kstrtab section though).

3. Teach pahole's btf encoder to encode the following bools: export,
   gpl_only, acquires, releases.

   I'm envisioning this info will end up in a new struct
   btf_func_proto in btf.h. Perhaps like this.

     struct btf_func_proto {
         /* "info" bits arrangement
          * bit     0: exported (callable by bpf prog)
          * bit     1: gpl_only (only callable from GPL licensed bpf prog)
          * bit     2: acquires (acquires and returns a refcounted pointer)
          * bit     3: releases (first argument, a refcounted pointer,
is released)
          * bits 4-31: unused
          */
         __u32    info;
     };

   Currently, a "struct btf_type" of type BTF_KIND_FUNC_PROTO is
   directly followed by vlen struct btf_param/s. I'm hoping we can
   insert btf_func_proto before the first btf_param or after the
   last. If that's not workable, adding a new type,
   BTF_KIND_FUNC_EXPORT, is another idea.

4. Teach btf consumers to work with the new info.

   Haven't started looking at this yet, but I'd imagine it'll be
   straightforward once the new info has been encoded in btf.

With those items in place, we gain some cool properties for exports
to bpg programs.

  - The exporting new symbols to bpf programs will be trivial.
  - Our exports will be obvious to the kernel community at large.
  - We'll be leveraging namespaces as intended to categorize our
      exports.
  - genksyms will generate a crc to identify our exports.

The crcs could be used to improve the developer experience when
using unstable helpers.

This concept is pulled from the trace/CO-RE work; it's just like in
"Dealing with compile-time #if's in BCC".
  https://facebookmicrosites.github.io/bpf/blog/2020/02/20/bcc-to-libbpf-howto-guide.html#dealing-with-compile-time-ifs-in-bcc

I believe we can make something like this possible.

  extern uint32_t crc_of_bpf_icmp_send __kcrctab;

  switch(crc_of_bpf_icmp_send) {
      case 0x12345678:
          /* use struct/call flavors associated with 0x12345678 */
          break;
      case 0x87654321:
          /* use struct/call flavors associated with 0x87654321 */
          break;
  }

Thoughts on this export mechanism?

> The interesting implementation detail is how to do BPF_CALL_x() in a generic
> way. The BPF trampoline solved the problem of calling BPF programs from the
> kernel. It translates kernel calling convention into BPF. To do unstable
> helpers the reverse BPF trampoline is needed. It would need to translate BPF
> calling convention into kernel. The nice part that on x86-64 there is no
> reverse trampoline necessary. JITed BPF code can call arbitrary (with <=5 args)
> functions without any tricks, but other architectures are not that lucky. Hence
> infrastructure has to be done to support all archs. It's ok to have it as a nop
> for x86-64, but infra should be there from the start. Take a look at
> btf_func_model. Kernel function needs to be distilled into it and reverse BPF
> trampoline generated.

I'll keep working on this as time permits. If anyone has time to
collaborate on unstable helpers (or even answer questions as I get
familiarized with some of the deeper topics), please let me know.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: unstable bpf helpers proposal. Was: [PATCH bpf-next v2 1/2] bpf: add bpf_ct_lookup_{tcp,udp}() helpers
  2020-04-03 23:56               ` Matt Cover
@ 2020-04-07  3:03                 ` Alexei Starovoitov
  2020-04-07  5:28                   ` Matt Cover
  0 siblings, 1 reply; 22+ messages in thread
From: Alexei Starovoitov @ 2020-04-07  3:03 UTC (permalink / raw)
  To: Matt Cover
  Cc: Daniel Borkmann, David S. Miller, Matthew Cover, netdev, bpf,
	kernel-team

On Fri, Apr 03, 2020 at 04:56:01PM -0700, Matt Cover wrote:
> > I think doing BTF annotation for EXPORT_SYMBOL_BPF(bpf_icmp_send); is trivial.
> 
> I've been looking into this more; here is what I'm thinking.
> 
> 1. Export symbols for bpf the same as modules, but into one or more
>    special namespaces.
> 
>    Exported symbols recently gained namespaces.
>      https://lore.kernel.org/linux-usb/20190906103235.197072-1-maennich@google.com/
>      Documentation/kbuild/namespaces.rst
> 
>    This makes the in-kernel changes needed for export super simple.
> 
>      #define EXPORT_SYMBOL_BPF(sym)     EXPORT_SYMBOL_NS(sym, BPF_PROG)
>      #define EXPORT_SYMBOL_BPF_GPL(sym) EXPORT_SYMBOL_NS_GPL(sym, BPF_PROG)
> 
>    BPF_PROG is our special namespace above. We can easily add
>    BPF_PROG_ACQUIRES and BPF_PROG_RELEASES for those types of
>    unstable helpers.
> 
>    Exports for bpf progs are then as simple as for modules.
> 
>      EXPORT_SYMBOL_BPF(bpf_icmp_send);
> 
>    Documenting these namespaces as not for use by modules should be
>    enough; an explicit import statement to use namespaced symbols is
>    already required. Explicitly preventing module use in
>    MODULE_IMPORT_NS or modpost are also options if we feel more is
>    needed.
> 
> 2. Teach pahole's (dwarves') dwarf loader to parse __ksymtab*.
> 
>    I've got a functional wip which retrieves the namespace from the
>    __kstrtab ELF section. Working to differentiate between __ksymtab
>    and __ksymtab_gpl symbols next. Good news is this info is readily
>    available in vmlinux and module .o files. The interface here will
>    probably end up similar to dwarves' elf_symtab__*, but with an
>    struct elf_ksymtab per __ksymtab* section (all pointing to the
>    same __kstrtab section though).
> 
> 3. Teach pahole's btf encoder to encode the following bools: export,
>    gpl_only, acquires, releases.
> 
>    I'm envisioning this info will end up in a new struct
>    btf_func_proto in btf.h. Perhaps like this.
> 
>      struct btf_func_proto {
>          /* "info" bits arrangement
>           * bit     0: exported (callable by bpf prog)
>           * bit     1: gpl_only (only callable from GPL licensed bpf prog)
>           * bit     2: acquires (acquires and returns a refcounted pointer)
>           * bit     3: releases (first argument, a refcounted pointer,
> is released)
>           * bits 4-31: unused
>           */
>          __u32    info;
>      };
> 
>    Currently, a "struct btf_type" of type BTF_KIND_FUNC_PROTO is
>    directly followed by vlen struct btf_param/s. I'm hoping we can
>    insert btf_func_proto before the first btf_param or after the
>    last. If that's not workable, adding a new type,
>    BTF_KIND_FUNC_EXPORT, is another idea.

I don't see why 1 and 2 are necessary.
What is the value of true export_symbol here?
What is the value of namespaced true export_symbol?
Imo it only adds memory overhead to vmlinux.
The same information is available in BTF as a _name_.
What is the point to replicate it into kcrc?
Imo kcrc is a poor protection mechanism that is already worse
that BTF. I really don't see a value going that route.

I think just encoding the intent to export into BTF is enough.
Option 3 above looks like overkill too. Just name convention would do.
We already use different prefixes to encode certain BTFs
(see struct_ops and btf_trace).
Just say when BTF func_proto starts with "export_" it means it's exported.
It would be trivial for users to grep as well:
bpftool btf dump file ./vmlinux |grep export_

> 
> The crcs could be used to improve the developer experience when
> using unstable helpers.

crc don't add any additional value on top of BTF. BTF types has to match exactly.
It's like C compiler checking that you can call a function with correct proto.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: unstable bpf helpers proposal. Was: [PATCH bpf-next v2 1/2] bpf: add bpf_ct_lookup_{tcp,udp}() helpers
  2020-04-07  3:03                 ` Alexei Starovoitov
@ 2020-04-07  5:28                   ` Matt Cover
  2020-04-07 17:34                     ` Alexei Starovoitov
  0 siblings, 1 reply; 22+ messages in thread
From: Matt Cover @ 2020-04-07  5:28 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Daniel Borkmann, David S. Miller, Matthew Cover, netdev, bpf,
	kernel-team

On Mon, Apr 6, 2020 at 8:03 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Fri, Apr 03, 2020 at 04:56:01PM -0700, Matt Cover wrote:
> > > I think doing BTF annotation for EXPORT_SYMBOL_BPF(bpf_icmp_send); is trivial.
> >
> > I've been looking into this more; here is what I'm thinking.
> >
> > 1. Export symbols for bpf the same as modules, but into one or more
> >    special namespaces.
> >
> >    Exported symbols recently gained namespaces.
> >      https://lore.kernel.org/linux-usb/20190906103235.197072-1-maennich@google.com/
> >      Documentation/kbuild/namespaces.rst
> >
> >    This makes the in-kernel changes needed for export super simple.
> >
> >      #define EXPORT_SYMBOL_BPF(sym)     EXPORT_SYMBOL_NS(sym, BPF_PROG)
> >      #define EXPORT_SYMBOL_BPF_GPL(sym) EXPORT_SYMBOL_NS_GPL(sym, BPF_PROG)
> >
> >    BPF_PROG is our special namespace above. We can easily add
> >    BPF_PROG_ACQUIRES and BPF_PROG_RELEASES for those types of
> >    unstable helpers.
> >
> >    Exports for bpf progs are then as simple as for modules.
> >
> >      EXPORT_SYMBOL_BPF(bpf_icmp_send);
> >
> >    Documenting these namespaces as not for use by modules should be
> >    enough; an explicit import statement to use namespaced symbols is
> >    already required. Explicitly preventing module use in
> >    MODULE_IMPORT_NS or modpost are also options if we feel more is
> >    needed.
> >
> > 2. Teach pahole's (dwarves') dwarf loader to parse __ksymtab*.
> >
> >    I've got a functional wip which retrieves the namespace from the
> >    __kstrtab ELF section. Working to differentiate between __ksymtab
> >    and __ksymtab_gpl symbols next. Good news is this info is readily
> >    available in vmlinux and module .o files. The interface here will
> >    probably end up similar to dwarves' elf_symtab__*, but with an
> >    struct elf_ksymtab per __ksymtab* section (all pointing to the
> >    same __kstrtab section though).
> >
> > 3. Teach pahole's btf encoder to encode the following bools: export,
> >    gpl_only, acquires, releases.
> >
> >    I'm envisioning this info will end up in a new struct
> >    btf_func_proto in btf.h. Perhaps like this.
> >
> >      struct btf_func_proto {
> >          /* "info" bits arrangement
> >           * bit     0: exported (callable by bpf prog)
> >           * bit     1: gpl_only (only callable from GPL licensed bpf prog)
> >           * bit     2: acquires (acquires and returns a refcounted pointer)
> >           * bit     3: releases (first argument, a refcounted pointer,
> > is released)
> >           * bits 4-31: unused
> >           */
> >          __u32    info;
> >      };
> >
> >    Currently, a "struct btf_type" of type BTF_KIND_FUNC_PROTO is
> >    directly followed by vlen struct btf_param/s. I'm hoping we can
> >    insert btf_func_proto before the first btf_param or after the
> >    last. If that's not workable, adding a new type,
> >    BTF_KIND_FUNC_EXPORT, is another idea.
>
> I don't see why 1 and 2 are necessary.
> What is the value of true export_symbol here?

Hmm... I was under the impression that these functions had to be
exported to be eligible for BTF. Perhaps I'm misunderstanding this
dwaves commit:

  3c5f2a224aa1 ("btf_encoder: Preserve and encode exported functions
as BTF_KIND_FUNC")

Looking briefly I can see that the functions in symvers and BTF are
not an exact match. Does "exported functions" in the above commit
message not mean "exported symbols"?

It looks like BTF FUNCs line up perfectly with symbols marked 'T' and
'W' in kallsyms. I'll look into what adds a [TW] marked symbol to
kallsyms and see how this differs from symvers.

> What is the value of namespaced true export_symbol?

This simply seemed like a clean way to group the symbols under the
premise these functions already needed to be exported via
EXPORT_SYMBOL*.

> Imo it only adds memory overhead to vmlinux.
> The same information is available in BTF as a _name_.
> What is the point to replicate it into kcrc?

See below.

> Imo kcrc is a poor protection mechanism that is already worse
> that BTF. I really don't see a value going that route.
>

See below

> I think just encoding the intent to export into BTF is enough.
> Option 3 above looks like overkill too. Just name convention would do.
> We already use different prefixes to encode certain BTFs
> (see struct_ops and btf_trace).
> Just say when BTF func_proto starts with "export_" it means it's exported.
> It would be trivial for users to grep as well:
> bpftool btf dump file ./vmlinux |grep export_

Ok, cool. A naming convention could work (even if it turns out we do
have to EXPORT_SYMBOL).

>
> >
> > The crcs could be used to improve the developer experience when
> > using unstable helpers.
>
> crc don't add any additional value on top of BTF. BTF types has to match exactly.
> It's like C compiler checking that you can call a function with correct proto.

I can see that for the verifier BTF is much superior to crc. The
safety of the program is not improved by the crc. I was simply
thinking the crc could be used in struct variant selection instead
of kernel version. In some environments this could be useful since
distros often backport patches while leaving version old (often
meaning a second distro-specific version must also be considered).

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: unstable bpf helpers proposal. Was: [PATCH bpf-next v2 1/2] bpf: add bpf_ct_lookup_{tcp,udp}() helpers
  2020-04-07  5:28                   ` Matt Cover
@ 2020-04-07 17:34                     ` Alexei Starovoitov
  0 siblings, 0 replies; 22+ messages in thread
From: Alexei Starovoitov @ 2020-04-07 17:34 UTC (permalink / raw)
  To: Matt Cover
  Cc: Daniel Borkmann, David S. Miller, Matthew Cover, netdev, bpf,
	kernel-team

On Mon, Apr 06, 2020 at 10:28:13PM -0700, Matt Cover wrote:
> >
> > I don't see why 1 and 2 are necessary.
> > What is the value of true export_symbol here?
> 
> Hmm... I was under the impression that these functions had to be
> exported to be eligible for BTF. Perhaps I'm misunderstanding this
> dwaves commit:
> 
>   3c5f2a224aa1 ("btf_encoder: Preserve and encode exported functions
> as BTF_KIND_FUNC")
> 
> Looking briefly I can see that the functions in symvers and BTF are
> not an exact match. Does "exported functions" in the above commit
> message not mean "exported symbols"?

Yeah. That pahole's commit log is confusing.
It meant to say that all of exported symbols will be in BTF
along with all other global functions.
$ bpftool btf dump file ./bld_x64/vmlinux|grep __icmp_send
[71784] FUNC '__icmp_send' type_id=71783
$ bpftool btf dump file ./bld_x64/vmlinux|grep bpf_prog_alloc_no_stats
[17945] FUNC 'bpf_prog_alloc_no_stats' type_id=17943
First one is exported. Second is a simple global.
There is no difference between them from BTF pov.

pahole can be improved too.
If it turns out that certain static functions has to be in BTF
we can easily make it so.

> >
> > crc don't add any additional value on top of BTF. BTF types has to match exactly.
> > It's like C compiler checking that you can call a function with correct proto.
> 
> I can see that for the verifier BTF is much superior to crc. The
> safety of the program is not improved by the crc. I was simply
> thinking the crc could be used in struct variant selection instead
> of kernel version. In some environments this could be useful since
> distros often backport patches while leaving version old (often
> meaning a second distro-specific version must also be considered).

The kernel version should not be used in any kind of logic. Lots of folks
backport bpf patches to older versions of the kernel. The kernel version is
meaningless form bpf pov. We even removed kernel_version check from kprobe
programs, because it was useless. vmlinux BTF is a complete description of the
running kernel. It's the one that the verifier is using to do it safety checks.

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2020-04-07 17:34 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-01-18  0:01 [PATCH bpf-next] bpf: add bpf_ct_lookup_{tcp,udp}() helpers Matthew Cover
2020-01-18 11:37 ` kbuild test robot
2020-01-18 11:58 ` kbuild test robot
2020-01-19  3:05 ` John Fastabend
2020-01-20 18:11   ` Matt Cover
2020-01-20 20:10     ` Matt Cover
2020-01-20 21:11       ` Daniel Borkmann
2020-01-20 21:21         ` Matt Cover
2020-01-23 21:28         ` Matt Cover
2020-01-21 20:20 ` [PATCH bpf-next v2 1/2] " Matthew Cover
2020-01-21 20:22   ` [PATCH bpf-next v2 2/2] selftests/bpf: test references to nf_conn Matthew Cover
2020-01-21 20:35   ` [PATCH bpf-next v2 1/2] bpf: add bpf_ct_lookup_{tcp,udp}() helpers Matt Cover
2020-01-21 21:31     ` Matt Cover
2020-01-24 19:11     ` Joe Stringer
2020-01-24 21:46       ` Matt Cover
2020-01-30 21:53         ` unstable bpf helpers proposal. Was: " Alexei Starovoitov
2020-02-06  6:13           ` Matt Cover
2020-02-20  4:45             ` Alexei Starovoitov
2020-04-03 23:56               ` Matt Cover
2020-04-07  3:03                 ` Alexei Starovoitov
2020-04-07  5:28                   ` Matt Cover
2020-04-07 17:34                     ` Alexei Starovoitov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).