linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] netfilter: nf_conntrack: Add conntrack helper for ESP/IPsec
@ 2021-04-14  3:53 Cole Dishington
  2021-04-14 15:40 ` Florian Westphal
  0 siblings, 1 reply; 10+ messages in thread
From: Cole Dishington @ 2021-04-14  3:53 UTC (permalink / raw)
  To: pablo
  Cc: kadlec, fw, davem, kuba, linux-kernel, netfilter-devel, coreteam,
	netdev, Cole Dishington

Introduce changes to add ESP connection tracking helper to netfilter
conntrack. The connection tracking of ESP is based on IPsec SPIs. The
underlying motivation for this patch was to allow multiple VPN ESP
clients to be distinguished when using NAT.

Added config flag CONFIG_NF_CT_PROTO_ESP to enable the ESP/IPsec
conntrack helper.

Signed-off-by: Cole Dishington <Cole.Dishington@alliedtelesis.co.nz>
---
 .../linux/netfilter/nf_conntrack_proto_esp.h  |  25 +
 .../net/netfilter/ipv4/nf_conntrack_ipv4.h    |   3 +
 include/net/netfilter/nf_conntrack.h          |   2 +
 include/net/netfilter/nf_conntrack_l4proto.h  |  15 +
 include/net/netfilter/nf_conntrack_tuple.h    |   3 +
 include/net/netns/conntrack.h                 |  24 +
 .../netfilter/nf_conntrack_tuple_common.h     |   3 +
 .../linux/netfilter/nfnetlink_conntrack.h     |   2 +
 net/netfilter/Kconfig                         |  10 +
 net/netfilter/Makefile                        |   1 +
 net/netfilter/nf_conntrack_core.c             |  23 +
 net/netfilter/nf_conntrack_netlink.c          |   4 +-
 net/netfilter/nf_conntrack_proto.c            |   6 +
 net/netfilter/nf_conntrack_proto_esp.c        | 535 ++++++++++++++++++
 net/netfilter/nf_conntrack_standalone.c       |   5 +
 net/netfilter/nf_internals.h                  |   4 +-
 16 files changed, 663 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/netfilter/nf_conntrack_proto_esp.h
 create mode 100644 net/netfilter/nf_conntrack_proto_esp.c

diff --git a/include/linux/netfilter/nf_conntrack_proto_esp.h b/include/linux/netfilter/nf_conntrack_proto_esp.h
new file mode 100644
index 000000000000..2441e031c68e
--- /dev/null
+++ b/include/linux/netfilter/nf_conntrack_proto_esp.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _CONNTRACK_PROTO_ESP_H
+#define _CONNTRACK_PROTO_ESP_H
+#include <asm/byteorder.h>
+
+/* ESP PROTOCOL HEADER */
+
+struct esphdr {
+	__u32 spi;
+};
+
+struct nf_ct_esp {
+	unsigned int stream_timeout;
+	unsigned int timeout;
+};
+
+#ifdef __KERNEL__
+#include <net/netfilter/nf_conntrack_tuple.h>
+
+void destroy_esp_conntrack_entry(struct nf_conn *ct);
+
+bool esp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
+		      struct net *net, struct nf_conntrack_tuple *tuple);
+#endif /* __KERNEL__ */
+#endif /* _CONNTRACK_PROTO_ESP_H */
diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
index 2c8c2b023848..1aee91592639 100644
--- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
+++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
@@ -25,5 +25,8 @@ extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite;
 #ifdef CONFIG_NF_CT_PROTO_GRE
 extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre;
 #endif
+#ifdef CONFIG_NF_CT_PROTO_ESP
+extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_esp;
+#endif
 
 #endif /*_NF_CONNTRACK_IPV4_H*/
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 439379ca9ffa..2bd1d94de138 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -21,6 +21,7 @@
 #include <linux/netfilter/nf_conntrack_dccp.h>
 #include <linux/netfilter/nf_conntrack_sctp.h>
 #include <linux/netfilter/nf_conntrack_proto_gre.h>
+#include <linux/netfilter/nf_conntrack_proto_esp.h>
 
 #include <net/netfilter/nf_conntrack_tuple.h>
 
@@ -36,6 +37,7 @@ union nf_conntrack_proto {
 	struct ip_ct_tcp tcp;
 	struct nf_ct_udp udp;
 	struct nf_ct_gre gre;
+	struct nf_ct_esp esp;
 	unsigned int tmpl_padto;
 };
 
diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 96f9cf81f46b..ec89e83ff20e 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -75,6 +75,8 @@ bool nf_conntrack_invert_icmp_tuple(struct nf_conntrack_tuple *tuple,
 				    const struct nf_conntrack_tuple *orig);
 bool nf_conntrack_invert_icmpv6_tuple(struct nf_conntrack_tuple *tuple,
 				      const struct nf_conntrack_tuple *orig);
+bool nf_conntrack_invert_esp_tuple(struct nf_conntrack_tuple *tuple,
+				   const struct nf_conntrack_tuple *orig);
 
 int nf_conntrack_inet_error(struct nf_conn *tmpl, struct sk_buff *skb,
 			    unsigned int dataoff,
@@ -132,6 +134,11 @@ int nf_conntrack_gre_packet(struct nf_conn *ct,
 			    unsigned int dataoff,
 			    enum ip_conntrack_info ctinfo,
 			    const struct nf_hook_state *state);
+int nf_conntrack_esp_packet(struct nf_conn *ct,
+			    struct sk_buff *skb,
+			    unsigned int dataoff,
+			    enum ip_conntrack_info ctinfo,
+			    const struct nf_hook_state *state);
 
 void nf_conntrack_generic_init_net(struct net *net);
 void nf_conntrack_tcp_init_net(struct net *net);
@@ -141,6 +148,7 @@ void nf_conntrack_dccp_init_net(struct net *net);
 void nf_conntrack_sctp_init_net(struct net *net);
 void nf_conntrack_icmp_init_net(struct net *net);
 void nf_conntrack_icmpv6_init_net(struct net *net);
+void nf_conntrack_esp_init_net(struct net *net);
 
 /* Existing built-in generic protocol */
 extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic;
@@ -240,4 +248,11 @@ static inline struct nf_gre_net *nf_gre_pernet(struct net *net)
 }
 #endif
 
+#ifdef CONFIG_NF_CT_PROTO_ESP
+static inline struct nf_esp_net *nf_esp_pernet(struct net *net)
+{
+	return &net->ct.nf_ct_proto.esp;
+}
+#endif
+
 #endif /*_NF_CONNTRACK_PROTOCOL_H*/
diff --git a/include/net/netfilter/nf_conntrack_tuple.h b/include/net/netfilter/nf_conntrack_tuple.h
index 9334371c94e2..7b9c3b5ae8cc 100644
--- a/include/net/netfilter/nf_conntrack_tuple.h
+++ b/include/net/netfilter/nf_conntrack_tuple.h
@@ -62,6 +62,9 @@ struct nf_conntrack_tuple {
 			struct {
 				__be16 key;
 			} gre;
+			struct {
+				__be16 spi;
+			} esp;
 		} u;
 
 		/* The protocol. */
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index 806454e767bf..c6f21c6316f0 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -69,6 +69,27 @@ struct nf_gre_net {
 };
 #endif
 
+#ifdef CONFIG_NF_CT_PROTO_ESP
+#define ESP_MAX_PORTS      1000
+#define HASH_TAB_SIZE  ESP_MAX_PORTS
+
+enum esp_conntrack {
+	ESP_CT_UNREPLIED,
+	ESP_CT_REPLIED,
+	ESP_CT_MAX
+};
+
+struct nf_esp_net {
+	rwlock_t esp_table_lock;
+	struct hlist_head ltable[HASH_TAB_SIZE];
+	struct hlist_head rtable[HASH_TAB_SIZE];
+	/* Initial lookup for remote end until rspi is known */
+	struct hlist_head incmpl_rtable[HASH_TAB_SIZE];
+	struct _esp_table *esp_table[ESP_MAX_PORTS];
+	unsigned int esp_timeouts[ESP_CT_MAX];
+};
+#endif
+
 struct nf_ip_net {
 	struct nf_generic_net   generic;
 	struct nf_tcp_net	tcp;
@@ -84,6 +105,9 @@ struct nf_ip_net {
 #ifdef CONFIG_NF_CT_PROTO_GRE
 	struct nf_gre_net	gre;
 #endif
+#ifdef CONFIG_NF_CT_PROTO_ESP
+	struct nf_esp_net	esp;
+#endif
 };
 
 struct ct_pcpu {
diff --git a/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h b/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h
index 64390fac6f7e..9bbd76c325d2 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h
@@ -39,6 +39,9 @@ union nf_conntrack_man_proto {
 	struct {
 		__be16 key;	/* GRE key is 32bit, PPtP only uses 16bit */
 	} gre;
+	struct {
+		__be16 spi;
+	} esp;
 };
 
 #define CTINFO2DIR(ctinfo) ((ctinfo) >= IP_CT_IS_REPLY ? IP_CT_DIR_REPLY : IP_CT_DIR_ORIGINAL)
diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
index d8484be72fdc..f9f81be7a163 100644
--- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
@@ -90,6 +90,8 @@ enum ctattr_l4proto {
 	CTA_PROTO_ICMPV6_ID,
 	CTA_PROTO_ICMPV6_TYPE,
 	CTA_PROTO_ICMPV6_CODE,
+	CTA_PROTO_SRC_ESP_SPI,
+	CTA_PROTO_DST_ESP_SPI,
 	__CTA_PROTO_MAX
 };
 #define CTA_PROTO_MAX (__CTA_PROTO_MAX - 1)
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 1a92063c73a4..7269312d322e 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -199,6 +199,16 @@ config NF_CT_PROTO_UDPLITE
 
 	  If unsure, say Y.
 
+config NF_CT_PROTO_ESP
+	bool "ESP protocol support"
+	depends on NETFILTER_ADVANCED
+	help
+	  ESP connection tracking helper. Provides connection tracking for IPsec
+	  clients behind this device based on SPI, especially useful for
+	  distinguishing multiple clients when using NAT.
+
+	  If unsure, say N.
+
 config NF_CONNTRACK_AMANDA
 	tristate "Amanda backup protocol support"
 	depends on NETFILTER_ADVANCED
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 33da7bf1b68e..0942f2c48ddb 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -14,6 +14,7 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o
 nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o
 nf_conntrack-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
 nf_conntrack-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o
+nf_conntrack-$(CONFIG_NF_CT_PROTO_ESP) += nf_conntrack_proto_esp.o
 
 obj-$(CONFIG_NETFILTER) = netfilter.o
 
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index ff0168736f6e..3bef361d19ce 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -295,6 +295,10 @@ nf_ct_get_tuple(const struct sk_buff *skb,
 #ifdef CONFIG_NF_CT_PROTO_GRE
 	case IPPROTO_GRE:
 		return gre_pkt_to_tuple(skb, dataoff, net, tuple);
+#endif
+#ifdef CONFIG_NF_CT_PROTO_ESP
+	case IPPROTO_ESP:
+		return esp_pkt_to_tuple(skb, dataoff, net, tuple);
 #endif
 	case IPPROTO_TCP:
 	case IPPROTO_UDP: /* fallthrough */
@@ -439,6 +443,10 @@ nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
 #if IS_ENABLED(CONFIG_IPV6)
 	case IPPROTO_ICMPV6:
 		return nf_conntrack_invert_icmpv6_tuple(inverse, orig);
+#endif
+#ifdef CONFIG_NF_CT_PROTO_ESP
+	case IPPROTO_ESP:
+		return nf_conntrack_invert_esp_tuple(inverse, orig);
 #endif
 	}
 
@@ -593,6 +601,13 @@ static void destroy_gre_conntrack(struct nf_conn *ct)
 #endif
 }
 
+static void destroy_esp_conntrack(struct nf_conn *ct)
+{
+#ifdef CONFIG_NF_CT_PROTO_ESP
+	destroy_esp_conntrack_entry(ct);
+#endif
+}
+
 static void
 destroy_conntrack(struct nf_conntrack *nfct)
 {
@@ -609,6 +624,9 @@ destroy_conntrack(struct nf_conntrack *nfct)
 	if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE))
 		destroy_gre_conntrack(ct);
 
+	if (unlikely(nf_ct_protonum(ct) == IPPROTO_ESP))
+		destroy_esp_conntrack(ct);
+
 	local_bh_disable();
 	/* Expectations will have been removed in clean_from_lists,
 	 * except TFTP can create an expectation on the first packet,
@@ -1783,6 +1801,11 @@ static int nf_conntrack_handle_packet(struct nf_conn *ct,
 	case IPPROTO_GRE:
 		return nf_conntrack_gre_packet(ct, skb, dataoff,
 					       ctinfo, state);
+#endif
+#ifdef CONFIG_NF_CT_PROTO_ESP
+	case IPPROTO_ESP:
+		return nf_conntrack_esp_packet(ct, skb, dataoff,
+					       ctinfo, state);
 #endif
 	}
 
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 1d519b0e51a5..f4a18a9c8ad4 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1382,7 +1382,9 @@ static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = {
    CTA_FILTER_F_CTA_PROTO_ICMP_ID | \
    CTA_FILTER_F_CTA_PROTO_ICMPV6_TYPE | \
    CTA_FILTER_F_CTA_PROTO_ICMPV6_CODE | \
-   CTA_FILTER_F_CTA_PROTO_ICMPV6_ID)
+   CTA_FILTER_F_CTA_PROTO_ICMPV6_ID | \
+   CTA_FILTER_F_CTA_PROTO_SRC_ESP_SPI | \
+   CTA_FILTER_F_CTA_PROTO_DST_ESP_SPI)
 
 static int
 ctnetlink_parse_tuple_filter(const struct nlattr * const cda[],
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 47e9319d2cf3..37beb8ce085c 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -112,6 +112,9 @@ const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto)
 #ifdef CONFIG_NF_CT_PROTO_GRE
 	case IPPROTO_GRE: return &nf_conntrack_l4proto_gre;
 #endif
+#ifdef CONFIG_NF_CT_PROTO_ESP
+	case IPPROTO_ESP: return &nf_conntrack_l4proto_esp;
+#endif
 #if IS_ENABLED(CONFIG_IPV6)
 	case IPPROTO_ICMPV6: return &nf_conntrack_l4proto_icmpv6;
 #endif /* CONFIG_IPV6 */
@@ -691,6 +694,9 @@ void nf_conntrack_proto_pernet_init(struct net *net)
 #ifdef CONFIG_NF_CT_PROTO_GRE
 	nf_conntrack_gre_init_net(net);
 #endif
+#ifdef CONFIG_NF_CT_PROTO_ESP
+	nf_conntrack_esp_init_net(net);
+#endif
 }
 
 void nf_conntrack_proto_pernet_fini(struct net *net)
diff --git a/net/netfilter/nf_conntrack_proto_esp.c b/net/netfilter/nf_conntrack_proto_esp.c
new file mode 100644
index 000000000000..2924bd82c78c
--- /dev/null
+++ b/net/netfilter/nf_conntrack_proto_esp.c
@@ -0,0 +1,535 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * <:copyright-gpl
+ * Copyright 2008 Broadcom Corp. All Rights Reserved.
+ * Copyright (C) 2021 Allied Telesis Labs NZ
+ *
+ * This program is free software; you can distribute it and/or modify it
+ * under the terms of the GNU General Public License (Version 2) as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.
+ * :>
+ */
+/******************************************************************************
+ * Filename:       nf_conntrack_proto_esp.c
+ * Author:         Pavan Kumar
+ * Creation Date:  05/27/04
+ *
+ * Description:
+ * Implements the ESP ALG connectiontracking.
+ * Migrated to kernel 2.6.21.5 on April 16, 2008 by Dan-Han Tsai.
+ * Migrated to kernel 5.11.0-rc2+ on March 3, 2021 by Allied Telesis Labs NZ (Cole Dishington).
+ *
+ * Updates to ESP conntracking on October,2010,by Manamohan,Lantiq Deutschland GmbH:
+ *	- Added the support for sessions with two or more different remote servers
+ *    from single or multiple lan clients with same lan and remote SPI Ids
+ *	- Support for associating the multiple LAN side sessions waiting
+ *    for the reply from same remote server with the one which is created first
+ * Updates to ESP conntracking on August,2015,by Allied Telesis Labs NZ:
+ *	- Improve ESP entry lookup performance by adding hashtable. (Anthony Lineham)
+ *	- Add locking around ESP connection table. (Anthony Lineham)
+ *	- Fixups including adding destroy function, endian-safe SPIs and IPs,
+ *	  replace prinks with DEBUGs. (Anthony Lineham)
+ *	- Extend ESP connection tracking to allow conntrack ESP entry matching
+ *	  of tuple values. (Matt Bennett)
+ ****************************************************************************/
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/list.h>
+#include <linux/seq_file.h>
+#include <linux/in.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/dst.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
+#include <linux/netfilter/nf_conntrack_proto_esp.h>
+
+#include "nf_internals.h"
+
+#if 0
+#define ESP_DEBUG 1
+#define DEBUGP(format, args...) printk(KERN_DEBUG "%s: " format, __func__, ## args)
+#else
+#undef ESP_DEBUG
+#define DEBUGP(x, args...)
+#endif
+
+#define TEMP_SPI_START 1500
+#define TEMP_SPI_MAX   (TEMP_SPI_START + ESP_MAX_PORTS - 1)
+
+struct _esp_table {
+	/* Hash table nodes for each required lookup
+	 * lnode: l_spi, l_ip, r_ip
+	 * rnode: r_spi, r_ip
+	 * incmpl_rnode: r_ip
+	 */
+	struct hlist_node lnode;
+	struct hlist_node rnode;
+	struct hlist_node incmpl_rnode;
+
+	u32 l_spi;
+	u32 r_spi;
+	u32 l_ip;
+	u32 r_ip;
+	u16 tspi;
+	unsigned long allocation_time;
+	struct net *net;
+};
+
+static unsigned int esp_timeouts[ESP_CT_MAX] = {
+	[ESP_CT_UNREPLIED] = 60 * HZ,
+	[ESP_CT_REPLIED] = 3600 * HZ,
+};
+
+static inline struct nf_esp_net *esp_pernet(struct net *net)
+{
+	return &net->ct.nf_ct_proto.esp;
+}
+
+static void esp_init_esp_tables(struct nf_esp_net *net_esp)
+{
+	struct _esp_table **esp_table;
+	int i;
+
+	rwlock_init(&net_esp->esp_table_lock);
+
+	write_lock_bh(&net_esp->esp_table_lock);
+	esp_table = net_esp->esp_table;
+	for (i = 0; i < ESP_MAX_PORTS; i++)
+		memset(&esp_table[i], 0, sizeof(struct _esp_table *));
+
+	for (i = 0; i < HASH_TAB_SIZE; i++) {
+		INIT_HLIST_HEAD(&net_esp->ltable[i]);
+		INIT_HLIST_HEAD(&net_esp->rtable[i]);
+		INIT_HLIST_HEAD(&net_esp->incmpl_rtable[i]);
+	}
+	DEBUGP("Initialized %i ESP table entries\n", i);
+	write_unlock_bh(&net_esp->esp_table_lock);
+}
+
+void nf_conntrack_esp_init_net(struct net *net)
+{
+	struct nf_esp_net *net_esp = esp_pernet(net);
+	int i;
+
+	esp_init_esp_tables(net_esp);
+	for (i = 0; i < ESP_CT_MAX; i++)
+		net_esp->esp_timeouts[i] = esp_timeouts[i];
+}
+
+/* Free an entry referred to by TSPI.
+ * Entry table locking and unlocking is the responsibility of the calling function.
+ * Range checking is the responsibility of the calling function.
+ */
+static void esp_table_free_entry_by_tspi(struct net *net, u16 tspi)
+{
+	struct nf_esp_net *esp_net = esp_pernet(net);
+	struct _esp_table *esp_entry = NULL;
+
+	esp_entry = esp_net->esp_table[tspi - TEMP_SPI_START];
+	if (esp_entry) {
+		/* Remove from all the hash tables. Hlist utility can handle items
+		 * that aren't actually in the list, so just try removing from
+		 * each list
+		 */
+		DEBUGP("Removing entry %x (%p) from all tables",
+		       esp_entry->tspi, esp_entry);
+		hlist_del_init(&esp_entry->lnode);
+		hlist_del_init(&esp_entry->incmpl_rnode);
+		hlist_del_init(&esp_entry->rnode);
+		kfree(esp_entry);
+		esp_net->esp_table[tspi - TEMP_SPI_START] = NULL;
+	}
+}
+
+/* Allocate a free IPSEC table entry.
+ * NOTE: The ESP entry table must be locked prior to calling this function.
+ */
+struct _esp_table *alloc_esp_entry(struct net *net)
+{
+	struct nf_esp_net *net_esp = esp_pernet(net);
+	struct _esp_table **esp_table = net_esp->esp_table;
+	struct _esp_table *esp_entry = NULL;
+	int idx = 0;
+
+	/* Find the first unused slot */
+	for (; idx < ESP_MAX_PORTS; idx++) {
+		if (esp_table[idx])
+			continue;
+
+		esp_table[idx] = kmalloc(sizeof(*esp_entry), GFP_ATOMIC);
+		memset(esp_table[idx], 0, sizeof(struct _esp_table));
+		esp_table[idx]->tspi = idx + TEMP_SPI_START;
+
+		DEBUGP("   New esp_entry (%p) at idx %d tspi %u\n",
+		       esp_table[idx], idx, esp_table[idx]->tspi);
+
+		esp_table[idx]->allocation_time = jiffies;
+		esp_table[idx]->net = net;
+		esp_entry = esp_table[idx];
+		break;
+	}
+	return esp_entry;
+}
+
+static u32 calculate_hash(const u32 spi, const u32 src_ip,
+			  const u32 dst_ip)
+{
+	u32 hash;
+
+	/* Simple combination */
+	hash = spi + src_ip + dst_ip;
+	/* Reduce to an index to fit the table size */
+	hash %= HASH_TAB_SIZE;
+
+	DEBUGP("Generated hash %x from spi %x srcIP %x dstIP %x\n", hash, spi,
+	       src_ip, dst_ip);
+	return hash;
+}
+
+/*	Search for an ESP entry in the initial state based the IP address of the
+ *	remote peer.
+ *	NOTE: The ESP entry table must be locked prior to calling this function.
+ */
+static struct _esp_table *search_esp_entry_init_remote(struct nf_esp_net *net_esp,
+						       const u32 src_ip)
+{
+	struct _esp_table **esp_table = net_esp->esp_table;
+	struct _esp_table *esp_entry = NULL;
+	u32 hash = 0;
+	int first_entry = -1;
+
+	hash = calculate_hash(0, src_ip, 0);
+	hlist_for_each_entry(esp_entry, &net_esp->incmpl_rtable[hash],
+			     incmpl_rnode) {
+		DEBUGP("Checking against incmpl_rtable entry %x (%p) with l_spi %x r_spi %x r_ip %x\n",
+		       esp_entry->tspi, esp_entry, esp_entry->l_spi,
+		       esp_entry->r_spi, esp_entry->r_ip);
+		if (src_ip == esp_entry->r_ip && esp_entry->l_spi != 0 &&
+		    esp_entry->r_spi == 0) {
+			DEBUGP("Matches entry %x", esp_entry->tspi);
+			if (first_entry == -1) {
+				DEBUGP("First match\n");
+				first_entry = esp_entry->tspi - TEMP_SPI_START;
+			} else if (esp_table[first_entry]->allocation_time >
+				   esp_entry->allocation_time) {
+				/* This entry is older than the last one found so treat this
+				 * as a better match.
+				 */
+				DEBUGP("Older/better match\n");
+				first_entry = esp_entry->tspi - TEMP_SPI_START;
+			}
+		}
+	}
+
+	if (first_entry != -1) {
+		DEBUGP("returning esp entry\n");
+		esp_entry = esp_table[first_entry];
+		return esp_entry;
+	}
+
+	DEBUGP("No init entry found\n");
+	return NULL;
+}
+
+/*	Search for an ESP entry by SPI and source and destination IP addresses.
+ *	NOTE: The ESP entry table must be locked prior to calling this function.
+ */
+struct _esp_table *search_esp_entry_by_spi(struct net *net, const __u32 spi,
+					   const __u32 src_ip, const __u32 dst_ip)
+{
+	struct nf_esp_net *net_esp = esp_pernet(net);
+	struct _esp_table *esp_entry = NULL;
+	u32 hash = 0;
+
+	/* Check for matching established session or repeated initial LAN side */
+	/* LAN side first */
+	hash = calculate_hash(spi, src_ip, dst_ip);
+	hlist_for_each_entry(esp_entry, &net_esp->ltable[hash], lnode) {
+		DEBUGP
+		    ("Checking against ltable entry %x (%p) with l_spi %x l_ip %x r_ip %x\n",
+		     esp_entry->tspi, esp_entry, esp_entry->l_spi,
+		     esp_entry->l_ip, esp_entry->r_ip);
+		if (spi == esp_entry->l_spi && src_ip == esp_entry->l_ip &&
+		    dst_ip == esp_entry->r_ip) {
+			/* When r_spi is set this is an established session. When not set it's
+			 * a repeated initial packet from LAN side. But both cases are treated
+			 * the same.
+			 */
+			DEBUGP("Matches entry %x", esp_entry->tspi);
+			return esp_entry;
+		}
+	}
+
+	/* Established remote side */
+	hash = calculate_hash(spi, src_ip, 0);
+	hlist_for_each_entry(esp_entry, &net_esp->rtable[hash], rnode) {
+		DEBUGP
+		    ("Checking against rtable entry %x (%p) with l_spi %x r_spi %x r_ip %x\n",
+		     esp_entry->tspi, esp_entry, esp_entry->l_spi,
+		     esp_entry->r_spi, esp_entry->r_ip);
+		if (spi == esp_entry->r_spi && src_ip == esp_entry->r_ip &&
+		    esp_entry->l_spi != 0) {
+			DEBUGP("Matches entry %x", esp_entry->tspi);
+			return esp_entry;
+		}
+	}
+
+	/* Incomplete remote side */
+	esp_entry = search_esp_entry_init_remote(net_esp, src_ip);
+	if (esp_entry) {
+		esp_entry->r_spi = spi;
+		/* Remove entry from incmpl_rtable and add to rtable */
+		DEBUGP("Completing entry %x with remote SPI info",
+		       esp_entry->tspi);
+		hlist_del_init(&esp_entry->incmpl_rnode);
+		hash = calculate_hash(spi, src_ip, 0);
+		hlist_add_head(&esp_entry->rnode, &net_esp->rtable[hash]);
+		return esp_entry;
+	}
+
+	DEBUGP("No Entry\n");
+	return NULL;
+}
+
+/* invert esp part of tuple */
+bool nf_conntrack_invert_esp_tuple(struct nf_conntrack_tuple *tuple,
+				   const struct nf_conntrack_tuple *orig)
+{
+	tuple->dst.u.esp.spi = orig->dst.u.esp.spi;
+	tuple->src.u.esp.spi = orig->src.u.esp.spi;
+	return true;
+}
+
+/* esp hdr info to tuple */
+bool esp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
+		      struct net *net, struct nf_conntrack_tuple *tuple)
+{
+	struct nf_esp_net *net_esp = esp_pernet(net);
+	struct esphdr _esphdr, *esphdr;
+	struct _esp_table *esp_entry = NULL;
+	u32 spi = 0;
+
+	esphdr = skb_header_pointer(skb, dataoff, sizeof(_esphdr), &_esphdr);
+	if (!esphdr) {
+		/* try to behave like "nf_conntrack_proto_generic" */
+		tuple->src.u.all = 0;
+		tuple->dst.u.all = 0;
+		return true;
+	}
+	spi = ntohl(esphdr->spi);
+
+	DEBUGP("Enter pkt_to_tuple() with spi %x\n", spi);
+	/* check if esphdr has a new SPI:
+	 *   if no, update tuple with correct tspi;
+	 *   if yes, check if we have seen the source IP:
+	 *             if yes, update the ESP tables update the tuple with correct tspi
+	 *             if no, create a new entry
+	 */
+	write_lock_bh(&net_esp->esp_table_lock);
+	esp_entry = search_esp_entry_by_spi(net, spi, tuple->src.u3.ip,
+					    tuple->dst.u3.ip);
+	if (!esp_entry) {
+		u32 hash = 0;
+
+		esp_entry = alloc_esp_entry(net);
+		if (!esp_entry) {
+			DEBUGP("All entries in use\n");
+			write_unlock_bh(&net_esp->esp_table_lock);
+			return false;
+		}
+		esp_entry->l_spi = spi;
+		esp_entry->l_ip = tuple->src.u3.ip;
+		esp_entry->r_ip = tuple->dst.u3.ip;
+		/* Add entries to the hash tables */
+		hash = calculate_hash(spi, esp_entry->l_ip, esp_entry->r_ip);
+		hlist_add_head(&esp_entry->lnode, &net_esp->ltable[hash]);
+		hash = calculate_hash(0, 0, esp_entry->r_ip);
+		hlist_add_head(&esp_entry->incmpl_rnode,
+			       &net_esp->incmpl_rtable[hash]);
+	}
+
+	DEBUGP
+	    ("entry_info: tspi %u l_spi 0x%x r_spi 0x%x l_ip %x r_ip %x srcIP %x dstIP %x\n",
+	     esp_entry->tspi, esp_entry->l_spi, esp_entry->r_spi,
+	     esp_entry->l_ip, esp_entry->r_ip, tuple->src.u3.ip,
+	     tuple->dst.u3.ip);
+
+	tuple->dst.u.esp.spi = esp_entry->tspi;
+	tuple->src.u.esp.spi = esp_entry->tspi;
+	write_unlock_bh(&net_esp->esp_table_lock);
+	return true;
+}
+
+#ifdef CONFIG_NF_CONNTRACK_PROCFS
+/* print private data for conntrack */
+static void esp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
+{
+	seq_printf(s, "timeout=%u, stream_timeout=%u ",
+		   (ct->proto.esp.timeout / HZ),
+		   (ct->proto.esp.stream_timeout / HZ));
+}
+#endif
+
+/* Returns verdict for packet, and may modify conntrack */
+int nf_conntrack_esp_packet(struct nf_conn *ct, struct sk_buff *skb,
+			    unsigned int dataoff,
+			    enum ip_conntrack_info ctinfo,
+			    const struct nf_hook_state *state)
+{
+	unsigned int *timeouts = nf_ct_timeout_lookup(ct);
+#ifdef ESP_DEBUG
+	const struct iphdr *iph;
+	struct esphdr _esphdr, *esphdr;
+
+	iph = ip_hdr(skb);
+	esphdr = skb_header_pointer(skb, dataoff, sizeof(_esphdr), &_esphdr);
+	if (iph && esphdr) {
+		u32 spi;
+
+		spi = ntohl(esphdr->spi);
+		DEBUGP("(0x%x) %x <-> %x status %s info %d %s\n",
+		       spi, iph->saddr, iph->daddr,
+		       (ct->status & IPS_SEEN_REPLY) ? "SEEN" : "NOT_SEEN",
+		       ctinfo, (ctinfo == IP_CT_NEW) ? "CT_NEW" : "SEEN_REPLY");
+	}
+#endif /* ESP_DEBUG */
+
+	if (!timeouts)
+		timeouts = esp_pernet(nf_ct_net(ct))->esp_timeouts;
+
+	if (!nf_ct_is_confirmed(ct)) {
+		ct->proto.esp.stream_timeout = timeouts[ESP_CT_REPLIED];
+		ct->proto.esp.timeout = timeouts[ESP_CT_UNREPLIED];
+	}
+
+	/* If we've seen traffic both ways, this is some kind of ESP
+	 * stream.  Extend timeout.
+	 */
+	if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+		nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[ESP_CT_REPLIED]);
+		/* Also, more likely to be important, and not a probe */
+		if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
+			/* Was originally IPCT_STATUS but this is no longer an option.
+			 * GRE uses assured for same purpose
+			 */
+			nf_conntrack_event_cache(IPCT_ASSURED, ct);
+	} else {
+		nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[ESP_CT_UNREPLIED]);
+	}
+
+	return NF_ACCEPT;
+}
+
+/* Called when a conntrack entry has already been removed from the hashes
+ * and is about to be deleted from memory
+ */
+void destroy_esp_conntrack_entry(struct nf_conn *ct)
+{
+	struct nf_conntrack_tuple *tuple = NULL;
+	enum ip_conntrack_dir dir;
+	u16 tspi = 0;
+	struct net *net = nf_ct_net(ct);
+	struct nf_esp_net *net_esp = esp_pernet(net);
+
+	write_lock_bh(&net_esp->esp_table_lock);
+
+	/* Probably all the ESP entries referenced in this connection are the same,
+	 * but the free function handles repeated frees, so best to do them all.
+	 */
+	for (dir = IP_CT_DIR_ORIGINAL; dir < IP_CT_DIR_MAX; dir++) {
+		tuple = nf_ct_tuple(ct, dir);
+
+		tspi = tuple->src.u.esp.spi;
+		if (tspi >= TEMP_SPI_START && tspi <= TEMP_SPI_MAX) {
+			DEBUGP("Deleting src tspi %x (dir %i)\n", tspi, dir);
+			esp_table_free_entry_by_tspi(net, tspi);
+		}
+		tuple->src.u.esp.spi = 0;
+		tspi = tuple->dst.u.esp.spi;
+		if (tspi >= TEMP_SPI_START && tspi <= TEMP_SPI_MAX) {
+			DEBUGP("Deleting dst tspi %x (dir %i)\n", tspi, dir);
+			esp_table_free_entry_by_tspi(net, tspi);
+		}
+		tuple->dst.u.esp.spi = 0;
+	}
+
+	write_unlock_bh(&net_esp->esp_table_lock);
+}
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+static int esp_tuple_to_nlattr(struct sk_buff *skb,
+			       const struct nf_conntrack_tuple *t)
+{
+	if (nla_put_be16(skb, CTA_PROTO_SRC_ESP_SPI, t->src.u.esp.spi) ||
+	    nla_put_be16(skb, CTA_PROTO_DST_ESP_SPI, t->dst.u.esp.spi))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static const struct nla_policy esp_nla_policy[CTA_PROTO_MAX + 1] = {
+	[CTA_PROTO_SRC_ESP_SPI] = { .type = NLA_U16 },
+	[CTA_PROTO_DST_ESP_SPI] = { .type = NLA_U16 },
+};
+
+static int esp_nlattr_to_tuple(struct nlattr *tb[],
+			       struct nf_conntrack_tuple *t,
+				   u32 flags)
+{
+	if (flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_ESP_SPI)) {
+		if (!tb[CTA_PROTO_SRC_ESP_SPI])
+			return -EINVAL;
+
+		t->src.u.esp.spi = nla_get_be16(tb[CTA_PROTO_SRC_ESP_SPI]);
+	}
+
+	if (flags & CTA_FILTER_FLAG(CTA_PROTO_DST_ESP_SPI)) {
+		if (!tb[CTA_PROTO_DST_ESP_SPI])
+			return -EINVAL;
+
+		t->dst.u.esp.spi = nla_get_be16(tb[CTA_PROTO_DST_ESP_SPI]);
+	}
+
+	return 0;
+}
+
+static unsigned int esp_nlattr_tuple_size(void)
+{
+	return nla_policy_len(esp_nla_policy, CTA_PROTO_MAX + 1);
+}
+#endif
+
+/* protocol helper struct */
+const struct nf_conntrack_l4proto nf_conntrack_l4proto_esp = {
+	.l4proto = IPPROTO_ESP,
+#ifdef CONFIG_NF_CONNTRACK_PROCFS
+	.print_conntrack = esp_print_conntrack,
+#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+	.tuple_to_nlattr = esp_tuple_to_nlattr,
+	.nlattr_tuple_size = esp_nlattr_tuple_size,
+	.nlattr_to_tuple = esp_nlattr_to_tuple,
+	.nla_policy = esp_nla_policy,
+#endif
+};
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index c6c0cb465664..e8cd28b4e602 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -88,6 +88,11 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
 			   ntohs(tuple->src.u.gre.key),
 			   ntohs(tuple->dst.u.gre.key));
 		break;
+	case IPPROTO_ESP:
+		seq_printf(s, "srcspi=0x%x dstspi=0x%x ",
+			   ntohs(tuple->src.u.esp.spi),
+			   ntohs(tuple->dst.u.esp.spi));
+		break;
 	default:
 		break;
 	}
diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h
index 832ae64179f0..26db7333c801 100644
--- a/net/netfilter/nf_internals.h
+++ b/net/netfilter/nf_internals.h
@@ -19,7 +19,9 @@
 #define CTA_FILTER_F_CTA_PROTO_ICMPV6_TYPE	(1 << 9)
 #define CTA_FILTER_F_CTA_PROTO_ICMPV6_CODE	(1 << 10)
 #define CTA_FILTER_F_CTA_PROTO_ICMPV6_ID	(1 << 11)
-#define CTA_FILTER_F_MAX			(1 << 12)
+#define CTA_FILTER_F_CTA_PROTO_SRC_ESP_SPI	(1 << 12)
+#define CTA_FILTER_F_CTA_PROTO_DST_ESP_SPI	(1 << 13)
+#define CTA_FILTER_F_MAX			(1 << 14)
 #define CTA_FILTER_F_ALL			(CTA_FILTER_F_MAX-1)
 #define CTA_FILTER_FLAG(ctattr) CTA_FILTER_F_ ## ctattr
 
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] netfilter: nf_conntrack: Add conntrack helper for ESP/IPsec
  2021-04-14  3:53 [PATCH] netfilter: nf_conntrack: Add conntrack helper for ESP/IPsec Cole Dishington
@ 2021-04-14 15:40 ` Florian Westphal
  2021-04-20 22:35   ` Cole Dishington
  2021-05-05 12:16   ` [PATCH] " Jan Engelhardt
  0 siblings, 2 replies; 10+ messages in thread
From: Florian Westphal @ 2021-04-14 15:40 UTC (permalink / raw)
  To: Cole Dishington
  Cc: pablo, kadlec, fw, davem, kuba, linux-kernel, netfilter-devel,
	coreteam, netdev

Cole Dishington <Cole.Dishington@alliedtelesis.co.nz> wrote:
> Introduce changes to add ESP connection tracking helper to netfilter
> conntrack. The connection tracking of ESP is based on IPsec SPIs. The
> underlying motivation for this patch was to allow multiple VPN ESP
> clients to be distinguished when using NAT.
>
> Added config flag CONFIG_NF_CT_PROTO_ESP to enable the ESP/IPsec
> conntrack helper.

Thanks for the effort to upstream out of tree code.

A couple of comments and questions below.

Preface: AFAIU this tracker aims to 'soft-splice' two independent
ESP connections, i.e.:

saddr:spi1 -> daddr
daddr:spi2 <- saddr

So that we basically get this conntrack:

saddr,daddr,spi1 (original)   daddr,saddr,spi2 (remote)

This can't be done as-is, because we don't know spi2 at the time the
first ESP packet is received.

The solution implemented here is introduction of a 'virtual esp id',
computed when first ESP packet is received, so conntrack really stores:

saddr,daddr,ID (original)   daddr,saddr,ID (remote)

Because the ID is never carried on the wire, this tracker hooks into
pkt_to_tuple() infra so that the conntrack tuple gets populated
as-needed.

If I got that right, I think it would be good to place some description
like this in the source code, this is unlike all the other trackers.

> index 000000000000..2441e031c68e
> --- /dev/null
> +++ b/include/linux/netfilter/nf_conntrack_proto_esp.h
> @@ -0,0 +1,25 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _CONNTRACK_PROTO_ESP_H
> +#define _CONNTRACK_PROTO_ESP_H
> +#include <asm/byteorder.h>
> +
> +/* ESP PROTOCOL HEADER */
> +
> +struct esphdr {
> +	__u32 spi;
> +};
> +
> +struct nf_ct_esp {
> +	unsigned int stream_timeout;
> +	unsigned int timeout;
> +};
> +
> +#ifdef __KERNEL__
> +#include <net/netfilter/nf_conntrack_tuple.h>
> +
> +void destroy_esp_conntrack_entry(struct nf_conn *ct);
> +
> +bool esp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
> +		      struct net *net, struct nf_conntrack_tuple *tuple);
> +#endif /* __KERNEL__ */

No need for the __KERNEL__, this header is not exposed to userspace
(only those in include/uapi/).

>  			struct {
>  				__be16 key;
>  			} gre;
> +			struct {
> +				__be16 spi;

__be32 ?

I now see that this "spi" seems to be allocated by the esp tracker.
Maybe 'esp_id' or something like that?

It doesn't appear to be related to the ESP header SPI value.

> --- a/include/net/netns/conntrack.h
> +++ b/include/net/netns/conntrack.h
> @@ -69,6 +69,27 @@ struct nf_gre_net {
>  };
>  #endif
>  
> +#ifdef CONFIG_NF_CT_PROTO_ESP
> +#define ESP_MAX_PORTS      1000
> +#define HASH_TAB_SIZE  ESP_MAX_PORTS

ESP? Ports?  Should this be 'slots'?  Maybe a comment helps, I don't
expect to see ports in an ESP tracker.

> +enum esp_conntrack {
> +	ESP_CT_UNREPLIED,
> +	ESP_CT_REPLIED,
> +	ESP_CT_MAX
> +};
> +
> +struct nf_esp_net {
> +	rwlock_t esp_table_lock;

This uses a rwlock but i only see writer locks being taken.
So this either should use a spinlock, or reader-parts should
take readlock, not wrlock.

(but also see below).

> +	struct hlist_head ltable[HASH_TAB_SIZE];
> +	struct hlist_head rtable[HASH_TAB_SIZE];
> +	/* Initial lookup for remote end until rspi is known */
> +	struct hlist_head incmpl_rtable[HASH_TAB_SIZE];
> +	struct _esp_table *esp_table[ESP_MAX_PORTS];
> +	unsigned int esp_timeouts[ESP_CT_MAX];
> +};

This is large structure -- >32kb.

Could this be moved to nf_conntrack_net?

It would also be good to not allocate these hash slots until after conntrack
is needed.

The esp_timeouts[] can be kept to avoid module dep problems.

(But also see below, I'm not sure homegrown hash table is the way to go).

>  struct ct_pcpu {
> diff --git a/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h b/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h
> index 64390fac6f7e..9bbd76c325d2 100644
> --- a/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h
> +++ b/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h
> @@ -39,6 +39,9 @@ union nf_conntrack_man_proto {
..
> +#if 0
> +#define ESP_DEBUG 1
> +#define DEBUGP(format, args...) printk(KERN_DEBUG "%s: " format, __func__, ## args)
> +#else
> +#undef ESP_DEBUG
> +#define DEBUGP(x, args...)
> +#endif

I suggest to get rid of all of DEBUGP(), either drop them, or, in cases
where they are useful, switch to pr_debug().

> +#define TEMP_SPI_START 1500
> +#define TEMP_SPI_MAX   (TEMP_SPI_START + ESP_MAX_PORTS - 1)

I think this could use an explanation.

> +struct _esp_table {
> +	/* Hash table nodes for each required lookup
> +	 * lnode: l_spi, l_ip, r_ip
> +	 * rnode: r_spi, r_ip
> +	 * incmpl_rnode: r_ip
> +	 */
> +	struct hlist_node lnode;
> +	struct hlist_node rnode;
> +	struct hlist_node incmpl_rnode;
> +
> +	u32 l_spi;
> +	u32 r_spi;
> +	u32 l_ip;
> +	u32 r_ip;

Hmm, ipv4 only.  Could this be changed to also support ipv6?

At least this should use 'union nf_inet_addr' or add a full
struct nf_conntrack_tuple to get src/dst/+SPIs in one entry.

> +	u16 tspi;

Whats the tspi? (Might be clear later after reading the entire
file, but for sure could use a comment).

> +	unsigned long allocation_time;

Perhaps use alloc_time_jiffies to make it clear what units are expected
here.  Or, better yet, use 'u32 alloc_time_jiffies' and nfct_time_stamp,
which is just 32bit jiffies (needs less space and is sufficient for
'older/more recent than' logic.

> +/* Allocate a free IPSEC table entry.
> + * NOTE: The ESP entry table must be locked prior to calling this function.
> + */
> +struct _esp_table *alloc_esp_entry(struct net *net)
> +{
> +	struct nf_esp_net *net_esp = esp_pernet(net);
> +	struct _esp_table **esp_table = net_esp->esp_table;
> +	struct _esp_table *esp_entry = NULL;
> +	int idx = 0;
> +
> +	/* Find the first unused slot */
> +	for (; idx < ESP_MAX_PORTS; idx++) {
> +		if (esp_table[idx])
> +			continue;
> +
> +		esp_table[idx] = kmalloc(sizeof(*esp_entry), GFP_ATOMIC);
> +		memset(esp_table[idx], 0, sizeof(struct _esp_table));

Missing NULL/ENOMEM check.

However, it would be nice to avoid this kmalloc completely, but so far I
can't find a way, since this gets inserted/retrieved from pkt_to_tuple, before
nf_conn is inserted/looked up.

1. spi * 2 (local, remote)
2. 3 node pointers for the tables (might be able to use only 2?)

Thats 56 bytes total on x86_64, and would fit into the ct->proto storage
(its a union, currently @ 60 bytes).

The addresses are already stored in the nf_conn entry itself.

Unfortunately I don't see a way to leverage this (we can't recurse/do
lookups into the conntrack table either).

> +static struct _esp_table *search_esp_entry_init_remote(struct nf_esp_net *net_esp,
> +						       const u32 src_ip)
> +{
> +	struct _esp_table **esp_table = net_esp->esp_table;
> +	struct _esp_table *esp_entry = NULL;
> +	u32 hash = 0;
> +	int first_entry = -1;
> +
> +	hash = calculate_hash(0, src_ip, 0);
> +	hlist_for_each_entry(esp_entry, &net_esp->incmpl_rtable[hash],
> +			     incmpl_rnode) {
> +		DEBUGP("Checking against incmpl_rtable entry %x (%p) with l_spi %x r_spi %x r_ip %x\n",
> +		       esp_entry->tspi, esp_entry, esp_entry->l_spi,
> +		       esp_entry->r_spi, esp_entry->r_ip);
> +		if (src_ip == esp_entry->r_ip && esp_entry->l_spi != 0 &&
> +		    esp_entry->r_spi == 0) {
> +			DEBUGP("Matches entry %x", esp_entry->tspi);
> +			if (first_entry == -1) {
> +				DEBUGP("First match\n");
> +				first_entry = esp_entry->tspi - TEMP_SPI_START;
> +			} else if (esp_table[first_entry]->allocation_time >
> +				   esp_entry->allocation_time) {

This needs time_after() etc. to avoid errors when jiffy counter wraps.
Alternatively, look at nf_ct_is_expired(), copy that and use "nfct_time_stamp"
instead of jiffies (its 32bit jiffies, even on 64bit to save space in
nf_conn struct).

> +struct _esp_table *search_esp_entry_by_spi(struct net *net, const __u32 spi,
> +					   const __u32 src_ip, const __u32 dst_ip)
> +{
> +	struct nf_esp_net *net_esp = esp_pernet(net);
> +	struct _esp_table *esp_entry = NULL;
> +	u32 hash = 0;
> +
> +	/* Check for matching established session or repeated initial LAN side */
> +	/* LAN side first */
> +	hash = calculate_hash(spi, src_ip, dst_ip);
> +	hlist_for_each_entry(esp_entry, &net_esp->ltable[hash], lnode) {
> +		DEBUGP
> +		    ("Checking against ltable entry %x (%p) with l_spi %x l_ip %x r_ip %x\n",
> +		     esp_entry->tspi, esp_entry, esp_entry->l_spi,
> +		     esp_entry->l_ip, esp_entry->r_ip);
> +		if (spi == esp_entry->l_spi && src_ip == esp_entry->l_ip &&
> +		    dst_ip == esp_entry->r_ip) {
> +			/* When r_spi is set this is an established session. When not set it's
> +			 * a repeated initial packet from LAN side. But both cases are treated
> +			 * the same.
> +			 */
> +			DEBUGP("Matches entry %x", esp_entry->tspi);
> +			return esp_entry;
> +		}
> +	}

The first lookup should normally find an entry, correct?

> +	/* Established remote side */
> +	hash = calculate_hash(spi, src_ip, 0);
> +	hlist_for_each_entry(esp_entry, &net_esp->rtable[hash], rnode) {
> +		DEBUGP
> +		    ("Checking against rtable entry %x (%p) with l_spi %x r_spi %x r_ip %x\n",
> +		     esp_entry->tspi, esp_entry, esp_entry->l_spi,
> +		     esp_entry->r_spi, esp_entry->r_ip);
> +		if (spi == esp_entry->r_spi && src_ip == esp_entry->r_ip &&
> +		    esp_entry->l_spi != 0) {
[..]

> +	/* Incomplete remote side */
> +	esp_entry = search_esp_entry_init_remote(net_esp, src_ip);
> +	if (esp_entry) {
> +		esp_entry->r_spi = spi;
> +		/* Remove entry from incmpl_rtable and add to rtable */
> +		DEBUGP("Completing entry %x with remote SPI info",
> +		       esp_entry->tspi);
> +		hlist_del_init(&esp_entry->incmpl_rnode);
> +		hash = calculate_hash(spi, src_ip, 0);
> +		hlist_add_head(&esp_entry->rnode, &net_esp->rtable[hash]);
> +		return esp_entry;
> +	}
> +
> +	DEBUGP("No Entry\n");
> +	return NULL;
> +}
> +
> +/* invert esp part of tuple */
> +bool nf_conntrack_invert_esp_tuple(struct nf_conntrack_tuple *tuple,
> +				   const struct nf_conntrack_tuple *orig)
> +{
> +	tuple->dst.u.esp.spi = orig->dst.u.esp.spi;
> +	tuple->src.u.esp.spi = orig->src.u.esp.spi;
> +	return true;
> +}
> +
> +/* esp hdr info to tuple */
> +bool esp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
> +		      struct net *net, struct nf_conntrack_tuple *tuple)
> +{
> +	struct nf_esp_net *net_esp = esp_pernet(net);
> +	struct esphdr _esphdr, *esphdr;
> +	struct _esp_table *esp_entry = NULL;
> +	u32 spi = 0;
> +
> +	esphdr = skb_header_pointer(skb, dataoff, sizeof(_esphdr), &_esphdr);
> +	if (!esphdr) {
> +		/* try to behave like "nf_conntrack_proto_generic" */
> +		tuple->src.u.all = 0;
> +		tuple->dst.u.all = 0;
> +		return true;
> +	}
> +	spi = ntohl(esphdr->spi);
> +
> +	DEBUGP("Enter pkt_to_tuple() with spi %x\n", spi);
> +	/* check if esphdr has a new SPI:
> +	 *   if no, update tuple with correct tspi;
> +	 *   if yes, check if we have seen the source IP:
> +	 *             if yes, update the ESP tables update the tuple with correct tspi
> +	 *             if no, create a new entry
> +	 */
> +	write_lock_bh(&net_esp->esp_table_lock);

So all CPUs serialize on this lock.  I'm concerned this will cause
performance regression, ATM ESP is handled by generic tracker; after
this is applied it will be handled here.

At the very least this should use read lock only and upgrade to wrlock
if needed only.

> +	esp_entry = search_esp_entry_by_spi(net, spi, tuple->src.u3.ip,
> +					    tuple->dst.u3.ip);
> +	if (!esp_entry) {
> +		u32 hash = 0;
> +
> +		esp_entry = alloc_esp_entry(net);
> +		if (!esp_entry) {
> +			DEBUGP("All entries in use\n");
> +			write_unlock_bh(&net_esp->esp_table_lock);
> +			return false;
> +		}
> +		esp_entry->l_spi = spi;
> +		esp_entry->l_ip = tuple->src.u3.ip;
> +		esp_entry->r_ip = tuple->dst.u3.ip;
> +		/* Add entries to the hash tables */
> +		hash = calculate_hash(spi, esp_entry->l_ip, esp_entry->r_ip);
> +		hlist_add_head(&esp_entry->lnode, &net_esp->ltable[hash]);
> +		hash = calculate_hash(0, 0, esp_entry->r_ip);
> +		hlist_add_head(&esp_entry->incmpl_rnode,
> +			       &net_esp->incmpl_rtable[hash]);
> +	}
> +
> +	DEBUGP
> +	    ("entry_info: tspi %u l_spi 0x%x r_spi 0x%x l_ip %x r_ip %x srcIP %x dstIP %x\n",
> +	     esp_entry->tspi, esp_entry->l_spi, esp_entry->r_spi,
> +	     esp_entry->l_ip, esp_entry->r_ip, tuple->src.u3.ip,
> +	     tuple->dst.u3.ip);
> +
> +	tuple->dst.u.esp.spi = esp_entry->tspi;
> +	tuple->src.u.esp.spi = esp_entry->tspi;
> +	write_unlock_bh(&net_esp->esp_table_lock);
> +	return true;
> +}
> +
> +#ifdef CONFIG_NF_CONNTRACK_PROCFS
> +/* print private data for conntrack */
> +static void esp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
> +{
> +	seq_printf(s, "timeout=%u, stream_timeout=%u ",
> +		   (ct->proto.esp.timeout / HZ),
> +		   (ct->proto.esp.stream_timeout / HZ));

What is ct->proto.esp.{timeout,stream_timeout} for?
I don't see where/how its used.

> +/* Returns verdict for packet, and may modify conntrack */
> +int nf_conntrack_esp_packet(struct nf_conn *ct, struct sk_buff *skb,
> +			    unsigned int dataoff,
> +			    enum ip_conntrack_info ctinfo,
> +			    const struct nf_hook_state *state)
> +{
> +	unsigned int *timeouts = nf_ct_timeout_lookup(ct);
> +#ifdef ESP_DEBUG
> +	const struct iphdr *iph;
> +	struct esphdr _esphdr, *esphdr;
> +
> +	iph = ip_hdr(skb);

This will treat ipv6 as ipv4 header, no?

> +	if (!timeouts)
> +		timeouts = esp_pernet(nf_ct_net(ct))->esp_timeouts;
> +
> +	if (!nf_ct_is_confirmed(ct)) {
> +		ct->proto.esp.stream_timeout = timeouts[ESP_CT_REPLIED];
> +		ct->proto.esp.timeout = timeouts[ESP_CT_UNREPLIED];

So first packet inits these, but apart from esp_print_conntrack() i see
no readers.

> +/* Called when a conntrack entry has already been removed from the hashes
> + * and is about to be deleted from memory
> + */
> +void destroy_esp_conntrack_entry(struct nf_conn *ct)
> +{
> +	struct nf_conntrack_tuple *tuple = NULL;
> +	enum ip_conntrack_dir dir;
> +	u16 tspi = 0;
> +	struct net *net = nf_ct_net(ct);
> +	struct nf_esp_net *net_esp = esp_pernet(net);
> +
> +	write_lock_bh(&net_esp->esp_table_lock);
> +
> +	/* Probably all the ESP entries referenced in this connection are the same,
> +	 * but the free function handles repeated frees, so best to do them all.
> +	 */
> +	for (dir = IP_CT_DIR_ORIGINAL; dir < IP_CT_DIR_MAX; dir++) {
> +		tuple = nf_ct_tuple(ct, dir);
> +
> +		tspi = tuple->src.u.esp.spi;
> +		if (tspi >= TEMP_SPI_START && tspi <= TEMP_SPI_MAX) {
> +			DEBUGP("Deleting src tspi %x (dir %i)\n", tspi, dir);
> +			esp_table_free_entry_by_tspi(net, tspi);
> +		}
> +		tuple->src.u.esp.spi = 0;
> +		tspi = tuple->dst.u.esp.spi;
> +		if (tspi >= TEMP_SPI_START && tspi <= TEMP_SPI_MAX) {
> +			DEBUGP("Deleting dst tspi %x (dir %i)\n", tspi, dir);
> +			esp_table_free_entry_by_tspi(net, tspi);
> +		}
> +		tuple->dst.u.esp.spi = 0;
> +	}
> +

Questions:
Could this use rhashtable(s) instead of the homegrown table?

This would allow:
 1. use of shared rhashtables for the namespaces, since netns could be
 part of key. That in turn avoids the need to allocate memory for each
 different netns.
 2. automatically provides rcu-guarded read access & parallel inserts.

Could this tracker store the current local and remove SPI as seen on
wire?  It could be stored in the proto.esp stash so its not part of the
hash key.

That would allow to export/print it to userspace, so conntrack tool or
/proc could show the real SPI used by local and remote.

3. How hard is it to add ipv6 support?

If its out-of-scope it could be restricted to NFPROTO_IPV4 and have
generic-tracker behaviour for ipv6.

I might have more questions later, I need to spend a bit more time on
this.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH] netfilter: nf_conntrack: Add conntrack helper for ESP/IPsec
  2021-04-14 15:40 ` Florian Westphal
@ 2021-04-20 22:35   ` Cole Dishington
  2021-04-26 11:54     ` Florian Westphal
  2021-05-05 12:16   ` [PATCH] " Jan Engelhardt
  1 sibling, 1 reply; 10+ messages in thread
From: Cole Dishington @ 2021-04-20 22:35 UTC (permalink / raw)
  To: fw
  Cc: pablo, kadlec, davem, kuba, Cole.Dishington, linux-kernel,
	netfilter-devel, coreteam, netdev

Introduce changes to add ESP connection tracking helper to netfilter
conntrack. The connection tracking of ESP is based on IPsec SPIs. The
underlying motivation for this patch was to allow multiple VPN ESP
clients to be distinguished when using NAT.

Added config flag CONFIG_NF_CT_PROTO_ESP to enable the ESP/IPsec
conntrack helper.

Signed-off-by: Cole Dishington <Cole.Dishington@alliedtelesis.co.nz>
---

Notes:
    changes in v2:
    - Move from homegrown hashtables to rhashtables and rhltable.
    - Add net_hash_mix to hashtable key to share hashtables over netns.
    - Move the _esp_table and hashtables from per net nf_esp_net structure to
      static within nf_conntrack_proto_esp.c.
    - Move from rwlock_t for _esp_table to spinlock, as no read locks were taken.
    - Add IPv6 support.
    - Print the local and remote SPIs (as seen on the wire) for proc in esp_print_conntrack().
    - Removed ct->proto.esp.{timeout,stream_timeout} as it was only used by esp_print_conntrack().
      It looks like it may have been copied from gre but gre's is used by pptp.
    - Use 32-bit jiffies and fix counter wrap in search_esp_entry_init_remote().
    - Add NULL check on alloc_esp_entry() kmalloc().
    - Replace custom DEBUGP macro with pr_debug().
    - Rename spi on tuple and tspi to esp.id and esp_id, respectively.
    - Remove __KERNEL__ ifdef from header as it is not in include/uapi/

 .../linux/netfilter/nf_conntrack_proto_esp.h  |  21 +
 .../net/netfilter/ipv4/nf_conntrack_ipv4.h    |   3 +
 include/net/netfilter/nf_conntrack.h          |   2 +
 include/net/netfilter/nf_conntrack_l4proto.h  |  16 +
 include/net/netfilter/nf_conntrack_tuple.h    |   3 +
 include/net/netns/conntrack.h                 |  15 +
 .../netfilter/nf_conntrack_tuple_common.h     |   3 +
 .../linux/netfilter/nfnetlink_conntrack.h     |   2 +
 net/netfilter/Kconfig                         |  10 +
 net/netfilter/Makefile                        |   1 +
 net/netfilter/nf_conntrack_core.c             |  23 +
 net/netfilter/nf_conntrack_netlink.c          |   4 +-
 net/netfilter/nf_conntrack_proto.c            |  12 +
 net/netfilter/nf_conntrack_proto_esp.c        | 736 ++++++++++++++++++
 net/netfilter/nf_conntrack_standalone.c       |   8 +
 net/netfilter/nf_internals.h                  |   4 +-
 16 files changed, 861 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/netfilter/nf_conntrack_proto_esp.h
 create mode 100644 net/netfilter/nf_conntrack_proto_esp.c

diff --git a/include/linux/netfilter/nf_conntrack_proto_esp.h b/include/linux/netfilter/nf_conntrack_proto_esp.h
new file mode 100644
index 000000000000..2e8aa99c5fcc
--- /dev/null
+++ b/include/linux/netfilter/nf_conntrack_proto_esp.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _CONNTRACK_PROTO_ESP_H
+#define _CONNTRACK_PROTO_ESP_H
+#include <asm/byteorder.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+
+/* ESP PROTOCOL HEADER */
+
+struct esphdr {
+	__u32 spi;
+};
+
+struct nf_ct_esp {
+	__u32 l_spi, r_spi;
+};
+
+void destroy_esp_conntrack_entry(struct nf_conn *ct);
+
+bool esp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
+		      struct net *net, struct nf_conntrack_tuple *tuple);
+#endif /* _CONNTRACK_PROTO_ESP_H */
diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
index 2c8c2b023848..1aee91592639 100644
--- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
+++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
@@ -25,5 +25,8 @@ extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite;
 #ifdef CONFIG_NF_CT_PROTO_GRE
 extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre;
 #endif
+#ifdef CONFIG_NF_CT_PROTO_ESP
+extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_esp;
+#endif
 
 #endif /*_NF_CONNTRACK_IPV4_H*/
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 439379ca9ffa..2bd1d94de138 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -21,6 +21,7 @@
 #include <linux/netfilter/nf_conntrack_dccp.h>
 #include <linux/netfilter/nf_conntrack_sctp.h>
 #include <linux/netfilter/nf_conntrack_proto_gre.h>
+#include <linux/netfilter/nf_conntrack_proto_esp.h>
 
 #include <net/netfilter/nf_conntrack_tuple.h>
 
@@ -36,6 +37,7 @@ union nf_conntrack_proto {
 	struct ip_ct_tcp tcp;
 	struct nf_ct_udp udp;
 	struct nf_ct_gre gre;
+	struct nf_ct_esp esp;
 	unsigned int tmpl_padto;
 };
 
diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 96f9cf81f46b..f700de0b9059 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -75,6 +75,8 @@ bool nf_conntrack_invert_icmp_tuple(struct nf_conntrack_tuple *tuple,
 				    const struct nf_conntrack_tuple *orig);
 bool nf_conntrack_invert_icmpv6_tuple(struct nf_conntrack_tuple *tuple,
 				      const struct nf_conntrack_tuple *orig);
+bool nf_conntrack_invert_esp_tuple(struct nf_conntrack_tuple *tuple,
+				   const struct nf_conntrack_tuple *orig);
 
 int nf_conntrack_inet_error(struct nf_conn *tmpl, struct sk_buff *skb,
 			    unsigned int dataoff,
@@ -132,6 +134,11 @@ int nf_conntrack_gre_packet(struct nf_conn *ct,
 			    unsigned int dataoff,
 			    enum ip_conntrack_info ctinfo,
 			    const struct nf_hook_state *state);
+int nf_conntrack_esp_packet(struct nf_conn *ct,
+			    struct sk_buff *skb,
+			    unsigned int dataoff,
+			    enum ip_conntrack_info ctinfo,
+			    const struct nf_hook_state *state);
 
 void nf_conntrack_generic_init_net(struct net *net);
 void nf_conntrack_tcp_init_net(struct net *net);
@@ -141,6 +148,8 @@ void nf_conntrack_dccp_init_net(struct net *net);
 void nf_conntrack_sctp_init_net(struct net *net);
 void nf_conntrack_icmp_init_net(struct net *net);
 void nf_conntrack_icmpv6_init_net(struct net *net);
+int nf_conntrack_esp_init(void);
+void nf_conntrack_esp_init_net(struct net *net);
 
 /* Existing built-in generic protocol */
 extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic;
@@ -240,4 +249,11 @@ static inline struct nf_gre_net *nf_gre_pernet(struct net *net)
 }
 #endif
 
+#ifdef CONFIG_NF_CT_PROTO_ESP
+static inline struct nf_esp_net *nf_esp_pernet(struct net *net)
+{
+	return &net->ct.nf_ct_proto.esp;
+}
+#endif
+
 #endif /*_NF_CONNTRACK_PROTOCOL_H*/
diff --git a/include/net/netfilter/nf_conntrack_tuple.h b/include/net/netfilter/nf_conntrack_tuple.h
index 9334371c94e2..60279ffabe36 100644
--- a/include/net/netfilter/nf_conntrack_tuple.h
+++ b/include/net/netfilter/nf_conntrack_tuple.h
@@ -62,6 +62,9 @@ struct nf_conntrack_tuple {
 			struct {
 				__be16 key;
 			} gre;
+			struct {
+				__be16 id;
+			} esp;
 		} u;
 
 		/* The protocol. */
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index 806454e767bf..29f7e779265a 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -69,6 +69,18 @@ struct nf_gre_net {
 };
 #endif
 
+#ifdef CONFIG_NF_CT_PROTO_ESP
+enum esp_conntrack {
+	ESP_CT_UNREPLIED,
+	ESP_CT_REPLIED,
+	ESP_CT_MAX
+};
+
+struct nf_esp_net {
+	unsigned int esp_timeouts[ESP_CT_MAX];
+};
+#endif
+
 struct nf_ip_net {
 	struct nf_generic_net   generic;
 	struct nf_tcp_net	tcp;
@@ -84,6 +96,9 @@ struct nf_ip_net {
 #ifdef CONFIG_NF_CT_PROTO_GRE
 	struct nf_gre_net	gre;
 #endif
+#ifdef CONFIG_NF_CT_PROTO_ESP
+	struct nf_esp_net	esp;
+#endif
 };
 
 struct ct_pcpu {
diff --git a/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h b/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h
index 64390fac6f7e..78600cb4bfff 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h
@@ -39,6 +39,9 @@ union nf_conntrack_man_proto {
 	struct {
 		__be16 key;	/* GRE key is 32bit, PPtP only uses 16bit */
 	} gre;
+	struct {
+		__be16 id;
+	} esp;
 };
 
 #define CTINFO2DIR(ctinfo) ((ctinfo) >= IP_CT_IS_REPLY ? IP_CT_DIR_REPLY : IP_CT_DIR_ORIGINAL)
diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
index d8484be72fdc..744d8931adeb 100644
--- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
@@ -90,6 +90,8 @@ enum ctattr_l4proto {
 	CTA_PROTO_ICMPV6_ID,
 	CTA_PROTO_ICMPV6_TYPE,
 	CTA_PROTO_ICMPV6_CODE,
+	CTA_PROTO_SRC_ESP_ID,
+	CTA_PROTO_DST_ESP_ID,
 	__CTA_PROTO_MAX
 };
 #define CTA_PROTO_MAX (__CTA_PROTO_MAX - 1)
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 1a92063c73a4..7269312d322e 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -199,6 +199,16 @@ config NF_CT_PROTO_UDPLITE
 
 	  If unsure, say Y.
 
+config NF_CT_PROTO_ESP
+	bool "ESP protocol support"
+	depends on NETFILTER_ADVANCED
+	help
+	  ESP connection tracking helper. Provides connection tracking for IPsec
+	  clients behind this device based on SPI, especially useful for
+	  distinguishing multiple clients when using NAT.
+
+	  If unsure, say N.
+
 config NF_CONNTRACK_AMANDA
 	tristate "Amanda backup protocol support"
 	depends on NETFILTER_ADVANCED
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 33da7bf1b68e..0942f2c48ddb 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -14,6 +14,7 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o
 nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o
 nf_conntrack-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
 nf_conntrack-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o
+nf_conntrack-$(CONFIG_NF_CT_PROTO_ESP) += nf_conntrack_proto_esp.o
 
 obj-$(CONFIG_NETFILTER) = netfilter.o
 
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index ff0168736f6e..3bef361d19ce 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -295,6 +295,10 @@ nf_ct_get_tuple(const struct sk_buff *skb,
 #ifdef CONFIG_NF_CT_PROTO_GRE
 	case IPPROTO_GRE:
 		return gre_pkt_to_tuple(skb, dataoff, net, tuple);
+#endif
+#ifdef CONFIG_NF_CT_PROTO_ESP
+	case IPPROTO_ESP:
+		return esp_pkt_to_tuple(skb, dataoff, net, tuple);
 #endif
 	case IPPROTO_TCP:
 	case IPPROTO_UDP: /* fallthrough */
@@ -439,6 +443,10 @@ nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
 #if IS_ENABLED(CONFIG_IPV6)
 	case IPPROTO_ICMPV6:
 		return nf_conntrack_invert_icmpv6_tuple(inverse, orig);
+#endif
+#ifdef CONFIG_NF_CT_PROTO_ESP
+	case IPPROTO_ESP:
+		return nf_conntrack_invert_esp_tuple(inverse, orig);
 #endif
 	}
 
@@ -593,6 +601,13 @@ static void destroy_gre_conntrack(struct nf_conn *ct)
 #endif
 }
 
+static void destroy_esp_conntrack(struct nf_conn *ct)
+{
+#ifdef CONFIG_NF_CT_PROTO_ESP
+	destroy_esp_conntrack_entry(ct);
+#endif
+}
+
 static void
 destroy_conntrack(struct nf_conntrack *nfct)
 {
@@ -609,6 +624,9 @@ destroy_conntrack(struct nf_conntrack *nfct)
 	if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE))
 		destroy_gre_conntrack(ct);
 
+	if (unlikely(nf_ct_protonum(ct) == IPPROTO_ESP))
+		destroy_esp_conntrack(ct);
+
 	local_bh_disable();
 	/* Expectations will have been removed in clean_from_lists,
 	 * except TFTP can create an expectation on the first packet,
@@ -1783,6 +1801,11 @@ static int nf_conntrack_handle_packet(struct nf_conn *ct,
 	case IPPROTO_GRE:
 		return nf_conntrack_gre_packet(ct, skb, dataoff,
 					       ctinfo, state);
+#endif
+#ifdef CONFIG_NF_CT_PROTO_ESP
+	case IPPROTO_ESP:
+		return nf_conntrack_esp_packet(ct, skb, dataoff,
+					       ctinfo, state);
 #endif
 	}
 
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 1d519b0e51a5..8df33dbbf5a3 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1382,7 +1382,9 @@ static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = {
    CTA_FILTER_F_CTA_PROTO_ICMP_ID | \
    CTA_FILTER_F_CTA_PROTO_ICMPV6_TYPE | \
    CTA_FILTER_F_CTA_PROTO_ICMPV6_CODE | \
-   CTA_FILTER_F_CTA_PROTO_ICMPV6_ID)
+   CTA_FILTER_F_CTA_PROTO_ICMPV6_ID | \
+   CTA_FILTER_F_CTA_PROTO_SRC_ESP_ID | \
+   CTA_FILTER_F_CTA_PROTO_DST_ESP_ID)
 
 static int
 ctnetlink_parse_tuple_filter(const struct nlattr * const cda[],
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 47e9319d2cf3..abba94f782c1 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -112,6 +112,9 @@ const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto)
 #ifdef CONFIG_NF_CT_PROTO_GRE
 	case IPPROTO_GRE: return &nf_conntrack_l4proto_gre;
 #endif
+#ifdef CONFIG_NF_CT_PROTO_ESP
+	case IPPROTO_ESP: return &nf_conntrack_l4proto_esp;
+#endif
 #if IS_ENABLED(CONFIG_IPV6)
 	case IPPROTO_ICMPV6: return &nf_conntrack_l4proto_icmpv6;
 #endif /* CONFIG_IPV6 */
@@ -656,6 +659,12 @@ int nf_conntrack_proto_init(void)
 		goto cleanup_sockopt;
 #endif
 
+#ifdef CONFIG_NF_CT_PROTO_ESP
+	ret = nf_conntrack_esp_init();
+	if (ret < 0)
+		goto cleanup_sockopt;
+#endif
+
 	return ret;
 
 #if IS_ENABLED(CONFIG_IPV6)
@@ -691,6 +700,9 @@ void nf_conntrack_proto_pernet_init(struct net *net)
 #ifdef CONFIG_NF_CT_PROTO_GRE
 	nf_conntrack_gre_init_net(net);
 #endif
+#ifdef CONFIG_NF_CT_PROTO_ESP
+	nf_conntrack_esp_init_net(net);
+#endif
 }
 
 void nf_conntrack_proto_pernet_fini(struct net *net)
diff --git a/net/netfilter/nf_conntrack_proto_esp.c b/net/netfilter/nf_conntrack_proto_esp.c
new file mode 100644
index 000000000000..f17ce8a9439f
--- /dev/null
+++ b/net/netfilter/nf_conntrack_proto_esp.c
@@ -0,0 +1,736 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * <:copyright-gpl
+ * Copyright 2008 Broadcom Corp. All Rights Reserved.
+ * Copyright (C) 2021 Allied Telesis Labs NZ
+ *
+ * This program is free software; you can distribute it and/or modify it
+ * under the terms of the GNU General Public License (Version 2) as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.
+ * :>
+ */
+/******************************************************************************
+ * Filename:       nf_conntrack_proto_esp.c
+ * Author:         Pavan Kumar
+ * Creation Date:  05/27/04
+ *
+ * Description:
+ * Implements the ESP ALG connectiontracking via soft-splicing two ESP
+ * connections together using each connections SPI. The connection is
+ * then identified with a generated esp id.
+ *
+ * Migrated to kernel 2.6.21.5 on April 16, 2008 by Dan-Han Tsai.
+ * Migrated to kernel 5.11.0-rc2+ on March 3, 2021 by Allied Telesis Labs NZ (Cole Dishington).
+ *
+ * Updates to ESP conntracking on October,2010,by Manamohan,Lantiq Deutschland GmbH:
+ *	- Added the support for sessions with two or more different remote servers
+ *    from single or multiple lan clients with same lan and remote SPI Ids
+ *	- Support for associating the multiple LAN side sessions waiting
+ *    for the reply from same remote server with the one which is created first
+ * Updates to ESP conntracking on August,2015,by Allied Telesis Labs NZ:
+ *	- Improve ESP entry lookup performance by adding hashtable. (Anthony Lineham)
+ *	- Add locking around ESP connection table. (Anthony Lineham)
+ *	- Fixups including adding destroy function, endian-safe SPIs and IPs,
+ *	  replace prinks with DEBUGPs. (Anthony Lineham)
+ *	- Extend ESP connection tracking to allow conntrack ESP entry matching
+ *	  of tuple values. (Matt Bennett)
+ * Updates to ESP conntracking on March 3,2021 by Allied Telesis Labs NZ (Cole Dishington):
+ *	- Migrate from homegrown hashtables per netns to rhashtable and
+ *	  rhltables shared between all netns.
+ *	- Move esp_table and hashtables to be shared over netns, rather than one per netns.
+ *	- Added IPv6 support.
+ *	- Added fixups for upstream including comments, DEBUGP -> pr_debug,
+ *	  64b -> 32 bit jiffies, and l_spi/r_spi exposed to procfs.
+ ****************************************************************************/
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/list.h>
+#include <linux/seq_file.h>
+#include <linux/in.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/dst.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
+#include <linux/netfilter/nf_conntrack_proto_esp.h>
+#include <net/netns/hash.h>
+#include <linux/rhashtable.h>
+#include <net/ipv6.h>
+
+#include "nf_internals.h"
+
+#define ESP_MAX_CONNECTIONS      1000
+#define HASH_TAB_MAX_SIZE  ESP_MAX_CONNECTIONS
+/* esp_id of 0 is left for unassigned values */
+#define TEMP_SPI_START 1
+#define TEMP_SPI_MAX   (TEMP_SPI_START + ESP_MAX_CONNECTIONS - 1)
+
+struct _esp_table {
+       /* Hash table nodes for each required lookup
+	* lnode: net->hash_mix, l_spi, l_ip, r_ip
+	* rnode: net->hash_mix, r_spi, r_ip
+	* incmpl_rlist: net->hash_mix, r_ip
+	*/
+	struct rhash_head lnode;
+	struct rhash_head rnode;
+	struct rhlist_head incmpl_rlist;
+
+	u16 esp_id;
+
+	u32 l_spi;
+	u32 r_spi;
+
+	u16 l3num;
+	union nf_inet_addr l_ip;
+	union nf_inet_addr r_ip;
+
+	u32 alloc_time_jiffies;
+	struct net *net;
+};
+
+struct _esp_hkey {
+	u16 l3num;
+	union nf_inet_addr src_ip;
+	union nf_inet_addr dst_ip;
+	u32 net_hmix;
+	u32 spi;
+};
+
+static DEFINE_SPINLOCK(esp_table_lock);
+static struct _esp_table *esp_table[ESP_MAX_CONNECTIONS];
+static struct rhashtable ltable;
+static struct rhashtable rtable;
+static struct rhltable incmpl_rtable;
+static unsigned int esp_timeouts[ESP_CT_MAX] = {
+	[ESP_CT_UNREPLIED] = 60 * HZ,
+	[ESP_CT_REPLIED] = 3600 * HZ,
+};
+
+static inline void esp_ip_addr_set_any(int af, union nf_inet_addr *a)
+{
+	if (af == AF_INET6)
+		ipv6_addr_set(&a->in6, 0, 0, 0, 0);
+	else
+		a->ip = 0;
+}
+
+static inline void esp_ip_addr_copy(int af, union nf_inet_addr *dst,
+				    const union nf_inet_addr *src)
+{
+	if (af == AF_INET6)
+		ipv6_addr_prefix_copy(&dst->in6, &src->in6, 128);
+	else
+		dst->ip = src->ip;
+}
+
+static inline int esp_ip_addr_equal(int af, const union nf_inet_addr *a,
+				    const union nf_inet_addr *b)
+{
+	if (af == AF_INET6)
+		return ipv6_addr_equal(&a->in6, &b->in6);
+	return a->ip == b->ip;
+}
+
+static inline struct nf_esp_net *esp_pernet(struct net *net)
+{
+	return &net->ct.nf_ct_proto.esp;
+}
+
+static inline void calculate_key(const u32 net_hmix, const u32 spi,
+				 const u16 l3num,
+				 const union nf_inet_addr *src_ip,
+				 const union nf_inet_addr *dst_ip,
+				 struct _esp_hkey *key)
+{
+	key->net_hmix = net_hmix;
+	key->spi = spi;
+	key->l3num = l3num;
+	esp_ip_addr_copy(l3num, &key->src_ip, src_ip);
+	esp_ip_addr_copy(l3num, &key->dst_ip, dst_ip);
+}
+
+static inline u32 calculate_hash(const void *data, u32 len, u32 seed)
+{
+	return jhash(data, len, seed);
+}
+
+static int ltable_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
+{
+	struct _esp_hkey obj_key = {};
+	const struct _esp_hkey *key = (const struct _esp_hkey *)arg->key;
+	const struct _esp_table *eobj = (const struct _esp_table *)obj;
+	u32 net_hmix = net_hash_mix(eobj->net);
+
+	calculate_key(net_hmix, eobj->l_spi, eobj->l3num, &eobj->l_ip,
+		      &eobj->r_ip, &obj_key);
+	return memcmp(key, &obj_key, sizeof(struct _esp_hkey));
+}
+
+static int rtable_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
+{
+	union nf_inet_addr any;
+	struct _esp_hkey obj_key = {};
+	const struct _esp_hkey *key = (const struct _esp_hkey *)arg->key;
+	const struct _esp_table *eobj = (const struct _esp_table *)obj;
+	u32 net_hmix = net_hash_mix(eobj->net);
+
+	esp_ip_addr_set_any(eobj->l3num, &any);
+	calculate_key(net_hmix, eobj->r_spi, eobj->l3num, &any, &eobj->r_ip,
+		      &obj_key);
+	return memcmp(key, &obj_key, sizeof(struct _esp_hkey));
+}
+
+static int incmpl_table_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
+{
+	union nf_inet_addr any;
+	struct _esp_hkey obj_key = {};
+	const struct _esp_hkey *key = (const struct _esp_hkey *)arg->key;
+	const struct _esp_table *eobj = (const struct _esp_table *)obj;
+	u32 net_hmix = net_hash_mix(eobj->net);
+
+	esp_ip_addr_set_any(eobj->l3num, &any);
+	calculate_key(net_hmix, 0, eobj->l3num, &any, &eobj->r_ip, &obj_key);
+	return memcmp(key, &obj_key, sizeof(struct _esp_hkey));
+}
+
+static u32 ltable_obj_hashfn(const void *data, u32 len, u32 seed)
+{
+	struct _esp_hkey key = {};
+	const struct _esp_table *eobj = (const struct _esp_table *)data;
+	u32 net_hmix = net_hash_mix(eobj->net);
+
+	calculate_key(net_hmix, eobj->l_spi, eobj->l3num, &eobj->l_ip,
+		      &eobj->r_ip, &key);
+	return calculate_hash(&key, len, seed);
+}
+
+static u32 rtable_obj_hashfn(const void *data, u32 len, u32 seed)
+{
+	union nf_inet_addr any;
+	struct _esp_hkey key = {};
+	const struct _esp_table *eobj = (const struct _esp_table *)data;
+	u32 net_hmix = net_hash_mix(eobj->net);
+
+	esp_ip_addr_set_any(eobj->l3num, &any);
+	calculate_key(net_hmix, eobj->r_spi, eobj->l3num, &any, &eobj->r_ip, &key);
+	return calculate_hash(&key, len, seed);
+}
+
+static u32 incmpl_table_obj_hashfn(const void *data, u32 len, u32 seed)
+{
+	union nf_inet_addr any;
+	struct _esp_hkey key = {};
+	const struct _esp_table *eobj = (const struct _esp_table *)data;
+	u32 net_hmix = net_hash_mix(eobj->net);
+
+	esp_ip_addr_set_any(eobj->l3num, &any);
+	calculate_key(net_hmix, 0, eobj->l3num, &any, &eobj->r_ip, &key);
+	return calculate_hash(&key, len, seed);
+}
+
+static const struct rhashtable_params ltable_params = {
+	.key_len     = sizeof(struct _esp_hkey),
+	.head_offset = offsetof(struct _esp_table, lnode),
+	.max_size = HASH_TAB_MAX_SIZE,
+	.hashfn      = calculate_hash,
+	.obj_hashfn = ltable_obj_hashfn,
+	.obj_cmpfn   = ltable_obj_cmpfn,
+};
+
+static const struct rhashtable_params rtable_params = {
+	.key_len     = sizeof(struct _esp_hkey),
+	.head_offset = offsetof(struct _esp_table, rnode),
+	.max_size = HASH_TAB_MAX_SIZE,
+	.hashfn      = calculate_hash,
+	.obj_hashfn = rtable_obj_hashfn,
+	.obj_cmpfn   = rtable_obj_cmpfn,
+};
+
+static const struct rhashtable_params incmpl_rtable_params = {
+	.key_len     = sizeof(struct _esp_hkey),
+	.head_offset = offsetof(struct _esp_table, incmpl_rlist),
+	.max_size = HASH_TAB_MAX_SIZE,
+	.hashfn      = calculate_hash,
+	.obj_hashfn = incmpl_table_obj_hashfn,
+	.obj_cmpfn   = incmpl_table_obj_cmpfn,
+};
+
+int nf_conntrack_esp_init(void)
+{
+	int i;
+	int ret = 0;
+
+	spin_lock_bh(&esp_table_lock);
+
+	for (i = 0; i < ESP_MAX_CONNECTIONS; i++)
+		memset(&esp_table[i], 0, sizeof(struct _esp_table *));
+
+	ret = rhashtable_init(&ltable, &ltable_params);
+	if (ret)
+		return ret;
+
+	ret = rhashtable_init(&rtable, &rtable_params);
+	if (ret)
+		goto err_free_ltable;
+
+	ret = rhltable_init(&incmpl_rtable, &incmpl_rtable_params);
+	if (ret)
+		goto err_free_rtable;
+
+	spin_unlock_bh(&esp_table_lock);
+
+	return ret;
+
+err_free_rtable:
+	rhashtable_destroy(&rtable);
+err_free_ltable:
+	rhashtable_destroy(&ltable);
+
+	spin_unlock_bh(&esp_table_lock);
+	return ret;
+}
+
+void nf_conntrack_esp_init_net(struct net *net)
+{
+	int i;
+	struct nf_esp_net *net_esp = esp_pernet(net);
+
+	for (i = 0; i < ESP_CT_MAX; i++)
+		net_esp->esp_timeouts[i] = esp_timeouts[i];
+}
+
+/* Free an entry referred to by esp_id.
+ *
+ * NOTE:
+ * Entry table locking and unlocking is the responsibility of the calling function.
+ * Range checking is the responsibility of the calling function.
+ */
+static void esp_table_free_entry_by_esp_id(struct net *net, u16 esp_id)
+{
+	struct _esp_table *esp_entry;
+
+	esp_entry = esp_table[esp_id - TEMP_SPI_START];
+	if (esp_entry) {
+		/* Remove from all the hash tables.
+		 */
+		pr_debug("Removing entry %x from all tables", esp_entry->esp_id);
+		rhashtable_remove_fast(&ltable, &esp_entry->lnode, ltable_params);
+		rhashtable_remove_fast(&rtable, &esp_entry->rnode, rtable_params);
+		rhltable_remove(&incmpl_rtable, &esp_entry->incmpl_rlist, incmpl_rtable_params);
+		esp_table[esp_id - TEMP_SPI_START] = NULL;
+		kfree(esp_entry);
+	}
+}
+
+/* Allocate the first available IPSEC table entry.
+ * NOTE: The ESP entry table must be locked prior to calling this function.
+ */
+struct _esp_table *alloc_esp_entry(struct net *net)
+{
+	int idx;
+	struct _esp_table *esp_entry = NULL;
+
+	/* Find the first unused slot */
+	for (idx = 0; idx < ESP_MAX_CONNECTIONS; idx++) {
+		if (esp_table[idx])
+			continue;
+
+		esp_table[idx] = kmalloc(sizeof(*esp_entry), GFP_ATOMIC);
+		if (!esp_table[idx])
+			return NULL;
+		memset(esp_table[idx], 0, sizeof(struct _esp_table));
+		esp_table[idx]->esp_id = idx + TEMP_SPI_START;
+		esp_table[idx]->alloc_time_jiffies = nfct_time_stamp;
+		esp_table[idx]->net = net;
+		esp_entry = esp_table[idx];
+		break;
+	}
+	return esp_entry;
+}
+
+/* Search for an ESP entry in the initial state based on the IP address of
+ * the remote peer.
+ * NOTE: The ESP entry table must be locked prior to calling this function.
+ */
+static struct _esp_table *search_esp_entry_init_remote(struct net *net,
+						       u16 l3num,
+						       const union nf_inet_addr *src_ip)
+{
+	union nf_inet_addr any;
+	u32 net_hmix = net_hash_mix(net);
+	struct _esp_table *esp_entry = NULL;
+	struct _esp_hkey key = {};
+	int first_entry = -1;
+	struct rhlist_head *pos, *list;
+
+	esp_ip_addr_set_any(l3num, &any);
+
+	calculate_key(net_hmix, 0, l3num, &any, src_ip, &key);
+	list = rhltable_lookup(&incmpl_rtable, (const void *)&key, incmpl_rtable_params);
+	rhl_for_each_entry_rcu(esp_entry, pos, list, incmpl_rlist) {
+		if (net_eq(net, esp_entry->net) &&
+		    l3num == esp_entry->l3num &&
+		    esp_ip_addr_equal(l3num, src_ip, &esp_entry->r_ip) &&
+		    esp_entry->r_spi == 0) {
+			if (first_entry == -1) {
+				first_entry = esp_entry->esp_id - TEMP_SPI_START;
+			} else if (esp_table[first_entry]->alloc_time_jiffies - esp_entry->alloc_time_jiffies <= 0) {
+				/* This entry is older than the last one found so treat this
+				 * as a better match.
+				 */
+				first_entry = esp_entry->esp_id - TEMP_SPI_START;
+			}
+		}
+	}
+
+	if (first_entry != -1) {
+		if (esp_entry->l3num == AF_INET) {
+			pr_debug("Matches incmpl_rtable entry %x with l_spi %x r_spi %x r_ip %pI4\n",
+				 esp_entry->esp_id, esp_entry->l_spi, esp_entry->r_spi,
+				 &esp_entry->r_ip.in);
+		} else {
+			pr_debug("Matches incmpl_rtable entry %x with l_spi %x r_spi %x r_ip %pI6\n",
+				 esp_entry->esp_id, esp_entry->l_spi, esp_entry->r_spi,
+				 &esp_entry->r_ip.in6);
+		}
+		esp_entry = esp_table[first_entry];
+		return esp_entry;
+	}
+
+	return NULL;
+}
+
+/* Search for an ESP entry by SPI, source and destination IP addresses.
+ * NOTE: The ESP entry table must be locked prior to calling this function.
+ */
+static struct _esp_table *search_esp_entry_by_spi(struct net *net, const __u32 spi,
+						  u16 l3num,
+						  const union nf_inet_addr *src_ip,
+						  const union nf_inet_addr *dst_ip)
+{
+	union nf_inet_addr any;
+	u32 net_hmix = net_hash_mix(net);
+	struct _esp_table *esp_entry;
+	struct _esp_hkey key = {};
+
+	esp_ip_addr_set_any(l3num, &any);
+
+	/* Check for matching established session or repeated initial LAN side */
+	/* LAN side first */
+	calculate_key(net_hmix, spi, l3num, src_ip, dst_ip, &key);
+	esp_entry = rhashtable_lookup_fast(&ltable, (const void *)&key, ltable_params);
+	if (esp_entry) {
+		/* When r_spi is set this is an established session. When not set it's
+		 * a repeated initial packet from LAN side. But both cases are treated
+		 * the same.
+		 */
+		if (esp_entry->l3num == AF_INET) {
+			pr_debug("Matches ltable entry %x with l_spi %x l_ip %pI4 r_ip %pI4\n",
+				 esp_entry->esp_id, esp_entry->l_spi,
+				 &esp_entry->l_ip.in, &esp_entry->r_ip.in);
+		} else {
+			pr_debug("Matches ltable entry %x with l_spi %x l_ip %pI6 r_ip %pI6\n",
+				 esp_entry->esp_id, esp_entry->l_spi,
+				 &esp_entry->l_ip.in6, &esp_entry->r_ip.in6);
+		}
+		return esp_entry;
+	}
+
+	/* Established remote side */
+	calculate_key(net_hmix, spi, l3num, &any, src_ip, &key);
+	esp_entry = rhashtable_lookup_fast(&rtable, (const void *)&key, rtable_params);
+	if (esp_entry) {
+		if (esp_entry->l3num == AF_INET) {
+			pr_debug("Matches rtable entry %x with l_spi %x r_spi %x l_ip %pI4 r_ip %pI4\n",
+				 esp_entry->esp_id, esp_entry->l_spi, esp_entry->r_spi,
+				 &esp_entry->l_ip.in, &esp_entry->r_ip.in);
+		} else {
+			pr_debug("Matches rtable entry %x with l_spi %x r_spi %x l_ip %pI6 r_ip %pI6\n",
+				 esp_entry->esp_id, esp_entry->l_spi, esp_entry->r_spi,
+				 &esp_entry->l_ip.in6, &esp_entry->r_ip.in6);
+		}
+		return esp_entry;
+	}
+
+	/* Incomplete remote side, check if packet has a missing r_spi */
+	esp_entry = search_esp_entry_init_remote(net, l3num, src_ip);
+	if (esp_entry) {
+		int err;
+
+		esp_entry->r_spi = spi;
+		/* Remove entry from incmpl_rtable and add to rtable */
+		rhltable_remove(&incmpl_rtable, &esp_entry->incmpl_rlist, incmpl_rtable_params);
+		/* Error will not be due to duplicate as established remote side lookup
+		 * above would have found it. Delete entry.
+		 */
+		err = rhashtable_insert_fast(&rtable, &esp_entry->rnode, rtable_params);
+		if (err) {
+			esp_table_free_entry_by_esp_id(net, esp_entry->esp_id);
+			return NULL;
+		}
+		return esp_entry;
+	}
+
+	if (l3num == AF_INET) {
+		pr_debug("No entry matches for spi %x src_ip %pI4 dst_ip %pI4\n",
+			 spi, &src_ip->in, &dst_ip->in);
+	} else {
+		pr_debug("No entry matches for spi %x src_ip %pI6 dst_ip %pI6\n",
+			 spi, &src_ip->in6, &dst_ip->in6);
+	}
+	return NULL;
+}
+
+/* invert esp part of tuple */
+bool nf_conntrack_invert_esp_tuple(struct nf_conntrack_tuple *tuple,
+				   const struct nf_conntrack_tuple *orig)
+{
+	tuple->dst.u.esp.id = orig->dst.u.esp.id;
+	tuple->src.u.esp.id = orig->src.u.esp.id;
+	return true;
+}
+
+/* esp hdr info to tuple */
+bool esp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
+		      struct net *net, struct nf_conntrack_tuple *tuple)
+{
+	struct esphdr _esphdr, *esphdr;
+	struct _esp_table *esp_entry = NULL;
+	u32 spi = 0;
+
+	esphdr = skb_header_pointer(skb, dataoff, sizeof(_esphdr), &_esphdr);
+	if (!esphdr) {
+		/* try to behave like "nf_conntrack_proto_generic" */
+		tuple->src.u.all = 0;
+		tuple->dst.u.all = 0;
+		return true;
+	}
+	spi = ntohl(esphdr->spi);
+
+	/* Check if esphdr already associated with a pre-existing connection:
+	 *   if no, create a new connection, missing the r_spi;
+	 *   if yes, check if we have seen the source IP:
+	 *             if no, fill in r_spi in the pre-existing connection.
+	 */
+	spin_lock_bh(&esp_table_lock);
+	esp_entry = search_esp_entry_by_spi(net, spi, tuple->src.l3num,
+					    &tuple->src.u3, &tuple->dst.u3);
+	if (!esp_entry) {
+		struct _esp_hkey key = {};
+		union nf_inet_addr any;
+		u32 net_hmix = net_hash_mix(net);
+		int err;
+
+		esp_entry = alloc_esp_entry(net);
+		if (!esp_entry) {
+			pr_debug("All esp connection slots in use\n");
+			spin_unlock_bh(&esp_table_lock);
+			return false;
+		}
+		esp_entry->l_spi = spi;
+		esp_entry->l3num = tuple->src.l3num;
+		esp_ip_addr_copy(esp_entry->l3num, &esp_entry->l_ip, &tuple->src.u3);
+		esp_ip_addr_copy(esp_entry->l3num, &esp_entry->r_ip, &tuple->dst.u3);
+
+		/* Add entries to the hash tables */
+
+		err = rhashtable_insert_fast(&ltable, &esp_entry->lnode, ltable_params);
+		if (err) {
+			esp_table_free_entry_by_esp_id(net, esp_entry->esp_id);
+			return false;
+		}
+
+		esp_ip_addr_set_any(esp_entry->l3num, &any);
+		calculate_key(net_hmix, 0, esp_entry->l3num, &any, &esp_entry->r_ip, &key);
+		err = rhltable_insert_key(&incmpl_rtable, (const void *)&key,
+					  &esp_entry->incmpl_rlist, incmpl_rtable_params);
+		if (err) {
+			esp_table_free_entry_by_esp_id(net, esp_entry->esp_id);
+			return false;
+		}
+
+		if (esp_entry->l3num == AF_INET) {
+			pr_debug("New entry %x with l_spi %x l_ip %pI4 r_ip %pI4\n",
+				 esp_entry->esp_id, esp_entry->l_spi,
+				 &esp_entry->l_ip.in, &esp_entry->r_ip.in);
+		} else {
+			pr_debug("New entry %x with l_spi %x l_ip %pI6 r_ip %pI6\n",
+				 esp_entry->esp_id, esp_entry->l_spi,
+				 &esp_entry->l_ip.in6, &esp_entry->r_ip.in6);
+		}
+	}
+
+	tuple->dst.u.esp.id = esp_entry->esp_id;
+	tuple->src.u.esp.id = esp_entry->esp_id;
+	spin_unlock_bh(&esp_table_lock);
+	return true;
+}
+
+#ifdef CONFIG_NF_CONNTRACK_PROCFS
+/* print private data for conntrack */
+static void esp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
+{
+	seq_printf(s, "l_spi=%x, r_spi=%x ", ct->proto.esp.l_spi, ct->proto.esp.r_spi);
+}
+#endif
+
+/* Returns verdict for packet, and may modify conntrack */
+int nf_conntrack_esp_packet(struct nf_conn *ct, struct sk_buff *skb,
+			    unsigned int dataoff,
+			    enum ip_conntrack_info ctinfo,
+			    const struct nf_hook_state *state)
+{
+	u16 esp_id;
+	struct nf_conntrack_tuple *tuple;
+	struct _esp_table *esp_entry;
+	unsigned int *timeouts = nf_ct_timeout_lookup(ct);
+
+	if (!timeouts)
+		timeouts = esp_pernet(nf_ct_net(ct))->esp_timeouts;
+
+	/* If we've seen traffic both ways, this is some kind of ESP
+	 * stream.  Extend timeout.
+	 */
+	if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+		nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[ESP_CT_REPLIED]);
+		/* Also, more likely to be important, and not a probe */
+		if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) {
+			/* Was originally IPCT_STATUS but this is no longer an option.
+			 * GRE uses assured for same purpose
+			 */
+			nf_conntrack_event_cache(IPCT_ASSURED, ct);
+
+			/* Retrieve SPIs of original and reply from esp_entry.
+			 * Both directions should contain the same esp_entry,
+			 * so just check the first one.
+			 */
+			tuple = nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL);
+			esp_id = tuple->src.u.esp.id;
+			if (esp_id >= TEMP_SPI_START && esp_id <= TEMP_SPI_MAX) {
+				spin_lock_bh(&esp_table_lock);
+				esp_entry = esp_table[esp_id - TEMP_SPI_START];
+				if (esp_entry) {
+					ct->proto.esp.l_spi = esp_entry->l_spi;
+					ct->proto.esp.r_spi = esp_entry->r_spi;
+				}
+				spin_unlock_bh(&esp_table_lock);
+			}
+		}
+	} else {
+		nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[ESP_CT_UNREPLIED]);
+	}
+
+	return NF_ACCEPT;
+}
+
+/* Called when a conntrack entry has already been removed from the hashes
+ * and is about to be deleted from memory
+ */
+void destroy_esp_conntrack_entry(struct nf_conn *ct)
+{
+	struct nf_conntrack_tuple *tuple = NULL;
+	enum ip_conntrack_dir dir;
+	u16 esp_id = 0;
+	struct net *net = nf_ct_net(ct);
+
+	spin_lock_bh(&esp_table_lock);
+
+	/* Probably all the ESP entries referenced in this connection are the same,
+	 * but the free function handles repeated frees, so best to do them all.
+	 */
+	for (dir = IP_CT_DIR_ORIGINAL; dir < IP_CT_DIR_MAX; dir++) {
+		tuple = nf_ct_tuple(ct, dir);
+
+		esp_id = tuple->src.u.esp.id;
+		if (esp_id >= TEMP_SPI_START && esp_id <= TEMP_SPI_MAX) {
+			pr_debug("Deleting src esp_id %x (dir %i)\n", esp_id, dir);
+			esp_table_free_entry_by_esp_id(net, esp_id);
+		}
+		tuple->src.u.esp.id = 0;
+		esp_id = tuple->dst.u.esp.id;
+		if (esp_id >= TEMP_SPI_START && esp_id <= TEMP_SPI_MAX) {
+			pr_debug("Deleting dst esp_id %x (dir %i)\n", esp_id, dir);
+			esp_table_free_entry_by_esp_id(net, esp_id);
+		}
+		tuple->dst.u.esp.id = 0;
+	}
+
+	spin_unlock_bh(&esp_table_lock);
+}
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+static int esp_tuple_to_nlattr(struct sk_buff *skb,
+			       const struct nf_conntrack_tuple *t)
+{
+	if (nla_put_be16(skb, CTA_PROTO_SRC_ESP_ID, t->src.u.esp.id) ||
+	    nla_put_be16(skb, CTA_PROTO_DST_ESP_ID, t->dst.u.esp.id))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static const struct nla_policy esp_nla_policy[CTA_PROTO_MAX + 1] = {
+	[CTA_PROTO_SRC_ESP_ID] = { .type = NLA_U16 },
+	[CTA_PROTO_DST_ESP_ID] = { .type = NLA_U16 },
+};
+
+static int esp_nlattr_to_tuple(struct nlattr *tb[],
+			       struct nf_conntrack_tuple *t,
+			       u32 flags)
+{
+	if (flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_ESP_ID)) {
+		if (!tb[CTA_PROTO_SRC_ESP_ID])
+			return -EINVAL;
+
+		t->src.u.esp.id = nla_get_be16(tb[CTA_PROTO_SRC_ESP_ID]);
+	}
+
+	if (flags & CTA_FILTER_FLAG(CTA_PROTO_DST_ESP_ID)) {
+		if (!tb[CTA_PROTO_DST_ESP_ID])
+			return -EINVAL;
+
+		t->dst.u.esp.id = nla_get_be16(tb[CTA_PROTO_DST_ESP_ID]);
+	}
+
+	return 0;
+}
+
+static unsigned int esp_nlattr_tuple_size(void)
+{
+	return nla_policy_len(esp_nla_policy, CTA_PROTO_MAX + 1);
+}
+#endif
+
+/* protocol helper struct */
+const struct nf_conntrack_l4proto nf_conntrack_l4proto_esp = {
+	.l4proto = IPPROTO_ESP,
+#ifdef CONFIG_NF_CONNTRACK_PROCFS
+	.print_conntrack = esp_print_conntrack,
+#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+	.tuple_to_nlattr = esp_tuple_to_nlattr,
+	.nlattr_tuple_size = esp_nlattr_tuple_size,
+	.nlattr_to_tuple = esp_nlattr_to_tuple,
+	.nla_policy = esp_nla_policy,
+#endif
+};
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index c6c0cb465664..7922ff6cf5a4 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -88,6 +88,14 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
 			   ntohs(tuple->src.u.gre.key),
 			   ntohs(tuple->dst.u.gre.key));
 		break;
+	case IPPROTO_ESP:
+		/* Both src and dest esp.id should be equal but showing both
+		 * will help find errors.
+		 */
+		seq_printf(s, "srcid=0x%x dstid=0x%x ",
+			   ntohs(tuple->src.u.esp.id),
+			   ntohs(tuple->dst.u.esp.id));
+		break;
 	default:
 		break;
 	}
diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h
index 832ae64179f0..4fd8956aec65 100644
--- a/net/netfilter/nf_internals.h
+++ b/net/netfilter/nf_internals.h
@@ -19,7 +19,9 @@
 #define CTA_FILTER_F_CTA_PROTO_ICMPV6_TYPE	(1 << 9)
 #define CTA_FILTER_F_CTA_PROTO_ICMPV6_CODE	(1 << 10)
 #define CTA_FILTER_F_CTA_PROTO_ICMPV6_ID	(1 << 11)
-#define CTA_FILTER_F_MAX			(1 << 12)
+#define CTA_FILTER_F_CTA_PROTO_SRC_ESP_ID	(1 << 12)
+#define CTA_FILTER_F_CTA_PROTO_DST_ESP_ID	(1 << 13)
+#define CTA_FILTER_F_MAX			(1 << 14)
 #define CTA_FILTER_F_ALL			(CTA_FILTER_F_MAX-1)
 #define CTA_FILTER_FLAG(ctattr) CTA_FILTER_F_ ## ctattr
 
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] netfilter: nf_conntrack: Add conntrack helper for ESP/IPsec
  2021-04-20 22:35   ` Cole Dishington
@ 2021-04-26 11:54     ` Florian Westphal
  2021-04-26 12:37       ` Florian Westphal
  0 siblings, 1 reply; 10+ messages in thread
From: Florian Westphal @ 2021-04-26 11:54 UTC (permalink / raw)
  To: Cole Dishington
  Cc: fw, pablo, kadlec, davem, kuba, linux-kernel, netfilter-devel,
	coreteam, netdev

Cole Dishington <Cole.Dishington@alliedtelesis.co.nz> wrote:
> @@ -90,6 +90,8 @@ enum ctattr_l4proto {
>  	CTA_PROTO_ICMPV6_ID,
>  	CTA_PROTO_ICMPV6_TYPE,
>  	CTA_PROTO_ICMPV6_CODE,
> +	CTA_PROTO_SRC_ESP_ID,
> +	CTA_PROTO_DST_ESP_ID,
>  	__CTA_PROTO_MAX
>  };


> diff --git a/net/netfilter/nf_conntrack_proto_esp.c b/net/netfilter/nf_conntrack_proto_esp.c
> new file mode 100644
> index 000000000000..f17ce8a9439f
> --- /dev/null
> +++ b/net/netfilter/nf_conntrack_proto_esp.c
> @@ -0,0 +1,736 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * <:copyright-gpl
> + * Copyright 2008 Broadcom Corp. All Rights Reserved.
> + * Copyright (C) 2021 Allied Telesis Labs NZ
> + *
> + * This program is free software; you can distribute it and/or modify it
> + * under the terms of the GNU General Public License (Version 2) as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
> + * for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program.
> + * :>
> + */
> +/******************************************************************************
> + * Filename:       nf_conntrack_proto_esp.c
> + * Author:         Pavan Kumar
> + * Creation Date:  05/27/04
> + *

Can you remove this changelog?  The history isn't relevant for upstream.
You can add credits to the commit message if you like.

> +	struct rhash_head lnode;
> +	struct rhash_head rnode;
> +	struct rhlist_head incmpl_rlist;
> +
> +	u16 esp_id;
> +
> +	u32 l_spi;
> +	u32 r_spi;
> +
> +	u16 l3num;

Minor nit: you can save a few bytes by placing the two u16 next to each
other.

> +	union nf_inet_addr l_ip;
> +	union nf_inet_addr r_ip;
> +
> +	u32 alloc_time_jiffies;
> +	struct net *net;
> +};
> +
> +struct _esp_hkey {
> +	u16 l3num;

Nit: l3num can be u8.

> +static inline void esp_ip_addr_set_any(int af, union nf_inet_addr *a)
> +{
> +	if (af == AF_INET6)
> +		ipv6_addr_set(&a->in6, 0, 0, 0, 0);

Alternative is a->in6 = IN6ADDR_ANY_INIT , up to you.

You could also remove the if (af ... conditional and just zero
everything.

Also, with very few exceptions, we try to avoid 'inline' keyword in .c
files.

> +static inline void esp_ip_addr_copy(int af, union nf_inet_addr *dst,
> +				    const union nf_inet_addr *src)
> +{
> +	if (af == AF_INET6)
> +		ipv6_addr_prefix_copy(&dst->in6, &src->in6, 128);

Alternative is to dst->in6 = src->in6.

> +static inline void calculate_key(const u32 net_hmix, const u32 spi,
> +				 const u16 l3num,

l3num can be u8.

> +int nf_conntrack_esp_init(void)
> +{
> +	int i;
> +	int ret = 0;
> +
> +	spin_lock_bh(&esp_table_lock);

This lock isn't needed.  There is no way this function
can be executed concurrently.

> +	/* Check if esphdr already associated with a pre-existing connection:
> +	 *   if no, create a new connection, missing the r_spi;
> +	 *   if yes, check if we have seen the source IP:
> +	 *             if no, fill in r_spi in the pre-existing connection.
> +	 */
> +	spin_lock_bh(&esp_table_lock);

Can you remove this lock?

It would be very unfortunate if we lose rhashtable ability of parallel
insert & lockless lookups.

> +	esp_entry = search_esp_entry_by_spi(net, spi, tuple->src.l3num,
> +					    &tuple->src.u3, &tuple->dst.u3);
> +	if (!esp_entry) {
> +		struct _esp_hkey key = {};
> +		union nf_inet_addr any;
> +		u32 net_hmix = net_hash_mix(net);
> +		int err;
> +
> +		esp_entry = alloc_esp_entry(net);
> +		if (!esp_entry) {
> +			pr_debug("All esp connection slots in use\n");
> +			spin_unlock_bh(&esp_table_lock);
> +			return false;
> +		}
> +		esp_entry->l_spi = spi;
> +		esp_entry->l3num = tuple->src.l3num;
> +		esp_ip_addr_copy(esp_entry->l3num, &esp_entry->l_ip, &tuple->src.u3);
> +		esp_ip_addr_copy(esp_entry->l3num, &esp_entry->r_ip, &tuple->dst.u3);
> +
> +		/* Add entries to the hash tables */
> +
> +		err = rhashtable_insert_fast(&ltable, &esp_entry->lnode, ltable_params);
> +		if (err) {

... without lock, this can fail with -EEXIST.

You could remove the esp_table_lock and change the above
rhashtable_insert_fast() to something like:

esp_entry_old = rhashtable_lookup_get_insert_fast(&ltable, &esp_entry->lnode ltable_params);
if (esp_entry_old) {
	if (IS_ERR(esp_entry_old)) {
		esp_table_free_entry_by_esp_id(net, esp_entry->esp_id);
		return false;
	}

	esp_table_free_entry_by_esp_id(net, esp_entry->esp_id);
	/* insertion raced, use existing entry */
	esp_entry = esp_entry_old;
}
/* esp_entry_old == NULL -- insertion successful */

This should allow removal of the esp_table_lock spinlock.

> +#ifdef CONFIG_NF_CONNTRACK_PROCFS
> +/* print private data for conntrack */
> +static void esp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
> +{
> +	seq_printf(s, "l_spi=%x, r_spi=%x ", ct->proto.esp.l_spi, ct->proto.esp.r_spi);

Thanks, this looks good.

> +			nf_conntrack_event_cache(IPCT_ASSURED, ct);
> +
> +			/* Retrieve SPIs of original and reply from esp_entry.
> +			 * Both directions should contain the same esp_entry,
> +			 * so just check the first one.
> +			 */
> +			tuple = nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL);
> +			esp_id = tuple->src.u.esp.id;
> +			if (esp_id >= TEMP_SPI_START && esp_id <= TEMP_SPI_MAX) {
> +				spin_lock_bh(&esp_table_lock);
> +				esp_entry = esp_table[esp_id - TEMP_SPI_START];

This esp_table[] has to be removed.

1. It breaks netns isolation
2. It forces contention on a single spinlock.

As far as I understand, this table also serves as a resource limiter to
avoid eating up too much resources.

So, how about adding a espid bitmap to struct nf_conntrack_net?

Something like this:

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -63,6 +63,9 @@ struct nf_conntrack_net {
 	struct delayed_work ecache_dwork;
 	struct netns_ct *ct_net;
 #endif
+#ifdef CONFIG_NF_CT_PROTO_ESP
+	DECLARE_BITMAP(esp_id_map, 1024);
+#endif
 };
 
 #include <linux/types.h>
diff --git a/net/netfilter/nf_conntrack_proto_esp.c b/net/netfilter/nf_conntrack_proto_esp.c
index f17ce8a9439f..ce4d5864c480 100644
--- a/net/netfilter/nf_conntrack_proto_esp.c
+++ b/net/netfilter/nf_conntrack_proto_esp.c
@@ -341,24 +340,28 @@ static void esp_table_free_entry_by_esp_id(struct net *net, u16 esp_id)
  */
 struct _esp_table *alloc_esp_entry(struct net *net)
 {
-	int idx;
+	struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id);
 	struct _esp_table *esp_entry = NULL;
+	int idx;
 
-	/* Find the first unused slot */
-	for (idx = 0; idx < ESP_MAX_CONNECTIONS; idx++) {
-		if (esp_table[idx])
-			continue;
+again:
+	idx = find_first_zero_bit(cnet->esp_id_map, 1024);
+	if (idx >= 1024)
+		return NULL;
 
-		esp_table[idx] = kmalloc(sizeof(*esp_entry), GFP_ATOMIC);
-		if (!esp_table[idx])
-			return NULL;
-		memset(esp_table[idx], 0, sizeof(struct _esp_table));
-		esp_table[idx]->esp_id = idx + TEMP_SPI_START;
-		esp_table[idx]->alloc_time_jiffies = nfct_time_stamp;
-		esp_table[idx]->net = net;
-		esp_entry = esp_table[idx];
-		break;
+	if (test_and_set_bit(cnet->esp_id_map, idx))
+		goto again; /* raced */
+
+	esp_entry = kmalloc(sizeof(*esp_entry), GFP_ATOMIC);
+	if (!esp_entry) {
+		clear_bit(cnet->esp_id_map, idx);
+		return NULL;
 	}
+
+	esp_entry->esp_id = idx + TEMP_SPI_START;
+	esp_entry->alloc_time_jiffies = nfct_time_stamp;
+	esp_entry->net = net;
+
 	return esp_entry;
 }


I have a few more concerns:

AFAICS there is no guarantee that an allocated esp table entry is backed
by a conntrack entry.

So, there must be a way to reap all allocated esp_entry structs
when a network namespace goes down.

Perhaps you could add a pernet (nf_conntrack_net) spinlock+list head
that appends each allocated entry to that list.

Then, on conntrack removal, in addition to removal from the rhash
tables, add a list_del().

On network namespace destruction, walk this list and remove all
remaining entries (those that are still around after removal of all
the conntrack objects).

Does that make sense to you?

> +static int esp_tuple_to_nlattr(struct sk_buff *skb,
> +			       const struct nf_conntrack_tuple *t)
> +{
> +	if (nla_put_be16(skb, CTA_PROTO_SRC_ESP_ID, t->src.u.esp.id) ||
> +	    nla_put_be16(skb, CTA_PROTO_DST_ESP_ID, t->dst.u.esp.id))
> +		goto nla_put_failure;

This exposes the 16 bit kernel-generated IDs, right?
Should this dump the real on-wire SPIs instead?

Or is there are reason why the internal IDs need exposure?

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] netfilter: nf_conntrack: Add conntrack helper for ESP/IPsec
  2021-04-26 11:54     ` Florian Westphal
@ 2021-04-26 12:37       ` Florian Westphal
  2021-05-03  1:06         ` [PATCH v3] " Cole Dishington
  0 siblings, 1 reply; 10+ messages in thread
From: Florian Westphal @ 2021-04-26 12:37 UTC (permalink / raw)
  To: Florian Westphal
  Cc: Cole Dishington, pablo, kadlec, davem, kuba, linux-kernel,
	netfilter-devel, coreteam, netdev

Florian Westphal <fw@strlen.de> wrote:
> I have a few more concerns:
[..]

Forgot to add that it would be good to have add a selftest for this feature
to tools/testing/selftests/netfilter/

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v3] netfilter: nf_conntrack: Add conntrack helper for ESP/IPsec
  2021-04-26 12:37       ` Florian Westphal
@ 2021-05-03  1:06         ` Cole Dishington
  2021-05-04 19:22           ` Florian Westphal
  2021-05-05 11:10           ` Florian Westphal
  0 siblings, 2 replies; 10+ messages in thread
From: Cole Dishington @ 2021-05-03  1:06 UTC (permalink / raw)
  To: fw
  Cc: Cole Dishington, Pablo Neira Ayuso, Jozsef Kadlecsik,
	David S. Miller, Jakub Kicinski, Shuah Khan, open list,
	open list:NETFILTER, open list:NETFILTER,
	open list:NETWORKING [GENERAL],
	open list:KERNEL SELFTEST FRAMEWORK

Introduce changes to add ESP connection tracking helper to netfilter
conntrack. The connection tracking of ESP is based on IPsec SPIs. The
underlying motivation for this patch was to allow multiple VPN ESP
clients to be distinguished when using NAT.

Added config flag CONFIG_NF_CT_PROTO_ESP to enable the ESP/IPsec
conntrack helper.

Signed-off-by: Cole Dishington <Cole.Dishington@alliedtelesis.co.nz>
---

Notes:
    Thanks for your time reviewing!
    
    Q.
    > +static int esp_tuple_to_nlattr(struct sk_buff *skb,
    > +                            const struct nf_conntrack_tuple *t)
    > +{
    > +     if (nla_put_be16(skb, CTA_PROTO_SRC_ESP_ID, t->src.u.esp.id) ||
    > +         nla_put_be16(skb, CTA_PROTO_DST_ESP_ID, t->dst.u.esp.id))
    > +             goto nla_put_failure;
    
    This exposes the 16 bit kernel-generated IDs, right?
    Should this dump the real on-wire SPIs instead?
    
    Or is there are reason why the internal IDs need exposure?
    
    A.
    I think I need to expose the internal esp ids here due to esp_nlattr_to_tuple().
    If esp id was changed to real SPIs here I would be unable to lookup the correct
    tuple (without IP addresses too).
    
    changes in v3:
    - Flush all esp entries for a given netns on nf_conntrack_proto_pernet_fini
    - Replace _esp_table (and its spinlock) shared over netns with per netns linked lists and bitmap (for esp ids)
    - Init IPv6 any address with IN6ADDR_ANY_INIT rather than ipv6_addr_set()
    - Change l3num on hash key from u16 to u8
    - Add selftests file for testing tracker with ipv4 and ipv6
    - Removed credits

 .../linux/netfilter/nf_conntrack_proto_esp.h  |  23 +
 .../net/netfilter/ipv4/nf_conntrack_ipv4.h    |   3 +
 include/net/netfilter/nf_conntrack.h          |   6 +
 include/net/netfilter/nf_conntrack_l4proto.h  |  16 +
 include/net/netfilter/nf_conntrack_tuple.h    |   3 +
 include/net/netns/conntrack.h                 |  17 +
 .../netfilter/nf_conntrack_tuple_common.h     |   3 +
 .../linux/netfilter/nfnetlink_conntrack.h     |   2 +
 net/netfilter/Kconfig                         |  10 +
 net/netfilter/Makefile                        |   1 +
 net/netfilter/nf_conntrack_core.c             |  23 +
 net/netfilter/nf_conntrack_netlink.c          |   4 +-
 net/netfilter/nf_conntrack_proto.c            |  15 +
 net/netfilter/nf_conntrack_proto_esp.c        | 741 ++++++++++++++++++
 net/netfilter/nf_conntrack_standalone.c       |   8 +
 net/netfilter/nf_internals.h                  |   4 +-
 .../netfilter/conntrack_esp_related.sh        | 268 +++++++
 17 files changed, 1145 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/netfilter/nf_conntrack_proto_esp.h
 create mode 100644 net/netfilter/nf_conntrack_proto_esp.c
 create mode 100755 tools/testing/selftests/netfilter/conntrack_esp_related.sh

diff --git a/include/linux/netfilter/nf_conntrack_proto_esp.h b/include/linux/netfilter/nf_conntrack_proto_esp.h
new file mode 100644
index 000000000000..96888669edd7
--- /dev/null
+++ b/include/linux/netfilter/nf_conntrack_proto_esp.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _CONNTRACK_PROTO_ESP_H
+#define _CONNTRACK_PROTO_ESP_H
+#include <asm/byteorder.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+
+/* ESP PROTOCOL HEADER */
+
+struct esphdr {
+	__u32 spi;
+};
+
+struct nf_ct_esp {
+	__u32 l_spi, r_spi;
+};
+
+void nf_ct_esp_pernet_flush(struct net *net);
+
+void destroy_esp_conntrack_entry(struct nf_conn *ct);
+
+bool esp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
+		      struct net *net, struct nf_conntrack_tuple *tuple);
+#endif /* _CONNTRACK_PROTO_ESP_H */
diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
index 2c8c2b023848..1aee91592639 100644
--- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
+++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
@@ -25,5 +25,8 @@ extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite;
 #ifdef CONFIG_NF_CT_PROTO_GRE
 extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre;
 #endif
+#ifdef CONFIG_NF_CT_PROTO_ESP
+extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_esp;
+#endif
 
 #endif /*_NF_CONNTRACK_IPV4_H*/
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 439379ca9ffa..4011be8c5e39 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -21,6 +21,7 @@
 #include <linux/netfilter/nf_conntrack_dccp.h>
 #include <linux/netfilter/nf_conntrack_sctp.h>
 #include <linux/netfilter/nf_conntrack_proto_gre.h>
+#include <linux/netfilter/nf_conntrack_proto_esp.h>
 
 #include <net/netfilter/nf_conntrack_tuple.h>
 
@@ -36,6 +37,7 @@ union nf_conntrack_proto {
 	struct ip_ct_tcp tcp;
 	struct nf_ct_udp udp;
 	struct nf_ct_gre gre;
+	struct nf_ct_esp esp;
 	unsigned int tmpl_padto;
 };
 
@@ -47,6 +49,10 @@ struct nf_conntrack_net {
 	unsigned int users4;
 	unsigned int users6;
 	unsigned int users_bridge;
+
+#ifdef CONFIG_NF_CT_PROTO_ESP
+	DECLARE_BITMAP(esp_id_map, 1024);
+#endif
 };
 
 #include <linux/types.h>
diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 96f9cf81f46b..f700de0b9059 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -75,6 +75,8 @@ bool nf_conntrack_invert_icmp_tuple(struct nf_conntrack_tuple *tuple,
 				    const struct nf_conntrack_tuple *orig);
 bool nf_conntrack_invert_icmpv6_tuple(struct nf_conntrack_tuple *tuple,
 				      const struct nf_conntrack_tuple *orig);
+bool nf_conntrack_invert_esp_tuple(struct nf_conntrack_tuple *tuple,
+				   const struct nf_conntrack_tuple *orig);
 
 int nf_conntrack_inet_error(struct nf_conn *tmpl, struct sk_buff *skb,
 			    unsigned int dataoff,
@@ -132,6 +134,11 @@ int nf_conntrack_gre_packet(struct nf_conn *ct,
 			    unsigned int dataoff,
 			    enum ip_conntrack_info ctinfo,
 			    const struct nf_hook_state *state);
+int nf_conntrack_esp_packet(struct nf_conn *ct,
+			    struct sk_buff *skb,
+			    unsigned int dataoff,
+			    enum ip_conntrack_info ctinfo,
+			    const struct nf_hook_state *state);
 
 void nf_conntrack_generic_init_net(struct net *net);
 void nf_conntrack_tcp_init_net(struct net *net);
@@ -141,6 +148,8 @@ void nf_conntrack_dccp_init_net(struct net *net);
 void nf_conntrack_sctp_init_net(struct net *net);
 void nf_conntrack_icmp_init_net(struct net *net);
 void nf_conntrack_icmpv6_init_net(struct net *net);
+int nf_conntrack_esp_init(void);
+void nf_conntrack_esp_init_net(struct net *net);
 
 /* Existing built-in generic protocol */
 extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic;
@@ -240,4 +249,11 @@ static inline struct nf_gre_net *nf_gre_pernet(struct net *net)
 }
 #endif
 
+#ifdef CONFIG_NF_CT_PROTO_ESP
+static inline struct nf_esp_net *nf_esp_pernet(struct net *net)
+{
+	return &net->ct.nf_ct_proto.esp;
+}
+#endif
+
 #endif /*_NF_CONNTRACK_PROTOCOL_H*/
diff --git a/include/net/netfilter/nf_conntrack_tuple.h b/include/net/netfilter/nf_conntrack_tuple.h
index 9334371c94e2..60279ffabe36 100644
--- a/include/net/netfilter/nf_conntrack_tuple.h
+++ b/include/net/netfilter/nf_conntrack_tuple.h
@@ -62,6 +62,9 @@ struct nf_conntrack_tuple {
 			struct {
 				__be16 key;
 			} gre;
+			struct {
+				__be16 id;
+			} esp;
 		} u;
 
 		/* The protocol. */
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index 806454e767bf..43cd1e78f790 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -69,6 +69,20 @@ struct nf_gre_net {
 };
 #endif
 
+#ifdef CONFIG_NF_CT_PROTO_ESP
+enum esp_conntrack {
+	ESP_CT_UNREPLIED,
+	ESP_CT_REPLIED,
+	ESP_CT_MAX
+};
+
+struct nf_esp_net {
+	spinlock_t id_list_lock;
+	struct list_head id_list;
+	unsigned int esp_timeouts[ESP_CT_MAX];
+};
+#endif
+
 struct nf_ip_net {
 	struct nf_generic_net   generic;
 	struct nf_tcp_net	tcp;
@@ -84,6 +98,9 @@ struct nf_ip_net {
 #ifdef CONFIG_NF_CT_PROTO_GRE
 	struct nf_gre_net	gre;
 #endif
+#ifdef CONFIG_NF_CT_PROTO_ESP
+	struct nf_esp_net	esp;
+#endif
 };
 
 struct ct_pcpu {
diff --git a/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h b/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h
index 64390fac6f7e..78600cb4bfff 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h
@@ -39,6 +39,9 @@ union nf_conntrack_man_proto {
 	struct {
 		__be16 key;	/* GRE key is 32bit, PPtP only uses 16bit */
 	} gre;
+	struct {
+		__be16 id;
+	} esp;
 };
 
 #define CTINFO2DIR(ctinfo) ((ctinfo) >= IP_CT_IS_REPLY ? IP_CT_DIR_REPLY : IP_CT_DIR_ORIGINAL)
diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
index d8484be72fdc..744d8931adeb 100644
--- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
@@ -90,6 +90,8 @@ enum ctattr_l4proto {
 	CTA_PROTO_ICMPV6_ID,
 	CTA_PROTO_ICMPV6_TYPE,
 	CTA_PROTO_ICMPV6_CODE,
+	CTA_PROTO_SRC_ESP_ID,
+	CTA_PROTO_DST_ESP_ID,
 	__CTA_PROTO_MAX
 };
 #define CTA_PROTO_MAX (__CTA_PROTO_MAX - 1)
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 1a92063c73a4..7269312d322e 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -199,6 +199,16 @@ config NF_CT_PROTO_UDPLITE
 
 	  If unsure, say Y.
 
+config NF_CT_PROTO_ESP
+	bool "ESP protocol support"
+	depends on NETFILTER_ADVANCED
+	help
+	  ESP connection tracking helper. Provides connection tracking for IPsec
+	  clients behind this device based on SPI, especially useful for
+	  distinguishing multiple clients when using NAT.
+
+	  If unsure, say N.
+
 config NF_CONNTRACK_AMANDA
 	tristate "Amanda backup protocol support"
 	depends on NETFILTER_ADVANCED
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 33da7bf1b68e..0942f2c48ddb 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -14,6 +14,7 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o
 nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o
 nf_conntrack-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
 nf_conntrack-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o
+nf_conntrack-$(CONFIG_NF_CT_PROTO_ESP) += nf_conntrack_proto_esp.o
 
 obj-$(CONFIG_NETFILTER) = netfilter.o
 
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index ff0168736f6e..3bef361d19ce 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -295,6 +295,10 @@ nf_ct_get_tuple(const struct sk_buff *skb,
 #ifdef CONFIG_NF_CT_PROTO_GRE
 	case IPPROTO_GRE:
 		return gre_pkt_to_tuple(skb, dataoff, net, tuple);
+#endif
+#ifdef CONFIG_NF_CT_PROTO_ESP
+	case IPPROTO_ESP:
+		return esp_pkt_to_tuple(skb, dataoff, net, tuple);
 #endif
 	case IPPROTO_TCP:
 	case IPPROTO_UDP: /* fallthrough */
@@ -439,6 +443,10 @@ nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
 #if IS_ENABLED(CONFIG_IPV6)
 	case IPPROTO_ICMPV6:
 		return nf_conntrack_invert_icmpv6_tuple(inverse, orig);
+#endif
+#ifdef CONFIG_NF_CT_PROTO_ESP
+	case IPPROTO_ESP:
+		return nf_conntrack_invert_esp_tuple(inverse, orig);
 #endif
 	}
 
@@ -593,6 +601,13 @@ static void destroy_gre_conntrack(struct nf_conn *ct)
 #endif
 }
 
+static void destroy_esp_conntrack(struct nf_conn *ct)
+{
+#ifdef CONFIG_NF_CT_PROTO_ESP
+	destroy_esp_conntrack_entry(ct);
+#endif
+}
+
 static void
 destroy_conntrack(struct nf_conntrack *nfct)
 {
@@ -609,6 +624,9 @@ destroy_conntrack(struct nf_conntrack *nfct)
 	if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE))
 		destroy_gre_conntrack(ct);
 
+	if (unlikely(nf_ct_protonum(ct) == IPPROTO_ESP))
+		destroy_esp_conntrack(ct);
+
 	local_bh_disable();
 	/* Expectations will have been removed in clean_from_lists,
 	 * except TFTP can create an expectation on the first packet,
@@ -1783,6 +1801,11 @@ static int nf_conntrack_handle_packet(struct nf_conn *ct,
 	case IPPROTO_GRE:
 		return nf_conntrack_gre_packet(ct, skb, dataoff,
 					       ctinfo, state);
+#endif
+#ifdef CONFIG_NF_CT_PROTO_ESP
+	case IPPROTO_ESP:
+		return nf_conntrack_esp_packet(ct, skb, dataoff,
+					       ctinfo, state);
 #endif
 	}
 
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 1d519b0e51a5..8df33dbbf5a3 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1382,7 +1382,9 @@ static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = {
    CTA_FILTER_F_CTA_PROTO_ICMP_ID | \
    CTA_FILTER_F_CTA_PROTO_ICMPV6_TYPE | \
    CTA_FILTER_F_CTA_PROTO_ICMPV6_CODE | \
-   CTA_FILTER_F_CTA_PROTO_ICMPV6_ID)
+   CTA_FILTER_F_CTA_PROTO_ICMPV6_ID | \
+   CTA_FILTER_F_CTA_PROTO_SRC_ESP_ID | \
+   CTA_FILTER_F_CTA_PROTO_DST_ESP_ID)
 
 static int
 ctnetlink_parse_tuple_filter(const struct nlattr * const cda[],
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 47e9319d2cf3..e71ddb4e33cc 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -112,6 +112,9 @@ const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto)
 #ifdef CONFIG_NF_CT_PROTO_GRE
 	case IPPROTO_GRE: return &nf_conntrack_l4proto_gre;
 #endif
+#ifdef CONFIG_NF_CT_PROTO_ESP
+	case IPPROTO_ESP: return &nf_conntrack_l4proto_esp;
+#endif
 #if IS_ENABLED(CONFIG_IPV6)
 	case IPPROTO_ICMPV6: return &nf_conntrack_l4proto_icmpv6;
 #endif /* CONFIG_IPV6 */
@@ -656,6 +659,12 @@ int nf_conntrack_proto_init(void)
 		goto cleanup_sockopt;
 #endif
 
+#ifdef CONFIG_NF_CT_PROTO_ESP
+	ret = nf_conntrack_esp_init();
+	if (ret < 0)
+		goto cleanup_sockopt;
+#endif
+
 	return ret;
 
 #if IS_ENABLED(CONFIG_IPV6)
@@ -691,6 +700,9 @@ void nf_conntrack_proto_pernet_init(struct net *net)
 #ifdef CONFIG_NF_CT_PROTO_GRE
 	nf_conntrack_gre_init_net(net);
 #endif
+#ifdef CONFIG_NF_CT_PROTO_ESP
+	nf_conntrack_esp_init_net(net);
+#endif
 }
 
 void nf_conntrack_proto_pernet_fini(struct net *net)
@@ -698,6 +710,9 @@ void nf_conntrack_proto_pernet_fini(struct net *net)
 #ifdef CONFIG_NF_CT_PROTO_GRE
 	nf_ct_gre_keymap_flush(net);
 #endif
+#ifdef CONFIG_NF_CT_PROTO_ESP
+	nf_ct_esp_pernet_flush(net);
+#endif
 }
 
 module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
diff --git a/net/netfilter/nf_conntrack_proto_esp.c b/net/netfilter/nf_conntrack_proto_esp.c
new file mode 100644
index 000000000000..1bc0cb879bfd
--- /dev/null
+++ b/net/netfilter/nf_conntrack_proto_esp.c
@@ -0,0 +1,741 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * <:copyright-gpl
+ * Copyright 2008 Broadcom Corp. All Rights Reserved.
+ * Copyright (C) 2021 Allied Telesis Labs NZ
+ *
+ * This program is free software; you can distribute it and/or modify it
+ * under the terms of the GNU General Public License (Version 2) as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.
+ * :>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/list.h>
+#include <linux/seq_file.h>
+#include <linux/in.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/dst.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
+#include <linux/netfilter/nf_conntrack_proto_esp.h>
+#include <net/netns/hash.h>
+#include <linux/rhashtable.h>
+#include <net/ipv6.h>
+
+#include "nf_internals.h"
+
+/* esp_id of 0 is left for unassigned values */
+#define TEMP_SPI_START 1
+#define TEMP_SPI_MAX   (TEMP_SPI_START + 1024 - 1)
+
+struct _esp_entry {
+	/* linked list node for per net lookup via esp_id */
+	struct list_head net_node;
+
+       /* Hash table nodes for each required lookup
+	* lnode: net->hash_mix, l_spi, l_ip, r_ip
+	* rnode: net->hash_mix, r_spi, r_ip
+	* incmpl_rlist: net->hash_mix, r_ip
+	*/
+	struct rhash_head lnode;
+	struct rhash_head rnode;
+	struct rhlist_head incmpl_rlist;
+
+	u16 esp_id;
+
+	u16 l3num;
+
+	u32 l_spi;
+	u32 r_spi;
+
+	union nf_inet_addr l_ip;
+	union nf_inet_addr r_ip;
+
+	u32 alloc_time_jiffies;
+	struct net *net;
+};
+
+struct _esp_hkey {
+	u8 l3num;
+	union nf_inet_addr src_ip;
+	union nf_inet_addr dst_ip;
+	u32 net_hmix;
+	u32 spi;
+};
+
+extern unsigned int nf_conntrack_net_id;
+
+static struct rhashtable ltable;
+static struct rhashtable rtable;
+static struct rhltable incmpl_rtable;
+static unsigned int esp_timeouts[ESP_CT_MAX] = {
+	[ESP_CT_UNREPLIED] = 60 * HZ,
+	[ESP_CT_REPLIED] = 3600 * HZ,
+};
+
+static void esp_ip_addr_copy(int af, union nf_inet_addr *dst,
+			     const union nf_inet_addr *src)
+{
+	if (af == AF_INET6)
+		dst->in6 = src->in6;
+	else
+		dst->ip = src->ip;
+}
+
+static int esp_ip_addr_equal(int af, const union nf_inet_addr *a,
+			     const union nf_inet_addr *b)
+{
+	if (af == AF_INET6)
+		return ipv6_addr_equal(&a->in6, &b->in6);
+	return a->ip == b->ip;
+}
+
+static inline struct nf_esp_net *esp_pernet(struct net *net)
+{
+	return &net->ct.nf_ct_proto.esp;
+}
+
+static inline void calculate_key(const u32 net_hmix, const u32 spi,
+				 const u8 l3num,
+				 const union nf_inet_addr *src_ip,
+				 const union nf_inet_addr *dst_ip,
+				 struct _esp_hkey *key)
+{
+	key->net_hmix = net_hmix;
+	key->spi = spi;
+	key->l3num = l3num;
+	esp_ip_addr_copy(l3num, &key->src_ip, src_ip);
+	esp_ip_addr_copy(l3num, &key->dst_ip, dst_ip);
+}
+
+static inline u32 calculate_hash(const void *data, u32 len, u32 seed)
+{
+	return jhash(data, len, seed);
+}
+
+static int ltable_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
+{
+	struct _esp_hkey obj_key = {};
+	const struct _esp_hkey *key = (const struct _esp_hkey *)arg->key;
+	const struct _esp_entry *eobj = (const struct _esp_entry *)obj;
+	u32 net_hmix = net_hash_mix(eobj->net);
+
+	calculate_key(net_hmix, eobj->l_spi, eobj->l3num, &eobj->l_ip,
+		      &eobj->r_ip, &obj_key);
+	return memcmp(key, &obj_key, sizeof(struct _esp_hkey));
+}
+
+static int rtable_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
+{
+	const union nf_inet_addr any = { .in6 = IN6ADDR_ANY_INIT };
+	struct _esp_hkey obj_key = {};
+	const struct _esp_hkey *key = (const struct _esp_hkey *)arg->key;
+	const struct _esp_entry *eobj = (const struct _esp_entry *)obj;
+	u32 net_hmix = net_hash_mix(eobj->net);
+
+	calculate_key(net_hmix, eobj->r_spi, eobj->l3num, &any, &eobj->r_ip,
+		      &obj_key);
+	return memcmp(key, &obj_key, sizeof(struct _esp_hkey));
+}
+
+static int incmpl_table_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
+{
+	const union nf_inet_addr any = { .in6 = IN6ADDR_ANY_INIT };
+	struct _esp_hkey obj_key = {};
+	const struct _esp_hkey *key = (const struct _esp_hkey *)arg->key;
+	const struct _esp_entry *eobj = (const struct _esp_entry *)obj;
+	u32 net_hmix = net_hash_mix(eobj->net);
+
+	calculate_key(net_hmix, 0, eobj->l3num, &any, &eobj->r_ip, &obj_key);
+	return memcmp(key, &obj_key, sizeof(struct _esp_hkey));
+}
+
+static u32 ltable_obj_hashfn(const void *data, u32 len, u32 seed)
+{
+	struct _esp_hkey key = {};
+	const struct _esp_entry *eobj = (const struct _esp_entry *)data;
+	u32 net_hmix = net_hash_mix(eobj->net);
+
+	calculate_key(net_hmix, eobj->l_spi, eobj->l3num, &eobj->l_ip,
+		      &eobj->r_ip, &key);
+	return calculate_hash(&key, len, seed);
+}
+
+static u32 rtable_obj_hashfn(const void *data, u32 len, u32 seed)
+{
+	const union nf_inet_addr any = { .in6 = IN6ADDR_ANY_INIT };
+	struct _esp_hkey key = {};
+	const struct _esp_entry *eobj = (const struct _esp_entry *)data;
+	u32 net_hmix = net_hash_mix(eobj->net);
+
+	calculate_key(net_hmix, eobj->r_spi, eobj->l3num, &any, &eobj->r_ip, &key);
+	return calculate_hash(&key, len, seed);
+}
+
+static u32 incmpl_table_obj_hashfn(const void *data, u32 len, u32 seed)
+{
+	const union nf_inet_addr any = { .in6 = IN6ADDR_ANY_INIT };
+	struct _esp_hkey key = {};
+	const struct _esp_entry *eobj = (const struct _esp_entry *)data;
+	u32 net_hmix = net_hash_mix(eobj->net);
+
+	calculate_key(net_hmix, 0, eobj->l3num, &any, &eobj->r_ip, &key);
+	return calculate_hash(&key, len, seed);
+}
+
+static const struct rhashtable_params ltable_params = {
+	.key_len     = sizeof(struct _esp_hkey),
+	.head_offset = offsetof(struct _esp_entry, lnode),
+	.hashfn      = calculate_hash,
+	.obj_hashfn = ltable_obj_hashfn,
+	.obj_cmpfn   = ltable_obj_cmpfn,
+};
+
+static const struct rhashtable_params rtable_params = {
+	.key_len     = sizeof(struct _esp_hkey),
+	.head_offset = offsetof(struct _esp_entry, rnode),
+	.hashfn      = calculate_hash,
+	.obj_hashfn = rtable_obj_hashfn,
+	.obj_cmpfn   = rtable_obj_cmpfn,
+};
+
+static const struct rhashtable_params incmpl_rtable_params = {
+	.key_len     = sizeof(struct _esp_hkey),
+	.head_offset = offsetof(struct _esp_entry, incmpl_rlist),
+	.hashfn      = calculate_hash,
+	.obj_hashfn = incmpl_table_obj_hashfn,
+	.obj_cmpfn   = incmpl_table_obj_cmpfn,
+};
+
+int nf_conntrack_esp_init(void)
+{
+	int ret;
+
+	ret = rhashtable_init(&ltable, &ltable_params);
+	if (ret)
+		return ret;
+
+	ret = rhashtable_init(&rtable, &rtable_params);
+	if (ret)
+		goto err_free_ltable;
+
+	ret = rhltable_init(&incmpl_rtable, &incmpl_rtable_params);
+	if (ret)
+		goto err_free_rtable;
+
+	return ret;
+
+err_free_rtable:
+	rhashtable_destroy(&rtable);
+err_free_ltable:
+	rhashtable_destroy(&ltable);
+
+	return ret;
+}
+
+void nf_conntrack_esp_init_net(struct net *net)
+{
+	int i;
+	struct nf_esp_net *net_esp = esp_pernet(net);
+
+	spin_lock_init(&net_esp->id_list_lock);
+	INIT_LIST_HEAD(&net_esp->id_list);
+
+	for (i = 0; i < ESP_CT_MAX; i++)
+		net_esp->esp_timeouts[i] = esp_timeouts[i];
+}
+
+static struct _esp_entry *find_esp_entry_by_id(struct nf_esp_net *esp_net, int esp_id)
+{
+	struct list_head *pos, *head;
+	struct _esp_entry *esp_entry;
+
+	head = &esp_net->id_list;
+	list_for_each(pos, head) {
+		esp_entry = list_entry(pos, struct _esp_entry, net_node);
+		if (esp_entry->esp_id == esp_id)
+			return esp_entry;
+	}
+	return NULL;
+}
+
+static void free_esp_entry(struct nf_conntrack_net *cnet, struct _esp_entry *esp_entry)
+{
+	if (esp_entry) {
+		/* Remove from all the hash tables */
+		pr_debug("Removing entry %x from all tables", esp_entry->esp_id);
+		list_del(&esp_entry->net_node);
+		rhashtable_remove_fast(&ltable, &esp_entry->lnode, ltable_params);
+		rhashtable_remove_fast(&rtable, &esp_entry->rnode, rtable_params);
+		rhltable_remove(&incmpl_rtable, &esp_entry->incmpl_rlist, incmpl_rtable_params);
+		clear_bit(esp_entry->esp_id - TEMP_SPI_START, cnet->esp_id_map);
+		kfree(esp_entry);
+	}
+}
+
+/* Free an entry referred to by esp_id.
+ *
+ * NOTE:
+ * Per net linked list locking and unlocking is the responsibility of the calling function.
+ * Range checking is the responsibility of the calling function.
+ */
+static void free_esp_entry_by_id(struct net *net, int esp_id)
+{
+	struct nf_esp_net *esp_net = esp_pernet(net);
+	struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id);
+	struct _esp_entry *esp_entry = find_esp_entry_by_id(esp_net, esp_id);
+
+	free_esp_entry(cnet, esp_entry);
+}
+
+/* Allocate the first available IPSEC table entry.
+ * NOTE: This function may block on per net list lock.
+ */
+struct _esp_entry *alloc_esp_entry(struct net *net)
+{
+	struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id);
+	struct nf_esp_net *esp_net = esp_pernet(net);
+	struct _esp_entry *esp_entry;
+	int id;
+
+again:
+	id = find_first_zero_bit(cnet->esp_id_map, 1024);
+	if (id >= 1024)
+		return NULL;
+
+	if (test_and_set_bit(id, cnet->esp_id_map))
+		goto again; /* raced */
+
+	esp_entry = kmalloc(sizeof(*esp_entry), GFP_ATOMIC);
+	if (!esp_entry) {
+		clear_bit(id, cnet->esp_id_map);
+		return NULL;
+	}
+
+	esp_entry->esp_id = id + TEMP_SPI_START;
+	esp_entry->alloc_time_jiffies = nfct_time_stamp;
+	esp_entry->net = net;
+
+	spin_lock(&esp_net->id_list_lock);
+	list_add(&esp_entry->net_node, &esp_net->id_list);
+	spin_unlock(&esp_net->id_list_lock);
+
+	return esp_entry;
+}
+
+/* Search for an ESP entry in the initial state based on the IP address of
+ * the remote peer.
+ */
+static struct _esp_entry *search_esp_entry_init_remote(struct net *net,
+						       u16 l3num,
+						       const union nf_inet_addr *src_ip)
+{
+	const union nf_inet_addr any = { .in6 = IN6ADDR_ANY_INIT };
+	u32 net_hmix = net_hash_mix(net);
+	struct _esp_entry *first_esp_entry = NULL;
+	struct _esp_entry *esp_entry;
+	struct _esp_hkey key = {};
+	struct rhlist_head *pos, *list;
+
+	calculate_key(net_hmix, 0, l3num, &any, src_ip, &key);
+	list = rhltable_lookup(&incmpl_rtable, (const void *)&key, incmpl_rtable_params);
+	rhl_for_each_entry_rcu(esp_entry, pos, list, incmpl_rlist) {
+		if (net_eq(net, esp_entry->net) &&
+		    l3num == esp_entry->l3num &&
+		    esp_ip_addr_equal(l3num, src_ip, &esp_entry->r_ip)) {
+			if (!first_esp_entry) {
+				first_esp_entry = esp_entry;
+			} else if (first_esp_entry->alloc_time_jiffies - esp_entry->alloc_time_jiffies <= 0) {
+				/* This entry is older than the last one found so treat this
+				 * as a better match.
+				 */
+				first_esp_entry = esp_entry;
+			}
+		}
+	}
+
+	if (first_esp_entry) {
+		if (first_esp_entry->l3num == AF_INET) {
+			pr_debug("Matches incmpl_rtable entry %x with l_spi %x r_ip %pI4\n",
+				 first_esp_entry->esp_id, first_esp_entry->l_spi,
+				 &first_esp_entry->r_ip.in);
+		} else {
+			pr_debug("Matches incmpl_rtable entry %x with l_spi %x r_ip %pI6\n",
+				 first_esp_entry->esp_id, first_esp_entry->l_spi,
+				 &first_esp_entry->r_ip.in6);
+		}
+	}
+
+	return first_esp_entry;
+}
+
+/* Search for an ESP entry by SPI, source and destination IP addresses.
+ * NOTE: This function may block on per net list lock.
+ */
+static struct _esp_entry *search_esp_entry_by_spi(struct net *net, const __u32 spi,
+						  u16 l3num,
+						  const union nf_inet_addr *src_ip,
+						  const union nf_inet_addr *dst_ip)
+{
+	const union nf_inet_addr any = { .in6 = IN6ADDR_ANY_INIT };
+	u32 net_hmix = net_hash_mix(net);
+	struct _esp_entry *esp_entry;
+	struct _esp_hkey key = {};
+
+	/* Check for matching established session or repeated initial LAN side */
+	/* LAN side first */
+	calculate_key(net_hmix, spi, l3num, src_ip, dst_ip, &key);
+	esp_entry = rhashtable_lookup_fast(&ltable, (const void *)&key, ltable_params);
+	if (esp_entry) {
+		/* When r_spi is set this is an established session. When not set it's
+		 * a repeated initial packet from LAN side. But both cases are treated
+		 * the same.
+		 */
+		if (esp_entry->l3num == AF_INET) {
+			pr_debug("Matches ltable entry %x with l_spi %x l_ip %pI4 r_ip %pI4\n",
+				 esp_entry->esp_id, esp_entry->l_spi,
+				 &esp_entry->l_ip.in, &esp_entry->r_ip.in);
+		} else {
+			pr_debug("Matches ltable entry %x with l_spi %x l_ip %pI6 r_ip %pI6\n",
+				 esp_entry->esp_id, esp_entry->l_spi,
+				 &esp_entry->l_ip.in6, &esp_entry->r_ip.in6);
+		}
+		return esp_entry;
+	}
+
+	/* Established remote side */
+	calculate_key(net_hmix, spi, l3num, &any, src_ip, &key);
+	esp_entry = rhashtable_lookup_fast(&rtable, (const void *)&key, rtable_params);
+	if (esp_entry) {
+		if (esp_entry->l3num == AF_INET) {
+			pr_debug("Matches rtable entry %x with l_spi %x r_spi %x l_ip %pI4 r_ip %pI4\n",
+				 esp_entry->esp_id, esp_entry->l_spi, esp_entry->r_spi,
+				 &esp_entry->l_ip.in, &esp_entry->r_ip.in);
+		} else {
+			pr_debug("Matches rtable entry %x with l_spi %x r_spi %x l_ip %pI6 r_ip %pI6\n",
+				 esp_entry->esp_id, esp_entry->l_spi, esp_entry->r_spi,
+				 &esp_entry->l_ip.in6, &esp_entry->r_ip.in6);
+		}
+		return esp_entry;
+	}
+
+	/* Incomplete remote side, check if packet has a missing r_spi */
+	esp_entry = search_esp_entry_init_remote(net, l3num, src_ip);
+	if (esp_entry) {
+		int err;
+
+		esp_entry->r_spi = spi;
+		/* Remove entry from incmpl_rtable and add to rtable */
+		rhltable_remove(&incmpl_rtable, &esp_entry->incmpl_rlist, incmpl_rtable_params);
+		/* Error will not be due to duplicate as established remote side lookup
+		 * above would have found it. Delete entry.
+		 */
+		err = rhashtable_insert_fast(&rtable, &esp_entry->rnode, rtable_params);
+		if (err) {
+			struct nf_esp_net *esp_net = esp_pernet(net);
+
+			spin_lock(&esp_net->id_list_lock);
+			free_esp_entry_by_id(net, esp_entry->esp_id);
+			spin_unlock(&esp_net->id_list_lock);
+			return NULL;
+		}
+		return esp_entry;
+	}
+
+	if (l3num == AF_INET) {
+		pr_debug("No entry matches for spi %x src_ip %pI4 dst_ip %pI4\n",
+			 spi, &src_ip->in, &dst_ip->in);
+	} else {
+		pr_debug("No entry matches for spi %x src_ip %pI6 dst_ip %pI6\n",
+			 spi, &src_ip->in6, &dst_ip->in6);
+	}
+	return NULL;
+}
+
+/* invert esp part of tuple */
+bool nf_conntrack_invert_esp_tuple(struct nf_conntrack_tuple *tuple,
+				   const struct nf_conntrack_tuple *orig)
+{
+	tuple->dst.u.esp.id = orig->dst.u.esp.id;
+	tuple->src.u.esp.id = orig->src.u.esp.id;
+	return true;
+}
+
+/* esp hdr info to tuple */
+bool esp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
+		      struct net *net, struct nf_conntrack_tuple *tuple)
+{
+	struct esphdr _esphdr, *esphdr;
+	struct _esp_entry *esp_entry;
+	u32 spi;
+
+	esphdr = skb_header_pointer(skb, dataoff, sizeof(_esphdr), &_esphdr);
+	if (!esphdr) {
+		/* try to behave like "nf_conntrack_proto_generic" */
+		tuple->src.u.all = 0;
+		tuple->dst.u.all = 0;
+		return true;
+	}
+	spi = ntohl(esphdr->spi);
+
+	/* Check if esphdr already associated with a pre-existing connection:
+	 *   if no, create a new connection, missing the r_spi;
+	 *   if yes, check if we have seen the source IP:
+	 *             if no, fill in r_spi in the pre-existing connection.
+	 */
+	esp_entry = search_esp_entry_by_spi(net, spi, tuple->src.l3num,
+					    &tuple->src.u3, &tuple->dst.u3);
+	if (!esp_entry) {
+		struct _esp_hkey key = {};
+		const union nf_inet_addr any = { .in6 = IN6ADDR_ANY_INIT };
+		u32 net_hmix = net_hash_mix(net);
+		struct nf_esp_net *esp_net = esp_pernet(net);
+		struct _esp_entry *esp_entry_old;
+		int err;
+
+		esp_entry = alloc_esp_entry(net);
+		if (!esp_entry) {
+			pr_debug("All esp connection slots in use\n");
+			return false;
+		}
+		esp_entry->l_spi = spi;
+		esp_entry->l3num = tuple->src.l3num;
+		esp_ip_addr_copy(esp_entry->l3num, &esp_entry->l_ip, &tuple->src.u3);
+		esp_ip_addr_copy(esp_entry->l3num, &esp_entry->r_ip, &tuple->dst.u3);
+
+		/* Add entries to the hash tables */
+
+		calculate_key(net_hmix, esp_entry->l_spi, esp_entry->l3num, &esp_entry->l_ip,
+			      &esp_entry->r_ip, &key);
+		esp_entry_old = rhashtable_lookup_get_insert_key(&ltable, &key, &esp_entry->lnode,
+								 ltable_params);
+		if (esp_entry_old) {
+			spin_lock(&esp_net->id_list_lock);
+
+			if (IS_ERR(esp_entry_old)) {
+				free_esp_entry_by_id(net, esp_entry->esp_id);
+				spin_unlock(&esp_net->id_list_lock);
+				return false;
+			}
+
+			free_esp_entry_by_id(net, esp_entry->esp_id);
+			spin_unlock(&esp_net->id_list_lock);
+
+			/* insertion raced, use existing entry */
+			esp_entry = esp_entry_old;
+		}
+		/* esp_entry_old == NULL -- insertion successful */
+
+		calculate_key(net_hmix, 0, esp_entry->l3num, &any, &esp_entry->r_ip, &key);
+		err = rhltable_insert_key(&incmpl_rtable, (const void *)&key,
+					  &esp_entry->incmpl_rlist, incmpl_rtable_params);
+		if (err) {
+			spin_lock(&esp_net->id_list_lock);
+			free_esp_entry_by_id(net, esp_entry->esp_id);
+			spin_unlock(&esp_net->id_list_lock);
+			return false;
+		}
+
+		if (esp_entry->l3num == AF_INET) {
+			pr_debug("New entry %x with l_spi %x l_ip %pI4 r_ip %pI4\n",
+				 esp_entry->esp_id, esp_entry->l_spi,
+				 &esp_entry->l_ip.in, &esp_entry->r_ip.in);
+		} else {
+			pr_debug("New entry %x with l_spi %x l_ip %pI6 r_ip %pI6\n",
+				 esp_entry->esp_id, esp_entry->l_spi,
+				 &esp_entry->l_ip.in6, &esp_entry->r_ip.in6);
+		}
+	}
+
+	tuple->dst.u.esp.id = esp_entry->esp_id;
+	tuple->src.u.esp.id = esp_entry->esp_id;
+	return true;
+}
+
+#ifdef CONFIG_NF_CONNTRACK_PROCFS
+/* print private data for conntrack */
+static void esp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
+{
+	seq_printf(s, "l_spi=%x, r_spi=%x ", ct->proto.esp.l_spi, ct->proto.esp.r_spi);
+}
+#endif
+
+/* Returns verdict for packet, and may modify conntrack */
+int nf_conntrack_esp_packet(struct nf_conn *ct, struct sk_buff *skb,
+			    unsigned int dataoff,
+			    enum ip_conntrack_info ctinfo,
+			    const struct nf_hook_state *state)
+{
+	int esp_id;
+	struct nf_conntrack_tuple *tuple;
+	unsigned int *timeouts = nf_ct_timeout_lookup(ct);
+	struct nf_esp_net *esp_net = esp_pernet(nf_ct_net(ct));
+
+	if (!timeouts)
+		timeouts = esp_net->esp_timeouts;
+
+	/* If we've seen traffic both ways, this is some kind of ESP
+	 * stream.  Extend timeout.
+	 */
+	if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+		nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[ESP_CT_REPLIED]);
+		/* Also, more likely to be important, and not a probe */
+		if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) {
+			/* Was originally IPCT_STATUS but this is no longer an option.
+			 * GRE uses assured for same purpose
+			 */
+			nf_conntrack_event_cache(IPCT_ASSURED, ct);
+
+			/* Retrieve SPIs of original and reply from esp_entry.
+			 * Both directions should contain the same esp_entry,
+			 * so just check the first one.
+			 */
+			tuple = nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL);
+
+			esp_id = tuple->src.u.esp.id;
+			if (esp_id >= TEMP_SPI_START && esp_id <= TEMP_SPI_MAX) {
+				struct _esp_entry *esp_entry;
+
+				spin_lock(&esp_net->id_list_lock);
+				esp_entry = find_esp_entry_by_id(esp_net, esp_id);
+				spin_unlock(&esp_net->id_list_lock);
+
+				if (esp_entry) {
+					ct->proto.esp.l_spi = esp_entry->l_spi;
+					ct->proto.esp.r_spi = esp_entry->r_spi;
+				}
+			}
+		}
+	} else {
+		nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[ESP_CT_UNREPLIED]);
+	}
+
+	return NF_ACCEPT;
+}
+
+void nf_ct_esp_pernet_flush(struct net *net)
+{
+	struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id);
+	struct nf_esp_net *esp_net = esp_pernet(net);
+	struct list_head *pos, *tmp, *head = &esp_net->id_list;
+	struct _esp_entry *esp_entry;
+
+	spin_lock(&esp_net->id_list_lock);
+	list_for_each_safe(pos, tmp, head) {
+		esp_entry = list_entry(pos, struct _esp_entry, net_node);
+		free_esp_entry(cnet, esp_entry);
+	}
+	spin_unlock(&esp_net->id_list_lock);
+}
+
+/* Called when a conntrack entry has already been removed from the hashes
+ * and is about to be deleted from memory
+ */
+void destroy_esp_conntrack_entry(struct nf_conn *ct)
+{
+	struct nf_conntrack_tuple *tuple;
+	enum ip_conntrack_dir dir;
+	int esp_id;
+	struct net *net = nf_ct_net(ct);
+	struct nf_esp_net *esp_net = esp_pernet(net);
+
+	/* Probably all the ESP entries referenced in this connection are the same,
+	 * but the free function handles repeated frees, so best to do them all.
+	 */
+	for (dir = IP_CT_DIR_ORIGINAL; dir < IP_CT_DIR_MAX; dir++) {
+		tuple = nf_ct_tuple(ct, dir);
+
+		spin_lock(&esp_net->id_list_lock);
+
+		esp_id = tuple->src.u.esp.id;
+		if (esp_id >= TEMP_SPI_START && esp_id <= TEMP_SPI_MAX)
+			free_esp_entry_by_id(net, esp_id);
+		tuple->src.u.esp.id = 0;
+
+		esp_id = tuple->dst.u.esp.id;
+		if (esp_id >= TEMP_SPI_START && esp_id <= TEMP_SPI_MAX)
+			free_esp_entry_by_id(net, esp_id);
+		tuple->dst.u.esp.id = 0;
+
+		spin_unlock(&esp_net->id_list_lock);
+	}
+}
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+static int esp_tuple_to_nlattr(struct sk_buff *skb,
+			       const struct nf_conntrack_tuple *t)
+{
+	if (nla_put_be16(skb, CTA_PROTO_SRC_ESP_ID, t->src.u.esp.id) ||
+	    nla_put_be16(skb, CTA_PROTO_DST_ESP_ID, t->dst.u.esp.id))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static const struct nla_policy esp_nla_policy[CTA_PROTO_MAX + 1] = {
+	[CTA_PROTO_SRC_ESP_ID] = { .type = NLA_U16 },
+	[CTA_PROTO_DST_ESP_ID] = { .type = NLA_U16 },
+};
+
+static int esp_nlattr_to_tuple(struct nlattr *tb[],
+			       struct nf_conntrack_tuple *t,
+			       u32 flags)
+{
+	if (flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_ESP_ID)) {
+		if (!tb[CTA_PROTO_SRC_ESP_ID])
+			return -EINVAL;
+
+		t->src.u.esp.id = nla_get_be16(tb[CTA_PROTO_SRC_ESP_ID]);
+	}
+
+	if (flags & CTA_FILTER_FLAG(CTA_PROTO_DST_ESP_ID)) {
+		if (!tb[CTA_PROTO_DST_ESP_ID])
+			return -EINVAL;
+
+		t->dst.u.esp.id = nla_get_be16(tb[CTA_PROTO_DST_ESP_ID]);
+	}
+
+	return 0;
+}
+
+static unsigned int esp_nlattr_tuple_size(void)
+{
+	return nla_policy_len(esp_nla_policy, CTA_PROTO_MAX + 1);
+}
+#endif
+
+/* protocol helper struct */
+const struct nf_conntrack_l4proto nf_conntrack_l4proto_esp = {
+	.l4proto = IPPROTO_ESP,
+#ifdef CONFIG_NF_CONNTRACK_PROCFS
+	.print_conntrack = esp_print_conntrack,
+#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+	.tuple_to_nlattr = esp_tuple_to_nlattr,
+	.nlattr_tuple_size = esp_nlattr_tuple_size,
+	.nlattr_to_tuple = esp_nlattr_to_tuple,
+	.nla_policy = esp_nla_policy,
+#endif
+};
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index c6c0cb465664..7922ff6cf5a4 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -88,6 +88,14 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
 			   ntohs(tuple->src.u.gre.key),
 			   ntohs(tuple->dst.u.gre.key));
 		break;
+	case IPPROTO_ESP:
+		/* Both src and dest esp.id should be equal but showing both
+		 * will help find errors.
+		 */
+		seq_printf(s, "srcid=0x%x dstid=0x%x ",
+			   ntohs(tuple->src.u.esp.id),
+			   ntohs(tuple->dst.u.esp.id));
+		break;
 	default:
 		break;
 	}
diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h
index 832ae64179f0..4fd8956aec65 100644
--- a/net/netfilter/nf_internals.h
+++ b/net/netfilter/nf_internals.h
@@ -19,7 +19,9 @@
 #define CTA_FILTER_F_CTA_PROTO_ICMPV6_TYPE	(1 << 9)
 #define CTA_FILTER_F_CTA_PROTO_ICMPV6_CODE	(1 << 10)
 #define CTA_FILTER_F_CTA_PROTO_ICMPV6_ID	(1 << 11)
-#define CTA_FILTER_F_MAX			(1 << 12)
+#define CTA_FILTER_F_CTA_PROTO_SRC_ESP_ID	(1 << 12)
+#define CTA_FILTER_F_CTA_PROTO_DST_ESP_ID	(1 << 13)
+#define CTA_FILTER_F_MAX			(1 << 14)
 #define CTA_FILTER_F_ALL			(CTA_FILTER_F_MAX-1)
 #define CTA_FILTER_FLAG(ctattr) CTA_FILTER_F_ ## ctattr
 
diff --git a/tools/testing/selftests/netfilter/conntrack_esp_related.sh b/tools/testing/selftests/netfilter/conntrack_esp_related.sh
new file mode 100755
index 000000000000..88b0f164664f
--- /dev/null
+++ b/tools/testing/selftests/netfilter/conntrack_esp_related.sh
@@ -0,0 +1,268 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# <:copyright-gpl
+# Copyright (C) 2021 Allied Telesis Labs NZ
+#
+# check that related ESP connections are tracked via spi.
+#
+# Setup is:
+#
+# nsclient3(veth0) -> (veth2)
+#                            (br0)nsrouter1(veth1) -> (veth1)nsrouter2 -> (veth0)nsclient2
+# nsclient1(veth0) -> (veth0)
+# Setup xfrm esp connections for IPv4 and IPv6 and check they are tracked.
+#
+# In addition, nsrouter1 will perform IP masquerading. If nsrouter1 does not support esp
+# connection tracking, it will be unable to tell the difference between packets from nsclient2 to
+# either nsclient1 or nsclient3.
+#
+# ESP connections (for IPv6) need to use tunnel mode, as ICMPv6 computes checksum over encapsulating
+# IP header addresses.
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+ret=0
+ns_all="nsclient1 nsclient3 nsrouter1 nsrouter2 nsclient2"
+
+conntrack -V > /dev/null 2>&1
+if [ $? -ne 0 ];then
+	echo "SKIP: Could not run test without conntrack tool"
+	exit $ksft_skip
+fi
+
+nft --version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+	echo "SKIP: Could not run test without nft tool"
+	exit $ksft_skip
+fi
+
+ip -Version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+	echo "SKIP: Could not run test without ip tool"
+	exit $ksft_skip
+fi
+
+ipv4() {
+	echo -n 192.168.$1.$2
+}
+
+ipv6 () {
+	echo -n dead:$1::$2
+}
+
+cleanup() {
+	for n in $ns_all; do ip netns del $n;done
+}
+
+check_counter()
+{
+	local ns_name=$1
+	local name="unknown"
+	local expect="packets 0 bytes 0"
+	local lret=0
+
+	cnt=$(ip netns exec $ns_name nft list counter inet filter "$name" | grep -q "$expect")
+	if [ $? -ne 0 ]; then
+		echo "ERROR: counter $name in $ns_name has unexpected value (expected $expect)" 1>&2
+		ip netns exec $ns_name nft list counter inet filter "$name" 1>&2
+		lret=1
+	fi
+	return $lret
+}
+
+check_unknown()
+{
+	for n in nsrouter1 nsrouter2; do
+		check_counter $n
+		if [ $? -ne 0 ] ;then
+			return 1
+		fi
+	done
+	return 0
+}
+
+check_conntrack()
+{
+	local ret=0
+
+	for p in ipv4 ipv6; do
+		cnt=$(ip netns exec nsrouter1 conntrack -f $p -L 2>&1)
+		# Check tracked connection was esp by port (conntrack shows unknown at the moment)
+		local num=$(echo -e "$cnt" | grep -cE "[a-zA-Z]+ +50")
+		if [ $? -ne 0 ] || [ "x$num" != "x2" ]; then
+			echo -e "ERROR: expect to see two conntrack esp flows for $p:\n $cnt" 1>&2
+			ret=1
+		fi
+	done
+	return $ret
+}
+
+for n in $ns_all; do
+	ip netns add $n
+	ip -net $n link set lo up
+done
+
+ip link add veth0 netns nsclient1 type veth peer name veth0 netns nsrouter1
+ip link add veth0 netns nsclient3 type veth peer name veth2 netns nsrouter1
+ip link add br0 netns nsrouter1 type bridge
+ip -net nsrouter1 link set veth0 master br0
+ip -net nsrouter1 link set veth2 master br0
+ip link add veth1 netns nsrouter1 type veth peer name veth1 netns nsrouter2
+ip link add veth0 netns nsrouter2 type veth peer name veth0 netns nsclient2
+
+for n in $ns_all; do
+	ip -net $n link set veth0 up
+done
+ip -net nsrouter1 link set veth1 up
+ip -net nsrouter1 link set veth2 up
+ip -net nsrouter1 link set br0 up
+ip -net nsrouter2 link set veth1 up
+
+for i in 1 2; do
+	ip -net nsclient$i addr add $(ipv4 $i 2)/24 dev veth0
+	ip -net nsclient$i addr add $(ipv6 $i 2)/64 dev veth0
+	ip -net nsclient$i route add default via $(ipv4 $i 1)
+	ip -net nsclient$i -6 route add default via $(ipv6 $i 1)
+
+	ip -net nsrouter$i addr add $(ipv4 3 $i)/24 dev veth1
+	ip -net nsrouter$i addr add $(ipv6 3 $i)/64 dev veth1
+done
+ip -net nsrouter1 addr add $(ipv4 1 1)/24 dev br0
+ip -net nsrouter1 addr add $(ipv6 1 1)/64 dev br0
+ip -net nsrouter2 addr add $(ipv4 2 1)/24 dev veth0
+ip -net nsrouter2 addr add $(ipv6 2 1)/64 dev veth0
+
+ip -net nsclient3 addr add $(ipv4 1 3)/24 dev veth0
+ip -net nsclient3 addr add $(ipv6 1 3)/64 dev veth0
+ip -net nsclient3 route add default via $(ipv4 1 1)
+ip -net nsclient3 -6 route add default via $(ipv6 1 1)
+
+ip -net nsrouter1 route add default via $(ipv4 3 2)
+ip -net nsrouter1 -6 route add default via $(ipv6 3 2)
+ip -net nsrouter2 route add default via $(ipv4 3 1)
+ip -net nsrouter2 -6 route add default via $(ipv6 3 1)
+
+for i in 1 2; do
+	ip netns exec nsrouter$i sysctl -q net.ipv4.conf.all.forwarding=1
+	ip netns exec nsrouter$i sysctl -q net.ipv6.conf.all.forwarding=1
+done
+
+for i in 1 2; do
+	ip netns exec nsrouter$i nft -f - <<-EOF
+	table inet filter {
+		counter unknown { }
+		chain forward {
+			type filter hook forward priority 0; policy accept;
+			meta l4proto esp ct state new,established accept
+			counter name "unknown" accept
+		}
+	}
+	EOF
+done
+
+for i in 1 2; do
+	ip netns exec nsrouter1 nft -f - <<-EOF
+	table ip nat {
+		chain postrouting {
+			type nat hook postrouting priority 0; policy accept;
+		oifname "veth1" counter masquerade
+		}
+	}
+	table ip6 nat {
+		chain postrouting {
+			type nat hook postrouting priority 0; policy accept;
+		oifname "veth1" counter masquerade
+		}
+	}
+	EOF
+done
+sleep 2
+
+ip_tunnel() {
+	ip -net nsclient$2 tunnel add tunnel$1 mode vti${1%4} local $3 remote $4 key 0x$1
+	ip -net nsclient$2 link set tunnel$1 up
+}
+
+ip_xfrm() {
+	ip -net nsclient$2 xfrm state add src $4 dst $5 \
+	 proto esp spi 0x$1$2$3 mode tunnel mark 0x$1 \
+	 sel src $6 dst $7 \
+	 auth-trunc 'hmac(sha256)' \
+	  0x0000000000000000000000000000000000000000000000000000000000000$1$2$3 128 \
+	 enc 'cbc(aes)' \
+	  0x0000000000000000000000000000000000000000000000000000000000000$1$2$3
+
+	ip -net nsclient$2 xfrm state add src $5 dst $4 \
+	 proto esp spi 0x$1$3$2 mode tunnel mark 0x$1 \
+	 sel src $7 dst $6 \
+	 auth-trunc 'hmac(sha256)' \
+	  0x0000000000000000000000000000000000000000000000000000000000000$1$3$2 128 \
+	 enc 'cbc(aes)' \
+	  0x0000000000000000000000000000000000000000000000000000000000000$1$3$2
+
+	ip -net nsclient$2 xfrm policy add src $7 dst $6 dir in mark 0x$1 \
+	 tmpl src $5 dst $4 proto esp mode tunnel
+	ip -net nsclient$2 xfrm policy add src $6 dst $7 dir out mark 0x$1 \
+	 tmpl src $4 dst $5 proto esp mode tunnel
+}
+
+ip_tunnel 4 1 $(ipv4 1 2) $(ipv4 2 2)
+ip -net nsclient1 addr add $(ipv4 250 1)/24 dev tunnel4
+ip_xfrm 4 1 2 $(ipv4 1 2) $(ipv4 2 2) $(ipv4 250 1) $(ipv4 250 2)
+
+ip_tunnel 4 3 $(ipv4 1 3) $(ipv4 2 2)
+ip -net nsclient3 addr add $(ipv4 251 1)/24 dev tunnel4
+ip_xfrm 4 3 2 $(ipv4 1 3) $(ipv4 2 2) $(ipv4 251 1) $(ipv4 251 2)
+
+ip_tunnel 4 2 $(ipv4 2 2) $(ipv4 3 1)
+ip -net nsclient2 addr add $(ipv4 250 2)/24 dev tunnel4
+ip -net nsclient2 addr add $(ipv4 251 2)/24 dev tunnel4
+ip_xfrm 4 2 1 $(ipv4 2 2) $(ipv4 3 1) $(ipv4 250 2) $(ipv4 250 1)
+ip_xfrm 4 2 3 $(ipv4 2 2) $(ipv4 3 1) $(ipv4 251 2) $(ipv4 251 1)
+
+
+ip_tunnel 6 1 $(ipv6 1 2) $(ipv6 2 2)
+ip -net nsclient1 addr add $(ipv6 250 1)/64 dev tunnel6
+ip_xfrm 6 1 2 $(ipv6 1 2) $(ipv6 2 2) $(ipv6 250 1) $(ipv6 250 2)
+
+ip_tunnel 6 3 $(ipv6 1 3) $(ipv6 2 2)
+ip -net nsclient3 addr add $(ipv6 251 1)/64 dev tunnel6
+ip_xfrm 6 3 2 $(ipv6 1 3) $(ipv6 2 2) $(ipv6 251 1) $(ipv6 251 2)
+
+ip_tunnel 6 2 $(ipv6 2 2) $(ipv6 3 1)
+ip -net nsclient2 addr add $(ipv6 250 2)/64 dev tunnel6
+ip -net nsclient2 addr add $(ipv6 251 2)/64 dev tunnel6
+ip_xfrm 6 2 1 $(ipv6 2 2) $(ipv6 3 1) $(ipv6 250 2) $(ipv6 250 1)
+ip_xfrm 6 2 3 $(ipv6 2 2) $(ipv6 3 1) $(ipv6 251 2) $(ipv6 251 1)
+
+test_ping() {
+	ip netns exec $1 ping -q -c 1 $2 >/dev/null 2>&1
+	if [ $? -ne 0 ]; then
+		echo "ERROR: netns ip routing/connectivity broken from $1 to $2" 1>&2
+	fi
+}
+
+test_ping nsclient1 $(ipv4 250 2)
+test_ping nsclient3 $(ipv4 251 2)
+test_ping nsclient1 $(ipv6 250 2)
+test_ping nsclient3 $(ipv6 251 2)
+
+check_conntrack
+if [ $? -ne 0 ]; then
+	ret=1
+fi
+
+check_unknown
+if [ $? -ne 0 ]; then
+	ret=1
+fi
+
+if [ $ret -eq 0 ];then
+	echo "PASS: ESP connections were tracked via SPIs"
+else
+	echo "ERROR: ESP connections were not tracked"
+fi
+
+cleanup
+exit $ret
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v3] netfilter: nf_conntrack: Add conntrack helper for ESP/IPsec
  2021-05-03  1:06         ` [PATCH v3] " Cole Dishington
@ 2021-05-04 19:22           ` Florian Westphal
  2021-05-05 11:10           ` Florian Westphal
  1 sibling, 0 replies; 10+ messages in thread
From: Florian Westphal @ 2021-05-04 19:22 UTC (permalink / raw)
  To: Cole Dishington
  Cc: fw, Pablo Neira Ayuso, Jozsef Kadlecsik, David S. Miller,
	Jakub Kicinski, Shuah Khan, open list, open list:NETFILTER,
	open list:NETFILTER, open list:NETWORKING [GENERAL],
	open list:KERNEL SELFTEST FRAMEWORK

Cole Dishington <Cole.Dishington@alliedtelesis.co.nz> wrote:
> Introduce changes to add ESP connection tracking helper to netfilter
> conntrack. The connection tracking of ESP is based on IPsec SPIs. The
> underlying motivation for this patch was to allow multiple VPN ESP
> clients to be distinguished when using NAT.
> 
> Added config flag CONFIG_NF_CT_PROTO_ESP to enable the ESP/IPsec
> conntrack helper.
> 
> Signed-off-by: Cole Dishington <Cole.Dishington@alliedtelesis.co.nz>
> ---
> 
> Notes:
>     Thanks for your time reviewing!
>
>     Q.
>     > +static int esp_tuple_to_nlattr(struct sk_buff *skb,
>     > +                            const struct nf_conntrack_tuple *t)
>     > +{
>     > +     if (nla_put_be16(skb, CTA_PROTO_SRC_ESP_ID, t->src.u.esp.id) ||
>     > +         nla_put_be16(skb, CTA_PROTO_DST_ESP_ID, t->dst.u.esp.id))
>     > +             goto nla_put_failure;
>     
>     This exposes the 16 bit kernel-generated IDs, right?
>     Should this dump the real on-wire SPIs instead?
>     
>     Or is there are reason why the internal IDs need exposure?
>     
>     A.
>     I think I need to expose the internal esp ids here due to esp_nlattr_to_tuple().
>     If esp id was changed to real SPIs here I would be unable to lookup the correct
>     tuple (without IP addresses too).

Oh, right. I keep forgetting the ESP tracker hooks into the tuple
creation function.  In that case I think it would be good to include the
raw/on-wire ESP IDs as well when dumping so conntrack -L can print them.

The internal numbers are seemingly pointless, except that you need them
to populate the tuple, and can't obtain the internal numbers based on
the on-wire ESP ID.

The only other solution I see is to check presence of
CTA_IP_DST|CTA_IP_SRC in 'flags', then take the ip addresses from the
provided tuple, and do a lookup in the rhashtable with the addresses
and the raw esp values.

Obvious downside: This will force users to provide the ip address as
well, search based on ESP value alone won't work anymore.

[ Unless a full table walk is done, but that might be ambiguous
  without the ip addresses, as on-wire ESP may not be unique ].

>    changes in v3:
>     - Flush all esp entries for a given netns on nf_conntrack_proto_pernet_fini
>     - Replace _esp_table (and its spinlock) shared over netns with per netns linked lists and bitmap (for esp ids)
>     - Init IPv6 any address with IN6ADDR_ANY_INIT rather than ipv6_addr_set()
>     - Change l3num on hash key from u16 to u8
>     - Add selftests file for testing tracker with ipv4 and ipv6

Thanks for this.  Can you place the selftest in a 2/2 patch in v4?

checkpatch doesn't like some whitespace, but I did not see any critical
warnings.

> diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
> index 439379ca9ffa..4011be8c5e39 100644
> --- a/include/net/netfilter/nf_conntrack.h
> @@ -47,6 +49,10 @@ struct nf_conntrack_net {
>  	unsigned int users4;
>  	unsigned int users6;
>  	unsigned int users_bridge;
> +
> +#ifdef CONFIG_NF_CT_PROTO_ESP
> +	DECLARE_BITMAP(esp_id_map, 1024);

Can we avoid the magic number?

Either make this 1024 and then have the esp.c file use a define
based on ARRAY_SIZE + BITS_PER_TYPE to 'recompute' the 1024 (or whatever
the exact size give is), or add a #define and use that for the bitmap
declaration, then use that consistently in the esp.c file.

Or come up with an alternative solution.

>  #include <linux/types.h>
> diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
> diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
> index 806454e767bf..43cd1e78f790 100644
> --- a/include/net/netns/conntrack.h
> +++ b/include/net/netns/conntrack.h
> @@ -69,6 +69,20 @@ struct nf_gre_net {
>  };
>  #endif
>  
> +#ifdef CONFIG_NF_CT_PROTO_ESP
> +enum esp_conntrack {
> +	ESP_CT_UNREPLIED,
> +	ESP_CT_REPLIED,
> +	ESP_CT_MAX
> +};
> +
> +struct nf_esp_net {
> +	spinlock_t id_list_lock;
> +	struct list_head id_list;

Can you place the list_id/_lock in nf_conntrack_net structure?

The nf_esp_net is placed in 'struct net', the other one is allocated
only when the conntrack module is loaded.

> +	unsigned int esp_timeouts[ESP_CT_MAX];

This is fine.

>  	CTA_PROTO_ICMPV6_TYPE,
>  	CTA_PROTO_ICMPV6_CODE,
> +	CTA_PROTO_SRC_ESP_ID,
> +	CTA_PROTO_DST_ESP_ID,

See above, if the internal IDs have to be exposed,
this should be something like:

> +	CTA_PROTO_SRC_ESP_ID,
> +	CTA_PROTO_DST_ESP_ID,
> +	CTA_PROTO_SRC_ESP_SPI,
> +	CTA_PROTO_DST_ESP_SPI,

... with the latter two exposing the __be32 of the ESP tunnel.
You could also just re-use existing CTA_SRC_PORT/DST_PORT for the
internal esp ids given that ESP has no ports.

I will leave that up to you though, we don't have to avoid new
enums here.

> +#ifdef CONFIG_NF_CT_PROTO_ESP
> +	ret = nf_conntrack_esp_init();
> +	if (ret < 0)
> +		goto cleanup_sockopt;

Ouch, thats a bug in the existing code, I will send a patch.

> diff --git a/net/netfilter/nf_conntrack_proto_esp.c b/net/netfilter/nf_conntrack_proto_esp.c
> new file mode 100644
> index 000000000000..1bc0cb879bfd
> --- /dev/null
> +++ b/net/netfilter/nf_conntrack_proto_esp.c
> @@ -0,0 +1,741 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * <:copyright-gpl
> + * Copyright 2008 Broadcom Corp. All Rights Reserved.
> + * Copyright (C) 2021 Allied Telesis Labs NZ
> + *
> + * This program is free software; you can distribute it and/or modify it
> + * under the terms of the GNU General Public License (Version 2) as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
> + * for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program.
> + * :>

The SPDX tag is enough, the GPL boilerplate can be removed.

> +static void free_esp_entry(struct nf_conntrack_net *cnet, struct _esp_entry *esp_entry)
> +{
> +	if (esp_entry) {
> +		/* Remove from all the hash tables */
> +		pr_debug("Removing entry %x from all tables", esp_entry->esp_id);
> +		list_del(&esp_entry->net_node);
> +		rhashtable_remove_fast(&ltable, &esp_entry->lnode, ltable_params);
> +		rhashtable_remove_fast(&rtable, &esp_entry->rnode, rtable_params);
> +		rhltable_remove(&incmpl_rtable, &esp_entry->incmpl_rlist, incmpl_rtable_params);
> +		clear_bit(esp_entry->esp_id - TEMP_SPI_START, cnet->esp_id_map);
> +		kfree(esp_entry);
> +	}
> +}
> +
> +/* Free an entry referred to by esp_id.
> + *
> + * NOTE:
> + * Per net linked list locking and unlocking is the responsibility of the calling function.

Why? I think it makes more sense to lock/unlock in free_esp_entry, when
the 'list_del(->net_node)' is done.

> + * NOTE: This function may block on per net list lock.

Is that important?  Its a spinlock, so noone should hold it for
long period. search_esp_entry_by_spi() has same comment and in that
case its not used in most cases.

> +	spin_lock(&esp_net->id_list_lock);
> +	list_add(&esp_entry->net_node, &esp_net->id_list);

spin_lock_bh() ?

> +		err = rhashtable_insert_fast(&rtable, &esp_entry->rnode, rtable_params);
> +		if (err) {
> +			struct nf_esp_net *esp_net = esp_pernet(net);
> +
> +			spin_lock(&esp_net->id_list_lock);

spin_lock_bh?  And why does the entire free_esp_entry_by_id() need this
lock?

> +		}
> +		esp_entry->l_spi = spi;
> +		esp_entry->l3num = tuple->src.l3num;
> +		esp_ip_addr_copy(esp_entry->l3num, &esp_entry->l_ip, &tuple->src.u3);
> +		esp_ip_addr_copy(esp_entry->l3num, &esp_entry->r_ip, &tuple->dst.u3);
> +
> +		/* Add entries to the hash tables */
> +
> +		calculate_key(net_hmix, esp_entry->l_spi, esp_entry->l3num, &esp_entry->l_ip,
> +			      &esp_entry->r_ip, &key);
> +		esp_entry_old = rhashtable_lookup_get_insert_key(&ltable, &key, &esp_entry->lnode,
> +								 ltable_params);
> +		if (esp_entry_old) {
> +			spin_lock(&esp_net->id_list_lock);
> +
> +			if (IS_ERR(esp_entry_old)) {
> +				free_esp_entry_by_id(net, esp_entry->esp_id);
> +				spin_unlock(&esp_net->id_list_lock);
> +				return false;
> +			}
> +
> +			free_esp_entry_by_id(net, esp_entry->esp_id);

This looks weird.  Both branches contain the same free_esp_entry_by_id() call.

I suspect this should be something like this:

	if (esp_entry_old) {
		free_esp_entry(net, esp_entry);

		if (IS_ERR(esp_entry_old))
			return false;

		esp_entry = esp_entry_old;

... because we want to remove the entry we allocated right before in the
same function, so why would be have to search by id?

> +		}
> +		/* esp_entry_old == NULL -- insertion successful */

Probably better to avoid this comment, and, if needed, add a 'insertion
raced, other CPU added same entry' or similar, in the if (esp_entry_old)
case.

Up to you.

> +/* Returns verdict for packet, and may modify conntrack */
> +int nf_conntrack_esp_packet(struct nf_conn *ct, struct sk_buff *skb,
> +			    unsigned int dataoff,
> +			    enum ip_conntrack_info ctinfo,
> +			    const struct nf_hook_state *state)
> +{
> +	int esp_id;
> +	struct nf_conntrack_tuple *tuple;
> +	unsigned int *timeouts = nf_ct_timeout_lookup(ct);
> +	struct nf_esp_net *esp_net = esp_pernet(nf_ct_net(ct));
> +
> +	if (!timeouts)
> +		timeouts = esp_net->esp_timeouts;
> +
> +	/* If we've seen traffic both ways, this is some kind of ESP
> +	 * stream.  Extend timeout.
> +	 */
> +	if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
> +		nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[ESP_CT_REPLIED]);
> +		/* Also, more likely to be important, and not a probe */
> +		if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) {
> +			/* Was originally IPCT_STATUS but this is no longer an option.
> +			 * GRE uses assured for same purpose
> +			 */

Please remove the above comment, almost noone remembers what
IPCT_STATUS was 8-)

> +			esp_id = tuple->src.u.esp.id;
> +			if (esp_id >= TEMP_SPI_START && esp_id <= TEMP_SPI_MAX) {
> +				struct _esp_entry *esp_entry;
> +
> +				spin_lock(&esp_net->id_list_lock);
> +				esp_entry = find_esp_entry_by_id(esp_net, esp_id);

There should be no list walk from packet path.  I would suggest to add
another rhashtable for this, or switch entire id allocation to IDR.

If you go for idr, you can place the idr root in nf_esp_net since its
going to be used in lookup path too.  idr can be used with rcu (for
read accesses).

This is my mistake, I did not realize the need for per-id lookup via
this list, I thought the existing rhashtables already covered this.

I thought the only need for this list was to quickly remove all the
allocated entries on netns teardown.

[ walking a (shared across all netns) rhashtable on netns destruction
  can be expensive ]

> +void nf_ct_esp_pernet_flush(struct net *net)
> +{
> +	struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id);
> +	struct nf_esp_net *esp_net = esp_pernet(net);
> +	struct list_head *pos, *tmp, *head = &esp_net->id_list;
> +	struct _esp_entry *esp_entry;
> +
> +	spin_lock(&esp_net->id_list_lock);
> +	list_for_each_safe(pos, tmp, head) {

list_for_each_entry_safe()?

> +		esp_entry = list_entry(pos, struct _esp_entry, net_node);
> +		free_esp_entry(cnet, esp_entry);
> +	}
> +	spin_unlock(&esp_net->id_list_lock);

I think it would be better to move the list lock/unlock to only cover
the single list_del operation to make it clear that this lock only
guards the list and nothing else.

> +/* Called when a conntrack entry has already been removed from the hashes
> + * and is about to be deleted from memory
> + */
> +void destroy_esp_conntrack_entry(struct nf_conn *ct)
> +{
> +	struct nf_conntrack_tuple *tuple;
> +	enum ip_conntrack_dir dir;
> +	int esp_id;
> +	struct net *net = nf_ct_net(ct);
> +	struct nf_esp_net *esp_net = esp_pernet(net);

Nit: some people (not me) are very obsessed with reverse xmas tree
ordering, i.e.

	struct nf_esp_net *esp_net = esp_pernet(net);
	struct net *net = nf_ct_net(ct);
	struct nf_conntrack_tuple *tuple;
	enum ip_conntrack_dir dir;
	int esp_id;

In addition, could you please use 'unsigned' except when you need to
store numbers < 0?

> +	 * but the free function handles repeated frees, so best to do them all.
> +	 */
> +	for (dir = IP_CT_DIR_ORIGINAL; dir < IP_CT_DIR_MAX; dir++) {
> +		tuple = nf_ct_tuple(ct, dir);
> +
> +		spin_lock(&esp_net->id_list_lock);
> +
> +		esp_id = tuple->src.u.esp.id;
> +		if (esp_id >= TEMP_SPI_START && esp_id <= TEMP_SPI_MAX)
> +			free_esp_entry_by_id(net, esp_id);

I find this repeated use of esp_id >= && <= ... confusing. Why is this
needed?

Could you move this down to where you need to protect illegal memory
accesses or similar, so this is just

   free_esp_entry_by_id(net, esp_id) ?

Also, I think it might be better to instead do this:

esp_id_src = tuple->src.u.esp.id;
tuple = nf_ct_tuple(ct, IP_CT_DIR_REPLY);
esp_id_repl = tuple->src.u.esp.id;

free_esp_entry_by_id(net, esp_id_orig);
if (esp_id_orig != esp_id_repl)
  free_esp_entry_by_id(net, esp_id_repl);

This avoids race with esp_id reallocation after the first
clear_bit().

I wonder if it would be possible to change how the assocation of the
reverse direction works.

1. Only use on-wire SPI value (no internal 'esp id' allocation anymore)

2. Use expectation infrastructure to associate the 'reverse' direction
   with the existing/orignal conntrack entry instead.

For ORIGINAL, the ESP Id gets split in two 16 bit halves, stored in the tuple
(so instead of tcp sport x dport y), it would be spi & 0xffff  && spi << 16.

This avoids size increase of the tuple, and we don't change the way tuples work,
i.e. all contents of the tuple are derived from packet header fields.

1. esp_pkt_to_tuple extracts the 32bit SPI, fills the tuple
   with the two 16bit halves.

2. esp_pkt_to_tuple does SPI lookup (just like now).
   found full match -> done

3. no match?
   Basically do what search_esp_entry_init_remote() does, i.e.
   check if its a 'reverse direction'.

Otherwise, assume the new connection is the reverse tunnel of an existing
connection.

Add an expectation for this connection, so it will be picked up as RELATED
rather than new. 

As far as I can see, all that is needed for the expectations can be
found using info stored in the rhashtable(s).

nf_conntrack_find_get() finds us the existing conntrack entry
that we need to pass to nf_ct_expect_alloc().

The needed tuple could just be stored in the rhltable that gets
searched in search_esp_entry_init_remote().

Everything else we need for nf_ct_expect_init() is already present
in the tuple.

This creates a related conntrack entry that is linked to the existing
connection.

Do you see anything that prevents this from working or has other,
undesireable properties vs. the existing proposal?

This would also be easier to review, since it could be layered:

First patch would just add a really simple ESP tracker that just
extracts the SPI, without any rhashtable usage.

Just enough code to make it so 'conntrack -L' or the older /proc file show each
address/spi as independent connection, rather than just the single
'generic' entry.

Then, add the rhashtable code and the hooks you already have to clear
out entries on netns removal and when a conntrack gets destroyed.

Then, in a 3rd patch, add the expectation code, so that a 'new'
connection turns into a related one if the rhtable/rhlist finds
something relevant.

What do you think?

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3] netfilter: nf_conntrack: Add conntrack helper for ESP/IPsec
  2021-05-03  1:06         ` [PATCH v3] " Cole Dishington
  2021-05-04 19:22           ` Florian Westphal
@ 2021-05-05 11:10           ` Florian Westphal
  1 sibling, 0 replies; 10+ messages in thread
From: Florian Westphal @ 2021-05-05 11:10 UTC (permalink / raw)
  To: Cole Dishington
  Cc: fw, Pablo Neira Ayuso, Jozsef Kadlecsik, David S. Miller,
	Jakub Kicinski, Shuah Khan, open list, open list:NETFILTER,
	open list:NETFILTER, open list:NETWORKING [GENERAL],
	open list:KERNEL SELFTEST FRAMEWORK

Cole Dishington <Cole.Dishington@alliedtelesis.co.nz> wrote:
> +/* esp hdr info to tuple */
> +bool esp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
> +		      struct net *net, struct nf_conntrack_tuple *tuple)
> +{
[..]

> +	tuple->dst.u.esp.id = esp_entry->esp_id;
> +	tuple->src.u.esp.id = esp_entry->esp_id;
> +	return true;
> +}

Did not consider this before, and doesn't matter if we'd follow this
approach or expectation-based solution:

Do we need to be mindful about hole-punching?

The above will automatically treat the incoming (never-seen-before)
ESP packet as being part of the outgoing one, i.e. this will match
ESTABLISHED rule, not NEW.

With expectation based approach, this will auto-match a RELATED rule.

With normal expectations as used by helpers (ftp, sip and so on),
we nowadays don't do such auto-accept schemes anymore but instead
require explicit configuation, e.g. something like

iptables -t raw -p tcp -A PREROUTING -s $allowed  -d $ftpserver -j CT --helper "ftp"

... to make it explicit that the kernel may automatically permit
incoming connection requests to $allowed from $ftpserver.

Do we need to worry about this for ESP too?

If the expectation-based route is taken, another patch could be piled on
top that adds a fake ESP helper, whose only function is to let
esp_pkt_to_tuple() check if the 'outgoing/seen-before' ESP connection
has been configured with the "esp" helper, and then allow the expectation
(or, not allow it in case the existing esp ct doesn't have the esp helper).

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] netfilter: nf_conntrack: Add conntrack helper for ESP/IPsec
  2021-04-14 15:40 ` Florian Westphal
  2021-04-20 22:35   ` Cole Dishington
@ 2021-05-05 12:16   ` Jan Engelhardt
  2021-05-06  2:59     ` Cole Dishington
  1 sibling, 1 reply; 10+ messages in thread
From: Jan Engelhardt @ 2021-05-05 12:16 UTC (permalink / raw)
  To: Florian Westphal
  Cc: Cole Dishington, pablo, kadlec, davem, kuba, linux-kernel,
	netfilter-devel, coreteam, netdev


On Wednesday 2021-04-14 17:40, Florian Westphal wrote:
>
>Preface: AFAIU this tracker aims to 'soft-splice' two independent ESP
>connections, i.e.: saddr:spi1 -> daddr, daddr:spi2 <- saddr. [...] This can't
>be done as-is, because we don't know spi2 at the time the first ESP packet is
>received. The solution implemented here is introduction of a 'virtual esp id',
>computed when first ESP packet is received,[...]

I can't imagine this working reliably.

1. The IKE daemons could do an exchange whereby just one ESP flow is set up (from
daddr to saddr). It's unusual to do a one-way tunnel, but it's a possibility.
Then you only ever have ESP packets going from daddr to saddr.

2. Even if the IKE daemons set up what we would consider a normal tunnel,
i.e. one ESP flow per direction, there is no obligation that saddr has to
send anything. daddr could be contacting saddr solely with a protocol
that is both connectionless at L4 and which does not demand any L7 responses
either. Like ... syslog-over-udp?

3. Even under best conditions, what if two clients on the saddr network
simultaneously initiate a connection to daddr, how will you decide
which of the daddr ESP SPIs belongs to which saddr?

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] netfilter: nf_conntrack: Add conntrack helper for ESP/IPsec
  2021-05-05 12:16   ` [PATCH] " Jan Engelhardt
@ 2021-05-06  2:59     ` Cole Dishington
  0 siblings, 0 replies; 10+ messages in thread
From: Cole Dishington @ 2021-05-06  2:59 UTC (permalink / raw)
  To: fw, jengelh
  Cc: linux-kernel, pablo, kuba, netdev, netfilter-devel, davem,
	kadlec, coreteam

On Wed, 2021-05-05 at 14:16 +0200, Jan Engelhardt wrote:
> On Wednesday 2021-04-14 17:40, Florian Westphal wrote:
> > 
> > Preface: AFAIU this tracker aims to 'soft-splice' two independent
> > ESP
> > connections, i.e.: saddr:spi1 -> daddr, daddr:spi2 <- saddr. [...]
> > This can't
> > be done as-is, because we don't know spi2 at the time the first ESP
> > packet is
> > received. The solution implemented here is introduction of a
> > 'virtual esp id',
> > computed when first ESP packet is received,[...]
> 
> I can't imagine this working reliably.
> 
> 1. The IKE daemons could do an exchange whereby just one ESP flow is
> set up (from
> daddr to saddr). It's unusual to do a one-way tunnel, but it's a
> possibility.
> Then you only ever have ESP packets going from daddr to saddr.
> 
> 2. Even if the IKE daemons set up what we would consider a normal
> tunnel,
> i.e. one ESP flow per direction, there is no obligation that saddr
> has to
> send anything. daddr could be contacting saddr solely with a protocol
> that is both connectionless at L4 and which does not demand any L7
> responses
> either. Like ... syslog-over-udp?
> 
> 3. Even under best conditions, what if two clients on the saddr
> network
> simultaneously initiate a connection to daddr, how will you decide
> which of the daddr ESP SPIs belongs to which saddr?

1 and 2 are limitations of treating two one-way ESP SAs as a single
connection. I think 1 and 2 would be less of an issue with Florian
Westphal's latest comments requesting expectations (although an
expectation for the other side would still be setup). 3 is handled by
assuming the first ESP packet will get the first ESP response. I think
the only way past 1 (and a more reliable approach to 3) would be by
processing ISAKMP messages.

However, considering that the ESP connection tracker's primary use is
to allow clients behind a NAT that doesn't support (or use) NAT-T a
method of establishing a connection wihout manually configuring
specific NAT rules, these limitations might be acceptable.

Thanks

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2021-05-06  3:00 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-14  3:53 [PATCH] netfilter: nf_conntrack: Add conntrack helper for ESP/IPsec Cole Dishington
2021-04-14 15:40 ` Florian Westphal
2021-04-20 22:35   ` Cole Dishington
2021-04-26 11:54     ` Florian Westphal
2021-04-26 12:37       ` Florian Westphal
2021-05-03  1:06         ` [PATCH v3] " Cole Dishington
2021-05-04 19:22           ` Florian Westphal
2021-05-05 11:10           ` Florian Westphal
2021-05-05 12:16   ` [PATCH] " Jan Engelhardt
2021-05-06  2:59     ` Cole Dishington

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).