netfilter-devel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH nf 0/2] netfilter: nf_nat_masquerade: don't block rtnl lock
@ 2021-09-15 14:46 Florian Westphal
  2021-09-15 14:46 ` [PATCH nf 1/2] netfilter: nf_nat_masquerade: make async masq_inet6_event handling generic Florian Westphal
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Florian Westphal @ 2021-09-15 14:46 UTC (permalink / raw)
  To: netfilter-devel; +Cc: Florian Westphal

nf_nat_masquerade registers conntrack notifiers to early-expire
conntracks that have been using the downed device/removed address.

With large number of disappearing devices (ppp), iterating the table
for every notification blocks the rtnl lock for multiple seconds.

This change unconditionally defers the walk to the system work queue
so that rtnl lock is not blocked longer than needed.

This is not a regression, the notifier and cleanup walk have existed
since the functionality was added more than 20 years ago.

Florian Westphal (2):
  netfilter: nf_nat_masquerade: make async masq_inet6_event handling
    generic
  netfilter: nf_nat_masquerade: defer conntrack walk to work queue

 net/netfilter/nf_nat_masquerade.c | 168 +++++++++++++++++-------------
 1 file changed, 97 insertions(+), 71 deletions(-)

-- 
2.32.0


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH nf 1/2] netfilter: nf_nat_masquerade: make async masq_inet6_event handling generic
  2021-09-15 14:46 [PATCH nf 0/2] netfilter: nf_nat_masquerade: don't block rtnl lock Florian Westphal
@ 2021-09-15 14:46 ` Florian Westphal
  2021-09-15 14:46 ` [PATCH nf 2/2] netfilter: nf_nat_masquerade: defer conntrack walk to work queue Florian Westphal
  2021-09-21  1:48 ` [PATCH nf 0/2] netfilter: nf_nat_masquerade: don't block rtnl lock Pablo Neira Ayuso
  2 siblings, 0 replies; 4+ messages in thread
From: Florian Westphal @ 2021-09-15 14:46 UTC (permalink / raw)
  To: netfilter-devel; +Cc: Florian Westphal

masq_inet6_event is called asynchronously from system work queue,
because the inet6 notifier is atomic and nf_iterate_cleanup can sleep.

The ipv4 and device notifiers call nf_iterate_cleanup directly.

This is legal, but these notifiers are called with RTNL mutex held.
A large conntrack table with many devices coming and going will have severe
impact on the system usability, with 'ip a' blocking for several seconds.

This change places the defer code into a helper and makes it more
generic so ipv4 and ifdown notifiers can be converted to defer the
cleanup walk as well in a follow patch.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/netfilter/nf_nat_masquerade.c | 122 ++++++++++++++++++------------
 1 file changed, 75 insertions(+), 47 deletions(-)

diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c
index 8e8a65d46345..415919a6ac1a 100644
--- a/net/netfilter/nf_nat_masquerade.c
+++ b/net/netfilter/nf_nat_masquerade.c
@@ -9,8 +9,19 @@
 
 #include <net/netfilter/nf_nat_masquerade.h>
 
+struct masq_dev_work {
+	struct work_struct work;
+	struct net *net;
+	union nf_inet_addr addr;
+	int ifindex;
+	int (*iter)(struct nf_conn *i, void *data);
+};
+
+#define MAX_MASQ_WORKER_COUNT	16
+
 static DEFINE_MUTEX(masq_mutex);
 static unsigned int masq_refcnt __read_mostly;
+static atomic_t masq_worker_count __read_mostly;
 
 unsigned int
 nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
@@ -63,6 +74,63 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
 }
 EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4);
 
+static void iterate_cleanup_work(struct work_struct *work)
+{
+	struct masq_dev_work *w;
+
+	w = container_of(work, struct masq_dev_work, work);
+
+	nf_ct_iterate_cleanup_net(w->net, w->iter, (void *)w, 0, 0);
+
+	put_net(w->net);
+	kfree(w);
+	atomic_dec(&masq_worker_count);
+	module_put(THIS_MODULE);
+}
+
+/* Iterate conntrack table in the background and remove conntrack entries
+ * that use the device/address being removed.
+ *
+ * In case too many work items have been queued already or memory allocation
+ * fails iteration is skipped, conntrack entries will time out eventually.
+ */
+static void nf_nat_masq_schedule(struct net *net, union nf_inet_addr *addr,
+				 int ifindex,
+				 int (*iter)(struct nf_conn *i, void *data),
+				 gfp_t gfp_flags)
+{
+	struct masq_dev_work *w;
+
+	if (atomic_read(&masq_worker_count) > MAX_MASQ_WORKER_COUNT)
+		return;
+
+	net = maybe_get_net(net);
+	if (!net)
+		return;
+
+	if (!try_module_get(THIS_MODULE))
+		goto err_module;
+
+	w = kzalloc(sizeof(*w), gfp_flags);
+	if (w) {
+		/* We can overshoot MAX_MASQ_WORKER_COUNT, no big deal */
+		atomic_inc(&masq_worker_count);
+
+		INIT_WORK(&w->work, iterate_cleanup_work);
+		w->ifindex = ifindex;
+		w->net = net;
+		w->iter = iter;
+		if (addr)
+			w->addr = *addr;
+		schedule_work(&w->work);
+		return;
+	}
+
+	module_put(THIS_MODULE);
+ err_module:
+	put_net(net);
+}
+
 static int device_cmp(struct nf_conn *i, void *ifindex)
 {
 	const struct nf_conn_nat *nat = nfct_nat(i);
@@ -136,8 +204,6 @@ static struct notifier_block masq_inet_notifier = {
 };
 
 #if IS_ENABLED(CONFIG_IPV6)
-static atomic_t v6_worker_count __read_mostly;
-
 static int
 nat_ipv6_dev_get_saddr(struct net *net, const struct net_device *dev,
 		       const struct in6_addr *daddr, unsigned int srcprefs,
@@ -187,13 +253,6 @@ nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
 }
 EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6);
 
-struct masq_dev_work {
-	struct work_struct work;
-	struct net *net;
-	struct in6_addr addr;
-	int ifindex;
-};
-
 static int inet6_cmp(struct nf_conn *ct, void *work)
 {
 	struct masq_dev_work *w = (struct masq_dev_work *)work;
@@ -204,21 +263,7 @@ static int inet6_cmp(struct nf_conn *ct, void *work)
 
 	tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
 
-	return ipv6_addr_equal(&w->addr, &tuple->dst.u3.in6);
-}
-
-static void iterate_cleanup_work(struct work_struct *work)
-{
-	struct masq_dev_work *w;
-
-	w = container_of(work, struct masq_dev_work, work);
-
-	nf_ct_iterate_cleanup_net(w->net, inet6_cmp, (void *)w, 0, 0);
-
-	put_net(w->net);
-	kfree(w);
-	atomic_dec(&v6_worker_count);
-	module_put(THIS_MODULE);
+	return nf_inet_addr_cmp(&w->addr, &tuple->dst.u3);
 }
 
 /* atomic notifier; can't call nf_ct_iterate_cleanup_net (it can sleep).
@@ -233,36 +278,19 @@ static int masq_inet6_event(struct notifier_block *this,
 {
 	struct inet6_ifaddr *ifa = ptr;
 	const struct net_device *dev;
-	struct masq_dev_work *w;
-	struct net *net;
+	union nf_inet_addr addr;
 
-	if (event != NETDEV_DOWN || atomic_read(&v6_worker_count) >= 16)
+	if (event != NETDEV_DOWN)
 		return NOTIFY_DONE;
 
 	dev = ifa->idev->dev;
-	net = maybe_get_net(dev_net(dev));
-	if (!net)
-		return NOTIFY_DONE;
 
-	if (!try_module_get(THIS_MODULE))
-		goto err_module;
+	memset(&addr, 0, sizeof(addr));
 
-	w = kmalloc(sizeof(*w), GFP_ATOMIC);
-	if (w) {
-		atomic_inc(&v6_worker_count);
+	addr.in6 = ifa->addr;
 
-		INIT_WORK(&w->work, iterate_cleanup_work);
-		w->ifindex = dev->ifindex;
-		w->net = net;
-		w->addr = ifa->addr;
-		schedule_work(&w->work);
-
-		return NOTIFY_DONE;
-	}
-
-	module_put(THIS_MODULE);
- err_module:
-	put_net(net);
+	nf_nat_masq_schedule(dev_net(dev), &addr, dev->ifindex, inet6_cmp,
+			     GFP_ATOMIC);
 	return NOTIFY_DONE;
 }
 
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH nf 2/2] netfilter: nf_nat_masquerade: defer conntrack walk to work queue
  2021-09-15 14:46 [PATCH nf 0/2] netfilter: nf_nat_masquerade: don't block rtnl lock Florian Westphal
  2021-09-15 14:46 ` [PATCH nf 1/2] netfilter: nf_nat_masquerade: make async masq_inet6_event handling generic Florian Westphal
@ 2021-09-15 14:46 ` Florian Westphal
  2021-09-21  1:48 ` [PATCH nf 0/2] netfilter: nf_nat_masquerade: don't block rtnl lock Pablo Neira Ayuso
  2 siblings, 0 replies; 4+ messages in thread
From: Florian Westphal @ 2021-09-15 14:46 UTC (permalink / raw)
  To: netfilter-devel; +Cc: Florian Westphal, Martin Zaharinov

The ipv4 and device notifiers are called with RTNL mutex held.
The table walk can take some time, better not block other RTNL users.

'ip a' has been reported to block for up to 20 seconds when conntrack table
has many entries and device down events are frequent (e.g., PPP).

Reported-and-tested-by: Martin Zaharinov <micron10@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/netfilter/nf_nat_masquerade.c | 50 +++++++++++++++----------------
 1 file changed, 24 insertions(+), 26 deletions(-)

diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c
index 415919a6ac1a..acd73f717a08 100644
--- a/net/netfilter/nf_nat_masquerade.c
+++ b/net/netfilter/nf_nat_masquerade.c
@@ -131,13 +131,14 @@ static void nf_nat_masq_schedule(struct net *net, union nf_inet_addr *addr,
 	put_net(net);
 }
 
-static int device_cmp(struct nf_conn *i, void *ifindex)
+static int device_cmp(struct nf_conn *i, void *arg)
 {
 	const struct nf_conn_nat *nat = nfct_nat(i);
+	const struct masq_dev_work *w = arg;
 
 	if (!nat)
 		return 0;
-	return nat->masq_index == (int)(long)ifindex;
+	return nat->masq_index == w->ifindex;
 }
 
 static int masq_device_event(struct notifier_block *this,
@@ -153,8 +154,8 @@ static int masq_device_event(struct notifier_block *this,
 		 * and forget them.
 		 */
 
-		nf_ct_iterate_cleanup_net(net, device_cmp,
-					  (void *)(long)dev->ifindex, 0, 0);
+		nf_nat_masq_schedule(net, NULL, dev->ifindex,
+				     device_cmp, GFP_KERNEL);
 	}
 
 	return NOTIFY_DONE;
@@ -162,35 +163,45 @@ static int masq_device_event(struct notifier_block *this,
 
 static int inet_cmp(struct nf_conn *ct, void *ptr)
 {
-	struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
-	struct net_device *dev = ifa->ifa_dev->dev;
 	struct nf_conntrack_tuple *tuple;
+	struct masq_dev_work *w = ptr;
 
-	if (!device_cmp(ct, (void *)(long)dev->ifindex))
+	if (!device_cmp(ct, ptr))
 		return 0;
 
 	tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
 
-	return ifa->ifa_address == tuple->dst.u3.ip;
+	return nf_inet_addr_cmp(&w->addr, &tuple->dst.u3);
 }
 
 static int masq_inet_event(struct notifier_block *this,
 			   unsigned long event,
 			   void *ptr)
 {
-	struct in_device *idev = ((struct in_ifaddr *)ptr)->ifa_dev;
-	struct net *net = dev_net(idev->dev);
+	const struct in_ifaddr *ifa = ptr;
+	const struct in_device *idev;
+	const struct net_device *dev;
+	union nf_inet_addr addr;
+
+	if (event != NETDEV_DOWN)
+		return NOTIFY_DONE;
 
 	/* The masq_dev_notifier will catch the case of the device going
 	 * down.  So if the inetdev is dead and being destroyed we have
 	 * no work to do.  Otherwise this is an individual address removal
 	 * and we have to perform the flush.
 	 */
+	idev = ifa->ifa_dev;
 	if (idev->dead)
 		return NOTIFY_DONE;
 
-	if (event == NETDEV_DOWN)
-		nf_ct_iterate_cleanup_net(net, inet_cmp, ptr, 0, 0);
+	memset(&addr, 0, sizeof(addr));
+
+	addr.ip = ifa->ifa_address;
+
+	dev = idev->dev;
+	nf_nat_masq_schedule(dev_net(idev->dev), &addr, dev->ifindex,
+			     inet_cmp, GFP_KERNEL);
 
 	return NOTIFY_DONE;
 }
@@ -253,19 +264,6 @@ nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
 }
 EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6);
 
-static int inet6_cmp(struct nf_conn *ct, void *work)
-{
-	struct masq_dev_work *w = (struct masq_dev_work *)work;
-	struct nf_conntrack_tuple *tuple;
-
-	if (!device_cmp(ct, (void *)(long)w->ifindex))
-		return 0;
-
-	tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
-
-	return nf_inet_addr_cmp(&w->addr, &tuple->dst.u3);
-}
-
 /* atomic notifier; can't call nf_ct_iterate_cleanup_net (it can sleep).
  *
  * Defer it to the system workqueue.
@@ -289,7 +287,7 @@ static int masq_inet6_event(struct notifier_block *this,
 
 	addr.in6 = ifa->addr;
 
-	nf_nat_masq_schedule(dev_net(dev), &addr, dev->ifindex, inet6_cmp,
+	nf_nat_masq_schedule(dev_net(dev), &addr, dev->ifindex, inet_cmp,
 			     GFP_ATOMIC);
 	return NOTIFY_DONE;
 }
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH nf 0/2] netfilter: nf_nat_masquerade: don't block rtnl lock
  2021-09-15 14:46 [PATCH nf 0/2] netfilter: nf_nat_masquerade: don't block rtnl lock Florian Westphal
  2021-09-15 14:46 ` [PATCH nf 1/2] netfilter: nf_nat_masquerade: make async masq_inet6_event handling generic Florian Westphal
  2021-09-15 14:46 ` [PATCH nf 2/2] netfilter: nf_nat_masquerade: defer conntrack walk to work queue Florian Westphal
@ 2021-09-21  1:48 ` Pablo Neira Ayuso
  2 siblings, 0 replies; 4+ messages in thread
From: Pablo Neira Ayuso @ 2021-09-21  1:48 UTC (permalink / raw)
  To: Florian Westphal; +Cc: netfilter-devel

On Wed, Sep 15, 2021 at 04:46:37PM +0200, Florian Westphal wrote:
> nf_nat_masquerade registers conntrack notifiers to early-expire
> conntracks that have been using the downed device/removed address.
> 
> With large number of disappearing devices (ppp), iterating the table
> for every notification blocks the rtnl lock for multiple seconds.
> 
> This change unconditionally defers the walk to the system work queue
> so that rtnl lock is not blocked longer than needed.
> 
> This is not a regression, the notifier and cleanup walk have existed
> since the functionality was added more than 20 years ago.

Series applied, thanks.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2021-09-21  1:55 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-09-15 14:46 [PATCH nf 0/2] netfilter: nf_nat_masquerade: don't block rtnl lock Florian Westphal
2021-09-15 14:46 ` [PATCH nf 1/2] netfilter: nf_nat_masquerade: make async masq_inet6_event handling generic Florian Westphal
2021-09-15 14:46 ` [PATCH nf 2/2] netfilter: nf_nat_masquerade: defer conntrack walk to work queue Florian Westphal
2021-09-21  1:48 ` [PATCH nf 0/2] netfilter: nf_nat_masquerade: don't block rtnl lock Pablo Neira Ayuso

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).