All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] netfilter: nf_tables: coalesce multiple notifications into one skbuff
@ 2020-08-27 17:28 Pablo Neira Ayuso
  2020-08-31 16:36 ` Phil Sutter
  2020-09-02 14:16 ` Phil Sutter
  0 siblings, 2 replies; 4+ messages in thread
From: Pablo Neira Ayuso @ 2020-08-27 17:28 UTC (permalink / raw)
  To: netfilter-devel; +Cc: phil, fw, eric

On x86_64, each notification results in one skbuff allocation which
consumes at least 768 bytes due to the skbuff overhead.

This patch coalesces several notifications into one single skbuff, so
each notification consumes at least ~211 bytes, that ~3.5 times less
memory consumption. As a result, this is reducing the chances to exhaust
the netlink socket receive buffer.

Rule of thumb is that each notification batch only contains netlink
messages whose report flag is the same, nfnetlink_send() requires this
to do appropriately delivery to userspace, either via unicast (echo
mode) or multicast (monitor mode).

The skbuff control buffer is used to annotate the report flag for later
handling at the new coalescing routine.

The batch skbuff notification size is NLMSG_GOODSIZE, using a larger
skbuff would allow for more socket receiver buffer savings (to amortize
the cost of the skbuff even more), however, going over that size might
break userspace applications, so let's be conservative and stick to
NLMSG_GOODSIZE.

Reported-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
@Phil: I don't know yet where to target this patch to, if this helps
       firewalld's issue, it might be a good option to enqueue this
       for nf.git

As a side note, it should be possible to skip alloc_skb() from _notify()
then memcpy() and kfree_skb(), ie. start batching a bit earlier, but I
leave this for the future in favour of this simplistic approach.

 include/net/netns/nftables.h  |  1 +
 net/netfilter/nf_tables_api.c | 71 ++++++++++++++++++++++++++++-------
 2 files changed, 59 insertions(+), 13 deletions(-)

diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h
index a1a8d45adb42..6c0806bd8d1e 100644
--- a/include/net/netns/nftables.h
+++ b/include/net/netns/nftables.h
@@ -8,6 +8,7 @@ struct netns_nftables {
 	struct list_head	tables;
 	struct list_head	commit_list;
 	struct list_head	module_list;
+	struct list_head	notify_list;
 	struct mutex		commit_mutex;
 	unsigned int		base_seq;
 	u8			gencursor;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index b7dc1cbf40ea..c77f250ffe8a 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -684,6 +684,18 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net,
 	return -1;
 }
 
+struct nftnl_skb_parms {
+	bool report;
+};
+#define NFT_CB(skb)	(*(struct nftnl_skb_parms*)&((skb)->cb))
+
+static void nft_notify_enqueue(struct sk_buff *skb, bool report,
+			       struct list_head *notify_list)
+{
+	NFT_CB(skb).report = report;
+	list_add_tail(&skb->list, notify_list);
+}
+
 static void nf_tables_table_notify(const struct nft_ctx *ctx, int event)
 {
 	struct sk_buff *skb;
@@ -715,8 +727,7 @@ static void nf_tables_table_notify(const struct nft_ctx *ctx, int event)
 		goto err;
 	}
 
-	nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
-		       ctx->report, GFP_KERNEL);
+	nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list);
 	return;
 err:
 	nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
@@ -1468,8 +1479,7 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event)
 		goto err;
 	}
 
-	nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
-		       ctx->report, GFP_KERNEL);
+	nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list);
 	return;
 err:
 	nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
@@ -2807,8 +2817,7 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx,
 		goto err;
 	}
 
-	nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
-		       ctx->report, GFP_KERNEL);
+	nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list);
 	return;
 err:
 	nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
@@ -3837,8 +3846,7 @@ static void nf_tables_set_notify(const struct nft_ctx *ctx,
 		goto err;
 	}
 
-	nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES, ctx->report,
-		       gfp_flags);
+	nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list);
 	return;
 err:
 	nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
@@ -4959,8 +4967,7 @@ static void nf_tables_setelem_notify(const struct nft_ctx *ctx,
 		goto err;
 	}
 
-	nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, ctx->report,
-		       GFP_KERNEL);
+	nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list);
 	return;
 err:
 	nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
@@ -6275,7 +6282,7 @@ void nft_obj_notify(struct net *net, const struct nft_table *table,
 		goto err;
 	}
 
-	nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report, gfp);
+	nft_notify_enqueue(skb, report, &net->nft.notify_list);
 	return;
 err:
 	nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
@@ -7085,8 +7092,7 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx,
 		goto err;
 	}
 
-	nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
-		       ctx->report, GFP_KERNEL);
+	nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list);
 	return;
 err:
 	nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
@@ -7695,6 +7701,42 @@ static void nf_tables_commit_release(struct net *net)
 	mutex_unlock(&net->nft.commit_mutex);
 }
 
+static void nft_commit_notify(struct net *net, u32 portid)
+{
+	struct sk_buff *batch_skb = NULL, *nskb, *skb;
+	unsigned char *data;
+	int len;
+
+	list_for_each_entry_safe(skb, nskb, &net->nft.notify_list, list) {
+		if (!batch_skb) {
+new_batch:
+			batch_skb = skb;
+			NFT_CB(batch_skb).report = NFT_CB(skb).report;
+			len = NLMSG_GOODSIZE;
+			list_del(&skb->list);
+			continue;
+		}
+		len -= skb->len;
+		if (len > 0 && NFT_CB(skb).report == NFT_CB(batch_skb).report) {
+			data = skb_put(batch_skb, skb->len);
+			memcpy(data, skb->data, skb->len);
+			list_del(&skb->list);
+			kfree_skb(skb);
+			continue;
+		}
+		nfnetlink_send(batch_skb, net, portid, NFNLGRP_NFTABLES,
+			       NFT_CB(batch_skb).report, GFP_KERNEL);
+		goto new_batch;
+	}
+
+	if (batch_skb) {
+		nfnetlink_send(batch_skb, net, portid, NFNLGRP_NFTABLES,
+			       NFT_CB(batch_skb).report, GFP_KERNEL);
+	}
+
+	WARN_ON_ONCE(!list_empty(&net->nft.notify_list));
+}
+
 static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 {
 	struct nft_trans *trans, *next;
@@ -7897,6 +7939,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 		}
 	}
 
+	nft_commit_notify(net, NETLINK_CB(skb).portid);
 	nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN);
 	nf_tables_commit_release(net);
 
@@ -8721,6 +8764,7 @@ static int __net_init nf_tables_init_net(struct net *net)
 	INIT_LIST_HEAD(&net->nft.tables);
 	INIT_LIST_HEAD(&net->nft.commit_list);
 	INIT_LIST_HEAD(&net->nft.module_list);
+	INIT_LIST_HEAD(&net->nft.notify_list);
 	mutex_init(&net->nft.commit_mutex);
 	net->nft.base_seq = 1;
 	net->nft.validate_state = NFT_VALIDATE_SKIP;
@@ -8737,6 +8781,7 @@ static void __net_exit nf_tables_exit_net(struct net *net)
 	mutex_unlock(&net->nft.commit_mutex);
 	WARN_ON_ONCE(!list_empty(&net->nft.tables));
 	WARN_ON_ONCE(!list_empty(&net->nft.module_list));
+	WARN_ON_ONCE(!list_empty(&net->nft.notify_list));
 }
 
 static struct pernet_operations nf_tables_net_ops = {
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] netfilter: nf_tables: coalesce multiple notifications into one skbuff
  2020-08-27 17:28 [PATCH] netfilter: nf_tables: coalesce multiple notifications into one skbuff Pablo Neira Ayuso
@ 2020-08-31 16:36 ` Phil Sutter
  2020-09-02 14:16 ` Phil Sutter
  1 sibling, 0 replies; 4+ messages in thread
From: Phil Sutter @ 2020-08-31 16:36 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netfilter-devel, fw, eric

Hi Pablo,

On Thu, Aug 27, 2020 at 07:28:42PM +0200, Pablo Neira Ayuso wrote:
> On x86_64, each notification results in one skbuff allocation which
> consumes at least 768 bytes due to the skbuff overhead.
> 
> This patch coalesces several notifications into one single skbuff, so
> each notification consumes at least ~211 bytes, that ~3.5 times less
> memory consumption. As a result, this is reducing the chances to exhaust
> the netlink socket receive buffer.
> 
> Rule of thumb is that each notification batch only contains netlink
> messages whose report flag is the same, nfnetlink_send() requires this
> to do appropriately delivery to userspace, either via unicast (echo
> mode) or multicast (monitor mode).
> 
> The skbuff control buffer is used to annotate the report flag for later
> handling at the new coalescing routine.
> 
> The batch skbuff notification size is NLMSG_GOODSIZE, using a larger
> skbuff would allow for more socket receiver buffer savings (to amortize
> the cost of the skbuff even more), however, going over that size might
> break userspace applications, so let's be conservative and stick to
> NLMSG_GOODSIZE.

With this patch in place on top of your other one ("netfilter:
nfnetlink: nfnetlink_unicast() reports EAGAIN instead of ENOBUFS"),
firewalld (as well as nft with same input) now report:

| netlink: Error: Could not process rule: No space left on device

The JSON snippet causing the problem is indeed quite big though:

| % grep "\"\(add\|insert\)\"" fail.pp.json | wc -l
| 462

Eric told me he plans to split initial ruleset creation into several
chunks though, so this should at least not be a blocker for firewalld.

something seems to be fishy, though. Here's a reproducer:

| #!/bin/bash
| 
| numrules="$1"
| 
| nft flush ruleset
| (
| 	echo "add table t"
| 	echo "add chain t c"
| 	for ((i = 0; i < $numrules; i++)); do
| 		echo "add rule t c ip saddr 10.0.0.1 ip daddr 10.0.0.2 tcp dport 27374 mark 0x23 counter accept"
| 	done
| ) | nft -ef -

It starts failing at 13, which is not much. Interestingly, it fails outside of
the container, too. And it even echoes part of the commands:

| add table ip t
| add chain ip t c
| add rule ip t c ip saddr 10.0.0.1 ip daddr 10.0.0.2 tcp dport 27374 meta mark 0x00000023 counter packets 0 bytes 0 accept
| add rule ip t c ip saddr 10.0.0.1 ip daddr 10.0.0.2 tcp dport 27374 meta mark 0x00000023 counter packets 0 bytes 0 accept
| add rule ip t c ip saddr 10.0.0.1 ip daddr 10.0.0.2 tcp dport 27374 meta mark 0x00000023 counter packets 0 bytes 0 accept
| add rule ip t c ip saddr 10.0.0.1 ip daddr 10.0.0.2 tcp dport 27374 meta mark 0x00000023 counter packets 0 bytes 0 accept
| add rule ip t c ip saddr 10.0.0.1 ip daddr 10.0.0.2 tcp dport 27374 meta mark 0x00000023 counter packets 0 bytes 0 accept
| add rule ip t c ip saddr 10.0.0.1 ip daddr 10.0.0.2 tcp dport 27374 meta mark 0x00000023 counter packets 0 bytes 0 accept
| netlink: Error: Could not process rule: No space left on device

Is this a bug in your patch?

Cheers, Phil

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] netfilter: nf_tables: coalesce multiple notifications into one skbuff
  2020-08-27 17:28 [PATCH] netfilter: nf_tables: coalesce multiple notifications into one skbuff Pablo Neira Ayuso
  2020-08-31 16:36 ` Phil Sutter
@ 2020-09-02 14:16 ` Phil Sutter
  2020-09-02 14:25   ` Pablo Neira Ayuso
  1 sibling, 1 reply; 4+ messages in thread
From: Phil Sutter @ 2020-09-02 14:16 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netfilter-devel, fw, eric

Hi Pablo,

On Thu, Aug 27, 2020 at 07:28:42PM +0200, Pablo Neira Ayuso wrote:
[...]
> +static void nft_commit_notify(struct net *net, u32 portid)
> +{
> +	struct sk_buff *batch_skb = NULL, *nskb, *skb;
> +	unsigned char *data;
> +	int len;
> +
> +	list_for_each_entry_safe(skb, nskb, &net->nft.notify_list, list) {
> +		if (!batch_skb) {
> +new_batch:
> +			batch_skb = skb;
> +			NFT_CB(batch_skb).report = NFT_CB(skb).report;
> +			len = NLMSG_GOODSIZE;

This doesn't account for the data in the first skb. After changing the
line into 'len = NLMSG_GOODSIZE - skb->len;', the reported problem
disappears and the patch works as expected.

Cheers, Phil

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] netfilter: nf_tables: coalesce multiple notifications into one skbuff
  2020-09-02 14:16 ` Phil Sutter
@ 2020-09-02 14:25   ` Pablo Neira Ayuso
  0 siblings, 0 replies; 4+ messages in thread
From: Pablo Neira Ayuso @ 2020-09-02 14:25 UTC (permalink / raw)
  To: Phil Sutter, netfilter-devel, fw, eric

On Wed, Sep 02, 2020 at 04:16:39PM +0200, Phil Sutter wrote:
> Hi Pablo,
> 
> On Thu, Aug 27, 2020 at 07:28:42PM +0200, Pablo Neira Ayuso wrote:
> [...]
> > +static void nft_commit_notify(struct net *net, u32 portid)
> > +{
> > +	struct sk_buff *batch_skb = NULL, *nskb, *skb;
> > +	unsigned char *data;
> > +	int len;
> > +
> > +	list_for_each_entry_safe(skb, nskb, &net->nft.notify_list, list) {
> > +		if (!batch_skb) {
> > +new_batch:
> > +			batch_skb = skb;
> > +			NFT_CB(batch_skb).report = NFT_CB(skb).report;
> > +			len = NLMSG_GOODSIZE;
> 
> This doesn't account for the data in the first skb. After changing the
> line into 'len = NLMSG_GOODSIZE - skb->len;', the reported problem
> disappears and the patch works as expected.

Thanks for narrowing down the problem.

I'll send a v2 including this update.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2020-09-02 14:25 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-08-27 17:28 [PATCH] netfilter: nf_tables: coalesce multiple notifications into one skbuff Pablo Neira Ayuso
2020-08-31 16:36 ` Phil Sutter
2020-09-02 14:16 ` Phil Sutter
2020-09-02 14:25   ` Pablo Neira Ayuso

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.