All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2 nf] netfilter: nft_set_bitmap: keep a list of dummy elements
@ 2017-03-13 12:27 Pablo Neira Ayuso
  2017-03-13 12:27 ` [PATCH 2/2 nf] Revert "netfilter: nf_tables: add flush field to struct nft_set_iter" Pablo Neira Ayuso
  2017-03-13 14:23 ` [PATCH 1/2 nf] netfilter: nft_set_bitmap: keep a list of dummy elements Liping Zhang
  0 siblings, 2 replies; 9+ messages in thread
From: Pablo Neira Ayuso @ 2017-03-13 12:27 UTC (permalink / raw)
  To: netfilter-devel; +Cc: zlpnobody

Element comments may come without any prior set flag, so we have to keep
a list of dummy struct nft_set_ext to keep this information around. This
is only useful for set dumps to userspace. From the packet path, this
set type relies on the bitmap representation. This patch simplifies the
logic since we don't need to allocate the dummy nft_set_ext structure
anymore on the fly at the cost of increasing memory consumption because
of the list of dummy struct nft_set_ext.

Fixes: 665153ff5752 ("netfilter: nf_tables: add bitmap set type")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_set_bitmap.c | 146 +++++++++++++++++++----------------------
 1 file changed, 66 insertions(+), 80 deletions(-)

diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
index 9b024e22717b..8ebbc2940f4c 100644
--- a/net/netfilter/nft_set_bitmap.c
+++ b/net/netfilter/nft_set_bitmap.c
@@ -15,6 +15,11 @@
 #include <linux/netfilter/nf_tables.h>
 #include <net/netfilter/nf_tables.h>
 
+struct nft_bitmap_elem {
+	struct list_head	head;
+	struct nft_set_ext	ext;
+};
+
 /* This bitmap uses two bits to represent one element. These two bits determine
  * the element state in the current and the future generation.
  *
@@ -41,8 +46,9 @@
  *      restore its previous state.
  */
 struct nft_bitmap {
-	u16	bitmap_size;
-	u8	bitmap[];
+	struct	list_head	list;
+	u16			bitmap_size;
+	u8			bitmap[];
 };
 
 static inline void nft_bitmap_location(const struct nft_set *set,
@@ -82,21 +88,43 @@ static bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set,
 	return nft_bitmap_active(priv->bitmap, idx, off, genmask);
 }
 
+static struct nft_bitmap_elem *
+nft_bitmap_elem_find(const struct nft_set *set, struct nft_bitmap_elem *this,
+		     u8 genmask)
+{
+	const struct nft_bitmap *priv = nft_set_priv(set);
+	struct nft_bitmap_elem *be;
+
+	list_for_each_entry_rcu(be, &priv->list, head) {
+		if (memcmp(nft_set_ext_key(&be->ext),
+			   nft_set_ext_key(&this->ext), set->klen) ||
+		    !nft_set_elem_active(&be->ext, genmask))
+			continue;
+
+		return be;
+	}
+	return NULL;
+}
+
 static int nft_bitmap_insert(const struct net *net, const struct nft_set *set,
 			     const struct nft_set_elem *elem,
-			     struct nft_set_ext **_ext)
+			     struct nft_set_ext **ext)
 {
 	struct nft_bitmap *priv = nft_set_priv(set);
-	struct nft_set_ext *ext = elem->priv;
+	struct nft_bitmap_elem *new = elem->priv, *be;
 	u8 genmask = nft_genmask_next(net);
 	u32 idx, off;
 
-	nft_bitmap_location(set, nft_set_ext_key(ext), &idx, &off);
-	if (nft_bitmap_active(priv->bitmap, idx, off, genmask))
+	be = nft_bitmap_elem_find(set, new, genmask);
+	if (be) {
+		*ext = &be->ext;
 		return -EEXIST;
+	}
 
+	nft_bitmap_location(set, nft_set_ext_key(&new->ext), &idx, &off);
 	/* Enter 01 state. */
 	priv->bitmap[idx] |= (genmask << off);
+	list_add_tail_rcu(&new->head, &priv->list);
 
 	return 0;
 }
@@ -106,13 +134,14 @@ static void nft_bitmap_remove(const struct net *net,
 			      const struct nft_set_elem *elem)
 {
 	struct nft_bitmap *priv = nft_set_priv(set);
-	struct nft_set_ext *ext = elem->priv;
+	struct nft_bitmap_elem *be = elem->priv;
 	u8 genmask = nft_genmask_next(net);
 	u32 idx, off;
 
-	nft_bitmap_location(set, nft_set_ext_key(ext), &idx, &off);
+	nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off);
 	/* Enter 00 state. */
 	priv->bitmap[idx] &= ~(genmask << off);
+	list_del_rcu(&be->head);
 }
 
 static void nft_bitmap_activate(const struct net *net,
@@ -120,73 +149,52 @@ static void nft_bitmap_activate(const struct net *net,
 				const struct nft_set_elem *elem)
 {
 	struct nft_bitmap *priv = nft_set_priv(set);
-	struct nft_set_ext *ext = elem->priv;
+	struct nft_bitmap_elem *be = elem->priv;
 	u8 genmask = nft_genmask_next(net);
 	u32 idx, off;
 
-	nft_bitmap_location(set, nft_set_ext_key(ext), &idx, &off);
+	nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off);
 	/* Enter 11 state. */
 	priv->bitmap[idx] |= (genmask << off);
+	nft_set_elem_change_active(net, set, &be->ext);
 }
 
 static bool nft_bitmap_flush(const struct net *net,
-			     const struct nft_set *set, void *ext)
+			     const struct nft_set *set, void *_be)
 {
 	struct nft_bitmap *priv = nft_set_priv(set);
 	u8 genmask = nft_genmask_next(net);
+	struct nft_bitmap_elem *be = _be;
 	u32 idx, off;
 
-	nft_bitmap_location(set, nft_set_ext_key(ext), &idx, &off);
+	nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off);
 	/* Enter 10 state, similar to deactivation. */
 	priv->bitmap[idx] &= ~(genmask << off);
+	nft_set_elem_change_active(net, set, &be->ext);
 
 	return true;
 }
 
-static struct nft_set_ext *nft_bitmap_ext_alloc(const struct nft_set *set,
-						const struct nft_set_elem *elem)
-{
-	struct nft_set_ext_tmpl tmpl;
-	struct nft_set_ext *ext;
-
-	nft_set_ext_prepare(&tmpl);
-	nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
-
-	ext = kzalloc(tmpl.len, GFP_KERNEL);
-	if (!ext)
-		return NULL;
-
-	nft_set_ext_init(ext, &tmpl);
-	memcpy(nft_set_ext_key(ext), elem->key.val.data, set->klen);
-
-	return ext;
-}
-
 static void *nft_bitmap_deactivate(const struct net *net,
 				   const struct nft_set *set,
 				   const struct nft_set_elem *elem)
 {
 	struct nft_bitmap *priv = nft_set_priv(set);
+	struct nft_bitmap_elem *this = elem->priv, *be;
 	u8 genmask = nft_genmask_next(net);
-	struct nft_set_ext *ext;
 	u32 idx, off;
 
 	nft_bitmap_location(set, elem->key.val.data, &idx, &off);
 
-	if (!nft_bitmap_active(priv->bitmap, idx, off, genmask))
-		return NULL;
-
-	/* We have no real set extension since this is a bitmap, allocate this
-	 * dummy object that is released from the commit/abort path.
-	 */
-	ext = nft_bitmap_ext_alloc(set, elem);
-	if (!ext)
+	be = nft_bitmap_elem_find(set, this, genmask);
+	if (!be)
 		return NULL;
 
 	/* Enter 10 state. */
 	priv->bitmap[idx] &= ~(genmask << off);
+	nft_set_elem_change_active(net, set, &be->ext);
 
-	return ext;
+	return be;
 }
 
 static void nft_bitmap_walk(const struct nft_ctx *ctx,
@@ -194,47 +202,23 @@ static void nft_bitmap_walk(const struct nft_ctx *ctx,
 			    struct nft_set_iter *iter)
 {
 	const struct nft_bitmap *priv = nft_set_priv(set);
-	struct nft_set_ext_tmpl tmpl;
+	struct nft_bitmap_elem *be;
 	struct nft_set_elem elem;
-	struct nft_set_ext *ext;
-	int idx, off;
-	u16 key;
-
-	nft_set_ext_prepare(&tmpl);
-	nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
-
-	for (idx = 0; idx < priv->bitmap_size; idx++) {
-		for (off = 0; off < BITS_PER_BYTE; off += 2) {
-			if (iter->count < iter->skip)
-				goto cont;
-
-			if (!nft_bitmap_active(priv->bitmap, idx, off,
-					       iter->genmask))
-				goto cont;
-
-			ext = kzalloc(tmpl.len, GFP_KERNEL);
-			if (!ext) {
-				iter->err = -ENOMEM;
-				return;
-			}
-			nft_set_ext_init(ext, &tmpl);
-			key = ((idx * BITS_PER_BYTE) + off) >> 1;
-			memcpy(nft_set_ext_key(ext), &key, set->klen);
-
-			elem.priv = ext;
-			iter->err = iter->fn(ctx, set, iter, &elem);
-
-			/* On set flush, this dummy extension object is released
-			 * from the commit/abort path.
-			 */
-			if (!iter->flush)
-				kfree(ext);
-
-			if (iter->err < 0)
-				return;
+
+	list_for_each_entry_rcu(be, &priv->list, head) {
+		if (iter->count < iter->skip)
+			goto cont;
+		if (!nft_set_elem_active(&be->ext, iter->genmask))
+			goto cont;
+
+		elem.priv = be;
+
+		iter->err = iter->fn(ctx, set, iter, &elem);
+
+		if (iter->err < 0)
+			return;
 cont:
-			iter->count++;
-		}
+		iter->count++;
 	}
 }
 
@@ -265,6 +249,7 @@ static int nft_bitmap_init(const struct nft_set *set,
 {
 	struct nft_bitmap *priv = nft_set_priv(set);
 
+	INIT_LIST_HEAD(&priv->list);
 	priv->bitmap_size = nft_bitmap_size(set->klen);
 
 	return 0;
@@ -290,6 +275,7 @@ static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features,
 
 static struct nft_set_ops nft_bitmap_ops __read_mostly = {
 	.privsize	= nft_bitmap_privsize,
+	.elemsize	= offsetof(struct nft_bitmap_elem, ext),
 	.estimate	= nft_bitmap_estimate,
 	.init		= nft_bitmap_init,
 	.destroy	= nft_bitmap_destroy,
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 2/2 nf] Revert "netfilter: nf_tables: add flush field to struct nft_set_iter"
  2017-03-13 12:27 [PATCH 1/2 nf] netfilter: nft_set_bitmap: keep a list of dummy elements Pablo Neira Ayuso
@ 2017-03-13 12:27 ` Pablo Neira Ayuso
  2017-03-13 14:23 ` [PATCH 1/2 nf] netfilter: nft_set_bitmap: keep a list of dummy elements Liping Zhang
  1 sibling, 0 replies; 9+ messages in thread
From: Pablo Neira Ayuso @ 2017-03-13 12:27 UTC (permalink / raw)
  To: netfilter-devel; +Cc: zlpnobody

This reverts commit 1f48ff6c5393aa7fe290faf5d633164f105b0aa7.

This patch is not required anymore now that we keep a dummy list of
set elements in the bitmap set implementation, so revert this before
we forget this code has not clients.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 1 -
 net/netfilter/nf_tables_api.c     | 4 ----
 2 files changed, 5 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 2aa8a9d80fbe..07e5c9c4f316 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -203,7 +203,6 @@ struct nft_set_elem {
 struct nft_set;
 struct nft_set_iter {
 	u8		genmask;
-	bool		flush;
 	unsigned int	count;
 	unsigned int	skip;
 	int		err;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 5e0ccfd5bb37..434c739dfeca 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3145,7 +3145,6 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
 		iter.count	= 0;
 		iter.err	= 0;
 		iter.fn		= nf_tables_bind_check_setelem;
-		iter.flush	= false;
 
 		set->ops->walk(ctx, set, &iter);
 		if (iter.err < 0)
@@ -3399,7 +3398,6 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
 	args.iter.count		= 0;
 	args.iter.err		= 0;
 	args.iter.fn		= nf_tables_dump_setelem;
-	args.iter.flush		= false;
 	set->ops->walk(&ctx, set, &args.iter);
 
 	nla_nest_end(skb, nest);
@@ -3963,7 +3961,6 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
 		struct nft_set_iter iter = {
 			.genmask	= genmask,
 			.fn		= nft_flush_set,
-			.flush		= true,
 		};
 		set->ops->walk(&ctx, set, &iter);
 
@@ -5114,7 +5111,6 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
 			iter.count	= 0;
 			iter.err	= 0;
 			iter.fn		= nf_tables_loop_check_setelem;
-			iter.flush	= false;
 
 			set->ops->walk(ctx, set, &iter);
 			if (iter.err < 0)
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2 nf] netfilter: nft_set_bitmap: keep a list of dummy elements
  2017-03-13 12:27 [PATCH 1/2 nf] netfilter: nft_set_bitmap: keep a list of dummy elements Pablo Neira Ayuso
  2017-03-13 12:27 ` [PATCH 2/2 nf] Revert "netfilter: nf_tables: add flush field to struct nft_set_iter" Pablo Neira Ayuso
@ 2017-03-13 14:23 ` Liping Zhang
  2017-03-13 17:23   ` Pablo Neira Ayuso
  1 sibling, 1 reply; 9+ messages in thread
From: Liping Zhang @ 2017-03-13 14:23 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: Netfilter Developer Mailing List

Hi Pablo,

2017-03-13 20:27 GMT+08:00 Pablo Neira Ayuso <pablo@netfilter.org>:
> Element comments may come without any prior set flag, so we have to keep
> a list of dummy struct nft_set_ext to keep this information around. This
> is only useful for set dumps to userspace. From the packet path, this
> set type relies on the bitmap representation. This patch simplifies the
> logic since we don't need to allocate the dummy nft_set_ext structure
> anymore on the fly at the cost of increasing memory consumption because
> of the list of dummy struct nft_set_ext.

If I didn't misunderstand it, I think after introducing the dummy nft_set_ext,
the nft_bitmap_estimate() should also be updated? i.e.:

1. est->size should be recalculated
2. est->space should be changed to NFT_SET_CLASS_O_N

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2 nf] netfilter: nft_set_bitmap: keep a list of dummy elements
  2017-03-13 14:23 ` [PATCH 1/2 nf] netfilter: nft_set_bitmap: keep a list of dummy elements Liping Zhang
@ 2017-03-13 17:23   ` Pablo Neira Ayuso
  2017-03-14  9:04     ` Liping Zhang
  0 siblings, 1 reply; 9+ messages in thread
From: Pablo Neira Ayuso @ 2017-03-13 17:23 UTC (permalink / raw)
  To: Liping Zhang; +Cc: Netfilter Developer Mailing List

Hi Liping,

On Mon, Mar 13, 2017 at 10:23:53PM +0800, Liping Zhang wrote:
> Hi Pablo,
> 
> 2017-03-13 20:27 GMT+08:00 Pablo Neira Ayuso <pablo@netfilter.org>:
> > Element comments may come without any prior set flag, so we have to keep
> > a list of dummy struct nft_set_ext to keep this information around. This
> > is only useful for set dumps to userspace. From the packet path, this
> > set type relies on the bitmap representation. This patch simplifies the
> > logic since we don't need to allocate the dummy nft_set_ext structure
> > anymore on the fly at the cost of increasing memory consumption because
> > of the list of dummy struct nft_set_ext.
> 
> If I didn't misunderstand it, I think after introducing the dummy nft_set_ext,
> the nft_bitmap_estimate() should also be updated? i.e.:
> 
> 1. est->size should be recalculated
> 2. est->space should be changed to NFT_SET_CLASS_O_N

I would like this only describes the representation that is exposed to
the packet path, not the real memory consumption of it. I know I'm
kind of cheating, but with this I'm also giving this bitmap an
advantage over the hashtable representation, the performance numbers I
gathered here when evaluating it are better.

Anyway, I'll be fine if this triggers some discussions on the set
backend selection at some point, as well as more detailed performance
evaluation. Note this big O notation just tell about the scalability.
Size provides an accurate way to measure how much memory this consumes
but only if userspace tells us how many elements we're going to add
beforehand. On the CPU side, we have no such specific metric as the
memory size. Probably we can introduce some generic cycle metric that
represents the cost of the lookup path, this won't be simple as this
number is not deterministic since there are many things to consider,
so we have to agree on how we calculate this pseudo-metric.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2 nf] netfilter: nft_set_bitmap: keep a list of dummy elements
  2017-03-13 17:23   ` Pablo Neira Ayuso
@ 2017-03-14  9:04     ` Liping Zhang
  2017-03-14 10:21       ` Pablo Neira Ayuso
  0 siblings, 1 reply; 9+ messages in thread
From: Liping Zhang @ 2017-03-14  9:04 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: Netfilter Developer Mailing List

Hi Pablo,

2017-03-14 1:23 GMT+08:00 Pablo Neira Ayuso <pablo@netfilter.org>:
[...]
> I would like this only describes the representation that is exposed to
> the packet path, not the real memory consumption of it. I know I'm
> kind of cheating, but with this I'm also giving this bitmap an
> advantage over the hashtable representation, the performance numbers I
> gathered here when evaluating it are better.

Yes, I agree, bitmap_set is faster than hash_set, so for POL_PERFORMANCE
policy, we should prefer to use bitmap_set.

> Anyway, I'll be fine if this triggers some discussions on the set
> backend selection at some point, as well as more detailed performance
> evaluation. Note this big O notation just tell about the scalability.
> Size provides an accurate way to measure how much memory this consumes
> but only if userspace tells us how many elements we're going to add
> beforehand. On the CPU side, we have no such specific metric as the
> memory size. Probably we can introduce some generic cycle metric that
> represents the cost of the lookup path, this won't be simple as this
> number is not deterministic since there are many things to consider,
> so we have to agree on how we calculate this pseudo-metric.

Hmm... I think a better selection algorithm is necessary now. I find an
obvious drawback for the time being, for example:

When set->klen is 2, the bitmap_set's memory consumption is much
higer than others. Only one single set with few elements will occupy at
least 16kB, so only 20 rules using sets will consume roughly 320kB, it will
become a high burden for these embedded systems which with low memory.

It's worse that we cannot ignore the bitmap_set, even if we select the the
NFT_SET_POL_MEMORY policy without the set size specified, we will still
choose the bitmap_set, since it claims that it's space complexity is
NFT_SET_CLASS_O_1.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2 nf] netfilter: nft_set_bitmap: keep a list of dummy elements
  2017-03-14  9:04     ` Liping Zhang
@ 2017-03-14 10:21       ` Pablo Neira Ayuso
  2017-03-14 12:19         ` Pablo Neira Ayuso
  0 siblings, 1 reply; 9+ messages in thread
From: Pablo Neira Ayuso @ 2017-03-14 10:21 UTC (permalink / raw)
  To: Liping Zhang; +Cc: Netfilter Developer Mailing List

On Tue, Mar 14, 2017 at 05:04:17PM +0800, Liping Zhang wrote:
> 2017-03-14 1:23 GMT+08:00 Pablo Neira Ayuso <pablo@netfilter.org>:
> [...]
> > Anyway, I'll be fine if this triggers some discussions on the set
> > backend selection at some point, as well as more detailed performance
> > evaluation. Note this big O notation just tell about the scalability.
> > Size provides an accurate way to measure how much memory this consumes
> > but only if userspace tells us how many elements we're going to add
> > beforehand. On the CPU side, we have no such specific metric as the
> > memory size. Probably we can introduce some generic cycle metric that
> > represents the cost of the lookup path, this won't be simple as this
> > number is not deterministic since there are many things to consider,
> > so we have to agree on how we calculate this pseudo-metric.
> 
> Hmm... I think a better selection algorithm is necessary now. I find an
> obvious drawback for the time being, for example:
> 
> When set->klen is 2, the bitmap_set's memory consumption is much
> higer than others. Only one single set with few elements will occupy at
> least 16kB, so only 20 rules using sets will consume roughly 320kB, it will
> become a high burden for these embedded systems which with low memory.
>
> It's worse that we cannot ignore the bitmap_set, even if we select the the
> NFT_SET_POL_MEMORY policy without the set size specified, we will still
> choose the bitmap_set, since it claims that it's space complexity is
> NFT_SET_CLASS_O_1.

Make sense. Please, submit patches for nf-next to revisit POL_MEMORY
selection explaining the new criteria. I guess we will need more
iterations on this set selection as we get more set backends. I wanted
to dedicate some time on this during the Netfilter Workshop (to be
announce, by Q2 2017).

Note that an anonymous sets default to POL_PERFORMANCE, so 20 rules
with anonymous sets would still consumed those 320 kB. So probably we
need a per-table global policy switch that we can set when the table
is created. I think updating such knob would result in EOPNOTSUPP at
this stage, as we would need a way to transfer set representations
from one backend to another if the policy update results in a
different set backend configuration in order to support
performance/memory policy updates.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2 nf] netfilter: nft_set_bitmap: keep a list of dummy elements
  2017-03-14 10:21       ` Pablo Neira Ayuso
@ 2017-03-14 12:19         ` Pablo Neira Ayuso
  2017-03-14 14:44           ` Liping Zhang
  0 siblings, 1 reply; 9+ messages in thread
From: Pablo Neira Ayuso @ 2017-03-14 12:19 UTC (permalink / raw)
  To: Liping Zhang; +Cc: Netfilter Developer Mailing List

On Tue, Mar 14, 2017 at 11:21:31AM +0100, Pablo Neira Ayuso wrote:
> On Tue, Mar 14, 2017 at 05:04:17PM +0800, Liping Zhang wrote:
> > 2017-03-14 1:23 GMT+08:00 Pablo Neira Ayuso <pablo@netfilter.org>:
> > [...]
> > > Anyway, I'll be fine if this triggers some discussions on the set
> > > backend selection at some point, as well as more detailed performance
> > > evaluation. Note this big O notation just tell about the scalability.
> > > Size provides an accurate way to measure how much memory this consumes
> > > but only if userspace tells us how many elements we're going to add
> > > beforehand. On the CPU side, we have no such specific metric as the
> > > memory size. Probably we can introduce some generic cycle metric that
> > > represents the cost of the lookup path, this won't be simple as this
> > > number is not deterministic since there are many things to consider,
> > > so we have to agree on how we calculate this pseudo-metric.
> > 
> > Hmm... I think a better selection algorithm is necessary now. I find an
> > obvious drawback for the time being, for example:
> > 
> > When set->klen is 2, the bitmap_set's memory consumption is much
> > higer than others. Only one single set with few elements will occupy at
> > least 16kB, so only 20 rules using sets will consume roughly 320kB, it will
> > become a high burden for these embedded systems which with low memory.
> >
> > It's worse that we cannot ignore the bitmap_set, even if we select the the
> > NFT_SET_POL_MEMORY policy without the set size specified, we will still
> > choose the bitmap_set, since it claims that it's space complexity is
> > NFT_SET_CLASS_O_1.
> 
> Make sense. Please, submit patches for nf-next to revisit POL_MEMORY
> selection explaining the new criteria. I guess we will need more
> iterations on this set selection as we get more set backends. I wanted
> to dedicate some time on this during the Netfilter Workshop (to be
> announce, by Q2 2017).
> 
> Note that an anonymous sets default to POL_PERFORMANCE, so 20 rules
> with anonymous sets would still consumed those 320 kB. So probably we
> need a per-table global policy switch that we can set when the table
> is created. I think updating such knob would result in EOPNOTSUPP at
> this stage, as we would need a way to transfer set representations
> from one backend to another if the policy update results in a
> different set backend configuration in order to support
> performance/memory policy updates.

Another possibility is to simply regard desc->size over the memory
scalability notation when provided. I think this just needs an update
from nft userspace. Look, bitmap and hashtable are both described as
O(1) in terms of performance. If the user provides the set size (this
is known in anonymous sets) we can select the one that takes less
memory. When no size is specified, we rely on the set policy that is
specified.

Still, for anonymous sets we will select hashtable instead, this is
going to be slower in systems that have plenty of memory. I think we
cannot escape the new per-table global knob to select
memory/performance for anononymous sets after all.

I'm curious, what kind of device are you thinking of with such memory
restrictions that cannot take 320 kB? I would expect such embedded
device that cannot afford such memory consumption will come also with
a smallish cpu.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2 nf] netfilter: nft_set_bitmap: keep a list of dummy elements
  2017-03-14 12:19         ` Pablo Neira Ayuso
@ 2017-03-14 14:44           ` Liping Zhang
  2017-03-14 15:21             ` Pablo Neira Ayuso
  0 siblings, 1 reply; 9+ messages in thread
From: Liping Zhang @ 2017-03-14 14:44 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: Netfilter Developer Mailing List

Hi Pablo,
2017-03-14 20:19 GMT+08:00 Pablo Neira Ayuso <pablo@netfilter.org>:
[...]
> Another possibility is to simply regard desc->size over the memory
> scalability notation when provided. I think this just needs an update
> from nft userspace. Look, bitmap and hashtable are both described as
> O(1) in terms of performance. If the user provides the set size (this
> is known in anonymous sets) we can select the one that takes less
> memory. When no size is specified, we rely on the set policy that is
> specified.
>
> Still, for anonymous sets we will select hashtable instead, this is
> going to be slower in systems that have plenty of memory. I think we
> cannot escape the new per-table global knob to select
> memory/performance for anononymous sets after all.

After we implement more and more sets types, I think just based on
POL_PERFORMANCE or POL_MEMORY to select a suitable set will
become a more and more difficult task. So how about this method:
1. For compatibility, POL_PERFORMANCE means hash set, and
    POL_MEMORY means rbtree set.(I know this maybe incorrect when
    the set->size is 0)
2. When the user create the set, he(she) can specify a new settype to
    select the set type, such as hash, rbtree, bitmap... a little similar to
    ipset.

I know this method is not perfect, but this will provide big
flexibility to the user.

> I'm curious, what kind of device are you thinking of with such memory
> restrictions that cannot take 320 kB? I would expect such embedded
> device that cannot afford such memory consumption will come also with
> a smallish cpu.

We had a small router with 32MB memory in my previous company.
On such an embedded device, occupy 320KB is also no problem of
course.

But I guess the user  will not happy to know the fact, inputting such a
nft rule "nft add x y tcp dport {21, 22} drop" will consume more than
16KB memory :)

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2 nf] netfilter: nft_set_bitmap: keep a list of dummy elements
  2017-03-14 14:44           ` Liping Zhang
@ 2017-03-14 15:21             ` Pablo Neira Ayuso
  0 siblings, 0 replies; 9+ messages in thread
From: Pablo Neira Ayuso @ 2017-03-14 15:21 UTC (permalink / raw)
  To: Liping Zhang; +Cc: Netfilter Developer Mailing List

On Tue, Mar 14, 2017 at 10:44:43PM +0800, Liping Zhang wrote:
> Hi Pablo,
> 2017-03-14 20:19 GMT+08:00 Pablo Neira Ayuso <pablo@netfilter.org>:
> [...]
> > Another possibility is to simply regard desc->size over the memory
> > scalability notation when provided. I think this just needs an update
> > from nft userspace. Look, bitmap and hashtable are both described as
> > O(1) in terms of performance. If the user provides the set size (this
> > is known in anonymous sets) we can select the one that takes less
> > memory. When no size is specified, we rely on the set policy that is
> > specified.
> >
> > Still, for anonymous sets we will select hashtable instead, this is
> > going to be slower in systems that have plenty of memory. I think we
> > cannot escape the new per-table global knob to select
> > memory/performance for anononymous sets after all.
> 
> After we implement more and more sets types, I think just based on
> POL_PERFORMANCE or POL_MEMORY to select a suitable set will
> become a more and more difficult task. So how about this method:
> 1. For compatibility, POL_PERFORMANCE means hash set, and
>     POL_MEMORY means rbtree set.(I know this maybe incorrect when
>     the set->size is 0)
> 2. When the user create the set, he(she) can specify a new settype to
>     select the set type, such as hash, rbtree, bitmap... a little similar to
>     ipset.
> 
> I know this method is not perfect, but this will provide big
> flexibility to the user.

Then, we cannot deprecate sets like the rbtree, that I'm very much in
favour to find a replacement, as it would be exposed to userspace and
anyone could be using it, and we cannot break existing user setups.

Moreover, if we, the developers, don't know exactly what is a good
choice, how can users just know what is best for them? I would prefer
developers come to us to tune the set backend selection so we get it
better. We can enhance this model incrementally.

Leaking details to userspace is easy, just a matter of exposing all
these knobs to userspace via netlink. If this turns out to be the way,
we'll do it at a given time, but I'm still willing to spend time on
this set backend selection routine.

> > I'm curious, what kind of device are you thinking of with such memory
> > restrictions that cannot take 320 kB? I would expect such embedded
> > device that cannot afford such memory consumption will come also with
> > a smallish cpu.
> 
> We had a small router with 32MB memory in my previous company.
> On such an embedded device, occupy 320KB is also no problem of
> course.
> 
> But I guess the user  will not happy to know the fact, inputting such a
> nft rule "nft add x y tcp dport {21, 22} drop" will consume more than
> 16KB memory :)

For such small usecase, we can expose something like:

table x {
        policy memory; <----

        chain y {
                type filter hook output priority 0; policy drop;

                tcp dport {22, 80} ct state established,related accept
        }
}

So the kernel knows memory is more important that performance, and
this policy exposes what the user needs. If not specified, the
performance representation is selected.

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2017-03-14 15:22 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-03-13 12:27 [PATCH 1/2 nf] netfilter: nft_set_bitmap: keep a list of dummy elements Pablo Neira Ayuso
2017-03-13 12:27 ` [PATCH 2/2 nf] Revert "netfilter: nf_tables: add flush field to struct nft_set_iter" Pablo Neira Ayuso
2017-03-13 14:23 ` [PATCH 1/2 nf] netfilter: nft_set_bitmap: keep a list of dummy elements Liping Zhang
2017-03-13 17:23   ` Pablo Neira Ayuso
2017-03-14  9:04     ` Liping Zhang
2017-03-14 10:21       ` Pablo Neira Ayuso
2017-03-14 12:19         ` Pablo Neira Ayuso
2017-03-14 14:44           ` Liping Zhang
2017-03-14 15:21             ` Pablo Neira Ayuso

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.