All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH nf-next 0/4] netfilter: nft_set_pipapo: speed up bulk element insertions
@ 2024-02-12 10:01 Florian Westphal
  2024-02-12 10:01 ` [PATCH nf-next 1/4] netfilter: nft_set_pipapo: constify lookup fn args where possible Florian Westphal
                   ` (3 more replies)
  0 siblings, 4 replies; 11+ messages in thread
From: Florian Westphal @ 2024-02-12 10:01 UTC (permalink / raw)
  To: netfilter-devel; +Cc: sbrivio, Florian Westphal

Bulk insertions into pipapo set type take a very long time, each new
element allocates space for elem+1 elements, then copies all existing
elements and appends the new element.

Alloc extra slack space to reduce the realloc overhead to speed this up.

While at it, shrink a few data structures, in may cases a much smaller
type can be used.

Florian Westphal (4):
  netfilter: nft_set_pipapo: constify lookup fn args where possible
  netfilter: nft_set_pipapo: do not rely on ZERO_SIZE_PTR
  netfilter: nft_set_pipapo: shrink data structures
  netfilter: nft_set_pipapo: speed up bulk element insertions

 net/netfilter/nft_set_pipapo.c      | 174 ++++++++++++++++++++--------
 net/netfilter/nft_set_pipapo.h      |  37 +++---
 net/netfilter/nft_set_pipapo_avx2.c |  26 ++---
 3 files changed, 153 insertions(+), 84 deletions(-)

-- 
2.43.0


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH nf-next 1/4] netfilter: nft_set_pipapo: constify lookup fn args where possible
  2024-02-12 10:01 [PATCH nf-next 0/4] netfilter: nft_set_pipapo: speed up bulk element insertions Florian Westphal
@ 2024-02-12 10:01 ` Florian Westphal
  2024-02-13  7:19   ` Stefano Brivio
  2024-02-12 10:01 ` [PATCH nf-next 2/4] netfilter: nft_set_pipapo: do not rely on ZERO_SIZE_PTR Florian Westphal
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 11+ messages in thread
From: Florian Westphal @ 2024-02-12 10:01 UTC (permalink / raw)
  To: netfilter-devel; +Cc: sbrivio, Florian Westphal

Those get called from packet path, content must not be modified.
No functional changes intended.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/netfilter/nft_set_pipapo.c      | 16 ++++++++--------
 net/netfilter/nft_set_pipapo.h      |  6 +++---
 net/netfilter/nft_set_pipapo_avx2.c | 26 +++++++++++++-------------
 3 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index ed243edaa6a0..395420fa71e5 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -360,7 +360,7 @@
  * Return: -1 on no match, bit position on 'match_only', 0 otherwise.
  */
 int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst,
-		  union nft_pipapo_map_bucket *mt, bool match_only)
+		  const union nft_pipapo_map_bucket *mt, bool match_only)
 {
 	unsigned long bitset;
 	int k, ret = -1;
@@ -412,9 +412,9 @@ bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set,
 	struct nft_pipapo_scratch *scratch;
 	unsigned long *res_map, *fill_map;
 	u8 genmask = nft_genmask_cur(net);
+	const struct nft_pipapo_match *m;
+	const struct nft_pipapo_field *f;
 	const u8 *rp = (const u8 *)key;
-	struct nft_pipapo_match *m;
-	struct nft_pipapo_field *f;
 	bool map_index;
 	int i;
 
@@ -521,9 +521,9 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net,
 {
 	struct nft_pipapo_elem *ret = ERR_PTR(-ENOENT);
 	struct nft_pipapo *priv = nft_set_priv(set);
-	struct nft_pipapo_match *m = priv->clone;
+	const struct nft_pipapo_match *m = priv->clone;
 	unsigned long *res_map, *fill_map = NULL;
-	struct nft_pipapo_field *f;
+	const struct nft_pipapo_field *f;
 	int i;
 
 	res_map = kmalloc_array(m->bsize_max, sizeof(*res_map), GFP_ATOMIC);
@@ -1599,7 +1599,7 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m)
 
 	while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) {
 		union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS];
-		struct nft_pipapo_field *f;
+		const struct nft_pipapo_field *f;
 		int i, start, rules_fx;
 
 		start = first_rule;
@@ -2041,8 +2041,8 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set,
 {
 	struct nft_pipapo *priv = nft_set_priv(set);
 	struct net *net = read_pnet(&set->net);
-	struct nft_pipapo_match *m;
-	struct nft_pipapo_field *f;
+	const struct nft_pipapo_match *m;
+	const struct nft_pipapo_field *f;
 	int i, r;
 
 	rcu_read_lock();
diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h
index f59a0cd81105..90d22d691afc 100644
--- a/net/netfilter/nft_set_pipapo.h
+++ b/net/netfilter/nft_set_pipapo.h
@@ -187,7 +187,7 @@ struct nft_pipapo_elem {
 };
 
 int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst,
-		  union nft_pipapo_map_bucket *mt, bool match_only);
+		  const union nft_pipapo_map_bucket *mt, bool match_only);
 
 /**
  * pipapo_and_field_buckets_4bit() - Intersect 4-bit buckets
@@ -195,7 +195,7 @@ int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst,
  * @dst:	Area to store result
  * @data:	Input data selecting table buckets
  */
-static inline void pipapo_and_field_buckets_4bit(struct nft_pipapo_field *f,
+static inline void pipapo_and_field_buckets_4bit(const struct nft_pipapo_field *f,
 						 unsigned long *dst,
 						 const u8 *data)
 {
@@ -223,7 +223,7 @@ static inline void pipapo_and_field_buckets_4bit(struct nft_pipapo_field *f,
  * @dst:	Area to store result
  * @data:	Input data selecting table buckets
  */
-static inline void pipapo_and_field_buckets_8bit(struct nft_pipapo_field *f,
+static inline void pipapo_and_field_buckets_8bit(const struct nft_pipapo_field *f,
 						 unsigned long *dst,
 						 const u8 *data)
 {
diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c
index 208e9d577347..d7bea311165f 100644
--- a/net/netfilter/nft_set_pipapo_avx2.c
+++ b/net/netfilter/nft_set_pipapo_avx2.c
@@ -212,7 +212,7 @@ static int nft_pipapo_avx2_refill(int offset, unsigned long *map,
  * word index to be checked next (i.e. first filled word).
  */
 static int nft_pipapo_avx2_lookup_4b_2(unsigned long *map, unsigned long *fill,
-				       struct nft_pipapo_field *f, int offset,
+				       const struct nft_pipapo_field *f, int offset,
 				       const u8 *pkt, bool first, bool last)
 {
 	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
@@ -274,7 +274,7 @@ static int nft_pipapo_avx2_lookup_4b_2(unsigned long *map, unsigned long *fill,
  * word index to be checked next (i.e. first filled word).
  */
 static int nft_pipapo_avx2_lookup_4b_4(unsigned long *map, unsigned long *fill,
-				       struct nft_pipapo_field *f, int offset,
+				       const struct nft_pipapo_field *f, int offset,
 				       const u8 *pkt, bool first, bool last)
 {
 	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
@@ -350,7 +350,7 @@ static int nft_pipapo_avx2_lookup_4b_4(unsigned long *map, unsigned long *fill,
  * word index to be checked next (i.e. first filled word).
  */
 static int nft_pipapo_avx2_lookup_4b_8(unsigned long *map, unsigned long *fill,
-				       struct nft_pipapo_field *f, int offset,
+				       const struct nft_pipapo_field *f, int offset,
 				       const u8 *pkt, bool first, bool last)
 {
 	u8 pg[8] = {  pkt[0] >> 4,  pkt[0] & 0xf,  pkt[1] >> 4,  pkt[1] & 0xf,
@@ -445,7 +445,7 @@ static int nft_pipapo_avx2_lookup_4b_8(unsigned long *map, unsigned long *fill,
  * word index to be checked next (i.e. first filled word).
  */
 static int nft_pipapo_avx2_lookup_4b_12(unsigned long *map, unsigned long *fill,
-				        struct nft_pipapo_field *f, int offset,
+					const struct nft_pipapo_field *f, int offset,
 				        const u8 *pkt, bool first, bool last)
 {
 	u8 pg[12] = {  pkt[0] >> 4,  pkt[0] & 0xf,  pkt[1] >> 4,  pkt[1] & 0xf,
@@ -534,7 +534,7 @@ static int nft_pipapo_avx2_lookup_4b_12(unsigned long *map, unsigned long *fill,
  * word index to be checked next (i.e. first filled word).
  */
 static int nft_pipapo_avx2_lookup_4b_32(unsigned long *map, unsigned long *fill,
-					struct nft_pipapo_field *f, int offset,
+					const struct nft_pipapo_field *f, int offset,
 					const u8 *pkt, bool first, bool last)
 {
 	u8 pg[32] = {  pkt[0] >> 4,  pkt[0] & 0xf,  pkt[1] >> 4,  pkt[1] & 0xf,
@@ -669,7 +669,7 @@ static int nft_pipapo_avx2_lookup_4b_32(unsigned long *map, unsigned long *fill,
  * word index to be checked next (i.e. first filled word).
  */
 static int nft_pipapo_avx2_lookup_8b_1(unsigned long *map, unsigned long *fill,
-				       struct nft_pipapo_field *f, int offset,
+				       const struct nft_pipapo_field *f, int offset,
 				       const u8 *pkt, bool first, bool last)
 {
 	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
@@ -726,7 +726,7 @@ static int nft_pipapo_avx2_lookup_8b_1(unsigned long *map, unsigned long *fill,
  * word index to be checked next (i.e. first filled word).
  */
 static int nft_pipapo_avx2_lookup_8b_2(unsigned long *map, unsigned long *fill,
-				       struct nft_pipapo_field *f, int offset,
+				       const struct nft_pipapo_field *f, int offset,
 				       const u8 *pkt, bool first, bool last)
 {
 	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
@@ -790,7 +790,7 @@ static int nft_pipapo_avx2_lookup_8b_2(unsigned long *map, unsigned long *fill,
  * word index to be checked next (i.e. first filled word).
  */
 static int nft_pipapo_avx2_lookup_8b_4(unsigned long *map, unsigned long *fill,
-				       struct nft_pipapo_field *f, int offset,
+				       const struct nft_pipapo_field *f, int offset,
 				       const u8 *pkt, bool first, bool last)
 {
 	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
@@ -865,7 +865,7 @@ static int nft_pipapo_avx2_lookup_8b_4(unsigned long *map, unsigned long *fill,
  * word index to be checked next (i.e. first filled word).
  */
 static int nft_pipapo_avx2_lookup_8b_6(unsigned long *map, unsigned long *fill,
-				       struct nft_pipapo_field *f, int offset,
+				       const struct nft_pipapo_field *f, int offset,
 				       const u8 *pkt, bool first, bool last)
 {
 	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
@@ -950,7 +950,7 @@ static int nft_pipapo_avx2_lookup_8b_6(unsigned long *map, unsigned long *fill,
  * word index to be checked next (i.e. first filled word).
  */
 static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill,
-					struct nft_pipapo_field *f, int offset,
+					const struct nft_pipapo_field *f, int offset,
 					const u8 *pkt, bool first, bool last)
 {
 	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
@@ -1042,7 +1042,7 @@ static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill,
  * word index to be checked next (i.e. first filled word).
  */
 static int nft_pipapo_avx2_lookup_slow(unsigned long *map, unsigned long *fill,
-					struct nft_pipapo_field *f, int offset,
+					const struct nft_pipapo_field *f, int offset,
 					const u8 *pkt, bool first, bool last)
 {
 	unsigned long bsize = f->bsize;
@@ -1119,9 +1119,9 @@ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
 	struct nft_pipapo *priv = nft_set_priv(set);
 	struct nft_pipapo_scratch *scratch;
 	u8 genmask = nft_genmask_cur(net);
+	const struct nft_pipapo_match *m;
+	const struct nft_pipapo_field *f;
 	const u8 *rp = (const u8 *)key;
-	struct nft_pipapo_match *m;
-	struct nft_pipapo_field *f;
 	unsigned long *res, *fill;
 	bool map_index;
 	int i, ret = 0;
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH nf-next 2/4] netfilter: nft_set_pipapo: do not rely on ZERO_SIZE_PTR
  2024-02-12 10:01 [PATCH nf-next 0/4] netfilter: nft_set_pipapo: speed up bulk element insertions Florian Westphal
  2024-02-12 10:01 ` [PATCH nf-next 1/4] netfilter: nft_set_pipapo: constify lookup fn args where possible Florian Westphal
@ 2024-02-12 10:01 ` Florian Westphal
  2024-02-13  7:20   ` Stefano Brivio
  2024-02-12 10:01 ` [PATCH nf-next 3/4] netfilter: nft_set_pipapo: shrink data structures Florian Westphal
  2024-02-12 10:01 ` [PATCH nf-next 4/4] netfilter: nft_set_pipapo: speed up bulk element insertions Florian Westphal
  3 siblings, 1 reply; 11+ messages in thread
From: Florian Westphal @ 2024-02-12 10:01 UTC (permalink / raw)
  To: netfilter-devel; +Cc: sbrivio, Florian Westphal

pipapo relies on kmalloc(0) returning ZERO_SIZE_PTR (i.e., not NULL
but pointer is invalid).

Rework this to not call slab allocator when we'd request a 0-byte
allocation.

While at it, also use GFP_KERNEL allocations here, this is only called
from control plane.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/netfilter/nft_set_pipapo.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index 395420fa71e5..6a79ec98de86 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -526,13 +526,16 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net,
 	const struct nft_pipapo_field *f;
 	int i;
 
-	res_map = kmalloc_array(m->bsize_max, sizeof(*res_map), GFP_ATOMIC);
+	if (m->bsize_max == 0)
+		return ret;
+
+	res_map = kmalloc_array(m->bsize_max, sizeof(*res_map), GFP_KERNEL);
 	if (!res_map) {
 		ret = ERR_PTR(-ENOMEM);
 		goto out;
 	}
 
-	fill_map = kcalloc(m->bsize_max, sizeof(*res_map), GFP_ATOMIC);
+	fill_map = kcalloc(m->bsize_max, sizeof(*res_map), GFP_KERNEL);
 	if (!fill_map) {
 		ret = ERR_PTR(-ENOMEM);
 		goto out;
@@ -1367,11 +1370,16 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
 		       src->bsize * sizeof(*dst->lt) *
 		       src->groups * NFT_PIPAPO_BUCKETS(src->bb));
 
-		dst->mt = kvmalloc(src->rules * sizeof(*src->mt), GFP_KERNEL);
-		if (!dst->mt)
-			goto out_mt;
+		if (src->rules > 0) {
+			dst->mt = kvmalloc_array(src->rules, sizeof(*src->mt), GFP_KERNEL);
+			if (!dst->mt)
+				goto out_mt;
+
+			memcpy(dst->mt, src->mt, src->rules * sizeof(*src->mt));
+		} else {
+			dst->mt = NULL;
+		}
 
-		memcpy(dst->mt, src->mt, src->rules * sizeof(*src->mt));
 		src++;
 		dst++;
 	}
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH nf-next 3/4] netfilter: nft_set_pipapo: shrink data structures
  2024-02-12 10:01 [PATCH nf-next 0/4] netfilter: nft_set_pipapo: speed up bulk element insertions Florian Westphal
  2024-02-12 10:01 ` [PATCH nf-next 1/4] netfilter: nft_set_pipapo: constify lookup fn args where possible Florian Westphal
  2024-02-12 10:01 ` [PATCH nf-next 2/4] netfilter: nft_set_pipapo: do not rely on ZERO_SIZE_PTR Florian Westphal
@ 2024-02-12 10:01 ` Florian Westphal
  2024-02-13 13:17   ` Stefano Brivio
  2024-02-12 10:01 ` [PATCH nf-next 4/4] netfilter: nft_set_pipapo: speed up bulk element insertions Florian Westphal
  3 siblings, 1 reply; 11+ messages in thread
From: Florian Westphal @ 2024-02-12 10:01 UTC (permalink / raw)
  To: netfilter-devel; +Cc: sbrivio, Florian Westphal

The set uses a mix of 'int', 'unsigned int', and size_t.

The rule count limit is NFT_PIPAPO_RULE0_MAX, which cannot
exceed INT_MAX (a few helpers use 'int' as return type).

Add a compile-time assertion for this.

Replace size_t usage in structs with unsigned int or u8 where
the stored values are smaller.

Replace signed-int arguments for lengths with 'unsigned int'
where possible.

Last, remove lt_aligned member: its set but never read.

struct nft_pipapo_match 40 bytes -> 32 bytes
struct nft_pipapo_field 56 bytes -> 32 bytes

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/netfilter/nft_set_pipapo.c | 60 ++++++++++++++++++++++------------
 net/netfilter/nft_set_pipapo.h | 29 ++++++----------
 2 files changed, 49 insertions(+), 40 deletions(-)

diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index 6a79ec98de86..a0ddf24a8052 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -359,11 +359,13 @@
  *
  * Return: -1 on no match, bit position on 'match_only', 0 otherwise.
  */
-int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst,
+int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules,
+		  unsigned long *dst,
 		  const union nft_pipapo_map_bucket *mt, bool match_only)
 {
 	unsigned long bitset;
-	int k, ret = -1;
+	unsigned int k;
+	int ret = -1;
 
 	for (k = 0; k < len; k++) {
 		bitset = map[k];
@@ -632,13 +634,16 @@ nft_pipapo_get(const struct net *net, const struct nft_set *set,
  *
  * Return: 0 on success, -ENOMEM on allocation failure.
  */
-static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules)
+static int pipapo_resize(struct nft_pipapo_field *f, unsigned int old_rules, unsigned int rules)
 {
 	long *new_lt = NULL, *new_p, *old_lt = f->lt, *old_p;
 	union nft_pipapo_map_bucket *new_mt, *old_mt = f->mt;
-	size_t new_bucket_size, copy;
+	unsigned int new_bucket_size, copy;
 	int group, bucket;
 
+	if (rules >= NFT_PIPAPO_RULE0_MAX)
+		return -ENOSPC;
+
 	new_bucket_size = DIV_ROUND_UP(rules, BITS_PER_LONG);
 #ifdef NFT_PIPAPO_ALIGN
 	new_bucket_size = roundup(new_bucket_size,
@@ -691,7 +696,7 @@ static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules)
 
 	if (new_lt) {
 		f->bsize = new_bucket_size;
-		NFT_PIPAPO_LT_ASSIGN(f, new_lt);
+		f->lt = new_lt;
 		kvfree(old_lt);
 	}
 
@@ -848,8 +853,8 @@ static void pipapo_lt_8b_to_4b(int old_groups, int bsize,
  */
 static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f)
 {
+	unsigned int groups, bb;
 	unsigned long *new_lt;
-	int groups, bb;
 	size_t lt_size;
 
 	lt_size = f->groups * NFT_PIPAPO_BUCKETS(f->bb) * f->bsize *
@@ -899,7 +904,7 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f)
 	f->groups = groups;
 	f->bb = bb;
 	kvfree(f->lt);
-	NFT_PIPAPO_LT_ASSIGN(f, new_lt);
+	f->lt = new_lt;
 }
 
 /**
@@ -916,7 +921,7 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f)
 static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k,
 			 int mask_bits)
 {
-	int rule = f->rules, group, ret, bit_offset = 0;
+	unsigned int rule = f->rules, group, ret, bit_offset = 0;
 
 	ret = pipapo_resize(f, f->rules, f->rules + 1);
 	if (ret)
@@ -1256,8 +1261,14 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
 	/* Validate */
 	start_p = start;
 	end_p = end;
+
+	/* some helpers return -1, or 0 >= for valid rule pos,
+	 * so we cannot support more than INT_MAX rules at this time.
+	 */
+	BUILD_BUG_ON(NFT_PIPAPO_RULE0_MAX > INT_MAX);
+
 	nft_pipapo_for_each_field(f, i, m) {
-		if (f->rules >= (unsigned long)NFT_PIPAPO_RULE0_MAX)
+		if (f->rules >= NFT_PIPAPO_RULE0_MAX)
 			return -ENOSPC;
 
 		if (memcmp(start_p, end_p,
@@ -1363,7 +1374,7 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
 		if (!new_lt)
 			goto out_lt;
 
-		NFT_PIPAPO_LT_ASSIGN(dst, new_lt);
+		dst->lt = new_lt;
 
 		memcpy(NFT_PIPAPO_LT_ALIGN(new_lt),
 		       NFT_PIPAPO_LT_ALIGN(src->lt),
@@ -1433,10 +1444,10 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
  *
  * Return: Number of rules that originated from the same entry as @first.
  */
-static int pipapo_rules_same_key(struct nft_pipapo_field *f, int first)
+static unsigned int pipapo_rules_same_key(struct nft_pipapo_field *f, unsigned int first)
 {
 	struct nft_pipapo_elem *e = NULL; /* Keep gcc happy */
-	int r;
+	unsigned int r;
 
 	for (r = first; r < f->rules; r++) {
 		if (r != first && e != f->mt[r].e)
@@ -1489,8 +1500,8 @@ static int pipapo_rules_same_key(struct nft_pipapo_field *f, int first)
  *                        0      1      2
  *  element pointers:  0x42   0x42   0x44
  */
-static void pipapo_unmap(union nft_pipapo_map_bucket *mt, int rules,
-			 int start, int n, int to_offset, bool is_last)
+static void pipapo_unmap(union nft_pipapo_map_bucket *mt, unsigned int rules,
+			 unsigned int start, unsigned int n, unsigned int to_offset, bool is_last)
 {
 	int i;
 
@@ -1596,8 +1607,8 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m)
 {
 	struct nft_pipapo *priv = nft_set_priv(set);
 	struct net *net = read_pnet(&set->net);
+	unsigned int rules_f0, first_rule = 0;
 	u64 tstamp = nft_net_tstamp(net);
-	int rules_f0, first_rule = 0;
 	struct nft_pipapo_elem *e;
 	struct nft_trans_gc *gc;
 
@@ -1608,7 +1619,7 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m)
 	while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) {
 		union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS];
 		const struct nft_pipapo_field *f;
-		int i, start, rules_fx;
+		unsigned int i, start, rules_fx;
 
 		start = first_rule;
 		rules_fx = rules_f0;
@@ -1986,7 +1997,7 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set,
 {
 	struct nft_pipapo *priv = nft_set_priv(set);
 	struct nft_pipapo_match *m = priv->clone;
-	int rules_f0, first_rule = 0;
+	unsigned int rules_f0, first_rule = 0;
 	struct nft_pipapo_elem *e;
 	const u8 *data;
 
@@ -2051,7 +2062,7 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set,
 	struct net *net = read_pnet(&set->net);
 	const struct nft_pipapo_match *m;
 	const struct nft_pipapo_field *f;
-	int i, r;
+	unsigned int i, r;
 
 	rcu_read_lock();
 	if (iter->genmask == nft_genmask_cur(net))
@@ -2155,6 +2166,9 @@ static int nft_pipapo_init(const struct nft_set *set,
 
 	field_count = desc->field_count ? : 1;
 
+	BUILD_BUG_ON(NFT_PIPAPO_MAX_FIELDS > 255);
+	BUILD_BUG_ON(NFT_PIPAPO_MAX_FIELDS != NFT_REG32_COUNT);
+
 	if (field_count > NFT_PIPAPO_MAX_FIELDS)
 		return -EINVAL;
 
@@ -2176,7 +2190,11 @@ static int nft_pipapo_init(const struct nft_set *set,
 	rcu_head_init(&m->rcu);
 
 	nft_pipapo_for_each_field(f, i, m) {
-		int len = desc->field_len[i] ? : set->klen;
+		unsigned int len = desc->field_len[i] ? : set->klen;
+
+		/* f->groups is u8 */
+		BUILD_BUG_ON((NFT_PIPAPO_MAX_BYTES *
+			      BITS_PER_BYTE / NFT_PIPAPO_GROUP_BITS_LARGE_SET) >= 256);
 
 		f->bb = NFT_PIPAPO_GROUP_BITS_INIT;
 		f->groups = len * NFT_PIPAPO_GROUPS_PER_BYTE(f);
@@ -2185,7 +2203,7 @@ static int nft_pipapo_init(const struct nft_set *set,
 
 		f->bsize = 0;
 		f->rules = 0;
-		NFT_PIPAPO_LT_ASSIGN(f, NULL);
+		f->lt = NULL;
 		f->mt = NULL;
 	}
 
@@ -2221,7 +2239,7 @@ static void nft_set_pipapo_match_destroy(const struct nft_ctx *ctx,
 					 struct nft_pipapo_match *m)
 {
 	struct nft_pipapo_field *f;
-	int i, r;
+	unsigned int i, r;
 
 	for (i = 0, f = m->f; i < m->field_count - 1; i++, f++)
 		;
diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h
index 90d22d691afc..8d9486ae0c01 100644
--- a/net/netfilter/nft_set_pipapo.h
+++ b/net/netfilter/nft_set_pipapo.h
@@ -70,15 +70,9 @@
 #define NFT_PIPAPO_ALIGN_HEADROOM					\
 	(NFT_PIPAPO_ALIGN - ARCH_KMALLOC_MINALIGN)
 #define NFT_PIPAPO_LT_ALIGN(lt)		(PTR_ALIGN((lt), NFT_PIPAPO_ALIGN))
-#define NFT_PIPAPO_LT_ASSIGN(field, x)					\
-	do {								\
-		(field)->lt_aligned = NFT_PIPAPO_LT_ALIGN(x);		\
-		(field)->lt = (x);					\
-	} while (0)
 #else
 #define NFT_PIPAPO_ALIGN_HEADROOM	0
 #define NFT_PIPAPO_LT_ALIGN(lt)		(lt)
-#define NFT_PIPAPO_LT_ASSIGN(field, x)	((field)->lt = (x))
 #endif /* NFT_PIPAPO_ALIGN */
 
 #define nft_pipapo_for_each_field(field, index, match)		\
@@ -110,22 +104,18 @@ union nft_pipapo_map_bucket {
 
 /**
  * struct nft_pipapo_field - Lookup, mapping tables and related data for a field
- * @groups:	Amount of bit groups
  * @rules:	Number of inserted rules
  * @bsize:	Size of each bucket in lookup table, in longs
+ * @groups:	Amount of bit groups
  * @bb:		Number of bits grouped together in lookup table buckets
  * @lt:		Lookup table: 'groups' rows of buckets
- * @lt_aligned:	Version of @lt aligned to NFT_PIPAPO_ALIGN bytes
  * @mt:		Mapping table: one bucket per rule
  */
 struct nft_pipapo_field {
-	int groups;
-	unsigned long rules;
-	size_t bsize;
-	int bb;
-#ifdef NFT_PIPAPO_ALIGN
-	unsigned long *lt_aligned;
-#endif
+	unsigned int rules;
+	unsigned int bsize;
+	u8 groups;
+	u8 bb;
 	unsigned long *lt;
 	union nft_pipapo_map_bucket *mt;
 };
@@ -145,15 +135,15 @@ struct nft_pipapo_scratch {
 /**
  * struct nft_pipapo_match - Data used for lookup and matching
  * @field_count		Amount of fields in set
- * @scratch:		Preallocated per-CPU maps for partial matching results
  * @bsize_max:		Maximum lookup table bucket size of all fields, in longs
+ * @scratch:		Preallocated per-CPU maps for partial matching results
  * @rcu			Matching data is swapped on commits
  * @f:			Fields, with lookup and mapping tables
  */
 struct nft_pipapo_match {
-	int field_count;
+	u8 field_count;
+	unsigned int bsize_max;
 	struct nft_pipapo_scratch * __percpu *scratch;
-	size_t bsize_max;
 	struct rcu_head rcu;
 	struct nft_pipapo_field f[] __counted_by(field_count);
 };
@@ -186,7 +176,8 @@ struct nft_pipapo_elem {
 	struct nft_set_ext	ext;
 };
 
-int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst,
+int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules,
+		  unsigned long *dst,
 		  const union nft_pipapo_map_bucket *mt, bool match_only);
 
 /**
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH nf-next 4/4] netfilter: nft_set_pipapo: speed up bulk element insertions
  2024-02-12 10:01 [PATCH nf-next 0/4] netfilter: nft_set_pipapo: speed up bulk element insertions Florian Westphal
                   ` (2 preceding siblings ...)
  2024-02-12 10:01 ` [PATCH nf-next 3/4] netfilter: nft_set_pipapo: shrink data structures Florian Westphal
@ 2024-02-12 10:01 ` Florian Westphal
  2024-02-13 13:17   ` Stefano Brivio
  3 siblings, 1 reply; 11+ messages in thread
From: Florian Westphal @ 2024-02-12 10:01 UTC (permalink / raw)
  To: netfilter-devel; +Cc: sbrivio, Florian Westphal

Insertions into the set are slow when we try to add many elements.
For 800k elements I get:

time nft -f pipapo_800k
real    19m34.849s
user    0m2.390s
sys     19m12.828s

perf stats:
 --95.39%--nft_pipapo_insert
     |--76.60%--pipapo_insert
     |           --76.37%--pipapo_resize
     |                     |--72.87%--memcpy_orig
     |                     |--1.88%--__free_pages_ok
     |                     |          --0.89%--free_tail_page_prepare
     |                      --1.38%--kvmalloc_node
     ..
     --18.56%--pipapo_get.isra.0
     |--13.91%--__bitmap_and
     |--3.01%--pipapo_refill
     |--0.81%--__kmalloc
     |           --0.74%--__kmalloc_large_node
     |                      --0.66%--__alloc_pages
     ..
     --0.52%--memset_orig

So lots of time is spent in copying exising elements to make space for
the next one.

Instead of allocating to the exact size of the new rule count, allocate
extra slack to reduce alloc/copy/free overhead.

After:
time nft -f pipapo_800k
real    1m54.110s
user    0m2.515s
sys     1m51.377s

 --80.46%--nft_pipapo_insert
     |--73.45%--pipapo_get.isra.0
     |--57.63%--__bitmap_and
     |          |--8.52%--pipapo_refill
     |--3.45%--__kmalloc
     |           --3.05%--__kmalloc_large_node
     |                      --2.58%--__alloc_pages
     --2.59%--memset_orig
     |--6.51%--pipapo_insert
            --5.96%--pipapo_resize
                     |--3.63%--memcpy_orig
                     --2.13%--kvmalloc_node

The new @rules_alloc fills a hole, so struct size doesn't go up.
Also make it so rule removal doesn't shrink unless the free/extra space
exceeds two pages.  This should be safe as well:

When a rule gets removed, the attempt to lower the allocated size is
already allowed to fail.

Exception: do exact allocations as long as set is very small (less
than one page needed).

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/netfilter/nft_set_pipapo.c | 80 +++++++++++++++++++++++++++-------
 net/netfilter/nft_set_pipapo.h |  2 +
 2 files changed, 67 insertions(+), 15 deletions(-)

diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index a0ddf24a8052..25cdf64a3139 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -622,6 +622,62 @@ nft_pipapo_get(const struct net *net, const struct nft_set *set,
 	return &e->priv;
 }
 
+static int pipapo_realloc_mt(struct nft_pipapo_field *f, unsigned int old_rules, unsigned int rules)
+{
+	union nft_pipapo_map_bucket *new_mt = NULL, *old_mt = f->mt;
+	unsigned int extra = 4096 / sizeof(*new_mt);
+	unsigned int rules_alloc = rules;
+
+	might_sleep();
+
+	BUILD_BUG_ON(extra < 32);
+
+	if (unlikely(rules == 0))
+		goto out_free;
+
+	/* growing and enough space left, no action needed */
+	if (rules > old_rules && f->rules_alloc > rules)
+		return 0;
+
+	/* downsize and extra slack has not grown too large */
+	if (rules < old_rules) {
+		unsigned int remove = f->rules_alloc - rules;
+
+		if (remove < (2u * extra))
+			return 0;
+	}
+
+	/* small sets get precise count, else add extra slack
+	 * to avoid frequent reallocations.  Extra slack is
+	 * currently one 4k page worth of rules.
+	 *
+	 * Use no slack if the set only has a small number
+	 * of rules.
+	 */
+	if (rules > extra &&
+	    check_add_overflow(rules, extra, &rules_alloc))
+		return -EOVERFLOW;
+
+	new_mt = kvmalloc_array(rules_alloc, sizeof(*new_mt), GFP_KERNEL);
+	if (!new_mt)
+		return -ENOMEM;
+
+	if (old_mt)
+		memcpy(new_mt, old_mt, min(old_rules, rules) * sizeof(*new_mt));
+
+	if (rules > old_rules)
+		memset(new_mt + old_rules, 0,
+		       (rules - old_rules) * sizeof(*new_mt));
+
+out_free:
+	f->rules_alloc = rules_alloc;
+	f->mt = new_mt;
+
+	kvfree(old_mt);
+
+	return 0;
+}
+
 /**
  * pipapo_resize() - Resize lookup or mapping table, or both
  * @f:		Field containing lookup and mapping tables
@@ -637,9 +693,8 @@ nft_pipapo_get(const struct net *net, const struct nft_set *set,
 static int pipapo_resize(struct nft_pipapo_field *f, unsigned int old_rules, unsigned int rules)
 {
 	long *new_lt = NULL, *new_p, *old_lt = f->lt, *old_p;
-	union nft_pipapo_map_bucket *new_mt, *old_mt = f->mt;
 	unsigned int new_bucket_size, copy;
-	int group, bucket;
+	int group, bucket, err;
 
 	if (rules >= NFT_PIPAPO_RULE0_MAX)
 		return -ENOSPC;
@@ -682,16 +737,10 @@ static int pipapo_resize(struct nft_pipapo_field *f, unsigned int old_rules, uns
 	}
 
 mt:
-	new_mt = kvmalloc(rules * sizeof(*new_mt), GFP_KERNEL);
-	if (!new_mt) {
+	err = pipapo_realloc_mt(f, old_rules, rules);
+	if (err) {
 		kvfree(new_lt);
-		return -ENOMEM;
-	}
-
-	memcpy(new_mt, f->mt, min(old_rules, rules) * sizeof(*new_mt));
-	if (rules > old_rules) {
-		memset(new_mt + old_rules, 0,
-		       (rules - old_rules) * sizeof(*new_mt));
+		return err;
 	}
 
 	if (new_lt) {
@@ -700,9 +749,6 @@ static int pipapo_resize(struct nft_pipapo_field *f, unsigned int old_rules, uns
 		kvfree(old_lt);
 	}
 
-	f->mt = new_mt;
-	kvfree(old_mt);
-
 	return 0;
 }
 
@@ -1382,13 +1428,16 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
 		       src->groups * NFT_PIPAPO_BUCKETS(src->bb));
 
 		if (src->rules > 0) {
-			dst->mt = kvmalloc_array(src->rules, sizeof(*src->mt), GFP_KERNEL);
+			dst->mt = kvmalloc_array(src->rules_alloc, sizeof(*src->mt), GFP_KERNEL);
 			if (!dst->mt)
 				goto out_mt;
 
 			memcpy(dst->mt, src->mt, src->rules * sizeof(*src->mt));
+			dst->rules_alloc = src->rules_alloc;
+			dst->rules = src->rules;
 		} else {
 			dst->mt = NULL;
+			dst->rules_alloc = 0;
 		}
 
 		src++;
@@ -2203,6 +2252,7 @@ static int nft_pipapo_init(const struct nft_set *set,
 
 		f->bsize = 0;
 		f->rules = 0;
+		f->rules_alloc = 0;
 		f->lt = NULL;
 		f->mt = NULL;
 	}
diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h
index 8d9486ae0c01..bbcac2b38167 100644
--- a/net/netfilter/nft_set_pipapo.h
+++ b/net/netfilter/nft_set_pipapo.h
@@ -106,6 +106,7 @@ union nft_pipapo_map_bucket {
  * struct nft_pipapo_field - Lookup, mapping tables and related data for a field
  * @rules:	Number of inserted rules
  * @bsize:	Size of each bucket in lookup table, in longs
+ * @rules_alloc Number of allocated rules, always >= rules
  * @groups:	Amount of bit groups
  * @bb:		Number of bits grouped together in lookup table buckets
  * @lt:		Lookup table: 'groups' rows of buckets
@@ -114,6 +115,7 @@ union nft_pipapo_map_bucket {
 struct nft_pipapo_field {
 	unsigned int rules;
 	unsigned int bsize;
+	unsigned int rules_alloc;
 	u8 groups;
 	u8 bb;
 	unsigned long *lt;
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH nf-next 1/4] netfilter: nft_set_pipapo: constify lookup fn args where possible
  2024-02-12 10:01 ` [PATCH nf-next 1/4] netfilter: nft_set_pipapo: constify lookup fn args where possible Florian Westphal
@ 2024-02-13  7:19   ` Stefano Brivio
  0 siblings, 0 replies; 11+ messages in thread
From: Stefano Brivio @ 2024-02-13  7:19 UTC (permalink / raw)
  To: Florian Westphal; +Cc: netfilter-devel

Nits only:

On Mon, 12 Feb 2024 11:01:50 +0100
Florian Westphal <fw@strlen.de> wrote:

> Those get called from packet path, content must not be modified.
> No functional changes intended.
> 
> Signed-off-by: Florian Westphal <fw@strlen.de>
> ---
>  net/netfilter/nft_set_pipapo.c      | 16 ++++++++--------
>  net/netfilter/nft_set_pipapo.h      |  6 +++---
>  net/netfilter/nft_set_pipapo_avx2.c | 26 +++++++++++++-------------
>  3 files changed, 24 insertions(+), 24 deletions(-)
> 
> diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
> index ed243edaa6a0..395420fa71e5 100644
> --- a/net/netfilter/nft_set_pipapo.c
> +++ b/net/netfilter/nft_set_pipapo.c
> @@ -360,7 +360,7 @@
>   * Return: -1 on no match, bit position on 'match_only', 0 otherwise.
>   */
>  int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst,
> -		  union nft_pipapo_map_bucket *mt, bool match_only)
> +		  const union nft_pipapo_map_bucket *mt, bool match_only)
>  {
>  	unsigned long bitset;
>  	int k, ret = -1;
> @@ -412,9 +412,9 @@ bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set,
>  	struct nft_pipapo_scratch *scratch;
>  	unsigned long *res_map, *fill_map;
>  	u8 genmask = nft_genmask_cur(net);
> +	const struct nft_pipapo_match *m;
> +	const struct nft_pipapo_field *f;
>  	const u8 *rp = (const u8 *)key;
> -	struct nft_pipapo_match *m;
> -	struct nft_pipapo_field *f;
>  	bool map_index;
>  	int i;
>  
> @@ -521,9 +521,9 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net,
>  {
>  	struct nft_pipapo_elem *ret = ERR_PTR(-ENOENT);
>  	struct nft_pipapo *priv = nft_set_priv(set);
> -	struct nft_pipapo_match *m = priv->clone;
> +	const struct nft_pipapo_match *m = priv->clone;

This should go one line below now and have a separate initialiser
(reverse Christmas tree).

>  	unsigned long *res_map, *fill_map = NULL;
> -	struct nft_pipapo_field *f;
> +	const struct nft_pipapo_field *f;
>  	int i;
>  
>  	res_map = kmalloc_array(m->bsize_max, sizeof(*res_map), GFP_ATOMIC);
> @@ -1599,7 +1599,7 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m)
>  
>  	while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) {
>  		union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS];
> -		struct nft_pipapo_field *f;
> +		const struct nft_pipapo_field *f;
>  		int i, start, rules_fx;
>  
>  		start = first_rule;
> @@ -2041,8 +2041,8 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set,
>  {
>  	struct nft_pipapo *priv = nft_set_priv(set);
>  	struct net *net = read_pnet(&set->net);
> -	struct nft_pipapo_match *m;
> -	struct nft_pipapo_field *f;
> +	const struct nft_pipapo_match *m;
> +	const struct nft_pipapo_field *f;
>  	int i, r;
>  
>  	rcu_read_lock();
> diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h
> index f59a0cd81105..90d22d691afc 100644
> --- a/net/netfilter/nft_set_pipapo.h
> +++ b/net/netfilter/nft_set_pipapo.h
> @@ -187,7 +187,7 @@ struct nft_pipapo_elem {
>  };
>  
>  int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst,
> -		  union nft_pipapo_map_bucket *mt, bool match_only);
> +		  const union nft_pipapo_map_bucket *mt, bool match_only);
>  
>  /**
>   * pipapo_and_field_buckets_4bit() - Intersect 4-bit buckets
> @@ -195,7 +195,7 @@ int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst,
>   * @dst:	Area to store result
>   * @data:	Input data selecting table buckets
>   */
> -static inline void pipapo_and_field_buckets_4bit(struct nft_pipapo_field *f,
> +static inline void pipapo_and_field_buckets_4bit(const struct nft_pipapo_field *f,
>  						 unsigned long *dst,
>  						 const u8 *data)
>  {
> @@ -223,7 +223,7 @@ static inline void pipapo_and_field_buckets_4bit(struct nft_pipapo_field *f,
>   * @dst:	Area to store result
>   * @data:	Input data selecting table buckets
>   */
> -static inline void pipapo_and_field_buckets_8bit(struct nft_pipapo_field *f,
> +static inline void pipapo_and_field_buckets_8bit(const struct nft_pipapo_field *f,
>  						 unsigned long *dst,
>  						 const u8 *data)
>  {
> diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c
> index 208e9d577347..d7bea311165f 100644
> --- a/net/netfilter/nft_set_pipapo_avx2.c
> +++ b/net/netfilter/nft_set_pipapo_avx2.c
> @@ -212,7 +212,7 @@ static int nft_pipapo_avx2_refill(int offset, unsigned long *map,
>   * word index to be checked next (i.e. first filled word).
>   */
>  static int nft_pipapo_avx2_lookup_4b_2(unsigned long *map, unsigned long *fill,
> -				       struct nft_pipapo_field *f, int offset,
> +				       const struct nft_pipapo_field *f, int offset,

This and all the changed declarations below exceed 80 columns but,
unlike the ones above, we could simply turn them into:

static int nft_pipapo_avx2_lookup_4b_2(unsigned long *map, unsigned long *fill,
				       const struct nft_pipapo_field *f,
				       int offset, const u8 *pkt,
				       bool first, bool last)

and so on.

>  				       const u8 *pkt, bool first, bool last)
>  {
>  	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
> @@ -274,7 +274,7 @@ static int nft_pipapo_avx2_lookup_4b_2(unsigned long *map, unsigned long *fill,
>   * word index to be checked next (i.e. first filled word).
>   */
>  static int nft_pipapo_avx2_lookup_4b_4(unsigned long *map, unsigned long *fill,
> -				       struct nft_pipapo_field *f, int offset,
> +				       const struct nft_pipapo_field *f, int offset,
>  				       const u8 *pkt, bool first, bool last)
>  {
>  	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
> @@ -350,7 +350,7 @@ static int nft_pipapo_avx2_lookup_4b_4(unsigned long *map, unsigned long *fill,
>   * word index to be checked next (i.e. first filled word).
>   */
>  static int nft_pipapo_avx2_lookup_4b_8(unsigned long *map, unsigned long *fill,
> -				       struct nft_pipapo_field *f, int offset,
> +				       const struct nft_pipapo_field *f, int offset,
>  				       const u8 *pkt, bool first, bool last)
>  {
>  	u8 pg[8] = {  pkt[0] >> 4,  pkt[0] & 0xf,  pkt[1] >> 4,  pkt[1] & 0xf,
> @@ -445,7 +445,7 @@ static int nft_pipapo_avx2_lookup_4b_8(unsigned long *map, unsigned long *fill,
>   * word index to be checked next (i.e. first filled word).
>   */
>  static int nft_pipapo_avx2_lookup_4b_12(unsigned long *map, unsigned long *fill,
> -				        struct nft_pipapo_field *f, int offset,
> +					const struct nft_pipapo_field *f, int offset,
>  				        const u8 *pkt, bool first, bool last)
>  {
>  	u8 pg[12] = {  pkt[0] >> 4,  pkt[0] & 0xf,  pkt[1] >> 4,  pkt[1] & 0xf,
> @@ -534,7 +534,7 @@ static int nft_pipapo_avx2_lookup_4b_12(unsigned long *map, unsigned long *fill,
>   * word index to be checked next (i.e. first filled word).
>   */
>  static int nft_pipapo_avx2_lookup_4b_32(unsigned long *map, unsigned long *fill,
> -					struct nft_pipapo_field *f, int offset,
> +					const struct nft_pipapo_field *f, int offset,
>  					const u8 *pkt, bool first, bool last)
>  {
>  	u8 pg[32] = {  pkt[0] >> 4,  pkt[0] & 0xf,  pkt[1] >> 4,  pkt[1] & 0xf,
> @@ -669,7 +669,7 @@ static int nft_pipapo_avx2_lookup_4b_32(unsigned long *map, unsigned long *fill,
>   * word index to be checked next (i.e. first filled word).
>   */
>  static int nft_pipapo_avx2_lookup_8b_1(unsigned long *map, unsigned long *fill,
> -				       struct nft_pipapo_field *f, int offset,
> +				       const struct nft_pipapo_field *f, int offset,
>  				       const u8 *pkt, bool first, bool last)
>  {
>  	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
> @@ -726,7 +726,7 @@ static int nft_pipapo_avx2_lookup_8b_1(unsigned long *map, unsigned long *fill,
>   * word index to be checked next (i.e. first filled word).
>   */
>  static int nft_pipapo_avx2_lookup_8b_2(unsigned long *map, unsigned long *fill,
> -				       struct nft_pipapo_field *f, int offset,
> +				       const struct nft_pipapo_field *f, int offset,
>  				       const u8 *pkt, bool first, bool last)
>  {
>  	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
> @@ -790,7 +790,7 @@ static int nft_pipapo_avx2_lookup_8b_2(unsigned long *map, unsigned long *fill,
>   * word index to be checked next (i.e. first filled word).
>   */
>  static int nft_pipapo_avx2_lookup_8b_4(unsigned long *map, unsigned long *fill,
> -				       struct nft_pipapo_field *f, int offset,
> +				       const struct nft_pipapo_field *f, int offset,
>  				       const u8 *pkt, bool first, bool last)
>  {
>  	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
> @@ -865,7 +865,7 @@ static int nft_pipapo_avx2_lookup_8b_4(unsigned long *map, unsigned long *fill,
>   * word index to be checked next (i.e. first filled word).
>   */
>  static int nft_pipapo_avx2_lookup_8b_6(unsigned long *map, unsigned long *fill,
> -				       struct nft_pipapo_field *f, int offset,
> +				       const struct nft_pipapo_field *f, int offset,
>  				       const u8 *pkt, bool first, bool last)
>  {
>  	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
> @@ -950,7 +950,7 @@ static int nft_pipapo_avx2_lookup_8b_6(unsigned long *map, unsigned long *fill,
>   * word index to be checked next (i.e. first filled word).
>   */
>  static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill,
> -					struct nft_pipapo_field *f, int offset,
> +					const struct nft_pipapo_field *f, int offset,
>  					const u8 *pkt, bool first, bool last)
>  {
>  	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
> @@ -1042,7 +1042,7 @@ static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill,
>   * word index to be checked next (i.e. first filled word).
>   */
>  static int nft_pipapo_avx2_lookup_slow(unsigned long *map, unsigned long *fill,
> -					struct nft_pipapo_field *f, int offset,
> +					const struct nft_pipapo_field *f, int offset,
>  					const u8 *pkt, bool first, bool last)
>  {
>  	unsigned long bsize = f->bsize;
> @@ -1119,9 +1119,9 @@ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
>  	struct nft_pipapo *priv = nft_set_priv(set);
>  	struct nft_pipapo_scratch *scratch;
>  	u8 genmask = nft_genmask_cur(net);
> +	const struct nft_pipapo_match *m;
> +	const struct nft_pipapo_field *f;
>  	const u8 *rp = (const u8 *)key;
> -	struct nft_pipapo_match *m;
> -	struct nft_pipapo_field *f;
>  	unsigned long *res, *fill;
>  	bool map_index;
>  	int i, ret = 0;

-- 
Stefano


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH nf-next 2/4] netfilter: nft_set_pipapo: do not rely on ZERO_SIZE_PTR
  2024-02-12 10:01 ` [PATCH nf-next 2/4] netfilter: nft_set_pipapo: do not rely on ZERO_SIZE_PTR Florian Westphal
@ 2024-02-13  7:20   ` Stefano Brivio
  2024-02-13 11:09     ` Florian Westphal
  0 siblings, 1 reply; 11+ messages in thread
From: Stefano Brivio @ 2024-02-13  7:20 UTC (permalink / raw)
  To: Florian Westphal; +Cc: netfilter-devel

On Mon, 12 Feb 2024 11:01:51 +0100
Florian Westphal <fw@strlen.de> wrote:

> pipapo relies on kmalloc(0) returning ZERO_SIZE_PTR (i.e., not NULL
> but pointer is invalid).
> 
> Rework this to not call slab allocator when we'd request a 0-byte
> allocation.
> 
> While at it, also use GFP_KERNEL allocations here, this is only called
> from control plane.
> 
> Signed-off-by: Florian Westphal <fw@strlen.de>
> ---
>  net/netfilter/nft_set_pipapo.c | 20 ++++++++++++++------
>  1 file changed, 14 insertions(+), 6 deletions(-)
> 
> diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
> index 395420fa71e5..6a79ec98de86 100644
> --- a/net/netfilter/nft_set_pipapo.c
> +++ b/net/netfilter/nft_set_pipapo.c
> @@ -526,13 +526,16 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net,
>  	const struct nft_pipapo_field *f;
>  	int i;
>  
> -	res_map = kmalloc_array(m->bsize_max, sizeof(*res_map), GFP_ATOMIC);
> +	if (m->bsize_max == 0)
> +		return ret;
> +
> +	res_map = kmalloc_array(m->bsize_max, sizeof(*res_map), GFP_KERNEL);
>  	if (!res_map) {
>  		ret = ERR_PTR(-ENOMEM);
>  		goto out;
>  	}
>  
> -	fill_map = kcalloc(m->bsize_max, sizeof(*res_map), GFP_ATOMIC);
> +	fill_map = kcalloc(m->bsize_max, sizeof(*res_map), GFP_KERNEL);

I haven't re-checked the whole logic, but can't nft_pipapo_deactivate()
(hence pipapo_deactivate() and pipapo_get()) be called from the data
path for some reason?

If I recall correctly that's why I used GFP_ATOMIC here, but I'm not
sure anymore and I guess you know better.

>  	if (!fill_map) {
>  		ret = ERR_PTR(-ENOMEM);
>  		goto out;
> @@ -1367,11 +1370,16 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
>  		       src->bsize * sizeof(*dst->lt) *
>  		       src->groups * NFT_PIPAPO_BUCKETS(src->bb));
>  
> -		dst->mt = kvmalloc(src->rules * sizeof(*src->mt), GFP_KERNEL);
> -		if (!dst->mt)
> -			goto out_mt;
> +		if (src->rules > 0) {
> +			dst->mt = kvmalloc_array(src->rules, sizeof(*src->mt), GFP_KERNEL);

Nit: equally readable within 80 columns:

			dst->mt = kvmalloc_array(src->rules, sizeof(*src->mt),
						 GFP_KERNEL);

> +			if (!dst->mt)
> +				goto out_mt;
> +
> +			memcpy(dst->mt, src->mt, src->rules * sizeof(*src->mt));
> +		} else {
> +			dst->mt = NULL;
> +		}
>  
> -		memcpy(dst->mt, src->mt, src->rules * sizeof(*src->mt));
>  		src++;
>  		dst++;
>  	}

-- 
Stefano


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH nf-next 2/4] netfilter: nft_set_pipapo: do not rely on ZERO_SIZE_PTR
  2024-02-13  7:20   ` Stefano Brivio
@ 2024-02-13 11:09     ` Florian Westphal
  0 siblings, 0 replies; 11+ messages in thread
From: Florian Westphal @ 2024-02-13 11:09 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: Florian Westphal, netfilter-devel

Stefano Brivio <sbrivio@redhat.com> wrote:
> > -	fill_map = kcalloc(m->bsize_max, sizeof(*res_map), GFP_ATOMIC);
> > +	fill_map = kcalloc(m->bsize_max, sizeof(*res_map), GFP_KERNEL);
> 
> I haven't re-checked the whole logic, but can't nft_pipapo_deactivate()
> (hence pipapo_deactivate() and pipapo_get()) be called from the data
> path for some reason?

Not that I know.  Deactivate turns off an element in the next generation
and that concept is tied to the transaction, and that needs the nft
mutex.

> If I recall correctly that's why I used GFP_ATOMIC here, but I'm not
> sure anymore and I guess you know better.

I'll have a look at original version to see if there was a reason
for this, if so I'll update commit message or move this to its own
change.

> > +		if (src->rules > 0) {
> > +			dst->mt = kvmalloc_array(src->rules, sizeof(*src->mt), GFP_KERNEL);
> 
> Nit: equally readable within 80 columns:
> 
> 			dst->mt = kvmalloc_array(src->rules, sizeof(*src->mt),
> 						 GFP_KERNEL);

OK. I'll reformat.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH nf-next 3/4] netfilter: nft_set_pipapo: shrink data structures
  2024-02-12 10:01 ` [PATCH nf-next 3/4] netfilter: nft_set_pipapo: shrink data structures Florian Westphal
@ 2024-02-13 13:17   ` Stefano Brivio
  0 siblings, 0 replies; 11+ messages in thread
From: Stefano Brivio @ 2024-02-13 13:17 UTC (permalink / raw)
  To: Florian Westphal; +Cc: netfilter-devel

On Mon, 12 Feb 2024 11:01:52 +0100
Florian Westphal <fw@strlen.de> wrote:

> The set uses a mix of 'int', 'unsigned int', and size_t.
> 
> The rule count limit is NFT_PIPAPO_RULE0_MAX, which cannot
> exceed INT_MAX (a few helpers use 'int' as return type).
> 
> Add a compile-time assertion for this.
> 
> Replace size_t usage in structs with unsigned int or u8 where
> the stored values are smaller.
> 
> Replace signed-int arguments for lengths with 'unsigned int'
> where possible.
> 
> Last, remove lt_aligned member: its set but never read.
> 
> struct nft_pipapo_match 40 bytes -> 32 bytes
> struct nft_pipapo_field 56 bytes -> 32 bytes
> 
> Signed-off-by: Florian Westphal <fw@strlen.de>
> ---
>  net/netfilter/nft_set_pipapo.c | 60 ++++++++++++++++++++++------------
>  net/netfilter/nft_set_pipapo.h | 29 ++++++----------
>  2 files changed, 49 insertions(+), 40 deletions(-)
> 
> diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
> index 6a79ec98de86..a0ddf24a8052 100644
> --- a/net/netfilter/nft_set_pipapo.c
> +++ b/net/netfilter/nft_set_pipapo.c
> @@ -359,11 +359,13 @@
>   *
>   * Return: -1 on no match, bit position on 'match_only', 0 otherwise.
>   */
> -int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst,
> +int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules,
> +		  unsigned long *dst,
>  		  const union nft_pipapo_map_bucket *mt, bool match_only)
>  {
>  	unsigned long bitset;
> -	int k, ret = -1;
> +	unsigned int k;
> +	int ret = -1;
>  
>  	for (k = 0; k < len; k++) {
>  		bitset = map[k];
> @@ -632,13 +634,16 @@ nft_pipapo_get(const struct net *net, const struct nft_set *set,
>   *
>   * Return: 0 on success, -ENOMEM on allocation failure.
>   */
> -static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules)
> +static int pipapo_resize(struct nft_pipapo_field *f, unsigned int old_rules, unsigned int rules)

Nit:

static int pipapo_resize(struct nft_pipapo_field *f,
			 unsigned int old_rules, unsigned int rules)

without losing readability.

>  {
>  	long *new_lt = NULL, *new_p, *old_lt = f->lt, *old_p;
>  	union nft_pipapo_map_bucket *new_mt, *old_mt = f->mt;
> -	size_t new_bucket_size, copy;
> +	unsigned int new_bucket_size, copy;
>  	int group, bucket;
>  
> +	if (rules >= NFT_PIPAPO_RULE0_MAX)
> +		return -ENOSPC;
> +
>  	new_bucket_size = DIV_ROUND_UP(rules, BITS_PER_LONG);
>  #ifdef NFT_PIPAPO_ALIGN
>  	new_bucket_size = roundup(new_bucket_size,
> @@ -691,7 +696,7 @@ static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules)
>  
>  	if (new_lt) {
>  		f->bsize = new_bucket_size;
> -		NFT_PIPAPO_LT_ASSIGN(f, new_lt);
> +		f->lt = new_lt;
>  		kvfree(old_lt);
>  	}
>  
> @@ -848,8 +853,8 @@ static void pipapo_lt_8b_to_4b(int old_groups, int bsize,
>   */
>  static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f)
>  {
> +	unsigned int groups, bb;
>  	unsigned long *new_lt;
> -	int groups, bb;
>  	size_t lt_size;
>  
>  	lt_size = f->groups * NFT_PIPAPO_BUCKETS(f->bb) * f->bsize *
> @@ -899,7 +904,7 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f)
>  	f->groups = groups;
>  	f->bb = bb;
>  	kvfree(f->lt);
> -	NFT_PIPAPO_LT_ASSIGN(f, new_lt);
> +	f->lt = new_lt;
>  }
>  
>  /**
> @@ -916,7 +921,7 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f)
>  static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k,
>  			 int mask_bits)
>  {
> -	int rule = f->rules, group, ret, bit_offset = 0;
> +	unsigned int rule = f->rules, group, ret, bit_offset = 0;
>  
>  	ret = pipapo_resize(f, f->rules, f->rules + 1);
>  	if (ret)
> @@ -1256,8 +1261,14 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
>  	/* Validate */
>  	start_p = start;
>  	end_p = end;
> +
> +	/* some helpers return -1, or 0 >= for valid rule pos,
> +	 * so we cannot support more than INT_MAX rules at this time.
> +	 */
> +	BUILD_BUG_ON(NFT_PIPAPO_RULE0_MAX > INT_MAX);
> +
>  	nft_pipapo_for_each_field(f, i, m) {
> -		if (f->rules >= (unsigned long)NFT_PIPAPO_RULE0_MAX)
> +		if (f->rules >= NFT_PIPAPO_RULE0_MAX)
>  			return -ENOSPC;
>  
>  		if (memcmp(start_p, end_p,
> @@ -1363,7 +1374,7 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
>  		if (!new_lt)
>  			goto out_lt;
>  
> -		NFT_PIPAPO_LT_ASSIGN(dst, new_lt);
> +		dst->lt = new_lt;
>  
>  		memcpy(NFT_PIPAPO_LT_ALIGN(new_lt),
>  		       NFT_PIPAPO_LT_ALIGN(src->lt),
> @@ -1433,10 +1444,10 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
>   *
>   * Return: Number of rules that originated from the same entry as @first.
>   */
> -static int pipapo_rules_same_key(struct nft_pipapo_field *f, int first)
> +static unsigned int pipapo_rules_same_key(struct nft_pipapo_field *f, unsigned int first)
>  {
>  	struct nft_pipapo_elem *e = NULL; /* Keep gcc happy */
> -	int r;
> +	unsigned int r;
>  
>  	for (r = first; r < f->rules; r++) {
>  		if (r != first && e != f->mt[r].e)
> @@ -1489,8 +1500,8 @@ static int pipapo_rules_same_key(struct nft_pipapo_field *f, int first)
>   *                        0      1      2
>   *  element pointers:  0x42   0x42   0x44
>   */
> -static void pipapo_unmap(union nft_pipapo_map_bucket *mt, int rules,
> -			 int start, int n, int to_offset, bool is_last)
> +static void pipapo_unmap(union nft_pipapo_map_bucket *mt, unsigned int rules,
> +			 unsigned int start, unsigned int n, unsigned int to_offset, bool is_last)

Same here,

static void pipapo_unmap(union nft_pipapo_map_bucket *mt, unsigned int rules,
			 unsigned int start, unsigned int n,
			 unsigned int to_offset, bool is_last)

?

>  {
>  	int i;
>  
> @@ -1596,8 +1607,8 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m)
>  {
>  	struct nft_pipapo *priv = nft_set_priv(set);
>  	struct net *net = read_pnet(&set->net);
> +	unsigned int rules_f0, first_rule = 0;
>  	u64 tstamp = nft_net_tstamp(net);
> -	int rules_f0, first_rule = 0;
>  	struct nft_pipapo_elem *e;
>  	struct nft_trans_gc *gc;
>  
> @@ -1608,7 +1619,7 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m)
>  	while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) {
>  		union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS];
>  		const struct nft_pipapo_field *f;
> -		int i, start, rules_fx;
> +		unsigned int i, start, rules_fx;
>  
>  		start = first_rule;
>  		rules_fx = rules_f0;
> @@ -1986,7 +1997,7 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set,
>  {
>  	struct nft_pipapo *priv = nft_set_priv(set);
>  	struct nft_pipapo_match *m = priv->clone;
> -	int rules_f0, first_rule = 0;
> +	unsigned int rules_f0, first_rule = 0;
>  	struct nft_pipapo_elem *e;
>  	const u8 *data;
>  
> @@ -2051,7 +2062,7 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set,
>  	struct net *net = read_pnet(&set->net);
>  	const struct nft_pipapo_match *m;
>  	const struct nft_pipapo_field *f;
> -	int i, r;
> +	unsigned int i, r;
>  
>  	rcu_read_lock();
>  	if (iter->genmask == nft_genmask_cur(net))
> @@ -2155,6 +2166,9 @@ static int nft_pipapo_init(const struct nft_set *set,
>  
>  	field_count = desc->field_count ? : 1;
>  
> +	BUILD_BUG_ON(NFT_PIPAPO_MAX_FIELDS > 255);
> +	BUILD_BUG_ON(NFT_PIPAPO_MAX_FIELDS != NFT_REG32_COUNT);
> +
>  	if (field_count > NFT_PIPAPO_MAX_FIELDS)
>  		return -EINVAL;
>  
> @@ -2176,7 +2190,11 @@ static int nft_pipapo_init(const struct nft_set *set,
>  	rcu_head_init(&m->rcu);
>  
>  	nft_pipapo_for_each_field(f, i, m) {
> -		int len = desc->field_len[i] ? : set->klen;
> +		unsigned int len = desc->field_len[i] ? : set->klen;
> +
> +		/* f->groups is u8 */
> +		BUILD_BUG_ON((NFT_PIPAPO_MAX_BYTES *
> +			      BITS_PER_BYTE / NFT_PIPAPO_GROUP_BITS_LARGE_SET) >= 256);
>  
>  		f->bb = NFT_PIPAPO_GROUP_BITS_INIT;
>  		f->groups = len * NFT_PIPAPO_GROUPS_PER_BYTE(f);
> @@ -2185,7 +2203,7 @@ static int nft_pipapo_init(const struct nft_set *set,
>  
>  		f->bsize = 0;
>  		f->rules = 0;
> -		NFT_PIPAPO_LT_ASSIGN(f, NULL);
> +		f->lt = NULL;
>  		f->mt = NULL;
>  	}
>  
> @@ -2221,7 +2239,7 @@ static void nft_set_pipapo_match_destroy(const struct nft_ctx *ctx,
>  					 struct nft_pipapo_match *m)
>  {
>  	struct nft_pipapo_field *f;
> -	int i, r;
> +	unsigned int i, r;
>  
>  	for (i = 0, f = m->f; i < m->field_count - 1; i++, f++)
>  		;
> diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h
> index 90d22d691afc..8d9486ae0c01 100644
> --- a/net/netfilter/nft_set_pipapo.h
> +++ b/net/netfilter/nft_set_pipapo.h
> @@ -70,15 +70,9 @@
>  #define NFT_PIPAPO_ALIGN_HEADROOM					\
>  	(NFT_PIPAPO_ALIGN - ARCH_KMALLOC_MINALIGN)
>  #define NFT_PIPAPO_LT_ALIGN(lt)		(PTR_ALIGN((lt), NFT_PIPAPO_ALIGN))
> -#define NFT_PIPAPO_LT_ASSIGN(field, x)					\
> -	do {								\
> -		(field)->lt_aligned = NFT_PIPAPO_LT_ALIGN(x);		\
> -		(field)->lt = (x);					\
> -	} while (0)
>  #else
>  #define NFT_PIPAPO_ALIGN_HEADROOM	0
>  #define NFT_PIPAPO_LT_ALIGN(lt)		(lt)
> -#define NFT_PIPAPO_LT_ASSIGN(field, x)	((field)->lt = (x))
>  #endif /* NFT_PIPAPO_ALIGN */
>  
>  #define nft_pipapo_for_each_field(field, index, match)		\
> @@ -110,22 +104,18 @@ union nft_pipapo_map_bucket {
>  
>  /**
>   * struct nft_pipapo_field - Lookup, mapping tables and related data for a field
> - * @groups:	Amount of bit groups
>   * @rules:	Number of inserted rules
>   * @bsize:	Size of each bucket in lookup table, in longs
> + * @groups:	Amount of bit groups
>   * @bb:		Number of bits grouped together in lookup table buckets
>   * @lt:		Lookup table: 'groups' rows of buckets
> - * @lt_aligned:	Version of @lt aligned to NFT_PIPAPO_ALIGN bytes
>   * @mt:		Mapping table: one bucket per rule
>   */
>  struct nft_pipapo_field {
> -	int groups;
> -	unsigned long rules;
> -	size_t bsize;
> -	int bb;
> -#ifdef NFT_PIPAPO_ALIGN
> -	unsigned long *lt_aligned;
> -#endif
> +	unsigned int rules;
> +	unsigned int bsize;
> +	u8 groups;
> +	u8 bb;
>  	unsigned long *lt;
>  	union nft_pipapo_map_bucket *mt;
>  };
> @@ -145,15 +135,15 @@ struct nft_pipapo_scratch {
>  /**
>   * struct nft_pipapo_match - Data used for lookup and matching
>   * @field_count		Amount of fields in set
> - * @scratch:		Preallocated per-CPU maps for partial matching results
>   * @bsize_max:		Maximum lookup table bucket size of all fields, in longs
> + * @scratch:		Preallocated per-CPU maps for partial matching results
>   * @rcu			Matching data is swapped on commits
>   * @f:			Fields, with lookup and mapping tables
>   */
>  struct nft_pipapo_match {
> -	int field_count;
> +	u8 field_count;
> +	unsigned int bsize_max;
>  	struct nft_pipapo_scratch * __percpu *scratch;
> -	size_t bsize_max;
>  	struct rcu_head rcu;
>  	struct nft_pipapo_field f[] __counted_by(field_count);
>  };
> @@ -186,7 +176,8 @@ struct nft_pipapo_elem {
>  	struct nft_set_ext	ext;
>  };
>  
> -int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst,
> +int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules,
> +		  unsigned long *dst,
>  		  const union nft_pipapo_map_bucket *mt, bool match_only);
>  
>  /**

-- 
Stefano


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH nf-next 4/4] netfilter: nft_set_pipapo: speed up bulk element insertions
  2024-02-12 10:01 ` [PATCH nf-next 4/4] netfilter: nft_set_pipapo: speed up bulk element insertions Florian Westphal
@ 2024-02-13 13:17   ` Stefano Brivio
  2024-02-13 13:29     ` Florian Westphal
  0 siblings, 1 reply; 11+ messages in thread
From: Stefano Brivio @ 2024-02-13 13:17 UTC (permalink / raw)
  To: Florian Westphal; +Cc: netfilter-devel

On Mon, 12 Feb 2024 11:01:53 +0100
Florian Westphal <fw@strlen.de> wrote:

> Insertions into the set are slow when we try to add many elements.
> For 800k elements I get:
> 
> time nft -f pipapo_800k
> real    19m34.849s
> user    0m2.390s
> sys     19m12.828s

Whoops.

> perf stats:
>  --95.39%--nft_pipapo_insert
>      |--76.60%--pipapo_insert
>      |           --76.37%--pipapo_resize
>      |                     |--72.87%--memcpy_orig
>      |                     |--1.88%--__free_pages_ok
>      |                     |          --0.89%--free_tail_page_prepare
>      |                      --1.38%--kvmalloc_node
>      ..
>      --18.56%--pipapo_get.isra.0
>      |--13.91%--__bitmap_and
>      |--3.01%--pipapo_refill
>      |--0.81%--__kmalloc
>      |           --0.74%--__kmalloc_large_node
>      |                      --0.66%--__alloc_pages
>      ..
>      --0.52%--memset_orig
> 
> So lots of time is spent in copying exising elements to make space for
> the next one.
> 
> Instead of allocating to the exact size of the new rule count, allocate
> extra slack to reduce alloc/copy/free overhead.
> 
> After:
> time nft -f pipapo_800k
> real    1m54.110s
> user    0m2.515s
> sys     1m51.377s

That's quite an improvement, thanks for fixing this!

> 
>  --80.46%--nft_pipapo_insert
>      |--73.45%--pipapo_get.isra.0
>      |--57.63%--__bitmap_and
>      |          |--8.52%--pipapo_refill
>      |--3.45%--__kmalloc
>      |           --3.05%--__kmalloc_large_node
>      |                      --2.58%--__alloc_pages
>      --2.59%--memset_orig
>      |--6.51%--pipapo_insert
>             --5.96%--pipapo_resize
>                      |--3.63%--memcpy_orig
>                      --2.13%--kvmalloc_node
> 
> The new @rules_alloc fills a hole, so struct size doesn't go up.
> Also make it so rule removal doesn't shrink unless the free/extra space
> exceeds two pages.  This should be safe as well:
> 
> When a rule gets removed, the attempt to lower the allocated size is
> already allowed to fail.
> 
> Exception: do exact allocations as long as set is very small (less
> than one page needed).
> 
> Signed-off-by: Florian Westphal <fw@strlen.de>
> ---
>  net/netfilter/nft_set_pipapo.c | 80 +++++++++++++++++++++++++++-------
>  net/netfilter/nft_set_pipapo.h |  2 +
>  2 files changed, 67 insertions(+), 15 deletions(-)
> 
> diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
> index a0ddf24a8052..25cdf64a3139 100644
> --- a/net/netfilter/nft_set_pipapo.c
> +++ b/net/netfilter/nft_set_pipapo.c
> @@ -622,6 +622,62 @@ nft_pipapo_get(const struct net *net, const struct nft_set *set,
>  	return &e->priv;
>  }
>  
> +static int pipapo_realloc_mt(struct nft_pipapo_field *f, unsigned int old_rules, unsigned int rules)

Nit:

/* pipapo_realloc_mt() - Reallocate mapping table if needed upon resize
 * @f:		Field containing mapping table
 * @old_rules:	Amount of existing mapped rules
 * @rules:	Amount of new rules to map
 *
 * Return: 0 on success, negative error code on failure.
 */
static int pipapo_realloc_mt(struct nft_pipapo_field *f,
			     unsigned int old_rules, unsigned int rules)

> +{
> +	union nft_pipapo_map_bucket *new_mt = NULL, *old_mt = f->mt;
> +	unsigned int extra = 4096 / sizeof(*new_mt);

Shouldn't we actually use PAGE_SIZE? I think the one-page limit is
somewhat arbitrary but makes sense, so it should be 64k on e.g.
CONFIG_PPC_64K_PAGES=y.

> +	unsigned int rules_alloc = rules;
> +
> +	might_sleep();
> +
> +	BUILD_BUG_ON(extra < 32);

I'm not entirely sure why this would be a problem. I mean, 'extra' at
this point is the number of extra rules, not the amount of extra
bytes, right?

> +
> +	if (unlikely(rules == 0))
> +		goto out_free;
> +
> +	/* growing and enough space left, no action needed */
> +	if (rules > old_rules && f->rules_alloc > rules)
> +		return 0;
> +
> +	/* downsize and extra slack has not grown too large */
> +	if (rules < old_rules) {
> +		unsigned int remove = f->rules_alloc - rules;
> +
> +		if (remove < (2u * extra))
> +			return 0;
> +	}
> +
> +	/* small sets get precise count, else add extra slack
> +	 * to avoid frequent reallocations.  Extra slack is
> +	 * currently one 4k page worth of rules.
> +	 *
> +	 * Use no slack if the set only has a small number
> +	 * of rules.

This isn't always true: if we slightly decrease the size of a small
mapping table, we might leave some slack, because we might hit the
(remove < (2u * extra)) condition above. Is that intended? It doesn't
look problematic to me, by the way.

> +	 */
> +	if (rules > extra &&
> +	    check_add_overflow(rules, extra, &rules_alloc))
> +		return -EOVERFLOW;
> +
> +	new_mt = kvmalloc_array(rules_alloc, sizeof(*new_mt), GFP_KERNEL);
> +	if (!new_mt)
> +		return -ENOMEM;
> +
> +	if (old_mt)
> +		memcpy(new_mt, old_mt, min(old_rules, rules) * sizeof(*new_mt));
> +
> +	if (rules > old_rules)

Nit: curly braces around multi-line block (for consistency).

> +		memset(new_mt + old_rules, 0,
> +		       (rules - old_rules) * sizeof(*new_mt));
> +
> +out_free:
> +	f->rules_alloc = rules_alloc;
> +	f->mt = new_mt;
> +
> +	kvfree(old_mt);
> +
> +	return 0;
> +}
> +
>  /**
>   * pipapo_resize() - Resize lookup or mapping table, or both
>   * @f:		Field containing lookup and mapping tables
> @@ -637,9 +693,8 @@ nft_pipapo_get(const struct net *net, const struct nft_set *set,
>  static int pipapo_resize(struct nft_pipapo_field *f, unsigned int old_rules, unsigned int rules)
>  {
>  	long *new_lt = NULL, *new_p, *old_lt = f->lt, *old_p;
> -	union nft_pipapo_map_bucket *new_mt, *old_mt = f->mt;
>  	unsigned int new_bucket_size, copy;
> -	int group, bucket;
> +	int group, bucket, err;
>  
>  	if (rules >= NFT_PIPAPO_RULE0_MAX)
>  		return -ENOSPC;
> @@ -682,16 +737,10 @@ static int pipapo_resize(struct nft_pipapo_field *f, unsigned int old_rules, uns
>  	}
>  
>  mt:
> -	new_mt = kvmalloc(rules * sizeof(*new_mt), GFP_KERNEL);
> -	if (!new_mt) {
> +	err = pipapo_realloc_mt(f, old_rules, rules);
> +	if (err) {
>  		kvfree(new_lt);
> -		return -ENOMEM;
> -	}
> -
> -	memcpy(new_mt, f->mt, min(old_rules, rules) * sizeof(*new_mt));
> -	if (rules > old_rules) {
> -		memset(new_mt + old_rules, 0,
> -		       (rules - old_rules) * sizeof(*new_mt));
> +		return err;
>  	}
>  
>  	if (new_lt) {
> @@ -700,9 +749,6 @@ static int pipapo_resize(struct nft_pipapo_field *f, unsigned int old_rules, uns
>  		kvfree(old_lt);
>  	}
>  
> -	f->mt = new_mt;
> -	kvfree(old_mt);
> -
>  	return 0;
>  }
>  
> @@ -1382,13 +1428,16 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
>  		       src->groups * NFT_PIPAPO_BUCKETS(src->bb));
>  
>  		if (src->rules > 0) {
> -			dst->mt = kvmalloc_array(src->rules, sizeof(*src->mt), GFP_KERNEL);
> +			dst->mt = kvmalloc_array(src->rules_alloc, sizeof(*src->mt), GFP_KERNEL);
>  			if (!dst->mt)
>  				goto out_mt;
>  
>  			memcpy(dst->mt, src->mt, src->rules * sizeof(*src->mt));
> +			dst->rules_alloc = src->rules_alloc;
> +			dst->rules = src->rules;

These two, and setting rules_alloc below, shouldn't be needed, because we
already copy everything in the source field before the lookup table, above.

>  		} else {
>  			dst->mt = NULL;
> +			dst->rules_alloc = 0;
>  		}
>  
>  		src++;
> @@ -2203,6 +2252,7 @@ static int nft_pipapo_init(const struct nft_set *set,
>  
>  		f->bsize = 0;
>  		f->rules = 0;
> +		f->rules_alloc = 0;
>  		f->lt = NULL;
>  		f->mt = NULL;
>  	}
> diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h
> index 8d9486ae0c01..bbcac2b38167 100644
> --- a/net/netfilter/nft_set_pipapo.h
> +++ b/net/netfilter/nft_set_pipapo.h
> @@ -106,6 +106,7 @@ union nft_pipapo_map_bucket {
>   * struct nft_pipapo_field - Lookup, mapping tables and related data for a field
>   * @rules:	Number of inserted rules
>   * @bsize:	Size of each bucket in lookup table, in longs
> + * @rules_alloc Number of allocated rules, always >= rules
>   * @groups:	Amount of bit groups
>   * @bb:		Number of bits grouped together in lookup table buckets
>   * @lt:		Lookup table: 'groups' rows of buckets
> @@ -114,6 +115,7 @@ union nft_pipapo_map_bucket {
>  struct nft_pipapo_field {
>  	unsigned int rules;
>  	unsigned int bsize;
> +	unsigned int rules_alloc;
>  	u8 groups;
>  	u8 bb;
>  	unsigned long *lt;

-- 
Stefano


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH nf-next 4/4] netfilter: nft_set_pipapo: speed up bulk element insertions
  2024-02-13 13:17   ` Stefano Brivio
@ 2024-02-13 13:29     ` Florian Westphal
  0 siblings, 0 replies; 11+ messages in thread
From: Florian Westphal @ 2024-02-13 13:29 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: Florian Westphal, netfilter-devel

Stefano Brivio <sbrivio@redhat.com> wrote:
> /* pipapo_realloc_mt() - Reallocate mapping table if needed upon resize
>  * @f:		Field containing mapping table
>  * @old_rules:	Amount of existing mapped rules
>  * @rules:	Amount of new rules to map
>  *
>  * Return: 0 on success, negative error code on failure.
>  */

Thanks, I'll steal this for v2.

> static int pipapo_realloc_mt(struct nft_pipapo_field *f,
> 			     unsigned int old_rules, unsigned int rules)
> 
> > +{
> > +	union nft_pipapo_map_bucket *new_mt = NULL, *old_mt = f->mt;
> > +	unsigned int extra = 4096 / sizeof(*new_mt);
> 
> Shouldn't we actually use PAGE_SIZE? I think the one-page limit is
> somewhat arbitrary but makes sense, so it should be 64k on e.g.
> CONFIG_PPC_64K_PAGES=y.

I wasn't sure if it would make sense to to use 64k for this batching,
I guess kvmalloc will use vmalloc anyway for such huge sets so I'll
change it back to PAGE_SIZE.

> > +	BUILD_BUG_ON(extra < 32);
> 
> I'm not entirely sure why this would be a problem. I mean, 'extra' at
> this point is the number of extra rules, not the amount of extra
> bytes, right?

Its a leftover from a version where this was bytes. I'll remove it.

> > +	/* small sets get precise count, else add extra slack
> > +	 * to avoid frequent reallocations.  Extra slack is
> > +	 * currently one 4k page worth of rules.
> > +	 *
> > +	 * Use no slack if the set only has a small number
> > +	 * of rules.
> 
> This isn't always true: if we slightly decrease the size of a small
> mapping table, we might leave some slack, because we might hit the
> (remove < (2u * extra)) condition above. Is that intended? It doesn't
> look problematic to me, by the way.

Ok. I'll ammend the comment then.

> > +	if (old_mt)
> > +		memcpy(new_mt, old_mt, min(old_rules, rules) * sizeof(*new_mt));
> > +
> > +	if (rules > old_rules)
> 
> Nit: curly braces around multi-line block (for consistency).

Oh?  Guess I'll see if checkpatch complains...

> >  		if (src->rules > 0) {
> > -			dst->mt = kvmalloc_array(src->rules, sizeof(*src->mt), GFP_KERNEL);
> > +			dst->mt = kvmalloc_array(src->rules_alloc, sizeof(*src->mt), GFP_KERNEL);
> >  			if (!dst->mt)
> >  				goto out_mt;
> >  
> >  			memcpy(dst->mt, src->mt, src->rules * sizeof(*src->mt));
> > +			dst->rules_alloc = src->rules_alloc;
> > +			dst->rules = src->rules;
> 
> These two, and setting rules_alloc below, shouldn't be needed, because we
> already copy everything in the source field before the lookup table, above.

Ah you mean:
    memcpy(dst, src, offsetof(struct nft_pipapo_field, lt));

.. above.  Ok, I'll remove those lines then.

Thanks for reviewing!

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2024-02-13 13:29 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-02-12 10:01 [PATCH nf-next 0/4] netfilter: nft_set_pipapo: speed up bulk element insertions Florian Westphal
2024-02-12 10:01 ` [PATCH nf-next 1/4] netfilter: nft_set_pipapo: constify lookup fn args where possible Florian Westphal
2024-02-13  7:19   ` Stefano Brivio
2024-02-12 10:01 ` [PATCH nf-next 2/4] netfilter: nft_set_pipapo: do not rely on ZERO_SIZE_PTR Florian Westphal
2024-02-13  7:20   ` Stefano Brivio
2024-02-13 11:09     ` Florian Westphal
2024-02-12 10:01 ` [PATCH nf-next 3/4] netfilter: nft_set_pipapo: shrink data structures Florian Westphal
2024-02-13 13:17   ` Stefano Brivio
2024-02-12 10:01 ` [PATCH nf-next 4/4] netfilter: nft_set_pipapo: speed up bulk element insertions Florian Westphal
2024-02-13 13:17   ` Stefano Brivio
2024-02-13 13:29     ` Florian Westphal

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.