Re: [PATCH nf-next 4/4] netfilter: nft_set_pipapo: speed up bulk element insertions

From: Stefano Brivio <sbrivio@redhat.com>
To: Florian Westphal <fw@strlen.de>
Cc: <netfilter-devel@vger.kernel.org>
Subject: Re: [PATCH nf-next 4/4] netfilter: nft_set_pipapo: speed up bulk element insertions
Date: Tue, 13 Feb 2024 14:17:53 +0100	[thread overview]
Message-ID: <20240213141753.17ef27a6@elisabeth> (raw)
In-Reply-To: <20240212100202.10116-5-fw@strlen.de>

On Mon, 12 Feb 2024 11:01:53 +0100
Florian Westphal <fw@strlen.de> wrote:

> Insertions into the set are slow when we try to add many elements.
> For 800k elements I get:
> 
> time nft -f pipapo_800k
> real    19m34.849s
> user    0m2.390s
> sys     19m12.828s

Whoops.

> perf stats:
>  --95.39%--nft_pipapo_insert
>      |--76.60%--pipapo_insert
>      |           --76.37%--pipapo_resize
>      |                     |--72.87%--memcpy_orig
>      |                     |--1.88%--__free_pages_ok
>      |                     |          --0.89%--free_tail_page_prepare
>      |                      --1.38%--kvmalloc_node
>      ..
>      --18.56%--pipapo_get.isra.0
>      |--13.91%--__bitmap_and
>      |--3.01%--pipapo_refill
>      |--0.81%--__kmalloc
>      |           --0.74%--__kmalloc_large_node
>      |                      --0.66%--__alloc_pages
>      ..
>      --0.52%--memset_orig
> 
> So lots of time is spent in copying exising elements to make space for
> the next one.
> 
> Instead of allocating to the exact size of the new rule count, allocate
> extra slack to reduce alloc/copy/free overhead.
> 
> After:
> time nft -f pipapo_800k
> real    1m54.110s
> user    0m2.515s
> sys     1m51.377s

That's quite an improvement, thanks for fixing this!

> 
>  --80.46%--nft_pipapo_insert
>      |--73.45%--pipapo_get.isra.0
>      |--57.63%--__bitmap_and
>      |          |--8.52%--pipapo_refill
>      |--3.45%--__kmalloc
>      |           --3.05%--__kmalloc_large_node
>      |                      --2.58%--__alloc_pages
>      --2.59%--memset_orig
>      |--6.51%--pipapo_insert
>             --5.96%--pipapo_resize
>                      |--3.63%--memcpy_orig
>                      --2.13%--kvmalloc_node
> 
> The new @rules_alloc fills a hole, so struct size doesn't go up.
> Also make it so rule removal doesn't shrink unless the free/extra space
> exceeds two pages.  This should be safe as well:
> 
> When a rule gets removed, the attempt to lower the allocated size is
> already allowed to fail.
> 
> Exception: do exact allocations as long as set is very small (less
> than one page needed).
> 
> Signed-off-by: Florian Westphal <fw@strlen.de>
> ---
>  net/netfilter/nft_set_pipapo.c | 80 +++++++++++++++++++++++++++-------
>  net/netfilter/nft_set_pipapo.h |  2 +
>  2 files changed, 67 insertions(+), 15 deletions(-)
> 
> diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
> index a0ddf24a8052..25cdf64a3139 100644
> --- a/net/netfilter/nft_set_pipapo.c
> +++ b/net/netfilter/nft_set_pipapo.c
> @@ -622,6 +622,62 @@ nft_pipapo_get(const struct net *net, const struct nft_set *set,
>  	return &e->priv;
>  }
>  
> +static int pipapo_realloc_mt(struct nft_pipapo_field *f, unsigned int old_rules, unsigned int rules)

Nit:

/* pipapo_realloc_mt() - Reallocate mapping table if needed upon resize
 * @f:		Field containing mapping table
 * @old_rules:	Amount of existing mapped rules
 * @rules:	Amount of new rules to map
 *
 * Return: 0 on success, negative error code on failure.
 */
static int pipapo_realloc_mt(struct nft_pipapo_field *f,
			     unsigned int old_rules, unsigned int rules)

> +{
> +	union nft_pipapo_map_bucket *new_mt = NULL, *old_mt = f->mt;
> +	unsigned int extra = 4096 / sizeof(*new_mt);

Shouldn't we actually use PAGE_SIZE? I think the one-page limit is
somewhat arbitrary but makes sense, so it should be 64k on e.g.
CONFIG_PPC_64K_PAGES=y.

> +	unsigned int rules_alloc = rules;
> +
> +	might_sleep();
> +
> +	BUILD_BUG_ON(extra < 32);

I'm not entirely sure why this would be a problem. I mean, 'extra' at
this point is the number of extra rules, not the amount of extra
bytes, right?

> +
> +	if (unlikely(rules == 0))
> +		goto out_free;
> +
> +	/* growing and enough space left, no action needed */
> +	if (rules > old_rules && f->rules_alloc > rules)
> +		return 0;
> +
> +	/* downsize and extra slack has not grown too large */
> +	if (rules < old_rules) {
> +		unsigned int remove = f->rules_alloc - rules;
> +
> +		if (remove < (2u * extra))
> +			return 0;
> +	}
> +
> +	/* small sets get precise count, else add extra slack
> +	 * to avoid frequent reallocations.  Extra slack is
> +	 * currently one 4k page worth of rules.
> +	 *
> +	 * Use no slack if the set only has a small number
> +	 * of rules.

This isn't always true: if we slightly decrease the size of a small
mapping table, we might leave some slack, because we might hit the
(remove < (2u * extra)) condition above. Is that intended? It doesn't
look problematic to me, by the way.

> +	 */
> +	if (rules > extra &&
> +	    check_add_overflow(rules, extra, &rules_alloc))
> +		return -EOVERFLOW;
> +
> +	new_mt = kvmalloc_array(rules_alloc, sizeof(*new_mt), GFP_KERNEL);
> +	if (!new_mt)
> +		return -ENOMEM;
> +
> +	if (old_mt)
> +		memcpy(new_mt, old_mt, min(old_rules, rules) * sizeof(*new_mt));
> +
> +	if (rules > old_rules)

Nit: curly braces around multi-line block (for consistency).

> +		memset(new_mt + old_rules, 0,
> +		       (rules - old_rules) * sizeof(*new_mt));
> +
> +out_free:
> +	f->rules_alloc = rules_alloc;
> +	f->mt = new_mt;
> +
> +	kvfree(old_mt);
> +
> +	return 0;
> +}
> +
>  /**
>   * pipapo_resize() - Resize lookup or mapping table, or both
>   * @f:		Field containing lookup and mapping tables
> @@ -637,9 +693,8 @@ nft_pipapo_get(const struct net *net, const struct nft_set *set,
>  static int pipapo_resize(struct nft_pipapo_field *f, unsigned int old_rules, unsigned int rules)
>  {
>  	long *new_lt = NULL, *new_p, *old_lt = f->lt, *old_p;
> -	union nft_pipapo_map_bucket *new_mt, *old_mt = f->mt;
>  	unsigned int new_bucket_size, copy;
> -	int group, bucket;
> +	int group, bucket, err;
>  
>  	if (rules >= NFT_PIPAPO_RULE0_MAX)
>  		return -ENOSPC;
> @@ -682,16 +737,10 @@ static int pipapo_resize(struct nft_pipapo_field *f, unsigned int old_rules, uns
>  	}
>  
>  mt:
> -	new_mt = kvmalloc(rules * sizeof(*new_mt), GFP_KERNEL);
> -	if (!new_mt) {
> +	err = pipapo_realloc_mt(f, old_rules, rules);
> +	if (err) {
>  		kvfree(new_lt);
> -		return -ENOMEM;
> -	}
> -
> -	memcpy(new_mt, f->mt, min(old_rules, rules) * sizeof(*new_mt));
> -	if (rules > old_rules) {
> -		memset(new_mt + old_rules, 0,
> -		       (rules - old_rules) * sizeof(*new_mt));
> +		return err;
>  	}
>  
>  	if (new_lt) {
> @@ -700,9 +749,6 @@ static int pipapo_resize(struct nft_pipapo_field *f, unsigned int old_rules, uns
>  		kvfree(old_lt);
>  	}
>  
> -	f->mt = new_mt;
> -	kvfree(old_mt);
> -
>  	return 0;
>  }
>  
> @@ -1382,13 +1428,16 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
>  		       src->groups * NFT_PIPAPO_BUCKETS(src->bb));
>  
>  		if (src->rules > 0) {
> -			dst->mt = kvmalloc_array(src->rules, sizeof(*src->mt), GFP_KERNEL);
> +			dst->mt = kvmalloc_array(src->rules_alloc, sizeof(*src->mt), GFP_KERNEL);
>  			if (!dst->mt)
>  				goto out_mt;
>  
>  			memcpy(dst->mt, src->mt, src->rules * sizeof(*src->mt));
> +			dst->rules_alloc = src->rules_alloc;
> +			dst->rules = src->rules;

These two, and setting rules_alloc below, shouldn't be needed, because we
already copy everything in the source field before the lookup table, above.

>  		} else {
>  			dst->mt = NULL;
> +			dst->rules_alloc = 0;
>  		}
>  
>  		src++;
> @@ -2203,6 +2252,7 @@ static int nft_pipapo_init(const struct nft_set *set,
>  
>  		f->bsize = 0;
>  		f->rules = 0;
> +		f->rules_alloc = 0;
>  		f->lt = NULL;
>  		f->mt = NULL;
>  	}
> diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h
> index 8d9486ae0c01..bbcac2b38167 100644
> --- a/net/netfilter/nft_set_pipapo.h
> +++ b/net/netfilter/nft_set_pipapo.h
> @@ -106,6 +106,7 @@ union nft_pipapo_map_bucket {
>   * struct nft_pipapo_field - Lookup, mapping tables and related data for a field
>   * @rules:	Number of inserted rules
>   * @bsize:	Size of each bucket in lookup table, in longs
> + * @rules_alloc Number of allocated rules, always >= rules
>   * @groups:	Amount of bit groups
>   * @bb:		Number of bits grouped together in lookup table buckets
>   * @lt:		Lookup table: 'groups' rows of buckets
> @@ -114,6 +115,7 @@ union nft_pipapo_map_bucket {
>  struct nft_pipapo_field {
>  	unsigned int rules;
>  	unsigned int bsize;
> +	unsigned int rules_alloc;
>  	u8 groups;
>  	u8 bb;
>  	unsigned long *lt;

-- 
Stefano