Re: [v3 PATCH bpf-next 3/6] bpf: populate the per-cpu insertions/deletions counters for hashmaps

From: Anton Protopopov <aspsk@isovalent.com>
To: Hou Tao <houtao@huaweicloud.com>
Cc: Alexei Starovoitov <ast@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>,
	John Fastabend <john.fastabend@gmail.com>,
	Andrii Nakryiko <andrii@kernel.org>,
	Martin KaFai Lau <martin.lau@linux.dev>,
	Song Liu <song@kernel.org>, Yonghong Song <yhs@fb.com>,
	KP Singh <kpsingh@kernel.org>,
	Stanislav Fomichev <sdf@google.com>, Hao Luo <haoluo@google.com>,
	Jiri Olsa <jolsa@kernel.org>,
	bpf@vger.kernel.org
Subject: Re: [v3 PATCH bpf-next 3/6] bpf: populate the per-cpu insertions/deletions counters for hashmaps
Date: Tue, 4 Jul 2023 14:34:27 +0000	[thread overview]
Message-ID: <ZKQt84Qz0A0ZkgN1@zh-lab-node-5> (raw)
In-Reply-To: <05a3c521-3c6f-79c2-a5a8-1f8ab35eb759@huaweicloud.com>

On Tue, Jul 04, 2023 at 09:56:36PM +0800, Hou Tao wrote:
> Hi,
> 
> On 6/30/2023 4:25 PM, Anton Protopopov wrote:
> > Initialize and utilize the per-cpu insertions/deletions counters for hash-based
> > maps. Non-trivial changes only apply to the preallocated maps for which the
> > {inc,dec}_elem_count functions are not called, as there's no need in counting
> > elements to sustain proper map operations.
> >
> > To increase/decrease percpu counters for preallocated maps we add raw calls to
> > the bpf_map_{inc,dec}_elem_count functions so that the impact is minimal. For
> > dynamically allocated maps we add corresponding calls to the existing
> > {inc,dec}_elem_count functions.
> >
> > Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
> > ---
> >  kernel/bpf/hashtab.c | 23 ++++++++++++++++++++---
> >  1 file changed, 20 insertions(+), 3 deletions(-)
> >
> > diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> > index 56d3da7d0bc6..faaef4fd3df0 100644
> > --- a/kernel/bpf/hashtab.c
> > +++ b/kernel/bpf/hashtab.c
> > @@ -581,8 +581,14 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
> >  		}
> >  	}
> >  
> > +	err = bpf_map_init_elem_count(&htab->map);
> > +	if (err)
> > +		goto free_extra_elements;
> Considering the per-cpu counter is not always needed, is it a good idea
> to make the elem_count being optional by introducing a new map flag ?

Per-map-flag or a static key? For me it looked like just doing an unconditional
`inc` for a per-cpu variable is better vs. doing a check then `inc` or an
unconditional jump.

> > +
> >  	return &htab->map;
> >  
> > +free_extra_elements:
> > +	free_percpu(htab->extra_elems);
> >  free_prealloc:
> >  	prealloc_destroy(htab);
> Need to check prealloc before calling prealloc_destroy(htab), otherwise
> for non-preallocated percpu htab prealloc_destroy() will trigger invalid
> memory dereference.

Thanks!

> >  free_map_locked:
> > @@ -804,6 +810,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
> >  		if (l == tgt_l) {
> >  			hlist_nulls_del_rcu(&l->hash_node);
> >  			check_and_free_fields(htab, l);
> > +			bpf_map_dec_elem_count(&htab->map);
> >  			break;
> >  		}
> >  
> > @@ -900,6 +907,8 @@ static bool is_map_full(struct bpf_htab *htab)
> >  
> >  static void inc_elem_count(struct bpf_htab *htab)
> >  {
> > +	bpf_map_inc_elem_count(&htab->map);
> > +
> >  	if (htab->use_percpu_counter)
> >  		percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH);
> >  	else
> > @@ -908,6 +917,8 @@ static void inc_elem_count(struct bpf_htab *htab)
> >  
> >  static void dec_elem_count(struct bpf_htab *htab)
> >  {
> > +	bpf_map_dec_elem_count(&htab->map);
> > +
> >  	if (htab->use_percpu_counter)
> >  		percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH);
> >  	else
> > @@ -920,6 +931,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
> >  	htab_put_fd_value(htab, l);
> >  
> >  	if (htab_is_prealloc(htab)) {
> > +		bpf_map_dec_elem_count(&htab->map);
> >  		check_and_free_fields(htab, l);
> >  		__pcpu_freelist_push(&htab->freelist, &l->fnode);
> >  	} else {
> > @@ -1000,6 +1012,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
> >  			if (!l)
> >  				return ERR_PTR(-E2BIG);
> >  			l_new = container_of(l, struct htab_elem, fnode);
> > +			bpf_map_inc_elem_count(&htab->map);
> >  		}
> >  	} else {
> >  		if (is_map_full(htab))
> > @@ -1224,7 +1237,8 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value
> >  	if (l_old) {
> >  		bpf_lru_node_set_ref(&l_new->lru_node);
> >  		hlist_nulls_del_rcu(&l_old->hash_node);
> > -	}
> > +	} else
> > +		bpf_map_inc_elem_count(&htab->map);
> >  	ret = 0;
> >  
> >  err:
> > @@ -1351,6 +1365,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
> >  		pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size),
> >  				value, onallcpus);
> >  		hlist_nulls_add_head_rcu(&l_new->hash_node, head);
> > +		bpf_map_inc_elem_count(&htab->map);
> >  		l_new = NULL;
> >  	}
> >  	ret = 0;
> > @@ -1437,9 +1452,10 @@ static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
> >  
> >  	l = lookup_elem_raw(head, hash, key, key_size);
> >  
> > -	if (l)
> > +	if (l) {
> > +		bpf_map_dec_elem_count(&htab->map);
> >  		hlist_nulls_del_rcu(&l->hash_node);
> > -	else
> > +	} else
> >  		ret = -ENOENT;
> Also need to decrease elem_count for
> __htab_map_lookup_and_delete_batch() and
> __htab_map_lookup_and_delete_elem() when is_lru_map is true. Maybe for
> LRU map, we could just do bpf_map_dec_elem_count() in
> htab_lru_push_free() and do bpf_map_inc_elem_count() in prealloc_lru_pop().

Thanks. I will fix the logic and extend the selftest to test the batch ops as well.

> >  
> >  	htab_unlock_bucket(htab, b, hash, flags);
> > @@ -1523,6 +1539,7 @@ static void htab_map_free(struct bpf_map *map)
> >  		prealloc_destroy(htab);
> >  	}
> >  
> > +	bpf_map_free_elem_count(map);
> >  	free_percpu(htab->extra_elems);
> >  	bpf_map_area_free(htab->buckets);
> >  	bpf_mem_alloc_destroy(&htab->pcpu_ma);
>