bpf.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Hou Tao <houtao@huaweicloud.com>
To: YiFei Zhu <zhuyifei@google.com>, bpf@vger.kernel.org
Cc: Alexei Starovoitov <ast@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>,
	Stanislav Fomichev <sdf@google.com>,
	Martin KaFai Lau <martin.lau@linux.dev>,
	Andrii Nakryiko <andrii@kernel.org>
Subject: Re: [PATCH bpf 2/2] bpf/memalloc: Schedule highprio wq for non-atomic alloc when atomic fails
Date: Fri, 21 Jul 2023 10:24:23 +0800	[thread overview]
Message-ID: <87874222-1d01-b08b-87e5-a94d90167e94@huaweicloud.com> (raw)
In-Reply-To: <3516fa9cc4bdbaeb90f208f5c970e622ba76be3e.1689885610.git.zhuyifei@google.com>

Hi,

On 7/21/2023 4:44 AM, YiFei Zhu wrote:
> Atomic refill can fail, such as when all percpu chunks are full,
> and when that happens there's no guarantee when more space will be
> available for atomic allocations.
>
> Instead of having the caller wait for memory to be available by
> retrying until the related BPF API no longer gives -ENOMEM, we can
> kick off a non-atomic GFP_KERNEL allocation with highprio workqueue.
> This should make it much less likely for those APIs to return
> -ENOMEM.
>
> Because alloc_bulk can now be called from the workqueue,
> non-atomic calls now also calls local_irq_save/restore to reduce
> the chance of races.
>
> Fixes: 7c8199e24fa0 ("bpf: Introduce any context BPF specific memory allocator.")
> Signed-off-by: YiFei Zhu <zhuyifei@google.com>
> ---
>  kernel/bpf/memalloc.c | 47 ++++++++++++++++++++++++++++++-------------
>  1 file changed, 33 insertions(+), 14 deletions(-)
>
> diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
> index 016249672b43..2915639a5e16 100644
> --- a/kernel/bpf/memalloc.c
> +++ b/kernel/bpf/memalloc.c
> @@ -84,14 +84,15 @@ struct bpf_mem_cache {
>  	struct llist_head free_llist;
>  	local_t active;
>  
> -	/* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill
> +	/* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill_*
>  	 * are sequenced by per-cpu 'active' counter. But unit_free() cannot
>  	 * fail. When 'active' is busy the unit_free() will add an object to
>  	 * free_llist_extra.
>  	 */
>  	struct llist_head free_llist_extra;
>  
> -	struct irq_work refill_work;
> +	struct irq_work refill_work_irq;
> +	struct work_struct refill_work_wq;
>  	struct obj_cgroup *objcg;
>  	int unit_size;
>  	/* count of objects in free_llist */
> @@ -153,7 +154,7 @@ static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c)
>  #endif
>  }
>  
> -/* Mostly runs from irq_work except __init phase. */
> +/* Mostly runs from irq_work except workqueue and __init phase. */
>  static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic)
>  {
>  	struct mem_cgroup *memcg = NULL, *old_memcg;
> @@ -188,10 +189,18 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic)
>  			 * want here.
>  			 */
>  			obj = __alloc(c, node, gfp);
> -			if (!obj)
> +			if (!obj) {
> +				/* We might have exhausted the percpu chunks, schedule
> +				 * non-atomic allocation so hopefully caller can get
> +				 * a free unit upon next invocation.
> +				 */
> +				if (atomic)
> +					queue_work_on(smp_processor_id(),
> +						      system_highpri_wq, &c->refill_work_wq);

I am not a MM expert. But according to the code in
pcpu_balance_workfn(), it will try to do pcpu_create_chunk() when
pcpu_atomic_alloc_failed is true, so the reason for introducing
refill_work_wq is that pcpu_balance_workfn is too slow to fulfill the
allocation request from bpf memory allocator ?
>  				break;
> +			}
>  		}
> -		if (IS_ENABLED(CONFIG_PREEMPT_RT))
> +		if (IS_ENABLED(CONFIG_PREEMPT_RT) || !atomic)
>  			/* In RT irq_work runs in per-cpu kthread, so disable
>  			 * interrupts to avoid preemption and interrupts and
>  			 * reduce the chance of bpf prog executing on this cpu
> @@ -208,7 +217,7 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic)
>  		__llist_add(obj, &c->free_llist);
>  		c->free_cnt++;
>  		local_dec(&c->active);
> -		if (IS_ENABLED(CONFIG_PREEMPT_RT))
> +		if (IS_ENABLED(CONFIG_PREEMPT_RT) || !atomic)
>  			local_irq_restore(flags);
>  	}
>  	set_active_memcg(old_memcg);
> @@ -314,9 +323,9 @@ static void free_bulk(struct bpf_mem_cache *c)
>  	do_call_rcu(c);
>  }
>  
> -static void bpf_mem_refill(struct irq_work *work)
> +static void bpf_mem_refill_irq(struct irq_work *work)
>  {
> -	struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work);
> +	struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work_irq);
>  	int cnt;
>  
>  	/* Racy access to free_cnt. It doesn't need to be 100% accurate */
> @@ -332,7 +341,14 @@ static void bpf_mem_refill(struct irq_work *work)
>  
>  static void notrace irq_work_raise(struct bpf_mem_cache *c)
>  {
> -	irq_work_queue(&c->refill_work);
> +	irq_work_queue(&c->refill_work_irq);
> +}
> +
> +static void bpf_mem_refill_wq(struct work_struct *work)
> +{
> +	struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work_wq);
> +
> +	alloc_bulk(c, c->batch, NUMA_NO_NODE, false);

Considering that the kworker may be interrupted by irq work, so there
will be concurrent __llist_del_first() operations on free_by_rcu, andI
think it is not safe to call alloc_bulk directly here. Maybe we can just
skip __llist_del_first() for !atomic context.

>  }
>  
>  /* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket
> @@ -352,7 +368,8 @@ static void notrace irq_work_raise(struct bpf_mem_cache *c)
>  
>  static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
>  {
> -	init_irq_work(&c->refill_work, bpf_mem_refill);
> +	init_irq_work(&c->refill_work_irq, bpf_mem_refill_irq);
> +	INIT_WORK(&c->refill_work_wq, bpf_mem_refill_wq);
>  	if (c->unit_size <= 256) {
>  		c->low_watermark = 32;
>  		c->high_watermark = 96;
> @@ -529,7 +546,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
>  		for_each_possible_cpu(cpu) {
>  			c = per_cpu_ptr(ma->cache, cpu);
>  			/*
> -			 * refill_work may be unfinished for PREEMPT_RT kernel
> +			 * refill_work_irq may be unfinished for PREEMPT_RT kernel
>  			 * in which irq work is invoked in a per-CPU RT thread.
>  			 * It is also possible for kernel with
>  			 * arch_irq_work_has_interrupt() being false and irq
> @@ -537,7 +554,8 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
>  			 * the completion of irq work to ease the handling of
>  			 * concurrency.
>  			 */
> -			irq_work_sync(&c->refill_work);
> +			irq_work_sync(&c->refill_work_irq);
> +			cancel_work_sync(&c->refill_work_wq);

cancel_work_sync() may be time-consuming. We may need to move it to
free_mem_alloc_deferred() to prevent blocking the destroy of bpf memory
allocator.
>  			drain_mem_cache(c);
>  			rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
>  		}
> @@ -552,7 +570,8 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
>  			cc = per_cpu_ptr(ma->caches, cpu);
>  			for (i = 0; i < NUM_CACHES; i++) {
>  				c = &cc->cache[i];
> -				irq_work_sync(&c->refill_work);
> +				irq_work_sync(&c->refill_work_irq);
> +				cancel_work_sync(&c->refill_work_wq);
>  				drain_mem_cache(c);
>  				rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
>  			}
> @@ -580,7 +599,7 @@ static void notrace *unit_alloc(struct bpf_mem_cache *c)
>  	 *
>  	 * but prog_B could be a perf_event NMI prog.
>  	 * Use per-cpu 'active' counter to order free_list access between
> -	 * unit_alloc/unit_free/bpf_mem_refill.
> +	 * unit_alloc/unit_free/bpf_mem_refill_*.
>  	 */
>  	local_irq_save(flags);
>  	if (local_inc_return(&c->active) == 1) {


  reply	other threads:[~2023-07-21  2:24 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-07-20 20:44 [PATCH bpf 0/2] bpf/memalloc: Allow non-atomic alloc_bulk YiFei Zhu
2023-07-20 20:44 ` [PATCH bpf 1/2] bpf/memalloc: Non-atomically allocate freelist during prefill YiFei Zhu
2023-07-21  1:44   ` Hou Tao
2023-07-21  2:31     ` YiFei Zhu
2023-07-26 11:38       ` Hou Tao
2023-07-26 18:44         ` YiFei Zhu
2023-07-27  1:21           ` Hou Tao
2023-07-20 20:44 ` [PATCH bpf 2/2] bpf/memalloc: Schedule highprio wq for non-atomic alloc when atomic fails YiFei Zhu
2023-07-21  2:24   ` Hou Tao [this message]
2023-07-21  3:02     ` YiFei Zhu
2023-07-26 11:37       ` Hou Tao

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87874222-1d01-b08b-87e5-a94d90167e94@huaweicloud.com \
    --to=houtao@huaweicloud.com \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=martin.lau@linux.dev \
    --cc=sdf@google.com \
    --cc=zhuyifei@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).