Re: [PATCH v20 5/9] mm: hugetlb: defer freeing of HugeTLB pages

From: Mike Kravetz <mike.kravetz@oracle.com>
To: Muchun Song <songmuchun@bytedance.com>,
	corbet@lwn.net, tglx@linutronix.de, mingo@redhat.com,
	bp@alien8.de, x86@kernel.org, hpa@zytor.com,
	dave.hansen@linux.intel.com, luto@kernel.org,
	peterz@infradead.org, viro@zeniv.linux.org.uk,
	akpm@linux-foundation.org, paulmck@kernel.org,
	pawan.kumar.gupta@linux.intel.com, rdunlap@infradead.org,
	oneukum@suse.com, anshuman.khandual@arm.com, jroedel@suse.de,
	almasrymina@google.com, rientjes@google.com, willy@infradead.org,
	osalvador@suse.de, mhocko@suse.com, song.bao.hua@hisilicon.com,
	david@redhat.com, naoya.horiguchi@nec.com,
	joao.m.martins@oracle.com
Cc: duanxiongchun@bytedance.com, fam.zheng@bytedance.com,
	linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org, linux-fsdevel@vger.kernel.org
Subject: Re: [PATCH v20 5/9] mm: hugetlb: defer freeing of HugeTLB pages
Date: Fri, 16 Apr 2021 16:55:56 -0700	[thread overview]
Message-ID: <33a4f2fe-72c2-a3a1-9205-461ddde9b162@oracle.com> (raw)
In-Reply-To: <20210415084005.25049-6-songmuchun@bytedance.com>

On 4/15/21 1:40 AM, Muchun Song wrote:
> In the subsequent patch, we should allocate the vmemmap pages when
> freeing a HugeTLB page. But update_and_free_page() can be called
> under any context, so we cannot use GFP_KERNEL to allocate vmemmap
> pages. However, we can defer the actual freeing in a kworker to
> prevent from using GFP_ATOMIC to allocate the vmemmap pages.

Thanks!  I knew we would need to introduce a kworker for this when I
removed the kworker previously used in free_huge_page.

> The __update_and_free_page() is where the call to allocate vmemmmap
> pages will be inserted.

This patch adds the functionality required for __update_and_free_page
to potentially sleep and fail.  More questions will come up in the
subsequent patch when code must deal with the failures.

> 
> Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> ---
>  mm/hugetlb.c         | 73 ++++++++++++++++++++++++++++++++++++++++++++++++----
>  mm/hugetlb_vmemmap.c | 12 ---------
>  mm/hugetlb_vmemmap.h | 17 ++++++++++++
>  3 files changed, 85 insertions(+), 17 deletions(-)
> 
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 923d05e2806b..eeb8f5480170 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1376,7 +1376,7 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page,
>  	h->nr_huge_pages_node[nid]--;
>  }
>  
> -static void update_and_free_page(struct hstate *h, struct page *page)
> +static void __update_and_free_page(struct hstate *h, struct page *page)
>  {
>  	int i;
>  	struct page *subpage = page;
> @@ -1399,12 +1399,73 @@ static void update_and_free_page(struct hstate *h, struct page *page)
>  	}
>  }
>  
> +/*
> + * As update_and_free_page() can be called under any context, so we cannot
> + * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the
> + * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate
> + * the vmemmap pages.
> + *
> + * free_hpage_workfn() locklessly retrieves the linked list of pages to be
> + * freed and frees them one-by-one. As the page->mapping pointer is going
> + * to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node
> + * structure of a lockless linked list of huge pages to be freed.
> + */
> +static LLIST_HEAD(hpage_freelist);
> +
> +static void free_hpage_workfn(struct work_struct *work)
> +{
> +	struct llist_node *node;
> +
> +	node = llist_del_all(&hpage_freelist);
> +
> +	while (node) {
> +		struct page *page;
> +		struct hstate *h;
> +
> +		page = container_of((struct address_space **)node,
> +				     struct page, mapping);
> +		node = node->next;
> +		page->mapping = NULL;
> +		h = page_hstate(page);

The VM_BUG_ON_PAGE(!PageHuge(page), page) in page_hstate is going to
trigger because a previous call to remove_hugetlb_page() will
set_compound_page_dtor(page, NULL_COMPOUND_DTOR)

Note how h(hstate) is grabbed before calling update_and_free_page in
existing code.

We could potentially drop the !PageHuge(page) in page_hstate.  Or,
perhaps just use 'size_to_hstate(page_size(page))' in free_hpage_workfn.
-- 
Mike Kravetz

> +
> +		__update_and_free_page(h, page);
> +
> +		cond_resched();
> +	}
> +}
> +static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
> +
> +static inline void flush_free_hpage_work(struct hstate *h)
> +{
> +	if (free_vmemmap_pages_per_hpage(h))
> +		flush_work(&free_hpage_work);
> +}
> +
> +static void update_and_free_page(struct hstate *h, struct page *page,
> +				 bool atomic)
> +{
> +	if (!free_vmemmap_pages_per_hpage(h) || !atomic) {
> +		__update_and_free_page(h, page);
> +		return;
> +	}
> +
> +	/*
> +	 * Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages.
> +	 *
> +	 * Only call schedule_work() if hpage_freelist is previously
> +	 * empty. Otherwise, schedule_work() had been called but the workfn
> +	 * hasn't retrieved the list yet.
> +	 */
> +	if (llist_add((struct llist_node *)&page->mapping, &hpage_freelist))
> +		schedule_work(&free_hpage_work);
> +}
> +
>  static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
>  {
>  	struct page *page, *t_page;
>  
>  	list_for_each_entry_safe(page, t_page, list, lru) {
> -		update_and_free_page(h, page);
> +		update_and_free_page(h, page, false);
>  		cond_resched();
>  	}
>  }
> @@ -1471,12 +1532,12 @@ void free_huge_page(struct page *page)
>  	if (HPageTemporary(page)) {
>  		remove_hugetlb_page(h, page, false);
>  		spin_unlock_irqrestore(&hugetlb_lock, flags);
> -		update_and_free_page(h, page);
> +		update_and_free_page(h, page, true);
>  	} else if (h->surplus_huge_pages_node[nid]) {
>  		/* remove the page from active list */
>  		remove_hugetlb_page(h, page, true);
>  		spin_unlock_irqrestore(&hugetlb_lock, flags);
> -		update_and_free_page(h, page);
> +		update_and_free_page(h, page, true);
>  	} else {
>  		arch_clear_hugepage_flags(page);
>  		enqueue_huge_page(h, page);
> @@ -1785,7 +1846,7 @@ int dissolve_free_huge_page(struct page *page)
>  		remove_hugetlb_page(h, page, false);
>  		h->max_huge_pages--;
>  		spin_unlock_irq(&hugetlb_lock);
> -		update_and_free_page(h, head);
> +		update_and_free_page(h, head, false);
>  		return 0;
>  	}
>  out:
> @@ -2627,6 +2688,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
>  	 * pages in hstate via the proc/sysfs interfaces.
>  	 */
>  	mutex_lock(&h->resize_lock);
> +	flush_free_hpage_work(h);
>  	spin_lock_irq(&hugetlb_lock);
>  
>  	/*
> @@ -2736,6 +2798,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
>  	/* free the pages after dropping lock */
>  	spin_unlock_irq(&hugetlb_lock);
>  	update_and_free_pages_bulk(h, &page_list);
> +	flush_free_hpage_work(h);
>  	spin_lock_irq(&hugetlb_lock);
>  
>  	while (count < persistent_huge_pages(h)) {
> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> index e45a138a7f85..cb28c5b6c9ff 100644
> --- a/mm/hugetlb_vmemmap.c
> +++ b/mm/hugetlb_vmemmap.c
> @@ -180,18 +180,6 @@
>  #define RESERVE_VMEMMAP_NR		2U
>  #define RESERVE_VMEMMAP_SIZE		(RESERVE_VMEMMAP_NR << PAGE_SHIFT)
>  
> -/*
> - * How many vmemmap pages associated with a HugeTLB page that can be freed
> - * to the buddy allocator.
> - *
> - * Todo: Returns zero for now, which means the feature is disabled. We will
> - * enable it once all the infrastructure is there.
> - */
> -static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
> -{
> -	return 0;
> -}
> -
>  static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h)
>  {
>  	return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT;
> diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
> index 6923f03534d5..01f8637adbe0 100644
> --- a/mm/hugetlb_vmemmap.h
> +++ b/mm/hugetlb_vmemmap.h
> @@ -12,9 +12,26 @@
>  
>  #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
>  void free_huge_page_vmemmap(struct hstate *h, struct page *head);
> +
> +/*
> + * How many vmemmap pages associated with a HugeTLB page that can be freed
> + * to the buddy allocator.
> + *
> + * Todo: Returns zero for now, which means the feature is disabled. We will
> + * enable it once all the infrastructure is there.
> + */
> +static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
> +{
> +	return 0;
> +}
>  #else
>  static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head)
>  {
>  }
> +
> +static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
> +{
> +	return 0;
> +}
>  #endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */
>  #endif /* _LINUX_HUGETLB_VMEMMAP_H */
>