All of lore.kernel.org
 help / color / mirror / Atom feed
From: Muchun Song <songmuchun@bytedance.com>
To: corbet@lwn.net, mike.kravetz@oracle.com, tglx@linutronix.de,
	mingo@redhat.com, bp@alien8.de, x86@kernel.org, hpa@zytor.com,
	dave.hansen@linux.intel.com, luto@kernel.org,
	peterz@infradead.org, viro@zeniv.linux.org.uk,
	akpm@linux-foundation.org, paulmck@kernel.org,
	pawan.kumar.gupta@linux.intel.com, rdunlap@infradead.org,
	oneukum@suse.com, anshuman.khandual@arm.com, jroedel@suse.de,
	almasrymina@google.com, rientjes@google.com, willy@infradead.org,
	osalvador@suse.de, mhocko@suse.com, song.bao.hua@hisilicon.com,
	david@redhat.com, naoya.horiguchi@nec.com,
	joao.m.martins@oracle.com
Cc: duanxiongchun@bytedance.com, fam.zheng@bytedance.com,
	zhengqi.arch@bytedance.com, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	linux-fsdevel@vger.kernel.org,
	Muchun Song <songmuchun@bytedance.com>
Subject: [PATCH v23 5/9] mm: hugetlb: defer freeing of HugeTLB pages
Date: Mon, 10 May 2021 11:00:23 +0800	[thread overview]
Message-ID: <20210510030027.56044-6-songmuchun@bytedance.com> (raw)
In-Reply-To: <20210510030027.56044-1-songmuchun@bytedance.com>

In the subsequent patch, we should allocate the vmemmap pages when
freeing a HugeTLB page. But update_and_free_page() can be called
under any context, so we cannot use GFP_KERNEL to allocate vmemmap
pages. However, we can defer the actual freeing in a kworker to
prevent from using GFP_ATOMIC to allocate the vmemmap pages.

The __update_and_free_page() is where the call to allocate vmemmmap
pages will be inserted.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
---
 mm/hugetlb.c         | 83 +++++++++++++++++++++++++++++++++++++++++++++++-----
 mm/hugetlb_vmemmap.c | 12 --------
 mm/hugetlb_vmemmap.h | 17 +++++++++++
 3 files changed, 93 insertions(+), 19 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index fd39fc94b23f..a3629c664f6a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1376,7 +1376,7 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page,
 	h->nr_huge_pages_node[nid]--;
 }
 
-static void update_and_free_page(struct hstate *h, struct page *page)
+static void __update_and_free_page(struct hstate *h, struct page *page)
 {
 	int i;
 	struct page *subpage = page;
@@ -1399,12 +1399,79 @@ static void update_and_free_page(struct hstate *h, struct page *page)
 	}
 }
 
+/*
+ * As update_and_free_page() can be called under any context, so we cannot
+ * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the
+ * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate
+ * the vmemmap pages.
+ *
+ * free_hpage_workfn() locklessly retrieves the linked list of pages to be
+ * freed and frees them one-by-one. As the page->mapping pointer is going
+ * to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node
+ * structure of a lockless linked list of huge pages to be freed.
+ */
+static LLIST_HEAD(hpage_freelist);
+
+static void free_hpage_workfn(struct work_struct *work)
+{
+	struct llist_node *node;
+
+	node = llist_del_all(&hpage_freelist);
+
+	while (node) {
+		struct page *page;
+		struct hstate *h;
+
+		page = container_of((struct address_space **)node,
+				     struct page, mapping);
+		node = node->next;
+		page->mapping = NULL;
+		/*
+		 * The VM_BUG_ON_PAGE(!PageHuge(page), page) in page_hstate()
+		 * is going to trigger because a previous call to
+		 * remove_hugetlb_page() will set_compound_page_dtor(page,
+		 * NULL_COMPOUND_DTOR), so do not use page_hstate() directly.
+		 */
+		h = size_to_hstate(page_size(page));
+
+		__update_and_free_page(h, page);
+
+		cond_resched();
+	}
+}
+static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
+
+static inline void flush_free_hpage_work(struct hstate *h)
+{
+	if (free_vmemmap_pages_per_hpage(h))
+		flush_work(&free_hpage_work);
+}
+
+static void update_and_free_page(struct hstate *h, struct page *page,
+				 bool atomic)
+{
+	if (!free_vmemmap_pages_per_hpage(h) || !atomic) {
+		__update_and_free_page(h, page);
+		return;
+	}
+
+	/*
+	 * Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages.
+	 *
+	 * Only call schedule_work() if hpage_freelist is previously
+	 * empty. Otherwise, schedule_work() had been called but the workfn
+	 * hasn't retrieved the list yet.
+	 */
+	if (llist_add((struct llist_node *)&page->mapping, &hpage_freelist))
+		schedule_work(&free_hpage_work);
+}
+
 static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
 {
 	struct page *page, *t_page;
 
 	list_for_each_entry_safe(page, t_page, list, lru) {
-		update_and_free_page(h, page);
+		update_and_free_page(h, page, false);
 		cond_resched();
 	}
 }
@@ -1471,12 +1538,12 @@ void free_huge_page(struct page *page)
 	if (HPageTemporary(page)) {
 		remove_hugetlb_page(h, page, false);
 		spin_unlock_irqrestore(&hugetlb_lock, flags);
-		update_and_free_page(h, page);
+		update_and_free_page(h, page, true);
 	} else if (h->surplus_huge_pages_node[nid]) {
 		/* remove the page from active list */
 		remove_hugetlb_page(h, page, true);
 		spin_unlock_irqrestore(&hugetlb_lock, flags);
-		update_and_free_page(h, page);
+		update_and_free_page(h, page, true);
 	} else {
 		arch_clear_hugepage_flags(page);
 		enqueue_huge_page(h, page);
@@ -1798,7 +1865,7 @@ int dissolve_free_huge_page(struct page *page)
 		remove_hugetlb_page(h, page, false);
 		h->max_huge_pages--;
 		spin_unlock_irq(&hugetlb_lock);
-		update_and_free_page(h, head);
+		update_and_free_page(h, head, false);
 		return 0;
 	}
 out:
@@ -2343,14 +2410,14 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
 		 * Pages have been replaced, we can safely free the old one.
 		 */
 		spin_unlock_irq(&hugetlb_lock);
-		update_and_free_page(h, old_page);
+		update_and_free_page(h, old_page, false);
 	}
 
 	return ret;
 
 free_new:
 	spin_unlock_irq(&hugetlb_lock);
-	update_and_free_page(h, new_page);
+	update_and_free_page(h, new_page, false);
 
 	return ret;
 }
@@ -2764,6 +2831,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
 	 * pages in hstate via the proc/sysfs interfaces.
 	 */
 	mutex_lock(&h->resize_lock);
+	flush_free_hpage_work(h);
 	spin_lock_irq(&hugetlb_lock);
 
 	/*
@@ -2873,6 +2941,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
 	/* free the pages after dropping lock */
 	spin_unlock_irq(&hugetlb_lock);
 	update_and_free_pages_bulk(h, &page_list);
+	flush_free_hpage_work(h);
 	spin_lock_irq(&hugetlb_lock);
 
 	while (count < persistent_huge_pages(h)) {
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index e45a138a7f85..cb28c5b6c9ff 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -180,18 +180,6 @@
 #define RESERVE_VMEMMAP_NR		2U
 #define RESERVE_VMEMMAP_SIZE		(RESERVE_VMEMMAP_NR << PAGE_SHIFT)
 
-/*
- * How many vmemmap pages associated with a HugeTLB page that can be freed
- * to the buddy allocator.
- *
- * Todo: Returns zero for now, which means the feature is disabled. We will
- * enable it once all the infrastructure is there.
- */
-static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
-{
-	return 0;
-}
-
 static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h)
 {
 	return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT;
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index 6923f03534d5..01f8637adbe0 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -12,9 +12,26 @@
 
 #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
 void free_huge_page_vmemmap(struct hstate *h, struct page *head);
+
+/*
+ * How many vmemmap pages associated with a HugeTLB page that can be freed
+ * to the buddy allocator.
+ *
+ * Todo: Returns zero for now, which means the feature is disabled. We will
+ * enable it once all the infrastructure is there.
+ */
+static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
+{
+	return 0;
+}
 #else
 static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head)
 {
 }
+
+static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
+{
+	return 0;
+}
 #endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */
 #endif /* _LINUX_HUGETLB_VMEMMAP_H */
-- 
2.11.0


  parent reply	other threads:[~2021-05-10  3:04 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-05-10  3:00 [PATCH v23 0/9] Free some vmemmap pages of HugeTLB page Muchun Song
2021-05-10  3:00 ` [PATCH v23 1/9] mm: memory_hotplug: factor out bootmem core functions to bootmem_info.c Muchun Song
2021-05-10  3:00 ` [PATCH v23 2/9] mm: hugetlb: introduce a new config HUGETLB_PAGE_FREE_VMEMMAP Muchun Song
2021-05-10  3:00 ` [PATCH v23 3/9] mm: hugetlb: gather discrete indexes of tail page Muchun Song
2021-05-10  3:00 ` [PATCH v23 4/9] mm: hugetlb: free the vmemmap pages associated with each HugeTLB page Muchun Song
2021-05-10  3:00 ` Muchun Song [this message]
2021-05-10 10:23   ` [PATCH v23 5/9] mm: hugetlb: defer freeing of HugeTLB pages Oscar Salvador
2021-05-10  3:00 ` [PATCH v23 6/9] mm: hugetlb: alloc the vmemmap pages associated with each HugeTLB page Muchun Song
2021-05-10 10:45   ` Oscar Salvador
2021-05-10 12:19     ` [External] " Muchun Song
2021-05-11  1:10       ` Mike Kravetz
2021-05-11  8:07       ` Oscar Salvador
2021-05-10  3:00 ` [PATCH v23 7/9] mm: hugetlb: add a kernel parameter hugetlb_free_vmemmap Muchun Song
2021-05-10  3:00 ` [PATCH v23 8/9] mm: memory_hotplug: disable memmap_on_memory when hugetlb_free_vmemmap enabled Muchun Song
2021-05-10 10:08   ` Oscar Salvador
2021-05-10 10:33     ` [External] " Muchun Song
2021-05-10  3:00 ` [PATCH v23 9/9] mm: hugetlb: introduce nr_free_vmemmap_pages in the struct hstate Muchun Song
2021-06-25 16:26 ` [PATCH v23 0/9] Free some vmemmap pages of HugeTLB page Wali, Rohit
2021-06-25 17:21 ` Wali, Rohit

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210510030027.56044-6-songmuchun@bytedance.com \
    --to=songmuchun@bytedance.com \
    --cc=akpm@linux-foundation.org \
    --cc=almasrymina@google.com \
    --cc=anshuman.khandual@arm.com \
    --cc=bp@alien8.de \
    --cc=corbet@lwn.net \
    --cc=dave.hansen@linux.intel.com \
    --cc=david@redhat.com \
    --cc=duanxiongchun@bytedance.com \
    --cc=fam.zheng@bytedance.com \
    --cc=hpa@zytor.com \
    --cc=joao.m.martins@oracle.com \
    --cc=jroedel@suse.de \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=luto@kernel.org \
    --cc=mhocko@suse.com \
    --cc=mike.kravetz@oracle.com \
    --cc=mingo@redhat.com \
    --cc=naoya.horiguchi@nec.com \
    --cc=oneukum@suse.com \
    --cc=osalvador@suse.de \
    --cc=paulmck@kernel.org \
    --cc=pawan.kumar.gupta@linux.intel.com \
    --cc=peterz@infradead.org \
    --cc=rdunlap@infradead.org \
    --cc=rientjes@google.com \
    --cc=song.bao.hua@hisilicon.com \
    --cc=tglx@linutronix.de \
    --cc=viro@zeniv.linux.org.uk \
    --cc=willy@infradead.org \
    --cc=x86@kernel.org \
    --cc=zhengqi.arch@bytedance.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.