linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Mike Kravetz <mike.kravetz@oracle.com>
To: linux-mm@kvack.org, linux-kernel@vger.kernel.org
Cc: Michal Hocko <mhocko@suse.com>,
	Shakeel Butt <shakeelb@google.com>,
	Oscar Salvador <osalvador@suse.de>,
	David Hildenbrand <david@redhat.com>,
	Muchun Song <songmuchun@bytedance.com>,
	David Rientjes <rientjes@google.com>,
	Miaohe Lin <linmiaohe@huawei.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Matthew Wilcox <willy@infradead.org>,
	HORIGUCHI NAOYA <naoya.horiguchi@nec.com>,
	"Aneesh Kumar K . V" <aneesh.kumar@linux.ibm.com>,
	Waiman Long <longman@redhat.com>, Peter Xu <peterx@redhat.com>,
	Mina Almasry <almasrymina@google.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Mike Kravetz <mike.kravetz@oracle.com>
Subject: [RFC PATCH 7/8] hugetlb: add update_and_free_page_no_sleep for irq context
Date: Fri, 19 Mar 2021 15:42:08 -0700	[thread overview]
Message-ID: <20210319224209.150047-8-mike.kravetz@oracle.com> (raw)
In-Reply-To: <20210319224209.150047-1-mike.kravetz@oracle.com>

The locks acquired in free_huge_page are irq safe.  However, in certain
circumstances the routine update_and_free_page could sleep.  Since
free_huge_page can be called from any context, it can not sleep.

Use a waitqueue to defer freeing of pages if the operation may sleep.  A
new routine update_and_free_page_no_sleep provides this functionality
and is only called from free_huge_page.

Note that any 'pages' sent to the workqueue for deferred freeing have
already been removed from the hugetlb subsystem.  What is actually
deferred is returning those base pages to the low level allocator.

Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
 include/linux/hugetlb.h | 12 +++++-
 mm/hugetlb.c            | 86 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 94 insertions(+), 4 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index f42d44050548..a81ca39c06be 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -666,9 +666,14 @@ static inline unsigned huge_page_shift(struct hstate *h)
 	return h->order + PAGE_SHIFT;
 }
 
+static inline bool order_is_gigantic(unsigned int order)
+{
+	return order >= MAX_ORDER;
+}
+
 static inline bool hstate_is_gigantic(struct hstate *h)
 {
-	return huge_page_order(h) >= MAX_ORDER;
+	return order_is_gigantic(huge_page_order(h));
 }
 
 static inline unsigned int pages_per_huge_page(struct hstate *h)
@@ -942,6 +947,11 @@ static inline unsigned int huge_page_shift(struct hstate *h)
 	return PAGE_SHIFT;
 }
 
+static inline bool order_is_gigantic(unsigned int order)
+{
+	return false;
+}
+
 static inline bool hstate_is_gigantic(struct hstate *h)
 {
 	return false;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 82614bbe7bb9..b8304b290a73 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1351,7 +1351,60 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page,
 	h->nr_huge_pages_node[nid]--;
 }
 
-static void update_and_free_page(struct hstate *h, struct page *page)
+/*
+ * free_huge_page() can be called from any context.  However, the freeing
+ * of a hugetlb page can potentially sleep.  If freeing will sleep, defer
+ * the actual freeing to a workqueue to prevent sleeping in contexts where
+ * sleeping is not allowed.
+ *
+ * Use the page->mapping pointer as a llist_node structure for the lockless
+ * linked list of pages to be freeed.  free_hpage_workfn() locklessly
+ * retrieves the linked list of pages to be freed and frees them one-by-one.
+ *
+ * The page passed to __free_huge_page is technically not a hugetlb page, so
+ * we can not use interfaces such as page_hstate().
+ */
+static void __free_huge_page(struct page *page)
+{
+	unsigned int order = compound_order(page);
+
+	if (order_is_gigantic(order)) {
+		destroy_compound_gigantic_page(page, order);
+		free_gigantic_page(page, order);
+	} else {
+		__free_pages(page, order);
+	}
+}
+
+static LLIST_HEAD(hpage_freelist);
+
+static void free_hpage_workfn(struct work_struct *work)
+{
+	struct llist_node *node;
+	struct page *page;
+
+	node = llist_del_all(&hpage_freelist);
+
+	while (node) {
+		page = container_of((struct address_space **)node,
+				     struct page, mapping);
+		node = node->next;
+		__free_huge_page(page);
+	}
+}
+static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
+
+static bool free_page_may_sleep(struct hstate *h, struct page *page)
+{
+	/* freeing gigantic pages in CMA may sleep */
+	if (hstate_is_gigantic(h))
+		return true;
+
+	return false;
+}
+
+static void __update_and_free_page(struct hstate *h, struct page *page,
+								bool can_sleep)
 {
 	int i;
 	struct page *subpage = page;
@@ -1366,6 +1419,21 @@ static void update_and_free_page(struct hstate *h, struct page *page)
 				1 << PG_active | 1 << PG_private |
 				1 << PG_writeback);
 	}
+
+	if (!can_sleep && free_page_may_sleep(h, page)) {
+		/*
+		 * Send page freeing to workqueue
+		 *
+		 * Only call schedule_work() if hpage_freelist is previously
+		 * empty. Otherwise, schedule_work() had been called but the
+		 * workfn hasn't retrieved the list yet.
+		 */
+		if (llist_add((struct llist_node *)&page->mapping,
+					&hpage_freelist))
+			schedule_work(&free_hpage_work);
+		return;
+	}
+
 	if (hstate_is_gigantic(h)) {
 		destroy_compound_gigantic_page(page, huge_page_order(h));
 		free_gigantic_page(page, huge_page_order(h));
@@ -1374,6 +1442,18 @@ static void update_and_free_page(struct hstate *h, struct page *page)
 	}
 }
 
+static void update_and_free_page_no_sleep(struct hstate *h, struct page *page)
+{
+	/* can not sleep */
+	return __update_and_free_page(h, page, false);
+}
+
+static void update_and_free_page(struct hstate *h, struct page *page)
+{
+	/* can sleep */
+	return __update_and_free_page(h, page, true);
+}
+
 struct hstate *size_to_hstate(unsigned long size)
 {
 	struct hstate *h;
@@ -1436,12 +1516,12 @@ void free_huge_page(struct page *page)
 	if (HPageTemporary(page)) {
 		remove_hugetlb_page(h, page, false);
 		spin_unlock_irqrestore(&hugetlb_lock, flags);
-		update_and_free_page(h, page);
+		update_and_free_page_no_sleep(h, page);
 	} else if (h->surplus_huge_pages_node[nid]) {
 		/* remove the page from active list */
 		remove_hugetlb_page(h, page, true);
 		spin_unlock_irqrestore(&hugetlb_lock, flags);
-		update_and_free_page(h, page);
+		update_and_free_page_no_sleep(h, page);
 	} else {
 		arch_clear_hugepage_flags(page);
 		enqueue_huge_page(h, page);
-- 
2.30.2


  parent reply	other threads:[~2021-03-19 22:44 UTC|newest]

Thread overview: 44+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-03-19 22:42 [RFC PATCH 0/8] make hugetlb put_page safe for all calling contexts Mike Kravetz
2021-03-19 22:42 ` [RFC PATCH 1/8] hugetlb: add per-hstate mutex to synchronize user adjustments Mike Kravetz
2021-03-22 13:59   ` Michal Hocko
2021-03-22 16:57     ` Mike Kravetz
2021-03-23  7:48       ` Michal Hocko
2021-03-19 22:42 ` [RFC PATCH 2/8] hugetlb: recompute min_count when dropping hugetlb_lock Mike Kravetz
2021-03-22 14:07   ` Michal Hocko
2021-03-22 23:07     ` Mike Kravetz
2021-03-23  7:50       ` Michal Hocko
2021-03-23  8:01         ` Peter Zijlstra
2021-03-23  8:14           ` Michal Hocko
2021-03-23 23:18             ` Mike Kravetz
2021-03-24  8:36               ` Michal Hocko
2021-03-24 16:43                 ` Mike Kravetz
2021-03-19 22:42 ` [RFC PATCH 3/8] hugetlb: create remove_hugetlb_page() to separate functionality Mike Kravetz
2021-03-22 14:15   ` Michal Hocko
2021-03-22 17:01     ` Mike Kravetz
2021-03-19 22:42 ` [RFC PATCH 4/8] hugetlb: call update_and_free_page without hugetlb_lock Mike Kravetz
2021-03-22 14:19   ` Michal Hocko
2021-03-19 22:42 ` [RFC PATCH 5/8] hugetlb: change free_pool_huge_page to remove_pool_huge_page Mike Kravetz
2021-03-22 14:31   ` Michal Hocko
2021-03-22 23:28     ` Mike Kravetz
2021-03-23  7:57       ` Michal Hocko
2021-03-24  1:03         ` Mike Kravetz
2021-03-24  8:40           ` Michal Hocko
2021-03-24 16:38             ` Mike Kravetz
2021-03-24 16:50               ` Michal Hocko
2021-03-19 22:42 ` [RFC PATCH 6/8] hugetlb: make free_huge_page irq safe Mike Kravetz
2021-03-21 19:55   ` Mike Kravetz
2021-03-22 13:36   ` [hugetlb] cd190f60f9: BUG:sleeping_function_called_from_invalid_context_at_mm/hugetlb.c kernel test robot
2021-03-22 14:35   ` [RFC PATCH 6/8] hugetlb: make free_huge_page irq safe Michal Hocko
2021-03-19 22:42 ` Mike Kravetz [this message]
2021-03-22  8:41   ` [RFC PATCH 7/8] hugetlb: add update_and_free_page_no_sleep for irq context Peter Zijlstra
2021-03-22 17:42     ` Mike Kravetz
2021-03-22 18:10       ` Roman Gushchin
2021-03-23 18:51         ` Mike Kravetz
2021-03-23 19:07           ` Roman Gushchin
2021-03-24  8:43           ` Michal Hocko
2021-03-24 16:53             ` Mike Kravetz
2021-03-22 20:43       ` Peter Zijlstra
2021-03-22 14:42   ` Michal Hocko
2021-03-22 14:46     ` Michal Hocko
2021-03-19 22:42 ` [RFC PATCH 8/8] hugetlb: track hugetlb pages allocated via cma_alloc Mike Kravetz
     [not found] ` <20210320011857.2004-1-hdanton@sina.com>
2021-03-25  0:26   ` [RFC PATCH 7/8] hugetlb: add update_and_free_page_no_sleep for irq context Mike Kravetz

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210319224209.150047-8-mike.kravetz@oracle.com \
    --to=mike.kravetz@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=almasrymina@google.com \
    --cc=aneesh.kumar@linux.ibm.com \
    --cc=david@redhat.com \
    --cc=linmiaohe@huawei.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=longman@redhat.com \
    --cc=mhocko@suse.com \
    --cc=naoya.horiguchi@nec.com \
    --cc=osalvador@suse.de \
    --cc=peterx@redhat.com \
    --cc=peterz@infradead.org \
    --cc=rientjes@google.com \
    --cc=shakeelb@google.com \
    --cc=songmuchun@bytedance.com \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).