All of lore.kernel.org
 help / color / mirror / Atom feed
From: Muchun Song <songmuchun@bytedance.com>
To: corbet@lwn.net, mike.kravetz@oracle.com, tglx@linutronix.de,
	mingo@redhat.com, bp@alien8.de, x86@kernel.org, hpa@zytor.com,
	dave.hansen@linux.intel.com, luto@kernel.org,
	peterz@infradead.org, viro@zeniv.linux.org.uk,
	akpm@linux-foundation.org, paulmck@kernel.org,
	mchehab+huawei@kernel.org, pawan.kumar.gupta@linux.intel.com,
	rdunlap@infradead.org, oneukum@suse.com,
	anshuman.khandual@arm.com, jroedel@suse.de,
	almasrymina@google.com, rientjes@google.com, willy@infradead.org,
	osalvador@suse.de, mhocko@suse.com, song.bao.hua@hisilicon.com,
	david@redhat.com
Cc: duanxiongchun@bytedance.com, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	linux-fsdevel@vger.kernel.org,
	Muchun Song <songmuchun@bytedance.com>
Subject: [PATCH v8 06/12] mm/hugetlb: Allocate the vmemmap pages associated with each HugeTLB page
Date: Thu, 10 Dec 2020 11:55:20 +0800	[thread overview]
Message-ID: <20201210035526.38938-7-songmuchun@bytedance.com> (raw)
In-Reply-To: <20201210035526.38938-1-songmuchun@bytedance.com>

When we free a HugeTLB page to the buddy allocator, we should allocate the
vmemmap pages associated with it. We can do that in the __free_hugepage()
before freeing it to buddy.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 mm/hugetlb.c         |  2 ++
 mm/hugetlb_vmemmap.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++---
 mm/hugetlb_vmemmap.h |  5 +++
 3 files changed, 91 insertions(+), 5 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0ff9b90e524f..542e6cb81321 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1362,6 +1362,8 @@ static void __free_hugepage(struct hstate *h, struct page *page)
 {
 	int i;
 
+	alloc_huge_page_vmemmap(h, page);
+
 	for (i = 0; i < pages_per_huge_page(h); i++) {
 		page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
 				1 << PG_referenced | 1 << PG_dirty |
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index d080488cde16..4587a0062808 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -169,6 +169,7 @@
 #define pr_fmt(fmt)	"HugeTLB vmemmap: " fmt
 
 #include <linux/bootmem_info.h>
+#include <linux/delay.h>
 #include "hugetlb_vmemmap.h"
 
 /*
@@ -181,6 +182,8 @@
 #define RESERVE_VMEMMAP_NR		2U
 #define RESERVE_VMEMMAP_SIZE		(RESERVE_VMEMMAP_NR << PAGE_SHIFT)
 #define VMEMMAP_TAIL_PAGE_REUSE		-1
+#define GFP_VMEMMAP_PAGE		\
+	(GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_HIGH | __GFP_NOWARN)
 
 #ifndef VMEMMAP_HPAGE_SHIFT
 #define VMEMMAP_HPAGE_SHIFT		HPAGE_SHIFT
@@ -197,6 +200,11 @@
 	(__boundary - 1 < (end) - 1) ? __boundary : (end);		 \
 })
 
+typedef void (*vmemmap_remap_pte_func_t)(struct page *reuse, pte_t *pte,
+					 unsigned long start, unsigned long end,
+					 void *priv);
+
+
 static inline unsigned int vmemmap_pages_per_hpage(struct hstate *h)
 {
 	return free_vmemmap_pages_per_hpage(h) + RESERVE_VMEMMAP_NR;
@@ -236,9 +244,39 @@ static pmd_t *vmemmap_to_pmd(unsigned long addr)
 	return pmd;
 }
 
+static void vmemmap_restore_pte_range(struct page *reuse, pte_t *pte,
+				      unsigned long start, unsigned long end,
+				      void *priv)
+{
+	pgprot_t pgprot = PAGE_KERNEL;
+	void *from = page_to_virt(reuse);
+	unsigned long addr;
+	struct list_head *pages = priv;
+
+	for (addr = start; addr < end; addr += PAGE_SIZE) {
+		void *to;
+		struct page *page;
+
+		VM_BUG_ON(pte_none(*pte) || pte_page(*pte) != reuse);
+
+		page = list_first_entry(pages, struct page, lru);
+		list_del(&page->lru);
+		to = page_to_virt(page);
+		copy_page(to, from);
+
+		/*
+		 * Make sure that any data that writes to the @to is made
+		 * visible to the physical page.
+		 */
+		flush_kernel_vmap_range(to, PAGE_SIZE);
+
+		set_pte_at(&init_mm, addr, pte++, mk_pte(page, pgprot));
+	}
+}
+
 static void vmemmap_reuse_pte_range(struct page *reuse, pte_t *pte,
 				    unsigned long start, unsigned long end,
-				    struct list_head *vmemmap_pages)
+				    void *priv)
 {
 	/*
 	 * Make the tail pages are mapped with read-only to catch
@@ -247,6 +285,7 @@ static void vmemmap_reuse_pte_range(struct page *reuse, pte_t *pte,
 	pgprot_t pgprot = PAGE_KERNEL_RO;
 	pte_t entry = mk_pte(reuse, pgprot);
 	unsigned long addr;
+	struct list_head *pages = priv;
 
 	for (addr = start; addr < end; addr += PAGE_SIZE, pte++) {
 		struct page *page;
@@ -254,14 +293,14 @@ static void vmemmap_reuse_pte_range(struct page *reuse, pte_t *pte,
 		VM_BUG_ON(pte_none(*pte));
 
 		page = pte_page(*pte);
-		list_add(&page->lru, vmemmap_pages);
+		list_add(&page->lru, pages);
 
 		set_pte_at(&init_mm, addr, pte, entry);
 	}
 }
 
 static void vmemmap_remap_range(unsigned long start, unsigned long end,
-				struct list_head *vmemmap_pages)
+				vmemmap_remap_pte_func_t func, void *priv)
 {
 	pmd_t *pmd;
 	unsigned long next, addr = start;
@@ -281,12 +320,52 @@ static void vmemmap_remap_range(unsigned long start, unsigned long end,
 			reuse = pte_page(pte[VMEMMAP_TAIL_PAGE_REUSE]);
 
 		next = vmemmap_hpage_addr_end(addr, end);
-		vmemmap_reuse_pte_range(reuse, pte, addr, next, vmemmap_pages);
+		func(reuse, pte, addr, next, priv);
 	} while (pmd++, addr = next, addr != end);
 
 	flush_tlb_kernel_range(start, end);
 }
 
+static inline void alloc_vmemmap_pages(struct hstate *h, struct list_head *list)
+{
+	unsigned int nr = free_vmemmap_pages_per_hpage(h);
+
+	while (nr--) {
+		struct page *page;
+
+retry:
+		page = alloc_page(GFP_VMEMMAP_PAGE);
+		if (unlikely(!page)) {
+			msleep(100);
+			/*
+			 * We should retry infinitely, because we cannot
+			 * handle allocation failures. Once we allocate
+			 * vmemmap pages successfully, then we can free
+			 * a HugeTLB page.
+			 */
+			goto retry;
+		}
+		list_add_tail(&page->lru, list);
+	}
+}
+
+void alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
+{
+	unsigned long start, end;
+	unsigned long vmemmap_addr = (unsigned long)head;
+	LIST_HEAD(vmemmap_pages);
+
+	if (!free_vmemmap_pages_per_hpage(h))
+		return;
+
+	alloc_vmemmap_pages(h, &vmemmap_pages);
+
+	start = vmemmap_addr + RESERVE_VMEMMAP_SIZE;
+	end = vmemmap_addr + vmemmap_pages_size_per_hpage(h);
+	vmemmap_remap_range(start, end, vmemmap_restore_pte_range,
+			    &vmemmap_pages);
+}
+
 /*
  * Free a vmemmap page. A vmemmap page can be allocated from the memblock
  * allocator or buddy allocator. If the PG_reserved flag is set, it means
@@ -322,7 +401,7 @@ void free_huge_page_vmemmap(struct hstate *h, struct page *head)
 
 	start = vmemmap_addr + RESERVE_VMEMMAP_SIZE;
 	end = vmemmap_addr + vmemmap_pages_size_per_hpage(h);
-	vmemmap_remap_range(start, end, &vmemmap_pages);
+	vmemmap_remap_range(start, end, vmemmap_reuse_pte_range, &vmemmap_pages);
 
 	free_vmemmap_page_list(&vmemmap_pages);
 }
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index bf22cd003acb..8fd57c49e230 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -11,6 +11,7 @@
 #include <linux/hugetlb.h>
 
 #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+void alloc_huge_page_vmemmap(struct hstate *h, struct page *head);
 void free_huge_page_vmemmap(struct hstate *h, struct page *head);
 
 /*
@@ -25,6 +26,10 @@ static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
 	return 0;
 }
 #else
+static inline void alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
+{
+}
+
 static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head)
 {
 }
-- 
2.11.0


  parent reply	other threads:[~2020-12-10  4:05 UTC|newest]

Thread overview: 48+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-12-10  3:55 [PATCH v8 00/12] Free some vmemmap pages of HugeTLB page Muchun Song
2020-12-10  3:55 ` [PATCH v8 01/12] mm/memory_hotplug: Factor out bootmem core functions to bootmem_info.c Muchun Song
2020-12-10  3:55 ` [PATCH v8 02/12] mm/hugetlb: Introduce a new config HUGETLB_PAGE_FREE_VMEMMAP Muchun Song
2020-12-10  3:55 ` [PATCH v8 03/12] mm/bootmem_info: Introduce free_bootmem_page helper Muchun Song
2020-12-10 14:15   ` Oscar Salvador
2020-12-10 15:22     ` [External] " Muchun Song
2020-12-10 15:22       ` Muchun Song
2020-12-10 15:26       ` Muchun Song
2020-12-10 15:26         ` Muchun Song
2020-12-10  3:55 ` [PATCH v8 04/12] mm/hugetlb: Free the vmemmap pages associated with each HugeTLB page Muchun Song
2020-12-10 14:42   ` Oscar Salvador
2020-12-10 14:44     ` Oscar Salvador
2020-12-10 15:58       ` [External] " Muchun Song
2020-12-10 15:58         ` Muchun Song
2020-12-10 15:57     ` Muchun Song
2020-12-10 15:57       ` Muchun Song
2020-12-10  3:55 ` [PATCH v8 05/12] mm/hugetlb: Defer freeing of HugeTLB pages Muchun Song
2020-12-10  3:55 ` Muchun Song [this message]
2020-12-11  9:35   ` [PATCH v8 06/12] mm/hugetlb: Allocate the vmemmap pages associated with each HugeTLB page Oscar Salvador
2020-12-11 10:52     ` David Hildenbrand
2020-12-11 13:01     ` [External] " Muchun Song
2020-12-11 13:01       ` Muchun Song
2020-12-10  3:55 ` [PATCH v8 07/12] mm/hugetlb: Set the PageHWPoison to the raw error page Muchun Song
2020-12-10 11:11   ` Muchun Song
2020-12-10 11:11     ` Muchun Song
2020-12-11 13:36   ` Oscar Salvador
2020-12-11 14:08     ` [External] " Muchun Song
2020-12-11 14:08       ` Muchun Song
2020-12-10  3:55 ` [PATCH v8 08/12] mm/hugetlb: Flush work when dissolving hugetlb page Muchun Song
2020-12-10  3:55 ` [PATCH v8 09/12] mm/hugetlb: Add a kernel parameter hugetlb_free_vmemmap Muchun Song
2020-12-10 10:04   ` Oscar Salvador
2020-12-10 12:26     ` [External] " Muchun Song
2020-12-10 12:26       ` Muchun Song
2020-12-10  3:55 ` [PATCH v8 10/12] mm/hugetlb: Introduce nr_free_vmemmap_pages in the struct hstate Muchun Song
2020-12-10 10:15   ` Oscar Salvador
2020-12-10 12:32     ` [External] " Muchun Song
2020-12-10 12:32       ` Muchun Song
2020-12-10  3:55 ` [PATCH v8 11/12] mm/hugetlb: Gather discrete indexes of tail page Muchun Song
2020-12-10  3:55 ` [PATCH v8 12/12] mm/hugetlb: Optimize the code with the help of the compiler Muchun Song
2020-12-10 10:25   ` Oscar Salvador
2020-12-10 12:14     ` [External] " Muchun Song
2020-12-10 12:14       ` Muchun Song
2020-12-10 13:16       ` Oscar Salvador
2020-12-10 13:29         ` Muchun Song
2020-12-10 13:29           ` Muchun Song
2020-12-10 16:19           ` Muchun Song
2020-12-10 16:19             ` Muchun Song
2020-12-10  9:18 ` [PATCH v8 00/12] Free some vmemmap pages of HugeTLB page Oscar Salvador

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20201210035526.38938-7-songmuchun@bytedance.com \
    --to=songmuchun@bytedance.com \
    --cc=akpm@linux-foundation.org \
    --cc=almasrymina@google.com \
    --cc=anshuman.khandual@arm.com \
    --cc=bp@alien8.de \
    --cc=corbet@lwn.net \
    --cc=dave.hansen@linux.intel.com \
    --cc=david@redhat.com \
    --cc=duanxiongchun@bytedance.com \
    --cc=hpa@zytor.com \
    --cc=jroedel@suse.de \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=luto@kernel.org \
    --cc=mchehab+huawei@kernel.org \
    --cc=mhocko@suse.com \
    --cc=mike.kravetz@oracle.com \
    --cc=mingo@redhat.com \
    --cc=oneukum@suse.com \
    --cc=osalvador@suse.de \
    --cc=paulmck@kernel.org \
    --cc=pawan.kumar.gupta@linux.intel.com \
    --cc=peterz@infradead.org \
    --cc=rdunlap@infradead.org \
    --cc=rientjes@google.com \
    --cc=song.bao.hua@hisilicon.com \
    --cc=tglx@linutronix.de \
    --cc=viro@zeniv.linux.org.uk \
    --cc=willy@infradead.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.