All of lore.kernel.org
 help / color / mirror / Atom feed
From: Muchun Song <songmuchun@bytedance.com>
To: corbet@lwn.net, mike.kravetz@oracle.com, tglx@linutronix.de,
	mingo@redhat.com, bp@alien8.de, x86@kernel.org, hpa@zytor.com,
	dave.hansen@linux.intel.com, luto@kernel.org,
	peterz@infradead.org, viro@zeniv.linux.org.uk,
	akpm@linux-foundation.org, paulmck@kernel.org,
	pawan.kumar.gupta@linux.intel.com, rdunlap@infradead.org,
	oneukum@suse.com, anshuman.khandual@arm.com, jroedel@suse.de,
	almasrymina@google.com, rientjes@google.com, willy@infradead.org,
	osalvador@suse.de, mhocko@suse.com, song.bao.hua@hisilicon.com,
	david@redhat.com, naoya.horiguchi@nec.com,
	joao.m.martins@oracle.com
Cc: duanxiongchun@bytedance.com, fam.zheng@bytedance.com,
	linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org, linux-fsdevel@vger.kernel.org,
	Muchun Song <songmuchun@bytedance.com>,
	Miaohe Lin <linmiaohe@huawei.com>,
	Chen Huang <chenhuang5@huawei.com>,
	Bodeddula Balasubramaniam <bodeddub@amazon.com>
Subject: [PATCH v20 7/9] mm: hugetlb: add a kernel parameter hugetlb_free_vmemmap
Date: Thu, 15 Apr 2021 16:40:03 +0800	[thread overview]
Message-ID: <20210415084005.25049-8-songmuchun@bytedance.com> (raw)
In-Reply-To: <20210415084005.25049-1-songmuchun@bytedance.com>

Add a kernel parameter hugetlb_free_vmemmap to enable the feature of
freeing unused vmemmap pages associated with each hugetlb page on boot.

We disables PMD mapping of vmemmap pages for x86-64 arch when this
feature is enabled. Because vmemmap_remap_free() depends on vmemmap
being base page mapped.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Barry Song <song.bao.hua@hisilicon.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Tested-by: Chen Huang <chenhuang5@huawei.com>
Tested-by: Bodeddula Balasubramaniam <bodeddub@amazon.com>
---
 Documentation/admin-guide/kernel-parameters.txt | 17 +++++++++++++++++
 Documentation/admin-guide/mm/hugetlbpage.rst    |  3 +++
 arch/x86/mm/init_64.c                           |  8 ++++++--
 include/linux/hugetlb.h                         | 19 +++++++++++++++++++
 mm/hugetlb_vmemmap.c                            | 24 ++++++++++++++++++++++++
 5 files changed, 69 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 3bf052d14504..9e655f5206ac 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1611,6 +1611,23 @@
 			Documentation/admin-guide/mm/hugetlbpage.rst.
 			Format: size[KMG]
 
+	hugetlb_free_vmemmap=
+			[KNL] Reguires CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+			enabled.
+			Allows heavy hugetlb users to free up some more
+			memory (6 * PAGE_SIZE for each 2MB hugetlb page).
+			This feauture is not free though. Large page
+			tables are not used to back vmemmap pages which
+			can lead to a performance degradation for some
+			workloads. Also there will be memory allocation
+			required when hugetlb pages are freed from the
+			pool which can lead to corner cases under heavy
+			memory pressure.
+			Format: { on | off (default) }
+
+			on:  enable the feature
+			off: disable the feature
+
 	hung_task_panic=
 			[KNL] Should the hung task detector generate panics.
 			Format: 0 | 1
diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst
index 6988895d09a8..8abaeb144e44 100644
--- a/Documentation/admin-guide/mm/hugetlbpage.rst
+++ b/Documentation/admin-guide/mm/hugetlbpage.rst
@@ -153,6 +153,9 @@ default_hugepagesz
 
 	will all result in 256 2M huge pages being allocated.  Valid default
 	huge page size is architecture dependent.
+hugetlb_free_vmemmap
+	When CONFIG_HUGETLB_PAGE_FREE_VMEMMAP is set, this enables freeing
+	unused vmemmap pages associated with each HugeTLB page.
 
 When multiple huge page sizes are supported, ``/proc/sys/vm/nr_hugepages``
 indicates the current number of pre-allocated huge pages of the default size.
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 65ea58527176..9d9d18d0c2a1 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -34,6 +34,7 @@
 #include <linux/gfp.h>
 #include <linux/kcore.h>
 #include <linux/bootmem_info.h>
+#include <linux/hugetlb.h>
 
 #include <asm/processor.h>
 #include <asm/bios_ebda.h>
@@ -1609,7 +1610,8 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 	VM_BUG_ON(!IS_ALIGNED(start, PAGE_SIZE));
 	VM_BUG_ON(!IS_ALIGNED(end, PAGE_SIZE));
 
-	if (end - start < PAGES_PER_SECTION * sizeof(struct page))
+	if ((is_hugetlb_free_vmemmap_enabled()  && !altmap) ||
+	    end - start < PAGES_PER_SECTION * sizeof(struct page))
 		err = vmemmap_populate_basepages(start, end, node, NULL);
 	else if (boot_cpu_has(X86_FEATURE_PSE))
 		err = vmemmap_populate_hugepages(start, end, node, altmap);
@@ -1637,6 +1639,8 @@ void register_page_bootmem_memmap(unsigned long section_nr,
 	pmd_t *pmd;
 	unsigned int nr_pmd_pages;
 	struct page *page;
+	bool base_mapping = !boot_cpu_has(X86_FEATURE_PSE) ||
+			    is_hugetlb_free_vmemmap_enabled();
 
 	for (; addr < end; addr = next) {
 		pte_t *pte = NULL;
@@ -1662,7 +1666,7 @@ void register_page_bootmem_memmap(unsigned long section_nr,
 		}
 		get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
 
-		if (!boot_cpu_has(X86_FEATURE_PSE)) {
+		if (base_mapping) {
 			next = (addr + PAGE_SIZE) & PAGE_MASK;
 			pmd = pmd_offset(pud, addr);
 			if (pmd_none(*pmd))
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 6e970a7d3480..4015cedacf91 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -894,6 +894,20 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
 }
 #endif
 
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+extern bool hugetlb_free_vmemmap_enabled;
+
+static inline bool is_hugetlb_free_vmemmap_enabled(void)
+{
+	return hugetlb_free_vmemmap_enabled;
+}
+#else
+static inline bool is_hugetlb_free_vmemmap_enabled(void)
+{
+	return false;
+}
+#endif
+
 #else	/* CONFIG_HUGETLB_PAGE */
 struct hstate {};
 
@@ -1047,6 +1061,11 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr
 					pte_t *ptep, pte_t pte, unsigned long sz)
 {
 }
+
+static inline bool is_hugetlb_free_vmemmap_enabled(void)
+{
+	return false;
+}
 #endif	/* CONFIG_HUGETLB_PAGE */
 
 static inline spinlock_t *huge_pte_lock(struct hstate *h,
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index a897c7778246..3070e1465b1b 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -168,6 +168,8 @@
  * (last) level. So this type of HugeTLB page can be optimized only when its
  * size of the struct page structs is greater than 2 pages.
  */
+#define pr_fmt(fmt)	"HugeTLB: " fmt
+
 #include "hugetlb_vmemmap.h"
 
 /*
@@ -180,6 +182,28 @@
 #define RESERVE_VMEMMAP_NR		2U
 #define RESERVE_VMEMMAP_SIZE		(RESERVE_VMEMMAP_NR << PAGE_SHIFT)
 
+bool hugetlb_free_vmemmap_enabled;
+
+static int __init early_hugetlb_free_vmemmap_param(char *buf)
+{
+	/* We cannot optimize if a "struct page" crosses page boundaries. */
+	if ((!is_power_of_2(sizeof(struct page)))) {
+		pr_warn("cannot free vmemmap pages because \"struct page\" crosses page boundaries\n");
+		return 0;
+	}
+
+	if (!buf)
+		return -EINVAL;
+
+	if (!strcmp(buf, "on"))
+		hugetlb_free_vmemmap_enabled = true;
+	else if (strcmp(buf, "off"))
+		return -EINVAL;
+
+	return 0;
+}
+early_param("hugetlb_free_vmemmap", early_hugetlb_free_vmemmap_param);
+
 static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h)
 {
 	return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT;
-- 
2.11.0


  parent reply	other threads:[~2021-04-15  8:43 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-04-15  8:39 [PATCH v20 0/9] Free some vmemmap pages of HugeTLB page Muchun Song
2021-04-15  8:39 ` [PATCH v20 1/9] mm: memory_hotplug: factor out bootmem core functions to bootmem_info.c Muchun Song
2021-04-15  8:39 ` [PATCH v20 2/9] mm: hugetlb: introduce a new config HUGETLB_PAGE_FREE_VMEMMAP Muchun Song
2021-04-15  8:39 ` [PATCH v20 3/9] mm: hugetlb: gather discrete indexes of tail page Muchun Song
2021-04-16 18:54   ` Mike Kravetz
2021-04-15  8:40 ` [PATCH v20 4/9] mm: hugetlb: free the vmemmap pages associated with each HugeTLB page Muchun Song
2021-04-16 21:10   ` Mike Kravetz
2021-04-17  2:55     ` [External] " Muchun Song
2021-04-15  8:40 ` [PATCH v20 5/9] mm: hugetlb: defer freeing of HugeTLB pages Muchun Song
2021-04-16 23:55   ` Mike Kravetz
2021-04-17  4:13     ` [External] " Muchun Song
2021-04-15  8:40 ` [PATCH v20 6/9] mm: hugetlb: alloc the vmemmap pages associated with each HugeTLB page Muchun Song
2021-04-19 23:19   ` Mike Kravetz
2021-04-20  8:46     ` [External] " Muchun Song
2021-04-20 17:48       ` Mike Kravetz
2021-04-21  3:42         ` Muchun Song
2021-04-15  8:40 ` Muchun Song [this message]
2021-04-19 23:41   ` [PATCH v20 7/9] mm: hugetlb: add a kernel parameter hugetlb_free_vmemmap Mike Kravetz
2021-04-15  8:40 ` [PATCH v20 8/9] mm: memory_hotplug: disable memmap_on_memory when hugetlb_free_vmemmap enabled Muchun Song
2021-04-20 10:35   ` Oscar Salvador
2021-04-21  3:41     ` [External] " Muchun Song
2021-04-21  7:33       ` Oscar Salvador
2021-04-21  9:06         ` Muchun Song
2021-04-15  8:40 ` [PATCH v20 9/9] mm: hugetlb: introduce nr_free_vmemmap_pages in the struct hstate Muchun Song

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210415084005.25049-8-songmuchun@bytedance.com \
    --to=songmuchun@bytedance.com \
    --cc=akpm@linux-foundation.org \
    --cc=almasrymina@google.com \
    --cc=anshuman.khandual@arm.com \
    --cc=bodeddub@amazon.com \
    --cc=bp@alien8.de \
    --cc=chenhuang5@huawei.com \
    --cc=corbet@lwn.net \
    --cc=dave.hansen@linux.intel.com \
    --cc=david@redhat.com \
    --cc=duanxiongchun@bytedance.com \
    --cc=fam.zheng@bytedance.com \
    --cc=hpa@zytor.com \
    --cc=joao.m.martins@oracle.com \
    --cc=jroedel@suse.de \
    --cc=linmiaohe@huawei.com \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=luto@kernel.org \
    --cc=mhocko@suse.com \
    --cc=mike.kravetz@oracle.com \
    --cc=mingo@redhat.com \
    --cc=naoya.horiguchi@nec.com \
    --cc=oneukum@suse.com \
    --cc=osalvador@suse.de \
    --cc=paulmck@kernel.org \
    --cc=pawan.kumar.gupta@linux.intel.com \
    --cc=peterz@infradead.org \
    --cc=rdunlap@infradead.org \
    --cc=rientjes@google.com \
    --cc=song.bao.hua@hisilicon.com \
    --cc=tglx@linutronix.de \
    --cc=viro@zeniv.linux.org.uk \
    --cc=willy@infradead.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.