All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jiaqi Yan <jiaqiyan@google.com>
To: mike.kravetz@oracle.com, peterx@redhat.com, naoya.horiguchi@nec.com
Cc: songmuchun@bytedance.com, duenwen@google.com,
	axelrasmussen@google.com, jthoughton@google.com,
	rientjes@google.com, linmiaohe@huawei.com, shy828301@gmail.com,
	baolin.wang@linux.alibaba.com, wangkefeng.wang@huawei.com,
	akpm@linux-foundation.org, linux-mm@kvack.org,
	linux-kernel@vger.kernel.org, Jiaqi Yan <jiaqiyan@google.com>
Subject: [RFC PATCH v1 1/7] hugetlb: add HugeTLB splitting functionality
Date: Fri, 28 Apr 2023 00:41:33 +0000	[thread overview]
Message-ID: <20230428004139.2899856-2-jiaqiyan@google.com> (raw)
In-Reply-To: <20230428004139.2899856-1-jiaqiyan@google.com>

The new function, hugetlb_split_to_shift, optimally splits the page
table to map a particular address at a paricular granularity. This
is useful for punching a hole in the mapping and for mapping (and
unmapping) small sections of a HugeTLB page.

Splitting is for present leaf HugeTLB PTE only. None HugeTLB PTEs
and other non-present HugeTLB PTE types are not supported as they
are better left untouched:
* None PTEs
* Migration PTEs
* HWPOISON PTEs
* UFFD writeprotect PTEs

Signed-off-by: Jiaqi Yan <jiaqiyan@google.com>
---
 include/linux/hugetlb.h |   9 ++
 mm/hugetlb.c            | 249 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 258 insertions(+)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 742e7f2cb170..d44bf6a794e5 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -1266,6 +1266,9 @@ int hugetlb_alloc_largest_pte(struct hugetlb_pte *hpte, struct mm_struct *mm,
 			      unsigned long end);
 int hugetlb_collapse(struct mm_struct *mm, unsigned long start,
 		     unsigned long end);
+int hugetlb_split_to_shift(struct mm_struct *mm, struct vm_area_struct *vma,
+			   struct hugetlb_pte *hpte, unsigned long addr,
+			   unsigned int desired_shift);
 #else
 static inline bool hugetlb_hgm_enabled(struct vm_area_struct *vma)
 {
@@ -1292,6 +1295,12 @@ int hugetlb_collapse(struct mm_struct *mm, unsigned long start,
 {
 	return -EINVAL;
 }
+int hugetlb_split_to_shift(struct mm_struct *mm, struct vm_area_struct *vma,
+			   const struct hugetlb_pte *hpte, unsigned long addr,
+			   unsigned int desired_shift)
+{
+	return -EINVAL;
+}
 #endif
 
 static inline
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index df4c17164abb..d3f3f1c2d293 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -8203,6 +8203,255 @@ int hugetlb_collapse(struct mm_struct *mm, unsigned long start,
 	return ret;
 }
 
+/*
+ * Find the optimal HugeTLB PTE shift that @desired_addr could be mapped at.
+ */
+static int hugetlb_find_shift(struct vm_area_struct *vma,
+			      unsigned long curr,
+			      unsigned long end,
+			      unsigned long desired_addr,
+			      unsigned long desired_shift,
+			      unsigned int *shift_found)
+{
+	struct hstate *h = hstate_vma(vma);
+	struct hstate *tmp_h;
+	unsigned int shift;
+	unsigned long sz;
+
+	for_each_hgm_shift(h, tmp_h, shift) {
+		sz = 1UL << shift;
+		/* This sz is not aligned or too large. */
+		if (!IS_ALIGNED(curr, sz) || curr + sz > end)
+			continue;
+		/*
+		 * When desired_addr is in [curr, curr + sz),
+		 * we want shift to be as close to desired_shift
+		 * as possible.
+		 */
+		if (curr <= desired_addr && desired_addr < curr + sz
+		    && shift > desired_shift)
+			continue;
+
+		*shift_found = shift;
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+/*
+ * Given a particular address @addr and it is a present leaf HugeTLB PTE,
+ * split it so that the PTE that maps @addr is at @desired_shift.
+ */
+static int hugetlb_split_to_shift_present_leaf(struct mm_struct *mm,
+					       struct vm_area_struct *vma,
+					       pte_t old_entry,
+					       unsigned long start,
+					       unsigned long end,
+					       unsigned long addr,
+					       unsigned int orig_shift,
+					       unsigned int desired_shift)
+{
+	bool old_entry_dirty;
+	bool old_entry_write;
+	bool old_entry_uffd_wp;
+	pte_t new_entry;
+	unsigned long curr;
+	unsigned long sz;
+	unsigned int shift;
+	int ret = 0;
+	struct hugetlb_pte new_hpte;
+	struct page *subpage = NULL;
+	struct folio *folio = page_folio(compound_head(pte_page(old_entry)));
+	struct hstate *h = hstate_vma(vma);
+	spinlock_t *ptl;
+
+	/* Unmap original unsplit hugepage per huge_ptep_get_and_clear. */
+	hugetlb_remove_rmap(folio_page(folio, 0), orig_shift, h, vma);
+
+	old_entry_dirty = huge_pte_dirty(old_entry);
+	old_entry_write = huge_pte_write(old_entry);
+	old_entry_uffd_wp = huge_pte_uffd_wp(old_entry);
+
+	for (curr = start; curr < end; curr += sz) {
+		ret = hugetlb_find_shift(vma, curr, end, addr,
+					 desired_shift, &shift);
+
+		/* Unable to find a shift that works */
+		if (WARN_ON(ret))
+			goto abort;
+
+		/*
+		 * Do HGM full walk and allocate new page table structures
+		 * to continue to walk to the level we want.
+		 */
+		sz = 1UL << shift;
+		ret = hugetlb_full_walk_alloc(&new_hpte, vma, curr, sz);
+		if (WARN_ON(ret))
+			goto abort;
+
+		BUG_ON(hugetlb_pte_size(&new_hpte) > sz);
+		/*
+		 * When hugetlb_pte_size(new_hpte) is than sz, increment
+		 * curr by hugetlb_pte_size(new_hpte) to avoid skip over
+		 * some PTEs.
+		 */
+		if (hugetlb_pte_size(&new_hpte) < sz)
+			sz = hugetlb_pte_size(&new_hpte);
+
+		subpage = hugetlb_find_subpage(h, folio, curr);
+		/*
+		 * Creating a new (finer granularity) PT entry and
+		 * populate it with old_entry's bits.
+		 */
+		new_entry = make_huge_pte(vma, subpage,
+					  huge_pte_write(old_entry), shift);
+		if (old_entry_dirty)
+			new_entry = huge_pte_mkdirty(new_entry);
+		if (old_entry_write)
+			new_entry = huge_pte_mkwrite(new_entry);
+		if (old_entry_uffd_wp)
+			new_entry = huge_pte_mkuffd_wp(new_entry);
+		ptl = hugetlb_pte_lock(&new_hpte);
+		set_huge_pte_at(mm, curr, new_hpte.ptep, new_entry);
+		spin_unlock(ptl);
+		/* Increment ref/mapcount per set_huge_pte_at(). */
+		hugetlb_add_file_rmap(subpage, shift, h, vma);
+		folio_get(folio);
+	}
+	/*
+	 * This refcount decrement is for the huge_ptep_get_and_clear
+	 * on the hpte BEFORE splitting, for the same reason as
+	 * hugetlb_remove_rmap(), but we cannot do it at that time.
+	 * Now that splitting succeeded, the refcount can be decremented.
+	 */
+	folio_put(folio);
+	return 0;
+abort:
+	/*
+	 * Restore mapcount on unsplitted hugepage. No need to restore
+	 * refcount as we won't folio_put() until splitting succeeded.
+	 */
+	hugetlb_add_file_rmap(folio_page(folio, 0), orig_shift, h, vma);
+	return ret;
+}
+
+/*
+ * Given a particular address @addr, split the HugeTLB PTE that currently
+ * maps it so that, for the given @addr, the PTE that maps it is @desired_shift.
+ * The splitting is always done optimally.
+ *
+ * Example: given a HugeTLB 1G page mapped from VA 0 to 1G, if caller calls
+ * this API with addr=0 and desired_shift=PAGE_SHIFT, we will change the page
+ * table as follows:
+ * 1. The original PUD will be split into 512 2M PMDs first
+ * 2. The 1st PMD will further be split into 512 4K PTEs
+ *
+ * Callers are required to hold locks on the file mapping within vma.
+ */
+int hugetlb_split_to_shift(struct mm_struct *mm, struct vm_area_struct *vma,
+			   struct hugetlb_pte *hpte, unsigned long addr,
+			   unsigned int desired_shift)
+{
+	unsigned long start, end;
+	unsigned long desired_sz = 1UL << desired_shift;
+	int ret;
+	pte_t old_entry;
+	struct mmu_gather tlb;
+	struct mmu_notifier_range range;
+	spinlock_t *ptl;
+
+	BUG_ON(!hpte->ptep);
+
+	start = addr & hugetlb_pte_mask(hpte);
+	end = start + hugetlb_pte_size(hpte);
+	BUG_ON(!IS_ALIGNED(start, desired_sz));
+	BUG_ON(!IS_ALIGNED(end, desired_sz));
+	BUG_ON(addr < start || end <= addr);
+
+	if (hpte->shift == desired_shift)
+		return 0;
+
+	/*
+	 * Non none-mostly hugetlb PTEs must be present leaf-level PTE,
+	 * i.e. not split before.
+	 */
+	ptl = hugetlb_pte_lock(hpte);
+	BUG_ON(!huge_pte_none_mostly(huge_ptep_get(hpte->ptep)) &&
+	       !hugetlb_pte_present_leaf(hpte, huge_ptep_get(hpte->ptep)));
+
+	i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+
+	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, start, end);
+	mmu_notifier_invalidate_range_start(&range);
+
+	/*
+	 * Get and clear the PTE. We will allocate new page table structures
+	 * when walking the page table.
+	 */
+	old_entry = huge_ptep_get_and_clear(mm, start, hpte->ptep);
+	spin_unlock(ptl);
+
+	/*
+	 * From now on, any failure exit needs to go through "skip" to
+	 * put old_entry back. If any form of hugetlb_split_to_shift_xxx
+	 * is invoked, it also needs to go through "abort" to get rid of
+	 * the allocated PTEs created before splitting fails.
+	 */
+
+	if (unlikely(huge_pte_none_mostly(old_entry))) {
+		ret = -EAGAIN;
+		goto skip;
+	}
+	if (unlikely(!pte_present(old_entry))) {
+		if (is_hugetlb_entry_migration(old_entry))
+			ret = -EBUSY;
+		else if (is_hugetlb_entry_hwpoisoned(old_entry))
+			ret = -EHWPOISON;
+		else {
+			WARN_ONCE(1, "Unexpected case of non-present HugeTLB PTE\n");
+			ret = -EINVAL;
+		}
+		goto skip;
+	}
+
+	if (!hugetlb_pte_present_leaf(hpte, old_entry)) {
+		WARN_ONCE(1, "HugeTLB present PTE is not leaf\n");
+		ret = -EAGAIN;
+		goto skip;
+	}
+	/* From now on old_entry is present leaf entry. */
+	ret = hugetlb_split_to_shift_present_leaf(mm, vma, old_entry,
+						  start, end, addr,
+						  hpte->shift,
+						  desired_shift);
+	if (ret)
+		goto abort;
+
+	/* Splitting done, new page table entries successfully setup. */
+	mmu_notifier_invalidate_range_end(&range);
+	return 0;
+abort:
+	/* Splitting failed, restoring to the original page table state. */
+	tlb_gather_mmu(&tlb, mm);
+	/* Decrement mapcount for all the split PTEs. */
+	__unmap_hugepage_range(&tlb, vma, start, end, NULL, ZAP_FLAG_DROP_MARKER);
+	/*
+	 * Free any newly allocated page table entries.
+	 * Ok if no new entries allocated at all.
+	 */
+	hugetlb_free_pgd_range(&tlb, start, end, start, end);
+	/* Decrement refcount for all the split PTEs. */
+	tlb_finish_mmu(&tlb);
+skip:
+	/* Restore the old entry. */
+	ptl = hugetlb_pte_lock(hpte);
+	set_huge_pte_at(mm, start, hpte->ptep, old_entry);
+	spin_unlock(ptl);
+	mmu_notifier_invalidate_range_end(&range);
+	return ret;
+}
+
 #endif /* CONFIG_HUGETLB_HIGH_GRANULARITY_MAPPING */
 
 /*
-- 
2.40.1.495.gc816e09b53d-goog


  reply	other threads:[~2023-04-28  0:41 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-04-28  0:41 [RFC PATCH v1 0/7] PAGE_SIZE Unmapping in Memory Failure Recovery for HugeTLB Pages Jiaqi Yan
2023-04-28  0:41 ` Jiaqi Yan [this message]
2023-04-28  0:41 ` [RFC PATCH v1 2/7] hugetlb: create PTE level mapping when possible Jiaqi Yan
2023-04-28  0:41 ` [RFC PATCH v1 3/7] mm: publish raw_hwp_page in mm.h Jiaqi Yan
2023-04-28  0:41 ` [RFC PATCH v1 4/7] mm/memory_failure: unmap raw HWPoison PTEs when possible Jiaqi Yan
2023-05-30  2:25   ` HORIGUCHI NAOYA(堀口 直也)
2023-05-30 21:31     ` Jiaqi Yan
2023-05-30 22:11       ` Jiaqi Yan
2023-06-01  2:58   ` HORIGUCHI NAOYA(堀口 直也)
2023-04-28  0:41 ` [RFC PATCH v1 5/7] hugetlb: only VM_FAULT_HWPOISON_LARGE raw page Jiaqi Yan
2023-04-28  0:41 ` [RFC PATCH v1 6/7] selftest/mm: test PAGESIZE unmapping HWPOISON pages Jiaqi Yan
2023-04-28  0:41 ` [RFC PATCH v1 7/7] selftest/mm: test PAGESIZE unmapping UFFD WP marker " Jiaqi Yan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230428004139.2899856-2-jiaqiyan@google.com \
    --to=jiaqiyan@google.com \
    --cc=akpm@linux-foundation.org \
    --cc=axelrasmussen@google.com \
    --cc=baolin.wang@linux.alibaba.com \
    --cc=duenwen@google.com \
    --cc=jthoughton@google.com \
    --cc=linmiaohe@huawei.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mike.kravetz@oracle.com \
    --cc=naoya.horiguchi@nec.com \
    --cc=peterx@redhat.com \
    --cc=rientjes@google.com \
    --cc=shy828301@gmail.com \
    --cc=songmuchun@bytedance.com \
    --cc=wangkefeng.wang@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.