All of lore.kernel.org
 help / color / mirror / Atom feed
From: Mike Kravetz <mike.kravetz@oracle.com>
To: linux-mm@kvack.org, linux-kernel@vger.kernel.org
Cc: Michal Hocko <mhocko@suse.com>, Peter Xu <peterx@redhat.com>,
	Naoya Horiguchi <naoya.horiguchi@linux.dev>,
	David Hildenbrand <david@redhat.com>,
	"Aneesh Kumar K . V" <aneesh.kumar@linux.vnet.ibm.com>,
	Andrea Arcangeli <aarcange@redhat.com>,
	"Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>,
	Davidlohr Bueso <dave@stgolabs.net>,
	Prakash Sangappa <prakash.sangappa@oracle.com>,
	James Houghton <jthoughton@google.com>,
	Mina Almasry <almasrymina@google.com>,
	Ray Fucillo <Ray.Fucillo@intersystems.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Mike Kravetz <mike.kravetz@oracle.com>
Subject: [RFC PATCH v2 5/6] hugetlbfs: Do not use pmd locks if hugetlb sharing possible
Date: Wed, 20 Apr 2022 15:37:52 -0700	[thread overview]
Message-ID: <20220420223753.386645-6-mike.kravetz@oracle.com> (raw)
In-Reply-To: <20220420223753.386645-1-mike.kravetz@oracle.com>

In hugetlbfs, split pmd page table locks are generally used if
huge_page_size is equal to PMD_SIZE.  These locks are located in the
struct page of the corresponding pmd page.  A pmd pointer is used to
locate the page.

In the case of pmd sharing, pmd pointers can become invalid unless one
holds the page table lock.  This creates a chicken/egg problem as we
need to use the pointer to locate the lock.  To address this issue, use
the page_table_lock in the mm_struct in the pmd pointer is associated
with a sharable vma.

The routines dealing with huge pte locks (huge_pte_lockptr and
huge_pte_lock) are modified to take a vma pointer instead of mm pointer.
The vma is then checked to determine if sharing is possible.  If it is,
then  the page table lock in the mm_struct is used.  Otherwise, the
lock in hte pmd page struct page is used.

Note that code uses the mm_struct if any part of the vma is sharable.
This could be optimized by passing in the virtial address associated
with the pte pointer to determine if that specific address is sharable.

Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
 arch/powerpc/mm/pgtable.c |  2 +-
 include/linux/hugetlb.h   | 27 ++++--------
 mm/damon/vaddr.c          |  4 +-
 mm/hmm.c                  |  2 +-
 mm/hugetlb.c              | 92 +++++++++++++++++++++++++++++----------
 mm/mempolicy.c            |  2 +-
 mm/migrate.c              |  2 +-
 mm/page_vma_mapped.c      |  2 +-
 8 files changed, 85 insertions(+), 48 deletions(-)

diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 6ec5a7dd7913..02f76e8b735a 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -261,7 +261,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 
 		psize = hstate_get_psize(h);
 #ifdef CONFIG_DEBUG_VM
-		assert_spin_locked(huge_pte_lockptr(h, vma->vm_mm, ptep));
+		assert_spin_locked(huge_pte_lockptr(h, vma, ptep));
 #endif
 
 #else
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 75f4ff481538..c37611eb8571 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -864,15 +864,8 @@ static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
 	return modified_mask;
 }
 
-static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
-					   struct mm_struct *mm, pte_t *pte)
-{
-	if (huge_page_size(h) == PMD_SIZE)
-		return pmd_lockptr(mm, (pmd_t *) pte);
-	VM_BUG_ON(huge_page_size(h) == PAGE_SIZE);
-	return &mm->page_table_lock;
-}
-
+spinlock_t *huge_pte_lockptr(struct hstate *h, struct vm_area_struct *vma,
+					   pte_t *pte);
 #ifndef hugepages_supported
 /*
  * Some platform decide whether they support huge pages at boot
@@ -1073,8 +1066,11 @@ static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
 }
 
 static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
-					   struct mm_struct *mm, pte_t *pte)
+					   struct vm_area_struct *vma,
+					   pte_t *pte)
 {
+	struct mm_struct *mm = vma->vm_mm;
+
 	return &mm->page_table_lock;
 }
 
@@ -1096,15 +1092,8 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr
 }
 #endif	/* CONFIG_HUGETLB_PAGE */
 
-static inline spinlock_t *huge_pte_lock(struct hstate *h,
-					struct mm_struct *mm, pte_t *pte)
-{
-	spinlock_t *ptl;
-
-	ptl = huge_pte_lockptr(h, mm, pte);
-	spin_lock(ptl);
-	return ptl;
-}
+spinlock_t *huge_pte_lock(struct hstate *h, struct vm_area_struct *vma,
+					pte_t *pte);
 
 #if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
 extern void __init hugetlb_cma_reserve(int order);
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index b2ec0aa1ff45..125439fc88b6 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -432,7 +432,7 @@ static int damon_mkold_hugetlb_entry(pte_t *pte, unsigned long hmask,
 	spinlock_t *ptl;
 	pte_t entry;
 
-	ptl = huge_pte_lock(h, walk->mm, pte);
+	ptl = huge_pte_lock(h, walk->vma, pte);
 	entry = huge_ptep_get(pte);
 	if (!pte_present(entry))
 		goto out;
@@ -555,7 +555,7 @@ static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask,
 	spinlock_t *ptl;
 	pte_t entry;
 
-	ptl = huge_pte_lock(h, walk->mm, pte);
+	ptl = huge_pte_lock(h, walk->vma, pte);
 	entry = huge_ptep_get(pte);
 	if (!pte_present(entry))
 		goto out;
diff --git a/mm/hmm.c b/mm/hmm.c
index 3fd3242c5e50..95b443f2e48e 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -486,7 +486,7 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
 	spinlock_t *ptl;
 	pte_t entry;
 
-	ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);
+	ptl = huge_pte_lock(hstate_vma(vma), vma, pte);
 	entry = huge_ptep_get(pte);
 
 	i = (start - range->start) >> PAGE_SHIFT;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e02df3527a9c..c1352ab7f941 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -94,8 +94,32 @@ DEFINE_SPINLOCK(hugetlb_lock);
 static int num_fault_mutexes;
 struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
 
-/* Forward declaration */
+/* Forward declarations */
 static int hugetlb_acct_memory(struct hstate *h, long delta);
+static bool vma_range_shareable(struct vm_area_struct *vma,
+				unsigned long start, unsigned long end);
+
+spinlock_t *huge_pte_lockptr(struct hstate *h, struct vm_area_struct *vma,
+				pte_t *pte)
+{
+	struct mm_struct *mm = vma->vm_mm;
+
+	if (huge_page_size(h) == PMD_SIZE &&
+			!vma_range_shareable(vma, vma->vm_start, vma->vm_end))
+		return pmd_lockptr(mm, (pmd_t *) pte);
+	VM_BUG_ON(huge_page_size(h) == PAGE_SIZE);
+	return &mm->page_table_lock;
+}
+
+spinlock_t *huge_pte_lock(struct hstate *h, struct vm_area_struct *vma,
+				pte_t *pte)
+{
+	spinlock_t *ptl;
+
+	ptl = huge_pte_lockptr(h, vma, pte);
+	spin_lock(ptl);
+	return ptl;
+}
 
 static inline bool subpool_is_free(struct hugepage_subpool *spool)
 {
@@ -4753,8 +4777,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 		if ((dst_pte == src_pte) || !huge_pte_none(dst_entry))
 			continue;
 
-		dst_ptl = huge_pte_lock(h, dst, dst_pte);
-		src_ptl = huge_pte_lockptr(h, src, src_pte);
+		dst_ptl = huge_pte_lock(h, dst_vma, dst_pte);
+		src_ptl = huge_pte_lockptr(h, src_vma, src_pte);
 		spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
 		entry = huge_ptep_get(src_pte);
 		dst_entry = huge_ptep_get(dst_pte);
@@ -4830,8 +4854,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 				put_page(ptepage);
 
 				/* Install the new huge page if src pte stable */
-				dst_ptl = huge_pte_lock(h, dst, dst_pte);
-				src_ptl = huge_pte_lockptr(h, src, src_pte);
+				dst_ptl = huge_pte_lock(h, dst_vma, dst_pte);
+				src_ptl = huge_pte_lockptr(h, src_vma, src_pte);
 				spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
 				entry = huge_ptep_get(src_pte);
 				if (!pte_same(src_pte_old, entry)) {
@@ -4882,8 +4906,8 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
 	spinlock_t *src_ptl, *dst_ptl;
 	pte_t pte;
 
-	dst_ptl = huge_pte_lock(h, mm, dst_pte);
-	src_ptl = huge_pte_lockptr(h, mm, src_pte);
+	dst_ptl = huge_pte_lock(h, vma, dst_pte);
+	src_ptl = huge_pte_lockptr(h, vma, src_pte);
 
 	/*
 	 * We don't have to worry about the ordering of src and dst ptlocks
@@ -4988,7 +5012,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
 		if (!ptep)
 			continue;
 
-		ptl = huge_pte_lock(h, mm, ptep);
+		ptl = huge_pte_lock(h, vma, ptep);
 		if (huge_pmd_unshare(mm, vma, &address, ptep)) {
 			spin_unlock(ptl);
 			tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
@@ -5485,7 +5509,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 			 * here.  Before returning error, get ptl and make
 			 * sure there really is no pte entry.
 			 */
-			ptl = huge_pte_lock(h, mm, ptep);
+			ptl = huge_pte_lock(h, vma, ptep);
 			ret = 0;
 			if (huge_pte_none(huge_ptep_get(ptep)))
 				ret = vmf_error(PTR_ERR(page));
@@ -5553,7 +5577,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 		vma_end_reservation(h, vma, haddr);
 	}
 
-	ptl = huge_pte_lock(h, mm, ptep);
+	ptl = huge_pte_lock(h, vma, ptep);
 	size = i_size_read(mapping->host) >> huge_page_shift(h);
 	if (idx >= size) {
 		beyond_i_size = true;
@@ -5733,7 +5757,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 								vma, haddr);
 	}
 
-	ptl = huge_pte_lock(h, mm, ptep);
+	ptl = huge_pte_lock(h, vma, ptep);
 
 	/* Check for a racing update before calling hugetlb_wp() */
 	if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
@@ -5935,7 +5959,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 		page_in_pagecache = true;
 	}
 
-	ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
+	ptl = huge_pte_lockptr(h, dst_vma, dst_pte);
 	spin_lock(ptl);
 
 	/*
@@ -6089,7 +6113,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
 				      huge_page_size(h));
 		if (pte)
-			ptl = huge_pte_lock(h, mm, pte);
+			ptl = huge_pte_lock(h, vma, pte);
 		absent = !pte || huge_pte_none(huge_ptep_get(pte));
 
 		/*
@@ -6267,7 +6291,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 		ptep = huge_pte_offset(mm, address, psize);
 		if (!ptep)
 			continue;
-		ptl = huge_pte_lock(h, mm, ptep);
+		ptl = huge_pte_lock(h, vma, ptep);
 		if (huge_pmd_unshare(mm, vma, &address, ptep)) {
 			/*
 			 * When uffd-wp is enabled on the vma, unshare
@@ -6583,26 +6607,44 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
 	return saddr;
 }
 
-static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
+static bool __vma_aligned_range_shareable(struct vm_area_struct *vma,
+				unsigned long start, unsigned long end)
 {
-	unsigned long base = addr & PUD_MASK;
-	unsigned long end = base + PUD_SIZE;
-
 	/*
 	 * check on proper vm_flags and page table alignment
 	 */
-	if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end))
+	if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, start, end))
 		return true;
 	return false;
 }
 
+static bool vma_range_shareable(struct vm_area_struct *vma,
+				unsigned long start, unsigned long end)
+{
+	unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE),
+		      v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
+
+	if (v_start >= v_end)
+		return false;
+
+	return __vma_aligned_range_shareable(vma, v_start, v_end);
+}
+
+static bool vma_addr_shareable(struct vm_area_struct *vma, unsigned long addr)
+{
+	unsigned long start = addr & PUD_MASK;
+	unsigned long end = start + PUD_SIZE;
+
+	return __vma_aligned_range_shareable(vma, start, end);
+}
+
 bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
 {
 #ifdef CONFIG_USERFAULTFD
 	if (uffd_disable_huge_pmd_share(vma))
 		return false;
 #endif
-	return vma_shareable(vma, addr);
+	return vma_addr_shareable(vma, addr);
 }
 
 /*
@@ -6672,7 +6714,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (!spte)
 		goto out;
 
-	ptl = huge_pte_lock(hstate_vma(vma), mm, spte);
+	ptl = huge_pte_lock(hstate_vma(vma), vma, spte);
 	if (pud_none(*pud)) {
 		pud_populate(mm, pud,
 				(pmd_t *)((unsigned long)spte & PAGE_MASK));
@@ -6719,6 +6761,12 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
 }
 
 #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+static bool vma_range_shareable(struct vm_area_struct *vma,
+				unsigned long start, unsigned long end)
+{
+	return false;
+}
+
 pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
 		      unsigned long addr, pud_t *pud)
 {
@@ -7034,7 +7082,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
 		ptep = huge_pte_offset(mm, address, sz);
 		if (!ptep)
 			continue;
-		ptl = huge_pte_lock(h, mm, ptep);
+		ptl = huge_pte_lock(h, vma, ptep);
 		/* We don't want 'address' to be changed */
 		huge_pmd_unshare(mm, vma, &tmp, ptep);
 		spin_unlock(ptl);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 58af432a39b2..4692640847eb 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -577,7 +577,7 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
 	spinlock_t *ptl;
 	pte_t entry;
 
-	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
+	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->vma, pte);
 	entry = huge_ptep_get(pte);
 	if (!pte_present(entry))
 		goto unlock;
diff --git a/mm/migrate.c b/mm/migrate.c
index b2678279eb43..3d765ee101ad 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -318,7 +318,7 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 void migration_entry_wait_huge(struct vm_area_struct *vma,
 		struct mm_struct *mm, pte_t *pte)
 {
-	spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
+	spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma, pte);
 	__migration_entry_wait(mm, pte, ptl);
 }
 
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index c10f839fc410..f09eaef2a828 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -174,7 +174,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
 		if (!pvmw->pte)
 			return false;
 
-		pvmw->ptl = huge_pte_lockptr(hstate, mm, pvmw->pte);
+		pvmw->ptl = huge_pte_lockptr(hstate, vma, pvmw->pte);
 		spin_lock(pvmw->ptl);
 		if (!check_pte(pvmw))
 			return not_found(pvmw);
-- 
2.35.1


  parent reply	other threads:[~2022-04-20 22:39 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-04-20 22:37 [RFC PATCH v2 0/6] hugetlb: Change huge pmd sharing synchronization again Mike Kravetz
2022-04-20 22:37 ` [RFC PATCH v2 1/6] hugetlbfs: revert use i_mmap_rwsem to address page fault/truncate race Mike Kravetz
2022-04-20 22:37 ` [RFC PATCH v2 2/6] hugetlbfs: revert use i_mmap_rwsem for more pmd sharing synchronization Mike Kravetz
2022-04-20 22:37 ` [RFC PATCH v2 3/6] hugetlbfs: move routine remove_huge_page to hugetlb.c Mike Kravetz
2022-04-20 22:37 ` [RFC PATCH v2 4/6] hugetlbfs: catch and handle truncate racing with page faults Mike Kravetz
2022-04-20 22:37 ` Mike Kravetz [this message]
2022-04-20 22:37 ` [RFC PATCH v2 6/6] hugetlb: Check for pmd unshare and fault/lookup races Mike Kravetz
2022-04-22 16:38 ` [RFC PATCH v2 0/6] hugetlb: Change huge pmd sharing synchronization again Mike Kravetz

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220420223753.386645-6-mike.kravetz@oracle.com \
    --to=mike.kravetz@oracle.com \
    --cc=Ray.Fucillo@intersystems.com \
    --cc=aarcange@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=almasrymina@google.com \
    --cc=aneesh.kumar@linux.vnet.ibm.com \
    --cc=dave@stgolabs.net \
    --cc=david@redhat.com \
    --cc=jthoughton@google.com \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@suse.com \
    --cc=naoya.horiguchi@linux.dev \
    --cc=peterx@redhat.com \
    --cc=prakash.sangappa@oracle.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.