All of lore.kernel.org
 help / color / mirror / Atom feed
From: Qi Zheng <zhengqi.arch@bytedance.com>
To: akpm@linux-foundation.org, tglx@linutronix.de,
	hannes@cmpxchg.org, mhocko@kernel.org, vdavydov.dev@gmail.com,
	kirill.shutemov@linux.intel.com, mika.penttila@nextfour.com,
	david@redhat.com
Cc: linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org, songmuchun@bytedance.com,
	Qi Zheng <zhengqi.arch@bytedance.com>
Subject: [PATCH v2 7/9] mm: add THP support for pte_ref
Date: Thu, 19 Aug 2021 11:18:56 +0800	[thread overview]
Message-ID: <20210819031858.98043-8-zhengqi.arch@bytedance.com> (raw)
In-Reply-To: <20210819031858.98043-1-zhengqi.arch@bytedance.com>

The situation will become more complex with THP enabled. In
the handle_pte_fault(), the pmd entry can be populated with
PTE page table as well as THP page. So we can no longer
always get ->pte_refcount successfully, we add the following
function to solve this case:

	pte_alloc_try_get()
	pmd_install_try_get()

When split THP into PTE page table, we initialize
->pte_refcount to HPAGE_PMD_NR, so that we can free this PTE
normally through pte_put().

Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
 fs/proc/task_mmu.c      | 14 +++-------
 include/linux/pte_ref.h | 46 +++++++++++++++++++++++++++++--
 mm/Kconfig              |  2 +-
 mm/filemap.c            | 11 ++++----
 mm/khugepaged.c         |  9 ++++++
 mm/madvise.c            |  9 ++----
 mm/memcontrol.c         |  7 ++---
 mm/memory.c             | 43 +++++++++++++++--------------
 mm/mempolicy.c          |  2 +-
 mm/migrate.c            |  7 +----
 mm/mincore.c            |  2 +-
 mm/pgtable-generic.c    |  2 ++
 mm/pte_ref.c            | 73 +++++++++++++++++++++++++++++++++++++------------
 mm/sparse-vmemmap.c     |  2 +-
 mm/swapfile.c           |  3 +-
 mm/userfaultfd.c        | 10 ++++---
 16 files changed, 159 insertions(+), 83 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 85ee730ff6ae..b31915696210 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -583,7 +583,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 		goto out;
 	}
 
-	if (pmd_trans_unstable(pmd) || !pte_try_get(vma->vm_mm, pmd))
+	if (pmd_trans_unstable_or_pte_try_get(vma->vm_mm, pmd))
 		goto out;
 	/*
 	 * The mmap_lock held all the way back in m_start() is what
@@ -1146,7 +1146,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 		return 0;
 	}
 
-	if (pmd_trans_unstable(pmd) || !pte_try_get(vma->vm_mm, pmd))
+	if (pmd_trans_unstable_or_pte_try_get(vma->vm_mm, pmd))
 		return 0;
 
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
@@ -1475,12 +1475,9 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 		spin_unlock(ptl);
 		return err;
 	}
-
-	if (pmd_trans_unstable(pmdp))
-		return 0;
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
-	if (!pte_try_get(walk->mm, pmdp))
+	if (pmd_trans_unstable_or_pte_try_get(walk->mm, pmdp))
 		return 0;
 
 	/*
@@ -1818,11 +1815,8 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 		spin_unlock(ptl);
 		return 0;
 	}
-
-	if (pmd_trans_unstable(pmd))
-		return 0;
 #endif
-	if (!pte_try_get(walk->mm, pmd))
+	if (pmd_trans_unstable_or_pte_try_get(walk->mm, pmd))
 		return 0;
 
 	orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
diff --git a/include/linux/pte_ref.h b/include/linux/pte_ref.h
index c2389d03bb59..04cca9427270 100644
--- a/include/linux/pte_ref.h
+++ b/include/linux/pte_ref.h
@@ -14,8 +14,9 @@
 #include <asm/pgalloc.h>
 
 int __pte_alloc(struct mm_struct *mm, pmd_t *pmd);
-void pmd_install_get(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
+bool pmd_install_try_get(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
 int __pte_alloc_get(struct mm_struct *mm, pmd_t *pmd);
+int __pte_alloc_try_get(struct mm_struct *mm, pmd_t *pmd);
 
 #define pte_alloc(mm, pmd) (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd))
 
@@ -95,7 +96,8 @@ static inline bool pte_try_get(struct mm_struct *mm, pmd_t *pmdp)
 	spinlock_t *ptl;
 
 	ptl = pmd_lock(mm, pmdp);
-	if (!pmd_present(*pmdp) || !pte_get_unless_zero(pmdp))
+	if (pmd_leaf(*pmdp) || !pmd_present(*pmdp) ||
+	    !pte_get_unless_zero(pmdp))
 		retval = false;
 	spin_unlock(ptl);
 
@@ -174,6 +176,30 @@ static inline int pte_alloc_get(struct mm_struct *mm, pmd_t *pmdp)
 #define pte_alloc_get_map_lock(mm, pmd, address, ptlp)		\
 	(pte_alloc_get(mm, pmd) ?				\
 		 NULL : pte_offset_map_lock(mm, pmd, address, ptlp))
+
+/*
+ * pte_alloc_try_get - allocate a PTE page table if the pmd entry is none, and
+ *		       try to hold a ->pte_refcount
+ * @mm: the mm_struct of the target address space.
+ * @pmdp: a pointer to the pmd entry corresponding to the PTE page table.
+ *
+ * Return 1 if the ->pte_refcount is successfully held, return 0 if the pmd
+ * entry is populated with a THP page by another thread, Otherwise return
+ * -ENOMEM.
+ */
+static inline int pte_alloc_try_get(struct mm_struct *mm, pmd_t *pmdp)
+{
+	if (!pte_try_get(mm, pmdp))
+		return __pte_alloc_try_get(mm, pmdp);
+	return 1;
+}
+
+static inline bool pmd_trans_unstable_or_pte_try_get(struct mm_struct *mm, pmd_t *pmdp)
+{
+	if (!pte_try_get(mm, pmdp))
+		return true;
+	return false;
+}
 #else
 static inline void pte_ref_init(pgtable_t pte, pmd_t *pmd, int count)
 {
@@ -229,6 +255,22 @@ static inline int pte_alloc_get(struct mm_struct *mm, pmd_t *pmdp)
 
 #define pte_alloc_get_map_lock(mm, pmd, address, ptlp)		\
 	pte_alloc_map_lock(mm, pmd, address, ptlp)
+
+static inline int pte_alloc_try_get(struct mm_struct *mm, pmd_t *pmdp)
+{
+	if (unlikely(pmd_none(*pmdp)))
+		return __pte_alloc_try_get(mm, pmdp);
+	if (unlikely(pmd_devmap_trans_unstable(pmdp)))
+		return 0;
+	return 1;
+}
+
+static inline bool pmd_trans_unstable_or_pte_try_get(struct mm_struct *mm, pmd_t *pmdp)
+{
+	if (pmd_trans_unstable(pmdp))
+		return true;
+	return false;
+}
 #endif /* CONFIG_FREE_USER_PTE */
 
 #endif
diff --git a/mm/Kconfig b/mm/Kconfig
index fb16ecebcb80..18aea3d0ce49 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -896,7 +896,7 @@ config SECRETMEM
 
 config FREE_USER_PTE
 	def_bool y
-	depends on X86_64 && !TRANSPARENT_HUGEPAGE
+	depends on X86_64
 
 source "mm/damon/Kconfig"
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 7f2eb64e5c76..8bcab6fc5d17 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3211,12 +3211,13 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page)
 		}
 	}
 
-	if (IS_ENABLED(CONFIG_FREE_USER_PTE) || pmd_none(*vmf->pmd))
-		pmd_install_get(mm, vmf->pmd, &vmf->prealloc_pte);
-
-	/* See comment in handle_pte_fault() */
-	if (pmd_devmap_trans_unstable(vmf->pmd))
+	if (IS_ENABLED(CONFIG_FREE_USER_PTE) || pmd_none(*vmf->pmd)) {
+		if (!pmd_install_try_get(mm, vmf->pmd, &vmf->prealloc_pte))
+			goto out;
+	} else if (pmd_devmap_trans_unstable(vmf->pmd)) {
+		/* See comment in handle_pte_fault() */
 		goto out;
+	}
 
 	return false;
 out:
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 6b9c98ddcd09..95d90c896580 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1240,6 +1240,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 	}
 
 	memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
+	if (!pte_try_get(mm, pmd)) {
+		result = SCAN_PMD_NULL;
+		goto out;
+	}
 	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
 	for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
 	     _pte++, _address += PAGE_SIZE) {
@@ -1361,6 +1365,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 	}
 out_unmap:
 	pte_unmap_unlock(pte, ptl);
+	pte_put(mm, pmd, address);
 	if (ret) {
 		node = khugepaged_find_target_node();
 		/* collapse_huge_page will return with the mmap_lock released */
@@ -1463,6 +1468,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
 	if (!pmd)
 		goto drop_hpage;
 
+	if (!pte_try_get(mm, pmd))
+		goto drop_hpage;
 	start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
 
 	/* step 1: check all mapped PTEs are to the right huge page */
@@ -1501,6 +1508,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
 	}
 
 	pte_unmap_unlock(start_pte, ptl);
+	pte_put(mm, pmd, haddr);
 
 	/* step 3: set proper refcount and mm_counters. */
 	if (count) {
@@ -1522,6 +1530,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
 
 abort:
 	pte_unmap_unlock(start_pte, ptl);
+	pte_put(mm, pmd, haddr);
 	goto drop_hpage;
 }
 
diff --git a/mm/madvise.c b/mm/madvise.c
index 1494da73281c..1befb4e64f2b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -191,8 +191,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
 	struct vm_area_struct *vma = walk->private;
 	unsigned long index;
 
-	if (pmd_none_or_trans_huge_or_clear_bad(pmd) ||
-	    !pte_try_get(vma->vm_mm, pmd))
+	if (pmd_trans_unstable_or_pte_try_get(vma->vm_mm, pmd))
 		return 0;
 
 	for (index = start; index != end; index += PAGE_SIZE) {
@@ -392,10 +391,8 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 	}
 
 regular_page:
-	if (pmd_trans_unstable(pmd))
-		return 0;
 #endif
-	if (!pte_try_get(vma->vm_mm, pmd))
+	if (pmd_trans_unstable_or_pte_try_get(vma->vm_mm, pmd))
 		return 0;
 
 	tlb_change_page_size(tlb, PAGE_SIZE);
@@ -595,7 +592,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 		if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
 			goto next;
 
-	if (pmd_trans_unstable(pmd) || !pte_try_get(mm, pmd))
+	if (pmd_trans_unstable_or_pte_try_get(mm, pmd))
 		return 0;
 	nr_put++;
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f7d203ce14af..56c580d37e94 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5810,7 +5810,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
 		return 0;
 	}
 
-	if (pmd_trans_unstable(pmd) || !pte_try_get(vma->vm_mm, pmd))
+	if (pmd_trans_unstable_or_pte_try_get(vma->vm_mm, pmd))
 		return 0;
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	for (; addr != end; pte++, addr += PAGE_SIZE)
@@ -6029,11 +6029,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 		spin_unlock(ptl);
 		return 0;
 	}
-
-	if (pmd_trans_unstable(pmd))
-		return 0;
 retry:
-	if (!pte_try_get(vma->vm_mm, pmd))
+	if (pmd_trans_unstable_or_pte_try_get(vma->vm_mm, pmd))
 		return 0;
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	for (; addr != end; addr += PAGE_SIZE) {
diff --git a/mm/memory.c b/mm/memory.c
index 8fcef8b67971..99dde124755b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -453,7 +453,7 @@ int __pte_alloc_kernel(pmd_t *pmd)
 
 	spin_lock(&init_mm.page_table_lock);
 	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
-		smp_wmb(); /* See comment in pmd_install_get() */
+		smp_wmb(); /* See comment in __pmd_install() */
 		pmd_populate_kernel(&init_mm, pmd, new);
 		new = NULL;
 	}
@@ -1478,8 +1478,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
 		 * because MADV_DONTNEED holds the mmap_lock in read
 		 * mode.
 		 */
-		if (pmd_none_or_trans_huge_or_clear_bad(pmd) ||
-		    !pte_try_get(tlb->mm, pmd))
+		if (pmd_trans_unstable_or_pte_try_get(tlb->mm, pmd))
 			goto next;
 		next = zap_pte_range(tlb, vma, pmd, addr, next, details);
 		pte_put(tlb->mm, pmd, addr);
@@ -3760,7 +3759,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 		return VM_FAULT_SIGBUS;
 
 	/*
-	 * Use pte_alloc_get() instead of pte_alloc_map().  We can't run
+	 * Use pte_alloc_try_get() instead of pte_alloc_get_map().  We can't run
 	 * pte_offset_map() on pmds where a huge pmd might be created
 	 * from a different thread.
 	 *
@@ -3769,12 +3768,10 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 	 *
 	 * Here we only have mmap_read_lock(mm).
 	 */
-	if (pte_alloc_get(vma->vm_mm, vmf->pmd))
-		return VM_FAULT_OOM;
-
-	/* See comment in handle_pte_fault() */
-	if (unlikely(pmd_trans_unstable(vmf->pmd)))
-		return 0;
+	ret = pte_alloc_try_get(vma->vm_mm, vmf->pmd);
+	if (ret <= 0)
+		return ret < 0 ? VM_FAULT_OOM : 0;
+	ret = 0;
 
 	/* Use the zero-page for reads */
 	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
@@ -4068,18 +4065,22 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
 				return ret;
 		}
 
-		if (vmf->prealloc_pte)
-			pmd_install_get(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte);
-		else if (unlikely(pte_alloc_get(vma->vm_mm, vmf->pmd)))
-			return VM_FAULT_OOM;
+		if (vmf->prealloc_pte) {
+			if (!pmd_install_try_get(vma->vm_mm, vmf->pmd,
+			    &vmf->prealloc_pte))
+				return 0;
+		} else {
+			ret = pte_alloc_try_get(vma->vm_mm, vmf->pmd);
+			if (ret <= 0)
+				return ret < 0 ? VM_FAULT_OOM : 0;
+		}
+	} else if (pmd_devmap_trans_unstable(vmf->pmd)) {
+		/* See comment in handle_pte_fault() */
+		return 0;
 	} else if (!pte_try_get(vma->vm_mm, vmf->pmd)) {
 		goto retry;
 	}
 
-	/* See comment in handle_pte_fault() */
-	if (pmd_devmap_trans_unstable(vmf->pmd))
-		return 0;
-
 	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
 				      vmf->address, &vmf->ptl);
 	ret = 0;
@@ -4890,7 +4891,7 @@ int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
 	if (pgd_present(*pgd))		/* Another has populated it */
 		p4d_free(mm, new);
 	else {
-		smp_wmb(); /* See comment in pmd_install_get() */
+		smp_wmb(); /* See comment in __pmd_install() */
 		pgd_populate(mm, pgd, new);
 	}
 	spin_unlock(&mm->page_table_lock);
@@ -4912,7 +4913,7 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
 	spin_lock(&mm->page_table_lock);
 	if (!p4d_present(*p4d)) {
 		mm_inc_nr_puds(mm);
-		smp_wmb(); /* See comment in pmd_install_get() */
+		smp_wmb(); /* See comment in __pmd_install() */
 		p4d_populate(mm, p4d, new);
 	} else	/* Another has populated it */
 		pud_free(mm, new);
@@ -4936,7 +4937,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 	ptl = pud_lock(mm, pud);
 	if (!pud_present(*pud)) {
 		mm_inc_nr_pmds(mm);
-		smp_wmb(); /* See comment in pmd_install_get() */
+		smp_wmb(); /* See comment in __pmd_install() */
 		pud_populate(mm, pud, new);
 	} else	/* Another has populated it */
 		pmd_free(mm, new);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index acef7df762a9..9d0493f80a75 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -518,7 +518,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
 	}
 	/* THP was split, fall through to pte walk */
 
-	if (pmd_trans_unstable(pmd) || !pte_try_get(walk->mm, pmd))
+	if (pmd_trans_unstable_or_pte_try_get(walk->mm, pmd))
 		return 0;
 
 	mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
diff --git a/mm/migrate.c b/mm/migrate.c
index a3bcef1430c9..af5b8900551b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2829,7 +2829,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
 		goto abort;
 
 	/*
-	 * Use pte_alloc_get() instead of pte_alloc_map().  We can't run
+	 * Use pte_alloc_try_get() instead of pte_alloc_get_map().  We can't run
 	 * pte_offset_map() on pmds where a huge pmd might be created
 	 * from a different thread.
 	 *
@@ -2841,12 +2841,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
 	if (pte_alloc_get(mm, pmdp))
 		goto abort;
 
-	/* See the comment in pte_alloc_one_map() */
-	if (unlikely(pmd_trans_unstable(pmdp)))
-		goto abort;
-
 	if (unlikely(anon_vma_prepare(vma)))
-		goto put;
 	if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL))
 		goto put;
 
diff --git a/mm/mincore.c b/mm/mincore.c
index 0401b526adee..a72ec90dd54f 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -114,7 +114,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 		goto out;
 	}
 
-	if (pmd_trans_unstable(pmd) || !pte_try_get(walk->mm, pmd)) {
+	if (pmd_trans_unstable_or_pte_try_get(walk->mm, pmd)) {
 		__mincore_unmapped_range(addr, end, vma, vec);
 		goto out;
 	}
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 4e640baf9794..88464c54619f 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -10,6 +10,7 @@
 #include <linux/pagemap.h>
 #include <linux/hugetlb.h>
 #include <linux/pgtable.h>
+#include <linux/pte_ref.h>
 #include <asm/tlb.h>
 
 /*
@@ -186,6 +187,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
 							  struct page, lru);
 	if (pmd_huge_pte(mm, pmdp))
 		list_del(&pgtable->lru);
+	pte_ref_init(pgtable, pmdp, HPAGE_PMD_NR);
 	return pgtable;
 }
 #endif
diff --git a/mm/pte_ref.c b/mm/pte_ref.c
index dfbf817b7367..dff32909c7c4 100644
--- a/mm/pte_ref.c
+++ b/mm/pte_ref.c
@@ -42,6 +42,28 @@ void free_pte_table(struct mm_struct *mm, pmd_t *pmdp, unsigned long addr)
 	pte_free(mm, pmd_pgtable(pmd));
 }
 
+static inline void __pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
+{
+	mm_inc_nr_ptes(mm);
+	/*
+	 * Ensure all pte setup (eg. pte page lock and page clearing) are
+	 * visible before the pte is made visible to other CPUs by being
+	 * put into page tables.
+	 *
+	 * The other side of the story is the pointer chasing in the page
+	 * table walking code (when walking the page table without locking;
+	 * ie. most of the time). Fortunately, these data accesses consist
+	 * of a chain of data-dependent loads, meaning most CPUs (alpha
+	 * being the notable exception) will already guarantee loads are
+	 * seen in-order. See the alpha page table accessors for the
+	 * smp_rmb() barriers in page table walking code.
+	 */
+	smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
+	pmd_populate(mm, pmd, *pte);
+	pte_ref_init(*pte, pmd, 1);
+	*pte = NULL;
+}
+
 void pmd_install_get(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
 {
 	spinlock_t *ptl;
@@ -49,24 +71,7 @@ void pmd_install_get(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
 retry:
 	ptl = pmd_lock(mm, pmd);
 	if (likely(pmd_none(*pmd))) {
-		mm_inc_nr_ptes(mm);
-		/*
-		 * Ensure all pte setup (eg. pte page lock and page clearing) are
-		 * visible before the pte is made visible to other CPUs by being
-		 * put into page tables.
-		 *
-		 * The other side of the story is the pointer chasing in the page
-		 * table walking code (when walking the page table without locking;
-		 * ie. most of the time). Fortunately, these data accesses consist
-		 * of a chain of data-dependent loads, meaning most CPUs (alpha
-		 * being the notable exception) will already guarantee loads are
-		 * seen in-order. See the alpha page table accessors for the
-		 * smp_rmb() barriers in page table walking code.
-		 */
-		smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
-		pmd_populate(mm, pmd, *pte);
-		pte_ref_init(*pte, pmd, 1);
-		*pte = NULL;
+		__pmd_install(mm, pmd, pte);
 	} else if (!pte_get_unless_zero(pmd)) {
 		spin_unlock(ptl);
 		goto retry;
@@ -74,6 +79,25 @@ void pmd_install_get(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
 	spin_unlock(ptl);
 }
 
+bool pmd_install_try_get(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
+{
+	spinlock_t *ptl;
+	bool retval = true;
+
+retry:
+	ptl = pmd_lock(mm, pmd);
+	if (likely(pmd_none(*pmd))) {
+		__pmd_install(mm, pmd, pte);
+	} else if (pmd_leaf(*pmd) || !pmd_present(*pmd)) {
+		retval = false;
+	} else if (!pte_get_unless_zero(pmd)) {
+		spin_unlock(ptl);
+		goto retry;
+	}
+	spin_unlock(ptl);
+	return retval;
+}
+
 int __pte_alloc_get(struct mm_struct *mm, pmd_t *pmd)
 {
 	pgtable_t new = pte_alloc_one(mm);
@@ -86,6 +110,19 @@ int __pte_alloc_get(struct mm_struct *mm, pmd_t *pmd)
 	return 0;
 }
 
+int __pte_alloc_try_get(struct mm_struct *mm, pmd_t *pmd)
+{
+	int retval;
+	pgtable_t new = pte_alloc_one(mm);
+	if (!new)
+		return -ENOMEM;
+
+	retval = pmd_install_try_get(mm, pmd, &new);
+	if (new)
+		pte_free(mm, new);
+	return retval;
+}
+
 int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
 {
 	return __pte_alloc_get(mm, pmd);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 818c82ee3c6e..0ce288b979f1 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -76,7 +76,7 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start,
 		set_pte_at(&init_mm, addr, pte, entry);
 	}
 
-	/* Make pte visible before pmd. See comment in pmd_install_get(). */
+	/* Make pte visible before pmd. See comment in __pmd_install(). */
 	smp_wmb();
 	pmd_populate_kernel(&init_mm, pmd, pgtable);
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 10ebfc94208a..6db8381e1e19 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2021,8 +2021,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 	do {
 		cond_resched();
 		next = pmd_addr_end(addr, end);
-		if (pmd_none_or_trans_huge_or_clear_bad(pmd) ||
-		    !pte_try_get(vma->vm_mm, pmd))
+		if (pmd_trans_unstable_or_pte_try_get(vma->vm_mm, pmd))
 			continue;
 		ret = unuse_pte_range(vma, pmd, addr, next, type,
 				      frontswap, fs_pages_to_unuse);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 7661929b27bf..cb76c6167541 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -572,6 +572,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 
 	while (src_addr < src_start + len) {
 		pmd_t dst_pmdval;
+		int ret = 1;
 
 		BUG_ON(dst_addr >= dst_start + len);
 
@@ -590,12 +591,13 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 			err = -EEXIST;
 			break;
 		}
-		if (unlikely(pmd_none(dst_pmdval)) &&
-		    unlikely(__pte_alloc_get(dst_mm, dst_pmd))) {
+		if ((IS_ENABLED(CONFIG_FREE_USER_PTE) &&
+		     unlikely((ret = pte_alloc_try_get(dst_mm, dst_pmd)) < 0)) ||
+		    (unlikely(pmd_none(dst_pmdval)) &&
+		     unlikely((ret = __pte_alloc_try_get(dst_mm, dst_pmd)) < 0))) {
 			err = -ENOMEM;
 			break;
-		} else if (unlikely(pmd_trans_huge(*dst_pmd)) ||
-			   !pte_try_get(dst_mm, dst_pmd)) {
+		} else if (!ret || unlikely(pmd_trans_huge(*dst_pmd))) {
 			/*
 			 * If an huge pmd materialized from under us fail or the
 			 * pte populated in the dst_pmd was freed.
-- 
2.11.0


  parent reply	other threads:[~2021-08-19  3:20 UTC|newest]

Thread overview: 39+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-08-19  3:18 [PATCH v2 0/9] Free user PTE page table pages Qi Zheng
2021-08-19  3:18 ` [PATCH v2 1/9] mm: introduce pmd_install() helper Qi Zheng
2021-08-24 16:26   ` David Hildenbrand
2021-08-25 16:20     ` Qi Zheng
2021-08-25 16:32       ` David Hildenbrand
2021-08-26  3:04         ` Qi Zheng
2021-08-19  3:18 ` [PATCH v2 2/9] mm: remove redundant smp_wmb() Qi Zheng
2021-08-19  3:18 ` [PATCH v2 3/9] mm: rework the parameter of lock_page_or_retry() Qi Zheng
2021-08-19  3:18 ` [PATCH v2 4/9] mm: move pte_alloc{,_map,_map_lock}() to a separate file Qi Zheng
2021-08-19  3:18 ` [PATCH v2 5/9] mm: pte_refcount infrastructure Qi Zheng
2021-08-19  3:18 ` [PATCH v2 6/9] mm: free user PTE page table pages Qi Zheng
2021-08-19  7:01   ` David Hildenbrand
2021-08-19 10:18     ` [External] " Qi Zheng
2021-09-01 13:53   ` Jason Gunthorpe
2021-09-01 13:57     ` David Hildenbrand
2021-09-01 15:32       ` Jason Gunthorpe
2021-09-01 16:13         ` David Hildenbrand
2021-09-01 16:16           ` Jason Gunthorpe
2021-09-01 16:19             ` David Hildenbrand
2021-09-01 17:10               ` Jason Gunthorpe
2021-09-01 17:49                 ` David Hildenbrand
2021-09-01 17:55                   ` Jason Gunthorpe
2021-09-01 17:58                     ` David Hildenbrand
2021-09-01 18:09                       ` Jason Gunthorpe
2021-09-01 18:10                         ` David Hildenbrand
2021-09-02  7:04                     ` Qi Zheng
2021-09-02  6:53         ` Qi Zheng
2021-08-19  3:18 ` Qi Zheng [this message]
2021-08-19  3:18 ` [PATCH v2 8/9] mm: free PTE page table by using rcu mechanism Qi Zheng
2021-08-19  3:18 ` [PATCH v2 9/9] mm: use mmu_gather to free PTE page table Qi Zheng
2021-09-01 12:32 ` [PATCH v2 0/9] Free user PTE page table pages David Hildenbrand
2021-09-01 16:07   ` Jason Gunthorpe
2021-09-01 16:10     ` David Hildenbrand
2021-09-02  3:37   ` Qi Zheng
2021-09-15 14:52   ` Qi Zheng
2021-09-15 14:59     ` Jason Gunthorpe
2021-09-16  5:32       ` Qi Zheng
2021-09-16  8:30         ` David Hildenbrand
2021-09-16  8:41           ` Qi Zheng

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210819031858.98043-8-zhengqi.arch@bytedance.com \
    --to=zhengqi.arch@bytedance.com \
    --cc=akpm@linux-foundation.org \
    --cc=david@redhat.com \
    --cc=hannes@cmpxchg.org \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@kernel.org \
    --cc=mika.penttila@nextfour.com \
    --cc=songmuchun@bytedance.com \
    --cc=tglx@linutronix.de \
    --cc=vdavydov.dev@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.