All of lore.kernel.org
 help / color / mirror / Atom feed
From: Zi Yan <zi.yan@sent.com>
To: linux-mm@kvack.org
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>,
	Roman Gushchin <guro@fb.com>, Rik van Riel <riel@surriel.com>,
	Matthew Wilcox <willy@infradead.org>,
	Shakeel Butt <shakeelb@google.com>,
	Yang Shi <shy828301@gmail.com>, Jason Gunthorpe <jgg@nvidia.com>,
	Mike Kravetz <mike.kravetz@oracle.com>,
	Michal Hocko <mhocko@suse.com>,
	David Hildenbrand <david@redhat.com>,
	William Kucharski <william.kucharski@oracle.com>,
	Andrea Arcangeli <aarcange@redhat.com>,
	John Hubbard <jhubbard@nvidia.com>,
	David Nellans <dnellans@nvidia.com>,
	linux-kernel@vger.kernel.org, Zi Yan <ziy@nvidia.com>
Subject: [RFC PATCH v2 02/30] mm: pagewalk: use READ_ONCE when reading the PMD entry unlocked
Date: Mon, 28 Sep 2020 13:54:00 -0400	[thread overview]
Message-ID: <20200928175428.4110504-3-zi.yan@sent.com> (raw)
In-Reply-To: <20200928175428.4110504-1-zi.yan@sent.com>

From: Zi Yan <ziy@nvidia.com>

The pagewalker runs while only holding the mmap_sem for read. The pud can
be set asynchronously, while also holding the mmap_sem for read.

This follows the same way as the commit:
mm/pagewalk: use READ_ONCE when reading the PUD entry unlocked"

Signed-off-by: Zi Yan <ziy@nvidia.com>
---
 fs/proc/task_mmu.c       | 69 ++++++++++++++++++++++++++--------------
 include/linux/pagewalk.h |  2 +-
 mm/madvise.c             | 59 ++++++++++++++++++----------------
 mm/memcontrol.c          | 30 +++++++++++------
 mm/mempolicy.c           | 15 ++++++---
 mm/mincore.c             | 10 +++---
 mm/pagewalk.c            | 21 ++++++------
 7 files changed, 124 insertions(+), 82 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 069978777423..a21484b1414d 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -570,28 +570,33 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
 }
 #endif
 
-static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
-			   struct mm_walk *walk)
+static int smaps_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+			unsigned long end, struct mm_walk *walk)
 {
 	struct vm_area_struct *vma = walk->vma;
 	pte_t *pte;
 	spinlock_t *ptl;
 
-	ptl = pmd_trans_huge_lock(pmd, vma);
+	ptl = pmd_trans_huge_lock(pmdp, vma);
 	if (ptl) {
-		smaps_pmd_entry(pmd, addr, walk);
+		if (memcmp(pmdp, &pmd, sizeof(pmd)) != 0) {
+			walk->action = ACTION_AGAIN;
+			spin_unlock(ptl);
+			return 0;
+		}
+		smaps_pmd_entry(pmdp, addr, walk);
 		spin_unlock(ptl);
 		goto out;
 	}
 
-	if (pmd_trans_unstable(pmd))
+	if (pmd_trans_unstable(&pmd))
 		goto out;
 	/*
 	 * The mmap_lock held all the way back in m_start() is what
 	 * keeps khugepaged out of here and from collapsing things
 	 * in here.
 	 */
-	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	pte = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl);
 	for (; addr != end; pte++, addr += PAGE_SIZE)
 		smaps_pte_entry(pte, addr, walk);
 	pte_unmap_unlock(pte - 1, ptl);
@@ -1091,7 +1096,7 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
 }
 #endif
 
-static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
+static int clear_refs_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
 				unsigned long end, struct mm_walk *walk)
 {
 	struct clear_refs_private *cp = walk->private;
@@ -1100,20 +1105,25 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 	spinlock_t *ptl;
 	struct page *page;
 
-	ptl = pmd_trans_huge_lock(pmd, vma);
+	ptl = pmd_trans_huge_lock(pmdp, vma);
 	if (ptl) {
+		if (memcmp(pmdp, &pmd, sizeof(pmd)) != 0) {
+			walk->action = ACTION_AGAIN;
+			spin_unlock(ptl);
+			return 0;
+		}
 		if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
-			clear_soft_dirty_pmd(vma, addr, pmd);
+			clear_soft_dirty_pmd(vma, addr, pmdp);
 			goto out;
 		}
 
-		if (!pmd_present(*pmd))
+		if (!pmd_present(pmd))
 			goto out;
 
-		page = pmd_page(*pmd);
+		page = pmd_page(pmd);
 
 		/* Clear accessed and referenced bits. */
-		pmdp_test_and_clear_young(vma, addr, pmd);
+		pmdp_test_and_clear_young(vma, addr, pmdp);
 		test_and_clear_page_young(page);
 		ClearPageReferenced(page);
 out:
@@ -1121,10 +1131,10 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 		return 0;
 	}
 
-	if (pmd_trans_unstable(pmd))
+	if (pmd_trans_unstable(&pmd))
 		return 0;
 
-	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	pte = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl);
 	for (; addr != end; pte++, addr += PAGE_SIZE) {
 		ptent = *pte;
 
@@ -1388,8 +1398,8 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
 	return make_pme(frame, flags);
 }
 
-static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
-			     struct mm_walk *walk)
+static int pagemap_pmd_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+			unsigned long end, struct mm_walk *walk)
 {
 	struct vm_area_struct *vma = walk->vma;
 	struct pagemapread *pm = walk->private;
@@ -1401,9 +1411,14 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 	ptl = pmd_trans_huge_lock(pmdp, vma);
 	if (ptl) {
 		u64 flags = 0, frame = 0;
-		pmd_t pmd = *pmdp;
 		struct page *page = NULL;
 
+		if (memcmp(pmdp, &pmd, sizeof(pmd)) != 0) {
+			walk->action = ACTION_AGAIN;
+			spin_unlock(ptl);
+			return 0;
+		}
+
 		if (vma->vm_flags & VM_SOFTDIRTY)
 			flags |= PM_SOFT_DIRTY;
 
@@ -1456,7 +1471,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 		return err;
 	}
 
-	if (pmd_trans_unstable(pmdp))
+	if (pmd_trans_unstable(&pmd))
 		return 0;
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
@@ -1768,7 +1783,7 @@ static struct page *can_gather_numa_stats_pmd(pmd_t pmd,
 }
 #endif
 
-static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
+static int gather_pte_stats(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
 		unsigned long end, struct mm_walk *walk)
 {
 	struct numa_maps *md = walk->private;
@@ -1778,22 +1793,28 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 	pte_t *pte;
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	ptl = pmd_trans_huge_lock(pmd, vma);
+	ptl = pmd_trans_huge_lock(pmdp, vma);
 	if (ptl) {
 		struct page *page;
 
-		page = can_gather_numa_stats_pmd(*pmd, vma, addr);
+		if (memcmp(pmdp, &pmd, sizeof(pmd)) != 0) {
+			walk->action = ACTION_AGAIN;
+			spin_unlock(ptl);
+			return 0;
+		}
+
+		page = can_gather_numa_stats_pmd(pmd, vma, addr);
 		if (page)
-			gather_stats(page, md, pmd_dirty(*pmd),
+			gather_stats(page, md, pmd_dirty(pmd),
 				     HPAGE_PMD_SIZE/PAGE_SIZE);
 		spin_unlock(ptl);
 		return 0;
 	}
 
-	if (pmd_trans_unstable(pmd))
+	if (pmd_trans_unstable(&pmd))
 		return 0;
 #endif
-	orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+	orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl);
 	do {
 		struct page *page = can_gather_numa_stats(*pte, vma, addr);
 		if (!page)
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index 6caf28aadafb..686b57e94a9f 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -41,7 +41,7 @@ struct mm_walk_ops {
 			 unsigned long next, struct mm_walk *walk);
 	int (*pud_entry)(pud_t pud, pud_t *pudp, unsigned long addr,
 			 unsigned long next, struct mm_walk *walk);
-	int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
+	int (*pmd_entry)(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
 			 unsigned long next, struct mm_walk *walk);
 	int (*pte_entry)(pte_t *pte, unsigned long addr,
 			 unsigned long next, struct mm_walk *walk);
diff --git a/mm/madvise.c b/mm/madvise.c
index ae266dfede8a..16e7b8eadb13 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -183,14 +183,14 @@ static long madvise_behavior(struct vm_area_struct *vma,
 }
 
 #ifdef CONFIG_SWAP
-static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
+static int swapin_walk_pmd_entry(pmd_t pmd, pmd_t *pmdp, unsigned long start,
 	unsigned long end, struct mm_walk *walk)
 {
 	pte_t *orig_pte;
 	struct vm_area_struct *vma = walk->private;
 	unsigned long index;
 
-	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
+	if (pmd_none_or_trans_huge_or_clear_bad(&pmd))
 		return 0;
 
 	for (index = start; index != end; index += PAGE_SIZE) {
@@ -199,7 +199,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
 		struct page *page;
 		spinlock_t *ptl;
 
-		orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
+		orig_pte = pte_offset_map_lock(vma->vm_mm, pmdp, start, &ptl);
 		pte = *(orig_pte + ((index - start) / PAGE_SIZE));
 		pte_unmap_unlock(orig_pte, ptl);
 
@@ -304,7 +304,7 @@ static long madvise_willneed(struct vm_area_struct *vma,
 	return 0;
 }
 
-static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
+static int madvise_cold_or_pageout_pte_range(pmd_t pmd, pmd_t *pmdp,
 				unsigned long addr, unsigned long end,
 				struct mm_walk *walk)
 {
@@ -322,26 +322,29 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 		return -EINTR;
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	if (pmd_trans_huge(*pmd)) {
-		pmd_t orig_pmd;
+	if (pmd_trans_huge(pmd)) {
 		unsigned long next = pmd_addr_end(addr, end);
 
 		tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
-		ptl = pmd_trans_huge_lock(pmd, vma);
+		ptl = pmd_trans_huge_lock(pmdp, vma);
 		if (!ptl)
 			return 0;
 
-		orig_pmd = *pmd;
-		if (is_huge_zero_pmd(orig_pmd))
+		if (memcmp(pmdp, &pmd, sizeof(pmd)) != 0) {
+			walk->action = ACTION_AGAIN;
+			goto huge_unlock;
+		}
+
+		if (is_huge_zero_pmd(pmd))
 			goto huge_unlock;
 
-		if (unlikely(!pmd_present(orig_pmd))) {
+		if (unlikely(!pmd_present(pmd))) {
 			VM_BUG_ON(thp_migration_supported() &&
-					!is_pmd_migration_entry(orig_pmd));
+					!is_pmd_migration_entry(pmd));
 			goto huge_unlock;
 		}
 
-		page = pmd_page(orig_pmd);
+		page = pmd_page(pmd);
 
 		/* Do not interfere with other mappings of this page */
 		if (page_mapcount(page) != 1)
@@ -361,12 +364,12 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 			return 0;
 		}
 
-		if (pmd_young(orig_pmd)) {
-			pmdp_invalidate(vma, addr, pmd);
-			orig_pmd = pmd_mkold(orig_pmd);
+		if (pmd_young(pmd)) {
+			pmdp_invalidate(vma, addr, pmdp);
+			pmd = pmd_mkold(pmd);
 
-			set_pmd_at(mm, addr, pmd, orig_pmd);
-			tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+			set_pmd_at(mm, addr, pmdp, pmd);
+			tlb_remove_pmd_tlb_entry(tlb, pmdp, addr);
 		}
 
 		ClearPageReferenced(page);
@@ -388,11 +391,11 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 	}
 
 regular_page:
-	if (pmd_trans_unstable(pmd))
+	if (pmd_trans_unstable(&pmd))
 		return 0;
 #endif
 	tlb_change_page_size(tlb, PAGE_SIZE);
-	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl);
 	flush_tlb_batched_pending(mm);
 	arch_enter_lazy_mmu_mode();
 	for (; addr < end; pte++, addr += PAGE_SIZE) {
@@ -424,12 +427,12 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 			if (split_huge_page(page)) {
 				unlock_page(page);
 				put_page(page);
-				pte_offset_map_lock(mm, pmd, addr, &ptl);
+				pte_offset_map_lock(mm, pmdp, addr, &ptl);
 				break;
 			}
 			unlock_page(page);
 			put_page(page);
-			pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+			pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
 			pte--;
 			addr -= PAGE_SIZE;
 			continue;
@@ -566,7 +569,7 @@ static long madvise_pageout(struct vm_area_struct *vma,
 	return 0;
 }
 
-static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
+static int madvise_free_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
 				unsigned long end, struct mm_walk *walk)
 
 {
@@ -580,15 +583,15 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 	unsigned long next;
 
 	next = pmd_addr_end(addr, end);
-	if (pmd_trans_huge(*pmd))
-		if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
+	if (pmd_trans_huge(pmd))
+		if (madvise_free_huge_pmd(tlb, vma, pmdp, addr, next))
 			goto next;
 
-	if (pmd_trans_unstable(pmd))
+	if (pmd_trans_unstable(&pmd))
 		return 0;
 
 	tlb_change_page_size(tlb, PAGE_SIZE);
-	orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+	orig_pte = pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
 	flush_tlb_batched_pending(mm);
 	arch_enter_lazy_mmu_mode();
 	for (; addr != end; pte++, addr += PAGE_SIZE) {
@@ -634,12 +637,12 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 			if (split_huge_page(page)) {
 				unlock_page(page);
 				put_page(page);
-				pte_offset_map_lock(mm, pmd, addr, &ptl);
+				pte_offset_map_lock(mm, pmdp, addr, &ptl);
 				goto out;
 			}
 			unlock_page(page);
 			put_page(page);
-			pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+			pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
 			pte--;
 			addr -= PAGE_SIZE;
 			continue;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9c4a0851348f..b28f620c1c5b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5827,7 +5827,7 @@ static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
 }
 #endif
 
-static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
+static int mem_cgroup_count_precharge_pte_range(pmd_t pmd, pmd_t *pmdp,
 					unsigned long addr, unsigned long end,
 					struct mm_walk *walk)
 {
@@ -5835,22 +5835,27 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
 	pte_t *pte;
 	spinlock_t *ptl;
 
-	ptl = pmd_trans_huge_lock(pmd, vma);
+	ptl = pmd_trans_huge_lock(pmdp, vma);
 	if (ptl) {
+		if (memcmp(pmdp, &pmd, sizeof(pmd)) != 0) {
+			walk->action = ACTION_AGAIN;
+			spin_unlock(ptl);
+			return 0;
+		}
 		/*
 		 * Note their can not be MC_TARGET_DEVICE for now as we do not
 		 * support transparent huge page with MEMORY_DEVICE_PRIVATE but
 		 * this might change.
 		 */
-		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
+		if (get_mctgt_type_thp(vma, addr, pmd, NULL) == MC_TARGET_PAGE)
 			mc.precharge += HPAGE_PMD_NR;
 		spin_unlock(ptl);
 		return 0;
 	}
 
-	if (pmd_trans_unstable(pmd))
+	if (pmd_trans_unstable(&pmd))
 		return 0;
-	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	pte = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl);
 	for (; addr != end; pte++, addr += PAGE_SIZE)
 		if (get_mctgt_type(vma, addr, *pte, NULL))
 			mc.precharge++;	/* increment precharge temporarily */
@@ -6023,7 +6028,7 @@ static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
 		mem_cgroup_clear_mc();
 }
 
-static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
+static int mem_cgroup_move_charge_pte_range(pmd_t pmd, pmd_t *pmdp,
 				unsigned long addr, unsigned long end,
 				struct mm_walk *walk)
 {
@@ -6035,13 +6040,18 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 	union mc_target target;
 	struct page *page;
 
-	ptl = pmd_trans_huge_lock(pmd, vma);
+	ptl = pmd_trans_huge_lock(pmdp, vma);
 	if (ptl) {
+		if (memcmp(pmdp, &pmd, sizeof(pmd)) != 0) {
+			walk->action = ACTION_AGAIN;
+			spin_unlock(ptl);
+			return 0;
+		}
 		if (mc.precharge < HPAGE_PMD_NR) {
 			spin_unlock(ptl);
 			return 0;
 		}
-		target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
+		target_type = get_mctgt_type_thp(vma, addr, pmd, &target);
 		if (target_type == MC_TARGET_PAGE) {
 			page = target.page;
 			if (!isolate_lru_page(page)) {
@@ -6066,10 +6076,10 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 		return 0;
 	}
 
-	if (pmd_trans_unstable(pmd))
+	if (pmd_trans_unstable(&pmd))
 		return 0;
 retry:
-	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	pte = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl);
 	for (; addr != end; addr += PAGE_SIZE) {
 		pte_t ptent = *(pte++);
 		bool device = false;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index eddbe4e56c73..731a7710395f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -516,7 +516,7 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
  * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
  *        on a node that does not follow the policy.
  */
-static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
+static int queue_pages_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
 			unsigned long end, struct mm_walk *walk)
 {
 	struct vm_area_struct *vma = walk->vma;
@@ -528,18 +528,23 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
 	pte_t *pte;
 	spinlock_t *ptl;
 
-	ptl = pmd_trans_huge_lock(pmd, vma);
+	ptl = pmd_trans_huge_lock(pmdp, vma);
 	if (ptl) {
-		ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
+		if (memcmp(pmdp, &pmd, sizeof(pmd)) != 0) {
+			walk->action = ACTION_AGAIN;
+			spin_unlock(ptl);
+			return 0;
+		}
+		ret = queue_pages_pmd(pmdp, ptl, addr, end, walk);
 		if (ret != 2)
 			return ret;
 	}
 	/* THP was split, fall through to pte walk */
 
-	if (pmd_trans_unstable(pmd))
+	if (pmd_trans_unstable(&pmd))
 		return 0;
 
-	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+	pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl);
 	for (; addr != end; pte++, addr += PAGE_SIZE) {
 		if (!pte_present(*pte))
 			continue;
diff --git a/mm/mincore.c b/mm/mincore.c
index 02db1a834021..168661f32aaa 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -96,8 +96,8 @@ static int mincore_unmapped_range(unsigned long addr, unsigned long end,
 	return 0;
 }
 
-static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
-			struct mm_walk *walk)
+static int mincore_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+			unsigned long end, struct mm_walk *walk)
 {
 	spinlock_t *ptl;
 	struct vm_area_struct *vma = walk->vma;
@@ -105,19 +105,19 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	unsigned char *vec = walk->private;
 	int nr = (end - addr) >> PAGE_SHIFT;
 
-	ptl = pmd_trans_huge_lock(pmd, vma);
+	ptl = pmd_trans_huge_lock(pmdp, vma);
 	if (ptl) {
 		memset(vec, 1, nr);
 		spin_unlock(ptl);
 		goto out;
 	}
 
-	if (pmd_trans_unstable(pmd)) {
+	if (pmd_trans_unstable(&pmd)) {
 		__mincore_unmapped_range(addr, end, vma, vec);
 		goto out;
 	}
 
-	ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+	ptep = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl);
 	for (; addr != end; ptep++, addr += PAGE_SIZE) {
 		pte_t pte = *ptep;
 
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 15d1e423b4a3..a3752c82a7b2 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -61,17 +61,19 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 static int walk_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
 			  struct mm_walk *walk)
 {
-	pmd_t *pmd;
+	pmd_t *pmdp;
+	pmd_t pmd;
 	unsigned long next;
 	const struct mm_walk_ops *ops = walk->ops;
 	int err = 0;
 	int depth = real_depth(3);
 
-	pmd = pmd_offset(&pud, addr);
+	pmdp = pmd_offset(&pud, addr);
 	do {
 again:
+		pmd = READ_ONCE(*pmdp);
 		next = pmd_addr_end(addr, end);
-		if (pmd_none(*pmd) || (!walk->vma && !walk->no_vma)) {
+		if (pmd_none(pmd) || (!walk->vma && !walk->no_vma)) {
 			if (ops->pte_hole)
 				err = ops->pte_hole(addr, next, depth, walk);
 			if (err)
@@ -86,7 +88,7 @@ static int walk_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
 		 * needs to know about pmd_trans_huge() pmds
 		 */
 		if (ops->pmd_entry)
-			err = ops->pmd_entry(pmd, addr, next, walk);
+			err = ops->pmd_entry(pmd, pmdp, addr, next, walk);
 		if (err)
 			break;
 
@@ -97,21 +99,22 @@ static int walk_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
 		 * Check this here so we only break down trans_huge
 		 * pages when we _need_ to
 		 */
-		if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) ||
+		if ((!walk->vma && (pmd_leaf(pmd) || !pmd_present(pmd))) ||
 		    walk->action == ACTION_CONTINUE ||
 		    !(ops->pte_entry))
 			continue;
 
 		if (walk->vma) {
-			split_huge_pmd(walk->vma, pmd, addr);
-			if (pmd_trans_unstable(pmd))
+			split_huge_pmd(walk->vma, pmdp, addr);
+			pmd = READ_ONCE(*pmdp);
+			if (pmd_trans_unstable(&pmd))
 				goto again;
 		}
 
-		err = walk_pte_range(pmd, addr, next, walk);
+		err = walk_pte_range(pmdp, addr, next, walk);
 		if (err)
 			break;
-	} while (pmd++, addr = next, addr != end);
+	} while (pmdp++, addr = next, addr != end);
 
 	return err;
 }
-- 
2.28.0


  parent reply	other threads:[~2020-09-28 18:01 UTC|newest]

Thread overview: 56+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-09-28 17:53 [RFC PATCH v2 00/30] 1GB PUD THP support on x86_64 Zi Yan
2020-09-28 17:53 ` [RFC PATCH v2 01/30] mm/pagewalk: use READ_ONCE when reading the PUD entry unlocked Zi Yan
2020-09-28 17:54 ` Zi Yan [this message]
2020-09-28 17:54 ` [RFC PATCH v2 03/30] mm: thp: use single linked list for THP page table page deposit Zi Yan
2020-09-28 19:34   ` Matthew Wilcox
2020-09-28 20:34     ` Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 04/30] mm: add new helper functions to allocate one PMD page with 512 PTE pages Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 05/30] mm: thp: add page table deposit/withdraw functions for PUD THP Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 06/30] mm: change thp_order and thp_nr as we will have not just PMD THPs Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 07/30] mm: thp: add anonymous PUD THP page fault support without enabling it Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 08/30] mm: thp: add PUD THP support for copy_huge_pud Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 09/30] mm: thp: add PUD THP support to zap_huge_pud Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 10/30] fs: proc: add PUD THP kpageflag Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 11/30] mm: thp: handling PUD THP reference bit Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 12/30] mm: rmap: add mappped/unmapped page order to anonymous page rmap functions Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 13/30] mm: rmap: add map_order to page_remove_anon_compound_rmap Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 14/30] mm: thp: add PUD THP split_huge_pud_page() function Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 15/30] mm: thp: add PUD THP to deferred split list when PUD mapping is gone Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 16/30] mm: debug: adapt dump_page to PUD THP Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 17/30] mm: thp: PUD THP COW splits PUD page and falls back to PMD page Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 18/30] mm: thp: PUD THP follow_p*d_page() support Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 19/30] mm: stats: make smap stats understand PUD THPs Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 20/30] mm: page_vma_walk: teach it about PMD-mapped PUD THP Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 21/30] mm: thp: PUD THP support in try_to_unmap() Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 22/30] mm: thp: split PUD THPs at page reclaim Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 23/30] mm: support PUD THP pagemap support Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 24/30] mm: madvise: add page size options to MADV_HUGEPAGE and MADV_NOHUGEPAGE Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 25/30] mm: vma: add VM_HUGEPAGE_PUD to vm_flags at bit 37 Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 26/30] mm: thp: add a global knob to enable/disable PUD THPs Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 27/30] mm: thp: make PUD THP size public Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 28/30] hugetlb: cma: move cma reserve function to cma.c Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 29/30] mm: thp: use cma reservation for pud thp allocation Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 30/30] mm: thp: enable anonymous PUD THP at page fault path Zi Yan
2020-09-30 11:55 ` [RFC PATCH v2 00/30] 1GB PUD THP support on x86_64 Michal Hocko
2020-10-01 15:14   ` Zi Yan
2020-10-02  7:32     ` Michal Hocko
2020-10-02  7:50       ` David Hildenbrand
2020-10-02  8:10         ` Michal Hocko
2020-10-02  8:30           ` David Hildenbrand
2020-10-05 15:03             ` Zi Yan
2020-10-05 15:55               ` Matthew Wilcox
2020-10-05 17:04                 ` Roman Gushchin
2020-10-05 19:12                 ` Zi Yan
2020-10-05 19:37                   ` Matthew Wilcox
2020-10-05 17:16               ` Roman Gushchin
2020-10-05 17:27                 ` David Hildenbrand
2020-10-05 18:25                   ` Roman Gushchin
2020-10-05 18:33                     ` David Hildenbrand
2020-10-05 19:11                       ` Roman Gushchin
2020-10-06  8:25                         ` David Hildenbrand
2020-10-05 17:39               ` David Hildenbrand
2020-10-05 18:05                 ` Zi Yan
2020-10-05 18:48                   ` David Hildenbrand
2020-10-06 11:59                   ` Michal Hocko
2020-10-05 15:34         ` Zi Yan
2020-10-05 17:30           ` David Hildenbrand

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200928175428.4110504-3-zi.yan@sent.com \
    --to=zi.yan@sent.com \
    --cc=aarcange@redhat.com \
    --cc=david@redhat.com \
    --cc=dnellans@nvidia.com \
    --cc=guro@fb.com \
    --cc=jgg@nvidia.com \
    --cc=jhubbard@nvidia.com \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@suse.com \
    --cc=mike.kravetz@oracle.com \
    --cc=riel@surriel.com \
    --cc=shakeelb@google.com \
    --cc=shy828301@gmail.com \
    --cc=william.kucharski@oracle.com \
    --cc=willy@infradead.org \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.