All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-08-29 23:40 ` Mike Kravetz
  0 siblings, 0 replies; 60+ messages in thread
From: Mike Kravetz @ 2022-08-29 23:40 UTC (permalink / raw)
  To: linux-mm, linux-kernel, inuxppc-dev, linux-ia64
  Cc: Baolin Wang, David Hildenbrand, Aneesh Kumar K . V,
	Naoya Horiguchi, Michael Ellerman, Muchun Song, Andrew Morton,
	Mike Kravetz

During discussions of this series [1], it was suggested that hugetlb
handling code in follow_page_mask could be simplified.  At the beginning
of follow_page_mask, there currently is a call to follow_huge_addr which
'may' handle hugetlb pages.  ia64 is the only architecture which provides
a follow_huge_addr routine that does not return error.  Instead, at each
level of the page table a check is made for a hugetlb entry.  If a hugetlb
entry is found, a call to a routine associated with that entry is made.

Currently, there are two checks for hugetlb entries at each page table
level.  The first check is of the form:
	if (p?d_huge())
		page = follow_huge_p?d();
the second check is of the form:
	if (is_hugepd())
		page = follow_huge_pd().

We can replace these checks, as well as the special handling routines
such as follow_huge_p?d() and follow_huge_pd() with a single routine to
handle hugetlb vmas.

A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
existing routine huge_pte_offset to walk page tables looking for hugetlb
entries.  huge_pte_offset can be overwritten by architectures, and already
handles special cases such as hugepd entries.

[1] https://lore.kernel.org/linux-mm/cover.1661240170.git.baolin.wang@linux.alibaba.com/
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
 arch/ia64/mm/hugetlbpage.c    |  15 ---
 arch/powerpc/mm/hugetlbpage.c |  37 --------
 include/linux/hugetlb.h       |  51 ++--------
 mm/gup.c                      |  65 ++-----------
 mm/hugetlb.c                  | 173 +++++++++++-----------------------
 5 files changed, 74 insertions(+), 267 deletions(-)

diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index f993cb36c062..380d2f3966c9 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -91,21 +91,6 @@ int prepare_hugepage_range(struct file *file,
 	return 0;
 }
 
-struct page *follow_huge_addr(struct mm_struct *mm, unsigned long addr, int write)
-{
-	struct page *page;
-	pte_t *ptep;
-
-	if (REGION_NUMBER(addr) != RGN_HPAGE)
-		return ERR_PTR(-EINVAL);
-
-	ptep = huge_pte_offset(mm, addr, HPAGE_SIZE);
-	if (!ptep || pte_none(*ptep))
-		return NULL;
-	page = pte_page(*ptep);
-	page += ((addr & ~HPAGE_MASK) >> PAGE_SHIFT);
-	return page;
-}
 int pmd_huge(pmd_t pmd)
 {
 	return 0;
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index bc84a594ca62..b0e037c75c12 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -506,43 +506,6 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 	} while (addr = next, addr != end);
 }
 
-struct page *follow_huge_pd(struct vm_area_struct *vma,
-			    unsigned long address, hugepd_t hpd,
-			    int flags, int pdshift)
-{
-	pte_t *ptep;
-	spinlock_t *ptl;
-	struct page *page = NULL;
-	unsigned long mask;
-	int shift = hugepd_shift(hpd);
-	struct mm_struct *mm = vma->vm_mm;
-
-retry:
-	/*
-	 * hugepage directory entries are protected by mm->page_table_lock
-	 * Use this instead of huge_pte_lockptr
-	 */
-	ptl = &mm->page_table_lock;
-	spin_lock(ptl);
-
-	ptep = hugepte_offset(hpd, address, pdshift);
-	if (pte_present(*ptep)) {
-		mask = (1UL << shift) - 1;
-		page = pte_page(*ptep);
-		page += ((address & mask) >> PAGE_SHIFT);
-		if (flags & FOLL_GET)
-			get_page(page);
-	} else {
-		if (is_hugetlb_entry_migration(*ptep)) {
-			spin_unlock(ptl);
-			__migration_entry_wait(mm, ptep, ptl);
-			goto retry;
-		}
-	}
-	spin_unlock(ptl);
-	return page;
-}
-
 bool __init arch_hugetlb_valid_size(unsigned long size)
 {
 	int shift = __ffs(size);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 852f911d676e..8ea3e5e726e4 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -142,6 +142,8 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
 			     unsigned long len);
 int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *,
 			    struct vm_area_struct *, struct vm_area_struct *);
+struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
+				unsigned long address, unsigned int flags);
 long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
 			 struct page **, struct vm_area_struct **,
 			 unsigned long *, unsigned long *, long, unsigned int,
@@ -202,17 +204,6 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
 				unsigned long addr, pte_t *ptep);
 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
 				unsigned long *start, unsigned long *end);
-struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
-			      int write);
-struct page *follow_huge_pd(struct vm_area_struct *vma,
-			    unsigned long address, hugepd_t hpd,
-			    int flags, int pdshift);
-struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-				pmd_t *pmd, int flags);
-struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
-				pud_t *pud, int flags);
-struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address,
-			     pgd_t *pgd, int flags);
 
 void hugetlb_vma_lock_read(struct vm_area_struct *vma);
 void hugetlb_vma_unlock_read(struct vm_area_struct *vma);
@@ -264,6 +255,13 @@ static inline void adjust_range_if_pmd_sharing_possible(
 {
 }
 
+static struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
+				unsigned long address, unsigned int flags)
+{
+	/* should never happen, but do not want to BUG */
+	return ERR_PTR(-EINVAL);
+}
+
 static inline long follow_hugetlb_page(struct mm_struct *mm,
 			struct vm_area_struct *vma, struct page **pages,
 			struct vm_area_struct **vmas, unsigned long *position,
@@ -274,12 +272,6 @@ static inline long follow_hugetlb_page(struct mm_struct *mm,
 	return 0;
 }
 
-static inline struct page *follow_huge_addr(struct mm_struct *mm,
-					unsigned long address, int write)
-{
-	return ERR_PTR(-EINVAL);
-}
-
 static inline int copy_hugetlb_page_range(struct mm_struct *dst,
 					  struct mm_struct *src,
 					  struct vm_area_struct *dst_vma,
@@ -312,31 +304,6 @@ static inline void hugetlb_show_meminfo_node(int nid)
 {
 }
 
-static inline struct page *follow_huge_pd(struct vm_area_struct *vma,
-				unsigned long address, hugepd_t hpd, int flags,
-				int pdshift)
-{
-	return NULL;
-}
-
-static inline struct page *follow_huge_pmd(struct mm_struct *mm,
-				unsigned long address, pmd_t *pmd, int flags)
-{
-	return NULL;
-}
-
-static inline struct page *follow_huge_pud(struct mm_struct *mm,
-				unsigned long address, pud_t *pud, int flags)
-{
-	return NULL;
-}
-
-static inline struct page *follow_huge_pgd(struct mm_struct *mm,
-				unsigned long address, pgd_t *pgd, int flags)
-{
-	return NULL;
-}
-
 static inline int prepare_hugepage_range(struct file *file,
 				unsigned long addr, unsigned long len)
 {
diff --git a/mm/gup.c b/mm/gup.c
index 66d8619e02ad..80ce04a5bae5 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -661,20 +661,6 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
 	pmdval = READ_ONCE(*pmd);
 	if (pmd_none(pmdval))
 		return no_page_table(vma, flags);
-	if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
-		page = follow_huge_pmd(mm, address, pmd, flags);
-		if (page)
-			return page;
-		return no_page_table(vma, flags);
-	}
-	if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
-		page = follow_huge_pd(vma, address,
-				      __hugepd(pmd_val(pmdval)), flags,
-				      PMD_SHIFT);
-		if (page)
-			return page;
-		return no_page_table(vma, flags);
-	}
 retry:
 	if (!pmd_present(pmdval)) {
 		/*
@@ -764,20 +750,6 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
 	pud = pud_offset(p4dp, address);
 	if (pud_none(*pud))
 		return no_page_table(vma, flags);
-	if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) {
-		page = follow_huge_pud(mm, address, pud, flags);
-		if (page)
-			return page;
-		return no_page_table(vma, flags);
-	}
-	if (is_hugepd(__hugepd(pud_val(*pud)))) {
-		page = follow_huge_pd(vma, address,
-				      __hugepd(pud_val(*pud)), flags,
-				      PUD_SHIFT);
-		if (page)
-			return page;
-		return no_page_table(vma, flags);
-	}
 	if (pud_devmap(*pud)) {
 		ptl = pud_lock(mm, pud);
 		page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
@@ -797,7 +769,6 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
 				    struct follow_page_context *ctx)
 {
 	p4d_t *p4d;
-	struct page *page;
 
 	p4d = p4d_offset(pgdp, address);
 	if (p4d_none(*p4d))
@@ -806,14 +777,6 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
 	if (unlikely(p4d_bad(*p4d)))
 		return no_page_table(vma, flags);
 
-	if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
-		page = follow_huge_pd(vma, address,
-				      __hugepd(p4d_val(*p4d)), flags,
-				      P4D_SHIFT);
-		if (page)
-			return page;
-		return no_page_table(vma, flags);
-	}
 	return follow_pud_mask(vma, address, p4d, flags, ctx);
 }
 
@@ -851,10 +814,15 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
 
 	ctx->page_mask = 0;
 
-	/* make this handle hugepd */
-	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
-	if (!IS_ERR(page)) {
-		WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
+	/*
+	 * Call hugetlb_follow_page_mask for hugetlb vmas as it will use
+	 * special hugetlb page table walking code.  This eliminates the
+	 * need to check for hugetlb entries in the general walking code.
+	 */
+	if (is_vm_hugetlb_page(vma)) {
+		page = hugetlb_follow_page_mask(vma, address, flags);
+		if (!page)
+			page = no_page_table(vma, flags);
 		return page;
 	}
 
@@ -863,21 +831,6 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
 	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
 		return no_page_table(vma, flags);
 
-	if (pgd_huge(*pgd)) {
-		page = follow_huge_pgd(mm, address, pgd, flags);
-		if (page)
-			return page;
-		return no_page_table(vma, flags);
-	}
-	if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
-		page = follow_huge_pd(vma, address,
-				      __hugepd(pgd_val(*pgd)), flags,
-				      PGDIR_SHIFT);
-		if (page)
-			return page;
-		return no_page_table(vma, flags);
-	}
-
 	return follow_p4d_mask(vma, address, pgd, flags, ctx);
 }
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d0617d64d718..b3da421ba5be 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6190,6 +6190,62 @@ static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
 	return false;
 }
 
+struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
+				unsigned long address, unsigned int flags)
+{
+	struct hstate *h = hstate_vma(vma);
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long haddr = address & huge_page_mask(h);
+	struct page *page = NULL;
+	spinlock_t *ptl;
+	pte_t *pte, entry;
+
+	/*
+	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
+	 * follow_hugetlb_page().
+	 */
+	if (WARN_ON_ONCE(flags & FOLL_PIN))
+		return NULL;
+
+	pte = huge_pte_offset(mm, haddr, huge_page_size(h));
+	if (!pte)
+		return NULL;
+
+retry:
+	ptl = huge_pte_lock(h, mm, pte);
+	entry = huge_ptep_get(pte);
+	if (pte_present(entry)) {
+		page = pte_page(entry) +
+				((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
+		/*
+		 * Note that page may be a sub-page, and with vmemmap
+		 * optimizations the page struct may be read only.
+		 * try_grab_page() will increase the ref count on the
+		 * head page, so this will be OK.
+		 *
+		 * try_grab_page() should always succeed here, because we hold
+		 * the ptl lock and have verified pte_present().
+		 */
+		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
+			page = NULL;
+			goto out;
+		}
+	} else {
+		if (is_hugetlb_entry_migration(entry)) {
+			spin_unlock(ptl);
+			__migration_entry_wait_huge(pte, ptl);
+			goto retry;
+		}
+		/*
+		 * hwpoisoned entry is treated as no_page_table in
+		 * follow_page_mask().
+		 */
+	}
+out:
+	spin_unlock(ptl);
+	return page;
+}
+
 long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			 struct page **pages, struct vm_area_struct **vmas,
 			 unsigned long *position, unsigned long *nr_pages,
@@ -7140,123 +7196,6 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h)
  * These functions are overwritable if your architecture needs its own
  * behavior.
  */
-struct page * __weak
-follow_huge_addr(struct mm_struct *mm, unsigned long address,
-			      int write)
-{
-	return ERR_PTR(-EINVAL);
-}
-
-struct page * __weak
-follow_huge_pd(struct vm_area_struct *vma,
-	       unsigned long address, hugepd_t hpd, int flags, int pdshift)
-{
-	WARN(1, "hugepd follow called with no support for hugepage directory format\n");
-	return NULL;
-}
-
-struct page * __weak
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-		pmd_t *pmd, int flags)
-{
-	struct page *page = NULL;
-	spinlock_t *ptl;
-	pte_t pte;
-
-	/*
-	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
-	 * follow_hugetlb_page().
-	 */
-	if (WARN_ON_ONCE(flags & FOLL_PIN))
-		return NULL;
-
-retry:
-	ptl = pmd_lockptr(mm, pmd);
-	spin_lock(ptl);
-	/*
-	 * make sure that the address range covered by this pmd is not
-	 * unmapped from other threads.
-	 */
-	if (!pmd_huge(*pmd))
-		goto out;
-	pte = huge_ptep_get((pte_t *)pmd);
-	if (pte_present(pte)) {
-		page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
-		/*
-		 * try_grab_page() should always succeed here, because: a) we
-		 * hold the pmd (ptl) lock, and b) we've just checked that the
-		 * huge pmd (head) page is present in the page tables. The ptl
-		 * prevents the head page and tail pages from being rearranged
-		 * in any way. So this page must be available at this point,
-		 * unless the page refcount overflowed:
-		 */
-		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
-			page = NULL;
-			goto out;
-		}
-	} else {
-		if (is_hugetlb_entry_migration(pte)) {
-			spin_unlock(ptl);
-			__migration_entry_wait_huge((pte_t *)pmd, ptl);
-			goto retry;
-		}
-		/*
-		 * hwpoisoned entry is treated as no_page_table in
-		 * follow_page_mask().
-		 */
-	}
-out:
-	spin_unlock(ptl);
-	return page;
-}
-
-struct page * __weak
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
-		pud_t *pud, int flags)
-{
-	struct page *page = NULL;
-	spinlock_t *ptl;
-	pte_t pte;
-
-	if (WARN_ON_ONCE(flags & FOLL_PIN))
-		return NULL;
-
-retry:
-	ptl = huge_pte_lock(hstate_sizelog(PUD_SHIFT), mm, (pte_t *)pud);
-	if (!pud_huge(*pud))
-		goto out;
-	pte = huge_ptep_get((pte_t *)pud);
-	if (pte_present(pte)) {
-		page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
-		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
-			page = NULL;
-			goto out;
-		}
-	} else {
-		if (is_hugetlb_entry_migration(pte)) {
-			spin_unlock(ptl);
-			__migration_entry_wait(mm, (pte_t *)pud, ptl);
-			goto retry;
-		}
-		/*
-		 * hwpoisoned entry is treated as no_page_table in
-		 * follow_page_mask().
-		 */
-	}
-out:
-	spin_unlock(ptl);
-	return page;
-}
-
-struct page * __weak
-follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
-{
-	if (flags & (FOLL_GET | FOLL_PIN))
-		return NULL;
-
-	return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
-}
-
 int isolate_hugetlb(struct page *page, struct list_head *list)
 {
 	int ret = 0;
-- 
2.37.1


^ permalink raw reply related	[flat|nested] 60+ messages in thread

* [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-08-29 23:40 ` Mike Kravetz
  0 siblings, 0 replies; 60+ messages in thread
From: Mike Kravetz @ 2022-08-29 23:40 UTC (permalink / raw)
  To: linux-mm, linux-kernel, inuxppc-dev, linux-ia64
  Cc: Baolin Wang, David Hildenbrand, Aneesh Kumar K . V,
	Naoya Horiguchi, Michael Ellerman, Muchun Song, Andrew Morton,
	Mike Kravetz

During discussions of this series [1], it was suggested that hugetlb
handling code in follow_page_mask could be simplified.  At the beginning
of follow_page_mask, there currently is a call to follow_huge_addr which
'may' handle hugetlb pages.  ia64 is the only architecture which provides
a follow_huge_addr routine that does not return error.  Instead, at each
level of the page table a check is made for a hugetlb entry.  If a hugetlb
entry is found, a call to a routine associated with that entry is made.

Currently, there are two checks for hugetlb entries at each page table
level.  The first check is of the form:
	if (p?d_huge())
		page = follow_huge_p?d();
the second check is of the form:
	if (is_hugepd())
		page = follow_huge_pd().

We can replace these checks, as well as the special handling routines
such as follow_huge_p?d() and follow_huge_pd() with a single routine to
handle hugetlb vmas.

A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
existing routine huge_pte_offset to walk page tables looking for hugetlb
entries.  huge_pte_offset can be overwritten by architectures, and already
handles special cases such as hugepd entries.

[1] https://lore.kernel.org/linux-mm/cover.1661240170.git.baolin.wang@linux.alibaba.com/
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
 arch/ia64/mm/hugetlbpage.c    |  15 ---
 arch/powerpc/mm/hugetlbpage.c |  37 --------
 include/linux/hugetlb.h       |  51 ++--------
 mm/gup.c                      |  65 ++-----------
 mm/hugetlb.c                  | 173 +++++++++++-----------------------
 5 files changed, 74 insertions(+), 267 deletions(-)

diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index f993cb36c062..380d2f3966c9 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -91,21 +91,6 @@ int prepare_hugepage_range(struct file *file,
 	return 0;
 }
 
-struct page *follow_huge_addr(struct mm_struct *mm, unsigned long addr, int write)
-{
-	struct page *page;
-	pte_t *ptep;
-
-	if (REGION_NUMBER(addr) != RGN_HPAGE)
-		return ERR_PTR(-EINVAL);
-
-	ptep = huge_pte_offset(mm, addr, HPAGE_SIZE);
-	if (!ptep || pte_none(*ptep))
-		return NULL;
-	page = pte_page(*ptep);
-	page += ((addr & ~HPAGE_MASK) >> PAGE_SHIFT);
-	return page;
-}
 int pmd_huge(pmd_t pmd)
 {
 	return 0;
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index bc84a594ca62..b0e037c75c12 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -506,43 +506,6 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 	} while (addr = next, addr != end);
 }
 
-struct page *follow_huge_pd(struct vm_area_struct *vma,
-			    unsigned long address, hugepd_t hpd,
-			    int flags, int pdshift)
-{
-	pte_t *ptep;
-	spinlock_t *ptl;
-	struct page *page = NULL;
-	unsigned long mask;
-	int shift = hugepd_shift(hpd);
-	struct mm_struct *mm = vma->vm_mm;
-
-retry:
-	/*
-	 * hugepage directory entries are protected by mm->page_table_lock
-	 * Use this instead of huge_pte_lockptr
-	 */
-	ptl = &mm->page_table_lock;
-	spin_lock(ptl);
-
-	ptep = hugepte_offset(hpd, address, pdshift);
-	if (pte_present(*ptep)) {
-		mask = (1UL << shift) - 1;
-		page = pte_page(*ptep);
-		page += ((address & mask) >> PAGE_SHIFT);
-		if (flags & FOLL_GET)
-			get_page(page);
-	} else {
-		if (is_hugetlb_entry_migration(*ptep)) {
-			spin_unlock(ptl);
-			__migration_entry_wait(mm, ptep, ptl);
-			goto retry;
-		}
-	}
-	spin_unlock(ptl);
-	return page;
-}
-
 bool __init arch_hugetlb_valid_size(unsigned long size)
 {
 	int shift = __ffs(size);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 852f911d676e..8ea3e5e726e4 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -142,6 +142,8 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
 			     unsigned long len);
 int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *,
 			    struct vm_area_struct *, struct vm_area_struct *);
+struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
+				unsigned long address, unsigned int flags);
 long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
 			 struct page **, struct vm_area_struct **,
 			 unsigned long *, unsigned long *, long, unsigned int,
@@ -202,17 +204,6 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
 				unsigned long addr, pte_t *ptep);
 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
 				unsigned long *start, unsigned long *end);
-struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
-			      int write);
-struct page *follow_huge_pd(struct vm_area_struct *vma,
-			    unsigned long address, hugepd_t hpd,
-			    int flags, int pdshift);
-struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-				pmd_t *pmd, int flags);
-struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
-				pud_t *pud, int flags);
-struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address,
-			     pgd_t *pgd, int flags);
 
 void hugetlb_vma_lock_read(struct vm_area_struct *vma);
 void hugetlb_vma_unlock_read(struct vm_area_struct *vma);
@@ -264,6 +255,13 @@ static inline void adjust_range_if_pmd_sharing_possible(
 {
 }
 
+static struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
+				unsigned long address, unsigned int flags)
+{
+	/* should never happen, but do not want to BUG */
+	return ERR_PTR(-EINVAL);
+}
+
 static inline long follow_hugetlb_page(struct mm_struct *mm,
 			struct vm_area_struct *vma, struct page **pages,
 			struct vm_area_struct **vmas, unsigned long *position,
@@ -274,12 +272,6 @@ static inline long follow_hugetlb_page(struct mm_struct *mm,
 	return 0;
 }
 
-static inline struct page *follow_huge_addr(struct mm_struct *mm,
-					unsigned long address, int write)
-{
-	return ERR_PTR(-EINVAL);
-}
-
 static inline int copy_hugetlb_page_range(struct mm_struct *dst,
 					  struct mm_struct *src,
 					  struct vm_area_struct *dst_vma,
@@ -312,31 +304,6 @@ static inline void hugetlb_show_meminfo_node(int nid)
 {
 }
 
-static inline struct page *follow_huge_pd(struct vm_area_struct *vma,
-				unsigned long address, hugepd_t hpd, int flags,
-				int pdshift)
-{
-	return NULL;
-}
-
-static inline struct page *follow_huge_pmd(struct mm_struct *mm,
-				unsigned long address, pmd_t *pmd, int flags)
-{
-	return NULL;
-}
-
-static inline struct page *follow_huge_pud(struct mm_struct *mm,
-				unsigned long address, pud_t *pud, int flags)
-{
-	return NULL;
-}
-
-static inline struct page *follow_huge_pgd(struct mm_struct *mm,
-				unsigned long address, pgd_t *pgd, int flags)
-{
-	return NULL;
-}
-
 static inline int prepare_hugepage_range(struct file *file,
 				unsigned long addr, unsigned long len)
 {
diff --git a/mm/gup.c b/mm/gup.c
index 66d8619e02ad..80ce04a5bae5 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -661,20 +661,6 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
 	pmdval = READ_ONCE(*pmd);
 	if (pmd_none(pmdval))
 		return no_page_table(vma, flags);
-	if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
-		page = follow_huge_pmd(mm, address, pmd, flags);
-		if (page)
-			return page;
-		return no_page_table(vma, flags);
-	}
-	if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
-		page = follow_huge_pd(vma, address,
-				      __hugepd(pmd_val(pmdval)), flags,
-				      PMD_SHIFT);
-		if (page)
-			return page;
-		return no_page_table(vma, flags);
-	}
 retry:
 	if (!pmd_present(pmdval)) {
 		/*
@@ -764,20 +750,6 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
 	pud = pud_offset(p4dp, address);
 	if (pud_none(*pud))
 		return no_page_table(vma, flags);
-	if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) {
-		page = follow_huge_pud(mm, address, pud, flags);
-		if (page)
-			return page;
-		return no_page_table(vma, flags);
-	}
-	if (is_hugepd(__hugepd(pud_val(*pud)))) {
-		page = follow_huge_pd(vma, address,
-				      __hugepd(pud_val(*pud)), flags,
-				      PUD_SHIFT);
-		if (page)
-			return page;
-		return no_page_table(vma, flags);
-	}
 	if (pud_devmap(*pud)) {
 		ptl = pud_lock(mm, pud);
 		page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
@@ -797,7 +769,6 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
 				    struct follow_page_context *ctx)
 {
 	p4d_t *p4d;
-	struct page *page;
 
 	p4d = p4d_offset(pgdp, address);
 	if (p4d_none(*p4d))
@@ -806,14 +777,6 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
 	if (unlikely(p4d_bad(*p4d)))
 		return no_page_table(vma, flags);
 
-	if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
-		page = follow_huge_pd(vma, address,
-				      __hugepd(p4d_val(*p4d)), flags,
-				      P4D_SHIFT);
-		if (page)
-			return page;
-		return no_page_table(vma, flags);
-	}
 	return follow_pud_mask(vma, address, p4d, flags, ctx);
 }
 
@@ -851,10 +814,15 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
 
 	ctx->page_mask = 0;
 
-	/* make this handle hugepd */
-	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
-	if (!IS_ERR(page)) {
-		WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
+	/*
+	 * Call hugetlb_follow_page_mask for hugetlb vmas as it will use
+	 * special hugetlb page table walking code.  This eliminates the
+	 * need to check for hugetlb entries in the general walking code.
+	 */
+	if (is_vm_hugetlb_page(vma)) {
+		page = hugetlb_follow_page_mask(vma, address, flags);
+		if (!page)
+			page = no_page_table(vma, flags);
 		return page;
 	}
 
@@ -863,21 +831,6 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
 	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
 		return no_page_table(vma, flags);
 
-	if (pgd_huge(*pgd)) {
-		page = follow_huge_pgd(mm, address, pgd, flags);
-		if (page)
-			return page;
-		return no_page_table(vma, flags);
-	}
-	if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
-		page = follow_huge_pd(vma, address,
-				      __hugepd(pgd_val(*pgd)), flags,
-				      PGDIR_SHIFT);
-		if (page)
-			return page;
-		return no_page_table(vma, flags);
-	}
-
 	return follow_p4d_mask(vma, address, pgd, flags, ctx);
 }
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d0617d64d718..b3da421ba5be 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6190,6 +6190,62 @@ static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
 	return false;
 }
 
+struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
+				unsigned long address, unsigned int flags)
+{
+	struct hstate *h = hstate_vma(vma);
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long haddr = address & huge_page_mask(h);
+	struct page *page = NULL;
+	spinlock_t *ptl;
+	pte_t *pte, entry;
+
+	/*
+	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
+	 * follow_hugetlb_page().
+	 */
+	if (WARN_ON_ONCE(flags & FOLL_PIN))
+		return NULL;
+
+	pte = huge_pte_offset(mm, haddr, huge_page_size(h));
+	if (!pte)
+		return NULL;
+
+retry:
+	ptl = huge_pte_lock(h, mm, pte);
+	entry = huge_ptep_get(pte);
+	if (pte_present(entry)) {
+		page = pte_page(entry) +
+				((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
+		/*
+		 * Note that page may be a sub-page, and with vmemmap
+		 * optimizations the page struct may be read only.
+		 * try_grab_page() will increase the ref count on the
+		 * head page, so this will be OK.
+		 *
+		 * try_grab_page() should always succeed here, because we hold
+		 * the ptl lock and have verified pte_present().
+		 */
+		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
+			page = NULL;
+			goto out;
+		}
+	} else {
+		if (is_hugetlb_entry_migration(entry)) {
+			spin_unlock(ptl);
+			__migration_entry_wait_huge(pte, ptl);
+			goto retry;
+		}
+		/*
+		 * hwpoisoned entry is treated as no_page_table in
+		 * follow_page_mask().
+		 */
+	}
+out:
+	spin_unlock(ptl);
+	return page;
+}
+
 long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			 struct page **pages, struct vm_area_struct **vmas,
 			 unsigned long *position, unsigned long *nr_pages,
@@ -7140,123 +7196,6 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h)
  * These functions are overwritable if your architecture needs its own
  * behavior.
  */
-struct page * __weak
-follow_huge_addr(struct mm_struct *mm, unsigned long address,
-			      int write)
-{
-	return ERR_PTR(-EINVAL);
-}
-
-struct page * __weak
-follow_huge_pd(struct vm_area_struct *vma,
-	       unsigned long address, hugepd_t hpd, int flags, int pdshift)
-{
-	WARN(1, "hugepd follow called with no support for hugepage directory format\n");
-	return NULL;
-}
-
-struct page * __weak
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-		pmd_t *pmd, int flags)
-{
-	struct page *page = NULL;
-	spinlock_t *ptl;
-	pte_t pte;
-
-	/*
-	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
-	 * follow_hugetlb_page().
-	 */
-	if (WARN_ON_ONCE(flags & FOLL_PIN))
-		return NULL;
-
-retry:
-	ptl = pmd_lockptr(mm, pmd);
-	spin_lock(ptl);
-	/*
-	 * make sure that the address range covered by this pmd is not
-	 * unmapped from other threads.
-	 */
-	if (!pmd_huge(*pmd))
-		goto out;
-	pte = huge_ptep_get((pte_t *)pmd);
-	if (pte_present(pte)) {
-		page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
-		/*
-		 * try_grab_page() should always succeed here, because: a) we
-		 * hold the pmd (ptl) lock, and b) we've just checked that the
-		 * huge pmd (head) page is present in the page tables. The ptl
-		 * prevents the head page and tail pages from being rearranged
-		 * in any way. So this page must be available at this point,
-		 * unless the page refcount overflowed:
-		 */
-		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
-			page = NULL;
-			goto out;
-		}
-	} else {
-		if (is_hugetlb_entry_migration(pte)) {
-			spin_unlock(ptl);
-			__migration_entry_wait_huge((pte_t *)pmd, ptl);
-			goto retry;
-		}
-		/*
-		 * hwpoisoned entry is treated as no_page_table in
-		 * follow_page_mask().
-		 */
-	}
-out:
-	spin_unlock(ptl);
-	return page;
-}
-
-struct page * __weak
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
-		pud_t *pud, int flags)
-{
-	struct page *page = NULL;
-	spinlock_t *ptl;
-	pte_t pte;
-
-	if (WARN_ON_ONCE(flags & FOLL_PIN))
-		return NULL;
-
-retry:
-	ptl = huge_pte_lock(hstate_sizelog(PUD_SHIFT), mm, (pte_t *)pud);
-	if (!pud_huge(*pud))
-		goto out;
-	pte = huge_ptep_get((pte_t *)pud);
-	if (pte_present(pte)) {
-		page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
-		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
-			page = NULL;
-			goto out;
-		}
-	} else {
-		if (is_hugetlb_entry_migration(pte)) {
-			spin_unlock(ptl);
-			__migration_entry_wait(mm, (pte_t *)pud, ptl);
-			goto retry;
-		}
-		/*
-		 * hwpoisoned entry is treated as no_page_table in
-		 * follow_page_mask().
-		 */
-	}
-out:
-	spin_unlock(ptl);
-	return page;
-}
-
-struct page * __weak
-follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
-{
-	if (flags & (FOLL_GET | FOLL_PIN))
-		return NULL;
-
-	return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
-}
-
 int isolate_hugetlb(struct page *page, struct list_head *list)
 {
 	int ret = 0;
-- 
2.37.1

^ permalink raw reply related	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
  2022-08-29 23:40 ` Mike Kravetz
@ 2022-08-30  1:06   ` Baolin Wang
  -1 siblings, 0 replies; 60+ messages in thread
From: Baolin Wang @ 2022-08-30  1:06 UTC (permalink / raw)
  To: Mike Kravetz, linux-mm, linux-kernel, inuxppc-dev, linux-ia64
  Cc: David Hildenbrand, Aneesh Kumar K . V, Naoya Horiguchi,
	Michael Ellerman, Muchun Song, Andrew Morton

Hi Mike,

On 8/30/2022 7:40 AM, Mike Kravetz wrote:
> During discussions of this series [1], it was suggested that hugetlb
> handling code in follow_page_mask could be simplified.  At the beginning
> of follow_page_mask, there currently is a call to follow_huge_addr which
> 'may' handle hugetlb pages.  ia64 is the only architecture which provides
> a follow_huge_addr routine that does not return error.  Instead, at each
> level of the page table a check is made for a hugetlb entry.  If a hugetlb
> entry is found, a call to a routine associated with that entry is made.
> 
> Currently, there are two checks for hugetlb entries at each page table
> level.  The first check is of the form:
> 	if (p?d_huge())
> 		page = follow_huge_p?d();
> the second check is of the form:
> 	if (is_hugepd())
> 		page = follow_huge_pd().
> 
> We can replace these checks, as well as the special handling routines
> such as follow_huge_p?d() and follow_huge_pd() with a single routine to
> handle hugetlb vmas.
> 
> A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
> beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
> existing routine huge_pte_offset to walk page tables looking for hugetlb
> entries.  huge_pte_offset can be overwritten by architectures, and already
> handles special cases such as hugepd entries.

Could you also mention that this patch will fix the lock issue for 
CONT-PTE/PMD hugetlb by changing to use huge_pte_lock()? which will help 
people to understand the issue.

Otherwise the changes look good to me.
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>

> 
> [1] https://lore.kernel.org/linux-mm/cover.1661240170.git.baolin.wang@linux.alibaba.com/
> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> ---
>   arch/ia64/mm/hugetlbpage.c    |  15 ---
>   arch/powerpc/mm/hugetlbpage.c |  37 --------
>   include/linux/hugetlb.h       |  51 ++--------
>   mm/gup.c                      |  65 ++-----------
>   mm/hugetlb.c                  | 173 +++++++++++-----------------------
>   5 files changed, 74 insertions(+), 267 deletions(-)
> 
> diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
> index f993cb36c062..380d2f3966c9 100644
> --- a/arch/ia64/mm/hugetlbpage.c
> +++ b/arch/ia64/mm/hugetlbpage.c
> @@ -91,21 +91,6 @@ int prepare_hugepage_range(struct file *file,
>   	return 0;
>   }
>   
> -struct page *follow_huge_addr(struct mm_struct *mm, unsigned long addr, int write)
> -{
> -	struct page *page;
> -	pte_t *ptep;
> -
> -	if (REGION_NUMBER(addr) != RGN_HPAGE)
> -		return ERR_PTR(-EINVAL);
> -
> -	ptep = huge_pte_offset(mm, addr, HPAGE_SIZE);
> -	if (!ptep || pte_none(*ptep))
> -		return NULL;
> -	page = pte_page(*ptep);
> -	page += ((addr & ~HPAGE_MASK) >> PAGE_SHIFT);
> -	return page;
> -}
>   int pmd_huge(pmd_t pmd)
>   {
>   	return 0;
> diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
> index bc84a594ca62..b0e037c75c12 100644
> --- a/arch/powerpc/mm/hugetlbpage.c
> +++ b/arch/powerpc/mm/hugetlbpage.c
> @@ -506,43 +506,6 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
>   	} while (addr = next, addr != end);
>   }
>   
> -struct page *follow_huge_pd(struct vm_area_struct *vma,
> -			    unsigned long address, hugepd_t hpd,
> -			    int flags, int pdshift)
> -{
> -	pte_t *ptep;
> -	spinlock_t *ptl;
> -	struct page *page = NULL;
> -	unsigned long mask;
> -	int shift = hugepd_shift(hpd);
> -	struct mm_struct *mm = vma->vm_mm;
> -
> -retry:
> -	/*
> -	 * hugepage directory entries are protected by mm->page_table_lock
> -	 * Use this instead of huge_pte_lockptr
> -	 */
> -	ptl = &mm->page_table_lock;
> -	spin_lock(ptl);
> -
> -	ptep = hugepte_offset(hpd, address, pdshift);
> -	if (pte_present(*ptep)) {
> -		mask = (1UL << shift) - 1;
> -		page = pte_page(*ptep);
> -		page += ((address & mask) >> PAGE_SHIFT);
> -		if (flags & FOLL_GET)
> -			get_page(page);
> -	} else {
> -		if (is_hugetlb_entry_migration(*ptep)) {
> -			spin_unlock(ptl);
> -			__migration_entry_wait(mm, ptep, ptl);
> -			goto retry;
> -		}
> -	}
> -	spin_unlock(ptl);
> -	return page;
> -}
> -
>   bool __init arch_hugetlb_valid_size(unsigned long size)
>   {
>   	int shift = __ffs(size);
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 852f911d676e..8ea3e5e726e4 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -142,6 +142,8 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
>   			     unsigned long len);
>   int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *,
>   			    struct vm_area_struct *, struct vm_area_struct *);
> +struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
> +				unsigned long address, unsigned int flags);
>   long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
>   			 struct page **, struct vm_area_struct **,
>   			 unsigned long *, unsigned long *, long, unsigned int,
> @@ -202,17 +204,6 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
>   				unsigned long addr, pte_t *ptep);
>   void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
>   				unsigned long *start, unsigned long *end);
> -struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
> -			      int write);
> -struct page *follow_huge_pd(struct vm_area_struct *vma,
> -			    unsigned long address, hugepd_t hpd,
> -			    int flags, int pdshift);
> -struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
> -				pmd_t *pmd, int flags);
> -struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
> -				pud_t *pud, int flags);
> -struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address,
> -			     pgd_t *pgd, int flags);
>   
>   void hugetlb_vma_lock_read(struct vm_area_struct *vma);
>   void hugetlb_vma_unlock_read(struct vm_area_struct *vma);
> @@ -264,6 +255,13 @@ static inline void adjust_range_if_pmd_sharing_possible(
>   {
>   }
>   
> +static struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
> +				unsigned long address, unsigned int flags)
> +{
> +	/* should never happen, but do not want to BUG */
> +	return ERR_PTR(-EINVAL);
> +}
> +
>   static inline long follow_hugetlb_page(struct mm_struct *mm,
>   			struct vm_area_struct *vma, struct page **pages,
>   			struct vm_area_struct **vmas, unsigned long *position,
> @@ -274,12 +272,6 @@ static inline long follow_hugetlb_page(struct mm_struct *mm,
>   	return 0;
>   }
>   
> -static inline struct page *follow_huge_addr(struct mm_struct *mm,
> -					unsigned long address, int write)
> -{
> -	return ERR_PTR(-EINVAL);
> -}
> -
>   static inline int copy_hugetlb_page_range(struct mm_struct *dst,
>   					  struct mm_struct *src,
>   					  struct vm_area_struct *dst_vma,
> @@ -312,31 +304,6 @@ static inline void hugetlb_show_meminfo_node(int nid)
>   {
>   }
>   
> -static inline struct page *follow_huge_pd(struct vm_area_struct *vma,
> -				unsigned long address, hugepd_t hpd, int flags,
> -				int pdshift)
> -{
> -	return NULL;
> -}
> -
> -static inline struct page *follow_huge_pmd(struct mm_struct *mm,
> -				unsigned long address, pmd_t *pmd, int flags)
> -{
> -	return NULL;
> -}
> -
> -static inline struct page *follow_huge_pud(struct mm_struct *mm,
> -				unsigned long address, pud_t *pud, int flags)
> -{
> -	return NULL;
> -}
> -
> -static inline struct page *follow_huge_pgd(struct mm_struct *mm,
> -				unsigned long address, pgd_t *pgd, int flags)
> -{
> -	return NULL;
> -}
> -
>   static inline int prepare_hugepage_range(struct file *file,
>   				unsigned long addr, unsigned long len)
>   {
> diff --git a/mm/gup.c b/mm/gup.c
> index 66d8619e02ad..80ce04a5bae5 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -661,20 +661,6 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
>   	pmdval = READ_ONCE(*pmd);
>   	if (pmd_none(pmdval))
>   		return no_page_table(vma, flags);
> -	if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
> -		page = follow_huge_pmd(mm, address, pmd, flags);
> -		if (page)
> -			return page;
> -		return no_page_table(vma, flags);
> -	}
> -	if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
> -		page = follow_huge_pd(vma, address,
> -				      __hugepd(pmd_val(pmdval)), flags,
> -				      PMD_SHIFT);
> -		if (page)
> -			return page;
> -		return no_page_table(vma, flags);
> -	}
>   retry:
>   	if (!pmd_present(pmdval)) {
>   		/*
> @@ -764,20 +750,6 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
>   	pud = pud_offset(p4dp, address);
>   	if (pud_none(*pud))
>   		return no_page_table(vma, flags);
> -	if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) {
> -		page = follow_huge_pud(mm, address, pud, flags);
> -		if (page)
> -			return page;
> -		return no_page_table(vma, flags);
> -	}
> -	if (is_hugepd(__hugepd(pud_val(*pud)))) {
> -		page = follow_huge_pd(vma, address,
> -				      __hugepd(pud_val(*pud)), flags,
> -				      PUD_SHIFT);
> -		if (page)
> -			return page;
> -		return no_page_table(vma, flags);
> -	}
>   	if (pud_devmap(*pud)) {
>   		ptl = pud_lock(mm, pud);
>   		page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
> @@ -797,7 +769,6 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
>   				    struct follow_page_context *ctx)
>   {
>   	p4d_t *p4d;
> -	struct page *page;
>   
>   	p4d = p4d_offset(pgdp, address);
>   	if (p4d_none(*p4d))
> @@ -806,14 +777,6 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
>   	if (unlikely(p4d_bad(*p4d)))
>   		return no_page_table(vma, flags);
>   
> -	if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
> -		page = follow_huge_pd(vma, address,
> -				      __hugepd(p4d_val(*p4d)), flags,
> -				      P4D_SHIFT);
> -		if (page)
> -			return page;
> -		return no_page_table(vma, flags);
> -	}
>   	return follow_pud_mask(vma, address, p4d, flags, ctx);
>   }
>   
> @@ -851,10 +814,15 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
>   
>   	ctx->page_mask = 0;
>   
> -	/* make this handle hugepd */
> -	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
> -	if (!IS_ERR(page)) {
> -		WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
> +	/*
> +	 * Call hugetlb_follow_page_mask for hugetlb vmas as it will use
> +	 * special hugetlb page table walking code.  This eliminates the
> +	 * need to check for hugetlb entries in the general walking code.
> +	 */
> +	if (is_vm_hugetlb_page(vma)) {
> +		page = hugetlb_follow_page_mask(vma, address, flags);
> +		if (!page)
> +			page = no_page_table(vma, flags);
>   		return page;
>   	}
>   
> @@ -863,21 +831,6 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
>   	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
>   		return no_page_table(vma, flags);
>   
> -	if (pgd_huge(*pgd)) {
> -		page = follow_huge_pgd(mm, address, pgd, flags);
> -		if (page)
> -			return page;
> -		return no_page_table(vma, flags);
> -	}
> -	if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
> -		page = follow_huge_pd(vma, address,
> -				      __hugepd(pgd_val(*pgd)), flags,
> -				      PGDIR_SHIFT);
> -		if (page)
> -			return page;
> -		return no_page_table(vma, flags);
> -	}
> -
>   	return follow_p4d_mask(vma, address, pgd, flags, ctx);
>   }
>   
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index d0617d64d718..b3da421ba5be 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -6190,6 +6190,62 @@ static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
>   	return false;
>   }
>   
> +struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
> +				unsigned long address, unsigned int flags)
> +{
> +	struct hstate *h = hstate_vma(vma);
> +	struct mm_struct *mm = vma->vm_mm;
> +	unsigned long haddr = address & huge_page_mask(h);
> +	struct page *page = NULL;
> +	spinlock_t *ptl;
> +	pte_t *pte, entry;
> +
> +	/*
> +	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
> +	 * follow_hugetlb_page().
> +	 */
> +	if (WARN_ON_ONCE(flags & FOLL_PIN))
> +		return NULL;
> +
> +	pte = huge_pte_offset(mm, haddr, huge_page_size(h));
> +	if (!pte)
> +		return NULL;
> +
> +retry:
> +	ptl = huge_pte_lock(h, mm, pte);
> +	entry = huge_ptep_get(pte);
> +	if (pte_present(entry)) {
> +		page = pte_page(entry) +
> +				((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
> +		/*
> +		 * Note that page may be a sub-page, and with vmemmap
> +		 * optimizations the page struct may be read only.
> +		 * try_grab_page() will increase the ref count on the
> +		 * head page, so this will be OK.
> +		 *
> +		 * try_grab_page() should always succeed here, because we hold
> +		 * the ptl lock and have verified pte_present().
> +		 */
> +		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
> +			page = NULL;
> +			goto out;
> +		}
> +	} else {
> +		if (is_hugetlb_entry_migration(entry)) {
> +			spin_unlock(ptl);
> +			__migration_entry_wait_huge(pte, ptl);
> +			goto retry;
> +		}
> +		/*
> +		 * hwpoisoned entry is treated as no_page_table in
> +		 * follow_page_mask().
> +		 */
> +	}
> +out:
> +	spin_unlock(ptl);
> +	return page;
> +}
> +
>   long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>   			 struct page **pages, struct vm_area_struct **vmas,
>   			 unsigned long *position, unsigned long *nr_pages,
> @@ -7140,123 +7196,6 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h)
>    * These functions are overwritable if your architecture needs its own
>    * behavior.
>    */
> -struct page * __weak
> -follow_huge_addr(struct mm_struct *mm, unsigned long address,
> -			      int write)
> -{
> -	return ERR_PTR(-EINVAL);
> -}
> -
> -struct page * __weak
> -follow_huge_pd(struct vm_area_struct *vma,
> -	       unsigned long address, hugepd_t hpd, int flags, int pdshift)
> -{
> -	WARN(1, "hugepd follow called with no support for hugepage directory format\n");
> -	return NULL;
> -}
> -
> -struct page * __weak
> -follow_huge_pmd(struct mm_struct *mm, unsigned long address,
> -		pmd_t *pmd, int flags)
> -{
> -	struct page *page = NULL;
> -	spinlock_t *ptl;
> -	pte_t pte;
> -
> -	/*
> -	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
> -	 * follow_hugetlb_page().
> -	 */
> -	if (WARN_ON_ONCE(flags & FOLL_PIN))
> -		return NULL;
> -
> -retry:
> -	ptl = pmd_lockptr(mm, pmd);
> -	spin_lock(ptl);
> -	/*
> -	 * make sure that the address range covered by this pmd is not
> -	 * unmapped from other threads.
> -	 */
> -	if (!pmd_huge(*pmd))
> -		goto out;
> -	pte = huge_ptep_get((pte_t *)pmd);
> -	if (pte_present(pte)) {
> -		page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
> -		/*
> -		 * try_grab_page() should always succeed here, because: a) we
> -		 * hold the pmd (ptl) lock, and b) we've just checked that the
> -		 * huge pmd (head) page is present in the page tables. The ptl
> -		 * prevents the head page and tail pages from being rearranged
> -		 * in any way. So this page must be available at this point,
> -		 * unless the page refcount overflowed:
> -		 */
> -		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
> -			page = NULL;
> -			goto out;
> -		}
> -	} else {
> -		if (is_hugetlb_entry_migration(pte)) {
> -			spin_unlock(ptl);
> -			__migration_entry_wait_huge((pte_t *)pmd, ptl);
> -			goto retry;
> -		}
> -		/*
> -		 * hwpoisoned entry is treated as no_page_table in
> -		 * follow_page_mask().
> -		 */
> -	}
> -out:
> -	spin_unlock(ptl);
> -	return page;
> -}
> -
> -struct page * __weak
> -follow_huge_pud(struct mm_struct *mm, unsigned long address,
> -		pud_t *pud, int flags)
> -{
> -	struct page *page = NULL;
> -	spinlock_t *ptl;
> -	pte_t pte;
> -
> -	if (WARN_ON_ONCE(flags & FOLL_PIN))
> -		return NULL;
> -
> -retry:
> -	ptl = huge_pte_lock(hstate_sizelog(PUD_SHIFT), mm, (pte_t *)pud);
> -	if (!pud_huge(*pud))
> -		goto out;
> -	pte = huge_ptep_get((pte_t *)pud);
> -	if (pte_present(pte)) {
> -		page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
> -		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
> -			page = NULL;
> -			goto out;
> -		}
> -	} else {
> -		if (is_hugetlb_entry_migration(pte)) {
> -			spin_unlock(ptl);
> -			__migration_entry_wait(mm, (pte_t *)pud, ptl);
> -			goto retry;
> -		}
> -		/*
> -		 * hwpoisoned entry is treated as no_page_table in
> -		 * follow_page_mask().
> -		 */
> -	}
> -out:
> -	spin_unlock(ptl);
> -	return page;
> -}
> -
> -struct page * __weak
> -follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
> -{
> -	if (flags & (FOLL_GET | FOLL_PIN))
> -		return NULL;
> -
> -	return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
> -}
> -
>   int isolate_hugetlb(struct page *page, struct list_head *list)
>   {
>   	int ret = 0;

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-08-30  1:06   ` Baolin Wang
  0 siblings, 0 replies; 60+ messages in thread
From: Baolin Wang @ 2022-08-30  1:06 UTC (permalink / raw)
  To: Mike Kravetz, linux-mm, linux-kernel, inuxppc-dev, linux-ia64
  Cc: David Hildenbrand, Aneesh Kumar K . V, Naoya Horiguchi,
	Michael Ellerman, Muchun Song, Andrew Morton

Hi Mike,

On 8/30/2022 7:40 AM, Mike Kravetz wrote:
> During discussions of this series [1], it was suggested that hugetlb
> handling code in follow_page_mask could be simplified.  At the beginning
> of follow_page_mask, there currently is a call to follow_huge_addr which
> 'may' handle hugetlb pages.  ia64 is the only architecture which provides
> a follow_huge_addr routine that does not return error.  Instead, at each
> level of the page table a check is made for a hugetlb entry.  If a hugetlb
> entry is found, a call to a routine associated with that entry is made.
> 
> Currently, there are two checks for hugetlb entries at each page table
> level.  The first check is of the form:
> 	if (p?d_huge())
> 		page = follow_huge_p?d();
> the second check is of the form:
> 	if (is_hugepd())
> 		page = follow_huge_pd().
> 
> We can replace these checks, as well as the special handling routines
> such as follow_huge_p?d() and follow_huge_pd() with a single routine to
> handle hugetlb vmas.
> 
> A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
> beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
> existing routine huge_pte_offset to walk page tables looking for hugetlb
> entries.  huge_pte_offset can be overwritten by architectures, and already
> handles special cases such as hugepd entries.

Could you also mention that this patch will fix the lock issue for 
CONT-PTE/PMD hugetlb by changing to use huge_pte_lock()? which will help 
people to understand the issue.

Otherwise the changes look good to me.
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>

> 
> [1] https://lore.kernel.org/linux-mm/cover.1661240170.git.baolin.wang@linux.alibaba.com/
> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> ---
>   arch/ia64/mm/hugetlbpage.c    |  15 ---
>   arch/powerpc/mm/hugetlbpage.c |  37 --------
>   include/linux/hugetlb.h       |  51 ++--------
>   mm/gup.c                      |  65 ++-----------
>   mm/hugetlb.c                  | 173 +++++++++++-----------------------
>   5 files changed, 74 insertions(+), 267 deletions(-)
> 
> diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
> index f993cb36c062..380d2f3966c9 100644
> --- a/arch/ia64/mm/hugetlbpage.c
> +++ b/arch/ia64/mm/hugetlbpage.c
> @@ -91,21 +91,6 @@ int prepare_hugepage_range(struct file *file,
>   	return 0;
>   }
>   
> -struct page *follow_huge_addr(struct mm_struct *mm, unsigned long addr, int write)
> -{
> -	struct page *page;
> -	pte_t *ptep;
> -
> -	if (REGION_NUMBER(addr) != RGN_HPAGE)
> -		return ERR_PTR(-EINVAL);
> -
> -	ptep = huge_pte_offset(mm, addr, HPAGE_SIZE);
> -	if (!ptep || pte_none(*ptep))
> -		return NULL;
> -	page = pte_page(*ptep);
> -	page += ((addr & ~HPAGE_MASK) >> PAGE_SHIFT);
> -	return page;
> -}
>   int pmd_huge(pmd_t pmd)
>   {
>   	return 0;
> diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
> index bc84a594ca62..b0e037c75c12 100644
> --- a/arch/powerpc/mm/hugetlbpage.c
> +++ b/arch/powerpc/mm/hugetlbpage.c
> @@ -506,43 +506,6 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
>   	} while (addr = next, addr != end);
>   }
>   
> -struct page *follow_huge_pd(struct vm_area_struct *vma,
> -			    unsigned long address, hugepd_t hpd,
> -			    int flags, int pdshift)
> -{
> -	pte_t *ptep;
> -	spinlock_t *ptl;
> -	struct page *page = NULL;
> -	unsigned long mask;
> -	int shift = hugepd_shift(hpd);
> -	struct mm_struct *mm = vma->vm_mm;
> -
> -retry:
> -	/*
> -	 * hugepage directory entries are protected by mm->page_table_lock
> -	 * Use this instead of huge_pte_lockptr
> -	 */
> -	ptl = &mm->page_table_lock;
> -	spin_lock(ptl);
> -
> -	ptep = hugepte_offset(hpd, address, pdshift);
> -	if (pte_present(*ptep)) {
> -		mask = (1UL << shift) - 1;
> -		page = pte_page(*ptep);
> -		page += ((address & mask) >> PAGE_SHIFT);
> -		if (flags & FOLL_GET)
> -			get_page(page);
> -	} else {
> -		if (is_hugetlb_entry_migration(*ptep)) {
> -			spin_unlock(ptl);
> -			__migration_entry_wait(mm, ptep, ptl);
> -			goto retry;
> -		}
> -	}
> -	spin_unlock(ptl);
> -	return page;
> -}
> -
>   bool __init arch_hugetlb_valid_size(unsigned long size)
>   {
>   	int shift = __ffs(size);
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 852f911d676e..8ea3e5e726e4 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -142,6 +142,8 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
>   			     unsigned long len);
>   int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *,
>   			    struct vm_area_struct *, struct vm_area_struct *);
> +struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
> +				unsigned long address, unsigned int flags);
>   long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
>   			 struct page **, struct vm_area_struct **,
>   			 unsigned long *, unsigned long *, long, unsigned int,
> @@ -202,17 +204,6 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
>   				unsigned long addr, pte_t *ptep);
>   void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
>   				unsigned long *start, unsigned long *end);
> -struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
> -			      int write);
> -struct page *follow_huge_pd(struct vm_area_struct *vma,
> -			    unsigned long address, hugepd_t hpd,
> -			    int flags, int pdshift);
> -struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
> -				pmd_t *pmd, int flags);
> -struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
> -				pud_t *pud, int flags);
> -struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address,
> -			     pgd_t *pgd, int flags);
>   
>   void hugetlb_vma_lock_read(struct vm_area_struct *vma);
>   void hugetlb_vma_unlock_read(struct vm_area_struct *vma);
> @@ -264,6 +255,13 @@ static inline void adjust_range_if_pmd_sharing_possible(
>   {
>   }
>   
> +static struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
> +				unsigned long address, unsigned int flags)
> +{
> +	/* should never happen, but do not want to BUG */
> +	return ERR_PTR(-EINVAL);
> +}
> +
>   static inline long follow_hugetlb_page(struct mm_struct *mm,
>   			struct vm_area_struct *vma, struct page **pages,
>   			struct vm_area_struct **vmas, unsigned long *position,
> @@ -274,12 +272,6 @@ static inline long follow_hugetlb_page(struct mm_struct *mm,
>   	return 0;
>   }
>   
> -static inline struct page *follow_huge_addr(struct mm_struct *mm,
> -					unsigned long address, int write)
> -{
> -	return ERR_PTR(-EINVAL);
> -}
> -
>   static inline int copy_hugetlb_page_range(struct mm_struct *dst,
>   					  struct mm_struct *src,
>   					  struct vm_area_struct *dst_vma,
> @@ -312,31 +304,6 @@ static inline void hugetlb_show_meminfo_node(int nid)
>   {
>   }
>   
> -static inline struct page *follow_huge_pd(struct vm_area_struct *vma,
> -				unsigned long address, hugepd_t hpd, int flags,
> -				int pdshift)
> -{
> -	return NULL;
> -}
> -
> -static inline struct page *follow_huge_pmd(struct mm_struct *mm,
> -				unsigned long address, pmd_t *pmd, int flags)
> -{
> -	return NULL;
> -}
> -
> -static inline struct page *follow_huge_pud(struct mm_struct *mm,
> -				unsigned long address, pud_t *pud, int flags)
> -{
> -	return NULL;
> -}
> -
> -static inline struct page *follow_huge_pgd(struct mm_struct *mm,
> -				unsigned long address, pgd_t *pgd, int flags)
> -{
> -	return NULL;
> -}
> -
>   static inline int prepare_hugepage_range(struct file *file,
>   				unsigned long addr, unsigned long len)
>   {
> diff --git a/mm/gup.c b/mm/gup.c
> index 66d8619e02ad..80ce04a5bae5 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -661,20 +661,6 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
>   	pmdval = READ_ONCE(*pmd);
>   	if (pmd_none(pmdval))
>   		return no_page_table(vma, flags);
> -	if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
> -		page = follow_huge_pmd(mm, address, pmd, flags);
> -		if (page)
> -			return page;
> -		return no_page_table(vma, flags);
> -	}
> -	if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
> -		page = follow_huge_pd(vma, address,
> -				      __hugepd(pmd_val(pmdval)), flags,
> -				      PMD_SHIFT);
> -		if (page)
> -			return page;
> -		return no_page_table(vma, flags);
> -	}
>   retry:
>   	if (!pmd_present(pmdval)) {
>   		/*
> @@ -764,20 +750,6 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
>   	pud = pud_offset(p4dp, address);
>   	if (pud_none(*pud))
>   		return no_page_table(vma, flags);
> -	if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) {
> -		page = follow_huge_pud(mm, address, pud, flags);
> -		if (page)
> -			return page;
> -		return no_page_table(vma, flags);
> -	}
> -	if (is_hugepd(__hugepd(pud_val(*pud)))) {
> -		page = follow_huge_pd(vma, address,
> -				      __hugepd(pud_val(*pud)), flags,
> -				      PUD_SHIFT);
> -		if (page)
> -			return page;
> -		return no_page_table(vma, flags);
> -	}
>   	if (pud_devmap(*pud)) {
>   		ptl = pud_lock(mm, pud);
>   		page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
> @@ -797,7 +769,6 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
>   				    struct follow_page_context *ctx)
>   {
>   	p4d_t *p4d;
> -	struct page *page;
>   
>   	p4d = p4d_offset(pgdp, address);
>   	if (p4d_none(*p4d))
> @@ -806,14 +777,6 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
>   	if (unlikely(p4d_bad(*p4d)))
>   		return no_page_table(vma, flags);
>   
> -	if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
> -		page = follow_huge_pd(vma, address,
> -				      __hugepd(p4d_val(*p4d)), flags,
> -				      P4D_SHIFT);
> -		if (page)
> -			return page;
> -		return no_page_table(vma, flags);
> -	}
>   	return follow_pud_mask(vma, address, p4d, flags, ctx);
>   }
>   
> @@ -851,10 +814,15 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
>   
>   	ctx->page_mask = 0;
>   
> -	/* make this handle hugepd */
> -	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
> -	if (!IS_ERR(page)) {
> -		WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
> +	/*
> +	 * Call hugetlb_follow_page_mask for hugetlb vmas as it will use
> +	 * special hugetlb page table walking code.  This eliminates the
> +	 * need to check for hugetlb entries in the general walking code.
> +	 */
> +	if (is_vm_hugetlb_page(vma)) {
> +		page = hugetlb_follow_page_mask(vma, address, flags);
> +		if (!page)
> +			page = no_page_table(vma, flags);
>   		return page;
>   	}
>   
> @@ -863,21 +831,6 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
>   	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
>   		return no_page_table(vma, flags);
>   
> -	if (pgd_huge(*pgd)) {
> -		page = follow_huge_pgd(mm, address, pgd, flags);
> -		if (page)
> -			return page;
> -		return no_page_table(vma, flags);
> -	}
> -	if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
> -		page = follow_huge_pd(vma, address,
> -				      __hugepd(pgd_val(*pgd)), flags,
> -				      PGDIR_SHIFT);
> -		if (page)
> -			return page;
> -		return no_page_table(vma, flags);
> -	}
> -
>   	return follow_p4d_mask(vma, address, pgd, flags, ctx);
>   }
>   
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index d0617d64d718..b3da421ba5be 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -6190,6 +6190,62 @@ static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
>   	return false;
>   }
>   
> +struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
> +				unsigned long address, unsigned int flags)
> +{
> +	struct hstate *h = hstate_vma(vma);
> +	struct mm_struct *mm = vma->vm_mm;
> +	unsigned long haddr = address & huge_page_mask(h);
> +	struct page *page = NULL;
> +	spinlock_t *ptl;
> +	pte_t *pte, entry;
> +
> +	/*
> +	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
> +	 * follow_hugetlb_page().
> +	 */
> +	if (WARN_ON_ONCE(flags & FOLL_PIN))
> +		return NULL;
> +
> +	pte = huge_pte_offset(mm, haddr, huge_page_size(h));
> +	if (!pte)
> +		return NULL;
> +
> +retry:
> +	ptl = huge_pte_lock(h, mm, pte);
> +	entry = huge_ptep_get(pte);
> +	if (pte_present(entry)) {
> +		page = pte_page(entry) +
> +				((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
> +		/*
> +		 * Note that page may be a sub-page, and with vmemmap
> +		 * optimizations the page struct may be read only.
> +		 * try_grab_page() will increase the ref count on the
> +		 * head page, so this will be OK.
> +		 *
> +		 * try_grab_page() should always succeed here, because we hold
> +		 * the ptl lock and have verified pte_present().
> +		 */
> +		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
> +			page = NULL;
> +			goto out;
> +		}
> +	} else {
> +		if (is_hugetlb_entry_migration(entry)) {
> +			spin_unlock(ptl);
> +			__migration_entry_wait_huge(pte, ptl);
> +			goto retry;
> +		}
> +		/*
> +		 * hwpoisoned entry is treated as no_page_table in
> +		 * follow_page_mask().
> +		 */
> +	}
> +out:
> +	spin_unlock(ptl);
> +	return page;
> +}
> +
>   long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>   			 struct page **pages, struct vm_area_struct **vmas,
>   			 unsigned long *position, unsigned long *nr_pages,
> @@ -7140,123 +7196,6 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h)
>    * These functions are overwritable if your architecture needs its own
>    * behavior.
>    */
> -struct page * __weak
> -follow_huge_addr(struct mm_struct *mm, unsigned long address,
> -			      int write)
> -{
> -	return ERR_PTR(-EINVAL);
> -}
> -
> -struct page * __weak
> -follow_huge_pd(struct vm_area_struct *vma,
> -	       unsigned long address, hugepd_t hpd, int flags, int pdshift)
> -{
> -	WARN(1, "hugepd follow called with no support for hugepage directory format\n");
> -	return NULL;
> -}
> -
> -struct page * __weak
> -follow_huge_pmd(struct mm_struct *mm, unsigned long address,
> -		pmd_t *pmd, int flags)
> -{
> -	struct page *page = NULL;
> -	spinlock_t *ptl;
> -	pte_t pte;
> -
> -	/*
> -	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
> -	 * follow_hugetlb_page().
> -	 */
> -	if (WARN_ON_ONCE(flags & FOLL_PIN))
> -		return NULL;
> -
> -retry:
> -	ptl = pmd_lockptr(mm, pmd);
> -	spin_lock(ptl);
> -	/*
> -	 * make sure that the address range covered by this pmd is not
> -	 * unmapped from other threads.
> -	 */
> -	if (!pmd_huge(*pmd))
> -		goto out;
> -	pte = huge_ptep_get((pte_t *)pmd);
> -	if (pte_present(pte)) {
> -		page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
> -		/*
> -		 * try_grab_page() should always succeed here, because: a) we
> -		 * hold the pmd (ptl) lock, and b) we've just checked that the
> -		 * huge pmd (head) page is present in the page tables. The ptl
> -		 * prevents the head page and tail pages from being rearranged
> -		 * in any way. So this page must be available at this point,
> -		 * unless the page refcount overflowed:
> -		 */
> -		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
> -			page = NULL;
> -			goto out;
> -		}
> -	} else {
> -		if (is_hugetlb_entry_migration(pte)) {
> -			spin_unlock(ptl);
> -			__migration_entry_wait_huge((pte_t *)pmd, ptl);
> -			goto retry;
> -		}
> -		/*
> -		 * hwpoisoned entry is treated as no_page_table in
> -		 * follow_page_mask().
> -		 */
> -	}
> -out:
> -	spin_unlock(ptl);
> -	return page;
> -}
> -
> -struct page * __weak
> -follow_huge_pud(struct mm_struct *mm, unsigned long address,
> -		pud_t *pud, int flags)
> -{
> -	struct page *page = NULL;
> -	spinlock_t *ptl;
> -	pte_t pte;
> -
> -	if (WARN_ON_ONCE(flags & FOLL_PIN))
> -		return NULL;
> -
> -retry:
> -	ptl = huge_pte_lock(hstate_sizelog(PUD_SHIFT), mm, (pte_t *)pud);
> -	if (!pud_huge(*pud))
> -		goto out;
> -	pte = huge_ptep_get((pte_t *)pud);
> -	if (pte_present(pte)) {
> -		page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
> -		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
> -			page = NULL;
> -			goto out;
> -		}
> -	} else {
> -		if (is_hugetlb_entry_migration(pte)) {
> -			spin_unlock(ptl);
> -			__migration_entry_wait(mm, (pte_t *)pud, ptl);
> -			goto retry;
> -		}
> -		/*
> -		 * hwpoisoned entry is treated as no_page_table in
> -		 * follow_page_mask().
> -		 */
> -	}
> -out:
> -	spin_unlock(ptl);
> -	return page;
> -}
> -
> -struct page * __weak
> -follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
> -{
> -	if (flags & (FOLL_GET | FOLL_PIN))
> -		return NULL;
> -
> -	return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
> -}
> -
>   int isolate_hugetlb(struct page *page, struct list_head *list)
>   {
>   	int ret = 0;

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
  2022-08-29 23:40 ` Mike Kravetz
@ 2022-08-30  8:11   ` David Hildenbrand
  -1 siblings, 0 replies; 60+ messages in thread
From: David Hildenbrand @ 2022-08-30  8:11 UTC (permalink / raw)
  To: Mike Kravetz, linux-mm, linux-kernel, inuxppc-dev, linux-ia64
  Cc: Baolin Wang, Aneesh Kumar K . V, Naoya Horiguchi,
	Michael Ellerman, Muchun Song, Andrew Morton

On 30.08.22 01:40, Mike Kravetz wrote:
> During discussions of this series [1], it was suggested that hugetlb
> handling code in follow_page_mask could be simplified.  At the beginning

Feel free to use a Suggested-by if you consider it appropriate.

> of follow_page_mask, there currently is a call to follow_huge_addr which
> 'may' handle hugetlb pages.  ia64 is the only architecture which provides
> a follow_huge_addr routine that does not return error.  Instead, at each
> level of the page table a check is made for a hugetlb entry.  If a hugetlb
> entry is found, a call to a routine associated with that entry is made.
> 
> Currently, there are two checks for hugetlb entries at each page table
> level.  The first check is of the form:
> 	if (p?d_huge())
> 		page = follow_huge_p?d();
> the second check is of the form:
> 	if (is_hugepd())
> 		page = follow_huge_pd().

BTW, what about all this hugepd stuff in mm/pagewalk.c?

Isn't this all dead code as we're essentially routing all hugetlb VMAs
via walk_hugetlb_range? [yes, all that hugepd stuff in generic code that
overcomplicates stuff has been annoying me for a long time]

> 
> We can replace these checks, as well as the special handling routines
> such as follow_huge_p?d() and follow_huge_pd() with a single routine to
> handle hugetlb vmas.
> 
> A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
> beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
> existing routine huge_pte_offset to walk page tables looking for hugetlb
> entries.  huge_pte_offset can be overwritten by architectures, and already
> handles special cases such as hugepd entries.
> 
> [1] https://lore.kernel.org/linux-mm/cover.1661240170.git.baolin.wang@linux.alibaba.com/
> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>

[...]

> +static struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
> +				unsigned long address, unsigned int flags)
> +{
> +	/* should never happen, but do not want to BUG */
> +	return ERR_PTR(-EINVAL);

Should there be a WARN_ON_ONCE() instead or could we use a BUILD_BUG_ON()?

> +}


[...]

> @@ -851,10 +814,15 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
>  
>  	ctx->page_mask = 0;
>  
> -	/* make this handle hugepd */
> -	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
> -	if (!IS_ERR(page)) {
> -		WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
> +	/*
> +	 * Call hugetlb_follow_page_mask for hugetlb vmas as it will use
> +	 * special hugetlb page table walking code.  This eliminates the
> +	 * need to check for hugetlb entries in the general walking code.
> +	 */

Maybe also comment that ordinary GUP never ends up in here and instead
directly uses follow_hugetlb_page(). This is for follow_page() handling
only.

[me suggestion to rename follow_hugetlb_page() still stands ;) ]

> +	if (is_vm_hugetlb_page(vma)) {
> +		page = hugetlb_follow_page_mask(vma, address, flags);
> +		if (!page)
> +			page = no_page_table(vma, flags);
>  		return page;
>  	}
>  
> @@ -863,21 +831,6 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
>  	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
>  		return no_page_table(vma, flags);
>  
> -	if (pgd_huge(*pgd)) {
> -		page = follow_huge_pgd(mm, address, pgd, flags);
> -		if (page)
> -			return page;
> -		return no_page_table(vma, flags);
> -	}
> -	if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
> -		page = follow_huge_pd(vma, address,
> -				      __hugepd(pgd_val(*pgd)), flags,
> -				      PGDIR_SHIFT);
> -		if (page)
> -			return page;
> -		return no_page_table(vma, flags);
> -	}
> -
>  	return follow_p4d_mask(vma, address, pgd, flags, ctx);
>  }
>  
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index d0617d64d718..b3da421ba5be 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -6190,6 +6190,62 @@ static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
>  	return false;
>  }
>  
> +struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
> +				unsigned long address, unsigned int flags)
> +{
> +	struct hstate *h = hstate_vma(vma);
> +	struct mm_struct *mm = vma->vm_mm;
> +	unsigned long haddr = address & huge_page_mask(h);
> +	struct page *page = NULL;
> +	spinlock_t *ptl;
> +	pte_t *pte, entry;
> +
> +	/*
> +	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
> +	 * follow_hugetlb_page().
> +	 */
> +	if (WARN_ON_ONCE(flags & FOLL_PIN))
> +		return NULL;
> +
> +	pte = huge_pte_offset(mm, haddr, huge_page_size(h));
> +	if (!pte)
> +		return NULL;
> +
> +retry:
> +	ptl = huge_pte_lock(h, mm, pte);
> +	entry = huge_ptep_get(pte);
> +	if (pte_present(entry)) {
> +		page = pte_page(entry) +
> +				((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
> +		/*
> +		 * Note that page may be a sub-page, and with vmemmap
> +		 * optimizations the page struct may be read only.
> +		 * try_grab_page() will increase the ref count on the
> +		 * head page, so this will be OK.
> +		 *
> +		 * try_grab_page() should always succeed here, because we hold
> +		 * the ptl lock and have verified pte_present().
> +		 */
> +		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
> +			page = NULL;
> +			goto out;
> +		}
> +	} else {
> +		if (is_hugetlb_entry_migration(entry)) {
> +			spin_unlock(ptl);
> +			__migration_entry_wait_huge(pte, ptl);
> +			goto retry;
> +		}
> +		/*
> +		 * hwpoisoned entry is treated as no_page_table in
> +		 * follow_page_mask().
> +		 */
> +	}
> +out:
> +	spin_unlock(ptl);
> +	return page;
> +}
> +
>  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>  			 struct page **pages, struct vm_area_struct **vmas,
>  			 unsigned long *position, unsigned long *nr_pages,
> @@ -7140,123 +7196,6 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h)
>   * These functions are overwritable if your architecture needs its own
>   * behavior.
>   */

[...]

Numbers speak for themselves.

Acked-by: David Hildenbrand <david@redhat.com>

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-08-30  8:11   ` David Hildenbrand
  0 siblings, 0 replies; 60+ messages in thread
From: David Hildenbrand @ 2022-08-30  8:11 UTC (permalink / raw)
  To: Mike Kravetz, linux-mm, linux-kernel, inuxppc-dev, linux-ia64
  Cc: Baolin Wang, Aneesh Kumar K . V, Naoya Horiguchi,
	Michael Ellerman, Muchun Song, Andrew Morton

On 30.08.22 01:40, Mike Kravetz wrote:
> During discussions of this series [1], it was suggested that hugetlb
> handling code in follow_page_mask could be simplified.  At the beginning

Feel free to use a Suggested-by if you consider it appropriate.

> of follow_page_mask, there currently is a call to follow_huge_addr which
> 'may' handle hugetlb pages.  ia64 is the only architecture which provides
> a follow_huge_addr routine that does not return error.  Instead, at each
> level of the page table a check is made for a hugetlb entry.  If a hugetlb
> entry is found, a call to a routine associated with that entry is made.
> 
> Currently, there are two checks for hugetlb entries at each page table
> level.  The first check is of the form:
> 	if (p?d_huge())
> 		page = follow_huge_p?d();
> the second check is of the form:
> 	if (is_hugepd())
> 		page = follow_huge_pd().

BTW, what about all this hugepd stuff in mm/pagewalk.c?

Isn't this all dead code as we're essentially routing all hugetlb VMAs
via walk_hugetlb_range? [yes, all that hugepd stuff in generic code that
overcomplicates stuff has been annoying me for a long time]

> 
> We can replace these checks, as well as the special handling routines
> such as follow_huge_p?d() and follow_huge_pd() with a single routine to
> handle hugetlb vmas.
> 
> A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
> beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
> existing routine huge_pte_offset to walk page tables looking for hugetlb
> entries.  huge_pte_offset can be overwritten by architectures, and already
> handles special cases such as hugepd entries.
> 
> [1] https://lore.kernel.org/linux-mm/cover.1661240170.git.baolin.wang@linux.alibaba.com/
> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>

[...]

> +static struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
> +				unsigned long address, unsigned int flags)
> +{
> +	/* should never happen, but do not want to BUG */
> +	return ERR_PTR(-EINVAL);

Should there be a WARN_ON_ONCE() instead or could we use a BUILD_BUG_ON()?

> +}


[...]

> @@ -851,10 +814,15 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
>  
>  	ctx->page_mask = 0;
>  
> -	/* make this handle hugepd */
> -	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
> -	if (!IS_ERR(page)) {
> -		WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
> +	/*
> +	 * Call hugetlb_follow_page_mask for hugetlb vmas as it will use
> +	 * special hugetlb page table walking code.  This eliminates the
> +	 * need to check for hugetlb entries in the general walking code.
> +	 */

Maybe also comment that ordinary GUP never ends up in here and instead
directly uses follow_hugetlb_page(). This is for follow_page() handling
only.

[me suggestion to rename follow_hugetlb_page() still stands ;) ]

> +	if (is_vm_hugetlb_page(vma)) {
> +		page = hugetlb_follow_page_mask(vma, address, flags);
> +		if (!page)
> +			page = no_page_table(vma, flags);
>  		return page;
>  	}
>  
> @@ -863,21 +831,6 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
>  	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
>  		return no_page_table(vma, flags);
>  
> -	if (pgd_huge(*pgd)) {
> -		page = follow_huge_pgd(mm, address, pgd, flags);
> -		if (page)
> -			return page;
> -		return no_page_table(vma, flags);
> -	}
> -	if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
> -		page = follow_huge_pd(vma, address,
> -				      __hugepd(pgd_val(*pgd)), flags,
> -				      PGDIR_SHIFT);
> -		if (page)
> -			return page;
> -		return no_page_table(vma, flags);
> -	}
> -
>  	return follow_p4d_mask(vma, address, pgd, flags, ctx);
>  }
>  
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index d0617d64d718..b3da421ba5be 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -6190,6 +6190,62 @@ static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
>  	return false;
>  }
>  
> +struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
> +				unsigned long address, unsigned int flags)
> +{
> +	struct hstate *h = hstate_vma(vma);
> +	struct mm_struct *mm = vma->vm_mm;
> +	unsigned long haddr = address & huge_page_mask(h);
> +	struct page *page = NULL;
> +	spinlock_t *ptl;
> +	pte_t *pte, entry;
> +
> +	/*
> +	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
> +	 * follow_hugetlb_page().
> +	 */
> +	if (WARN_ON_ONCE(flags & FOLL_PIN))
> +		return NULL;
> +
> +	pte = huge_pte_offset(mm, haddr, huge_page_size(h));
> +	if (!pte)
> +		return NULL;
> +
> +retry:
> +	ptl = huge_pte_lock(h, mm, pte);
> +	entry = huge_ptep_get(pte);
> +	if (pte_present(entry)) {
> +		page = pte_page(entry) +
> +				((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
> +		/*
> +		 * Note that page may be a sub-page, and with vmemmap
> +		 * optimizations the page struct may be read only.
> +		 * try_grab_page() will increase the ref count on the
> +		 * head page, so this will be OK.
> +		 *
> +		 * try_grab_page() should always succeed here, because we hold
> +		 * the ptl lock and have verified pte_present().
> +		 */
> +		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
> +			page = NULL;
> +			goto out;
> +		}
> +	} else {
> +		if (is_hugetlb_entry_migration(entry)) {
> +			spin_unlock(ptl);
> +			__migration_entry_wait_huge(pte, ptl);
> +			goto retry;
> +		}
> +		/*
> +		 * hwpoisoned entry is treated as no_page_table in
> +		 * follow_page_mask().
> +		 */
> +	}
> +out:
> +	spin_unlock(ptl);
> +	return page;
> +}
> +
>  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>  			 struct page **pages, struct vm_area_struct **vmas,
>  			 unsigned long *position, unsigned long *nr_pages,
> @@ -7140,123 +7196,6 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h)
>   * These functions are overwritable if your architecture needs its own
>   * behavior.
>   */

[...]

Numbers speak for themselves.

Acked-by: David Hildenbrand <david@redhat.com>

-- 
Thanks,

David / dhildenb

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
  2022-08-30  1:06   ` Baolin Wang
@ 2022-08-30 16:44     ` Mike Kravetz
  -1 siblings, 0 replies; 60+ messages in thread
From: Mike Kravetz @ 2022-08-30 16:44 UTC (permalink / raw)
  To: Baolin Wang
  Cc: linux-mm, linux-kernel, inuxppc-dev, linux-ia64,
	David Hildenbrand, Aneesh Kumar K . V, Naoya Horiguchi,
	Michael Ellerman, Muchun Song, Andrew Morton

On 08/30/22 09:06, Baolin Wang wrote:
> Hi Mike,
> 
> On 8/30/2022 7:40 AM, Mike Kravetz wrote:
> > During discussions of this series [1], it was suggested that hugetlb
> > handling code in follow_page_mask could be simplified.  At the beginning
> > of follow_page_mask, there currently is a call to follow_huge_addr which
> > 'may' handle hugetlb pages.  ia64 is the only architecture which provides
> > a follow_huge_addr routine that does not return error.  Instead, at each
> > level of the page table a check is made for a hugetlb entry.  If a hugetlb
> > entry is found, a call to a routine associated with that entry is made.
> > 
> > Currently, there are two checks for hugetlb entries at each page table
> > level.  The first check is of the form:
> > 	if (p?d_huge())
> > 		page = follow_huge_p?d();
> > the second check is of the form:
> > 	if (is_hugepd())
> > 		page = follow_huge_pd().
> > 
> > We can replace these checks, as well as the special handling routines
> > such as follow_huge_p?d() and follow_huge_pd() with a single routine to
> > handle hugetlb vmas.
> > 
> > A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
> > beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
> > existing routine huge_pte_offset to walk page tables looking for hugetlb
> > entries.  huge_pte_offset can be overwritten by architectures, and already
> > handles special cases such as hugepd entries.
> 
> Could you also mention that this patch will fix the lock issue for
> CONT-PTE/PMD hugetlb by changing to use huge_pte_lock()? which will help
> people to understand the issue.

Will update message in v2.  Thanks for taking a look!

> 
> Otherwise the changes look good to me.
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
-- 
Mike Kravetz

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-08-30 16:44     ` Mike Kravetz
  0 siblings, 0 replies; 60+ messages in thread
From: Mike Kravetz @ 2022-08-30 16:44 UTC (permalink / raw)
  To: Baolin Wang
  Cc: linux-mm, linux-kernel, inuxppc-dev, linux-ia64,
	David Hildenbrand, Aneesh Kumar K . V, Naoya Horiguchi,
	Michael Ellerman, Muchun Song, Andrew Morton

On 08/30/22 09:06, Baolin Wang wrote:
> Hi Mike,
> 
> On 8/30/2022 7:40 AM, Mike Kravetz wrote:
> > During discussions of this series [1], it was suggested that hugetlb
> > handling code in follow_page_mask could be simplified.  At the beginning
> > of follow_page_mask, there currently is a call to follow_huge_addr which
> > 'may' handle hugetlb pages.  ia64 is the only architecture which provides
> > a follow_huge_addr routine that does not return error.  Instead, at each
> > level of the page table a check is made for a hugetlb entry.  If a hugetlb
> > entry is found, a call to a routine associated with that entry is made.
> > 
> > Currently, there are two checks for hugetlb entries at each page table
> > level.  The first check is of the form:
> > 	if (p?d_huge())
> > 		page = follow_huge_p?d();
> > the second check is of the form:
> > 	if (is_hugepd())
> > 		page = follow_huge_pd().
> > 
> > We can replace these checks, as well as the special handling routines
> > such as follow_huge_p?d() and follow_huge_pd() with a single routine to
> > handle hugetlb vmas.
> > 
> > A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
> > beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
> > existing routine huge_pte_offset to walk page tables looking for hugetlb
> > entries.  huge_pte_offset can be overwritten by architectures, and already
> > handles special cases such as hugepd entries.
> 
> Could you also mention that this patch will fix the lock issue for
> CONT-PTE/PMD hugetlb by changing to use huge_pte_lock()? which will help
> people to understand the issue.

Will update message in v2.  Thanks for taking a look!

> 
> Otherwise the changes look good to me.
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
-- 
Mike Kravetz

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
  2022-08-30  8:11   ` David Hildenbrand
@ 2022-08-30 16:52     ` Mike Kravetz
  -1 siblings, 0 replies; 60+ messages in thread
From: Mike Kravetz @ 2022-08-30 16:52 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: linux-mm, linux-kernel, inuxppc-dev, linux-ia64, Baolin Wang,
	Aneesh Kumar K . V, Naoya Horiguchi, Michael Ellerman,
	Muchun Song, Andrew Morton

On 08/30/22 10:11, David Hildenbrand wrote:
> On 30.08.22 01:40, Mike Kravetz wrote:
> > During discussions of this series [1], it was suggested that hugetlb
> > handling code in follow_page_mask could be simplified.  At the beginning
> 
> Feel free to use a Suggested-by if you consider it appropriate.
> 
> > of follow_page_mask, there currently is a call to follow_huge_addr which
> > 'may' handle hugetlb pages.  ia64 is the only architecture which provides
> > a follow_huge_addr routine that does not return error.  Instead, at each
> > level of the page table a check is made for a hugetlb entry.  If a hugetlb
> > entry is found, a call to a routine associated with that entry is made.
> > 
> > Currently, there are two checks for hugetlb entries at each page table
> > level.  The first check is of the form:
> > 	if (p?d_huge())
> > 		page = follow_huge_p?d();
> > the second check is of the form:
> > 	if (is_hugepd())
> > 		page = follow_huge_pd().
> 
> BTW, what about all this hugepd stuff in mm/pagewalk.c?
> 
> Isn't this all dead code as we're essentially routing all hugetlb VMAs
> via walk_hugetlb_range? [yes, all that hugepd stuff in generic code that
> overcomplicates stuff has been annoying me for a long time]

I am 'happy' to look at cleaning up that code next.  Perhaps I will just
create a cleanup series.

I just wanted to focus on eliminating the two callouts in generic code mentioned
above: follow_huge_p?d() and follow_huge_pd().

Really looking for input from Aneesh and Naoya as they added much of the
code that is being removed here.

> > 
> > We can replace these checks, as well as the special handling routines
> > such as follow_huge_p?d() and follow_huge_pd() with a single routine to
> > handle hugetlb vmas.
> > 
> > A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
> > beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
> > existing routine huge_pte_offset to walk page tables looking for hugetlb
> > entries.  huge_pte_offset can be overwritten by architectures, and already
> > handles special cases such as hugepd entries.
> > 
> > [1] https://lore.kernel.org/linux-mm/cover.1661240170.git.baolin.wang@linux.alibaba.com/
> > Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> 
> [...]
> 
> > +static struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
> > +				unsigned long address, unsigned int flags)
> > +{
> > +	/* should never happen, but do not want to BUG */
> > +	return ERR_PTR(-EINVAL);
> 
> Should there be a WARN_ON_ONCE() instead or could we use a BUILD_BUG_ON()?
> 

Ok, I will look into adding one of these.  Prefer a BUILD_BUG_ON().

> > +}
> 
> 
> [...]
> 
> > @@ -851,10 +814,15 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
> >  
> >  	ctx->page_mask = 0;
> >  
> > -	/* make this handle hugepd */
> > -	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
> > -	if (!IS_ERR(page)) {
> > -		WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
> > +	/*
> > +	 * Call hugetlb_follow_page_mask for hugetlb vmas as it will use
> > +	 * special hugetlb page table walking code.  This eliminates the
> > +	 * need to check for hugetlb entries in the general walking code.
> > +	 */
> 
> Maybe also comment that ordinary GUP never ends up in here and instead
> directly uses follow_hugetlb_page(). This is for follow_page() handling
> only.
> 
> [me suggestion to rename follow_hugetlb_page() still stands ;) ]

Will update the comment in v2.

I think renaming follow_hugetlb_page() would be in a separate patch.  Perhaps,
included in a larger cleanup series.  I will not forget. :)

> 
> Numbers speak for themselves.
> 
> Acked-by: David Hildenbrand <david@redhat.com>
> 

Thanks,
-- 
Mike Kravetz

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-08-30 16:52     ` Mike Kravetz
  0 siblings, 0 replies; 60+ messages in thread
From: Mike Kravetz @ 2022-08-30 16:52 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: linux-mm, linux-kernel, inuxppc-dev, linux-ia64, Baolin Wang,
	Aneesh Kumar K . V, Naoya Horiguchi, Michael Ellerman,
	Muchun Song, Andrew Morton

On 08/30/22 10:11, David Hildenbrand wrote:
> On 30.08.22 01:40, Mike Kravetz wrote:
> > During discussions of this series [1], it was suggested that hugetlb
> > handling code in follow_page_mask could be simplified.  At the beginning
> 
> Feel free to use a Suggested-by if you consider it appropriate.
> 
> > of follow_page_mask, there currently is a call to follow_huge_addr which
> > 'may' handle hugetlb pages.  ia64 is the only architecture which provides
> > a follow_huge_addr routine that does not return error.  Instead, at each
> > level of the page table a check is made for a hugetlb entry.  If a hugetlb
> > entry is found, a call to a routine associated with that entry is made.
> > 
> > Currently, there are two checks for hugetlb entries at each page table
> > level.  The first check is of the form:
> > 	if (p?d_huge())
> > 		page = follow_huge_p?d();
> > the second check is of the form:
> > 	if (is_hugepd())
> > 		page = follow_huge_pd().
> 
> BTW, what about all this hugepd stuff in mm/pagewalk.c?
> 
> Isn't this all dead code as we're essentially routing all hugetlb VMAs
> via walk_hugetlb_range? [yes, all that hugepd stuff in generic code that
> overcomplicates stuff has been annoying me for a long time]

I am 'happy' to look at cleaning up that code next.  Perhaps I will just
create a cleanup series.

I just wanted to focus on eliminating the two callouts in generic code mentioned
above: follow_huge_p?d() and follow_huge_pd().

Really looking for input from Aneesh and Naoya as they added much of the
code that is being removed here.

> > 
> > We can replace these checks, as well as the special handling routines
> > such as follow_huge_p?d() and follow_huge_pd() with a single routine to
> > handle hugetlb vmas.
> > 
> > A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
> > beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
> > existing routine huge_pte_offset to walk page tables looking for hugetlb
> > entries.  huge_pte_offset can be overwritten by architectures, and already
> > handles special cases such as hugepd entries.
> > 
> > [1] https://lore.kernel.org/linux-mm/cover.1661240170.git.baolin.wang@linux.alibaba.com/
> > Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> 
> [...]
> 
> > +static struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
> > +				unsigned long address, unsigned int flags)
> > +{
> > +	/* should never happen, but do not want to BUG */
> > +	return ERR_PTR(-EINVAL);
> 
> Should there be a WARN_ON_ONCE() instead or could we use a BUILD_BUG_ON()?
> 

Ok, I will look into adding one of these.  Prefer a BUILD_BUG_ON().

> > +}
> 
> 
> [...]
> 
> > @@ -851,10 +814,15 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
> >  
> >  	ctx->page_mask = 0;
> >  
> > -	/* make this handle hugepd */
> > -	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
> > -	if (!IS_ERR(page)) {
> > -		WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
> > +	/*
> > +	 * Call hugetlb_follow_page_mask for hugetlb vmas as it will use
> > +	 * special hugetlb page table walking code.  This eliminates the
> > +	 * need to check for hugetlb entries in the general walking code.
> > +	 */
> 
> Maybe also comment that ordinary GUP never ends up in here and instead
> directly uses follow_hugetlb_page(). This is for follow_page() handling
> only.
> 
> [me suggestion to rename follow_hugetlb_page() still stands ;) ]

Will update the comment in v2.

I think renaming follow_hugetlb_page() would be in a separate patch.  Perhaps,
included in a larger cleanup series.  I will not forget. :)

> 
> Numbers speak for themselves.
> 
> Acked-by: David Hildenbrand <david@redhat.com>
> 

Thanks,
-- 
Mike Kravetz

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
  2022-08-30 16:44     ` Mike Kravetz
@ 2022-08-30 18:39       ` Mike Kravetz
  -1 siblings, 0 replies; 60+ messages in thread
From: Mike Kravetz @ 2022-08-30 18:39 UTC (permalink / raw)
  To: Baolin Wang
  Cc: linux-mm, linux-kernel, inuxppc-dev, linux-ia64,
	David Hildenbrand, Aneesh Kumar K . V, Naoya Horiguchi,
	Michael Ellerman, Muchun Song, Andrew Morton

On 08/30/22 09:44, Mike Kravetz wrote:
> On 08/30/22 09:06, Baolin Wang wrote:
> > Hi Mike,
> > 
> > On 8/30/2022 7:40 AM, Mike Kravetz wrote:
> > > During discussions of this series [1], it was suggested that hugetlb
> > > handling code in follow_page_mask could be simplified.  At the beginning
> > > of follow_page_mask, there currently is a call to follow_huge_addr which
> > > 'may' handle hugetlb pages.  ia64 is the only architecture which provides
> > > a follow_huge_addr routine that does not return error.  Instead, at each
> > > level of the page table a check is made for a hugetlb entry.  If a hugetlb
> > > entry is found, a call to a routine associated with that entry is made.
> > > 
> > > Currently, there are two checks for hugetlb entries at each page table
> > > level.  The first check is of the form:
> > > 	if (p?d_huge())
> > > 		page = follow_huge_p?d();
> > > the second check is of the form:
> > > 	if (is_hugepd())
> > > 		page = follow_huge_pd().
> > > 
> > > We can replace these checks, as well as the special handling routines
> > > such as follow_huge_p?d() and follow_huge_pd() with a single routine to
> > > handle hugetlb vmas.
> > > 
> > > A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
> > > beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
> > > existing routine huge_pte_offset to walk page tables looking for hugetlb
> > > entries.  huge_pte_offset can be overwritten by architectures, and already
> > > handles special cases such as hugepd entries.
> > 
> > Could you also mention that this patch will fix the lock issue for
> > CONT-PTE/PMD hugetlb by changing to use huge_pte_lock()? which will help
> > people to understand the issue.
> 
> Will update message in v2.  Thanks for taking a look!
> 

One additional thought, we 'may' need a separate patch to fix the locking
issues that can be easily backported.  Not sure this 'simplification' is
a good backport candidate.
-- 
Mike Kravetz

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-08-30 18:39       ` Mike Kravetz
  0 siblings, 0 replies; 60+ messages in thread
From: Mike Kravetz @ 2022-08-30 18:39 UTC (permalink / raw)
  To: Baolin Wang
  Cc: linux-mm, linux-kernel, inuxppc-dev, linux-ia64,
	David Hildenbrand, Aneesh Kumar K . V, Naoya Horiguchi,
	Michael Ellerman, Muchun Song, Andrew Morton

On 08/30/22 09:44, Mike Kravetz wrote:
> On 08/30/22 09:06, Baolin Wang wrote:
> > Hi Mike,
> > 
> > On 8/30/2022 7:40 AM, Mike Kravetz wrote:
> > > During discussions of this series [1], it was suggested that hugetlb
> > > handling code in follow_page_mask could be simplified.  At the beginning
> > > of follow_page_mask, there currently is a call to follow_huge_addr which
> > > 'may' handle hugetlb pages.  ia64 is the only architecture which provides
> > > a follow_huge_addr routine that does not return error.  Instead, at each
> > > level of the page table a check is made for a hugetlb entry.  If a hugetlb
> > > entry is found, a call to a routine associated with that entry is made.
> > > 
> > > Currently, there are two checks for hugetlb entries at each page table
> > > level.  The first check is of the form:
> > > 	if (p?d_huge())
> > > 		page = follow_huge_p?d();
> > > the second check is of the form:
> > > 	if (is_hugepd())
> > > 		page = follow_huge_pd().
> > > 
> > > We can replace these checks, as well as the special handling routines
> > > such as follow_huge_p?d() and follow_huge_pd() with a single routine to
> > > handle hugetlb vmas.
> > > 
> > > A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
> > > beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
> > > existing routine huge_pte_offset to walk page tables looking for hugetlb
> > > entries.  huge_pte_offset can be overwritten by architectures, and already
> > > handles special cases such as hugepd entries.
> > 
> > Could you also mention that this patch will fix the lock issue for
> > CONT-PTE/PMD hugetlb by changing to use huge_pte_lock()? which will help
> > people to understand the issue.
> 
> Will update message in v2.  Thanks for taking a look!
> 

One additional thought, we 'may' need a separate patch to fix the locking
issues that can be easily backported.  Not sure this 'simplification' is
a good backport candidate.
-- 
Mike Kravetz

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
  2022-08-30 16:52     ` Mike Kravetz
@ 2022-08-30 21:31       ` Mike Kravetz
  -1 siblings, 0 replies; 60+ messages in thread
From: Mike Kravetz @ 2022-08-30 21:31 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: linux-mm, linux-kernel, inuxppc-dev, linux-ia64, Baolin Wang,
	Aneesh Kumar K . V, Naoya Horiguchi, Michael Ellerman,
	Muchun Song, Andrew Morton, Christophe Leroy

On 08/30/22 09:52, Mike Kravetz wrote:
> On 08/30/22 10:11, David Hildenbrand wrote:
> > On 30.08.22 01:40, Mike Kravetz wrote:
> > > During discussions of this series [1], it was suggested that hugetlb
> > > handling code in follow_page_mask could be simplified.  At the beginning
> > 
> > Feel free to use a Suggested-by if you consider it appropriate.
> > 
> > > of follow_page_mask, there currently is a call to follow_huge_addr which
> > > 'may' handle hugetlb pages.  ia64 is the only architecture which provides
> > > a follow_huge_addr routine that does not return error.  Instead, at each
> > > level of the page table a check is made for a hugetlb entry.  If a hugetlb
> > > entry is found, a call to a routine associated with that entry is made.
> > > 
> > > Currently, there are two checks for hugetlb entries at each page table
> > > level.  The first check is of the form:
> > > 	if (p?d_huge())
> > > 		page = follow_huge_p?d();
> > > the second check is of the form:
> > > 	if (is_hugepd())
> > > 		page = follow_huge_pd().
> > 
> > BTW, what about all this hugepd stuff in mm/pagewalk.c?
> > 
> > Isn't this all dead code as we're essentially routing all hugetlb VMAs
> > via walk_hugetlb_range? [yes, all that hugepd stuff in generic code that
> > overcomplicates stuff has been annoying me for a long time]
> 
> I am 'happy' to look at cleaning up that code next.  Perhaps I will just
> create a cleanup series.
> 

Technically, that code is not dead IIUC.  The call to walk_hugetlb_range in
__walk_page_range is as follows:

	if (vma && is_vm_hugetlb_page(vma)) {
		if (ops->hugetlb_entry)
			err = walk_hugetlb_range(start, end, walk);
	} else
		err = walk_pgd_range(start, end, walk);

We also have the interface walk_page_range_novma() that will call
__walk_page_range without a value for vma.  So, in that case we would
end up calling walk_pgd_range, etc.  walk_pgd_range and related routines
do have those checks such as:

		if (is_hugepd(__hugepd(pmd_val(*pmd))))
			err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT);

So, it looks like in this case we would process 'hugepd' entries but not
'normal' hugetlb entries.  That does not seem right.

Christophe Leroy added this code with commit e17eae2b8399 "mm: pagewalk: fix
walk for hugepage tables".  This was part of the series "Convert powerpc to
GENERIC_PTDUMP".  And, the ptdump code uses the walk_page_range_novma
interface.  So, this code is certainly not dead.

Adding Christophe on Cc:

Christophe do you know if is_hugepd is true for all hugetlb entries, not
just hugepd?

On systems without hugepd entries, I guess ptdump skips all hugetlb entries.
Sigh!
-- 
Mike Kravetz

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-08-30 21:31       ` Mike Kravetz
  0 siblings, 0 replies; 60+ messages in thread
From: Mike Kravetz @ 2022-08-30 21:31 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: linux-mm, linux-kernel, inuxppc-dev, linux-ia64, Baolin Wang,
	Aneesh Kumar K . V, Naoya Horiguchi, Michael Ellerman,
	Muchun Song, Andrew Morton, Christophe Leroy

On 08/30/22 09:52, Mike Kravetz wrote:
> On 08/30/22 10:11, David Hildenbrand wrote:
> > On 30.08.22 01:40, Mike Kravetz wrote:
> > > During discussions of this series [1], it was suggested that hugetlb
> > > handling code in follow_page_mask could be simplified.  At the beginning
> > 
> > Feel free to use a Suggested-by if you consider it appropriate.
> > 
> > > of follow_page_mask, there currently is a call to follow_huge_addr which
> > > 'may' handle hugetlb pages.  ia64 is the only architecture which provides
> > > a follow_huge_addr routine that does not return error.  Instead, at each
> > > level of the page table a check is made for a hugetlb entry.  If a hugetlb
> > > entry is found, a call to a routine associated with that entry is made.
> > > 
> > > Currently, there are two checks for hugetlb entries at each page table
> > > level.  The first check is of the form:
> > > 	if (p?d_huge())
> > > 		page = follow_huge_p?d();
> > > the second check is of the form:
> > > 	if (is_hugepd())
> > > 		page = follow_huge_pd().
> > 
> > BTW, what about all this hugepd stuff in mm/pagewalk.c?
> > 
> > Isn't this all dead code as we're essentially routing all hugetlb VMAs
> > via walk_hugetlb_range? [yes, all that hugepd stuff in generic code that
> > overcomplicates stuff has been annoying me for a long time]
> 
> I am 'happy' to look at cleaning up that code next.  Perhaps I will just
> create a cleanup series.
> 

Technically, that code is not dead IIUC.  The call to walk_hugetlb_range in
__walk_page_range is as follows:

	if (vma && is_vm_hugetlb_page(vma)) {
		if (ops->hugetlb_entry)
			err = walk_hugetlb_range(start, end, walk);
	} else
		err = walk_pgd_range(start, end, walk);

We also have the interface walk_page_range_novma() that will call
__walk_page_range without a value for vma.  So, in that case we would
end up calling walk_pgd_range, etc.  walk_pgd_range and related routines
do have those checks such as:

		if (is_hugepd(__hugepd(pmd_val(*pmd))))
			err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT);

So, it looks like in this case we would process 'hugepd' entries but not
'normal' hugetlb entries.  That does not seem right.

Christophe Leroy added this code with commit e17eae2b8399 "mm: pagewalk: fix
walk for hugepage tables".  This was part of the series "Convert powerpc to
GENERIC_PTDUMP".  And, the ptdump code uses the walk_page_range_novma
interface.  So, this code is certainly not dead.

Adding Christophe on Cc:

Christophe do you know if is_hugepd is true for all hugetlb entries, not
just hugepd?

On systems without hugepd entries, I guess ptdump skips all hugetlb entries.
Sigh!
-- 
Mike Kravetz

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
  2022-08-30 18:39       ` Mike Kravetz
@ 2022-08-31  1:07         ` Baolin Wang
  -1 siblings, 0 replies; 60+ messages in thread
From: Baolin Wang @ 2022-08-31  1:07 UTC (permalink / raw)
  To: Mike Kravetz
  Cc: linux-mm, linux-kernel, inuxppc-dev, linux-ia64,
	David Hildenbrand, Aneesh Kumar K . V, Naoya Horiguchi,
	Michael Ellerman, Muchun Song, Andrew Morton



On 8/31/2022 2:39 AM, Mike Kravetz wrote:
> On 08/30/22 09:44, Mike Kravetz wrote:
>> On 08/30/22 09:06, Baolin Wang wrote:
>>> Hi Mike,
>>>
>>> On 8/30/2022 7:40 AM, Mike Kravetz wrote:
>>>> During discussions of this series [1], it was suggested that hugetlb
>>>> handling code in follow_page_mask could be simplified.  At the beginning
>>>> of follow_page_mask, there currently is a call to follow_huge_addr which
>>>> 'may' handle hugetlb pages.  ia64 is the only architecture which provides
>>>> a follow_huge_addr routine that does not return error.  Instead, at each
>>>> level of the page table a check is made for a hugetlb entry.  If a hugetlb
>>>> entry is found, a call to a routine associated with that entry is made.
>>>>
>>>> Currently, there are two checks for hugetlb entries at each page table
>>>> level.  The first check is of the form:
>>>> 	if (p?d_huge())
>>>> 		page = follow_huge_p?d();
>>>> the second check is of the form:
>>>> 	if (is_hugepd())
>>>> 		page = follow_huge_pd().
>>>>
>>>> We can replace these checks, as well as the special handling routines
>>>> such as follow_huge_p?d() and follow_huge_pd() with a single routine to
>>>> handle hugetlb vmas.
>>>>
>>>> A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
>>>> beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
>>>> existing routine huge_pte_offset to walk page tables looking for hugetlb
>>>> entries.  huge_pte_offset can be overwritten by architectures, and already
>>>> handles special cases such as hugepd entries.
>>>
>>> Could you also mention that this patch will fix the lock issue for
>>> CONT-PTE/PMD hugetlb by changing to use huge_pte_lock()? which will help
>>> people to understand the issue.
>>
>> Will update message in v2.  Thanks for taking a look!
>>
> 
> One additional thought, we 'may' need a separate patch to fix the locking
> issues that can be easily backported.  Not sure this 'simplification' is
> a good backport candidate.

Yes, that was my thought before, but David did not like adding more 
make-legacy-cruft-happy code.

So how about creating a series that contains 3 patches: picking up patch 
1 and patch 3 of my previous series [1], and your current patch? That 
means patch 1 and patch 2 in this series can fix the lock issue 
explicitly and be suitable to backport, meanwhile patch 3 (which is your 
current patch) will cleanup the legacy code.

[1] 
https://lore.kernel.org/all/cover.1661240170.git.baolin.wang@linux.alibaba.com/

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-08-31  1:07         ` Baolin Wang
  0 siblings, 0 replies; 60+ messages in thread
From: Baolin Wang @ 2022-08-31  1:07 UTC (permalink / raw)
  To: Mike Kravetz
  Cc: linux-mm, linux-kernel, inuxppc-dev, linux-ia64,
	David Hildenbrand, Aneesh Kumar K . V, Naoya Horiguchi,
	Michael Ellerman, Muchun Song, Andrew Morton



On 8/31/2022 2:39 AM, Mike Kravetz wrote:
> On 08/30/22 09:44, Mike Kravetz wrote:
>> On 08/30/22 09:06, Baolin Wang wrote:
>>> Hi Mike,
>>>
>>> On 8/30/2022 7:40 AM, Mike Kravetz wrote:
>>>> During discussions of this series [1], it was suggested that hugetlb
>>>> handling code in follow_page_mask could be simplified.  At the beginning
>>>> of follow_page_mask, there currently is a call to follow_huge_addr which
>>>> 'may' handle hugetlb pages.  ia64 is the only architecture which provides
>>>> a follow_huge_addr routine that does not return error.  Instead, at each
>>>> level of the page table a check is made for a hugetlb entry.  If a hugetlb
>>>> entry is found, a call to a routine associated with that entry is made.
>>>>
>>>> Currently, there are two checks for hugetlb entries at each page table
>>>> level.  The first check is of the form:
>>>> 	if (p?d_huge())
>>>> 		page = follow_huge_p?d();
>>>> the second check is of the form:
>>>> 	if (is_hugepd())
>>>> 		page = follow_huge_pd().
>>>>
>>>> We can replace these checks, as well as the special handling routines
>>>> such as follow_huge_p?d() and follow_huge_pd() with a single routine to
>>>> handle hugetlb vmas.
>>>>
>>>> A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
>>>> beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
>>>> existing routine huge_pte_offset to walk page tables looking for hugetlb
>>>> entries.  huge_pte_offset can be overwritten by architectures, and already
>>>> handles special cases such as hugepd entries.
>>>
>>> Could you also mention that this patch will fix the lock issue for
>>> CONT-PTE/PMD hugetlb by changing to use huge_pte_lock()? which will help
>>> people to understand the issue.
>>
>> Will update message in v2.  Thanks for taking a look!
>>
> 
> One additional thought, we 'may' need a separate patch to fix the locking
> issues that can be easily backported.  Not sure this 'simplification' is
> a good backport candidate.

Yes, that was my thought before, but David did not like adding more 
make-legacy-cruft-happy code.

So how about creating a series that contains 3 patches: picking up patch 
1 and patch 3 of my previous series [1], and your current patch? That 
means patch 1 and patch 2 in this series can fix the lock issue 
explicitly and be suitable to backport, meanwhile patch 3 (which is your 
current patch) will cleanup the legacy code.

[1] 
https://lore.kernel.org/all/cover.1661240170.git.baolin.wang@linux.alibaba.com/

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
  2022-08-29 23:40 ` Mike Kravetz
@ 2022-08-31  5:08   ` kernel test robot
  -1 siblings, 0 replies; 60+ messages in thread
From: kernel test robot @ 2022-08-31  5:08 UTC (permalink / raw)
  To: Mike Kravetz, linux-mm, linux-kernel, inuxppc-dev, linux-ia64
  Cc: kbuild-all, Baolin Wang, David Hildenbrand, Aneesh Kumar K . V,
	Naoya Horiguchi, Michael Ellerman, Muchun Song, Andrew Morton,
	Linux Memory Management List, Mike Kravetz

Hi Mike,

I love your patch! Yet something to improve:

[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on linus/master v6.0-rc3 next-20220830]
[cannot apply to powerpc/next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Mike-Kravetz/hugetlb-simplify-hugetlb-handling-in-follow_page_mask/20220830-074147
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
config: powerpc-randconfig-r001-20220830 (https://download.01.org/0day-ci/archive/20220831/202208311341.ybNgt0Kz-lkp@intel.com/config)
compiler: powerpc-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/f7dc41c1552ecd1e483a100c8b0921df62980f38
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Mike-Kravetz/hugetlb-simplify-hugetlb-handling-in-follow_page_mask/20220830-074147
        git checkout f7dc41c1552ecd1e483a100c8b0921df62980f38
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=powerpc SHELL=/bin/bash arch/powerpc/kernel/

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   In file included from arch/powerpc/kernel/setup-common.c:35:
>> include/linux/hugetlb.h:258:21: error: 'hugetlb_follow_page_mask' defined but not used [-Werror=unused-function]
     258 | static struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
         |                     ^~~~~~~~~~~~~~~~~~~~~~~~
   cc1: all warnings being treated as errors


vim +/hugetlb_follow_page_mask +258 include/linux/hugetlb.h

   257	
 > 258	static struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
   259					unsigned long address, unsigned int flags)
   260	{
   261		/* should never happen, but do not want to BUG */
   262		return ERR_PTR(-EINVAL);
   263	}
   264	

-- 
0-DAY CI Kernel Test Service
https://01.org/lkp

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-08-31  5:08   ` kernel test robot
  0 siblings, 0 replies; 60+ messages in thread
From: kernel test robot @ 2022-08-31  5:08 UTC (permalink / raw)
  To: Mike Kravetz, linux-mm, linux-kernel, inuxppc-dev, linux-ia64
  Cc: kbuild-all, Baolin Wang, David Hildenbrand, Aneesh Kumar K . V,
	Naoya Horiguchi, Michael Ellerman, Muchun Song, Andrew Morton,
	Mike Kravetz

Hi Mike,

I love your patch! Yet something to improve:

[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on linus/master v6.0-rc3 next-20220830]
[cannot apply to powerpc/next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Mike-Kravetz/hugetlb-simplify-hugetlb-handling-in-follow_page_mask/20220830-074147
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
config: powerpc-randconfig-r001-20220830 (https://download.01.org/0day-ci/archive/20220831/202208311341.ybNgt0Kz-lkp@intel.com/config)
compiler: powerpc-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/f7dc41c1552ecd1e483a100c8b0921df62980f38
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Mike-Kravetz/hugetlb-simplify-hugetlb-handling-in-follow_page_mask/20220830-074147
        git checkout f7dc41c1552ecd1e483a100c8b0921df62980f38
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=powerpc SHELL=/bin/bash arch/powerpc/kernel/

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   In file included from arch/powerpc/kernel/setup-common.c:35:
>> include/linux/hugetlb.h:258:21: error: 'hugetlb_follow_page_mask' defined but not used [-Werror=unused-function]
     258 | static struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
         |                     ^~~~~~~~~~~~~~~~~~~~~~~~
   cc1: all warnings being treated as errors


vim +/hugetlb_follow_page_mask +258 include/linux/hugetlb.h

   257	
 > 258	static struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
   259					unsigned long address, unsigned int flags)
   260	{
   261		/* should never happen, but do not want to BUG */
   262		return ERR_PTR(-EINVAL);
   263	}
   264	

-- 
0-DAY CI Kernel Test Service
https://01.org/lkp

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
  2022-08-30 21:31       ` Mike Kravetz
@ 2022-08-31  8:07         ` David Hildenbrand
  -1 siblings, 0 replies; 60+ messages in thread
From: David Hildenbrand @ 2022-08-31  8:07 UTC (permalink / raw)
  To: Mike Kravetz
  Cc: linux-mm, linux-kernel, inuxppc-dev, linux-ia64, Baolin Wang,
	Aneesh Kumar K . V, Naoya Horiguchi, Michael Ellerman,
	Muchun Song, Andrew Morton, Christophe Leroy

On 30.08.22 23:31, Mike Kravetz wrote:
> On 08/30/22 09:52, Mike Kravetz wrote:
>> On 08/30/22 10:11, David Hildenbrand wrote:
>>> On 30.08.22 01:40, Mike Kravetz wrote:
>>>> During discussions of this series [1], it was suggested that hugetlb
>>>> handling code in follow_page_mask could be simplified.  At the beginning
>>>
>>> Feel free to use a Suggested-by if you consider it appropriate.
>>>
>>>> of follow_page_mask, there currently is a call to follow_huge_addr which
>>>> 'may' handle hugetlb pages.  ia64 is the only architecture which provides
>>>> a follow_huge_addr routine that does not return error.  Instead, at each
>>>> level of the page table a check is made for a hugetlb entry.  If a hugetlb
>>>> entry is found, a call to a routine associated with that entry is made.
>>>>
>>>> Currently, there are two checks for hugetlb entries at each page table
>>>> level.  The first check is of the form:
>>>> 	if (p?d_huge())
>>>> 		page = follow_huge_p?d();
>>>> the second check is of the form:
>>>> 	if (is_hugepd())
>>>> 		page = follow_huge_pd().
>>>
>>> BTW, what about all this hugepd stuff in mm/pagewalk.c?
>>>
>>> Isn't this all dead code as we're essentially routing all hugetlb VMAs
>>> via walk_hugetlb_range? [yes, all that hugepd stuff in generic code that
>>> overcomplicates stuff has been annoying me for a long time]
>>
>> I am 'happy' to look at cleaning up that code next.  Perhaps I will just
>> create a cleanup series.
>>
> 
> Technically, that code is not dead IIUC.  The call to walk_hugetlb_range in
> __walk_page_range is as follows:
> 
> 	if (vma && is_vm_hugetlb_page(vma)) {
> 		if (ops->hugetlb_entry)
> 			err = walk_hugetlb_range(start, end, walk);
> 	} else
> 		err = walk_pgd_range(start, end, walk);
> 
> We also have the interface walk_page_range_novma() that will call
> __walk_page_range without a value for vma.  So, in that case we would
> end up calling walk_pgd_range, etc.  walk_pgd_range and related routines
> do have those checks such as:
> 
> 		if (is_hugepd(__hugepd(pmd_val(*pmd))))
> 			err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT);
> 
> So, it looks like in this case we would process 'hugepd' entries but not
> 'normal' hugetlb entries.  That does not seem right.

:/ walking a hugetlb range without knowing whether it's a hugetlb range
is certainly questionable.


> 
> Christophe Leroy added this code with commit e17eae2b8399 "mm: pagewalk: fix
> walk for hugepage tables".  This was part of the series "Convert powerpc to
> GENERIC_PTDUMP".  And, the ptdump code uses the walk_page_range_novma
> interface.  So, this code is certainly not dead.

Hm, that commit doesn't actually mention how it can happen, what exactly
will happen ("crazy result") and if it ever happened.

> 
> Adding Christophe on Cc:
> 
> Christophe do you know if is_hugepd is true for all hugetlb entries, not
> just hugepd?
> 
> On systems without hugepd entries, I guess ptdump skips all hugetlb entries.
> Sigh!

IIUC, the idea of ptdump_walk_pgd() is to dump page tables even outside
VMAs (for debugging purposes?).

I cannot convince myself that that's a good idea when only holding the
mmap lock in read mode, because we can just see page tables getting
freed concurrently e.g., during concurrent munmap() ... while holding
the mmap lock in read we may only walk inside VMA boundaries.

That then raises the questions if we're only calling this on special MMs
(e.g., init_mm) whereby we cannot really see concurrent munmap() and
where we shouldn't have hugetlb mappings or hugepd entries.

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-08-31  8:07         ` David Hildenbrand
  0 siblings, 0 replies; 60+ messages in thread
From: David Hildenbrand @ 2022-08-31  8:07 UTC (permalink / raw)
  To: Mike Kravetz
  Cc: linux-mm, linux-kernel, inuxppc-dev, linux-ia64, Baolin Wang,
	Aneesh Kumar K . V, Naoya Horiguchi, Michael Ellerman,
	Muchun Song, Andrew Morton, Christophe Leroy

On 30.08.22 23:31, Mike Kravetz wrote:
> On 08/30/22 09:52, Mike Kravetz wrote:
>> On 08/30/22 10:11, David Hildenbrand wrote:
>>> On 30.08.22 01:40, Mike Kravetz wrote:
>>>> During discussions of this series [1], it was suggested that hugetlb
>>>> handling code in follow_page_mask could be simplified.  At the beginning
>>>
>>> Feel free to use a Suggested-by if you consider it appropriate.
>>>
>>>> of follow_page_mask, there currently is a call to follow_huge_addr which
>>>> 'may' handle hugetlb pages.  ia64 is the only architecture which provides
>>>> a follow_huge_addr routine that does not return error.  Instead, at each
>>>> level of the page table a check is made for a hugetlb entry.  If a hugetlb
>>>> entry is found, a call to a routine associated with that entry is made.
>>>>
>>>> Currently, there are two checks for hugetlb entries at each page table
>>>> level.  The first check is of the form:
>>>> 	if (p?d_huge())
>>>> 		page = follow_huge_p?d();
>>>> the second check is of the form:
>>>> 	if (is_hugepd())
>>>> 		page = follow_huge_pd().
>>>
>>> BTW, what about all this hugepd stuff in mm/pagewalk.c?
>>>
>>> Isn't this all dead code as we're essentially routing all hugetlb VMAs
>>> via walk_hugetlb_range? [yes, all that hugepd stuff in generic code that
>>> overcomplicates stuff has been annoying me for a long time]
>>
>> I am 'happy' to look at cleaning up that code next.  Perhaps I will just
>> create a cleanup series.
>>
> 
> Technically, that code is not dead IIUC.  The call to walk_hugetlb_range in
> __walk_page_range is as follows:
> 
> 	if (vma && is_vm_hugetlb_page(vma)) {
> 		if (ops->hugetlb_entry)
> 			err = walk_hugetlb_range(start, end, walk);
> 	} else
> 		err = walk_pgd_range(start, end, walk);
> 
> We also have the interface walk_page_range_novma() that will call
> __walk_page_range without a value for vma.  So, in that case we would
> end up calling walk_pgd_range, etc.  walk_pgd_range and related routines
> do have those checks such as:
> 
> 		if (is_hugepd(__hugepd(pmd_val(*pmd))))
> 			err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT);
> 
> So, it looks like in this case we would process 'hugepd' entries but not
> 'normal' hugetlb entries.  That does not seem right.

:/ walking a hugetlb range without knowing whether it's a hugetlb range
is certainly questionable.


> 
> Christophe Leroy added this code with commit e17eae2b8399 "mm: pagewalk: fix
> walk for hugepage tables".  This was part of the series "Convert powerpc to
> GENERIC_PTDUMP".  And, the ptdump code uses the walk_page_range_novma
> interface.  So, this code is certainly not dead.

Hm, that commit doesn't actually mention how it can happen, what exactly
will happen ("crazy result") and if it ever happened.

> 
> Adding Christophe on Cc:
> 
> Christophe do you know if is_hugepd is true for all hugetlb entries, not
> just hugepd?
> 
> On systems without hugepd entries, I guess ptdump skips all hugetlb entries.
> Sigh!

IIUC, the idea of ptdump_walk_pgd() is to dump page tables even outside
VMAs (for debugging purposes?).

I cannot convince myself that that's a good idea when only holding the
mmap lock in read mode, because we can just see page tables getting
freed concurrently e.g., during concurrent munmap() ... while holding
the mmap lock in read we may only walk inside VMA boundaries.

That then raises the questions if we're only calling this on special MMs
(e.g., init_mm) whereby we cannot really see concurrent munmap() and
where we shouldn't have hugetlb mappings or hugepd entries.

-- 
Thanks,

David / dhildenb

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
  2022-08-31  5:08   ` kernel test robot
  (?)
@ 2022-08-31 20:42     ` Mike Kravetz
  -1 siblings, 0 replies; 60+ messages in thread
From: Mike Kravetz @ 2022-08-31 20:42 UTC (permalink / raw)
  To: kernel test robot
  Cc: linux-mm, linux-kernel, inuxppc-dev, linux-ia64, kbuild-all,
	Baolin Wang, David Hildenbrand, Aneesh Kumar K . V,
	Naoya Horiguchi, Michael Ellerman, Muchun Song, Andrew Morton

On 08/31/22 13:08, kernel test robot wrote:
> Hi Mike,
> 
> I love your patch! Yet something to improve:
> 
> [auto build test ERROR on akpm-mm/mm-everything]
> [also build test ERROR on linus/master v6.0-rc3 next-20220830]
> [cannot apply to powerpc/next]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
> 
> url:    https://github.com/intel-lab-lkp/linux/commits/Mike-Kravetz/hugetlb-simplify-hugetlb-handling-in-follow_page_mask/20220830-074147
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
> config: powerpc-randconfig-r001-20220830 (https://download.01.org/0day-ci/archive/20220831/202208311341.ybNgt0Kz-lkp@intel.com/config)
> compiler: powerpc-linux-gcc (GCC) 12.1.0
> reproduce (this is a W=1 build):
>         wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
>         chmod +x ~/bin/make.cross
>         # https://github.com/intel-lab-lkp/linux/commit/f7dc41c1552ecd1e483a100c8b0921df62980f38
>         git remote add linux-review https://github.com/intel-lab-lkp/linux
>         git fetch --no-tags linux-review Mike-Kravetz/hugetlb-simplify-hugetlb-handling-in-follow_page_mask/20220830-074147
>         git checkout f7dc41c1552ecd1e483a100c8b0921df62980f38
>         # save the config file
>         mkdir build_dir && cp config build_dir/.config
>         COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=powerpc SHELL=/bin/bash arch/powerpc/kernel/
> 
> If you fix the issue, kindly add following tag where applicable
> Reported-by: kernel test robot <lkp@intel.com>
> 
> All errors (new ones prefixed by >>):
> 
>    In file included from arch/powerpc/kernel/setup-common.c:35:
> >> include/linux/hugetlb.h:258:21: error: 'hugetlb_follow_page_mask' defined but not used [-Werror=unused-function]
>      258 | static struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
>          |                     ^~~~~~~~~~~~~~~~~~~~~~~~
>    cc1: all warnings being treated as errors
> 
> 
> vim +/hugetlb_follow_page_mask +258 include/linux/hugetlb.h
> 
>    257	
>  > 258	static struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,

Thanks! That should be,

		static inline  struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,

-- 
Mike Kravetz


>    259					unsigned long address, unsigned int flags)
>    260	{
>    261		/* should never happen, but do not want to BUG */
>    262		return ERR_PTR(-EINVAL);
>    263	}
>    264	
> 
> -- 
> 0-DAY CI Kernel Test Service
> https://01.org/lkp

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-08-31 20:42     ` Mike Kravetz
  0 siblings, 0 replies; 60+ messages in thread
From: Mike Kravetz @ 2022-08-31 20:42 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 2791 bytes --]

On 08/31/22 13:08, kernel test robot wrote:
> Hi Mike,
> 
> I love your patch! Yet something to improve:
> 
> [auto build test ERROR on akpm-mm/mm-everything]
> [also build test ERROR on linus/master v6.0-rc3 next-20220830]
> [cannot apply to powerpc/next]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
> 
> url:    https://github.com/intel-lab-lkp/linux/commits/Mike-Kravetz/hugetlb-simplify-hugetlb-handling-in-follow_page_mask/20220830-074147
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
> config: powerpc-randconfig-r001-20220830 (https://download.01.org/0day-ci/archive/20220831/202208311341.ybNgt0Kz-lkp(a)intel.com/config)
> compiler: powerpc-linux-gcc (GCC) 12.1.0
> reproduce (this is a W=1 build):
>         wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
>         chmod +x ~/bin/make.cross
>         # https://github.com/intel-lab-lkp/linux/commit/f7dc41c1552ecd1e483a100c8b0921df62980f38
>         git remote add linux-review https://github.com/intel-lab-lkp/linux
>         git fetch --no-tags linux-review Mike-Kravetz/hugetlb-simplify-hugetlb-handling-in-follow_page_mask/20220830-074147
>         git checkout f7dc41c1552ecd1e483a100c8b0921df62980f38
>         # save the config file
>         mkdir build_dir && cp config build_dir/.config
>         COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=powerpc SHELL=/bin/bash arch/powerpc/kernel/
> 
> If you fix the issue, kindly add following tag where applicable
> Reported-by: kernel test robot <lkp@intel.com>
> 
> All errors (new ones prefixed by >>):
> 
>    In file included from arch/powerpc/kernel/setup-common.c:35:
> >> include/linux/hugetlb.h:258:21: error: 'hugetlb_follow_page_mask' defined but not used [-Werror=unused-function]
>      258 | static struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
>          |                     ^~~~~~~~~~~~~~~~~~~~~~~~
>    cc1: all warnings being treated as errors
> 
> 
> vim +/hugetlb_follow_page_mask +258 include/linux/hugetlb.h
> 
>    257	
>  > 258	static struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,

Thanks! That should be,

		static inline  struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,

-- 
Mike Kravetz


>    259					unsigned long address, unsigned int flags)
>    260	{
>    261		/* should never happen, but do not want to BUG */
>    262		return ERR_PTR(-EINVAL);
>    263	}
>    264	
> 
> -- 
> 0-DAY CI Kernel Test Service
> https://01.org/lkp

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-08-31 20:42     ` Mike Kravetz
  0 siblings, 0 replies; 60+ messages in thread
From: Mike Kravetz @ 2022-08-31 20:42 UTC (permalink / raw)
  To: kernel test robot
  Cc: linux-mm, linux-kernel, inuxppc-dev, linux-ia64, kbuild-all,
	Baolin Wang, David Hildenbrand, Aneesh Kumar K . V,
	Naoya Horiguchi, Michael Ellerman, Muchun Song, Andrew Morton

On 08/31/22 13:08, kernel test robot wrote:
> Hi Mike,
> 
> I love your patch! Yet something to improve:
> 
> [auto build test ERROR on akpm-mm/mm-everything]
> [also build test ERROR on linus/master v6.0-rc3 next-20220830]
> [cannot apply to powerpc/next]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
> 
> url:    https://github.com/intel-lab-lkp/linux/commits/Mike-Kravetz/hugetlb-simplify-hugetlb-handling-in-follow_page_mask/20220830-074147
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
> config: powerpc-randconfig-r001-20220830 (https://download.01.org/0day-ci/archive/20220831/202208311341.ybNgt0Kz-lkp@intel.com/config)
> compiler: powerpc-linux-gcc (GCC) 12.1.0
> reproduce (this is a W=1 build):
>         wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
>         chmod +x ~/bin/make.cross
>         # https://github.com/intel-lab-lkp/linux/commit/f7dc41c1552ecd1e483a100c8b0921df62980f38
>         git remote add linux-review https://github.com/intel-lab-lkp/linux
>         git fetch --no-tags linux-review Mike-Kravetz/hugetlb-simplify-hugetlb-handling-in-follow_page_mask/20220830-074147
>         git checkout f7dc41c1552ecd1e483a100c8b0921df62980f38
>         # save the config file
>         mkdir build_dir && cp config build_dir/.config
>         COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=powerpc SHELL=/bin/bash arch/powerpc/kernel/
> 
> If you fix the issue, kindly add following tag where applicable
> Reported-by: kernel test robot <lkp@intel.com>
> 
> All errors (new ones prefixed by >>):
> 
>    In file included from arch/powerpc/kernel/setup-common.c:35:
> >> include/linux/hugetlb.h:258:21: error: 'hugetlb_follow_page_mask' defined but not used [-Werror=unused-function]
>      258 | static struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
>          |                     ^~~~~~~~~~~~~~~~~~~~~~~~
>    cc1: all warnings being treated as errors
> 
> 
> vim +/hugetlb_follow_page_mask +258 include/linux/hugetlb.h
> 
>    257	
>  > 258	static struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,

Thanks! That should be,

		static inline  struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,

-- 
Mike Kravetz


>    259					unsigned long address, unsigned int flags)
>    260	{
>    261		/* should never happen, but do not want to BUG */
>    262		return ERR_PTR(-EINVAL);
>    263	}
>    264	
> 
> -- 
> 0-DAY CI Kernel Test Service
> https://01.org/lkp

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
  2022-08-31  1:07         ` Baolin Wang
@ 2022-09-01  0:00           ` Mike Kravetz
  -1 siblings, 0 replies; 60+ messages in thread
From: Mike Kravetz @ 2022-08-31 23:56 UTC (permalink / raw)
  To: Baolin Wang
  Cc: linux-mm, linux-kernel, inuxppc-dev, linux-ia64,
	David Hildenbrand, Aneesh Kumar K . V, Naoya Horiguchi,
	Michael Ellerman, Muchun Song, Andrew Morton

On 08/31/22 09:07, Baolin Wang wrote:
> 
> 
> On 8/31/2022 2:39 AM, Mike Kravetz wrote:
> > On 08/30/22 09:44, Mike Kravetz wrote:
> > > On 08/30/22 09:06, Baolin Wang wrote:
> > > > Hi Mike,
> > > > 
> > > > On 8/30/2022 7:40 AM, Mike Kravetz wrote:
> > > > > During discussions of this series [1], it was suggested that hugetlb
> > > > > handling code in follow_page_mask could be simplified.  At the beginning
> > > > > of follow_page_mask, there currently is a call to follow_huge_addr which
> > > > > 'may' handle hugetlb pages.  ia64 is the only architecture which provides
> > > > > a follow_huge_addr routine that does not return error.  Instead, at each
> > > > > level of the page table a check is made for a hugetlb entry.  If a hugetlb
> > > > > entry is found, a call to a routine associated with that entry is made.
> > > > > 
> > > > > Currently, there are two checks for hugetlb entries at each page table
> > > > > level.  The first check is of the form:
> > > > > 	if (p?d_huge())
> > > > > 		page = follow_huge_p?d();
> > > > > the second check is of the form:
> > > > > 	if (is_hugepd())
> > > > > 		page = follow_huge_pd().
> > > > > 
> > > > > We can replace these checks, as well as the special handling routines
> > > > > such as follow_huge_p?d() and follow_huge_pd() with a single routine to
> > > > > handle hugetlb vmas.
> > > > > 
> > > > > A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
> > > > > beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
> > > > > existing routine huge_pte_offset to walk page tables looking for hugetlb
> > > > > entries.  huge_pte_offset can be overwritten by architectures, and already
> > > > > handles special cases such as hugepd entries.
> > > > 
> > > > Could you also mention that this patch will fix the lock issue for
> > > > CONT-PTE/PMD hugetlb by changing to use huge_pte_lock()? which will help
> > > > people to understand the issue.
> > > 
> > > Will update message in v2.  Thanks for taking a look!
> > > 
> > 
> > One additional thought, we 'may' need a separate patch to fix the locking
> > issues that can be easily backported.  Not sure this 'simplification' is
> > a good backport candidate.
> 
> Yes, that was my thought before, but David did not like adding more
> make-legacy-cruft-happy code.
> 
> So how about creating a series that contains 3 patches: picking up patch 1
> and patch 3 of my previous series [1], and your current patch? That means
> patch 1 and patch 2 in this series can fix the lock issue explicitly and be
> suitable to backport, meanwhile patch 3 (which is your current patch) will
> cleanup the legacy code.
> 

When I looked at patch 3, I was thinking the update follow_huge_pmd routine
would work for the PTE level with a few more modifications.  Perhaps, this is
too ugly but it is a smaller set of changes for backport.

Of course, this would be followed up with the simplification patch which
removes all this code.
-- 
Mike Kravetz


diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 852f911d676e..b2050d22d855 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -207,8 +207,8 @@ struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
 struct page *follow_huge_pd(struct vm_area_struct *vma,
 			    unsigned long address, hugepd_t hpd,
 			    int flags, int pdshift);
-struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-				pmd_t *pmd, int flags);
+struct page *follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address,
+				int flags);
 struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
 				pud_t *pud, int flags);
 struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address,
@@ -319,8 +319,8 @@ static inline struct page *follow_huge_pd(struct vm_area_struct *vma,
 	return NULL;
 }
 
-static inline struct page *follow_huge_pmd(struct mm_struct *mm,
-				unsigned long address, pmd_t *pmd, int flags)
+static inline struct page *follow_huge_pmd_pte(struct vm_area_struct *vma,
+				unsigned long address, int flags)
 {
 	return NULL;
 }
diff --git a/mm/gup.c b/mm/gup.c
index 66d8619e02ad..fda980b436ed 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -530,6 +530,13 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
 	if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) =
 			 (FOLL_PIN | FOLL_GET)))
 		return ERR_PTR(-EINVAL);
+
+	if (is_vm_hugetlb_page(vma)) {
+		page = follow_huge_pmd_pte(vma, address, flags);
+		if (page)
+			return page;
+		return no_page_table(vma, flags);
+	}
 retry:
 	if (unlikely(pmd_bad(*pmd)))
 		return no_page_table(vma, flags);
@@ -662,7 +669,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
 	if (pmd_none(pmdval))
 		return no_page_table(vma, flags);
 	if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
-		page = follow_huge_pmd(mm, address, pmd, flags);
+		page = follow_huge_pmd_pte(vma, address, flags);
 		if (page)
 			return page;
 		return no_page_table(vma, flags);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d0617d64d718..e2e54dc27b00 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7155,13 +7155,23 @@ follow_huge_pd(struct vm_area_struct *vma,
 	return NULL;
 }
 
+/*
+ * Temporarily handles both PMDs and PTEs.
+ * How can there be hugetlb entries at the PTE level?  One such example is
+ * CONT_PTE on arm64.
+ *
+ * The hack of handling both PMDs and PTEs is made for a stable backports.
+ * A cleanup and removal of this code is made upstream.
+ */
 struct page * __weak
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-		pmd_t *pmd, int flags)
+follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address,
+								int flags)
 {
+	struct hstate *h = hstate_vma(vma);
+	struct mm_struct *mm = vma->vm_mm;
 	struct page *page = NULL;
 	spinlock_t *ptl;
-	pte_t pte;
+	pte_t *ptep, pte;
 
 	/*
 	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
@@ -7171,17 +7181,15 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 		return NULL;
 
 retry:
-	ptl = pmd_lockptr(mm, pmd);
-	spin_lock(ptl);
-	/*
-	 * make sure that the address range covered by this pmd is not
-	 * unmapped from other threads.
-	 */
-	if (!pmd_huge(*pmd))
+	ptep = huge_pte_offset(mm, address, huge_page_size(h));
+	if (!ptep)
 		goto out;
-	pte = huge_ptep_get((pte_t *)pmd);
+
+	ptl = huge_pte_lock(h, mm, ptep);
+	pte = huge_ptep_get(ptep);
 	if (pte_present(pte)) {
-		page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
+		page = pte_page(pte) +
+			((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
 		/*
 		 * try_grab_page() should always succeed here, because: a) we
 		 * hold the pmd (ptl) lock, and b) we've just checked that the
@@ -7197,7 +7205,7 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 	} else {
 		if (is_hugetlb_entry_migration(pte)) {
 			spin_unlock(ptl);
-			__migration_entry_wait_huge((pte_t *)pmd, ptl);
+			__migration_entry_wait_huge(ptep, ptl);
 			goto retry;
 		}
 		/*

^ permalink raw reply related	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-09-01  0:00           ` Mike Kravetz
  0 siblings, 0 replies; 60+ messages in thread
From: Mike Kravetz @ 2022-09-01  0:00 UTC (permalink / raw)
  To: Baolin Wang
  Cc: linux-mm, linux-kernel, inuxppc-dev, linux-ia64,
	David Hildenbrand, Aneesh Kumar K . V, Naoya Horiguchi,
	Michael Ellerman, Muchun Song, Andrew Morton

On 08/31/22 09:07, Baolin Wang wrote:
> 
> 
> On 8/31/2022 2:39 AM, Mike Kravetz wrote:
> > On 08/30/22 09:44, Mike Kravetz wrote:
> > > On 08/30/22 09:06, Baolin Wang wrote:
> > > > Hi Mike,
> > > > 
> > > > On 8/30/2022 7:40 AM, Mike Kravetz wrote:
> > > > > During discussions of this series [1], it was suggested that hugetlb
> > > > > handling code in follow_page_mask could be simplified.  At the beginning
> > > > > of follow_page_mask, there currently is a call to follow_huge_addr which
> > > > > 'may' handle hugetlb pages.  ia64 is the only architecture which provides
> > > > > a follow_huge_addr routine that does not return error.  Instead, at each
> > > > > level of the page table a check is made for a hugetlb entry.  If a hugetlb
> > > > > entry is found, a call to a routine associated with that entry is made.
> > > > > 
> > > > > Currently, there are two checks for hugetlb entries at each page table
> > > > > level.  The first check is of the form:
> > > > > 	if (p?d_huge())
> > > > > 		page = follow_huge_p?d();
> > > > > the second check is of the form:
> > > > > 	if (is_hugepd())
> > > > > 		page = follow_huge_pd().
> > > > > 
> > > > > We can replace these checks, as well as the special handling routines
> > > > > such as follow_huge_p?d() and follow_huge_pd() with a single routine to
> > > > > handle hugetlb vmas.
> > > > > 
> > > > > A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
> > > > > beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
> > > > > existing routine huge_pte_offset to walk page tables looking for hugetlb
> > > > > entries.  huge_pte_offset can be overwritten by architectures, and already
> > > > > handles special cases such as hugepd entries.
> > > > 
> > > > Could you also mention that this patch will fix the lock issue for
> > > > CONT-PTE/PMD hugetlb by changing to use huge_pte_lock()? which will help
> > > > people to understand the issue.
> > > 
> > > Will update message in v2.  Thanks for taking a look!
> > > 
> > 
> > One additional thought, we 'may' need a separate patch to fix the locking
> > issues that can be easily backported.  Not sure this 'simplification' is
> > a good backport candidate.
> 
> Yes, that was my thought before, but David did not like adding more
> make-legacy-cruft-happy code.
> 
> So how about creating a series that contains 3 patches: picking up patch 1
> and patch 3 of my previous series [1], and your current patch? That means
> patch 1 and patch 2 in this series can fix the lock issue explicitly and be
> suitable to backport, meanwhile patch 3 (which is your current patch) will
> cleanup the legacy code.
> 

When I looked at patch 3, I was thinking the update follow_huge_pmd routine
would work for the PTE level with a few more modifications.  Perhaps, this is
too ugly but it is a smaller set of changes for backport.

Of course, this would be followed up with the simplification patch which
removes all this code.
-- 
Mike Kravetz


diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 852f911d676e..b2050d22d855 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -207,8 +207,8 @@ struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
 struct page *follow_huge_pd(struct vm_area_struct *vma,
 			    unsigned long address, hugepd_t hpd,
 			    int flags, int pdshift);
-struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-				pmd_t *pmd, int flags);
+struct page *follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address,
+				int flags);
 struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
 				pud_t *pud, int flags);
 struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address,
@@ -319,8 +319,8 @@ static inline struct page *follow_huge_pd(struct vm_area_struct *vma,
 	return NULL;
 }
 
-static inline struct page *follow_huge_pmd(struct mm_struct *mm,
-				unsigned long address, pmd_t *pmd, int flags)
+static inline struct page *follow_huge_pmd_pte(struct vm_area_struct *vma,
+				unsigned long address, int flags)
 {
 	return NULL;
 }
diff --git a/mm/gup.c b/mm/gup.c
index 66d8619e02ad..fda980b436ed 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -530,6 +530,13 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
 	if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
 			 (FOLL_PIN | FOLL_GET)))
 		return ERR_PTR(-EINVAL);
+
+	if (is_vm_hugetlb_page(vma)) {
+		page = follow_huge_pmd_pte(vma, address, flags);
+		if (page)
+			return page;
+		return no_page_table(vma, flags);
+	}
 retry:
 	if (unlikely(pmd_bad(*pmd)))
 		return no_page_table(vma, flags);
@@ -662,7 +669,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
 	if (pmd_none(pmdval))
 		return no_page_table(vma, flags);
 	if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
-		page = follow_huge_pmd(mm, address, pmd, flags);
+		page = follow_huge_pmd_pte(vma, address, flags);
 		if (page)
 			return page;
 		return no_page_table(vma, flags);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d0617d64d718..e2e54dc27b00 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7155,13 +7155,23 @@ follow_huge_pd(struct vm_area_struct *vma,
 	return NULL;
 }
 
+/*
+ * Temporarily handles both PMDs and PTEs.
+ * How can there be hugetlb entries at the PTE level?  One such example is
+ * CONT_PTE on arm64.
+ *
+ * The hack of handling both PMDs and PTEs is made for a stable backports.
+ * A cleanup and removal of this code is made upstream.
+ */
 struct page * __weak
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-		pmd_t *pmd, int flags)
+follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address,
+								int flags)
 {
+	struct hstate *h = hstate_vma(vma);
+	struct mm_struct *mm = vma->vm_mm;
 	struct page *page = NULL;
 	spinlock_t *ptl;
-	pte_t pte;
+	pte_t *ptep, pte;
 
 	/*
 	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
@@ -7171,17 +7181,15 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 		return NULL;
 
 retry:
-	ptl = pmd_lockptr(mm, pmd);
-	spin_lock(ptl);
-	/*
-	 * make sure that the address range covered by this pmd is not
-	 * unmapped from other threads.
-	 */
-	if (!pmd_huge(*pmd))
+	ptep = huge_pte_offset(mm, address, huge_page_size(h));
+	if (!ptep)
 		goto out;
-	pte = huge_ptep_get((pte_t *)pmd);
+
+	ptl = huge_pte_lock(h, mm, ptep);
+	pte = huge_ptep_get(ptep);
 	if (pte_present(pte)) {
-		page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
+		page = pte_page(pte) +
+			((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
 		/*
 		 * try_grab_page() should always succeed here, because: a) we
 		 * hold the pmd (ptl) lock, and b) we've just checked that the
@@ -7197,7 +7205,7 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 	} else {
 		if (is_hugetlb_entry_migration(pte)) {
 			spin_unlock(ptl);
-			__migration_entry_wait_huge((pte_t *)pmd, ptl);
+			__migration_entry_wait_huge(ptep, ptl);
 			goto retry;
 		}
 		/*

^ permalink raw reply related	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
  2022-09-01  0:00           ` Mike Kravetz
@ 2022-09-01  1:24             ` Baolin Wang
  -1 siblings, 0 replies; 60+ messages in thread
From: Baolin Wang @ 2022-09-01  1:24 UTC (permalink / raw)
  To: Mike Kravetz
  Cc: linux-mm, linux-kernel, inuxppc-dev, linux-ia64,
	David Hildenbrand, Aneesh Kumar K . V, Naoya Horiguchi,
	Michael Ellerman, Muchun Song, Andrew Morton



On 9/1/2022 8:00 AM, Mike Kravetz wrote:
> On 08/31/22 09:07, Baolin Wang wrote:
>>
>>
>> On 8/31/2022 2:39 AM, Mike Kravetz wrote:
>>> On 08/30/22 09:44, Mike Kravetz wrote:
>>>> On 08/30/22 09:06, Baolin Wang wrote:
>>>>> Hi Mike,
>>>>>
>>>>> On 8/30/2022 7:40 AM, Mike Kravetz wrote:
>>>>>> During discussions of this series [1], it was suggested that hugetlb
>>>>>> handling code in follow_page_mask could be simplified.  At the beginning
>>>>>> of follow_page_mask, there currently is a call to follow_huge_addr which
>>>>>> 'may' handle hugetlb pages.  ia64 is the only architecture which provides
>>>>>> a follow_huge_addr routine that does not return error.  Instead, at each
>>>>>> level of the page table a check is made for a hugetlb entry.  If a hugetlb
>>>>>> entry is found, a call to a routine associated with that entry is made.
>>>>>>
>>>>>> Currently, there are two checks for hugetlb entries at each page table
>>>>>> level.  The first check is of the form:
>>>>>> 	if (p?d_huge())
>>>>>> 		page = follow_huge_p?d();
>>>>>> the second check is of the form:
>>>>>> 	if (is_hugepd())
>>>>>> 		page = follow_huge_pd().
>>>>>>
>>>>>> We can replace these checks, as well as the special handling routines
>>>>>> such as follow_huge_p?d() and follow_huge_pd() with a single routine to
>>>>>> handle hugetlb vmas.
>>>>>>
>>>>>> A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
>>>>>> beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
>>>>>> existing routine huge_pte_offset to walk page tables looking for hugetlb
>>>>>> entries.  huge_pte_offset can be overwritten by architectures, and already
>>>>>> handles special cases such as hugepd entries.
>>>>>
>>>>> Could you also mention that this patch will fix the lock issue for
>>>>> CONT-PTE/PMD hugetlb by changing to use huge_pte_lock()? which will help
>>>>> people to understand the issue.
>>>>
>>>> Will update message in v2.  Thanks for taking a look!
>>>>
>>>
>>> One additional thought, we 'may' need a separate patch to fix the locking
>>> issues that can be easily backported.  Not sure this 'simplification' is
>>> a good backport candidate.
>>
>> Yes, that was my thought before, but David did not like adding more
>> make-legacy-cruft-happy code.
>>
>> So how about creating a series that contains 3 patches: picking up patch 1
>> and patch 3 of my previous series [1], and your current patch? That means
>> patch 1 and patch 2 in this series can fix the lock issue explicitly and be
>> suitable to backport, meanwhile patch 3 (which is your current patch) will
>> cleanup the legacy code.
>>
> 
> When I looked at patch 3, I was thinking the update follow_huge_pmd routine
> would work for the PTE level with a few more modifications.  Perhaps, this is
> too ugly but it is a smaller set of changes for backport.
> 
> Of course, this would be followed up with the simplification patch which
> removes all this code.

Yes, looks more simple. I can send you a formal patch with your 
suggestion, which can be added into your cleanup series. Thanks.

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-09-01  1:24             ` Baolin Wang
  0 siblings, 0 replies; 60+ messages in thread
From: Baolin Wang @ 2022-09-01  1:24 UTC (permalink / raw)
  To: Mike Kravetz
  Cc: linux-mm, linux-kernel, inuxppc-dev, linux-ia64,
	David Hildenbrand, Aneesh Kumar K . V, Naoya Horiguchi,
	Michael Ellerman, Muchun Song, Andrew Morton



On 9/1/2022 8:00 AM, Mike Kravetz wrote:
> On 08/31/22 09:07, Baolin Wang wrote:
>>
>>
>> On 8/31/2022 2:39 AM, Mike Kravetz wrote:
>>> On 08/30/22 09:44, Mike Kravetz wrote:
>>>> On 08/30/22 09:06, Baolin Wang wrote:
>>>>> Hi Mike,
>>>>>
>>>>> On 8/30/2022 7:40 AM, Mike Kravetz wrote:
>>>>>> During discussions of this series [1], it was suggested that hugetlb
>>>>>> handling code in follow_page_mask could be simplified.  At the beginning
>>>>>> of follow_page_mask, there currently is a call to follow_huge_addr which
>>>>>> 'may' handle hugetlb pages.  ia64 is the only architecture which provides
>>>>>> a follow_huge_addr routine that does not return error.  Instead, at each
>>>>>> level of the page table a check is made for a hugetlb entry.  If a hugetlb
>>>>>> entry is found, a call to a routine associated with that entry is made.
>>>>>>
>>>>>> Currently, there are two checks for hugetlb entries at each page table
>>>>>> level.  The first check is of the form:
>>>>>> 	if (p?d_huge())
>>>>>> 		page = follow_huge_p?d();
>>>>>> the second check is of the form:
>>>>>> 	if (is_hugepd())
>>>>>> 		page = follow_huge_pd().
>>>>>>
>>>>>> We can replace these checks, as well as the special handling routines
>>>>>> such as follow_huge_p?d() and follow_huge_pd() with a single routine to
>>>>>> handle hugetlb vmas.
>>>>>>
>>>>>> A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
>>>>>> beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
>>>>>> existing routine huge_pte_offset to walk page tables looking for hugetlb
>>>>>> entries.  huge_pte_offset can be overwritten by architectures, and already
>>>>>> handles special cases such as hugepd entries.
>>>>>
>>>>> Could you also mention that this patch will fix the lock issue for
>>>>> CONT-PTE/PMD hugetlb by changing to use huge_pte_lock()? which will help
>>>>> people to understand the issue.
>>>>
>>>> Will update message in v2.  Thanks for taking a look!
>>>>
>>>
>>> One additional thought, we 'may' need a separate patch to fix the locking
>>> issues that can be easily backported.  Not sure this 'simplification' is
>>> a good backport candidate.
>>
>> Yes, that was my thought before, but David did not like adding more
>> make-legacy-cruft-happy code.
>>
>> So how about creating a series that contains 3 patches: picking up patch 1
>> and patch 3 of my previous series [1], and your current patch? That means
>> patch 1 and patch 2 in this series can fix the lock issue explicitly and be
>> suitable to backport, meanwhile patch 3 (which is your current patch) will
>> cleanup the legacy code.
>>
> 
> When I looked at patch 3, I was thinking the update follow_huge_pmd routine
> would work for the PTE level with a few more modifications.  Perhaps, this is
> too ugly but it is a smaller set of changes for backport.
> 
> Of course, this would be followed up with the simplification patch which
> removes all this code.

Yes, looks more simple. I can send you a formal patch with your 
suggestion, which can be added into your cleanup series. Thanks.

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
  2022-09-01  1:24             ` Baolin Wang
@ 2022-09-01  6:59               ` David Hildenbrand
  -1 siblings, 0 replies; 60+ messages in thread
From: David Hildenbrand @ 2022-09-01  6:59 UTC (permalink / raw)
  To: Baolin Wang, Mike Kravetz
  Cc: linux-mm, linux-kernel, inuxppc-dev, linux-ia64,
	Aneesh Kumar K . V, Naoya Horiguchi, Michael Ellerman,
	Muchun Song, Andrew Morton

On 01.09.22 03:24, Baolin Wang wrote:
> 
> 
> On 9/1/2022 8:00 AM, Mike Kravetz wrote:
>> On 08/31/22 09:07, Baolin Wang wrote:
>>>
>>>
>>> On 8/31/2022 2:39 AM, Mike Kravetz wrote:
>>>> On 08/30/22 09:44, Mike Kravetz wrote:
>>>>> On 08/30/22 09:06, Baolin Wang wrote:
>>>>>> Hi Mike,
>>>>>>
>>>>>> On 8/30/2022 7:40 AM, Mike Kravetz wrote:
>>>>>>> During discussions of this series [1], it was suggested that hugetlb
>>>>>>> handling code in follow_page_mask could be simplified.  At the beginning
>>>>>>> of follow_page_mask, there currently is a call to follow_huge_addr which
>>>>>>> 'may' handle hugetlb pages.  ia64 is the only architecture which provides
>>>>>>> a follow_huge_addr routine that does not return error.  Instead, at each
>>>>>>> level of the page table a check is made for a hugetlb entry.  If a hugetlb
>>>>>>> entry is found, a call to a routine associated with that entry is made.
>>>>>>>
>>>>>>> Currently, there are two checks for hugetlb entries at each page table
>>>>>>> level.  The first check is of the form:
>>>>>>> 	if (p?d_huge())
>>>>>>> 		page = follow_huge_p?d();
>>>>>>> the second check is of the form:
>>>>>>> 	if (is_hugepd())
>>>>>>> 		page = follow_huge_pd().
>>>>>>>
>>>>>>> We can replace these checks, as well as the special handling routines
>>>>>>> such as follow_huge_p?d() and follow_huge_pd() with a single routine to
>>>>>>> handle hugetlb vmas.
>>>>>>>
>>>>>>> A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
>>>>>>> beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
>>>>>>> existing routine huge_pte_offset to walk page tables looking for hugetlb
>>>>>>> entries.  huge_pte_offset can be overwritten by architectures, and already
>>>>>>> handles special cases such as hugepd entries.
>>>>>>
>>>>>> Could you also mention that this patch will fix the lock issue for
>>>>>> CONT-PTE/PMD hugetlb by changing to use huge_pte_lock()? which will help
>>>>>> people to understand the issue.
>>>>>
>>>>> Will update message in v2.  Thanks for taking a look!
>>>>>
>>>>
>>>> One additional thought, we 'may' need a separate patch to fix the locking
>>>> issues that can be easily backported.  Not sure this 'simplification' is
>>>> a good backport candidate.
>>>
>>> Yes, that was my thought before, but David did not like adding more
>>> make-legacy-cruft-happy code.
>>>
>>> So how about creating a series that contains 3 patches: picking up patch 1
>>> and patch 3 of my previous series [1], and your current patch? That means
>>> patch 1 and patch 2 in this series can fix the lock issue explicitly and be
>>> suitable to backport, meanwhile patch 3 (which is your current patch) will
>>> cleanup the legacy code.
>>>
>>
>> When I looked at patch 3, I was thinking the update follow_huge_pmd routine
>> would work for the PTE level with a few more modifications.  Perhaps, this is
>> too ugly but it is a smaller set of changes for backport.
>>
>> Of course, this would be followed up with the simplification patch which
>> removes all this code.
> 
> Yes, looks more simple. I can send you a formal patch with your 
> suggestion, which can be added into your cleanup series. Thanks.

As an alternative, we can have a stable-only version that does that.

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-09-01  6:59               ` David Hildenbrand
  0 siblings, 0 replies; 60+ messages in thread
From: David Hildenbrand @ 2022-09-01  6:59 UTC (permalink / raw)
  To: Baolin Wang, Mike Kravetz
  Cc: linux-mm, linux-kernel, inuxppc-dev, linux-ia64,
	Aneesh Kumar K . V, Naoya Horiguchi, Michael Ellerman,
	Muchun Song, Andrew Morton

On 01.09.22 03:24, Baolin Wang wrote:
> 
> 
> On 9/1/2022 8:00 AM, Mike Kravetz wrote:
>> On 08/31/22 09:07, Baolin Wang wrote:
>>>
>>>
>>> On 8/31/2022 2:39 AM, Mike Kravetz wrote:
>>>> On 08/30/22 09:44, Mike Kravetz wrote:
>>>>> On 08/30/22 09:06, Baolin Wang wrote:
>>>>>> Hi Mike,
>>>>>>
>>>>>> On 8/30/2022 7:40 AM, Mike Kravetz wrote:
>>>>>>> During discussions of this series [1], it was suggested that hugetlb
>>>>>>> handling code in follow_page_mask could be simplified.  At the beginning
>>>>>>> of follow_page_mask, there currently is a call to follow_huge_addr which
>>>>>>> 'may' handle hugetlb pages.  ia64 is the only architecture which provides
>>>>>>> a follow_huge_addr routine that does not return error.  Instead, at each
>>>>>>> level of the page table a check is made for a hugetlb entry.  If a hugetlb
>>>>>>> entry is found, a call to a routine associated with that entry is made.
>>>>>>>
>>>>>>> Currently, there are two checks for hugetlb entries at each page table
>>>>>>> level.  The first check is of the form:
>>>>>>> 	if (p?d_huge())
>>>>>>> 		page = follow_huge_p?d();
>>>>>>> the second check is of the form:
>>>>>>> 	if (is_hugepd())
>>>>>>> 		page = follow_huge_pd().
>>>>>>>
>>>>>>> We can replace these checks, as well as the special handling routines
>>>>>>> such as follow_huge_p?d() and follow_huge_pd() with a single routine to
>>>>>>> handle hugetlb vmas.
>>>>>>>
>>>>>>> A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
>>>>>>> beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
>>>>>>> existing routine huge_pte_offset to walk page tables looking for hugetlb
>>>>>>> entries.  huge_pte_offset can be overwritten by architectures, and already
>>>>>>> handles special cases such as hugepd entries.
>>>>>>
>>>>>> Could you also mention that this patch will fix the lock issue for
>>>>>> CONT-PTE/PMD hugetlb by changing to use huge_pte_lock()? which will help
>>>>>> people to understand the issue.
>>>>>
>>>>> Will update message in v2.  Thanks for taking a look!
>>>>>
>>>>
>>>> One additional thought, we 'may' need a separate patch to fix the locking
>>>> issues that can be easily backported.  Not sure this 'simplification' is
>>>> a good backport candidate.
>>>
>>> Yes, that was my thought before, but David did not like adding more
>>> make-legacy-cruft-happy code.
>>>
>>> So how about creating a series that contains 3 patches: picking up patch 1
>>> and patch 3 of my previous series [1], and your current patch? That means
>>> patch 1 and patch 2 in this series can fix the lock issue explicitly and be
>>> suitable to backport, meanwhile patch 3 (which is your current patch) will
>>> cleanup the legacy code.
>>>
>>
>> When I looked at patch 3, I was thinking the update follow_huge_pmd routine
>> would work for the PTE level with a few more modifications.  Perhaps, this is
>> too ugly but it is a smaller set of changes for backport.
>>
>> Of course, this would be followed up with the simplification patch which
>> removes all this code.
> 
> Yes, looks more simple. I can send you a formal patch with your 
> suggestion, which can be added into your cleanup series. Thanks.

As an alternative, we can have a stable-only version that does that.

-- 
Thanks,

David / dhildenb

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
  2022-09-01  6:59               ` David Hildenbrand
@ 2022-09-01 10:40                 ` Baolin Wang
  -1 siblings, 0 replies; 60+ messages in thread
From: Baolin Wang @ 2022-09-01 10:40 UTC (permalink / raw)
  To: David Hildenbrand, Mike Kravetz
  Cc: linux-mm, linux-kernel, inuxppc-dev, linux-ia64,
	Aneesh Kumar K . V, Naoya Horiguchi, Michael Ellerman,
	Muchun Song, Andrew Morton



On 9/1/2022 2:59 PM, David Hildenbrand wrote:
> On 01.09.22 03:24, Baolin Wang wrote:
>>
>>
>> On 9/1/2022 8:00 AM, Mike Kravetz wrote:
>>> On 08/31/22 09:07, Baolin Wang wrote:
>>>>
>>>>
>>>> On 8/31/2022 2:39 AM, Mike Kravetz wrote:
>>>>> On 08/30/22 09:44, Mike Kravetz wrote:
>>>>>> On 08/30/22 09:06, Baolin Wang wrote:
>>>>>>> Hi Mike,
>>>>>>>
>>>>>>> On 8/30/2022 7:40 AM, Mike Kravetz wrote:
>>>>>>>> During discussions of this series [1], it was suggested that hugetlb
>>>>>>>> handling code in follow_page_mask could be simplified.  At the beginning
>>>>>>>> of follow_page_mask, there currently is a call to follow_huge_addr which
>>>>>>>> 'may' handle hugetlb pages.  ia64 is the only architecture which provides
>>>>>>>> a follow_huge_addr routine that does not return error.  Instead, at each
>>>>>>>> level of the page table a check is made for a hugetlb entry.  If a hugetlb
>>>>>>>> entry is found, a call to a routine associated with that entry is made.
>>>>>>>>
>>>>>>>> Currently, there are two checks for hugetlb entries at each page table
>>>>>>>> level.  The first check is of the form:
>>>>>>>> 	if (p?d_huge())
>>>>>>>> 		page = follow_huge_p?d();
>>>>>>>> the second check is of the form:
>>>>>>>> 	if (is_hugepd())
>>>>>>>> 		page = follow_huge_pd().
>>>>>>>>
>>>>>>>> We can replace these checks, as well as the special handling routines
>>>>>>>> such as follow_huge_p?d() and follow_huge_pd() with a single routine to
>>>>>>>> handle hugetlb vmas.
>>>>>>>>
>>>>>>>> A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
>>>>>>>> beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
>>>>>>>> existing routine huge_pte_offset to walk page tables looking for hugetlb
>>>>>>>> entries.  huge_pte_offset can be overwritten by architectures, and already
>>>>>>>> handles special cases such as hugepd entries.
>>>>>>>
>>>>>>> Could you also mention that this patch will fix the lock issue for
>>>>>>> CONT-PTE/PMD hugetlb by changing to use huge_pte_lock()? which will help
>>>>>>> people to understand the issue.
>>>>>>
>>>>>> Will update message in v2.  Thanks for taking a look!
>>>>>>
>>>>>
>>>>> One additional thought, we 'may' need a separate patch to fix the locking
>>>>> issues that can be easily backported.  Not sure this 'simplification' is
>>>>> a good backport candidate.
>>>>
>>>> Yes, that was my thought before, but David did not like adding more
>>>> make-legacy-cruft-happy code.
>>>>
>>>> So how about creating a series that contains 3 patches: picking up patch 1
>>>> and patch 3 of my previous series [1], and your current patch? That means
>>>> patch 1 and patch 2 in this series can fix the lock issue explicitly and be
>>>> suitable to backport, meanwhile patch 3 (which is your current patch) will
>>>> cleanup the legacy code.
>>>>
>>>
>>> When I looked at patch 3, I was thinking the update follow_huge_pmd routine
>>> would work for the PTE level with a few more modifications.  Perhaps, this is
>>> too ugly but it is a smaller set of changes for backport.
>>>
>>> Of course, this would be followed up with the simplification patch which
>>> removes all this code.
>>
>> Yes, looks more simple. I can send you a formal patch with your
>> suggestion, which can be added into your cleanup series. Thanks.
> 
> As an alternative, we can have a stable-only version that does that.

But from stable-kernel-rules, we should follow "It or an equivalent fix 
must already exist in Linus' tree (upstream)."

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-09-01 10:40                 ` Baolin Wang
  0 siblings, 0 replies; 60+ messages in thread
From: Baolin Wang @ 2022-09-01 10:40 UTC (permalink / raw)
  To: David Hildenbrand, Mike Kravetz
  Cc: linux-mm, linux-kernel, inuxppc-dev, linux-ia64,
	Aneesh Kumar K . V, Naoya Horiguchi, Michael Ellerman,
	Muchun Song, Andrew Morton



On 9/1/2022 2:59 PM, David Hildenbrand wrote:
> On 01.09.22 03:24, Baolin Wang wrote:
>>
>>
>> On 9/1/2022 8:00 AM, Mike Kravetz wrote:
>>> On 08/31/22 09:07, Baolin Wang wrote:
>>>>
>>>>
>>>> On 8/31/2022 2:39 AM, Mike Kravetz wrote:
>>>>> On 08/30/22 09:44, Mike Kravetz wrote:
>>>>>> On 08/30/22 09:06, Baolin Wang wrote:
>>>>>>> Hi Mike,
>>>>>>>
>>>>>>> On 8/30/2022 7:40 AM, Mike Kravetz wrote:
>>>>>>>> During discussions of this series [1], it was suggested that hugetlb
>>>>>>>> handling code in follow_page_mask could be simplified.  At the beginning
>>>>>>>> of follow_page_mask, there currently is a call to follow_huge_addr which
>>>>>>>> 'may' handle hugetlb pages.  ia64 is the only architecture which provides
>>>>>>>> a follow_huge_addr routine that does not return error.  Instead, at each
>>>>>>>> level of the page table a check is made for a hugetlb entry.  If a hugetlb
>>>>>>>> entry is found, a call to a routine associated with that entry is made.
>>>>>>>>
>>>>>>>> Currently, there are two checks for hugetlb entries at each page table
>>>>>>>> level.  The first check is of the form:
>>>>>>>> 	if (p?d_huge())
>>>>>>>> 		page = follow_huge_p?d();
>>>>>>>> the second check is of the form:
>>>>>>>> 	if (is_hugepd())
>>>>>>>> 		page = follow_huge_pd().
>>>>>>>>
>>>>>>>> We can replace these checks, as well as the special handling routines
>>>>>>>> such as follow_huge_p?d() and follow_huge_pd() with a single routine to
>>>>>>>> handle hugetlb vmas.
>>>>>>>>
>>>>>>>> A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
>>>>>>>> beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
>>>>>>>> existing routine huge_pte_offset to walk page tables looking for hugetlb
>>>>>>>> entries.  huge_pte_offset can be overwritten by architectures, and already
>>>>>>>> handles special cases such as hugepd entries.
>>>>>>>
>>>>>>> Could you also mention that this patch will fix the lock issue for
>>>>>>> CONT-PTE/PMD hugetlb by changing to use huge_pte_lock()? which will help
>>>>>>> people to understand the issue.
>>>>>>
>>>>>> Will update message in v2.  Thanks for taking a look!
>>>>>>
>>>>>
>>>>> One additional thought, we 'may' need a separate patch to fix the locking
>>>>> issues that can be easily backported.  Not sure this 'simplification' is
>>>>> a good backport candidate.
>>>>
>>>> Yes, that was my thought before, but David did not like adding more
>>>> make-legacy-cruft-happy code.
>>>>
>>>> So how about creating a series that contains 3 patches: picking up patch 1
>>>> and patch 3 of my previous series [1], and your current patch? That means
>>>> patch 1 and patch 2 in this series can fix the lock issue explicitly and be
>>>> suitable to backport, meanwhile patch 3 (which is your current patch) will
>>>> cleanup the legacy code.
>>>>
>>>
>>> When I looked at patch 3, I was thinking the update follow_huge_pmd routine
>>> would work for the PTE level with a few more modifications.  Perhaps, this is
>>> too ugly but it is a smaller set of changes for backport.
>>>
>>> Of course, this would be followed up with the simplification patch which
>>> removes all this code.
>>
>> Yes, looks more simple. I can send you a formal patch with your
>> suggestion, which can be added into your cleanup series. Thanks.
> 
> As an alternative, we can have a stable-only version that does that.

But from stable-kernel-rules, we should follow "It or an equivalent fix 
must already exist in Linus' tree (upstream)."

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
  2022-08-29 23:40 ` Mike Kravetz
@ 2022-09-01 16:19   ` Mike Kravetz
  -1 siblings, 0 replies; 60+ messages in thread
From: Mike Kravetz @ 2022-09-01 16:19 UTC (permalink / raw)
  To: linux-mm, linux-kernel, inuxppc-dev, linux-ia64
  Cc: Baolin Wang, David Hildenbrand, Aneesh Kumar K . V,
	Naoya Horiguchi, Michael Ellerman, Muchun Song, Andrew Morton

On 08/29/22 16:40, Mike Kravetz wrote:
> A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
> beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
> existing routine huge_pte_offset to walk page tables looking for hugetlb
> entries.  huge_pte_offset can be overwritten by architectures, and already
> handles special cases such as hugepd entries.
> 
<snip>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index d0617d64d718..b3da421ba5be 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -6190,6 +6190,62 @@ static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
>  	return false;
>  }
>  
> +struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
> +				unsigned long address, unsigned int flags)
> +{
> +	struct hstate *h = hstate_vma(vma);
> +	struct mm_struct *mm = vma->vm_mm;
> +	unsigned long haddr = address & huge_page_mask(h);
> +	struct page *page = NULL;
> +	spinlock_t *ptl;
> +	pte_t *pte, entry;
> +
> +	/*
> +	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
> +	 * follow_hugetlb_page().
> +	 */
> +	if (WARN_ON_ONCE(flags & FOLL_PIN))
> +		return NULL;
> +
> +	pte = huge_pte_offset(mm, haddr, huge_page_size(h));
> +	if (!pte)
> +		return NULL;
> +
> +retry:
> +	ptl = huge_pte_lock(h, mm, pte);

I can't believe I forgot about huge pmd sharing as described here!!!
https://lore.kernel.org/linux-mm/20220824175757.20590-1-mike.kravetz@oracle.com/

The above series is in Andrew's tree, and we should add 'vma locking' calls
to this routine.

Do note that the existing page walking code can race with pmd unsharing.
I would NOT suggest trying to address this in stable releases.  To date,
I am unaware of any issues caused by races with pmd unsharing.  Trying
to take this into account in 'generic page walking code', could get ugly.
Since hugetlb_follow_page_mask will be a special callout for hugetlb page
table walking, we can easily add the required locking and address the
potential race issue.  This will be in v2.

Still hoping to get some feedback from Aneesh and Naoya about this approach.
-- 
Mike Kravetz

> +	entry = huge_ptep_get(pte);
> +	if (pte_present(entry)) {
> +		page = pte_page(entry) +
> +				((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
> +		/*
> +		 * Note that page may be a sub-page, and with vmemmap
> +		 * optimizations the page struct may be read only.
> +		 * try_grab_page() will increase the ref count on the
> +		 * head page, so this will be OK.
> +		 *
> +		 * try_grab_page() should always succeed here, because we hold
> +		 * the ptl lock and have verified pte_present().
> +		 */
> +		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
> +			page = NULL;
> +			goto out;
> +		}
> +	} else {
> +		if (is_hugetlb_entry_migration(entry)) {
> +			spin_unlock(ptl);
> +			__migration_entry_wait_huge(pte, ptl);
> +			goto retry;
> +		}
> +		/*
> +		 * hwpoisoned entry is treated as no_page_table in
> +		 * follow_page_mask().
> +		 */
> +	}
> +out:
> +	spin_unlock(ptl);
> +	return page;
> +}
> +
>  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>  			 struct page **pages, struct vm_area_struct **vmas,
>  			 unsigned long *position, unsigned long *nr_pages,
> @@ -7140,123 +7196,6 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h)
>   * These functions are overwritable if your architecture needs its own
>   * behavior.
>   */
> -struct page * __weak
> -follow_huge_addr(struct mm_struct *mm, unsigned long address,
> -			      int write)
> -{
> -	return ERR_PTR(-EINVAL);
> -}
> -
> -struct page * __weak
> -follow_huge_pd(struct vm_area_struct *vma,
> -	       unsigned long address, hugepd_t hpd, int flags, int pdshift)
> -{
> -	WARN(1, "hugepd follow called with no support for hugepage directory format\n");
> -	return NULL;
> -}
> -
> -struct page * __weak
> -follow_huge_pmd(struct mm_struct *mm, unsigned long address,
> -		pmd_t *pmd, int flags)
> -{
> -	struct page *page = NULL;
> -	spinlock_t *ptl;
> -	pte_t pte;
> -
> -	/*
> -	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
> -	 * follow_hugetlb_page().
> -	 */
> -	if (WARN_ON_ONCE(flags & FOLL_PIN))
> -		return NULL;
> -
> -retry:
> -	ptl = pmd_lockptr(mm, pmd);
> -	spin_lock(ptl);
> -	/*
> -	 * make sure that the address range covered by this pmd is not
> -	 * unmapped from other threads.
> -	 */
> -	if (!pmd_huge(*pmd))
> -		goto out;
> -	pte = huge_ptep_get((pte_t *)pmd);
> -	if (pte_present(pte)) {
> -		page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
> -		/*
> -		 * try_grab_page() should always succeed here, because: a) we
> -		 * hold the pmd (ptl) lock, and b) we've just checked that the
> -		 * huge pmd (head) page is present in the page tables. The ptl
> -		 * prevents the head page and tail pages from being rearranged
> -		 * in any way. So this page must be available at this point,
> -		 * unless the page refcount overflowed:
> -		 */
> -		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
> -			page = NULL;
> -			goto out;
> -		}
> -	} else {
> -		if (is_hugetlb_entry_migration(pte)) {
> -			spin_unlock(ptl);
> -			__migration_entry_wait_huge((pte_t *)pmd, ptl);
> -			goto retry;
> -		}
> -		/*
> -		 * hwpoisoned entry is treated as no_page_table in
> -		 * follow_page_mask().
> -		 */
> -	}
> -out:
> -	spin_unlock(ptl);
> -	return page;
> -}
> -
> -struct page * __weak
> -follow_huge_pud(struct mm_struct *mm, unsigned long address,
> -		pud_t *pud, int flags)
> -{
> -	struct page *page = NULL;
> -	spinlock_t *ptl;
> -	pte_t pte;
> -
> -	if (WARN_ON_ONCE(flags & FOLL_PIN))
> -		return NULL;
> -
> -retry:
> -	ptl = huge_pte_lock(hstate_sizelog(PUD_SHIFT), mm, (pte_t *)pud);
> -	if (!pud_huge(*pud))
> -		goto out;
> -	pte = huge_ptep_get((pte_t *)pud);
> -	if (pte_present(pte)) {
> -		page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
> -		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
> -			page = NULL;
> -			goto out;
> -		}
> -	} else {
> -		if (is_hugetlb_entry_migration(pte)) {
> -			spin_unlock(ptl);
> -			__migration_entry_wait(mm, (pte_t *)pud, ptl);
> -			goto retry;
> -		}
> -		/*
> -		 * hwpoisoned entry is treated as no_page_table in
> -		 * follow_page_mask().
> -		 */
> -	}
> -out:
> -	spin_unlock(ptl);
> -	return page;
> -}
> -
> -struct page * __weak
> -follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
> -{
> -	if (flags & (FOLL_GET | FOLL_PIN))
> -		return NULL;
> -
> -	return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
> -}
> -
>  int isolate_hugetlb(struct page *page, struct list_head *list)
>  {
>  	int ret = 0;
> -- 
> 2.37.1
> 

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-09-01 16:19   ` Mike Kravetz
  0 siblings, 0 replies; 60+ messages in thread
From: Mike Kravetz @ 2022-09-01 16:19 UTC (permalink / raw)
  To: linux-mm, linux-kernel, inuxppc-dev, linux-ia64
  Cc: Baolin Wang, David Hildenbrand, Aneesh Kumar K . V,
	Naoya Horiguchi, Michael Ellerman, Muchun Song, Andrew Morton

On 08/29/22 16:40, Mike Kravetz wrote:
> A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
> beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
> existing routine huge_pte_offset to walk page tables looking for hugetlb
> entries.  huge_pte_offset can be overwritten by architectures, and already
> handles special cases such as hugepd entries.
> 
<snip>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index d0617d64d718..b3da421ba5be 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -6190,6 +6190,62 @@ static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
>  	return false;
>  }
>  
> +struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
> +				unsigned long address, unsigned int flags)
> +{
> +	struct hstate *h = hstate_vma(vma);
> +	struct mm_struct *mm = vma->vm_mm;
> +	unsigned long haddr = address & huge_page_mask(h);
> +	struct page *page = NULL;
> +	spinlock_t *ptl;
> +	pte_t *pte, entry;
> +
> +	/*
> +	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
> +	 * follow_hugetlb_page().
> +	 */
> +	if (WARN_ON_ONCE(flags & FOLL_PIN))
> +		return NULL;
> +
> +	pte = huge_pte_offset(mm, haddr, huge_page_size(h));
> +	if (!pte)
> +		return NULL;
> +
> +retry:
> +	ptl = huge_pte_lock(h, mm, pte);

I can't believe I forgot about huge pmd sharing as described here!!!
https://lore.kernel.org/linux-mm/20220824175757.20590-1-mike.kravetz@oracle.com/

The above series is in Andrew's tree, and we should add 'vma locking' calls
to this routine.

Do note that the existing page walking code can race with pmd unsharing.
I would NOT suggest trying to address this in stable releases.  To date,
I am unaware of any issues caused by races with pmd unsharing.  Trying
to take this into account in 'generic page walking code', could get ugly.
Since hugetlb_follow_page_mask will be a special callout for hugetlb page
table walking, we can easily add the required locking and address the
potential race issue.  This will be in v2.

Still hoping to get some feedback from Aneesh and Naoya about this approach.
-- 
Mike Kravetz

> +	entry = huge_ptep_get(pte);
> +	if (pte_present(entry)) {
> +		page = pte_page(entry) +
> +				((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
> +		/*
> +		 * Note that page may be a sub-page, and with vmemmap
> +		 * optimizations the page struct may be read only.
> +		 * try_grab_page() will increase the ref count on the
> +		 * head page, so this will be OK.
> +		 *
> +		 * try_grab_page() should always succeed here, because we hold
> +		 * the ptl lock and have verified pte_present().
> +		 */
> +		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
> +			page = NULL;
> +			goto out;
> +		}
> +	} else {
> +		if (is_hugetlb_entry_migration(entry)) {
> +			spin_unlock(ptl);
> +			__migration_entry_wait_huge(pte, ptl);
> +			goto retry;
> +		}
> +		/*
> +		 * hwpoisoned entry is treated as no_page_table in
> +		 * follow_page_mask().
> +		 */
> +	}
> +out:
> +	spin_unlock(ptl);
> +	return page;
> +}
> +
>  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>  			 struct page **pages, struct vm_area_struct **vmas,
>  			 unsigned long *position, unsigned long *nr_pages,
> @@ -7140,123 +7196,6 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h)
>   * These functions are overwritable if your architecture needs its own
>   * behavior.
>   */
> -struct page * __weak
> -follow_huge_addr(struct mm_struct *mm, unsigned long address,
> -			      int write)
> -{
> -	return ERR_PTR(-EINVAL);
> -}
> -
> -struct page * __weak
> -follow_huge_pd(struct vm_area_struct *vma,
> -	       unsigned long address, hugepd_t hpd, int flags, int pdshift)
> -{
> -	WARN(1, "hugepd follow called with no support for hugepage directory format\n");
> -	return NULL;
> -}
> -
> -struct page * __weak
> -follow_huge_pmd(struct mm_struct *mm, unsigned long address,
> -		pmd_t *pmd, int flags)
> -{
> -	struct page *page = NULL;
> -	spinlock_t *ptl;
> -	pte_t pte;
> -
> -	/*
> -	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
> -	 * follow_hugetlb_page().
> -	 */
> -	if (WARN_ON_ONCE(flags & FOLL_PIN))
> -		return NULL;
> -
> -retry:
> -	ptl = pmd_lockptr(mm, pmd);
> -	spin_lock(ptl);
> -	/*
> -	 * make sure that the address range covered by this pmd is not
> -	 * unmapped from other threads.
> -	 */
> -	if (!pmd_huge(*pmd))
> -		goto out;
> -	pte = huge_ptep_get((pte_t *)pmd);
> -	if (pte_present(pte)) {
> -		page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
> -		/*
> -		 * try_grab_page() should always succeed here, because: a) we
> -		 * hold the pmd (ptl) lock, and b) we've just checked that the
> -		 * huge pmd (head) page is present in the page tables. The ptl
> -		 * prevents the head page and tail pages from being rearranged
> -		 * in any way. So this page must be available at this point,
> -		 * unless the page refcount overflowed:
> -		 */
> -		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
> -			page = NULL;
> -			goto out;
> -		}
> -	} else {
> -		if (is_hugetlb_entry_migration(pte)) {
> -			spin_unlock(ptl);
> -			__migration_entry_wait_huge((pte_t *)pmd, ptl);
> -			goto retry;
> -		}
> -		/*
> -		 * hwpoisoned entry is treated as no_page_table in
> -		 * follow_page_mask().
> -		 */
> -	}
> -out:
> -	spin_unlock(ptl);
> -	return page;
> -}
> -
> -struct page * __weak
> -follow_huge_pud(struct mm_struct *mm, unsigned long address,
> -		pud_t *pud, int flags)
> -{
> -	struct page *page = NULL;
> -	spinlock_t *ptl;
> -	pte_t pte;
> -
> -	if (WARN_ON_ONCE(flags & FOLL_PIN))
> -		return NULL;
> -
> -retry:
> -	ptl = huge_pte_lock(hstate_sizelog(PUD_SHIFT), mm, (pte_t *)pud);
> -	if (!pud_huge(*pud))
> -		goto out;
> -	pte = huge_ptep_get((pte_t *)pud);
> -	if (pte_present(pte)) {
> -		page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
> -		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
> -			page = NULL;
> -			goto out;
> -		}
> -	} else {
> -		if (is_hugetlb_entry_migration(pte)) {
> -			spin_unlock(ptl);
> -			__migration_entry_wait(mm, (pte_t *)pud, ptl);
> -			goto retry;
> -		}
> -		/*
> -		 * hwpoisoned entry is treated as no_page_table in
> -		 * follow_page_mask().
> -		 */
> -	}
> -out:
> -	spin_unlock(ptl);
> -	return page;
> -}
> -
> -struct page * __weak
> -follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
> -{
> -	if (flags & (FOLL_GET | FOLL_PIN))
> -		return NULL;
> -
> -	return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
> -}
> -
>  int isolate_hugetlb(struct page *page, struct list_head *list)
>  {
>  	int ret = 0;
> -- 
> 2.37.1
> 

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
  2022-08-31  8:07         ` David Hildenbrand
@ 2022-09-02 18:50           ` Mike Kravetz
  -1 siblings, 0 replies; 60+ messages in thread
From: Mike Kravetz @ 2022-09-02 18:50 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: linux-mm, linux-kernel, inuxppc-dev, linux-ia64, Baolin Wang,
	Aneesh Kumar K . V, Naoya Horiguchi, Michael Ellerman,
	Muchun Song, Andrew Morton, Christophe Leroy

On 08/31/22 10:07, David Hildenbrand wrote:
> On 30.08.22 23:31, Mike Kravetz wrote:
> > On 08/30/22 09:52, Mike Kravetz wrote:
> >> On 08/30/22 10:11, David Hildenbrand wrote:
> >>> On 30.08.22 01:40, Mike Kravetz wrote:
> >>>> During discussions of this series [1], it was suggested that hugetlb
> >>>> handling code in follow_page_mask could be simplified.  At the beginning
> >>>
> >>> Feel free to use a Suggested-by if you consider it appropriate.
> >>>
> >>>> of follow_page_mask, there currently is a call to follow_huge_addr which
> >>>> 'may' handle hugetlb pages.  ia64 is the only architecture which provides
> >>>> a follow_huge_addr routine that does not return error.  Instead, at each
> >>>> level of the page table a check is made for a hugetlb entry.  If a hugetlb
> >>>> entry is found, a call to a routine associated with that entry is made.
> >>>>
> >>>> Currently, there are two checks for hugetlb entries at each page table
> >>>> level.  The first check is of the form:
> >>>> 	if (p?d_huge())
> >>>> 		page = follow_huge_p?d();
> >>>> the second check is of the form:
> >>>> 	if (is_hugepd())
> >>>> 		page = follow_huge_pd().
> >>>
> >>> BTW, what about all this hugepd stuff in mm/pagewalk.c?
> >>>
> >>> Isn't this all dead code as we're essentially routing all hugetlb VMAs
> >>> via walk_hugetlb_range? [yes, all that hugepd stuff in generic code that
> >>> overcomplicates stuff has been annoying me for a long time]
> >>
> >> I am 'happy' to look at cleaning up that code next.  Perhaps I will just
> >> create a cleanup series.
> >>
> > 
> > Technically, that code is not dead IIUC.  The call to walk_hugetlb_range in
> > __walk_page_range is as follows:
> > 
> > 	if (vma && is_vm_hugetlb_page(vma)) {
> > 		if (ops->hugetlb_entry)
> > 			err = walk_hugetlb_range(start, end, walk);
> > 	} else
> > 		err = walk_pgd_range(start, end, walk);
> > 
> > We also have the interface walk_page_range_novma() that will call
> > __walk_page_range without a value for vma.  So, in that case we would
> > end up calling walk_pgd_range, etc.  walk_pgd_range and related routines
> > do have those checks such as:
> > 
> > 		if (is_hugepd(__hugepd(pmd_val(*pmd))))
> > 			err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT);
> > 
> > So, it looks like in this case we would process 'hugepd' entries but not
> > 'normal' hugetlb entries.  That does not seem right.
> 
> :/ walking a hugetlb range without knowing whether it's a hugetlb range
> is certainly questionable.
> 
> 
> > 
> > Christophe Leroy added this code with commit e17eae2b8399 "mm: pagewalk: fix
> > walk for hugepage tables".  This was part of the series "Convert powerpc to
> > GENERIC_PTDUMP".  And, the ptdump code uses the walk_page_range_novma
> > interface.  So, this code is certainly not dead.
> 
> Hm, that commit doesn't actually mention how it can happen, what exactly
> will happen ("crazy result") and if it ever happened.
> 
> > 
> > Adding Christophe on Cc:
> > 
> > Christophe do you know if is_hugepd is true for all hugetlb entries, not
> > just hugepd?
> > 
> > On systems without hugepd entries, I guess ptdump skips all hugetlb entries.
> > Sigh!
> 
> IIUC, the idea of ptdump_walk_pgd() is to dump page tables even outside
> VMAs (for debugging purposes?).
> 
> I cannot convince myself that that's a good idea when only holding the
> mmap lock in read mode, because we can just see page tables getting
> freed concurrently e.g., during concurrent munmap() ... while holding
> the mmap lock in read we may only walk inside VMA boundaries.
> 
> That then raises the questions if we're only calling this on special MMs
> (e.g., init_mm) whereby we cannot really see concurrent munmap() and
> where we shouldn't have hugetlb mappings or hugepd entries.
> 

This is going to require a little more thought.

Since Baolin's patch for stable releases is moving forward, I want to
get the cleanup provided by this patch in ASAP.  So, I am going to rebase
this patch on Baolin's with the other fixups.

Will come back to this cleanup later.
-- 
Mike Kravetz

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-09-02 18:50           ` Mike Kravetz
  0 siblings, 0 replies; 60+ messages in thread
From: Mike Kravetz @ 2022-09-02 18:50 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: linux-mm, linux-kernel, inuxppc-dev, linux-ia64, Baolin Wang,
	Aneesh Kumar K . V, Naoya Horiguchi, Michael Ellerman,
	Muchun Song, Andrew Morton, Christophe Leroy

On 08/31/22 10:07, David Hildenbrand wrote:
> On 30.08.22 23:31, Mike Kravetz wrote:
> > On 08/30/22 09:52, Mike Kravetz wrote:
> >> On 08/30/22 10:11, David Hildenbrand wrote:
> >>> On 30.08.22 01:40, Mike Kravetz wrote:
> >>>> During discussions of this series [1], it was suggested that hugetlb
> >>>> handling code in follow_page_mask could be simplified.  At the beginning
> >>>
> >>> Feel free to use a Suggested-by if you consider it appropriate.
> >>>
> >>>> of follow_page_mask, there currently is a call to follow_huge_addr which
> >>>> 'may' handle hugetlb pages.  ia64 is the only architecture which provides
> >>>> a follow_huge_addr routine that does not return error.  Instead, at each
> >>>> level of the page table a check is made for a hugetlb entry.  If a hugetlb
> >>>> entry is found, a call to a routine associated with that entry is made.
> >>>>
> >>>> Currently, there are two checks for hugetlb entries at each page table
> >>>> level.  The first check is of the form:
> >>>> 	if (p?d_huge())
> >>>> 		page = follow_huge_p?d();
> >>>> the second check is of the form:
> >>>> 	if (is_hugepd())
> >>>> 		page = follow_huge_pd().
> >>>
> >>> BTW, what about all this hugepd stuff in mm/pagewalk.c?
> >>>
> >>> Isn't this all dead code as we're essentially routing all hugetlb VMAs
> >>> via walk_hugetlb_range? [yes, all that hugepd stuff in generic code that
> >>> overcomplicates stuff has been annoying me for a long time]
> >>
> >> I am 'happy' to look at cleaning up that code next.  Perhaps I will just
> >> create a cleanup series.
> >>
> > 
> > Technically, that code is not dead IIUC.  The call to walk_hugetlb_range in
> > __walk_page_range is as follows:
> > 
> > 	if (vma && is_vm_hugetlb_page(vma)) {
> > 		if (ops->hugetlb_entry)
> > 			err = walk_hugetlb_range(start, end, walk);
> > 	} else
> > 		err = walk_pgd_range(start, end, walk);
> > 
> > We also have the interface walk_page_range_novma() that will call
> > __walk_page_range without a value for vma.  So, in that case we would
> > end up calling walk_pgd_range, etc.  walk_pgd_range and related routines
> > do have those checks such as:
> > 
> > 		if (is_hugepd(__hugepd(pmd_val(*pmd))))
> > 			err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT);
> > 
> > So, it looks like in this case we would process 'hugepd' entries but not
> > 'normal' hugetlb entries.  That does not seem right.
> 
> :/ walking a hugetlb range without knowing whether it's a hugetlb range
> is certainly questionable.
> 
> 
> > 
> > Christophe Leroy added this code with commit e17eae2b8399 "mm: pagewalk: fix
> > walk for hugepage tables".  This was part of the series "Convert powerpc to
> > GENERIC_PTDUMP".  And, the ptdump code uses the walk_page_range_novma
> > interface.  So, this code is certainly not dead.
> 
> Hm, that commit doesn't actually mention how it can happen, what exactly
> will happen ("crazy result") and if it ever happened.
> 
> > 
> > Adding Christophe on Cc:
> > 
> > Christophe do you know if is_hugepd is true for all hugetlb entries, not
> > just hugepd?
> > 
> > On systems without hugepd entries, I guess ptdump skips all hugetlb entries.
> > Sigh!
> 
> IIUC, the idea of ptdump_walk_pgd() is to dump page tables even outside
> VMAs (for debugging purposes?).
> 
> I cannot convince myself that that's a good idea when only holding the
> mmap lock in read mode, because we can just see page tables getting
> freed concurrently e.g., during concurrent munmap() ... while holding
> the mmap lock in read we may only walk inside VMA boundaries.
> 
> That then raises the questions if we're only calling this on special MMs
> (e.g., init_mm) whereby we cannot really see concurrent munmap() and
> where we shouldn't have hugetlb mappings or hugepd entries.
> 

This is going to require a little more thought.

Since Baolin's patch for stable releases is moving forward, I want to
get the cleanup provided by this patch in ASAP.  So, I am going to rebase
this patch on Baolin's with the other fixups.

Will come back to this cleanup later.
-- 
Mike Kravetz

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
  2022-09-02 18:50           ` Mike Kravetz
@ 2022-09-02 18:52             ` David Hildenbrand
  -1 siblings, 0 replies; 60+ messages in thread
From: David Hildenbrand @ 2022-09-02 18:52 UTC (permalink / raw)
  To: Mike Kravetz
  Cc: linux-mm, linux-kernel, inuxppc-dev, linux-ia64, Baolin Wang,
	Aneesh Kumar K . V, Naoya Horiguchi, Michael Ellerman,
	Muchun Song, Andrew Morton, Christophe Leroy

>>> Adding Christophe on Cc:
>>>
>>> Christophe do you know if is_hugepd is true for all hugetlb entries, not
>>> just hugepd?
>>>
>>> On systems without hugepd entries, I guess ptdump skips all hugetlb entries.
>>> Sigh!
>>
>> IIUC, the idea of ptdump_walk_pgd() is to dump page tables even outside
>> VMAs (for debugging purposes?).
>>
>> I cannot convince myself that that's a good idea when only holding the
>> mmap lock in read mode, because we can just see page tables getting
>> freed concurrently e.g., during concurrent munmap() ... while holding
>> the mmap lock in read we may only walk inside VMA boundaries.
>>
>> That then raises the questions if we're only calling this on special MMs
>> (e.g., init_mm) whereby we cannot really see concurrent munmap() and
>> where we shouldn't have hugetlb mappings or hugepd entries.
>>
> 
> This is going to require a little more thought.
> 
> Since Baolin's patch for stable releases is moving forward, I want to
> get the cleanup provided by this patch in ASAP.  So, I am going to rebase
> this patch on Baolin's with the other fixups.
> 
> Will come back to this cleanup later.

Sure, no need to do it all at once (I was just bringing it up while
thinking about it).

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-09-02 18:52             ` David Hildenbrand
  0 siblings, 0 replies; 60+ messages in thread
From: David Hildenbrand @ 2022-09-02 18:52 UTC (permalink / raw)
  To: Mike Kravetz
  Cc: linux-mm, linux-kernel, inuxppc-dev, linux-ia64, Baolin Wang,
	Aneesh Kumar K . V, Naoya Horiguchi, Michael Ellerman,
	Muchun Song, Andrew Morton, Christophe Leroy

>>> Adding Christophe on Cc:
>>>
>>> Christophe do you know if is_hugepd is true for all hugetlb entries, not
>>> just hugepd?
>>>
>>> On systems without hugepd entries, I guess ptdump skips all hugetlb entries.
>>> Sigh!
>>
>> IIUC, the idea of ptdump_walk_pgd() is to dump page tables even outside
>> VMAs (for debugging purposes?).
>>
>> I cannot convince myself that that's a good idea when only holding the
>> mmap lock in read mode, because we can just see page tables getting
>> freed concurrently e.g., during concurrent munmap() ... while holding
>> the mmap lock in read we may only walk inside VMA boundaries.
>>
>> That then raises the questions if we're only calling this on special MMs
>> (e.g., init_mm) whereby we cannot really see concurrent munmap() and
>> where we shouldn't have hugetlb mappings or hugepd entries.
>>
> 
> This is going to require a little more thought.
> 
> Since Baolin's patch for stable releases is moving forward, I want to
> get the cleanup provided by this patch in ASAP.  So, I am going to rebase
> this patch on Baolin's with the other fixups.
> 
> Will come back to this cleanup later.

Sure, no need to do it all at once (I was just bringing it up while
thinking about it).

-- 
Thanks,

David / dhildenb

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
  2022-09-02 18:52             ` David Hildenbrand
@ 2022-09-03  6:59               ` Christophe Leroy
  -1 siblings, 0 replies; 60+ messages in thread
From: Christophe Leroy @ 2022-09-03  6:59 UTC (permalink / raw)
  To: David Hildenbrand, Mike Kravetz
  Cc: linux-mm, linux-kernel, inuxppc-dev, linux-ia64, Baolin Wang,
	Aneesh Kumar K . V, Naoya Horiguchi, Michael Ellerman,
	Muchun Song, Andrew Morton



Le 02/09/2022 à 20:52, David Hildenbrand a écrit :
>>>> Adding Christophe on Cc:
>>>>
>>>> Christophe do you know if is_hugepd is true for all hugetlb entries, not
>>>> just hugepd?

is_hugepd() is true if and only if the directory entry points to a huge 
page directory and not to the normal lower level directory.

As far as I understand if the directory entry is not pointing to any 
lower directory but is a huge page entry, pXd_leaf() is true.


>>>>
>>>> On systems without hugepd entries, I guess ptdump skips all hugetlb entries.
>>>> Sigh!

As far as I can see, ptdump_pXd_entry() handles the pXd_leaf() case.

>>>
>>> IIUC, the idea of ptdump_walk_pgd() is to dump page tables even outside
>>> VMAs (for debugging purposes?).
>>>
>>> I cannot convince myself that that's a good idea when only holding the
>>> mmap lock in read mode, because we can just see page tables getting
>>> freed concurrently e.g., during concurrent munmap() ... while holding
>>> the mmap lock in read we may only walk inside VMA boundaries.
>>>
>>> That then raises the questions if we're only calling this on special MMs
>>> (e.g., init_mm) whereby we cannot really see concurrent munmap() and
>>> where we shouldn't have hugetlb mappings or hugepd entries.

At least on powerpc, PTDUMP handles only init_mm.

Hugepage are used at least on powerpc 8xx for linear memory mapping, see

commit 34536d780683 ("powerpc/8xx: Add a function to early map kernel 
via huge pages")
commit cf209951fa7f ("powerpc/8xx: Map linear memory with huge pages")

hugepds may also be used in the future to use huge pages for vmap and 
vmalloc, see commit a6a8f7c4aa7e ("powerpc/8xx: add support for huge 
pages on VMAP and VMALLOC")

As far as I know, ppc64 also use huge pages for VMAP and VMALLOC, see

commit d909f9109c30 ("powerpc/64s/radix: Enable HAVE_ARCH_HUGE_VMAP")
commit 8abddd968a30 ("powerpc/64s/radix: Enable huge vmalloc mappings")

Christophe

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-09-03  6:59               ` Christophe Leroy
  0 siblings, 0 replies; 60+ messages in thread
From: Christophe Leroy @ 2022-09-03  6:59 UTC (permalink / raw)
  To: David Hildenbrand, Mike Kravetz
  Cc: linux-mm, linux-kernel, inuxppc-dev, linux-ia64, Baolin Wang,
	Aneesh Kumar K . V, Naoya Horiguchi, Michael Ellerman,
	Muchun Song, Andrew Morton

DQoNCkxlIDAyLzA5LzIwMjIgw6AgMjA6NTIsIERhdmlkIEhpbGRlbmJyYW5kIGEgw6ljcml0wqA6
DQo+Pj4+IEFkZGluZyBDaHJpc3RvcGhlIG9uIENjOg0KPj4+Pg0KPj4+PiBDaHJpc3RvcGhlIGRv
IHlvdSBrbm93IGlmIGlzX2h1Z2VwZCBpcyB0cnVlIGZvciBhbGwgaHVnZXRsYiBlbnRyaWVzLCBu
b3QNCj4+Pj4ganVzdCBodWdlcGQ/DQoNCmlzX2h1Z2VwZCgpIGlzIHRydWUgaWYgYW5kIG9ubHkg
aWYgdGhlIGRpcmVjdG9yeSBlbnRyeSBwb2ludHMgdG8gYSBodWdlIA0KcGFnZSBkaXJlY3Rvcnkg
YW5kIG5vdCB0byB0aGUgbm9ybWFsIGxvd2VyIGxldmVsIGRpcmVjdG9yeS4NCg0KQXMgZmFyIGFz
IEkgdW5kZXJzdGFuZCBpZiB0aGUgZGlyZWN0b3J5IGVudHJ5IGlzIG5vdCBwb2ludGluZyB0byBh
bnkgDQpsb3dlciBkaXJlY3RvcnkgYnV0IGlzIGEgaHVnZSBwYWdlIGVudHJ5LCBwWGRfbGVhZigp
IGlzIHRydWUuDQoNCg0KPj4+Pg0KPj4+PiBPbiBzeXN0ZW1zIHdpdGhvdXQgaHVnZXBkIGVudHJp
ZXMsIEkgZ3Vlc3MgcHRkdW1wIHNraXBzIGFsbCBodWdldGxiIGVudHJpZXMuDQo+Pj4+IFNpZ2gh
DQoNCkFzIGZhciBhcyBJIGNhbiBzZWUsIHB0ZHVtcF9wWGRfZW50cnkoKSBoYW5kbGVzIHRoZSBw
WGRfbGVhZigpIGNhc2UuDQoNCj4+Pg0KPj4+IElJVUMsIHRoZSBpZGVhIG9mIHB0ZHVtcF93YWxr
X3BnZCgpIGlzIHRvIGR1bXAgcGFnZSB0YWJsZXMgZXZlbiBvdXRzaWRlDQo+Pj4gVk1BcyAoZm9y
IGRlYnVnZ2luZyBwdXJwb3Nlcz8pLg0KPj4+DQo+Pj4gSSBjYW5ub3QgY29udmluY2UgbXlzZWxm
IHRoYXQgdGhhdCdzIGEgZ29vZCBpZGVhIHdoZW4gb25seSBob2xkaW5nIHRoZQ0KPj4+IG1tYXAg
bG9jayBpbiByZWFkIG1vZGUsIGJlY2F1c2Ugd2UgY2FuIGp1c3Qgc2VlIHBhZ2UgdGFibGVzIGdl
dHRpbmcNCj4+PiBmcmVlZCBjb25jdXJyZW50bHkgZS5nLiwgZHVyaW5nIGNvbmN1cnJlbnQgbXVu
bWFwKCkgLi4uIHdoaWxlIGhvbGRpbmcNCj4+PiB0aGUgbW1hcCBsb2NrIGluIHJlYWQgd2UgbWF5
IG9ubHkgd2FsayBpbnNpZGUgVk1BIGJvdW5kYXJpZXMuDQo+Pj4NCj4+PiBUaGF0IHRoZW4gcmFp
c2VzIHRoZSBxdWVzdGlvbnMgaWYgd2UncmUgb25seSBjYWxsaW5nIHRoaXMgb24gc3BlY2lhbCBN
TXMNCj4+PiAoZS5nLiwgaW5pdF9tbSkgd2hlcmVieSB3ZSBjYW5ub3QgcmVhbGx5IHNlZSBjb25j
dXJyZW50IG11bm1hcCgpIGFuZA0KPj4+IHdoZXJlIHdlIHNob3VsZG4ndCBoYXZlIGh1Z2V0bGIg
bWFwcGluZ3Mgb3IgaHVnZXBkIGVudHJpZXMuDQoNCkF0IGxlYXN0IG9uIHBvd2VycGMsIFBURFVN
UCBoYW5kbGVzIG9ubHkgaW5pdF9tbS4NCg0KSHVnZXBhZ2UgYXJlIHVzZWQgYXQgbGVhc3Qgb24g
cG93ZXJwYyA4eHggZm9yIGxpbmVhciBtZW1vcnkgbWFwcGluZywgc2VlDQoNCmNvbW1pdCAzNDUz
NmQ3ODA2ODMgKCJwb3dlcnBjLzh4eDogQWRkIGEgZnVuY3Rpb24gdG8gZWFybHkgbWFwIGtlcm5l
bCANCnZpYSBodWdlIHBhZ2VzIikNCmNvbW1pdCBjZjIwOTk1MWZhN2YgKCJwb3dlcnBjLzh4eDog
TWFwIGxpbmVhciBtZW1vcnkgd2l0aCBodWdlIHBhZ2VzIikNCg0KaHVnZXBkcyBtYXkgYWxzbyBi
ZSB1c2VkIGluIHRoZSBmdXR1cmUgdG8gdXNlIGh1Z2UgcGFnZXMgZm9yIHZtYXAgYW5kIA0Kdm1h
bGxvYywgc2VlIGNvbW1pdCBhNmE4ZjdjNGFhN2UgKCJwb3dlcnBjLzh4eDogYWRkIHN1cHBvcnQg
Zm9yIGh1Z2UgDQpwYWdlcyBvbiBWTUFQIGFuZCBWTUFMTE9DIikNCg0KQXMgZmFyIGFzIEkga25v
dywgcHBjNjQgYWxzbyB1c2UgaHVnZSBwYWdlcyBmb3IgVk1BUCBhbmQgVk1BTExPQywgc2VlDQoN
CmNvbW1pdCBkOTA5ZjkxMDljMzAgKCJwb3dlcnBjLzY0cy9yYWRpeDogRW5hYmxlIEhBVkVfQVJD
SF9IVUdFX1ZNQVAiKQ0KY29tbWl0IDhhYmRkZDk2OGEzMCAoInBvd2VycGMvNjRzL3JhZGl4OiBF
bmFibGUgaHVnZSB2bWFsbG9jIG1hcHBpbmdzIikNCg0KQ2hyaXN0b3BoZQ=

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
  2022-09-02 18:52             ` David Hildenbrand
  (?)
@ 2022-09-03  7:07               ` Christophe Leroy
  -1 siblings, 0 replies; 60+ messages in thread
From: Christophe Leroy @ 2022-09-03  7:07 UTC (permalink / raw)
  To: David Hildenbrand, Mike Kravetz
  Cc: linux-mm, linux-kernel, linux-ia64, Baolin Wang,
	Aneesh Kumar K . V, Naoya Horiguchi, Michael Ellerman,
	Muchun Song, Andrew Morton, linuxppc-dev

+Resending with valid powerpc list address

Le 02/09/2022 à 20:52, David Hildenbrand a écrit :
>>>> Adding Christophe on Cc:
>>>>
>>>> Christophe do you know if is_hugepd is true for all hugetlb entries, not
>>>> just hugepd?

is_hugepd() is true if and only if the directory entry points to a huge 
page directory and not to the normal lower level directory.

As far as I understand if the directory entry is not pointing to any 
lower directory but is a huge page entry, pXd_leaf() is true.


>>>>
>>>> On systems without hugepd entries, I guess ptdump skips all hugetlb entries.
>>>> Sigh!

As far as I can see, ptdump_pXd_entry() handles the pXd_leaf() case.

>>>
>>> IIUC, the idea of ptdump_walk_pgd() is to dump page tables even outside
>>> VMAs (for debugging purposes?).
>>>
>>> I cannot convince myself that that's a good idea when only holding the
>>> mmap lock in read mode, because we can just see page tables getting
>>> freed concurrently e.g., during concurrent munmap() ... while holding
>>> the mmap lock in read we may only walk inside VMA boundaries.
>>>
>>> That then raises the questions if we're only calling this on special MMs
>>> (e.g., init_mm) whereby we cannot really see concurrent munmap() and
>>> where we shouldn't have hugetlb mappings or hugepd entries.

At least on powerpc, PTDUMP handles only init_mm.

Hugepage are used at least on powerpc 8xx for linear memory mapping, see

commit 34536d780683 ("powerpc/8xx: Add a function to early map kernel 
via huge pages")
commit cf209951fa7f ("powerpc/8xx: Map linear memory with huge pages")

hugepds may also be used in the future to use huge pages for vmap and 
vmalloc, see commit a6a8f7c4aa7e ("powerpc/8xx: add support for huge 
pages on VMAP and VMALLOC")

As far as I know, ppc64 also use huge pages for VMAP and VMALLOC, see

commit d909f9109c30 ("powerpc/64s/radix: Enable HAVE_ARCH_HUGE_VMAP")
commit 8abddd968a30 ("powerpc/64s/radix: Enable huge vmalloc mappings")

Christophe

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-09-03  7:07               ` Christophe Leroy
  0 siblings, 0 replies; 60+ messages in thread
From: Christophe Leroy @ 2022-09-03  7:07 UTC (permalink / raw)
  To: David Hildenbrand, Mike Kravetz
  Cc: linuxppc-dev, linux-ia64, Aneesh Kumar K . V, Muchun Song,
	linux-kernel, linux-mm, Baolin Wang, Andrew Morton,
	Naoya Horiguchi

+Resending with valid powerpc list address

Le 02/09/2022 à 20:52, David Hildenbrand a écrit :
>>>> Adding Christophe on Cc:
>>>>
>>>> Christophe do you know if is_hugepd is true for all hugetlb entries, not
>>>> just hugepd?

is_hugepd() is true if and only if the directory entry points to a huge 
page directory and not to the normal lower level directory.

As far as I understand if the directory entry is not pointing to any 
lower directory but is a huge page entry, pXd_leaf() is true.


>>>>
>>>> On systems without hugepd entries, I guess ptdump skips all hugetlb entries.
>>>> Sigh!

As far as I can see, ptdump_pXd_entry() handles the pXd_leaf() case.

>>>
>>> IIUC, the idea of ptdump_walk_pgd() is to dump page tables even outside
>>> VMAs (for debugging purposes?).
>>>
>>> I cannot convince myself that that's a good idea when only holding the
>>> mmap lock in read mode, because we can just see page tables getting
>>> freed concurrently e.g., during concurrent munmap() ... while holding
>>> the mmap lock in read we may only walk inside VMA boundaries.
>>>
>>> That then raises the questions if we're only calling this on special MMs
>>> (e.g., init_mm) whereby we cannot really see concurrent munmap() and
>>> where we shouldn't have hugetlb mappings or hugepd entries.

At least on powerpc, PTDUMP handles only init_mm.

Hugepage are used at least on powerpc 8xx for linear memory mapping, see

commit 34536d780683 ("powerpc/8xx: Add a function to early map kernel 
via huge pages")
commit cf209951fa7f ("powerpc/8xx: Map linear memory with huge pages")

hugepds may also be used in the future to use huge pages for vmap and 
vmalloc, see commit a6a8f7c4aa7e ("powerpc/8xx: add support for huge 
pages on VMAP and VMALLOC")

As far as I know, ppc64 also use huge pages for VMAP and VMALLOC, see

commit d909f9109c30 ("powerpc/64s/radix: Enable HAVE_ARCH_HUGE_VMAP")
commit 8abddd968a30 ("powerpc/64s/radix: Enable huge vmalloc mappings")

Christophe

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-09-03  7:07               ` Christophe Leroy
  0 siblings, 0 replies; 60+ messages in thread
From: Christophe Leroy @ 2022-09-03  7:07 UTC (permalink / raw)
  To: David Hildenbrand, Mike Kravetz
  Cc: linux-mm, linux-kernel, linux-ia64, Baolin Wang,
	Aneesh Kumar K . V, Naoya Horiguchi, Michael Ellerman,
	Muchun Song, Andrew Morton, linuxppc-dev

K1Jlc2VuZGluZyB3aXRoIHZhbGlkIHBvd2VycGMgbGlzdCBhZGRyZXNzDQoNCkxlIDAyLzA5LzIw
MjIgw6AgMjA6NTIsIERhdmlkIEhpbGRlbmJyYW5kIGEgw6ljcml0wqA6DQo+Pj4+IEFkZGluZyBD
aHJpc3RvcGhlIG9uIENjOg0KPj4+Pg0KPj4+PiBDaHJpc3RvcGhlIGRvIHlvdSBrbm93IGlmIGlz
X2h1Z2VwZCBpcyB0cnVlIGZvciBhbGwgaHVnZXRsYiBlbnRyaWVzLCBub3QNCj4+Pj4ganVzdCBo
dWdlcGQ/DQoNCmlzX2h1Z2VwZCgpIGlzIHRydWUgaWYgYW5kIG9ubHkgaWYgdGhlIGRpcmVjdG9y
eSBlbnRyeSBwb2ludHMgdG8gYSBodWdlIA0KcGFnZSBkaXJlY3RvcnkgYW5kIG5vdCB0byB0aGUg
bm9ybWFsIGxvd2VyIGxldmVsIGRpcmVjdG9yeS4NCg0KQXMgZmFyIGFzIEkgdW5kZXJzdGFuZCBp
ZiB0aGUgZGlyZWN0b3J5IGVudHJ5IGlzIG5vdCBwb2ludGluZyB0byBhbnkgDQpsb3dlciBkaXJl
Y3RvcnkgYnV0IGlzIGEgaHVnZSBwYWdlIGVudHJ5LCBwWGRfbGVhZigpIGlzIHRydWUuDQoNCg0K
Pj4+Pg0KPj4+PiBPbiBzeXN0ZW1zIHdpdGhvdXQgaHVnZXBkIGVudHJpZXMsIEkgZ3Vlc3MgcHRk
dW1wIHNraXBzIGFsbCBodWdldGxiIGVudHJpZXMuDQo+Pj4+IFNpZ2ghDQoNCkFzIGZhciBhcyBJ
IGNhbiBzZWUsIHB0ZHVtcF9wWGRfZW50cnkoKSBoYW5kbGVzIHRoZSBwWGRfbGVhZigpIGNhc2Uu
DQoNCj4+Pg0KPj4+IElJVUMsIHRoZSBpZGVhIG9mIHB0ZHVtcF93YWxrX3BnZCgpIGlzIHRvIGR1
bXAgcGFnZSB0YWJsZXMgZXZlbiBvdXRzaWRlDQo+Pj4gVk1BcyAoZm9yIGRlYnVnZ2luZyBwdXJw
b3Nlcz8pLg0KPj4+DQo+Pj4gSSBjYW5ub3QgY29udmluY2UgbXlzZWxmIHRoYXQgdGhhdCdzIGEg
Z29vZCBpZGVhIHdoZW4gb25seSBob2xkaW5nIHRoZQ0KPj4+IG1tYXAgbG9jayBpbiByZWFkIG1v
ZGUsIGJlY2F1c2Ugd2UgY2FuIGp1c3Qgc2VlIHBhZ2UgdGFibGVzIGdldHRpbmcNCj4+PiBmcmVl
ZCBjb25jdXJyZW50bHkgZS5nLiwgZHVyaW5nIGNvbmN1cnJlbnQgbXVubWFwKCkgLi4uIHdoaWxl
IGhvbGRpbmcNCj4+PiB0aGUgbW1hcCBsb2NrIGluIHJlYWQgd2UgbWF5IG9ubHkgd2FsayBpbnNp
ZGUgVk1BIGJvdW5kYXJpZXMuDQo+Pj4NCj4+PiBUaGF0IHRoZW4gcmFpc2VzIHRoZSBxdWVzdGlv
bnMgaWYgd2UncmUgb25seSBjYWxsaW5nIHRoaXMgb24gc3BlY2lhbCBNTXMNCj4+PiAoZS5nLiwg
aW5pdF9tbSkgd2hlcmVieSB3ZSBjYW5ub3QgcmVhbGx5IHNlZSBjb25jdXJyZW50IG11bm1hcCgp
IGFuZA0KPj4+IHdoZXJlIHdlIHNob3VsZG4ndCBoYXZlIGh1Z2V0bGIgbWFwcGluZ3Mgb3IgaHVn
ZXBkIGVudHJpZXMuDQoNCkF0IGxlYXN0IG9uIHBvd2VycGMsIFBURFVNUCBoYW5kbGVzIG9ubHkg
aW5pdF9tbS4NCg0KSHVnZXBhZ2UgYXJlIHVzZWQgYXQgbGVhc3Qgb24gcG93ZXJwYyA4eHggZm9y
IGxpbmVhciBtZW1vcnkgbWFwcGluZywgc2VlDQoNCmNvbW1pdCAzNDUzNmQ3ODA2ODMgKCJwb3dl
cnBjLzh4eDogQWRkIGEgZnVuY3Rpb24gdG8gZWFybHkgbWFwIGtlcm5lbCANCnZpYSBodWdlIHBh
Z2VzIikNCmNvbW1pdCBjZjIwOTk1MWZhN2YgKCJwb3dlcnBjLzh4eDogTWFwIGxpbmVhciBtZW1v
cnkgd2l0aCBodWdlIHBhZ2VzIikNCg0KaHVnZXBkcyBtYXkgYWxzbyBiZSB1c2VkIGluIHRoZSBm
dXR1cmUgdG8gdXNlIGh1Z2UgcGFnZXMgZm9yIHZtYXAgYW5kIA0Kdm1hbGxvYywgc2VlIGNvbW1p
dCBhNmE4ZjdjNGFhN2UgKCJwb3dlcnBjLzh4eDogYWRkIHN1cHBvcnQgZm9yIGh1Z2UgDQpwYWdl
cyBvbiBWTUFQIGFuZCBWTUFMTE9DIikNCg0KQXMgZmFyIGFzIEkga25vdywgcHBjNjQgYWxzbyB1
c2UgaHVnZSBwYWdlcyBmb3IgVk1BUCBhbmQgVk1BTExPQywgc2VlDQoNCmNvbW1pdCBkOTA5Zjkx
MDljMzAgKCJwb3dlcnBjLzY0cy9yYWRpeDogRW5hYmxlIEhBVkVfQVJDSF9IVUdFX1ZNQVAiKQ0K
Y29tbWl0IDhhYmRkZDk2OGEzMCAoInBvd2VycGMvNjRzL3JhZGl4OiBFbmFibGUgaHVnZSB2bWFs
bG9jIG1hcHBpbmdzIikNCg0KQ2hyaXN0b3BoZQ=

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
  2022-09-03  7:07               ` Christophe Leroy
  (?)
@ 2022-09-04 11:49                 ` Michael Ellerman
  -1 siblings, 0 replies; 60+ messages in thread
From: Michael Ellerman @ 2022-09-04 11:49 UTC (permalink / raw)
  To: Christophe Leroy, David Hildenbrand, Mike Kravetz
  Cc: linux-mm, linux-kernel, linux-ia64, Baolin Wang,
	Aneesh Kumar K . V, Naoya Horiguchi, Muchun Song, Andrew Morton,
	linuxppc-dev

Christophe Leroy <christophe.leroy@csgroup.eu> writes:
> +Resending with valid powerpc list address
>
> Le 02/09/2022 à 20:52, David Hildenbrand a écrit :
>>>>> Adding Christophe on Cc:
>>>>>
>>>>> Christophe do you know if is_hugepd is true for all hugetlb entries, not
>>>>> just hugepd?
>
> is_hugepd() is true if and only if the directory entry points to a huge 
> page directory and not to the normal lower level directory.
>
> As far as I understand if the directory entry is not pointing to any 
> lower directory but is a huge page entry, pXd_leaf() is true.

Yes.

Though historically it's pXd_huge() which is used to test that, which is
gated by CONFIG_HUGETLB_PAGE.

The leaf versions are newer and test whether the entry is a PTE
regardless of whether CONFIG_HUGETLB_PAGE is enabled. Which is needed
for PTDUMP if the kernel mapping uses huge pages independently of
CONFIG_HUGETLB_PAGE, which is true on at least powerpc.

>>>>>
>>>>> On systems without hugepd entries, I guess ptdump skips all hugetlb entries.
>>>>> Sigh!
>
> As far as I can see, ptdump_pXd_entry() handles the pXd_leaf() case.
>
>>>>
>>>> IIUC, the idea of ptdump_walk_pgd() is to dump page tables even outside
>>>> VMAs (for debugging purposes?).
>>>>
>>>> I cannot convince myself that that's a good idea when only holding the
>>>> mmap lock in read mode, because we can just see page tables getting
>>>> freed concurrently e.g., during concurrent munmap() ... while holding
>>>> the mmap lock in read we may only walk inside VMA boundaries.
>>>>
>>>> That then raises the questions if we're only calling this on special MMs
>>>> (e.g., init_mm) whereby we cannot really see concurrent munmap() and
>>>> where we shouldn't have hugetlb mappings or hugepd entries.
>
> At least on powerpc, PTDUMP handles only init_mm.
>
> Hugepage are used at least on powerpc 8xx for linear memory mapping, see
>
> commit 34536d780683 ("powerpc/8xx: Add a function to early map kernel 
> via huge pages")
> commit cf209951fa7f ("powerpc/8xx: Map linear memory with huge pages")
>
> hugepds may also be used in the future to use huge pages for vmap and 
> vmalloc, see commit a6a8f7c4aa7e ("powerpc/8xx: add support for huge 
> pages on VMAP and VMALLOC")
>
> As far as I know, ppc64 also use huge pages for VMAP and VMALLOC, see
>
> commit d909f9109c30 ("powerpc/64s/radix: Enable HAVE_ARCH_HUGE_VMAP")
> commit 8abddd968a30 ("powerpc/64s/radix: Enable huge vmalloc mappings")

64-bit also uses huge pages for the kernel linear mapping (aka. direct
mapping), and on newer systems (>= Power9) those also appear in the
kernel page tables.

cheers

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-09-04 11:49                 ` Michael Ellerman
  0 siblings, 0 replies; 60+ messages in thread
From: Michael Ellerman @ 2022-09-04 11:49 UTC (permalink / raw)
  To: Christophe Leroy, David Hildenbrand, Mike Kravetz
  Cc: linux-ia64, Aneesh Kumar K . V, linuxppc-dev, linux-kernel,
	linux-mm, Baolin Wang, Muchun Song, Andrew Morton,
	Naoya Horiguchi

Christophe Leroy <christophe.leroy@csgroup.eu> writes:
> +Resending with valid powerpc list address
>
> Le 02/09/2022 à 20:52, David Hildenbrand a écrit :
>>>>> Adding Christophe on Cc:
>>>>>
>>>>> Christophe do you know if is_hugepd is true for all hugetlb entries, not
>>>>> just hugepd?
>
> is_hugepd() is true if and only if the directory entry points to a huge 
> page directory and not to the normal lower level directory.
>
> As far as I understand if the directory entry is not pointing to any 
> lower directory but is a huge page entry, pXd_leaf() is true.

Yes.

Though historically it's pXd_huge() which is used to test that, which is
gated by CONFIG_HUGETLB_PAGE.

The leaf versions are newer and test whether the entry is a PTE
regardless of whether CONFIG_HUGETLB_PAGE is enabled. Which is needed
for PTDUMP if the kernel mapping uses huge pages independently of
CONFIG_HUGETLB_PAGE, which is true on at least powerpc.

>>>>>
>>>>> On systems without hugepd entries, I guess ptdump skips all hugetlb entries.
>>>>> Sigh!
>
> As far as I can see, ptdump_pXd_entry() handles the pXd_leaf() case.
>
>>>>
>>>> IIUC, the idea of ptdump_walk_pgd() is to dump page tables even outside
>>>> VMAs (for debugging purposes?).
>>>>
>>>> I cannot convince myself that that's a good idea when only holding the
>>>> mmap lock in read mode, because we can just see page tables getting
>>>> freed concurrently e.g., during concurrent munmap() ... while holding
>>>> the mmap lock in read we may only walk inside VMA boundaries.
>>>>
>>>> That then raises the questions if we're only calling this on special MMs
>>>> (e.g., init_mm) whereby we cannot really see concurrent munmap() and
>>>> where we shouldn't have hugetlb mappings or hugepd entries.
>
> At least on powerpc, PTDUMP handles only init_mm.
>
> Hugepage are used at least on powerpc 8xx for linear memory mapping, see
>
> commit 34536d780683 ("powerpc/8xx: Add a function to early map kernel 
> via huge pages")
> commit cf209951fa7f ("powerpc/8xx: Map linear memory with huge pages")
>
> hugepds may also be used in the future to use huge pages for vmap and 
> vmalloc, see commit a6a8f7c4aa7e ("powerpc/8xx: add support for huge 
> pages on VMAP and VMALLOC")
>
> As far as I know, ppc64 also use huge pages for VMAP and VMALLOC, see
>
> commit d909f9109c30 ("powerpc/64s/radix: Enable HAVE_ARCH_HUGE_VMAP")
> commit 8abddd968a30 ("powerpc/64s/radix: Enable huge vmalloc mappings")

64-bit also uses huge pages for the kernel linear mapping (aka. direct
mapping), and on newer systems (>= Power9) those also appear in the
kernel page tables.

cheers

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-09-04 11:49                 ` Michael Ellerman
  0 siblings, 0 replies; 60+ messages in thread
From: Michael Ellerman @ 2022-09-04 11:49 UTC (permalink / raw)
  To: Christophe Leroy, David Hildenbrand, Mike Kravetz
  Cc: linux-mm, linux-kernel, linux-ia64, Baolin Wang,
	Aneesh Kumar K . V, Naoya Horiguchi, Muchun Song, Andrew Morton,
	linuxppc-dev

Christophe Leroy <christophe.leroy@csgroup.eu> writes:
> +Resending with valid powerpc list address
>
> Le 02/09/2022 à 20:52, David Hildenbrand a écrit :
>>>>> Adding Christophe on Cc:
>>>>>
>>>>> Christophe do you know if is_hugepd is true for all hugetlb entries, not
>>>>> just hugepd?
>
> is_hugepd() is true if and only if the directory entry points to a huge 
> page directory and not to the normal lower level directory.
>
> As far as I understand if the directory entry is not pointing to any 
> lower directory but is a huge page entry, pXd_leaf() is true.

Yes.

Though historically it's pXd_huge() which is used to test that, which is
gated by CONFIG_HUGETLB_PAGE.

The leaf versions are newer and test whether the entry is a PTE
regardless of whether CONFIG_HUGETLB_PAGE is enabled. Which is needed
for PTDUMP if the kernel mapping uses huge pages independently of
CONFIG_HUGETLB_PAGE, which is true on at least powerpc.

>>>>>
>>>>> On systems without hugepd entries, I guess ptdump skips all hugetlb entries.
>>>>> Sigh!
>
> As far as I can see, ptdump_pXd_entry() handles the pXd_leaf() case.
>
>>>>
>>>> IIUC, the idea of ptdump_walk_pgd() is to dump page tables even outside
>>>> VMAs (for debugging purposes?).
>>>>
>>>> I cannot convince myself that that's a good idea when only holding the
>>>> mmap lock in read mode, because we can just see page tables getting
>>>> freed concurrently e.g., during concurrent munmap() ... while holding
>>>> the mmap lock in read we may only walk inside VMA boundaries.
>>>>
>>>> That then raises the questions if we're only calling this on special MMs
>>>> (e.g., init_mm) whereby we cannot really see concurrent munmap() and
>>>> where we shouldn't have hugetlb mappings or hugepd entries.
>
> At least on powerpc, PTDUMP handles only init_mm.
>
> Hugepage are used at least on powerpc 8xx for linear memory mapping, see
>
> commit 34536d780683 ("powerpc/8xx: Add a function to early map kernel 
> via huge pages")
> commit cf209951fa7f ("powerpc/8xx: Map linear memory with huge pages")
>
> hugepds may also be used in the future to use huge pages for vmap and 
> vmalloc, see commit a6a8f7c4aa7e ("powerpc/8xx: add support for huge 
> pages on VMAP and VMALLOC")
>
> As far as I know, ppc64 also use huge pages for VMAP and VMALLOC, see
>
> commit d909f9109c30 ("powerpc/64s/radix: Enable HAVE_ARCH_HUGE_VMAP")
> commit 8abddd968a30 ("powerpc/64s/radix: Enable huge vmalloc mappings")

64-bit also uses huge pages for the kernel linear mapping (aka. direct
mapping), and on newer systems (>= Power9) those also appear in the
kernel page tables.

cheers

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
  2022-09-03  7:07               ` Christophe Leroy
  (?)
@ 2022-09-05  8:37                 ` David Hildenbrand
  -1 siblings, 0 replies; 60+ messages in thread
From: David Hildenbrand @ 2022-09-05  8:37 UTC (permalink / raw)
  To: Christophe Leroy, Mike Kravetz
  Cc: linux-mm, linux-kernel, linux-ia64, Baolin Wang,
	Aneesh Kumar K . V, Naoya Horiguchi, Michael Ellerman,
	Muchun Song, Andrew Morton, linuxppc-dev

On 03.09.22 09:07, Christophe Leroy wrote:
> +Resending with valid powerpc list address
> 
> Le 02/09/2022 à 20:52, David Hildenbrand a écrit :
>>>>> Adding Christophe on Cc:
>>>>>
>>>>> Christophe do you know if is_hugepd is true for all hugetlb entries, not
>>>>> just hugepd?
> 
> is_hugepd() is true if and only if the directory entry points to a huge
> page directory and not to the normal lower level directory.
> 
> As far as I understand if the directory entry is not pointing to any
> lower directory but is a huge page entry, pXd_leaf() is true.
> 
> 
>>>>>
>>>>> On systems without hugepd entries, I guess ptdump skips all hugetlb entries.
>>>>> Sigh!
> 
> As far as I can see, ptdump_pXd_entry() handles the pXd_leaf() case.
> 
>>>>
>>>> IIUC, the idea of ptdump_walk_pgd() is to dump page tables even outside
>>>> VMAs (for debugging purposes?).
>>>>
>>>> I cannot convince myself that that's a good idea when only holding the
>>>> mmap lock in read mode, because we can just see page tables getting
>>>> freed concurrently e.g., during concurrent munmap() ... while holding
>>>> the mmap lock in read we may only walk inside VMA boundaries.
>>>>
>>>> That then raises the questions if we're only calling this on special MMs
>>>> (e.g., init_mm) whereby we cannot really see concurrent munmap() and
>>>> where we shouldn't have hugetlb mappings or hugepd entries.
> 
> At least on powerpc, PTDUMP handles only init_mm.
> 
> Hugepage are used at least on powerpc 8xx for linear memory mapping, see
> 
> commit 34536d780683 ("powerpc/8xx: Add a function to early map kernel
> via huge pages")
> commit cf209951fa7f ("powerpc/8xx: Map linear memory with huge pages")
> 
> hugepds may also be used in the future to use huge pages for vmap and
> vmalloc, see commit a6a8f7c4aa7e ("powerpc/8xx: add support for huge
> pages on VMAP and VMALLOC")
> 
> As far as I know, ppc64 also use huge pages for VMAP and VMALLOC, see
> 
> commit d909f9109c30 ("powerpc/64s/radix: Enable HAVE_ARCH_HUGE_VMAP")
> commit 8abddd968a30 ("powerpc/64s/radix: Enable huge vmalloc mappings")

There is a difference between an ordinary huge mapping (e.g., as used 
for THP) and a a hugetlb mapping.

Our current understanding is that hugepd only applies to hugetlb. 
Wouldn't vmap/vmalloc user ordinary huge pmd entries instead of hugepd?

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-09-05  8:37                 ` David Hildenbrand
  0 siblings, 0 replies; 60+ messages in thread
From: David Hildenbrand @ 2022-09-05  8:37 UTC (permalink / raw)
  To: Christophe Leroy, Mike Kravetz
  Cc: linuxppc-dev, linux-ia64, Aneesh Kumar K . V, Muchun Song,
	linux-kernel, linux-mm, Baolin Wang, Andrew Morton,
	Naoya Horiguchi

On 03.09.22 09:07, Christophe Leroy wrote:
> +Resending with valid powerpc list address
> 
> Le 02/09/2022 à 20:52, David Hildenbrand a écrit :
>>>>> Adding Christophe on Cc:
>>>>>
>>>>> Christophe do you know if is_hugepd is true for all hugetlb entries, not
>>>>> just hugepd?
> 
> is_hugepd() is true if and only if the directory entry points to a huge
> page directory and not to the normal lower level directory.
> 
> As far as I understand if the directory entry is not pointing to any
> lower directory but is a huge page entry, pXd_leaf() is true.
> 
> 
>>>>>
>>>>> On systems without hugepd entries, I guess ptdump skips all hugetlb entries.
>>>>> Sigh!
> 
> As far as I can see, ptdump_pXd_entry() handles the pXd_leaf() case.
> 
>>>>
>>>> IIUC, the idea of ptdump_walk_pgd() is to dump page tables even outside
>>>> VMAs (for debugging purposes?).
>>>>
>>>> I cannot convince myself that that's a good idea when only holding the
>>>> mmap lock in read mode, because we can just see page tables getting
>>>> freed concurrently e.g., during concurrent munmap() ... while holding
>>>> the mmap lock in read we may only walk inside VMA boundaries.
>>>>
>>>> That then raises the questions if we're only calling this on special MMs
>>>> (e.g., init_mm) whereby we cannot really see concurrent munmap() and
>>>> where we shouldn't have hugetlb mappings or hugepd entries.
> 
> At least on powerpc, PTDUMP handles only init_mm.
> 
> Hugepage are used at least on powerpc 8xx for linear memory mapping, see
> 
> commit 34536d780683 ("powerpc/8xx: Add a function to early map kernel
> via huge pages")
> commit cf209951fa7f ("powerpc/8xx: Map linear memory with huge pages")
> 
> hugepds may also be used in the future to use huge pages for vmap and
> vmalloc, see commit a6a8f7c4aa7e ("powerpc/8xx: add support for huge
> pages on VMAP and VMALLOC")
> 
> As far as I know, ppc64 also use huge pages for VMAP and VMALLOC, see
> 
> commit d909f9109c30 ("powerpc/64s/radix: Enable HAVE_ARCH_HUGE_VMAP")
> commit 8abddd968a30 ("powerpc/64s/radix: Enable huge vmalloc mappings")

There is a difference between an ordinary huge mapping (e.g., as used 
for THP) and a a hugetlb mapping.

Our current understanding is that hugepd only applies to hugetlb. 
Wouldn't vmap/vmalloc user ordinary huge pmd entries instead of hugepd?

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-09-05  8:37                 ` David Hildenbrand
  0 siblings, 0 replies; 60+ messages in thread
From: David Hildenbrand @ 2022-09-05  8:37 UTC (permalink / raw)
  To: Christophe Leroy, Mike Kravetz
  Cc: linux-mm, linux-kernel, linux-ia64, Baolin Wang,
	Aneesh Kumar K . V, Naoya Horiguchi, Michael Ellerman,
	Muchun Song, Andrew Morton, linuxppc-dev

On 03.09.22 09:07, Christophe Leroy wrote:
> +Resending with valid powerpc list address
> 
> Le 02/09/2022 à 20:52, David Hildenbrand a écrit :
>>>>> Adding Christophe on Cc:
>>>>>
>>>>> Christophe do you know if is_hugepd is true for all hugetlb entries, not
>>>>> just hugepd?
> 
> is_hugepd() is true if and only if the directory entry points to a huge
> page directory and not to the normal lower level directory.
> 
> As far as I understand if the directory entry is not pointing to any
> lower directory but is a huge page entry, pXd_leaf() is true.
> 
> 
>>>>>
>>>>> On systems without hugepd entries, I guess ptdump skips all hugetlb entries.
>>>>> Sigh!
> 
> As far as I can see, ptdump_pXd_entry() handles the pXd_leaf() case.
> 
>>>>
>>>> IIUC, the idea of ptdump_walk_pgd() is to dump page tables even outside
>>>> VMAs (for debugging purposes?).
>>>>
>>>> I cannot convince myself that that's a good idea when only holding the
>>>> mmap lock in read mode, because we can just see page tables getting
>>>> freed concurrently e.g., during concurrent munmap() ... while holding
>>>> the mmap lock in read we may only walk inside VMA boundaries.
>>>>
>>>> That then raises the questions if we're only calling this on special MMs
>>>> (e.g., init_mm) whereby we cannot really see concurrent munmap() and
>>>> where we shouldn't have hugetlb mappings or hugepd entries.
> 
> At least on powerpc, PTDUMP handles only init_mm.
> 
> Hugepage are used at least on powerpc 8xx for linear memory mapping, see
> 
> commit 34536d780683 ("powerpc/8xx: Add a function to early map kernel
> via huge pages")
> commit cf209951fa7f ("powerpc/8xx: Map linear memory with huge pages")
> 
> hugepds may also be used in the future to use huge pages for vmap and
> vmalloc, see commit a6a8f7c4aa7e ("powerpc/8xx: add support for huge
> pages on VMAP and VMALLOC")
> 
> As far as I know, ppc64 also use huge pages for VMAP and VMALLOC, see
> 
> commit d909f9109c30 ("powerpc/64s/radix: Enable HAVE_ARCH_HUGE_VMAP")
> commit 8abddd968a30 ("powerpc/64s/radix: Enable huge vmalloc mappings")

There is a difference between an ordinary huge mapping (e.g., as used 
for THP) and a a hugetlb mapping.

Our current understanding is that hugepd only applies to hugetlb. 
Wouldn't vmap/vmalloc user ordinary huge pmd entries instead of hugepd?

-- 
Thanks,

David / dhildenb

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
  2022-09-05  8:37                 ` David Hildenbrand
  (?)
@ 2022-09-05  9:33                   ` Christophe Leroy
  -1 siblings, 0 replies; 60+ messages in thread
From: Christophe Leroy @ 2022-09-05  9:33 UTC (permalink / raw)
  To: David Hildenbrand, Mike Kravetz
  Cc: linux-mm, linux-kernel, linux-ia64, Baolin Wang,
	Aneesh Kumar K . V, Naoya Horiguchi, Michael Ellerman,
	Muchun Song, Andrew Morton, linuxppc-dev



Le 05/09/2022 à 10:37, David Hildenbrand a écrit :
> On 03.09.22 09:07, Christophe Leroy wrote:
>> +Resending with valid powerpc list address
>>
>> Le 02/09/2022 à 20:52, David Hildenbrand a écrit :
>>>>>> Adding Christophe on Cc:
>>>>>>
>>>>>> Christophe do you know if is_hugepd is true for all hugetlb 
>>>>>> entries, not
>>>>>> just hugepd?
>>
>> is_hugepd() is true if and only if the directory entry points to a huge
>> page directory and not to the normal lower level directory.
>>
>> As far as I understand if the directory entry is not pointing to any
>> lower directory but is a huge page entry, pXd_leaf() is true.
>>
>>
>>>>>>
>>>>>> On systems without hugepd entries, I guess ptdump skips all 
>>>>>> hugetlb entries.
>>>>>> Sigh!
>>
>> As far as I can see, ptdump_pXd_entry() handles the pXd_leaf() case.
>>
>>>>>
>>>>> IIUC, the idea of ptdump_walk_pgd() is to dump page tables even 
>>>>> outside
>>>>> VMAs (for debugging purposes?).
>>>>>
>>>>> I cannot convince myself that that's a good idea when only holding the
>>>>> mmap lock in read mode, because we can just see page tables getting
>>>>> freed concurrently e.g., during concurrent munmap() ... while holding
>>>>> the mmap lock in read we may only walk inside VMA boundaries.
>>>>>
>>>>> That then raises the questions if we're only calling this on 
>>>>> special MMs
>>>>> (e.g., init_mm) whereby we cannot really see concurrent munmap() and
>>>>> where we shouldn't have hugetlb mappings or hugepd entries.
>>
>> At least on powerpc, PTDUMP handles only init_mm.
>>
>> Hugepage are used at least on powerpc 8xx for linear memory mapping, see
>>
>> commit 34536d780683 ("powerpc/8xx: Add a function to early map kernel
>> via huge pages")
>> commit cf209951fa7f ("powerpc/8xx: Map linear memory with huge pages")
>>
>> hugepds may also be used in the future to use huge pages for vmap and
>> vmalloc, see commit a6a8f7c4aa7e ("powerpc/8xx: add support for huge
>> pages on VMAP and VMALLOC")
>>
>> As far as I know, ppc64 also use huge pages for VMAP and VMALLOC, see
>>
>> commit d909f9109c30 ("powerpc/64s/radix: Enable HAVE_ARCH_HUGE_VMAP")
>> commit 8abddd968a30 ("powerpc/64s/radix: Enable huge vmalloc mappings")
> 
> There is a difference between an ordinary huge mapping (e.g., as used 
> for THP) and a a hugetlb mapping.
> 
> Our current understanding is that hugepd only applies to hugetlb. 
> Wouldn't vmap/vmalloc user ordinary huge pmd entries instead of hugepd?
> 

'hugepd' stands for huge page directory. It is independant of whether a 
huge page is used for hugetlb or for anything else, it represents the 
way pages are described in the page tables.

I don't know what you mean by _ordinary_ huge pmd entry.

Let's take the exemple of powerpc 8xx which is the one I know best. This 
is a powerpc32, so it has two levels : PGD and PTE. PGD has 1024 entries 
and each entry covers a 4Mbytes area. Normal PTE has 1024 entries and 
each entry is a 4k page. When you use 8Mbytes pages, you don't use PTEs 
as it would be a waste of memory. You use a huge page directory that has 
a single entry, and you have two PGD entries pointing to the huge page 
directory.

Some time ago, hupgepd was also used for 512kbytes pages and 16kbytes 
pages:
- there was huge page directories with 8x 512kbytes pages,
- there was huge page directories with 256x 16kbytes pages,

And the PGD/PMD entry points to a huge page directory (HUGEPD) instead 
of pointing to a page table directory (PTE).

Since commit b250c8c08c79 ("powerpc/8xx: Manage 512k huge pages as 
standard pages."), the 8xx doesn't use anymore hugepd for 512k huge 
page, but other platforms like powerpc book3e extensively use huge page 
directories.

I hope this clarifies the subject, otherwise I'm happy to provide 
further details.

Christophe

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-09-05  9:33                   ` Christophe Leroy
  0 siblings, 0 replies; 60+ messages in thread
From: Christophe Leroy @ 2022-09-05  9:33 UTC (permalink / raw)
  To: David Hildenbrand, Mike Kravetz
  Cc: linuxppc-dev, linux-ia64, Aneesh Kumar K . V, Muchun Song,
	linux-kernel, linux-mm, Baolin Wang, Andrew Morton,
	Naoya Horiguchi



Le 05/09/2022 à 10:37, David Hildenbrand a écrit :
> On 03.09.22 09:07, Christophe Leroy wrote:
>> +Resending with valid powerpc list address
>>
>> Le 02/09/2022 à 20:52, David Hildenbrand a écrit :
>>>>>> Adding Christophe on Cc:
>>>>>>
>>>>>> Christophe do you know if is_hugepd is true for all hugetlb 
>>>>>> entries, not
>>>>>> just hugepd?
>>
>> is_hugepd() is true if and only if the directory entry points to a huge
>> page directory and not to the normal lower level directory.
>>
>> As far as I understand if the directory entry is not pointing to any
>> lower directory but is a huge page entry, pXd_leaf() is true.
>>
>>
>>>>>>
>>>>>> On systems without hugepd entries, I guess ptdump skips all 
>>>>>> hugetlb entries.
>>>>>> Sigh!
>>
>> As far as I can see, ptdump_pXd_entry() handles the pXd_leaf() case.
>>
>>>>>
>>>>> IIUC, the idea of ptdump_walk_pgd() is to dump page tables even 
>>>>> outside
>>>>> VMAs (for debugging purposes?).
>>>>>
>>>>> I cannot convince myself that that's a good idea when only holding the
>>>>> mmap lock in read mode, because we can just see page tables getting
>>>>> freed concurrently e.g., during concurrent munmap() ... while holding
>>>>> the mmap lock in read we may only walk inside VMA boundaries.
>>>>>
>>>>> That then raises the questions if we're only calling this on 
>>>>> special MMs
>>>>> (e.g., init_mm) whereby we cannot really see concurrent munmap() and
>>>>> where we shouldn't have hugetlb mappings or hugepd entries.
>>
>> At least on powerpc, PTDUMP handles only init_mm.
>>
>> Hugepage are used at least on powerpc 8xx for linear memory mapping, see
>>
>> commit 34536d780683 ("powerpc/8xx: Add a function to early map kernel
>> via huge pages")
>> commit cf209951fa7f ("powerpc/8xx: Map linear memory with huge pages")
>>
>> hugepds may also be used in the future to use huge pages for vmap and
>> vmalloc, see commit a6a8f7c4aa7e ("powerpc/8xx: add support for huge
>> pages on VMAP and VMALLOC")
>>
>> As far as I know, ppc64 also use huge pages for VMAP and VMALLOC, see
>>
>> commit d909f9109c30 ("powerpc/64s/radix: Enable HAVE_ARCH_HUGE_VMAP")
>> commit 8abddd968a30 ("powerpc/64s/radix: Enable huge vmalloc mappings")
> 
> There is a difference between an ordinary huge mapping (e.g., as used 
> for THP) and a a hugetlb mapping.
> 
> Our current understanding is that hugepd only applies to hugetlb. 
> Wouldn't vmap/vmalloc user ordinary huge pmd entries instead of hugepd?
> 

'hugepd' stands for huge page directory. It is independant of whether a 
huge page is used for hugetlb or for anything else, it represents the 
way pages are described in the page tables.

I don't know what you mean by _ordinary_ huge pmd entry.

Let's take the exemple of powerpc 8xx which is the one I know best. This 
is a powerpc32, so it has two levels : PGD and PTE. PGD has 1024 entries 
and each entry covers a 4Mbytes area. Normal PTE has 1024 entries and 
each entry is a 4k page. When you use 8Mbytes pages, you don't use PTEs 
as it would be a waste of memory. You use a huge page directory that has 
a single entry, and you have two PGD entries pointing to the huge page 
directory.

Some time ago, hupgepd was also used for 512kbytes pages and 16kbytes 
pages:
- there was huge page directories with 8x 512kbytes pages,
- there was huge page directories with 256x 16kbytes pages,

And the PGD/PMD entry points to a huge page directory (HUGEPD) instead 
of pointing to a page table directory (PTE).

Since commit b250c8c08c79 ("powerpc/8xx: Manage 512k huge pages as 
standard pages."), the 8xx doesn't use anymore hugepd for 512k huge 
page, but other platforms like powerpc book3e extensively use huge page 
directories.

I hope this clarifies the subject, otherwise I'm happy to provide 
further details.

Christophe

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-09-05  9:33                   ` Christophe Leroy
  0 siblings, 0 replies; 60+ messages in thread
From: Christophe Leroy @ 2022-09-05  9:33 UTC (permalink / raw)
  To: David Hildenbrand, Mike Kravetz
  Cc: linux-mm, linux-kernel, linux-ia64, Baolin Wang,
	Aneesh Kumar K . V, Naoya Horiguchi, Michael Ellerman,
	Muchun Song, Andrew Morton, linuxppc-dev

DQoNCkxlIDA1LzA5LzIwMjIgw6AgMTA6MzcsIERhdmlkIEhpbGRlbmJyYW5kIGEgw6ljcml0wqA6
DQo+IE9uIDAzLjA5LjIyIDA5OjA3LCBDaHJpc3RvcGhlIExlcm95IHdyb3RlOg0KPj4gK1Jlc2Vu
ZGluZyB3aXRoIHZhbGlkIHBvd2VycGMgbGlzdCBhZGRyZXNzDQo+Pg0KPj4gTGUgMDIvMDkvMjAy
MiDDoCAyMDo1MiwgRGF2aWQgSGlsZGVuYnJhbmQgYSDDqWNyaXTCoDoNCj4+Pj4+PiBBZGRpbmcg
Q2hyaXN0b3BoZSBvbiBDYzoNCj4+Pj4+Pg0KPj4+Pj4+IENocmlzdG9waGUgZG8geW91IGtub3cg
aWYgaXNfaHVnZXBkIGlzIHRydWUgZm9yIGFsbCBodWdldGxiIA0KPj4+Pj4+IGVudHJpZXMsIG5v
dA0KPj4+Pj4+IGp1c3QgaHVnZXBkPw0KPj4NCj4+IGlzX2h1Z2VwZCgpIGlzIHRydWUgaWYgYW5k
IG9ubHkgaWYgdGhlIGRpcmVjdG9yeSBlbnRyeSBwb2ludHMgdG8gYSBodWdlDQo+PiBwYWdlIGRp
cmVjdG9yeSBhbmQgbm90IHRvIHRoZSBub3JtYWwgbG93ZXIgbGV2ZWwgZGlyZWN0b3J5Lg0KPj4N
Cj4+IEFzIGZhciBhcyBJIHVuZGVyc3RhbmQgaWYgdGhlIGRpcmVjdG9yeSBlbnRyeSBpcyBub3Qg
cG9pbnRpbmcgdG8gYW55DQo+PiBsb3dlciBkaXJlY3RvcnkgYnV0IGlzIGEgaHVnZSBwYWdlIGVu
dHJ5LCBwWGRfbGVhZigpIGlzIHRydWUuDQo+Pg0KPj4NCj4+Pj4+Pg0KPj4+Pj4+IE9uIHN5c3Rl
bXMgd2l0aG91dCBodWdlcGQgZW50cmllcywgSSBndWVzcyBwdGR1bXAgc2tpcHMgYWxsIA0KPj4+
Pj4+IGh1Z2V0bGIgZW50cmllcy4NCj4+Pj4+PiBTaWdoIQ0KPj4NCj4+IEFzIGZhciBhcyBJIGNh
biBzZWUsIHB0ZHVtcF9wWGRfZW50cnkoKSBoYW5kbGVzIHRoZSBwWGRfbGVhZigpIGNhc2UuDQo+
Pg0KPj4+Pj4NCj4+Pj4+IElJVUMsIHRoZSBpZGVhIG9mIHB0ZHVtcF93YWxrX3BnZCgpIGlzIHRv
IGR1bXAgcGFnZSB0YWJsZXMgZXZlbiANCj4+Pj4+IG91dHNpZGUNCj4+Pj4+IFZNQXMgKGZvciBk
ZWJ1Z2dpbmcgcHVycG9zZXM/KS4NCj4+Pj4+DQo+Pj4+PiBJIGNhbm5vdCBjb252aW5jZSBteXNl
bGYgdGhhdCB0aGF0J3MgYSBnb29kIGlkZWEgd2hlbiBvbmx5IGhvbGRpbmcgdGhlDQo+Pj4+PiBt
bWFwIGxvY2sgaW4gcmVhZCBtb2RlLCBiZWNhdXNlIHdlIGNhbiBqdXN0IHNlZSBwYWdlIHRhYmxl
cyBnZXR0aW5nDQo+Pj4+PiBmcmVlZCBjb25jdXJyZW50bHkgZS5nLiwgZHVyaW5nIGNvbmN1cnJl
bnQgbXVubWFwKCkgLi4uIHdoaWxlIGhvbGRpbmcNCj4+Pj4+IHRoZSBtbWFwIGxvY2sgaW4gcmVh
ZCB3ZSBtYXkgb25seSB3YWxrIGluc2lkZSBWTUEgYm91bmRhcmllcy4NCj4+Pj4+DQo+Pj4+PiBU
aGF0IHRoZW4gcmFpc2VzIHRoZSBxdWVzdGlvbnMgaWYgd2UncmUgb25seSBjYWxsaW5nIHRoaXMg
b24gDQo+Pj4+PiBzcGVjaWFsIE1Ncw0KPj4+Pj4gKGUuZy4sIGluaXRfbW0pIHdoZXJlYnkgd2Ug
Y2Fubm90IHJlYWxseSBzZWUgY29uY3VycmVudCBtdW5tYXAoKSBhbmQNCj4+Pj4+IHdoZXJlIHdl
IHNob3VsZG4ndCBoYXZlIGh1Z2V0bGIgbWFwcGluZ3Mgb3IgaHVnZXBkIGVudHJpZXMuDQo+Pg0K
Pj4gQXQgbGVhc3Qgb24gcG93ZXJwYywgUFREVU1QIGhhbmRsZXMgb25seSBpbml0X21tLg0KPj4N
Cj4+IEh1Z2VwYWdlIGFyZSB1c2VkIGF0IGxlYXN0IG9uIHBvd2VycGMgOHh4IGZvciBsaW5lYXIg
bWVtb3J5IG1hcHBpbmcsIHNlZQ0KPj4NCj4+IGNvbW1pdCAzNDUzNmQ3ODA2ODMgKCJwb3dlcnBj
Lzh4eDogQWRkIGEgZnVuY3Rpb24gdG8gZWFybHkgbWFwIGtlcm5lbA0KPj4gdmlhIGh1Z2UgcGFn
ZXMiKQ0KPj4gY29tbWl0IGNmMjA5OTUxZmE3ZiAoInBvd2VycGMvOHh4OiBNYXAgbGluZWFyIG1l
bW9yeSB3aXRoIGh1Z2UgcGFnZXMiKQ0KPj4NCj4+IGh1Z2VwZHMgbWF5IGFsc28gYmUgdXNlZCBp
biB0aGUgZnV0dXJlIHRvIHVzZSBodWdlIHBhZ2VzIGZvciB2bWFwIGFuZA0KPj4gdm1hbGxvYywg
c2VlIGNvbW1pdCBhNmE4ZjdjNGFhN2UgKCJwb3dlcnBjLzh4eDogYWRkIHN1cHBvcnQgZm9yIGh1
Z2UNCj4+IHBhZ2VzIG9uIFZNQVAgYW5kIFZNQUxMT0MiKQ0KPj4NCj4+IEFzIGZhciBhcyBJIGtu
b3csIHBwYzY0IGFsc28gdXNlIGh1Z2UgcGFnZXMgZm9yIFZNQVAgYW5kIFZNQUxMT0MsIHNlZQ0K
Pj4NCj4+IGNvbW1pdCBkOTA5ZjkxMDljMzAgKCJwb3dlcnBjLzY0cy9yYWRpeDogRW5hYmxlIEhB
VkVfQVJDSF9IVUdFX1ZNQVAiKQ0KPj4gY29tbWl0IDhhYmRkZDk2OGEzMCAoInBvd2VycGMvNjRz
L3JhZGl4OiBFbmFibGUgaHVnZSB2bWFsbG9jIG1hcHBpbmdzIikNCj4gDQo+IFRoZXJlIGlzIGEg
ZGlmZmVyZW5jZSBiZXR3ZWVuIGFuIG9yZGluYXJ5IGh1Z2UgbWFwcGluZyAoZS5nLiwgYXMgdXNl
ZCANCj4gZm9yIFRIUCkgYW5kIGEgYSBodWdldGxiIG1hcHBpbmcuDQo+IA0KPiBPdXIgY3VycmVu
dCB1bmRlcnN0YW5kaW5nIGlzIHRoYXQgaHVnZXBkIG9ubHkgYXBwbGllcyB0byBodWdldGxiLiAN
Cj4gV291bGRuJ3Qgdm1hcC92bWFsbG9jIHVzZXIgb3JkaW5hcnkgaHVnZSBwbWQgZW50cmllcyBp
bnN0ZWFkIG9mIGh1Z2VwZD8NCj4gDQoNCidodWdlcGQnIHN0YW5kcyBmb3IgaHVnZSBwYWdlIGRp
cmVjdG9yeS4gSXQgaXMgaW5kZXBlbmRhbnQgb2Ygd2hldGhlciBhIA0KaHVnZSBwYWdlIGlzIHVz
ZWQgZm9yIGh1Z2V0bGIgb3IgZm9yIGFueXRoaW5nIGVsc2UsIGl0IHJlcHJlc2VudHMgdGhlIA0K
d2F5IHBhZ2VzIGFyZSBkZXNjcmliZWQgaW4gdGhlIHBhZ2UgdGFibGVzLg0KDQpJIGRvbid0IGtu
b3cgd2hhdCB5b3UgbWVhbiBieSBfb3JkaW5hcnlfIGh1Z2UgcG1kIGVudHJ5Lg0KDQpMZXQncyB0
YWtlIHRoZSBleGVtcGxlIG9mIHBvd2VycGMgOHh4IHdoaWNoIGlzIHRoZSBvbmUgSSBrbm93IGJl
c3QuIFRoaXMgDQppcyBhIHBvd2VycGMzMiwgc28gaXQgaGFzIHR3byBsZXZlbHMgOiBQR0QgYW5k
IFBURS4gUEdEIGhhcyAxMDI0IGVudHJpZXMgDQphbmQgZWFjaCBlbnRyeSBjb3ZlcnMgYSA0TWJ5
dGVzIGFyZWEuIE5vcm1hbCBQVEUgaGFzIDEwMjQgZW50cmllcyBhbmQgDQplYWNoIGVudHJ5IGlz
IGEgNGsgcGFnZS4gV2hlbiB5b3UgdXNlIDhNYnl0ZXMgcGFnZXMsIHlvdSBkb24ndCB1c2UgUFRF
cyANCmFzIGl0IHdvdWxkIGJlIGEgd2FzdGUgb2YgbWVtb3J5LiBZb3UgdXNlIGEgaHVnZSBwYWdl
IGRpcmVjdG9yeSB0aGF0IGhhcyANCmEgc2luZ2xlIGVudHJ5LCBhbmQgeW91IGhhdmUgdHdvIFBH
RCBlbnRyaWVzIHBvaW50aW5nIHRvIHRoZSBodWdlIHBhZ2UgDQpkaXJlY3RvcnkuDQoNClNvbWUg
dGltZSBhZ28sIGh1cGdlcGQgd2FzIGFsc28gdXNlZCBmb3IgNTEya2J5dGVzIHBhZ2VzIGFuZCAx
NmtieXRlcyANCnBhZ2VzOg0KLSB0aGVyZSB3YXMgaHVnZSBwYWdlIGRpcmVjdG9yaWVzIHdpdGgg
OHggNTEya2J5dGVzIHBhZ2VzLA0KLSB0aGVyZSB3YXMgaHVnZSBwYWdlIGRpcmVjdG9yaWVzIHdp
dGggMjU2eCAxNmtieXRlcyBwYWdlcywNCg0KQW5kIHRoZSBQR0QvUE1EIGVudHJ5IHBvaW50cyB0
byBhIGh1Z2UgcGFnZSBkaXJlY3RvcnkgKEhVR0VQRCkgaW5zdGVhZCANCm9mIHBvaW50aW5nIHRv
IGEgcGFnZSB0YWJsZSBkaXJlY3RvcnkgKFBURSkuDQoNClNpbmNlIGNvbW1pdCBiMjUwYzhjMDhj
NzkgKCJwb3dlcnBjLzh4eDogTWFuYWdlIDUxMmsgaHVnZSBwYWdlcyBhcyANCnN0YW5kYXJkIHBh
Z2VzLiIpLCB0aGUgOHh4IGRvZXNuJ3QgdXNlIGFueW1vcmUgaHVnZXBkIGZvciA1MTJrIGh1Z2Ug
DQpwYWdlLCBidXQgb3RoZXIgcGxhdGZvcm1zIGxpa2UgcG93ZXJwYyBib29rM2UgZXh0ZW5zaXZl
bHkgdXNlIGh1Z2UgcGFnZSANCmRpcmVjdG9yaWVzLg0KDQpJIGhvcGUgdGhpcyBjbGFyaWZpZXMg
dGhlIHN1YmplY3QsIG90aGVyd2lzZSBJJ20gaGFwcHkgdG8gcHJvdmlkZSANCmZ1cnRoZXIgZGV0
YWlscy4NCg0KQ2hyaXN0b3BoZQ=

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
  2022-09-05  9:33                   ` Christophe Leroy
  (?)
@ 2022-09-05  9:46                     ` David Hildenbrand
  -1 siblings, 0 replies; 60+ messages in thread
From: David Hildenbrand @ 2022-09-05  9:46 UTC (permalink / raw)
  To: Christophe Leroy, Mike Kravetz
  Cc: linux-mm, linux-kernel, linux-ia64, Baolin Wang,
	Aneesh Kumar K . V, Naoya Horiguchi, Michael Ellerman,
	Muchun Song, Andrew Morton, linuxppc-dev

On 05.09.22 11:33, Christophe Leroy wrote:
> 
> 
> Le 05/09/2022 à 10:37, David Hildenbrand a écrit :
>> On 03.09.22 09:07, Christophe Leroy wrote:
>>> +Resending with valid powerpc list address
>>>
>>> Le 02/09/2022 à 20:52, David Hildenbrand a écrit :
>>>>>>> Adding Christophe on Cc:
>>>>>>>
>>>>>>> Christophe do you know if is_hugepd is true for all hugetlb
>>>>>>> entries, not
>>>>>>> just hugepd?
>>>
>>> is_hugepd() is true if and only if the directory entry points to a huge
>>> page directory and not to the normal lower level directory.
>>>
>>> As far as I understand if the directory entry is not pointing to any
>>> lower directory but is a huge page entry, pXd_leaf() is true.
>>>
>>>
>>>>>>>
>>>>>>> On systems without hugepd entries, I guess ptdump skips all
>>>>>>> hugetlb entries.
>>>>>>> Sigh!
>>>
>>> As far as I can see, ptdump_pXd_entry() handles the pXd_leaf() case.
>>>
>>>>>>
>>>>>> IIUC, the idea of ptdump_walk_pgd() is to dump page tables even
>>>>>> outside
>>>>>> VMAs (for debugging purposes?).
>>>>>>
>>>>>> I cannot convince myself that that's a good idea when only holding the
>>>>>> mmap lock in read mode, because we can just see page tables getting
>>>>>> freed concurrently e.g., during concurrent munmap() ... while holding
>>>>>> the mmap lock in read we may only walk inside VMA boundaries.
>>>>>>
>>>>>> That then raises the questions if we're only calling this on
>>>>>> special MMs
>>>>>> (e.g., init_mm) whereby we cannot really see concurrent munmap() and
>>>>>> where we shouldn't have hugetlb mappings or hugepd entries.
>>>
>>> At least on powerpc, PTDUMP handles only init_mm.
>>>
>>> Hugepage are used at least on powerpc 8xx for linear memory mapping, see
>>>
>>> commit 34536d780683 ("powerpc/8xx: Add a function to early map kernel
>>> via huge pages")
>>> commit cf209951fa7f ("powerpc/8xx: Map linear memory with huge pages")
>>>
>>> hugepds may also be used in the future to use huge pages for vmap and
>>> vmalloc, see commit a6a8f7c4aa7e ("powerpc/8xx: add support for huge
>>> pages on VMAP and VMALLOC")
>>>
>>> As far as I know, ppc64 also use huge pages for VMAP and VMALLOC, see
>>>
>>> commit d909f9109c30 ("powerpc/64s/radix: Enable HAVE_ARCH_HUGE_VMAP")
>>> commit 8abddd968a30 ("powerpc/64s/radix: Enable huge vmalloc mappings")
>>
>> There is a difference between an ordinary huge mapping (e.g., as used
>> for THP) and a a hugetlb mapping.
>>
>> Our current understanding is that hugepd only applies to hugetlb.
>> Wouldn't vmap/vmalloc user ordinary huge pmd entries instead of hugepd?
>>
> 
> 'hugepd' stands for huge page directory. It is independant of whether a
> huge page is used for hugetlb or for anything else, it represents the
> way pages are described in the page tables.

This patch here makes the assumption that hugepd only applies to 
hugetlb, because it removes any such handling from the !hugetlb path in 
GUP. Is that incorrect or are there valid cases where that could happen? 
(init_mm is special in that regard, i don't think it interacts with GUP 
at all).

> 
> I don't know what you mean by _ordinary_ huge pmd entry.
> 

Essentially, what we use for THP. Let me try to understand how hugepd 
interact with the rest of the system.

Do systems that support hugepd currently implement THP? Reading above 
32bit systems below, I assume not?

> Let's take the exemple of powerpc 8xx which is the one I know best. This
> is a powerpc32, so it has two levels : PGD and PTE. PGD has 1024 entries
> and each entry covers a 4Mbytes area. Normal PTE has 1024 entries and
> each entry is a 4k page. When you use 8Mbytes pages, you don't use PTEs
> as it would be a waste of memory. You use a huge page directory that has
> a single entry, and you have two PGD entries pointing to the huge page
> directory.

Thanks, I assume there are no 8MB THP, correct?

The 8MB example with 4MB PGD entries makes it sound a bit like the 
cont-PTE/cont-PMD handling on aarch64: they don't use a hugepd but 
would simply let two consecutive PGD entries point at the the relevant 
(sub) parts of the hugetlb page. No hugepd involved.

> 
> Some time ago, hupgepd was also used for 512kbytes pages and 16kbytes
> pages:
> - there was huge page directories with 8x 512kbytes pages,
> - there was huge page directories with 256x 16kbytes pages,
> 
> And the PGD/PMD entry points to a huge page directory (HUGEPD) instead
> of pointing to a page table directory (PTE).

Thanks for the example.

> 
> Since commit b250c8c08c79 ("powerpc/8xx: Manage 512k huge pages as
> standard pages."), the 8xx doesn't use anymore hugepd for 512k huge
> page, but other platforms like powerpc book3e extensively use huge page
> directories.
> 
> I hope this clarifies the subject, otherwise I'm happy to provide
> further details.

Thanks, it would be valuable to know if the assumption in this patch is 
correct: hugepd will only be found in hugetlb areas in ordinary MMs (not 
init_mm).

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-09-05  9:46                     ` David Hildenbrand
  0 siblings, 0 replies; 60+ messages in thread
From: David Hildenbrand @ 2022-09-05  9:46 UTC (permalink / raw)
  To: Christophe Leroy, Mike Kravetz
  Cc: linuxppc-dev, linux-ia64, Aneesh Kumar K . V, Muchun Song,
	linux-kernel, linux-mm, Baolin Wang, Andrew Morton,
	Naoya Horiguchi

On 05.09.22 11:33, Christophe Leroy wrote:
> 
> 
> Le 05/09/2022 à 10:37, David Hildenbrand a écrit :
>> On 03.09.22 09:07, Christophe Leroy wrote:
>>> +Resending with valid powerpc list address
>>>
>>> Le 02/09/2022 à 20:52, David Hildenbrand a écrit :
>>>>>>> Adding Christophe on Cc:
>>>>>>>
>>>>>>> Christophe do you know if is_hugepd is true for all hugetlb
>>>>>>> entries, not
>>>>>>> just hugepd?
>>>
>>> is_hugepd() is true if and only if the directory entry points to a huge
>>> page directory and not to the normal lower level directory.
>>>
>>> As far as I understand if the directory entry is not pointing to any
>>> lower directory but is a huge page entry, pXd_leaf() is true.
>>>
>>>
>>>>>>>
>>>>>>> On systems without hugepd entries, I guess ptdump skips all
>>>>>>> hugetlb entries.
>>>>>>> Sigh!
>>>
>>> As far as I can see, ptdump_pXd_entry() handles the pXd_leaf() case.
>>>
>>>>>>
>>>>>> IIUC, the idea of ptdump_walk_pgd() is to dump page tables even
>>>>>> outside
>>>>>> VMAs (for debugging purposes?).
>>>>>>
>>>>>> I cannot convince myself that that's a good idea when only holding the
>>>>>> mmap lock in read mode, because we can just see page tables getting
>>>>>> freed concurrently e.g., during concurrent munmap() ... while holding
>>>>>> the mmap lock in read we may only walk inside VMA boundaries.
>>>>>>
>>>>>> That then raises the questions if we're only calling this on
>>>>>> special MMs
>>>>>> (e.g., init_mm) whereby we cannot really see concurrent munmap() and
>>>>>> where we shouldn't have hugetlb mappings or hugepd entries.
>>>
>>> At least on powerpc, PTDUMP handles only init_mm.
>>>
>>> Hugepage are used at least on powerpc 8xx for linear memory mapping, see
>>>
>>> commit 34536d780683 ("powerpc/8xx: Add a function to early map kernel
>>> via huge pages")
>>> commit cf209951fa7f ("powerpc/8xx: Map linear memory with huge pages")
>>>
>>> hugepds may also be used in the future to use huge pages for vmap and
>>> vmalloc, see commit a6a8f7c4aa7e ("powerpc/8xx: add support for huge
>>> pages on VMAP and VMALLOC")
>>>
>>> As far as I know, ppc64 also use huge pages for VMAP and VMALLOC, see
>>>
>>> commit d909f9109c30 ("powerpc/64s/radix: Enable HAVE_ARCH_HUGE_VMAP")
>>> commit 8abddd968a30 ("powerpc/64s/radix: Enable huge vmalloc mappings")
>>
>> There is a difference between an ordinary huge mapping (e.g., as used
>> for THP) and a a hugetlb mapping.
>>
>> Our current understanding is that hugepd only applies to hugetlb.
>> Wouldn't vmap/vmalloc user ordinary huge pmd entries instead of hugepd?
>>
> 
> 'hugepd' stands for huge page directory. It is independant of whether a
> huge page is used for hugetlb or for anything else, it represents the
> way pages are described in the page tables.

This patch here makes the assumption that hugepd only applies to 
hugetlb, because it removes any such handling from the !hugetlb path in 
GUP. Is that incorrect or are there valid cases where that could happen? 
(init_mm is special in that regard, i don't think it interacts with GUP 
at all).

> 
> I don't know what you mean by _ordinary_ huge pmd entry.
> 

Essentially, what we use for THP. Let me try to understand how hugepd 
interact with the rest of the system.

Do systems that support hugepd currently implement THP? Reading above 
32bit systems below, I assume not?

> Let's take the exemple of powerpc 8xx which is the one I know best. This
> is a powerpc32, so it has two levels : PGD and PTE. PGD has 1024 entries
> and each entry covers a 4Mbytes area. Normal PTE has 1024 entries and
> each entry is a 4k page. When you use 8Mbytes pages, you don't use PTEs
> as it would be a waste of memory. You use a huge page directory that has
> a single entry, and you have two PGD entries pointing to the huge page
> directory.

Thanks, I assume there are no 8MB THP, correct?

The 8MB example with 4MB PGD entries makes it sound a bit like the 
cont-PTE/cont-PMD handling on aarch64: they don't use a hugepd but 
would simply let two consecutive PGD entries point at the the relevant 
(sub) parts of the hugetlb page. No hugepd involved.

> 
> Some time ago, hupgepd was also used for 512kbytes pages and 16kbytes
> pages:
> - there was huge page directories with 8x 512kbytes pages,
> - there was huge page directories with 256x 16kbytes pages,
> 
> And the PGD/PMD entry points to a huge page directory (HUGEPD) instead
> of pointing to a page table directory (PTE).

Thanks for the example.

> 
> Since commit b250c8c08c79 ("powerpc/8xx: Manage 512k huge pages as
> standard pages."), the 8xx doesn't use anymore hugepd for 512k huge
> page, but other platforms like powerpc book3e extensively use huge page
> directories.
> 
> I hope this clarifies the subject, otherwise I'm happy to provide
> further details.

Thanks, it would be valuable to know if the assumption in this patch is 
correct: hugepd will only be found in hugetlb areas in ordinary MMs (not 
init_mm).

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-09-05  9:46                     ` David Hildenbrand
  0 siblings, 0 replies; 60+ messages in thread
From: David Hildenbrand @ 2022-09-05  9:46 UTC (permalink / raw)
  To: Christophe Leroy, Mike Kravetz
  Cc: linux-mm, linux-kernel, linux-ia64, Baolin Wang,
	Aneesh Kumar K . V, Naoya Horiguchi, Michael Ellerman,
	Muchun Song, Andrew Morton, linuxppc-dev

On 05.09.22 11:33, Christophe Leroy wrote:
> 
> 
> Le 05/09/2022 à 10:37, David Hildenbrand a écrit :
>> On 03.09.22 09:07, Christophe Leroy wrote:
>>> +Resending with valid powerpc list address
>>>
>>> Le 02/09/2022 à 20:52, David Hildenbrand a écrit :
>>>>>>> Adding Christophe on Cc:
>>>>>>>
>>>>>>> Christophe do you know if is_hugepd is true for all hugetlb
>>>>>>> entries, not
>>>>>>> just hugepd?
>>>
>>> is_hugepd() is true if and only if the directory entry points to a huge
>>> page directory and not to the normal lower level directory.
>>>
>>> As far as I understand if the directory entry is not pointing to any
>>> lower directory but is a huge page entry, pXd_leaf() is true.
>>>
>>>
>>>>>>>
>>>>>>> On systems without hugepd entries, I guess ptdump skips all
>>>>>>> hugetlb entries.
>>>>>>> Sigh!
>>>
>>> As far as I can see, ptdump_pXd_entry() handles the pXd_leaf() case.
>>>
>>>>>>
>>>>>> IIUC, the idea of ptdump_walk_pgd() is to dump page tables even
>>>>>> outside
>>>>>> VMAs (for debugging purposes?).
>>>>>>
>>>>>> I cannot convince myself that that's a good idea when only holding the
>>>>>> mmap lock in read mode, because we can just see page tables getting
>>>>>> freed concurrently e.g., during concurrent munmap() ... while holding
>>>>>> the mmap lock in read we may only walk inside VMA boundaries.
>>>>>>
>>>>>> That then raises the questions if we're only calling this on
>>>>>> special MMs
>>>>>> (e.g., init_mm) whereby we cannot really see concurrent munmap() and
>>>>>> where we shouldn't have hugetlb mappings or hugepd entries.
>>>
>>> At least on powerpc, PTDUMP handles only init_mm.
>>>
>>> Hugepage are used at least on powerpc 8xx for linear memory mapping, see
>>>
>>> commit 34536d780683 ("powerpc/8xx: Add a function to early map kernel
>>> via huge pages")
>>> commit cf209951fa7f ("powerpc/8xx: Map linear memory with huge pages")
>>>
>>> hugepds may also be used in the future to use huge pages for vmap and
>>> vmalloc, see commit a6a8f7c4aa7e ("powerpc/8xx: add support for huge
>>> pages on VMAP and VMALLOC")
>>>
>>> As far as I know, ppc64 also use huge pages for VMAP and VMALLOC, see
>>>
>>> commit d909f9109c30 ("powerpc/64s/radix: Enable HAVE_ARCH_HUGE_VMAP")
>>> commit 8abddd968a30 ("powerpc/64s/radix: Enable huge vmalloc mappings")
>>
>> There is a difference between an ordinary huge mapping (e.g., as used
>> for THP) and a a hugetlb mapping.
>>
>> Our current understanding is that hugepd only applies to hugetlb.
>> Wouldn't vmap/vmalloc user ordinary huge pmd entries instead of hugepd?
>>
> 
> 'hugepd' stands for huge page directory. It is independant of whether a
> huge page is used for hugetlb or for anything else, it represents the
> way pages are described in the page tables.

This patch here makes the assumption that hugepd only applies to 
hugetlb, because it removes any such handling from the !hugetlb path in 
GUP. Is that incorrect or are there valid cases where that could happen? 
(init_mm is special in that regard, i don't think it interacts with GUP 
at all).

> 
> I don't know what you mean by _ordinary_ huge pmd entry.
> 

Essentially, what we use for THP. Let me try to understand how hugepd 
interact with the rest of the system.

Do systems that support hugepd currently implement THP? Reading above 
32bit systems below, I assume not?

> Let's take the exemple of powerpc 8xx which is the one I know best. This
> is a powerpc32, so it has two levels : PGD and PTE. PGD has 1024 entries
> and each entry covers a 4Mbytes area. Normal PTE has 1024 entries and
> each entry is a 4k page. When you use 8Mbytes pages, you don't use PTEs
> as it would be a waste of memory. You use a huge page directory that has
> a single entry, and you have two PGD entries pointing to the huge page
> directory.

Thanks, I assume there are no 8MB THP, correct?

The 8MB example with 4MB PGD entries makes it sound a bit like the 
cont-PTE/cont-PMD handling on aarch64: they don't use a hugepd but 
would simply let two consecutive PGD entries point at the the relevant 
(sub) parts of the hugetlb page. No hugepd involved.

> 
> Some time ago, hupgepd was also used for 512kbytes pages and 16kbytes
> pages:
> - there was huge page directories with 8x 512kbytes pages,
> - there was huge page directories with 256x 16kbytes pages,
> 
> And the PGD/PMD entry points to a huge page directory (HUGEPD) instead
> of pointing to a page table directory (PTE).

Thanks for the example.

> 
> Since commit b250c8c08c79 ("powerpc/8xx: Manage 512k huge pages as
> standard pages."), the 8xx doesn't use anymore hugepd for 512k huge
> page, but other platforms like powerpc book3e extensively use huge page
> directories.
> 
> I hope this clarifies the subject, otherwise I'm happy to provide
> further details.

Thanks, it would be valuable to know if the assumption in this patch is 
correct: hugepd will only be found in hugetlb areas in ordinary MMs (not 
init_mm).

-- 
Thanks,

David / dhildenb

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
  2022-09-05  9:46                     ` David Hildenbrand
  (?)
@ 2022-09-05 16:05                       ` Christophe Leroy
  -1 siblings, 0 replies; 60+ messages in thread
From: Christophe Leroy @ 2022-09-05 16:05 UTC (permalink / raw)
  To: David Hildenbrand, Mike Kravetz
  Cc: linux-mm, linux-kernel, linux-ia64, Baolin Wang,
	Aneesh Kumar K . V, Naoya Horiguchi, Michael Ellerman,
	Muchun Song, Andrew Morton, linuxppc-dev



Le 05/09/2022 à 11:46, David Hildenbrand a écrit :
> On 05.09.22 11:33, Christophe Leroy wrote:
>>
>>
>> Le 05/09/2022 à 10:37, David Hildenbrand a écrit :
>>> On 03.09.22 09:07, Christophe Leroy wrote:
>>>> +Resending with valid powerpc list address
>>>>
>>>> Le 02/09/2022 à 20:52, David Hildenbrand a écrit :
>>>>>>>> Adding Christophe on Cc:
>>>>>>>>
>>>>>>>> Christophe do you know if is_hugepd is true for all hugetlb
>>>>>>>> entries, not
>>>>>>>> just hugepd?
>>>>
>>>> is_hugepd() is true if and only if the directory entry points to a huge
>>>> page directory and not to the normal lower level directory.
>>>>
>>>> As far as I understand if the directory entry is not pointing to any
>>>> lower directory but is a huge page entry, pXd_leaf() is true.
>>>>
>>>>
>>>>>>>>
>>>>>>>> On systems without hugepd entries, I guess ptdump skips all
>>>>>>>> hugetlb entries.
>>>>>>>> Sigh!
>>>>
>>>> As far as I can see, ptdump_pXd_entry() handles the pXd_leaf() case.
>>>>
>>>>>>>
>>>>>>> IIUC, the idea of ptdump_walk_pgd() is to dump page tables even
>>>>>>> outside
>>>>>>> VMAs (for debugging purposes?).
>>>>>>>
>>>>>>> I cannot convince myself that that's a good idea when only 
>>>>>>> holding the
>>>>>>> mmap lock in read mode, because we can just see page tables getting
>>>>>>> freed concurrently e.g., during concurrent munmap() ... while 
>>>>>>> holding
>>>>>>> the mmap lock in read we may only walk inside VMA boundaries.
>>>>>>>
>>>>>>> That then raises the questions if we're only calling this on
>>>>>>> special MMs
>>>>>>> (e.g., init_mm) whereby we cannot really see concurrent munmap() and
>>>>>>> where we shouldn't have hugetlb mappings or hugepd entries.
>>>>
>>>> At least on powerpc, PTDUMP handles only init_mm.
>>>>
>>>> Hugepage are used at least on powerpc 8xx for linear memory mapping, 
>>>> see
>>>>
>>>> commit 34536d780683 ("powerpc/8xx: Add a function to early map kernel
>>>> via huge pages")
>>>> commit cf209951fa7f ("powerpc/8xx: Map linear memory with huge pages")
>>>>
>>>> hugepds may also be used in the future to use huge pages for vmap and
>>>> vmalloc, see commit a6a8f7c4aa7e ("powerpc/8xx: add support for huge
>>>> pages on VMAP and VMALLOC")
>>>>
>>>> As far as I know, ppc64 also use huge pages for VMAP and VMALLOC, see
>>>>
>>>> commit d909f9109c30 ("powerpc/64s/radix: Enable HAVE_ARCH_HUGE_VMAP")
>>>> commit 8abddd968a30 ("powerpc/64s/radix: Enable huge vmalloc mappings")
>>>
>>> There is a difference between an ordinary huge mapping (e.g., as used
>>> for THP) and a a hugetlb mapping.
>>>
>>> Our current understanding is that hugepd only applies to hugetlb.
>>> Wouldn't vmap/vmalloc user ordinary huge pmd entries instead of hugepd?
>>>
>>
>> 'hugepd' stands for huge page directory. It is independant of whether a
>> huge page is used for hugetlb or for anything else, it represents the
>> way pages are described in the page tables.
> 
> This patch here makes the assumption that hugepd only applies to 
> hugetlb, because it removes any such handling from the !hugetlb path in 
> GUP. Is that incorrect or are there valid cases where that could happen? 
> (init_mm is special in that regard, i don't think it interacts with GUP 
> at all).

You are correct I think, for user pages hugepd only applies to hugetlb.

> 
>>
>> I don't know what you mean by _ordinary_ huge pmd entry.
>>
> 
> Essentially, what we use for THP. Let me try to understand how hugepd 
> interact with the rest of the system.
> 
> Do systems that support hugepd currently implement THP? Reading above 
> 32bit systems below, I assume not?

Right, as far as I understand only leaf huge pages are handled by THP as 
far as I understand.

> 
>> Let's take the exemple of powerpc 8xx which is the one I know best. This
>> is a powerpc32, so it has two levels : PGD and PTE. PGD has 1024 entries
>> and each entry covers a 4Mbytes area. Normal PTE has 1024 entries and
>> each entry is a 4k page. When you use 8Mbytes pages, you don't use PTEs
>> as it would be a waste of memory. You use a huge page directory that has
>> a single entry, and you have two PGD entries pointing to the huge page
>> directory.
> 
> Thanks, I assume there are no 8MB THP, correct?

Correct.

> 
> The 8MB example with 4MB PGD entries makes it sound a bit like the 
> cont-PTE/cont-PMD handling on aarch64: they don't use a hugepd but would 
> simply let two consecutive PGD entries point at the the relevant (sub) 
> parts of the hugetlb page. No hugepd involved.

Yes it is my feeling as well.

Allthough in the case of the powerpc 8xx we really need a PGD entry + a 
page entry in order to use the hardware assisted page table walk and 
also to populate L1 and L2 TLB entries without to many processing in the 
TLB-miss interrupt handler.

> 
>>
>> Some time ago, hupgepd was also used for 512kbytes pages and 16kbytes
>> pages:
>> - there was huge page directories with 8x 512kbytes pages,
>> - there was huge page directories with 256x 16kbytes pages,
>>
>> And the PGD/PMD entry points to a huge page directory (HUGEPD) instead
>> of pointing to a page table directory (PTE).
> 
> Thanks for the example.
> 
>>
>> Since commit b250c8c08c79 ("powerpc/8xx: Manage 512k huge pages as
>> standard pages."), the 8xx doesn't use anymore hugepd for 512k huge
>> page, but other platforms like powerpc book3e extensively use huge page
>> directories.
>>
>> I hope this clarifies the subject, otherwise I'm happy to provide
>> further details.
> 
> Thanks, it would be valuable to know if the assumption in this patch is 
> correct: hugepd will only be found in hugetlb areas in ordinary MMs (not 
> init_mm).
> 

Yes I think the assumption is correct for user pages hence for GUP.

By the way the discussion started with PTDUMP. For PTDUMP we need huge 
page directories to be taken into account. And for anything that 
involves kernel pages like VMAP or VMALLOC.

Christophe

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-09-05 16:05                       ` Christophe Leroy
  0 siblings, 0 replies; 60+ messages in thread
From: Christophe Leroy @ 2022-09-05 16:05 UTC (permalink / raw)
  To: David Hildenbrand, Mike Kravetz
  Cc: linuxppc-dev, linux-ia64, Aneesh Kumar K . V, Muchun Song,
	linux-kernel, linux-mm, Baolin Wang, Andrew Morton,
	Naoya Horiguchi



Le 05/09/2022 à 11:46, David Hildenbrand a écrit :
> On 05.09.22 11:33, Christophe Leroy wrote:
>>
>>
>> Le 05/09/2022 à 10:37, David Hildenbrand a écrit :
>>> On 03.09.22 09:07, Christophe Leroy wrote:
>>>> +Resending with valid powerpc list address
>>>>
>>>> Le 02/09/2022 à 20:52, David Hildenbrand a écrit :
>>>>>>>> Adding Christophe on Cc:
>>>>>>>>
>>>>>>>> Christophe do you know if is_hugepd is true for all hugetlb
>>>>>>>> entries, not
>>>>>>>> just hugepd?
>>>>
>>>> is_hugepd() is true if and only if the directory entry points to a huge
>>>> page directory and not to the normal lower level directory.
>>>>
>>>> As far as I understand if the directory entry is not pointing to any
>>>> lower directory but is a huge page entry, pXd_leaf() is true.
>>>>
>>>>
>>>>>>>>
>>>>>>>> On systems without hugepd entries, I guess ptdump skips all
>>>>>>>> hugetlb entries.
>>>>>>>> Sigh!
>>>>
>>>> As far as I can see, ptdump_pXd_entry() handles the pXd_leaf() case.
>>>>
>>>>>>>
>>>>>>> IIUC, the idea of ptdump_walk_pgd() is to dump page tables even
>>>>>>> outside
>>>>>>> VMAs (for debugging purposes?).
>>>>>>>
>>>>>>> I cannot convince myself that that's a good idea when only 
>>>>>>> holding the
>>>>>>> mmap lock in read mode, because we can just see page tables getting
>>>>>>> freed concurrently e.g., during concurrent munmap() ... while 
>>>>>>> holding
>>>>>>> the mmap lock in read we may only walk inside VMA boundaries.
>>>>>>>
>>>>>>> That then raises the questions if we're only calling this on
>>>>>>> special MMs
>>>>>>> (e.g., init_mm) whereby we cannot really see concurrent munmap() and
>>>>>>> where we shouldn't have hugetlb mappings or hugepd entries.
>>>>
>>>> At least on powerpc, PTDUMP handles only init_mm.
>>>>
>>>> Hugepage are used at least on powerpc 8xx for linear memory mapping, 
>>>> see
>>>>
>>>> commit 34536d780683 ("powerpc/8xx: Add a function to early map kernel
>>>> via huge pages")
>>>> commit cf209951fa7f ("powerpc/8xx: Map linear memory with huge pages")
>>>>
>>>> hugepds may also be used in the future to use huge pages for vmap and
>>>> vmalloc, see commit a6a8f7c4aa7e ("powerpc/8xx: add support for huge
>>>> pages on VMAP and VMALLOC")
>>>>
>>>> As far as I know, ppc64 also use huge pages for VMAP and VMALLOC, see
>>>>
>>>> commit d909f9109c30 ("powerpc/64s/radix: Enable HAVE_ARCH_HUGE_VMAP")
>>>> commit 8abddd968a30 ("powerpc/64s/radix: Enable huge vmalloc mappings")
>>>
>>> There is a difference between an ordinary huge mapping (e.g., as used
>>> for THP) and a a hugetlb mapping.
>>>
>>> Our current understanding is that hugepd only applies to hugetlb.
>>> Wouldn't vmap/vmalloc user ordinary huge pmd entries instead of hugepd?
>>>
>>
>> 'hugepd' stands for huge page directory. It is independant of whether a
>> huge page is used for hugetlb or for anything else, it represents the
>> way pages are described in the page tables.
> 
> This patch here makes the assumption that hugepd only applies to 
> hugetlb, because it removes any such handling from the !hugetlb path in 
> GUP. Is that incorrect or are there valid cases where that could happen? 
> (init_mm is special in that regard, i don't think it interacts with GUP 
> at all).

You are correct I think, for user pages hugepd only applies to hugetlb.

> 
>>
>> I don't know what you mean by _ordinary_ huge pmd entry.
>>
> 
> Essentially, what we use for THP. Let me try to understand how hugepd 
> interact with the rest of the system.
> 
> Do systems that support hugepd currently implement THP? Reading above 
> 32bit systems below, I assume not?

Right, as far as I understand only leaf huge pages are handled by THP as 
far as I understand.

> 
>> Let's take the exemple of powerpc 8xx which is the one I know best. This
>> is a powerpc32, so it has two levels : PGD and PTE. PGD has 1024 entries
>> and each entry covers a 4Mbytes area. Normal PTE has 1024 entries and
>> each entry is a 4k page. When you use 8Mbytes pages, you don't use PTEs
>> as it would be a waste of memory. You use a huge page directory that has
>> a single entry, and you have two PGD entries pointing to the huge page
>> directory.
> 
> Thanks, I assume there are no 8MB THP, correct?

Correct.

> 
> The 8MB example with 4MB PGD entries makes it sound a bit like the 
> cont-PTE/cont-PMD handling on aarch64: they don't use a hugepd but would 
> simply let two consecutive PGD entries point at the the relevant (sub) 
> parts of the hugetlb page. No hugepd involved.

Yes it is my feeling as well.

Allthough in the case of the powerpc 8xx we really need a PGD entry + a 
page entry in order to use the hardware assisted page table walk and 
also to populate L1 and L2 TLB entries without to many processing in the 
TLB-miss interrupt handler.

> 
>>
>> Some time ago, hupgepd was also used for 512kbytes pages and 16kbytes
>> pages:
>> - there was huge page directories with 8x 512kbytes pages,
>> - there was huge page directories with 256x 16kbytes pages,
>>
>> And the PGD/PMD entry points to a huge page directory (HUGEPD) instead
>> of pointing to a page table directory (PTE).
> 
> Thanks for the example.
> 
>>
>> Since commit b250c8c08c79 ("powerpc/8xx: Manage 512k huge pages as
>> standard pages."), the 8xx doesn't use anymore hugepd for 512k huge
>> page, but other platforms like powerpc book3e extensively use huge page
>> directories.
>>
>> I hope this clarifies the subject, otherwise I'm happy to provide
>> further details.
> 
> Thanks, it would be valuable to know if the assumption in this patch is 
> correct: hugepd will only be found in hugetlb areas in ordinary MMs (not 
> init_mm).
> 

Yes I think the assumption is correct for user pages hence for GUP.

By the way the discussion started with PTDUMP. For PTDUMP we need huge 
page directories to be taken into account. And for anything that 
involves kernel pages like VMAP or VMALLOC.

Christophe

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-09-05 16:05                       ` Christophe Leroy
  0 siblings, 0 replies; 60+ messages in thread
From: Christophe Leroy @ 2022-09-05 16:05 UTC (permalink / raw)
  To: David Hildenbrand, Mike Kravetz
  Cc: linux-mm, linux-kernel, linux-ia64, Baolin Wang,
	Aneesh Kumar K . V, Naoya Horiguchi, Michael Ellerman,
	Muchun Song, Andrew Morton, linuxppc-dev

DQoNCkxlIDA1LzA5LzIwMjIgw6AgMTE6NDYsIERhdmlkIEhpbGRlbmJyYW5kIGEgw6ljcml0wqA6
DQo+IE9uIDA1LjA5LjIyIDExOjMzLCBDaHJpc3RvcGhlIExlcm95IHdyb3RlOg0KPj4NCj4+DQo+
PiBMZSAwNS8wOS8yMDIyIMOgIDEwOjM3LCBEYXZpZCBIaWxkZW5icmFuZCBhIMOpY3JpdMKgOg0K
Pj4+IE9uIDAzLjA5LjIyIDA5OjA3LCBDaHJpc3RvcGhlIExlcm95IHdyb3RlOg0KPj4+PiArUmVz
ZW5kaW5nIHdpdGggdmFsaWQgcG93ZXJwYyBsaXN0IGFkZHJlc3MNCj4+Pj4NCj4+Pj4gTGUgMDIv
MDkvMjAyMiDDoCAyMDo1MiwgRGF2aWQgSGlsZGVuYnJhbmQgYSDDqWNyaXTCoDoNCj4+Pj4+Pj4+
IEFkZGluZyBDaHJpc3RvcGhlIG9uIENjOg0KPj4+Pj4+Pj4NCj4+Pj4+Pj4+IENocmlzdG9waGUg
ZG8geW91IGtub3cgaWYgaXNfaHVnZXBkIGlzIHRydWUgZm9yIGFsbCBodWdldGxiDQo+Pj4+Pj4+
PiBlbnRyaWVzLCBub3QNCj4+Pj4+Pj4+IGp1c3QgaHVnZXBkPw0KPj4+Pg0KPj4+PiBpc19odWdl
cGQoKSBpcyB0cnVlIGlmIGFuZCBvbmx5IGlmIHRoZSBkaXJlY3RvcnkgZW50cnkgcG9pbnRzIHRv
IGEgaHVnZQ0KPj4+PiBwYWdlIGRpcmVjdG9yeSBhbmQgbm90IHRvIHRoZSBub3JtYWwgbG93ZXIg
bGV2ZWwgZGlyZWN0b3J5Lg0KPj4+Pg0KPj4+PiBBcyBmYXIgYXMgSSB1bmRlcnN0YW5kIGlmIHRo
ZSBkaXJlY3RvcnkgZW50cnkgaXMgbm90IHBvaW50aW5nIHRvIGFueQ0KPj4+PiBsb3dlciBkaXJl
Y3RvcnkgYnV0IGlzIGEgaHVnZSBwYWdlIGVudHJ5LCBwWGRfbGVhZigpIGlzIHRydWUuDQo+Pj4+
DQo+Pj4+DQo+Pj4+Pj4+Pg0KPj4+Pj4+Pj4gT24gc3lzdGVtcyB3aXRob3V0IGh1Z2VwZCBlbnRy
aWVzLCBJIGd1ZXNzIHB0ZHVtcCBza2lwcyBhbGwNCj4+Pj4+Pj4+IGh1Z2V0bGIgZW50cmllcy4N
Cj4+Pj4+Pj4+IFNpZ2ghDQo+Pj4+DQo+Pj4+IEFzIGZhciBhcyBJIGNhbiBzZWUsIHB0ZHVtcF9w
WGRfZW50cnkoKSBoYW5kbGVzIHRoZSBwWGRfbGVhZigpIGNhc2UuDQo+Pj4+DQo+Pj4+Pj4+DQo+
Pj4+Pj4+IElJVUMsIHRoZSBpZGVhIG9mIHB0ZHVtcF93YWxrX3BnZCgpIGlzIHRvIGR1bXAgcGFn
ZSB0YWJsZXMgZXZlbg0KPj4+Pj4+PiBvdXRzaWRlDQo+Pj4+Pj4+IFZNQXMgKGZvciBkZWJ1Z2dp
bmcgcHVycG9zZXM/KS4NCj4+Pj4+Pj4NCj4+Pj4+Pj4gSSBjYW5ub3QgY29udmluY2UgbXlzZWxm
IHRoYXQgdGhhdCdzIGEgZ29vZCBpZGVhIHdoZW4gb25seSANCj4+Pj4+Pj4gaG9sZGluZyB0aGUN
Cj4+Pj4+Pj4gbW1hcCBsb2NrIGluIHJlYWQgbW9kZSwgYmVjYXVzZSB3ZSBjYW4ganVzdCBzZWUg
cGFnZSB0YWJsZXMgZ2V0dGluZw0KPj4+Pj4+PiBmcmVlZCBjb25jdXJyZW50bHkgZS5nLiwgZHVy
aW5nIGNvbmN1cnJlbnQgbXVubWFwKCkgLi4uIHdoaWxlIA0KPj4+Pj4+PiBob2xkaW5nDQo+Pj4+
Pj4+IHRoZSBtbWFwIGxvY2sgaW4gcmVhZCB3ZSBtYXkgb25seSB3YWxrIGluc2lkZSBWTUEgYm91
bmRhcmllcy4NCj4+Pj4+Pj4NCj4+Pj4+Pj4gVGhhdCB0aGVuIHJhaXNlcyB0aGUgcXVlc3Rpb25z
IGlmIHdlJ3JlIG9ubHkgY2FsbGluZyB0aGlzIG9uDQo+Pj4+Pj4+IHNwZWNpYWwgTU1zDQo+Pj4+
Pj4+IChlLmcuLCBpbml0X21tKSB3aGVyZWJ5IHdlIGNhbm5vdCByZWFsbHkgc2VlIGNvbmN1cnJl
bnQgbXVubWFwKCkgYW5kDQo+Pj4+Pj4+IHdoZXJlIHdlIHNob3VsZG4ndCBoYXZlIGh1Z2V0bGIg
bWFwcGluZ3Mgb3IgaHVnZXBkIGVudHJpZXMuDQo+Pj4+DQo+Pj4+IEF0IGxlYXN0IG9uIHBvd2Vy
cGMsIFBURFVNUCBoYW5kbGVzIG9ubHkgaW5pdF9tbS4NCj4+Pj4NCj4+Pj4gSHVnZXBhZ2UgYXJl
IHVzZWQgYXQgbGVhc3Qgb24gcG93ZXJwYyA4eHggZm9yIGxpbmVhciBtZW1vcnkgbWFwcGluZywg
DQo+Pj4+IHNlZQ0KPj4+Pg0KPj4+PiBjb21taXQgMzQ1MzZkNzgwNjgzICgicG93ZXJwYy84eHg6
IEFkZCBhIGZ1bmN0aW9uIHRvIGVhcmx5IG1hcCBrZXJuZWwNCj4+Pj4gdmlhIGh1Z2UgcGFnZXMi
KQ0KPj4+PiBjb21taXQgY2YyMDk5NTFmYTdmICgicG93ZXJwYy84eHg6IE1hcCBsaW5lYXIgbWVt
b3J5IHdpdGggaHVnZSBwYWdlcyIpDQo+Pj4+DQo+Pj4+IGh1Z2VwZHMgbWF5IGFsc28gYmUgdXNl
ZCBpbiB0aGUgZnV0dXJlIHRvIHVzZSBodWdlIHBhZ2VzIGZvciB2bWFwIGFuZA0KPj4+PiB2bWFs
bG9jLCBzZWUgY29tbWl0IGE2YThmN2M0YWE3ZSAoInBvd2VycGMvOHh4OiBhZGQgc3VwcG9ydCBm
b3IgaHVnZQ0KPj4+PiBwYWdlcyBvbiBWTUFQIGFuZCBWTUFMTE9DIikNCj4+Pj4NCj4+Pj4gQXMg
ZmFyIGFzIEkga25vdywgcHBjNjQgYWxzbyB1c2UgaHVnZSBwYWdlcyBmb3IgVk1BUCBhbmQgVk1B
TExPQywgc2VlDQo+Pj4+DQo+Pj4+IGNvbW1pdCBkOTA5ZjkxMDljMzAgKCJwb3dlcnBjLzY0cy9y
YWRpeDogRW5hYmxlIEhBVkVfQVJDSF9IVUdFX1ZNQVAiKQ0KPj4+PiBjb21taXQgOGFiZGRkOTY4
YTMwICgicG93ZXJwYy82NHMvcmFkaXg6IEVuYWJsZSBodWdlIHZtYWxsb2MgbWFwcGluZ3MiKQ0K
Pj4+DQo+Pj4gVGhlcmUgaXMgYSBkaWZmZXJlbmNlIGJldHdlZW4gYW4gb3JkaW5hcnkgaHVnZSBt
YXBwaW5nIChlLmcuLCBhcyB1c2VkDQo+Pj4gZm9yIFRIUCkgYW5kIGEgYSBodWdldGxiIG1hcHBp
bmcuDQo+Pj4NCj4+PiBPdXIgY3VycmVudCB1bmRlcnN0YW5kaW5nIGlzIHRoYXQgaHVnZXBkIG9u
bHkgYXBwbGllcyB0byBodWdldGxiLg0KPj4+IFdvdWxkbid0IHZtYXAvdm1hbGxvYyB1c2VyIG9y
ZGluYXJ5IGh1Z2UgcG1kIGVudHJpZXMgaW5zdGVhZCBvZiBodWdlcGQ/DQo+Pj4NCj4+DQo+PiAn
aHVnZXBkJyBzdGFuZHMgZm9yIGh1Z2UgcGFnZSBkaXJlY3RvcnkuIEl0IGlzIGluZGVwZW5kYW50
IG9mIHdoZXRoZXIgYQ0KPj4gaHVnZSBwYWdlIGlzIHVzZWQgZm9yIGh1Z2V0bGIgb3IgZm9yIGFu
eXRoaW5nIGVsc2UsIGl0IHJlcHJlc2VudHMgdGhlDQo+PiB3YXkgcGFnZXMgYXJlIGRlc2NyaWJl
ZCBpbiB0aGUgcGFnZSB0YWJsZXMuDQo+IA0KPiBUaGlzIHBhdGNoIGhlcmUgbWFrZXMgdGhlIGFz
c3VtcHRpb24gdGhhdCBodWdlcGQgb25seSBhcHBsaWVzIHRvIA0KPiBodWdldGxiLCBiZWNhdXNl
IGl0IHJlbW92ZXMgYW55IHN1Y2ggaGFuZGxpbmcgZnJvbSB0aGUgIWh1Z2V0bGIgcGF0aCBpbiAN
Cj4gR1VQLiBJcyB0aGF0IGluY29ycmVjdCBvciBhcmUgdGhlcmUgdmFsaWQgY2FzZXMgd2hlcmUg
dGhhdCBjb3VsZCBoYXBwZW4/IA0KPiAoaW5pdF9tbSBpcyBzcGVjaWFsIGluIHRoYXQgcmVnYXJk
LCBpIGRvbid0IHRoaW5rIGl0IGludGVyYWN0cyB3aXRoIEdVUCANCj4gYXQgYWxsKS4NCg0KWW91
IGFyZSBjb3JyZWN0IEkgdGhpbmssIGZvciB1c2VyIHBhZ2VzIGh1Z2VwZCBvbmx5IGFwcGxpZXMg
dG8gaHVnZXRsYi4NCg0KPiANCj4+DQo+PiBJIGRvbid0IGtub3cgd2hhdCB5b3UgbWVhbiBieSBf
b3JkaW5hcnlfIGh1Z2UgcG1kIGVudHJ5Lg0KPj4NCj4gDQo+IEVzc2VudGlhbGx5LCB3aGF0IHdl
IHVzZSBmb3IgVEhQLiBMZXQgbWUgdHJ5IHRvIHVuZGVyc3RhbmQgaG93IGh1Z2VwZCANCj4gaW50
ZXJhY3Qgd2l0aCB0aGUgcmVzdCBvZiB0aGUgc3lzdGVtLg0KPiANCj4gRG8gc3lzdGVtcyB0aGF0
IHN1cHBvcnQgaHVnZXBkIGN1cnJlbnRseSBpbXBsZW1lbnQgVEhQPyBSZWFkaW5nIGFib3ZlIA0K
PiAzMmJpdCBzeXN0ZW1zIGJlbG93LCBJIGFzc3VtZSBub3Q/DQoNClJpZ2h0LCBhcyBmYXIgYXMg
SSB1bmRlcnN0YW5kIG9ubHkgbGVhZiBodWdlIHBhZ2VzIGFyZSBoYW5kbGVkIGJ5IFRIUCBhcyAN
CmZhciBhcyBJIHVuZGVyc3RhbmQuDQoNCj4gDQo+PiBMZXQncyB0YWtlIHRoZSBleGVtcGxlIG9m
IHBvd2VycGMgOHh4IHdoaWNoIGlzIHRoZSBvbmUgSSBrbm93IGJlc3QuIFRoaXMNCj4+IGlzIGEg
cG93ZXJwYzMyLCBzbyBpdCBoYXMgdHdvIGxldmVscyA6IFBHRCBhbmQgUFRFLiBQR0QgaGFzIDEw
MjQgZW50cmllcw0KPj4gYW5kIGVhY2ggZW50cnkgY292ZXJzIGEgNE1ieXRlcyBhcmVhLiBOb3Jt
YWwgUFRFIGhhcyAxMDI0IGVudHJpZXMgYW5kDQo+PiBlYWNoIGVudHJ5IGlzIGEgNGsgcGFnZS4g
V2hlbiB5b3UgdXNlIDhNYnl0ZXMgcGFnZXMsIHlvdSBkb24ndCB1c2UgUFRFcw0KPj4gYXMgaXQg
d291bGQgYmUgYSB3YXN0ZSBvZiBtZW1vcnkuIFlvdSB1c2UgYSBodWdlIHBhZ2UgZGlyZWN0b3J5
IHRoYXQgaGFzDQo+PiBhIHNpbmdsZSBlbnRyeSwgYW5kIHlvdSBoYXZlIHR3byBQR0QgZW50cmll
cyBwb2ludGluZyB0byB0aGUgaHVnZSBwYWdlDQo+PiBkaXJlY3RvcnkuDQo+IA0KPiBUaGFua3Ms
IEkgYXNzdW1lIHRoZXJlIGFyZSBubyA4TUIgVEhQLCBjb3JyZWN0Pw0KDQpDb3JyZWN0Lg0KDQo+
IA0KPiBUaGUgOE1CIGV4YW1wbGUgd2l0aCA0TUIgUEdEIGVudHJpZXMgbWFrZXMgaXQgc291bmQg
YSBiaXQgbGlrZSB0aGUgDQo+IGNvbnQtUFRFL2NvbnQtUE1EIGhhbmRsaW5nIG9uIGFhcmNoNjQ6
IHRoZXkgZG9uJ3QgdXNlIGEgaHVnZXBkIGJ1dCB3b3VsZCANCj4gc2ltcGx5IGxldCB0d28gY29u
c2VjdXRpdmUgUEdEIGVudHJpZXMgcG9pbnQgYXQgdGhlIHRoZSByZWxldmFudCAoc3ViKSANCj4g
cGFydHMgb2YgdGhlIGh1Z2V0bGIgcGFnZS4gTm8gaHVnZXBkIGludm9sdmVkLg0KDQpZZXMgaXQg
aXMgbXkgZmVlbGluZyBhcyB3ZWxsLg0KDQpBbGx0aG91Z2ggaW4gdGhlIGNhc2Ugb2YgdGhlIHBv
d2VycGMgOHh4IHdlIHJlYWxseSBuZWVkIGEgUEdEIGVudHJ5ICsgYSANCnBhZ2UgZW50cnkgaW4g
b3JkZXIgdG8gdXNlIHRoZSBoYXJkd2FyZSBhc3Npc3RlZCBwYWdlIHRhYmxlIHdhbGsgYW5kIA0K
YWxzbyB0byBwb3B1bGF0ZSBMMSBhbmQgTDIgVExCIGVudHJpZXMgd2l0aG91dCB0byBtYW55IHBy
b2Nlc3NpbmcgaW4gdGhlIA0KVExCLW1pc3MgaW50ZXJydXB0IGhhbmRsZXIuDQoNCj4gDQo+Pg0K
Pj4gU29tZSB0aW1lIGFnbywgaHVwZ2VwZCB3YXMgYWxzbyB1c2VkIGZvciA1MTJrYnl0ZXMgcGFn
ZXMgYW5kIDE2a2J5dGVzDQo+PiBwYWdlczoNCj4+IC0gdGhlcmUgd2FzIGh1Z2UgcGFnZSBkaXJl
Y3RvcmllcyB3aXRoIDh4IDUxMmtieXRlcyBwYWdlcywNCj4+IC0gdGhlcmUgd2FzIGh1Z2UgcGFn
ZSBkaXJlY3RvcmllcyB3aXRoIDI1NnggMTZrYnl0ZXMgcGFnZXMsDQo+Pg0KPj4gQW5kIHRoZSBQ
R0QvUE1EIGVudHJ5IHBvaW50cyB0byBhIGh1Z2UgcGFnZSBkaXJlY3RvcnkgKEhVR0VQRCkgaW5z
dGVhZA0KPj4gb2YgcG9pbnRpbmcgdG8gYSBwYWdlIHRhYmxlIGRpcmVjdG9yeSAoUFRFKS4NCj4g
DQo+IFRoYW5rcyBmb3IgdGhlIGV4YW1wbGUuDQo+IA0KPj4NCj4+IFNpbmNlIGNvbW1pdCBiMjUw
YzhjMDhjNzkgKCJwb3dlcnBjLzh4eDogTWFuYWdlIDUxMmsgaHVnZSBwYWdlcyBhcw0KPj4gc3Rh
bmRhcmQgcGFnZXMuIiksIHRoZSA4eHggZG9lc24ndCB1c2UgYW55bW9yZSBodWdlcGQgZm9yIDUx
MmsgaHVnZQ0KPj4gcGFnZSwgYnV0IG90aGVyIHBsYXRmb3JtcyBsaWtlIHBvd2VycGMgYm9vazNl
IGV4dGVuc2l2ZWx5IHVzZSBodWdlIHBhZ2UNCj4+IGRpcmVjdG9yaWVzLg0KPj4NCj4+IEkgaG9w
ZSB0aGlzIGNsYXJpZmllcyB0aGUgc3ViamVjdCwgb3RoZXJ3aXNlIEknbSBoYXBweSB0byBwcm92
aWRlDQo+PiBmdXJ0aGVyIGRldGFpbHMuDQo+IA0KPiBUaGFua3MsIGl0IHdvdWxkIGJlIHZhbHVh
YmxlIHRvIGtub3cgaWYgdGhlIGFzc3VtcHRpb24gaW4gdGhpcyBwYXRjaCBpcyANCj4gY29ycmVj
dDogaHVnZXBkIHdpbGwgb25seSBiZSBmb3VuZCBpbiBodWdldGxiIGFyZWFzIGluIG9yZGluYXJ5
IE1NcyAobm90IA0KPiBpbml0X21tKS4NCj4gDQoNClllcyBJIHRoaW5rIHRoZSBhc3N1bXB0aW9u
IGlzIGNvcnJlY3QgZm9yIHVzZXIgcGFnZXMgaGVuY2UgZm9yIEdVUC4NCg0KQnkgdGhlIHdheSB0
aGUgZGlzY3Vzc2lvbiBzdGFydGVkIHdpdGggUFREVU1QLiBGb3IgUFREVU1QIHdlIG5lZWQgaHVn
ZSANCnBhZ2UgZGlyZWN0b3JpZXMgdG8gYmUgdGFrZW4gaW50byBhY2NvdW50LiBBbmQgZm9yIGFu
eXRoaW5nIHRoYXQgDQppbnZvbHZlcyBrZXJuZWwgcGFnZXMgbGlrZSBWTUFQIG9yIFZNQUxMT0Mu
DQoNCkNocmlzdG9waGU

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
  2022-09-05 16:05                       ` Christophe Leroy
  (?)
@ 2022-09-05 16:09                         ` David Hildenbrand
  -1 siblings, 0 replies; 60+ messages in thread
From: David Hildenbrand @ 2022-09-05 16:09 UTC (permalink / raw)
  To: Christophe Leroy, Mike Kravetz
  Cc: linux-mm, linux-kernel, linux-ia64, Baolin Wang,
	Aneesh Kumar K . V, Naoya Horiguchi, Michael Ellerman,
	Muchun Song, Andrew Morton, linuxppc-dev

> Yes I think the assumption is correct for user pages hence for GUP.
> 
> By the way the discussion started with PTDUMP. For PTDUMP we need huge
> page directories to be taken into account. And for anything that
> involves kernel pages like VMAP or VMALLOC.

Yes, makes perfect sense to me now that you explained how/where hugepd 
is actually used -- thanks!

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-09-05 16:09                         ` David Hildenbrand
  0 siblings, 0 replies; 60+ messages in thread
From: David Hildenbrand @ 2022-09-05 16:09 UTC (permalink / raw)
  To: Christophe Leroy, Mike Kravetz
  Cc: linuxppc-dev, linux-ia64, Aneesh Kumar K . V, Muchun Song,
	linux-kernel, linux-mm, Baolin Wang, Andrew Morton,
	Naoya Horiguchi

> Yes I think the assumption is correct for user pages hence for GUP.
> 
> By the way the discussion started with PTDUMP. For PTDUMP we need huge
> page directories to be taken into account. And for anything that
> involves kernel pages like VMAP or VMALLOC.

Yes, makes perfect sense to me now that you explained how/where hugepd 
is actually used -- thanks!

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask
@ 2022-09-05 16:09                         ` David Hildenbrand
  0 siblings, 0 replies; 60+ messages in thread
From: David Hildenbrand @ 2022-09-05 16:09 UTC (permalink / raw)
  To: Christophe Leroy, Mike Kravetz
  Cc: linux-mm, linux-kernel, linux-ia64, Baolin Wang,
	Aneesh Kumar K . V, Naoya Horiguchi, Michael Ellerman,
	Muchun Song, Andrew Morton, linuxppc-dev

> Yes I think the assumption is correct for user pages hence for GUP.
> 
> By the way the discussion started with PTDUMP. For PTDUMP we need huge
> page directories to be taken into account. And for anything that
> involves kernel pages like VMAP or VMALLOC.

Yes, makes perfect sense to me now that you explained how/where hugepd 
is actually used -- thanks!

-- 
Thanks,

David / dhildenb

^ permalink raw reply	[flat|nested] 60+ messages in thread

end of thread, other threads:[~2022-09-05 16:09 UTC | newest]

Thread overview: 60+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-08-29 23:40 [PATCH] hugetlb: simplify hugetlb handling in follow_page_mask Mike Kravetz
2022-08-29 23:40 ` Mike Kravetz
2022-08-30  1:06 ` Baolin Wang
2022-08-30  1:06   ` Baolin Wang
2022-08-30 16:44   ` Mike Kravetz
2022-08-30 16:44     ` Mike Kravetz
2022-08-30 18:39     ` Mike Kravetz
2022-08-30 18:39       ` Mike Kravetz
2022-08-31  1:07       ` Baolin Wang
2022-08-31  1:07         ` Baolin Wang
2022-08-31 23:56         ` Mike Kravetz
2022-09-01  0:00           ` Mike Kravetz
2022-09-01  1:24           ` Baolin Wang
2022-09-01  1:24             ` Baolin Wang
2022-09-01  6:59             ` David Hildenbrand
2022-09-01  6:59               ` David Hildenbrand
2022-09-01 10:40               ` Baolin Wang
2022-09-01 10:40                 ` Baolin Wang
2022-08-30  8:11 ` David Hildenbrand
2022-08-30  8:11   ` David Hildenbrand
2022-08-30 16:52   ` Mike Kravetz
2022-08-30 16:52     ` Mike Kravetz
2022-08-30 21:31     ` Mike Kravetz
2022-08-30 21:31       ` Mike Kravetz
2022-08-31  8:07       ` David Hildenbrand
2022-08-31  8:07         ` David Hildenbrand
2022-09-02 18:50         ` Mike Kravetz
2022-09-02 18:50           ` Mike Kravetz
2022-09-02 18:52           ` David Hildenbrand
2022-09-02 18:52             ` David Hildenbrand
2022-09-03  6:59             ` Christophe Leroy
2022-09-03  6:59               ` Christophe Leroy
2022-09-03  7:07             ` Christophe Leroy
2022-09-03  7:07               ` Christophe Leroy
2022-09-03  7:07               ` Christophe Leroy
2022-09-04 11:49               ` Michael Ellerman
2022-09-04 11:49                 ` Michael Ellerman
2022-09-04 11:49                 ` Michael Ellerman
2022-09-05  8:37               ` David Hildenbrand
2022-09-05  8:37                 ` David Hildenbrand
2022-09-05  8:37                 ` David Hildenbrand
2022-09-05  9:33                 ` Christophe Leroy
2022-09-05  9:33                   ` Christophe Leroy
2022-09-05  9:33                   ` Christophe Leroy
2022-09-05  9:46                   ` David Hildenbrand
2022-09-05  9:46                     ` David Hildenbrand
2022-09-05  9:46                     ` David Hildenbrand
2022-09-05 16:05                     ` Christophe Leroy
2022-09-05 16:05                       ` Christophe Leroy
2022-09-05 16:05                       ` Christophe Leroy
2022-09-05 16:09                       ` David Hildenbrand
2022-09-05 16:09                         ` David Hildenbrand
2022-09-05 16:09                         ` David Hildenbrand
2022-08-31  5:08 ` kernel test robot
2022-08-31  5:08   ` kernel test robot
2022-08-31 20:42   ` Mike Kravetz
2022-08-31 20:42     ` Mike Kravetz
2022-08-31 20:42     ` Mike Kravetz
2022-09-01 16:19 ` Mike Kravetz
2022-09-01 16:19   ` Mike Kravetz

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.