On 9 Sep 2020, at 10:09, Kirill A. Shutemov wrote: > On Wed, Sep 02, 2020 at 02:06:17PM -0400, Zi Yan wrote: >> From: Zi Yan >> >> Add PUD-level TLB flush ops and teach page_vma_mapped_talk about 1GB >> THPs. >> >> Signed-off-by: Zi Yan >> --- >> arch/x86/include/asm/pgtable.h | 3 +++ >> arch/x86/mm/pgtable.c | 13 +++++++++++++ >> include/linux/mmu_notifier.h | 13 +++++++++++++ >> include/linux/pgtable.h | 14 ++++++++++++++ >> include/linux/rmap.h | 1 + >> mm/page_vma_mapped.c | 33 +++++++++++++++++++++++++++++---- >> mm/rmap.c | 12 +++++++++--- >> 7 files changed, 82 insertions(+), 7 deletions(-) >> >> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h >> index 26255cac78c0..15334f5ba172 100644 >> --- a/arch/x86/include/asm/pgtable.h >> +++ b/arch/x86/include/asm/pgtable.h >> @@ -1127,6 +1127,9 @@ extern int pudp_test_and_clear_young(struct vm_area_struct *vma, >> extern int pmdp_clear_flush_young(struct vm_area_struct *vma, >> unsigned long address, pmd_t *pmdp); >> >> +#define __HAVE_ARCH_PUDP_CLEAR_YOUNG_FLUSH >> +extern int pudp_clear_flush_young(struct vm_area_struct *vma, >> + unsigned long address, pud_t *pudp); >> >> #define pmd_write pmd_write >> static inline int pmd_write(pmd_t pmd) >> diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c >> index 7be73aee6183..e4a2dffcc418 100644 >> --- a/arch/x86/mm/pgtable.c >> +++ b/arch/x86/mm/pgtable.c >> @@ -633,6 +633,19 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma, >> >> return young; >> } >> +int pudp_clear_flush_young(struct vm_area_struct *vma, >> + unsigned long address, pud_t *pudp) >> +{ >> + int young; >> + >> + VM_BUG_ON(address & ~HPAGE_PUD_MASK); >> + >> + young = pudp_test_and_clear_young(vma, address, pudp); >> + if (young) >> + flush_tlb_range(vma, address, address + HPAGE_PUD_SIZE); >> + >> + return young; >> +} >> #endif >> >> /** >> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h >> index b8200782dede..4ffa179e654f 100644 >> --- a/include/linux/mmu_notifier.h >> +++ b/include/linux/mmu_notifier.h >> @@ -557,6 +557,19 @@ static inline void mmu_notifier_range_init_migrate( >> __young; \ >> }) >> >> +#define pudp_clear_flush_young_notify(__vma, __address, __pudp) \ >> +({ \ >> + int __young; \ >> + struct vm_area_struct *___vma = __vma; \ >> + unsigned long ___address = __address; \ >> + __young = pudp_clear_flush_young(___vma, ___address, __pudp); \ >> + __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ >> + ___address, \ >> + ___address + \ >> + PUD_SIZE); \ >> + __young; \ >> +}) >> + >> #define ptep_clear_young_notify(__vma, __address, __ptep) \ >> ({ \ >> int __young; \ >> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h >> index 255275d5b73e..8ef358c386af 100644 >> --- a/include/linux/pgtable.h >> +++ b/include/linux/pgtable.h >> @@ -240,6 +240,20 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, >> #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ >> #endif >> >> +#ifndef __HAVE_ARCH_PUDP_CLEAR_YOUNG_FLUSH >> +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD >> +extern int pudp_clear_flush_young(struct vm_area_struct *vma, >> + unsigned long address, pud_t *pudp); >> +#else >> +int pudp_clear_flush_young(struct vm_area_struct *vma, >> + unsigned long address, pud_t *pudp) >> +{ >> + BUILD_BUG(); >> + return 0; >> +} >> +#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ >> +#endif >> + >> #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR >> static inline pte_t ptep_get_and_clear(struct mm_struct *mm, >> unsigned long address, >> diff --git a/include/linux/rmap.h b/include/linux/rmap.h >> index 3a6adfa70fb0..0af61dd193d2 100644 >> --- a/include/linux/rmap.h >> +++ b/include/linux/rmap.h >> @@ -206,6 +206,7 @@ struct page_vma_mapped_walk { >> struct page *page; >> struct vm_area_struct *vma; >> unsigned long address; >> + pud_t *pud; >> pmd_t *pmd; >> pte_t *pte; >> spinlock_t *ptl; >> diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c >> index 5e77b269c330..d9d39ec06e21 100644 >> --- a/mm/page_vma_mapped.c >> +++ b/mm/page_vma_mapped.c >> @@ -145,9 +145,12 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) >> struct page *page = pvmw->page; >> pgd_t *pgd; >> p4d_t *p4d; >> - pud_t *pud; >> + pud_t pude; >> pmd_t pmde; >> >> + if (!pvmw->pte && !pvmw->pmd && pvmw->pud) >> + return not_found(pvmw); >> + >> /* The only possible pmd mapping has been handled on last iteration */ >> if (pvmw->pmd && !pvmw->pte) >> return not_found(pvmw); >> @@ -174,10 +177,31 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) >> p4d = p4d_offset(pgd, pvmw->address); >> if (!p4d_present(*p4d)) >> return false; >> - pud = pud_offset(p4d, pvmw->address); >> - if (!pud_present(*pud)) >> + pvmw->pud = pud_offset(p4d, pvmw->address); >> + >> + /* >> + * Make sure the pud value isn't cached in a register by the >> + * compiler and used as a stale value after we've observed a >> + * subsequent update. >> + */ >> + pude = READ_ONCE(*pvmw->pud); >> + if (pud_trans_huge(pude)) { >> + pvmw->ptl = pud_lock(mm, pvmw->pud); >> + if (likely(pud_trans_huge(*pvmw->pud))) { >> + if (pvmw->flags & PVMW_MIGRATION) >> + return not_found(pvmw); >> + if (pud_page(*pvmw->pud) != page) >> + return not_found(pvmw); >> + return true; >> + } else { >> + /* THP pud was split under us: handle on pmd level */ >> + spin_unlock(pvmw->ptl); >> + pvmw->ptl = NULL; > > Hm. What makes you sure the pmd table is established here? > > I have not looked at PUD THP handling of MADV_DONTNEED yet, but for PMD > THP can became pmd_none() at any point (unless ptl is locked). You are right. I need to check pud_present here and only go to pmd level when pud is present, otherwise just return with not_found. Thanks. > >> + } >> + } else if (!pud_present(pude)) >> return false; >> - pvmw->pmd = pmd_offset(pud, pvmw->address); >> + >> + pvmw->pmd = pmd_offset(pvmw->pud, pvmw->address); >> /* >> * Make sure the pmd value isn't cached in a register by the >> * compiler and used as a stale value after we've observed a >> @@ -213,6 +237,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) >> } else if (!pmd_present(pmde)) { >> return false; >> } >> + >> if (!map_pte(pvmw)) >> goto next_pte; >> while (1) { >> diff --git a/mm/rmap.c b/mm/rmap.c > > Why? > The extra newline? Will remove it. >> index 10195a2421cf..77cec0658b76 100644 >> --- a/mm/rmap.c >> +++ b/mm/rmap.c >> @@ -803,9 +803,15 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, >> referenced++; >> } >> } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { >> - if (pmdp_clear_flush_young_notify(vma, address, >> - pvmw.pmd)) >> - referenced++; >> + if (pvmw.pmd) { >> + if (pmdp_clear_flush_young_notify(vma, address, >> + pvmw.pmd)) >> + referenced++; >> + } else if (pvmw.pud) { >> + if (pudp_clear_flush_young_notify(vma, address, >> + pvmw.pud)) >> + referenced++; >> + } >> } else { >> /* unexpected pmd-mapped page? */ >> WARN_ON_ONCE(1); >> -- >> 2.28.0 >> >> > > -- > Kirill A. Shutemov — Best Regards, Yan Zi