Kirill A. Shutemov wrote: > On Mon, Mar 13, 2017 at 11:45:02AM -0400, Zi Yan wrote: >> From: Naoya Horiguchi >> >> If one of callers of page migration starts to handle thp, >> memory management code start to see pmd migration entry, so we need >> to prepare for it before enabling. This patch changes various code >> point which checks the status of given pmds in order to prevent race >> between thp migration and the pmd-related works. >> >> ChangeLog v1 -> v2: >> - introduce pmd_related() (I know the naming is not good, but can't >> think up no better name. Any suggesntion is welcomed.) >> >> Signed-off-by: Naoya Horiguchi >> >> ChangeLog v2 -> v3: >> - add is_swap_pmd() >> - a pmd entry should be pmd pointing to pte pages, is_swap_pmd(), >> pmd_trans_huge(), pmd_devmap(), or pmd_none() >> - use pmdp_huge_clear_flush() instead of pmdp_huge_get_and_clear() >> - flush_cache_range() while set_pmd_migration_entry() >> - pmd_none_or_trans_huge_or_clear_bad() and pmd_trans_unstable() return >> true on pmd_migration_entry, so that migration entries are not >> treated as pmd page table entries. >> >> Signed-off-by: Zi Yan >> --- >> arch/x86/mm/gup.c | 4 +-- >> fs/proc/task_mmu.c | 22 +++++++++------ >> include/asm-generic/pgtable.h | 3 +- >> include/linux/huge_mm.h | 14 +++++++-- >> mm/gup.c | 22 +++++++++++++-- >> mm/huge_memory.c | 66 ++++++++++++++++++++++++++++++++++++++----- >> mm/madvise.c | 2 ++ >> mm/memcontrol.c | 2 ++ >> mm/memory.c | 9 ++++-- >> mm/mprotect.c | 6 ++-- >> mm/mremap.c | 2 +- >> 11 files changed, 124 insertions(+), 28 deletions(-) >> >> diff --git a/mm/gup.c b/mm/gup.c >> index 94fab8fa432b..2b1effb16242 100644 >> --- a/mm/gup.c >> +++ b/mm/gup.c >> @@ -272,6 +272,15 @@ struct page *follow_page_mask(struct vm_area_struct *vma, >> return page; >> return no_page_table(vma, flags); >> } >> + if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) >> + return no_page_table(vma, flags); >> + if (!pmd_present(*pmd)) { >> +retry: >> + if (likely(!(flags & FOLL_MIGRATION))) >> + return no_page_table(vma, flags); >> + pmd_migration_entry_wait(mm, pmd); >> + goto retry; > > This looks a lot like endless loop if flags contain FOLL_MIGRATION. Hm? > > I guess retry label should be on previous line. You are right. It should be: + if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) + return no_page_table(vma, flags); +retry: + if (!pmd_present(*pmd)) { + if (likely(!(flags & FOLL_MIGRATION))) + return no_page_table(vma, flags); + pmd_migration_entry_wait(mm, pmd); + goto retry; > >> + } >> if (pmd_devmap(*pmd)) { >> ptl = pmd_lock(mm, pmd); >> page = follow_devmap_pmd(vma, address, pmd, flags); >> @@ -286,6 +295,15 @@ struct page *follow_page_mask(struct vm_area_struct *vma, >> return no_page_table(vma, flags); >> >> ptl = pmd_lock(mm, pmd); >> + if (unlikely(!pmd_present(*pmd))) { >> +retry_locked: >> + if (likely(!(flags & FOLL_MIGRATION))) { >> + spin_unlock(ptl); >> + return no_page_table(vma, flags); >> + } >> + pmd_migration_entry_wait(mm, pmd); >> + goto retry_locked; > > Again. That's doesn't look right.. It will be changed: ptl = pmd_lock(mm, pmd); +retry_locked: + if (unlikely(!pmd_present(*pmd))) { + if (likely(!(flags & FOLL_MIGRATION))) { + spin_unlock(ptl); + return no_page_table(vma, flags); + } + pmd_migration_entry_wait(mm, pmd); + goto retry_locked; > >> + } >> if (unlikely(!pmd_trans_huge(*pmd))) { >> spin_unlock(ptl); >> return follow_page_pte(vma, address, pmd, flags); >> @@ -341,7 +359,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, >> pud = pud_offset(pgd, address); >> BUG_ON(pud_none(*pud)); >> pmd = pmd_offset(pud, address); >> - if (pmd_none(*pmd)) >> + if (!pmd_present(*pmd)) >> return -EFAULT; >> VM_BUG_ON(pmd_trans_huge(*pmd)); >> pte = pte_offset_map(pmd, address); >> @@ -1369,7 +1387,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, >> pmd_t pmd = READ_ONCE(*pmdp); >> >> next = pmd_addr_end(addr, end); >> - if (pmd_none(pmd)) >> + if (!pmd_present(pmd)) >> return 0; >> >> if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) { >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >> index a9c2a0ef5b9b..3f18452f3eb1 100644 >> --- a/mm/huge_memory.c >> +++ b/mm/huge_memory.c >> @@ -898,6 +898,21 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, >> >> ret = -EAGAIN; >> pmd = *src_pmd; >> + >> + if (unlikely(is_pmd_migration_entry(pmd))) { > > Shouldn't you first check that the pmd is not present? is_pmd_migration_entry() checks !pmd_present(). in linux/swapops.h, is_pmd_migration_entry is defined as: static inline int is_pmd_migration_entry(pmd_t pmd) { return !pmd_present(pmd) && is_migration_entry(pmd_to_swp_entry(pmd)); } > >> + swp_entry_t entry = pmd_to_swp_entry(pmd); >> + >> + if (is_write_migration_entry(entry)) { >> + make_migration_entry_read(&entry); >> + pmd = swp_entry_to_pmd(entry); >> + set_pmd_at(src_mm, addr, src_pmd, pmd); >> + } >> + set_pmd_at(dst_mm, addr, dst_pmd, pmd); >> + ret = 0; >> + goto out_unlock; >> + } >> + WARN_ONCE(!pmd_present(pmd), "Uknown non-present format on pmd.\n"); > > Typo. Got it. > >> + >> if (unlikely(!pmd_trans_huge(pmd))) { >> pte_free(dst_mm, pgtable); >> goto out_unlock; >> @@ -1204,6 +1219,9 @@ int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) >> if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) >> goto out_unlock;j >> >> + if (unlikely(!pmd_present(orig_pmd))) >> + goto out_unlock; >> + >> page = pmd_page(orig_pmd); >> VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); >> /* >> @@ -1338,7 +1356,15 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, >> if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) >> goto out; >> >> - page = pmd_page(*pmd); >> + if (is_pmd_migration_entry(*pmd)) { > > Again, I don't think it's it's safe to check if pmd is migration entry > before checking if it's present. > >> + swp_entry_t entry; >> + >> + entry = pmd_to_swp_entry(*pmd); >> + page = pfn_to_page(swp_offset(entry)); >> + if (!is_migration_entry(entry)) >> + goto out; > > I don't understand how it suppose to work. > You take swp_offset() of entry before checking if it's migration entry. > What's going on? This chunk of change inside follow_trans_huge_pmd() is not needed. Because two callers, smaps_pmd_entry() and follow_page_mask(), guarantee that the pmd points to a present entry. I will drop this chunk in the next version. > >> + } else >> + page = pmd_page(*pmd); >> VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); >> if (flags & FOLL_TOUCH) >> touch_pmd(vma, addr, pmd); >> @@ -1534,6 +1560,9 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, >> if (is_huge_zero_pmd(orig_pmd)) >> goto out; >> >> + if (unlikely(!pmd_present(orig_pmd))) >> + goto out; >> + >> page = pmd_page(orig_pmd); >> /* >> * If other processes are mapping this page, we couldn't discard >> @@ -1766,6 +1795,20 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, >> if (prot_numa && pmd_protnone(*pmd)) >> goto unlock; >> >> + if (is_pmd_migration_entry(*pmd)) { >> + swp_entry_t entry = pmd_to_swp_entry(*pmd); >> + >> + if (is_write_migration_entry(entry)) { >> + pmd_t newpmd; >> + >> + make_migration_entry_read(&entry); >> + newpmd = swp_entry_to_pmd(entry); >> + set_pmd_at(mm, addr, pmd, newpmd); >> + } >> + goto unlock; >> + } else if (!pmd_present(*pmd)) >> + WARN_ONCE(1, "Uknown non-present format on pmd.\n"); > > Another typo. Got it. Thanks for all your comments. -- Best Regards, Yan Zi