diff --cc include/linux/mm.h index 49692a64d645,68e68d37a3d0..000000000000 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@@ -814,8 -829,15 +825,19 @@@ static inline int page_mapcount(struct return atomic_read(&page->_mapcount) + 1; } + int folio_mapcount(struct folio *folio); + #ifdef CONFIG_TRANSPARENT_HUGEPAGE ++<<<<<<< HEAD +int total_mapcount(struct page *page); ++======= + static inline int total_mapcount(struct page *page) + { + return folio_mapcount(page_folio(page)); + } + + int page_trans_huge_mapcount(struct page *page); ++>>>>>>> folio/for-next #else static inline int total_mapcount(struct page *page) { @@@ -1103,7 -1162,32 +1149,36 @@@ static inline bool put_devmap_managed_p { return false; } ++<<<<<<< HEAD +#endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ ++======= + + static inline void put_devmap_managed_page(struct page *page) + { + } + #endif /* CONFIG_DEV_PAGEMAP_OPS */ + + static inline bool is_device_private_page(const struct page *page) + { + return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && + IS_ENABLED(CONFIG_DEVICE_PRIVATE) && + is_zone_device_page(page) && + page->pgmap->type == MEMORY_DEVICE_PRIVATE; + } + + static inline bool folio_is_device_private(const struct folio *folio) + { + return is_device_private_page(&folio->page); + } + + static inline bool is_pci_p2pdma_page(const struct page *page) + { + return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && + IS_ENABLED(CONFIG_PCI_P2PDMA) && + is_zone_device_page(page) && + page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; + } ++>>>>>>> folio/for-next /* 127: arbitrary random number, small enough to assemble well */ #define folio_ref_zero_or_close_to_overflow(folio) \ diff --cc include/linux/rmap.h index 73cce292d32c,17230c458341..000000000000 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@@ -11,9 -11,8 +11,10 @@@ #include #include #include + #include +#include + /* * The anon_vma heads a list of private "related" vmas, to scan if * an anonymous page pointing to this anon_vma needs to be unmapped: @@@ -240,7 -261,7 +263,11 @@@ unsigned long page_address_in_vma(struc */ int folio_mkclean(struct folio *); ++<<<<<<< HEAD +void remove_migration_ptes(struct page *old, struct page *new, bool locked); ++======= + void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked); ++>>>>>>> folio/for-next /* * Called by memory-failure.c to kill processes. diff --cc mm/gup.c index 4ab43b4fc9bc,57bf69ac8ab4..000000000000 --- a/mm/gup.c +++ b/mm/gup.c @@@ -1844,84 -1783,50 +1786,128 @@@ static long check_and_migrate_movable_p struct page **pages, unsigned int gup_flags) { ++<<<<<<< HEAD + unsigned long isolation_error_count = 0, i; + struct page *prev_head = NULL; + LIST_HEAD(movable_page_list); + bool drain_allow = true; + int ret = 0; + + for (i = 0; i < nr_pages; i++) { + struct page *head = compound_head(pages[i]); + + if (head == prev_head) + continue; + prev_head = head; + + /* + * Device private pages will get faulted in during gup so it + * shouldn't be possible to see one here. + */ + if (WARN_ON_ONCE(is_device_private_page(head))) { + ret = -EFAULT; + goto unpin_pages; ++======= + unsigned long i; + unsigned long isolation_error_count = 0; + bool drain_allow = true; + LIST_HEAD(movable_page_list); + long ret = 0; + struct folio *folio, *prev_folio = NULL; + struct migration_target_control mtc = { + .nid = NUMA_NO_NODE, + .gfp_mask = GFP_USER | __GFP_NOWARN, + }; + + for (i = 0; i < nr_pages; i++) { + folio = page_folio(pages[i]); + if (folio == prev_folio) + continue; + prev_folio = folio; + /* + * If we get a movable page, since we are going to be pinning + * these entries, try to move them out if possible. + */ + if (!is_pinnable_page(&folio->page)) { + if (folio_test_hugetlb(folio)) { + if (!isolate_huge_page(&folio->page, + &movable_page_list)) + isolation_error_count++; + } else { + if (!folio_test_lru(folio) && drain_allow) { + lru_add_drain_all(); + drain_allow = false; + } + + if (folio_isolate_lru(folio)) { + isolation_error_count++; + continue; + } + list_add_tail(&folio->lru, &movable_page_list); + node_stat_mod_folio(folio, + NR_ISOLATED_ANON + + folio_is_file_lru(folio), + folio_nr_pages(folio)); + } ++>>>>>>> folio/for-next + } + + /* + * Device coherent pages are managed by a driver and should not + * be pinned indefinitely as it prevents the driver moving the + * page. So when trying to pin with FOLL_LONGTERM instead try + * to migrate the page out of device memory. + */ + if (is_device_coherent_page(head)) { + WARN_ON_ONCE(PageCompound(head)); + + /* + * Migration will fail if the page is pinned, so convert + * the pin on the source page to a normal reference. + */ + if (gup_flags & FOLL_PIN) { + get_page(head); + unpin_user_page(head); + } + + pages[i] = migrate_device_page(head, gup_flags); + if (!pages[i]) { + ret = -EBUSY; + goto unpin_pages; + } + continue; } + + if (is_pinnable_page(head)) + continue; + + /* + * Try to move out any movable page before pinning the range. + */ + if (PageHuge(head)) { + if (!isolate_huge_page(head, &movable_page_list)) + isolation_error_count++; + continue; + } + + if (!PageLRU(head) && drain_allow) { + lru_add_drain_all(); + drain_allow = false; + } + + if (isolate_lru_page(head)) { + isolation_error_count++; + continue; + } + list_add_tail(&head->lru, &movable_page_list); + mod_node_page_state(page_pgdat(head), + NR_ISOLATED_ANON + page_is_file_lru(head), + thp_nr_pages(head)); } + if (!list_empty(&movable_page_list) || isolation_error_count) + goto unpin_pages; + /* * If list is empty, and no isolation errors, means that all pages are * in the correct zone. diff --cc mm/huge_memory.c index 09fb65a80e63,f85b04b31bd1..000000000000 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@@ -2137,6 -2114,8 +2134,11 @@@ void __split_huge_pmd(struct vm_area_st { spinlock_t *ptl; struct mmu_notifier_range range; ++<<<<<<< HEAD ++======= + bool do_unlock_folio = false; + pmd_t _pmd; ++>>>>>>> folio/for-next mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address & HPAGE_PMD_MASK, @@@ -2155,14 -2134,42 +2157,49 @@@ goto out; } -repeat: if (pmd_trans_huge(*pmd)) { ++<<<<<<< HEAD + if (!page) + page = pmd_page(*pmd); ++======= + if (!folio) { + folio = page_folio(pmd_page(*pmd)); + /* + * An anonymous page must be locked, to ensure that a + * concurrent reuse_swap_page() sees stable mapcount; + * but reuse_swap_page() is not used on shmem or file, + * and page lock must not be taken when zap_pmd_range() + * calls __split_huge_pmd() while i_mmap_lock is held. + */ + if (folio_test_anon(folio)) { + if (unlikely(!folio_trylock(folio))) { + folio_get(folio); + _pmd = *pmd; + spin_unlock(ptl); + folio_lock(folio); + spin_lock(ptl); + if (unlikely(!pmd_same(*pmd, _pmd))) { + folio_unlock(folio); + folio_put(folio); + folio = NULL; + goto repeat; + } + folio_put(folio); + } + do_unlock_folio = true; + } + } ++>>>>>>> folio/for-next } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd))) goto out; __split_huge_pmd_locked(vma, pmd, range.start, freeze); out: spin_unlock(ptl); ++<<<<<<< HEAD ++======= + if (do_unlock_folio) + folio_unlock(folio); ++>>>>>>> folio/for-next /* * No need to double call mmu_notifier->invalidate_range() callback. * They are 3 cases to consider inside __split_huge_pmd_locked(): @@@ -2455,28 -2464,52 +2494,77 @@@ static void __split_huge_page(struct pa } } ++<<<<<<< HEAD +int total_mapcount(struct page *page) +{ + int i, compound, nr, ret; + + VM_BUG_ON_PAGE(PageTail(page), page); + + if (likely(!PageCompound(page))) + return atomic_read(&page->_mapcount) + 1; + + compound = compound_mapcount(page); + nr = compound_nr(page); + if (PageHuge(page)) + return compound; + ret = compound; + for (i = 0; i < nr; i++) + ret += atomic_read(&page[i]._mapcount) + 1; + /* File pages has compound_mapcount included in _mapcount */ + if (!PageAnon(page)) + return ret - compound * nr; + if (PageDoubleMap(page)) + ret -= nr; + return ret; ++======= + /* + * This calculates accurately how many mappings a transparent hugepage + * has (unlike page_mapcount() which isn't fully accurate). This full + * accuracy is primarily needed to know if copy-on-write faults can + * reuse the page and change the mapping to read-write instead of + * copying them. At the same time this returns the total_mapcount too. + * + * The function returns the highest mapcount any one of the subpages + * has. If the return value is one, even if different processes are + * mapping different subpages of the transparent hugepage, they can + * all reuse it, because each process is reusing a different subpage. + * + * The total_mapcount is instead counting all virtual mappings of the + * subpages. If the total_mapcount is equal to "one", it tells the + * caller all mappings belong to the same "mm" and in turn the + * anon_vma of the transparent hugepage can become the vma->anon_vma + * local one as no other process may be mapping any of the subpages. + * + * It would be more accurate to replace page_mapcount() with + * page_trans_huge_mapcount(), however we only use + * page_trans_huge_mapcount() in the copy-on-write faults where we + * need full accuracy to avoid breaking page pinning, because + * page_trans_huge_mapcount() is slower than page_mapcount(). + */ + int page_trans_huge_mapcount(struct page *page) + { + int i, ret; + + /* hugetlbfs shouldn't call it */ + VM_BUG_ON_PAGE(PageHuge(page), page); + + if (likely(!PageTransCompound(page))) + return atomic_read(&page->_mapcount) + 1; + + page = compound_head(page); + + ret = 0; + for (i = 0; i < thp_nr_pages(page); i++) { + int mapcount = atomic_read(&page[i]._mapcount) + 1; + ret = max(ret, mapcount); + } + + if (PageDoubleMap(page)) + ret -= 1; + + return ret + compound_mapcount(page); ++>>>>>>> folio/for-next } /* Racy check whether the huge page can be split */ @@@ -3116,9 -3151,6 +3206,12 @@@ void remove_migration_pmd(struct page_v else page_add_file_rmap(new, vma, true); set_pmd_at(mm, mmun_start, pvmw->pmd, pmde); ++<<<<<<< HEAD + + /* No need to invalidate - it was non-present before */ ++======= ++>>>>>>> folio/for-next update_mmu_cache_pmd(vma, address, pvmw->pmd); + trace_remove_migration_pmd(address, pmd_val(pmde)); } #endif diff --cc mm/internal.h index 7ed98955c8f4,f0e4dfac0264..000000000000 --- a/mm/internal.h +++ b/mm/internal.h @@@ -409,15 -417,22 +417,33 @@@ extern int mlock_future_check(struct mm * pte mappings of THPs, which cannot be consistently counted: a pte * mapping of the THP head cannot be distinguished by the page alone. */ ++<<<<<<< HEAD +void mlock_page(struct page *page); +static inline void mlock_vma_page(struct page *page, ++======= + void mlock_folio(struct folio *folio); + static inline void mlock_vma_folio(struct folio *folio, ++>>>>>>> folio/for-next struct vm_area_struct *vma, bool compound) { /* VM_IO check prevents migration from double-counting during mlock */ if (unlikely((vma->vm_flags & (VM_LOCKED|VM_IO)) == VM_LOCKED) && ++<<<<<<< HEAD + (compound || !PageTransCompound(page))) + mlock_page(page); +} ++======= + (compound || !folio_test_large(folio))) + mlock_folio(folio); + } + + static inline void mlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) + { + mlock_vma_folio(page_folio(page), vma, compound); + } + ++>>>>>>> folio/for-next void munlock_page(struct page *page); static inline void munlock_vma_page(struct page *page, struct vm_area_struct *vma, bool compound) @@@ -717,9 -745,9 +745,16 @@@ void vunmap_range_noflush(unsigned lon int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags); ++<<<<<<< HEAD +DECLARE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); + +void free_zone_device_page(struct page *page); +struct page *migrate_device_page(struct page *page, unsigned int gup_flags); ++======= + /* + * mm/gup.c + */ + struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags); ++>>>>>>> folio/for-next #endif /* __MM_INTERNAL_H */ diff --cc mm/khugepaged.c index 7d45d463acf5,000825a6e086..000000000000 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@@ -1823,7 -1835,7 +1824,11 @@@ static void collapse_file(struct mm_str } if (page_mapped(page)) ++<<<<<<< HEAD + try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH); ++======= + try_to_unmap(folio, TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH); ++>>>>>>> folio/for-next xas_lock_irq(&xas); xas_set(&xas, index); diff --cc mm/madvise.c index ede6affa1350,ae35d72627ef..000000000000 --- a/mm/madvise.c +++ b/mm/madvise.c @@@ -554,14 -530,9 +554,20 @@@ static void madvise_cold_page_range(str tlb_end_vma(tlb, vma); } ++<<<<<<< HEAD +static inline bool can_madv_lru_non_huge_vma(struct vm_area_struct *vma) +{ + return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP)); +} + +static inline bool can_madv_lru_vma(struct vm_area_struct *vma) +{ + return can_madv_lru_non_huge_vma(vma) && !is_vm_hugetlb_page(vma); ++======= + static inline bool can_madv_lru_vma(struct vm_area_struct *vma) + { + return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)); ++>>>>>>> folio/for-next } static long madvise_cold(struct vm_area_struct *vma, diff --cc mm/memory-failure.c index 3e404b06efdc,aa8236848949..000000000000 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@@ -1405,22 -1413,26 +1407,45 @@@ static bool hwpoison_user_mappings(stru if (kill) collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); ++<<<<<<< HEAD + if (PageHuge(hpage) && !PageAnon(hpage)) { + /* + * For hugetlb pages in shared mappings, try_to_unmap + * could potentially call huge_pmd_unshare. Because of + * this, take semaphore in write mode here and set + * TTU_RMAP_LOCKED to indicate we have taken the lock + * at this higher level. + */ + mapping = hugetlb_page_mapping_lock_write(hpage); + if (mapping) { + try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED); + i_mmap_unlock_write(mapping); + } else + pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn); + } else { + try_to_unmap(hpage, ttu); ++======= + if (!PageHuge(hpage)) { + try_to_unmap(folio, ttu); + } else { + if (!PageAnon(hpage)) { + /* + * For hugetlb pages in shared mappings, try_to_unmap + * could potentially call huge_pmd_unshare. Because of + * this, take semaphore in write mode here and set + * TTU_RMAP_LOCKED to indicate we have taken the lock + * at this higher level. + */ + mapping = hugetlb_page_mapping_lock_write(hpage); + if (mapping) { + try_to_unmap(folio, ttu|TTU_RMAP_LOCKED); + i_mmap_unlock_write(mapping); + } else + pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn); + } else { + try_to_unmap(folio, ttu); + } ++>>>>>>> folio/for-next } unmap_success = !page_mapped(hpage); diff --cc mm/migrate.c index 88b59f9f8d29,8a2f0a64f703..000000000000 --- a/mm/migrate.c +++ b/mm/migrate.c @@@ -251,9 -248,6 +246,12 @@@ static bool remove_migration_pte(struc } if (vma->vm_flags & VM_LOCKED) mlock_page_drain(smp_processor_id()); ++<<<<<<< HEAD + + trace_remove_migration_pte(pvmw.address, pte_val(pte), + compound_order(new)); ++======= ++>>>>>>> folio/for-next /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, pvmw.address, pvmw.pte); @@@ -2157,6 -2135,768 +2160,771 @@@ out #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_NUMA */ ++<<<<<<< HEAD ++======= + #ifdef CONFIG_DEVICE_PRIVATE + static int migrate_vma_collect_skip(unsigned long start, + unsigned long end, + struct mm_walk *walk) + { + struct migrate_vma *migrate = walk->private; + unsigned long addr; + + for (addr = start; addr < end; addr += PAGE_SIZE) { + migrate->dst[migrate->npages] = 0; + migrate->src[migrate->npages++] = 0; + } + + return 0; + } + + static int migrate_vma_collect_hole(unsigned long start, + unsigned long end, + __always_unused int depth, + struct mm_walk *walk) + { + struct migrate_vma *migrate = walk->private; + unsigned long addr; + + /* Only allow populating anonymous memory. */ + if (!vma_is_anonymous(walk->vma)) + return migrate_vma_collect_skip(start, end, walk); + + for (addr = start; addr < end; addr += PAGE_SIZE) { + migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; + migrate->dst[migrate->npages] = 0; + migrate->npages++; + migrate->cpages++; + } + + return 0; + } + + static int migrate_vma_collect_pmd(pmd_t *pmdp, + unsigned long start, + unsigned long end, + struct mm_walk *walk) + { + struct migrate_vma *migrate = walk->private; + struct vm_area_struct *vma = walk->vma; + struct mm_struct *mm = vma->vm_mm; + unsigned long addr = start, unmapped = 0; + spinlock_t *ptl; + pte_t *ptep; + + again: + if (pmd_none(*pmdp)) + return migrate_vma_collect_hole(start, end, -1, walk); + + if (pmd_trans_huge(*pmdp)) { + struct page *page; + + ptl = pmd_lock(mm, pmdp); + if (unlikely(!pmd_trans_huge(*pmdp))) { + spin_unlock(ptl); + goto again; + } + + page = pmd_page(*pmdp); + if (is_huge_zero_page(page)) { + spin_unlock(ptl); + split_huge_pmd(vma, pmdp, addr); + if (pmd_trans_unstable(pmdp)) + return migrate_vma_collect_skip(start, end, + walk); + } else { + int ret; + + get_page(page); + spin_unlock(ptl); + if (unlikely(!trylock_page(page))) + return migrate_vma_collect_skip(start, end, + walk); + ret = split_huge_page(page); + unlock_page(page); + put_page(page); + if (ret) + return migrate_vma_collect_skip(start, end, + walk); + if (pmd_none(*pmdp)) + return migrate_vma_collect_hole(start, end, -1, + walk); + } + } + + if (unlikely(pmd_bad(*pmdp))) + return migrate_vma_collect_skip(start, end, walk); + + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + arch_enter_lazy_mmu_mode(); + + for (; addr < end; addr += PAGE_SIZE, ptep++) { + unsigned long mpfn = 0, pfn; + struct page *page; + swp_entry_t entry; + pte_t pte; + + pte = *ptep; + + if (pte_none(pte)) { + if (vma_is_anonymous(vma)) { + mpfn = MIGRATE_PFN_MIGRATE; + migrate->cpages++; + } + goto next; + } + + if (!pte_present(pte)) { + /* + * Only care about unaddressable device page special + * page table entry. Other special swap entries are not + * migratable, and we ignore regular swapped page. + */ + entry = pte_to_swp_entry(pte); + if (!is_device_private_entry(entry)) + goto next; + + page = pfn_swap_entry_to_page(entry); + if (!(migrate->flags & + MIGRATE_VMA_SELECT_DEVICE_PRIVATE) || + page->pgmap->owner != migrate->pgmap_owner) + goto next; + + mpfn = migrate_pfn(page_to_pfn(page)) | + MIGRATE_PFN_MIGRATE; + if (is_writable_device_private_entry(entry)) + mpfn |= MIGRATE_PFN_WRITE; + } else { + if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) + goto next; + pfn = pte_pfn(pte); + if (is_zero_pfn(pfn)) { + mpfn = MIGRATE_PFN_MIGRATE; + migrate->cpages++; + goto next; + } + page = vm_normal_page(migrate->vma, addr, pte); + mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; + mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; + } + + /* FIXME support THP */ + if (!page || !page->mapping || PageTransCompound(page)) { + mpfn = 0; + goto next; + } + + /* + * By getting a reference on the page we pin it and that blocks + * any kind of migration. Side effect is that it "freezes" the + * pte. + * + * We drop this reference after isolating the page from the lru + * for non device page (device page are not on the lru and thus + * can't be dropped from it). + */ + get_page(page); + + /* + * Optimize for the common case where page is only mapped once + * in one process. If we can lock the page, then we can safely + * set up a special migration page table entry now. + */ + if (trylock_page(page)) { + pte_t swp_pte; + + migrate->cpages++; + ptep_get_and_clear(mm, addr, ptep); + + /* Setup special migration page table entry */ + if (mpfn & MIGRATE_PFN_WRITE) + entry = make_writable_migration_entry( + page_to_pfn(page)); + else + entry = make_readable_migration_entry( + page_to_pfn(page)); + swp_pte = swp_entry_to_pte(entry); + if (pte_present(pte)) { + if (pte_soft_dirty(pte)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_uffd_wp(pte)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); + } else { + if (pte_swp_soft_dirty(pte)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_swp_uffd_wp(pte)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); + } + set_pte_at(mm, addr, ptep, swp_pte); + + /* + * This is like regular unmap: we remove the rmap and + * drop page refcount. Page won't be freed, as we took + * a reference just above. + */ + page_remove_rmap(page, vma, false); + put_page(page); + + if (pte_present(pte)) + unmapped++; + } else { + put_page(page); + mpfn = 0; + } + + next: + migrate->dst[migrate->npages] = 0; + migrate->src[migrate->npages++] = mpfn; + } + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(ptep - 1, ptl); + + /* Only flush the TLB if we actually modified any entries */ + if (unmapped) + flush_tlb_range(walk->vma, start, end); + + return 0; + } + + static const struct mm_walk_ops migrate_vma_walk_ops = { + .pmd_entry = migrate_vma_collect_pmd, + .pte_hole = migrate_vma_collect_hole, + }; + + /* + * migrate_vma_collect() - collect pages over a range of virtual addresses + * @migrate: migrate struct containing all migration information + * + * This will walk the CPU page table. For each virtual address backed by a + * valid page, it updates the src array and takes a reference on the page, in + * order to pin the page until we lock it and unmap it. + */ + static void migrate_vma_collect(struct migrate_vma *migrate) + { + struct mmu_notifier_range range; + + /* + * Note that the pgmap_owner is passed to the mmu notifier callback so + * that the registered device driver can skip invalidating device + * private page mappings that won't be migrated. + */ + mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0, + migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end, + migrate->pgmap_owner); + mmu_notifier_invalidate_range_start(&range); + + walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end, + &migrate_vma_walk_ops, migrate); + + mmu_notifier_invalidate_range_end(&range); + migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); + } + + /* + * migrate_vma_check_page() - check if page is pinned or not + * @page: struct page to check + * + * Pinned pages cannot be migrated. This is the same test as in + * folio_migrate_mapping(), except that here we allow migration of a + * ZONE_DEVICE page. + */ + static bool migrate_vma_check_page(struct page *page) + { + /* + * One extra ref because caller holds an extra reference, either from + * isolate_lru_page() for a regular page, or migrate_vma_collect() for + * a device page. + */ + int extra = 1; + + /* + * FIXME support THP (transparent huge page), it is bit more complex to + * check them than regular pages, because they can be mapped with a pmd + * or with a pte (split pte mapping). + */ + if (PageCompound(page)) + return false; + + /* Page from ZONE_DEVICE have one extra reference */ + if (is_zone_device_page(page)) + extra++; + + /* For file back page */ + if (page_mapping(page)) + extra += 1 + page_has_private(page); + + if ((page_count(page) - extra) > page_mapcount(page)) + return false; + + return true; + } + + /* + * migrate_vma_unmap() - replace page mapping with special migration pte entry + * @migrate: migrate struct containing all migration information + * + * Isolate pages from the LRU and replace mappings (CPU page table pte) with a + * special migration pte entry and check if it has been pinned. Pinned pages are + * restored because we cannot migrate them. + * + * This is the last step before we call the device driver callback to allocate + * destination memory and copy contents of original page over to new page. + */ + static void migrate_vma_unmap(struct migrate_vma *migrate) + { + const unsigned long npages = migrate->npages; + unsigned long i, restore = 0; + bool allow_drain = true; + + lru_add_drain(); + + for (i = 0; i < npages; i++) { + struct page *page = migrate_pfn_to_page(migrate->src[i]); + struct folio *folio; + + if (!page) + continue; + + /* ZONE_DEVICE pages are not on LRU */ + if (!is_zone_device_page(page)) { + if (!PageLRU(page) && allow_drain) { + /* Drain CPU's pagevec */ + lru_add_drain_all(); + allow_drain = false; + } + + if (isolate_lru_page(page)) { + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + migrate->cpages--; + restore++; + continue; + } + + /* Drop the reference we took in collect */ + put_page(page); + } + + folio = page_folio(page); + if (folio_mapped(folio)) + try_to_migrate(folio, 0); + + if (page_mapped(page) || !migrate_vma_check_page(page)) { + if (!is_zone_device_page(page)) { + get_page(page); + putback_lru_page(page); + } + + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + migrate->cpages--; + restore++; + continue; + } + } + + for (i = 0; i < npages && restore; i++) { + struct page *page = migrate_pfn_to_page(migrate->src[i]); + struct folio *folio; + + if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) + continue; + + folio = page_folio(page); + remove_migration_ptes(folio, folio, false); + + migrate->src[i] = 0; + folio_unlock(folio); + folio_put(folio); + restore--; + } + } + + /** + * migrate_vma_setup() - prepare to migrate a range of memory + * @args: contains the vma, start, and pfns arrays for the migration + * + * Returns: negative errno on failures, 0 when 0 or more pages were migrated + * without an error. + * + * Prepare to migrate a range of memory virtual address range by collecting all + * the pages backing each virtual address in the range, saving them inside the + * src array. Then lock those pages and unmap them. Once the pages are locked + * and unmapped, check whether each page is pinned or not. Pages that aren't + * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the + * corresponding src array entry. Then restores any pages that are pinned, by + * remapping and unlocking those pages. + * + * The caller should then allocate destination memory and copy source memory to + * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE + * flag set). Once these are allocated and copied, the caller must update each + * corresponding entry in the dst array with the pfn value of the destination + * page and with MIGRATE_PFN_VALID. Destination pages must be locked via + * lock_page(). + * + * Note that the caller does not have to migrate all the pages that are marked + * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from + * device memory to system memory. If the caller cannot migrate a device page + * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe + * consequences for the userspace process, so it must be avoided if at all + * possible. + * + * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we + * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus + * allowing the caller to allocate device memory for those unbacked virtual + * addresses. For this the caller simply has to allocate device memory and + * properly set the destination entry like for regular migration. Note that + * this can still fail, and thus inside the device driver you must check if the + * migration was successful for those entries after calling migrate_vma_pages(), + * just like for regular migration. + * + * After that, the callers must call migrate_vma_pages() to go over each entry + * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag + * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, + * then migrate_vma_pages() to migrate struct page information from the source + * struct page to the destination struct page. If it fails to migrate the + * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the + * src array. + * + * At this point all successfully migrated pages have an entry in the src + * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst + * array entry with MIGRATE_PFN_VALID flag set. + * + * Once migrate_vma_pages() returns the caller may inspect which pages were + * successfully migrated, and which were not. Successfully migrated pages will + * have the MIGRATE_PFN_MIGRATE flag set for their src array entry. + * + * It is safe to update device page table after migrate_vma_pages() because + * both destination and source page are still locked, and the mmap_lock is held + * in read mode (hence no one can unmap the range being migrated). + * + * Once the caller is done cleaning up things and updating its page table (if it + * chose to do so, this is not an obligation) it finally calls + * migrate_vma_finalize() to update the CPU page table to point to new pages + * for successfully migrated pages or otherwise restore the CPU page table to + * point to the original source pages. + */ + int migrate_vma_setup(struct migrate_vma *args) + { + long nr_pages = (args->end - args->start) >> PAGE_SHIFT; + + args->start &= PAGE_MASK; + args->end &= PAGE_MASK; + if (!args->vma || is_vm_hugetlb_page(args->vma) || + (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma)) + return -EINVAL; + if (nr_pages <= 0) + return -EINVAL; + if (args->start < args->vma->vm_start || + args->start >= args->vma->vm_end) + return -EINVAL; + if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end) + return -EINVAL; + if (!args->src || !args->dst) + return -EINVAL; + + memset(args->src, 0, sizeof(*args->src) * nr_pages); + args->cpages = 0; + args->npages = 0; + + migrate_vma_collect(args); + + if (args->cpages) + migrate_vma_unmap(args); + + /* + * At this point pages are locked and unmapped, and thus they have + * stable content and can safely be copied to destination memory that + * is allocated by the drivers. + */ + return 0; + + } + EXPORT_SYMBOL(migrate_vma_setup); + + /* + * This code closely matches the code in: + * __handle_mm_fault() + * handle_pte_fault() + * do_anonymous_page() + * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE + * private page. + */ + static void migrate_vma_insert_page(struct migrate_vma *migrate, + unsigned long addr, + struct page *page, + unsigned long *src) + { + struct vm_area_struct *vma = migrate->vma; + struct mm_struct *mm = vma->vm_mm; + bool flush = false; + spinlock_t *ptl; + pte_t entry; + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + /* Only allow populating anonymous memory */ + if (!vma_is_anonymous(vma)) + goto abort; + + pgdp = pgd_offset(mm, addr); + p4dp = p4d_alloc(mm, pgdp, addr); + if (!p4dp) + goto abort; + pudp = pud_alloc(mm, p4dp, addr); + if (!pudp) + goto abort; + pmdp = pmd_alloc(mm, pudp, addr); + if (!pmdp) + goto abort; + + if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp)) + goto abort; + + /* + * Use pte_alloc() instead of pte_alloc_map(). We can't run + * pte_offset_map() on pmds where a huge pmd might be created + * from a different thread. + * + * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when + * parallel threads are excluded by other means. + * + * Here we only have mmap_read_lock(mm). + */ + if (pte_alloc(mm, pmdp)) + goto abort; + + /* See the comment in pte_alloc_one_map() */ + if (unlikely(pmd_trans_unstable(pmdp))) + goto abort; + + if (unlikely(anon_vma_prepare(vma))) + goto abort; + if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL)) + goto abort; + + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceding stores to the page contents become visible before + * the set_pte_at() write. + */ + __SetPageUptodate(page); + + if (is_zone_device_page(page)) { + if (is_device_private_page(page)) { + swp_entry_t swp_entry; + + if (vma->vm_flags & VM_WRITE) + swp_entry = make_writable_device_private_entry( + page_to_pfn(page)); + else + swp_entry = make_readable_device_private_entry( + page_to_pfn(page)); + entry = swp_entry_to_pte(swp_entry); + } else { + /* + * For now we only support migrating to un-addressable + * device memory. + */ + pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); + goto abort; + } + } else { + entry = mk_pte(page, vma->vm_page_prot); + if (vma->vm_flags & VM_WRITE) + entry = pte_mkwrite(pte_mkdirty(entry)); + } + + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + + if (check_stable_address_space(mm)) + goto unlock_abort; + + if (pte_present(*ptep)) { + unsigned long pfn = pte_pfn(*ptep); + + if (!is_zero_pfn(pfn)) + goto unlock_abort; + flush = true; + } else if (!pte_none(*ptep)) + goto unlock_abort; + + /* + * Check for userfaultfd but do not deliver the fault. Instead, + * just back off. + */ + if (userfaultfd_missing(vma)) + goto unlock_abort; + + inc_mm_counter(mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, addr, false); + if (!is_zone_device_page(page)) + lru_cache_add_inactive_or_unevictable(page, vma); + get_page(page); + + if (flush) { + flush_cache_page(vma, addr, pte_pfn(*ptep)); + ptep_clear_flush_notify(vma, addr, ptep); + set_pte_at_notify(mm, addr, ptep, entry); + update_mmu_cache(vma, addr, ptep); + } else { + /* No need to invalidate - it was non-present before */ + set_pte_at(mm, addr, ptep, entry); + update_mmu_cache(vma, addr, ptep); + } + + pte_unmap_unlock(ptep, ptl); + *src = MIGRATE_PFN_MIGRATE; + return; + + unlock_abort: + pte_unmap_unlock(ptep, ptl); + abort: + *src &= ~MIGRATE_PFN_MIGRATE; + } + + /** + * migrate_vma_pages() - migrate meta-data from src page to dst page + * @migrate: migrate struct containing all migration information + * + * This migrates struct page meta-data from source struct page to destination + * struct page. This effectively finishes the migration from source page to the + * destination page. + */ + void migrate_vma_pages(struct migrate_vma *migrate) + { + const unsigned long npages = migrate->npages; + const unsigned long start = migrate->start; + struct mmu_notifier_range range; + unsigned long addr, i; + bool notified = false; + + for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) { + struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); + struct page *page = migrate_pfn_to_page(migrate->src[i]); + struct address_space *mapping; + int r; + + if (!newpage) { + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + continue; + } + + if (!page) { + if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) + continue; + if (!notified) { + notified = true; + + mmu_notifier_range_init_owner(&range, + MMU_NOTIFY_MIGRATE, 0, migrate->vma, + migrate->vma->vm_mm, addr, migrate->end, + migrate->pgmap_owner); + mmu_notifier_invalidate_range_start(&range); + } + migrate_vma_insert_page(migrate, addr, newpage, + &migrate->src[i]); + continue; + } + + mapping = page_mapping(page); + + if (is_zone_device_page(newpage)) { + if (is_device_private_page(newpage)) { + /* + * For now only support private anonymous when + * migrating to un-addressable device memory. + */ + if (mapping) { + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + continue; + } + } else { + /* + * Other types of ZONE_DEVICE page are not + * supported. + */ + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + continue; + } + } + + r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY); + if (r != MIGRATEPAGE_SUCCESS) + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + } + + /* + * No need to double call mmu_notifier->invalidate_range() callback as + * the above ptep_clear_flush_notify() inside migrate_vma_insert_page() + * did already call it. + */ + if (notified) + mmu_notifier_invalidate_range_only_end(&range); + } + EXPORT_SYMBOL(migrate_vma_pages); + + /** + * migrate_vma_finalize() - restore CPU page table entry + * @migrate: migrate struct containing all migration information + * + * This replaces the special migration pte entry with either a mapping to the + * new page if migration was successful for that page, or to the original page + * otherwise. + * + * This also unlocks the pages and puts them back on the lru, or drops the extra + * refcount, for device pages. + */ + void migrate_vma_finalize(struct migrate_vma *migrate) + { + const unsigned long npages = migrate->npages; + unsigned long i; + + for (i = 0; i < npages; i++) { + struct folio *dst, *src; + struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); + struct page *page = migrate_pfn_to_page(migrate->src[i]); + + if (!page) { + if (newpage) { + unlock_page(newpage); + put_page(newpage); + } + continue; + } + + if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) { + if (newpage) { + unlock_page(newpage); + put_page(newpage); + } + newpage = page; + } + + src = page_folio(page); + dst = page_folio(newpage); + remove_migration_ptes(src, dst, false); + folio_unlock(src); + + if (is_zone_device_page(page)) + put_page(page); + else + putback_lru_page(page); + + if (newpage != page) { + unlock_page(newpage); + if (is_zone_device_page(newpage)) + put_page(newpage); + else + putback_lru_page(newpage); + } + } + } + EXPORT_SYMBOL(migrate_vma_finalize); + #endif /* CONFIG_DEVICE_PRIVATE */ + ++>>>>>>> folio/for-next /* * node_demotion[] example: * diff --cc mm/mlock.c index d28e56529e5b,9858e733c29b..000000000000 --- a/mm/mlock.c +++ b/mm/mlock.c @@@ -75,183 -75,183 +75,374 @@@ static struct lruvec *__mlock_page(stru } goto out; } ++<<<<<<< HEAD + + if (PageUnevictable(page)) { + if (PageMlocked(page)) + page->mlock_count++; + goto out; + } + + del_page_from_lru_list(page, lruvec); + ClearPageActive(page); + SetPageUnevictable(page); + page->mlock_count = !!PageMlocked(page); + add_page_to_lru_list(page, lruvec); + __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page)); +out: + SetPageLRU(page); + return lruvec; +} + +static struct lruvec *__mlock_new_page(struct page *page, struct lruvec *lruvec) +{ + VM_BUG_ON_PAGE(PageLRU(page), page); + + lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); + + /* As above, this is a little surprising, but possible */ + if (unlikely(page_evictable(page))) + goto out; + + SetPageUnevictable(page); + page->mlock_count = !!PageMlocked(page); + __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page)); +out: + add_page_to_lru_list(page, lruvec); + SetPageLRU(page); + return lruvec; +} + +static struct lruvec *__munlock_page(struct page *page, struct lruvec *lruvec) +{ + int nr_pages = thp_nr_pages(page); + bool isolated = false; + + if (!TestClearPageLRU(page)) + goto munlock; + + isolated = true; + lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); + + if (PageUnevictable(page)) { + /* Then mlock_count is maintained, but might undercount */ + if (page->mlock_count) + page->mlock_count--; + if (page->mlock_count) + goto out; + } + /* else assume that was the last mlock: reclaim will fix it if not */ + +munlock: + if (TestClearPageMlocked(page)) { + __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); + if (isolated || !PageUnevictable(page)) + __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); + else + __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); + } + + /* page_evictable() has to be checked *after* clearing Mlocked */ + if (isolated && PageUnevictable(page) && page_evictable(page)) { + del_page_from_lru_list(page, lruvec); + ClearPageUnevictable(page); + add_page_to_lru_list(page, lruvec); + __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); + } +out: + if (isolated) + SetPageLRU(page); + return lruvec; +} + +/* + * Flags held in the low bits of a struct page pointer on the mlock_pvec. + */ +#define LRU_PAGE 0x1 +#define NEW_PAGE 0x2 +static inline struct page *mlock_lru(struct page *page) +{ + return (struct page *)((unsigned long)page + LRU_PAGE); +} + +static inline struct page *mlock_new(struct page *page) +{ + return (struct page *)((unsigned long)page + NEW_PAGE); +} + +/* + * mlock_pagevec() is derived from pagevec_lru_move_fn(): + * perhaps that can make use of such page pointer flags in future, + * but for now just keep it for mlock. We could use three separate + * pagevecs instead, but one feels better (munlocking a full pagevec + * does not need to drain mlocking pagevecs first). + */ +static void mlock_pagevec(struct pagevec *pvec) +{ + struct lruvec *lruvec = NULL; + unsigned long mlock; + struct page *page; + int i; + + for (i = 0; i < pagevec_count(pvec); i++) { + page = pvec->pages[i]; + mlock = (unsigned long)page & (LRU_PAGE | NEW_PAGE); + page = (struct page *)((unsigned long)page - mlock); + pvec->pages[i] = page; + + if (mlock & LRU_PAGE) + lruvec = __mlock_page(page, lruvec); + else if (mlock & NEW_PAGE) + lruvec = __mlock_new_page(page, lruvec); + else + lruvec = __munlock_page(page, lruvec); + } + + if (lruvec) + unlock_page_lruvec_irq(lruvec); + release_pages(pvec->pages, pvec->nr); + pagevec_reinit(pvec); +} + +void mlock_page_drain(int cpu) +{ + struct pagevec *pvec; + + pvec = &per_cpu(mlock_pvec, cpu); + if (pagevec_count(pvec)) + mlock_pagevec(pvec); +} + +bool need_mlock_page_drain(int cpu) +{ + return pagevec_count(&per_cpu(mlock_pvec, cpu)); +} + +/** + * mlock_page - mlock a page already on (or temporarily off) LRU + * @page: page to be mlocked, either a normal page or a THP head. + */ +void mlock_page(struct page *page) +{ + struct pagevec *pvec = &get_cpu_var(mlock_pvec); + + if (!TestSetPageMlocked(page)) { + int nr_pages = thp_nr_pages(page); + + mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); + __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); + } + + get_page(page); + if (!pagevec_add(pvec, mlock_lru(page)) || + PageHead(page) || lru_cache_disabled()) + mlock_pagevec(pvec); + put_cpu_var(mlock_pvec); +} + +/** + * mlock_new_page - mlock a newly allocated page not yet on LRU + * @page: page to be mlocked, either a normal page or a THP head. + */ +void mlock_new_page(struct page *page) +{ + struct pagevec *pvec = &get_cpu_var(mlock_pvec); + int nr_pages = thp_nr_pages(page); + + SetPageMlocked(page); + mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); + __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); + ++ get_page(page); ++ if (!pagevec_add(pvec, mlock_new(page)) || ++ PageHead(page) || lru_cache_disabled()) ++======= + + if (PageUnevictable(page)) { + if (PageMlocked(page)) + page->mlock_count++; + goto out; + } + + del_page_from_lru_list(page, lruvec); + ClearPageActive(page); + SetPageUnevictable(page); + page->mlock_count = !!PageMlocked(page); + add_page_to_lru_list(page, lruvec); + __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page)); + out: + SetPageLRU(page); + return lruvec; + } + + static struct lruvec *__mlock_new_page(struct page *page, struct lruvec *lruvec) + { + VM_BUG_ON_PAGE(PageLRU(page), page); + + lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); + + /* As above, this is a little surprising, but possible */ + if (unlikely(page_evictable(page))) + goto out; + + SetPageUnevictable(page); + page->mlock_count = !!PageMlocked(page); + __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page)); + out: + add_page_to_lru_list(page, lruvec); + SetPageLRU(page); + return lruvec; + } + + static struct lruvec *__munlock_page(struct page *page, struct lruvec *lruvec) + { + int nr_pages = thp_nr_pages(page); + bool isolated = false; + + if (!TestClearPageLRU(page)) + goto munlock; + + isolated = true; + lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); + + if (PageUnevictable(page)) { + /* Then mlock_count is maintained, but might undercount */ + if (page->mlock_count) + page->mlock_count--; + if (page->mlock_count) + goto out; + } + /* else assume that was the last mlock: reclaim will fix it if not */ + + munlock: + if (TestClearPageMlocked(page)) { + __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); + if (isolated || !PageUnevictable(page)) + __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); + else + __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); + } + + /* page_evictable() has to be checked *after* clearing Mlocked */ + if (isolated && PageUnevictable(page) && page_evictable(page)) { + del_page_from_lru_list(page, lruvec); + ClearPageUnevictable(page); + add_page_to_lru_list(page, lruvec); + __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); + } + out: + if (isolated) + SetPageLRU(page); + return lruvec; + } + + /* + * Flags held in the low bits of a struct page pointer on the mlock_pvec. + */ + #define LRU_PAGE 0x1 + #define NEW_PAGE 0x2 + static inline struct page *mlock_lru(struct page *page) + { + return (struct page *)((unsigned long)page + LRU_PAGE); + } + + static inline struct page *mlock_new(struct page *page) + { + return (struct page *)((unsigned long)page + NEW_PAGE); + } + + /* + * mlock_pagevec() is derived from pagevec_lru_move_fn(): + * perhaps that can make use of such page pointer flags in future, + * but for now just keep it for mlock. We could use three separate + * pagevecs instead, but one feels better (munlocking a full pagevec + * does not need to drain mlocking pagevecs first). + */ + static void mlock_pagevec(struct pagevec *pvec) + { + struct lruvec *lruvec = NULL; + unsigned long mlock; + struct page *page; + int i; + + for (i = 0; i < pagevec_count(pvec); i++) { + page = pvec->pages[i]; + mlock = (unsigned long)page & (LRU_PAGE | NEW_PAGE); + page = (struct page *)((unsigned long)page - mlock); + pvec->pages[i] = page; + + if (mlock & LRU_PAGE) + lruvec = __mlock_page(page, lruvec); + else if (mlock & NEW_PAGE) + lruvec = __mlock_new_page(page, lruvec); + else + lruvec = __munlock_page(page, lruvec); + } + + if (lruvec) + unlock_page_lruvec_irq(lruvec); + release_pages(pvec->pages, pvec->nr); + pagevec_reinit(pvec); + } + + void mlock_page_drain(int cpu) + { + struct pagevec *pvec; + + pvec = &per_cpu(mlock_pvec, cpu); + if (pagevec_count(pvec)) + mlock_pagevec(pvec); + } + + bool need_mlock_page_drain(int cpu) + { + return pagevec_count(&per_cpu(mlock_pvec, cpu)); + } + + /** + * mlock_folio - mlock a folio already on (or temporarily off) LRU + * @page: folio to be mlocked. + */ + void mlock_folio(struct folio *folio) + { + struct pagevec *pvec = &get_cpu_var(mlock_pvec); + + if (!folio_test_set_mlocked(folio)) { + int nr_pages = folio_nr_pages(folio); + + zone_stat_mod_folio(folio, NR_MLOCK, nr_pages); + __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); + } + + folio_get(folio); + if (!pagevec_add(pvec, mlock_lru(&folio->page)) || + folio_test_large(folio) || lru_cache_disabled()) ++>>>>>>> folio/for-next + mlock_pagevec(pvec); + put_cpu_var(mlock_pvec); + } + + /** ++<<<<<<< HEAD ++ * munlock_page - munlock a page ++ * @page: page to be munlocked, either a normal page or a THP head. ++ */ ++void munlock_page(struct page *page) ++{ ++ struct pagevec *pvec = &get_cpu_var(mlock_pvec); ++======= + * mlock_new_page - mlock a newly allocated page not yet on LRU + * @page: page to be mlocked, either a normal page or a THP head. + */ + void mlock_new_page(struct page *page) + { + struct pagevec *pvec = &get_cpu_var(mlock_pvec); + int nr_pages = thp_nr_pages(page); + + SetPageMlocked(page); + mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); + __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); + get_page(page); if (!pagevec_add(pvec, mlock_new(page)) || PageHead(page) || lru_cache_disabled()) @@@ -266,6 -266,6 +457,7 @@@ void munlock_page(struct page *page) { struct pagevec *pvec = &get_cpu_var(mlock_pvec); ++>>>>>>> folio/for-next /* * TestClearPageMlocked(page) must be left to __munlock_page(), @@@ -296,7 -296,7 +488,11 @@@ static int mlock_pte_range(pmd_t *pmd, goto out; page = pmd_page(*pmd); if (vma->vm_flags & VM_LOCKED) ++<<<<<<< HEAD + mlock_page(page); ++======= + mlock_folio(page_folio(page)); ++>>>>>>> folio/for-next else munlock_page(page); goto out; @@@ -312,7 -312,7 +508,11 @@@ if (PageTransCompound(page)) continue; if (vma->vm_flags & VM_LOCKED) ++<<<<<<< HEAD + mlock_page(page); ++======= + mlock_folio(page_folio(page)); ++>>>>>>> folio/for-next else munlock_page(page); } diff --cc mm/rmap.c index a13487385820,5470c8de2ec0..000000000000 --- a/mm/rmap.c +++ b/mm/rmap.c @@@ -106,10 -104,10 +106,10 @@@ static inline struct anon_vma *anon_vma static inline void anon_vma_free(struct anon_vma *anon_vma) { - VM_BUG_ON(atomic_read(&anon_vma->refcount)); + VM_BUG_ON(refcount_read(&anon_vma->refcount)); /* - * Synchronize against page_lock_anon_vma_read() such that + * Synchronize against folio_lock_anon_vma_read() such that * we can safely hold the lock without the anon_vma getting * freed. * @@@ -815,9 -810,9 +812,15 @@@ static bool folio_referenced_one(struc address = pvmw.address; if ((vma->vm_flags & VM_LOCKED) && ++<<<<<<< HEAD + (!PageTransCompound(page) || !pvmw.pte)) { + /* Restore the mlock which got missed */ + mlock_vma_page(page, vma, !pvmw.pte); ++======= + (!folio_test_large(folio) || !pvmw.pte)) { + /* Restore the mlock which got missed */ + mlock_vma_folio(folio, vma, !pvmw.pte); ++>>>>>>> folio/for-next page_vma_mapped_walk_done(&pvmw); pra->vm_flags |= VM_LOCKED; return false; /* To break the loop */ @@@ -1469,24 -1454,25 +1462,40 @@@ static bool try_to_unmap_one(struct fol while (page_vma_mapped_walk(&pvmw)) { /* Unexpected PMD-mapped THP? */ ++<<<<<<< HEAD + VM_BUG_ON_PAGE(!pvmw.pte, page); + + /* + * If the page is in an mlock()d vma, we must not swap it out. ++======= + VM_BUG_ON_FOLIO(!pvmw.pte, folio); + + /* + * If the folio is in an mlock()d vma, we must not swap it out. ++>>>>>>> folio/for-next */ if (!(flags & TTU_IGNORE_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /* Restore the mlock which got missed */ ++<<<<<<< HEAD + mlock_vma_page(page, vma, false); ++======= + mlock_vma_folio(folio, vma, false); ++>>>>>>> folio/for-next page_vma_mapped_walk_done(&pvmw); ret = false; break; } ++<<<<<<< HEAD + subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); ++======= + subpage = folio_page(folio, + pte_pfn(*pvmw.pte) - folio_pfn(folio)); ++>>>>>>> folio/for-next address = pvmw.address; - if (PageHuge(page) && !PageAnon(page)) { + if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) { /* * To call huge_pmd_unshare, i_mmap_rwsem must be * held in write mode. Caller needs to explicitly @@@ -1590,31 -1577,8 +1600,36 @@@ } /* MADV_FREE page check */ ++<<<<<<< HEAD + if (!PageSwapBacked(page)) { + int ref_count, map_count; + + /* + * Synchronize with gup_pte_range(): + * - clear PTE; barrier; read refcount + * - inc refcount; barrier; read PTE + */ + smp_mb(); + + ref_count = page_count(page); + map_count = page_mapcount(page); + + /* + * Order reads for page refcount and dirty flag; + * see __remove_mapping(). + */ + smp_rmb(); + + /* + * The only page refs must be from the isolation + * plus one or more rmap's (dropped by discard:). + */ + if ((ref_count == 1 + map_count) && + !PageDirty(page)) { ++======= + if (!folio_test_swapbacked(folio)) { + if (!folio_test_dirty(folio)) { ++>>>>>>> folio/for-next /* Invalidate as we cleared the pte */ mmu_notifier_invalidate_range(mm, address, address + PAGE_SIZE); @@@ -1683,10 -1648,10 +1699,17 @@@ discard * * See Documentation/vm/mmu_notifier.rst */ ++<<<<<<< HEAD + page_remove_rmap(subpage, vma, PageHuge(page)); + if (vma->vm_flags & VM_LOCKED) + mlock_page_drain(smp_processor_id()); + put_page(page); ++======= + page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); + if (vma->vm_flags & VM_LOCKED) + mlock_page_drain(smp_processor_id()); + folio_put(folio); ++>>>>>>> folio/for-next } mmu_notifier_invalidate_range_end(&range); @@@ -1852,8 -1815,8 +1873,13 @@@ static bool try_to_migrate_one(struct f /* Update high watermark before we lower rss */ update_hiwater_rss(mm); ++<<<<<<< HEAD + if (is_device_private_page(page)) { + unsigned long pfn = page_to_pfn(page); ++======= + if (folio_is_zone_device(folio)) { + unsigned long pfn = folio_pfn(folio); ++>>>>>>> folio/for-next swp_entry_t entry; pte_t swp_pte; @@@ -1891,11 -1852,11 +1917,15 @@@ * changed when hugepage migrations to device private * memory are supported. */ ++<<<<<<< HEAD + subpage = page; ++======= + subpage = &folio->page; ++>>>>>>> folio/for-next } else if (PageHWPoison(subpage)) { pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); - if (PageHuge(page)) { - hugetlb_count_sub(compound_nr(page), mm); + if (folio_test_hugetlb(folio)) { + hugetlb_count_sub(folio_nr_pages(folio), mm); set_huge_swap_pte_at(mm, address, pvmw.pte, pteval, vma_mmu_pagesize(vma)); @@@ -1963,10 -1922,10 +1993,17 @@@ * * See Documentation/vm/mmu_notifier.rst */ ++<<<<<<< HEAD + page_remove_rmap(subpage, vma, PageHuge(page)); + if (vma->vm_flags & VM_LOCKED) + mlock_page_drain(smp_processor_id()); + put_page(page); ++======= + page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); + if (vma->vm_flags & VM_LOCKED) + mlock_page_drain(smp_processor_id()); + folio_put(folio); ++>>>>>>> folio/for-next } mmu_notifier_invalidate_range_end(&range); @@@ -1999,8 -1958,7 +2036,12 @@@ void try_to_migrate(struct folio *folio TTU_SYNC))) return; ++<<<<<<< HEAD + if (is_zone_device_page(page) && + (!is_device_private_page(page) && !is_device_coherent_page(page))) ++======= + if (folio_is_zone_device(folio) && !folio_is_device_private(folio)) ++>>>>>>> folio/for-next return; /* @@@ -2015,9 -1973,9 +2056,13 @@@ rwc.invalid_vma = invalid_migration_vma; if (flags & TTU_RMAP_LOCKED) - rmap_walk_locked(page, &rwc); + rmap_walk_locked(folio, &rwc); else ++<<<<<<< HEAD + rmap_walk(page, &rwc); ++======= + rmap_walk(folio, &rwc); ++>>>>>>> folio/for-next } #ifdef CONFIG_DEVICE_PRIVATE diff --cc mm/vmscan.c index 5f471c1e279f,7db5d0237333..000000000000 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@@ -986,12 -985,23 +986,12 @@@ static inline int is_page_cache_freeabl * that isolated the page, the page cache and optional buffer * heads at page->private. */ - int page_cache_pins = thp_nr_pages(page); - return page_count(page) - page_has_private(page) == 1 + page_cache_pins; + return folio_ref_count(folio) - folio_test_private(folio) == + 1 + folio_nr_pages(folio); } -static int may_write_to_inode(struct inode *inode) -{ - if (current->flags & PF_SWAPWRITE) - return 1; - if (!inode_write_congested(inode)) - return 1; - if (inode_to_bdi(inode) == current->backing_dev_info) - return 1; - return 0; -} - /* - * We detected a synchronous write error writing a page out. Probably + * We detected a synchronous write error writing a folio out. Probably * -ENOSPC. We need to propagate that into the address_space for a subsequent * fsync(), msync() or close(). * @@@ -1191,8 -1201,10 +1191,8 @@@ static pageout_t pageout(struct folio * } if (mapping->a_ops->writepage == NULL) return PAGE_ACTIVATE; - if (!may_write_to_inode(mapping->host)) - return PAGE_KEEP; - if (clear_page_dirty_for_io(page)) { + if (folio_clear_dirty_for_io(folio)) { int res; struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, @@@ -1365,19 -1383,19 +1371,24 @@@ enum page_references PAGEREF_ACTIVATE, }; - static enum page_references page_check_references(struct page *page, + static enum page_references folio_check_references(struct folio *folio, struct scan_control *sc) { - int referenced_ptes, referenced_page; + int referenced_ptes, referenced_folio; unsigned long vm_flags; - referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup, - &vm_flags); - referenced_page = TestClearPageReferenced(page); + referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup, + &vm_flags); + referenced_folio = folio_test_clear_referenced(folio); /* ++<<<<<<< HEAD + * The supposedly reclaimable page was found to be in a VM_LOCKED vma. + * Let the page, now marked Mlocked, be moved to the unevictable list. ++======= + * The supposedly reclaimable folio was found to be in a VM_LOCKED vma. + * Let the folio, now marked Mlocked, be moved to the unevictable list. ++>>>>>>> folio/for-next */ if (vm_flags & VM_LOCKED) return PAGEREF_ACTIVATE; @@@ -1566,8 -1586,10 +1579,15 @@@ retry * end of the LRU a second time. */ mapping = page_mapping(page); ++<<<<<<< HEAD + if (writeback && PageReclaim(page)) + stat->nr_congested++; ++======= + if (((dirty || writeback) && mapping && + inode_write_congested(mapping->host)) || + (writeback && PageReclaim(page))) + stat->nr_congested += nr_pages; ++>>>>>>> folio/for-next /* * If a page at the tail of the LRU is under writeback, there @@@ -1716,9 -1738,9 +1736,15 @@@ /* Adding to swap updated mapping */ mapping = page_mapping(page); } ++<<<<<<< HEAD + } else if (unlikely(PageTransHuge(page))) { + /* Split file/lazyfree THP */ + if (split_huge_page_to_list(page, page_list)) ++======= + } else if (PageSwapBacked(page) && PageTransHuge(page)) { + /* Split shmem THP */ + if (split_folio_to_list(folio, page_list)) ++>>>>>>> folio/for-next goto keep_locked; }