[RFC v2 PATCH 14/17] mm: Copy large folios for anonymous memory

From: Ryan Roberts <ryan.roberts@arm.com>
To: Andrew Morton <akpm@linux-foundation.org>,
	"Matthew Wilcox (Oracle)" <willy@infradead.org>,
	Yu Zhao <yuzhao@google.com>,
	"Yin, Fengwei" <fengwei.yin@intel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>,
	linux-mm@kvack.org, linux-arm-kernel@lists.infradead.org
Subject: [RFC v2 PATCH 14/17] mm: Copy large folios for anonymous memory
Date: Fri, 14 Apr 2023 14:03:00 +0100	[thread overview]
Message-ID: <20230414130303.2345383-15-ryan.roberts@arm.com> (raw)
In-Reply-To: <20230414130303.2345383-1-ryan.roberts@arm.com>

When taking a write fault on an anonymous page, if we are unable to
reuse the folio (due to it being mapped by others), do CoW for the
entire folio instead of just a single page.

We assume that the size of the anonymous folio chosen at allocation time
is still a good choice and therefore it is better to copy the entire
folio rather than a single page. It does not seem wise to do this for
file-backed folios, since the folio size chosen there is related to the
system-wide usage of the file. So we continue to CoW a single page for
file-backed mappings.

There are edge cases where the original mapping has been mremapped or
partially munmapped. In this case the source folio may not be naturally
aligned in the virtual address space. In this case, we CoW a power-of-2
portion of the source folio which is aligned. A similar effect happens
when allocation of a high order destination folio fails. In this case,
we reduce the order to 0 until we are successful.

Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
---
 mm/memory.c | 242 ++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 207 insertions(+), 35 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index f2b7cfb2efc0..61cec97a57f3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3086,6 +3086,30 @@ static inline int check_ptes_none(pte_t *pte, int nr)
 	return nr;
 }

+/*
+ * Returns index of first pte that is not mapped RO and physically contiguously
+ * starting at pfn, or nr if all are correct.
+ */
+static inline int check_ptes_contig_ro(pte_t *pte, int nr, unsigned long pfn)
+{
+	int i;
+	pte_t entry;
+
+	for (i = 0; i < nr; i++) {
+		entry = *pte++;
+
+		if (!pte_present(entry) ||
+		    pte_write(entry) ||
+		    pte_protnone(entry) ||
+		    pte_pfn(entry) != pfn)
+			return i;
+
+		pfn++;
+	}
+
+	return nr;
+}
+
 static int calc_anon_folio_order_alloc(struct vm_fault *vmf, int order)
 {
 	/*
@@ -3155,6 +3179,94 @@ static int calc_anon_folio_order_alloc(struct vm_fault *vmf, int order)
 	return order;
 }

+static int calc_anon_folio_order_copy(struct vm_fault *vmf,
+					struct folio *old_folio, int order)
+{
+	/*
+	 * The aim here is to determine what size of folio we should allocate as
+	 * the destination for this CoW fault. Factors include:
+	 * - Order must not be higher than `order` upon entry
+	 * - Folio must be naturally aligned within VA space
+	 * - Folio must not breach boundaries of vma
+	 * - Folio must be fully contained inside one pmd entry
+	 * - All covered ptes must be present, physically contiguous and RO
+	 * - All covered ptes must be mapped to old_folio
+	 *
+	 * Additionally, we do not allow order-1 since this breaks assumptions
+	 * elsewhere in the mm; THP pages must be at least order-2 (since they
+	 * store state up to the 3rd struct page subpage), and these pages must
+	 * be THP in order to correctly use pre-existing THP infrastructure such
+	 * as folio_split().
+	 *
+	 * As a consequence of relying on the THP infrastructure, if the system
+	 * does not support THP, we always fallback to order-0.
+	 *
+	 * Note that old_folio may not be naturally aligned in VA space due to
+	 * mremap. We deliberately force alignment of the new folio to simplify
+	 * fallback, so in this unaligned case we will end up only copying a
+	 * portion of old_folio.
+	 *
+	 * Note that the caller may or may not choose to lock the pte. If
+	 * unlocked, the calculation should be considered an estimate that will
+	 * need to be validated under the lock.
+	 */
+
+	struct vm_area_struct *vma = vmf->vma;
+	int nr;
+	unsigned long addr;
+	pte_t *pte;
+	pte_t *first_bad = NULL;
+	int ret;
+	unsigned long start, end;
+	unsigned long offset;
+	unsigned long pfn;
+
+	if (has_transparent_hugepage()) {
+		order = min(order, PMD_SHIFT - PAGE_SHIFT);
+
+		start = page_addr(&old_folio->page, vmf->page, vmf->address);
+		start = max(start, vma->vm_start);
+
+		end = page_addr(&old_folio->page + folio_nr_pages(old_folio),
+			vmf->page, vmf->address);
+		end = min(end, vma->vm_end);
+
+		for (; order > 1; order--) {
+			nr = 1 << order;
+			addr = ALIGN_DOWN(vmf->address, nr << PAGE_SHIFT);
+			offset = ((vmf->address - addr) >> PAGE_SHIFT);
+			pfn = page_to_pfn(vmf->page) - offset;
+			pte = vmf->pte - offset;
+
+			/* Check vma and folio bounds. */
+			if (addr < start ||
+			    addr + (nr << PAGE_SHIFT) > end)
+				continue;
+
+			/* Ptes covered by order already known to be good. */
+			if (pte + nr <= first_bad)
+				break;
+
+			/* Already found bad pte in range covered by order. */
+			if (pte <= first_bad)
+				continue;
+
+			/* Need to check if all the ptes are good. */
+			ret = check_ptes_contig_ro(pte, nr, pfn);
+			if (ret == nr)
+				break;
+
+			first_bad = pte + ret;
+		}
+
+		if (order == 1)
+			order = 0;
+	} else
+		order = 0;
+
+	return order;
+}
+
 static void calc_anon_folio_range_reuse(struct vm_fault *vmf,
 					struct folio *folio,
 					struct anon_folio_range *range_out)
@@ -3366,6 +3478,14 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 	int page_copied = 0;
 	struct mmu_notifier_range range;
 	int ret;
+	pte_t orig_pte;
+	unsigned long addr = vmf->address;
+	int order = 0;
+	int pgcount = BIT(order);
+	unsigned long offset = 0;
+	unsigned long pfn;
+	struct page *page;
+	int i;

 	delayacct_wpcopy_start();

@@ -3375,20 +3495,39 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 		goto oom;

 	if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
-		new_folio = vma_alloc_zeroed_movable_folio(vma, vmf->address,
-									0, 0);
+		new_folio = vma_alloc_movable_folio(vma, vmf->address, 0, true);
 		if (!new_folio)
 			goto oom;
 	} else {
-		new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma,
-				vmf->address, false);
+		if (old_folio && folio_test_anon(old_folio)) {
+			order = min_t(int, folio_order(old_folio),
+						max_anon_folio_order(vma));
+retry:
+			/*
+			 * Estimate the folio order to allocate. We are not
+			 * under the ptl here so this estimate needs to be
+			 * re-checked later once we have the lock.
+			 */
+			vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
+			order = calc_anon_folio_order_copy(vmf, old_folio, order);
+			pte_unmap(vmf->pte);
+		}
+
+		new_folio = try_vma_alloc_movable_folio(vma, vmf->address,
+							order, false);
 		if (!new_folio)
 			goto oom;

+		/* We may have been granted less than we asked for. */
+		order = folio_order(new_folio);
+		pgcount = BIT(order);
+		addr = ALIGN_DOWN(vmf->address, pgcount << PAGE_SHIFT);
+		offset = ((vmf->address - addr) >> PAGE_SHIFT);
+
 		if (likely(old_folio))
 			ret = __wp_page_copy_user_range(&new_folio->page,
-							vmf->page,
-							1, vmf->address, vma);
+							vmf->page - offset,
+							pgcount, addr, vma);
 		else
 			ret = __wp_page_copy_user_pfn(&new_folio->page, vmf);
 		if (ret) {
@@ -3410,39 +3549,31 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)

 	if (mem_cgroup_charge(new_folio, mm, GFP_KERNEL))
 		goto oom_free_new;
-	cgroup_throttle_swaprate(&new_folio->page, GFP_KERNEL);
+	folio_throttle_swaprate(new_folio, GFP_KERNEL);

 	__folio_mark_uptodate(new_folio);

 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
-				vmf->address & PAGE_MASK,
-				(vmf->address & PAGE_MASK) + PAGE_SIZE);
+				addr, addr + (pgcount << PAGE_SHIFT));
 	mmu_notifier_invalidate_range_start(&range);

 	/*
-	 * Re-check the pte - we dropped the lock
+	 * Re-check the pte(s) - we dropped the lock
 	 */
-	vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
-	if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
+	vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
+	pfn = pte_pfn(vmf->orig_pte) - offset;
+	if (likely(check_ptes_contig_ro(vmf->pte, pgcount, pfn) == pgcount)) {
 		if (old_folio) {
 			if (!folio_test_anon(old_folio)) {
+				VM_BUG_ON(order != 0);
 				dec_mm_counter(mm, mm_counter_file(&old_folio->page));
 				inc_mm_counter(mm, MM_ANONPAGES);
 			}
 		} else {
+			VM_BUG_ON(order != 0);
 			inc_mm_counter(mm, MM_ANONPAGES);
 		}
-		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
-		entry = mk_pte(&new_folio->page, vma->vm_page_prot);
-		entry = pte_sw_mkyoung(entry);
-		if (unlikely(unshare)) {
-			if (pte_soft_dirty(vmf->orig_pte))
-				entry = pte_mksoft_dirty(entry);
-			if (pte_uffd_wp(vmf->orig_pte))
-				entry = pte_mkuffd_wp(entry);
-		} else {
-			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-		}
+		flush_cache_range(vma, addr, addr + (pgcount << PAGE_SHIFT));

 		/*
 		 * Clear the pte entry and flush it first, before updating the
@@ -3451,17 +3582,40 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 		 * that left a window where the new PTE could be loaded into
 		 * some TLBs while the old PTE remains in others.
 		 */
-		ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
-		folio_add_new_anon_rmap(new_folio, vma, vmf->address);
+		ptep_clear_flush_range_notify(vma, addr, vmf->pte, pgcount);
+		folio_ref_add(new_folio, pgcount - 1);
+		folio_add_new_anon_rmap_range(new_folio, &new_folio->page,
+							pgcount, vma, addr);
 		folio_add_lru_vma(new_folio, vma);
 		/*
 		 * We call the notify macro here because, when using secondary
 		 * mmu page tables (such as kvm shadow page tables), we want the
 		 * new page to be mapped directly into the secondary page table.
 		 */
-		BUG_ON(unshare && pte_write(entry));
-		set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
-		update_mmu_cache(vma, vmf->address, vmf->pte);
+		page = &new_folio->page;
+		for (i = 0; i < pgcount; i++, page++) {
+			entry = mk_pte(page, vma->vm_page_prot);
+			entry = pte_sw_mkyoung(entry);
+			if (unlikely(unshare)) {
+				orig_pte = vmf->pte[i];
+				if (pte_soft_dirty(orig_pte))
+					entry = pte_mksoft_dirty(entry);
+				if (pte_uffd_wp(orig_pte))
+					entry = pte_mkuffd_wp(entry);
+			} else {
+				entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+			}
+			/*
+			 * TODO: Batch for !unshare case. Could use set_ptes(),
+			 * but currently there is no arch-agnostic way to
+			 * increment pte values by pfn so can't do the notify
+			 * part. So currently stuck creating the pte from
+			 * scratch every iteration.
+			 */
+			set_pte_at_notify(mm, addr + (i << PAGE_SHIFT),
+						vmf->pte + i, entry);
+		}
+		update_mmu_cache_range(vma, addr, vmf->pte, pgcount);
 		if (old_folio) {
 			/*
 			 * Only after switching the pte to the new page may
@@ -3473,10 +3627,10 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 			 * threads.
 			 *
 			 * The critical issue is to order this
-			 * page_remove_rmap with the ptp_clear_flush above.
-			 * Those stores are ordered by (if nothing else,)
+			 * folio_remove_rmap_range with the ptp_clear_flush
+			 * above. Those stores are ordered by (if nothing else,)
 			 * the barrier present in the atomic_add_negative
-			 * in page_remove_rmap.
+			 * in folio_remove_rmap_range.
 			 *
 			 * Then the TLB flush in ptep_clear_flush ensures that
 			 * no process can access the old page before the
@@ -3485,14 +3639,30 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 			 * mapcount is visible. So transitively, TLBs to
 			 * old page will be flushed before it can be reused.
 			 */
-			page_remove_rmap(vmf->page, vma, false);
+			folio_remove_rmap_range(old_folio,
+						vmf->page - offset,
+						pgcount, vma);
 		}

 		/* Free the old page.. */
 		new_folio = old_folio;
 		page_copied = 1;
 	} else {
-		update_mmu_tlb(vma, vmf->address, vmf->pte);
+		pte_t *pte = vmf->pte + ((vmf->address - addr) >> PAGE_SHIFT);
+
+		/*
+		 * If faulting pte was serviced by another, exit early. Else try
+		 * again, with a lower order.
+		 */
+		if (order > 0 && pte_same(*pte, vmf->orig_pte)) {
+			pte_unmap_unlock(vmf->pte, vmf->ptl);
+			mmu_notifier_invalidate_range_only_end(&range);
+			folio_put(new_folio);
+			order--;
+			goto retry;
+		}
+
+		update_mmu_tlb(vma, vmf->address, pte);
 	}

 	if (new_folio)
@@ -3505,9 +3675,11 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 	 */
 	mmu_notifier_invalidate_range_only_end(&range);
 	if (old_folio) {
-		if (page_copied)
+		if (page_copied) {
 			free_swap_cache(&old_folio->page);
-		folio_put(old_folio);
+			folio_put_refs(old_folio, pgcount);
+		} else
+			folio_put(old_folio);
 	}

 	delayacct_wpcopy_end();
--
2.25.1