From 0b41b082f06535f092e78bdd4f0501ddbb7de165 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 18 Sep 2020 11:56:27 -0400
Subject: [PATCH] mm: Do early cow for FOLL_PIN pages during fork()

Co-developed-by: Jason Gunthorpe <jgg@ziepe.ca>
Signed-off-by: Peter Xu <peterx@redhat.com>
---
 include/linux/mm.h       |  2 +-
 include/linux/mm_types.h |  1 +
 kernel/fork.c            |  3 +-
 mm/gup.c                 | 15 +++++++
 mm/memory.c              | 85 +++++++++++++++++++++++++++++++++++-----
 5 files changed, 94 insertions(+), 12 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ca6e6a81576b..bf1ac54be55e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1644,7 +1644,7 @@ struct mmu_notifier_range;
 void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
 		unsigned long end, unsigned long floor, unsigned long ceiling);
 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
-			struct vm_area_struct *vma);
+		    struct vm_area_struct *vma, struct vm_area_struct *new);
 int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
 		   struct mmu_notifier_range *range,
 		   pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 496c3ff97cce..b3812fa6383f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -458,6 +458,7 @@ struct mm_struct {
 
 		unsigned long total_vm;	   /* Total pages mapped */
 		unsigned long locked_vm;   /* Pages that have PG_mlocked set */
+		unsigned long has_pinned;  /* Whether this mm has pinned any page */
 		atomic64_t    pinned_vm;   /* Refcount permanently increased */
 		unsigned long data_vm;	   /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
 		unsigned long exec_vm;	   /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
diff --git a/kernel/fork.c b/kernel/fork.c
index 49677d668de4..843807ade6dd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -589,7 +589,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 
 		mm->map_count++;
 		if (!(tmp->vm_flags & VM_WIPEONFORK))
-			retval = copy_page_range(mm, oldmm, mpnt);
+			retval = copy_page_range(mm, oldmm, mpnt, tmp);
 
 		if (tmp->vm_ops && tmp->vm_ops->open)
 			tmp->vm_ops->open(tmp);
@@ -1011,6 +1011,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	mm_pgtables_bytes_init(mm);
 	mm->map_count = 0;
 	mm->locked_vm = 0;
+	mm->has_pinned = 0;
 	atomic64_set(&mm->pinned_vm, 0);
 	memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
 	spin_lock_init(&mm->page_table_lock);
diff --git a/mm/gup.c b/mm/gup.c
index e5739a1974d5..cab10cefefe4 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1255,6 +1255,17 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
 		BUG_ON(*locked != 1);
 	}
 
+	/*
+	 * Mark the mm struct if there's any page pinning attempt.  We're
+	 * aggresive on this bit since even if the pinned pages were unpinned
+	 * later on, we'll still keep this bit set for this address space just
+	 * to make everything easy.
+	 *
+	 * TODO: Ideally we can use mm->pinned_vm but only until it's stable.
+	 */
+	if (flags & FOLL_PIN)
+		WRITE_ONCE(mm->has_pinned, 1);
+
 	/*
 	 * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
 	 * is to set FOLL_GET if the caller wants pages[] filled in (but has
@@ -2660,6 +2671,10 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
 				       FOLL_FAST_ONLY)))
 		return -EINVAL;
 
+	/* Please refer to comments in __get_user_pages_locked() */
+	if (gup_flags & FOLL_PIN)
+		WRITE_ONCE(current->mm->has_pinned, 1);
+
 	if (!(gup_flags & FOLL_FAST_ONLY))
 		might_lock_read(&current->mm->mmap_lock);
 
diff --git a/mm/memory.c b/mm/memory.c
index 469af373ae76..be9e2946c462 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -689,6 +689,38 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
 }
 #endif
 
+/*
+ * Do early cow for the page and the pte. Return true if page duplicate
+ * succeeded, false otherwise.
+ */
+static bool duplicate_page(struct mm_struct *mm, struct vm_area_struct *vma,
+			   unsigned long address, struct page *page,
+			   pte_t *newpte)
+{
+       struct page *new_page;
+       pte_t entry;
+
+       new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+       if (!new_page)
+               return false;
+
+       copy_user_highpage(new_page, page, address, vma);
+       if (mem_cgroup_charge(new_page, mm, GFP_KERNEL)) {
+	       put_page(new_page);
+	       return false;
+       }
+       cgroup_throttle_swaprate(new_page, GFP_KERNEL);
+       __SetPageUptodate(new_page);
+
+       entry = mk_pte(new_page, vma->vm_page_prot);
+       entry = pte_sw_mkyoung(entry);
+       entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+       *newpte = entry;
+       page_add_new_anon_rmap(new_page, vma, address, false);
+
+       return true;
+}
+
 /*
  * copy one vm_area from one task to the other. Assumes the page tables
  * already present in the new task to be cleared in the whole range
@@ -698,11 +730,13 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
 static inline unsigned long
 copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
+	        struct vm_area_struct *new,
 		unsigned long addr, int *rss)
 {
 	unsigned long vm_flags = vma->vm_flags;
 	pte_t pte = *src_pte;
 	struct page *page;
+	bool wp;
 
 	/* pte contains position in swap or file, so copy. */
 	if (unlikely(!pte_present(pte))) {
@@ -779,10 +813,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	 * If it's a COW mapping, write protect it both
 	 * in the parent and the child
 	 */
-	if (is_cow_mapping(vm_flags) && pte_write(pte)) {
-		ptep_set_wrprotect(src_mm, addr, src_pte);
-		pte = pte_wrprotect(pte);
-	}
+	wp = is_cow_mapping(vm_flags) && pte_write(pte);
 
 	/*
 	 * If it's a shared mapping, mark it clean in
@@ -805,6 +836,36 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		get_page(page);
 		page_dup_rmap(page, false);
 		rss[mm_counter(page)]++;
+
+		/*
+		 * If the page is pinned in source mm, do early cow right now
+		 * so that the pinned page won't be replaced by another random
+		 * page without being noticed after the fork().
+		 *
+		 * Note: There can be some very rare cases that we'll do
+		 * unnecessary cow here, due to page_maybe_dma_pinned() is
+		 * sometimes bogus, and has_pinned flag is currently aggresive
+		 * too.  However this should be good enough for us for now as
+		 * long as we covered all the pinned pages.
+		 *
+		 * If page copy triggered successfully, undo wp.  Otherwise we
+		 * probably don't have enough memory for use, we can either
+		 * report up so we fail the fork(), or for simplicity we keep
+		 * the old behavior of keep the page shared.
+		 *
+		 * Note: we passed in src_mm here for accounting because
+		 * copy_namespaces() should not have been done when copy_mm()
+		 * happens.
+		 */
+		if (wp && READ_ONCE(src_mm->has_pinned) &&
+		    page_maybe_dma_pinned(page) &&
+		    duplicate_page(src_mm, new, addr, page, &pte))
+			wp = false;
+	}
+
+	if (wp) {
+		ptep_set_wrprotect(src_mm, addr, src_pte);
+		pte = pte_wrprotect(pte);
 	}
 
 out_set_pte:
@@ -814,6 +875,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 
 static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		   pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
+		   struct vm_area_struct *new,
 		   unsigned long addr, unsigned long end)
 {
 	pte_t *orig_src_pte, *orig_dst_pte;
@@ -852,7 +914,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 			continue;
 		}
 		entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
-							vma, addr, rss);
+					 vma, new, addr, rss);
 		if (entry.val)
 			break;
 		progress += 8;
@@ -877,6 +939,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 
 static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
+		struct vm_area_struct *new,
 		unsigned long addr, unsigned long end)
 {
 	pmd_t *src_pmd, *dst_pmd;
@@ -903,7 +966,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
 		if (pmd_none_or_clear_bad(src_pmd))
 			continue;
 		if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
-						vma, addr, next))
+				   vma, new, addr, next))
 			return -ENOMEM;
 	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
 	return 0;
@@ -911,6 +974,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
 
 static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		p4d_t *dst_p4d, p4d_t *src_p4d, struct vm_area_struct *vma,
+		struct vm_area_struct *new,
 		unsigned long addr, unsigned long end)
 {
 	pud_t *src_pud, *dst_pud;
@@ -937,7 +1001,7 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src
 		if (pud_none_or_clear_bad(src_pud))
 			continue;
 		if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
-						vma, addr, next))
+				   vma, new, addr, next))
 			return -ENOMEM;
 	} while (dst_pud++, src_pud++, addr = next, addr != end);
 	return 0;
@@ -945,6 +1009,7 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src
 
 static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
+		struct vm_area_struct *new,
 		unsigned long addr, unsigned long end)
 {
 	p4d_t *src_p4d, *dst_p4d;
@@ -959,14 +1024,14 @@ static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src
 		if (p4d_none_or_clear_bad(src_p4d))
 			continue;
 		if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d,
-						vma, addr, next))
+				   vma, new, addr, next))
 			return -ENOMEM;
 	} while (dst_p4d++, src_p4d++, addr = next, addr != end);
 	return 0;
 }
 
 int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
-		struct vm_area_struct *vma)
+		    struct vm_area_struct *vma, struct vm_area_struct *new)
 {
 	pgd_t *src_pgd, *dst_pgd;
 	unsigned long next;
@@ -1021,7 +1086,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		if (pgd_none_or_clear_bad(src_pgd))
 			continue;
 		if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd,
-					    vma, addr, next))) {
+					    vma, new, addr, next))) {
 			ret = -ENOMEM;
 			break;
 		}
-- 
2.26.2