All of lore.kernel.org
 help / color / mirror / Atom feed
From: Laurent Dufour <ldufour@linux.ibm.com>
To: akpm@linux-foundation.org, mhocko@kernel.org,
	peterz@infradead.org, kirill@shutemov.name, ak@linux.intel.com,
	dave@stgolabs.net, jack@suse.cz,
	Matthew Wilcox <willy@infradead.org>,
	aneesh.kumar@linux.ibm.com, benh@kernel.crashing.org,
	mpe@ellerman.id.au, paulus@samba.org,
	Thomas Gleixner <tglx@linutronix.de>,
	Ingo Molnar <mingo@redhat.com>,
	hpa@zytor.com, Will Deacon <will.deacon@arm.com>,
	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>,
	sergey.senozhatsky.work@gmail.com,
	Andrea Arcangeli <aarcange@redhat.com>,
	Alexei Starovoitov <alexei.starovoitov@gmail.com>,
	kemi.wang@intel.com, Daniel Jordan <daniel.m.jordan@oracle.com>,
	David Rientjes <rientjes@google.com>,
	Jerome Glisse <jglisse@redhat.com>,
	Ganesh Mahendran <opensource.ganesh@gmail.com>,
	Minchan Kim <minchan@kernel.org>,
	Punit Agrawal <punitagrawal@gmail.com>,
	vinayak menon <vinayakm.list@gmail.com>,
	Yang Shi <yang.shi@linux.alibaba.com>,
	zhong jiang <zhongjiang@huawei.com>,
	Haiyan Song <haiyanx.song@intel.com>,
	Balbir Singh <bsingharora@gmail.com>,
	sj38.park@gmail.com, Michel Lespinasse <walken@google.com>,
	Mike Rapoport <rppt@linux.ibm.com>
Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	haren@linux.vnet.ibm.com, npiggin@gmail.com,
	paulmck@linux.vnet.ibm.com, Tim Chen <tim.c.chen@linux.intel.com>,
	linuxppc-dev@lists.ozlabs.org, x86@kernel.org
Subject: [PATCH v12 11/31] mm: protect mremap() against SPF hanlder
Date: Tue, 16 Apr 2019 15:45:02 +0200	[thread overview]
Message-ID: <20190416134522.17540-12-ldufour@linux.ibm.com> (raw)
In-Reply-To: <20190416134522.17540-1-ldufour@linux.ibm.com>

If a thread is remapping an area while another one is faulting on the
destination area, the SPF handler may fetch the vma from the RB tree before
the pte has been moved by the other thread. This means that the moved ptes
will overwrite those create by the page fault handler leading to page
leaked.

	CPU 1				CPU2
	enter mremap()
	unmap the dest area
	copy_vma()			Enter speculative page fault handler
	   >> at this time the dest area is present in the RB tree
					fetch the vma matching dest area
					create a pte as the VMA matched
					Exit the SPF handler
					<data written in the new page>
	move_ptes()
	  > it is assumed that the dest area is empty,
 	  > the move ptes overwrite the page mapped by the CPU2.

To prevent that, when the VMA matching the dest area is extended or created
by copy_vma(), it should be marked as non available to the SPF handler.
The usual way to so is to rely on vm_write_begin()/end().
This is already in __vma_adjust() called by copy_vma() (through
vma_merge()). But __vma_adjust() is calling vm_write_end() before returning
which create a window for another thread.
This patch adds a new parameter to vma_merge() which is passed down to
vma_adjust().
The assumption is that copy_vma() is returning a vma which should be
released by calling vm_raw_write_end() by the callee once the ptes have
been moved.

Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
---
 include/linux/mm.h | 24 ++++++++++++++++-----
 mm/mmap.c          | 53 +++++++++++++++++++++++++++++++++++-----------
 mm/mremap.c        | 13 ++++++++++++
 3 files changed, 73 insertions(+), 17 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 906b9e06f18e..5d45b7d8718d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2343,18 +2343,32 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
 
 /* mmap.c */
 extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
+
 extern int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
 	unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
-	struct vm_area_struct *expand);
+	struct vm_area_struct *expand, bool keep_locked);
+
 static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start,
 	unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
 {
-	return __vma_adjust(vma, start, end, pgoff, insert, NULL);
+	return __vma_adjust(vma, start, end, pgoff, insert, NULL, false);
 }
-extern struct vm_area_struct *vma_merge(struct mm_struct *,
+
+extern struct vm_area_struct *__vma_merge(struct mm_struct *mm,
+	struct vm_area_struct *prev, unsigned long addr, unsigned long end,
+	unsigned long vm_flags, struct anon_vma *anon, struct file *file,
+	pgoff_t pgoff, struct mempolicy *mpol,
+	struct vm_userfaultfd_ctx uff, bool keep_locked);
+
+static inline struct vm_area_struct *vma_merge(struct mm_struct *mm,
 	struct vm_area_struct *prev, unsigned long addr, unsigned long end,
-	unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
-	struct mempolicy *, struct vm_userfaultfd_ctx);
+	unsigned long vm_flags, struct anon_vma *anon, struct file *file,
+	pgoff_t off, struct mempolicy *pol, struct vm_userfaultfd_ctx uff)
+{
+	return __vma_merge(mm, prev, addr, end, vm_flags, anon, file, off,
+			   pol, uff, false);
+}
+
 extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
 extern int __split_vma(struct mm_struct *, struct vm_area_struct *,
 	unsigned long addr, int new_below);
diff --git a/mm/mmap.c b/mm/mmap.c
index b77ec0149249..13460b38b0fb 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -714,7 +714,7 @@ static inline void __vma_unlink_prev(struct mm_struct *mm,
  */
 int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
 	unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
-	struct vm_area_struct *expand)
+	struct vm_area_struct *expand, bool keep_locked)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct vm_area_struct *next = vma->vm_next, *orig_vma = vma;
@@ -830,8 +830,12 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
 
 			importer->anon_vma = exporter->anon_vma;
 			error = anon_vma_clone(importer, exporter);
-			if (error)
+			if (error) {
+				if (next && next != vma)
+					vm_raw_write_end(next);
+				vm_raw_write_end(vma);
 				return error;
+			}
 		}
 	}
 again:
@@ -1025,7 +1029,8 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
 
 	if (next && next != vma)
 		vm_raw_write_end(next);
-	vm_raw_write_end(vma);
+	if (!keep_locked)
+		vm_raw_write_end(vma);
 
 	validate_mm(mm);
 
@@ -1161,12 +1166,13 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
  * parameter) may establish ptes with the wrong permissions of NNNN
  * instead of the right permissions of XXXX.
  */
-struct vm_area_struct *vma_merge(struct mm_struct *mm,
+struct vm_area_struct *__vma_merge(struct mm_struct *mm,
 			struct vm_area_struct *prev, unsigned long addr,
 			unsigned long end, unsigned long vm_flags,
 			struct anon_vma *anon_vma, struct file *file,
 			pgoff_t pgoff, struct mempolicy *policy,
-			struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+			struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+			bool keep_locked)
 {
 	pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
 	struct vm_area_struct *area, *next;
@@ -1214,10 +1220,11 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 							/* cases 1, 6 */
 			err = __vma_adjust(prev, prev->vm_start,
 					 next->vm_end, prev->vm_pgoff, NULL,
-					 prev);
+					 prev, keep_locked);
 		} else					/* cases 2, 5, 7 */
 			err = __vma_adjust(prev, prev->vm_start,
-					 end, prev->vm_pgoff, NULL, prev);
+					   end, prev->vm_pgoff, NULL, prev,
+					   keep_locked);
 		if (err)
 			return NULL;
 		khugepaged_enter_vma_merge(prev, vm_flags);
@@ -1234,10 +1241,12 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 					     vm_userfaultfd_ctx)) {
 		if (prev && addr < prev->vm_end)	/* case 4 */
 			err = __vma_adjust(prev, prev->vm_start,
-					 addr, prev->vm_pgoff, NULL, next);
+					 addr, prev->vm_pgoff, NULL, next,
+					 keep_locked);
 		else {					/* cases 3, 8 */
 			err = __vma_adjust(area, addr, next->vm_end,
-					 next->vm_pgoff - pglen, NULL, next);
+					 next->vm_pgoff - pglen, NULL, next,
+					 keep_locked);
 			/*
 			 * In case 3 area is already equal to next and
 			 * this is a noop, but in case 8 "area" has
@@ -3259,9 +3268,20 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 
 	if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
 		return NULL;	/* should never get here */
-	new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
-			    vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
-			    vma->vm_userfaultfd_ctx);
+
+	/* There is 3 cases to manage here in
+	 *     AAAA            AAAA              AAAA              AAAA
+	 * PPPP....      PPPP......NNNN      PPPP....NNNN      PP........NN
+	 * PPPPPPPP(A)   PPPP..NNNNNNNN(B)   PPPPPPPPPPPP(1)       NULL
+	 *                                   PPPPPPPPNNNN(2)
+	 *                                   PPPPNNNNNNNN(3)
+	 *
+	 * new_vma == prev in case A,1,2
+	 * new_vma == next in case B,3
+	 */
+	new_vma = __vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
+			      vma->anon_vma, vma->vm_file, pgoff,
+			      vma_policy(vma), vma->vm_userfaultfd_ctx, true);
 	if (new_vma) {
 		/*
 		 * Source vma may have been merged into new_vma
@@ -3299,6 +3319,15 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 			get_file(new_vma->vm_file);
 		if (new_vma->vm_ops && new_vma->vm_ops->open)
 			new_vma->vm_ops->open(new_vma);
+		/*
+		 * As the VMA is linked right now, it may be hit by the
+		 * speculative page fault handler. But we don't want it to
+		 * to start mapping page in this area until the caller has
+		 * potentially move the pte from the moved VMA. To prevent
+		 * that we protect it right now, and let the caller unprotect
+		 * it once the move is done.
+		 */
+		vm_raw_write_begin(new_vma);
 		vma_link(mm, new_vma, prev, rb_link, rb_parent);
 		*need_rmap_locks = false;
 	}
diff --git a/mm/mremap.c b/mm/mremap.c
index fc241d23cd97..ae5c3379586e 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -357,6 +357,14 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	if (!new_vma)
 		return -ENOMEM;
 
+	/* new_vma is returned protected by copy_vma, to prevent speculative
+	 * page fault to be done in the destination area before we move the pte.
+	 * Now, we must also protect the source VMA since we don't want pages
+	 * to be mapped in our back while we are copying the PTEs.
+	 */
+	if (vma != new_vma)
+		vm_raw_write_begin(vma);
+
 	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
 				     need_rmap_locks);
 	if (moved_len < old_len) {
@@ -373,6 +381,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 		 */
 		move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
 				 true);
+		if (vma != new_vma)
+			vm_raw_write_end(vma);
 		vma = new_vma;
 		old_len = new_len;
 		old_addr = new_addr;
@@ -381,7 +391,10 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 		mremap_userfaultfd_prep(new_vma, uf);
 		arch_remap(mm, old_addr, old_addr + old_len,
 			   new_addr, new_addr + new_len);
+		if (vma != new_vma)
+			vm_raw_write_end(vma);
 	}
+	vm_raw_write_end(new_vma);
 
 	/* Conceal VM_ACCOUNT so old reservation is not undone */
 	if (vm_flags & VM_ACCOUNT) {
-- 
2.21.0


WARNING: multiple messages have this Message-ID (diff)
From: Laurent Dufour <ldufour@linux.ibm.com>
To: akpm@linux-foundation.org, mhocko@kernel.org,
	peterz@infradead.org, kirill@shutemov.name, ak@linux.intel.com,
	dave@stgolabs.net, jack@suse.cz,
	Matthew Wilcox <willy@infradead.org>,
	aneesh.kumar@linux.ibm.com, benh@kernel.crashing.org,
	mpe@ellerman.id.au, paulus@samba.org,
	Thomas Gleixner <tglx@linutronix.de>,
	Ingo Molnar <mingo@redhat.com>,
	hpa@zytor.com, Will Deacon <will.deacon@arm.com>,
	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>,
	sergey.senozhatsky.work@gmail.com,
	Andrea Arcangeli <aarcange@redhat.com>,
	Alexei Starovoitov <alexei.starovoitov@gmail.com>,
	kemi.wang@intel.com, Daniel Jordan <daniel.m.jordan@oracle.com>,
	David Rientjes <rientjes@google.com>,
	Jerome Glisse <jglisse@redhat.com>,
	Ganesh Mahendran <opensource.ganesh@gmail.com>,
	Minchan Kim <minchan@kernel.org>,
	Punit Agrawal <punitagrawal@gmail.com>,
	vinayak menon <vinayakm.list@gmail.com>,
	Yang Shi <yang.shi@linux.alibaba.com>,
	zhong jiang <zhongjiang@huawei.com>,
	Haiyan Song <haiyanx.song@intel.com>,
	Balbir Singh <bsingharora@gmail.com>,
	sj38.park@gmail.com, Michel Lespinasse <walken@google.com>,
	Mike Rapoport <rppt@linux.ibm.com>
Cc: linuxppc-dev@lists.ozlabs.org, x86@kernel.org,
	linux-kernel@vger.kernel.org, npiggin@gmail.com,
	linux-mm@kvack.org, paulmck@linux.vnet.ibm.com,
	Tim Chen <tim.c.chen@linux.intel.com>,
	haren@linux.vnet.ibm.com
Subject: [PATCH v12 11/31] mm: protect mremap() against SPF hanlder
Date: Tue, 16 Apr 2019 15:45:02 +0200	[thread overview]
Message-ID: <20190416134522.17540-12-ldufour@linux.ibm.com> (raw)
In-Reply-To: <20190416134522.17540-1-ldufour@linux.ibm.com>

If a thread is remapping an area while another one is faulting on the
destination area, the SPF handler may fetch the vma from the RB tree before
the pte has been moved by the other thread. This means that the moved ptes
will overwrite those create by the page fault handler leading to page
leaked.

	CPU 1				CPU2
	enter mremap()
	unmap the dest area
	copy_vma()			Enter speculative page fault handler
	   >> at this time the dest area is present in the RB tree
					fetch the vma matching dest area
					create a pte as the VMA matched
					Exit the SPF handler
					<data written in the new page>
	move_ptes()
	  > it is assumed that the dest area is empty,
 	  > the move ptes overwrite the page mapped by the CPU2.

To prevent that, when the VMA matching the dest area is extended or created
by copy_vma(), it should be marked as non available to the SPF handler.
The usual way to so is to rely on vm_write_begin()/end().
This is already in __vma_adjust() called by copy_vma() (through
vma_merge()). But __vma_adjust() is calling vm_write_end() before returning
which create a window for another thread.
This patch adds a new parameter to vma_merge() which is passed down to
vma_adjust().
The assumption is that copy_vma() is returning a vma which should be
released by calling vm_raw_write_end() by the callee once the ptes have
been moved.

Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
---
 include/linux/mm.h | 24 ++++++++++++++++-----
 mm/mmap.c          | 53 +++++++++++++++++++++++++++++++++++-----------
 mm/mremap.c        | 13 ++++++++++++
 3 files changed, 73 insertions(+), 17 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 906b9e06f18e..5d45b7d8718d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2343,18 +2343,32 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
 
 /* mmap.c */
 extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
+
 extern int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
 	unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
-	struct vm_area_struct *expand);
+	struct vm_area_struct *expand, bool keep_locked);
+
 static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start,
 	unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
 {
-	return __vma_adjust(vma, start, end, pgoff, insert, NULL);
+	return __vma_adjust(vma, start, end, pgoff, insert, NULL, false);
 }
-extern struct vm_area_struct *vma_merge(struct mm_struct *,
+
+extern struct vm_area_struct *__vma_merge(struct mm_struct *mm,
+	struct vm_area_struct *prev, unsigned long addr, unsigned long end,
+	unsigned long vm_flags, struct anon_vma *anon, struct file *file,
+	pgoff_t pgoff, struct mempolicy *mpol,
+	struct vm_userfaultfd_ctx uff, bool keep_locked);
+
+static inline struct vm_area_struct *vma_merge(struct mm_struct *mm,
 	struct vm_area_struct *prev, unsigned long addr, unsigned long end,
-	unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
-	struct mempolicy *, struct vm_userfaultfd_ctx);
+	unsigned long vm_flags, struct anon_vma *anon, struct file *file,
+	pgoff_t off, struct mempolicy *pol, struct vm_userfaultfd_ctx uff)
+{
+	return __vma_merge(mm, prev, addr, end, vm_flags, anon, file, off,
+			   pol, uff, false);
+}
+
 extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
 extern int __split_vma(struct mm_struct *, struct vm_area_struct *,
 	unsigned long addr, int new_below);
diff --git a/mm/mmap.c b/mm/mmap.c
index b77ec0149249..13460b38b0fb 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -714,7 +714,7 @@ static inline void __vma_unlink_prev(struct mm_struct *mm,
  */
 int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
 	unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
-	struct vm_area_struct *expand)
+	struct vm_area_struct *expand, bool keep_locked)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct vm_area_struct *next = vma->vm_next, *orig_vma = vma;
@@ -830,8 +830,12 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
 
 			importer->anon_vma = exporter->anon_vma;
 			error = anon_vma_clone(importer, exporter);
-			if (error)
+			if (error) {
+				if (next && next != vma)
+					vm_raw_write_end(next);
+				vm_raw_write_end(vma);
 				return error;
+			}
 		}
 	}
 again:
@@ -1025,7 +1029,8 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
 
 	if (next && next != vma)
 		vm_raw_write_end(next);
-	vm_raw_write_end(vma);
+	if (!keep_locked)
+		vm_raw_write_end(vma);
 
 	validate_mm(mm);
 
@@ -1161,12 +1166,13 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
  * parameter) may establish ptes with the wrong permissions of NNNN
  * instead of the right permissions of XXXX.
  */
-struct vm_area_struct *vma_merge(struct mm_struct *mm,
+struct vm_area_struct *__vma_merge(struct mm_struct *mm,
 			struct vm_area_struct *prev, unsigned long addr,
 			unsigned long end, unsigned long vm_flags,
 			struct anon_vma *anon_vma, struct file *file,
 			pgoff_t pgoff, struct mempolicy *policy,
-			struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+			struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+			bool keep_locked)
 {
 	pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
 	struct vm_area_struct *area, *next;
@@ -1214,10 +1220,11 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 							/* cases 1, 6 */
 			err = __vma_adjust(prev, prev->vm_start,
 					 next->vm_end, prev->vm_pgoff, NULL,
-					 prev);
+					 prev, keep_locked);
 		} else					/* cases 2, 5, 7 */
 			err = __vma_adjust(prev, prev->vm_start,
-					 end, prev->vm_pgoff, NULL, prev);
+					   end, prev->vm_pgoff, NULL, prev,
+					   keep_locked);
 		if (err)
 			return NULL;
 		khugepaged_enter_vma_merge(prev, vm_flags);
@@ -1234,10 +1241,12 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 					     vm_userfaultfd_ctx)) {
 		if (prev && addr < prev->vm_end)	/* case 4 */
 			err = __vma_adjust(prev, prev->vm_start,
-					 addr, prev->vm_pgoff, NULL, next);
+					 addr, prev->vm_pgoff, NULL, next,
+					 keep_locked);
 		else {					/* cases 3, 8 */
 			err = __vma_adjust(area, addr, next->vm_end,
-					 next->vm_pgoff - pglen, NULL, next);
+					 next->vm_pgoff - pglen, NULL, next,
+					 keep_locked);
 			/*
 			 * In case 3 area is already equal to next and
 			 * this is a noop, but in case 8 "area" has
@@ -3259,9 +3268,20 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 
 	if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
 		return NULL;	/* should never get here */
-	new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
-			    vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
-			    vma->vm_userfaultfd_ctx);
+
+	/* There is 3 cases to manage here in
+	 *     AAAA            AAAA              AAAA              AAAA
+	 * PPPP....      PPPP......NNNN      PPPP....NNNN      PP........NN
+	 * PPPPPPPP(A)   PPPP..NNNNNNNN(B)   PPPPPPPPPPPP(1)       NULL
+	 *                                   PPPPPPPPNNNN(2)
+	 *                                   PPPPNNNNNNNN(3)
+	 *
+	 * new_vma == prev in case A,1,2
+	 * new_vma == next in case B,3
+	 */
+	new_vma = __vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
+			      vma->anon_vma, vma->vm_file, pgoff,
+			      vma_policy(vma), vma->vm_userfaultfd_ctx, true);
 	if (new_vma) {
 		/*
 		 * Source vma may have been merged into new_vma
@@ -3299,6 +3319,15 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 			get_file(new_vma->vm_file);
 		if (new_vma->vm_ops && new_vma->vm_ops->open)
 			new_vma->vm_ops->open(new_vma);
+		/*
+		 * As the VMA is linked right now, it may be hit by the
+		 * speculative page fault handler. But we don't want it to
+		 * to start mapping page in this area until the caller has
+		 * potentially move the pte from the moved VMA. To prevent
+		 * that we protect it right now, and let the caller unprotect
+		 * it once the move is done.
+		 */
+		vm_raw_write_begin(new_vma);
 		vma_link(mm, new_vma, prev, rb_link, rb_parent);
 		*need_rmap_locks = false;
 	}
diff --git a/mm/mremap.c b/mm/mremap.c
index fc241d23cd97..ae5c3379586e 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -357,6 +357,14 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	if (!new_vma)
 		return -ENOMEM;
 
+	/* new_vma is returned protected by copy_vma, to prevent speculative
+	 * page fault to be done in the destination area before we move the pte.
+	 * Now, we must also protect the source VMA since we don't want pages
+	 * to be mapped in our back while we are copying the PTEs.
+	 */
+	if (vma != new_vma)
+		vm_raw_write_begin(vma);
+
 	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
 				     need_rmap_locks);
 	if (moved_len < old_len) {
@@ -373,6 +381,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 		 */
 		move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
 				 true);
+		if (vma != new_vma)
+			vm_raw_write_end(vma);
 		vma = new_vma;
 		old_len = new_len;
 		old_addr = new_addr;
@@ -381,7 +391,10 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 		mremap_userfaultfd_prep(new_vma, uf);
 		arch_remap(mm, old_addr, old_addr + old_len,
 			   new_addr, new_addr + new_len);
+		if (vma != new_vma)
+			vm_raw_write_end(vma);
 	}
+	vm_raw_write_end(new_vma);
 
 	/* Conceal VM_ACCOUNT so old reservation is not undone */
 	if (vm_flags & VM_ACCOUNT) {
-- 
2.21.0


  parent reply	other threads:[~2019-04-16 13:47 UTC|newest]

Thread overview: 197+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-04-16 13:44 [PATCH v12 00/31] Speculative page faults Laurent Dufour
2019-04-16 13:44 ` Laurent Dufour
2019-04-16 13:44 ` [PATCH v12 01/31] mm: introduce CONFIG_SPECULATIVE_PAGE_FAULT Laurent Dufour
2019-04-16 13:44   ` Laurent Dufour
2019-04-18 21:47   ` Jerome Glisse
2019-04-18 21:47     ` Jerome Glisse
2019-04-23 15:21     ` Laurent Dufour
2019-04-23 15:21       ` Laurent Dufour
2019-04-16 13:44 ` [PATCH v12 02/31] x86/mm: define ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT Laurent Dufour
2019-04-16 13:44   ` Laurent Dufour
2019-04-18 21:48   ` Jerome Glisse
2019-04-18 21:48     ` Jerome Glisse
2019-04-16 13:44 ` [PATCH v12 03/31] powerpc/mm: set ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT Laurent Dufour
2019-04-16 13:44   ` Laurent Dufour
2019-04-18 21:49   ` Jerome Glisse
2019-04-18 21:49     ` Jerome Glisse
2019-04-16 13:44 ` [PATCH v12 04/31] arm64/mm: define ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT Laurent Dufour
2019-04-16 13:44   ` Laurent Dufour
2019-04-16 14:27   ` Mark Rutland
2019-04-16 14:27     ` Mark Rutland
2019-04-16 14:31     ` Laurent Dufour
2019-04-16 14:31       ` Laurent Dufour
2019-04-16 14:41       ` Mark Rutland
2019-04-16 14:41         ` Mark Rutland
2019-04-18 21:51         ` Jerome Glisse
2019-04-18 21:51           ` Jerome Glisse
2019-04-23 15:36           ` Laurent Dufour
2019-04-23 15:36             ` Laurent Dufour
2019-04-23 16:19             ` Mark Rutland
2019-04-23 16:19               ` Mark Rutland
2019-04-24 10:34               ` Laurent Dufour
2019-04-24 10:34                 ` Laurent Dufour
2019-04-16 13:44 ` [PATCH v12 05/31] mm: prepare for FAULT_FLAG_SPECULATIVE Laurent Dufour
2019-04-16 13:44   ` Laurent Dufour
2019-04-18 22:04   ` Jerome Glisse
2019-04-18 22:04     ` Jerome Glisse
2019-04-23 15:45     ` Laurent Dufour
2019-04-23 15:45       ` Laurent Dufour
2019-04-16 13:44 ` [PATCH v12 06/31] mm: introduce pte_spinlock " Laurent Dufour
2019-04-16 13:44   ` Laurent Dufour
2019-04-18 22:05   ` Jerome Glisse
2019-04-18 22:05     ` Jerome Glisse
2019-04-16 13:44 ` [PATCH v12 07/31] mm: make pte_unmap_same compatible with SPF Laurent Dufour
2019-04-16 13:44   ` Laurent Dufour
2019-04-18 22:10   ` Jerome Glisse
2019-04-18 22:10     ` Jerome Glisse
2019-04-23 15:43   ` Matthew Wilcox
2019-04-23 15:43     ` Matthew Wilcox
2019-04-23 15:47     ` Laurent Dufour
2019-04-23 15:47       ` Laurent Dufour
2019-04-16 13:44 ` [PATCH v12 08/31] mm: introduce INIT_VMA() Laurent Dufour
2019-04-16 13:44   ` Laurent Dufour
2019-04-18 22:22   ` Jerome Glisse
2019-04-18 22:22     ` Jerome Glisse
2019-04-16 13:45 ` [PATCH v12 09/31] mm: VMA sequence count Laurent Dufour
2019-04-16 13:45   ` Laurent Dufour
2019-04-18 22:48   ` Jerome Glisse
2019-04-18 22:48     ` Jerome Glisse
2019-04-19 15:45     ` Laurent Dufour
2019-04-19 15:45       ` Laurent Dufour
2019-04-22 15:51       ` Jerome Glisse
2019-04-22 15:51         ` Jerome Glisse
2019-04-16 13:45 ` [PATCH v12 10/31] mm: protect VMA modifications using " Laurent Dufour
2019-04-16 13:45   ` Laurent Dufour
2019-04-22 19:43   ` Jerome Glisse
2019-04-22 19:43     ` Jerome Glisse
2019-04-16 13:45 ` Laurent Dufour [this message]
2019-04-16 13:45   ` [PATCH v12 11/31] mm: protect mremap() against SPF hanlder Laurent Dufour
2019-04-22 19:51   ` Jerome Glisse
2019-04-22 19:51     ` Jerome Glisse
2019-04-23 15:51     ` Laurent Dufour
2019-04-23 15:51       ` Laurent Dufour
2019-04-16 13:45 ` [PATCH v12 12/31] mm: protect SPF handler against anon_vma changes Laurent Dufour
2019-04-16 13:45   ` Laurent Dufour
2019-04-22 19:53   ` Jerome Glisse
2019-04-22 19:53     ` Jerome Glisse
2019-04-16 13:45 ` [PATCH v12 13/31] mm: cache some VMA fields in the vm_fault structure Laurent Dufour
2019-04-16 13:45   ` Laurent Dufour
2019-04-22 20:06   ` Jerome Glisse
2019-04-22 20:06     ` Jerome Glisse
2019-04-16 13:45 ` [PATCH v12 14/31] mm/migrate: Pass vm_fault pointer to migrate_misplaced_page() Laurent Dufour
2019-04-16 13:45   ` Laurent Dufour
2019-04-22 20:09   ` Jerome Glisse
2019-04-22 20:09     ` Jerome Glisse
2019-04-16 13:45 ` [PATCH v12 15/31] mm: introduce __lru_cache_add_active_or_unevictable Laurent Dufour
2019-04-16 13:45   ` Laurent Dufour
2019-04-22 20:11   ` Jerome Glisse
2019-04-22 20:11     ` Jerome Glisse
2019-04-16 13:45 ` [PATCH v12 16/31] mm: introduce __vm_normal_page() Laurent Dufour
2019-04-16 13:45   ` Laurent Dufour
2019-04-22 20:15   ` Jerome Glisse
2019-04-22 20:15     ` Jerome Glisse
2019-04-16 13:45 ` [PATCH v12 17/31] mm: introduce __page_add_new_anon_rmap() Laurent Dufour
2019-04-16 13:45   ` Laurent Dufour
2019-04-22 20:18   ` Jerome Glisse
2019-04-22 20:18     ` Jerome Glisse
2019-04-16 13:45 ` [PATCH v12 18/31] mm: protect against PTE changes done by dup_mmap() Laurent Dufour
2019-04-16 13:45   ` Laurent Dufour
2019-04-22 20:32   ` Jerome Glisse
2019-04-22 20:32     ` Jerome Glisse
2019-04-24 10:33     ` Laurent Dufour
2019-04-24 10:33       ` Laurent Dufour
2019-04-16 13:45 ` [PATCH v12 19/31] mm: protect the RB tree with a sequence lock Laurent Dufour
2019-04-16 13:45   ` Laurent Dufour
2019-04-22 20:33   ` Jerome Glisse
2019-04-22 20:33     ` Jerome Glisse
2019-04-16 13:45 ` [PATCH v12 20/31] mm: introduce vma reference counter Laurent Dufour
2019-04-16 13:45   ` Laurent Dufour
2019-04-22 20:36   ` Jerome Glisse
2019-04-22 20:36     ` Jerome Glisse
2019-04-24 14:26     ` Laurent Dufour
2019-04-24 14:26       ` Laurent Dufour
2019-04-16 13:45 ` [PATCH v12 21/31] mm: Introduce find_vma_rcu() Laurent Dufour
2019-04-16 13:45   ` Laurent Dufour
2019-04-22 20:57   ` Jerome Glisse
2019-04-22 20:57     ` Jerome Glisse
2019-04-24 14:39     ` Laurent Dufour
2019-04-24 14:39       ` Laurent Dufour
2019-04-23  9:27   ` Peter Zijlstra
2019-04-23  9:27     ` Peter Zijlstra
2019-04-23 18:13     ` Davidlohr Bueso
2019-04-23 18:13       ` Davidlohr Bueso
2019-04-24  7:57     ` Laurent Dufour
2019-04-24  7:57       ` Laurent Dufour
2019-04-16 13:45 ` [PATCH v12 22/31] mm: provide speculative fault infrastructure Laurent Dufour
2019-04-16 13:45   ` Laurent Dufour
2019-04-22 21:26   ` Jerome Glisse
2019-04-22 21:26     ` Jerome Glisse
2019-04-24 14:56     ` Laurent Dufour
2019-04-24 14:56       ` Laurent Dufour
2019-04-24 15:13       ` Jerome Glisse
2019-04-24 15:13         ` Jerome Glisse
2019-04-16 13:45 ` [PATCH v12 23/31] mm: don't do swap readahead during speculative page fault Laurent Dufour
2019-04-16 13:45   ` Laurent Dufour
2019-04-22 21:36   ` Jerome Glisse
2019-04-22 21:36     ` Jerome Glisse
2019-04-24 14:57     ` Laurent Dufour
2019-04-24 14:57       ` Laurent Dufour
2019-04-16 13:45 ` [PATCH v12 24/31] mm: adding speculative page fault failure trace events Laurent Dufour
2019-04-16 13:45   ` Laurent Dufour
2019-04-16 13:45 ` [PATCH v12 25/31] perf: add a speculative page fault sw event Laurent Dufour
2019-04-16 13:45   ` Laurent Dufour
2019-04-16 13:45 ` [PATCH v12 26/31] perf tools: add support for the SPF perf event Laurent Dufour
2019-04-16 13:45   ` Laurent Dufour
2019-04-16 13:45 ` [PATCH v12 27/31] mm: add speculative page fault vmstats Laurent Dufour
2019-04-16 13:45   ` Laurent Dufour
2019-04-16 13:45 ` [PATCH v12 28/31] x86/mm: add speculative pagefault handling Laurent Dufour
2019-04-16 13:45   ` Laurent Dufour
2019-04-16 13:45 ` [PATCH v12 29/31] powerpc/mm: add speculative page fault Laurent Dufour
2019-04-16 13:45   ` Laurent Dufour
2019-04-16 13:45 ` [PATCH v12 30/31] arm64/mm: " Laurent Dufour
2019-04-16 13:45   ` Laurent Dufour
2019-04-16 13:45 ` [PATCH v12 31/31] mm: Add a speculative page fault switch in sysctl Laurent Dufour
2019-04-16 13:45   ` Laurent Dufour
2019-04-22 21:29 ` [PATCH v12 00/31] Speculative page faults Michel Lespinasse
2019-04-22 21:29   ` Michel Lespinasse
2019-04-22 21:29   ` Michel Lespinasse
2019-04-23  9:38   ` Peter Zijlstra
2019-04-23  9:38     ` Peter Zijlstra
2019-04-24  7:33     ` Laurent Dufour
2019-04-24  7:33       ` Laurent Dufour
2019-04-27  1:53       ` Michel Lespinasse
2019-04-27  1:53         ` Michel Lespinasse
2019-04-23 10:47   ` Michal Hocko
2019-04-23 10:47     ` Michal Hocko
2019-04-23 12:41     ` Matthew Wilcox
2019-04-23 12:41       ` Matthew Wilcox
2019-04-23 12:48       ` Peter Zijlstra
2019-04-23 12:48         ` Peter Zijlstra
2019-04-23 13:42       ` Michal Hocko
2019-04-23 13:42         ` Michal Hocko
2019-04-24 18:01   ` Laurent Dufour
2019-04-24 18:01     ` Laurent Dufour
2019-04-27  6:00     ` Michel Lespinasse
2019-04-27  6:00       ` Michel Lespinasse
2019-04-23 11:35 ` Anshuman Khandual
2019-04-23 11:35   ` Anshuman Khandual
2019-06-06  6:51 ` Haiyan Song
2019-06-06  6:51   ` Haiyan Song
2019-06-14  8:37   ` Laurent Dufour
2019-06-14  8:37     ` Laurent Dufour
2019-06-14  8:44     ` Laurent Dufour
2019-06-14  8:44       ` Laurent Dufour
2019-06-20  8:19       ` Haiyan Song
2019-06-20  8:19         ` Haiyan Song
2020-07-06  9:25         ` Chinwen Chang
2020-07-06  9:25           ` Chinwen Chang
2020-07-06 12:27           ` Laurent Dufour
2020-07-06 12:27             ` Laurent Dufour
2020-07-07  5:31             ` Chinwen Chang
2020-07-07  5:31               ` Chinwen Chang
2020-12-14  2:03               ` Joel Fernandes
2020-12-14  2:03                 ` Joel Fernandes
2020-12-14  9:36                 ` Laurent Dufour
2020-12-14  9:36                   ` Laurent Dufour
2020-12-14 18:10                   ` Joel Fernandes
2020-12-14 18:10                     ` Joel Fernandes

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190416134522.17540-12-ldufour@linux.ibm.com \
    --to=ldufour@linux.ibm.com \
    --cc=aarcange@redhat.com \
    --cc=ak@linux.intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=alexei.starovoitov@gmail.com \
    --cc=aneesh.kumar@linux.ibm.com \
    --cc=benh@kernel.crashing.org \
    --cc=bsingharora@gmail.com \
    --cc=daniel.m.jordan@oracle.com \
    --cc=dave@stgolabs.net \
    --cc=haiyanx.song@intel.com \
    --cc=haren@linux.vnet.ibm.com \
    --cc=hpa@zytor.com \
    --cc=jack@suse.cz \
    --cc=jglisse@redhat.com \
    --cc=kemi.wang@intel.com \
    --cc=kirill@shutemov.name \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linuxppc-dev@lists.ozlabs.org \
    --cc=mhocko@kernel.org \
    --cc=minchan@kernel.org \
    --cc=mingo@redhat.com \
    --cc=mpe@ellerman.id.au \
    --cc=npiggin@gmail.com \
    --cc=opensource.ganesh@gmail.com \
    --cc=paulmck@linux.vnet.ibm.com \
    --cc=paulus@samba.org \
    --cc=peterz@infradead.org \
    --cc=punitagrawal@gmail.com \
    --cc=rientjes@google.com \
    --cc=rppt@linux.ibm.com \
    --cc=sergey.senozhatsky.work@gmail.com \
    --cc=sergey.senozhatsky@gmail.com \
    --cc=sj38.park@gmail.com \
    --cc=tglx@linutronix.de \
    --cc=tim.c.chen@linux.intel.com \
    --cc=vinayakm.list@gmail.com \
    --cc=walken@google.com \
    --cc=will.deacon@arm.com \
    --cc=willy@infradead.org \
    --cc=x86@kernel.org \
    --cc=yang.shi@linux.alibaba.com \
    --cc=zhongjiang@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.