linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 0/3] per-vma locks in userfaultfd
@ 2024-01-29 19:35 Lokesh Gidra
  2024-01-29 19:35 ` [PATCH v2 1/3] userfaultfd: move userfaultfd_ctx struct to header file Lokesh Gidra
                   ` (3 more replies)
  0 siblings, 4 replies; 35+ messages in thread
From: Lokesh Gidra @ 2024-01-29 19:35 UTC (permalink / raw)
  To: akpm
  Cc: lokeshgidra, linux-fsdevel, linux-mm, linux-kernel, selinux,
	surenb, kernel-team, aarcange, peterx, david, axelrasmussen,
	bgeffon, willy, jannh, kaleshsingh, ngeoffray, timmurray, rppt

Performing userfaultfd operations (like copy/move etc.) in critical
section of mmap_lock (read-mode) causes significant contention on the
lock when operations requiring the lock in write-mode are taking place
concurrently. We can use per-vma locks instead to significantly reduce
the contention issue.

Changes since v1 [1]:
- rebase patches on 'mm-unstable' branch

[1] https://lore.kernel.org/all/20240126182647.2748949-1-lokeshgidra@google.com/

Lokesh Gidra (3):
  userfaultfd: move userfaultfd_ctx struct to header file
  userfaultfd: protect mmap_changing with rw_sem in userfaulfd_ctx
  userfaultfd: use per-vma locks in userfaultfd operations

 fs/userfaultfd.c              |  86 ++++---------
 include/linux/userfaultfd_k.h |  75 ++++++++---
 mm/userfaultfd.c              | 229 ++++++++++++++++++++++------------
 3 files changed, 229 insertions(+), 161 deletions(-)

-- 
2.43.0.429.g432eaa2c6b-goog


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH v2 1/3] userfaultfd: move userfaultfd_ctx struct to header file
  2024-01-29 19:35 [PATCH v2 0/3] per-vma locks in userfaultfd Lokesh Gidra
@ 2024-01-29 19:35 ` Lokesh Gidra
  2024-01-30  7:12   ` Mike Rapoport
  2024-01-29 19:35 ` [PATCH v2 2/3] userfaultfd: protect mmap_changing with rw_sem in userfaulfd_ctx Lokesh Gidra
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 35+ messages in thread
From: Lokesh Gidra @ 2024-01-29 19:35 UTC (permalink / raw)
  To: akpm
  Cc: lokeshgidra, linux-fsdevel, linux-mm, linux-kernel, selinux,
	surenb, kernel-team, aarcange, peterx, david, axelrasmussen,
	bgeffon, willy, jannh, kaleshsingh, ngeoffray, timmurray, rppt

Moving the struct to userfaultfd_k.h to be accessible from
mm/userfaultfd.c. There are no other changes in the struct.

This is required to prepare for using per-vma locks in userfaultfd
operations.

Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
---
 fs/userfaultfd.c              | 39 -----------------------------------
 include/linux/userfaultfd_k.h | 39 +++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 05c8e8a05427..58331b83d648 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -50,45 +50,6 @@ static struct ctl_table vm_userfaultfd_table[] = {
 
 static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init;
 
-/*
- * Start with fault_pending_wqh and fault_wqh so they're more likely
- * to be in the same cacheline.
- *
- * Locking order:
- *	fd_wqh.lock
- *		fault_pending_wqh.lock
- *			fault_wqh.lock
- *		event_wqh.lock
- *
- * To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
- * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
- * also taken in IRQ context.
- */
-struct userfaultfd_ctx {
-	/* waitqueue head for the pending (i.e. not read) userfaults */
-	wait_queue_head_t fault_pending_wqh;
-	/* waitqueue head for the userfaults */
-	wait_queue_head_t fault_wqh;
-	/* waitqueue head for the pseudo fd to wakeup poll/read */
-	wait_queue_head_t fd_wqh;
-	/* waitqueue head for events */
-	wait_queue_head_t event_wqh;
-	/* a refile sequence protected by fault_pending_wqh lock */
-	seqcount_spinlock_t refile_seq;
-	/* pseudo fd refcounting */
-	refcount_t refcount;
-	/* userfaultfd syscall flags */
-	unsigned int flags;
-	/* features requested from the userspace */
-	unsigned int features;
-	/* released */
-	bool released;
-	/* memory mappings are changing because of non-cooperative event */
-	atomic_t mmap_changing;
-	/* mm with one ore more vmas attached to this userfaultfd_ctx */
-	struct mm_struct *mm;
-};
-
 struct userfaultfd_fork_ctx {
 	struct userfaultfd_ctx *orig;
 	struct userfaultfd_ctx *new;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index e4056547fbe6..691d928ee864 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -36,6 +36,45 @@
 #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
 #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
 
+/*
+ * Start with fault_pending_wqh and fault_wqh so they're more likely
+ * to be in the same cacheline.
+ *
+ * Locking order:
+ *	fd_wqh.lock
+ *		fault_pending_wqh.lock
+ *			fault_wqh.lock
+ *		event_wqh.lock
+ *
+ * To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
+ * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
+ * also taken in IRQ context.
+ */
+struct userfaultfd_ctx {
+	/* waitqueue head for the pending (i.e. not read) userfaults */
+	wait_queue_head_t fault_pending_wqh;
+	/* waitqueue head for the userfaults */
+	wait_queue_head_t fault_wqh;
+	/* waitqueue head for the pseudo fd to wakeup poll/read */
+	wait_queue_head_t fd_wqh;
+	/* waitqueue head for events */
+	wait_queue_head_t event_wqh;
+	/* a refile sequence protected by fault_pending_wqh lock */
+	seqcount_spinlock_t refile_seq;
+	/* pseudo fd refcounting */
+	refcount_t refcount;
+	/* userfaultfd syscall flags */
+	unsigned int flags;
+	/* features requested from the userspace */
+	unsigned int features;
+	/* released */
+	bool released;
+	/* memory mappings are changing because of non-cooperative event */
+	atomic_t mmap_changing;
+	/* mm with one ore more vmas attached to this userfaultfd_ctx */
+	struct mm_struct *mm;
+};
+
 extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
 
 /* A combined operation mode + behavior flags. */
-- 
2.43.0.429.g432eaa2c6b-goog


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH v2 2/3] userfaultfd: protect mmap_changing with rw_sem in userfaulfd_ctx
  2024-01-29 19:35 [PATCH v2 0/3] per-vma locks in userfaultfd Lokesh Gidra
  2024-01-29 19:35 ` [PATCH v2 1/3] userfaultfd: move userfaultfd_ctx struct to header file Lokesh Gidra
@ 2024-01-29 19:35 ` Lokesh Gidra
  2024-01-29 21:00   ` Liam R. Howlett
  2024-01-30  7:21   ` Mike Rapoport
  2024-01-29 19:35 ` [PATCH v2 3/3] userfaultfd: use per-vma locks in userfaultfd operations Lokesh Gidra
  2024-01-29 20:39 ` [PATCH v2 0/3] per-vma locks in userfaultfd Liam R. Howlett
  3 siblings, 2 replies; 35+ messages in thread
From: Lokesh Gidra @ 2024-01-29 19:35 UTC (permalink / raw)
  To: akpm
  Cc: lokeshgidra, linux-fsdevel, linux-mm, linux-kernel, selinux,
	surenb, kernel-team, aarcange, peterx, david, axelrasmussen,
	bgeffon, willy, jannh, kaleshsingh, ngeoffray, timmurray, rppt

Increments and loads to mmap_changing are always in mmap_lock
critical section. This ensures that if userspace requests event
notification for non-cooperative operations (e.g. mremap), userfaultfd
operations don't occur concurrently.

This can be achieved by using a separate read-write semaphore in
userfaultfd_ctx such that increments are done in write-mode and loads
in read-mode, thereby eliminating the dependency on mmap_lock for this
purpose.

This is a preparatory step before we replace mmap_lock usage with
per-vma locks in fill/move ioctls.

Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
---
 fs/userfaultfd.c              | 40 ++++++++++++----------
 include/linux/userfaultfd_k.h | 31 ++++++++++--------
 mm/userfaultfd.c              | 62 ++++++++++++++++++++---------------
 3 files changed, 75 insertions(+), 58 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 58331b83d648..c00a021bcce4 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -685,12 +685,15 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
 		ctx->flags = octx->flags;
 		ctx->features = octx->features;
 		ctx->released = false;
+		init_rwsem(&ctx->map_changing_lock);
 		atomic_set(&ctx->mmap_changing, 0);
 		ctx->mm = vma->vm_mm;
 		mmgrab(ctx->mm);
 
 		userfaultfd_ctx_get(octx);
+		down_write(&octx->map_changing_lock);
 		atomic_inc(&octx->mmap_changing);
+		up_write(&octx->map_changing_lock);
 		fctx->orig = octx;
 		fctx->new = ctx;
 		list_add_tail(&fctx->list, fcs);
@@ -737,7 +740,9 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
 	if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
 		vm_ctx->ctx = ctx;
 		userfaultfd_ctx_get(ctx);
+		down_write(&ctx->map_changing_lock);
 		atomic_inc(&ctx->mmap_changing);
+		up_write(&ctx->map_changing_lock);
 	} else {
 		/* Drop uffd context if remap feature not enabled */
 		vma_start_write(vma);
@@ -783,7 +788,9 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
 		return true;
 
 	userfaultfd_ctx_get(ctx);
+	down_write(&ctx->map_changing_lock);
 	atomic_inc(&ctx->mmap_changing);
+	up_write(&ctx->map_changing_lock);
 	mmap_read_unlock(mm);
 
 	msg_init(&ewq.msg);
@@ -825,7 +832,9 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
 		return -ENOMEM;
 
 	userfaultfd_ctx_get(ctx);
+	down_write(&ctx->map_changing_lock);
 	atomic_inc(&ctx->mmap_changing);
+	up_write(&ctx->map_changing_lock);
 	unmap_ctx->ctx = ctx;
 	unmap_ctx->start = start;
 	unmap_ctx->end = end;
@@ -1709,9 +1718,8 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
 	if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
 		flags |= MFILL_ATOMIC_WP;
 	if (mmget_not_zero(ctx->mm)) {
-		ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
-					uffdio_copy.len, &ctx->mmap_changing,
-					flags);
+		ret = mfill_atomic_copy(ctx, uffdio_copy.dst, uffdio_copy.src,
+					uffdio_copy.len, flags);
 		mmput(ctx->mm);
 	} else {
 		return -ESRCH;
@@ -1761,9 +1769,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
 		goto out;
 
 	if (mmget_not_zero(ctx->mm)) {
-		ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start,
-					   uffdio_zeropage.range.len,
-					   &ctx->mmap_changing);
+		ret = mfill_atomic_zeropage(ctx, uffdio_zeropage.range.start,
+					   uffdio_zeropage.range.len);
 		mmput(ctx->mm);
 	} else {
 		return -ESRCH;
@@ -1818,9 +1825,8 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
 		return -EINVAL;
 
 	if (mmget_not_zero(ctx->mm)) {
-		ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
-					  uffdio_wp.range.len, mode_wp,
-					  &ctx->mmap_changing);
+		ret = mwriteprotect_range(ctx, uffdio_wp.range.start,
+					  uffdio_wp.range.len, mode_wp);
 		mmput(ctx->mm);
 	} else {
 		return -ESRCH;
@@ -1870,9 +1876,8 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
 		flags |= MFILL_ATOMIC_WP;
 
 	if (mmget_not_zero(ctx->mm)) {
-		ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start,
-					    uffdio_continue.range.len,
-					    &ctx->mmap_changing, flags);
+		ret = mfill_atomic_continue(ctx, uffdio_continue.range.start,
+					    uffdio_continue.range.len, flags);
 		mmput(ctx->mm);
 	} else {
 		return -ESRCH;
@@ -1925,9 +1930,8 @@ static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long
 		goto out;
 
 	if (mmget_not_zero(ctx->mm)) {
-		ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start,
-					  uffdio_poison.range.len,
-					  &ctx->mmap_changing, 0);
+		ret = mfill_atomic_poison(ctx, uffdio_poison.range.start,
+					  uffdio_poison.range.len, 0);
 		mmput(ctx->mm);
 	} else {
 		return -ESRCH;
@@ -2003,13 +2007,14 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
 	if (mmget_not_zero(mm)) {
 		mmap_read_lock(mm);
 
-		/* Re-check after taking mmap_lock */
+		/* Re-check after taking map_changing_lock */
+		down_read(&ctx->map_changing_lock);
 		if (likely(!atomic_read(&ctx->mmap_changing)))
 			ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
 					 uffdio_move.len, uffdio_move.mode);
 		else
 			ret = -EAGAIN;
-
+		up_read(&ctx->map_changing_lock);
 		mmap_read_unlock(mm);
 		mmput(mm);
 	} else {
@@ -2216,6 +2221,7 @@ static int new_userfaultfd(int flags)
 	ctx->flags = flags;
 	ctx->features = 0;
 	ctx->released = false;
+	init_rwsem(&ctx->map_changing_lock);
 	atomic_set(&ctx->mmap_changing, 0);
 	ctx->mm = current->mm;
 	/* prevent the mm struct to be freed */
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 691d928ee864..3210c3552976 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -69,6 +69,13 @@ struct userfaultfd_ctx {
 	unsigned int features;
 	/* released */
 	bool released;
+	/*
+	 * Prevents userfaultfd operations (fill/move/wp) from happening while
+	 * some non-cooperative event(s) is taking place. Increments are done
+	 * in write-mode. Whereas, userfaultfd operations, which includes
+	 * reading mmap_changing, is done under read-mode.
+	 */
+	struct rw_semaphore map_changing_lock;
 	/* memory mappings are changing because of non-cooperative event */
 	atomic_t mmap_changing;
 	/* mm with one ore more vmas attached to this userfaultfd_ctx */
@@ -113,22 +120,18 @@ extern int mfill_atomic_install_pte(pmd_t *dst_pmd,
 				    unsigned long dst_addr, struct page *page,
 				    bool newly_allocated, uffd_flags_t flags);
 
-extern ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
+extern ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
 				 unsigned long src_start, unsigned long len,
-				 atomic_t *mmap_changing, uffd_flags_t flags);
-extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
+				 uffd_flags_t flags);
+extern ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
 				     unsigned long dst_start,
-				     unsigned long len,
-				     atomic_t *mmap_changing);
-extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start,
-				     unsigned long len, atomic_t *mmap_changing,
-				     uffd_flags_t flags);
-extern ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
-				   unsigned long len, atomic_t *mmap_changing,
-				   uffd_flags_t flags);
-extern int mwriteprotect_range(struct mm_struct *dst_mm,
-			       unsigned long start, unsigned long len,
-			       bool enable_wp, atomic_t *mmap_changing);
+				     unsigned long len);
+extern ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long dst_start,
+				     unsigned long len, uffd_flags_t flags);
+extern ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
+				   unsigned long len, uffd_flags_t flags);
+extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
+			       unsigned long len, bool enable_wp);
 extern long uffd_wp_range(struct vm_area_struct *vma,
 			  unsigned long start, unsigned long len, bool enable_wp);
 
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index e3a91871462a..6e2ca04ab04d 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -353,11 +353,11 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
  * called with mmap_lock held, it will release mmap_lock before returning.
  */
 static __always_inline ssize_t mfill_atomic_hugetlb(
+					      struct userfaultfd_ctx *ctx,
 					      struct vm_area_struct *dst_vma,
 					      unsigned long dst_start,
 					      unsigned long src_start,
 					      unsigned long len,
-					      atomic_t *mmap_changing,
 					      uffd_flags_t flags)
 {
 	struct mm_struct *dst_mm = dst_vma->vm_mm;
@@ -379,6 +379,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
 	 * feature is not supported.
 	 */
 	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
+		up_read(&ctx->map_changing_lock);
 		mmap_read_unlock(dst_mm);
 		return -EINVAL;
 	}
@@ -463,6 +464,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
 		cond_resched();
 
 		if (unlikely(err == -ENOENT)) {
+			up_read(&ctx->map_changing_lock);
 			mmap_read_unlock(dst_mm);
 			BUG_ON(!folio);
 
@@ -473,12 +475,13 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
 				goto out;
 			}
 			mmap_read_lock(dst_mm);
+			down_read(&ctx->map_changing_lock);
 			/*
 			 * If memory mappings are changing because of non-cooperative
 			 * operation (e.g. mremap) running in parallel, bail out and
 			 * request the user to retry later
 			 */
-			if (mmap_changing && atomic_read(mmap_changing)) {
+			if (atomic_read(ctx->mmap_changing)) {
 				err = -EAGAIN;
 				break;
 			}
@@ -501,6 +504,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
 	}
 
 out_unlock:
+	up_read(&ctx->map_changing_lock);
 	mmap_read_unlock(dst_mm);
 out:
 	if (folio)
@@ -512,11 +516,11 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
 }
 #else /* !CONFIG_HUGETLB_PAGE */
 /* fail at build time if gcc attempts to use this */
-extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma,
+extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
+				    struct vm_area_struct *dst_vma,
 				    unsigned long dst_start,
 				    unsigned long src_start,
 				    unsigned long len,
-				    atomic_t *mmap_changing,
 				    uffd_flags_t flags);
 #endif /* CONFIG_HUGETLB_PAGE */
 
@@ -564,13 +568,13 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
 	return err;
 }
 
-static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
+static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 					    unsigned long dst_start,
 					    unsigned long src_start,
 					    unsigned long len,
-					    atomic_t *mmap_changing,
 					    uffd_flags_t flags)
 {
+	struct mm_struct *dst_mm = ctx->mm;
 	struct vm_area_struct *dst_vma;
 	ssize_t err;
 	pmd_t *dst_pmd;
@@ -600,8 +604,9 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
 	 * operation (e.g. mremap) running in parallel, bail out and
 	 * request the user to retry later
 	 */
+	down_read(&ctx->map_changing_lock);
 	err = -EAGAIN;
-	if (mmap_changing && atomic_read(mmap_changing))
+	if (atomic_read(&ctx->mmap_changing))
 		goto out_unlock;
 
 	/*
@@ -633,8 +638,8 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
 	 * If this is a HUGETLB vma, pass off to appropriate routine
 	 */
 	if (is_vm_hugetlb_page(dst_vma))
-		return  mfill_atomic_hugetlb(dst_vma, dst_start, src_start,
-					     len, mmap_changing, flags);
+		return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
+					     src_start, len, flags);
 
 	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
 		goto out_unlock;
@@ -693,6 +698,7 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
 		if (unlikely(err == -ENOENT)) {
 			void *kaddr;
 
+			up_read(&ctx->map_changing_lock);
 			mmap_read_unlock(dst_mm);
 			BUG_ON(!folio);
 
@@ -723,6 +729,7 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
 	}
 
 out_unlock:
+	up_read(&ctx->map_changing_lock);
 	mmap_read_unlock(dst_mm);
 out:
 	if (folio)
@@ -733,34 +740,33 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
 	return copied ? copied : err;
 }
 
-ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
+ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
 			  unsigned long src_start, unsigned long len,
-			  atomic_t *mmap_changing, uffd_flags_t flags)
+			  uffd_flags_t flags)
 {
-	return mfill_atomic(dst_mm, dst_start, src_start, len, mmap_changing,
+	return mfill_atomic(ctx, dst_start, src_start, len,
 			    uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY));
 }
 
-ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start,
-			      unsigned long len, atomic_t *mmap_changing)
+ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
+			      unsigned long start,
+			      unsigned long len)
 {
-	return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
+	return mfill_atomic(ctx, start, 0, len,
 			    uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE));
 }
 
-ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
-			      unsigned long len, atomic_t *mmap_changing,
-			      uffd_flags_t flags)
+ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start,
+			      unsigned long len, uffd_flags_t flags)
 {
-	return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
+	return mfill_atomic(ctx, start, 0, len,
 			    uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
 }
 
-ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
-			    unsigned long len, atomic_t *mmap_changing,
-			    uffd_flags_t flags)
+ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
+			    unsigned long len, uffd_flags_t flags)
 {
-	return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
+	return mfill_atomic(ctx, start, 0, len,
 			    uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON));
 }
 
@@ -793,10 +799,10 @@ long uffd_wp_range(struct vm_area_struct *dst_vma,
 	return ret;
 }
 
-int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
-			unsigned long len, bool enable_wp,
-			atomic_t *mmap_changing)
+int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
+			unsigned long len, bool enable_wp)
 {
+	struct mm_struct *dst_mm = ctx->mm;
 	unsigned long end = start + len;
 	unsigned long _start, _end;
 	struct vm_area_struct *dst_vma;
@@ -820,8 +826,9 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
 	 * operation (e.g. mremap) running in parallel, bail out and
 	 * request the user to retry later
 	 */
+	down_read(&ctx->map_changing_lock);
 	err = -EAGAIN;
-	if (mmap_changing && atomic_read(mmap_changing))
+	if (atomic_read(&ctx->mmap_changing))
 		goto out_unlock;
 
 	err = -ENOENT;
@@ -850,6 +857,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
 		err = 0;
 	}
 out_unlock:
+	up_read(&ctx->map_changing_lock);
 	mmap_read_unlock(dst_mm);
 	return err;
 }
-- 
2.43.0.429.g432eaa2c6b-goog


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH v2 3/3] userfaultfd: use per-vma locks in userfaultfd operations
  2024-01-29 19:35 [PATCH v2 0/3] per-vma locks in userfaultfd Lokesh Gidra
  2024-01-29 19:35 ` [PATCH v2 1/3] userfaultfd: move userfaultfd_ctx struct to header file Lokesh Gidra
  2024-01-29 19:35 ` [PATCH v2 2/3] userfaultfd: protect mmap_changing with rw_sem in userfaulfd_ctx Lokesh Gidra
@ 2024-01-29 19:35 ` Lokesh Gidra
  2024-01-29 20:36   ` Liam R. Howlett
  2024-01-29 20:39 ` [PATCH v2 0/3] per-vma locks in userfaultfd Liam R. Howlett
  3 siblings, 1 reply; 35+ messages in thread
From: Lokesh Gidra @ 2024-01-29 19:35 UTC (permalink / raw)
  To: akpm
  Cc: lokeshgidra, linux-fsdevel, linux-mm, linux-kernel, selinux,
	surenb, kernel-team, aarcange, peterx, david, axelrasmussen,
	bgeffon, willy, jannh, kaleshsingh, ngeoffray, timmurray, rppt

All userfaultfd operations, except write-protect, opportunistically use
per-vma locks to lock vmas. If we fail then fall back to locking
mmap-lock in read-mode.

Write-protect operation requires mmap_lock as it iterates over multiple vmas.

Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
---
 fs/userfaultfd.c              |  13 +--
 include/linux/userfaultfd_k.h |   5 +-
 mm/userfaultfd.c              | 175 +++++++++++++++++++++++-----------
 3 files changed, 122 insertions(+), 71 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index c00a021bcce4..60dcfafdc11a 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -2005,17 +2005,8 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
 		return -EINVAL;
 
 	if (mmget_not_zero(mm)) {
-		mmap_read_lock(mm);
-
-		/* Re-check after taking map_changing_lock */
-		down_read(&ctx->map_changing_lock);
-		if (likely(!atomic_read(&ctx->mmap_changing)))
-			ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
-					 uffdio_move.len, uffdio_move.mode);
-		else
-			ret = -EAGAIN;
-		up_read(&ctx->map_changing_lock);
-		mmap_read_unlock(mm);
+		ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src,
+				 uffdio_move.len, uffdio_move.mode);
 		mmput(mm);
 	} else {
 		return -ESRCH;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 3210c3552976..05d59f74fc88 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -138,9 +138,8 @@ extern long uffd_wp_range(struct vm_area_struct *vma,
 /* move_pages */
 void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
 void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2);
-ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
-		   unsigned long dst_start, unsigned long src_start,
-		   unsigned long len, __u64 flags);
+ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
+		   unsigned long src_start, unsigned long len, __u64 flags);
 int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
 			struct vm_area_struct *dst_vma,
 			struct vm_area_struct *src_vma,
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 6e2ca04ab04d..d55bf18b80db 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -19,20 +19,39 @@
 #include <asm/tlb.h>
 #include "internal.h"
 
-static __always_inline
-struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
-				    unsigned long dst_start,
-				    unsigned long len)
+void unpin_vma(struct mm_struct *mm, struct vm_area_struct *vma, bool *mmap_locked)
+{
+	BUG_ON(!vma && !*mmap_locked);
+
+	if (*mmap_locked) {
+		mmap_read_unlock(mm);
+		*mmap_locked = false;
+	} else
+		vma_end_read(vma);
+}
+
+/*
+ * Search for VMA and make sure it is stable either by locking it or taking
+ * mmap_lock.
+ */
+struct vm_area_struct *find_and_pin_dst_vma(struct mm_struct *dst_mm,
+					    unsigned long dst_start,
+					    unsigned long len,
+					    bool *mmap_locked)
 {
+	struct vm_area_struct *dst_vma = lock_vma_under_rcu(dst_mm, dst_start);
+	if (!dst_vma) {
+		mmap_read_lock(dst_mm);
+		*mmap_locked = true;
+		dst_vma = find_vma(dst_mm, dst_start);
+	}
+
 	/*
 	 * Make sure that the dst range is both valid and fully within a
 	 * single existing vma.
 	 */
-	struct vm_area_struct *dst_vma;
-
-	dst_vma = find_vma(dst_mm, dst_start);
 	if (!range_in_vma(dst_vma, dst_start, dst_start + len))
-		return NULL;
+		goto unpin;
 
 	/*
 	 * Check the vma is registered in uffd, this is required to
@@ -40,9 +59,13 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
 	 * time.
 	 */
 	if (!dst_vma->vm_userfaultfd_ctx.ctx)
-		return NULL;
+		goto unpin;
 
 	return dst_vma;
+
+unpin:
+	unpin_vma(dst_mm, dst_vma, mmap_locked);
+	return NULL;
 }
 
 /* Check if dst_addr is outside of file's size. Must be called with ptl held. */
@@ -350,7 +373,8 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
 #ifdef CONFIG_HUGETLB_PAGE
 /*
  * mfill_atomic processing for HUGETLB vmas.  Note that this routine is
- * called with mmap_lock held, it will release mmap_lock before returning.
+ * called with either vma-lock or mmap_lock held, it will release the lock
+ * before returning.
  */
 static __always_inline ssize_t mfill_atomic_hugetlb(
 					      struct userfaultfd_ctx *ctx,
@@ -358,7 +382,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
 					      unsigned long dst_start,
 					      unsigned long src_start,
 					      unsigned long len,
-					      uffd_flags_t flags)
+					      uffd_flags_t flags,
+					      bool *mmap_locked)
 {
 	struct mm_struct *dst_mm = dst_vma->vm_mm;
 	int vm_shared = dst_vma->vm_flags & VM_SHARED;
@@ -380,7 +405,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
 	 */
 	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
 		up_read(&ctx->map_changing_lock);
-		mmap_read_unlock(dst_mm);
+		unpin_vma(dst_mm, dst_vma, mmap_locked);
 		return -EINVAL;
 	}
 
@@ -404,12 +429,25 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
 	 */
 	if (!dst_vma) {
 		err = -ENOENT;
-		dst_vma = find_dst_vma(dst_mm, dst_start, len);
-		if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
-			goto out_unlock;
+		dst_vma = find_and_pin_dst_vma(dst_mm, dst_start,
+					       len, mmap_locked);
+		if (!dst_vma)
+			goto out;
+		if (!is_vm_hugetlb_page(dst_vma))
+			goto out_unlock_vma;
 
 		err = -EINVAL;
 		if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
+			goto out_unlock_vma;
+
+		/*
+		 * If memory mappings are changing because of non-cooperative
+		 * operation (e.g. mremap) running in parallel, bail out and
+		 * request the user to retry later
+		 */
+		down_read(&ctx->map_changing_lock);
+		err = -EAGAIN;
+		if (atomic_read(&ctx->mmap_changing))
 			goto out_unlock;
 
 		vm_shared = dst_vma->vm_flags & VM_SHARED;
@@ -465,7 +503,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
 
 		if (unlikely(err == -ENOENT)) {
 			up_read(&ctx->map_changing_lock);
-			mmap_read_unlock(dst_mm);
+			unpin_vma(dst_mm, dst_vma, mmap_locked);
 			BUG_ON(!folio);
 
 			err = copy_folio_from_user(folio,
@@ -474,17 +512,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
 				err = -EFAULT;
 				goto out;
 			}
-			mmap_read_lock(dst_mm);
-			down_read(&ctx->map_changing_lock);
-			/*
-			 * If memory mappings are changing because of non-cooperative
-			 * operation (e.g. mremap) running in parallel, bail out and
-			 * request the user to retry later
-			 */
-			if (atomic_read(ctx->mmap_changing)) {
-				err = -EAGAIN;
-				break;
-			}
 
 			dst_vma = NULL;
 			goto retry;
@@ -505,7 +532,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
 
 out_unlock:
 	up_read(&ctx->map_changing_lock);
-	mmap_read_unlock(dst_mm);
+out_unlock_vma:
+	unpin_vma(dst_mm, dst_vma, mmap_locked);
 out:
 	if (folio)
 		folio_put(folio);
@@ -521,7 +549,8 @@ extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
 				    unsigned long dst_start,
 				    unsigned long src_start,
 				    unsigned long len,
-				    uffd_flags_t flags);
+				    uffd_flags_t flags,
+				    bool *mmap_locked);
 #endif /* CONFIG_HUGETLB_PAGE */
 
 static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
@@ -581,6 +610,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 	unsigned long src_addr, dst_addr;
 	long copied;
 	struct folio *folio;
+	bool mmap_locked = false;
 
 	/*
 	 * Sanitize the command parameters:
@@ -597,7 +627,14 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 	copied = 0;
 	folio = NULL;
 retry:
-	mmap_read_lock(dst_mm);
+	/*
+	 * Make sure the vma is not shared, that the dst range is
+	 * both valid and fully within a single existing vma.
+	 */
+	err = -ENOENT;
+	dst_vma = find_and_pin_dst_vma(dst_mm, dst_start, len, &mmap_locked);
+	if (!dst_vma)
+		goto out;
 
 	/*
 	 * If memory mappings are changing because of non-cooperative
@@ -609,15 +646,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 	if (atomic_read(&ctx->mmap_changing))
 		goto out_unlock;
 
-	/*
-	 * Make sure the vma is not shared, that the dst range is
-	 * both valid and fully within a single existing vma.
-	 */
-	err = -ENOENT;
-	dst_vma = find_dst_vma(dst_mm, dst_start, len);
-	if (!dst_vma)
-		goto out_unlock;
-
 	err = -EINVAL;
 	/*
 	 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
@@ -638,8 +666,8 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 	 * If this is a HUGETLB vma, pass off to appropriate routine
 	 */
 	if (is_vm_hugetlb_page(dst_vma))
-		return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
-					     src_start, len, flags);
+		return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start, src_start
+					     len, flags, &mmap_locked);
 
 	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
 		goto out_unlock;
@@ -699,7 +727,8 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 			void *kaddr;
 
 			up_read(&ctx->map_changing_lock);
-			mmap_read_unlock(dst_mm);
+			unpin_vma(dst_mm, dst_vma, &mmap_locked);
+
 			BUG_ON(!folio);
 
 			kaddr = kmap_local_folio(folio, 0);
@@ -730,7 +759,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 
 out_unlock:
 	up_read(&ctx->map_changing_lock);
-	mmap_read_unlock(dst_mm);
+	unpin_vma(dst_mm, dst_vma, &mmap_locked);
 out:
 	if (folio)
 		folio_put(folio);
@@ -1285,8 +1314,6 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
  * @len: length of the virtual memory range
  * @mode: flags from uffdio_move.mode
  *
- * Must be called with mmap_lock held for read.
- *
  * move_pages() remaps arbitrary anonymous pages atomically in zero
  * copy. It only works on non shared anonymous pages because those can
  * be relocated without generating non linear anon_vmas in the rmap
@@ -1353,15 +1380,16 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
  * could be obtained. This is the only additional complexity added to
  * the rmap code to provide this anonymous page remapping functionality.
  */
-ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
-		   unsigned long dst_start, unsigned long src_start,
-		   unsigned long len, __u64 mode)
+ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
+		   unsigned long src_start, unsigned long len, __u64 mode)
 {
+	struct mm_struct *mm = ctx->mm;
 	struct vm_area_struct *src_vma, *dst_vma;
 	unsigned long src_addr, dst_addr;
 	pmd_t *src_pmd, *dst_pmd;
 	long err = -EINVAL;
 	ssize_t moved = 0;
+	bool mmap_locked = false;
 
 	/* Sanitize the command parameters. */
 	if (WARN_ON_ONCE(src_start & ~PAGE_MASK) ||
@@ -1374,28 +1402,52 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
 	    WARN_ON_ONCE(dst_start + len <= dst_start))
 		goto out;
 
+	dst_vma = NULL;
+	src_vma = lock_vma_under_rcu(mm, src_start);
+	if (src_vma) {
+		dst_vma = lock_vma_under_rcu(mm, dst_start);
+		if (!dst_vma)
+			vma_end_read(src_vma);
+	}
+
+	/* If we failed to lock both VMAs, fall back to mmap_lock */
+	if (!dst_vma) {
+		mmap_read_lock(mm);
+		mmap_locked = true;
+		src_vma = find_vma(mm, src_start);
+		if (!src_vma)
+			goto out_unlock_mmap;
+		dst_vma = find_vma(mm, dst_start);
+		if (!dst_vma)
+			goto out_unlock_mmap;
+	}
+
+	/* Re-check after taking map_changing_lock */
+	down_read(&ctx->map_changing_lock);
+	if (likely(atomic_read(&ctx->mmap_changing))) {
+		err = -EAGAIN;
+		goto out_unlock;
+	}
 	/*
 	 * Make sure the vma is not shared, that the src and dst remap
 	 * ranges are both valid and fully within a single existing
 	 * vma.
 	 */
-	src_vma = find_vma(mm, src_start);
-	if (!src_vma || (src_vma->vm_flags & VM_SHARED))
-		goto out;
+	if (src_vma->vm_flags & VM_SHARED)
+		goto out_unlock;
 	if (src_start < src_vma->vm_start ||
 	    src_start + len > src_vma->vm_end)
-		goto out;
+		goto out_unlock;
 
-	dst_vma = find_vma(mm, dst_start);
-	if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
-		goto out;
+	if (dst_vma->vm_flags & VM_SHARED)
+		goto out_unlock;
 	if (dst_start < dst_vma->vm_start ||
 	    dst_start + len > dst_vma->vm_end)
-		goto out;
+		goto out_unlock;
 
 	err = validate_move_areas(ctx, src_vma, dst_vma);
 	if (err)
-		goto out;
+		goto out_unlock;
 
 	for (src_addr = src_start, dst_addr = dst_start;
 	     src_addr < src_start + len;) {
@@ -1512,6 +1564,15 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
 		moved += step_size;
 	}
 
+out_unlock:
+	up_read(&ctx->map_changing_lock);
+out_unlock_mmap:
+	if (mmap_locked)
+		mmap_read_unlock(mm);
+	else {
+		vma_end_read(dst_vma);
+		vma_end_read(src_vma);
+	}
 out:
 	VM_WARN_ON(moved < 0);
 	VM_WARN_ON(err > 0);
-- 
2.43.0.429.g432eaa2c6b-goog


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 3/3] userfaultfd: use per-vma locks in userfaultfd operations
  2024-01-29 19:35 ` [PATCH v2 3/3] userfaultfd: use per-vma locks in userfaultfd operations Lokesh Gidra
@ 2024-01-29 20:36   ` Liam R. Howlett
  2024-01-29 20:52     ` Suren Baghdasaryan
  0 siblings, 1 reply; 35+ messages in thread
From: Liam R. Howlett @ 2024-01-29 20:36 UTC (permalink / raw)
  To: Lokesh Gidra
  Cc: akpm, linux-fsdevel, linux-mm, linux-kernel, selinux, surenb,
	kernel-team, aarcange, peterx, david, axelrasmussen, bgeffon,
	willy, jannh, kaleshsingh, ngeoffray, timmurray, rppt

* Lokesh Gidra <lokeshgidra@google.com> [240129 14:35]:
> All userfaultfd operations, except write-protect, opportunistically use
> per-vma locks to lock vmas. If we fail then fall back to locking
> mmap-lock in read-mode.
> 
> Write-protect operation requires mmap_lock as it iterates over multiple vmas.
> 
> Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
> ---
>  fs/userfaultfd.c              |  13 +--
>  include/linux/userfaultfd_k.h |   5 +-
>  mm/userfaultfd.c              | 175 +++++++++++++++++++++++-----------
>  3 files changed, 122 insertions(+), 71 deletions(-)
> 
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index c00a021bcce4..60dcfafdc11a 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -2005,17 +2005,8 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
>  		return -EINVAL;
>  
>  	if (mmget_not_zero(mm)) {
> -		mmap_read_lock(mm);
> -
> -		/* Re-check after taking map_changing_lock */
> -		down_read(&ctx->map_changing_lock);
> -		if (likely(!atomic_read(&ctx->mmap_changing)))
> -			ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
> -					 uffdio_move.len, uffdio_move.mode);
> -		else
> -			ret = -EAGAIN;
> -		up_read(&ctx->map_changing_lock);
> -		mmap_read_unlock(mm);
> +		ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src,
> +				 uffdio_move.len, uffdio_move.mode);
>  		mmput(mm);
>  	} else {
>  		return -ESRCH;
> diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> index 3210c3552976..05d59f74fc88 100644
> --- a/include/linux/userfaultfd_k.h
> +++ b/include/linux/userfaultfd_k.h
> @@ -138,9 +138,8 @@ extern long uffd_wp_range(struct vm_area_struct *vma,
>  /* move_pages */
>  void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
>  void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2);
> -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> -		   unsigned long dst_start, unsigned long src_start,
> -		   unsigned long len, __u64 flags);
> +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> +		   unsigned long src_start, unsigned long len, __u64 flags);
>  int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
>  			struct vm_area_struct *dst_vma,
>  			struct vm_area_struct *src_vma,
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index 6e2ca04ab04d..d55bf18b80db 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -19,20 +19,39 @@
>  #include <asm/tlb.h>
>  #include "internal.h"
>  
> -static __always_inline
> -struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
> -				    unsigned long dst_start,
> -				    unsigned long len)
> +void unpin_vma(struct mm_struct *mm, struct vm_area_struct *vma, bool *mmap_locked)
> +{
> +	BUG_ON(!vma && !*mmap_locked);
> +
> +	if (*mmap_locked) {
> +		mmap_read_unlock(mm);
> +		*mmap_locked = false;
> +	} else
> +		vma_end_read(vma);

You are missing braces here.

This function is small so it could be inline, although I hope the
compiler would get that right for us.

I don't think this small helper is worth it, considering you are
altering a pointer in here, which makes things harder to follow (not to
mention the locking).  The only code that depends on this update is a
single place, which already assigns a custom variable after the function
return.

> +}
> +
> +/*
> + * Search for VMA and make sure it is stable either by locking it or taking
> + * mmap_lock.

This function returns something that isn't documented and also sets a
boolean which is passed in as a pointer which also is lacking from the
documentation.

> + */
> +struct vm_area_struct *find_and_pin_dst_vma(struct mm_struct *dst_mm,
> +					    unsigned long dst_start,
> +					    unsigned long len,
> +					    bool *mmap_locked)
>  {
> +	struct vm_area_struct *dst_vma = lock_vma_under_rcu(dst_mm, dst_start);

lock_vma_under_rcu() calls mas_walk(), which goes to dst_start for the
VMA.  It is not possible for dst_start to be outside the range.

> +	if (!dst_vma) {

BUG_ON(mmap_locked) ?

> +		mmap_read_lock(dst_mm);
> +		*mmap_locked = true;
> +		dst_vma = find_vma(dst_mm, dst_start);

find_vma() walks to dst_start and searches upwards from that address.
This is functionally different than what you have asked for above.  You
will not see an issue as you have coded it - but it may be suboptimal
since a start address lower than the VMA you are looking for can be
found... however, later you check the range falls between the dst_start
and dst_start + len.

If you expect the dst_start to always be within the VMA range and not
lower, then you should use vma_lookup().

If you want to search upwards from dst_start for a VMA then you should
move the range check below into this brace.

> +	}
> +
>  	/*
>  	 * Make sure that the dst range is both valid and fully within a
>  	 * single existing vma.
>  	 */
> -	struct vm_area_struct *dst_vma;
> -
> -	dst_vma = find_vma(dst_mm, dst_start);
>  	if (!range_in_vma(dst_vma, dst_start, dst_start + len))
> -		return NULL;
> +		goto unpin;
>  
>  	/*
>  	 * Check the vma is registered in uffd, this is required to
> @@ -40,9 +59,13 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
>  	 * time.
>  	 */
>  	if (!dst_vma->vm_userfaultfd_ctx.ctx)
> -		return NULL;
> +		goto unpin;
>  
>  	return dst_vma;
> +
> +unpin:
> +	unpin_vma(dst_mm, dst_vma, mmap_locked);
> +	return NULL;
>  }
>  
>  /* Check if dst_addr is outside of file's size. Must be called with ptl held. */
> @@ -350,7 +373,8 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
>  #ifdef CONFIG_HUGETLB_PAGE
>  /*
>   * mfill_atomic processing for HUGETLB vmas.  Note that this routine is
> - * called with mmap_lock held, it will release mmap_lock before returning.
> + * called with either vma-lock or mmap_lock held, it will release the lock
> + * before returning.
>   */
>  static __always_inline ssize_t mfill_atomic_hugetlb(
>  					      struct userfaultfd_ctx *ctx,
> @@ -358,7 +382,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  					      unsigned long dst_start,
>  					      unsigned long src_start,
>  					      unsigned long len,
> -					      uffd_flags_t flags)
> +					      uffd_flags_t flags,
> +					      bool *mmap_locked)
>  {
>  	struct mm_struct *dst_mm = dst_vma->vm_mm;
>  	int vm_shared = dst_vma->vm_flags & VM_SHARED;
> @@ -380,7 +405,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  	 */
>  	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
>  		up_read(&ctx->map_changing_lock);
> -		mmap_read_unlock(dst_mm);
> +		unpin_vma(dst_mm, dst_vma, mmap_locked);
>  		return -EINVAL;
>  	}
>  
> @@ -404,12 +429,25 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  	 */
>  	if (!dst_vma) {
>  		err = -ENOENT;
> -		dst_vma = find_dst_vma(dst_mm, dst_start, len);
> -		if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
> -			goto out_unlock;
> +		dst_vma = find_and_pin_dst_vma(dst_mm, dst_start,
> +					       len, mmap_locked);
> +		if (!dst_vma)
> +			goto out;
> +		if (!is_vm_hugetlb_page(dst_vma))
> +			goto out_unlock_vma;
>  
>  		err = -EINVAL;
>  		if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
> +			goto out_unlock_vma;
> +
> +		/*
> +		 * If memory mappings are changing because of non-cooperative
> +		 * operation (e.g. mremap) running in parallel, bail out and
> +		 * request the user to retry later
> +		 */
> +		down_read(&ctx->map_changing_lock);
> +		err = -EAGAIN;
> +		if (atomic_read(&ctx->mmap_changing))
>  			goto out_unlock;
>  
>  		vm_shared = dst_vma->vm_flags & VM_SHARED;
> @@ -465,7 +503,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  
>  		if (unlikely(err == -ENOENT)) {
>  			up_read(&ctx->map_changing_lock);
> -			mmap_read_unlock(dst_mm);
> +			unpin_vma(dst_mm, dst_vma, mmap_locked);
>  			BUG_ON(!folio);
>  
>  			err = copy_folio_from_user(folio,
> @@ -474,17 +512,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  				err = -EFAULT;
>  				goto out;
>  			}
> -			mmap_read_lock(dst_mm);
> -			down_read(&ctx->map_changing_lock);
> -			/*
> -			 * If memory mappings are changing because of non-cooperative
> -			 * operation (e.g. mremap) running in parallel, bail out and
> -			 * request the user to retry later
> -			 */
> -			if (atomic_read(ctx->mmap_changing)) {
> -				err = -EAGAIN;
> -				break;
> -			}

... Okay, this is where things get confusing.

How about this: Don't do this locking/boolean dance.

Instead, do something like this:
In mm/memory.c, below lock_vma_under_rcu(), but something like this

struct vm_area_struct *lock_vma(struct mm_struct *mm,
	unsigned long addr))	/* or some better name.. */
{
	struct vm_area_struct *vma;

	vma = lock_vma_under_rcu(mm, addr);

	if (vma)
		return vma;

	mmap_read_lock(mm);
	vma = lookup_vma(mm, addr);
	if (vma)
		vma_start_read(vma); /* Won't fail */

	mmap_read_unlock(mm);
	return vma;
}

Now, we know we have a vma that's vma locked if there is a vma.  The vma
won't go away - you have it locked.  The mmap lock is held for even
less time for your worse case, and the code gets easier to follow.

Once you are done with the vma do a vma_end_read(vma).  Don't forget to
do this!

Now the comment above such a function should state that the vma needs to
be vma_end_read(vma), or that could go undetected..  It might be worth
adding a unlock_vma() counterpart to vma_end_read(vma) even.


>  
>  			dst_vma = NULL;
>  			goto retry;
> @@ -505,7 +532,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  
>  out_unlock:
>  	up_read(&ctx->map_changing_lock);
> -	mmap_read_unlock(dst_mm);
> +out_unlock_vma:
> +	unpin_vma(dst_mm, dst_vma, mmap_locked);
>  out:
>  	if (folio)
>  		folio_put(folio);
> @@ -521,7 +549,8 @@ extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
>  				    unsigned long dst_start,
>  				    unsigned long src_start,
>  				    unsigned long len,
> -				    uffd_flags_t flags);
> +				    uffd_flags_t flags,
> +				    bool *mmap_locked);

Just a thought, tabbing in twice for each argument would make this more
compact.


>  #endif /* CONFIG_HUGETLB_PAGE */
>  
>  static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
> @@ -581,6 +610,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
>  	unsigned long src_addr, dst_addr;
>  	long copied;
>  	struct folio *folio;
> +	bool mmap_locked = false;
>  
>  	/*
>  	 * Sanitize the command parameters:
> @@ -597,7 +627,14 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
>  	copied = 0;
>  	folio = NULL;
>  retry:
> -	mmap_read_lock(dst_mm);
> +	/*
> +	 * Make sure the vma is not shared, that the dst range is
> +	 * both valid and fully within a single existing vma.
> +	 */
> +	err = -ENOENT;
> +	dst_vma = find_and_pin_dst_vma(dst_mm, dst_start, len, &mmap_locked);
> +	if (!dst_vma)
> +		goto out;
>  
>  	/*
>  	 * If memory mappings are changing because of non-cooperative
> @@ -609,15 +646,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
>  	if (atomic_read(&ctx->mmap_changing))
>  		goto out_unlock;
>  
> -	/*
> -	 * Make sure the vma is not shared, that the dst range is
> -	 * both valid and fully within a single existing vma.
> -	 */
> -	err = -ENOENT;
> -	dst_vma = find_dst_vma(dst_mm, dst_start, len);
> -	if (!dst_vma)
> -		goto out_unlock;
> -
>  	err = -EINVAL;
>  	/*
>  	 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
> @@ -638,8 +666,8 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
>  	 * If this is a HUGETLB vma, pass off to appropriate routine
>  	 */
>  	if (is_vm_hugetlb_page(dst_vma))
> -		return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
> -					     src_start, len, flags);
> +		return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start, src_start
> +					     len, flags, &mmap_locked);
>  
>  	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
>  		goto out_unlock;
> @@ -699,7 +727,8 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
>  			void *kaddr;
>  
>  			up_read(&ctx->map_changing_lock);
> -			mmap_read_unlock(dst_mm);
> +			unpin_vma(dst_mm, dst_vma, &mmap_locked);
> +
>  			BUG_ON(!folio);
>  
>  			kaddr = kmap_local_folio(folio, 0);
> @@ -730,7 +759,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
>  
>  out_unlock:
>  	up_read(&ctx->map_changing_lock);
> -	mmap_read_unlock(dst_mm);
> +	unpin_vma(dst_mm, dst_vma, &mmap_locked);
>  out:
>  	if (folio)
>  		folio_put(folio);
> @@ -1285,8 +1314,6 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
>   * @len: length of the virtual memory range
>   * @mode: flags from uffdio_move.mode
>   *
> - * Must be called with mmap_lock held for read.
> - *
>   * move_pages() remaps arbitrary anonymous pages atomically in zero
>   * copy. It only works on non shared anonymous pages because those can
>   * be relocated without generating non linear anon_vmas in the rmap
> @@ -1353,15 +1380,16 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
>   * could be obtained. This is the only additional complexity added to
>   * the rmap code to provide this anonymous page remapping functionality.
>   */
> -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> -		   unsigned long dst_start, unsigned long src_start,
> -		   unsigned long len, __u64 mode)
> +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> +		   unsigned long src_start, unsigned long len, __u64 mode)
>  {
> +	struct mm_struct *mm = ctx->mm;
>  	struct vm_area_struct *src_vma, *dst_vma;
>  	unsigned long src_addr, dst_addr;
>  	pmd_t *src_pmd, *dst_pmd;
>  	long err = -EINVAL;
>  	ssize_t moved = 0;
> +	bool mmap_locked = false;
>  
>  	/* Sanitize the command parameters. */
>  	if (WARN_ON_ONCE(src_start & ~PAGE_MASK) ||
> @@ -1374,28 +1402,52 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
>  	    WARN_ON_ONCE(dst_start + len <= dst_start))
>  		goto out;

Ah, is this safe for rmap?  I think you need to leave this read lock.

>  
> +	dst_vma = NULL;
> +	src_vma = lock_vma_under_rcu(mm, src_start);
> +	if (src_vma) {
> +		dst_vma = lock_vma_under_rcu(mm, dst_start);
> +		if (!dst_vma)
> +			vma_end_read(src_vma);
> +	}
> +
> +	/* If we failed to lock both VMAs, fall back to mmap_lock */
> +	if (!dst_vma) {
> +		mmap_read_lock(mm);
> +		mmap_locked = true;
> +		src_vma = find_vma(mm, src_start);
> +		if (!src_vma)
> +			goto out_unlock_mmap;
> +		dst_vma = find_vma(mm, dst_start);

Again, there is a difference in how find_vma and lock_vam_under_rcu
works.

> +		if (!dst_vma)
> +			goto out_unlock_mmap;
> +	}
> +
> +	/* Re-check after taking map_changing_lock */
> +	down_read(&ctx->map_changing_lock);
> +	if (likely(atomic_read(&ctx->mmap_changing))) {
> +		err = -EAGAIN;
> +		goto out_unlock;
> +	}
>  	/*
>  	 * Make sure the vma is not shared, that the src and dst remap
>  	 * ranges are both valid and fully within a single existing
>  	 * vma.
>  	 */
> -	src_vma = find_vma(mm, src_start);
> -	if (!src_vma || (src_vma->vm_flags & VM_SHARED))
> -		goto out;
> +	if (src_vma->vm_flags & VM_SHARED)
> +		goto out_unlock;
>  	if (src_start < src_vma->vm_start ||
>  	    src_start + len > src_vma->vm_end)
> -		goto out;
> +		goto out_unlock;
>  
> -	dst_vma = find_vma(mm, dst_start);
> -	if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
> -		goto out;
> +	if (dst_vma->vm_flags & VM_SHARED)
> +		goto out_unlock;
>  	if (dst_start < dst_vma->vm_start ||
>  	    dst_start + len > dst_vma->vm_end)
> -		goto out;
> +		goto out_unlock;
>  
>  	err = validate_move_areas(ctx, src_vma, dst_vma);
>  	if (err)
> -		goto out;
> +		goto out_unlock;
>  
>  	for (src_addr = src_start, dst_addr = dst_start;
>  	     src_addr < src_start + len;) {
> @@ -1512,6 +1564,15 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
>  		moved += step_size;
>  	}
>  
> +out_unlock:
> +	up_read(&ctx->map_changing_lock);
> +out_unlock_mmap:
> +	if (mmap_locked)
> +		mmap_read_unlock(mm);
> +	else {
> +		vma_end_read(dst_vma);
> +		vma_end_read(src_vma);
> +	}
>  out:
>  	VM_WARN_ON(moved < 0);
>  	VM_WARN_ON(err > 0);
> -- 
> 2.43.0.429.g432eaa2c6b-goog
> 
> 

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 0/3] per-vma locks in userfaultfd
  2024-01-29 19:35 [PATCH v2 0/3] per-vma locks in userfaultfd Lokesh Gidra
                   ` (2 preceding siblings ...)
  2024-01-29 19:35 ` [PATCH v2 3/3] userfaultfd: use per-vma locks in userfaultfd operations Lokesh Gidra
@ 2024-01-29 20:39 ` Liam R. Howlett
  2024-01-29 21:58   ` Lokesh Gidra
  3 siblings, 1 reply; 35+ messages in thread
From: Liam R. Howlett @ 2024-01-29 20:39 UTC (permalink / raw)
  To: Lokesh Gidra
  Cc: akpm, linux-fsdevel, linux-mm, linux-kernel, selinux, surenb,
	kernel-team, aarcange, peterx, david, axelrasmussen, bgeffon,
	willy, jannh, kaleshsingh, ngeoffray, timmurray, rppt

* Lokesh Gidra <lokeshgidra@google.com> [240129 14:35]:
> Performing userfaultfd operations (like copy/move etc.) in critical
> section of mmap_lock (read-mode) causes significant contention on the
> lock when operations requiring the lock in write-mode are taking place
> concurrently. We can use per-vma locks instead to significantly reduce
> the contention issue.

Is this really an issue?  I'm surprised so much userfaultfd work is
happening to create contention.  Can you share some numbers and how your
patch set changes the performance?

> 
> Changes since v1 [1]:
> - rebase patches on 'mm-unstable' branch
> 
> [1] https://lore.kernel.org/all/20240126182647.2748949-1-lokeshgidra@google.com/
> 
> Lokesh Gidra (3):
>   userfaultfd: move userfaultfd_ctx struct to header file
>   userfaultfd: protect mmap_changing with rw_sem in userfaulfd_ctx
>   userfaultfd: use per-vma locks in userfaultfd operations
> 
>  fs/userfaultfd.c              |  86 ++++---------
>  include/linux/userfaultfd_k.h |  75 ++++++++---
>  mm/userfaultfd.c              | 229 ++++++++++++++++++++++------------
>  3 files changed, 229 insertions(+), 161 deletions(-)
> 
> -- 
> 2.43.0.429.g432eaa2c6b-goog
> 
> 

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 3/3] userfaultfd: use per-vma locks in userfaultfd operations
  2024-01-29 20:36   ` Liam R. Howlett
@ 2024-01-29 20:52     ` Suren Baghdasaryan
  2024-01-29 21:18       ` Liam R. Howlett
  2024-01-30  0:28       ` Lokesh Gidra
  0 siblings, 2 replies; 35+ messages in thread
From: Suren Baghdasaryan @ 2024-01-29 20:52 UTC (permalink / raw)
  To: Liam R. Howlett, Lokesh Gidra, akpm, linux-fsdevel, linux-mm,
	linux-kernel, selinux, surenb, kernel-team, aarcange, peterx,
	david, axelrasmussen, bgeffon, willy, jannh, kaleshsingh,
	ngeoffray, timmurray, rppt

On Mon, Jan 29, 2024 at 12:36 PM Liam R. Howlett
<Liam.Howlett@oracle.com> wrote:
>
> * Lokesh Gidra <lokeshgidra@google.com> [240129 14:35]:
> > All userfaultfd operations, except write-protect, opportunistically use
> > per-vma locks to lock vmas. If we fail then fall back to locking
> > mmap-lock in read-mode.
> >
> > Write-protect operation requires mmap_lock as it iterates over multiple vmas.
> >
> > Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
> > ---
> >  fs/userfaultfd.c              |  13 +--
> >  include/linux/userfaultfd_k.h |   5 +-
> >  mm/userfaultfd.c              | 175 +++++++++++++++++++++++-----------
> >  3 files changed, 122 insertions(+), 71 deletions(-)
> >
> > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > index c00a021bcce4..60dcfafdc11a 100644
> > --- a/fs/userfaultfd.c
> > +++ b/fs/userfaultfd.c
> > @@ -2005,17 +2005,8 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
> >               return -EINVAL;
> >
> >       if (mmget_not_zero(mm)) {
> > -             mmap_read_lock(mm);
> > -
> > -             /* Re-check after taking map_changing_lock */
> > -             down_read(&ctx->map_changing_lock);
> > -             if (likely(!atomic_read(&ctx->mmap_changing)))
> > -                     ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
> > -                                      uffdio_move.len, uffdio_move.mode);
> > -             else
> > -                     ret = -EAGAIN;
> > -             up_read(&ctx->map_changing_lock);
> > -             mmap_read_unlock(mm);
> > +             ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src,
> > +                              uffdio_move.len, uffdio_move.mode);
> >               mmput(mm);
> >       } else {
> >               return -ESRCH;
> > diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> > index 3210c3552976..05d59f74fc88 100644
> > --- a/include/linux/userfaultfd_k.h
> > +++ b/include/linux/userfaultfd_k.h
> > @@ -138,9 +138,8 @@ extern long uffd_wp_range(struct vm_area_struct *vma,
> >  /* move_pages */
> >  void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
> >  void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2);
> > -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > -                unsigned long dst_start, unsigned long src_start,
> > -                unsigned long len, __u64 flags);
> > +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> > +                unsigned long src_start, unsigned long len, __u64 flags);
> >  int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
> >                       struct vm_area_struct *dst_vma,
> >                       struct vm_area_struct *src_vma,
> > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> > index 6e2ca04ab04d..d55bf18b80db 100644
> > --- a/mm/userfaultfd.c
> > +++ b/mm/userfaultfd.c
> > @@ -19,20 +19,39 @@
> >  #include <asm/tlb.h>
> >  #include "internal.h"
> >
> > -static __always_inline
> > -struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
> > -                                 unsigned long dst_start,
> > -                                 unsigned long len)
> > +void unpin_vma(struct mm_struct *mm, struct vm_area_struct *vma, bool *mmap_locked)
> > +{
> > +     BUG_ON(!vma && !*mmap_locked);
> > +
> > +     if (*mmap_locked) {
> > +             mmap_read_unlock(mm);
> > +             *mmap_locked = false;
> > +     } else
> > +             vma_end_read(vma);
>
> You are missing braces here.
>
> This function is small so it could be inline, although I hope the
> compiler would get that right for us.
>
> I don't think this small helper is worth it, considering you are
> altering a pointer in here, which makes things harder to follow (not to
> mention the locking).  The only code that depends on this update is a
> single place, which already assigns a custom variable after the function
> return.
>
> > +}
> > +
> > +/*
> > + * Search for VMA and make sure it is stable either by locking it or taking
> > + * mmap_lock.
>
> This function returns something that isn't documented and also sets a
> boolean which is passed in as a pointer which also is lacking from the
> documentation.
>
> > + */
> > +struct vm_area_struct *find_and_pin_dst_vma(struct mm_struct *dst_mm,
> > +                                         unsigned long dst_start,
> > +                                         unsigned long len,
> > +                                         bool *mmap_locked)
> >  {
> > +     struct vm_area_struct *dst_vma = lock_vma_under_rcu(dst_mm, dst_start);
>
> lock_vma_under_rcu() calls mas_walk(), which goes to dst_start for the
> VMA.  It is not possible for dst_start to be outside the range.
>
> > +     if (!dst_vma) {
>
> BUG_ON(mmap_locked) ?
>
> > +             mmap_read_lock(dst_mm);
> > +             *mmap_locked = true;
> > +             dst_vma = find_vma(dst_mm, dst_start);
>
> find_vma() walks to dst_start and searches upwards from that address.
> This is functionally different than what you have asked for above.  You
> will not see an issue as you have coded it - but it may be suboptimal
> since a start address lower than the VMA you are looking for can be
> found... however, later you check the range falls between the dst_start
> and dst_start + len.
>
> If you expect the dst_start to always be within the VMA range and not
> lower, then you should use vma_lookup().
>
> If you want to search upwards from dst_start for a VMA then you should
> move the range check below into this brace.
>
> > +     }
> > +
> >       /*
> >        * Make sure that the dst range is both valid and fully within a
> >        * single existing vma.
> >        */
> > -     struct vm_area_struct *dst_vma;
> > -
> > -     dst_vma = find_vma(dst_mm, dst_start);
> >       if (!range_in_vma(dst_vma, dst_start, dst_start + len))
> > -             return NULL;
> > +             goto unpin;
> >
> >       /*
> >        * Check the vma is registered in uffd, this is required to
> > @@ -40,9 +59,13 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
> >        * time.
> >        */
> >       if (!dst_vma->vm_userfaultfd_ctx.ctx)
> > -             return NULL;
> > +             goto unpin;
> >
> >       return dst_vma;
> > +
> > +unpin:
> > +     unpin_vma(dst_mm, dst_vma, mmap_locked);
> > +     return NULL;
> >  }
> >
> >  /* Check if dst_addr is outside of file's size. Must be called with ptl held. */
> > @@ -350,7 +373,8 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
> >  #ifdef CONFIG_HUGETLB_PAGE
> >  /*
> >   * mfill_atomic processing for HUGETLB vmas.  Note that this routine is
> > - * called with mmap_lock held, it will release mmap_lock before returning.
> > + * called with either vma-lock or mmap_lock held, it will release the lock
> > + * before returning.
> >   */
> >  static __always_inline ssize_t mfill_atomic_hugetlb(
> >                                             struct userfaultfd_ctx *ctx,
> > @@ -358,7 +382,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> >                                             unsigned long dst_start,
> >                                             unsigned long src_start,
> >                                             unsigned long len,
> > -                                           uffd_flags_t flags)
> > +                                           uffd_flags_t flags,
> > +                                           bool *mmap_locked)
> >  {
> >       struct mm_struct *dst_mm = dst_vma->vm_mm;
> >       int vm_shared = dst_vma->vm_flags & VM_SHARED;
> > @@ -380,7 +405,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> >        */
> >       if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
> >               up_read(&ctx->map_changing_lock);
> > -             mmap_read_unlock(dst_mm);
> > +             unpin_vma(dst_mm, dst_vma, mmap_locked);
> >               return -EINVAL;
> >       }
> >
> > @@ -404,12 +429,25 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> >        */
> >       if (!dst_vma) {
> >               err = -ENOENT;
> > -             dst_vma = find_dst_vma(dst_mm, dst_start, len);
> > -             if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
> > -                     goto out_unlock;
> > +             dst_vma = find_and_pin_dst_vma(dst_mm, dst_start,
> > +                                            len, mmap_locked);
> > +             if (!dst_vma)
> > +                     goto out;
> > +             if (!is_vm_hugetlb_page(dst_vma))
> > +                     goto out_unlock_vma;
> >
> >               err = -EINVAL;
> >               if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
> > +                     goto out_unlock_vma;
> > +
> > +             /*
> > +              * If memory mappings are changing because of non-cooperative
> > +              * operation (e.g. mremap) running in parallel, bail out and
> > +              * request the user to retry later
> > +              */
> > +             down_read(&ctx->map_changing_lock);
> > +             err = -EAGAIN;
> > +             if (atomic_read(&ctx->mmap_changing))
> >                       goto out_unlock;
> >
> >               vm_shared = dst_vma->vm_flags & VM_SHARED;
> > @@ -465,7 +503,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> >
> >               if (unlikely(err == -ENOENT)) {
> >                       up_read(&ctx->map_changing_lock);
> > -                     mmap_read_unlock(dst_mm);
> > +                     unpin_vma(dst_mm, dst_vma, mmap_locked);
> >                       BUG_ON(!folio);
> >
> >                       err = copy_folio_from_user(folio,
> > @@ -474,17 +512,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> >                               err = -EFAULT;
> >                               goto out;
> >                       }
> > -                     mmap_read_lock(dst_mm);
> > -                     down_read(&ctx->map_changing_lock);
> > -                     /*
> > -                      * If memory mappings are changing because of non-cooperative
> > -                      * operation (e.g. mremap) running in parallel, bail out and
> > -                      * request the user to retry later
> > -                      */
> > -                     if (atomic_read(ctx->mmap_changing)) {
> > -                             err = -EAGAIN;
> > -                             break;
> > -                     }
>
> ... Okay, this is where things get confusing.
>
> How about this: Don't do this locking/boolean dance.
>
> Instead, do something like this:
> In mm/memory.c, below lock_vma_under_rcu(), but something like this
>
> struct vm_area_struct *lock_vma(struct mm_struct *mm,
>         unsigned long addr))    /* or some better name.. */
> {
>         struct vm_area_struct *vma;
>
>         vma = lock_vma_under_rcu(mm, addr);
>
>         if (vma)
>                 return vma;
>
>         mmap_read_lock(mm);
>         vma = lookup_vma(mm, addr);
>         if (vma)
>                 vma_start_read(vma); /* Won't fail */

Please don't assume vma_start_read() won't fail even when you have
mmap_read_lock(). See the comment in vma_start_read() about the
possibility of an overflow producing false negatives.

>
>         mmap_read_unlock(mm);
>         return vma;
> }
>
> Now, we know we have a vma that's vma locked if there is a vma.  The vma
> won't go away - you have it locked.  The mmap lock is held for even
> less time for your worse case, and the code gets easier to follow.
>
> Once you are done with the vma do a vma_end_read(vma).  Don't forget to
> do this!
>
> Now the comment above such a function should state that the vma needs to
> be vma_end_read(vma), or that could go undetected..  It might be worth
> adding a unlock_vma() counterpart to vma_end_read(vma) even.

Locking VMA while holding mmap_read_lock is an interesting usage
pattern I haven't seen yet. I think this should work quite well!

>
>
> >
> >                       dst_vma = NULL;
> >                       goto retry;
> > @@ -505,7 +532,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> >
> >  out_unlock:
> >       up_read(&ctx->map_changing_lock);
> > -     mmap_read_unlock(dst_mm);
> > +out_unlock_vma:
> > +     unpin_vma(dst_mm, dst_vma, mmap_locked);
> >  out:
> >       if (folio)
> >               folio_put(folio);
> > @@ -521,7 +549,8 @@ extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
> >                                   unsigned long dst_start,
> >                                   unsigned long src_start,
> >                                   unsigned long len,
> > -                                 uffd_flags_t flags);
> > +                                 uffd_flags_t flags,
> > +                                 bool *mmap_locked);
>
> Just a thought, tabbing in twice for each argument would make this more
> compact.
>
>
> >  #endif /* CONFIG_HUGETLB_PAGE */
> >
> >  static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
> > @@ -581,6 +610,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> >       unsigned long src_addr, dst_addr;
> >       long copied;
> >       struct folio *folio;
> > +     bool mmap_locked = false;
> >
> >       /*
> >        * Sanitize the command parameters:
> > @@ -597,7 +627,14 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> >       copied = 0;
> >       folio = NULL;
> >  retry:
> > -     mmap_read_lock(dst_mm);
> > +     /*
> > +      * Make sure the vma is not shared, that the dst range is
> > +      * both valid and fully within a single existing vma.
> > +      */
> > +     err = -ENOENT;
> > +     dst_vma = find_and_pin_dst_vma(dst_mm, dst_start, len, &mmap_locked);
> > +     if (!dst_vma)
> > +             goto out;
> >
> >       /*
> >        * If memory mappings are changing because of non-cooperative
> > @@ -609,15 +646,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> >       if (atomic_read(&ctx->mmap_changing))
> >               goto out_unlock;
> >
> > -     /*
> > -      * Make sure the vma is not shared, that the dst range is
> > -      * both valid and fully within a single existing vma.
> > -      */
> > -     err = -ENOENT;
> > -     dst_vma = find_dst_vma(dst_mm, dst_start, len);
> > -     if (!dst_vma)
> > -             goto out_unlock;
> > -
> >       err = -EINVAL;
> >       /*
> >        * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
> > @@ -638,8 +666,8 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> >        * If this is a HUGETLB vma, pass off to appropriate routine
> >        */
> >       if (is_vm_hugetlb_page(dst_vma))
> > -             return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
> > -                                          src_start, len, flags);
> > +             return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start, src_start
> > +                                          len, flags, &mmap_locked);
> >
> >       if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
> >               goto out_unlock;
> > @@ -699,7 +727,8 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> >                       void *kaddr;
> >
> >                       up_read(&ctx->map_changing_lock);
> > -                     mmap_read_unlock(dst_mm);
> > +                     unpin_vma(dst_mm, dst_vma, &mmap_locked);
> > +
> >                       BUG_ON(!folio);
> >
> >                       kaddr = kmap_local_folio(folio, 0);
> > @@ -730,7 +759,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> >
> >  out_unlock:
> >       up_read(&ctx->map_changing_lock);
> > -     mmap_read_unlock(dst_mm);
> > +     unpin_vma(dst_mm, dst_vma, &mmap_locked);
> >  out:
> >       if (folio)
> >               folio_put(folio);
> > @@ -1285,8 +1314,6 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> >   * @len: length of the virtual memory range
> >   * @mode: flags from uffdio_move.mode
> >   *
> > - * Must be called with mmap_lock held for read.
> > - *
> >   * move_pages() remaps arbitrary anonymous pages atomically in zero
> >   * copy. It only works on non shared anonymous pages because those can
> >   * be relocated without generating non linear anon_vmas in the rmap
> > @@ -1353,15 +1380,16 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> >   * could be obtained. This is the only additional complexity added to
> >   * the rmap code to provide this anonymous page remapping functionality.
> >   */
> > -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > -                unsigned long dst_start, unsigned long src_start,
> > -                unsigned long len, __u64 mode)
> > +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> > +                unsigned long src_start, unsigned long len, __u64 mode)
> >  {
> > +     struct mm_struct *mm = ctx->mm;
> >       struct vm_area_struct *src_vma, *dst_vma;
> >       unsigned long src_addr, dst_addr;
> >       pmd_t *src_pmd, *dst_pmd;
> >       long err = -EINVAL;
> >       ssize_t moved = 0;
> > +     bool mmap_locked = false;
> >
> >       /* Sanitize the command parameters. */
> >       if (WARN_ON_ONCE(src_start & ~PAGE_MASK) ||
> > @@ -1374,28 +1402,52 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> >           WARN_ON_ONCE(dst_start + len <= dst_start))
> >               goto out;
>
> Ah, is this safe for rmap?  I think you need to leave this read lock.
>
> >
> > +     dst_vma = NULL;
> > +     src_vma = lock_vma_under_rcu(mm, src_start);
> > +     if (src_vma) {
> > +             dst_vma = lock_vma_under_rcu(mm, dst_start);
> > +             if (!dst_vma)
> > +                     vma_end_read(src_vma);
> > +     }
> > +
> > +     /* If we failed to lock both VMAs, fall back to mmap_lock */
> > +     if (!dst_vma) {
> > +             mmap_read_lock(mm);
> > +             mmap_locked = true;
> > +             src_vma = find_vma(mm, src_start);
> > +             if (!src_vma)
> > +                     goto out_unlock_mmap;
> > +             dst_vma = find_vma(mm, dst_start);
>
> Again, there is a difference in how find_vma and lock_vam_under_rcu
> works.
>
> > +             if (!dst_vma)
> > +                     goto out_unlock_mmap;
> > +     }
> > +
> > +     /* Re-check after taking map_changing_lock */
> > +     down_read(&ctx->map_changing_lock);
> > +     if (likely(atomic_read(&ctx->mmap_changing))) {
> > +             err = -EAGAIN;
> > +             goto out_unlock;
> > +     }
> >       /*
> >        * Make sure the vma is not shared, that the src and dst remap
> >        * ranges are both valid and fully within a single existing
> >        * vma.
> >        */
> > -     src_vma = find_vma(mm, src_start);
> > -     if (!src_vma || (src_vma->vm_flags & VM_SHARED))
> > -             goto out;
> > +     if (src_vma->vm_flags & VM_SHARED)
> > +             goto out_unlock;
> >       if (src_start < src_vma->vm_start ||
> >           src_start + len > src_vma->vm_end)
> > -             goto out;
> > +             goto out_unlock;
> >
> > -     dst_vma = find_vma(mm, dst_start);
> > -     if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
> > -             goto out;
> > +     if (dst_vma->vm_flags & VM_SHARED)
> > +             goto out_unlock;
> >       if (dst_start < dst_vma->vm_start ||
> >           dst_start + len > dst_vma->vm_end)
> > -             goto out;
> > +             goto out_unlock;
> >
> >       err = validate_move_areas(ctx, src_vma, dst_vma);
> >       if (err)
> > -             goto out;
> > +             goto out_unlock;
> >
> >       for (src_addr = src_start, dst_addr = dst_start;
> >            src_addr < src_start + len;) {
> > @@ -1512,6 +1564,15 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> >               moved += step_size;
> >       }
> >
> > +out_unlock:
> > +     up_read(&ctx->map_changing_lock);
> > +out_unlock_mmap:
> > +     if (mmap_locked)
> > +             mmap_read_unlock(mm);
> > +     else {
> > +             vma_end_read(dst_vma);
> > +             vma_end_read(src_vma);
> > +     }
> >  out:
> >       VM_WARN_ON(moved < 0);
> >       VM_WARN_ON(err > 0);
> > --
> > 2.43.0.429.g432eaa2c6b-goog
> >
> >

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 2/3] userfaultfd: protect mmap_changing with rw_sem in userfaulfd_ctx
  2024-01-29 19:35 ` [PATCH v2 2/3] userfaultfd: protect mmap_changing with rw_sem in userfaulfd_ctx Lokesh Gidra
@ 2024-01-29 21:00   ` Liam R. Howlett
  2024-01-29 22:35     ` Lokesh Gidra
  2024-01-30  7:21   ` Mike Rapoport
  1 sibling, 1 reply; 35+ messages in thread
From: Liam R. Howlett @ 2024-01-29 21:00 UTC (permalink / raw)
  To: Lokesh Gidra
  Cc: akpm, linux-fsdevel, linux-mm, linux-kernel, selinux, surenb,
	kernel-team, aarcange, peterx, david, axelrasmussen, bgeffon,
	willy, jannh, kaleshsingh, ngeoffray, timmurray, rppt

* Lokesh Gidra <lokeshgidra@google.com> [240129 14:35]:
> Increments and loads to mmap_changing are always in mmap_lock
> critical section.

Read or write?


> This ensures that if userspace requests event
> notification for non-cooperative operations (e.g. mremap), userfaultfd
> operations don't occur concurrently.
> 
> This can be achieved by using a separate read-write semaphore in
> userfaultfd_ctx such that increments are done in write-mode and loads
> in read-mode, thereby eliminating the dependency on mmap_lock for this
> purpose.
> 
> This is a preparatory step before we replace mmap_lock usage with
> per-vma locks in fill/move ioctls.
> 
> Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
> ---
>  fs/userfaultfd.c              | 40 ++++++++++++----------
>  include/linux/userfaultfd_k.h | 31 ++++++++++--------
>  mm/userfaultfd.c              | 62 ++++++++++++++++++++---------------
>  3 files changed, 75 insertions(+), 58 deletions(-)
> 
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 58331b83d648..c00a021bcce4 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -685,12 +685,15 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
>  		ctx->flags = octx->flags;
>  		ctx->features = octx->features;
>  		ctx->released = false;
> +		init_rwsem(&ctx->map_changing_lock);
>  		atomic_set(&ctx->mmap_changing, 0);
>  		ctx->mm = vma->vm_mm;
>  		mmgrab(ctx->mm);
>  
>  		userfaultfd_ctx_get(octx);
> +		down_write(&octx->map_changing_lock);
>  		atomic_inc(&octx->mmap_changing);
> +		up_write(&octx->map_changing_lock);

This can potentially hold up your writer as the readers execute.  I
think this will change your priority (ie: priority inversion)?

You could use the first bit of the atomic_inc as indication of a write.
So if the mmap_changing is even, then there are no writers.  If it
didn't change and it's even then you know no modification has happened
(or it overflowed and hit the same number which would be rare, but
maybe okay?).

>  		fctx->orig = octx;
>  		fctx->new = ctx;
>  		list_add_tail(&fctx->list, fcs);
> @@ -737,7 +740,9 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
>  	if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
>  		vm_ctx->ctx = ctx;
>  		userfaultfd_ctx_get(ctx);
> +		down_write(&ctx->map_changing_lock);
>  		atomic_inc(&ctx->mmap_changing);
> +		up_write(&ctx->map_changing_lock);
>  	} else {
>  		/* Drop uffd context if remap feature not enabled */
>  		vma_start_write(vma);
> @@ -783,7 +788,9 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
>  		return true;
>  
>  	userfaultfd_ctx_get(ctx);
> +	down_write(&ctx->map_changing_lock);
>  	atomic_inc(&ctx->mmap_changing);
> +	up_write(&ctx->map_changing_lock);
>  	mmap_read_unlock(mm);
>  
>  	msg_init(&ewq.msg);
> @@ -825,7 +832,9 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
>  		return -ENOMEM;
>  
>  	userfaultfd_ctx_get(ctx);
> +	down_write(&ctx->map_changing_lock);
>  	atomic_inc(&ctx->mmap_changing);
> +	up_write(&ctx->map_changing_lock);
>  	unmap_ctx->ctx = ctx;
>  	unmap_ctx->start = start;
>  	unmap_ctx->end = end;
> @@ -1709,9 +1718,8 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
>  	if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
>  		flags |= MFILL_ATOMIC_WP;
>  	if (mmget_not_zero(ctx->mm)) {
> -		ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
> -					uffdio_copy.len, &ctx->mmap_changing,
> -					flags);
> +		ret = mfill_atomic_copy(ctx, uffdio_copy.dst, uffdio_copy.src,
> +					uffdio_copy.len, flags);
>  		mmput(ctx->mm);
>  	} else {
>  		return -ESRCH;
> @@ -1761,9 +1769,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
>  		goto out;
>  
>  	if (mmget_not_zero(ctx->mm)) {
> -		ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start,
> -					   uffdio_zeropage.range.len,
> -					   &ctx->mmap_changing);
> +		ret = mfill_atomic_zeropage(ctx, uffdio_zeropage.range.start,
> +					   uffdio_zeropage.range.len);
>  		mmput(ctx->mm);
>  	} else {
>  		return -ESRCH;
> @@ -1818,9 +1825,8 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
>  		return -EINVAL;
>  
>  	if (mmget_not_zero(ctx->mm)) {
> -		ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
> -					  uffdio_wp.range.len, mode_wp,
> -					  &ctx->mmap_changing);
> +		ret = mwriteprotect_range(ctx, uffdio_wp.range.start,
> +					  uffdio_wp.range.len, mode_wp);
>  		mmput(ctx->mm);
>  	} else {
>  		return -ESRCH;
> @@ -1870,9 +1876,8 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
>  		flags |= MFILL_ATOMIC_WP;
>  
>  	if (mmget_not_zero(ctx->mm)) {
> -		ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start,
> -					    uffdio_continue.range.len,
> -					    &ctx->mmap_changing, flags);
> +		ret = mfill_atomic_continue(ctx, uffdio_continue.range.start,
> +					    uffdio_continue.range.len, flags);
>  		mmput(ctx->mm);
>  	} else {
>  		return -ESRCH;
> @@ -1925,9 +1930,8 @@ static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long
>  		goto out;
>  
>  	if (mmget_not_zero(ctx->mm)) {
> -		ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start,
> -					  uffdio_poison.range.len,
> -					  &ctx->mmap_changing, 0);
> +		ret = mfill_atomic_poison(ctx, uffdio_poison.range.start,
> +					  uffdio_poison.range.len, 0);
>  		mmput(ctx->mm);
>  	} else {
>  		return -ESRCH;
> @@ -2003,13 +2007,14 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
>  	if (mmget_not_zero(mm)) {
>  		mmap_read_lock(mm);
>  
> -		/* Re-check after taking mmap_lock */
> +		/* Re-check after taking map_changing_lock */
> +		down_read(&ctx->map_changing_lock);
>  		if (likely(!atomic_read(&ctx->mmap_changing)))
>  			ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
>  					 uffdio_move.len, uffdio_move.mode);
>  		else
>  			ret = -EAGAIN;
> -
> +		up_read(&ctx->map_changing_lock);
>  		mmap_read_unlock(mm);
>  		mmput(mm);
>  	} else {
> @@ -2216,6 +2221,7 @@ static int new_userfaultfd(int flags)
>  	ctx->flags = flags;
>  	ctx->features = 0;
>  	ctx->released = false;
> +	init_rwsem(&ctx->map_changing_lock);
>  	atomic_set(&ctx->mmap_changing, 0);
>  	ctx->mm = current->mm;
>  	/* prevent the mm struct to be freed */
> diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> index 691d928ee864..3210c3552976 100644
> --- a/include/linux/userfaultfd_k.h
> +++ b/include/linux/userfaultfd_k.h
> @@ -69,6 +69,13 @@ struct userfaultfd_ctx {
>  	unsigned int features;
>  	/* released */
>  	bool released;
> +	/*
> +	 * Prevents userfaultfd operations (fill/move/wp) from happening while
> +	 * some non-cooperative event(s) is taking place. Increments are done
> +	 * in write-mode. Whereas, userfaultfd operations, which includes
> +	 * reading mmap_changing, is done under read-mode.
> +	 */
> +	struct rw_semaphore map_changing_lock;
>  	/* memory mappings are changing because of non-cooperative event */
>  	atomic_t mmap_changing;
>  	/* mm with one ore more vmas attached to this userfaultfd_ctx */
> @@ -113,22 +120,18 @@ extern int mfill_atomic_install_pte(pmd_t *dst_pmd,
>  				    unsigned long dst_addr, struct page *page,
>  				    bool newly_allocated, uffd_flags_t flags);
>  
> -extern ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
> +extern ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
>  				 unsigned long src_start, unsigned long len,
> -				 atomic_t *mmap_changing, uffd_flags_t flags);
> -extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
> +				 uffd_flags_t flags);
> +extern ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
>  				     unsigned long dst_start,
> -				     unsigned long len,
> -				     atomic_t *mmap_changing);
> -extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start,
> -				     unsigned long len, atomic_t *mmap_changing,
> -				     uffd_flags_t flags);
> -extern ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
> -				   unsigned long len, atomic_t *mmap_changing,
> -				   uffd_flags_t flags);
> -extern int mwriteprotect_range(struct mm_struct *dst_mm,
> -			       unsigned long start, unsigned long len,
> -			       bool enable_wp, atomic_t *mmap_changing);
> +				     unsigned long len);
> +extern ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> +				     unsigned long len, uffd_flags_t flags);
> +extern ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
> +				   unsigned long len, uffd_flags_t flags);
> +extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
> +			       unsigned long len, bool enable_wp);
>  extern long uffd_wp_range(struct vm_area_struct *vma,
>  			  unsigned long start, unsigned long len, bool enable_wp);
>  
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index e3a91871462a..6e2ca04ab04d 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -353,11 +353,11 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
>   * called with mmap_lock held, it will release mmap_lock before returning.
>   */
>  static __always_inline ssize_t mfill_atomic_hugetlb(
> +					      struct userfaultfd_ctx *ctx,
>  					      struct vm_area_struct *dst_vma,
>  					      unsigned long dst_start,
>  					      unsigned long src_start,
>  					      unsigned long len,
> -					      atomic_t *mmap_changing,
>  					      uffd_flags_t flags)
>  {
>  	struct mm_struct *dst_mm = dst_vma->vm_mm;
> @@ -379,6 +379,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  	 * feature is not supported.
>  	 */
>  	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
> +		up_read(&ctx->map_changing_lock);
>  		mmap_read_unlock(dst_mm);
>  		return -EINVAL;
>  	}
> @@ -463,6 +464,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  		cond_resched();
>  
>  		if (unlikely(err == -ENOENT)) {
> +			up_read(&ctx->map_changing_lock);
>  			mmap_read_unlock(dst_mm);
>  			BUG_ON(!folio);
>  
> @@ -473,12 +475,13 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  				goto out;
>  			}
>  			mmap_read_lock(dst_mm);
> +			down_read(&ctx->map_changing_lock);
>  			/*
>  			 * If memory mappings are changing because of non-cooperative
>  			 * operation (e.g. mremap) running in parallel, bail out and
>  			 * request the user to retry later
>  			 */
> -			if (mmap_changing && atomic_read(mmap_changing)) {
> +			if (atomic_read(ctx->mmap_changing)) {
>  				err = -EAGAIN;
>  				break;
>  			}
> @@ -501,6 +504,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  	}
>  
>  out_unlock:
> +	up_read(&ctx->map_changing_lock);
>  	mmap_read_unlock(dst_mm);
>  out:
>  	if (folio)
> @@ -512,11 +516,11 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  }
>  #else /* !CONFIG_HUGETLB_PAGE */
>  /* fail at build time if gcc attempts to use this */
> -extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma,
> +extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
> +				    struct vm_area_struct *dst_vma,
>  				    unsigned long dst_start,
>  				    unsigned long src_start,
>  				    unsigned long len,
> -				    atomic_t *mmap_changing,
>  				    uffd_flags_t flags);
>  #endif /* CONFIG_HUGETLB_PAGE */
>  
> @@ -564,13 +568,13 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
>  	return err;
>  }
>  
> -static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
> +static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
>  					    unsigned long dst_start,
>  					    unsigned long src_start,
>  					    unsigned long len,
> -					    atomic_t *mmap_changing,
>  					    uffd_flags_t flags)
>  {
> +	struct mm_struct *dst_mm = ctx->mm;
>  	struct vm_area_struct *dst_vma;
>  	ssize_t err;
>  	pmd_t *dst_pmd;
> @@ -600,8 +604,9 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
>  	 * operation (e.g. mremap) running in parallel, bail out and
>  	 * request the user to retry later
>  	 */
> +	down_read(&ctx->map_changing_lock);
>  	err = -EAGAIN;
> -	if (mmap_changing && atomic_read(mmap_changing))
> +	if (atomic_read(&ctx->mmap_changing))
>  		goto out_unlock;
>  
>  	/*
> @@ -633,8 +638,8 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
>  	 * If this is a HUGETLB vma, pass off to appropriate routine
>  	 */
>  	if (is_vm_hugetlb_page(dst_vma))
> -		return  mfill_atomic_hugetlb(dst_vma, dst_start, src_start,
> -					     len, mmap_changing, flags);
> +		return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
> +					     src_start, len, flags);
>  
>  	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
>  		goto out_unlock;
> @@ -693,6 +698,7 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
>  		if (unlikely(err == -ENOENT)) {
>  			void *kaddr;
>  
> +			up_read(&ctx->map_changing_lock);
>  			mmap_read_unlock(dst_mm);
>  			BUG_ON(!folio);
>  
> @@ -723,6 +729,7 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
>  	}
>  
>  out_unlock:
> +	up_read(&ctx->map_changing_lock);
>  	mmap_read_unlock(dst_mm);
>  out:
>  	if (folio)
> @@ -733,34 +740,33 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
>  	return copied ? copied : err;
>  }
>  
> -ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
> +ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
>  			  unsigned long src_start, unsigned long len,
> -			  atomic_t *mmap_changing, uffd_flags_t flags)
> +			  uffd_flags_t flags)
>  {
> -	return mfill_atomic(dst_mm, dst_start, src_start, len, mmap_changing,
> +	return mfill_atomic(ctx, dst_start, src_start, len,
>  			    uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY));
>  }
>  
> -ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start,
> -			      unsigned long len, atomic_t *mmap_changing)
> +ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
> +			      unsigned long start,
> +			      unsigned long len)
>  {
> -	return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
> +	return mfill_atomic(ctx, start, 0, len,
>  			    uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE));
>  }
>  
> -ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
> -			      unsigned long len, atomic_t *mmap_changing,
> -			      uffd_flags_t flags)
> +ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start,
> +			      unsigned long len, uffd_flags_t flags)
>  {
> -	return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
> +	return mfill_atomic(ctx, start, 0, len,
>  			    uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
>  }
>  
> -ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
> -			    unsigned long len, atomic_t *mmap_changing,
> -			    uffd_flags_t flags)
> +ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
> +			    unsigned long len, uffd_flags_t flags)
>  {
> -	return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
> +	return mfill_atomic(ctx, start, 0, len,
>  			    uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON));
>  }
>  
> @@ -793,10 +799,10 @@ long uffd_wp_range(struct vm_area_struct *dst_vma,
>  	return ret;
>  }
>  
> -int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
> -			unsigned long len, bool enable_wp,
> -			atomic_t *mmap_changing)
> +int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
> +			unsigned long len, bool enable_wp)
>  {
> +	struct mm_struct *dst_mm = ctx->mm;
>  	unsigned long end = start + len;
>  	unsigned long _start, _end;
>  	struct vm_area_struct *dst_vma;
> @@ -820,8 +826,9 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
>  	 * operation (e.g. mremap) running in parallel, bail out and
>  	 * request the user to retry later
>  	 */
> +	down_read(&ctx->map_changing_lock);
>  	err = -EAGAIN;
> -	if (mmap_changing && atomic_read(mmap_changing))
> +	if (atomic_read(&ctx->mmap_changing))
>  		goto out_unlock;
>  
>  	err = -ENOENT;
> @@ -850,6 +857,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
>  		err = 0;
>  	}
>  out_unlock:
> +	up_read(&ctx->map_changing_lock);
>  	mmap_read_unlock(dst_mm);
>  	return err;
>  }
> -- 
> 2.43.0.429.g432eaa2c6b-goog
> 
> 

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 3/3] userfaultfd: use per-vma locks in userfaultfd operations
  2024-01-29 20:52     ` Suren Baghdasaryan
@ 2024-01-29 21:18       ` Liam R. Howlett
  2024-01-30  0:28       ` Lokesh Gidra
  1 sibling, 0 replies; 35+ messages in thread
From: Liam R. Howlett @ 2024-01-29 21:18 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: Lokesh Gidra, akpm, linux-fsdevel, linux-mm, linux-kernel,
	selinux, kernel-team, aarcange, peterx, david, axelrasmussen,
	bgeffon, willy, jannh, kaleshsingh, ngeoffray, timmurray, rppt

* Suren Baghdasaryan <surenb@google.com> [240129 15:53]:
> On Mon, Jan 29, 2024 at 12:36 PM Liam R. Howlett

...

> > > @@ -465,7 +503,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > >
> > >               if (unlikely(err == -ENOENT)) {
> > >                       up_read(&ctx->map_changing_lock);
> > > -                     mmap_read_unlock(dst_mm);
> > > +                     unpin_vma(dst_mm, dst_vma, mmap_locked);
> > >                       BUG_ON(!folio);
> > >
> > >                       err = copy_folio_from_user(folio,
> > > @@ -474,17 +512,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > >                               err = -EFAULT;
> > >                               goto out;
> > >                       }
> > > -                     mmap_read_lock(dst_mm);
> > > -                     down_read(&ctx->map_changing_lock);
> > > -                     /*
> > > -                      * If memory mappings are changing because of non-cooperative
> > > -                      * operation (e.g. mremap) running in parallel, bail out and
> > > -                      * request the user to retry later
> > > -                      */
> > > -                     if (atomic_read(ctx->mmap_changing)) {
> > > -                             err = -EAGAIN;
> > > -                             break;
> > > -                     }
> >
> > ... Okay, this is where things get confusing.
> >
> > How about this: Don't do this locking/boolean dance.
> >
> > Instead, do something like this:
> > In mm/memory.c, below lock_vma_under_rcu(), but something like this
> >
> > struct vm_area_struct *lock_vma(struct mm_struct *mm,
> >         unsigned long addr))    /* or some better name.. */
> > {
> >         struct vm_area_struct *vma;
> >
> >         vma = lock_vma_under_rcu(mm, addr);
> >
> >         if (vma)
> >                 return vma;
> >
> >         mmap_read_lock(mm);
> >         vma = lookup_vma(mm, addr);
> >         if (vma)
> >                 vma_start_read(vma); /* Won't fail */
> 
> Please don't assume vma_start_read() won't fail even when you have
> mmap_read_lock(). See the comment in vma_start_read() about the
> possibility of an overflow producing false negatives.

I did say something *like* this...

Thanks for catching my mistake.

> 
> >
> >         mmap_read_unlock(mm);
> >         return vma;
> > }
> >
> > Now, we know we have a vma that's vma locked if there is a vma.  The vma
> > won't go away - you have it locked.  The mmap lock is held for even
> > less time for your worse case, and the code gets easier to follow.
> >
> > Once you are done with the vma do a vma_end_read(vma).  Don't forget to
> > do this!
> >
> > Now the comment above such a function should state that the vma needs to
> > be vma_end_read(vma), or that could go undetected..  It might be worth
> > adding a unlock_vma() counterpart to vma_end_read(vma) even.
> 
> Locking VMA while holding mmap_read_lock is an interesting usage
> pattern I haven't seen yet. I think this should work quite well!

What concerns me is this working too well - for instance someone *ahem*
binder *ahem* forever and always isolating their VMA, or someone
forgetting to unlock and never noticing.

vma->vm_lock->lock being locked should be caught by lockdep on exit
though.

...

Thanks,
Liam

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 0/3] per-vma locks in userfaultfd
  2024-01-29 20:39 ` [PATCH v2 0/3] per-vma locks in userfaultfd Liam R. Howlett
@ 2024-01-29 21:58   ` Lokesh Gidra
  0 siblings, 0 replies; 35+ messages in thread
From: Lokesh Gidra @ 2024-01-29 21:58 UTC (permalink / raw)
  To: Liam R. Howlett, Lokesh Gidra, akpm, linux-fsdevel, linux-mm,
	linux-kernel, selinux, surenb, kernel-team, aarcange, peterx,
	david, axelrasmussen, bgeffon, willy, jannh, kaleshsingh,
	ngeoffray, timmurray, rppt

On Mon, Jan 29, 2024 at 12:39 PM Liam R. Howlett
<Liam.Howlett@oracle.com> wrote:
>
> * Lokesh Gidra <lokeshgidra@google.com> [240129 14:35]:
> > Performing userfaultfd operations (like copy/move etc.) in critical
> > section of mmap_lock (read-mode) causes significant contention on the
> > lock when operations requiring the lock in write-mode are taking place
> > concurrently. We can use per-vma locks instead to significantly reduce
> > the contention issue.
>
> Is this really an issue?  I'm surprised so much userfaultfd work is
> happening to create contention.  Can you share some numbers and how your
> patch set changes the performance?
>

In Android we are using userfaultfd for Android Runtime's GC
compaction. mmap-lock (write-mode) operations like mmap/munmap/mlock
happening simultaneously elsewhere in the process caused significant
contention. Of course, this doesn't happen during every compaction,
but whenever it does it leads to a jittery experience for the user.
During one such reproducible scenario, we observed the following
improvements with this patch-set:

- Wall clock time of compaction phase came down from ~3s to less than 500ms
- Uninterruptible sleep time (across all threads in the process) was
~10ms (none was in mmap_lock) during compaction, instead of >20s

I will add these numbers in the cover letter in the next version of
this patchset.

> >
> > Changes since v1 [1]:
> > - rebase patches on 'mm-unstable' branch
> >
> > [1] https://lore.kernel.org/all/20240126182647.2748949-1-lokeshgidra@google.com/
> >
> > Lokesh Gidra (3):
> >   userfaultfd: move userfaultfd_ctx struct to header file
> >   userfaultfd: protect mmap_changing with rw_sem in userfaulfd_ctx
> >   userfaultfd: use per-vma locks in userfaultfd operations
> >
> >  fs/userfaultfd.c              |  86 ++++---------
> >  include/linux/userfaultfd_k.h |  75 ++++++++---
> >  mm/userfaultfd.c              | 229 ++++++++++++++++++++++------------
> >  3 files changed, 229 insertions(+), 161 deletions(-)
> >
> > --
> > 2.43.0.429.g432eaa2c6b-goog
> >
> >

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 2/3] userfaultfd: protect mmap_changing with rw_sem in userfaulfd_ctx
  2024-01-29 21:00   ` Liam R. Howlett
@ 2024-01-29 22:35     ` Lokesh Gidra
  2024-01-30  3:46       ` Liam R. Howlett
  0 siblings, 1 reply; 35+ messages in thread
From: Lokesh Gidra @ 2024-01-29 22:35 UTC (permalink / raw)
  To: Liam R. Howlett, Lokesh Gidra, akpm, linux-fsdevel, linux-mm,
	linux-kernel, selinux, surenb, kernel-team, aarcange, peterx,
	david, axelrasmussen, bgeffon, willy, jannh, kaleshsingh,
	ngeoffray, timmurray, rppt

On Mon, Jan 29, 2024 at 1:00 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
>
> * Lokesh Gidra <lokeshgidra@google.com> [240129 14:35]:
> > Increments and loads to mmap_changing are always in mmap_lock
> > critical section.
>
> Read or write?
>
It's write-mode when incrementing (except in case of
userfaultfd_remove() where it's done in read-mode) and loads are in
mmap_lock (read-mode). I'll clarify this in the next version.
>
> > This ensures that if userspace requests event
> > notification for non-cooperative operations (e.g. mremap), userfaultfd
> > operations don't occur concurrently.
> >
> > This can be achieved by using a separate read-write semaphore in
> > userfaultfd_ctx such that increments are done in write-mode and loads
> > in read-mode, thereby eliminating the dependency on mmap_lock for this
> > purpose.
> >
> > This is a preparatory step before we replace mmap_lock usage with
> > per-vma locks in fill/move ioctls.
> >
> > Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
> > ---
> >  fs/userfaultfd.c              | 40 ++++++++++++----------
> >  include/linux/userfaultfd_k.h | 31 ++++++++++--------
> >  mm/userfaultfd.c              | 62 ++++++++++++++++++++---------------
> >  3 files changed, 75 insertions(+), 58 deletions(-)
> >
> > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > index 58331b83d648..c00a021bcce4 100644
> > --- a/fs/userfaultfd.c
> > +++ b/fs/userfaultfd.c
> > @@ -685,12 +685,15 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
> >               ctx->flags = octx->flags;
> >               ctx->features = octx->features;
> >               ctx->released = false;
> > +             init_rwsem(&ctx->map_changing_lock);
> >               atomic_set(&ctx->mmap_changing, 0);
> >               ctx->mm = vma->vm_mm;
> >               mmgrab(ctx->mm);
> >
> >               userfaultfd_ctx_get(octx);
> > +             down_write(&octx->map_changing_lock);
> >               atomic_inc(&octx->mmap_changing);
> > +             up_write(&octx->map_changing_lock);
>
> This can potentially hold up your writer as the readers execute.  I
> think this will change your priority (ie: priority inversion)?

Priority inversion, if any, is already happening due to mmap_lock, no?
Also, I thought rw_semaphore implementation is fair, so the writer
will eventually get the lock right? Please correct me if I'm wrong.

At this patch: there can't be any readers as they need to acquire
mmap_lock in read-mode first. While writers, at the point of
incrementing mmap_changing, already hold mmap_lock in write-mode.

With per-vma locks, the same synchronization that mmap_lock achieved
around mmap_changing, will be achieved by ctx->map_changing_lock.
>
> You could use the first bit of the atomic_inc as indication of a write.
> So if the mmap_changing is even, then there are no writers.  If it
> didn't change and it's even then you know no modification has happened
> (or it overflowed and hit the same number which would be rare, but
> maybe okay?).

This is already achievable, right? If mmap_changing is >0 then we know
there are writers. The problem is that we want writers (like mremap
operations) to block as long as there is a userfaultfd operation (also
reader of mmap_changing) going on. Please note that I'm inferring this
from current implementation.

AFAIU, mmap_changing isn't required for correctness, because all
operations are happening under the right mode of mmap_lock. It's used
to ensure that while a non-cooperative operations is happening, if the
user has asked it to be notified, then no other userfaultfd operations
should take place until the user gets the event notification.
>
> >               fctx->orig = octx;
> >               fctx->new = ctx;
> >               list_add_tail(&fctx->list, fcs);
> > @@ -737,7 +740,9 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
> >       if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
> >               vm_ctx->ctx = ctx;
> >               userfaultfd_ctx_get(ctx);
> > +             down_write(&ctx->map_changing_lock);
> >               atomic_inc(&ctx->mmap_changing);
> > +             up_write(&ctx->map_changing_lock);
> >       } else {
> >               /* Drop uffd context if remap feature not enabled */
> >               vma_start_write(vma);
> > @@ -783,7 +788,9 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
> >               return true;
> >
> >       userfaultfd_ctx_get(ctx);
> > +     down_write(&ctx->map_changing_lock);
> >       atomic_inc(&ctx->mmap_changing);
> > +     up_write(&ctx->map_changing_lock);
> >       mmap_read_unlock(mm);
> >
> >       msg_init(&ewq.msg);
> > @@ -825,7 +832,9 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
> >               return -ENOMEM;
> >
> >       userfaultfd_ctx_get(ctx);
> > +     down_write(&ctx->map_changing_lock);
> >       atomic_inc(&ctx->mmap_changing);
> > +     up_write(&ctx->map_changing_lock);
> >       unmap_ctx->ctx = ctx;
> >       unmap_ctx->start = start;
> >       unmap_ctx->end = end;
> > @@ -1709,9 +1718,8 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
> >       if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
> >               flags |= MFILL_ATOMIC_WP;
> >       if (mmget_not_zero(ctx->mm)) {
> > -             ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
> > -                                     uffdio_copy.len, &ctx->mmap_changing,
> > -                                     flags);
> > +             ret = mfill_atomic_copy(ctx, uffdio_copy.dst, uffdio_copy.src,
> > +                                     uffdio_copy.len, flags);
> >               mmput(ctx->mm);
> >       } else {
> >               return -ESRCH;
> > @@ -1761,9 +1769,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
> >               goto out;
> >
> >       if (mmget_not_zero(ctx->mm)) {
> > -             ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start,
> > -                                        uffdio_zeropage.range.len,
> > -                                        &ctx->mmap_changing);
> > +             ret = mfill_atomic_zeropage(ctx, uffdio_zeropage.range.start,
> > +                                        uffdio_zeropage.range.len);
> >               mmput(ctx->mm);
> >       } else {
> >               return -ESRCH;
> > @@ -1818,9 +1825,8 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
> >               return -EINVAL;
> >
> >       if (mmget_not_zero(ctx->mm)) {
> > -             ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
> > -                                       uffdio_wp.range.len, mode_wp,
> > -                                       &ctx->mmap_changing);
> > +             ret = mwriteprotect_range(ctx, uffdio_wp.range.start,
> > +                                       uffdio_wp.range.len, mode_wp);
> >               mmput(ctx->mm);
> >       } else {
> >               return -ESRCH;
> > @@ -1870,9 +1876,8 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
> >               flags |= MFILL_ATOMIC_WP;
> >
> >       if (mmget_not_zero(ctx->mm)) {
> > -             ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start,
> > -                                         uffdio_continue.range.len,
> > -                                         &ctx->mmap_changing, flags);
> > +             ret = mfill_atomic_continue(ctx, uffdio_continue.range.start,
> > +                                         uffdio_continue.range.len, flags);
> >               mmput(ctx->mm);
> >       } else {
> >               return -ESRCH;
> > @@ -1925,9 +1930,8 @@ static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long
> >               goto out;
> >
> >       if (mmget_not_zero(ctx->mm)) {
> > -             ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start,
> > -                                       uffdio_poison.range.len,
> > -                                       &ctx->mmap_changing, 0);
> > +             ret = mfill_atomic_poison(ctx, uffdio_poison.range.start,
> > +                                       uffdio_poison.range.len, 0);
> >               mmput(ctx->mm);
> >       } else {
> >               return -ESRCH;
> > @@ -2003,13 +2007,14 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
> >       if (mmget_not_zero(mm)) {
> >               mmap_read_lock(mm);
> >
> > -             /* Re-check after taking mmap_lock */
> > +             /* Re-check after taking map_changing_lock */
> > +             down_read(&ctx->map_changing_lock);
> >               if (likely(!atomic_read(&ctx->mmap_changing)))
> >                       ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
> >                                        uffdio_move.len, uffdio_move.mode);
> >               else
> >                       ret = -EAGAIN;
> > -
> > +             up_read(&ctx->map_changing_lock);
> >               mmap_read_unlock(mm);
> >               mmput(mm);
> >       } else {
> > @@ -2216,6 +2221,7 @@ static int new_userfaultfd(int flags)
> >       ctx->flags = flags;
> >       ctx->features = 0;
> >       ctx->released = false;
> > +     init_rwsem(&ctx->map_changing_lock);
> >       atomic_set(&ctx->mmap_changing, 0);
> >       ctx->mm = current->mm;
> >       /* prevent the mm struct to be freed */
> > diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> > index 691d928ee864..3210c3552976 100644
> > --- a/include/linux/userfaultfd_k.h
> > +++ b/include/linux/userfaultfd_k.h
> > @@ -69,6 +69,13 @@ struct userfaultfd_ctx {
> >       unsigned int features;
> >       /* released */
> >       bool released;
> > +     /*
> > +      * Prevents userfaultfd operations (fill/move/wp) from happening while
> > +      * some non-cooperative event(s) is taking place. Increments are done
> > +      * in write-mode. Whereas, userfaultfd operations, which includes
> > +      * reading mmap_changing, is done under read-mode.
> > +      */
> > +     struct rw_semaphore map_changing_lock;
> >       /* memory mappings are changing because of non-cooperative event */
> >       atomic_t mmap_changing;
> >       /* mm with one ore more vmas attached to this userfaultfd_ctx */
> > @@ -113,22 +120,18 @@ extern int mfill_atomic_install_pte(pmd_t *dst_pmd,
> >                                   unsigned long dst_addr, struct page *page,
> >                                   bool newly_allocated, uffd_flags_t flags);
> >
> > -extern ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
> > +extern ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> >                                unsigned long src_start, unsigned long len,
> > -                              atomic_t *mmap_changing, uffd_flags_t flags);
> > -extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
> > +                              uffd_flags_t flags);
> > +extern ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
> >                                    unsigned long dst_start,
> > -                                  unsigned long len,
> > -                                  atomic_t *mmap_changing);
> > -extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start,
> > -                                  unsigned long len, atomic_t *mmap_changing,
> > -                                  uffd_flags_t flags);
> > -extern ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
> > -                                unsigned long len, atomic_t *mmap_changing,
> > -                                uffd_flags_t flags);
> > -extern int mwriteprotect_range(struct mm_struct *dst_mm,
> > -                            unsigned long start, unsigned long len,
> > -                            bool enable_wp, atomic_t *mmap_changing);
> > +                                  unsigned long len);
> > +extern ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> > +                                  unsigned long len, uffd_flags_t flags);
> > +extern ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
> > +                                unsigned long len, uffd_flags_t flags);
> > +extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
> > +                            unsigned long len, bool enable_wp);
> >  extern long uffd_wp_range(struct vm_area_struct *vma,
> >                         unsigned long start, unsigned long len, bool enable_wp);
> >
> > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> > index e3a91871462a..6e2ca04ab04d 100644
> > --- a/mm/userfaultfd.c
> > +++ b/mm/userfaultfd.c
> > @@ -353,11 +353,11 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
> >   * called with mmap_lock held, it will release mmap_lock before returning.
> >   */
> >  static __always_inline ssize_t mfill_atomic_hugetlb(
> > +                                           struct userfaultfd_ctx *ctx,
> >                                             struct vm_area_struct *dst_vma,
> >                                             unsigned long dst_start,
> >                                             unsigned long src_start,
> >                                             unsigned long len,
> > -                                           atomic_t *mmap_changing,
> >                                             uffd_flags_t flags)
> >  {
> >       struct mm_struct *dst_mm = dst_vma->vm_mm;
> > @@ -379,6 +379,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> >        * feature is not supported.
> >        */
> >       if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
> > +             up_read(&ctx->map_changing_lock);
> >               mmap_read_unlock(dst_mm);
> >               return -EINVAL;
> >       }
> > @@ -463,6 +464,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> >               cond_resched();
> >
> >               if (unlikely(err == -ENOENT)) {
> > +                     up_read(&ctx->map_changing_lock);
> >                       mmap_read_unlock(dst_mm);
> >                       BUG_ON(!folio);
> >
> > @@ -473,12 +475,13 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> >                               goto out;
> >                       }
> >                       mmap_read_lock(dst_mm);
> > +                     down_read(&ctx->map_changing_lock);
> >                       /*
> >                        * If memory mappings are changing because of non-cooperative
> >                        * operation (e.g. mremap) running in parallel, bail out and
> >                        * request the user to retry later
> >                        */
> > -                     if (mmap_changing && atomic_read(mmap_changing)) {
> > +                     if (atomic_read(ctx->mmap_changing)) {
> >                               err = -EAGAIN;
> >                               break;
> >                       }
> > @@ -501,6 +504,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> >       }
> >
> >  out_unlock:
> > +     up_read(&ctx->map_changing_lock);
> >       mmap_read_unlock(dst_mm);
> >  out:
> >       if (folio)
> > @@ -512,11 +516,11 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> >  }
> >  #else /* !CONFIG_HUGETLB_PAGE */
> >  /* fail at build time if gcc attempts to use this */
> > -extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma,
> > +extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
> > +                                 struct vm_area_struct *dst_vma,
> >                                   unsigned long dst_start,
> >                                   unsigned long src_start,
> >                                   unsigned long len,
> > -                                 atomic_t *mmap_changing,
> >                                   uffd_flags_t flags);
> >  #endif /* CONFIG_HUGETLB_PAGE */
> >
> > @@ -564,13 +568,13 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
> >       return err;
> >  }
> >
> > -static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
> > +static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> >                                           unsigned long dst_start,
> >                                           unsigned long src_start,
> >                                           unsigned long len,
> > -                                         atomic_t *mmap_changing,
> >                                           uffd_flags_t flags)
> >  {
> > +     struct mm_struct *dst_mm = ctx->mm;
> >       struct vm_area_struct *dst_vma;
> >       ssize_t err;
> >       pmd_t *dst_pmd;
> > @@ -600,8 +604,9 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
> >        * operation (e.g. mremap) running in parallel, bail out and
> >        * request the user to retry later
> >        */
> > +     down_read(&ctx->map_changing_lock);
> >       err = -EAGAIN;
> > -     if (mmap_changing && atomic_read(mmap_changing))
> > +     if (atomic_read(&ctx->mmap_changing))
> >               goto out_unlock;
> >
> >       /*
> > @@ -633,8 +638,8 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
> >        * If this is a HUGETLB vma, pass off to appropriate routine
> >        */
> >       if (is_vm_hugetlb_page(dst_vma))
> > -             return  mfill_atomic_hugetlb(dst_vma, dst_start, src_start,
> > -                                          len, mmap_changing, flags);
> > +             return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
> > +                                          src_start, len, flags);
> >
> >       if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
> >               goto out_unlock;
> > @@ -693,6 +698,7 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
> >               if (unlikely(err == -ENOENT)) {
> >                       void *kaddr;
> >
> > +                     up_read(&ctx->map_changing_lock);
> >                       mmap_read_unlock(dst_mm);
> >                       BUG_ON(!folio);
> >
> > @@ -723,6 +729,7 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
> >       }
> >
> >  out_unlock:
> > +     up_read(&ctx->map_changing_lock);
> >       mmap_read_unlock(dst_mm);
> >  out:
> >       if (folio)
> > @@ -733,34 +740,33 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
> >       return copied ? copied : err;
> >  }
> >
> > -ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
> > +ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> >                         unsigned long src_start, unsigned long len,
> > -                       atomic_t *mmap_changing, uffd_flags_t flags)
> > +                       uffd_flags_t flags)
> >  {
> > -     return mfill_atomic(dst_mm, dst_start, src_start, len, mmap_changing,
> > +     return mfill_atomic(ctx, dst_start, src_start, len,
> >                           uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY));
> >  }
> >
> > -ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start,
> > -                           unsigned long len, atomic_t *mmap_changing)
> > +ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
> > +                           unsigned long start,
> > +                           unsigned long len)
> >  {
> > -     return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
> > +     return mfill_atomic(ctx, start, 0, len,
> >                           uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE));
> >  }
> >
> > -ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
> > -                           unsigned long len, atomic_t *mmap_changing,
> > -                           uffd_flags_t flags)
> > +ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start,
> > +                           unsigned long len, uffd_flags_t flags)
> >  {
> > -     return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
> > +     return mfill_atomic(ctx, start, 0, len,
> >                           uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
> >  }
> >
> > -ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
> > -                         unsigned long len, atomic_t *mmap_changing,
> > -                         uffd_flags_t flags)
> > +ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
> > +                         unsigned long len, uffd_flags_t flags)
> >  {
> > -     return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
> > +     return mfill_atomic(ctx, start, 0, len,
> >                           uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON));
> >  }
> >
> > @@ -793,10 +799,10 @@ long uffd_wp_range(struct vm_area_struct *dst_vma,
> >       return ret;
> >  }
> >
> > -int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
> > -                     unsigned long len, bool enable_wp,
> > -                     atomic_t *mmap_changing)
> > +int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
> > +                     unsigned long len, bool enable_wp)
> >  {
> > +     struct mm_struct *dst_mm = ctx->mm;
> >       unsigned long end = start + len;
> >       unsigned long _start, _end;
> >       struct vm_area_struct *dst_vma;
> > @@ -820,8 +826,9 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
> >        * operation (e.g. mremap) running in parallel, bail out and
> >        * request the user to retry later
> >        */
> > +     down_read(&ctx->map_changing_lock);
> >       err = -EAGAIN;
> > -     if (mmap_changing && atomic_read(mmap_changing))
> > +     if (atomic_read(&ctx->mmap_changing))
> >               goto out_unlock;
> >
> >       err = -ENOENT;
> > @@ -850,6 +857,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
> >               err = 0;
> >       }
> >  out_unlock:
> > +     up_read(&ctx->map_changing_lock);
> >       mmap_read_unlock(dst_mm);
> >       return err;
> >  }
> > --
> > 2.43.0.429.g432eaa2c6b-goog
> >
> >

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 3/3] userfaultfd: use per-vma locks in userfaultfd operations
  2024-01-29 20:52     ` Suren Baghdasaryan
  2024-01-29 21:18       ` Liam R. Howlett
@ 2024-01-30  0:28       ` Lokesh Gidra
  2024-01-30  2:58         ` Liam R. Howlett
  1 sibling, 1 reply; 35+ messages in thread
From: Lokesh Gidra @ 2024-01-30  0:28 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: Liam R. Howlett, akpm, linux-fsdevel, linux-mm, linux-kernel,
	selinux, kernel-team, aarcange, peterx, david, axelrasmussen,
	bgeffon, willy, jannh, kaleshsingh, ngeoffray, timmurray, rppt

On Mon, Jan 29, 2024 at 12:53 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Mon, Jan 29, 2024 at 12:36 PM Liam R. Howlett
> <Liam.Howlett@oracle.com> wrote:
> >
> > * Lokesh Gidra <lokeshgidra@google.com> [240129 14:35]:
> > > All userfaultfd operations, except write-protect, opportunistically use
> > > per-vma locks to lock vmas. If we fail then fall back to locking
> > > mmap-lock in read-mode.
> > >
> > > Write-protect operation requires mmap_lock as it iterates over multiple vmas.
> > >
> > > Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
> > > ---
> > >  fs/userfaultfd.c              |  13 +--
> > >  include/linux/userfaultfd_k.h |   5 +-
> > >  mm/userfaultfd.c              | 175 +++++++++++++++++++++++-----------
> > >  3 files changed, 122 insertions(+), 71 deletions(-)
> > >
> > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > > index c00a021bcce4..60dcfafdc11a 100644
> > > --- a/fs/userfaultfd.c
> > > +++ b/fs/userfaultfd.c
> > > @@ -2005,17 +2005,8 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
> > >               return -EINVAL;
> > >
> > >       if (mmget_not_zero(mm)) {
> > > -             mmap_read_lock(mm);
> > > -
> > > -             /* Re-check after taking map_changing_lock */
> > > -             down_read(&ctx->map_changing_lock);
> > > -             if (likely(!atomic_read(&ctx->mmap_changing)))
> > > -                     ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
> > > -                                      uffdio_move.len, uffdio_move.mode);
> > > -             else
> > > -                     ret = -EAGAIN;
> > > -             up_read(&ctx->map_changing_lock);
> > > -             mmap_read_unlock(mm);
> > > +             ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src,
> > > +                              uffdio_move.len, uffdio_move.mode);
> > >               mmput(mm);
> > >       } else {
> > >               return -ESRCH;
> > > diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> > > index 3210c3552976..05d59f74fc88 100644
> > > --- a/include/linux/userfaultfd_k.h
> > > +++ b/include/linux/userfaultfd_k.h
> > > @@ -138,9 +138,8 @@ extern long uffd_wp_range(struct vm_area_struct *vma,
> > >  /* move_pages */
> > >  void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
> > >  void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2);
> > > -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > -                unsigned long dst_start, unsigned long src_start,
> > > -                unsigned long len, __u64 flags);
> > > +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> > > +                unsigned long src_start, unsigned long len, __u64 flags);
> > >  int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
> > >                       struct vm_area_struct *dst_vma,
> > >                       struct vm_area_struct *src_vma,
> > > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> > > index 6e2ca04ab04d..d55bf18b80db 100644
> > > --- a/mm/userfaultfd.c
> > > +++ b/mm/userfaultfd.c
> > > @@ -19,20 +19,39 @@
> > >  #include <asm/tlb.h>
> > >  #include "internal.h"
> > >
> > > -static __always_inline
> > > -struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
> > > -                                 unsigned long dst_start,
> > > -                                 unsigned long len)
> > > +void unpin_vma(struct mm_struct *mm, struct vm_area_struct *vma, bool *mmap_locked)
> > > +{
> > > +     BUG_ON(!vma && !*mmap_locked);
> > > +
> > > +     if (*mmap_locked) {
> > > +             mmap_read_unlock(mm);
> > > +             *mmap_locked = false;
> > > +     } else
> > > +             vma_end_read(vma);
> >
> > You are missing braces here.
> >
> > This function is small so it could be inline, although I hope the
> > compiler would get that right for us.
> >
> > I don't think this small helper is worth it, considering you are
> > altering a pointer in here, which makes things harder to follow (not to
> > mention the locking).  The only code that depends on this update is a
> > single place, which already assigns a custom variable after the function
> > return.
> >
Sure. I'll replace unpin_vma() calls with inlined unlocking.
> > > +}
> > > +
> > > +/*
> > > + * Search for VMA and make sure it is stable either by locking it or taking
> > > + * mmap_lock.
> >
> > This function returns something that isn't documented and also sets a
> > boolean which is passed in as a pointer which also is lacking from the
> > documentation.
> >

I'll fix the comment in next version.
> > > + */
> > > +struct vm_area_struct *find_and_pin_dst_vma(struct mm_struct *dst_mm,
> > > +                                         unsigned long dst_start,
> > > +                                         unsigned long len,
> > > +                                         bool *mmap_locked)
> > >  {
> > > +     struct vm_area_struct *dst_vma = lock_vma_under_rcu(dst_mm, dst_start);
> >
> > lock_vma_under_rcu() calls mas_walk(), which goes to dst_start for the
> > VMA.  It is not possible for dst_start to be outside the range.
> >
> > > +     if (!dst_vma) {
> >
> > BUG_ON(mmap_locked) ?
> >
> > > +             mmap_read_lock(dst_mm);
> > > +             *mmap_locked = true;
> > > +             dst_vma = find_vma(dst_mm, dst_start);
> >
> > find_vma() walks to dst_start and searches upwards from that address.
> > This is functionally different than what you have asked for above.  You
> > will not see an issue as you have coded it - but it may be suboptimal
> > since a start address lower than the VMA you are looking for can be
> > found... however, later you check the range falls between the dst_start
> > and dst_start + len.
> >
> > If you expect the dst_start to always be within the VMA range and not
> > lower, then you should use vma_lookup().
> >

Thanks for informing. So vma_lookup() returns the vma for any address
within [vma->vm_start, vma->vm_end)?
> > If you want to search upwards from dst_start for a VMA then you should
> > move the range check below into this brace.
> >
> > > +     }
> > > +
> > >       /*
> > >        * Make sure that the dst range is both valid and fully within a
> > >        * single existing vma.
> > >        */
> > > -     struct vm_area_struct *dst_vma;
> > > -
> > > -     dst_vma = find_vma(dst_mm, dst_start);
> > >       if (!range_in_vma(dst_vma, dst_start, dst_start + len))
> > > -             return NULL;
> > > +             goto unpin;
> > >
> > >       /*
> > >        * Check the vma is registered in uffd, this is required to
> > > @@ -40,9 +59,13 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
> > >        * time.
> > >        */
> > >       if (!dst_vma->vm_userfaultfd_ctx.ctx)
> > > -             return NULL;
> > > +             goto unpin;
> > >
> > >       return dst_vma;
> > > +
> > > +unpin:
> > > +     unpin_vma(dst_mm, dst_vma, mmap_locked);
> > > +     return NULL;
> > >  }
> > >
> > >  /* Check if dst_addr is outside of file's size. Must be called with ptl held. */
> > > @@ -350,7 +373,8 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
> > >  #ifdef CONFIG_HUGETLB_PAGE
> > >  /*
> > >   * mfill_atomic processing for HUGETLB vmas.  Note that this routine is
> > > - * called with mmap_lock held, it will release mmap_lock before returning.
> > > + * called with either vma-lock or mmap_lock held, it will release the lock
> > > + * before returning.
> > >   */
> > >  static __always_inline ssize_t mfill_atomic_hugetlb(
> > >                                             struct userfaultfd_ctx *ctx,
> > > @@ -358,7 +382,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > >                                             unsigned long dst_start,
> > >                                             unsigned long src_start,
> > >                                             unsigned long len,
> > > -                                           uffd_flags_t flags)
> > > +                                           uffd_flags_t flags,
> > > +                                           bool *mmap_locked)
> > >  {
> > >       struct mm_struct *dst_mm = dst_vma->vm_mm;
> > >       int vm_shared = dst_vma->vm_flags & VM_SHARED;
> > > @@ -380,7 +405,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > >        */
> > >       if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
> > >               up_read(&ctx->map_changing_lock);
> > > -             mmap_read_unlock(dst_mm);
> > > +             unpin_vma(dst_mm, dst_vma, mmap_locked);
> > >               return -EINVAL;
> > >       }
> > >
> > > @@ -404,12 +429,25 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > >        */
> > >       if (!dst_vma) {
> > >               err = -ENOENT;
> > > -             dst_vma = find_dst_vma(dst_mm, dst_start, len);
> > > -             if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
> > > -                     goto out_unlock;
> > > +             dst_vma = find_and_pin_dst_vma(dst_mm, dst_start,
> > > +                                            len, mmap_locked);
> > > +             if (!dst_vma)
> > > +                     goto out;
> > > +             if (!is_vm_hugetlb_page(dst_vma))
> > > +                     goto out_unlock_vma;
> > >
> > >               err = -EINVAL;
> > >               if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
> > > +                     goto out_unlock_vma;
> > > +
> > > +             /*
> > > +              * If memory mappings are changing because of non-cooperative
> > > +              * operation (e.g. mremap) running in parallel, bail out and
> > > +              * request the user to retry later
> > > +              */
> > > +             down_read(&ctx->map_changing_lock);
> > > +             err = -EAGAIN;
> > > +             if (atomic_read(&ctx->mmap_changing))
> > >                       goto out_unlock;
> > >
> > >               vm_shared = dst_vma->vm_flags & VM_SHARED;
> > > @@ -465,7 +503,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > >
> > >               if (unlikely(err == -ENOENT)) {
> > >                       up_read(&ctx->map_changing_lock);
> > > -                     mmap_read_unlock(dst_mm);
> > > +                     unpin_vma(dst_mm, dst_vma, mmap_locked);
> > >                       BUG_ON(!folio);
> > >
> > >                       err = copy_folio_from_user(folio,
> > > @@ -474,17 +512,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > >                               err = -EFAULT;
> > >                               goto out;
> > >                       }
> > > -                     mmap_read_lock(dst_mm);
> > > -                     down_read(&ctx->map_changing_lock);
> > > -                     /*
> > > -                      * If memory mappings are changing because of non-cooperative
> > > -                      * operation (e.g. mremap) running in parallel, bail out and
> > > -                      * request the user to retry later
> > > -                      */
> > > -                     if (atomic_read(ctx->mmap_changing)) {
> > > -                             err = -EAGAIN;
> > > -                             break;
> > > -                     }
> >
> > ... Okay, this is where things get confusing.
> >
> > How about this: Don't do this locking/boolean dance.
> >
> > Instead, do something like this:
> > In mm/memory.c, below lock_vma_under_rcu(), but something like this
> >
> > struct vm_area_struct *lock_vma(struct mm_struct *mm,
> >         unsigned long addr))    /* or some better name.. */
> > {
> >         struct vm_area_struct *vma;
> >
> >         vma = lock_vma_under_rcu(mm, addr);
> >
> >         if (vma)
> >                 return vma;
> >
> >         mmap_read_lock(mm);
> >         vma = lookup_vma(mm, addr);
> >         if (vma)
> >                 vma_start_read(vma); /* Won't fail */
>
> Please don't assume vma_start_read() won't fail even when you have
> mmap_read_lock(). See the comment in vma_start_read() about the
> possibility of an overflow producing false negatives.
>
> >
> >         mmap_read_unlock(mm);
> >         return vma;
> > }
> >
> > Now, we know we have a vma that's vma locked if there is a vma.  The vma
> > won't go away - you have it locked.  The mmap lock is held for even
> > less time for your worse case, and the code gets easier to follow.

Your suggestion is definitely simpler and easier to follow, but due to
the overflow situation that Suren pointed out, I would still need to
keep the locking/boolean dance, no? IIUC, even if I were to return
EAGAIN to the userspace, there is no guarantee that subsequent ioctls
on the same vma will succeed due to the same overflow, until someone
acquires and releases mmap_lock in write-mode.
Also, sometimes it seems insufficient whether we managed to lock vma
or not. For instance, lock_vma_under_rcu() checks if anon_vma (for
anonymous vma) exists. If not then it bails out.
So it seems to me that we have to provide some fall back in
userfaultfd operations which executes with mmap_lock in read-mode.
> >
> > Once you are done with the vma do a vma_end_read(vma).  Don't forget to
> > do this!
> >
> > Now the comment above such a function should state that the vma needs to
> > be vma_end_read(vma), or that could go undetected..  It might be worth
> > adding a unlock_vma() counterpart to vma_end_read(vma) even.
>
> Locking VMA while holding mmap_read_lock is an interesting usage
> pattern I haven't seen yet. I think this should work quite well!
>
> >
> >
> > >
> > >                       dst_vma = NULL;
> > >                       goto retry;
> > > @@ -505,7 +532,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > >
> > >  out_unlock:
> > >       up_read(&ctx->map_changing_lock);
> > > -     mmap_read_unlock(dst_mm);
> > > +out_unlock_vma:
> > > +     unpin_vma(dst_mm, dst_vma, mmap_locked);
> > >  out:
> > >       if (folio)
> > >               folio_put(folio);
> > > @@ -521,7 +549,8 @@ extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
> > >                                   unsigned long dst_start,
> > >                                   unsigned long src_start,
> > >                                   unsigned long len,
> > > -                                 uffd_flags_t flags);
> > > +                                 uffd_flags_t flags,
> > > +                                 bool *mmap_locked);
> >
> > Just a thought, tabbing in twice for each argument would make this more
> > compact.
> >
> >
> > >  #endif /* CONFIG_HUGETLB_PAGE */
> > >
> > >  static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
> > > @@ -581,6 +610,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > >       unsigned long src_addr, dst_addr;
> > >       long copied;
> > >       struct folio *folio;
> > > +     bool mmap_locked = false;
> > >
> > >       /*
> > >        * Sanitize the command parameters:
> > > @@ -597,7 +627,14 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > >       copied = 0;
> > >       folio = NULL;
> > >  retry:
> > > -     mmap_read_lock(dst_mm);
> > > +     /*
> > > +      * Make sure the vma is not shared, that the dst range is
> > > +      * both valid and fully within a single existing vma.
> > > +      */
> > > +     err = -ENOENT;
> > > +     dst_vma = find_and_pin_dst_vma(dst_mm, dst_start, len, &mmap_locked);
> > > +     if (!dst_vma)
> > > +             goto out;
> > >
> > >       /*
> > >        * If memory mappings are changing because of non-cooperative
> > > @@ -609,15 +646,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > >       if (atomic_read(&ctx->mmap_changing))
> > >               goto out_unlock;
> > >
> > > -     /*
> > > -      * Make sure the vma is not shared, that the dst range is
> > > -      * both valid and fully within a single existing vma.
> > > -      */
> > > -     err = -ENOENT;
> > > -     dst_vma = find_dst_vma(dst_mm, dst_start, len);
> > > -     if (!dst_vma)
> > > -             goto out_unlock;
> > > -
> > >       err = -EINVAL;
> > >       /*
> > >        * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
> > > @@ -638,8 +666,8 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > >        * If this is a HUGETLB vma, pass off to appropriate routine
> > >        */
> > >       if (is_vm_hugetlb_page(dst_vma))
> > > -             return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
> > > -                                          src_start, len, flags);
> > > +             return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start, src_start
> > > +                                          len, flags, &mmap_locked);
> > >
> > >       if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
> > >               goto out_unlock;
> > > @@ -699,7 +727,8 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > >                       void *kaddr;
> > >
> > >                       up_read(&ctx->map_changing_lock);
> > > -                     mmap_read_unlock(dst_mm);
> > > +                     unpin_vma(dst_mm, dst_vma, &mmap_locked);
> > > +
> > >                       BUG_ON(!folio);
> > >
> > >                       kaddr = kmap_local_folio(folio, 0);
> > > @@ -730,7 +759,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > >
> > >  out_unlock:
> > >       up_read(&ctx->map_changing_lock);
> > > -     mmap_read_unlock(dst_mm);
> > > +     unpin_vma(dst_mm, dst_vma, &mmap_locked);
> > >  out:
> > >       if (folio)
> > >               folio_put(folio);
> > > @@ -1285,8 +1314,6 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> > >   * @len: length of the virtual memory range
> > >   * @mode: flags from uffdio_move.mode
> > >   *
> > > - * Must be called with mmap_lock held for read.
> > > - *
> > >   * move_pages() remaps arbitrary anonymous pages atomically in zero
> > >   * copy. It only works on non shared anonymous pages because those can
> > >   * be relocated without generating non linear anon_vmas in the rmap
> > > @@ -1353,15 +1380,16 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> > >   * could be obtained. This is the only additional complexity added to
> > >   * the rmap code to provide this anonymous page remapping functionality.
> > >   */
> > > -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > -                unsigned long dst_start, unsigned long src_start,
> > > -                unsigned long len, __u64 mode)
> > > +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> > > +                unsigned long src_start, unsigned long len, __u64 mode)
> > >  {
> > > +     struct mm_struct *mm = ctx->mm;
> > >       struct vm_area_struct *src_vma, *dst_vma;
> > >       unsigned long src_addr, dst_addr;
> > >       pmd_t *src_pmd, *dst_pmd;
> > >       long err = -EINVAL;
> > >       ssize_t moved = 0;
> > > +     bool mmap_locked = false;
> > >
> > >       /* Sanitize the command parameters. */
> > >       if (WARN_ON_ONCE(src_start & ~PAGE_MASK) ||
> > > @@ -1374,28 +1402,52 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > >           WARN_ON_ONCE(dst_start + len <= dst_start))
> > >               goto out;
> >
> > Ah, is this safe for rmap?  I think you need to leave this read lock.
> >
I didn't fully understand you here.
> > >
> > > +     dst_vma = NULL;
> > > +     src_vma = lock_vma_under_rcu(mm, src_start);
> > > +     if (src_vma) {
> > > +             dst_vma = lock_vma_under_rcu(mm, dst_start);
> > > +             if (!dst_vma)
> > > +                     vma_end_read(src_vma);
> > > +     }
> > > +
> > > +     /* If we failed to lock both VMAs, fall back to mmap_lock */
> > > +     if (!dst_vma) {
> > > +             mmap_read_lock(mm);
> > > +             mmap_locked = true;
> > > +             src_vma = find_vma(mm, src_start);
> > > +             if (!src_vma)
> > > +                     goto out_unlock_mmap;
> > > +             dst_vma = find_vma(mm, dst_start);
> >
> > Again, there is a difference in how find_vma and lock_vam_under_rcu
> > works.

Sure, I'll use vma_lookup() instead of find_vma().
> >
> > > +             if (!dst_vma)
> > > +                     goto out_unlock_mmap;
> > > +     }
> > > +
> > > +     /* Re-check after taking map_changing_lock */
> > > +     down_read(&ctx->map_changing_lock);
> > > +     if (likely(atomic_read(&ctx->mmap_changing))) {
> > > +             err = -EAGAIN;
> > > +             goto out_unlock;
> > > +     }
> > >       /*
> > >        * Make sure the vma is not shared, that the src and dst remap
> > >        * ranges are both valid and fully within a single existing
> > >        * vma.
> > >        */
> > > -     src_vma = find_vma(mm, src_start);
> > > -     if (!src_vma || (src_vma->vm_flags & VM_SHARED))
> > > -             goto out;
> > > +     if (src_vma->vm_flags & VM_SHARED)
> > > +             goto out_unlock;
> > >       if (src_start < src_vma->vm_start ||
> > >           src_start + len > src_vma->vm_end)
> > > -             goto out;
> > > +             goto out_unlock;
> > >
> > > -     dst_vma = find_vma(mm, dst_start);
> > > -     if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
> > > -             goto out;
> > > +     if (dst_vma->vm_flags & VM_SHARED)
> > > +             goto out_unlock;
> > >       if (dst_start < dst_vma->vm_start ||
> > >           dst_start + len > dst_vma->vm_end)
> > > -             goto out;
> > > +             goto out_unlock;
> > >
> > >       err = validate_move_areas(ctx, src_vma, dst_vma);
> > >       if (err)
> > > -             goto out;
> > > +             goto out_unlock;
> > >
> > >       for (src_addr = src_start, dst_addr = dst_start;
> > >            src_addr < src_start + len;) {
> > > @@ -1512,6 +1564,15 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > >               moved += step_size;
> > >       }
> > >
> > > +out_unlock:
> > > +     up_read(&ctx->map_changing_lock);
> > > +out_unlock_mmap:
> > > +     if (mmap_locked)
> > > +             mmap_read_unlock(mm);
> > > +     else {
> > > +             vma_end_read(dst_vma);
> > > +             vma_end_read(src_vma);
> > > +     }
> > >  out:
> > >       VM_WARN_ON(moved < 0);
> > >       VM_WARN_ON(err > 0);
> > > --
> > > 2.43.0.429.g432eaa2c6b-goog
> > >
> > >

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 3/3] userfaultfd: use per-vma locks in userfaultfd operations
  2024-01-30  0:28       ` Lokesh Gidra
@ 2024-01-30  2:58         ` Liam R. Howlett
  2024-01-31  2:49           ` Lokesh Gidra
  2024-01-31  3:03           ` Suren Baghdasaryan
  0 siblings, 2 replies; 35+ messages in thread
From: Liam R. Howlett @ 2024-01-30  2:58 UTC (permalink / raw)
  To: Lokesh Gidra
  Cc: Suren Baghdasaryan, akpm, linux-fsdevel, linux-mm, linux-kernel,
	selinux, kernel-team, aarcange, peterx, david, axelrasmussen,
	bgeffon, willy, jannh, kaleshsingh, ngeoffray, timmurray, rppt

* Lokesh Gidra <lokeshgidra@google.com> [240129 19:28]:
> On Mon, Jan 29, 2024 at 12:53 PM Suren Baghdasaryan <surenb@google.com> wrote:
> >

...

> 
> Thanks for informing. So vma_lookup() returns the vma for any address
> within [vma->vm_start, vma->vm_end)?

No.  It returns the vma that contains the address passed.  If there
isn't one, you will get NULL.  This is why the range check is not
needed.

find_vma() walks to the address passed and if it is NULL, it returns a
vma that has a higher start address than the one passed (or, rarely NULL
if it runs off the edge).

> > > If you want to search upwards from dst_start for a VMA then you should
> > > move the range check below into this brace.
> > >
> > > > +     }
> > > > +
> > > >       /*
> > > >        * Make sure that the dst range is both valid and fully within a
> > > >        * single existing vma.
> > > >        */
> > > > -     struct vm_area_struct *dst_vma;
> > > > -
> > > > -     dst_vma = find_vma(dst_mm, dst_start);
> > > >       if (!range_in_vma(dst_vma, dst_start, dst_start + len))
> > > > -             return NULL;
> > > > +             goto unpin;
> > > >
> > > >       /*
> > > >        * Check the vma is registered in uffd, this is required to
> > > > @@ -40,9 +59,13 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
> > > >        * time.
> > > >        */
> > > >       if (!dst_vma->vm_userfaultfd_ctx.ctx)
> > > > -             return NULL;
> > > > +             goto unpin;
> > > >
> > > >       return dst_vma;
> > > > +
> > > > +unpin:
> > > > +     unpin_vma(dst_mm, dst_vma, mmap_locked);
> > > > +     return NULL;
> > > >  }
> > > >
> > > >  /* Check if dst_addr is outside of file's size. Must be called with ptl held. */
> > > > @@ -350,7 +373,8 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
> > > >  #ifdef CONFIG_HUGETLB_PAGE
> > > >  /*
> > > >   * mfill_atomic processing for HUGETLB vmas.  Note that this routine is
> > > > - * called with mmap_lock held, it will release mmap_lock before returning.
> > > > + * called with either vma-lock or mmap_lock held, it will release the lock
> > > > + * before returning.
> > > >   */
> > > >  static __always_inline ssize_t mfill_atomic_hugetlb(
> > > >                                             struct userfaultfd_ctx *ctx,
> > > > @@ -358,7 +382,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > >                                             unsigned long dst_start,
> > > >                                             unsigned long src_start,
> > > >                                             unsigned long len,
> > > > -                                           uffd_flags_t flags)
> > > > +                                           uffd_flags_t flags,
> > > > +                                           bool *mmap_locked)
> > > >  {
> > > >       struct mm_struct *dst_mm = dst_vma->vm_mm;
> > > >       int vm_shared = dst_vma->vm_flags & VM_SHARED;
> > > > @@ -380,7 +405,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > >        */
> > > >       if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
> > > >               up_read(&ctx->map_changing_lock);
> > > > -             mmap_read_unlock(dst_mm);
> > > > +             unpin_vma(dst_mm, dst_vma, mmap_locked);
> > > >               return -EINVAL;
> > > >       }
> > > >
> > > > @@ -404,12 +429,25 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > >        */
> > > >       if (!dst_vma) {
> > > >               err = -ENOENT;
> > > > -             dst_vma = find_dst_vma(dst_mm, dst_start, len);
> > > > -             if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
> > > > -                     goto out_unlock;
> > > > +             dst_vma = find_and_pin_dst_vma(dst_mm, dst_start,
> > > > +                                            len, mmap_locked);
> > > > +             if (!dst_vma)
> > > > +                     goto out;
> > > > +             if (!is_vm_hugetlb_page(dst_vma))
> > > > +                     goto out_unlock_vma;
> > > >
> > > >               err = -EINVAL;
> > > >               if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
> > > > +                     goto out_unlock_vma;
> > > > +
> > > > +             /*
> > > > +              * If memory mappings are changing because of non-cooperative
> > > > +              * operation (e.g. mremap) running in parallel, bail out and
> > > > +              * request the user to retry later
> > > > +              */
> > > > +             down_read(&ctx->map_changing_lock);
> > > > +             err = -EAGAIN;
> > > > +             if (atomic_read(&ctx->mmap_changing))
> > > >                       goto out_unlock;
> > > >
> > > >               vm_shared = dst_vma->vm_flags & VM_SHARED;
> > > > @@ -465,7 +503,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > >
> > > >               if (unlikely(err == -ENOENT)) {
> > > >                       up_read(&ctx->map_changing_lock);
> > > > -                     mmap_read_unlock(dst_mm);
> > > > +                     unpin_vma(dst_mm, dst_vma, mmap_locked);
> > > >                       BUG_ON(!folio);
> > > >
> > > >                       err = copy_folio_from_user(folio,
> > > > @@ -474,17 +512,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > >                               err = -EFAULT;
> > > >                               goto out;
> > > >                       }
> > > > -                     mmap_read_lock(dst_mm);
> > > > -                     down_read(&ctx->map_changing_lock);
> > > > -                     /*
> > > > -                      * If memory mappings are changing because of non-cooperative
> > > > -                      * operation (e.g. mremap) running in parallel, bail out and
> > > > -                      * request the user to retry later
> > > > -                      */
> > > > -                     if (atomic_read(ctx->mmap_changing)) {
> > > > -                             err = -EAGAIN;
> > > > -                             break;
> > > > -                     }
> > >
> > > ... Okay, this is where things get confusing.
> > >
> > > How about this: Don't do this locking/boolean dance.
> > >
> > > Instead, do something like this:
> > > In mm/memory.c, below lock_vma_under_rcu(), but something like this
> > >
> > > struct vm_area_struct *lock_vma(struct mm_struct *mm,
> > >         unsigned long addr))    /* or some better name.. */
> > > {
> > >         struct vm_area_struct *vma;
> > >
> > >         vma = lock_vma_under_rcu(mm, addr);
> > >
> > >         if (vma)
> > >                 return vma;
> > >
> > >         mmap_read_lock(mm);
> > >         vma = lookup_vma(mm, addr);
> > >         if (vma)
> > >                 vma_start_read(vma); /* Won't fail */
> >
> > Please don't assume vma_start_read() won't fail even when you have
> > mmap_read_lock(). See the comment in vma_start_read() about the
> > possibility of an overflow producing false negatives.
> >
> > >
> > >         mmap_read_unlock(mm);
> > >         return vma;
> > > }
> > >
> > > Now, we know we have a vma that's vma locked if there is a vma.  The vma
> > > won't go away - you have it locked.  The mmap lock is held for even
> > > less time for your worse case, and the code gets easier to follow.
> 
> Your suggestion is definitely simpler and easier to follow, but due to
> the overflow situation that Suren pointed out, I would still need to
> keep the locking/boolean dance, no? IIUC, even if I were to return
> EAGAIN to the userspace, there is no guarantee that subsequent ioctls
> on the same vma will succeed due to the same overflow, until someone
> acquires and releases mmap_lock in write-mode.
> Also, sometimes it seems insufficient whether we managed to lock vma
> or not. For instance, lock_vma_under_rcu() checks if anon_vma (for
> anonymous vma) exists. If not then it bails out.
> So it seems to me that we have to provide some fall back in
> userfaultfd operations which executes with mmap_lock in read-mode.

Fair enough, what if we didn't use the sequence number and just locked
the vma directly?

/* This will wait on the vma lock, so once we return it's locked */
void vma_aquire_read_lock(struct vm_area_struct *vma)
{
	mmap_assert_locked(vma->vm_mm);
	down_read(&vma->vm_lock->lock);
}

struct vm_area_struct *lock_vma(struct mm_struct *mm,
        unsigned long addr))    /* or some better name.. */
{
        struct vm_area_struct *vma;

        vma = lock_vma_under_rcu(mm, addr);
        if (vma)
                return vma;

        mmap_read_lock(mm);
	/* mm sequence cannot change, no mm writers anyways.
	 * find_mergeable_anon_vma is only a concern in the page fault
	 * path
	 * start/end won't change under the mmap_lock
	 * vma won't become detached as we have the mmap_lock in read
	 * We are now sure no writes will change the VMA
	 * So let's make sure no other context is isolating the vma
	 */
        vma = lookup_vma(mm, addr);
        if (vma)
                vma_aquire_read_lock(vma);

        mmap_read_unlock(mm);
        return vma;
}

I'm betting that avoiding the mmap_lock most of the time is good, but
then holding it just to lock the vma will have extremely rare collisions
- and they will be short lived.

This would allow us to simplify your code.

> > >
> > > Once you are done with the vma do a vma_end_read(vma).  Don't forget to
> > > do this!
> > >
> > > Now the comment above such a function should state that the vma needs to
> > > be vma_end_read(vma), or that could go undetected..  It might be worth
> > > adding a unlock_vma() counterpart to vma_end_read(vma) even.
> >
> > Locking VMA while holding mmap_read_lock is an interesting usage
> > pattern I haven't seen yet. I think this should work quite well!
> >
> > >
> > >
> > > >
> > > >                       dst_vma = NULL;
> > > >                       goto retry;
> > > > @@ -505,7 +532,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > >
> > > >  out_unlock:
> > > >       up_read(&ctx->map_changing_lock);
> > > > -     mmap_read_unlock(dst_mm);
> > > > +out_unlock_vma:
> > > > +     unpin_vma(dst_mm, dst_vma, mmap_locked);
> > > >  out:
> > > >       if (folio)
> > > >               folio_put(folio);
> > > > @@ -521,7 +549,8 @@ extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
> > > >                                   unsigned long dst_start,
> > > >                                   unsigned long src_start,
> > > >                                   unsigned long len,
> > > > -                                 uffd_flags_t flags);
> > > > +                                 uffd_flags_t flags,
> > > > +                                 bool *mmap_locked);
> > >
> > > Just a thought, tabbing in twice for each argument would make this more
> > > compact.
> > >
> > >
> > > >  #endif /* CONFIG_HUGETLB_PAGE */
> > > >
> > > >  static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
> > > > @@ -581,6 +610,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > >       unsigned long src_addr, dst_addr;
> > > >       long copied;
> > > >       struct folio *folio;
> > > > +     bool mmap_locked = false;
> > > >
> > > >       /*
> > > >        * Sanitize the command parameters:
> > > > @@ -597,7 +627,14 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > >       copied = 0;
> > > >       folio = NULL;
> > > >  retry:
> > > > -     mmap_read_lock(dst_mm);
> > > > +     /*
> > > > +      * Make sure the vma is not shared, that the dst range is
> > > > +      * both valid and fully within a single existing vma.
> > > > +      */
> > > > +     err = -ENOENT;
> > > > +     dst_vma = find_and_pin_dst_vma(dst_mm, dst_start, len, &mmap_locked);
> > > > +     if (!dst_vma)
> > > > +             goto out;
> > > >
> > > >       /*
> > > >        * If memory mappings are changing because of non-cooperative
> > > > @@ -609,15 +646,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > >       if (atomic_read(&ctx->mmap_changing))
> > > >               goto out_unlock;
> > > >
> > > > -     /*
> > > > -      * Make sure the vma is not shared, that the dst range is
> > > > -      * both valid and fully within a single existing vma.
> > > > -      */
> > > > -     err = -ENOENT;
> > > > -     dst_vma = find_dst_vma(dst_mm, dst_start, len);
> > > > -     if (!dst_vma)
> > > > -             goto out_unlock;
> > > > -
> > > >       err = -EINVAL;
> > > >       /*
> > > >        * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
> > > > @@ -638,8 +666,8 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > >        * If this is a HUGETLB vma, pass off to appropriate routine
> > > >        */
> > > >       if (is_vm_hugetlb_page(dst_vma))
> > > > -             return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
> > > > -                                          src_start, len, flags);
> > > > +             return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start, src_start
> > > > +                                          len, flags, &mmap_locked);
> > > >
> > > >       if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
> > > >               goto out_unlock;
> > > > @@ -699,7 +727,8 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > >                       void *kaddr;
> > > >
> > > >                       up_read(&ctx->map_changing_lock);
> > > > -                     mmap_read_unlock(dst_mm);
> > > > +                     unpin_vma(dst_mm, dst_vma, &mmap_locked);
> > > > +
> > > >                       BUG_ON(!folio);
> > > >
> > > >                       kaddr = kmap_local_folio(folio, 0);
> > > > @@ -730,7 +759,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > >
> > > >  out_unlock:
> > > >       up_read(&ctx->map_changing_lock);
> > > > -     mmap_read_unlock(dst_mm);
> > > > +     unpin_vma(dst_mm, dst_vma, &mmap_locked);
> > > >  out:
> > > >       if (folio)
> > > >               folio_put(folio);
> > > > @@ -1285,8 +1314,6 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> > > >   * @len: length of the virtual memory range
> > > >   * @mode: flags from uffdio_move.mode
> > > >   *
> > > > - * Must be called with mmap_lock held for read.
> > > > - *
> > > >   * move_pages() remaps arbitrary anonymous pages atomically in zero
> > > >   * copy. It only works on non shared anonymous pages because those can
> > > >   * be relocated without generating non linear anon_vmas in the rmap
> > > > @@ -1353,15 +1380,16 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> > > >   * could be obtained. This is the only additional complexity added to
> > > >   * the rmap code to provide this anonymous page remapping functionality.
> > > >   */
> > > > -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > > -                unsigned long dst_start, unsigned long src_start,
> > > > -                unsigned long len, __u64 mode)
> > > > +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> > > > +                unsigned long src_start, unsigned long len, __u64 mode)
> > > >  {
> > > > +     struct mm_struct *mm = ctx->mm;
> > > >       struct vm_area_struct *src_vma, *dst_vma;
> > > >       unsigned long src_addr, dst_addr;
> > > >       pmd_t *src_pmd, *dst_pmd;
> > > >       long err = -EINVAL;
> > > >       ssize_t moved = 0;
> > > > +     bool mmap_locked = false;
> > > >
> > > >       /* Sanitize the command parameters. */
> > > >       if (WARN_ON_ONCE(src_start & ~PAGE_MASK) ||
> > > > @@ -1374,28 +1402,52 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > >           WARN_ON_ONCE(dst_start + len <= dst_start))
> > > >               goto out;
> > >
> > > Ah, is this safe for rmap?  I think you need to leave this read lock.
> > >
> I didn't fully understand you here.

Sorry, I'm confused on how your locking scheme avoids rmap from trying
to use the VMA with the atomic increment part.

> > > >
> > > > +     dst_vma = NULL;
> > > > +     src_vma = lock_vma_under_rcu(mm, src_start);
> > > > +     if (src_vma) {
> > > > +             dst_vma = lock_vma_under_rcu(mm, dst_start);
> > > > +             if (!dst_vma)
> > > > +                     vma_end_read(src_vma);
> > > > +     }
> > > > +
> > > > +     /* If we failed to lock both VMAs, fall back to mmap_lock */
> > > > +     if (!dst_vma) {
> > > > +             mmap_read_lock(mm);
> > > > +             mmap_locked = true;
> > > > +             src_vma = find_vma(mm, src_start);
> > > > +             if (!src_vma)
> > > > +                     goto out_unlock_mmap;
> > > > +             dst_vma = find_vma(mm, dst_start);
> > >
> > > Again, there is a difference in how find_vma and lock_vam_under_rcu
> > > works.
> 
> Sure, I'll use vma_lookup() instead of find_vma().

Be sure it fits with what you are doing, I'm not entire sure it's right
to switch.  If it is not right then I don't think you can use
lock_vma_under_rcu() - but we can work around that too.

> > >
> > > > +             if (!dst_vma)
> > > > +                     goto out_unlock_mmap;
> > > > +     }
> > > > +
> > > > +     /* Re-check after taking map_changing_lock */
> > > > +     down_read(&ctx->map_changing_lock);
> > > > +     if (likely(atomic_read(&ctx->mmap_changing))) {
> > > > +             err = -EAGAIN;
> > > > +             goto out_unlock;
> > > > +     }
> > > >       /*
> > > >        * Make sure the vma is not shared, that the src and dst remap
> > > >        * ranges are both valid and fully within a single existing
> > > >        * vma.
> > > >        */
> > > > -     src_vma = find_vma(mm, src_start);
> > > > -     if (!src_vma || (src_vma->vm_flags & VM_SHARED))
> > > > -             goto out;
> > > > +     if (src_vma->vm_flags & VM_SHARED)
> > > > +             goto out_unlock;
> > > >       if (src_start < src_vma->vm_start ||
> > > >           src_start + len > src_vma->vm_end)
> > > > -             goto out;
> > > > +             goto out_unlock;
> > > >
> > > > -     dst_vma = find_vma(mm, dst_start);
> > > > -     if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
> > > > -             goto out;
> > > > +     if (dst_vma->vm_flags & VM_SHARED)
> > > > +             goto out_unlock;
> > > >       if (dst_start < dst_vma->vm_start ||
> > > >           dst_start + len > dst_vma->vm_end)
> > > > -             goto out;
> > > > +             goto out_unlock;
> > > >
> > > >       err = validate_move_areas(ctx, src_vma, dst_vma);
> > > >       if (err)
> > > > -             goto out;
> > > > +             goto out_unlock;
> > > >
> > > >       for (src_addr = src_start, dst_addr = dst_start;
> > > >            src_addr < src_start + len;) {
> > > > @@ -1512,6 +1564,15 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > >               moved += step_size;
> > > >       }
> > > >
> > > > +out_unlock:
> > > > +     up_read(&ctx->map_changing_lock);
> > > > +out_unlock_mmap:
> > > > +     if (mmap_locked)
> > > > +             mmap_read_unlock(mm);
> > > > +     else {
> > > > +             vma_end_read(dst_vma);
> > > > +             vma_end_read(src_vma);
> > > > +     }
> > > >  out:
> > > >       VM_WARN_ON(moved < 0);
> > > >       VM_WARN_ON(err > 0);
> > > > --
> > > > 2.43.0.429.g432eaa2c6b-goog
> > > >
> > > >

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 2/3] userfaultfd: protect mmap_changing with rw_sem in userfaulfd_ctx
  2024-01-29 22:35     ` Lokesh Gidra
@ 2024-01-30  3:46       ` Liam R. Howlett
  2024-01-30  8:55         ` Mike Rapoport
  0 siblings, 1 reply; 35+ messages in thread
From: Liam R. Howlett @ 2024-01-30  3:46 UTC (permalink / raw)
  To: Lokesh Gidra
  Cc: akpm, linux-fsdevel, linux-mm, linux-kernel, selinux, surenb,
	kernel-team, aarcange, peterx, david, axelrasmussen, bgeffon,
	willy, jannh, kaleshsingh, ngeoffray, timmurray, rppt

* Lokesh Gidra <lokeshgidra@google.com> [240129 17:35]:
> On Mon, Jan 29, 2024 at 1:00 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> >
> > * Lokesh Gidra <lokeshgidra@google.com> [240129 14:35]:
> > > Increments and loads to mmap_changing are always in mmap_lock
> > > critical section.
> >
> > Read or write?
> >
> It's write-mode when incrementing (except in case of
> userfaultfd_remove() where it's done in read-mode) and loads are in
> mmap_lock (read-mode). I'll clarify this in the next version.
> >
> > > This ensures that if userspace requests event
> > > notification for non-cooperative operations (e.g. mremap), userfaultfd
> > > operations don't occur concurrently.
> > >
> > > This can be achieved by using a separate read-write semaphore in
> > > userfaultfd_ctx such that increments are done in write-mode and loads
> > > in read-mode, thereby eliminating the dependency on mmap_lock for this
> > > purpose.
> > >
> > > This is a preparatory step before we replace mmap_lock usage with
> > > per-vma locks in fill/move ioctls.
> > >
> > > Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
> > > ---
> > >  fs/userfaultfd.c              | 40 ++++++++++++----------
> > >  include/linux/userfaultfd_k.h | 31 ++++++++++--------
> > >  mm/userfaultfd.c              | 62 ++++++++++++++++++++---------------
> > >  3 files changed, 75 insertions(+), 58 deletions(-)
> > >
> > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > > index 58331b83d648..c00a021bcce4 100644
> > > --- a/fs/userfaultfd.c
> > > +++ b/fs/userfaultfd.c
> > > @@ -685,12 +685,15 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
> > >               ctx->flags = octx->flags;
> > >               ctx->features = octx->features;
> > >               ctx->released = false;
> > > +             init_rwsem(&ctx->map_changing_lock);
> > >               atomic_set(&ctx->mmap_changing, 0);
> > >               ctx->mm = vma->vm_mm;
> > >               mmgrab(ctx->mm);
> > >
> > >               userfaultfd_ctx_get(octx);
> > > +             down_write(&octx->map_changing_lock);
> > >               atomic_inc(&octx->mmap_changing);
> > > +             up_write(&octx->map_changing_lock);

On init, I don't think taking the lock is strictly necessary - unless
there is a way to access it before this increment?  Not that it would
cost much.

> >
> > This can potentially hold up your writer as the readers execute.  I
> > think this will change your priority (ie: priority inversion)?
> 
> Priority inversion, if any, is already happening due to mmap_lock, no?
> Also, I thought rw_semaphore implementation is fair, so the writer
> will eventually get the lock right? Please correct me if I'm wrong.

You are correct.  Any writer will stop any new readers, but readers
currently in the section must finish before the writer.

> 
> At this patch: there can't be any readers as they need to acquire
> mmap_lock in read-mode first. While writers, at the point of
> incrementing mmap_changing, already hold mmap_lock in write-mode.
> 
> With per-vma locks, the same synchronization that mmap_lock achieved
> around mmap_changing, will be achieved by ctx->map_changing_lock.

The inversion I was thinking was that the writer cannot complete the
write until the reader is done failing because the atomic_inc has
happened..?  I see the writer as a priority since readers cannot
complete within the write, but I read it wrong.  I think the readers are
fine if the happen before, during, or after a write.  The work is thrown
out if the reader happens during the transition between those states,
which is detected through the atomic.  This makes sense now.

> >
> > You could use the first bit of the atomic_inc as indication of a write.
> > So if the mmap_changing is even, then there are no writers.  If it
> > didn't change and it's even then you know no modification has happened
> > (or it overflowed and hit the same number which would be rare, but
> > maybe okay?).
> 
> This is already achievable, right? If mmap_changing is >0 then we know
> there are writers. The problem is that we want writers (like mremap
> operations) to block as long as there is a userfaultfd operation (also
> reader of mmap_changing) going on. Please note that I'm inferring this
> from current implementation.
> 
> AFAIU, mmap_changing isn't required for correctness, because all
> operations are happening under the right mode of mmap_lock. It's used
> to ensure that while a non-cooperative operations is happening, if the
> user has asked it to be notified, then no other userfaultfd operations
> should take place until the user gets the event notification.

I think it is needed, mmap_changing is read before the mmap_lock is
taken, then compared after the mmap_lock is taken (both read mode) to
ensure nothing has changed.

...

> > > @@ -783,7 +788,9 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
> > >               return true;
> > >
> > >       userfaultfd_ctx_get(ctx);
> > > +     down_write(&ctx->map_changing_lock);
> > >       atomic_inc(&ctx->mmap_changing);
> > > +     up_write(&ctx->map_changing_lock);
> > >       mmap_read_unlock(mm);
> > >
> > >       msg_init(&ewq.msg);

If this happens in read mode, then why are you waiting for the readers
to leave?  Can't you just increment the atomic?  It's fine happening in
read mode today, so it should be fine with this new rwsem.

Thanks,
Liam

...

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 1/3] userfaultfd: move userfaultfd_ctx struct to header file
  2024-01-29 19:35 ` [PATCH v2 1/3] userfaultfd: move userfaultfd_ctx struct to header file Lokesh Gidra
@ 2024-01-30  7:12   ` Mike Rapoport
  0 siblings, 0 replies; 35+ messages in thread
From: Mike Rapoport @ 2024-01-30  7:12 UTC (permalink / raw)
  To: Lokesh Gidra
  Cc: akpm, linux-fsdevel, linux-mm, linux-kernel, selinux, surenb,
	kernel-team, aarcange, peterx, david, axelrasmussen, bgeffon,
	willy, jannh, kaleshsingh, ngeoffray, timmurray

On Mon, Jan 29, 2024 at 11:35:10AM -0800, Lokesh Gidra wrote:
> Moving the struct to userfaultfd_k.h to be accessible from
> mm/userfaultfd.c. There are no other changes in the struct.

Just a thought, it maybe worth to move all of fs/userfaultfd.c to
mm/userfaultfd.c ...
 
> This is required to prepare for using per-vma locks in userfaultfd
> operations.
> 
> Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>

Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>

> ---
>  fs/userfaultfd.c              | 39 -----------------------------------
>  include/linux/userfaultfd_k.h | 39 +++++++++++++++++++++++++++++++++++
>  2 files changed, 39 insertions(+), 39 deletions(-)
> 
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 05c8e8a05427..58331b83d648 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -50,45 +50,6 @@ static struct ctl_table vm_userfaultfd_table[] = {
>  
>  static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init;
>  
> -/*
> - * Start with fault_pending_wqh and fault_wqh so they're more likely
> - * to be in the same cacheline.
> - *
> - * Locking order:
> - *	fd_wqh.lock
> - *		fault_pending_wqh.lock
> - *			fault_wqh.lock
> - *		event_wqh.lock
> - *
> - * To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
> - * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
> - * also taken in IRQ context.
> - */
> -struct userfaultfd_ctx {
> -	/* waitqueue head for the pending (i.e. not read) userfaults */
> -	wait_queue_head_t fault_pending_wqh;
> -	/* waitqueue head for the userfaults */
> -	wait_queue_head_t fault_wqh;
> -	/* waitqueue head for the pseudo fd to wakeup poll/read */
> -	wait_queue_head_t fd_wqh;
> -	/* waitqueue head for events */
> -	wait_queue_head_t event_wqh;
> -	/* a refile sequence protected by fault_pending_wqh lock */
> -	seqcount_spinlock_t refile_seq;
> -	/* pseudo fd refcounting */
> -	refcount_t refcount;
> -	/* userfaultfd syscall flags */
> -	unsigned int flags;
> -	/* features requested from the userspace */
> -	unsigned int features;
> -	/* released */
> -	bool released;
> -	/* memory mappings are changing because of non-cooperative event */
> -	atomic_t mmap_changing;
> -	/* mm with one ore more vmas attached to this userfaultfd_ctx */
> -	struct mm_struct *mm;
> -};
> -
>  struct userfaultfd_fork_ctx {
>  	struct userfaultfd_ctx *orig;
>  	struct userfaultfd_ctx *new;
> diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> index e4056547fbe6..691d928ee864 100644
> --- a/include/linux/userfaultfd_k.h
> +++ b/include/linux/userfaultfd_k.h
> @@ -36,6 +36,45 @@
>  #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
>  #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
>  
> +/*
> + * Start with fault_pending_wqh and fault_wqh so they're more likely
> + * to be in the same cacheline.
> + *
> + * Locking order:
> + *	fd_wqh.lock
> + *		fault_pending_wqh.lock
> + *			fault_wqh.lock
> + *		event_wqh.lock
> + *
> + * To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
> + * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
> + * also taken in IRQ context.
> + */
> +struct userfaultfd_ctx {
> +	/* waitqueue head for the pending (i.e. not read) userfaults */
> +	wait_queue_head_t fault_pending_wqh;
> +	/* waitqueue head for the userfaults */
> +	wait_queue_head_t fault_wqh;
> +	/* waitqueue head for the pseudo fd to wakeup poll/read */
> +	wait_queue_head_t fd_wqh;
> +	/* waitqueue head for events */
> +	wait_queue_head_t event_wqh;
> +	/* a refile sequence protected by fault_pending_wqh lock */
> +	seqcount_spinlock_t refile_seq;
> +	/* pseudo fd refcounting */
> +	refcount_t refcount;
> +	/* userfaultfd syscall flags */
> +	unsigned int flags;
> +	/* features requested from the userspace */
> +	unsigned int features;
> +	/* released */
> +	bool released;
> +	/* memory mappings are changing because of non-cooperative event */
> +	atomic_t mmap_changing;
> +	/* mm with one ore more vmas attached to this userfaultfd_ctx */
> +	struct mm_struct *mm;
> +};
> +
>  extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
>  
>  /* A combined operation mode + behavior flags. */
> -- 
> 2.43.0.429.g432eaa2c6b-goog
> 

-- 
Sincerely yours,
Mike.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 2/3] userfaultfd: protect mmap_changing with rw_sem in userfaulfd_ctx
  2024-01-29 19:35 ` [PATCH v2 2/3] userfaultfd: protect mmap_changing with rw_sem in userfaulfd_ctx Lokesh Gidra
  2024-01-29 21:00   ` Liam R. Howlett
@ 2024-01-30  7:21   ` Mike Rapoport
  1 sibling, 0 replies; 35+ messages in thread
From: Mike Rapoport @ 2024-01-30  7:21 UTC (permalink / raw)
  To: Lokesh Gidra
  Cc: akpm, linux-fsdevel, linux-mm, linux-kernel, selinux, surenb,
	kernel-team, aarcange, peterx, david, axelrasmussen, bgeffon,
	willy, jannh, kaleshsingh, ngeoffray, timmurray

On Mon, Jan 29, 2024 at 11:35:11AM -0800, Lokesh Gidra wrote:
> Increments and loads to mmap_changing are always in mmap_lock
> critical section. This ensures that if userspace requests event
> notification for non-cooperative operations (e.g. mremap), userfaultfd
> operations don't occur concurrently.
> 
> This can be achieved by using a separate read-write semaphore in
> userfaultfd_ctx such that increments are done in write-mode and loads
> in read-mode, thereby eliminating the dependency on mmap_lock for this
> purpose.
> 
> This is a preparatory step before we replace mmap_lock usage with
> per-vma locks in fill/move ioctls.
> 
> Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>

Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>

> ---
>  fs/userfaultfd.c              | 40 ++++++++++++----------
>  include/linux/userfaultfd_k.h | 31 ++++++++++--------
>  mm/userfaultfd.c              | 62 ++++++++++++++++++++---------------
>  3 files changed, 75 insertions(+), 58 deletions(-)
> 
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 58331b83d648..c00a021bcce4 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -685,12 +685,15 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
>  		ctx->flags = octx->flags;
>  		ctx->features = octx->features;
>  		ctx->released = false;
> +		init_rwsem(&ctx->map_changing_lock);
>  		atomic_set(&ctx->mmap_changing, 0);
>  		ctx->mm = vma->vm_mm;
>  		mmgrab(ctx->mm);
>  
>  		userfaultfd_ctx_get(octx);
> +		down_write(&octx->map_changing_lock);
>  		atomic_inc(&octx->mmap_changing);
> +		up_write(&octx->map_changing_lock);
>  		fctx->orig = octx;
>  		fctx->new = ctx;
>  		list_add_tail(&fctx->list, fcs);
> @@ -737,7 +740,9 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
>  	if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
>  		vm_ctx->ctx = ctx;
>  		userfaultfd_ctx_get(ctx);
> +		down_write(&ctx->map_changing_lock);
>  		atomic_inc(&ctx->mmap_changing);
> +		up_write(&ctx->map_changing_lock);
>  	} else {
>  		/* Drop uffd context if remap feature not enabled */
>  		vma_start_write(vma);
> @@ -783,7 +788,9 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
>  		return true;
>  
>  	userfaultfd_ctx_get(ctx);
> +	down_write(&ctx->map_changing_lock);
>  	atomic_inc(&ctx->mmap_changing);
> +	up_write(&ctx->map_changing_lock);
>  	mmap_read_unlock(mm);
>  
>  	msg_init(&ewq.msg);
> @@ -825,7 +832,9 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
>  		return -ENOMEM;
>  
>  	userfaultfd_ctx_get(ctx);
> +	down_write(&ctx->map_changing_lock);
>  	atomic_inc(&ctx->mmap_changing);
> +	up_write(&ctx->map_changing_lock);
>  	unmap_ctx->ctx = ctx;
>  	unmap_ctx->start = start;
>  	unmap_ctx->end = end;
> @@ -1709,9 +1718,8 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
>  	if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
>  		flags |= MFILL_ATOMIC_WP;
>  	if (mmget_not_zero(ctx->mm)) {
> -		ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
> -					uffdio_copy.len, &ctx->mmap_changing,
> -					flags);
> +		ret = mfill_atomic_copy(ctx, uffdio_copy.dst, uffdio_copy.src,
> +					uffdio_copy.len, flags);
>  		mmput(ctx->mm);
>  	} else {
>  		return -ESRCH;
> @@ -1761,9 +1769,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
>  		goto out;
>  
>  	if (mmget_not_zero(ctx->mm)) {
> -		ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start,
> -					   uffdio_zeropage.range.len,
> -					   &ctx->mmap_changing);
> +		ret = mfill_atomic_zeropage(ctx, uffdio_zeropage.range.start,
> +					   uffdio_zeropage.range.len);
>  		mmput(ctx->mm);
>  	} else {
>  		return -ESRCH;
> @@ -1818,9 +1825,8 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
>  		return -EINVAL;
>  
>  	if (mmget_not_zero(ctx->mm)) {
> -		ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
> -					  uffdio_wp.range.len, mode_wp,
> -					  &ctx->mmap_changing);
> +		ret = mwriteprotect_range(ctx, uffdio_wp.range.start,
> +					  uffdio_wp.range.len, mode_wp);
>  		mmput(ctx->mm);
>  	} else {
>  		return -ESRCH;
> @@ -1870,9 +1876,8 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
>  		flags |= MFILL_ATOMIC_WP;
>  
>  	if (mmget_not_zero(ctx->mm)) {
> -		ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start,
> -					    uffdio_continue.range.len,
> -					    &ctx->mmap_changing, flags);
> +		ret = mfill_atomic_continue(ctx, uffdio_continue.range.start,
> +					    uffdio_continue.range.len, flags);
>  		mmput(ctx->mm);
>  	} else {
>  		return -ESRCH;
> @@ -1925,9 +1930,8 @@ static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long
>  		goto out;
>  
>  	if (mmget_not_zero(ctx->mm)) {
> -		ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start,
> -					  uffdio_poison.range.len,
> -					  &ctx->mmap_changing, 0);
> +		ret = mfill_atomic_poison(ctx, uffdio_poison.range.start,
> +					  uffdio_poison.range.len, 0);
>  		mmput(ctx->mm);
>  	} else {
>  		return -ESRCH;
> @@ -2003,13 +2007,14 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
>  	if (mmget_not_zero(mm)) {
>  		mmap_read_lock(mm);
>  
> -		/* Re-check after taking mmap_lock */
> +		/* Re-check after taking map_changing_lock */
> +		down_read(&ctx->map_changing_lock);
>  		if (likely(!atomic_read(&ctx->mmap_changing)))
>  			ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
>  					 uffdio_move.len, uffdio_move.mode);
>  		else
>  			ret = -EAGAIN;
> -
> +		up_read(&ctx->map_changing_lock);
>  		mmap_read_unlock(mm);
>  		mmput(mm);
>  	} else {
> @@ -2216,6 +2221,7 @@ static int new_userfaultfd(int flags)
>  	ctx->flags = flags;
>  	ctx->features = 0;
>  	ctx->released = false;
> +	init_rwsem(&ctx->map_changing_lock);
>  	atomic_set(&ctx->mmap_changing, 0);
>  	ctx->mm = current->mm;
>  	/* prevent the mm struct to be freed */
> diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> index 691d928ee864..3210c3552976 100644
> --- a/include/linux/userfaultfd_k.h
> +++ b/include/linux/userfaultfd_k.h
> @@ -69,6 +69,13 @@ struct userfaultfd_ctx {
>  	unsigned int features;
>  	/* released */
>  	bool released;
> +	/*
> +	 * Prevents userfaultfd operations (fill/move/wp) from happening while
> +	 * some non-cooperative event(s) is taking place. Increments are done
> +	 * in write-mode. Whereas, userfaultfd operations, which includes
> +	 * reading mmap_changing, is done under read-mode.
> +	 */
> +	struct rw_semaphore map_changing_lock;
>  	/* memory mappings are changing because of non-cooperative event */
>  	atomic_t mmap_changing;
>  	/* mm with one ore more vmas attached to this userfaultfd_ctx */
> @@ -113,22 +120,18 @@ extern int mfill_atomic_install_pte(pmd_t *dst_pmd,
>  				    unsigned long dst_addr, struct page *page,
>  				    bool newly_allocated, uffd_flags_t flags);
>  
> -extern ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
> +extern ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
>  				 unsigned long src_start, unsigned long len,
> -				 atomic_t *mmap_changing, uffd_flags_t flags);
> -extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
> +				 uffd_flags_t flags);
> +extern ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
>  				     unsigned long dst_start,
> -				     unsigned long len,
> -				     atomic_t *mmap_changing);
> -extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start,
> -				     unsigned long len, atomic_t *mmap_changing,
> -				     uffd_flags_t flags);
> -extern ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
> -				   unsigned long len, atomic_t *mmap_changing,
> -				   uffd_flags_t flags);
> -extern int mwriteprotect_range(struct mm_struct *dst_mm,
> -			       unsigned long start, unsigned long len,
> -			       bool enable_wp, atomic_t *mmap_changing);
> +				     unsigned long len);
> +extern ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> +				     unsigned long len, uffd_flags_t flags);
> +extern ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
> +				   unsigned long len, uffd_flags_t flags);
> +extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
> +			       unsigned long len, bool enable_wp);
>  extern long uffd_wp_range(struct vm_area_struct *vma,
>  			  unsigned long start, unsigned long len, bool enable_wp);
>  
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index e3a91871462a..6e2ca04ab04d 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -353,11 +353,11 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
>   * called with mmap_lock held, it will release mmap_lock before returning.
>   */
>  static __always_inline ssize_t mfill_atomic_hugetlb(
> +					      struct userfaultfd_ctx *ctx,
>  					      struct vm_area_struct *dst_vma,
>  					      unsigned long dst_start,
>  					      unsigned long src_start,
>  					      unsigned long len,
> -					      atomic_t *mmap_changing,
>  					      uffd_flags_t flags)
>  {
>  	struct mm_struct *dst_mm = dst_vma->vm_mm;
> @@ -379,6 +379,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  	 * feature is not supported.
>  	 */
>  	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
> +		up_read(&ctx->map_changing_lock);
>  		mmap_read_unlock(dst_mm);
>  		return -EINVAL;
>  	}
> @@ -463,6 +464,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  		cond_resched();
>  
>  		if (unlikely(err == -ENOENT)) {
> +			up_read(&ctx->map_changing_lock);
>  			mmap_read_unlock(dst_mm);
>  			BUG_ON(!folio);
>  
> @@ -473,12 +475,13 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  				goto out;
>  			}
>  			mmap_read_lock(dst_mm);
> +			down_read(&ctx->map_changing_lock);
>  			/*
>  			 * If memory mappings are changing because of non-cooperative
>  			 * operation (e.g. mremap) running in parallel, bail out and
>  			 * request the user to retry later
>  			 */
> -			if (mmap_changing && atomic_read(mmap_changing)) {
> +			if (atomic_read(ctx->mmap_changing)) {
>  				err = -EAGAIN;
>  				break;
>  			}
> @@ -501,6 +504,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  	}
>  
>  out_unlock:
> +	up_read(&ctx->map_changing_lock);
>  	mmap_read_unlock(dst_mm);
>  out:
>  	if (folio)
> @@ -512,11 +516,11 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  }
>  #else /* !CONFIG_HUGETLB_PAGE */
>  /* fail at build time if gcc attempts to use this */
> -extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma,
> +extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
> +				    struct vm_area_struct *dst_vma,
>  				    unsigned long dst_start,
>  				    unsigned long src_start,
>  				    unsigned long len,
> -				    atomic_t *mmap_changing,
>  				    uffd_flags_t flags);
>  #endif /* CONFIG_HUGETLB_PAGE */
>  
> @@ -564,13 +568,13 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
>  	return err;
>  }
>  
> -static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
> +static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
>  					    unsigned long dst_start,
>  					    unsigned long src_start,
>  					    unsigned long len,
> -					    atomic_t *mmap_changing,
>  					    uffd_flags_t flags)
>  {
> +	struct mm_struct *dst_mm = ctx->mm;
>  	struct vm_area_struct *dst_vma;
>  	ssize_t err;
>  	pmd_t *dst_pmd;
> @@ -600,8 +604,9 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
>  	 * operation (e.g. mremap) running in parallel, bail out and
>  	 * request the user to retry later
>  	 */
> +	down_read(&ctx->map_changing_lock);
>  	err = -EAGAIN;
> -	if (mmap_changing && atomic_read(mmap_changing))
> +	if (atomic_read(&ctx->mmap_changing))
>  		goto out_unlock;
>  
>  	/*
> @@ -633,8 +638,8 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
>  	 * If this is a HUGETLB vma, pass off to appropriate routine
>  	 */
>  	if (is_vm_hugetlb_page(dst_vma))
> -		return  mfill_atomic_hugetlb(dst_vma, dst_start, src_start,
> -					     len, mmap_changing, flags);
> +		return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
> +					     src_start, len, flags);
>  
>  	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
>  		goto out_unlock;
> @@ -693,6 +698,7 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
>  		if (unlikely(err == -ENOENT)) {
>  			void *kaddr;
>  
> +			up_read(&ctx->map_changing_lock);
>  			mmap_read_unlock(dst_mm);
>  			BUG_ON(!folio);
>  
> @@ -723,6 +729,7 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
>  	}
>  
>  out_unlock:
> +	up_read(&ctx->map_changing_lock);
>  	mmap_read_unlock(dst_mm);
>  out:
>  	if (folio)
> @@ -733,34 +740,33 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
>  	return copied ? copied : err;
>  }
>  
> -ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
> +ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
>  			  unsigned long src_start, unsigned long len,
> -			  atomic_t *mmap_changing, uffd_flags_t flags)
> +			  uffd_flags_t flags)
>  {
> -	return mfill_atomic(dst_mm, dst_start, src_start, len, mmap_changing,
> +	return mfill_atomic(ctx, dst_start, src_start, len,
>  			    uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY));
>  }
>  
> -ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start,
> -			      unsigned long len, atomic_t *mmap_changing)
> +ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
> +			      unsigned long start,
> +			      unsigned long len)
>  {
> -	return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
> +	return mfill_atomic(ctx, start, 0, len,
>  			    uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE));
>  }
>  
> -ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
> -			      unsigned long len, atomic_t *mmap_changing,
> -			      uffd_flags_t flags)
> +ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start,
> +			      unsigned long len, uffd_flags_t flags)
>  {
> -	return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
> +	return mfill_atomic(ctx, start, 0, len,
>  			    uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
>  }
>  
> -ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
> -			    unsigned long len, atomic_t *mmap_changing,
> -			    uffd_flags_t flags)
> +ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
> +			    unsigned long len, uffd_flags_t flags)
>  {
> -	return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
> +	return mfill_atomic(ctx, start, 0, len,
>  			    uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON));
>  }
>  
> @@ -793,10 +799,10 @@ long uffd_wp_range(struct vm_area_struct *dst_vma,
>  	return ret;
>  }
>  
> -int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
> -			unsigned long len, bool enable_wp,
> -			atomic_t *mmap_changing)
> +int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
> +			unsigned long len, bool enable_wp)
>  {
> +	struct mm_struct *dst_mm = ctx->mm;
>  	unsigned long end = start + len;
>  	unsigned long _start, _end;
>  	struct vm_area_struct *dst_vma;
> @@ -820,8 +826,9 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
>  	 * operation (e.g. mremap) running in parallel, bail out and
>  	 * request the user to retry later
>  	 */
> +	down_read(&ctx->map_changing_lock);
>  	err = -EAGAIN;
> -	if (mmap_changing && atomic_read(mmap_changing))
> +	if (atomic_read(&ctx->mmap_changing))
>  		goto out_unlock;
>  
>  	err = -ENOENT;
> @@ -850,6 +857,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
>  		err = 0;
>  	}
>  out_unlock:
> +	up_read(&ctx->map_changing_lock);
>  	mmap_read_unlock(dst_mm);
>  	return err;
>  }
> -- 
> 2.43.0.429.g432eaa2c6b-goog
> 

-- 
Sincerely yours,
Mike.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 2/3] userfaultfd: protect mmap_changing with rw_sem in userfaulfd_ctx
  2024-01-30  3:46       ` Liam R. Howlett
@ 2024-01-30  8:55         ` Mike Rapoport
  2024-01-30 17:28           ` Liam R. Howlett
  0 siblings, 1 reply; 35+ messages in thread
From: Mike Rapoport @ 2024-01-30  8:55 UTC (permalink / raw)
  To: Liam R. Howlett, Lokesh Gidra, akpm, linux-fsdevel, linux-mm,
	linux-kernel, selinux, surenb, kernel-team, aarcange, peterx,
	david, axelrasmussen, bgeffon, willy, jannh, kaleshsingh,
	ngeoffray, timmurray

On Mon, Jan 29, 2024 at 10:46:27PM -0500, Liam R. Howlett wrote:
> * Lokesh Gidra <lokeshgidra@google.com> [240129 17:35]:
> > On Mon, Jan 29, 2024 at 1:00 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> > >
> > > * Lokesh Gidra <lokeshgidra@google.com> [240129 14:35]:
> > > > Increments and loads to mmap_changing are always in mmap_lock
> > > > critical section.
> > >
> > > Read or write?
> > >
> > It's write-mode when incrementing (except in case of
> > userfaultfd_remove() where it's done in read-mode) and loads are in
> > mmap_lock (read-mode). I'll clarify this in the next version.
> > >
> > > > This ensures that if userspace requests event
> > > > notification for non-cooperative operations (e.g. mremap), userfaultfd
> > > > operations don't occur concurrently.
> > > >
> > > > This can be achieved by using a separate read-write semaphore in
> > > > userfaultfd_ctx such that increments are done in write-mode and loads
> > > > in read-mode, thereby eliminating the dependency on mmap_lock for this
> > > > purpose.
> > > >
> > > > This is a preparatory step before we replace mmap_lock usage with
> > > > per-vma locks in fill/move ioctls.
> > > >
> > > > Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
> > > > ---
> > > >  fs/userfaultfd.c              | 40 ++++++++++++----------
> > > >  include/linux/userfaultfd_k.h | 31 ++++++++++--------
> > > >  mm/userfaultfd.c              | 62 ++++++++++++++++++++---------------
> > > >  3 files changed, 75 insertions(+), 58 deletions(-)
> > > >
> > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > > > index 58331b83d648..c00a021bcce4 100644
> > > > --- a/fs/userfaultfd.c
> > > > +++ b/fs/userfaultfd.c
> > > > @@ -685,12 +685,15 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
> > > >               ctx->flags = octx->flags;
> > > >               ctx->features = octx->features;
> > > >               ctx->released = false;
> > > > +             init_rwsem(&ctx->map_changing_lock);
> > > >               atomic_set(&ctx->mmap_changing, 0);
> > > >               ctx->mm = vma->vm_mm;
> > > >               mmgrab(ctx->mm);
> > > >
> > > >               userfaultfd_ctx_get(octx);
> > > > +             down_write(&octx->map_changing_lock);
> > > >               atomic_inc(&octx->mmap_changing);
> > > > +             up_write(&octx->map_changing_lock);
> 
> On init, I don't think taking the lock is strictly necessary - unless
> there is a way to access it before this increment?  Not that it would
> cost much.

It's fork, the lock is for the context of the parent process and there
could be uffdio ops running in parallel on its VM.
 
> > > You could use the first bit of the atomic_inc as indication of a write.
> > > So if the mmap_changing is even, then there are no writers.  If it
> > > didn't change and it's even then you know no modification has happened
> > > (or it overflowed and hit the same number which would be rare, but
> > > maybe okay?).
> > 
> > This is already achievable, right? If mmap_changing is >0 then we know
> > there are writers. The problem is that we want writers (like mremap
> > operations) to block as long as there is a userfaultfd operation (also
> > reader of mmap_changing) going on. Please note that I'm inferring this
> > from current implementation.
> > 
> > AFAIU, mmap_changing isn't required for correctness, because all
> > operations are happening under the right mode of mmap_lock. It's used
> > to ensure that while a non-cooperative operations is happening, if the
> > user has asked it to be notified, then no other userfaultfd operations
> > should take place until the user gets the event notification.
> 
> I think it is needed, mmap_changing is read before the mmap_lock is
> taken, then compared after the mmap_lock is taken (both read mode) to
> ensure nothing has changed.

mmap_changing is required to ensure that no uffdio operation runs in
parallel with operations that modify the memory map, like fork, mremap,
munmap and some of madvise calls. 
And we do need the writers to block if there is an uffdio operation going
on, so I think an rwsem is the right way to protect mmap_chaniging.

> > > > @@ -783,7 +788,9 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
> > > >               return true;
> > > >
> > > >       userfaultfd_ctx_get(ctx);
> > > > +     down_write(&ctx->map_changing_lock);
> > > >       atomic_inc(&ctx->mmap_changing);
> > > > +     up_write(&ctx->map_changing_lock);
> > > >       mmap_read_unlock(mm);
> > > >
> > > >       msg_init(&ewq.msg);
> 
> If this happens in read mode, then why are you waiting for the readers
> to leave?  Can't you just increment the atomic?  It's fine happening in
> read mode today, so it should be fine with this new rwsem.

It's been a while and the details are blurred now, but if I remember
correctly, having this in read mode forced non-cooperative uffd monitor to
be single threaded. If a monitor runs, say uffdio_copy, and in parallel a
thread in the monitored process does MADV_DONTNEED, the latter will wait
for userfaultfd_remove notification to be processed in the monitor and drop
the VMA contents only afterwards. If a non-cooperative monitor would
process notification in parallel with uffdio ops, MADV_DONTNEED could
continue and race with uffdio_copy, so read mode wouldn't be enough.

There was no much sense to make MADV_DONTNEED take mmap_lock in write mode
just for this, but now taking the rwsem in write mode here sounds
reasonable.
 
> Thanks,
> Liam
> 
> ...

-- 
Sincerely yours,
Mike.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 2/3] userfaultfd: protect mmap_changing with rw_sem in userfaulfd_ctx
  2024-01-30  8:55         ` Mike Rapoport
@ 2024-01-30 17:28           ` Liam R. Howlett
  2024-01-31  2:24             ` Lokesh Gidra
  0 siblings, 1 reply; 35+ messages in thread
From: Liam R. Howlett @ 2024-01-30 17:28 UTC (permalink / raw)
  To: Mike Rapoport
  Cc: Lokesh Gidra, akpm, linux-fsdevel, linux-mm, linux-kernel,
	selinux, surenb, kernel-team, aarcange, peterx, david,
	axelrasmussen, bgeffon, willy, jannh, kaleshsingh, ngeoffray,
	timmurray

* Mike Rapoport <rppt@kernel.org> [240130 03:55]:
> On Mon, Jan 29, 2024 at 10:46:27PM -0500, Liam R. Howlett wrote:
> > * Lokesh Gidra <lokeshgidra@google.com> [240129 17:35]:
> > > On Mon, Jan 29, 2024 at 1:00 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> > > >
> > > > * Lokesh Gidra <lokeshgidra@google.com> [240129 14:35]:
> > > > > Increments and loads to mmap_changing are always in mmap_lock
> > > > > critical section.
> > > >
> > > > Read or write?
> > > >
> > > It's write-mode when incrementing (except in case of
> > > userfaultfd_remove() where it's done in read-mode) and loads are in
> > > mmap_lock (read-mode). I'll clarify this in the next version.
> > > >
> > > > > This ensures that if userspace requests event
> > > > > notification for non-cooperative operations (e.g. mremap), userfaultfd
> > > > > operations don't occur concurrently.
> > > > >
> > > > > This can be achieved by using a separate read-write semaphore in
> > > > > userfaultfd_ctx such that increments are done in write-mode and loads
> > > > > in read-mode, thereby eliminating the dependency on mmap_lock for this
> > > > > purpose.
> > > > >
> > > > > This is a preparatory step before we replace mmap_lock usage with
> > > > > per-vma locks in fill/move ioctls.
> > > > >
> > > > > Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
> > > > > ---
> > > > >  fs/userfaultfd.c              | 40 ++++++++++++----------
> > > > >  include/linux/userfaultfd_k.h | 31 ++++++++++--------
> > > > >  mm/userfaultfd.c              | 62 ++++++++++++++++++++---------------
> > > > >  3 files changed, 75 insertions(+), 58 deletions(-)
> > > > >
> > > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > > > > index 58331b83d648..c00a021bcce4 100644
> > > > > --- a/fs/userfaultfd.c
> > > > > +++ b/fs/userfaultfd.c
> > > > > @@ -685,12 +685,15 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
> > > > >               ctx->flags = octx->flags;
> > > > >               ctx->features = octx->features;
> > > > >               ctx->released = false;
> > > > > +             init_rwsem(&ctx->map_changing_lock);
> > > > >               atomic_set(&ctx->mmap_changing, 0);
> > > > >               ctx->mm = vma->vm_mm;
> > > > >               mmgrab(ctx->mm);
> > > > >
> > > > >               userfaultfd_ctx_get(octx);
> > > > > +             down_write(&octx->map_changing_lock);
> > > > >               atomic_inc(&octx->mmap_changing);
> > > > > +             up_write(&octx->map_changing_lock);
> > 
> > On init, I don't think taking the lock is strictly necessary - unless
> > there is a way to access it before this increment?  Not that it would
> > cost much.
> 
> It's fork, the lock is for the context of the parent process and there
> could be uffdio ops running in parallel on its VM.

Is this necessary then?  We are getting the octx from another mm but the
mm is locked for forking.  Why does it matter if there are readers of
the octx?

I assume, currently, there is no way the userfaultfd ctx can
be altered under mmap_lock held for writing. I would think it matters if
there are writers (which, I presume are blocked by the mmap_lock for
now?)  Shouldn't we hold the write lock for the entire dup process, I
mean, if we remove the userfaultfd from the mmap_lock, we cannot let the
structure being duplicated change half way through the dup process?

I must be missing something with where this is headed?

>  
> > > > You could use the first bit of the atomic_inc as indication of a write.
> > > > So if the mmap_changing is even, then there are no writers.  If it
> > > > didn't change and it's even then you know no modification has happened
> > > > (or it overflowed and hit the same number which would be rare, but
> > > > maybe okay?).
> > > 
> > > This is already achievable, right? If mmap_changing is >0 then we know
> > > there are writers. The problem is that we want writers (like mremap
> > > operations) to block as long as there is a userfaultfd operation (also
> > > reader of mmap_changing) going on. Please note that I'm inferring this
> > > from current implementation.
> > > 
> > > AFAIU, mmap_changing isn't required for correctness, because all
> > > operations are happening under the right mode of mmap_lock. It's used
> > > to ensure that while a non-cooperative operations is happening, if the
> > > user has asked it to be notified, then no other userfaultfd operations
> > > should take place until the user gets the event notification.
> > 
> > I think it is needed, mmap_changing is read before the mmap_lock is
> > taken, then compared after the mmap_lock is taken (both read mode) to
> > ensure nothing has changed.
> 
> mmap_changing is required to ensure that no uffdio operation runs in
> parallel with operations that modify the memory map, like fork, mremap,
> munmap and some of madvise calls. 
> And we do need the writers to block if there is an uffdio operation going
> on, so I think an rwsem is the right way to protect mmap_chaniging.
> 
> > > > > @@ -783,7 +788,9 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
> > > > >               return true;
> > > > >
> > > > >       userfaultfd_ctx_get(ctx);
> > > > > +     down_write(&ctx->map_changing_lock);
> > > > >       atomic_inc(&ctx->mmap_changing);
> > > > > +     up_write(&ctx->map_changing_lock);
> > > > >       mmap_read_unlock(mm);
> > > > >
> > > > >       msg_init(&ewq.msg);
> > 
> > If this happens in read mode, then why are you waiting for the readers
> > to leave?  Can't you just increment the atomic?  It's fine happening in
> > read mode today, so it should be fine with this new rwsem.
> 
> It's been a while and the details are blurred now, but if I remember
> correctly, having this in read mode forced non-cooperative uffd monitor to
> be single threaded. If a monitor runs, say uffdio_copy, and in parallel a
> thread in the monitored process does MADV_DONTNEED, the latter will wait
> for userfaultfd_remove notification to be processed in the monitor and drop
> the VMA contents only afterwards. If a non-cooperative monitor would
> process notification in parallel with uffdio ops, MADV_DONTNEED could
> continue and race with uffdio_copy, so read mode wouldn't be enough.
> 

Right now this function won't stop to wait for readers to exit the
critical section, but with this change there will be a pause (since the
down_write() will need to wait for the readers with the read lock).  So
this is adding a delay in this call path that isn't necessary (?) nor
existed before.  If you have non-cooperative uffd monitors, then you
will have to wait for them to finish to mark the uffd as being removed,
where as before it was a fire & forget, this is now a wait to tell.


> There was no much sense to make MADV_DONTNEED take mmap_lock in write mode
> just for this, but now taking the rwsem in write mode here sounds
> reasonable.
>  

I see why there was no need for a mmap_lock in write mode, but I think
taking the new rwsem in write mode is unnecessary.

Basically, I see this as a signal to new readers to abort, but we don't
need to wait for current readers to finish before this one increments
the atomic.  

Unless I missed something, I don't think you want to take the write lock
here.

Thanks,
Liam

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 2/3] userfaultfd: protect mmap_changing with rw_sem in userfaulfd_ctx
  2024-01-30 17:28           ` Liam R. Howlett
@ 2024-01-31  2:24             ` Lokesh Gidra
  2024-02-04 10:27               ` Mike Rapoport
  0 siblings, 1 reply; 35+ messages in thread
From: Lokesh Gidra @ 2024-01-31  2:24 UTC (permalink / raw)
  To: Liam R. Howlett, Mike Rapoport, Lokesh Gidra, akpm,
	linux-fsdevel, linux-mm, linux-kernel, selinux, surenb,
	kernel-team, aarcange, peterx, david, axelrasmussen, bgeffon,
	willy, jannh, kaleshsingh, ngeoffray, timmurray

On Tue, Jan 30, 2024 at 9:28 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
>
> * Mike Rapoport <rppt@kernel.org> [240130 03:55]:
> > On Mon, Jan 29, 2024 at 10:46:27PM -0500, Liam R. Howlett wrote:
> > > * Lokesh Gidra <lokeshgidra@google.com> [240129 17:35]:
> > > > On Mon, Jan 29, 2024 at 1:00 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> > > > >
> > > > > * Lokesh Gidra <lokeshgidra@google.com> [240129 14:35]:
> > > > > > Increments and loads to mmap_changing are always in mmap_lock
> > > > > > critical section.
> > > > >
> > > > > Read or write?
> > > > >
> > > > It's write-mode when incrementing (except in case of
> > > > userfaultfd_remove() where it's done in read-mode) and loads are in
> > > > mmap_lock (read-mode). I'll clarify this in the next version.
> > > > >
> > > > > > This ensures that if userspace requests event
> > > > > > notification for non-cooperative operations (e.g. mremap), userfaultfd
> > > > > > operations don't occur concurrently.
> > > > > >
> > > > > > This can be achieved by using a separate read-write semaphore in
> > > > > > userfaultfd_ctx such that increments are done in write-mode and loads
> > > > > > in read-mode, thereby eliminating the dependency on mmap_lock for this
> > > > > > purpose.
> > > > > >
> > > > > > This is a preparatory step before we replace mmap_lock usage with
> > > > > > per-vma locks in fill/move ioctls.
> > > > > >
> > > > > > Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
> > > > > > ---
> > > > > >  fs/userfaultfd.c              | 40 ++++++++++++----------
> > > > > >  include/linux/userfaultfd_k.h | 31 ++++++++++--------
> > > > > >  mm/userfaultfd.c              | 62 ++++++++++++++++++++---------------
> > > > > >  3 files changed, 75 insertions(+), 58 deletions(-)
> > > > > >
> > > > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > > > > > index 58331b83d648..c00a021bcce4 100644
> > > > > > --- a/fs/userfaultfd.c
> > > > > > +++ b/fs/userfaultfd.c
> > > > > > @@ -685,12 +685,15 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
> > > > > >               ctx->flags = octx->flags;
> > > > > >               ctx->features = octx->features;
> > > > > >               ctx->released = false;
> > > > > > +             init_rwsem(&ctx->map_changing_lock);
> > > > > >               atomic_set(&ctx->mmap_changing, 0);
> > > > > >               ctx->mm = vma->vm_mm;
> > > > > >               mmgrab(ctx->mm);
> > > > > >
> > > > > >               userfaultfd_ctx_get(octx);
> > > > > > +             down_write(&octx->map_changing_lock);
> > > > > >               atomic_inc(&octx->mmap_changing);
> > > > > > +             up_write(&octx->map_changing_lock);
> > >
> > > On init, I don't think taking the lock is strictly necessary - unless
> > > there is a way to access it before this increment?  Not that it would
> > > cost much.
> >
> > It's fork, the lock is for the context of the parent process and there
> > could be uffdio ops running in parallel on its VM.
>
> Is this necessary then?  We are getting the octx from another mm but the
> mm is locked for forking.  Why does it matter if there are readers of
> the octx?
>
> I assume, currently, there is no way the userfaultfd ctx can
> be altered under mmap_lock held for writing. I would think it matters if
> there are writers (which, I presume are blocked by the mmap_lock for
> now?)  Shouldn't we hold the write lock for the entire dup process, I
> mean, if we remove the userfaultfd from the mmap_lock, we cannot let the
> structure being duplicated change half way through the dup process?
>
> I must be missing something with where this is headed?
>
AFAIU, the purpose of mmap_changing is to serialize uffdio operations
with non-cooperative events if and when such events are being
monitored by userspace (in case you missed, in all the cases of writes
to mmap_changing, we only do it if that non-cooperative event has been
requested by the user). As you pointed out there are no correctness
concerns as far as userfaultfd operations are concerned. But these
events are essential for the uffd monitor's functioning.

For example: say the uffd monitor wants to be notified for REMAP
operations while doing uffdio_copy operations. When COPY ioctls start
failing with -EAGAIN and uffdio_copy.copy == 0, then it knows it must
be due to mremap(), in which case it waits for the REMAP event
notification before attempting COPY again.

But there are few things that I didn't get after going through the
history of non-cooperative events. Hopefully Mike (or someone else
familiar) can clarify:

IIUC, the idea behind non-cooperative events was to block uffdio
operations from happening *before* the page tables are manipulated by
the event (like mremap), and that the uffdio ops are resumed after the
event notification is received by the monitor. If so then:

1) Why in the case of REMAP prep() is done after page-tables are
moved? Shouldn't it be done before? All other non-cooperative
operations do the prep() before.
2) UFFD_FEATURE_EVENT_REMOVE only notifies user space. It is not
consistently blocking uffdio operations (as both sides are acquiring
mmap_lock in read-mode) when remove operation is taking place. I can
understand this was intentionally left as is in the interest of not
acquiring mmap_lock in write-mode during madvise. But is only getting
the notification any useful? Can we say this patch fixes it? And in
that case shouldn't I split userfaultfd_remove() into two functions
(like other non-cooperative operations)?
3) Based on [1] I see how mmap_changing helps in eliminating duplicate
work (background copy) by uffd monitor, but didn't get if there is a
correctness aspect too that I'm missing? I concur with Amit's point in
[1] that getting -EEXIST when setting up the pte will avoid memory
corruption, no?

[1] https://lore.kernel.org/lkml/20201206093703.GY123287@linux.ibm.com/
> >
> > > > > You could use the first bit of the atomic_inc as indication of a write.
> > > > > So if the mmap_changing is even, then there are no writers.  If it
> > > > > didn't change and it's even then you know no modification has happened
> > > > > (or it overflowed and hit the same number which would be rare, but
> > > > > maybe okay?).
> > > >
> > > > This is already achievable, right? If mmap_changing is >0 then we know
> > > > there are writers. The problem is that we want writers (like mremap
> > > > operations) to block as long as there is a userfaultfd operation (also
> > > > reader of mmap_changing) going on. Please note that I'm inferring this
> > > > from current implementation.
> > > >
> > > > AFAIU, mmap_changing isn't required for correctness, because all
> > > > operations are happening under the right mode of mmap_lock. It's used
> > > > to ensure that while a non-cooperative operations is happening, if the
> > > > user has asked it to be notified, then no other userfaultfd operations
> > > > should take place until the user gets the event notification.
> > >
> > > I think it is needed, mmap_changing is read before the mmap_lock is
> > > taken, then compared after the mmap_lock is taken (both read mode) to
> > > ensure nothing has changed.
> >
> > mmap_changing is required to ensure that no uffdio operation runs in
> > parallel with operations that modify the memory map, like fork, mremap,
> > munmap and some of madvise calls.
> > And we do need the writers to block if there is an uffdio operation going
> > on, so I think an rwsem is the right way to protect mmap_chaniging.
> >
> > > > > > @@ -783,7 +788,9 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
> > > > > >               return true;
> > > > > >
> > > > > >       userfaultfd_ctx_get(ctx);
> > > > > > +     down_write(&ctx->map_changing_lock);
> > > > > >       atomic_inc(&ctx->mmap_changing);
> > > > > > +     up_write(&ctx->map_changing_lock);
> > > > > >       mmap_read_unlock(mm);
> > > > > >
> > > > > >       msg_init(&ewq.msg);
> > >
> > > If this happens in read mode, then why are you waiting for the readers
> > > to leave?  Can't you just increment the atomic?  It's fine happening in
> > > read mode today, so it should be fine with this new rwsem.
> >
> > It's been a while and the details are blurred now, but if I remember
> > correctly, having this in read mode forced non-cooperative uffd monitor to
> > be single threaded. If a monitor runs, say uffdio_copy, and in parallel a
> > thread in the monitored process does MADV_DONTNEED, the latter will wait
> > for userfaultfd_remove notification to be processed in the monitor and drop
> > the VMA contents only afterwards. If a non-cooperative monitor would
> > process notification in parallel with uffdio ops, MADV_DONTNEED could
> > continue and race with uffdio_copy, so read mode wouldn't be enough.
> >
>
> Right now this function won't stop to wait for readers to exit the
> critical section, but with this change there will be a pause (since the
> down_write() will need to wait for the readers with the read lock).  So
> this is adding a delay in this call path that isn't necessary (?) nor
> existed before.  If you have non-cooperative uffd monitors, then you
> will have to wait for them to finish to mark the uffd as being removed,
> where as before it was a fire & forget, this is now a wait to tell.
>
I think a lot will be clearer once we get a response to my questions
above. IMHO not only this write-lock is needed here, we need to fix
userfaultfd_remove() by splitting it into userfaultfd_remove_prep()
and userfaultfd_remove_complete() (like all other non-cooperative
operations) as well. This patch enables us to do that as we remove
mmap_changing's dependency on mmap_lock for synchronization.
>
> > There was no much sense to make MADV_DONTNEED take mmap_lock in write mode
> > just for this, but now taking the rwsem in write mode here sounds
> > reasonable.
> >
>
> I see why there was no need for a mmap_lock in write mode, but I think
> taking the new rwsem in write mode is unnecessary.
>
> Basically, I see this as a signal to new readers to abort, but we don't
> need to wait for current readers to finish before this one increments
> the atomic.
>
> Unless I missed something, I don't think you want to take the write lock
> here.
What I understood from the history of mmap_changing is that the
intention was to enable informing the uffd monitor about the correct
state of which pages are filled and which aren't. Going through this
thread was very helpful [2]

[2] https://lore.kernel.org/lkml/1527061324-19949-1-git-send-email-rppt@linux.vnet.ibm.com/
>
> Thanks,
> Liam

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 3/3] userfaultfd: use per-vma locks in userfaultfd operations
  2024-01-30  2:58         ` Liam R. Howlett
@ 2024-01-31  2:49           ` Lokesh Gidra
  2024-01-31 21:41             ` Liam R. Howlett
  2024-01-31  3:03           ` Suren Baghdasaryan
  1 sibling, 1 reply; 35+ messages in thread
From: Lokesh Gidra @ 2024-01-31  2:49 UTC (permalink / raw)
  To: Liam R. Howlett, Lokesh Gidra, Suren Baghdasaryan, akpm,
	linux-fsdevel, linux-mm, linux-kernel, selinux, kernel-team,
	aarcange, peterx, david, axelrasmussen, bgeffon, willy, jannh,
	kaleshsingh, ngeoffray, timmurray, rppt

On Mon, Jan 29, 2024 at 6:58 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
>
> * Lokesh Gidra <lokeshgidra@google.com> [240129 19:28]:
> > On Mon, Jan 29, 2024 at 12:53 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > >
>
> ...
>
> >
> > Thanks for informing. So vma_lookup() returns the vma for any address
> > within [vma->vm_start, vma->vm_end)?
>
> No.  It returns the vma that contains the address passed.  If there
> isn't one, you will get NULL.  This is why the range check is not
> needed.

This is what we need. IIUC, with vma_lookup() and lock_vma_under_rcu()
the only validation required is

if (vma && vma->vm_end >= dst_start + len)

Thanks for clarifying.
>
> find_vma() walks to the address passed and if it is NULL, it returns a
> vma that has a higher start address than the one passed (or, rarely NULL
> if it runs off the edge).
>
> > > > If you want to search upwards from dst_start for a VMA then you should
> > > > move the range check below into this brace.
> > > >
> > > > > +     }
> > > > > +
> > > > >       /*
> > > > >        * Make sure that the dst range is both valid and fully within a
> > > > >        * single existing vma.
> > > > >        */
> > > > > -     struct vm_area_struct *dst_vma;
> > > > > -
> > > > > -     dst_vma = find_vma(dst_mm, dst_start);
> > > > >       if (!range_in_vma(dst_vma, dst_start, dst_start + len))
> > > > > -             return NULL;
> > > > > +             goto unpin;
> > > > >
> > > > >       /*
> > > > >        * Check the vma is registered in uffd, this is required to
> > > > > @@ -40,9 +59,13 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
> > > > >        * time.
> > > > >        */
> > > > >       if (!dst_vma->vm_userfaultfd_ctx.ctx)
> > > > > -             return NULL;
> > > > > +             goto unpin;
> > > > >
> > > > >       return dst_vma;
> > > > > +
> > > > > +unpin:
> > > > > +     unpin_vma(dst_mm, dst_vma, mmap_locked);
> > > > > +     return NULL;
> > > > >  }
> > > > >
> > > > >  /* Check if dst_addr is outside of file's size. Must be called with ptl held. */
> > > > > @@ -350,7 +373,8 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
> > > > >  #ifdef CONFIG_HUGETLB_PAGE
> > > > >  /*
> > > > >   * mfill_atomic processing for HUGETLB vmas.  Note that this routine is
> > > > > - * called with mmap_lock held, it will release mmap_lock before returning.
> > > > > + * called with either vma-lock or mmap_lock held, it will release the lock
> > > > > + * before returning.
> > > > >   */
> > > > >  static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >                                             struct userfaultfd_ctx *ctx,
> > > > > @@ -358,7 +382,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >                                             unsigned long dst_start,
> > > > >                                             unsigned long src_start,
> > > > >                                             unsigned long len,
> > > > > -                                           uffd_flags_t flags)
> > > > > +                                           uffd_flags_t flags,
> > > > > +                                           bool *mmap_locked)
> > > > >  {
> > > > >       struct mm_struct *dst_mm = dst_vma->vm_mm;
> > > > >       int vm_shared = dst_vma->vm_flags & VM_SHARED;
> > > > > @@ -380,7 +405,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >        */
> > > > >       if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
> > > > >               up_read(&ctx->map_changing_lock);
> > > > > -             mmap_read_unlock(dst_mm);
> > > > > +             unpin_vma(dst_mm, dst_vma, mmap_locked);
> > > > >               return -EINVAL;
> > > > >       }
> > > > >
> > > > > @@ -404,12 +429,25 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >        */
> > > > >       if (!dst_vma) {
> > > > >               err = -ENOENT;
> > > > > -             dst_vma = find_dst_vma(dst_mm, dst_start, len);
> > > > > -             if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
> > > > > -                     goto out_unlock;
> > > > > +             dst_vma = find_and_pin_dst_vma(dst_mm, dst_start,
> > > > > +                                            len, mmap_locked);
> > > > > +             if (!dst_vma)
> > > > > +                     goto out;
> > > > > +             if (!is_vm_hugetlb_page(dst_vma))
> > > > > +                     goto out_unlock_vma;
> > > > >
> > > > >               err = -EINVAL;
> > > > >               if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
> > > > > +                     goto out_unlock_vma;
> > > > > +
> > > > > +             /*
> > > > > +              * If memory mappings are changing because of non-cooperative
> > > > > +              * operation (e.g. mremap) running in parallel, bail out and
> > > > > +              * request the user to retry later
> > > > > +              */
> > > > > +             down_read(&ctx->map_changing_lock);
> > > > > +             err = -EAGAIN;
> > > > > +             if (atomic_read(&ctx->mmap_changing))
> > > > >                       goto out_unlock;
> > > > >
> > > > >               vm_shared = dst_vma->vm_flags & VM_SHARED;
> > > > > @@ -465,7 +503,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >
> > > > >               if (unlikely(err == -ENOENT)) {
> > > > >                       up_read(&ctx->map_changing_lock);
> > > > > -                     mmap_read_unlock(dst_mm);
> > > > > +                     unpin_vma(dst_mm, dst_vma, mmap_locked);
> > > > >                       BUG_ON(!folio);
> > > > >
> > > > >                       err = copy_folio_from_user(folio,
> > > > > @@ -474,17 +512,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >                               err = -EFAULT;
> > > > >                               goto out;
> > > > >                       }
> > > > > -                     mmap_read_lock(dst_mm);
> > > > > -                     down_read(&ctx->map_changing_lock);
> > > > > -                     /*
> > > > > -                      * If memory mappings are changing because of non-cooperative
> > > > > -                      * operation (e.g. mremap) running in parallel, bail out and
> > > > > -                      * request the user to retry later
> > > > > -                      */
> > > > > -                     if (atomic_read(ctx->mmap_changing)) {
> > > > > -                             err = -EAGAIN;
> > > > > -                             break;
> > > > > -                     }
> > > >
> > > > ... Okay, this is where things get confusing.
> > > >
> > > > How about this: Don't do this locking/boolean dance.
> > > >
> > > > Instead, do something like this:
> > > > In mm/memory.c, below lock_vma_under_rcu(), but something like this
> > > >
> > > > struct vm_area_struct *lock_vma(struct mm_struct *mm,
> > > >         unsigned long addr))    /* or some better name.. */
> > > > {
> > > >         struct vm_area_struct *vma;
> > > >
> > > >         vma = lock_vma_under_rcu(mm, addr);
> > > >
> > > >         if (vma)
> > > >                 return vma;
> > > >
> > > >         mmap_read_lock(mm);
> > > >         vma = lookup_vma(mm, addr);
> > > >         if (vma)
> > > >                 vma_start_read(vma); /* Won't fail */
> > >
> > > Please don't assume vma_start_read() won't fail even when you have
> > > mmap_read_lock(). See the comment in vma_start_read() about the
> > > possibility of an overflow producing false negatives.
> > >
> > > >
> > > >         mmap_read_unlock(mm);
> > > >         return vma;
> > > > }
> > > >
> > > > Now, we know we have a vma that's vma locked if there is a vma.  The vma
> > > > won't go away - you have it locked.  The mmap lock is held for even
> > > > less time for your worse case, and the code gets easier to follow.
> >
> > Your suggestion is definitely simpler and easier to follow, but due to
> > the overflow situation that Suren pointed out, I would still need to
> > keep the locking/boolean dance, no? IIUC, even if I were to return
> > EAGAIN to the userspace, there is no guarantee that subsequent ioctls
> > on the same vma will succeed due to the same overflow, until someone
> > acquires and releases mmap_lock in write-mode.
> > Also, sometimes it seems insufficient whether we managed to lock vma
> > or not. For instance, lock_vma_under_rcu() checks if anon_vma (for
> > anonymous vma) exists. If not then it bails out.
> > So it seems to me that we have to provide some fall back in
> > userfaultfd operations which executes with mmap_lock in read-mode.
>
> Fair enough, what if we didn't use the sequence number and just locked
> the vma directly?

Looks good to me, unless someone else has any objections.
>
> /* This will wait on the vma lock, so once we return it's locked */
> void vma_aquire_read_lock(struct vm_area_struct *vma)
> {
>         mmap_assert_locked(vma->vm_mm);
>         down_read(&vma->vm_lock->lock);
> }
>
> struct vm_area_struct *lock_vma(struct mm_struct *mm,
>         unsigned long addr))    /* or some better name.. */
> {
>         struct vm_area_struct *vma;
>
>         vma = lock_vma_under_rcu(mm, addr);
>         if (vma)
>                 return vma;
>
>         mmap_read_lock(mm);
>         /* mm sequence cannot change, no mm writers anyways.
>          * find_mergeable_anon_vma is only a concern in the page fault
>          * path
>          * start/end won't change under the mmap_lock
>          * vma won't become detached as we have the mmap_lock in read
>          * We are now sure no writes will change the VMA
>          * So let's make sure no other context is isolating the vma
>          */
>         vma = lookup_vma(mm, addr);
>         if (vma)
We can take care of anon_vma as well here right? I can take a bool
parameter ('prepare_anon' or something) and then:

           if (vma) {
                    if (prepare_anon && vma_is_anonymous(vma)) &&
!anon_vma_prepare(vma)) {
                                      vma = ERR_PTR(-ENOMEM);
                                      goto out_unlock;
                   }
>                 vma_aquire_read_lock(vma);
           }
out_unlock:
>         mmap_read_unlock(mm);
>         return vma;
> }
>
> I'm betting that avoiding the mmap_lock most of the time is good, but
> then holding it just to lock the vma will have extremely rare collisions
> - and they will be short lived.
>
> This would allow us to simplify your code.

Agreed! Thanks for the suggestion.
>
> > > >
> > > > Once you are done with the vma do a vma_end_read(vma).  Don't forget to
> > > > do this!
> > > >
> > > > Now the comment above such a function should state that the vma needs to
> > > > be vma_end_read(vma), or that could go undetected..  It might be worth
> > > > adding a unlock_vma() counterpart to vma_end_read(vma) even.
> > >
> > > Locking VMA while holding mmap_read_lock is an interesting usage
> > > pattern I haven't seen yet. I think this should work quite well!
> > >
> > > >
> > > >
> > > > >
> > > > >                       dst_vma = NULL;
> > > > >                       goto retry;
> > > > > @@ -505,7 +532,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >
> > > > >  out_unlock:
> > > > >       up_read(&ctx->map_changing_lock);
> > > > > -     mmap_read_unlock(dst_mm);
> > > > > +out_unlock_vma:
> > > > > +     unpin_vma(dst_mm, dst_vma, mmap_locked);
> > > > >  out:
> > > > >       if (folio)
> > > > >               folio_put(folio);
> > > > > @@ -521,7 +549,8 @@ extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
> > > > >                                   unsigned long dst_start,
> > > > >                                   unsigned long src_start,
> > > > >                                   unsigned long len,
> > > > > -                                 uffd_flags_t flags);
> > > > > +                                 uffd_flags_t flags,
> > > > > +                                 bool *mmap_locked);
> > > >
> > > > Just a thought, tabbing in twice for each argument would make this more
> > > > compact.
> > > >
> > > >
> > > > >  #endif /* CONFIG_HUGETLB_PAGE */
> > > > >
> > > > >  static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
> > > > > @@ -581,6 +610,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > >       unsigned long src_addr, dst_addr;
> > > > >       long copied;
> > > > >       struct folio *folio;
> > > > > +     bool mmap_locked = false;
> > > > >
> > > > >       /*
> > > > >        * Sanitize the command parameters:
> > > > > @@ -597,7 +627,14 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > >       copied = 0;
> > > > >       folio = NULL;
> > > > >  retry:
> > > > > -     mmap_read_lock(dst_mm);
> > > > > +     /*
> > > > > +      * Make sure the vma is not shared, that the dst range is
> > > > > +      * both valid and fully within a single existing vma.
> > > > > +      */
> > > > > +     err = -ENOENT;
> > > > > +     dst_vma = find_and_pin_dst_vma(dst_mm, dst_start, len, &mmap_locked);
> > > > > +     if (!dst_vma)
> > > > > +             goto out;
> > > > >
> > > > >       /*
> > > > >        * If memory mappings are changing because of non-cooperative
> > > > > @@ -609,15 +646,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > >       if (atomic_read(&ctx->mmap_changing))
> > > > >               goto out_unlock;
> > > > >
> > > > > -     /*
> > > > > -      * Make sure the vma is not shared, that the dst range is
> > > > > -      * both valid and fully within a single existing vma.
> > > > > -      */
> > > > > -     err = -ENOENT;
> > > > > -     dst_vma = find_dst_vma(dst_mm, dst_start, len);
> > > > > -     if (!dst_vma)
> > > > > -             goto out_unlock;
> > > > > -
> > > > >       err = -EINVAL;
> > > > >       /*
> > > > >        * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
> > > > > @@ -638,8 +666,8 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > >        * If this is a HUGETLB vma, pass off to appropriate routine
> > > > >        */
> > > > >       if (is_vm_hugetlb_page(dst_vma))
> > > > > -             return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
> > > > > -                                          src_start, len, flags);
> > > > > +             return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start, src_start
> > > > > +                                          len, flags, &mmap_locked);
> > > > >
> > > > >       if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
> > > > >               goto out_unlock;
> > > > > @@ -699,7 +727,8 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > >                       void *kaddr;
> > > > >
> > > > >                       up_read(&ctx->map_changing_lock);
> > > > > -                     mmap_read_unlock(dst_mm);
> > > > > +                     unpin_vma(dst_mm, dst_vma, &mmap_locked);
> > > > > +
> > > > >                       BUG_ON(!folio);
> > > > >
> > > > >                       kaddr = kmap_local_folio(folio, 0);
> > > > > @@ -730,7 +759,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > >
> > > > >  out_unlock:
> > > > >       up_read(&ctx->map_changing_lock);
> > > > > -     mmap_read_unlock(dst_mm);
> > > > > +     unpin_vma(dst_mm, dst_vma, &mmap_locked);
> > > > >  out:
> > > > >       if (folio)
> > > > >               folio_put(folio);
> > > > > @@ -1285,8 +1314,6 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> > > > >   * @len: length of the virtual memory range
> > > > >   * @mode: flags from uffdio_move.mode
> > > > >   *
> > > > > - * Must be called with mmap_lock held for read.
> > > > > - *
> > > > >   * move_pages() remaps arbitrary anonymous pages atomically in zero
> > > > >   * copy. It only works on non shared anonymous pages because those can
> > > > >   * be relocated without generating non linear anon_vmas in the rmap
> > > > > @@ -1353,15 +1380,16 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> > > > >   * could be obtained. This is the only additional complexity added to
> > > > >   * the rmap code to provide this anonymous page remapping functionality.
> > > > >   */
> > > > > -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > > > -                unsigned long dst_start, unsigned long src_start,
> > > > > -                unsigned long len, __u64 mode)
> > > > > +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> > > > > +                unsigned long src_start, unsigned long len, __u64 mode)
> > > > >  {
> > > > > +     struct mm_struct *mm = ctx->mm;
> > > > >       struct vm_area_struct *src_vma, *dst_vma;
> > > > >       unsigned long src_addr, dst_addr;
> > > > >       pmd_t *src_pmd, *dst_pmd;
> > > > >       long err = -EINVAL;
> > > > >       ssize_t moved = 0;
> > > > > +     bool mmap_locked = false;
> > > > >
> > > > >       /* Sanitize the command parameters. */
> > > > >       if (WARN_ON_ONCE(src_start & ~PAGE_MASK) ||
> > > > > @@ -1374,28 +1402,52 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > > >           WARN_ON_ONCE(dst_start + len <= dst_start))
> > > > >               goto out;
> > > >
> > > > Ah, is this safe for rmap?  I think you need to leave this read lock.
> > > >
> > I didn't fully understand you here.
>
> Sorry, I'm confused on how your locking scheme avoids rmap from trying
> to use the VMA with the atomic increment part.
>
> > > > >
> > > > > +     dst_vma = NULL;
> > > > > +     src_vma = lock_vma_under_rcu(mm, src_start);
> > > > > +     if (src_vma) {
> > > > > +             dst_vma = lock_vma_under_rcu(mm, dst_start);
> > > > > +             if (!dst_vma)
> > > > > +                     vma_end_read(src_vma);
> > > > > +     }
> > > > > +
> > > > > +     /* If we failed to lock both VMAs, fall back to mmap_lock */
> > > > > +     if (!dst_vma) {
> > > > > +             mmap_read_lock(mm);
> > > > > +             mmap_locked = true;
> > > > > +             src_vma = find_vma(mm, src_start);
> > > > > +             if (!src_vma)
> > > > > +                     goto out_unlock_mmap;
> > > > > +             dst_vma = find_vma(mm, dst_start);
> > > >
> > > > Again, there is a difference in how find_vma and lock_vam_under_rcu
> > > > works.
> >
> > Sure, I'll use vma_lookup() instead of find_vma().
>
> Be sure it fits with what you are doing, I'm not entire sure it's right
> to switch.  If it is not right then I don't think you can use
> lock_vma_under_rcu() - but we can work around that too.
>
> > > >
> > > > > +             if (!dst_vma)
> > > > > +                     goto out_unlock_mmap;
> > > > > +     }
> > > > > +
> > > > > +     /* Re-check after taking map_changing_lock */
> > > > > +     down_read(&ctx->map_changing_lock);
> > > > > +     if (likely(atomic_read(&ctx->mmap_changing))) {
> > > > > +             err = -EAGAIN;
> > > > > +             goto out_unlock;
> > > > > +     }
> > > > >       /*
> > > > >        * Make sure the vma is not shared, that the src and dst remap
> > > > >        * ranges are both valid and fully within a single existing
> > > > >        * vma.
> > > > >        */
> > > > > -     src_vma = find_vma(mm, src_start);
> > > > > -     if (!src_vma || (src_vma->vm_flags & VM_SHARED))
> > > > > -             goto out;
> > > > > +     if (src_vma->vm_flags & VM_SHARED)
> > > > > +             goto out_unlock;
> > > > >       if (src_start < src_vma->vm_start ||
> > > > >           src_start + len > src_vma->vm_end)
> > > > > -             goto out;
> > > > > +             goto out_unlock;
> > > > >
> > > > > -     dst_vma = find_vma(mm, dst_start);
> > > > > -     if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
> > > > > -             goto out;
> > > > > +     if (dst_vma->vm_flags & VM_SHARED)
> > > > > +             goto out_unlock;
> > > > >       if (dst_start < dst_vma->vm_start ||
> > > > >           dst_start + len > dst_vma->vm_end)
> > > > > -             goto out;
> > > > > +             goto out_unlock;
> > > > >
> > > > >       err = validate_move_areas(ctx, src_vma, dst_vma);
> > > > >       if (err)
> > > > > -             goto out;
> > > > > +             goto out_unlock;
> > > > >
> > > > >       for (src_addr = src_start, dst_addr = dst_start;
> > > > >            src_addr < src_start + len;) {
> > > > > @@ -1512,6 +1564,15 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > > >               moved += step_size;
> > > > >       }
> > > > >
> > > > > +out_unlock:
> > > > > +     up_read(&ctx->map_changing_lock);
> > > > > +out_unlock_mmap:
> > > > > +     if (mmap_locked)
> > > > > +             mmap_read_unlock(mm);
> > > > > +     else {
> > > > > +             vma_end_read(dst_vma);
> > > > > +             vma_end_read(src_vma);
> > > > > +     }
> > > > >  out:
> > > > >       VM_WARN_ON(moved < 0);
> > > > >       VM_WARN_ON(err > 0);
> > > > > --
> > > > > 2.43.0.429.g432eaa2c6b-goog
> > > > >
> > > > >

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 3/3] userfaultfd: use per-vma locks in userfaultfd operations
  2024-01-30  2:58         ` Liam R. Howlett
  2024-01-31  2:49           ` Lokesh Gidra
@ 2024-01-31  3:03           ` Suren Baghdasaryan
  2024-01-31 21:43             ` Liam R. Howlett
  1 sibling, 1 reply; 35+ messages in thread
From: Suren Baghdasaryan @ 2024-01-31  3:03 UTC (permalink / raw)
  To: Liam R. Howlett, Lokesh Gidra, Suren Baghdasaryan, akpm,
	linux-fsdevel, linux-mm, linux-kernel, selinux, kernel-team,
	aarcange, peterx, david, axelrasmussen, bgeffon, willy, jannh,
	kaleshsingh, ngeoffray, timmurray, rppt

On Mon, Jan 29, 2024 at 6:58 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
>
> * Lokesh Gidra <lokeshgidra@google.com> [240129 19:28]:
> > On Mon, Jan 29, 2024 at 12:53 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > >
>
> ...
>
> >
> > Thanks for informing. So vma_lookup() returns the vma for any address
> > within [vma->vm_start, vma->vm_end)?
>
> No.  It returns the vma that contains the address passed.  If there
> isn't one, you will get NULL.  This is why the range check is not
> needed.
>
> find_vma() walks to the address passed and if it is NULL, it returns a
> vma that has a higher start address than the one passed (or, rarely NULL
> if it runs off the edge).
>
> > > > If you want to search upwards from dst_start for a VMA then you should
> > > > move the range check below into this brace.
> > > >
> > > > > +     }
> > > > > +
> > > > >       /*
> > > > >        * Make sure that the dst range is both valid and fully within a
> > > > >        * single existing vma.
> > > > >        */
> > > > > -     struct vm_area_struct *dst_vma;
> > > > > -
> > > > > -     dst_vma = find_vma(dst_mm, dst_start);
> > > > >       if (!range_in_vma(dst_vma, dst_start, dst_start + len))
> > > > > -             return NULL;
> > > > > +             goto unpin;
> > > > >
> > > > >       /*
> > > > >        * Check the vma is registered in uffd, this is required to
> > > > > @@ -40,9 +59,13 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
> > > > >        * time.
> > > > >        */
> > > > >       if (!dst_vma->vm_userfaultfd_ctx.ctx)
> > > > > -             return NULL;
> > > > > +             goto unpin;
> > > > >
> > > > >       return dst_vma;
> > > > > +
> > > > > +unpin:
> > > > > +     unpin_vma(dst_mm, dst_vma, mmap_locked);
> > > > > +     return NULL;
> > > > >  }
> > > > >
> > > > >  /* Check if dst_addr is outside of file's size. Must be called with ptl held. */
> > > > > @@ -350,7 +373,8 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
> > > > >  #ifdef CONFIG_HUGETLB_PAGE
> > > > >  /*
> > > > >   * mfill_atomic processing for HUGETLB vmas.  Note that this routine is
> > > > > - * called with mmap_lock held, it will release mmap_lock before returning.
> > > > > + * called with either vma-lock or mmap_lock held, it will release the lock
> > > > > + * before returning.
> > > > >   */
> > > > >  static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >                                             struct userfaultfd_ctx *ctx,
> > > > > @@ -358,7 +382,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >                                             unsigned long dst_start,
> > > > >                                             unsigned long src_start,
> > > > >                                             unsigned long len,
> > > > > -                                           uffd_flags_t flags)
> > > > > +                                           uffd_flags_t flags,
> > > > > +                                           bool *mmap_locked)
> > > > >  {
> > > > >       struct mm_struct *dst_mm = dst_vma->vm_mm;
> > > > >       int vm_shared = dst_vma->vm_flags & VM_SHARED;
> > > > > @@ -380,7 +405,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >        */
> > > > >       if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
> > > > >               up_read(&ctx->map_changing_lock);
> > > > > -             mmap_read_unlock(dst_mm);
> > > > > +             unpin_vma(dst_mm, dst_vma, mmap_locked);
> > > > >               return -EINVAL;
> > > > >       }
> > > > >
> > > > > @@ -404,12 +429,25 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >        */
> > > > >       if (!dst_vma) {
> > > > >               err = -ENOENT;
> > > > > -             dst_vma = find_dst_vma(dst_mm, dst_start, len);
> > > > > -             if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
> > > > > -                     goto out_unlock;
> > > > > +             dst_vma = find_and_pin_dst_vma(dst_mm, dst_start,
> > > > > +                                            len, mmap_locked);
> > > > > +             if (!dst_vma)
> > > > > +                     goto out;
> > > > > +             if (!is_vm_hugetlb_page(dst_vma))
> > > > > +                     goto out_unlock_vma;
> > > > >
> > > > >               err = -EINVAL;
> > > > >               if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
> > > > > +                     goto out_unlock_vma;
> > > > > +
> > > > > +             /*
> > > > > +              * If memory mappings are changing because of non-cooperative
> > > > > +              * operation (e.g. mremap) running in parallel, bail out and
> > > > > +              * request the user to retry later
> > > > > +              */
> > > > > +             down_read(&ctx->map_changing_lock);
> > > > > +             err = -EAGAIN;
> > > > > +             if (atomic_read(&ctx->mmap_changing))
> > > > >                       goto out_unlock;
> > > > >
> > > > >               vm_shared = dst_vma->vm_flags & VM_SHARED;
> > > > > @@ -465,7 +503,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >
> > > > >               if (unlikely(err == -ENOENT)) {
> > > > >                       up_read(&ctx->map_changing_lock);
> > > > > -                     mmap_read_unlock(dst_mm);
> > > > > +                     unpin_vma(dst_mm, dst_vma, mmap_locked);
> > > > >                       BUG_ON(!folio);
> > > > >
> > > > >                       err = copy_folio_from_user(folio,
> > > > > @@ -474,17 +512,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >                               err = -EFAULT;
> > > > >                               goto out;
> > > > >                       }
> > > > > -                     mmap_read_lock(dst_mm);
> > > > > -                     down_read(&ctx->map_changing_lock);
> > > > > -                     /*
> > > > > -                      * If memory mappings are changing because of non-cooperative
> > > > > -                      * operation (e.g. mremap) running in parallel, bail out and
> > > > > -                      * request the user to retry later
> > > > > -                      */
> > > > > -                     if (atomic_read(ctx->mmap_changing)) {
> > > > > -                             err = -EAGAIN;
> > > > > -                             break;
> > > > > -                     }
> > > >
> > > > ... Okay, this is where things get confusing.
> > > >
> > > > How about this: Don't do this locking/boolean dance.
> > > >
> > > > Instead, do something like this:
> > > > In mm/memory.c, below lock_vma_under_rcu(), but something like this
> > > >
> > > > struct vm_area_struct *lock_vma(struct mm_struct *mm,
> > > >         unsigned long addr))    /* or some better name.. */
> > > > {
> > > >         struct vm_area_struct *vma;
> > > >
> > > >         vma = lock_vma_under_rcu(mm, addr);
> > > >
> > > >         if (vma)
> > > >                 return vma;
> > > >
> > > >         mmap_read_lock(mm);
> > > >         vma = lookup_vma(mm, addr);
> > > >         if (vma)
> > > >                 vma_start_read(vma); /* Won't fail */
> > >
> > > Please don't assume vma_start_read() won't fail even when you have
> > > mmap_read_lock(). See the comment in vma_start_read() about the
> > > possibility of an overflow producing false negatives.
> > >
> > > >
> > > >         mmap_read_unlock(mm);
> > > >         return vma;
> > > > }
> > > >
> > > > Now, we know we have a vma that's vma locked if there is a vma.  The vma
> > > > won't go away - you have it locked.  The mmap lock is held for even
> > > > less time for your worse case, and the code gets easier to follow.
> >
> > Your suggestion is definitely simpler and easier to follow, but due to
> > the overflow situation that Suren pointed out, I would still need to
> > keep the locking/boolean dance, no? IIUC, even if I were to return
> > EAGAIN to the userspace, there is no guarantee that subsequent ioctls
> > on the same vma will succeed due to the same overflow, until someone
> > acquires and releases mmap_lock in write-mode.
> > Also, sometimes it seems insufficient whether we managed to lock vma
> > or not. For instance, lock_vma_under_rcu() checks if anon_vma (for
> > anonymous vma) exists. If not then it bails out.
> > So it seems to me that we have to provide some fall back in
> > userfaultfd operations which executes with mmap_lock in read-mode.
>
> Fair enough, what if we didn't use the sequence number and just locked
> the vma directly?
>
> /* This will wait on the vma lock, so once we return it's locked */
> void vma_aquire_read_lock(struct vm_area_struct *vma)
> {
>         mmap_assert_locked(vma->vm_mm);
>         down_read(&vma->vm_lock->lock);
> }
>
> struct vm_area_struct *lock_vma(struct mm_struct *mm,
>         unsigned long addr))    /* or some better name.. */
> {
>         struct vm_area_struct *vma;
>
>         vma = lock_vma_under_rcu(mm, addr);
>         if (vma)
>                 return vma;
>
>         mmap_read_lock(mm);
>         /* mm sequence cannot change, no mm writers anyways.
>          * find_mergeable_anon_vma is only a concern in the page fault
>          * path
>          * start/end won't change under the mmap_lock
>          * vma won't become detached as we have the mmap_lock in read
>          * We are now sure no writes will change the VMA
>          * So let's make sure no other context is isolating the vma
>          */
>         vma = lookup_vma(mm, addr);
>         if (vma)
>                 vma_aquire_read_lock(vma);
>
>         mmap_read_unlock(mm);
>         return vma;
> }
>
> I'm betting that avoiding the mmap_lock most of the time is good, but
> then holding it just to lock the vma will have extremely rare collisions
> - and they will be short lived.
>
> This would allow us to simplify your code.
>
> > > >
> > > > Once you are done with the vma do a vma_end_read(vma).  Don't forget to
> > > > do this!
> > > >
> > > > Now the comment above such a function should state that the vma needs to
> > > > be vma_end_read(vma), or that could go undetected..  It might be worth
> > > > adding a unlock_vma() counterpart to vma_end_read(vma) even.
> > >
> > > Locking VMA while holding mmap_read_lock is an interesting usage
> > > pattern I haven't seen yet. I think this should work quite well!
> > >
> > > >
> > > >
> > > > >
> > > > >                       dst_vma = NULL;
> > > > >                       goto retry;
> > > > > @@ -505,7 +532,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >
> > > > >  out_unlock:
> > > > >       up_read(&ctx->map_changing_lock);
> > > > > -     mmap_read_unlock(dst_mm);
> > > > > +out_unlock_vma:
> > > > > +     unpin_vma(dst_mm, dst_vma, mmap_locked);
> > > > >  out:
> > > > >       if (folio)
> > > > >               folio_put(folio);
> > > > > @@ -521,7 +549,8 @@ extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
> > > > >                                   unsigned long dst_start,
> > > > >                                   unsigned long src_start,
> > > > >                                   unsigned long len,
> > > > > -                                 uffd_flags_t flags);
> > > > > +                                 uffd_flags_t flags,
> > > > > +                                 bool *mmap_locked);
> > > >
> > > > Just a thought, tabbing in twice for each argument would make this more
> > > > compact.
> > > >
> > > >
> > > > >  #endif /* CONFIG_HUGETLB_PAGE */
> > > > >
> > > > >  static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
> > > > > @@ -581,6 +610,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > >       unsigned long src_addr, dst_addr;
> > > > >       long copied;
> > > > >       struct folio *folio;
> > > > > +     bool mmap_locked = false;
> > > > >
> > > > >       /*
> > > > >        * Sanitize the command parameters:
> > > > > @@ -597,7 +627,14 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > >       copied = 0;
> > > > >       folio = NULL;
> > > > >  retry:
> > > > > -     mmap_read_lock(dst_mm);
> > > > > +     /*
> > > > > +      * Make sure the vma is not shared, that the dst range is
> > > > > +      * both valid and fully within a single existing vma.
> > > > > +      */
> > > > > +     err = -ENOENT;
> > > > > +     dst_vma = find_and_pin_dst_vma(dst_mm, dst_start, len, &mmap_locked);
> > > > > +     if (!dst_vma)
> > > > > +             goto out;
> > > > >
> > > > >       /*
> > > > >        * If memory mappings are changing because of non-cooperative
> > > > > @@ -609,15 +646,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > >       if (atomic_read(&ctx->mmap_changing))
> > > > >               goto out_unlock;
> > > > >
> > > > > -     /*
> > > > > -      * Make sure the vma is not shared, that the dst range is
> > > > > -      * both valid and fully within a single existing vma.
> > > > > -      */
> > > > > -     err = -ENOENT;
> > > > > -     dst_vma = find_dst_vma(dst_mm, dst_start, len);
> > > > > -     if (!dst_vma)
> > > > > -             goto out_unlock;
> > > > > -
> > > > >       err = -EINVAL;
> > > > >       /*
> > > > >        * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
> > > > > @@ -638,8 +666,8 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > >        * If this is a HUGETLB vma, pass off to appropriate routine
> > > > >        */
> > > > >       if (is_vm_hugetlb_page(dst_vma))
> > > > > -             return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
> > > > > -                                          src_start, len, flags);
> > > > > +             return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start, src_start
> > > > > +                                          len, flags, &mmap_locked);
> > > > >
> > > > >       if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
> > > > >               goto out_unlock;
> > > > > @@ -699,7 +727,8 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > >                       void *kaddr;
> > > > >
> > > > >                       up_read(&ctx->map_changing_lock);
> > > > > -                     mmap_read_unlock(dst_mm);
> > > > > +                     unpin_vma(dst_mm, dst_vma, &mmap_locked);
> > > > > +
> > > > >                       BUG_ON(!folio);
> > > > >
> > > > >                       kaddr = kmap_local_folio(folio, 0);
> > > > > @@ -730,7 +759,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > >
> > > > >  out_unlock:
> > > > >       up_read(&ctx->map_changing_lock);
> > > > > -     mmap_read_unlock(dst_mm);
> > > > > +     unpin_vma(dst_mm, dst_vma, &mmap_locked);
> > > > >  out:
> > > > >       if (folio)
> > > > >               folio_put(folio);
> > > > > @@ -1285,8 +1314,6 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> > > > >   * @len: length of the virtual memory range
> > > > >   * @mode: flags from uffdio_move.mode
> > > > >   *
> > > > > - * Must be called with mmap_lock held for read.
> > > > > - *
> > > > >   * move_pages() remaps arbitrary anonymous pages atomically in zero
> > > > >   * copy. It only works on non shared anonymous pages because those can
> > > > >   * be relocated without generating non linear anon_vmas in the rmap
> > > > > @@ -1353,15 +1380,16 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> > > > >   * could be obtained. This is the only additional complexity added to
> > > > >   * the rmap code to provide this anonymous page remapping functionality.
> > > > >   */
> > > > > -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > > > -                unsigned long dst_start, unsigned long src_start,
> > > > > -                unsigned long len, __u64 mode)
> > > > > +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> > > > > +                unsigned long src_start, unsigned long len, __u64 mode)
> > > > >  {
> > > > > +     struct mm_struct *mm = ctx->mm;
> > > > >       struct vm_area_struct *src_vma, *dst_vma;
> > > > >       unsigned long src_addr, dst_addr;
> > > > >       pmd_t *src_pmd, *dst_pmd;
> > > > >       long err = -EINVAL;
> > > > >       ssize_t moved = 0;
> > > > > +     bool mmap_locked = false;
> > > > >
> > > > >       /* Sanitize the command parameters. */
> > > > >       if (WARN_ON_ONCE(src_start & ~PAGE_MASK) ||
> > > > > @@ -1374,28 +1402,52 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > > >           WARN_ON_ONCE(dst_start + len <= dst_start))
> > > > >               goto out;
> > > >
> > > > Ah, is this safe for rmap?  I think you need to leave this read lock.
> > > >
> > I didn't fully understand you here.
>
> Sorry, I'm confused on how your locking scheme avoids rmap from trying
> to use the VMA with the atomic increment part.

I'm also a bit confused. Which atomic increment are you referring to?
AFAIU move_pages() will lock both src_vma and dst_vma, so even if rmap
finds them it can't modify them, no?

>
> > > > >
> > > > > +     dst_vma = NULL;
> > > > > +     src_vma = lock_vma_under_rcu(mm, src_start);
> > > > > +     if (src_vma) {
> > > > > +             dst_vma = lock_vma_under_rcu(mm, dst_start);
> > > > > +             if (!dst_vma)
> > > > > +                     vma_end_read(src_vma);
> > > > > +     }
> > > > > +
> > > > > +     /* If we failed to lock both VMAs, fall back to mmap_lock */
> > > > > +     if (!dst_vma) {
> > > > > +             mmap_read_lock(mm);
> > > > > +             mmap_locked = true;
> > > > > +             src_vma = find_vma(mm, src_start);
> > > > > +             if (!src_vma)
> > > > > +                     goto out_unlock_mmap;
> > > > > +             dst_vma = find_vma(mm, dst_start);
> > > >
> > > > Again, there is a difference in how find_vma and lock_vam_under_rcu
> > > > works.
> >
> > Sure, I'll use vma_lookup() instead of find_vma().
>
> Be sure it fits with what you are doing, I'm not entire sure it's right
> to switch.  If it is not right then I don't think you can use
> lock_vma_under_rcu() - but we can work around that too.
>
> > > >
> > > > > +             if (!dst_vma)
> > > > > +                     goto out_unlock_mmap;
> > > > > +     }
> > > > > +
> > > > > +     /* Re-check after taking map_changing_lock */
> > > > > +     down_read(&ctx->map_changing_lock);
> > > > > +     if (likely(atomic_read(&ctx->mmap_changing))) {
> > > > > +             err = -EAGAIN;
> > > > > +             goto out_unlock;
> > > > > +     }
> > > > >       /*
> > > > >        * Make sure the vma is not shared, that the src and dst remap
> > > > >        * ranges are both valid and fully within a single existing
> > > > >        * vma.
> > > > >        */
> > > > > -     src_vma = find_vma(mm, src_start);
> > > > > -     if (!src_vma || (src_vma->vm_flags & VM_SHARED))
> > > > > -             goto out;
> > > > > +     if (src_vma->vm_flags & VM_SHARED)
> > > > > +             goto out_unlock;
> > > > >       if (src_start < src_vma->vm_start ||
> > > > >           src_start + len > src_vma->vm_end)
> > > > > -             goto out;
> > > > > +             goto out_unlock;
> > > > >
> > > > > -     dst_vma = find_vma(mm, dst_start);
> > > > > -     if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
> > > > > -             goto out;
> > > > > +     if (dst_vma->vm_flags & VM_SHARED)
> > > > > +             goto out_unlock;
> > > > >       if (dst_start < dst_vma->vm_start ||
> > > > >           dst_start + len > dst_vma->vm_end)
> > > > > -             goto out;
> > > > > +             goto out_unlock;
> > > > >
> > > > >       err = validate_move_areas(ctx, src_vma, dst_vma);
> > > > >       if (err)
> > > > > -             goto out;
> > > > > +             goto out_unlock;
> > > > >
> > > > >       for (src_addr = src_start, dst_addr = dst_start;
> > > > >            src_addr < src_start + len;) {
> > > > > @@ -1512,6 +1564,15 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > > >               moved += step_size;
> > > > >       }
> > > > >
> > > > > +out_unlock:
> > > > > +     up_read(&ctx->map_changing_lock);
> > > > > +out_unlock_mmap:
> > > > > +     if (mmap_locked)
> > > > > +             mmap_read_unlock(mm);
> > > > > +     else {
> > > > > +             vma_end_read(dst_vma);
> > > > > +             vma_end_read(src_vma);
> > > > > +     }
> > > > >  out:
> > > > >       VM_WARN_ON(moved < 0);
> > > > >       VM_WARN_ON(err > 0);
> > > > > --
> > > > > 2.43.0.429.g432eaa2c6b-goog
> > > > >
> > > > >
>
> --
> To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 3/3] userfaultfd: use per-vma locks in userfaultfd operations
  2024-01-31  2:49           ` Lokesh Gidra
@ 2024-01-31 21:41             ` Liam R. Howlett
  2024-02-05 21:46               ` Suren Baghdasaryan
  0 siblings, 1 reply; 35+ messages in thread
From: Liam R. Howlett @ 2024-01-31 21:41 UTC (permalink / raw)
  To: Lokesh Gidra
  Cc: Suren Baghdasaryan, akpm, linux-fsdevel, linux-mm, linux-kernel,
	selinux, kernel-team, aarcange, peterx, david, axelrasmussen,
	bgeffon, willy, jannh, kaleshsingh, ngeoffray, timmurray, rppt

* Lokesh Gidra <lokeshgidra@google.com> [240130 21:49]:
> On Mon, Jan 29, 2024 at 6:58 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> >
> > * Lokesh Gidra <lokeshgidra@google.com> [240129 19:28]:
> > > On Mon, Jan 29, 2024 at 12:53 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > > >
> >

...

> >
> > > Your suggestion is definitely simpler and easier to follow, but due to
> > > the overflow situation that Suren pointed out, I would still need to
> > > keep the locking/boolean dance, no? IIUC, even if I were to return
> > > EAGAIN to the userspace, there is no guarantee that subsequent ioctls
> > > on the same vma will succeed due to the same overflow, until someone
> > > acquires and releases mmap_lock in write-mode.
> > > Also, sometimes it seems insufficient whether we managed to lock vma
> > > or not. For instance, lock_vma_under_rcu() checks if anon_vma (for
> > > anonymous vma) exists. If not then it bails out.
> > > So it seems to me that we have to provide some fall back in
> > > userfaultfd operations which executes with mmap_lock in read-mode.
> >
> > Fair enough, what if we didn't use the sequence number and just locked
> > the vma directly?
> 
> Looks good to me, unless someone else has any objections.
> >
> > /* This will wait on the vma lock, so once we return it's locked */
> > void vma_aquire_read_lock(struct vm_area_struct *vma)
> > {
> >         mmap_assert_locked(vma->vm_mm);
> >         down_read(&vma->vm_lock->lock);
> > }
> >
> > struct vm_area_struct *lock_vma(struct mm_struct *mm,
> >         unsigned long addr))    /* or some better name.. */
> > {
> >         struct vm_area_struct *vma;
> >
> >         vma = lock_vma_under_rcu(mm, addr);
> >         if (vma)
> >                 return vma;
> >
> >         mmap_read_lock(mm);
> >         /* mm sequence cannot change, no mm writers anyways.
> >          * find_mergeable_anon_vma is only a concern in the page fault
> >          * path
> >          * start/end won't change under the mmap_lock
> >          * vma won't become detached as we have the mmap_lock in read
> >          * We are now sure no writes will change the VMA
> >          * So let's make sure no other context is isolating the vma
> >          */
> >         vma = lookup_vma(mm, addr);
> >         if (vma)
> We can take care of anon_vma as well here right? I can take a bool
> parameter ('prepare_anon' or something) and then:
> 
>            if (vma) {
>                     if (prepare_anon && vma_is_anonymous(vma)) &&
> !anon_vma_prepare(vma)) {
>                                       vma = ERR_PTR(-ENOMEM);
>                                       goto out_unlock;
>                    }
> >                 vma_aquire_read_lock(vma);
>            }
> out_unlock:
> >         mmap_read_unlock(mm);
> >         return vma;
> > }

Do you need this?  I didn't think this was happening in the code as
written?  If you need it I would suggest making it happen always and
ditch the flag until a user needs this variant, but document what's
going on in here or even have a better name.

Thanks,
Liam

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 3/3] userfaultfd: use per-vma locks in userfaultfd operations
  2024-01-31  3:03           ` Suren Baghdasaryan
@ 2024-01-31 21:43             ` Liam R. Howlett
  0 siblings, 0 replies; 35+ messages in thread
From: Liam R. Howlett @ 2024-01-31 21:43 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: Lokesh Gidra, akpm, linux-fsdevel, linux-mm, linux-kernel,
	selinux, kernel-team, aarcange, peterx, david, axelrasmussen,
	bgeffon, willy, jannh, kaleshsingh, ngeoffray, timmurray, rppt

* Suren Baghdasaryan <surenb@google.com> [240130 22:03]:
> On Mon, Jan 29, 2024 at 6:58 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:

...

> > > > > > @@ -730,7 +759,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > > >
> > > > > >  out_unlock:
> > > > > >       up_read(&ctx->map_changing_lock);
> > > > > > -     mmap_read_unlock(dst_mm);
> > > > > > +     unpin_vma(dst_mm, dst_vma, &mmap_locked);
> > > > > >  out:
> > > > > >       if (folio)
> > > > > >               folio_put(folio);
> > > > > > @@ -1285,8 +1314,6 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> > > > > >   * @len: length of the virtual memory range
> > > > > >   * @mode: flags from uffdio_move.mode
> > > > > >   *
> > > > > > - * Must be called with mmap_lock held for read.
> > > > > > - *
> > > > > >   * move_pages() remaps arbitrary anonymous pages atomically in zero
> > > > > >   * copy. It only works on non shared anonymous pages because those can
> > > > > >   * be relocated without generating non linear anon_vmas in the rmap
> > > > > > @@ -1353,15 +1380,16 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> > > > > >   * could be obtained. This is the only additional complexity added to
> > > > > >   * the rmap code to provide this anonymous page remapping functionality.
> > > > > >   */
> > > > > > -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > > > > -                unsigned long dst_start, unsigned long src_start,
> > > > > > -                unsigned long len, __u64 mode)
> > > > > > +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> > > > > > +                unsigned long src_start, unsigned long len, __u64 mode)
> > > > > >  {
> > > > > > +     struct mm_struct *mm = ctx->mm;
> > > > > >       struct vm_area_struct *src_vma, *dst_vma;
> > > > > >       unsigned long src_addr, dst_addr;
> > > > > >       pmd_t *src_pmd, *dst_pmd;
> > > > > >       long err = -EINVAL;
> > > > > >       ssize_t moved = 0;
> > > > > > +     bool mmap_locked = false;
> > > > > >
> > > > > >       /* Sanitize the command parameters. */
> > > > > >       if (WARN_ON_ONCE(src_start & ~PAGE_MASK) ||
> > > > > > @@ -1374,28 +1402,52 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > > > >           WARN_ON_ONCE(dst_start + len <= dst_start))
> > > > > >               goto out;
> > > > >
> > > > > Ah, is this safe for rmap?  I think you need to leave this read lock.
> > > > >
> > > I didn't fully understand you here.
> >
> > Sorry, I'm confused on how your locking scheme avoids rmap from trying
> > to use the VMA with the atomic increment part.
> 
> I'm also a bit confused. Which atomic increment are you referring to?
> AFAIU move_pages() will lock both src_vma and dst_vma, so even if rmap
> finds them it can't modify them, no?

The uffd atomic, mmap_changing.

...

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 2/3] userfaultfd: protect mmap_changing with rw_sem in userfaulfd_ctx
  2024-01-31  2:24             ` Lokesh Gidra
@ 2024-02-04 10:27               ` Mike Rapoport
  2024-02-05 20:53                 ` Lokesh Gidra
  0 siblings, 1 reply; 35+ messages in thread
From: Mike Rapoport @ 2024-02-04 10:27 UTC (permalink / raw)
  To: Lokesh Gidra
  Cc: Liam R. Howlett, akpm, linux-fsdevel, linux-mm, linux-kernel,
	selinux, surenb, kernel-team, aarcange, peterx, david,
	axelrasmussen, bgeffon, willy, jannh, kaleshsingh, ngeoffray,
	timmurray

On Tue, Jan 30, 2024 at 06:24:24PM -0800, Lokesh Gidra wrote:
> On Tue, Jan 30, 2024 at 9:28 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> >
> > * Mike Rapoport <rppt@kernel.org> [240130 03:55]:
> > > On Mon, Jan 29, 2024 at 10:46:27PM -0500, Liam R. Howlett wrote:
> > > > * Lokesh Gidra <lokeshgidra@google.com> [240129 17:35]:
> > > >
> > > > > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > > > > > > index 58331b83d648..c00a021bcce4 100644
> > > > > > > --- a/fs/userfaultfd.c
> > > > > > > +++ b/fs/userfaultfd.c
> > > > > > > @@ -685,12 +685,15 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
> > > > > > >               ctx->flags = octx->flags;
> > > > > > >               ctx->features = octx->features;
> > > > > > >               ctx->released = false;
> > > > > > > +             init_rwsem(&ctx->map_changing_lock);
> > > > > > >               atomic_set(&ctx->mmap_changing, 0);
> > > > > > >               ctx->mm = vma->vm_mm;
> > > > > > >               mmgrab(ctx->mm);
> > > > > > >
> > > > > > >               userfaultfd_ctx_get(octx);
> > > > > > > +             down_write(&octx->map_changing_lock);
> > > > > > >               atomic_inc(&octx->mmap_changing);
> > > > > > > +             up_write(&octx->map_changing_lock);
> > > >
> > > > On init, I don't think taking the lock is strictly necessary - unless
> > > > there is a way to access it before this increment?  Not that it would
> > > > cost much.
> > >
> > > It's fork, the lock is for the context of the parent process and there
> > > could be uffdio ops running in parallel on its VM.
> >
> > Is this necessary then?  We are getting the octx from another mm but the
> > mm is locked for forking.  Why does it matter if there are readers of
> > the octx?
> >
> > I assume, currently, there is no way the userfaultfd ctx can
> > be altered under mmap_lock held for writing. I would think it matters if
> > there are writers (which, I presume are blocked by the mmap_lock for
> > now?)  Shouldn't we hold the write lock for the entire dup process, I
> > mean, if we remove the userfaultfd from the mmap_lock, we cannot let the
> > structure being duplicated change half way through the dup process?
> >
> > I must be missing something with where this is headed?
> >
> AFAIU, the purpose of mmap_changing is to serialize uffdio operations
> with non-cooperative events if and when such events are being
> monitored by userspace (in case you missed, in all the cases of writes
> to mmap_changing, we only do it if that non-cooperative event has been
> requested by the user). As you pointed out there are no correctness
> concerns as far as userfaultfd operations are concerned. But these
> events are essential for the uffd monitor's functioning.
> 
> For example: say the uffd monitor wants to be notified for REMAP
> operations while doing uffdio_copy operations. When COPY ioctls start
> failing with -EAGAIN and uffdio_copy.copy == 0, then it knows it must
> be due to mremap(), in which case it waits for the REMAP event
> notification before attempting COPY again.
> 
> But there are few things that I didn't get after going through the
> history of non-cooperative events. Hopefully Mike (or someone else
> familiar) can clarify:
> 
> IIUC, the idea behind non-cooperative events was to block uffdio
> operations from happening *before* the page tables are manipulated by
> the event (like mremap), and that the uffdio ops are resumed after the
> event notification is received by the monitor.

The idea was to give userspace some way to serialize processing of
non-cooperative event notifications and uffdio operations running in
parallel. It's not necessary to block uffdio operations from happening
before changes to the memory map, but with the mmap_lock synchronization
that already was there adding mmap_chaning that will prevent uffdio
operations when mmap_lock is taken for write was the simplest thing to do.

When CRIU does post-copy restore of a process, its uffd monitor reacts to
page fault and non-cooperative notifications and also performs a background
copy of the memory contents from the saved state to the address space of
the process being restored.

Since non-cooperative events may happen completely independent from the
uffd monitor, there are cases when the uffd monitor couldn't identify the
order of events, like  e.g. what won the race on mmap_lock, the process
thread doing fork or the uffd monitor's uffdio_copy.

In the fork vs uffdio_copy example, without mmap_changing, if the
uffdio_copy takes the mmap_lock first, the new page will be present in the
parent by the time copy_page_range() is called and the page will appear in
the child's memory mappings by the time uffd monitor gets notification
about the fork event. However, if the fork() is the first to take the
mmap_lock, the new page will appear in the parent address space after
copy_page_range() and it won't be mapped in the child's address space.

With mmap_changing and current locking with mmap_lock, we have a guarantee
that uffdio_copy will bail out if fork already took mmap_lock and the
monitor can act appropriately.
 
> 1) Why in the case of REMAP prep() is done after page-tables are
> moved? Shouldn't it be done before? All other non-cooperative
> operations do the prep() before.

mremap_userfaultfd_prep() is done after page tables are moved because it
initializes uffd context on the new_vma and if the actual remap fails,
there's no point of doing it.
Since mrpemap holds mmap_lock for write it does not matter if mmap_changed
is updated before or after page tables are moved. In the time between
mmap_lock is released and the UFFD_EVENT_REMAP is delivered to the uffd
monitor, mmap_chaging will remain >0 and uffdio operations will bail out.

> 2) UFFD_FEATURE_EVENT_REMOVE only notifies user space. It is not
> consistently blocking uffdio operations (as both sides are acquiring
> mmap_lock in read-mode) when remove operation is taking place. I can
> understand this was intentionally left as is in the interest of not
> acquiring mmap_lock in write-mode during madvise. But is only getting
> the notification any useful? Can we say this patch fixes it? And in
> that case shouldn't I split userfaultfd_remove() into two functions
> (like other non-cooperative operations)?

The notifications are useful because uffd monitor knows what memory should
not be filled with uffdio_copy. Indeed there was no interest in taking
mmap_lock for write in madvise, so there could be race between madvise and
uffdio operations. This race essentially prevents uffd monitor from running
the background copy in a separate thread, and with your change this should
be possible.

> 3) Based on [1] I see how mmap_changing helps in eliminating duplicate
> work (background copy) by uffd monitor, but didn't get if there is a
> correctness aspect too that I'm missing? I concur with Amit's point in
> [1] that getting -EEXIST when setting up the pte will avoid memory
> corruption, no?

In the fork case without mmap_changing the child process may be get data or
zeroes depending on the race for mmap_lock between the fork and
uffdio_copy and -EEXIST is not enough for monitor to detect what was the
ordering between fork and uffdio_copy.
 
> > > > > > > @@ -783,7 +788,9 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
> > > > > > >               return true;
> > > > > > >
> > > > > > >       userfaultfd_ctx_get(ctx);
> > > > > > > +     down_write(&ctx->map_changing_lock);
> > > > > > >       atomic_inc(&ctx->mmap_changing);
> > > > > > > +     up_write(&ctx->map_changing_lock);
> > > > > > >       mmap_read_unlock(mm);
> > > > > > >
> > > > > > >       msg_init(&ewq.msg);
> > > >
> > > > If this happens in read mode, then why are you waiting for the readers
> > > > to leave?  Can't you just increment the atomic?  It's fine happening in
> > > > read mode today, so it should be fine with this new rwsem.
> > >
> > > It's been a while and the details are blurred now, but if I remember
> > > correctly, having this in read mode forced non-cooperative uffd monitor to
> > > be single threaded. If a monitor runs, say uffdio_copy, and in parallel a
> > > thread in the monitored process does MADV_DONTNEED, the latter will wait
> > > for userfaultfd_remove notification to be processed in the monitor and drop
> > > the VMA contents only afterwards. If a non-cooperative monitor would
> > > process notification in parallel with uffdio ops, MADV_DONTNEED could
> > > continue and race with uffdio_copy, so read mode wouldn't be enough.
> > >
> >
> > Right now this function won't stop to wait for readers to exit the
> > critical section, but with this change there will be a pause (since the
> > down_write() will need to wait for the readers with the read lock).  So
> > this is adding a delay in this call path that isn't necessary (?) nor
> > existed before.  If you have non-cooperative uffd monitors, then you
> > will have to wait for them to finish to mark the uffd as being removed,
> > where as before it was a fire & forget, this is now a wait to tell.
> >
> I think a lot will be clearer once we get a response to my questions
> above. IMHO not only this write-lock is needed here, we need to fix
> userfaultfd_remove() by splitting it into userfaultfd_remove_prep()
> and userfaultfd_remove_complete() (like all other non-cooperative
> operations) as well. This patch enables us to do that as we remove
> mmap_changing's dependency on mmap_lock for synchronization.

The write-lock is not a requirement here for correctness and I don't see
why we would need userfaultfd_remove_prep().

As I've said earlier, having a write-lock here will let CRIU to run
background copy in parallel with processing of uffd events, but I don't
feel strongly about doing it.

> > > There was no much sense to make MADV_DONTNEED take mmap_lock in write mode
> > > just for this, but now taking the rwsem in write mode here sounds
> > > reasonable.
> > >
> >
> > I see why there was no need for a mmap_lock in write mode, but I think
> > taking the new rwsem in write mode is unnecessary.
> >
> > Basically, I see this as a signal to new readers to abort, but we don't
> > need to wait for current readers to finish before this one increments
> > the atomic.
> >
> > Unless I missed something, I don't think you want to take the write lock
> > here.
> What I understood from the history of mmap_changing is that the
> intention was to enable informing the uffd monitor about the correct
> state of which pages are filled and which aren't. Going through this
> thread was very helpful [2]
> 
> [2] https://lore.kernel.org/lkml/1527061324-19949-1-git-send-email-rppt@linux.vnet.ibm.com/

-- 
Sincerely yours,
Mike.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 2/3] userfaultfd: protect mmap_changing with rw_sem in userfaulfd_ctx
  2024-02-04 10:27               ` Mike Rapoport
@ 2024-02-05 20:53                 ` Lokesh Gidra
  2024-02-07 15:27                   ` Mike Rapoport
  0 siblings, 1 reply; 35+ messages in thread
From: Lokesh Gidra @ 2024-02-05 20:53 UTC (permalink / raw)
  To: Mike Rapoport
  Cc: Liam R. Howlett, akpm, linux-fsdevel, linux-mm, linux-kernel,
	selinux, surenb, kernel-team, aarcange, peterx, david,
	axelrasmussen, bgeffon, willy, jannh, kaleshsingh, ngeoffray,
	timmurray

On Sun, Feb 4, 2024 at 2:27 AM Mike Rapoport <rppt@kernel.org> wrote:
>
> On Tue, Jan 30, 2024 at 06:24:24PM -0800, Lokesh Gidra wrote:
> > On Tue, Jan 30, 2024 at 9:28 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> > >
> > > * Mike Rapoport <rppt@kernel.org> [240130 03:55]:
> > > > On Mon, Jan 29, 2024 at 10:46:27PM -0500, Liam R. Howlett wrote:
> > > > > * Lokesh Gidra <lokeshgidra@google.com> [240129 17:35]:
> > > > >
> > > > > > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > > > > > > > index 58331b83d648..c00a021bcce4 100644
> > > > > > > > --- a/fs/userfaultfd.c
> > > > > > > > +++ b/fs/userfaultfd.c
> > > > > > > > @@ -685,12 +685,15 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
> > > > > > > >               ctx->flags = octx->flags;
> > > > > > > >               ctx->features = octx->features;
> > > > > > > >               ctx->released = false;
> > > > > > > > +             init_rwsem(&ctx->map_changing_lock);
> > > > > > > >               atomic_set(&ctx->mmap_changing, 0);
> > > > > > > >               ctx->mm = vma->vm_mm;
> > > > > > > >               mmgrab(ctx->mm);
> > > > > > > >
> > > > > > > >               userfaultfd_ctx_get(octx);
> > > > > > > > +             down_write(&octx->map_changing_lock);
> > > > > > > >               atomic_inc(&octx->mmap_changing);
> > > > > > > > +             up_write(&octx->map_changing_lock);
> > > > >
> > > > > On init, I don't think taking the lock is strictly necessary - unless
> > > > > there is a way to access it before this increment?  Not that it would
> > > > > cost much.
> > > >
> > > > It's fork, the lock is for the context of the parent process and there
> > > > could be uffdio ops running in parallel on its VM.
> > >
> > > Is this necessary then?  We are getting the octx from another mm but the
> > > mm is locked for forking.  Why does it matter if there are readers of
> > > the octx?
> > >
> > > I assume, currently, there is no way the userfaultfd ctx can
> > > be altered under mmap_lock held for writing. I would think it matters if
> > > there are writers (which, I presume are blocked by the mmap_lock for
> > > now?)  Shouldn't we hold the write lock for the entire dup process, I
> > > mean, if we remove the userfaultfd from the mmap_lock, we cannot let the
> > > structure being duplicated change half way through the dup process?
> > >
> > > I must be missing something with where this is headed?
> > >
> > AFAIU, the purpose of mmap_changing is to serialize uffdio operations
> > with non-cooperative events if and when such events are being
> > monitored by userspace (in case you missed, in all the cases of writes
> > to mmap_changing, we only do it if that non-cooperative event has been
> > requested by the user). As you pointed out there are no correctness
> > concerns as far as userfaultfd operations are concerned. But these
> > events are essential for the uffd monitor's functioning.
> >
> > For example: say the uffd monitor wants to be notified for REMAP
> > operations while doing uffdio_copy operations. When COPY ioctls start
> > failing with -EAGAIN and uffdio_copy.copy == 0, then it knows it must
> > be due to mremap(), in which case it waits for the REMAP event
> > notification before attempting COPY again.
> >
> > But there are few things that I didn't get after going through the
> > history of non-cooperative events. Hopefully Mike (or someone else
> > familiar) can clarify:
> >
> > IIUC, the idea behind non-cooperative events was to block uffdio
> > operations from happening *before* the page tables are manipulated by
> > the event (like mremap), and that the uffdio ops are resumed after the
> > event notification is received by the monitor.
>
> The idea was to give userspace some way to serialize processing of
> non-cooperative event notifications and uffdio operations running in
> parallel. It's not necessary to block uffdio operations from happening
> before changes to the memory map, but with the mmap_lock synchronization
> that already was there adding mmap_chaning that will prevent uffdio
> operations when mmap_lock is taken for write was the simplest thing to do.
>
> When CRIU does post-copy restore of a process, its uffd monitor reacts to
> page fault and non-cooperative notifications and also performs a background
> copy of the memory contents from the saved state to the address space of
> the process being restored.
>
> Since non-cooperative events may happen completely independent from the
> uffd monitor, there are cases when the uffd monitor couldn't identify the
> order of events, like  e.g. what won the race on mmap_lock, the process
> thread doing fork or the uffd monitor's uffdio_copy.
>
> In the fork vs uffdio_copy example, without mmap_changing, if the
> uffdio_copy takes the mmap_lock first, the new page will be present in the
> parent by the time copy_page_range() is called and the page will appear in
> the child's memory mappings by the time uffd monitor gets notification
> about the fork event. However, if the fork() is the first to take the
> mmap_lock, the new page will appear in the parent address space after
> copy_page_range() and it won't be mapped in the child's address space.
>
> With mmap_changing and current locking with mmap_lock, we have a guarantee
> that uffdio_copy will bail out if fork already took mmap_lock and the
> monitor can act appropriately.
>
Thanks for the explanation. Really helpful!

> > 1) Why in the case of REMAP prep() is done after page-tables are
> > moved? Shouldn't it be done before? All other non-cooperative
> > operations do the prep() before.
>
> mremap_userfaultfd_prep() is done after page tables are moved because it
> initializes uffd context on the new_vma and if the actual remap fails,
> there's no point of doing it.
> Since mrpemap holds mmap_lock for write it does not matter if mmap_changed
> is updated before or after page tables are moved. In the time between
> mmap_lock is released and the UFFD_EVENT_REMAP is delivered to the uffd
> monitor, mmap_chaging will remain >0 and uffdio operations will bail out.
>
Yes this makes sense. Even with per-vma locks, I see that the new_vma
is write-locked (vma_start_write()) in vma_link() guaranteeing the
same.

> > 2) UFFD_FEATURE_EVENT_REMOVE only notifies user space. It is not
> > consistently blocking uffdio operations (as both sides are acquiring
> > mmap_lock in read-mode) when remove operation is taking place. I can
> > understand this was intentionally left as is in the interest of not
> > acquiring mmap_lock in write-mode during madvise. But is only getting
> > the notification any useful? Can we say this patch fixes it? And in
> > that case shouldn't I split userfaultfd_remove() into two functions
> > (like other non-cooperative operations)?
>
> The notifications are useful because uffd monitor knows what memory should
> not be filled with uffdio_copy. Indeed there was no interest in taking
> mmap_lock for write in madvise, so there could be race between madvise and
> uffdio operations. This race essentially prevents uffd monitor from running
> the background copy in a separate thread, and with your change this should
> be possible.
>
Makes sense. Thanks!

> > 3) Based on [1] I see how mmap_changing helps in eliminating duplicate
> > work (background copy) by uffd monitor, but didn't get if there is a
> > correctness aspect too that I'm missing? I concur with Amit's point in
> > [1] that getting -EEXIST when setting up the pte will avoid memory
> > corruption, no?
>
> In the fork case without mmap_changing the child process may be get data or
> zeroes depending on the race for mmap_lock between the fork and
> uffdio_copy and -EEXIST is not enough for monitor to detect what was the
> ordering between fork and uffdio_copy.

This is extremely helpful. IIUC, there is a window after mmap_lock
(write-mode) is released and before the uffd monitor thread is
notified of fork. In that window, the monitor doesn't know that fork
has already happened. So, without mmap_changing it would have done
background copy only in the parent, thereby causing data inconsistency
between parent and child processes.

It seems to me that the correctness argument for mmap_changing is
there in case of FORK event and REMAP when mremap is called with
MREMAP_DONTUNMAP. In all other cases its only benefit is by avoiding
unnecessary background copies, right?

>
> > > > > > > > @@ -783,7 +788,9 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
> > > > > > > >               return true;
> > > > > > > >
> > > > > > > >       userfaultfd_ctx_get(ctx);
> > > > > > > > +     down_write(&ctx->map_changing_lock);
> > > > > > > >       atomic_inc(&ctx->mmap_changing);
> > > > > > > > +     up_write(&ctx->map_changing_lock);
> > > > > > > >       mmap_read_unlock(mm);
> > > > > > > >
> > > > > > > >       msg_init(&ewq.msg);
> > > > >
> > > > > If this happens in read mode, then why are you waiting for the readers
> > > > > to leave?  Can't you just increment the atomic?  It's fine happening in
> > > > > read mode today, so it should be fine with this new rwsem.
> > > >
> > > > It's been a while and the details are blurred now, but if I remember
> > > > correctly, having this in read mode forced non-cooperative uffd monitor to
> > > > be single threaded. If a monitor runs, say uffdio_copy, and in parallel a
> > > > thread in the monitored process does MADV_DONTNEED, the latter will wait
> > > > for userfaultfd_remove notification to be processed in the monitor and drop
> > > > the VMA contents only afterwards. If a non-cooperative monitor would
> > > > process notification in parallel with uffdio ops, MADV_DONTNEED could
> > > > continue and race with uffdio_copy, so read mode wouldn't be enough.
> > > >
> > >
> > > Right now this function won't stop to wait for readers to exit the
> > > critical section, but with this change there will be a pause (since the
> > > down_write() will need to wait for the readers with the read lock).  So
> > > this is adding a delay in this call path that isn't necessary (?) nor
> > > existed before.  If you have non-cooperative uffd monitors, then you
> > > will have to wait for them to finish to mark the uffd as being removed,
> > > where as before it was a fire & forget, this is now a wait to tell.
> > >
> > I think a lot will be clearer once we get a response to my questions
> > above. IMHO not only this write-lock is needed here, we need to fix
> > userfaultfd_remove() by splitting it into userfaultfd_remove_prep()
> > and userfaultfd_remove_complete() (like all other non-cooperative
> > operations) as well. This patch enables us to do that as we remove
> > mmap_changing's dependency on mmap_lock for synchronization.
>
> The write-lock is not a requirement here for correctness and I don't see
> why we would need userfaultfd_remove_prep().
>
> As I've said earlier, having a write-lock here will let CRIU to run
> background copy in parallel with processing of uffd events, but I don't
> feel strongly about doing it.
>
Got it. Anyways, such a change needn't be part of this patch, so I'm
going to keep it unchanged.

> > > > There was no much sense to make MADV_DONTNEED take mmap_lock in write mode
> > > > just for this, but now taking the rwsem in write mode here sounds
> > > > reasonable.
> > > >
> > >
> > > I see why there was no need for a mmap_lock in write mode, but I think
> > > taking the new rwsem in write mode is unnecessary.
> > >
> > > Basically, I see this as a signal to new readers to abort, but we don't
> > > need to wait for current readers to finish before this one increments
> > > the atomic.
> > >
> > > Unless I missed something, I don't think you want to take the write lock
> > > here.
> > What I understood from the history of mmap_changing is that the
> > intention was to enable informing the uffd monitor about the correct
> > state of which pages are filled and which aren't. Going through this
> > thread was very helpful [2]
> >
> > [2] https://lore.kernel.org/lkml/1527061324-19949-1-git-send-email-rppt@linux.vnet.ibm.com/
>
> --
> Sincerely yours,
> Mike.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 3/3] userfaultfd: use per-vma locks in userfaultfd operations
  2024-01-31 21:41             ` Liam R. Howlett
@ 2024-02-05 21:46               ` Suren Baghdasaryan
  2024-02-05 21:54                 ` Lokesh Gidra
  0 siblings, 1 reply; 35+ messages in thread
From: Suren Baghdasaryan @ 2024-02-05 21:46 UTC (permalink / raw)
  To: Liam R. Howlett, Lokesh Gidra, Suren Baghdasaryan, akpm,
	linux-fsdevel, linux-mm, linux-kernel, selinux, kernel-team,
	aarcange, peterx, david, axelrasmussen, bgeffon, willy, jannh,
	kaleshsingh, ngeoffray, timmurray, rppt

On Wed, Jan 31, 2024 at 1:41 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
>
> * Lokesh Gidra <lokeshgidra@google.com> [240130 21:49]:
> > On Mon, Jan 29, 2024 at 6:58 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> > >
> > > * Lokesh Gidra <lokeshgidra@google.com> [240129 19:28]:
> > > > On Mon, Jan 29, 2024 at 12:53 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > >
> > >
>
> ...
>
> > >
> > > > Your suggestion is definitely simpler and easier to follow, but due to
> > > > the overflow situation that Suren pointed out, I would still need to
> > > > keep the locking/boolean dance, no? IIUC, even if I were to return
> > > > EAGAIN to the userspace, there is no guarantee that subsequent ioctls
> > > > on the same vma will succeed due to the same overflow, until someone
> > > > acquires and releases mmap_lock in write-mode.
> > > > Also, sometimes it seems insufficient whether we managed to lock vma
> > > > or not. For instance, lock_vma_under_rcu() checks if anon_vma (for
> > > > anonymous vma) exists. If not then it bails out.
> > > > So it seems to me that we have to provide some fall back in
> > > > userfaultfd operations which executes with mmap_lock in read-mode.
> > >
> > > Fair enough, what if we didn't use the sequence number and just locked
> > > the vma directly?
> >
> > Looks good to me, unless someone else has any objections.
> > >
> > > /* This will wait on the vma lock, so once we return it's locked */
> > > void vma_aquire_read_lock(struct vm_area_struct *vma)
> > > {
> > >         mmap_assert_locked(vma->vm_mm);
> > >         down_read(&vma->vm_lock->lock);
> > > }
> > >
> > > struct vm_area_struct *lock_vma(struct mm_struct *mm,
> > >         unsigned long addr))    /* or some better name.. */
> > > {
> > >         struct vm_area_struct *vma;
> > >
> > >         vma = lock_vma_under_rcu(mm, addr);
> > >         if (vma)
> > >                 return vma;
> > >
> > >         mmap_read_lock(mm);
> > >         /* mm sequence cannot change, no mm writers anyways.
> > >          * find_mergeable_anon_vma is only a concern in the page fault
> > >          * path
> > >          * start/end won't change under the mmap_lock
> > >          * vma won't become detached as we have the mmap_lock in read
> > >          * We are now sure no writes will change the VMA
> > >          * So let's make sure no other context is isolating the vma
> > >          */
> > >         vma = lookup_vma(mm, addr);
> > >         if (vma)
> > We can take care of anon_vma as well here right? I can take a bool
> > parameter ('prepare_anon' or something) and then:
> >
> >            if (vma) {
> >                     if (prepare_anon && vma_is_anonymous(vma)) &&
> > !anon_vma_prepare(vma)) {
> >                                       vma = ERR_PTR(-ENOMEM);
> >                                       goto out_unlock;
> >                    }
> > >                 vma_aquire_read_lock(vma);
> >            }
> > out_unlock:
> > >         mmap_read_unlock(mm);
> > >         return vma;
> > > }
>
> Do you need this?  I didn't think this was happening in the code as
> written?  If you need it I would suggest making it happen always and
> ditch the flag until a user needs this variant, but document what's
> going on in here or even have a better name.

I think yes, you do need this. I can see calls to anon_vma_prepare()
under mmap_read_lock() protection in both mfill_atomic_hugetlb() and
in mfill_atomic(). This means, just like in the pagefault path, we
modify vma->anon_vma under mmap_read_lock protection which guarantees
that adjacent VMAs won't change. This is important because
__anon_vma_prepare() uses find_mergeable_anon_vma() that needs the
neighboring VMAs to be stable. Per-VMA lock guarantees stability of
the VMA we locked but not of its neighbors, therefore holding per-VMA
lock while calling anon_vma_prepare() is not enough. The solution
Lokesh suggests would call anon_vma_prepare() under mmap_read_lock and
therefore would avoid the issue.


>
> Thanks,
> Liam

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 3/3] userfaultfd: use per-vma locks in userfaultfd operations
  2024-02-05 21:46               ` Suren Baghdasaryan
@ 2024-02-05 21:54                 ` Lokesh Gidra
  2024-02-05 22:00                   ` Liam R. Howlett
  0 siblings, 1 reply; 35+ messages in thread
From: Lokesh Gidra @ 2024-02-05 21:54 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: Liam R. Howlett, akpm, linux-fsdevel, linux-mm, linux-kernel,
	selinux, kernel-team, aarcange, peterx, david, axelrasmussen,
	bgeffon, willy, jannh, kaleshsingh, ngeoffray, timmurray, rppt

On Mon, Feb 5, 2024 at 1:47 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Wed, Jan 31, 2024 at 1:41 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> >
> > * Lokesh Gidra <lokeshgidra@google.com> [240130 21:49]:
> > > On Mon, Jan 29, 2024 at 6:58 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> > > >
> > > > * Lokesh Gidra <lokeshgidra@google.com> [240129 19:28]:
> > > > > On Mon, Jan 29, 2024 at 12:53 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > > >
> > > >
> >
> > ...
> >
> > > >
> > > > > Your suggestion is definitely simpler and easier to follow, but due to
> > > > > the overflow situation that Suren pointed out, I would still need to
> > > > > keep the locking/boolean dance, no? IIUC, even if I were to return
> > > > > EAGAIN to the userspace, there is no guarantee that subsequent ioctls
> > > > > on the same vma will succeed due to the same overflow, until someone
> > > > > acquires and releases mmap_lock in write-mode.
> > > > > Also, sometimes it seems insufficient whether we managed to lock vma
> > > > > or not. For instance, lock_vma_under_rcu() checks if anon_vma (for
> > > > > anonymous vma) exists. If not then it bails out.
> > > > > So it seems to me that we have to provide some fall back in
> > > > > userfaultfd operations which executes with mmap_lock in read-mode.
> > > >
> > > > Fair enough, what if we didn't use the sequence number and just locked
> > > > the vma directly?
> > >
> > > Looks good to me, unless someone else has any objections.
> > > >
> > > > /* This will wait on the vma lock, so once we return it's locked */
> > > > void vma_aquire_read_lock(struct vm_area_struct *vma)
> > > > {
> > > >         mmap_assert_locked(vma->vm_mm);
> > > >         down_read(&vma->vm_lock->lock);
> > > > }
> > > >
> > > > struct vm_area_struct *lock_vma(struct mm_struct *mm,
> > > >         unsigned long addr))    /* or some better name.. */
> > > > {
> > > >         struct vm_area_struct *vma;
> > > >
> > > >         vma = lock_vma_under_rcu(mm, addr);
> > > >         if (vma)
> > > >                 return vma;
> > > >
> > > >         mmap_read_lock(mm);
> > > >         /* mm sequence cannot change, no mm writers anyways.
> > > >          * find_mergeable_anon_vma is only a concern in the page fault
> > > >          * path
> > > >          * start/end won't change under the mmap_lock
> > > >          * vma won't become detached as we have the mmap_lock in read
> > > >          * We are now sure no writes will change the VMA
> > > >          * So let's make sure no other context is isolating the vma
> > > >          */
> > > >         vma = lookup_vma(mm, addr);
> > > >         if (vma)
> > > We can take care of anon_vma as well here right? I can take a bool
> > > parameter ('prepare_anon' or something) and then:
> > >
> > >            if (vma) {
> > >                     if (prepare_anon && vma_is_anonymous(vma)) &&
> > > !anon_vma_prepare(vma)) {
> > >                                       vma = ERR_PTR(-ENOMEM);
> > >                                       goto out_unlock;
> > >                    }
> > > >                 vma_aquire_read_lock(vma);
> > >            }
> > > out_unlock:
> > > >         mmap_read_unlock(mm);
> > > >         return vma;
> > > > }
> >
> > Do you need this?  I didn't think this was happening in the code as
> > written?  If you need it I would suggest making it happen always and
> > ditch the flag until a user needs this variant, but document what's
> > going on in here or even have a better name.
>
> I think yes, you do need this. I can see calls to anon_vma_prepare()
> under mmap_read_lock() protection in both mfill_atomic_hugetlb() and
> in mfill_atomic(). This means, just like in the pagefault path, we
> modify vma->anon_vma under mmap_read_lock protection which guarantees
> that adjacent VMAs won't change. This is important because
> __anon_vma_prepare() uses find_mergeable_anon_vma() that needs the
> neighboring VMAs to be stable. Per-VMA lock guarantees stability of
> the VMA we locked but not of its neighbors, therefore holding per-VMA
> lock while calling anon_vma_prepare() is not enough. The solution
> Lokesh suggests would call anon_vma_prepare() under mmap_read_lock and
> therefore would avoid the issue.
>
Thanks, Suren.
anon_vma_prepare() is also called in validate_move_areas() via move_pages().
>
> >
> > Thanks,
> > Liam

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 3/3] userfaultfd: use per-vma locks in userfaultfd operations
  2024-02-05 21:54                 ` Lokesh Gidra
@ 2024-02-05 22:00                   ` Liam R. Howlett
  2024-02-05 22:24                     ` Lokesh Gidra
  0 siblings, 1 reply; 35+ messages in thread
From: Liam R. Howlett @ 2024-02-05 22:00 UTC (permalink / raw)
  To: Lokesh Gidra
  Cc: Suren Baghdasaryan, akpm, linux-fsdevel, linux-mm, linux-kernel,
	selinux, kernel-team, aarcange, peterx, david, axelrasmussen,
	bgeffon, willy, jannh, kaleshsingh, ngeoffray, timmurray, rppt

* Lokesh Gidra <lokeshgidra@google.com> [240205 16:55]:
...

> > > > We can take care of anon_vma as well here right? I can take a bool
> > > > parameter ('prepare_anon' or something) and then:
> > > >
> > > >            if (vma) {
> > > >                     if (prepare_anon && vma_is_anonymous(vma)) &&
> > > > !anon_vma_prepare(vma)) {
> > > >                                       vma = ERR_PTR(-ENOMEM);
> > > >                                       goto out_unlock;
> > > >                    }
> > > > >                 vma_aquire_read_lock(vma);
> > > >            }
> > > > out_unlock:
> > > > >         mmap_read_unlock(mm);
> > > > >         return vma;
> > > > > }
> > >
> > > Do you need this?  I didn't think this was happening in the code as
> > > written?  If you need it I would suggest making it happen always and
> > > ditch the flag until a user needs this variant, but document what's
> > > going on in here or even have a better name.
> >
> > I think yes, you do need this. I can see calls to anon_vma_prepare()
> > under mmap_read_lock() protection in both mfill_atomic_hugetlb() and
> > in mfill_atomic(). This means, just like in the pagefault path, we
> > modify vma->anon_vma under mmap_read_lock protection which guarantees
> > that adjacent VMAs won't change. This is important because
> > __anon_vma_prepare() uses find_mergeable_anon_vma() that needs the
> > neighboring VMAs to be stable. Per-VMA lock guarantees stability of
> > the VMA we locked but not of its neighbors, therefore holding per-VMA
> > lock while calling anon_vma_prepare() is not enough. The solution
> > Lokesh suggests would call anon_vma_prepare() under mmap_read_lock and
> > therefore would avoid the issue.
> >

...

> anon_vma_prepare() is also called in validate_move_areas() via move_pages().

Probably worth doing it unconditionally and have a comment as to why it
is necessary.

Does this avoid your locking workaround?

Thanks,
Liam

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 3/3] userfaultfd: use per-vma locks in userfaultfd operations
  2024-02-05 22:00                   ` Liam R. Howlett
@ 2024-02-05 22:24                     ` Lokesh Gidra
  2024-02-06 14:35                       ` Liam R. Howlett
  0 siblings, 1 reply; 35+ messages in thread
From: Lokesh Gidra @ 2024-02-05 22:24 UTC (permalink / raw)
  To: Liam R. Howlett, Lokesh Gidra, Suren Baghdasaryan, akpm,
	linux-fsdevel, linux-mm, linux-kernel, selinux, kernel-team,
	aarcange, peterx, david, axelrasmussen, bgeffon, willy, jannh,
	kaleshsingh, ngeoffray, timmurray, rppt

On Mon, Feb 5, 2024 at 2:00 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
>
> * Lokesh Gidra <lokeshgidra@google.com> [240205 16:55]:
> ...
>
> > > > > We can take care of anon_vma as well here right? I can take a bool
> > > > > parameter ('prepare_anon' or something) and then:
> > > > >
> > > > >            if (vma) {
> > > > >                     if (prepare_anon && vma_is_anonymous(vma)) &&
> > > > > !anon_vma_prepare(vma)) {
> > > > >                                       vma = ERR_PTR(-ENOMEM);
> > > > >                                       goto out_unlock;
> > > > >                    }
> > > > > >                 vma_aquire_read_lock(vma);
> > > > >            }
> > > > > out_unlock:
> > > > > >         mmap_read_unlock(mm);
> > > > > >         return vma;
> > > > > > }
> > > >
> > > > Do you need this?  I didn't think this was happening in the code as
> > > > written?  If you need it I would suggest making it happen always and
> > > > ditch the flag until a user needs this variant, but document what's
> > > > going on in here or even have a better name.
> > >
> > > I think yes, you do need this. I can see calls to anon_vma_prepare()
> > > under mmap_read_lock() protection in both mfill_atomic_hugetlb() and
> > > in mfill_atomic(). This means, just like in the pagefault path, we
> > > modify vma->anon_vma under mmap_read_lock protection which guarantees
> > > that adjacent VMAs won't change. This is important because
> > > __anon_vma_prepare() uses find_mergeable_anon_vma() that needs the
> > > neighboring VMAs to be stable. Per-VMA lock guarantees stability of
> > > the VMA we locked but not of its neighbors, therefore holding per-VMA
> > > lock while calling anon_vma_prepare() is not enough. The solution
> > > Lokesh suggests would call anon_vma_prepare() under mmap_read_lock and
> > > therefore would avoid the issue.
> > >
>
> ...
>
> > anon_vma_prepare() is also called in validate_move_areas() via move_pages().
>
> Probably worth doing it unconditionally and have a comment as to why it
> is necessary.
>
The src_vma (in case of move_pages()) doesn't need to have it.

The only reason I'm not inclined to make it unconditional is what if
some future user of lock_vma() doesn't need it for their purpose? Why
allocate anon_vma in that case.

> Does this avoid your locking workaround?

Not sure which workaround you are referring to. I am almost done
implementing your suggestion. Very soon will share the next version of
the patch-set.
>
> Thanks,
> Liam

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 3/3] userfaultfd: use per-vma locks in userfaultfd operations
  2024-02-05 22:24                     ` Lokesh Gidra
@ 2024-02-06 14:35                       ` Liam R. Howlett
  2024-02-06 16:26                         ` Lokesh Gidra
  0 siblings, 1 reply; 35+ messages in thread
From: Liam R. Howlett @ 2024-02-06 14:35 UTC (permalink / raw)
  To: Lokesh Gidra
  Cc: Suren Baghdasaryan, akpm, linux-fsdevel, linux-mm, linux-kernel,
	selinux, kernel-team, aarcange, peterx, david, axelrasmussen,
	bgeffon, willy, jannh, kaleshsingh, ngeoffray, timmurray, rppt

* Lokesh Gidra <lokeshgidra@google.com> [240205 17:24]:
> On Mon, Feb 5, 2024 at 2:00 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> >
> > * Lokesh Gidra <lokeshgidra@google.com> [240205 16:55]:
> > ...
> >
> > > > > > We can take care of anon_vma as well here right? I can take a bool
> > > > > > parameter ('prepare_anon' or something) and then:
> > > > > >
> > > > > >            if (vma) {
> > > > > >                     if (prepare_anon && vma_is_anonymous(vma)) &&
> > > > > > !anon_vma_prepare(vma)) {
> > > > > >                                       vma = ERR_PTR(-ENOMEM);
> > > > > >                                       goto out_unlock;
> > > > > >                    }
> > > > > > >                 vma_aquire_read_lock(vma);
> > > > > >            }
> > > > > > out_unlock:
> > > > > > >         mmap_read_unlock(mm);
> > > > > > >         return vma;
> > > > > > > }
> > > > >
> > > > > Do you need this?  I didn't think this was happening in the code as
> > > > > written?  If you need it I would suggest making it happen always and
> > > > > ditch the flag until a user needs this variant, but document what's
> > > > > going on in here or even have a better name.
> > > >
> > > > I think yes, you do need this. I can see calls to anon_vma_prepare()
> > > > under mmap_read_lock() protection in both mfill_atomic_hugetlb() and
> > > > in mfill_atomic(). This means, just like in the pagefault path, we
> > > > modify vma->anon_vma under mmap_read_lock protection which guarantees
> > > > that adjacent VMAs won't change. This is important because
> > > > __anon_vma_prepare() uses find_mergeable_anon_vma() that needs the
> > > > neighboring VMAs to be stable. Per-VMA lock guarantees stability of
> > > > the VMA we locked but not of its neighbors, therefore holding per-VMA
> > > > lock while calling anon_vma_prepare() is not enough. The solution
> > > > Lokesh suggests would call anon_vma_prepare() under mmap_read_lock and
> > > > therefore would avoid the issue.
> > > >
> >
> > ...
> >
> > > anon_vma_prepare() is also called in validate_move_areas() via move_pages().
> >
> > Probably worth doing it unconditionally and have a comment as to why it
> > is necessary.
> >
> The src_vma (in case of move_pages()) doesn't need to have it.
> 
> The only reason I'm not inclined to make it unconditional is what if
> some future user of lock_vma() doesn't need it for their purpose? Why
> allocate anon_vma in that case.

Because there isn't a user and it'll add a flag that's a constant.  If
there is a need for the flag later then it can be added at that time.
Maybe there will never be a user and we've just complicated the code for
no reason.  Don't implement features that aren't necessary, especially
if there is no intent to use them.

> 
> > Does this avoid your locking workaround?
> 
> Not sure which workaround you are referring to. I am almost done
> implementing your suggestion. Very soon will share the next version of
> the patch-set.

The locking dance with the flags indicating if it's per-vma lock or
mmap_lock.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 3/3] userfaultfd: use per-vma locks in userfaultfd operations
  2024-02-06 14:35                       ` Liam R. Howlett
@ 2024-02-06 16:26                         ` Lokesh Gidra
  2024-02-06 17:07                           ` Liam R. Howlett
  0 siblings, 1 reply; 35+ messages in thread
From: Lokesh Gidra @ 2024-02-06 16:26 UTC (permalink / raw)
  To: Liam R. Howlett, Lokesh Gidra, Suren Baghdasaryan, akpm,
	linux-fsdevel, linux-mm, linux-kernel, selinux, kernel-team,
	aarcange, peterx, david, axelrasmussen, bgeffon, willy, jannh,
	kaleshsingh, ngeoffray, timmurray, rppt

On Tue, Feb 6, 2024 at 6:35 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
>
> * Lokesh Gidra <lokeshgidra@google.com> [240205 17:24]:
> > On Mon, Feb 5, 2024 at 2:00 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> > >
> > > * Lokesh Gidra <lokeshgidra@google.com> [240205 16:55]:
> > > ...
> > >
> > > > > > > We can take care of anon_vma as well here right? I can take a bool
> > > > > > > parameter ('prepare_anon' or something) and then:
> > > > > > >
> > > > > > >            if (vma) {
> > > > > > >                     if (prepare_anon && vma_is_anonymous(vma)) &&
> > > > > > > !anon_vma_prepare(vma)) {
> > > > > > >                                       vma = ERR_PTR(-ENOMEM);
> > > > > > >                                       goto out_unlock;
> > > > > > >                    }
> > > > > > > >                 vma_aquire_read_lock(vma);
> > > > > > >            }
> > > > > > > out_unlock:
> > > > > > > >         mmap_read_unlock(mm);
> > > > > > > >         return vma;
> > > > > > > > }
> > > > > >
> > > > > > Do you need this?  I didn't think this was happening in the code as
> > > > > > written?  If you need it I would suggest making it happen always and
> > > > > > ditch the flag until a user needs this variant, but document what's
> > > > > > going on in here or even have a better name.
> > > > >
> > > > > I think yes, you do need this. I can see calls to anon_vma_prepare()
> > > > > under mmap_read_lock() protection in both mfill_atomic_hugetlb() and
> > > > > in mfill_atomic(). This means, just like in the pagefault path, we
> > > > > modify vma->anon_vma under mmap_read_lock protection which guarantees
> > > > > that adjacent VMAs won't change. This is important because
> > > > > __anon_vma_prepare() uses find_mergeable_anon_vma() that needs the
> > > > > neighboring VMAs to be stable. Per-VMA lock guarantees stability of
> > > > > the VMA we locked but not of its neighbors, therefore holding per-VMA
> > > > > lock while calling anon_vma_prepare() is not enough. The solution
> > > > > Lokesh suggests would call anon_vma_prepare() under mmap_read_lock and
> > > > > therefore would avoid the issue.
> > > > >
> > >
> > > ...
> > >
> > > > anon_vma_prepare() is also called in validate_move_areas() via move_pages().
> > >
> > > Probably worth doing it unconditionally and have a comment as to why it
> > > is necessary.
> > >
> > The src_vma (in case of move_pages()) doesn't need to have it.
> >
> > The only reason I'm not inclined to make it unconditional is what if
> > some future user of lock_vma() doesn't need it for their purpose? Why
> > allocate anon_vma in that case.
>
> Because there isn't a user and it'll add a flag that's a constant.  If
> there is a need for the flag later then it can be added at that time.
> Maybe there will never be a user and we've just complicated the code for
> no reason.  Don't implement features that aren't necessary, especially
> if there is no intent to use them.
>

I'm not too attached to the idea of keeping it conditional. But I have
already sent v3 which currently does it conditionally. Please take a
look at it. Along with any other comments/changes that I get, I'll
also make it unconditional in v4, if you say so.
> >
> > > Does this avoid your locking workaround?
> >
> > Not sure which workaround you are referring to. I am almost done
> > implementing your suggestion. Very soon will share the next version of
> > the patch-set.
>
> The locking dance with the flags indicating if it's per-vma lock or
> mmap_lock.
>
That dance was not because of anon_vma. It's just that I hadn't
realized that we can do it the way you suggested :) I really liked
your suggestion and is implemented in v3. PTAL.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 3/3] userfaultfd: use per-vma locks in userfaultfd operations
  2024-02-06 16:26                         ` Lokesh Gidra
@ 2024-02-06 17:07                           ` Liam R. Howlett
  0 siblings, 0 replies; 35+ messages in thread
From: Liam R. Howlett @ 2024-02-06 17:07 UTC (permalink / raw)
  To: Lokesh Gidra
  Cc: Suren Baghdasaryan, akpm, linux-fsdevel, linux-mm, linux-kernel,
	selinux, kernel-team, aarcange, peterx, david, axelrasmussen,
	bgeffon, willy, jannh, kaleshsingh, ngeoffray, timmurray, rppt

* Lokesh Gidra <lokeshgidra@google.com> [240206 11:26]:
> On Tue, Feb 6, 2024 at 6:35 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> >
> > * Lokesh Gidra <lokeshgidra@google.com> [240205 17:24]:
> > > On Mon, Feb 5, 2024 at 2:00 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> > > >
> > > > * Lokesh Gidra <lokeshgidra@google.com> [240205 16:55]:
> > > > ...
> > > >
> > > > > > > > We can take care of anon_vma as well here right? I can take a bool
> > > > > > > > parameter ('prepare_anon' or something) and then:
> > > > > > > >
> > > > > > > >            if (vma) {
> > > > > > > >                     if (prepare_anon && vma_is_anonymous(vma)) &&
> > > > > > > > !anon_vma_prepare(vma)) {
> > > > > > > >                                       vma = ERR_PTR(-ENOMEM);
> > > > > > > >                                       goto out_unlock;
> > > > > > > >                    }
> > > > > > > > >                 vma_aquire_read_lock(vma);
> > > > > > > >            }
> > > > > > > > out_unlock:
> > > > > > > > >         mmap_read_unlock(mm);
> > > > > > > > >         return vma;
> > > > > > > > > }
> > > > > > >
> > > > > > > Do you need this?  I didn't think this was happening in the code as
> > > > > > > written?  If you need it I would suggest making it happen always and
> > > > > > > ditch the flag until a user needs this variant, but document what's
> > > > > > > going on in here or even have a better name.
> > > > > >
> > > > > > I think yes, you do need this. I can see calls to anon_vma_prepare()
> > > > > > under mmap_read_lock() protection in both mfill_atomic_hugetlb() and
> > > > > > in mfill_atomic(). This means, just like in the pagefault path, we
> > > > > > modify vma->anon_vma under mmap_read_lock protection which guarantees
> > > > > > that adjacent VMAs won't change. This is important because
> > > > > > __anon_vma_prepare() uses find_mergeable_anon_vma() that needs the
> > > > > > neighboring VMAs to be stable. Per-VMA lock guarantees stability of
> > > > > > the VMA we locked but not of its neighbors, therefore holding per-VMA
> > > > > > lock while calling anon_vma_prepare() is not enough. The solution
> > > > > > Lokesh suggests would call anon_vma_prepare() under mmap_read_lock and
> > > > > > therefore would avoid the issue.
> > > > > >
> > > >
> > > > ...
> > > >
> > > > > anon_vma_prepare() is also called in validate_move_areas() via move_pages().
> > > >
> > > > Probably worth doing it unconditionally and have a comment as to why it
> > > > is necessary.
> > > >
> > > The src_vma (in case of move_pages()) doesn't need to have it.
> > >
> > > The only reason I'm not inclined to make it unconditional is what if
> > > some future user of lock_vma() doesn't need it for their purpose? Why
> > > allocate anon_vma in that case.
> >
> > Because there isn't a user and it'll add a flag that's a constant.  If
> > there is a need for the flag later then it can be added at that time.
> > Maybe there will never be a user and we've just complicated the code for
> > no reason.  Don't implement features that aren't necessary, especially
> > if there is no intent to use them.
> >
> 
> I'm not too attached to the idea of keeping it conditional. But I have
> already sent v3 which currently does it conditionally. Please take a
> look at it. Along with any other comments/changes that I get, I'll
> also make it unconditional in v4, if you say so.

well, you use it conditionally, so it does have use.  It was not clear
in your comment above that you were going to use it.  I am not sure
about the dst/src needing/not needing it.  If you have a user, then
leave it in.

Thanks,
Liam

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 2/3] userfaultfd: protect mmap_changing with rw_sem in userfaulfd_ctx
  2024-02-05 20:53                 ` Lokesh Gidra
@ 2024-02-07 15:27                   ` Mike Rapoport
  2024-02-07 20:24                     ` Lokesh Gidra
  0 siblings, 1 reply; 35+ messages in thread
From: Mike Rapoport @ 2024-02-07 15:27 UTC (permalink / raw)
  To: Lokesh Gidra
  Cc: Liam R. Howlett, akpm, linux-fsdevel, linux-mm, linux-kernel,
	selinux, surenb, kernel-team, aarcange, peterx, david,
	axelrasmussen, bgeffon, willy, jannh, kaleshsingh, ngeoffray,
	timmurray

On Mon, Feb 05, 2024 at 12:53:33PM -0800, Lokesh Gidra wrote:
> On Sun, Feb 4, 2024 at 2:27 AM Mike Rapoport <rppt@kernel.org> wrote:
> >
> > > 3) Based on [1] I see how mmap_changing helps in eliminating duplicate
> > > work (background copy) by uffd monitor, but didn't get if there is a
> > > correctness aspect too that I'm missing? I concur with Amit's point in
> > > [1] that getting -EEXIST when setting up the pte will avoid memory
> > > corruption, no?
> >
> > In the fork case without mmap_changing the child process may be get data or
> > zeroes depending on the race for mmap_lock between the fork and
> > uffdio_copy and -EEXIST is not enough for monitor to detect what was the
> > ordering between fork and uffdio_copy.
> 
> This is extremely helpful. IIUC, there is a window after mmap_lock
> (write-mode) is released and before the uffd monitor thread is
> notified of fork. In that window, the monitor doesn't know that fork
> has already happened. So, without mmap_changing it would have done
> background copy only in the parent, thereby causing data inconsistency
> between parent and child processes.

Yes.
 
> It seems to me that the correctness argument for mmap_changing is
> there in case of FORK event and REMAP when mremap is called with
> MREMAP_DONTUNMAP. In all other cases its only benefit is by avoiding
> unnecessary background copies, right?

Yes, I think you are right, but it's possible I've forgot some nasty race
that will need mmap_changing for other events.

> > > > > > > > > @@ -783,7 +788,9 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
> > > > > > > > >               return true;
> > > > > > > > >
> > > > > > > > >       userfaultfd_ctx_get(ctx);
> > > > > > > > > +     down_write(&ctx->map_changing_lock);
> > > > > > > > >       atomic_inc(&ctx->mmap_changing);
> > > > > > > > > +     up_write(&ctx->map_changing_lock);
> > > > > > > > >       mmap_read_unlock(mm);
> > > > > > > > >
> > > > > > > > >       msg_init(&ewq.msg);
> > > > > >
> > > > > > If this happens in read mode, then why are you waiting for the readers
> > > > > > to leave?  Can't you just increment the atomic?  It's fine happening in
> > > > > > read mode today, so it should be fine with this new rwsem.
> > > > >
> > > > > It's been a while and the details are blurred now, but if I remember
> > > > > correctly, having this in read mode forced non-cooperative uffd monitor to
> > > > > be single threaded. If a monitor runs, say uffdio_copy, and in parallel a
> > > > > thread in the monitored process does MADV_DONTNEED, the latter will wait
> > > > > for userfaultfd_remove notification to be processed in the monitor and drop
> > > > > the VMA contents only afterwards. If a non-cooperative monitor would
> > > > > process notification in parallel with uffdio ops, MADV_DONTNEED could
> > > > > continue and race with uffdio_copy, so read mode wouldn't be enough.
> > > > >
> > > >
> > > > Right now this function won't stop to wait for readers to exit the
> > > > critical section, but with this change there will be a pause (since the
> > > > down_write() will need to wait for the readers with the read lock).  So
> > > > this is adding a delay in this call path that isn't necessary (?) nor
> > > > existed before.  If you have non-cooperative uffd monitors, then you
> > > > will have to wait for them to finish to mark the uffd as being removed,
> > > > where as before it was a fire & forget, this is now a wait to tell.
> > > >
> > > I think a lot will be clearer once we get a response to my questions
> > > above. IMHO not only this write-lock is needed here, we need to fix
> > > userfaultfd_remove() by splitting it into userfaultfd_remove_prep()
> > > and userfaultfd_remove_complete() (like all other non-cooperative
> > > operations) as well. This patch enables us to do that as we remove
> > > mmap_changing's dependency on mmap_lock for synchronization.
> >
> > The write-lock is not a requirement here for correctness and I don't see
> > why we would need userfaultfd_remove_prep().
> >
> > As I've said earlier, having a write-lock here will let CRIU to run
> > background copy in parallel with processing of uffd events, but I don't
> > feel strongly about doing it.
> >
> Got it. Anyways, such a change needn't be part of this patch, so I'm
> going to keep it unchanged.

You mean with a read lock?

-- 
Sincerely yours,
Mike.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 2/3] userfaultfd: protect mmap_changing with rw_sem in userfaulfd_ctx
  2024-02-07 15:27                   ` Mike Rapoport
@ 2024-02-07 20:24                     ` Lokesh Gidra
  2024-02-12  8:14                       ` Mike Rapoport
  0 siblings, 1 reply; 35+ messages in thread
From: Lokesh Gidra @ 2024-02-07 20:24 UTC (permalink / raw)
  To: Mike Rapoport
  Cc: Liam R. Howlett, akpm, linux-fsdevel, linux-mm, linux-kernel,
	selinux, surenb, kernel-team, aarcange, peterx, david,
	axelrasmussen, bgeffon, willy, jannh, kaleshsingh, ngeoffray,
	timmurray

On Wed, Feb 7, 2024 at 7:27 AM Mike Rapoport <rppt@kernel.org> wrote:
>
> On Mon, Feb 05, 2024 at 12:53:33PM -0800, Lokesh Gidra wrote:
> > On Sun, Feb 4, 2024 at 2:27 AM Mike Rapoport <rppt@kernel.org> wrote:
> > >
> > > > 3) Based on [1] I see how mmap_changing helps in eliminating duplicate
> > > > work (background copy) by uffd monitor, but didn't get if there is a
> > > > correctness aspect too that I'm missing? I concur with Amit's point in
> > > > [1] that getting -EEXIST when setting up the pte will avoid memory
> > > > corruption, no?
> > >
> > > In the fork case without mmap_changing the child process may be get data or
> > > zeroes depending on the race for mmap_lock between the fork and
> > > uffdio_copy and -EEXIST is not enough for monitor to detect what was the
> > > ordering between fork and uffdio_copy.
> >
> > This is extremely helpful. IIUC, there is a window after mmap_lock
> > (write-mode) is released and before the uffd monitor thread is
> > notified of fork. In that window, the monitor doesn't know that fork
> > has already happened. So, without mmap_changing it would have done
> > background copy only in the parent, thereby causing data inconsistency
> > between parent and child processes.
>
> Yes.
>
> > It seems to me that the correctness argument for mmap_changing is
> > there in case of FORK event and REMAP when mremap is called with
> > MREMAP_DONTUNMAP. In all other cases its only benefit is by avoiding
> > unnecessary background copies, right?
>
> Yes, I think you are right, but it's possible I've forgot some nasty race
> that will need mmap_changing for other events.
>
> > > > > > > > > > @@ -783,7 +788,9 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
> > > > > > > > > >               return true;
> > > > > > > > > >
> > > > > > > > > >       userfaultfd_ctx_get(ctx);
> > > > > > > > > > +     down_write(&ctx->map_changing_lock);
> > > > > > > > > >       atomic_inc(&ctx->mmap_changing);
> > > > > > > > > > +     up_write(&ctx->map_changing_lock);
> > > > > > > > > >       mmap_read_unlock(mm);
> > > > > > > > > >
> > > > > > > > > >       msg_init(&ewq.msg);
> > > > > > >
> > > > > > > If this happens in read mode, then why are you waiting for the readers
> > > > > > > to leave?  Can't you just increment the atomic?  It's fine happening in
> > > > > > > read mode today, so it should be fine with this new rwsem.
> > > > > >
> > > > > > It's been a while and the details are blurred now, but if I remember
> > > > > > correctly, having this in read mode forced non-cooperative uffd monitor to
> > > > > > be single threaded. If a monitor runs, say uffdio_copy, and in parallel a
> > > > > > thread in the monitored process does MADV_DONTNEED, the latter will wait
> > > > > > for userfaultfd_remove notification to be processed in the monitor and drop
> > > > > > the VMA contents only afterwards. If a non-cooperative monitor would
> > > > > > process notification in parallel with uffdio ops, MADV_DONTNEED could
> > > > > > continue and race with uffdio_copy, so read mode wouldn't be enough.
> > > > > >
> > > > >
> > > > > Right now this function won't stop to wait for readers to exit the
> > > > > critical section, but with this change there will be a pause (since the
> > > > > down_write() will need to wait for the readers with the read lock).  So
> > > > > this is adding a delay in this call path that isn't necessary (?) nor
> > > > > existed before.  If you have non-cooperative uffd monitors, then you
> > > > > will have to wait for them to finish to mark the uffd as being removed,
> > > > > where as before it was a fire & forget, this is now a wait to tell.
> > > > >
> > > > I think a lot will be clearer once we get a response to my questions
> > > > above. IMHO not only this write-lock is needed here, we need to fix
> > > > userfaultfd_remove() by splitting it into userfaultfd_remove_prep()
> > > > and userfaultfd_remove_complete() (like all other non-cooperative
> > > > operations) as well. This patch enables us to do that as we remove
> > > > mmap_changing's dependency on mmap_lock for synchronization.
> > >
> > > The write-lock is not a requirement here for correctness and I don't see
> > > why we would need userfaultfd_remove_prep().
> > >
> > > As I've said earlier, having a write-lock here will let CRIU to run
> > > background copy in parallel with processing of uffd events, but I don't
> > > feel strongly about doing it.
> > >
> > Got it. Anyways, such a change needn't be part of this patch, so I'm
> > going to keep it unchanged.
>
> You mean with a read lock?

No, I think write lock is good as it enables parallel background copy.
Also because it brings consistency in blocking userfaultfd operations.

I meant encapsulating remove operations within
userfaultfd_remove_prep() and userfaultfd_remove_complete(). I
couldn't figure out any need for that.


>
> --
> Sincerely yours,
> Mike.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 2/3] userfaultfd: protect mmap_changing with rw_sem in userfaulfd_ctx
  2024-02-07 20:24                     ` Lokesh Gidra
@ 2024-02-12  8:14                       ` Mike Rapoport
  0 siblings, 0 replies; 35+ messages in thread
From: Mike Rapoport @ 2024-02-12  8:14 UTC (permalink / raw)
  To: Lokesh Gidra
  Cc: Liam R. Howlett, akpm, linux-fsdevel, linux-mm, linux-kernel,
	selinux, surenb, kernel-team, aarcange, peterx, david,
	axelrasmussen, bgeffon, willy, jannh, kaleshsingh, ngeoffray,
	timmurray

On Wed, Feb 07, 2024 at 12:24:52PM -0800, Lokesh Gidra wrote:
> On Wed, Feb 7, 2024 at 7:27 AM Mike Rapoport <rppt@kernel.org> wrote:
> >
> > > > The write-lock is not a requirement here for correctness and I don't see
> > > > why we would need userfaultfd_remove_prep().
> > > >
> > > > As I've said earlier, having a write-lock here will let CRIU to run
> > > > background copy in parallel with processing of uffd events, but I don't
> > > > feel strongly about doing it.
> > > >
> > > Got it. Anyways, such a change needn't be part of this patch, so I'm
> > > going to keep it unchanged.
> >
> > You mean with a read lock?
> 
> No, I think write lock is good as it enables parallel background copy.
> Also because it brings consistency in blocking userfaultfd operations.
> 
> I meant encapsulating remove operations within
> userfaultfd_remove_prep() and userfaultfd_remove_complete(). I
> couldn't figure out any need for that.

I don't think there is a need for that. With fork/mremap prep is required
to ensure there's uffd context for new vmas.
 
-- 
Sincerely yours,
Mike.

^ permalink raw reply	[flat|nested] 35+ messages in thread

end of thread, other threads:[~2024-02-12  8:14 UTC | newest]

Thread overview: 35+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-01-29 19:35 [PATCH v2 0/3] per-vma locks in userfaultfd Lokesh Gidra
2024-01-29 19:35 ` [PATCH v2 1/3] userfaultfd: move userfaultfd_ctx struct to header file Lokesh Gidra
2024-01-30  7:12   ` Mike Rapoport
2024-01-29 19:35 ` [PATCH v2 2/3] userfaultfd: protect mmap_changing with rw_sem in userfaulfd_ctx Lokesh Gidra
2024-01-29 21:00   ` Liam R. Howlett
2024-01-29 22:35     ` Lokesh Gidra
2024-01-30  3:46       ` Liam R. Howlett
2024-01-30  8:55         ` Mike Rapoport
2024-01-30 17:28           ` Liam R. Howlett
2024-01-31  2:24             ` Lokesh Gidra
2024-02-04 10:27               ` Mike Rapoport
2024-02-05 20:53                 ` Lokesh Gidra
2024-02-07 15:27                   ` Mike Rapoport
2024-02-07 20:24                     ` Lokesh Gidra
2024-02-12  8:14                       ` Mike Rapoport
2024-01-30  7:21   ` Mike Rapoport
2024-01-29 19:35 ` [PATCH v2 3/3] userfaultfd: use per-vma locks in userfaultfd operations Lokesh Gidra
2024-01-29 20:36   ` Liam R. Howlett
2024-01-29 20:52     ` Suren Baghdasaryan
2024-01-29 21:18       ` Liam R. Howlett
2024-01-30  0:28       ` Lokesh Gidra
2024-01-30  2:58         ` Liam R. Howlett
2024-01-31  2:49           ` Lokesh Gidra
2024-01-31 21:41             ` Liam R. Howlett
2024-02-05 21:46               ` Suren Baghdasaryan
2024-02-05 21:54                 ` Lokesh Gidra
2024-02-05 22:00                   ` Liam R. Howlett
2024-02-05 22:24                     ` Lokesh Gidra
2024-02-06 14:35                       ` Liam R. Howlett
2024-02-06 16:26                         ` Lokesh Gidra
2024-02-06 17:07                           ` Liam R. Howlett
2024-01-31  3:03           ` Suren Baghdasaryan
2024-01-31 21:43             ` Liam R. Howlett
2024-01-29 20:39 ` [PATCH v2 0/3] per-vma locks in userfaultfd Liam R. Howlett
2024-01-29 21:58   ` Lokesh Gidra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).