linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Nadav Amit <nadav.amit@gmail.com>
To: linux-fsdevel@vger.kernel.org
Cc: Nadav Amit <namit@vmware.com>, Jens Axboe <axboe@kernel.dk>,
	Andrea Arcangeli <aarcange@redhat.com>,
	Peter Xu <peterx@redhat.com>,
	Alexander Viro <viro@zeniv.linux.org.uk>,
	io-uring@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org
Subject: [RFC PATCH 10/13] fs/userfaultfd: add write_iter() interface
Date: Sat, 28 Nov 2020 16:45:45 -0800	[thread overview]
Message-ID: <20201129004548.1619714-11-namit@vmware.com> (raw)
In-Reply-To: <20201129004548.1619714-1-namit@vmware.com>

From: Nadav Amit <namit@vmware.com>

In order to use userfaultfd with io-uring, there are two options for
extensions: support userfaultfd ioctls or provide similar functionality
through the "write" interface. The latter approach seems more compelling
as it does not require io-uring changes, and keeps all the logic of
userfaultfd where it should be. In addition it allows to provide
asynchronous completions by performing the copying/zeroing in the
faulting thread (which will be done in a later patch).

This patch enhances the userfaultfd API to provide write interface to
perform similar operations for copy/zero. The lower bits of the position
(smaller than PAGE_SHIFT) are being used to encode the required
operation: zero/copy/wake/write-protect. In the case of zeroing, the
source data is ignored and only the length is being used to determine
the size of the data that needs to be zeroed.

Cc: Jens Axboe <axboe@kernel.dk>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: io-uring@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-mm@kvack.org
Signed-off-by: Nadav Amit <namit@vmware.com>
---
 fs/userfaultfd.c                 | 96 +++++++++++++++++++++++++++++++-
 include/uapi/linux/userfaultfd.h | 14 ++++-
 2 files changed, 107 insertions(+), 3 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 7bbee2a00d37..eae6ac303951 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1140,6 +1140,34 @@ static __poll_t userfaultfd_poll(struct file *file, poll_table *wait)
 
 static const struct file_operations userfaultfd_fops;
 
+/* Open-coded version of anon_inode_getfd() to setup FMODE_PWRITE */
+static int userfaultfd_getfd(const char *name, const struct file_operations *fops,
+		     void *priv, int flags)
+{
+	int error, fd;
+	struct file *file;
+
+	error = get_unused_fd_flags(flags);
+	if (error < 0)
+		return error;
+	fd = error;
+
+	file = anon_inode_getfile(name, fops, priv, flags);
+
+	if (IS_ERR(file)) {
+		error = PTR_ERR(file);
+		goto err_put_unused_fd;
+	}
+	file->f_mode |= FMODE_PWRITE;
+	fd_install(fd, file);
+
+	return fd;
+
+err_put_unused_fd:
+	put_unused_fd(fd);
+	return error;
+}
+
 static int resolve_userfault_fork(struct userfaultfd_ctx *ctx,
 				  struct userfaultfd_ctx *new,
 				  struct uffd_msg *msg)
@@ -1161,7 +1189,7 @@ static int resolve_userfault_fork(struct userfaultfd_ctx *ctx,
 		task_unlock(current);
 	}
 
-	fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, new,
+	fd = userfaultfd_getfd("[userfaultfd]", &userfaultfd_fops, new,
 			      O_RDWR | (new->flags & UFFD_SHARED_FCNTL_FLAGS));
 
 	if (files != NULL) {
@@ -1496,6 +1524,69 @@ static __always_inline int validate_range(struct mm_struct *mm,
 	return 0;
 }
 
+ssize_t userfaultfd_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct userfaultfd_wake_range range;
+	struct userfaultfd_ctx *ctx = file->private_data;
+	size_t len = iov_iter_count(from);
+	__u64 dst = iocb->ki_pos & PAGE_MASK;
+	unsigned long mode = iocb->ki_pos & ~PAGE_MASK;
+	bool zeropage;
+	__s64 ret;
+
+	BUG_ON(len == 0);
+
+	zeropage = mode & UFFDIO_WRITE_MODE_ZEROPAGE;
+
+	ret = -EINVAL;
+	if (mode & ~(UFFDIO_WRITE_MODE_DONTWAKE | UFFDIO_WRITE_MODE_WP |
+		     UFFDIO_WRITE_MODE_ZEROPAGE))
+		goto out;
+
+	mode = mode & (UFFDIO_WRITE_MODE_DONTWAKE | UFFDIO_WRITE_MODE_WP);
+
+	/*
+	 * Keep compatibility with zeropage ioctl, which does not allow
+	 * write-protect and dontwake.
+	 */
+	if (zeropage &&
+	    (mode & (UFFDIO_WRITE_MODE_DONTWAKE | UFFDIO_WRITE_MODE_WP)) ==
+	     (UFFDIO_WRITE_MODE_DONTWAKE | UFFDIO_WRITE_MODE_WP))
+		goto out;
+
+	ret = -EAGAIN;
+	if (READ_ONCE(ctx->mmap_changing))
+		goto out;
+
+	ret = validate_range(ctx->mm, &dst, len);
+	if (ret)
+		goto out;
+
+	if (mmget_not_zero(ctx->mm)) {
+		if (zeropage)
+			ret = mfill_zeropage(ctx->mm, dst, from,
+					     &ctx->mmap_changing);
+		else
+			ret = mcopy_atomic(ctx->mm, dst, from,
+					   &ctx->mmap_changing, mode);
+		mmput(ctx->mm);
+	} else {
+		return -ESRCH;
+	}
+	if (ret < 0)
+		goto out;
+
+	/* len == 0 would wake all */
+	range.len = ret;
+	if (!(mode & UFFDIO_COPY_MODE_DONTWAKE)) {
+		range.start = dst;
+		wake_userfault(ctx, &range);
+	}
+out:
+	return ret;
+}
+
 static inline bool vma_can_userfault(struct vm_area_struct *vma,
 				     unsigned long vm_flags)
 {
@@ -2197,6 +2288,7 @@ static const struct file_operations userfaultfd_fops = {
 	.release	= userfaultfd_release,
 	.poll		= userfaultfd_poll,
 	.read_iter	= userfaultfd_read_iter,
+	.write_iter	= userfaultfd_write_iter,
 	.unlocked_ioctl = userfaultfd_ioctl,
 	.compat_ioctl	= compat_ptr_ioctl,
 	.llseek		= noop_llseek,
@@ -2248,7 +2340,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
 
 	ctx->files = get_files_struct(current);
 
-	fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, ctx,
+	fd = userfaultfd_getfd("[userfaultfd]", &userfaultfd_fops, ctx,
 			      O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
 	if (fd < 0) {
 		mmdrop(ctx->mm);
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 4eeba4235afe..943e50b41742 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -28,7 +28,8 @@
 			   UFFD_FEATURE_MISSING_SHMEM |		\
 			   UFFD_FEATURE_SIGBUS |		\
 			   UFFD_FEATURE_THREAD_ID |		\
-			   UFFD_FEATURE_POLL)
+			   UFFD_FEATURE_POLL |			\
+			   UFFD_FEATURE_WRITE)
 
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
@@ -177,6 +178,9 @@ struct uffdio_api {
 	 * UFFD_FEATURE_POLL polls upon page-fault if the feature is requested
 	 * instead of descheduling. This feature should only be enabled for
 	 * low-latency handlers and when CPUs are not overcomitted.
+	 *
+	 * UFFD_FEATURE_WRITE allows to use the write interface for copy and
+	 * zeroing of pages in addition to the ioctl interface.
 	 */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
@@ -188,6 +192,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_SIGBUS			(1<<7)
 #define UFFD_FEATURE_THREAD_ID			(1<<8)
 #define UFFD_FEATURE_POLL			(1<<9)
+#define UFFD_FEATURE_WRITE			(1<<10)
 	__u64 features;
 
 	__u64 ioctls;
@@ -264,4 +269,11 @@ struct uffdio_writeprotect {
 	__u64 mode;
 };
 
+/*
+ * Write modes to be use with UFFDIO_SET_WRITE_MODE ioctl.
+ */
+#define UFFDIO_WRITE_MODE_DONTWAKE		UFFDIO_COPY_MODE_DONTWAKE
+#define UFFDIO_WRITE_MODE_WP			UFFDIO_COPY_MODE_WP
+#define UFFDIO_WRITE_MODE_ZEROPAGE		((__u64)1<<2)
+
 #endif /* _LINUX_USERFAULTFD_H */
-- 
2.25.1


  parent reply	other threads:[~2020-11-29  0:51 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-11-29  0:45 [RFC PATCH 00/13] fs/userfaultfd: support iouring and polling Nadav Amit
2020-11-29  0:45 ` [RFC PATCH 01/13] fs/userfaultfd: fix wrong error code on WP & !VM_MAYWRITE Nadav Amit
2020-12-01 21:22   ` Mike Kravetz
2020-12-21 19:01     ` Peter Xu
2020-11-29  0:45 ` [RFC PATCH 02/13] fs/userfaultfd: fix wrong file usage with iouring Nadav Amit
2020-11-29  0:45 ` [RFC PATCH 03/13] selftests/vm/userfaultfd: wake after copy failure Nadav Amit
2020-12-21 19:28   ` Peter Xu
2020-12-21 19:51     ` Nadav Amit
2020-12-21 20:52       ` Peter Xu
2020-12-21 20:54         ` Nadav Amit
2020-11-29  0:45 ` [RFC PATCH 04/13] fs/userfaultfd: simplify locks in userfaultfd_ctx_read Nadav Amit
2020-11-29  0:45 ` [RFC PATCH 05/13] fs/userfaultfd: introduce UFFD_FEATURE_POLL Nadav Amit
2020-11-29  0:45 ` [RFC PATCH 06/13] iov_iter: support atomic copy_page_from_iter_iovec() Nadav Amit
2020-11-29  0:45 ` [RFC PATCH 07/13] fs/userfaultfd: support read_iter to use io_uring Nadav Amit
2020-11-30 18:20   ` Jens Axboe
2020-11-30 19:23     ` Nadav Amit
2020-11-29  0:45 ` [RFC PATCH 08/13] fs/userfaultfd: complete reads asynchronously Nadav Amit
2020-11-29  0:45 ` [RFC PATCH 09/13] fs/userfaultfd: use iov_iter for copy/zero Nadav Amit
2020-11-29  0:45 ` Nadav Amit [this message]
2020-11-29  0:45 ` [RFC PATCH 11/13] fs/userfaultfd: complete write asynchronously Nadav Amit
2020-12-02  7:12   ` Nadav Amit
2020-11-29  0:45 ` [RFC PATCH 12/13] fs/userfaultfd: kmem-cache for wait-queue objects Nadav Amit
2020-11-30 19:51   ` Nadav Amit
2020-12-03  5:19   ` [fs/userfaultfd] fec9227821: will-it-scale.per_process_ops -5.5% regression kernel test robot
2020-11-29  0:45 ` [RFC PATCH 13/13] selftests/vm/userfaultfd: iouring and polling tests Nadav Amit

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20201129004548.1619714-11-namit@vmware.com \
    --to=nadav.amit@gmail.com \
    --cc=aarcange@redhat.com \
    --cc=axboe@kernel.dk \
    --cc=io-uring@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=namit@vmware.com \
    --cc=peterx@redhat.com \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).