io-uring.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] io_uring: Add vmsplice support
@ 2021-01-05 23:00 arni
  2021-01-05 23:00 ` [PATCH 1/2] splice: Make vmsplice public arni
  2021-01-05 23:00 ` [PATCH 2/2] io_uring: Add vmsplice support arni
  0 siblings, 2 replies; 4+ messages in thread
From: arni @ 2021-01-05 23:00 UTC (permalink / raw)
  To: io-uring; +Cc: axboe

This patchset is a followup from my last email, which may be found at
https://lore.kernel.org/io-uring/20210103222117.905850-1-arni@dagur.eu/

Thanks for you feedback, Jens. I have modified the test app on my end as
well, and it now looks like the following
https://gist.githubusercontent.com/ArniDagur/3392a787e89e78ba8ff739ff0f8476d5/raw/d01d19bb6fdc3defea59ae7c2a2c3d29682d8520/main.c

As you suggested, I now always return -EGAIN when force_nonblock is set.
In addition, req_set_fail_links() is now called when less than the
entirety of the buffer is spliced, and io_req_complete() is called at
the end.



^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH 1/2] splice: Make vmsplice public
  2021-01-05 23:00 [PATCH] io_uring: Add vmsplice support arni
@ 2021-01-05 23:00 ` arni
  2021-01-05 23:00 ` [PATCH 2/2] io_uring: Add vmsplice support arni
  1 sibling, 0 replies; 4+ messages in thread
From: arni @ 2021-01-05 23:00 UTC (permalink / raw)
  To: io-uring; +Cc: axboe, Árni Dagur

From: Árni Dagur <arni@dagur.eu>

Create a public function do_vmsplice(), so that other parts of the
kernel can use it.

Signed-off-by: Árni Dagur <arni@dagur.eu>
---
 fs/splice.c            | 21 +++++++++++++++------
 include/linux/splice.h |  2 +-
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/fs/splice.c b/fs/splice.c
index 866d5c2367b2..2d653a20cced 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1270,6 +1270,20 @@ static int vmsplice_type(struct fd f, int *type)
 	return 0;
 }
 
+long do_vmsplice(struct file *file, struct iov_iter *iter, unsigned int flags)
+{
+	long error;
+
+	if (!iov_iter_count(iter))
+		error = 0;
+	else if (iov_iter_rw(iter) == WRITE)
+		error = vmsplice_to_pipe(file, iter, flags);
+	else
+		error = vmsplice_to_user(file, iter, flags);
+
+	return error;
+}
+
 /*
  * Note that vmsplice only really supports true splicing _from_ user memory
  * to a pipe, not the other way around. Splicing from user memory is a simple
@@ -1309,12 +1323,7 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
 	if (error < 0)
 		goto out_fdput;
 
-	if (!iov_iter_count(&iter))
-		error = 0;
-	else if (iov_iter_rw(&iter) == WRITE)
-		error = vmsplice_to_pipe(f.file, &iter, flags);
-	else
-		error = vmsplice_to_user(f.file, &iter, flags);
+	error = do_vmsplice(f.file, &iter, flags);
 
 	kfree(iov);
 out_fdput:
diff --git a/include/linux/splice.h b/include/linux/splice.h
index a55179fd60fc..44c0e612f652 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -81,9 +81,9 @@ extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *,
 extern long do_splice(struct file *in, loff_t *off_in,
 		      struct file *out, loff_t *off_out,
 		      size_t len, unsigned int flags);
-
 extern long do_tee(struct file *in, struct file *out, size_t len,
 		   unsigned int flags);
+extern long do_vmsplice(struct file *file, struct iov_iter *iter, unsigned int flags);
 
 /*
  * for dynamic pipe sizing
-- 
2.30.0


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 2/2] io_uring: Add vmsplice support
  2021-01-05 23:00 [PATCH] io_uring: Add vmsplice support arni
  2021-01-05 23:00 ` [PATCH 1/2] splice: Make vmsplice public arni
@ 2021-01-05 23:00 ` arni
  2021-01-05 23:43   ` Pavel Begunkov
  1 sibling, 1 reply; 4+ messages in thread
From: arni @ 2021-01-05 23:00 UTC (permalink / raw)
  To: io-uring; +Cc: axboe, Árni Dagur

From: Árni Dagur <arni@dagur.eu>

* The `sqe->splice_flags` field is used to hold flags.
* We return -EAGAIN if force_nonblock is set.

Signed-off-by: Árni Dagur <arni@dagur.eu>
---
 fs/io_uring.c                 | 76 +++++++++++++++++++++++++++++++++++
 include/uapi/linux/io_uring.h |  1 +
 2 files changed, 77 insertions(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ca46f314640b..a99a89798386 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -531,6 +531,13 @@ struct io_splice {
 	unsigned int			flags;
 };
 
+struct io_vmsplice {
+	struct file			*file;
+	u64				addr;
+	u64				len;
+	unsigned int			flags;
+};
+
 struct io_provide_buf {
 	struct file			*file;
 	__u64				addr;
@@ -692,6 +699,7 @@ struct io_kiocb {
 		struct io_madvise	madvise;
 		struct io_epoll		epoll;
 		struct io_splice	splice;
+		struct io_vmsplice	vmsplice;
 		struct io_provide_buf	pbuf;
 		struct io_statx		statx;
 		struct io_shutdown	shutdown;
@@ -967,6 +975,12 @@ static const struct io_op_def io_op_defs[] = {
 		.unbound_nonreg_file	= 1,
 		.work_flags		= IO_WQ_WORK_BLKCG,
 	},
+	[IORING_OP_VMSPLICE] = {
+		.needs_file = 1,
+		.hash_reg_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.work_flags		= IO_WQ_WORK_MM,
+	},
 	[IORING_OP_PROVIDE_BUFFERS] = {},
 	[IORING_OP_REMOVE_BUFFERS] = {},
 	[IORING_OP_TEE] = {
@@ -3884,6 +3898,63 @@ static int io_splice(struct io_kiocb *req, bool force_nonblock)
 	return 0;
 }
 
+static int io_vmsplice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_vmsplice *sp = &req->vmsplice;
+
+	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+	if (unlikely(READ_ONCE(sqe->off)))
+		return -EINVAL;
+
+	sp->addr = READ_ONCE(sqe->addr);
+	sp->len = READ_ONCE(sqe->len);
+	sp->flags = READ_ONCE(sqe->splice_flags);
+
+	if (sp->flags & ~SPLICE_F_ALL)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int io_vmsplice(struct io_kiocb *req, bool force_nonblock)
+{
+	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+	struct io_vmsplice *sp = &req->vmsplice;
+	void __user *buf = u64_to_user_ptr(sp->addr);
+	struct iov_iter __iter, *iter = &__iter;
+	struct file *file = sp->file;
+	ssize_t io_size;
+	int type, ret;
+
+	if (force_nonblock)
+		return -EAGAIN;
+
+	if (file->f_mode & FMODE_WRITE)
+		type = WRITE;
+	else if (file->f_mode & FMODE_READ)
+		type = READ;
+	else {
+		ret = -EBADF;
+		goto err;
+	}
+
+	ret = __import_iovec(type, buf, sp->len, UIO_FASTIOV, &iovec, iter,
+				req->ctx->compat);
+	if (ret < 0)
+		goto err;
+	io_size = iov_iter_count(iter);
+
+	ret = do_vmsplice(file, iter, sp->flags);
+	if (ret != io_size) {
+err:
+		req_set_fail_links(req);
+	}
+	io_req_complete(req, ret);
+	kfree(iovec);
+	return 0;
+}
+
 /*
  * IORING_OP_NOP just posts a completion event, nothing else.
  */
@@ -6009,6 +6080,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return io_epoll_ctl_prep(req, sqe);
 	case IORING_OP_SPLICE:
 		return io_splice_prep(req, sqe);
+	case IORING_OP_VMSPLICE:
+		return io_vmsplice_prep(req, sqe);
 	case IORING_OP_PROVIDE_BUFFERS:
 		return io_provide_buffers_prep(req, sqe);
 	case IORING_OP_REMOVE_BUFFERS:
@@ -6262,6 +6335,9 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock,
 	case IORING_OP_SPLICE:
 		ret = io_splice(req, force_nonblock);
 		break;
+	case IORING_OP_VMSPLICE:
+		ret = io_vmsplice(req, force_nonblock);
+		break;
 	case IORING_OP_PROVIDE_BUFFERS:
 		ret = io_provide_buffers(req, force_nonblock, cs);
 		break;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index d31a2a1e8ef9..6bc79f9bb123 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -137,6 +137,7 @@ enum {
 	IORING_OP_SHUTDOWN,
 	IORING_OP_RENAMEAT,
 	IORING_OP_UNLINKAT,
+	IORING_OP_VMSPLICE,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
-- 
2.30.0


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH 2/2] io_uring: Add vmsplice support
  2021-01-05 23:00 ` [PATCH 2/2] io_uring: Add vmsplice support arni
@ 2021-01-05 23:43   ` Pavel Begunkov
  0 siblings, 0 replies; 4+ messages in thread
From: Pavel Begunkov @ 2021-01-05 23:43 UTC (permalink / raw)
  To: arni, io-uring; +Cc: axboe

On 05/01/2021 23:00, arni@dagur.eu wrote:
> From: Árni Dagur <arni@dagur.eu>
> 
> * The `sqe->splice_flags` field is used to hold flags.
> * We return -EAGAIN if force_nonblock is set.
> 
> Signed-off-by: Árni Dagur <arni@dagur.eu>
> ---
>  fs/io_uring.c                 | 76 +++++++++++++++++++++++++++++++++++
>  include/uapi/linux/io_uring.h |  1 +
>  2 files changed, 77 insertions(+)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index ca46f314640b..a99a89798386 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -531,6 +531,13 @@ struct io_splice {
>  	unsigned int			flags;
>  };
>  
> +struct io_vmsplice {
> +	struct file			*file;
> +	u64				addr;
> +	u64				len;
> +	unsigned int			flags;
> +};
> +
>  struct io_provide_buf {
>  	struct file			*file;
>  	__u64				addr;
> @@ -692,6 +699,7 @@ struct io_kiocb {
>  		struct io_madvise	madvise;
>  		struct io_epoll		epoll;
>  		struct io_splice	splice;
> +		struct io_vmsplice	vmsplice;
>  		struct io_provide_buf	pbuf;
>  		struct io_statx		statx;
>  		struct io_shutdown	shutdown;
> @@ -967,6 +975,12 @@ static const struct io_op_def io_op_defs[] = {
>  		.unbound_nonreg_file	= 1,
>  		.work_flags		= IO_WQ_WORK_BLKCG,
>  	},
> +	[IORING_OP_VMSPLICE] = {
> +		.needs_file = 1,
> +		.hash_reg_file		= 1,
> +		.unbound_nonreg_file	= 1,
> +		.work_flags		= IO_WQ_WORK_MM,
> +	},
>  	[IORING_OP_PROVIDE_BUFFERS] = {},
>  	[IORING_OP_REMOVE_BUFFERS] = {},
>  	[IORING_OP_TEE] = {
> @@ -3884,6 +3898,63 @@ static int io_splice(struct io_kiocb *req, bool force_nonblock)
>  	return 0;
>  }
>  
> +static int io_vmsplice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> +{
> +	struct io_vmsplice *sp = &req->vmsplice;
> +
> +	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
> +		return -EINVAL;
> +	if (unlikely(READ_ONCE(sqe->off)))
> +		return -EINVAL;
> +
> +	sp->addr = READ_ONCE(sqe->addr);
> +	sp->len = READ_ONCE(sqe->len);
> +	sp->flags = READ_ONCE(sqe->splice_flags);
> +
> +	if (sp->flags & ~SPLICE_F_ALL)
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
> +static int io_vmsplice(struct io_kiocb *req, bool force_nonblock)
> +{
> +	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
> +	struct io_vmsplice *sp = &req->vmsplice;
> +	void __user *buf = u64_to_user_ptr(sp->addr);

const struct iovec __user *uiov

> +	struct iov_iter __iter, *iter = &__iter;

read/write either use ((struct io_async_rw *)req->async_data)->iter
or to avoid allocation use an on-stack iter. This only has that
on-stack __iter, so why do you need *iter?

> +	struct file *file = sp->file;
> +	ssize_t io_size;
> +	int type, ret;
> +
> +	if (force_nonblock)
> +		return -EAGAIN;
> +
> +	if (file->f_mode & FMODE_WRITE)
> +		type = WRITE;
> +	else if (file->f_mode & FMODE_READ)
> +		type = READ;
> +	else {
> +		ret = -EBADF;
> +		goto err;

it jumps to kfree(iovec), where iovec=inline_vecs

> +	}
> +
> +	ret = __import_iovec(type, buf, sp->len, UIO_FASTIOV, &iovec, iter,
> +				req->ctx->compat);

This may happen asynchronously long after io_uring_enter(submit)
returned, e.g. if a user keeps uiov on-stack it will fail or read
garbage.

So, it's either to make it a part of ABI -- users must not delete
uiov until the request completion, or copy it while not-yet-async.
For consistency with read/write I'd prefer the second.

> +	if (ret < 0)
> +		goto err;
> +	io_size = iov_iter_count(iter);
> +
> +	ret = do_vmsplice(file, iter, sp->flags);
> +	if (ret != io_size) {
> +err:
> +		req_set_fail_links(req);
> +	}
> +	io_req_complete(req, ret);
> +	kfree(iovec);
> +	return 0;
> +}
> +
>  /*
>   * IORING_OP_NOP just posts a completion event, nothing else.
>   */
> @@ -6009,6 +6080,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>  		return io_epoll_ctl_prep(req, sqe);
>  	case IORING_OP_SPLICE:
>  		return io_splice_prep(req, sqe);
> +	case IORING_OP_VMSPLICE:
> +		return io_vmsplice_prep(req, sqe);
>  	case IORING_OP_PROVIDE_BUFFERS:
>  		return io_provide_buffers_prep(req, sqe);
>  	case IORING_OP_REMOVE_BUFFERS:
> @@ -6262,6 +6335,9 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock,
>  	case IORING_OP_SPLICE:
>  		ret = io_splice(req, force_nonblock);
>  		break;
> +	case IORING_OP_VMSPLICE:
> +		ret = io_vmsplice(req, force_nonblock);
> +		break;
>  	case IORING_OP_PROVIDE_BUFFERS:
>  		ret = io_provide_buffers(req, force_nonblock, cs);
>  		break;
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index d31a2a1e8ef9..6bc79f9bb123 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -137,6 +137,7 @@ enum {
>  	IORING_OP_SHUTDOWN,
>  	IORING_OP_RENAMEAT,
>  	IORING_OP_UNLINKAT,
> +	IORING_OP_VMSPLICE,
>  
>  	/* this goes last, obviously */
>  	IORING_OP_LAST,
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2021-01-05 23:48 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-01-05 23:00 [PATCH] io_uring: Add vmsplice support arni
2021-01-05 23:00 ` [PATCH 1/2] splice: Make vmsplice public arni
2021-01-05 23:00 ` [PATCH 2/2] io_uring: Add vmsplice support arni
2021-01-05 23:43   ` Pavel Begunkov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).