All of lore.kernel.org
 help / color / mirror / Atom feed
* Questions regarding implementation of vmsplice in io_uring
@ 2021-01-03 22:22 arni
  2021-01-04 15:21 ` Jens Axboe
  0 siblings, 1 reply; 3+ messages in thread
From: arni @ 2021-01-03 22:22 UTC (permalink / raw)
  To: io-uring; +Cc: Árni Dagur

From: Árni Dagur <arnidg@protonmail.ch>

Hello,

For my first stab at kernel development, I wanted to try implementing
vmsplice for io_uring. I've attached the code I've written so far. I have two
questions to ask, sorry if this is not the right place.

1. Currently I use __import_iovec directly, instead of using
io_import_iovec. That's because I've created a new "kiocb" struct
called io_vmsplice, rather than using io_rw as io_import_iovec expects.
The reason I created a new struct is so that it can hold an unsigned int
for the flags argument -- which is not present in io_rw. Im guessing that I
should find a way to use io_import_iovec instead?

One way I can think of is giving the io_vmsplice struct the same initial
fields as io_rw, and letting io_import_iovec access the union as io_rw rather
than io_vmsplice. Coming from a Rust background however, this solution
sounds like a bug waiting to happen (if one of the structs is changed
but the other is not).

2. Whenever I run the test program at
https://gist.githubusercontent.com/ArniDagur/07d87aefae93868ca1bf10766194599d/raw/dc14a63649d530e5e29f0d1288f41ed54bc6b810/main.c
I get a BADF result value. The debugger tells me that this occurs
because `file->f_op != &pipefifo_fops` in get_pipe_info() in fs/pipe.c
(neither pointer is NULL).

I give the program the file descriptor "1". Shouldn't that always be a pipe?
Is there something obvious that I'm missing?

Thanks a lot!
-- Árni

---
 fs/io_uring.c                 | 66 +++++++++++++++++++++++++++++++++++
 fs/splice.c                   | 18 ++++++----
 include/linux/splice.h        |  2 +-
 include/uapi/linux/io_uring.h |  1 +
 4 files changed, 80 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ca46f314640b..55dbbd4704c6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -531,6 +531,14 @@ struct io_splice {
 	unsigned int			flags;
 };
 
+struct io_vmsplice {
+	struct file			*file;
+	u64				addr;
+	u64				len;
+	unsigned int	flags;
+};
+
+
 struct io_provide_buf {
 	struct file			*file;
 	__u64				addr;
@@ -692,6 +700,7 @@ struct io_kiocb {
 		struct io_madvise	madvise;
 		struct io_epoll		epoll;
 		struct io_splice	splice;
+		struct io_vmsplice	vmsplice;
 		struct io_provide_buf	pbuf;
 		struct io_statx		statx;
 		struct io_shutdown	shutdown;
@@ -967,6 +976,11 @@ static const struct io_op_def io_op_defs[] = {
 		.unbound_nonreg_file	= 1,
 		.work_flags		= IO_WQ_WORK_BLKCG,
 	},
+	[IORING_OP_VMSPLICE] = {
+		.needs_file = 1,
+		.hash_reg_file		= 1,
+		.unbound_nonreg_file	= 1,

I couldn't find any information regarding what the work flags do, so
I've left them empty for now.

+	},
 	[IORING_OP_PROVIDE_BUFFERS] = {},
 	[IORING_OP_REMOVE_BUFFERS] = {},
 	[IORING_OP_TEE] = {
@@ -3884,6 +3898,53 @@ static int io_splice(struct io_kiocb *req, bool force_nonblock)
 	return 0;
 }
 
+
+static int io_vmsplice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) {
+	struct io_vmsplice* sp = &req->vmsplice;
+
+	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+
+	if (unlikely(READ_ONCE(sqe->off)))
+		return -EINVAL;
+
+	sp->addr = READ_ONCE(sqe->addr);
+	sp->len = READ_ONCE(sqe->len);
+	sp->flags = READ_ONCE(sqe->splice_flags);
+
+	if (sp->flags & ~SPLICE_F_ALL)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int io_vmsplice(struct io_kiocb *req, bool force_nonblock) {
+	struct io_vmsplice* sp = &req->vmsplice;
+	struct file *file = sp->file;
+	int type;
+	int ret;
+
+	void __user *buf = u64_to_user_ptr(sp->addr);
+	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+	struct iov_iter __iter, *iter = &__iter;
+
+	if (file->f_mode & FMODE_WRITE) {
+		type = WRITE;
+	} else if (file->f_mode & FMODE_READ) {
+		type = READ;
+	} else {
+		return -EBADF;
+	}
+
+	ret = __import_iovec(type, buf, sp->len, UIO_FASTIOV, &iovec, iter, req->ctx->compat);
+	if (ret < 0)
+		return ret;
+
+	ret = do_vmsplice(file, iter, sp->flags);
+	kfree(iovec);
+	return ret;
+}
+
 /*
  * IORING_OP_NOP just posts a completion event, nothing else.
  */
@@ -6009,6 +6070,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return io_epoll_ctl_prep(req, sqe);
 	case IORING_OP_SPLICE:
 		return io_splice_prep(req, sqe);
+	case IORING_OP_VMSPLICE:
+		return io_vmsplice_prep(req, sqe);
 	case IORING_OP_PROVIDE_BUFFERS:
 		return io_provide_buffers_prep(req, sqe);
 	case IORING_OP_REMOVE_BUFFERS:
@@ -6262,6 +6325,9 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock,
 	case IORING_OP_SPLICE:
 		ret = io_splice(req, force_nonblock);
 		break;
+	case IORING_OP_VMSPLICE:
+		ret = io_vmsplice(req, force_nonblock);
+		break;
 	case IORING_OP_PROVIDE_BUFFERS:
 		ret = io_provide_buffers(req, force_nonblock, cs);
 		break;
diff --git a/fs/splice.c b/fs/splice.c
index 866d5c2367b2..e9f1f27460a1 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1270,6 +1270,17 @@ static int vmsplice_type(struct fd f, int *type)
 	return 0;
 }
 
+long do_vmsplice(struct file *file, struct iov_iter *iter, unsigned int flags) {
+	long error;
+	if (!iov_iter_count(iter))
+		error = 0;
+	else if (iov_iter_rw(iter) == WRITE)
+		error = vmsplice_to_pipe(file, iter, flags);
+	else
+		error = vmsplice_to_user(file, iter, flags);
+	return error;
+}
+
 /*
  * Note that vmsplice only really supports true splicing _from_ user memory
  * to a pipe, not the other way around. Splicing from user memory is a simple
@@ -1309,12 +1320,7 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
 	if (error < 0)
 		goto out_fdput;
 
-	if (!iov_iter_count(&iter))
-		error = 0;
-	else if (iov_iter_rw(&iter) == WRITE)
-		error = vmsplice_to_pipe(f.file, &iter, flags);
-	else
-		error = vmsplice_to_user(f.file, &iter, flags);
+	error = do_vmsplice(f.file, &iter, flags);
 
 	kfree(iov);
 out_fdput:
diff --git a/include/linux/splice.h b/include/linux/splice.h
index a55179fd60fc..44c0e612f652 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -81,9 +81,9 @@ extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *,
 extern long do_splice(struct file *in, loff_t *off_in,
 		      struct file *out, loff_t *off_out,
 		      size_t len, unsigned int flags);
-
 extern long do_tee(struct file *in, struct file *out, size_t len,
 		   unsigned int flags);
+extern long do_vmsplice(struct file *file, struct iov_iter *iter, unsigned int flags);
 
 /*
  * for dynamic pipe sizing
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index d31a2a1e8ef9..6bc79f9bb123 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -137,6 +137,7 @@ enum {
 	IORING_OP_SHUTDOWN,
 	IORING_OP_RENAMEAT,
 	IORING_OP_UNLINKAT,
+	IORING_OP_VMSPLICE,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
-- 
2.30.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: Questions regarding implementation of vmsplice in io_uring
  2021-01-03 22:22 Questions regarding implementation of vmsplice in io_uring arni
@ 2021-01-04 15:21 ` Jens Axboe
  2021-01-04 15:37   ` Jens Axboe
  0 siblings, 1 reply; 3+ messages in thread
From: Jens Axboe @ 2021-01-04 15:21 UTC (permalink / raw)
  To: arni, io-uring; +Cc: Árni Dagur

On 1/3/21 3:22 PM, arni@dagur.eu wrote:
> From: Árni Dagur <arnidg@protonmail.ch>
> 
> Hello,
> 
> For my first stab at kernel development, I wanted to try implementing
> vmsplice for io_uring. I've attached the code I've written so far. I have two
> questions to ask, sorry if this is not the right place.
> 
> 1. Currently I use __import_iovec directly, instead of using
> io_import_iovec. That's because I've created a new "kiocb" struct
> called io_vmsplice, rather than using io_rw as io_import_iovec expects.
> The reason I created a new struct is so that it can hold an unsigned int
> for the flags argument -- which is not present in io_rw. Im guessing that I
> should find a way to use io_import_iovec instead?
> 
> One way I can think of is giving the io_vmsplice struct the same initial
> fields as io_rw, and letting io_import_iovec access the union as io_rw rather
> than io_vmsplice. Coming from a Rust background however, this solution
> sounds like a bug waiting to happen (if one of the structs is changed
> but the other is not).
> 
> 2. Whenever I run the test program at
> https://gist.githubusercontent.com/ArniDagur/07d87aefae93868ca1bf10766194599d/raw/dc14a63649d530e5e29f0d1288f41ed54bc6b810/main.c
> I get a BADF result value. The debugger tells me that this occurs
> because `file->f_op != &pipefifo_fops` in get_pipe_info() in fs/pipe.c
> (neither pointer is NULL).
> 
> I give the program the file descriptor "1". Shouldn't that always be a pipe?
> Is there something obvious that I'm missing?

The change looks reasonable, some changes needed around not blocking.
But you can't use the splice ops with a tty, you need to use an end of a
pipe. That's why you get -EBADF in your test program. I'm assuming you
didn't run the one you sent, because you're overwriting ->addr in that
one by setting splice_off_in after having assigned ->addr using the prep
function?

> @@ -967,6 +976,11 @@ static const struct io_op_def io_op_defs[] = {
>  		.unbound_nonreg_file	= 1,
>  		.work_flags		= IO_WQ_WORK_BLKCG,
>  	},
> +	[IORING_OP_VMSPLICE] = {
> +		.needs_file = 1,
> +		.hash_reg_file		= 1,
> +		.unbound_nonreg_file	= 1,
> 
> I couldn't find any information regarding what the work flags do, so
> I've left them empty for now.

As a minimum, you'd need IO_WQ_WORK_MM I think for the async part of it,
if we need to block.

Various style issues in here too, like lines that are too long and
function braces need to go on a new line (and no braces for single
lines). If you want to move further with this, also split it into two
patches. The first should do the abstraction needed for splice.[ch] and
the second should be the io_uring change.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: Questions regarding implementation of vmsplice in io_uring
  2021-01-04 15:21 ` Jens Axboe
@ 2021-01-04 15:37   ` Jens Axboe
  0 siblings, 0 replies; 3+ messages in thread
From: Jens Axboe @ 2021-01-04 15:37 UTC (permalink / raw)
  To: arni, io-uring; +Cc: Árni Dagur

On 1/4/21 8:21 AM, Jens Axboe wrote:
> On 1/3/21 3:22 PM, arni@dagur.eu wrote:
>> From: Árni Dagur <arnidg@protonmail.ch>
>>
>> Hello,
>>
>> For my first stab at kernel development, I wanted to try implementing
>> vmsplice for io_uring. I've attached the code I've written so far. I have two
>> questions to ask, sorry if this is not the right place.
>>
>> 1. Currently I use __import_iovec directly, instead of using
>> io_import_iovec. That's because I've created a new "kiocb" struct
>> called io_vmsplice, rather than using io_rw as io_import_iovec expects.
>> The reason I created a new struct is so that it can hold an unsigned int
>> for the flags argument -- which is not present in io_rw. Im guessing that I
>> should find a way to use io_import_iovec instead?
>>
>> One way I can think of is giving the io_vmsplice struct the same initial
>> fields as io_rw, and letting io_import_iovec access the union as io_rw rather
>> than io_vmsplice. Coming from a Rust background however, this solution
>> sounds like a bug waiting to happen (if one of the structs is changed
>> but the other is not).
>>
>> 2. Whenever I run the test program at
>> https://gist.githubusercontent.com/ArniDagur/07d87aefae93868ca1bf10766194599d/raw/dc14a63649d530e5e29f0d1288f41ed54bc6b810/main.c
>> I get a BADF result value. The debugger tells me that this occurs
>> because `file->f_op != &pipefifo_fops` in get_pipe_info() in fs/pipe.c
>> (neither pointer is NULL).
>>
>> I give the program the file descriptor "1". Shouldn't that always be a pipe?
>> Is there something obvious that I'm missing?
> 
> The change looks reasonable, some changes needed around not blocking.
> But you can't use the splice ops with a tty, you need to use an end of a
> pipe. That's why you get -EBADF in your test program. I'm assuming you
> didn't run the one you sent, because you're overwriting ->addr in that
> one by setting splice_off_in after having assigned ->addr using the prep
> function?
> 
>> @@ -967,6 +976,11 @@ static const struct io_op_def io_op_defs[] = {
>>  		.unbound_nonreg_file	= 1,
>>  		.work_flags		= IO_WQ_WORK_BLKCG,
>>  	},
>> +	[IORING_OP_VMSPLICE] = {
>> +		.needs_file = 1,
>> +		.hash_reg_file		= 1,
>> +		.unbound_nonreg_file	= 1,
>>
>> I couldn't find any information regarding what the work flags do, so
>> I've left them empty for now.
> 
> As a minimum, you'd need IO_WQ_WORK_MM I think for the async part of it,
> if we need to block.
> 
> Various style issues in here too, like lines that are too long and
> function braces need to go on a new line (and no braces for single
> lines). If you want to move further with this, also split it into two
> patches. The first should do the abstraction needed for splice.[ch] and
> the second should be the io_uring change.

With your test app fixed up, this one works for me and has the style
issues cleaned up too. Might need more work than this, but it's a
good starting point. One thing to note is that the pipe_lock()
means we probably have to always return -EAGAIN if force_nonblock
is set in io_vmsplice() for now.


diff --git a/fs/io_uring.c b/fs/io_uring.c
index ed13642b56bc..349449202c6a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -531,6 +531,13 @@ struct io_splice {
 	unsigned int			flags;
 };
 
+struct io_vmsplice {
+	struct file			*file;
+	u64				addr;
+	u64				len;
+	unsigned int			flags;
+};
+
 struct io_provide_buf {
 	struct file			*file;
 	__u64				addr;
@@ -692,6 +699,7 @@ struct io_kiocb {
 		struct io_madvise	madvise;
 		struct io_epoll		epoll;
 		struct io_splice	splice;
+		struct io_vmsplice	vmsplice;
 		struct io_provide_buf	pbuf;
 		struct io_statx		statx;
 		struct io_shutdown	shutdown;
@@ -967,6 +975,12 @@ static const struct io_op_def io_op_defs[] = {
 		.unbound_nonreg_file	= 1,
 		.work_flags		= IO_WQ_WORK_BLKCG,
 	},
+	[IORING_OP_VMSPLICE] = {
+		.needs_file = 1,
+		.hash_reg_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.work_flags		= IO_WQ_WORK_MM,
+	},
 	[IORING_OP_PROVIDE_BUFFERS] = {},
 	[IORING_OP_REMOVE_BUFFERS] = {},
 	[IORING_OP_TEE] = {
@@ -3902,6 +3916,54 @@ static int io_splice(struct io_kiocb *req, bool force_nonblock)
 	return 0;
 }
 
+static int io_vmsplice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_vmsplice *sp = &req->vmsplice;
+
+	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+	if (unlikely(READ_ONCE(sqe->off)))
+		return -EINVAL;
+
+	sp->addr = READ_ONCE(sqe->addr);
+	sp->len = READ_ONCE(sqe->len);
+	sp->flags = READ_ONCE(sqe->splice_flags);
+
+	if (sp->flags & ~SPLICE_F_ALL)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int io_vmsplice(struct io_kiocb *req, bool force_nonblock)
+{
+	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+	void __user *buf = u64_to_user_ptr(sp->addr);
+	struct io_vmsplice *sp = &req->vmsplice;
+	struct iov_iter __iter, *iter = &__iter;
+	struct file *file = sp->file;
+	int type, ret, flags;
+
+	if (file->f_mode & FMODE_WRITE)
+		type = WRITE;
+	else if (file->f_mode & FMODE_READ)
+		type = READ;
+	else
+		return -EBADF;
+
+	ret = __import_iovec(type, buf, sp->len, UIO_FASTIOV, &iovec, iter,
+				req->ctx->compat);
+	if (ret < 0)
+		return ret;
+
+	flags = sp->flags;
+	if (force_nonblock)
+		flags |= SPLICE_F_NONBLOCK;
+	ret = do_vmsplice(file, iter, flags);
+	kfree(iovec);
+	return ret;
+}
+
 /*
  * IORING_OP_NOP just posts a completion event, nothing else.
  */
@@ -6027,6 +6089,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return io_epoll_ctl_prep(req, sqe);
 	case IORING_OP_SPLICE:
 		return io_splice_prep(req, sqe);
+	case IORING_OP_VMSPLICE:
+		return io_vmsplice_prep(req, sqe);
 	case IORING_OP_PROVIDE_BUFFERS:
 		return io_provide_buffers_prep(req, sqe);
 	case IORING_OP_REMOVE_BUFFERS:
@@ -6280,6 +6344,9 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock,
 	case IORING_OP_SPLICE:
 		ret = io_splice(req, force_nonblock);
 		break;
+	case IORING_OP_VMSPLICE:
+		ret = io_vmsplice(req, force_nonblock);
+		break;
 	case IORING_OP_PROVIDE_BUFFERS:
 		ret = io_provide_buffers(req, force_nonblock, cs);
 		break;
diff --git a/fs/splice.c b/fs/splice.c
index 866d5c2367b2..2d653a20cced 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1270,6 +1270,20 @@ static int vmsplice_type(struct fd f, int *type)
 	return 0;
 }
 
+long do_vmsplice(struct file *file, struct iov_iter *iter, unsigned int flags)
+{
+	long error;
+
+	if (!iov_iter_count(iter))
+		error = 0;
+	else if (iov_iter_rw(iter) == WRITE)
+		error = vmsplice_to_pipe(file, iter, flags);
+	else
+		error = vmsplice_to_user(file, iter, flags);
+
+	return error;
+}
+
 /*
  * Note that vmsplice only really supports true splicing _from_ user memory
  * to a pipe, not the other way around. Splicing from user memory is a simple
@@ -1309,12 +1323,7 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
 	if (error < 0)
 		goto out_fdput;
 
-	if (!iov_iter_count(&iter))
-		error = 0;
-	else if (iov_iter_rw(&iter) == WRITE)
-		error = vmsplice_to_pipe(f.file, &iter, flags);
-	else
-		error = vmsplice_to_user(f.file, &iter, flags);
+	error = do_vmsplice(f.file, &iter, flags);
 
 	kfree(iov);
 out_fdput:
diff --git a/include/linux/splice.h b/include/linux/splice.h
index a55179fd60fc..44c0e612f652 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -81,9 +81,9 @@ extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *,
 extern long do_splice(struct file *in, loff_t *off_in,
 		      struct file *out, loff_t *off_out,
 		      size_t len, unsigned int flags);
-
 extern long do_tee(struct file *in, struct file *out, size_t len,
 		   unsigned int flags);
+extern long do_vmsplice(struct file *file, struct iov_iter *iter, unsigned int flags);
 
 /*
  * for dynamic pipe sizing
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index d31a2a1e8ef9..6bc79f9bb123 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -137,6 +137,7 @@ enum {
 	IORING_OP_SHUTDOWN,
 	IORING_OP_RENAMEAT,
 	IORING_OP_UNLINKAT,
+	IORING_OP_VMSPLICE,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2021-01-04 15:39 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-01-03 22:22 Questions regarding implementation of vmsplice in io_uring arni
2021-01-04 15:21 ` Jens Axboe
2021-01-04 15:37   ` Jens Axboe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.