All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCHSET RFC 0/4] Add support for IOSQE2_POLL_FIRST
@ 2022-04-26 18:33 Jens Axboe
  2022-04-26 18:33 ` [PATCH 1/4] io_uring: check IOPOLL/ioprio support upfront Jens Axboe
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: Jens Axboe @ 2022-04-26 18:33 UTC (permalink / raw)
  To: io-uring

Hi,

For some workloads, it's not at all uncommon that every request will end
up using the internal poll feature to trigger the successful execution of
a request. This is quite common for network receive, where the application
doesn't expect any data to be immediately available. Yet we still attempt
to do this receive, then get -EAGAIN, arm poll, and trigger the retry
based on poll.

This can be quite wasteful, and particularly so for cases where we
expect to arm poll basically 100% of the time.

This series builds to adding support for asking io_uring to arm poll
first, rather than first attempt an IO, and finally adds support for
this feature to send/sendmsg and recv/recvmsg (with the two latter ones
being the most useful, imho).

Given that most requests don't support IO priorities, a new flags2 field
is added using that same space. The last bit we have in sqe->flags is
added to say that "ioprio is really flags2". This does mean that any
IOSQE2_ flags added cannot be used with IO priorities.

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH 1/4] io_uring: check IOPOLL/ioprio support upfront
  2022-04-26 18:33 [PATCHSET RFC 0/4] Add support for IOSQE2_POLL_FIRST Jens Axboe
@ 2022-04-26 18:33 ` Jens Axboe
  2022-04-26 18:33 ` [PATCH 2/4] io_uring: add flags2 variable Jens Axboe
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: Jens Axboe @ 2022-04-26 18:33 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

Don't punt this check to the op prep handlers, add the support to
io_op_defs and we can check them while setting up the request.

This reduces the text size by 500 bytes on aarch64, and makes this less
fragile by having the check in one spot and needing opcodes to opt in
to IOPOLL or ioprio support.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 150 +++++++++++++++++---------------------------------
 1 file changed, 52 insertions(+), 98 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1e7466079af7..29153958ea78 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1028,6 +1028,10 @@ struct io_op_def {
 	unsigned		not_supported : 1;
 	/* skip auditing */
 	unsigned		audit_skip : 1;
+	/* supports ioprio */
+	unsigned		ioprio : 1;
+	/* supports iopoll */
+	unsigned		iopoll : 1;
 	/* size of async data needed, if any */
 	unsigned short		async_size;
 };
@@ -1042,6 +1046,8 @@ static const struct io_op_def io_op_defs[] = {
 		.needs_async_setup	= 1,
 		.plug			= 1,
 		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
 		.async_size		= sizeof(struct io_async_rw),
 	},
 	[IORING_OP_WRITEV] = {
@@ -1052,6 +1058,8 @@ static const struct io_op_def io_op_defs[] = {
 		.needs_async_setup	= 1,
 		.plug			= 1,
 		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
 		.async_size		= sizeof(struct io_async_rw),
 	},
 	[IORING_OP_FSYNC] = {
@@ -1064,6 +1072,8 @@ static const struct io_op_def io_op_defs[] = {
 		.pollin			= 1,
 		.plug			= 1,
 		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
 		.async_size		= sizeof(struct io_async_rw),
 	},
 	[IORING_OP_WRITE_FIXED] = {
@@ -1073,6 +1083,8 @@ static const struct io_op_def io_op_defs[] = {
 		.pollout		= 1,
 		.plug			= 1,
 		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
 		.async_size		= sizeof(struct io_async_rw),
 	},
 	[IORING_OP_POLL_ADD] = {
@@ -1137,6 +1149,7 @@ static const struct io_op_def io_op_defs[] = {
 	[IORING_OP_CLOSE] = {},
 	[IORING_OP_FILES_UPDATE] = {
 		.audit_skip		= 1,
+		.iopoll			= 1,
 	},
 	[IORING_OP_STATX] = {
 		.audit_skip		= 1,
@@ -1148,6 +1161,8 @@ static const struct io_op_def io_op_defs[] = {
 		.buffer_select		= 1,
 		.plug			= 1,
 		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
 		.async_size		= sizeof(struct io_async_rw),
 	},
 	[IORING_OP_WRITE] = {
@@ -1157,6 +1172,8 @@ static const struct io_op_def io_op_defs[] = {
 		.pollout		= 1,
 		.plug			= 1,
 		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
 		.async_size		= sizeof(struct io_async_rw),
 	},
 	[IORING_OP_FADVISE] = {
@@ -1191,9 +1208,11 @@ static const struct io_op_def io_op_defs[] = {
 	},
 	[IORING_OP_PROVIDE_BUFFERS] = {
 		.audit_skip		= 1,
+		.iopoll			= 1,
 	},
 	[IORING_OP_REMOVE_BUFFERS] = {
 		.audit_skip		= 1,
+		.iopoll			= 1,
 	},
 	[IORING_OP_TEE] = {
 		.needs_file		= 1,
@@ -1211,6 +1230,7 @@ static const struct io_op_def io_op_defs[] = {
 	[IORING_OP_LINKAT] = {},
 	[IORING_OP_MSG_RING] = {
 		.needs_file		= 1,
+		.iopoll			= 1,
 	},
 };
 
@@ -4139,9 +4159,7 @@ static int io_renameat_prep(struct io_kiocb *req,
 	struct io_rename *ren = &req->rename;
 	const char __user *oldf, *newf;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
+	if (sqe->buf_index || sqe->splice_fd_in)
 		return -EINVAL;
 	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 		return -EBADF;
@@ -4190,10 +4208,7 @@ static int io_unlinkat_prep(struct io_kiocb *req,
 	struct io_unlink *un = &req->unlink;
 	const char __user *fname;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
-	    sqe->splice_fd_in)
+	if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in)
 		return -EINVAL;
 	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 		return -EBADF;
@@ -4239,10 +4254,7 @@ static int io_mkdirat_prep(struct io_kiocb *req,
 	struct io_mkdir *mkd = &req->mkdir;
 	const char __user *fname;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index ||
-	    sqe->splice_fd_in)
+	if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
 		return -EINVAL;
 	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 		return -EBADF;
@@ -4282,10 +4294,7 @@ static int io_symlinkat_prep(struct io_kiocb *req,
 	struct io_symlink *sl = &req->symlink;
 	const char __user *oldpath, *newpath;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (sqe->ioprio || sqe->len || sqe->rw_flags || sqe->buf_index ||
-	    sqe->splice_fd_in)
+	if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
 		return -EINVAL;
 	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 		return -EBADF;
@@ -4331,9 +4340,7 @@ static int io_linkat_prep(struct io_kiocb *req,
 	struct io_hardlink *lnk = &req->hardlink;
 	const char __user *oldf, *newf;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (sqe->ioprio || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
+	if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
 		return -EINVAL;
 	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 		return -EBADF;
@@ -4380,9 +4387,7 @@ static int io_shutdown_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
 {
 #if defined(CONFIG_NET)
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
+	if (unlikely(sqe->off || sqe->addr || sqe->rw_flags ||
 		     sqe->buf_index || sqe->splice_fd_in))
 		return -EINVAL;
 
@@ -4422,9 +4427,6 @@ static int __io_splice_prep(struct io_kiocb *req,
 	struct io_splice *sp = &req->splice;
 	unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-
 	sp->len = READ_ONCE(sqe->len);
 	sp->flags = READ_ONCE(sqe->splice_flags);
 	if (unlikely(sp->flags & ~valid_flags))
@@ -4523,11 +4525,6 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
  */
 static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_ring_ctx *ctx = req->ctx;
-
-	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-
 	__io_req_complete(req, issue_flags, 0, 0);
 	return 0;
 }
@@ -4535,8 +4532,8 @@ static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
 static int io_msg_ring_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
 {
-	if (unlikely(sqe->addr || sqe->ioprio || sqe->rw_flags ||
-		     sqe->splice_fd_in || sqe->buf_index || sqe->personality))
+	if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in ||
+		     sqe->buf_index || sqe->personality))
 		return -EINVAL;
 
 	req->msg.user_data = READ_ONCE(sqe->off);
@@ -4577,12 +4574,7 @@ static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
 
 static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_ring_ctx *ctx = req->ctx;
-
-	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
-		     sqe->splice_fd_in))
+	if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))
 		return -EINVAL;
 
 	req->sync.flags = READ_ONCE(sqe->fsync_flags);
@@ -4615,10 +4607,7 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
 static int io_fallocate_prep(struct io_kiocb *req,
 			     const struct io_uring_sqe *sqe)
 {
-	if (sqe->ioprio || sqe->buf_index || sqe->rw_flags ||
-	    sqe->splice_fd_in)
-		return -EINVAL;
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+	if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
 		return -EINVAL;
 
 	req->sync.off = READ_ONCE(sqe->off);
@@ -4649,9 +4638,7 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	const char __user *fname;
 	int ret;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (unlikely(sqe->ioprio || sqe->buf_index))
+	if (unlikely(sqe->buf_index))
 		return -EINVAL;
 	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 		return -EBADF;
@@ -4783,7 +4770,7 @@ static int io_remove_buffers_prep(struct io_kiocb *req,
 	struct io_provide_buf *p = &req->pbuf;
 	u64 tmp;
 
-	if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
+	if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
 	    sqe->splice_fd_in)
 		return -EINVAL;
 
@@ -4850,7 +4837,7 @@ static int io_provide_buffers_prep(struct io_kiocb *req,
 	struct io_provide_buf *p = &req->pbuf;
 	u64 tmp;
 
-	if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
+	if (sqe->rw_flags || sqe->splice_fd_in)
 		return -EINVAL;
 
 	tmp = READ_ONCE(sqe->fd);
@@ -4980,9 +4967,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req,
 			     const struct io_uring_sqe *sqe)
 {
 #if defined(CONFIG_EPOLL)
-	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
-		return -EINVAL;
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+	if (sqe->buf_index || sqe->splice_fd_in)
 		return -EINVAL;
 
 	req->epoll.epfd = READ_ONCE(sqe->fd);
@@ -5026,9 +5011,7 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
 static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
-	if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in)
-		return -EINVAL;
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+	if (sqe->buf_index || sqe->off || sqe->splice_fd_in)
 		return -EINVAL;
 
 	req->madvise.addr = READ_ONCE(sqe->addr);
@@ -5061,9 +5044,7 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
 
 static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in)
-		return -EINVAL;
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+	if (sqe->buf_index || sqe->addr || sqe->splice_fd_in)
 		return -EINVAL;
 
 	req->fadvise.offset = READ_ONCE(sqe->off);
@@ -5099,9 +5080,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	const char __user *path;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
+	if (sqe->buf_index || sqe->splice_fd_in)
 		return -EINVAL;
 	if (req->flags & REQ_F_FIXED_FILE)
 		return -EBADF;
@@ -5146,10 +5125,7 @@ static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
 
 static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
-	    sqe->rw_flags || sqe->buf_index)
+	if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index)
 		return -EINVAL;
 	if (req->flags & REQ_F_FIXED_FILE)
 		return -EBADF;
@@ -5215,12 +5191,7 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags)
 
 static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_ring_ctx *ctx = req->ctx;
-
-	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
-		     sqe->splice_fd_in))
+	if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))
 		return -EINVAL;
 
 	req->sync.off = READ_ONCE(sqe->off);
@@ -5298,9 +5269,6 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_sr_msg *sr = &req->sr_msg;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-
 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	sr->len = READ_ONCE(sqe->len);
 	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
@@ -5531,9 +5499,6 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_sr_msg *sr = &req->sr_msg;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-
 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	sr->len = READ_ONCE(sqe->len);
 	sr->bgid = READ_ONCE(sqe->buf_group);
@@ -5688,9 +5653,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_accept *accept = &req->accept;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (sqe->ioprio || sqe->len || sqe->buf_index)
+	if (sqe->len || sqe->buf_index)
 		return -EINVAL;
 
 	accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
@@ -5756,10 +5719,7 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_connect *conn = &req->connect;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags ||
-	    sqe->splice_fd_in)
+	if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
 		return -EINVAL;
 
 	conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
@@ -6442,9 +6402,7 @@ static int io_poll_update_prep(struct io_kiocb *req,
 	struct io_poll_update *upd = &req->poll_update;
 	u32 flags;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
+	if (sqe->buf_index || sqe->splice_fd_in)
 		return -EINVAL;
 	flags = READ_ONCE(sqe->len);
 	if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
@@ -6474,9 +6432,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	struct io_poll_iocb *poll = &req->poll;
 	u32 flags;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr)
+	if (sqe->buf_index || sqe->off || sqe->addr)
 		return -EINVAL;
 	flags = READ_ONCE(sqe->len);
 	if (flags & ~IORING_POLL_ADD_MULTI)
@@ -6683,11 +6639,9 @@ static int io_timeout_remove_prep(struct io_kiocb *req,
 {
 	struct io_timeout_rem *tr = &req->timeout_rem;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
 	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
 		return -EINVAL;
-	if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in)
+	if (sqe->buf_index || sqe->len || sqe->splice_fd_in)
 		return -EINVAL;
 
 	tr->ltimeout = false;
@@ -6757,10 +6711,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	unsigned flags;
 	u32 off = READ_ONCE(sqe->off);
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (sqe->ioprio || sqe->buf_index || sqe->len != 1 ||
-	    sqe->splice_fd_in)
+	if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in)
 		return -EINVAL;
 	if (off && is_timeout_link)
 		return -EINVAL;
@@ -6942,11 +6893,9 @@ static int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd)
 static int io_async_cancel_prep(struct io_kiocb *req,
 				const struct io_uring_sqe *sqe)
 {
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
 	if (unlikely(req->flags & REQ_F_BUFFER_SELECT))
 		return -EINVAL;
-	if (sqe->ioprio || sqe->off || sqe->len || sqe->splice_fd_in)
+	if (sqe->off || sqe->len || sqe->splice_fd_in)
 		return -EINVAL;
 
 	req->cancel.addr = READ_ONCE(sqe->addr);
@@ -7032,7 +6981,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req,
 {
 	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
 		return -EINVAL;
-	if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
+	if (sqe->rw_flags || sqe->splice_fd_in)
 		return -EINVAL;
 
 	req->rsrc_update.offset = READ_ONCE(sqe->off);
@@ -7845,6 +7794,11 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		}
 	}
 
+	if (!io_op_defs[opcode].ioprio && sqe->ioprio)
+		return -EINVAL;
+	if (!io_op_defs[opcode].iopoll && (ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+
 	if (io_op_defs[opcode].needs_file) {
 		struct io_submit_state *state = &ctx->submit_state;
 
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 2/4] io_uring: add flags2 variable
  2022-04-26 18:33 [PATCHSET RFC 0/4] Add support for IOSQE2_POLL_FIRST Jens Axboe
  2022-04-26 18:33 ` [PATCH 1/4] io_uring: check IOPOLL/ioprio support upfront Jens Axboe
@ 2022-04-26 18:33 ` Jens Axboe
  2022-04-26 18:33 ` [PATCH 3/4] io_uring: add IOSQE2_POLL_FIRST flag Jens Axboe
  2022-04-26 18:33 ` [PATCH 4/4] io_uring: wire up IOSQE2_POLL_FIRST for send/sendmsg and recv/recvmsg Jens Axboe
  3 siblings, 0 replies; 5+ messages in thread
From: Jens Axboe @ 2022-04-26 18:33 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

Most opcodes don't support using the ioprio field, it's really only
something that read/write to storage media supports. Overlay a flags2
variable with ioprio, so we can grow our flags space.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 40 ++++++++++++++++++++++++-----------
 include/uapi/linux/io_uring.h |  8 ++++++-
 2 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 29153958ea78..06afe4db5a9a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -108,7 +108,10 @@
 			  IOSQE_IO_HARDLINK | IOSQE_ASYNC)
 
 #define SQE_VALID_FLAGS	(SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \
-			IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS)
+			IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS | \
+			IOSQE_FLAGS2)
+
+#define SQE_VALID_FLAGS2	0
 
 #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
 				REQ_F_POLLED | REQ_F_CREDS | REQ_F_ASYNC_DATA)
@@ -788,9 +791,10 @@ enum {
 	REQ_F_FORCE_ASYNC_BIT	= IOSQE_ASYNC_BIT,
 	REQ_F_BUFFER_SELECT_BIT	= IOSQE_BUFFER_SELECT_BIT,
 	REQ_F_CQE_SKIP_BIT	= IOSQE_CQE_SKIP_SUCCESS_BIT,
+	REQ_F_FLAGS2_BIT	= IOSQE_FLAGS2_BIT,
 
-	/* first byte is taken by user flags, shift it to not overlap */
-	REQ_F_FAIL_BIT		= 8,
+	/* first bits are taken by user flags, shift it to not overlap */
+	REQ_F_FAIL_BIT		= 9,
 	REQ_F_INFLIGHT_BIT,
 	REQ_F_CUR_POS_BIT,
 	REQ_F_NOWAIT_BIT,
@@ -831,6 +835,8 @@ enum {
 	REQ_F_BUFFER_SELECT	= BIT(REQ_F_BUFFER_SELECT_BIT),
 	/* IOSQE_CQE_SKIP_SUCCESS */
 	REQ_F_CQE_SKIP		= BIT(REQ_F_CQE_SKIP_BIT),
+	/* IOSQE_FLAGS2 */
+	REQ_F_FLAGS2		= BIT(REQ_F_FLAGS2_BIT),
 
 	/* fail rest of links */
 	REQ_F_FAIL		= BIT(REQ_F_FAIL_BIT),
@@ -3280,15 +3286,16 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 	kiocb->ki_pos = READ_ONCE(sqe->off);
 
-	ioprio = READ_ONCE(sqe->ioprio);
-	if (ioprio) {
-		ret = ioprio_check_cap(ioprio);
-		if (ret)
-			return ret;
+	kiocb->ki_ioprio = get_current_ioprio();
+	if (!(req->flags & REQ_F_FLAGS2)) {
+		ioprio = READ_ONCE(sqe->ioprio);
+		if (ioprio) {
+			ret = ioprio_check_cap(ioprio);
+			if (ret)
+				return ret;
 
-		kiocb->ki_ioprio = ioprio;
-	} else {
-		kiocb->ki_ioprio = get_current_ioprio();
+			kiocb->ki_ioprio = ioprio;
+		}
 	}
 
 	req->imu = NULL;
@@ -7779,6 +7786,14 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 				return -EOPNOTSUPP;
 			io_init_req_drain(req);
 		}
+		if (sqe_flags & IOSQE_FLAGS2) {
+			unsigned int sqe_flags2;
+
+			sqe_flags2 = READ_ONCE(sqe->flags2);
+			if (sqe_flags2 & ~SQE_VALID_FLAGS2)
+				return -EINVAL;
+			req->flags |= sqe_flags | (sqe_flags2 << 8U);
+		}
 	}
 	if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
 		if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
@@ -7794,7 +7809,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		}
 	}
 
-	if (!io_op_defs[opcode].ioprio && sqe->ioprio)
+	if (!io_op_defs[opcode].ioprio && sqe->ioprio &&
+	    !(req->flags & REQ_F_FLAGS2))
 		return -EINVAL;
 	if (!io_op_defs[opcode].iopoll && (ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index fad63564678a..622f6e27a444 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -17,7 +17,10 @@
 struct io_uring_sqe {
 	__u8	opcode;		/* type of operation for this sqe */
 	__u8	flags;		/* IOSQE_ flags */
-	__u16	ioprio;		/* ioprio for the request */
+	union {
+		__u16	ioprio;	/* ioprio for the request */
+		__u16	flags2;	/* extra flags */
+	};
 	__s32	fd;		/* file descriptor to do IO on */
 	union {
 		__u64	off;	/* offset into file */
@@ -71,6 +74,7 @@ enum {
 	IOSQE_ASYNC_BIT,
 	IOSQE_BUFFER_SELECT_BIT,
 	IOSQE_CQE_SKIP_SUCCESS_BIT,
+	IOSQE_FLAGS2_BIT,
 };
 
 /*
@@ -90,6 +94,8 @@ enum {
 #define IOSQE_BUFFER_SELECT	(1U << IOSQE_BUFFER_SELECT_BIT)
 /* don't post CQE if request succeeded */
 #define IOSQE_CQE_SKIP_SUCCESS	(1U << IOSQE_CQE_SKIP_SUCCESS_BIT)
+/* flags2 is valid and set */
+#define IOSQE_FLAGS2		(1U << IOSQE_FLAGS2_BIT)
 
 /*
  * io_uring_setup() flags
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 3/4] io_uring: add IOSQE2_POLL_FIRST flag
  2022-04-26 18:33 [PATCHSET RFC 0/4] Add support for IOSQE2_POLL_FIRST Jens Axboe
  2022-04-26 18:33 ` [PATCH 1/4] io_uring: check IOPOLL/ioprio support upfront Jens Axboe
  2022-04-26 18:33 ` [PATCH 2/4] io_uring: add flags2 variable Jens Axboe
@ 2022-04-26 18:33 ` Jens Axboe
  2022-04-26 18:33 ` [PATCH 4/4] io_uring: wire up IOSQE2_POLL_FIRST for send/sendmsg and recv/recvmsg Jens Axboe
  3 siblings, 0 replies; 5+ messages in thread
From: Jens Axboe @ 2022-04-26 18:33 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

For some operations, the application fully expects that no data or space
is available, and that the request will be punted to our poll handler
after the initial transfer attempt. For this case, it's a potentially big
waste of time to first attempt the operation, only to get -EAGAIN and
need to arm poll.

Add IOSQE2_POLL_FIRST for this case, allowing the application to request
that the handler checks poll first before attempting the operation.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 |  5 ++++-
 include/uapi/linux/io_uring.h | 11 +++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 06afe4db5a9a..eb5f77bde98d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -111,7 +111,7 @@
 			IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS | \
 			IOSQE_FLAGS2)
 
-#define SQE_VALID_FLAGS2	0
+#define SQE_VALID_FLAGS2	(IOSQE2_POLL_FIRST)
 
 #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
 				REQ_F_POLLED | REQ_F_CREDS | REQ_F_ASYNC_DATA)
@@ -792,6 +792,7 @@ enum {
 	REQ_F_BUFFER_SELECT_BIT	= IOSQE_BUFFER_SELECT_BIT,
 	REQ_F_CQE_SKIP_BIT	= IOSQE_CQE_SKIP_SUCCESS_BIT,
 	REQ_F_FLAGS2_BIT	= IOSQE_FLAGS2_BIT,
+	REQ_F_POLL_FIRST_BIT	= IOSQE_FLAGS2_BIT + 1,
 
 	/* first bits are taken by user flags, shift it to not overlap */
 	REQ_F_FAIL_BIT		= 9,
@@ -837,6 +838,8 @@ enum {
 	REQ_F_CQE_SKIP		= BIT(REQ_F_CQE_SKIP_BIT),
 	/* IOSQE_FLAGS2 */
 	REQ_F_FLAGS2		= BIT(REQ_F_FLAGS2_BIT),
+	/* IOSQE2_POLL_FIRST */
+	REQ_F_POLL_FIRST	= BIT(REQ_F_POLL_FIRST_BIT),
 
 	/* fail rest of links */
 	REQ_F_FAIL		= BIT(REQ_F_FAIL_BIT),
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 622f6e27a444..fe5d8cae7e7d 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -66,6 +66,7 @@ struct io_uring_sqe {
 	__u64	__pad2[2];
 };
 
+/* sqe->flags */
 enum {
 	IOSQE_FIXED_FILE_BIT,
 	IOSQE_IO_DRAIN_BIT,
@@ -77,6 +78,11 @@ enum {
 	IOSQE_FLAGS2_BIT,
 };
 
+/* sqe->flags2, if IOSQE_FLAGS2 is set in sqe->flags */
+enum {
+	IOSQE2_POLL_FIRST_BIT,
+};
+
 /*
  * sqe->flags
  */
@@ -97,6 +103,11 @@ enum {
 /* flags2 is valid and set */
 #define IOSQE_FLAGS2		(1U << IOSQE_FLAGS2_BIT)
 
+/*
+ * sqe->flags2
+ */
+#define IOSQE2_POLL_FIRST	(1U << IOSQE2_POLL_FIRST_BIT)
+
 /*
  * io_uring_setup() flags
  */
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 4/4] io_uring: wire up IOSQE2_POLL_FIRST for send/sendmsg and recv/recvmsg
  2022-04-26 18:33 [PATCHSET RFC 0/4] Add support for IOSQE2_POLL_FIRST Jens Axboe
                   ` (2 preceding siblings ...)
  2022-04-26 18:33 ` [PATCH 3/4] io_uring: add IOSQE2_POLL_FIRST flag Jens Axboe
@ 2022-04-26 18:33 ` Jens Axboe
  3 siblings, 0 replies; 5+ messages in thread
From: Jens Axboe @ 2022-04-26 18:33 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

If REQ_F_POLL_FIRST is set and we haven't polled for this request before,
go straight to checking poll status before attempting a data transfer.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index eb5f77bde98d..3ae18604ed59 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5315,6 +5315,9 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 		kmsg = &iomsg;
 	}
 
+	if ((req->flags & (REQ_F_POLLED | REQ_F_POLL_FIRST)) == REQ_F_POLL_FIRST)
+		return io_setup_async_msg(req, kmsg);
+
 	flags = req->sr_msg.msg_flags;
 	if (issue_flags & IO_URING_F_NONBLOCK)
 		flags |= MSG_DONTWAIT;
@@ -5357,6 +5360,9 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags)
 	int min_ret = 0;
 	int ret;
 
+	if ((req->flags & (REQ_F_POLLED | REQ_F_POLL_FIRST)) == REQ_F_POLL_FIRST)
+		return -EAGAIN;
+
 	sock = sock_from_file(req->file);
 	if (unlikely(!sock))
 		return -ENOTSOCK;
@@ -5547,6 +5553,9 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 		kmsg = &iomsg;
 	}
 
+	if ((req->flags & (REQ_F_POLLED | REQ_F_POLL_FIRST)) == REQ_F_POLL_FIRST)
+		return io_setup_async_msg(req, kmsg);
+
 	if (req->flags & REQ_F_BUFFER_SELECT) {
 		kbuf = io_recv_buffer_select(req, issue_flags);
 		if (IS_ERR(kbuf))
@@ -5604,6 +5613,9 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 	int ret, min_ret = 0;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 
+	if ((req->flags & (REQ_F_POLLED | REQ_F_POLL_FIRST)) == REQ_F_POLL_FIRST)
+		return -EAGAIN;
+
 	sock = sock_from_file(req->file);
 	if (unlikely(!sock))
 		return -ENOTSOCK;
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2022-04-26 18:33 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-26 18:33 [PATCHSET RFC 0/4] Add support for IOSQE2_POLL_FIRST Jens Axboe
2022-04-26 18:33 ` [PATCH 1/4] io_uring: check IOPOLL/ioprio support upfront Jens Axboe
2022-04-26 18:33 ` [PATCH 2/4] io_uring: add flags2 variable Jens Axboe
2022-04-26 18:33 ` [PATCH 3/4] io_uring: add IOSQE2_POLL_FIRST flag Jens Axboe
2022-04-26 18:33 ` [PATCH 4/4] io_uring: wire up IOSQE2_POLL_FIRST for send/sendmsg and recv/recvmsg Jens Axboe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.