All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 0/4] allow to skip CQE posting
@ 2021-11-10 15:49 Pavel Begunkov
  2021-11-10 15:49 ` [PATCH v2 1/4] io_uring: clean cqe filling functions Pavel Begunkov
                   ` (5 more replies)
  0 siblings, 6 replies; 18+ messages in thread
From: Pavel Begunkov @ 2021-11-10 15:49 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe, asml.silence

It's expensive enough to post an CQE, and there are other
reasons to want to ignore them, e.g. for link handling and
it may just be more convenient for the userspace.

Try to cover most of the use cases with one flag. The overhead
is one "if (cqe->flags & IOSQE_CQE_SKIP_SUCCESS)" check per
requests and a bit bloated req_set_fail(), should be bearable.

See 2/4 for the actual description of the flag.

v2: don't allow drain with the new flag (see 4/4)
    add IORING_FEAT_CQE_SKIP

Pavel Begunkov (4):
  io_uring: clean cqe filling functions
  io_uring: add option to skip CQE posting
  io_uring: don't spinlock when not posting CQEs
  io_uring: disable drain with cqe skip

 fs/io_uring.c                 | 114 +++++++++++++++++++++++-----------
 include/uapi/linux/io_uring.h |   4 ++
 2 files changed, 81 insertions(+), 37 deletions(-)

-- 
2.33.1


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH v2 1/4] io_uring: clean cqe filling functions
  2021-11-10 15:49 [PATCH v2 0/4] allow to skip CQE posting Pavel Begunkov
@ 2021-11-10 15:49 ` Pavel Begunkov
  2021-11-10 15:49 ` [PATCH v2 2/4] io_uring: add option to skip CQE posting Pavel Begunkov
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2021-11-10 15:49 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe, asml.silence

Split io_cqring_fill_event() into a couple of more targeted functions.
The first on is io_fill_cqe_aux() for completions that are not
associated with request completions and doing the ->cq_extra accounting.
Examples are additional CQEs from multishot poll and rsrc notifications.

The second is io_fill_cqe_req(), should be called when it's a normal
request completion. Nothing more to it at the moment, will be used in
later patches.

The last one is inlined __io_fill_cqe() for a finer grained control,
should be used with caution and in hottest places.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 58 ++++++++++++++++++++++++++-------------------------
 1 file changed, 30 insertions(+), 28 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b07196b4511c..e624f3f850bd 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1108,8 +1108,8 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 					 bool cancel_all);
 static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
 
-static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
-				 s32 res, u32 cflags);
+static void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags);
+
 static void io_put_req(struct io_kiocb *req);
 static void io_put_req_deferred(struct io_kiocb *req);
 static void io_dismantle_req(struct io_kiocb *req);
@@ -1560,7 +1560,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status)
 		atomic_set(&req->ctx->cq_timeouts,
 			atomic_read(&req->ctx->cq_timeouts) + 1);
 		list_del_init(&req->timeout.list);
-		io_cqring_fill_event(req->ctx, req->user_data, status, 0);
+		io_fill_cqe_req(req, status, 0);
 		io_put_req_deferred(req);
 	}
 }
@@ -1819,8 +1819,8 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
 	return true;
 }
 
-static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
-					  s32 res, u32 cflags)
+static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
+				 s32 res, u32 cflags)
 {
 	struct io_uring_cqe *cqe;
 
@@ -1841,11 +1841,16 @@ static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data
 	return io_cqring_event_overflow(ctx, user_data, res, cflags);
 }
 
-/* not as hot to bloat with inlining */
-static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
-					  s32 res, u32 cflags)
+static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
+{
+	__io_fill_cqe(req->ctx, req->user_data, res, cflags);
+}
+
+static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
+				     s32 res, u32 cflags)
 {
-	return __io_cqring_fill_event(ctx, user_data, res, cflags);
+	ctx->cq_extra++;
+	return __io_fill_cqe(ctx, user_data, res, cflags);
 }
 
 static void io_req_complete_post(struct io_kiocb *req, s32 res,
@@ -1854,7 +1859,7 @@ static void io_req_complete_post(struct io_kiocb *req, s32 res,
 	struct io_ring_ctx *ctx = req->ctx;
 
 	spin_lock(&ctx->completion_lock);
-	__io_cqring_fill_event(ctx, req->user_data, res, cflags);
+	__io_fill_cqe(ctx, req->user_data, res, cflags);
 	/*
 	 * If we're the last reference to this request, add to our locked
 	 * free_list cache.
@@ -2062,8 +2067,7 @@ static bool io_kill_linked_timeout(struct io_kiocb *req)
 		link->timeout.head = NULL;
 		if (hrtimer_try_to_cancel(&io->timer) != -1) {
 			list_del(&link->timeout.list);
-			io_cqring_fill_event(link->ctx, link->user_data,
-					     -ECANCELED, 0);
+			io_fill_cqe_req(link, -ECANCELED, 0);
 			io_put_req_deferred(link);
 			return true;
 		}
@@ -2087,7 +2091,7 @@ static void io_fail_links(struct io_kiocb *req)
 		link->link = NULL;
 
 		trace_io_uring_fail_link(req, link);
-		io_cqring_fill_event(link->ctx, link->user_data, res, 0);
+		io_fill_cqe_req(link, res, 0);
 		io_put_req_deferred(link);
 		link = nxt;
 	}
@@ -2104,8 +2108,7 @@ static bool io_disarm_next(struct io_kiocb *req)
 		req->flags &= ~REQ_F_ARM_LTIMEOUT;
 		if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
 			io_remove_next_linked(req);
-			io_cqring_fill_event(link->ctx, link->user_data,
-					     -ECANCELED, 0);
+			io_fill_cqe_req(link, -ECANCELED, 0);
 			io_put_req_deferred(link);
 			posted = true;
 		}
@@ -2369,8 +2372,8 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
 						    comp_list);
 
-		__io_cqring_fill_event(ctx, req->user_data, req->result,
-					req->cflags);
+		__io_fill_cqe(ctx, req->user_data, req->result,
+			      req->cflags);
 	}
 	io_commit_cqring(ctx);
 	spin_unlock(&ctx->completion_lock);
@@ -2504,8 +2507,8 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 		/* order with io_complete_rw_iopoll(), e.g. ->result updates */
 		if (!smp_load_acquire(&req->iopoll_completed))
 			break;
-		__io_cqring_fill_event(ctx, req->user_data, req->result,
-					io_put_rw_kbuf(req));
+		__io_fill_cqe(ctx, req->user_data, req->result,
+			      io_put_rw_kbuf(req));
 		nr_events++;
 	}
 
@@ -5356,13 +5359,13 @@ static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask)
 	}
 	if (req->poll.events & EPOLLONESHOT)
 		flags = 0;
-	if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) {
+
+	if (!(flags & IORING_CQE_F_MORE)) {
+		io_fill_cqe_req(req, error, flags);
+	} else if (!io_fill_cqe_aux(ctx, req->user_data, error, flags)) {
 		req->poll.events |= EPOLLONESHOT;
 		flags = 0;
 	}
-	if (flags & IORING_CQE_F_MORE)
-		ctx->cq_extra++;
-
 	return !(flags & IORING_CQE_F_MORE);
 }
 
@@ -5680,9 +5683,9 @@ static bool io_poll_remove_one(struct io_kiocb *req)
 	do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true);
 
 	if (do_complete) {
-		io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0);
-		io_commit_cqring(req->ctx);
 		req_set_fail(req);
+		io_fill_cqe_req(req, -ECANCELED, 0);
+		io_commit_cqring(req->ctx);
 		io_put_req_deferred(req);
 	}
 	return do_complete;
@@ -5982,7 +5985,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
 		return PTR_ERR(req);
 
 	req_set_fail(req);
-	io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0);
+	io_fill_cqe_req(req, -ECANCELED, 0);
 	io_put_req_deferred(req);
 	return 0;
 }
@@ -8215,8 +8218,7 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 
 			io_ring_submit_lock(ctx, lock_ring);
 			spin_lock(&ctx->completion_lock);
-			io_cqring_fill_event(ctx, prsrc->tag, 0, 0);
-			ctx->cq_extra++;
+			io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);
 			io_commit_cqring(ctx);
 			spin_unlock(&ctx->completion_lock);
 			io_cqring_ev_posted(ctx);
-- 
2.33.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 2/4] io_uring: add option to skip CQE posting
  2021-11-10 15:49 [PATCH v2 0/4] allow to skip CQE posting Pavel Begunkov
  2021-11-10 15:49 ` [PATCH v2 1/4] io_uring: clean cqe filling functions Pavel Begunkov
@ 2021-11-10 15:49 ` Pavel Begunkov
  2021-11-10 15:49 ` [PATCH v2 3/4] io_uring: don't spinlock when not posting CQEs Pavel Begunkov
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2021-11-10 15:49 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe, asml.silence

Emitting a CQE is expensive from the kernel perspective. Often, it's
also not convenient for the userspace, spends some cycles on processing
and just complicates the logic. A similar problems goes for linked
requests, where we post an CQE for each request in the link.

Introduce a new flags, IOSQE_CQE_SKIP_SUCCESS, trying to help with it.
When set and a request completed successfully, it won't generate a CQE.
When fails, it produces an CQE, but all following linked requests will
be CQE-less, regardless whether they have IOSQE_CQE_SKIP_SUCCESS or not.
The notion of "fail" is the same as for link failing-cancellation, where
it's opcode dependent, and _usually_ result >= 0 is a success, but not
always.

Linked timeouts are a bit special. When the requests it's linked to was
not attempted to be executed, e.g. failing linked requests, it follows
the description above. Otherwise, whether a linked timeout will post a
completion or not solely depends on IOSQE_CQE_SKIP_SUCCESS of that
linked timeout request. Linked timeout never "fail" during execution, so
for them it's unconditional. It's expected for users to not really care
about the result of it but rely solely on the result of the master
request. Another reason for such a treatment is that it's racy, and the
timeout callback may be running awhile the master request posts its
completion.

use case 1:
If one doesn't care about results of some requests, e.g. normal
timeouts, just set IOSQE_CQE_SKIP_SUCCESS. Error result will still be
posted and need to be handled.

use case 2:
Set IOSQE_CQE_SKIP_SUCCESS for all requests of a link but the last,
and it'll post a completion only for the last one if everything goes
right, otherwise there will be one only one CQE for the first failed
request.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c                 | 42 +++++++++++++++++++++++++++--------
 include/uapi/linux/io_uring.h |  4 ++++
 2 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e624f3f850bd..22572cfd6864 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -106,7 +106,8 @@
 #define IORING_MAX_REG_BUFFERS	(1U << 14)
 
 #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
-			  IOSQE_IO_HARDLINK | IOSQE_ASYNC)
+			  IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
+			  IOSQE_CQE_SKIP_SUCCESS)
 
 #define SQE_VALID_FLAGS	(SQE_COMMON_FLAGS|IOSQE_BUFFER_SELECT|IOSQE_IO_DRAIN)
 
@@ -721,6 +722,7 @@ enum {
 	REQ_F_HARDLINK_BIT	= IOSQE_IO_HARDLINK_BIT,
 	REQ_F_FORCE_ASYNC_BIT	= IOSQE_ASYNC_BIT,
 	REQ_F_BUFFER_SELECT_BIT	= IOSQE_BUFFER_SELECT_BIT,
+	REQ_F_CQE_SKIP_BIT	= IOSQE_CQE_SKIP_SUCCESS_BIT,
 
 	/* first byte is taken by user flags, shift it to not overlap */
 	REQ_F_FAIL_BIT		= 8,
@@ -737,6 +739,7 @@ enum {
 	REQ_F_REFCOUNT_BIT,
 	REQ_F_ARM_LTIMEOUT_BIT,
 	REQ_F_ASYNC_DATA_BIT,
+	REQ_F_SKIP_LINK_CQES_BIT,
 	/* keep async read/write and isreg together and in order */
 	REQ_F_SUPPORT_NOWAIT_BIT,
 	REQ_F_ISREG_BIT,
@@ -758,6 +761,8 @@ enum {
 	REQ_F_FORCE_ASYNC	= BIT(REQ_F_FORCE_ASYNC_BIT),
 	/* IOSQE_BUFFER_SELECT */
 	REQ_F_BUFFER_SELECT	= BIT(REQ_F_BUFFER_SELECT_BIT),
+	/* IOSQE_CQE_SKIP_SUCCESS */
+	REQ_F_CQE_SKIP		= BIT(REQ_F_CQE_SKIP_BIT),
 
 	/* fail rest of links */
 	REQ_F_FAIL		= BIT(REQ_F_FAIL_BIT),
@@ -791,6 +796,8 @@ enum {
 	REQ_F_ARM_LTIMEOUT	= BIT(REQ_F_ARM_LTIMEOUT_BIT),
 	/* ->async_data allocated */
 	REQ_F_ASYNC_DATA	= BIT(REQ_F_ASYNC_DATA_BIT),
+	/* don't post CQEs while failing linked requests */
+	REQ_F_SKIP_LINK_CQES	= BIT(REQ_F_SKIP_LINK_CQES_BIT),
 };
 
 struct async_poll {
@@ -1301,6 +1308,10 @@ static inline bool req_has_async_data(struct io_kiocb *req)
 static inline void req_set_fail(struct io_kiocb *req)
 {
 	req->flags |= REQ_F_FAIL;
+	if (req->flags & REQ_F_CQE_SKIP) {
+		req->flags &= ~REQ_F_CQE_SKIP;
+		req->flags |= REQ_F_SKIP_LINK_CQES;
+	}
 }
 
 static inline void req_fail_link_node(struct io_kiocb *req, int res)
@@ -1843,7 +1854,8 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
 
 static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
 {
-	__io_fill_cqe(req->ctx, req->user_data, res, cflags);
+	if (!(req->flags & REQ_F_CQE_SKIP))
+		__io_fill_cqe(req->ctx, req->user_data, res, cflags);
 }
 
 static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
@@ -1859,7 +1871,8 @@ static void io_req_complete_post(struct io_kiocb *req, s32 res,
 	struct io_ring_ctx *ctx = req->ctx;
 
 	spin_lock(&ctx->completion_lock);
-	__io_fill_cqe(ctx, req->user_data, res, cflags);
+	if (!(req->flags & REQ_F_CQE_SKIP))
+		__io_fill_cqe(ctx, req->user_data, res, cflags);
 	/*
 	 * If we're the last reference to this request, add to our locked
 	 * free_list cache.
@@ -2067,6 +2080,7 @@ static bool io_kill_linked_timeout(struct io_kiocb *req)
 		link->timeout.head = NULL;
 		if (hrtimer_try_to_cancel(&io->timer) != -1) {
 			list_del(&link->timeout.list);
+			/* leave REQ_F_CQE_SKIP to io_fill_cqe_req */
 			io_fill_cqe_req(link, -ECANCELED, 0);
 			io_put_req_deferred(link);
 			return true;
@@ -2079,6 +2093,7 @@ static void io_fail_links(struct io_kiocb *req)
 	__must_hold(&req->ctx->completion_lock)
 {
 	struct io_kiocb *nxt, *link = req->link;
+	bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES;
 
 	req->link = NULL;
 	while (link) {
@@ -2091,7 +2106,10 @@ static void io_fail_links(struct io_kiocb *req)
 		link->link = NULL;
 
 		trace_io_uring_fail_link(req, link);
-		io_fill_cqe_req(link, res, 0);
+		if (!ignore_cqes) {
+			link->flags &= ~REQ_F_CQE_SKIP;
+			io_fill_cqe_req(link, res, 0);
+		}
 		io_put_req_deferred(link);
 		link = nxt;
 	}
@@ -2108,6 +2126,7 @@ static bool io_disarm_next(struct io_kiocb *req)
 		req->flags &= ~REQ_F_ARM_LTIMEOUT;
 		if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
 			io_remove_next_linked(req);
+			/* leave REQ_F_CQE_SKIP to io_fill_cqe_req */
 			io_fill_cqe_req(link, -ECANCELED, 0);
 			io_put_req_deferred(link);
 			posted = true;
@@ -2372,8 +2391,9 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
 						    comp_list);
 
-		__io_fill_cqe(ctx, req->user_data, req->result,
-			      req->cflags);
+		if (!(req->flags & REQ_F_CQE_SKIP))
+			__io_fill_cqe(ctx, req->user_data, req->result,
+				      req->cflags);
 	}
 	io_commit_cqring(ctx);
 	spin_unlock(&ctx->completion_lock);
@@ -2503,12 +2523,14 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 	prev = start;
 	wq_list_for_each_resume(pos, prev) {
 		struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
+		u32 cflags;
 
 		/* order with io_complete_rw_iopoll(), e.g. ->result updates */
 		if (!smp_load_acquire(&req->iopoll_completed))
 			break;
-		__io_fill_cqe(ctx, req->user_data, req->result,
-			      io_put_rw_kbuf(req));
+		cflags = io_put_rw_kbuf(req);
+		if (!(req->flags & REQ_F_CQE_SKIP))
+			__io_fill_cqe(ctx, req->user_data, req->result, cflags);
 		nr_events++;
 	}
 
@@ -5828,6 +5850,8 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	flags = READ_ONCE(sqe->len);
 	if (flags & ~IORING_POLL_ADD_MULTI)
 		return -EINVAL;
+	if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP))
+		return -EINVAL;
 
 	io_req_set_refcount(req);
 	poll->events = io_poll_parse_events(sqe, flags);
@@ -10438,7 +10462,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 			IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
 			IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
 			IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
-			IORING_FEAT_RSRC_TAGS;
+			IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP;
 
 	if (copy_to_user(params, p, sizeof(*p))) {
 		ret = -EFAULT;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index c45b5e9a9387..787f491f0d2a 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -70,6 +70,7 @@ enum {
 	IOSQE_IO_HARDLINK_BIT,
 	IOSQE_ASYNC_BIT,
 	IOSQE_BUFFER_SELECT_BIT,
+	IOSQE_CQE_SKIP_SUCCESS_BIT,
 };
 
 /*
@@ -87,6 +88,8 @@ enum {
 #define IOSQE_ASYNC		(1U << IOSQE_ASYNC_BIT)
 /* select buffer from sqe->buf_group */
 #define IOSQE_BUFFER_SELECT	(1U << IOSQE_BUFFER_SELECT_BIT)
+/* don't post CQE if request succeeded */
+#define IOSQE_CQE_SKIP_SUCCESS	(1U << IOSQE_CQE_SKIP_SUCCESS_BIT)
 
 /*
  * io_uring_setup() flags
@@ -289,6 +292,7 @@ struct io_uring_params {
 #define IORING_FEAT_EXT_ARG		(1U << 8)
 #define IORING_FEAT_NATIVE_WORKERS	(1U << 9)
 #define IORING_FEAT_RSRC_TAGS		(1U << 10)
+#define IORING_FEAT_CQE_SKIP		(1U << 11)
 
 /*
  * io_uring_register(2) opcodes and arguments
-- 
2.33.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 3/4] io_uring: don't spinlock when not posting CQEs
  2021-11-10 15:49 [PATCH v2 0/4] allow to skip CQE posting Pavel Begunkov
  2021-11-10 15:49 ` [PATCH v2 1/4] io_uring: clean cqe filling functions Pavel Begunkov
  2021-11-10 15:49 ` [PATCH v2 2/4] io_uring: add option to skip CQE posting Pavel Begunkov
@ 2021-11-10 15:49 ` Pavel Begunkov
  2021-11-25  3:48   ` Hao Xu
  2021-11-10 15:49 ` [PATCH v2 4/4] io_uring: disable drain with cqe skip Pavel Begunkov
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 18+ messages in thread
From: Pavel Begunkov @ 2021-11-10 15:49 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe, asml.silence

When no of queued for the batch completion requests need to post an CQE,
see IOSQE_CQE_SKIP_SUCCESS, avoid grabbing ->completion_lock and other
commit/post.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 22572cfd6864..0c0ea3bbb50a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -321,6 +321,7 @@ struct io_submit_state {
 
 	bool			plug_started;
 	bool			need_plug;
+	bool			flush_cqes;
 	unsigned short		submit_nr;
 	struct blk_plug		plug;
 };
@@ -1525,8 +1526,11 @@ static void io_prep_async_link(struct io_kiocb *req)
 
 static inline void io_req_add_compl_list(struct io_kiocb *req)
 {
+	struct io_ring_ctx *ctx = req->ctx;
 	struct io_submit_state *state = &req->ctx->submit_state;
 
+	if (!(req->flags & REQ_F_CQE_SKIP))
+		ctx->submit_state.flush_cqes = true;
 	wq_list_add_tail(&req->comp_list, &state->compl_reqs);
 }
 
@@ -2386,18 +2390,22 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 	struct io_wq_work_node *node, *prev;
 	struct io_submit_state *state = &ctx->submit_state;
 
-	spin_lock(&ctx->completion_lock);
-	wq_list_for_each(node, prev, &state->compl_reqs) {
-		struct io_kiocb *req = container_of(node, struct io_kiocb,
+	if (state->flush_cqes) {
+		spin_lock(&ctx->completion_lock);
+		wq_list_for_each(node, prev, &state->compl_reqs) {
+			struct io_kiocb *req = container_of(node, struct io_kiocb,
 						    comp_list);
 
-		if (!(req->flags & REQ_F_CQE_SKIP))
-			__io_fill_cqe(ctx, req->user_data, req->result,
-				      req->cflags);
+			if (!(req->flags & REQ_F_CQE_SKIP))
+				__io_fill_cqe(ctx, req->user_data, req->result,
+					      req->cflags);
+		}
+
+		io_commit_cqring(ctx);
+		spin_unlock(&ctx->completion_lock);
+		io_cqring_ev_posted(ctx);
+		state->flush_cqes = false;
 	}
-	io_commit_cqring(ctx);
-	spin_unlock(&ctx->completion_lock);
-	io_cqring_ev_posted(ctx);
 
 	io_free_batch_list(ctx, state->compl_reqs.first);
 	INIT_WQ_LIST(&state->compl_reqs);
-- 
2.33.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 4/4] io_uring: disable drain with cqe skip
  2021-11-10 15:49 [PATCH v2 0/4] allow to skip CQE posting Pavel Begunkov
                   ` (2 preceding siblings ...)
  2021-11-10 15:49 ` [PATCH v2 3/4] io_uring: don't spinlock when not posting CQEs Pavel Begunkov
@ 2021-11-10 15:49 ` Pavel Begunkov
  2021-11-10 16:14 ` [PATCH v2 0/4] allow to skip CQE posting Jens Axboe
  2021-11-24 18:18 ` Jens Axboe
  5 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2021-11-10 15:49 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe, asml.silence

Current IOSQE_IO_DRAIN implementation doesn't work well with CQE
skipping and it's not allowed, otherwise some requests might be not
executed until the ring is destroyed and the userspace would hang.

Let's fail all drain requests after seeing IOSQE_CQE_SKIP_SUCCESS at
least once. All drained requests prior to that will get run normally,
so there should be no stalls. However, even though such mixing wouldn't
lead to issues at the moment, it's still not allowed as the behaviour
may change.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0c0ea3bbb50a..f2c64f0f68d9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -106,10 +106,10 @@
 #define IORING_MAX_REG_BUFFERS	(1U << 14)
 
 #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
-			  IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
-			  IOSQE_CQE_SKIP_SUCCESS)
+			  IOSQE_IO_HARDLINK | IOSQE_ASYNC)
 
-#define SQE_VALID_FLAGS	(SQE_COMMON_FLAGS|IOSQE_BUFFER_SELECT|IOSQE_IO_DRAIN)
+#define SQE_VALID_FLAGS	(SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \
+			IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS)
 
 #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
 				REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
@@ -339,6 +339,7 @@ struct io_ring_ctx {
 		unsigned int		restricted: 1;
 		unsigned int		off_timeout_used: 1;
 		unsigned int		drain_active: 1;
+		unsigned int		drain_disabled: 1;
 	} ____cacheline_aligned_in_smp;
 
 	/* submission data */
@@ -7123,8 +7124,13 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
 		    !io_op_defs[opcode].buffer_select)
 			return -EOPNOTSUPP;
-		if (sqe_flags & IOSQE_IO_DRAIN)
+		if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
+			ctx->drain_disabled = true;
+		if (sqe_flags & IOSQE_IO_DRAIN) {
+			if (ctx->drain_disabled)
+				return -EOPNOTSUPP;
 			io_init_req_drain(req);
+		}
 	}
 	if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
 		if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
-- 
2.33.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 0/4] allow to skip CQE posting
  2021-11-10 15:49 [PATCH v2 0/4] allow to skip CQE posting Pavel Begunkov
                   ` (3 preceding siblings ...)
  2021-11-10 15:49 ` [PATCH v2 4/4] io_uring: disable drain with cqe skip Pavel Begunkov
@ 2021-11-10 16:14 ` Jens Axboe
  2021-11-10 16:42   ` Pavel Begunkov
  2021-11-24 18:18 ` Jens Axboe
  5 siblings, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2021-11-10 16:14 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 11/10/21 8:49 AM, Pavel Begunkov wrote:
> It's expensive enough to post an CQE, and there are other
> reasons to want to ignore them, e.g. for link handling and
> it may just be more convenient for the userspace.
> 
> Try to cover most of the use cases with one flag. The overhead
> is one "if (cqe->flags & IOSQE_CQE_SKIP_SUCCESS)" check per
> requests and a bit bloated req_set_fail(), should be bearable.

I like the idea, one thing I'm struggling with is I think a normal use
case of this would be fast IO where we still need to know if a
completion event has happened, we just don't need to know the details of
it since we already know what those details would be if it ends up in
success.

How about having a skip counter? That would supposedly also allow drain
to work, and it could be mapped with the other cq parts to allow the app
to see it as well.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 0/4] allow to skip CQE posting
  2021-11-10 16:14 ` [PATCH v2 0/4] allow to skip CQE posting Jens Axboe
@ 2021-11-10 16:42   ` Pavel Begunkov
  2021-11-10 16:47     ` Jens Axboe
  2021-11-25  9:35     ` Hao Xu
  0 siblings, 2 replies; 18+ messages in thread
From: Pavel Begunkov @ 2021-11-10 16:42 UTC (permalink / raw)
  To: Jens Axboe, io-uring

On 11/10/21 16:14, Jens Axboe wrote:
> On 11/10/21 8:49 AM, Pavel Begunkov wrote:
>> It's expensive enough to post an CQE, and there are other
>> reasons to want to ignore them, e.g. for link handling and
>> it may just be more convenient for the userspace.
>>
>> Try to cover most of the use cases with one flag. The overhead
>> is one "if (cqe->flags & IOSQE_CQE_SKIP_SUCCESS)" check per
>> requests and a bit bloated req_set_fail(), should be bearable.
> 
> I like the idea, one thing I'm struggling with is I think a normal use
> case of this would be fast IO where we still need to know if a
> completion event has happened, we just don't need to know the details of
> it since we already know what those details would be if it ends up in
> success.
> 
> How about having a skip counter? That would supposedly also allow drain
> to work, and it could be mapped with the other cq parts to allow the app
> to see it as well.

It doesn't go through expensive io_cqring_ev_posted(), so the userspace
can't really wait on it. It can do some linking tricks to alleviate that,
but I don't see any new capabilities from the current approach.

Also the locking is a problem, I was thinking about it, mainly hoping
that I can adjust cq_extra and leave draining, but it didn't appear
great to me. AFAIK, it's either an atomic, beating the purpose of the
thing.

Another option is to split it in two, one counter is kept under
->uring_lock and another under ->completion_lock. But it'll be messy,
shifting flushing part of draining to a work-queue for mutex locking,
adding yet another bunch of counters that hard to maintain and so.

And __io_submit_flush_completions() would also need to go through
the request list one extra time to do the accounting, wouldn't
want to grow massively inlined io_req_complete_state().

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 0/4] allow to skip CQE posting
  2021-11-10 16:42   ` Pavel Begunkov
@ 2021-11-10 16:47     ` Jens Axboe
  2021-11-24 17:55       ` Pavel Begunkov
  2021-11-25  9:35     ` Hao Xu
  1 sibling, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2021-11-10 16:47 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 11/10/21 9:42 AM, Pavel Begunkov wrote:
> On 11/10/21 16:14, Jens Axboe wrote:
>> On 11/10/21 8:49 AM, Pavel Begunkov wrote:
>>> It's expensive enough to post an CQE, and there are other
>>> reasons to want to ignore them, e.g. for link handling and
>>> it may just be more convenient for the userspace.
>>>
>>> Try to cover most of the use cases with one flag. The overhead
>>> is one "if (cqe->flags & IOSQE_CQE_SKIP_SUCCESS)" check per
>>> requests and a bit bloated req_set_fail(), should be bearable.
>>
>> I like the idea, one thing I'm struggling with is I think a normal use
>> case of this would be fast IO where we still need to know if a
>> completion event has happened, we just don't need to know the details of
>> it since we already know what those details would be if it ends up in
>> success.
>>
>> How about having a skip counter? That would supposedly also allow drain
>> to work, and it could be mapped with the other cq parts to allow the app
>> to see it as well.
> 
> It doesn't go through expensive io_cqring_ev_posted(), so the
> userspace can't really wait on it. It can do some linking tricks to
> alleviate that, but I don't see any new capabilities from the current
> approach.

I'm not talking about waiting, just reading the cqring entry to see how
many were skipped. If you ask for no cqe, by definition there would be
nothing to wait on for you. Though it'd probably be better as an sqring
entry, since we'd be accounting at that time. Only caveat there is then
if the sqe errors and we do end up posting a cqe..

> Also the locking is a problem, I was thinking about it, mainly hoping
> that I can adjust cq_extra and leave draining, but it didn't appear
> great to me. AFAIK, it's either an atomic, beating the purpose of the
> thing.

If we do submission side, then the ring mutex would cover it. No need
for any extra locking

> Another option is to split it in two, one counter is kept under
> ->uring_lock and another under ->completion_lock. But it'll be messy,
> shifting flushing part of draining to a work-queue for mutex locking,
> adding yet another bunch of counters that hard to maintain and so.

You'd only need the cqring counter for the unlikely event that the
request is failed and does post an cqe, though.

> And __io_submit_flush_completions() would also need to go through
> the request list one extra time to do the accounting, wouldn't
> want to grow massively inlined io_req_complete_state().

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 0/4] allow to skip CQE posting
  2021-11-10 16:47     ` Jens Axboe
@ 2021-11-24 17:55       ` Pavel Begunkov
  2021-11-24 17:57         ` Jens Axboe
  0 siblings, 1 reply; 18+ messages in thread
From: Pavel Begunkov @ 2021-11-24 17:55 UTC (permalink / raw)
  To: Jens Axboe, io-uring

On 11/10/21 16:47, Jens Axboe wrote:
> On 11/10/21 9:42 AM, Pavel Begunkov wrote:
>> On 11/10/21 16:14, Jens Axboe wrote:
>>> On 11/10/21 8:49 AM, Pavel Begunkov wrote:
>>>> It's expensive enough to post an CQE, and there are other
>>>> reasons to want to ignore them, e.g. for link handling and
>>>> it may just be more convenient for the userspace.
>>>>
>>>> Try to cover most of the use cases with one flag. The overhead
>>>> is one "if (cqe->flags & IOSQE_CQE_SKIP_SUCCESS)" check per
>>>> requests and a bit bloated req_set_fail(), should be bearable.
>>>
>>> I like the idea, one thing I'm struggling with is I think a normal use
>>> case of this would be fast IO where we still need to know if a
>>> completion event has happened, we just don't need to know the details of
>>> it since we already know what those details would be if it ends up in
>>> success.
>>>
>>> How about having a skip counter? That would supposedly also allow drain
>>> to work, and it could be mapped with the other cq parts to allow the app
>>> to see it as well.
>>
>> It doesn't go through expensive io_cqring_ev_posted(), so the
>> userspace can't really wait on it. It can do some linking tricks to
>> alleviate that, but I don't see any new capabilities from the current
>> approach.
> 
> I'm not talking about waiting, just reading the cqring entry to see how
> many were skipped. If you ask for no cqe, by definition there would be
> nothing to wait on for you. Though it'd probably be better as an sqring
> entry, since we'd be accounting at that time. Only caveat there is then
> if the sqe errors and we do end up posting a cqe..
> 
>> Also the locking is a problem, I was thinking about it, mainly hoping
>> that I can adjust cq_extra and leave draining, but it didn't appear
>> great to me. AFAIK, it's either an atomic, beating the purpose of the
>> thing.
> 
> If we do submission side, then the ring mutex would cover it. No need
> for any extra locking

Jens, let's decide what we're going to do with this feature

> 
>> Another option is to split it in two, one counter is kept under
>> ->uring_lock and another under ->completion_lock. But it'll be messy,
>> shifting flushing part of draining to a work-queue for mutex locking,
>> adding yet another bunch of counters that hard to maintain and so.
> 
> You'd only need the cqring counter for the unlikely event that the
> request is failed and does post an cqe, though.
> 
>> And __io_submit_flush_completions() would also need to go through
>> the request list one extra time to do the accounting, wouldn't
>> want to grow massively inlined io_req_complete_state().
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 0/4] allow to skip CQE posting
  2021-11-24 17:55       ` Pavel Begunkov
@ 2021-11-24 17:57         ` Jens Axboe
  2021-11-24 18:02           ` Pavel Begunkov
  0 siblings, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2021-11-24 17:57 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 11/24/21 10:55 AM, Pavel Begunkov wrote:
> On 11/10/21 16:47, Jens Axboe wrote:
>> On 11/10/21 9:42 AM, Pavel Begunkov wrote:
>>> On 11/10/21 16:14, Jens Axboe wrote:
>>>> On 11/10/21 8:49 AM, Pavel Begunkov wrote:
>>>>> It's expensive enough to post an CQE, and there are other
>>>>> reasons to want to ignore them, e.g. for link handling and
>>>>> it may just be more convenient for the userspace.
>>>>>
>>>>> Try to cover most of the use cases with one flag. The overhead
>>>>> is one "if (cqe->flags & IOSQE_CQE_SKIP_SUCCESS)" check per
>>>>> requests and a bit bloated req_set_fail(), should be bearable.
>>>>
>>>> I like the idea, one thing I'm struggling with is I think a normal use
>>>> case of this would be fast IO where we still need to know if a
>>>> completion event has happened, we just don't need to know the details of
>>>> it since we already know what those details would be if it ends up in
>>>> success.
>>>>
>>>> How about having a skip counter? That would supposedly also allow drain
>>>> to work, and it could be mapped with the other cq parts to allow the app
>>>> to see it as well.
>>>
>>> It doesn't go through expensive io_cqring_ev_posted(), so the
>>> userspace can't really wait on it. It can do some linking tricks to
>>> alleviate that, but I don't see any new capabilities from the current
>>> approach.
>>
>> I'm not talking about waiting, just reading the cqring entry to see how
>> many were skipped. If you ask for no cqe, by definition there would be
>> nothing to wait on for you. Though it'd probably be better as an sqring
>> entry, since we'd be accounting at that time. Only caveat there is then
>> if the sqe errors and we do end up posting a cqe..
>>
>>> Also the locking is a problem, I was thinking about it, mainly hoping
>>> that I can adjust cq_extra and leave draining, but it didn't appear
>>> great to me. AFAIK, it's either an atomic, beating the purpose of the
>>> thing.
>>
>> If we do submission side, then the ring mutex would cover it. No need
>> for any extra locking
> 
> Jens, let's decide what we're going to do with this feature

Only weird bit is the drain, but apart from that I think it looks sane.
Are you going to send a documentation update to liburing as well? Should
be detailed in terms of what it does and the usability of it.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 0/4] allow to skip CQE posting
  2021-11-24 17:57         ` Jens Axboe
@ 2021-11-24 18:02           ` Pavel Begunkov
  2021-11-24 18:17             ` Jens Axboe
  0 siblings, 1 reply; 18+ messages in thread
From: Pavel Begunkov @ 2021-11-24 18:02 UTC (permalink / raw)
  To: Jens Axboe, io-uring

On 11/24/21 17:57, Jens Axboe wrote:
> On 11/24/21 10:55 AM, Pavel Begunkov wrote:
>> On 11/10/21 16:47, Jens Axboe wrote:
>>> On 11/10/21 9:42 AM, Pavel Begunkov wrote:
>>>> On 11/10/21 16:14, Jens Axboe wrote:
>>>>> On 11/10/21 8:49 AM, Pavel Begunkov wrote:
>>>>>> It's expensive enough to post an CQE, and there are other
>>>>>> reasons to want to ignore them, e.g. for link handling and
>>>>>> it may just be more convenient for the userspace.
>>>>>>
>>>>>> Try to cover most of the use cases with one flag. The overhead
>>>>>> is one "if (cqe->flags & IOSQE_CQE_SKIP_SUCCESS)" check per
>>>>>> requests and a bit bloated req_set_fail(), should be bearable.
>>>>>
>>>>> I like the idea, one thing I'm struggling with is I think a normal use
>>>>> case of this would be fast IO where we still need to know if a
>>>>> completion event has happened, we just don't need to know the details of
>>>>> it since we already know what those details would be if it ends up in
>>>>> success.
>>>>>
>>>>> How about having a skip counter? That would supposedly also allow drain
>>>>> to work, and it could be mapped with the other cq parts to allow the app
>>>>> to see it as well.
>>>>
>>>> It doesn't go through expensive io_cqring_ev_posted(), so the
>>>> userspace can't really wait on it. It can do some linking tricks to
>>>> alleviate that, but I don't see any new capabilities from the current
>>>> approach.
>>>
>>> I'm not talking about waiting, just reading the cqring entry to see how
>>> many were skipped. If you ask for no cqe, by definition there would be
>>> nothing to wait on for you. Though it'd probably be better as an sqring
>>> entry, since we'd be accounting at that time. Only caveat there is then
>>> if the sqe errors and we do end up posting a cqe..
>>>
>>>> Also the locking is a problem, I was thinking about it, mainly hoping
>>>> that I can adjust cq_extra and leave draining, but it didn't appear
>>>> great to me. AFAIK, it's either an atomic, beating the purpose of the
>>>> thing.
>>>
>>> If we do submission side, then the ring mutex would cover it. No need
>>> for any extra locking
>>
>> Jens, let's decide what we're going to do with this feature
> 
> Only weird bit is the drain, but apart from that I think it looks sane.

agree, but I can't find a fix without penalising performance

> Are you going to send a documentation update to liburing as well? Should
> be detailed in terms of what it does and the usability of it.

yeah, and also need to rebase and resend tests

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 0/4] allow to skip CQE posting
  2021-11-24 18:02           ` Pavel Begunkov
@ 2021-11-24 18:17             ` Jens Axboe
  0 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2021-11-24 18:17 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 11/24/21 11:02 AM, Pavel Begunkov wrote:
> On 11/24/21 17:57, Jens Axboe wrote:
>> On 11/24/21 10:55 AM, Pavel Begunkov wrote:
>>> On 11/10/21 16:47, Jens Axboe wrote:
>>>> On 11/10/21 9:42 AM, Pavel Begunkov wrote:
>>>>> On 11/10/21 16:14, Jens Axboe wrote:
>>>>>> On 11/10/21 8:49 AM, Pavel Begunkov wrote:
>>>>>>> It's expensive enough to post an CQE, and there are other
>>>>>>> reasons to want to ignore them, e.g. for link handling and
>>>>>>> it may just be more convenient for the userspace.
>>>>>>>
>>>>>>> Try to cover most of the use cases with one flag. The overhead
>>>>>>> is one "if (cqe->flags & IOSQE_CQE_SKIP_SUCCESS)" check per
>>>>>>> requests and a bit bloated req_set_fail(), should be bearable.
>>>>>>
>>>>>> I like the idea, one thing I'm struggling with is I think a normal use
>>>>>> case of this would be fast IO where we still need to know if a
>>>>>> completion event has happened, we just don't need to know the details of
>>>>>> it since we already know what those details would be if it ends up in
>>>>>> success.
>>>>>>
>>>>>> How about having a skip counter? That would supposedly also allow drain
>>>>>> to work, and it could be mapped with the other cq parts to allow the app
>>>>>> to see it as well.
>>>>>
>>>>> It doesn't go through expensive io_cqring_ev_posted(), so the
>>>>> userspace can't really wait on it. It can do some linking tricks to
>>>>> alleviate that, but I don't see any new capabilities from the current
>>>>> approach.
>>>>
>>>> I'm not talking about waiting, just reading the cqring entry to see how
>>>> many were skipped. If you ask for no cqe, by definition there would be
>>>> nothing to wait on for you. Though it'd probably be better as an sqring
>>>> entry, since we'd be accounting at that time. Only caveat there is then
>>>> if the sqe errors and we do end up posting a cqe..
>>>>
>>>>> Also the locking is a problem, I was thinking about it, mainly hoping
>>>>> that I can adjust cq_extra and leave draining, but it didn't appear
>>>>> great to me. AFAIK, it's either an atomic, beating the purpose of the
>>>>> thing.
>>>>
>>>> If we do submission side, then the ring mutex would cover it. No need
>>>> for any extra locking
>>>
>>> Jens, let's decide what we're going to do with this feature
>>
>> Only weird bit is the drain, but apart from that I think it looks sane.
> 
> agree, but I can't find a fix without penalising performance

I think we're OK as I don't DRAIN is used very much, and as long as it's
adequately documented in terms of them not co-existing and what the error
code is, then if we do find a way to  make them work together we can
relax them in the future.

>> Are you going to send a documentation update to liburing as well? Should
>> be detailed in terms of what it does and the usability of it.
> 
> yeah, and also need to rebase and resend tests

Great thanks.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 0/4] allow to skip CQE posting
  2021-11-10 15:49 [PATCH v2 0/4] allow to skip CQE posting Pavel Begunkov
                   ` (4 preceding siblings ...)
  2021-11-10 16:14 ` [PATCH v2 0/4] allow to skip CQE posting Jens Axboe
@ 2021-11-24 18:18 ` Jens Axboe
  2021-12-06 19:49   ` Olivier Langlois
  5 siblings, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2021-11-24 18:18 UTC (permalink / raw)
  To: io-uring, Pavel Begunkov

On Wed, 10 Nov 2021 15:49:30 +0000, Pavel Begunkov wrote:
> It's expensive enough to post an CQE, and there are other
> reasons to want to ignore them, e.g. for link handling and
> it may just be more convenient for the userspace.
> 
> Try to cover most of the use cases with one flag. The overhead
> is one "if (cqe->flags & IOSQE_CQE_SKIP_SUCCESS)" check per
> requests and a bit bloated req_set_fail(), should be bearable.
> 
> [...]

Applied, thanks!

[1/4] io_uring: clean cqe filling functions
      commit: 913a571affedd17239c4d4ea90c8874b32fc2191
[2/4] io_uring: add option to skip CQE posting
      commit: 04c76b41ca974b508522831441dd7e5b1b59cbb0
[3/4] io_uring: don't spinlock when not posting CQEs
      commit: 3d4aeb9f98058c3bdfef5286e240cf18c50fee89
[4/4] io_uring: disable drain with cqe skip
      commit: 5562a8d71aa32ea27133d8b10406b3dcd57c01a5

Best regards,
-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 3/4] io_uring: don't spinlock when not posting CQEs
  2021-11-10 15:49 ` [PATCH v2 3/4] io_uring: don't spinlock when not posting CQEs Pavel Begunkov
@ 2021-11-25  3:48   ` Hao Xu
  2021-11-25  7:35     ` Hao Xu
  0 siblings, 1 reply; 18+ messages in thread
From: Hao Xu @ 2021-11-25  3:48 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring; +Cc: Jens Axboe

在 2021/11/10 下午11:49, Pavel Begunkov 写道:
> When no of queued for the batch completion requests need to post an CQE,
> see IOSQE_CQE_SKIP_SUCCESS, avoid grabbing ->completion_lock and other
> commit/post.
> 
> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
> ---
>   fs/io_uring.c | 26 +++++++++++++++++---------
>   1 file changed, 17 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 22572cfd6864..0c0ea3bbb50a 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -321,6 +321,7 @@ struct io_submit_state {
>   
>   	bool			plug_started;
>   	bool			need_plug;
> +	bool			flush_cqes;
>   	unsigned short		submit_nr;
>   	struct blk_plug		plug;
>   };
> @@ -1525,8 +1526,11 @@ static void io_prep_async_link(struct io_kiocb *req)
>   
>   static inline void io_req_add_compl_list(struct io_kiocb *req)
>   {
> +	struct io_ring_ctx *ctx = req->ctx;
>   	struct io_submit_state *state = &req->ctx->submit_state;
>   
> +	if (!(req->flags & REQ_F_CQE_SKIP))
> +		ctx->submit_state.flush_cqes = true;
Should we init it to false in submit_state_start()?
>   	wq_list_add_tail(&req->comp_list, &state->compl_reqs);
>   }
>   
> @@ -2386,18 +2390,22 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
>   	struct io_wq_work_node *node, *prev;
>   	struct io_submit_state *state = &ctx->submit_state;
>   
> -	spin_lock(&ctx->completion_lock);
> -	wq_list_for_each(node, prev, &state->compl_reqs) {
> -		struct io_kiocb *req = container_of(node, struct io_kiocb,
> +	if (state->flush_cqes) {
> +		spin_lock(&ctx->completion_lock);
> +		wq_list_for_each(node, prev, &state->compl_reqs) {
> +			struct io_kiocb *req = container_of(node, struct io_kiocb,
>   						    comp_list);
>   
> -		if (!(req->flags & REQ_F_CQE_SKIP))
> -			__io_fill_cqe(ctx, req->user_data, req->result,
> -				      req->cflags);
> +			if (!(req->flags & REQ_F_CQE_SKIP))
> +				__io_fill_cqe(ctx, req->user_data, req->result,
> +					      req->cflags);
> +		}
> +
> +		io_commit_cqring(ctx);
> +		spin_unlock(&ctx->completion_lock);
> +		io_cqring_ev_posted(ctx);
> +		state->flush_cqes = false;
>   	}
> -	io_commit_cqring(ctx);
> -	spin_unlock(&ctx->completion_lock);
> -	io_cqring_ev_posted(ctx);
>   
>   	io_free_batch_list(ctx, state->compl_reqs.first);
>   	INIT_WQ_LIST(&state->compl_reqs);
> 


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 3/4] io_uring: don't spinlock when not posting CQEs
  2021-11-25  3:48   ` Hao Xu
@ 2021-11-25  7:35     ` Hao Xu
  0 siblings, 0 replies; 18+ messages in thread
From: Hao Xu @ 2021-11-25  7:35 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring; +Cc: Jens Axboe

在 2021/11/25 上午11:48, Hao Xu 写道:
> 在 2021/11/10 下午11:49, Pavel Begunkov 写道:
>> When no of queued for the batch completion requests need to post an CQE,
>> see IOSQE_CQE_SKIP_SUCCESS, avoid grabbing ->completion_lock and other
>> commit/post.
>>
>> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
>> ---
>>   fs/io_uring.c | 26 +++++++++++++++++---------
>>   1 file changed, 17 insertions(+), 9 deletions(-)
>>
>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>> index 22572cfd6864..0c0ea3bbb50a 100644
>> --- a/fs/io_uring.c
>> +++ b/fs/io_uring.c
>> @@ -321,6 +321,7 @@ struct io_submit_state {
>>       bool            plug_started;
>>       bool            need_plug;
>> +    bool            flush_cqes;
>>       unsigned short        submit_nr;
>>       struct blk_plug        plug;
>>   };
>> @@ -1525,8 +1526,11 @@ static void io_prep_async_link(struct io_kiocb 
>> *req)
>>   static inline void io_req_add_compl_list(struct io_kiocb *req)
>>   {
>> +    struct io_ring_ctx *ctx = req->ctx;
>>       struct io_submit_state *state = &req->ctx->submit_state;
>> +    if (!(req->flags & REQ_F_CQE_SKIP))
>> +        ctx->submit_state.flush_cqes = true;
> Should we init it to false in submit_state_start()?
Never mind, I saw it in __io_submit_flush_completions() so there is no
problem..

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 0/4] allow to skip CQE posting
  2021-11-10 16:42   ` Pavel Begunkov
  2021-11-10 16:47     ` Jens Axboe
@ 2021-11-25  9:35     ` Hao Xu
  2021-11-25 14:22       ` Pavel Begunkov
  1 sibling, 1 reply; 18+ messages in thread
From: Hao Xu @ 2021-11-25  9:35 UTC (permalink / raw)
  To: Pavel Begunkov, Jens Axboe, io-uring

在 2021/11/11 上午12:42, Pavel Begunkov 写道:
> On 11/10/21 16:14, Jens Axboe wrote:
>> On 11/10/21 8:49 AM, Pavel Begunkov wrote:
>>> It's expensive enough to post an CQE, and there are other
>>> reasons to want to ignore them, e.g. for link handling and
>>> it may just be more convenient for the userspace.
>>>
>>> Try to cover most of the use cases with one flag. The overhead
>>> is one "if (cqe->flags & IOSQE_CQE_SKIP_SUCCESS)" check per
>>> requests and a bit bloated req_set_fail(), should be bearable.
>>
>> I like the idea, one thing I'm struggling with is I think a normal use
>> case of this would be fast IO where we still need to know if a
>> completion event has happened, we just don't need to know the details of
>> it since we already know what those details would be if it ends up in
>> success.
>>
>> How about having a skip counter? That would supposedly also allow drain
>> to work, and it could be mapped with the other cq parts to allow the app
>> to see it as well.
> 
> It doesn't go through expensive io_cqring_ev_posted(), so the userspace
> can't really wait on it. It can do some linking tricks to alleviate that,
> but I don't see any new capabilities from the current approach.
> 
> Also the locking is a problem, I was thinking about it, mainly hoping
> that I can adjust cq_extra and leave draining, but it didn't appear
> great to me. AFAIK, it's either an atomic, beating the purpose of the
> thing.
For drain requests, we just need to adjust cq_extra:
if (!skip) fill_cqe;
else       cq_extra--;
cq_extra is already protected by completion_lock
Do I miss something?
> 
> Another option is to split it in two, one counter is kept under
> ->uring_lock and another under ->completion_lock. But it'll be messy,
> shifting flushing part of draining to a work-queue for mutex locking,
> adding yet another bunch of counters that hard to maintain and so.
> 
> And __io_submit_flush_completions() would also need to go through
> the request list one extra time to do the accounting, wouldn't
> want to grow massively inlined io_req_complete_state().
> 


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 0/4] allow to skip CQE posting
  2021-11-25  9:35     ` Hao Xu
@ 2021-11-25 14:22       ` Pavel Begunkov
  0 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2021-11-25 14:22 UTC (permalink / raw)
  To: Hao Xu, Jens Axboe, io-uring

On 11/25/21 09:35, Hao Xu wrote:
> 在 2021/11/11 上午12:42, Pavel Begunkov 写道:
>> On 11/10/21 16:14, Jens Axboe wrote:
>>> On 11/10/21 8:49 AM, Pavel Begunkov wrote:
>>>> It's expensive enough to post an CQE, and there are other
>>>> reasons to want to ignore them, e.g. for link handling and
>>>> it may just be more convenient for the userspace.
>>>>
>>>> Try to cover most of the use cases with one flag. The overhead
>>>> is one "if (cqe->flags & IOSQE_CQE_SKIP_SUCCESS)" check per
>>>> requests and a bit bloated req_set_fail(), should be bearable.
>>>
>>> I like the idea, one thing I'm struggling with is I think a normal use
>>> case of this would be fast IO where we still need to know if a
>>> completion event has happened, we just don't need to know the details of
>>> it since we already know what those details would be if it ends up in
>>> success.
>>>
>>> How about having a skip counter? That would supposedly also allow drain
>>> to work, and it could be mapped with the other cq parts to allow the app
>>> to see it as well.
>>
>> It doesn't go through expensive io_cqring_ev_posted(), so the userspace
>> can't really wait on it. It can do some linking tricks to alleviate that,
>> but I don't see any new capabilities from the current approach.
>>
>> Also the locking is a problem, I was thinking about it, mainly hoping
>> that I can adjust cq_extra and leave draining, but it didn't appear
>> great to me. AFAIK, it's either an atomic, beating the purpose of the
>> thing.
> For drain requests, we just need to adjust cq_extra:
> if (!skip) fill_cqe;
> else       cq_extra--;
> cq_extra is already protected by completion_lock

Yes, and we don't take the lock in __io_submit_flush_completions()
when not posting.


>> Another option is to split it in two, one counter is kept under
>> ->uring_lock and another under ->completion_lock. But it'll be messy,
>> shifting flushing part of draining to a work-queue for mutex locking,
>> adding yet another bunch of counters that hard to maintain and so.
>>
>> And __io_submit_flush_completions() would also need to go through
>> the request list one extra time to do the accounting, wouldn't
>> want to grow massively inlined io_req_complete_state().

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 0/4] allow to skip CQE posting
  2021-11-24 18:18 ` Jens Axboe
@ 2021-12-06 19:49   ` Olivier Langlois
  0 siblings, 0 replies; 18+ messages in thread
From: Olivier Langlois @ 2021-12-06 19:49 UTC (permalink / raw)
  To: Jens Axboe, io-uring, Pavel Begunkov

On Wed, 2021-11-24 at 11:18 -0700, Jens Axboe wrote:
> On Wed, 10 Nov 2021 15:49:30 +0000, Pavel Begunkov wrote:
> > It's expensive enough to post an CQE, and there are other
> > reasons to want to ignore them, e.g. for link handling and
> > it may just be more convenient for the userspace.
> > 
> > Try to cover most of the use cases with one flag. The overhead
> > is one "if (cqe->flags & IOSQE_CQE_SKIP_SUCCESS)" check per
> > requests and a bit bloated req_set_fail(), should be bearable.
> > 
> > [...]
> 
> Applied, thanks!
> 
> [1/4] io_uring: clean cqe filling functions
>       commit: 913a571affedd17239c4d4ea90c8874b32fc2191
> [2/4] io_uring: add option to skip CQE posting
>       commit: 04c76b41ca974b508522831441dd7e5b1b59cbb0
> [3/4] io_uring: don't spinlock when not posting CQEs
>       commit: 3d4aeb9f98058c3bdfef5286e240cf18c50fee89
> [4/4] io_uring: disable drain with cqe skip
>       commit: 5562a8d71aa32ea27133d8b10406b3dcd57c01a5
> 
> Best regards,

Awesome!

that set of patches was on my radar and I am very interested in it.

If 5.15 or the soon to be released 5.16 is patchable with it, I'll give
it a try in my app and I will report back the benefits it got from
it...

Greetings,


^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2021-12-06 20:24 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-10 15:49 [PATCH v2 0/4] allow to skip CQE posting Pavel Begunkov
2021-11-10 15:49 ` [PATCH v2 1/4] io_uring: clean cqe filling functions Pavel Begunkov
2021-11-10 15:49 ` [PATCH v2 2/4] io_uring: add option to skip CQE posting Pavel Begunkov
2021-11-10 15:49 ` [PATCH v2 3/4] io_uring: don't spinlock when not posting CQEs Pavel Begunkov
2021-11-25  3:48   ` Hao Xu
2021-11-25  7:35     ` Hao Xu
2021-11-10 15:49 ` [PATCH v2 4/4] io_uring: disable drain with cqe skip Pavel Begunkov
2021-11-10 16:14 ` [PATCH v2 0/4] allow to skip CQE posting Jens Axboe
2021-11-10 16:42   ` Pavel Begunkov
2021-11-10 16:47     ` Jens Axboe
2021-11-24 17:55       ` Pavel Begunkov
2021-11-24 17:57         ` Jens Axboe
2021-11-24 18:02           ` Pavel Begunkov
2021-11-24 18:17             ` Jens Axboe
2021-11-25  9:35     ` Hao Xu
2021-11-25 14:22       ` Pavel Begunkov
2021-11-24 18:18 ` Jens Axboe
2021-12-06 19:49   ` Olivier Langlois

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.