All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH next 0/9] for-next clean ups and micro optimisation
@ 2022-04-12 14:09 Pavel Begunkov
  2022-04-12 14:09 ` [PATCH 1/9] io_uring: explicitly keep a CQE in io_kiocb Pavel Begunkov
                   ` (10 more replies)
  0 siblings, 11 replies; 16+ messages in thread
From: Pavel Begunkov @ 2022-04-12 14:09 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe, asml.silence

nops benchmark: 40.3 -> 41.1 MIOPS, or +2%

Pavel Begunkov (9):
  io_uring: explicitly keep a CQE in io_kiocb
  io_uring: memcpy CQE from req
  io_uring: shrink final link flush
  io_uring: inline io_flush_cached_reqs
  io_uring: helper for empty req cache checks
  io_uring: add helper to return req to cache list
  io_uring: optimise submission loop invariant
  io_uring: optimise submission left counting
  io_uring: optimise io_get_cqe()

 fs/io_uring.c | 288 +++++++++++++++++++++++++++++---------------------
 1 file changed, 165 insertions(+), 123 deletions(-)

-- 
2.35.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 1/9] io_uring: explicitly keep a CQE in io_kiocb
  2022-04-12 14:09 [PATCH next 0/9] for-next clean ups and micro optimisation Pavel Begunkov
@ 2022-04-12 14:09 ` Pavel Begunkov
  2022-04-12 14:09 ` [PATCH 2/9] io_uring: memcpy CQE from req Pavel Begunkov
                   ` (9 subsequent siblings)
  10 siblings, 0 replies; 16+ messages in thread
From: Pavel Begunkov @ 2022-04-12 14:09 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe, asml.silence

We already have req->{result,user_data,cflags}, which mimic struct
io_uring_cqe and are intended to store CQE data. Combine them into a
struct io_uring_cqe field.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 123 ++++++++++++++++++++++++--------------------------
 1 file changed, 60 insertions(+), 63 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a28eb7aec84d..ce5d7ebc34aa 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -909,10 +909,7 @@ struct io_kiocb {
 	u16				buf_index;
 	unsigned int			flags;
 
-	u64				user_data;
-	u32				result;
-	u32				cflags;
-
+	struct io_uring_cqe		cqe;
 	struct io_ring_ctx		*ctx;
 	struct task_struct		*task;
 
@@ -1493,7 +1490,7 @@ static inline void req_set_fail(struct io_kiocb *req)
 static inline void req_fail_link_node(struct io_kiocb *req, int res)
 {
 	req_set_fail(req);
-	req->result = res;
+	req->cqe.res = res;
 }
 
 static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
@@ -1725,7 +1722,7 @@ static void io_queue_async_work(struct io_kiocb *req, bool *dont_use)
 	if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
 		req->work.flags |= IO_WQ_WORK_CANCEL;
 
-	trace_io_uring_queue_async_work(ctx, req, req->user_data, req->opcode, req->flags,
+	trace_io_uring_queue_async_work(ctx, req, req->cqe.user_data, req->opcode, req->flags,
 					&req->work, io_wq_is_hashed(&req->work));
 	io_wq_enqueue(tctx->io_wq, &req->work);
 	if (link)
@@ -2067,8 +2064,8 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
 
 static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
 {
-	trace_io_uring_complete(req->ctx, req, req->user_data, res, cflags);
-	return __io_fill_cqe(req->ctx, req->user_data, res, cflags);
+	trace_io_uring_complete(req->ctx, req, req->cqe.user_data, res, cflags);
+	return __io_fill_cqe(req->ctx, req->cqe.user_data, res, cflags);
 }
 
 static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
@@ -2134,8 +2131,8 @@ static void io_req_complete_post(struct io_kiocb *req, s32 res,
 static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
 					 u32 cflags)
 {
-	req->result = res;
-	req->cflags = cflags;
+	req->cqe.res = res;
+	req->cqe.flags = cflags;
 	req->flags |= REQ_F_COMPLETE_INLINE;
 }
 
@@ -2167,7 +2164,7 @@ static void io_req_complete_fail_submit(struct io_kiocb *req)
 	 */
 	req->flags &= ~REQ_F_HARDLINK;
 	req->flags |= REQ_F_LINK;
-	io_req_complete_failed(req, req->result);
+	io_req_complete_failed(req, req->cqe.res);
 }
 
 /*
@@ -2180,7 +2177,7 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
 	req->link = NULL;
 	req->async_data = NULL;
 	/* not necessary, but safer to zero */
-	req->result = 0;
+	req->cqe.res = 0;
 }
 
 static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
@@ -2334,12 +2331,12 @@ static void io_fail_links(struct io_kiocb *req)
 		long res = -ECANCELED;
 
 		if (link->flags & REQ_F_FAIL)
-			res = link->result;
+			res = link->cqe.res;
 
 		nxt = link->link;
 		link->link = NULL;
 
-		trace_io_uring_fail_link(req->ctx, req, req->user_data,
+		trace_io_uring_fail_link(req->ctx, req, req->cqe.user_data,
 					req->opcode, link);
 
 		if (!ignore_cqes) {
@@ -2459,7 +2456,7 @@ static void handle_prev_tw_list(struct io_wq_work_node *node,
 		if (likely(*uring_locked))
 			req->io_task_work.func(req, uring_locked);
 		else
-			__io_req_complete_post(req, req->result,
+			__io_req_complete_post(req, req->cqe.res,
 						io_put_kbuf_comp(req));
 		node = next;
 	} while (node);
@@ -2589,7 +2586,7 @@ static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
 
 	/* not needed for normal modes, but SQPOLL depends on it */
 	io_tw_lock(ctx, locked);
-	io_req_complete_failed(req, req->result);
+	io_req_complete_failed(req, req->cqe.res);
 }
 
 static void io_req_task_submit(struct io_kiocb *req, bool *locked)
@@ -2606,7 +2603,7 @@ static void io_req_task_submit(struct io_kiocb *req, bool *locked)
 
 static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
 {
-	req->result = ret;
+	req->cqe.res = ret;
 	req->io_task_work.func = io_req_task_cancel;
 	io_req_task_work_add(req, false);
 }
@@ -2706,7 +2703,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 						    comp_list);
 
 			if (!(req->flags & REQ_F_CQE_SKIP))
-				__io_fill_cqe_req(req, req->result, req->cflags);
+				__io_fill_cqe_req(req, req->cqe.res, req->cqe.flags);
 		}
 
 		io_commit_cqring(ctx);
@@ -2831,7 +2828,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 		if (unlikely(req->flags & REQ_F_CQE_SKIP))
 			continue;
 
-		__io_fill_cqe_req(req, req->result, io_put_kbuf(req, 0));
+		__io_fill_cqe_req(req, req->cqe.res, io_put_kbuf(req, 0));
 		nr_events++;
 	}
 
@@ -2990,21 +2987,21 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
 	} else {
 		fsnotify_access(req->file);
 	}
-	if (unlikely(res != req->result)) {
+	if (unlikely(res != req->cqe.res)) {
 		if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
 		    io_rw_should_reissue(req)) {
 			req->flags |= REQ_F_REISSUE;
 			return true;
 		}
 		req_set_fail(req);
-		req->result = res;
+		req->cqe.res = res;
 	}
 	return false;
 }
 
 static inline void io_req_task_complete(struct io_kiocb *req, bool *locked)
 {
-	int res = req->result;
+	int res = req->cqe.res;
 
 	if (*locked) {
 		io_req_complete_state(req, res, io_put_kbuf(req, 0));
@@ -3020,7 +3017,7 @@ static void __io_complete_rw(struct io_kiocb *req, long res,
 {
 	if (__io_complete_rw_common(req, res))
 		return;
-	__io_req_complete(req, issue_flags, req->result,
+	__io_req_complete(req, issue_flags, req->cqe.res,
 				io_put_kbuf(req, issue_flags));
 }
 
@@ -3030,7 +3027,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res)
 
 	if (__io_complete_rw_common(req, res))
 		return;
-	req->result = res;
+	req->cqe.res = res;
 	req->io_task_work.func = io_req_task_complete;
 	io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL));
 }
@@ -3041,12 +3038,12 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
 
 	if (kiocb->ki_flags & IOCB_WRITE)
 		kiocb_end_write(req);
-	if (unlikely(res != req->result)) {
+	if (unlikely(res != req->cqe.res)) {
 		if (res == -EAGAIN && io_rw_should_reissue(req)) {
 			req->flags |= REQ_F_REISSUE;
 			return;
 		}
-		req->result = res;
+		req->cqe.res = res;
 	}
 
 	/* order with io_iopoll_complete() checking ->iopoll_completed */
@@ -3838,7 +3835,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
 	ret = io_rw_init_file(req, FMODE_READ);
 	if (unlikely(ret))
 		return ret;
-	req->result = iov_iter_count(&s->iter);
+	req->cqe.res = iov_iter_count(&s->iter);
 
 	if (force_nonblock) {
 		/* If the file doesn't support async, just async punt */
@@ -3854,7 +3851,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
 
 	ppos = io_kiocb_update_pos(req);
 
-	ret = rw_verify_area(READ, req->file, ppos, req->result);
+	ret = rw_verify_area(READ, req->file, ppos, req->cqe.res);
 	if (unlikely(ret)) {
 		kfree(iovec);
 		return ret;
@@ -3876,7 +3873,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
 		ret = 0;
 	} else if (ret == -EIOCBQUEUED) {
 		goto out_free;
-	} else if (ret == req->result || ret <= 0 || !force_nonblock ||
+	} else if (ret == req->cqe.res || ret <= 0 || !force_nonblock ||
 		   (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
 		/* read all, failed, already did sync or don't want to retry */
 		goto done;
@@ -3964,7 +3961,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
 	ret = io_rw_init_file(req, FMODE_WRITE);
 	if (unlikely(ret))
 		return ret;
-	req->result = iov_iter_count(&s->iter);
+	req->cqe.res = iov_iter_count(&s->iter);
 
 	if (force_nonblock) {
 		/* If the file doesn't support async, just async punt */
@@ -3984,7 +3981,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
 
 	ppos = io_kiocb_update_pos(req);
 
-	ret = rw_verify_area(WRITE, req->file, ppos, req->result);
+	ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res);
 	if (unlikely(ret))
 		goto out_free;
 
@@ -5769,7 +5766,7 @@ static void io_poll_req_insert(struct io_kiocb *req)
 	struct io_ring_ctx *ctx = req->ctx;
 	struct hlist_head *list;
 
-	list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
+	list = &ctx->cancel_hash[hash_long(req->cqe.user_data, ctx->cancel_hash_bits)];
 	hlist_add_head(&req->hash_node, list);
 }
 
@@ -5834,7 +5831,7 @@ static void io_poll_remove_entries(struct io_kiocb *req)
  *
  * Returns a negative error on failure. >0 when no action require, which is
  * either spurious wakeup or multishot CQE is served. 0 when it's done with
- * the request, then the mask is stored in req->result.
+ * the request, then the mask is stored in req->cqe.res.
  */
 static int io_poll_check_events(struct io_kiocb *req, bool locked)
 {
@@ -5855,29 +5852,29 @@ static int io_poll_check_events(struct io_kiocb *req, bool locked)
 		if (v & IO_POLL_CANCEL_FLAG)
 			return -ECANCELED;
 
-		if (!req->result) {
-			struct poll_table_struct pt = { ._key = req->cflags };
+		if (!req->cqe.res) {
+			struct poll_table_struct pt = { ._key = req->cqe.flags };
 
 			if (unlikely(!io_assign_file(req, IO_URING_F_UNLOCKED)))
-				req->result = -EBADF;
+				req->cqe.res = -EBADF;
 			else
-				req->result = vfs_poll(req->file, &pt) & req->cflags;
+				req->cqe.res = vfs_poll(req->file, &pt) & req->cqe.flags;
 		}
 
 		/* multishot, just fill an CQE and proceed */
-		if (req->result && !(req->cflags & EPOLLONESHOT)) {
-			__poll_t mask = mangle_poll(req->result & poll->events);
+		if (req->cqe.res && !(req->cqe.flags & EPOLLONESHOT)) {
+			__poll_t mask = mangle_poll(req->cqe.res & poll->events);
 			bool filled;
 
 			spin_lock(&ctx->completion_lock);
-			filled = io_fill_cqe_aux(ctx, req->user_data, mask,
+			filled = io_fill_cqe_aux(ctx, req->cqe.user_data, mask,
 						 IORING_CQE_F_MORE);
 			io_commit_cqring(ctx);
 			spin_unlock(&ctx->completion_lock);
 			if (unlikely(!filled))
 				return -ECANCELED;
 			io_cqring_ev_posted(ctx);
-		} else if (req->result) {
+		} else if (req->cqe.res) {
 			return 0;
 		}
 
@@ -5900,16 +5897,16 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
 		return;
 
 	if (!ret) {
-		req->result = mangle_poll(req->result & req->poll.events);
+		req->cqe.res = mangle_poll(req->cqe.res & req->poll.events);
 	} else {
-		req->result = ret;
+		req->cqe.res = ret;
 		req_set_fail(req);
 	}
 
 	io_poll_remove_entries(req);
 	spin_lock(&ctx->completion_lock);
 	hash_del(&req->hash_node);
-	__io_req_complete_post(req, req->result, 0);
+	__io_req_complete_post(req, req->cqe.res, 0);
 	io_commit_cqring(ctx);
 	spin_unlock(&ctx->completion_lock);
 	io_cqring_ev_posted(ctx);
@@ -5937,20 +5934,20 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
 
 static void __io_poll_execute(struct io_kiocb *req, int mask, int events)
 {
-	req->result = mask;
+	req->cqe.res = mask;
 	/*
 	 * This is useful for poll that is armed on behalf of another
 	 * request, and where the wakeup path could be on a different
 	 * CPU. We want to avoid pulling in req->apoll->events for that
 	 * case.
 	 */
-	req->cflags = events;
+	req->cqe.flags = events;
 	if (req->opcode == IORING_OP_POLL_ADD)
 		req->io_task_work.func = io_poll_task_func;
 	else
 		req->io_task_work.func = io_apoll_task_func;
 
-	trace_io_uring_task_add(req->ctx, req, req->user_data, req->opcode, mask);
+	trace_io_uring_task_add(req->ctx, req, req->cqe.user_data, req->opcode, mask);
 	io_req_task_work_add(req, false);
 }
 
@@ -6200,7 +6197,7 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
 	if (ret || ipt.error)
 		return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
 
-	trace_io_uring_poll_arm(ctx, req, req->user_data, req->opcode,
+	trace_io_uring_poll_arm(ctx, req, req->cqe.user_data, req->opcode,
 				mask, apoll->poll.events);
 	return IO_APOLL_OK;
 }
@@ -6242,7 +6239,7 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
 
 	list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
 	hlist_for_each_entry(req, list, hash_node) {
-		if (sqe_addr != req->user_data)
+		if (sqe_addr != req->cqe.user_data)
 			continue;
 		if (poll_only && req->opcode != IORING_OP_POLL_ADD)
 			continue;
@@ -6336,7 +6333,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 		return -EINVAL;
 
 	io_req_set_refcount(req);
-	req->cflags = poll->events = io_poll_parse_events(sqe, flags);
+	req->cqe.flags = poll->events = io_poll_parse_events(sqe, flags);
 	return 0;
 }
 
@@ -6379,7 +6376,7 @@ static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
 			preq->poll.events |= IO_POLL_UNMASK;
 		}
 		if (req->poll_update.update_user_data)
-			preq->user_data = req->poll_update.new_user_data;
+			preq->cqe.user_data = req->poll_update.new_user_data;
 
 		ret2 = io_poll_add(preq, issue_flags);
 		/* successfully updated, don't complete poll request */
@@ -6388,7 +6385,7 @@ static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
 	}
 
 	req_set_fail(preq);
-	preq->result = -ECANCELED;
+	preq->cqe.res = -ECANCELED;
 	locked = !(issue_flags & IO_URING_F_UNLOCKED);
 	io_req_task_complete(preq, &locked);
 out:
@@ -6416,7 +6413,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 	if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
 		req_set_fail(req);
 
-	req->result = -ETIME;
+	req->cqe.res = -ETIME;
 	req->io_task_work.func = io_req_task_complete;
 	io_req_task_work_add(req, false);
 	return HRTIMER_NORESTART;
@@ -6431,7 +6428,7 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
 	bool found = false;
 
 	list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
-		found = user_data == req->user_data;
+		found = user_data == req->cqe.user_data;
 		if (found)
 			break;
 	}
@@ -6482,7 +6479,7 @@ static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
 	bool found = false;
 
 	list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) {
-		found = user_data == req->user_data;
+		found = user_data == req->cqe.user_data;
 		if (found)
 			break;
 	}
@@ -6707,7 +6704,7 @@ static bool io_cancel_cb(struct io_wq_work *work, void *data)
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 	struct io_cancel_data *cd = data;
 
-	return req->ctx == cd->ctx && req->user_data == cd->user_data;
+	return req->ctx == cd->ctx && req->cqe.user_data == cd->user_data;
 }
 
 static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
@@ -7007,7 +7004,7 @@ static __cold void io_drain_req(struct io_kiocb *req)
 		goto queue;
 	}
 
-	trace_io_uring_defer(ctx, req, req->user_data, req->opcode);
+	trace_io_uring_defer(ctx, req, req->cqe.user_data, req->opcode);
 	de->req = req;
 	de->seq = seq;
 	list_add_tail(&de->list, &ctx->defer_list);
@@ -7098,7 +7095,7 @@ static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags)
 		return true;
 
 	req_set_fail(req);
-	req->result = -EBADF;
+	req->cqe.res = -EBADF;
 	return false;
 }
 
@@ -7384,7 +7381,7 @@ static struct file *io_file_get_normal(struct io_kiocb *req, int fd)
 {
 	struct file *file = fget(fd);
 
-	trace_io_uring_file_get(req->ctx, req, req->user_data, fd);
+	trace_io_uring_file_get(req->ctx, req, req->cqe.user_data, fd);
 
 	/* we don't allow fixed io_uring files */
 	if (file && file->f_op == &io_uring_fops)
@@ -7399,7 +7396,7 @@ static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
 
 	if (prev) {
 		if (!(req->task->flags & PF_EXITING))
-			ret = io_try_cancel_userdata(req, prev->user_data);
+			ret = io_try_cancel_userdata(req, prev->cqe.user_data);
 		io_req_complete_post(req, ret ?: -ETIME, 0);
 		io_put_req(prev);
 	} else {
@@ -7590,7 +7587,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	req->opcode = opcode = READ_ONCE(sqe->opcode);
 	/* same numerical values with corresponding REQ_F_*, safe to copy */
 	req->flags = sqe_flags = READ_ONCE(sqe->flags);
-	req->user_data = READ_ONCE(sqe->user_data);
+	req->cqe.user_data = READ_ONCE(sqe->user_data);
 	req->file = NULL;
 	req->fixed_rsrc_refs = NULL;
 	req->task = current;
@@ -7680,7 +7677,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 			 * we can judge a link req is failed or cancelled by if
 			 * REQ_F_FAIL is set, but the head is an exception since
 			 * it may be set REQ_F_FAIL because of other req's failure
-			 * so let's leverage req->result to distinguish if a head
+			 * so let's leverage req->cqe.res to distinguish if a head
 			 * is set REQ_F_FAIL because of its failure or other req's
 			 * failure so that we can set the correct ret code for it.
 			 * init result here to avoid affecting the normal path.
@@ -7699,7 +7696,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	}
 
 	/* don't need @sqe from now on */
-	trace_io_uring_submit_sqe(ctx, req, req->user_data, req->opcode,
+	trace_io_uring_submit_sqe(ctx, req, req->cqe.user_data, req->opcode,
 				  req->flags, true,
 				  ctx->flags & IORING_SETUP_SQPOLL);
 
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 2/9] io_uring: memcpy CQE from req
  2022-04-12 14:09 [PATCH next 0/9] for-next clean ups and micro optimisation Pavel Begunkov
  2022-04-12 14:09 ` [PATCH 1/9] io_uring: explicitly keep a CQE in io_kiocb Pavel Begunkov
@ 2022-04-12 14:09 ` Pavel Begunkov
  2022-04-12 14:09 ` [PATCH 3/9] io_uring: shrink final link flush Pavel Begunkov
                   ` (8 subsequent siblings)
  10 siblings, 0 replies; 16+ messages in thread
From: Pavel Begunkov @ 2022-04-12 14:09 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe, asml.silence

We can do CQE filling a bit more efficiently when req->cqe is fully
filled by memcpy()'ing it to the userspace instead of doing it field by
field. It's easier on register spilling, removes a couple of extra
loads/stores and write combines two u32 memory writes.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ce5d7ebc34aa..66dbd25bd3ae 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2062,6 +2062,28 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
 	return io_cqring_event_overflow(ctx, user_data, res, cflags);
 }
 
+static inline bool __io_fill_cqe_req_filled(struct io_ring_ctx *ctx,
+					    struct io_kiocb *req)
+{
+	struct io_uring_cqe *cqe;
+
+	trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
+				req->cqe.res, req->cqe.flags);
+
+	/*
+	 * If we can't get a cq entry, userspace overflowed the
+	 * submission (by quite a lot). Increment the overflow count in
+	 * the ring.
+	 */
+	cqe = io_get_cqe(ctx);
+	if (likely(cqe)) {
+		memcpy(cqe, &req->cqe, sizeof(*cqe));
+		return true;
+	}
+	return io_cqring_event_overflow(ctx, req->cqe.user_data,
+					req->cqe.res, req->cqe.flags);
+}
+
 static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
 {
 	trace_io_uring_complete(req->ctx, req, req->cqe.user_data, res, cflags);
@@ -2703,7 +2725,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 						    comp_list);
 
 			if (!(req->flags & REQ_F_CQE_SKIP))
-				__io_fill_cqe_req(req, req->cqe.res, req->cqe.flags);
+				__io_fill_cqe_req_filled(ctx, req);
 		}
 
 		io_commit_cqring(ctx);
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 3/9] io_uring: shrink final link flush
  2022-04-12 14:09 [PATCH next 0/9] for-next clean ups and micro optimisation Pavel Begunkov
  2022-04-12 14:09 ` [PATCH 1/9] io_uring: explicitly keep a CQE in io_kiocb Pavel Begunkov
  2022-04-12 14:09 ` [PATCH 2/9] io_uring: memcpy CQE from req Pavel Begunkov
@ 2022-04-12 14:09 ` Pavel Begunkov
  2022-04-12 14:09 ` [PATCH 4/9] io_uring: inline io_flush_cached_reqs Pavel Begunkov
                   ` (7 subsequent siblings)
  10 siblings, 0 replies; 16+ messages in thread
From: Pavel Begunkov @ 2022-04-12 14:09 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe, asml.silence

All good users should not set IOSQE_IO_*LINK flags for the last request
of a link. io_uring flushes collected links at the end of submission,
but it's not the optimal way and so we don't care too much about it.
Replace io_queue_sqe() call with io_queue_sqe_fallback() as the former
one is inlined and will generate a bunch of extra code. This will also
help compilers with the submission path inlining.

> size ./fs/io_uring.o
   text    data     bss     dec     hex filename
  87265   13734       8  101007   18a8f ./fs/io_uring.o
> size ./fs/io_uring.o
   text    data     bss     dec     hex filename
  87073   13734       8  100815   189cf ./fs/io_uring.o

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 66dbd25bd3ae..d996d7f82d5d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7766,8 +7766,8 @@ static void io_submit_state_end(struct io_ring_ctx *ctx)
 {
 	struct io_submit_state *state = &ctx->submit_state;
 
-	if (state->link.head)
-		io_queue_sqe(state->link.head);
+	if (unlikely(state->link.head))
+		io_queue_sqe_fallback(state->link.head);
 	/* flush only after queuing links as they can generate completions */
 	io_submit_flush_completions(ctx);
 	if (state->plug_started)
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 4/9] io_uring: inline io_flush_cached_reqs
  2022-04-12 14:09 [PATCH next 0/9] for-next clean ups and micro optimisation Pavel Begunkov
                   ` (2 preceding siblings ...)
  2022-04-12 14:09 ` [PATCH 3/9] io_uring: shrink final link flush Pavel Begunkov
@ 2022-04-12 14:09 ` Pavel Begunkov
  2022-04-12 14:09 ` [PATCH 5/9] io_uring: helper for empty req cache checks Pavel Begunkov
                   ` (6 subsequent siblings)
  10 siblings, 0 replies; 16+ messages in thread
From: Pavel Begunkov @ 2022-04-12 14:09 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe, asml.silence

io_flush_cached_reqs() isn't descriptive and has only one caller, inline
it into __io_alloc_req_refill().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 30 +++++++++++-------------------
 1 file changed, 11 insertions(+), 19 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index d996d7f82d5d..73422af2dd79 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2211,21 +2211,6 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
 	spin_unlock(&ctx->completion_lock);
 }
 
-/* Returns true IFF there are requests in the cache */
-static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
-{
-	struct io_submit_state *state = &ctx->submit_state;
-
-	/*
-	 * If we have more than a batch's worth of requests in our IRQ side
-	 * locked cache, grab the lock and move them over to our submission
-	 * side cache.
-	 */
-	if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
-		io_flush_cached_locked_reqs(ctx, state);
-	return !!state->free_list.next;
-}
-
 /*
  * A request might get retired back into the request caches even before opcode
  * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
@@ -2238,11 +2223,18 @@ static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
 	struct io_submit_state *state = &ctx->submit_state;
 	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
 	void *reqs[IO_REQ_ALLOC_BATCH];
-	struct io_kiocb *req;
 	int ret, i;
 
-	if (likely(state->free_list.next || io_flush_cached_reqs(ctx)))
-		return true;
+	/*
+	 * If we have more than a batch's worth of requests in our IRQ side
+	 * locked cache, grab the lock and move them over to our submission
+	 * side cache.
+	 */
+	if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH) {
+		io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
+		if (state->free_list.next)
+			return true;
+	}
 
 	ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
 
@@ -2259,7 +2251,7 @@ static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
 
 	percpu_ref_get_many(&ctx->refs, ret);
 	for (i = 0; i < ret; i++) {
-		req = reqs[i];
+		struct io_kiocb *req = reqs[i];
 
 		io_preinit_req(req, ctx);
 		wq_stack_add_head(&req->comp_list, &state->free_list);
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 5/9] io_uring: helper for empty req cache checks
  2022-04-12 14:09 [PATCH next 0/9] for-next clean ups and micro optimisation Pavel Begunkov
                   ` (3 preceding siblings ...)
  2022-04-12 14:09 ` [PATCH 4/9] io_uring: inline io_flush_cached_reqs Pavel Begunkov
@ 2022-04-12 14:09 ` Pavel Begunkov
  2022-04-12 14:09 ` [PATCH 6/9] io_uring: add helper to return req to cache list Pavel Begunkov
                   ` (5 subsequent siblings)
  10 siblings, 0 replies; 16+ messages in thread
From: Pavel Begunkov @ 2022-04-12 14:09 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe, asml.silence

Add io_req_cache_empty(), which checks if there are requests in the
inline req cache or not. It'll be needed in the future, but also nicely
cleans up a few spots poking into ->free_list directly.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 73422af2dd79..3ccc13acb498 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2211,6 +2211,11 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
 	spin_unlock(&ctx->completion_lock);
 }
 
+static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
+{
+	return !ctx->submit_state.free_list.next;
+}
+
 /*
  * A request might get retired back into the request caches even before opcode
  * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
@@ -2232,7 +2237,7 @@ static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
 	 */
 	if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH) {
 		io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
-		if (state->free_list.next)
+		if (!io_req_cache_empty(ctx))
 			return true;
 	}
 
@@ -2261,7 +2266,7 @@ static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
 
 static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
 {
-	if (unlikely(!ctx->submit_state.free_list.next))
+	if (unlikely(io_req_cache_empty(ctx)))
 		return __io_alloc_req_refill(ctx);
 	return true;
 }
@@ -9790,7 +9795,7 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
 	mutex_lock(&ctx->uring_lock);
 	io_flush_cached_locked_reqs(ctx, state);
 
-	while (state->free_list.next) {
+	while (!io_req_cache_empty(ctx)) {
 		struct io_wq_work_node *node;
 		struct io_kiocb *req;
 
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 6/9] io_uring: add helper to return req to cache list
  2022-04-12 14:09 [PATCH next 0/9] for-next clean ups and micro optimisation Pavel Begunkov
                   ` (4 preceding siblings ...)
  2022-04-12 14:09 ` [PATCH 5/9] io_uring: helper for empty req cache checks Pavel Begunkov
@ 2022-04-12 14:09 ` Pavel Begunkov
  2022-04-12 14:09 ` [PATCH 7/9] io_uring: optimise submission loop invariant Pavel Begunkov
                   ` (4 subsequent siblings)
  10 siblings, 0 replies; 16+ messages in thread
From: Pavel Begunkov @ 2022-04-12 14:09 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe, asml.silence

Don't hand code wq_stack_add_head() to ->free_list, which serves for
recycling io_kiocb, add a helper doing it for us.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3ccc13acb498..a751ca167d21 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1493,6 +1493,11 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res)
 	req->cqe.res = res;
 }
 
+static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx)
+{
+	wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
+}
+
 static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
 {
 	struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
@@ -2225,7 +2230,6 @@ static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
 static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
 	__must_hold(&ctx->uring_lock)
 {
-	struct io_submit_state *state = &ctx->submit_state;
 	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
 	void *reqs[IO_REQ_ALLOC_BATCH];
 	int ret, i;
@@ -2259,7 +2263,7 @@ static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
 		struct io_kiocb *req = reqs[i];
 
 		io_preinit_req(req, ctx);
-		wq_stack_add_head(&req->comp_list, &state->free_list);
+		io_req_add_to_cache(req, ctx);
 	}
 	return true;
 }
@@ -2702,7 +2706,7 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,
 		}
 		task_refs++;
 		node = req->comp_list.next;
-		wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
+		io_req_add_to_cache(req, ctx);
 	} while (node);
 
 	if (task)
@@ -7853,7 +7857,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 		req = io_alloc_req(ctx);
 		sqe = io_get_sqe(ctx);
 		if (unlikely(!sqe)) {
-			wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
+			io_req_add_to_cache(req, ctx);
 			break;
 		}
 		/* will complete beyond this point, count as submitted */
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 7/9] io_uring: optimise submission loop invariant
  2022-04-12 14:09 [PATCH next 0/9] for-next clean ups and micro optimisation Pavel Begunkov
                   ` (5 preceding siblings ...)
  2022-04-12 14:09 ` [PATCH 6/9] io_uring: add helper to return req to cache list Pavel Begunkov
@ 2022-04-12 14:09 ` Pavel Begunkov
  2022-04-12 14:09 ` [PATCH 8/9] io_uring: optimise submission left counting Pavel Begunkov
                   ` (3 subsequent siblings)
  10 siblings, 0 replies; 16+ messages in thread
From: Pavel Begunkov @ 2022-04-12 14:09 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe, asml.silence

Instead of keeping @submitted in io_submit_sqes(), which for each
iteration requires comparison with the initial number of SQEs, store the
number of SQEs left to submit. We'll need nr only for when we're done
with SQE handling.

note: if we can't allocate a req for the first SQE we always has been
returning -EAGAIN to the userspace, save this behaviour by looking into
the cache in a slow path.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a751ca167d21..20eb73d9ae42 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7836,24 +7836,22 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 	__must_hold(&ctx->uring_lock)
 {
 	unsigned int entries = io_sqring_entries(ctx);
-	int submitted = 0;
+	unsigned int left;
+	int ret;
 
 	if (unlikely(!entries))
 		return 0;
 	/* make sure SQ entry isn't read before tail */
-	nr = min3(nr, ctx->sq_entries, entries);
-	io_get_task_refs(nr);
+	ret = left = min3(nr, ctx->sq_entries, entries);
+	io_get_task_refs(left);
+	io_submit_state_start(&ctx->submit_state, left);
 
-	io_submit_state_start(&ctx->submit_state, nr);
 	do {
 		const struct io_uring_sqe *sqe;
 		struct io_kiocb *req;
 
-		if (unlikely(!io_alloc_req_refill(ctx))) {
-			if (!submitted)
-				submitted = -EAGAIN;
+		if (unlikely(!io_alloc_req_refill(ctx)))
 			break;
-		}
 		req = io_alloc_req(ctx);
 		sqe = io_get_sqe(ctx);
 		if (unlikely(!sqe)) {
@@ -7861,7 +7859,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 			break;
 		}
 		/* will complete beyond this point, count as submitted */
-		submitted++;
+		left--;
 		if (io_submit_sqe(ctx, req, sqe)) {
 			/*
 			 * Continue submitting even for sqe failure if the
@@ -7870,20 +7868,20 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 			if (!(ctx->flags & IORING_SETUP_SUBMIT_ALL))
 				break;
 		}
-	} while (submitted < nr);
+	} while (left);
 
-	if (unlikely(submitted != nr)) {
-		int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
-		int unused = nr - ref_used;
-
-		current->io_uring->cached_refs += unused;
+	if (unlikely(left)) {
+		ret -= left;
+		/* try again if it submitted nothing and can't allocate a req */
+		if (!ret && io_req_cache_empty(ctx))
+			ret = -EAGAIN;
+		current->io_uring->cached_refs += left;
 	}
 
 	io_submit_state_end(ctx);
 	 /* Commit SQ ring head once we've consumed and submitted all SQEs */
 	io_commit_sqring(ctx);
-
-	return submitted;
+	return ret;
 }
 
 static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 8/9] io_uring: optimise submission left counting
  2022-04-12 14:09 [PATCH next 0/9] for-next clean ups and micro optimisation Pavel Begunkov
                   ` (6 preceding siblings ...)
  2022-04-12 14:09 ` [PATCH 7/9] io_uring: optimise submission loop invariant Pavel Begunkov
@ 2022-04-12 14:09 ` Pavel Begunkov
  2022-04-12 14:09 ` [PATCH 9/9] io_uring: optimise io_get_cqe() Pavel Begunkov
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 16+ messages in thread
From: Pavel Begunkov @ 2022-04-12 14:09 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe, asml.silence

Considering all inlining io_submit_sqe() is huge and usually ends up
calling some other functions.

We decrement @left in io_submit_sqes() just before calling
io_submit_sqe() and use it later after the call. Considering how huge
io_submit_sqe() is, there is not much hope @left will be treated
gracefully by compilers.

Decrement it after the call, not only it's easier on register spilling
and probably saves stack write/read, but also at least for x64 uses
CPU flags set by the dec instead of doing (read/write and tests).

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 20eb73d9ae42..b349a3c52354 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7858,17 +7858,17 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 			io_req_add_to_cache(req, ctx);
 			break;
 		}
-		/* will complete beyond this point, count as submitted */
-		left--;
-		if (io_submit_sqe(ctx, req, sqe)) {
-			/*
-			 * Continue submitting even for sqe failure if the
-			 * ring was setup with IORING_SETUP_SUBMIT_ALL
-			 */
-			if (!(ctx->flags & IORING_SETUP_SUBMIT_ALL))
-				break;
+
+		/*
+		 * Continue submitting even for sqe failure if the
+		 * ring was setup with IORING_SETUP_SUBMIT_ALL
+		 */
+		if (unlikely(io_submit_sqe(ctx, req, sqe)) &&
+		    !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) {
+			left--;
+			break;
 		}
-	} while (left);
+	} while (--left);
 
 	if (unlikely(left)) {
 		ret -= left;
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 9/9] io_uring: optimise io_get_cqe()
  2022-04-12 14:09 [PATCH next 0/9] for-next clean ups and micro optimisation Pavel Begunkov
                   ` (7 preceding siblings ...)
  2022-04-12 14:09 ` [PATCH 8/9] io_uring: optimise submission left counting Pavel Begunkov
@ 2022-04-12 14:09 ` Pavel Begunkov
  2022-04-12 16:06   ` Florian Schmaus
  2022-04-12 15:05 ` [PATCH next 0/9] for-next clean ups and micro optimisation Jens Axboe
  2022-04-12 16:47 ` Jens Axboe
  10 siblings, 1 reply; 16+ messages in thread
From: Pavel Begunkov @ 2022-04-12 14:09 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe, asml.silence

io_get_cqe() is expensive because of a bunch of loads, masking, etc.
However, most of the time we should have enough of entries in the CQ,
so we can cache two pointers representing a range of contiguous CQE
memory we can use. When the range is exhausted we'll go through a slower
path to set up a new range. When there are no CQEs avaliable, pointers
will naturally point to the same address.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 46 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b349a3c52354..f2269ffe09eb 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -416,6 +416,13 @@ struct io_ring_ctx {
 	unsigned long		check_cq_overflow;
 
 	struct {
+		/*
+		 * We cache a range of free CQEs we can use, once exhausted it
+		 * should go through a slower range setup, see __io_get_cqe()
+		 */
+		struct io_uring_cqe	*cqe_cached;
+		struct io_uring_cqe	*cqe_santinel;
+
 		unsigned		cached_cq_tail;
 		unsigned		cq_entries;
 		struct io_ev_fd	__rcu	*io_ev_fd;
@@ -1831,21 +1838,38 @@ static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
 	return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
 }
 
-static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
+/*
+ * writes to the cq entry need to come after reading head; the
+ * control dependency is enough as we're using WRITE_ONCE to
+ * fill the cq entry
+ */
+static noinline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
 {
 	struct io_rings *rings = ctx->rings;
-	unsigned tail, mask = ctx->cq_entries - 1;
-
-	/*
-	 * writes to the cq entry need to come after reading head; the
-	 * control dependency is enough as we're using WRITE_ONCE to
-	 * fill the cq entry
-	 */
-	if (__io_cqring_events(ctx) == ctx->cq_entries)
+	unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
+	unsigned int free, queued, len;
+
+	/* userspace may cheat modifying the tail, be safe and do min */
+	queued = min(__io_cqring_events(ctx), ctx->cq_entries);
+	free = ctx->cq_entries - queued;
+	/* we need a contiguous range, limit based on the current array offset */
+	len = min(free, ctx->cq_entries - off);
+	if (!len)
 		return NULL;
 
-	tail = ctx->cached_cq_tail++;
-	return &rings->cqes[tail & mask];
+	ctx->cached_cq_tail++;
+	ctx->cqe_cached = &rings->cqes[off];
+	ctx->cqe_santinel = ctx->cqe_cached + len;
+	return ctx->cqe_cached++;
+}
+
+static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
+{
+	if (likely(ctx->cqe_cached < ctx->cqe_santinel)) {
+		ctx->cached_cq_tail++;
+		return ctx->cqe_cached++;
+	}
+	return __io_get_cqe(ctx);
 }
 
 static void io_eventfd_signal(struct io_ring_ctx *ctx)
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH next 0/9] for-next clean ups and micro optimisation
  2022-04-12 14:09 [PATCH next 0/9] for-next clean ups and micro optimisation Pavel Begunkov
                   ` (8 preceding siblings ...)
  2022-04-12 14:09 ` [PATCH 9/9] io_uring: optimise io_get_cqe() Pavel Begunkov
@ 2022-04-12 15:05 ` Jens Axboe
  2022-04-12 15:12   ` Jens Axboe
  2022-04-12 16:47 ` Jens Axboe
  10 siblings, 1 reply; 16+ messages in thread
From: Jens Axboe @ 2022-04-12 15:05 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 4/12/22 8:09 AM, Pavel Begunkov wrote:
> nops benchmark: 40.3 -> 41.1 MIOPS, or +2%
> 
> Pavel Begunkov (9):
>   io_uring: explicitly keep a CQE in io_kiocb
>   io_uring: memcpy CQE from req
>   io_uring: shrink final link flush
>   io_uring: inline io_flush_cached_reqs
>   io_uring: helper for empty req cache checks
>   io_uring: add helper to return req to cache list
>   io_uring: optimise submission loop invariant
>   io_uring: optimise submission left counting
>   io_uring: optimise io_get_cqe()
> 
>  fs/io_uring.c | 288 +++++++++++++++++++++++++++++---------------------
>  1 file changed, 165 insertions(+), 123 deletions(-)

Get about ~4% on aarch64. I like both main changes, memcpy of cqe and
the improvements to io_get_cqe().

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH next 0/9] for-next clean ups and micro optimisation
  2022-04-12 15:05 ` [PATCH next 0/9] for-next clean ups and micro optimisation Jens Axboe
@ 2022-04-12 15:12   ` Jens Axboe
  0 siblings, 0 replies; 16+ messages in thread
From: Jens Axboe @ 2022-04-12 15:12 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 4/12/22 9:05 AM, Jens Axboe wrote:
> On 4/12/22 8:09 AM, Pavel Begunkov wrote:
>> nops benchmark: 40.3 -> 41.1 MIOPS, or +2%
>>
>> Pavel Begunkov (9):
>>   io_uring: explicitly keep a CQE in io_kiocb
>>   io_uring: memcpy CQE from req
>>   io_uring: shrink final link flush
>>   io_uring: inline io_flush_cached_reqs
>>   io_uring: helper for empty req cache checks
>>   io_uring: add helper to return req to cache list
>>   io_uring: optimise submission loop invariant
>>   io_uring: optimise submission left counting
>>   io_uring: optimise io_get_cqe()
>>
>>  fs/io_uring.c | 288 +++++++++++++++++++++++++++++---------------------
>>  1 file changed, 165 insertions(+), 123 deletions(-)
> 
> Get about ~4% on aarch64. I like both main changes, memcpy of cqe and
> the improvements to io_get_cqe().

Ran the nop tests on the 12900K, and I see about an 8% improvement
there, going from ~88M to 95M. I didn't split and check which part
made the most improvement.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 9/9] io_uring: optimise io_get_cqe()
  2022-04-12 14:09 ` [PATCH 9/9] io_uring: optimise io_get_cqe() Pavel Begunkov
@ 2022-04-12 16:06   ` Florian Schmaus
  2022-04-12 16:15     ` Pavel Begunkov
  0 siblings, 1 reply; 16+ messages in thread
From: Florian Schmaus @ 2022-04-12 16:06 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring; +Cc: Jens Axboe


[-- Attachment #1.1: Type: text/plain, Size: 1185 bytes --]

On 12/04/2022 16.09, Pavel Begunkov wrote:
> io_get_cqe() is expensive because of a bunch of loads, masking, etc.
> However, most of the time we should have enough of entries in the CQ,
> so we can cache two pointers representing a range of contiguous CQE
> memory we can use. When the range is exhausted we'll go through a slower
> path to set up a new range. When there are no CQEs avaliable, pointers
> will naturally point to the same address.
> 
> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
> ---
>   fs/io_uring.c | 46 +++++++++++++++++++++++++++++++++++-----------
>   1 file changed, 35 insertions(+), 11 deletions(-)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index b349a3c52354..f2269ffe09eb 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -416,6 +416,13 @@ struct io_ring_ctx {
>   	unsigned long		check_cq_overflow;
>   
>   	struct {
> +		/*
> +		 * We cache a range of free CQEs we can use, once exhausted it
> +		 * should go through a slower range setup, see __io_get_cqe()
> +		 */
> +		struct io_uring_cqe	*cqe_cached;
> +		struct io_uring_cqe	*cqe_santinel;

I think this should s/santinel/sentinel.

- Flow

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 495 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 9/9] io_uring: optimise io_get_cqe()
  2022-04-12 16:06   ` Florian Schmaus
@ 2022-04-12 16:15     ` Pavel Begunkov
  2022-04-12 16:25       ` Jens Axboe
  0 siblings, 1 reply; 16+ messages in thread
From: Pavel Begunkov @ 2022-04-12 16:15 UTC (permalink / raw)
  To: Florian Schmaus, io-uring; +Cc: Jens Axboe

On 4/12/22 17:06, Florian Schmaus wrote:
> On 12/04/2022 16.09, Pavel Begunkov wrote:
>> io_get_cqe() is expensive because of a bunch of loads, masking, etc.
>> However, most of the time we should have enough of entries in the CQ,
>> so we can cache two pointers representing a range of contiguous CQE
>> memory we can use. When the range is exhausted we'll go through a slower
>> path to set up a new range. When there are no CQEs avaliable, pointers
>> will naturally point to the same address.
>>
>> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
>> ---
>>   fs/io_uring.c | 46 +++++++++++++++++++++++++++++++++++-----------
>>   1 file changed, 35 insertions(+), 11 deletions(-)
>>
>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>> index b349a3c52354..f2269ffe09eb 100644
>> --- a/fs/io_uring.c
>> +++ b/fs/io_uring.c
>> @@ -416,6 +416,13 @@ struct io_ring_ctx {
>>       unsigned long        check_cq_overflow;
>>       struct {
>> +        /*
>> +         * We cache a range of free CQEs we can use, once exhausted it
>> +         * should go through a slower range setup, see __io_get_cqe()
>> +         */
>> +        struct io_uring_cqe    *cqe_cached;
>> +        struct io_uring_cqe    *cqe_santinel;
> 
> I think this should s/santinel/sentinel.

Indeed, thanks

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 9/9] io_uring: optimise io_get_cqe()
  2022-04-12 16:15     ` Pavel Begunkov
@ 2022-04-12 16:25       ` Jens Axboe
  0 siblings, 0 replies; 16+ messages in thread
From: Jens Axboe @ 2022-04-12 16:25 UTC (permalink / raw)
  To: Pavel Begunkov, Florian Schmaus, io-uring

On 4/12/22 10:15 AM, Pavel Begunkov wrote:
> On 4/12/22 17:06, Florian Schmaus wrote:
>> On 12/04/2022 16.09, Pavel Begunkov wrote:
>>> io_get_cqe() is expensive because of a bunch of loads, masking, etc.
>>> However, most of the time we should have enough of entries in the CQ,
>>> so we can cache two pointers representing a range of contiguous CQE
>>> memory we can use. When the range is exhausted we'll go through a slower
>>> path to set up a new range. When there are no CQEs avaliable, pointers
>>> will naturally point to the same address.
>>>
>>> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
>>> ---
>>>   fs/io_uring.c | 46 +++++++++++++++++++++++++++++++++++-----------
>>>   1 file changed, 35 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>> index b349a3c52354..f2269ffe09eb 100644
>>> --- a/fs/io_uring.c
>>> +++ b/fs/io_uring.c
>>> @@ -416,6 +416,13 @@ struct io_ring_ctx {
>>>       unsigned long        check_cq_overflow;
>>>       struct {
>>> +        /*
>>> +         * We cache a range of free CQEs we can use, once exhausted it
>>> +         * should go through a slower range setup, see __io_get_cqe()
>>> +         */
>>> +        struct io_uring_cqe    *cqe_cached;
>>> +        struct io_uring_cqe    *cqe_santinel;
>>
>> I think this should s/santinel/sentinel.

I fixed it up.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH next 0/9] for-next clean ups and micro optimisation
  2022-04-12 14:09 [PATCH next 0/9] for-next clean ups and micro optimisation Pavel Begunkov
                   ` (9 preceding siblings ...)
  2022-04-12 15:05 ` [PATCH next 0/9] for-next clean ups and micro optimisation Jens Axboe
@ 2022-04-12 16:47 ` Jens Axboe
  10 siblings, 0 replies; 16+ messages in thread
From: Jens Axboe @ 2022-04-12 16:47 UTC (permalink / raw)
  To: asml.silence, io-uring

On Tue, 12 Apr 2022 15:09:42 +0100, Pavel Begunkov wrote:
> nops benchmark: 40.3 -> 41.1 MIOPS, or +2%
> 
> Pavel Begunkov (9):
>   io_uring: explicitly keep a CQE in io_kiocb
>   io_uring: memcpy CQE from req
>   io_uring: shrink final link flush
>   io_uring: inline io_flush_cached_reqs
>   io_uring: helper for empty req cache checks
>   io_uring: add helper to return req to cache list
>   io_uring: optimise submission loop invariant
>   io_uring: optimise submission left counting
>   io_uring: optimise io_get_cqe()
> 
> [...]

Applied, thanks!

[1/9] io_uring: explicitly keep a CQE in io_kiocb
      (no commit info)
[2/9] io_uring: memcpy CQE from req
      (no commit info)
[3/9] io_uring: shrink final link flush
      (no commit info)
[4/9] io_uring: inline io_flush_cached_reqs
      (no commit info)
[5/9] io_uring: helper for empty req cache checks
      (no commit info)
[6/9] io_uring: add helper to return req to cache list
      (no commit info)
[7/9] io_uring: optimise submission loop invariant
      (no commit info)
[8/9] io_uring: optimise submission left counting
      (no commit info)
[9/9] io_uring: optimise io_get_cqe()
      (no commit info)

Best regards,
-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2022-04-12 16:47 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-12 14:09 [PATCH next 0/9] for-next clean ups and micro optimisation Pavel Begunkov
2022-04-12 14:09 ` [PATCH 1/9] io_uring: explicitly keep a CQE in io_kiocb Pavel Begunkov
2022-04-12 14:09 ` [PATCH 2/9] io_uring: memcpy CQE from req Pavel Begunkov
2022-04-12 14:09 ` [PATCH 3/9] io_uring: shrink final link flush Pavel Begunkov
2022-04-12 14:09 ` [PATCH 4/9] io_uring: inline io_flush_cached_reqs Pavel Begunkov
2022-04-12 14:09 ` [PATCH 5/9] io_uring: helper for empty req cache checks Pavel Begunkov
2022-04-12 14:09 ` [PATCH 6/9] io_uring: add helper to return req to cache list Pavel Begunkov
2022-04-12 14:09 ` [PATCH 7/9] io_uring: optimise submission loop invariant Pavel Begunkov
2022-04-12 14:09 ` [PATCH 8/9] io_uring: optimise submission left counting Pavel Begunkov
2022-04-12 14:09 ` [PATCH 9/9] io_uring: optimise io_get_cqe() Pavel Begunkov
2022-04-12 16:06   ` Florian Schmaus
2022-04-12 16:15     ` Pavel Begunkov
2022-04-12 16:25       ` Jens Axboe
2022-04-12 15:05 ` [PATCH next 0/9] for-next clean ups and micro optimisation Jens Axboe
2022-04-12 15:12   ` Jens Axboe
2022-04-12 16:47 ` Jens Axboe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.