All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC 0/3] improvements for poll requests
@ 2021-09-22 12:34 Xiaoguang Wang
  2021-09-22 12:34 ` [RFC 1/3] io_uring: reduce frequent add_wait_queue() overhead for multi-shot poll request Xiaoguang Wang
                   ` (3 more replies)
  0 siblings, 4 replies; 11+ messages in thread
From: Xiaoguang Wang @ 2021-09-22 12:34 UTC (permalink / raw)
  To: io-uring; +Cc: axboe, asml.silence, Xiaoguang Wang

This patchset tries to improve echo_server model based on io_uring's
IORING_POLL_ADD_MULTI feature.

Xiaoguang Wang (3):
  io_uring: reduce frequent add_wait_queue() overhead for multi-shot
    poll request
  io_uring: don't get completion_lock in io_poll_rewait()
  io_uring: try to batch poll request completion

 fs/io_uring.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 92 insertions(+), 24 deletions(-)

-- 
2.14.4.44.g2045bb6


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [RFC 1/3] io_uring: reduce frequent add_wait_queue() overhead for multi-shot poll request
  2021-09-22 12:34 [RFC 0/3] improvements for poll requests Xiaoguang Wang
@ 2021-09-22 12:34 ` Xiaoguang Wang
  2021-09-22 17:52   ` Hao Xu
  2021-09-22 12:34 ` [RFC 2/3] io_uring: don't get completion_lock in io_poll_rewait() Xiaoguang Wang
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 11+ messages in thread
From: Xiaoguang Wang @ 2021-09-22 12:34 UTC (permalink / raw)
  To: io-uring; +Cc: axboe, asml.silence, Xiaoguang Wang

Run echo_server to evaluate io_uring's multi-shot poll performance, perf
shows that add_wait_queue() has obvious overhead. Intruduce a new state
'active' in io_poll_iocb to indicate whether io_poll_wake() should queue
a task_work. This new state will be set to true initially, be set to false
when starting to queue a task work, and be set to true again when a poll
cqe has been committed. One concern is that this method may lost waken-up
event, but seems it's ok.

  io_poll_wake                io_poll_task_func
t1                       |
t2                       |    WRITE_ONCE(req->poll.active, true);
t3                       |
t4                       |    io_commit_cqring(ctx);
t5                       |
t6                       |

If waken-up events happens before or at t4, it's ok, user app will always
see a cqe. If waken-up events happens after t4 and IIUC, io_poll_wake()
will see the new req->poll.active value by using READ_ONCE().

With this patch, a pure echo_server(1000 connections, packet is 16 bytes)
shows about 1.6% reqs improvement.

Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
---
 fs/io_uring.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1294b1ef4acb..ca4464a75c7b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -487,6 +487,7 @@ struct io_poll_iocb {
 	__poll_t			events;
 	bool				done;
 	bool				canceled;
+	bool				active;
 	struct wait_queue_entry		wait;
 };
 
@@ -5025,8 +5026,6 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
 
 	trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
 
-	list_del_init(&poll->wait.entry);
-
 	req->result = mask;
 	req->io_task_work.func = func;
 
@@ -5057,7 +5056,10 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
 
 	spin_lock(&ctx->completion_lock);
 	if (!req->result && !READ_ONCE(poll->canceled)) {
-		add_wait_queue(poll->head, &poll->wait);
+		if (req->opcode == IORING_OP_POLL_ADD)
+			WRITE_ONCE(req->poll.active, true);
+		else
+			add_wait_queue(poll->head, &poll->wait);
 		return true;
 	}
 
@@ -5133,6 +5135,9 @@ static inline bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
 	return done;
 }
 
+static bool __io_poll_remove_one(struct io_kiocb *req,
+				 struct io_poll_iocb *poll, bool do_cancel);
+
 static void io_poll_task_func(struct io_kiocb *req, bool *locked)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -5146,10 +5151,11 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
 		done = __io_poll_complete(req, req->result);
 		if (done) {
 			io_poll_remove_double(req);
+			__io_poll_remove_one(req, io_poll_get_single(req), true);
 			hash_del(&req->hash_node);
 		} else {
 			req->result = 0;
-			add_wait_queue(req->poll.head, &req->poll.wait);
+			WRITE_ONCE(req->poll.active, true);
 		}
 		io_commit_cqring(ctx);
 		spin_unlock(&ctx->completion_lock);
@@ -5204,6 +5210,7 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
 	poll->head = NULL;
 	poll->done = false;
 	poll->canceled = false;
+	poll->active = true;
 #define IO_POLL_UNMASK	(EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
 	/* mask in events that we always want/need */
 	poll->events = events | IO_POLL_UNMASK;
@@ -5301,6 +5308,7 @@ static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 	trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
 					key_to_poll(key));
 
+	list_del_init(&poll->wait.entry);
 	return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
 }
 
@@ -5569,6 +5577,10 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 	struct io_kiocb *req = wait->private;
 	struct io_poll_iocb *poll = &req->poll;
 
+	if (!READ_ONCE(poll->active))
+		return 0;
+
+	WRITE_ONCE(poll->active, false);
 	return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
 }
 
-- 
2.14.4.44.g2045bb6


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [RFC 2/3] io_uring: don't get completion_lock in io_poll_rewait()
  2021-09-22 12:34 [RFC 0/3] improvements for poll requests Xiaoguang Wang
  2021-09-22 12:34 ` [RFC 1/3] io_uring: reduce frequent add_wait_queue() overhead for multi-shot poll request Xiaoguang Wang
@ 2021-09-22 12:34 ` Xiaoguang Wang
  2021-09-22 12:34 ` [RFC 3/3] io_uring: try to batch poll request completion Xiaoguang Wang
  2021-09-22 13:00 ` [RFC 0/3] improvements for poll requests Jens Axboe
  3 siblings, 0 replies; 11+ messages in thread
From: Xiaoguang Wang @ 2021-09-22 12:34 UTC (permalink / raw)
  To: io-uring; +Cc: axboe, asml.silence, Xiaoguang Wang

In current implementation, if there are not available events,
io_poll_rewait() just gets completion_lock, and unlocks it in
io_poll_task_func() or io_async_task_func(), which isn't necessary.

Change this logic to let io_poll_task_func() or io_async_task_func()
get the completion_lock lock, this is also a preparation for
later patch, which will batch poll request completion.

Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
---
 fs/io_uring.c | 50 ++++++++++++++++++++++----------------------------
 1 file changed, 22 insertions(+), 28 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ca4464a75c7b..6fdfb688cf91 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5040,10 +5040,7 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
 }
 
 static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
-	__acquires(&req->ctx->completion_lock)
 {
-	struct io_ring_ctx *ctx = req->ctx;
-
 	/* req->task == current here, checking PF_EXITING is safe */
 	if (unlikely(req->task->flags & PF_EXITING))
 		WRITE_ONCE(poll->canceled, true);
@@ -5054,7 +5051,6 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
 		req->result = vfs_poll(req->file, &pt) & poll->events;
 	}
 
-	spin_lock(&ctx->completion_lock);
 	if (!req->result && !READ_ONCE(poll->canceled)) {
 		if (req->opcode == IORING_OP_POLL_ADD)
 			WRITE_ONCE(req->poll.active, true);
@@ -5142,30 +5138,29 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_kiocb *nxt;
+	bool done;
 
-	if (io_poll_rewait(req, &req->poll)) {
-		spin_unlock(&ctx->completion_lock);
-	} else {
-		bool done;
+	if (io_poll_rewait(req, &req->poll))
+		return;
 
-		done = __io_poll_complete(req, req->result);
-		if (done) {
-			io_poll_remove_double(req);
-			__io_poll_remove_one(req, io_poll_get_single(req), true);
-			hash_del(&req->hash_node);
-		} else {
-			req->result = 0;
-			WRITE_ONCE(req->poll.active, true);
-		}
-		io_commit_cqring(ctx);
-		spin_unlock(&ctx->completion_lock);
-		io_cqring_ev_posted(ctx);
+	spin_lock(&ctx->completion_lock);
+	done = __io_poll_complete(req, req->result);
+	if (done) {
+		io_poll_remove_double(req);
+		__io_poll_remove_one(req, io_poll_get_single(req), true);
+		hash_del(&req->hash_node);
+	} else {
+		req->result = 0;
+		WRITE_ONCE(req->poll.active, true);
+	}
+	io_commit_cqring(ctx);
+	spin_unlock(&ctx->completion_lock);
+	io_cqring_ev_posted(ctx);
 
-		if (done) {
-			nxt = io_put_req_find_next(req);
-			if (nxt)
-				io_req_task_submit(nxt, locked);
-		}
+	if (done) {
+		nxt = io_put_req_find_next(req);
+		if (nxt)
+			io_req_task_submit(nxt, locked);
 	}
 }
 
@@ -5284,11 +5279,10 @@ static void io_async_task_func(struct io_kiocb *req, bool *locked)
 
 	trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data);
 
-	if (io_poll_rewait(req, &apoll->poll)) {
-		spin_unlock(&ctx->completion_lock);
+	if (io_poll_rewait(req, &apoll->poll))
 		return;
-	}
 
+	spin_lock(&ctx->completion_lock);
 	hash_del(&req->hash_node);
 	io_poll_remove_double(req);
 	spin_unlock(&ctx->completion_lock);
-- 
2.14.4.44.g2045bb6


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [RFC 3/3] io_uring: try to batch poll request completion
  2021-09-22 12:34 [RFC 0/3] improvements for poll requests Xiaoguang Wang
  2021-09-22 12:34 ` [RFC 1/3] io_uring: reduce frequent add_wait_queue() overhead for multi-shot poll request Xiaoguang Wang
  2021-09-22 12:34 ` [RFC 2/3] io_uring: don't get completion_lock in io_poll_rewait() Xiaoguang Wang
@ 2021-09-22 12:34 ` Xiaoguang Wang
  2021-09-22 16:24   ` Pavel Begunkov
  2021-09-22 17:00   ` Hao Xu
  2021-09-22 13:00 ` [RFC 0/3] improvements for poll requests Jens Axboe
  3 siblings, 2 replies; 11+ messages in thread
From: Xiaoguang Wang @ 2021-09-22 12:34 UTC (permalink / raw)
  To: io-uring; +Cc: axboe, asml.silence, Xiaoguang Wang

For an echo-server based on io_uring's IORING_POLL_ADD_MULTI feature,
only poll request are completed in task work, normal read/write
requests are issued when user app sees cqes on corresponding poll
requests, and they will mostly read/write data successfully, which
don't need task work. So at least for echo-server model, batching
poll request completion properly will give benefits.

Currently don't find any appropriate place to store batched poll
requests, put them in struct io_submit_state temporarily, which I
think it'll need rework in future.

Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
---
 fs/io_uring.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 63 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6fdfb688cf91..14118388bfc6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -321,6 +321,11 @@ struct io_submit_state {
 	 */
 	struct io_kiocb		*compl_reqs[IO_COMPL_BATCH];
 	unsigned int		compl_nr;
+
+	struct io_kiocb		*poll_compl_reqs[IO_COMPL_BATCH];
+	bool			poll_req_status[IO_COMPL_BATCH];
+	unsigned int		poll_compl_nr;
+
 	/* inline/task_work completion list, under ->uring_lock */
 	struct list_head	free_list;
 
@@ -2093,6 +2098,8 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
 	percpu_ref_put(&ctx->refs);
 }
 
+static void io_poll_flush_completions(struct io_ring_ctx *ctx, bool *locked);
+
 static void tctx_task_work(struct callback_head *cb)
 {
 	bool locked = false;
@@ -2103,8 +2110,11 @@ static void tctx_task_work(struct callback_head *cb)
 	while (1) {
 		struct io_wq_work_node *node;
 
-		if (!tctx->task_list.first && locked && ctx->submit_state.compl_nr)
+		if (!tctx->task_list.first && locked && (ctx->submit_state.compl_nr ||
+		    ctx->submit_state.poll_compl_nr)) {
 			io_submit_flush_completions(ctx);
+			io_poll_flush_completions(ctx, &locked);
+		}
 
 		spin_lock_irq(&tctx->task_lock);
 		node = tctx->task_list.first;
@@ -5134,6 +5144,49 @@ static inline bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
 static bool __io_poll_remove_one(struct io_kiocb *req,
 				 struct io_poll_iocb *poll, bool do_cancel);
 
+static void io_poll_flush_completions(struct io_ring_ctx *ctx, bool *locked)
+	__must_hold(&ctx->uring_lock)
+{
+	struct io_submit_state *state = &ctx->submit_state;
+	struct io_kiocb *req, *nxt;
+	int i, nr = state->poll_compl_nr;
+	bool done, skip_done = true;
+
+	spin_lock(&ctx->completion_lock);
+	for (i = 0; i < nr; i++) {
+		req = state->poll_compl_reqs[i];
+		done = __io_poll_complete(req, req->result);
+		if (done) {
+			io_poll_remove_double(req);
+			__io_poll_remove_one(req, io_poll_get_single(req), true);
+			hash_del(&req->hash_node);
+			state->poll_req_status[i] = true;
+			if (skip_done)
+				skip_done = false;
+		} else {
+			req->result = 0;
+			state->poll_req_status[i] = false;
+			WRITE_ONCE(req->poll.active, true);
+		}
+	}
+	io_commit_cqring(ctx);
+	spin_unlock(&ctx->completion_lock);
+	io_cqring_ev_posted(ctx);
+	state->poll_compl_nr = 0;
+
+	if (skip_done)
+		return;
+
+	for (i = 0; i < nr; i++) {
+		if (state->poll_req_status[i]) {
+			req = state->poll_compl_reqs[i];
+			nxt = io_put_req_find_next(req);
+			if (nxt)
+				io_req_task_submit(nxt, locked);
+		}
+	}
+}
+
 static void io_poll_task_func(struct io_kiocb *req, bool *locked)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -5143,6 +5196,15 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
 	if (io_poll_rewait(req, &req->poll))
 		return;
 
+	if (*locked) {
+		struct io_submit_state *state = &ctx->submit_state;
+
+		state->poll_compl_reqs[state->poll_compl_nr++] = req;
+		if (state->poll_compl_nr == ARRAY_SIZE(state->poll_compl_reqs))
+			io_poll_flush_completions(ctx, locked);
+		return;
+	}
+
 	spin_lock(&ctx->completion_lock);
 	done = __io_poll_complete(req, req->result);
 	if (done) {
-- 
2.14.4.44.g2045bb6


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [RFC 0/3] improvements for poll requests
  2021-09-22 12:34 [RFC 0/3] improvements for poll requests Xiaoguang Wang
                   ` (2 preceding siblings ...)
  2021-09-22 12:34 ` [RFC 3/3] io_uring: try to batch poll request completion Xiaoguang Wang
@ 2021-09-22 13:00 ` Jens Axboe
  3 siblings, 0 replies; 11+ messages in thread
From: Jens Axboe @ 2021-09-22 13:00 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: asml.silence

On 9/22/21 6:34 AM, Xiaoguang Wang wrote:
> This patchset tries to improve echo_server model based on io_uring's
> IORING_POLL_ADD_MULTI feature.

Nifty, I'll take a look. Can you put the echo server using multishot
somewhere so others can test it too? Maybe it already is, but would be
nice to detail exactly what was run.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC 3/3] io_uring: try to batch poll request completion
  2021-09-22 12:34 ` [RFC 3/3] io_uring: try to batch poll request completion Xiaoguang Wang
@ 2021-09-22 16:24   ` Pavel Begunkov
  2021-09-24  4:28     ` Xiaoguang Wang
  2021-09-22 17:00   ` Hao Xu
  1 sibling, 1 reply; 11+ messages in thread
From: Pavel Begunkov @ 2021-09-22 16:24 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: axboe

On 9/22/21 1:34 PM, Xiaoguang Wang wrote:
> For an echo-server based on io_uring's IORING_POLL_ADD_MULTI feature,
> only poll request are completed in task work, normal read/write
> requests are issued when user app sees cqes on corresponding poll
> requests, and they will mostly read/write data successfully, which
> don't need task work. So at least for echo-server model, batching
> poll request completion properly will give benefits.
> 
> Currently don't find any appropriate place to store batched poll
> requests, put them in struct io_submit_state temporarily, which I
> think it'll need rework in future.
> 
> Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
> ---
>  fs/io_uring.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 63 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 6fdfb688cf91..14118388bfc6 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -321,6 +321,11 @@ struct io_submit_state {
>  	 */
>  	struct io_kiocb		*compl_reqs[IO_COMPL_BATCH];
>  	unsigned int		compl_nr;
> +
> +	struct io_kiocb		*poll_compl_reqs[IO_COMPL_BATCH];
> +	bool			poll_req_status[IO_COMPL_BATCH];
> +	unsigned int		poll_compl_nr;
> +
>  	/* inline/task_work completion list, under ->uring_lock */
>  	struct list_head	free_list;
>  
> @@ -2093,6 +2098,8 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
>  	percpu_ref_put(&ctx->refs);
>  }
>  
> +static void io_poll_flush_completions(struct io_ring_ctx *ctx, bool *locked);
> +
>  static void tctx_task_work(struct callback_head *cb)
>  {
>  	bool locked = false;
> @@ -2103,8 +2110,11 @@ static void tctx_task_work(struct callback_head *cb)
>  	while (1) {
>  		struct io_wq_work_node *node;
>  
> -		if (!tctx->task_list.first && locked && ctx->submit_state.compl_nr)
> +		if (!tctx->task_list.first && locked && (ctx->submit_state.compl_nr ||
> +		    ctx->submit_state.poll_compl_nr)) {

io_submit_flush_completions() shouldn't be called if there are no requests... And the
check is already inside for-next, will be 

if (... && locked) {
	io_submit_flush_completions();
	if (poll_compl_nr)
		io_poll_flush_completions();
}

>  			io_submit_flush_completions(ctx);
> +			io_poll_flush_completions(ctx, &locked);
> +		}
>  
>  		spin_lock_irq(&tctx->task_lock);
>  		node = tctx->task_list.first;
> @@ -5134,6 +5144,49 @@ static inline bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
>  static bool __io_poll_remove_one(struct io_kiocb *req,
>  				 struct io_poll_iocb *poll, bool do_cancel);
>  
> +static void io_poll_flush_completions(struct io_ring_ctx *ctx, bool *locked)
> +	__must_hold(&ctx->uring_lock)
> +{
> +	struct io_submit_state *state = &ctx->submit_state;
> +	struct io_kiocb *req, *nxt;
> +	int i, nr = state->poll_compl_nr;
> +	bool done, skip_done = true;
> +
> +	spin_lock(&ctx->completion_lock);
> +	for (i = 0; i < nr; i++) {
> +		req = state->poll_compl_reqs[i];
> +		done = __io_poll_complete(req, req->result);

I believe we first need to fix all the poll problems and lay out something more intuitive
than the current implementation, or it'd be pure hell to do afterwards.

Can be a nice addition, curious about numbers as well.


-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC 3/3] io_uring: try to batch poll request completion
  2021-09-22 12:34 ` [RFC 3/3] io_uring: try to batch poll request completion Xiaoguang Wang
  2021-09-22 16:24   ` Pavel Begunkov
@ 2021-09-22 17:00   ` Hao Xu
  2021-09-22 17:01     ` Hao Xu
  1 sibling, 1 reply; 11+ messages in thread
From: Hao Xu @ 2021-09-22 17:00 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: axboe, asml.silence

在 2021/9/22 下午8:34, Xiaoguang Wang 写道:
> For an echo-server based on io_uring's IORING_POLL_ADD_MULTI feature,
> only poll request are completed in task work, normal read/write
> requests are issued when user app sees cqes on corresponding poll
> requests, and they will mostly read/write data successfully, which
> don't need task work. So at least for echo-server model, batching
> poll request completion properly will give benefits.
> 
> Currently don't find any appropriate place to store batched poll
> requests, put them in struct io_submit_state temporarily, which I
> think it'll need rework in future.
> 
> Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
We may need flush completion when io_submit_end as well, there may be
situations where pure poll reqs are sparse. For instance, users may
just use pure poll to do accept, and fast poll for read/write,
send/recv, latency for pure poll reqs may amplify.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC 3/3] io_uring: try to batch poll request completion
  2021-09-22 17:00   ` Hao Xu
@ 2021-09-22 17:01     ` Hao Xu
  2021-09-22 17:09       ` Hao Xu
  0 siblings, 1 reply; 11+ messages in thread
From: Hao Xu @ 2021-09-22 17:01 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: axboe, asml.silence

在 2021/9/23 上午1:00, Hao Xu 写道:
> 在 2021/9/22 下午8:34, Xiaoguang Wang 写道:
>> For an echo-server based on io_uring's IORING_POLL_ADD_MULTI feature,
>> only poll request are completed in task work, normal read/write
>> requests are issued when user app sees cqes on corresponding poll
>> requests, and they will mostly read/write data successfully, which
>> don't need task work. So at least for echo-server model, batching
>> poll request completion properly will give benefits.
>>
>> Currently don't find any appropriate place to store batched poll
>> requests, put them in struct io_submit_state temporarily, which I
>> think it'll need rework in future.
>>
>> Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
> We may need flush completion when io_submit_end as well, there may be
> situations where pure poll reqs are sparse. For instance, users may
> just use pure poll to do accept, and fast poll for read/write,
I mean in persistent connection model.
> send/recv, latency for pure poll reqs may amplify.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC 3/3] io_uring: try to batch poll request completion
  2021-09-22 17:01     ` Hao Xu
@ 2021-09-22 17:09       ` Hao Xu
  0 siblings, 0 replies; 11+ messages in thread
From: Hao Xu @ 2021-09-22 17:09 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: axboe, asml.silence

在 2021/9/23 上午1:01, Hao Xu 写道:
> 在 2021/9/23 上午1:00, Hao Xu 写道:
>> 在 2021/9/22 下午8:34, Xiaoguang Wang 写道:
>>> For an echo-server based on io_uring's IORING_POLL_ADD_MULTI feature,
>>> only poll request are completed in task work, normal read/write
>>> requests are issued when user app sees cqes on corresponding poll
>>> requests, and they will mostly read/write data successfully, which
>>> don't need task work. So at least for echo-server model, batching
>>> poll request completion properly will give benefits.
>>>
>>> Currently don't find any appropriate place to store batched poll
>>> requests, put them in struct io_submit_state temporarily, which I
>>> think it'll need rework in future.
>>>
>>> Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
>> We may need flush completion when io_submit_end as well, there may be
>> situations where pure poll reqs are sparse. For instance, users may
>> just use pure poll to do accept, and fast poll for read/write,
> I mean in persistent connection model.
>> send/recv, latency for pure poll reqs may amplify.
I actually think flush them in io_subimit_state_end doesn't work as well
since it is multishot req..


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC 1/3] io_uring: reduce frequent add_wait_queue() overhead for multi-shot poll request
  2021-09-22 12:34 ` [RFC 1/3] io_uring: reduce frequent add_wait_queue() overhead for multi-shot poll request Xiaoguang Wang
@ 2021-09-22 17:52   ` Hao Xu
  0 siblings, 0 replies; 11+ messages in thread
From: Hao Xu @ 2021-09-22 17:52 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: axboe, asml.silence, Joseph Qi

在 2021/9/22 下午8:34, Xiaoguang Wang 写道:
> Run echo_server to evaluate io_uring's multi-shot poll performance, perf
> shows that add_wait_queue() has obvious overhead. Intruduce a new state
> 'active' in io_poll_iocb to indicate whether io_poll_wake() should queue
> a task_work. This new state will be set to true initially, be set to false
> when starting to queue a task work, and be set to true again when a poll
> cqe has been committed. One concern is that this method may lost waken-up
> event, but seems it's ok.
> 
>    io_poll_wake                io_poll_task_func
> t1                       |
> t2                       |    WRITE_ONCE(req->poll.active, true);
> t3                       |
> t4                       |    io_commit_cqring(ctx);
> t5                       |
> t6                       |
> 
> If waken-up events happens before or at t4, it's ok, user app will always
> see a cqe. If waken-up events happens after t4 and IIUC, io_poll_wake()
> will see the new req->poll.active value by using READ_ONCE().
> 
> With this patch, a pure echo_server(1000 connections, packet is 16 bytes)
> shows about 1.6% reqs improvement.
> 
> Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
> ---
>   fs/io_uring.c | 20 ++++++++++++++++----
>   1 file changed, 16 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 1294b1ef4acb..ca4464a75c7b 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -487,6 +487,7 @@ struct io_poll_iocb {
>   	__poll_t			events;
>   	bool				done;
>   	bool				canceled;
> +	bool				active;
>   	struct wait_queue_entry		wait;
>   };
>   
> @@ -5025,8 +5026,6 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
>   
>   	trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
>   
> -	list_del_init(&poll->wait.entry);
> -
>   	req->result = mask;
>   	req->io_task_work.func = func;
>   
> @@ -5057,7 +5056,10 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
>   
>   	spin_lock(&ctx->completion_lock);
>   	if (!req->result && !READ_ONCE(poll->canceled)) {
> -		add_wait_queue(poll->head, &poll->wait);
> +		if (req->opcode == IORING_OP_POLL_ADD)
> +			WRITE_ONCE(req->poll.active, true);
> +		else
> +			add_wait_queue(poll->head, &poll->wait);
>   		return true;
>   	}
>   
> @@ -5133,6 +5135,9 @@ static inline bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
>   	return done;
>   }
>   
> +static bool __io_poll_remove_one(struct io_kiocb *req,
> +				 struct io_poll_iocb *poll, bool do_cancel);
> +
>   static void io_poll_task_func(struct io_kiocb *req, bool *locked)
>   {
>   	struct io_ring_ctx *ctx = req->ctx;
> @@ -5146,10 +5151,11 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
>   		done = __io_poll_complete(req, req->result);
>   		if (done) {
>   			io_poll_remove_double(req);
> +			__io_poll_remove_one(req, io_poll_get_single(req), true);
This may cause race problems, like there may be multiple cancelled cqes
considerring io_poll_add() parallelled. hash_del is redundant either.
__io_poll_remove_one may not be the best choice here, and since we now
don't del wait entry inbetween, code in _arm_poll should probably be
tweaked as well(not very sure, will dive into it tomorrow).

Regards,
Hao
>   			hash_del(&req->hash_node);
>   		} else {
>   			req->result = 0;
> -			add_wait_queue(req->poll.head, &req->poll.wait);
> +			WRITE_ONCE(req->poll.active, true);
>   		}
>   		io_commit_cqring(ctx);
>   		spin_unlock(&ctx->completion_lock);
> @@ -5204,6 +5210,7 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
>   	poll->head = NULL;
>   	poll->done = false;
>   	poll->canceled = false;
> +	poll->active = true;
>   #define IO_POLL_UNMASK	(EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
>   	/* mask in events that we always want/need */
>   	poll->events = events | IO_POLL_UNMASK;
> @@ -5301,6 +5308,7 @@ static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
>   	trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
>   					key_to_poll(key));
>   
> +	list_del_init(&poll->wait.entry);
>   	return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
>   }
>   
> @@ -5569,6 +5577,10 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
>   	struct io_kiocb *req = wait->private;
>   	struct io_poll_iocb *poll = &req->poll;
>   
> +	if (!READ_ONCE(poll->active))
> +		return 0;
> +
> +	WRITE_ONCE(poll->active, false);
>   	return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
>   }
>   
> 


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC 3/3] io_uring: try to batch poll request completion
  2021-09-22 16:24   ` Pavel Begunkov
@ 2021-09-24  4:28     ` Xiaoguang Wang
  0 siblings, 0 replies; 11+ messages in thread
From: Xiaoguang Wang @ 2021-09-24  4:28 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring; +Cc: axboe

hi,


> On 9/22/21 1:34 PM, Xiaoguang Wang wrote:
>> For an echo-server based on io_uring's IORING_POLL_ADD_MULTI feature,
>> only poll request are completed in task work, normal read/write
>> requests are issued when user app sees cqes on corresponding poll
>> requests, and they will mostly read/write data successfully, which
>> don't need task work. So at least for echo-server model, batching
>> poll request completion properly will give benefits.
>>
>> Currently don't find any appropriate place to store batched poll
>> requests, put them in struct io_submit_state temporarily, which I
>> think it'll need rework in future.
>>
>> Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
>> ---
>>   fs/io_uring.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 63 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>> index 6fdfb688cf91..14118388bfc6 100644
>> --- a/fs/io_uring.c
>> +++ b/fs/io_uring.c
>> @@ -321,6 +321,11 @@ struct io_submit_state {
>>   	 */
>>   	struct io_kiocb		*compl_reqs[IO_COMPL_BATCH];
>>   	unsigned int		compl_nr;
>> +
>> +	struct io_kiocb		*poll_compl_reqs[IO_COMPL_BATCH];
>> +	bool			poll_req_status[IO_COMPL_BATCH];
>> +	unsigned int		poll_compl_nr;
>> +
>>   	/* inline/task_work completion list, under ->uring_lock */
>>   	struct list_head	free_list;
>>   
>> @@ -2093,6 +2098,8 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
>>   	percpu_ref_put(&ctx->refs);
>>   }
>>   
>> +static void io_poll_flush_completions(struct io_ring_ctx *ctx, bool *locked);
>> +
>>   static void tctx_task_work(struct callback_head *cb)
>>   {
>>   	bool locked = false;
>> @@ -2103,8 +2110,11 @@ static void tctx_task_work(struct callback_head *cb)
>>   	while (1) {
>>   		struct io_wq_work_node *node;
>>   
>> -		if (!tctx->task_list.first && locked && ctx->submit_state.compl_nr)
>> +		if (!tctx->task_list.first && locked && (ctx->submit_state.compl_nr ||
>> +		    ctx->submit_state.poll_compl_nr)) {
> io_submit_flush_completions() shouldn't be called if there are no requests... And the
> check is already inside for-next, will be
>
> if (... && locked) {
> 	io_submit_flush_completions();
> 	if (poll_compl_nr)
> 		io_poll_flush_completions();

OK, thanks for pointing this, and I have dropped the poll request 
completion batching patch, since

it shows performance fluctuations, hard to say whether it's useful.


Regards,

Xiaoguang Wang


> }
>
>>   			io_submit_flush_completions(ctx);
>> +			io_poll_flush_completions(ctx, &locked);
>> +		}
>>   
>>   		spin_lock_irq(&tctx->task_lock);
>>   		node = tctx->task_list.first;
>> @@ -5134,6 +5144,49 @@ static inline bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
>>   static bool __io_poll_remove_one(struct io_kiocb *req,
>>   				 struct io_poll_iocb *poll, bool do_cancel);
>>   
>> +static void io_poll_flush_completions(struct io_ring_ctx *ctx, bool *locked)
>> +	__must_hold(&ctx->uring_lock)
>> +{
>> +	struct io_submit_state *state = &ctx->submit_state;
>> +	struct io_kiocb *req, *nxt;
>> +	int i, nr = state->poll_compl_nr;
>> +	bool done, skip_done = true;
>> +
>> +	spin_lock(&ctx->completion_lock);
>> +	for (i = 0; i < nr; i++) {
>> +		req = state->poll_compl_reqs[i];
>> +		done = __io_poll_complete(req, req->result);
> I believe we first need to fix all the poll problems and lay out something more intuitive
> than the current implementation, or it'd be pure hell to do afterwards.
>
> Can be a nice addition, curious about numbers as well.
>
>

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2021-09-24  4:28 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-09-22 12:34 [RFC 0/3] improvements for poll requests Xiaoguang Wang
2021-09-22 12:34 ` [RFC 1/3] io_uring: reduce frequent add_wait_queue() overhead for multi-shot poll request Xiaoguang Wang
2021-09-22 17:52   ` Hao Xu
2021-09-22 12:34 ` [RFC 2/3] io_uring: don't get completion_lock in io_poll_rewait() Xiaoguang Wang
2021-09-22 12:34 ` [RFC 3/3] io_uring: try to batch poll request completion Xiaoguang Wang
2021-09-22 16:24   ` Pavel Begunkov
2021-09-24  4:28     ` Xiaoguang Wang
2021-09-22 17:00   ` Hao Xu
2021-09-22 17:01     ` Hao Xu
2021-09-22 17:09       ` Hao Xu
2021-09-22 13:00 ` [RFC 0/3] improvements for poll requests Jens Axboe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.