io-uring.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] io_uring: use task_work for links if possible
@ 2020-06-25 18:27 Jens Axboe
  2020-06-25 20:28 ` Pavel Begunkov
  2020-06-26 20:27 ` Pavel Begunkov
  0 siblings, 2 replies; 9+ messages in thread
From: Jens Axboe @ 2020-06-25 18:27 UTC (permalink / raw)
  To: io-uring

Currently links are always done in an async fashion, unless we
catch them inline after we successfully complete a request without
having to resort to blocking. This isn't necessarily the most efficient
approach, it'd be more ideal if we could just use the task_work handling
for this.

Outside of saving an async jump, we can also do less prep work for
these kinds of requests.

Running dependent links from the task_work handler yields some nice
performance benefits. As an example, examples/link-cp from the liburing
repository uses read+write links to implement a copy operation. Without
this patch, the a cache fold 4G file read from a VM runs in about
3 seconds:

$ time examples/link-cp /data/file /dev/null

real	0m2.986s
user	0m0.051s
sys	0m2.843s

and a subsequent cache hot run looks like this:

$ time examples/link-cp /data/file /dev/null

real	0m0.898s
user	0m0.069s
sys	0m0.797s

With this patch in place, the cold case takes about 2.4 seconds:

$ time examples/link-cp /data/file /dev/null

real	0m2.400s
user	0m0.020s
sys	0m2.366s

and the cache hot case looks like this:

$ time examples/link-cp /data/file /dev/null

real	0m0.676s
user	0m0.010s
sys	0m0.665s

As expected, the (mostly) cache hot case yields the biggest improvement,
running about 25% faster with this change, while the cache cold case
yields about a 20% increase in performance. Outside of the performance
increase, we're using less CPU as well, as we're not using the async
offload threads at all for this anymore.

Signed-off-by: Jens Axboe <axboe@kernel.dk>

---

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0bba12e4e559..389274a078c8 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -898,6 +898,7 @@ enum io_mem_account {
 static void io_wq_submit_work(struct io_wq_work **workptr);
 static void io_cqring_fill_event(struct io_kiocb *req, long res);
 static void io_put_req(struct io_kiocb *req);
+static void io_double_put_req(struct io_kiocb *req);
 static void __io_double_put_req(struct io_kiocb *req);
 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
 static void io_queue_linked_timeout(struct io_kiocb *req);
@@ -951,6 +952,34 @@ static void __io_put_req_task(struct io_kiocb *req)
 		put_task_struct(req->task);
 }
 
+static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
+{
+	struct mm_struct *mm = current->mm;
+
+	if (mm) {
+		kthread_unuse_mm(mm);
+		mmput(mm);
+	}
+}
+
+static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
+				   struct io_kiocb *req)
+{
+	if (io_op_defs[req->opcode].needs_mm && !current->mm) {
+		if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
+			return -EFAULT;
+		kthread_use_mm(ctx->sqo_mm);
+	}
+
+	return 0;
+}
+
+static inline void req_set_fail_links(struct io_kiocb *req)
+{
+	if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
+		req->flags |= REQ_F_FAIL_LINK;
+}
+
 static void io_file_put_work(struct work_struct *work);
 
 /*
@@ -1664,6 +1693,64 @@ static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
 	}
 }
 
+static void __io_req_task_cancel(struct io_kiocb *req, int error)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+
+	spin_lock_irq(&ctx->completion_lock);
+	io_cqring_fill_event(req, error);
+	io_commit_cqring(ctx);
+	spin_unlock_irq(&ctx->completion_lock);
+
+	io_cqring_ev_posted(ctx);
+	req_set_fail_links(req);
+	io_double_put_req(req);
+}
+
+static void io_req_task_cancel(struct callback_head *cb)
+{
+	struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+
+	__io_req_task_cancel(req, -ECANCELED);
+}
+
+static void __io_req_task_submit(struct io_kiocb *req)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+
+	__set_current_state(TASK_RUNNING);
+	if (!io_sq_thread_acquire_mm(ctx, req)) {
+		mutex_lock(&ctx->uring_lock);
+		__io_queue_sqe(req, NULL, NULL);
+		mutex_unlock(&ctx->uring_lock);
+	} else {
+		__io_req_task_cancel(req, -EFAULT);
+	}
+}
+
+static void io_req_task_submit(struct callback_head *cb)
+{
+	struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+
+	__io_req_task_submit(req);
+}
+
+static void io_req_task_queue(struct io_kiocb *req)
+{
+	struct task_struct *tsk = req->task;
+	int ret;
+
+	init_task_work(&req->task_work, io_req_task_submit);
+
+	ret = task_work_add(tsk, &req->task_work, true);
+	if (unlikely(ret)) {
+		init_task_work(&req->task_work, io_req_task_cancel);
+		tsk = io_wq_get_task(req->ctx->io_wq);
+		task_work_add(tsk, &req->task_work, true);
+	}
+	wake_up_process(tsk);
+}
+
 static void io_free_req(struct io_kiocb *req)
 {
 	struct io_kiocb *nxt = NULL;
@@ -1671,8 +1758,12 @@ static void io_free_req(struct io_kiocb *req)
 	io_req_find_next(req, &nxt);
 	__io_free_req(req);
 
-	if (nxt)
-		io_queue_async_work(nxt);
+	if (nxt) {
+		if (nxt->flags & REQ_F_WORK_INITIALIZED)
+			io_queue_async_work(nxt);
+		else
+			io_req_task_queue(nxt);
+	}
 }
 
 static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
@@ -2013,12 +2104,6 @@ static void kiocb_end_write(struct io_kiocb *req)
 	file_end_write(req->file);
 }
 
-static inline void req_set_fail_links(struct io_kiocb *req)
-{
-	if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
-		req->flags |= REQ_F_FAIL_LINK;
-}
-
 static void io_complete_rw_common(struct kiocb *kiocb, long res,
 				  struct io_comp_state *cs)
 {
@@ -2035,35 +2120,6 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res,
 	__io_req_complete(req, res, cflags, cs);
 }
 
-static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
-{
-	struct mm_struct *mm = current->mm;
-
-	if (mm) {
-		kthread_unuse_mm(mm);
-		mmput(mm);
-	}
-}
-
-static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
-{
-	if (!current->mm) {
-		if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
-			return -EFAULT;
-		kthread_use_mm(ctx->sqo_mm);
-	}
-
-	return 0;
-}
-
-static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
-				   struct io_kiocb *req)
-{
-	if (!io_op_defs[req->opcode].needs_mm)
-		return 0;
-	return __io_sq_thread_acquire_mm(ctx);
-}
-
 #ifdef CONFIG_BLOCK
 static bool io_resubmit_prep(struct io_kiocb *req, int error)
 {
@@ -2811,20 +2867,6 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	return 0;
 }
 
-static void __io_async_buf_error(struct io_kiocb *req, int error)
-{
-	struct io_ring_ctx *ctx = req->ctx;
-
-	spin_lock_irq(&ctx->completion_lock);
-	io_cqring_fill_event(req, error);
-	io_commit_cqring(ctx);
-	spin_unlock_irq(&ctx->completion_lock);
-
-	io_cqring_ev_posted(ctx);
-	req_set_fail_links(req);
-	io_double_put_req(req);
-}
-
 static void io_async_buf_cancel(struct callback_head *cb)
 {
 	struct io_async_rw *rw;
@@ -2832,27 +2874,18 @@ static void io_async_buf_cancel(struct callback_head *cb)
 
 	rw = container_of(cb, struct io_async_rw, task_work);
 	req = rw->wpq.wait.private;
-	__io_async_buf_error(req, -ECANCELED);
+	__io_req_task_cancel(req, -ECANCELED);
 }
 
 static void io_async_buf_retry(struct callback_head *cb)
 {
 	struct io_async_rw *rw;
-	struct io_ring_ctx *ctx;
 	struct io_kiocb *req;
 
 	rw = container_of(cb, struct io_async_rw, task_work);
 	req = rw->wpq.wait.private;
-	ctx = req->ctx;
 
-	__set_current_state(TASK_RUNNING);
-	if (!__io_sq_thread_acquire_mm(ctx)) {
-		mutex_lock(&ctx->uring_lock);
-		__io_queue_sqe(req, NULL, NULL);
-		mutex_unlock(&ctx->uring_lock);
-	} else {
-		__io_async_buf_error(req, -EFAULT);
-	}
+	__io_req_task_submit(req);
 }
 
 static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
@@ -5218,22 +5251,24 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock,
 }
 
 static int io_req_defer_prep(struct io_kiocb *req,
-			     const struct io_uring_sqe *sqe)
+			     const struct io_uring_sqe *sqe, bool for_async)
 {
 	ssize_t ret = 0;
 
 	if (!sqe)
 		return 0;
 
-	io_req_init_async(req);
+	if (for_async) {
+		io_req_init_async(req);
 
-	if (io_op_defs[req->opcode].file_table) {
-		ret = io_grab_files(req);
-		if (unlikely(ret))
-			return ret;
-	}
+		if (io_op_defs[req->opcode].file_table) {
+			ret = io_grab_files(req);
+			if (unlikely(ret))
+				return ret;
+		}
 
-	io_req_work_grab_env(req, &io_op_defs[req->opcode]);
+		io_req_work_grab_env(req, &io_op_defs[req->opcode]);
+	}
 
 	switch (req->opcode) {
 	case IORING_OP_NOP:
@@ -5347,7 +5382,7 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (!req->io) {
 		if (io_alloc_async_ctx(req))
 			return -EAGAIN;
-		ret = io_req_defer_prep(req, sqe);
+		ret = io_req_defer_prep(req, sqe, true);
 		if (ret < 0)
 			return ret;
 	}
@@ -5966,7 +6001,7 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 			ret = -EAGAIN;
 			if (io_alloc_async_ctx(req))
 				goto fail_req;
-			ret = io_req_defer_prep(req, sqe);
+			ret = io_req_defer_prep(req, sqe, true);
 			if (unlikely(ret < 0))
 				goto fail_req;
 		}
@@ -6022,13 +6057,14 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		if (io_alloc_async_ctx(req))
 			return -EAGAIN;
 
-		ret = io_req_defer_prep(req, sqe);
+		ret = io_req_defer_prep(req, sqe, false);
 		if (ret) {
 			/* fail even hard links since we don't submit */
 			head->flags |= REQ_F_FAIL_LINK;
 			return ret;
 		}
 		trace_io_uring_link(ctx, req, head);
+		io_get_req_task(req);
 		list_add_tail(&req->link_list, &head->link_list);
 
 		/* last request of a link, enqueue the link */
@@ -6048,7 +6084,7 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 			if (io_alloc_async_ctx(req))
 				return -EAGAIN;
 
-			ret = io_req_defer_prep(req, sqe);
+			ret = io_req_defer_prep(req, sqe, true);
 			if (ret)
 				req->flags |= REQ_F_FAIL_LINK;
 			*link = req;

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] io_uring: use task_work for links if possible
  2020-06-25 18:27 [PATCH] io_uring: use task_work for links if possible Jens Axboe
@ 2020-06-25 20:28 ` Pavel Begunkov
  2020-06-25 21:37   ` Jens Axboe
  2020-06-26 20:27 ` Pavel Begunkov
  1 sibling, 1 reply; 9+ messages in thread
From: Pavel Begunkov @ 2020-06-25 20:28 UTC (permalink / raw)
  To: Jens Axboe, io-uring

On 25/06/2020 21:27, Jens Axboe wrote:
> Currently links are always done in an async fashion, unless we
> catch them inline after we successfully complete a request without
> having to resort to blocking. This isn't necessarily the most efficient
> approach, it'd be more ideal if we could just use the task_work handling
> for this.

Well, you beat me on this. As mentioned, I was going to rebase it after
lending iopoll fixes. Nice numbers! A small comment below, but LGTM.
I'll review more formally on a fresh head.

Could you push it to a branch? My other patches would conflict.

> 
> Outside of saving an async jump, we can also do less prep work for
> these kinds of requests.
> 
> Running dependent links from the task_work handler yields some nice
> performance benefits. As an example, examples/link-cp from the liburing
> repository uses read+write links to implement a copy operation. Without
> this patch, the a cache fold 4G file read from a VM runs in about
> 3 seconds:
> 
> $ time examples/link-cp /data/file /dev/null
> 
> real	0m2.986s
> user	0m0.051s
> sys	0m2.843s
> 
> and a subsequent cache hot run looks like this:
> 
> $ time examples/link-cp /data/file /dev/null
> 
> real	0m0.898s
> user	0m0.069s
> sys	0m0.797s
> 
> With this patch in place, the cold case takes about 2.4 seconds:
> 
> $ time examples/link-cp /data/file /dev/null
> 
> real	0m2.400s
> user	0m0.020s
> sys	0m2.366s
> 
> and the cache hot case looks like this:
> 
> $ time examples/link-cp /data/file /dev/null
> 
> real	0m0.676s
> user	0m0.010s
> sys	0m0.665s
> 
> As expected, the (mostly) cache hot case yields the biggest improvement,
> running about 25% faster with this change, while the cache cold case
> yields about a 20% increase in performance. Outside of the performance
> increase, we're using less CPU as well, as we're not using the async
> offload threads at all for this anymore.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> 
> ---
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 0bba12e4e559..389274a078c8 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
...
>  
> +static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
> +{
> +	struct mm_struct *mm = current->mm;
> +
> +	if (mm) {
> +		kthread_unuse_mm(mm);
> +		mmput(mm);
> +	}
> +}
> +
> +static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
> +				   struct io_kiocb *req)
> +{
> +	if (io_op_defs[req->opcode].needs_mm && !current->mm) {
> +		if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
> +			return -EFAULT;
> +		kthread_use_mm(ctx->sqo_mm);
> +	}
> +
> +	return 0;
> +}
...
> +static void __io_req_task_submit(struct io_kiocb *req)
> +{
> +	struct io_ring_ctx *ctx = req->ctx;
> +
> +	__set_current_state(TASK_RUNNING);
> +	if (!io_sq_thread_acquire_mm(ctx, req)) {

My last patch replaced it with "__" version. Is it merge problems
or intended as this?

> +		mutex_lock(&ctx->uring_lock);
> +		__io_queue_sqe(req, NULL, NULL);
> +		mutex_unlock(&ctx->uring_lock);
> +	} else {
> +		__io_req_task_cancel(req, -EFAULT);
> +	}
> +}
> +

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] io_uring: use task_work for links if possible
  2020-06-25 20:28 ` Pavel Begunkov
@ 2020-06-25 21:37   ` Jens Axboe
  2020-06-26  9:41     ` Pavel Begunkov
  0 siblings, 1 reply; 9+ messages in thread
From: Jens Axboe @ 2020-06-25 21:37 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

> 
On 6/25/20 2:28 PM, Pavel Begunkov wrote:
> On 25/06/2020 21:27, Jens Axboe wrote:
>> Currently links are always done in an async fashion, unless we
>> catch them inline after we successfully complete a request without
>> having to resort to blocking. This isn't necessarily the most efficient
>> approach, it'd be more ideal if we could just use the task_work handling
>> for this.
> 
> Well, you beat me on this. As mentioned, I was going to rebase it after
> lending iopoll fixes. Nice numbers! A small comment below, but LGTM.
> I'll review more formally on a fresh head.

I thought you were doing this for the retry -EAGAIN based stuff, didn't
know you had plans on links! If so, I would have left it alone. This was
just a quick idea and execution this morning.

> Could you push it to a branch? My other patches would conflict.

Yep, I'll push it out now.

>> +static void __io_req_task_submit(struct io_kiocb *req)
>> +{
>> +	struct io_ring_ctx *ctx = req->ctx;
>> +
>> +	__set_current_state(TASK_RUNNING);
>> +	if (!io_sq_thread_acquire_mm(ctx, req)) {
> 
> My last patch replaced it with "__" version. Is it merge problems
> or intended as this?

I'll make sure it applies on for-5.9/io_uring, and then I'll sort out
any merge issues by pulling in io_uring-5.8 to there, if we need to.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] io_uring: use task_work for links if possible
  2020-06-25 21:37   ` Jens Axboe
@ 2020-06-26  9:41     ` Pavel Begunkov
  0 siblings, 0 replies; 9+ messages in thread
From: Pavel Begunkov @ 2020-06-26  9:41 UTC (permalink / raw)
  To: Jens Axboe, io-uring

On 26/06/2020 00:37, Jens Axboe wrote:
>>
> On 6/25/20 2:28 PM, Pavel Begunkov wrote:
>> On 25/06/2020 21:27, Jens Axboe wrote:
>>> Currently links are always done in an async fashion, unless we
>>> catch them inline after we successfully complete a request without
>>> having to resort to blocking. This isn't necessarily the most efficient
>>> approach, it'd be more ideal if we could just use the task_work handling
>>> for this.
>>
>> Well, you beat me on this. As mentioned, I was going to rebase it after
>> lending iopoll fixes. Nice numbers! A small comment below, but LGTM.
>> I'll review more formally on a fresh head.
> 
> I thought you were doing this for the retry -EAGAIN based stuff, didn't
> know you had plans on links! If so, I would have left it alone. This was
> just a quick idea and execution this morning.

I don't mind, just we did double work and that looks kind of wasteful.

> 
>> Could you push it to a branch? My other patches would conflict.
> 
> Yep, I'll push it out now.

Thanks

> 
>>> +static void __io_req_task_submit(struct io_kiocb *req)
>>> +{
>>> +	struct io_ring_ctx *ctx = req->ctx;
>>> +
>>> +	__set_current_state(TASK_RUNNING);
>>> +	if (!io_sq_thread_acquire_mm(ctx, req)) {
>>
>> My last patch replaced it with "__" version. Is it merge problems
>> or intended as this?
> 
> I'll make sure it applies on for-5.9/io_uring, and then I'll sort out
> any merge issues by pulling in io_uring-5.8 to there, if we need to.
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] io_uring: use task_work for links if possible
  2020-06-25 18:27 [PATCH] io_uring: use task_work for links if possible Jens Axboe
  2020-06-25 20:28 ` Pavel Begunkov
@ 2020-06-26 20:27 ` Pavel Begunkov
  2020-06-26 20:43   ` Jens Axboe
  1 sibling, 1 reply; 9+ messages in thread
From: Pavel Begunkov @ 2020-06-26 20:27 UTC (permalink / raw)
  To: Jens Axboe, io-uring

On 25/06/2020 21:27, Jens Axboe wrote:
> Currently links are always done in an async fashion, unless we
> catch them inline after we successfully complete a request without
> having to resort to blocking. This isn't necessarily the most efficient
> approach, it'd be more ideal if we could just use the task_work handling
> for this.
> 
> Outside of saving an async jump, we can also do less prep work for
> these kinds of requests.
> 
> Running dependent links from the task_work handler yields some nice
> performance benefits. As an example, examples/link-cp from the liburing
> repository uses read+write links to implement a copy operation. Without
> this patch, the a cache fold 4G file read from a VM runs in about
> 3 seconds:

A few comments below


> > diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 0bba12e4e559..389274a078c8 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -898,6 +898,7 @@ enum io_mem_account {
...
> +static void __io_req_task_submit(struct io_kiocb *req)
> +{
> +	struct io_ring_ctx *ctx = req->ctx;
> +
> +	__set_current_state(TASK_RUNNING);
> +	if (!io_sq_thread_acquire_mm(ctx, req)) {
> +		mutex_lock(&ctx->uring_lock);
> +		__io_queue_sqe(req, NULL, NULL);
> +		mutex_unlock(&ctx->uring_lock);
> +	} else {
> +		__io_req_task_cancel(req, -EFAULT);
> +	}
> +}
> +
> +static void io_req_task_submit(struct callback_head *cb)
> +{
> +	struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
> +
> +	__io_req_task_submit(req);
> +}
> +
> +static void io_req_task_queue(struct io_kiocb *req)
> +{
> +	struct task_struct *tsk = req->task;
> +	int ret;
> +
> +	init_task_work(&req->task_work, io_req_task_submit);
> +
> +	ret = task_work_add(tsk, &req->task_work, true);
> +	if (unlikely(ret)) {
> +		init_task_work(&req->task_work, io_req_task_cancel);

Why not to kill it off here? It just was nxt, so shouldn't anything like
NOCANCEL

> +		tsk = io_wq_get_task(req->ctx->io_wq);
> +		task_work_add(tsk, &req->task_work, true);
> +	}
> +	wake_up_process(tsk);
> +}
> +
>  static void io_free_req(struct io_kiocb *req)
>  {
>  	struct io_kiocb *nxt = NULL;
> @@ -1671,8 +1758,12 @@ static void io_free_req(struct io_kiocb *req)
>  	io_req_find_next(req, &nxt);
>  	__io_free_req(req);
>  
> -	if (nxt)
> -		io_queue_async_work(nxt);
> +	if (nxt) {
> +		if (nxt->flags & REQ_F_WORK_INITIALIZED)
> +			io_queue_async_work(nxt);

Don't think it will work. E.g. io_close_prep() may have set
REQ_F_WORK_INITIALIZED but without io_req_work_grab_env().

> +		else
> +			io_req_task_queue(nxt);
> +	}
>  }
>  
>  static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
> @@ -2013,12 +2104,6 @@ static void kiocb_end_write(struct io_kiocb *req)
>  	file_end_write(req->file);
>  }
>  
> -static inline void req_set_fail_links(struct io_kiocb *req)
> -{
> -	if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
> -		req->flags |= REQ_F_FAIL_LINK;
> -}
> -

I think it'd be nicer in 2 patches, first moving io_sq_thread_drop_mm, etc. up.
And the second one doing actual work. 

>  static void io_complete_rw_common(struct kiocb *kiocb, long res,
>  				  struct io_comp_state *cs)
>  {
> @@ -2035,35 +2120,6 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res,
>  	__io_req_complete(req, res, cflags, cs);
>  }
>  
...
>  	switch (req->opcode) {
>  	case IORING_OP_NOP:
> @@ -5347,7 +5382,7 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>  	if (!req->io) {
>  		if (io_alloc_async_ctx(req))
>  			return -EAGAIN;
> -		ret = io_req_defer_prep(req, sqe);
> +		ret = io_req_defer_prep(req, sqe, true);

Why head of a link is for_async?

>  		if (ret < 0)
>  			return ret;
>  	}
> @@ -5966,7 +6001,7 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
>  			ret = -EAGAIN;
>  			if (io_alloc_async_ctx(req))
>  				goto fail_req;
> -			ret = io_req_defer_prep(req, sqe);
> +			ret = io_req_defer_prep(req, sqe, true);
>  			if (unlikely(ret < 0))
>  				goto fail_req;
>  		}
> @@ -6022,13 +6057,14 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
>  		if (io_alloc_async_ctx(req))
>  			return -EAGAIN;
>  
> -		ret = io_req_defer_prep(req, sqe);
> +		ret = io_req_defer_prep(req, sqe, false);
>  		if (ret) {
>  			/* fail even hard links since we don't submit */
>  			head->flags |= REQ_F_FAIL_LINK;
>  			return ret;
>  		}
>  		trace_io_uring_link(ctx, req, head);
> +		io_get_req_task(req);
>  		list_add_tail(&req->link_list, &head->link_list);
>  
>  		/* last request of a link, enqueue the link */
> @@ -6048,7 +6084,7 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
>  			if (io_alloc_async_ctx(req))
>  				return -EAGAIN;
>  
> -			ret = io_req_defer_prep(req, sqe);
> +			ret = io_req_defer_prep(req, sqe, true);
>  			if (ret)
>  				req->flags |= REQ_F_FAIL_LINK;
>  			*link = req;
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] io_uring: use task_work for links if possible
  2020-06-26 20:27 ` Pavel Begunkov
@ 2020-06-26 20:43   ` Jens Axboe
  2020-06-26 21:20     ` Pavel Begunkov
  0 siblings, 1 reply; 9+ messages in thread
From: Jens Axboe @ 2020-06-26 20:43 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 6/26/20 2:27 PM, Pavel Begunkov wrote:
> On 25/06/2020 21:27, Jens Axboe wrote:
>> Currently links are always done in an async fashion, unless we
>> catch them inline after we successfully complete a request without
>> having to resort to blocking. This isn't necessarily the most efficient
>> approach, it'd be more ideal if we could just use the task_work handling
>> for this.
>>
>> Outside of saving an async jump, we can also do less prep work for
>> these kinds of requests.
>>
>> Running dependent links from the task_work handler yields some nice
>> performance benefits. As an example, examples/link-cp from the liburing
>> repository uses read+write links to implement a copy operation. Without
>> this patch, the a cache fold 4G file read from a VM runs in about
>> 3 seconds:
> 
> A few comments below
> 
> 
>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>> index 0bba12e4e559..389274a078c8 100644
>> --- a/fs/io_uring.c
>> +++ b/fs/io_uring.c
>> @@ -898,6 +898,7 @@ enum io_mem_account {
> ...
>> +static void __io_req_task_submit(struct io_kiocb *req)
>> +{
>> +	struct io_ring_ctx *ctx = req->ctx;
>> +
>> +	__set_current_state(TASK_RUNNING);
>> +	if (!io_sq_thread_acquire_mm(ctx, req)) {
>> +		mutex_lock(&ctx->uring_lock);
>> +		__io_queue_sqe(req, NULL, NULL);
>> +		mutex_unlock(&ctx->uring_lock);
>> +	} else {
>> +		__io_req_task_cancel(req, -EFAULT);
>> +	}
>> +}
>> +
>> +static void io_req_task_submit(struct callback_head *cb)
>> +{
>> +	struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
>> +
>> +	__io_req_task_submit(req);
>> +}
>> +
>> +static void io_req_task_queue(struct io_kiocb *req)
>> +{
>> +	struct task_struct *tsk = req->task;
>> +	int ret;
>> +
>> +	init_task_work(&req->task_work, io_req_task_submit);
>> +
>> +	ret = task_work_add(tsk, &req->task_work, true);
>> +	if (unlikely(ret)) {
>> +		init_task_work(&req->task_work, io_req_task_cancel);
> 
> Why not to kill it off here? It just was nxt, so shouldn't anything like
> NOCANCEL

We don't necessarily know the context, and we'd at least need to check
REQ_F_COMP_LOCKED and propagate. I think it gets ugly really quick,
better to just punt it to clean context.

>> +		tsk = io_wq_get_task(req->ctx->io_wq);
>> +		task_work_add(tsk, &req->task_work, true);
>> +	}
>> +	wake_up_process(tsk);
>> +}
>> +
>>  static void io_free_req(struct io_kiocb *req)
>>  {
>>  	struct io_kiocb *nxt = NULL;
>> @@ -1671,8 +1758,12 @@ static void io_free_req(struct io_kiocb *req)
>>  	io_req_find_next(req, &nxt);
>>  	__io_free_req(req);
>>  
>> -	if (nxt)
>> -		io_queue_async_work(nxt);
>> +	if (nxt) {
>> +		if (nxt->flags & REQ_F_WORK_INITIALIZED)
>> +			io_queue_async_work(nxt);
> 
> Don't think it will work. E.g. io_close_prep() may have set
> REQ_F_WORK_INITIALIZED but without io_req_work_grab_env().

This really doesn't change the existing path, it just makes sure we
don't do io_req_task_queue() on something that has already modified
->work (and hence, ->task_work). This might miss cases where we have
only cleared it and done nothing else, but that just means we'll have
cases that we could potentially improve the effiency of down the line.

>> -static inline void req_set_fail_links(struct io_kiocb *req)
>> -{
>> -	if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
>> -		req->flags |= REQ_F_FAIL_LINK;
>> -}
>> -
> 
> I think it'd be nicer in 2 patches, first moving io_sq_thread_drop_mm, etc. up.
> And the second one doing actual work. 

Yeah I agree...

>>  static void io_complete_rw_common(struct kiocb *kiocb, long res,
>>  				  struct io_comp_state *cs)
>>  {
>> @@ -2035,35 +2120,6 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res,
>>  	__io_req_complete(req, res, cflags, cs);
>>  }
>>  
> ...
>>  	switch (req->opcode) {
>>  	case IORING_OP_NOP:
>> @@ -5347,7 +5382,7 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>  	if (!req->io) {
>>  		if (io_alloc_async_ctx(req))
>>  			return -EAGAIN;
>> -		ret = io_req_defer_prep(req, sqe);
>> +		ret = io_req_defer_prep(req, sqe, true);
> 
> Why head of a link is for_async?

True, that could be false instead.

Since these are just minor things, we can do a fix on top. I don't want
to reshuffle this unless I have to.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] io_uring: use task_work for links if possible
  2020-06-26 20:43   ` Jens Axboe
@ 2020-06-26 21:20     ` Pavel Begunkov
  2020-06-27  1:45       ` Jens Axboe
  0 siblings, 1 reply; 9+ messages in thread
From: Pavel Begunkov @ 2020-06-26 21:20 UTC (permalink / raw)
  To: Jens Axboe, io-uring

On 26/06/2020 23:43, Jens Axboe wrote:
> On 6/26/20 2:27 PM, Pavel Begunkov wrote:
>> On 25/06/2020 21:27, Jens Axboe wrote:
>>> Currently links are always done in an async fashion, unless we
>>> catch them inline after we successfully complete a request without
>>> having to resort to blocking. This isn't necessarily the most efficient
>>> approach, it'd be more ideal if we could just use the task_work handling
>>> for this.
>>>
>>> Outside of saving an async jump, we can also do less prep work for
>>> these kinds of requests.
>>>
>>> Running dependent links from the task_work handler yields some nice
>>> performance benefits. As an example, examples/link-cp from the liburing
>>> repository uses read+write links to implement a copy operation. Without
>>> this patch, the a cache fold 4G file read from a VM runs in about
>>> 3 seconds:
>>
>> A few comments below
>>
>>
>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>> index 0bba12e4e559..389274a078c8 100644
>>> --- a/fs/io_uring.c
>>> +++ b/fs/io_uring.c
>>> @@ -898,6 +898,7 @@ enum io_mem_account {
>> ...
>>> +static void __io_req_task_submit(struct io_kiocb *req)
>>> +{
>>> +	struct io_ring_ctx *ctx = req->ctx;
>>> +
>>> +	__set_current_state(TASK_RUNNING);
>>> +	if (!io_sq_thread_acquire_mm(ctx, req)) {
>>> +		mutex_lock(&ctx->uring_lock);
>>> +		__io_queue_sqe(req, NULL, NULL);
>>> +		mutex_unlock(&ctx->uring_lock);
>>> +	} else {
>>> +		__io_req_task_cancel(req, -EFAULT);
>>> +	}
>>> +}
>>> +
>>> +static void io_req_task_submit(struct callback_head *cb)
>>> +{
>>> +	struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
>>> +
>>> +	__io_req_task_submit(req);
>>> +}
>>> +
>>> +static void io_req_task_queue(struct io_kiocb *req)
>>> +{
>>> +	struct task_struct *tsk = req->task;
>>> +	int ret;
>>> +
>>> +	init_task_work(&req->task_work, io_req_task_submit);
>>> +
>>> +	ret = task_work_add(tsk, &req->task_work, true);
>>> +	if (unlikely(ret)) {
>>> +		init_task_work(&req->task_work, io_req_task_cancel);
>>
>> Why not to kill it off here? It just was nxt, so shouldn't anything like
>> NOCANCEL
> 
> We don't necessarily know the context, and we'd at least need to check
> REQ_F_COMP_LOCKED and propagate. I think it gets ugly really quick,
> better to just punt it to clean context.

Makes sense

> 
>>> +		tsk = io_wq_get_task(req->ctx->io_wq);
>>> +		task_work_add(tsk, &req->task_work, true);
>>> +	}
>>> +	wake_up_process(tsk);
>>> +}
>>> +
>>>  static void io_free_req(struct io_kiocb *req)
>>>  {
>>>  	struct io_kiocb *nxt = NULL;
>>> @@ -1671,8 +1758,12 @@ static void io_free_req(struct io_kiocb *req)
>>>  	io_req_find_next(req, &nxt);
>>>  	__io_free_req(req);
>>>  
>>> -	if (nxt)
>>> -		io_queue_async_work(nxt);
>>> +	if (nxt) {
>>> +		if (nxt->flags & REQ_F_WORK_INITIALIZED)
>>> +			io_queue_async_work(nxt);
>>
>> Don't think it will work. E.g. io_close_prep() may have set
>> REQ_F_WORK_INITIALIZED but without io_req_work_grab_env().
> 
> This really doesn't change the existing path, it just makes sure we
> don't do io_req_task_queue() on something that has already modified
> ->work (and hence, ->task_work). This might miss cases where we have
> only cleared it and done nothing else, but that just means we'll have
> cases that we could potentially improve the effiency of down the line.

Before the patch it was always initialising linked reqs, and that would
work ok, if not this lazy grab_env().

E.g. req1 -> close_req

It calls, io_req_defer_prep(__close_req__, sqe, __false__)
which doesn't do grab_env() because of for_async=false,
but calls io_close_prep() which sets REQ_F_WORK_INITIALIZED.

Then, after completion of req1 it will follow added lines

if (nxt)
	if (nxt->flags & REQ_F_WORK_INITIALIZED)
		io_queue_async_work(nxt);

Ending up in

io_queue_async_work()
	-> grab_env()

And that's who knows from which context.
E.g. req1 was an rw completed in an irq.


Not sure it's related, but fallocate shows the log below, and some other tests
hang the kernel as well.

[   42.445719] BUG: kernel NULL pointer dereference, address: 0000000000000000
[   42.445723] #PF: supervisor write access in kernel mode
[   42.445725] #PF: error_code(0x0002) - not-present page
[   42.445726] PGD 0 P4D 0 
[   42.445729] Oops: 0002 [#1] PREEMPT SMP PTI
[   42.445733] CPU: 1 PID: 1511 Comm: io_wqe_worker-0 Tainted: G          I       5.8.0-rc2-00358-g9d2391dd5359 #489
[   42.445740] RIP: 0010:override_creds+0x19/0x30
...
[   42.445754] Call Trace:
[   42.445758]  io_worker_handle_work+0x25c/0x430
[   42.445760]  io_wqe_worker+0x2a0/0x350
[   42.445764]  ? _raw_spin_unlock_irqrestore+0x24/0x40
[   42.445766]  ? io_worker_handle_work+0x430/0x430
[   42.445769]  kthread+0x136/0x180
[   42.445771]  ? kthread_park+0x90/0x90
[   42.445774]  ret_from_fork+0x22/0x30


> 
>>> -static inline void req_set_fail_links(struct io_kiocb *req)
>>> -{
>>> -	if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
>>> -		req->flags |= REQ_F_FAIL_LINK;
>>> -}
>>> -
>>
>> I think it'd be nicer in 2 patches, first moving io_sq_thread_drop_mm, etc. up.
>> And the second one doing actual work. 
> 
> Yeah I agree...
> 
>>>  static void io_complete_rw_common(struct kiocb *kiocb, long res,
>>>  				  struct io_comp_state *cs)
>>>  {
>>> @@ -2035,35 +2120,6 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res,
>>>  	__io_req_complete(req, res, cflags, cs);
>>>  }
>>>  
>> ...
>>>  	switch (req->opcode) {
>>>  	case IORING_OP_NOP:
>>> @@ -5347,7 +5382,7 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>>  	if (!req->io) {
>>>  		if (io_alloc_async_ctx(req))
>>>  			return -EAGAIN;
>>> -		ret = io_req_defer_prep(req, sqe);
>>> +		ret = io_req_defer_prep(req, sqe, true);
>>
>> Why head of a link is for_async?
> 
> True, that could be false instead.
> 
> Since these are just minor things, we can do a fix on top. I don't want
> to reshuffle this unless I have to.

Agree, I have a pile on top myself.

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] io_uring: use task_work for links if possible
  2020-06-26 21:20     ` Pavel Begunkov
@ 2020-06-27  1:45       ` Jens Axboe
  2020-06-27 10:57         ` Pavel Begunkov
  0 siblings, 1 reply; 9+ messages in thread
From: Jens Axboe @ 2020-06-27  1:45 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 6/26/20 3:20 PM, Pavel Begunkov wrote:
>>>> +		tsk = io_wq_get_task(req->ctx->io_wq);
>>>> +		task_work_add(tsk, &req->task_work, true);
>>>> +	}
>>>> +	wake_up_process(tsk);
>>>> +}
>>>> +
>>>>  static void io_free_req(struct io_kiocb *req)
>>>>  {
>>>>  	struct io_kiocb *nxt = NULL;
>>>> @@ -1671,8 +1758,12 @@ static void io_free_req(struct io_kiocb *req)
>>>>  	io_req_find_next(req, &nxt);
>>>>  	__io_free_req(req);
>>>>  
>>>> -	if (nxt)
>>>> -		io_queue_async_work(nxt);
>>>> +	if (nxt) {
>>>> +		if (nxt->flags & REQ_F_WORK_INITIALIZED)
>>>> +			io_queue_async_work(nxt);
>>>
>>> Don't think it will work. E.g. io_close_prep() may have set
>>> REQ_F_WORK_INITIALIZED but without io_req_work_grab_env().
>>
>> This really doesn't change the existing path, it just makes sure we
>> don't do io_req_task_queue() on something that has already modified
>> ->work (and hence, ->task_work). This might miss cases where we have
>> only cleared it and done nothing else, but that just means we'll have
>> cases that we could potentially improve the effiency of down the line.
> 
> Before the patch it was always initialising linked reqs, and that would
> work ok, if not this lazy grab_env().
> 
> E.g. req1 -> close_req
> 
> It calls, io_req_defer_prep(__close_req__, sqe, __false__)
> which doesn't do grab_env() because of for_async=false,
> but calls io_close_prep() which sets REQ_F_WORK_INITIALIZED.
> 
> Then, after completion of req1 it will follow added lines
> 
> if (nxt)
> 	if (nxt->flags & REQ_F_WORK_INITIALIZED)
> 		io_queue_async_work(nxt);
> 
> Ending up in
> 
> io_queue_async_work()
> 	-> grab_env()
> 
> And that's who knows from which context.
> E.g. req1 was an rw completed in an irq.

Hmm yes, good point, that is a problem. I don't have a good immediate
solution for this. Do you have any suggestions on how best to handle
this?

> Not sure it's related, but fallocate shows the log below, and some
> other tests hang the kernel as well.

Yeah, that's indeed that very thing.

>> True, that could be false instead.
>>
>> Since these are just minor things, we can do a fix on top. I don't want
>> to reshuffle this unless I have to.
> 
> Agree, I have a pile on top myself.

Fire away :-)

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] io_uring: use task_work for links if possible
  2020-06-27  1:45       ` Jens Axboe
@ 2020-06-27 10:57         ` Pavel Begunkov
  0 siblings, 0 replies; 9+ messages in thread
From: Pavel Begunkov @ 2020-06-27 10:57 UTC (permalink / raw)
  To: Jens Axboe, io-uring

On 27/06/2020 04:45, Jens Axboe wrote:
> On 6/26/20 3:20 PM, Pavel Begunkov wrote:
>>>>> +		tsk = io_wq_get_task(req->ctx->io_wq);
>>>>> +		task_work_add(tsk, &req->task_work, true);
>>>>> +	}
>>>>> +	wake_up_process(tsk);
>>>>> +}
>>>>> +
>>>>>  static void io_free_req(struct io_kiocb *req)
>>>>>  {
>>>>>  	struct io_kiocb *nxt = NULL;
>>>>> @@ -1671,8 +1758,12 @@ static void io_free_req(struct io_kiocb *req)
>>>>>  	io_req_find_next(req, &nxt);
>>>>>  	__io_free_req(req);
>>>>>  
>>>>> -	if (nxt)
>>>>> -		io_queue_async_work(nxt);
>>>>> +	if (nxt) {
>>>>> +		if (nxt->flags & REQ_F_WORK_INITIALIZED)
>>>>> +			io_queue_async_work(nxt);
>>>>
>>>> Don't think it will work. E.g. io_close_prep() may have set
>>>> REQ_F_WORK_INITIALIZED but without io_req_work_grab_env().
>>>
>>> This really doesn't change the existing path, it just makes sure we
>>> don't do io_req_task_queue() on something that has already modified
>>> ->work (and hence, ->task_work). This might miss cases where we have
>>> only cleared it and done nothing else, but that just means we'll have
>>> cases that we could potentially improve the effiency of down the line.
>>
>> Before the patch it was always initialising linked reqs, and that would
>> work ok, if not this lazy grab_env().
>>
>> E.g. req1 -> close_req
>>
>> It calls, io_req_defer_prep(__close_req__, sqe, __false__)
>> which doesn't do grab_env() because of for_async=false,
>> but calls io_close_prep() which sets REQ_F_WORK_INITIALIZED.
>>
>> Then, after completion of req1 it will follow added lines
>>
>> if (nxt)
>> 	if (nxt->flags & REQ_F_WORK_INITIALIZED)
>> 		io_queue_async_work(nxt);
>>
>> Ending up in
>>
>> io_queue_async_work()
>> 	-> grab_env()
>>
>> And that's who knows from which context.
>> E.g. req1 was an rw completed in an irq.
> 
> Hmm yes, good point, that is a problem. I don't have a good immediate
> solution for this. Do you have any suggestions on how best to handle
> this?

I certainly don't want another REQ_F_GRABBED_ENV flag :)

From the start I was planning to move all grab_env() calls to
io_queue_async_work() just before we're doing punting. like

io_queue_async_work(req) {
	// simplified
	for_each_in_link(req)
		grab_env();
	...
}

If done right, this can solve a lot of problems and simplify
lifetime management. There are much more problems, I'll send
a patchset with quick fixes, and then we can do it right
without hurry.

> 
>> Not sure it's related, but fallocate shows the log below, and some
>> other tests hang the kernel as well.
> 
> Yeah, that's indeed that very thing.

Turns out it's not.

> 
>>> True, that could be false instead.
>>>
>>> Since these are just minor things, we can do a fix on top. I don't want
>>> to reshuffle this unless I have to.
>>
>> Agree, I have a pile on top myself.
> 
> Fire away :-)

I prefer to have a working branch first.


-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2020-06-27 10:58 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-06-25 18:27 [PATCH] io_uring: use task_work for links if possible Jens Axboe
2020-06-25 20:28 ` Pavel Begunkov
2020-06-25 21:37   ` Jens Axboe
2020-06-26  9:41     ` Pavel Begunkov
2020-06-26 20:27 ` Pavel Begunkov
2020-06-26 20:43   ` Jens Axboe
2020-06-26 21:20     ` Pavel Begunkov
2020-06-27  1:45       ` Jens Axboe
2020-06-27 10:57         ` Pavel Begunkov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).