io-uring.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/8] a second pack of 5.12 cleanups
@ 2021-02-02  0:21 Pavel Begunkov
  2021-02-02  0:21 ` [PATCH 1/8] io_uring: deduplicate core cancellations sequence Pavel Begunkov
                   ` (8 more replies)
  0 siblings, 9 replies; 10+ messages in thread
From: Pavel Begunkov @ 2021-02-02  0:21 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Those are a bit harder to look through.

4-8 are trying to simplify io_read(). 7/8 looks like a real problem, but
not sure if even can happen with this IOCB_WAITQ set.

Pavel Begunkov (8):
  io_uring: deduplicate core cancellations sequence
  io_uring: refactor scheduling in io_cqring_wait
  io_uring: refactor io_cqring_wait
  io_uring: refactor io_read for unsupported nowait
  io_uring: further simplify do_read error parsing
  io_uring: let io_setup_async_rw take care of iovec
  io_uring: don't forget to adjust io_size
  io_uring: inline io_read()'s iovec freeing

 fs/io_uring.c | 215 +++++++++++++++++++++++---------------------------
 1 file changed, 97 insertions(+), 118 deletions(-)

-- 
2.24.0


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 1/8] io_uring: deduplicate core cancellations sequence
  2021-02-02  0:21 [PATCH 0/8] a second pack of 5.12 cleanups Pavel Begunkov
@ 2021-02-02  0:21 ` Pavel Begunkov
  2021-02-02  0:21 ` [PATCH 2/8] io_uring: refactor scheduling in io_cqring_wait Pavel Begunkov
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Pavel Begunkov @ 2021-02-02  0:21 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Files and task cancellations go over same steps trying to cancel
requests in io-wq, poll, etc. Deduplicate it with a helper.

note: new io_uring_try_cancel_requests() is former
__io_uring_cancel_task_requests() with files passed as an agrument and
flushing overflowed requests.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 85 ++++++++++++++++++++++++---------------------------
 1 file changed, 40 insertions(+), 45 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 24ad36d71289..a750c504366d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1003,9 +1003,9 @@ enum io_mem_account {
 	ACCT_PINNED,
 };
 
-static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
-					    struct task_struct *task);
-
+static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
+					 struct task_struct *task,
+					 struct files_struct *files);
 static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node);
 static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
 			struct io_ring_ctx *ctx);
@@ -8817,7 +8817,7 @@ static void io_ring_exit_work(struct work_struct *work)
 	 * as nobody else will be looking for them.
 	 */
 	do {
-		__io_uring_cancel_task_requests(ctx, NULL);
+		io_uring_try_cancel_requests(ctx, NULL, NULL);
 	} while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
 	io_ring_ctx_free(ctx);
 }
@@ -8931,6 +8931,40 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx,
 	}
 }
 
+static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
+					 struct task_struct *task,
+					 struct files_struct *files)
+{
+	struct io_task_cancel cancel = { .task = task, .files = files, };
+
+	while (1) {
+		enum io_wq_cancel cret;
+		bool ret = false;
+
+		if (ctx->io_wq) {
+			cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb,
+					       &cancel, true);
+			ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
+		}
+
+		/* SQPOLL thread does its own polling */
+		if (!(ctx->flags & IORING_SETUP_SQPOLL) && !files) {
+			while (!list_empty_careful(&ctx->iopoll_list)) {
+				io_iopoll_try_reap_events(ctx);
+				ret = true;
+			}
+		}
+
+		ret |= io_poll_remove_all(ctx, task, files);
+		ret |= io_kill_timeouts(ctx, task, files);
+		ret |= io_run_task_work();
+		io_cqring_overflow_flush(ctx, true, task, files);
+		if (!ret)
+			break;
+		cond_resched();
+	}
+}
+
 static int io_uring_count_inflight(struct io_ring_ctx *ctx,
 				   struct task_struct *task,
 				   struct files_struct *files)
@@ -8950,7 +8984,6 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
 				  struct files_struct *files)
 {
 	while (!list_empty_careful(&ctx->inflight_list)) {
-		struct io_task_cancel cancel = { .task = task, .files = files };
 		DEFINE_WAIT(wait);
 		int inflight;
 
@@ -8958,13 +8991,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
 		if (!inflight)
 			break;
 
-		io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true);
-		io_poll_remove_all(ctx, task, files);
-		io_kill_timeouts(ctx, task, files);
-		io_cqring_overflow_flush(ctx, true, task, files);
-		/* cancellations _may_ trigger task work */
-		io_run_task_work();
-
+		io_uring_try_cancel_requests(ctx, task, files);
 		prepare_to_wait(&task->io_uring->wait, &wait,
 				TASK_UNINTERRUPTIBLE);
 		if (inflight == io_uring_count_inflight(ctx, task, files))
@@ -8973,37 +9000,6 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
 	}
 }
 
-static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
-					    struct task_struct *task)
-{
-	while (1) {
-		struct io_task_cancel cancel = { .task = task, .files = NULL, };
-		enum io_wq_cancel cret;
-		bool ret = false;
-
-		if (ctx->io_wq) {
-			cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb,
-					       &cancel, true);
-			ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
-		}
-
-		/* SQPOLL thread does its own polling */
-		if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
-			while (!list_empty_careful(&ctx->iopoll_list)) {
-				io_iopoll_try_reap_events(ctx);
-				ret = true;
-			}
-		}
-
-		ret |= io_poll_remove_all(ctx, task, NULL);
-		ret |= io_kill_timeouts(ctx, task, NULL);
-		ret |= io_run_task_work();
-		if (!ret)
-			break;
-		cond_resched();
-	}
-}
-
 static void io_disable_sqo_submit(struct io_ring_ctx *ctx)
 {
 	mutex_lock(&ctx->uring_lock);
@@ -9033,11 +9029,10 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
 	}
 
 	io_cancel_defer_files(ctx, task, files);
-	io_cqring_overflow_flush(ctx, true, task, files);
 
 	io_uring_cancel_files(ctx, task, files);
 	if (!files)
-		__io_uring_cancel_task_requests(ctx, task);
+		io_uring_try_cancel_requests(ctx, task, NULL);
 
 	if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
 		atomic_dec(&task->io_uring->in_idle);
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 2/8] io_uring: refactor scheduling in io_cqring_wait
  2021-02-02  0:21 [PATCH 0/8] a second pack of 5.12 cleanups Pavel Begunkov
  2021-02-02  0:21 ` [PATCH 1/8] io_uring: deduplicate core cancellations sequence Pavel Begunkov
@ 2021-02-02  0:21 ` Pavel Begunkov
  2021-02-02  0:21 ` [PATCH 3/8] io_uring: refactor io_cqring_wait Pavel Begunkov
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Pavel Begunkov @ 2021-02-02  0:21 UTC (permalink / raw)
  To: Jens Axboe, io-uring

schedule_timeout() with timeout=MAX_SCHEDULE_TIMEOUT is guaranteed to
work just as schedule(), so instead of hand-coding it based on arguments
always use the timeout version and simplify code.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a750c504366d..5b735635b8f0 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7213,9 +7213,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 		.to_wait	= min_events,
 	};
 	struct io_rings *rings = ctx->rings;
-	struct timespec64 ts;
-	signed long timeout = 0;
-	int ret = 0;
+	signed long timeout = MAX_SCHEDULE_TIMEOUT;
+	int ret;
 
 	do {
 		io_cqring_overflow_flush(ctx, false, NULL, NULL);
@@ -7239,6 +7238,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	}
 
 	if (uts) {
+		struct timespec64 ts;
+
 		if (get_timespec64(&ts, uts))
 			return -EFAULT;
 		timeout = timespec64_to_jiffies(&ts);
@@ -7264,14 +7265,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			finish_wait(&ctx->wait, &iowq.wq);
 			continue;
 		}
-		if (uts) {
-			timeout = schedule_timeout(timeout);
-			if (timeout == 0) {
-				ret = -ETIME;
-				break;
-			}
-		} else {
-			schedule();
+		timeout = schedule_timeout(timeout);
+		if (timeout == 0) {
+			ret = -ETIME;
+			break;
 		}
 	} while (1);
 	finish_wait(&ctx->wait, &iowq.wq);
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 3/8] io_uring: refactor io_cqring_wait
  2021-02-02  0:21 [PATCH 0/8] a second pack of 5.12 cleanups Pavel Begunkov
  2021-02-02  0:21 ` [PATCH 1/8] io_uring: deduplicate core cancellations sequence Pavel Begunkov
  2021-02-02  0:21 ` [PATCH 2/8] io_uring: refactor scheduling in io_cqring_wait Pavel Begunkov
@ 2021-02-02  0:21 ` Pavel Begunkov
  2021-02-02  0:21 ` [PATCH 4/8] io_uring: refactor io_read for unsupported nowait Pavel Begunkov
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Pavel Begunkov @ 2021-02-02  0:21 UTC (permalink / raw)
  To: Jens Axboe, io-uring

It's easy to make a mistake in io_cqring_wait() because for all
break/continue clauses we need to watch for prepare/finish_wait to be
used correctly. Extract all those into a new helper
io_cqring_wait_schedule(), and transforming the loop into simple series
of func calls: prepare(); check_and_schedule(); finish();

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 43 ++++++++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5b735635b8f0..dcb9e937daa3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7195,6 +7195,25 @@ static int io_run_task_work_sig(void)
 	return -EINTR;
 }
 
+/* when returns >0, the caller should retry */
+static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
+					  struct io_wait_queue *iowq,
+					  signed long *timeout)
+{
+	int ret;
+
+	/* make sure we run task_work before checking for signals */
+	ret = io_run_task_work_sig();
+	if (ret || io_should_wake(iowq))
+		return ret;
+	/* let the caller flush overflows, retry */
+	if (test_bit(0, &ctx->cq_check_overflow))
+		return 1;
+
+	*timeout = schedule_timeout(*timeout);
+	return !*timeout ? -ETIME : 1;
+}
+
 /*
  * Wait until events become available, if we don't already have some. The
  * application must reap them itself, as they reside on the shared cq ring.
@@ -7251,27 +7270,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 		io_cqring_overflow_flush(ctx, false, NULL, NULL);
 		prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
 						TASK_INTERRUPTIBLE);
-		/* make sure we run task_work before checking for signals */
-		ret = io_run_task_work_sig();
-		if (ret > 0) {
-			finish_wait(&ctx->wait, &iowq.wq);
-			continue;
-		}
-		else if (ret < 0)
-			break;
-		if (io_should_wake(&iowq))
-			break;
-		if (test_bit(0, &ctx->cq_check_overflow)) {
-			finish_wait(&ctx->wait, &iowq.wq);
-			continue;
-		}
-		timeout = schedule_timeout(timeout);
-		if (timeout == 0) {
-			ret = -ETIME;
-			break;
-		}
-	} while (1);
-	finish_wait(&ctx->wait, &iowq.wq);
+		ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
+		finish_wait(&ctx->wait, &iowq.wq);
+	} while (ret > 0);
 
 	restore_saved_sigmask_unless(ret == -EINTR);
 
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 4/8] io_uring: refactor io_read for unsupported nowait
  2021-02-02  0:21 [PATCH 0/8] a second pack of 5.12 cleanups Pavel Begunkov
                   ` (2 preceding siblings ...)
  2021-02-02  0:21 ` [PATCH 3/8] io_uring: refactor io_cqring_wait Pavel Begunkov
@ 2021-02-02  0:21 ` Pavel Begunkov
  2021-02-02  0:21 ` [PATCH 5/8] io_uring: further simplify do_read error parsing Pavel Begunkov
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Pavel Begunkov @ 2021-02-02  0:21 UTC (permalink / raw)
  To: Jens Axboe, io-uring

!io_file_supports_async() case of io_read() is hard to read, it jumps
somewhere in the middle of the function just to do async setup and fail
on a similar check. Call io_setup_async_rw() directly for this case,
it's much easier to follow.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index dcb9e937daa3..866e0ea83dbe 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3506,7 +3506,6 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	struct iov_iter __iter, *iter = &__iter;
 	struct io_async_rw *rw = req->async_data;
 	ssize_t io_size, ret, ret2;
-	bool no_async;
 
 	if (rw) {
 		iter = &rw->iter;
@@ -3527,9 +3526,12 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 		kiocb->ki_flags |= IOCB_NOWAIT;
 
 	/* If the file doesn't support async, just async punt */
-	no_async = force_nonblock && !io_file_supports_async(req->file, READ);
-	if (no_async)
-		goto copy_iov;
+	if (force_nonblock && !io_file_supports_async(req->file, READ)) {
+		ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
+		if (!ret)
+			return -EAGAIN;
+		goto out_free;
+	}
 
 	ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
 	if (unlikely(ret))
@@ -3568,8 +3570,6 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 		ret = ret2;
 		goto out_free;
 	}
-	if (no_async)
-		return -EAGAIN;
 	rw = req->async_data;
 	/* it's copied and will be cleaned with ->io */
 	iovec = NULL;
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 5/8] io_uring: further simplify do_read error parsing
  2021-02-02  0:21 [PATCH 0/8] a second pack of 5.12 cleanups Pavel Begunkov
                   ` (3 preceding siblings ...)
  2021-02-02  0:21 ` [PATCH 4/8] io_uring: refactor io_read for unsupported nowait Pavel Begunkov
@ 2021-02-02  0:21 ` Pavel Begunkov
  2021-02-02  0:21 ` [PATCH 6/8] io_uring: let io_setup_async_rw take care of iovec Pavel Begunkov
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Pavel Begunkov @ 2021-02-02  0:21 UTC (permalink / raw)
  To: Jens Axboe, io-uring

First, instead of checking iov_iter_count(iter) for 0 to find out that
all needed bytes were read, just compare returned code against io_size.
It's more reliable and arguably cleaner.

Also, place the half-read case into an else branch and delete an extra
label.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 866e0ea83dbe..1d1fa1f77332 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3552,19 +3552,18 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 		/* some cases will consume bytes even on error returns */
 		iov_iter_revert(iter, io_size - iov_iter_count(iter));
 		ret = 0;
-		goto copy_iov;
-	} else if (ret <= 0) {
+	} else if (ret <= 0 || ret == io_size) {
 		/* make sure -ERESTARTSYS -> -EINTR is done */
 		goto done;
-	}
+	} else {
+		/* we did blocking attempt. no retry. */
+		if (!force_nonblock || (req->file->f_flags & O_NONBLOCK) ||
+		    !(req->flags & REQ_F_ISREG))
+			goto done;
 
-	/* read it all, or we did blocking attempt. no retry. */
-	if (!iov_iter_count(iter) || !force_nonblock ||
-	    (req->file->f_flags & O_NONBLOCK) || !(req->flags & REQ_F_ISREG))
-		goto done;
+		io_size -= ret;
+	}
 
-	io_size -= ret;
-copy_iov:
 	ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
 	if (ret2) {
 		ret = ret2;
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 6/8] io_uring: let io_setup_async_rw take care of iovec
  2021-02-02  0:21 [PATCH 0/8] a second pack of 5.12 cleanups Pavel Begunkov
                   ` (4 preceding siblings ...)
  2021-02-02  0:21 ` [PATCH 5/8] io_uring: further simplify do_read error parsing Pavel Begunkov
@ 2021-02-02  0:21 ` Pavel Begunkov
  2021-02-02  0:21 ` [PATCH 7/8] io_uring: don't forget to adjust io_size Pavel Begunkov
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Pavel Begunkov @ 2021-02-02  0:21 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Now we give out ownership of iovec into io_setup_async_rw(), so it
either sets request's context right or frees the iovec on error itself.
Makes our life a bit easier at call sites.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1d1fa1f77332..f8492d62b6a1 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2721,11 +2721,7 @@ static bool io_resubmit_prep(struct io_kiocb *req)
 	ret = io_import_iovec(rw, req, &iovec, &iter, false);
 	if (ret < 0)
 		return false;
-	ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
-	if (!ret)
-		return true;
-	kfree(iovec);
-	return false;
+	return !io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
 }
 #endif
 
@@ -3366,8 +3362,10 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
 	if (!force && !io_op_defs[req->opcode].needs_async_data)
 		return 0;
 	if (!req->async_data) {
-		if (__io_alloc_async_data(req))
+		if (__io_alloc_async_data(req)) {
+			kfree(iovec);
 			return -ENOMEM;
+		}
 
 		io_req_map_rw(req, iovec, fast_iov, iter);
 	}
@@ -3528,9 +3526,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	/* If the file doesn't support async, just async punt */
 	if (force_nonblock && !io_file_supports_async(req->file, READ)) {
 		ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
-		if (!ret)
-			return -EAGAIN;
-		goto out_free;
+		return ret ?: -EAGAIN;
 	}
 
 	ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
@@ -3565,10 +3561,9 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	}
 
 	ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
-	if (ret2) {
-		ret = ret2;
-		goto out_free;
-	}
+	if (ret2)
+		return ret2;
+
 	rw = req->async_data;
 	/* it's copied and will be cleaned with ->io */
 	iovec = NULL;
@@ -3703,8 +3698,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
 		/* some cases will consume bytes even on error returns */
 		iov_iter_revert(iter, io_size - iov_iter_count(iter));
 		ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
-		if (!ret)
-			return -EAGAIN;
+		return ret ?: -EAGAIN;
 	}
 out_free:
 	/* it's reportedly faster than delegating the null check to kfree() */
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 7/8] io_uring: don't forget to adjust io_size
  2021-02-02  0:21 [PATCH 0/8] a second pack of 5.12 cleanups Pavel Begunkov
                   ` (5 preceding siblings ...)
  2021-02-02  0:21 ` [PATCH 6/8] io_uring: let io_setup_async_rw take care of iovec Pavel Begunkov
@ 2021-02-02  0:21 ` Pavel Begunkov
  2021-02-02  0:21 ` [PATCH 8/8] io_uring: inline io_read()'s iovec freeing Pavel Begunkov
  2021-02-04 13:52 ` [PATCH 0/8] a second pack of 5.12 cleanups Pavel Begunkov
  8 siblings, 0 replies; 10+ messages in thread
From: Pavel Begunkov @ 2021-02-02  0:21 UTC (permalink / raw)
  To: Jens Axboe, io-uring

We have invariant in io_read() of how much we're trying to read spilled
into an iter and io_size variable. The last one controls decision making
about whether to do read-retries. However, io_size is modified only
after the first read attempt, so if we happen to go for a third retry in
a single call to io_read(), we will get io_size greater than in the
iterator, so may lead to various side effects up to live-locking.

Modify io_size each time.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f8492d62b6a1..3e648c0e6b8d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3551,13 +3551,10 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	} else if (ret <= 0 || ret == io_size) {
 		/* make sure -ERESTARTSYS -> -EINTR is done */
 		goto done;
-	} else {
+	} else if (!force_nonblock || (req->file->f_flags & O_NONBLOCK) ||
+		   !(req->flags & REQ_F_ISREG)) {
 		/* we did blocking attempt. no retry. */
-		if (!force_nonblock || (req->file->f_flags & O_NONBLOCK) ||
-		    !(req->flags & REQ_F_ISREG))
-			goto done;
-
-		io_size -= ret;
+		goto done;
 	}
 
 	ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
@@ -3570,6 +3567,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	/* now use our persistent iterator, if we aren't already */
 	iter = &rw->iter;
 retry:
+	io_size -= ret;
 	rw->bytes_done += ret;
 	/* if we can retry, do so with the callbacks armed */
 	if (!io_rw_should_retry(req)) {
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 8/8] io_uring: inline io_read()'s iovec freeing
  2021-02-02  0:21 [PATCH 0/8] a second pack of 5.12 cleanups Pavel Begunkov
                   ` (6 preceding siblings ...)
  2021-02-02  0:21 ` [PATCH 7/8] io_uring: don't forget to adjust io_size Pavel Begunkov
@ 2021-02-02  0:21 ` Pavel Begunkov
  2021-02-04 13:52 ` [PATCH 0/8] a second pack of 5.12 cleanups Pavel Begunkov
  8 siblings, 0 replies; 10+ messages in thread
From: Pavel Begunkov @ 2021-02-02  0:21 UTC (permalink / raw)
  To: Jens Axboe, io-uring

io_read() has not the simpliest control flow with a lot of jumps and
it's hard to read. One of those is a out_free: label, which frees iovec.
However, from the middle of io_read() iovec is NULL'ed and so
kfree(iovec) is no-op, it leaves us with two place where we can inline
it and further clean up the code.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3e648c0e6b8d..17a3736661c6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3530,14 +3530,18 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	}
 
 	ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
-	if (unlikely(ret))
-		goto out_free;
+	if (unlikely(ret)) {
+		kfree(iovec);
+		return ret;
+	}
 
 	ret = io_iter_do_read(req, iter);
 
 	if (ret == -EIOCBQUEUED) {
-		ret = 0;
-		goto out_free;
+		/* it's faster to check here then delegate to kfree */
+		if (iovec)
+			kfree(iovec);
+		return 0;
 	} else if (ret == -EAGAIN) {
 		/* IOPOLL retry should happen for io-wq threads */
 		if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
@@ -3562,8 +3566,6 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 		return ret2;
 
 	rw = req->async_data;
-	/* it's copied and will be cleaned with ->io */
-	iovec = NULL;
 	/* now use our persistent iterator, if we aren't already */
 	iter = &rw->iter;
 retry:
@@ -3582,21 +3584,14 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	 * do, then just retry at the new offset.
 	 */
 	ret = io_iter_do_read(req, iter);
-	if (ret == -EIOCBQUEUED) {
-		ret = 0;
-		goto out_free;
-	} else if (ret > 0 && ret < io_size) {
-		/* we got some bytes, but not all. retry. */
+	if (ret == -EIOCBQUEUED)
+		return 0;
+	/* we got some bytes, but not all. retry. */
+	if (ret > 0 && ret < io_size)
 		goto retry;
-	}
 done:
 	kiocb_done(kiocb, ret, cs);
-	ret = 0;
-out_free:
-	/* it's reportedly faster than delegating the null check to kfree() */
-	if (iovec)
-		kfree(iovec);
-	return ret;
+	return 0;
 }
 
 static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH 0/8] a second pack of 5.12 cleanups
  2021-02-02  0:21 [PATCH 0/8] a second pack of 5.12 cleanups Pavel Begunkov
                   ` (7 preceding siblings ...)
  2021-02-02  0:21 ` [PATCH 8/8] io_uring: inline io_read()'s iovec freeing Pavel Begunkov
@ 2021-02-04 13:52 ` Pavel Begunkov
  8 siblings, 0 replies; 10+ messages in thread
From: Pavel Begunkov @ 2021-02-04 13:52 UTC (permalink / raw)
  To: Jens Axboe, io-uring

On 02/02/2021 00:21, Pavel Begunkov wrote:
> Those are a bit harder to look through.
> 
> 4-8 are trying to simplify io_read(). 7/8 looks like a real problem, but
> not sure if even can happen with this IOCB_WAITQ set.

extended v2 has been sent

> 
> Pavel Begunkov (8):
>   io_uring: deduplicate core cancellations sequence
>   io_uring: refactor scheduling in io_cqring_wait
>   io_uring: refactor io_cqring_wait
>   io_uring: refactor io_read for unsupported nowait
>   io_uring: further simplify do_read error parsing
>   io_uring: let io_setup_async_rw take care of iovec
>   io_uring: don't forget to adjust io_size
>   io_uring: inline io_read()'s iovec freeing
> 
>  fs/io_uring.c | 215 +++++++++++++++++++++++---------------------------
>  1 file changed, 97 insertions(+), 118 deletions(-)
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2021-02-04 14:00 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-02-02  0:21 [PATCH 0/8] a second pack of 5.12 cleanups Pavel Begunkov
2021-02-02  0:21 ` [PATCH 1/8] io_uring: deduplicate core cancellations sequence Pavel Begunkov
2021-02-02  0:21 ` [PATCH 2/8] io_uring: refactor scheduling in io_cqring_wait Pavel Begunkov
2021-02-02  0:21 ` [PATCH 3/8] io_uring: refactor io_cqring_wait Pavel Begunkov
2021-02-02  0:21 ` [PATCH 4/8] io_uring: refactor io_read for unsupported nowait Pavel Begunkov
2021-02-02  0:21 ` [PATCH 5/8] io_uring: further simplify do_read error parsing Pavel Begunkov
2021-02-02  0:21 ` [PATCH 6/8] io_uring: let io_setup_async_rw take care of iovec Pavel Begunkov
2021-02-02  0:21 ` [PATCH 7/8] io_uring: don't forget to adjust io_size Pavel Begunkov
2021-02-02  0:21 ` [PATCH 8/8] io_uring: inline io_read()'s iovec freeing Pavel Begunkov
2021-02-04 13:52 ` [PATCH 0/8] a second pack of 5.12 cleanups Pavel Begunkov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).