All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups
@ 2021-02-04 13:51 Pavel Begunkov
  2021-02-04 13:51 ` [PATCH v2 01/13] io_uring: deduplicate core cancellations sequence Pavel Begunkov
                   ` (13 more replies)
  0 siblings, 14 replies; 18+ messages in thread
From: Pavel Begunkov @ 2021-02-04 13:51 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Bunch of random cleanups, noticeable part of which (4-9/13) refactor
io_read(), which is currently not in the best shape and is hard to
understand.

7/13 may actually fix a problem.
10/13 should make NOWAIT and NONBLOCK to work right

v2: add 9-13/13

Pavel Begunkov (13):
  io_uring: deduplicate core cancellations sequence
  io_uring: refactor scheduling in io_cqring_wait
  io_uring: refactor io_cqring_wait
  io_uring: refactor io_read for unsupported nowait
  io_uring: further simplify do_read error parsing
  io_uring: let io_setup_async_rw take care of iovec
  io_uring: don't forget to adjust io_size
  io_uring: inline io_read()'s iovec freeing
  io_uring: highlight read-retry loop
  io_uring: treat NONBLOCK and RWF_NOWAIT similarly
  io_uring: io_import_iovec return type cleanup
  io_uring: deduplicate file table slot calculation
  io_uring/io-wq: return 2-step work swap scheme

 fs/io-wq.c    |  16 +--
 fs/io-wq.h    |   4 +-
 fs/io_uring.c | 366 ++++++++++++++++++++++----------------------------
 3 files changed, 166 insertions(+), 220 deletions(-)

-- 
2.24.0


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH v2 01/13] io_uring: deduplicate core cancellations sequence
  2021-02-04 13:51 [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Pavel Begunkov
@ 2021-02-04 13:51 ` Pavel Begunkov
  2021-02-04 13:51 ` [PATCH v2 02/13] io_uring: refactor scheduling in io_cqring_wait Pavel Begunkov
                   ` (12 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2021-02-04 13:51 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Files and task cancellations go over same steps trying to cancel
requests in io-wq, poll, etc. Deduplicate it with a helper.

note: new io_uring_try_cancel_requests() is former
__io_uring_cancel_task_requests() with files passed as an agrument and
flushing overflowed requests.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 85 ++++++++++++++++++++++++---------------------------
 1 file changed, 40 insertions(+), 45 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 24ad36d71289..a750c504366d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1003,9 +1003,9 @@ enum io_mem_account {
 	ACCT_PINNED,
 };
 
-static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
-					    struct task_struct *task);
-
+static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
+					 struct task_struct *task,
+					 struct files_struct *files);
 static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node);
 static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
 			struct io_ring_ctx *ctx);
@@ -8817,7 +8817,7 @@ static void io_ring_exit_work(struct work_struct *work)
 	 * as nobody else will be looking for them.
 	 */
 	do {
-		__io_uring_cancel_task_requests(ctx, NULL);
+		io_uring_try_cancel_requests(ctx, NULL, NULL);
 	} while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
 	io_ring_ctx_free(ctx);
 }
@@ -8931,6 +8931,40 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx,
 	}
 }
 
+static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
+					 struct task_struct *task,
+					 struct files_struct *files)
+{
+	struct io_task_cancel cancel = { .task = task, .files = files, };
+
+	while (1) {
+		enum io_wq_cancel cret;
+		bool ret = false;
+
+		if (ctx->io_wq) {
+			cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb,
+					       &cancel, true);
+			ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
+		}
+
+		/* SQPOLL thread does its own polling */
+		if (!(ctx->flags & IORING_SETUP_SQPOLL) && !files) {
+			while (!list_empty_careful(&ctx->iopoll_list)) {
+				io_iopoll_try_reap_events(ctx);
+				ret = true;
+			}
+		}
+
+		ret |= io_poll_remove_all(ctx, task, files);
+		ret |= io_kill_timeouts(ctx, task, files);
+		ret |= io_run_task_work();
+		io_cqring_overflow_flush(ctx, true, task, files);
+		if (!ret)
+			break;
+		cond_resched();
+	}
+}
+
 static int io_uring_count_inflight(struct io_ring_ctx *ctx,
 				   struct task_struct *task,
 				   struct files_struct *files)
@@ -8950,7 +8984,6 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
 				  struct files_struct *files)
 {
 	while (!list_empty_careful(&ctx->inflight_list)) {
-		struct io_task_cancel cancel = { .task = task, .files = files };
 		DEFINE_WAIT(wait);
 		int inflight;
 
@@ -8958,13 +8991,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
 		if (!inflight)
 			break;
 
-		io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true);
-		io_poll_remove_all(ctx, task, files);
-		io_kill_timeouts(ctx, task, files);
-		io_cqring_overflow_flush(ctx, true, task, files);
-		/* cancellations _may_ trigger task work */
-		io_run_task_work();
-
+		io_uring_try_cancel_requests(ctx, task, files);
 		prepare_to_wait(&task->io_uring->wait, &wait,
 				TASK_UNINTERRUPTIBLE);
 		if (inflight == io_uring_count_inflight(ctx, task, files))
@@ -8973,37 +9000,6 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
 	}
 }
 
-static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
-					    struct task_struct *task)
-{
-	while (1) {
-		struct io_task_cancel cancel = { .task = task, .files = NULL, };
-		enum io_wq_cancel cret;
-		bool ret = false;
-
-		if (ctx->io_wq) {
-			cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb,
-					       &cancel, true);
-			ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
-		}
-
-		/* SQPOLL thread does its own polling */
-		if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
-			while (!list_empty_careful(&ctx->iopoll_list)) {
-				io_iopoll_try_reap_events(ctx);
-				ret = true;
-			}
-		}
-
-		ret |= io_poll_remove_all(ctx, task, NULL);
-		ret |= io_kill_timeouts(ctx, task, NULL);
-		ret |= io_run_task_work();
-		if (!ret)
-			break;
-		cond_resched();
-	}
-}
-
 static void io_disable_sqo_submit(struct io_ring_ctx *ctx)
 {
 	mutex_lock(&ctx->uring_lock);
@@ -9033,11 +9029,10 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
 	}
 
 	io_cancel_defer_files(ctx, task, files);
-	io_cqring_overflow_flush(ctx, true, task, files);
 
 	io_uring_cancel_files(ctx, task, files);
 	if (!files)
-		__io_uring_cancel_task_requests(ctx, task);
+		io_uring_try_cancel_requests(ctx, task, NULL);
 
 	if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
 		atomic_dec(&task->io_uring->in_idle);
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 02/13] io_uring: refactor scheduling in io_cqring_wait
  2021-02-04 13:51 [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Pavel Begunkov
  2021-02-04 13:51 ` [PATCH v2 01/13] io_uring: deduplicate core cancellations sequence Pavel Begunkov
@ 2021-02-04 13:51 ` Pavel Begunkov
  2021-02-04 13:51 ` [PATCH v2 03/13] io_uring: refactor io_cqring_wait Pavel Begunkov
                   ` (11 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2021-02-04 13:51 UTC (permalink / raw)
  To: Jens Axboe, io-uring

schedule_timeout() with timeout=MAX_SCHEDULE_TIMEOUT is guaranteed to
work just as schedule(), so instead of hand-coding it based on arguments
always use the timeout version and simplify code.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a750c504366d..5b735635b8f0 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7213,9 +7213,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 		.to_wait	= min_events,
 	};
 	struct io_rings *rings = ctx->rings;
-	struct timespec64 ts;
-	signed long timeout = 0;
-	int ret = 0;
+	signed long timeout = MAX_SCHEDULE_TIMEOUT;
+	int ret;
 
 	do {
 		io_cqring_overflow_flush(ctx, false, NULL, NULL);
@@ -7239,6 +7238,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	}
 
 	if (uts) {
+		struct timespec64 ts;
+
 		if (get_timespec64(&ts, uts))
 			return -EFAULT;
 		timeout = timespec64_to_jiffies(&ts);
@@ -7264,14 +7265,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			finish_wait(&ctx->wait, &iowq.wq);
 			continue;
 		}
-		if (uts) {
-			timeout = schedule_timeout(timeout);
-			if (timeout == 0) {
-				ret = -ETIME;
-				break;
-			}
-		} else {
-			schedule();
+		timeout = schedule_timeout(timeout);
+		if (timeout == 0) {
+			ret = -ETIME;
+			break;
 		}
 	} while (1);
 	finish_wait(&ctx->wait, &iowq.wq);
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 03/13] io_uring: refactor io_cqring_wait
  2021-02-04 13:51 [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Pavel Begunkov
  2021-02-04 13:51 ` [PATCH v2 01/13] io_uring: deduplicate core cancellations sequence Pavel Begunkov
  2021-02-04 13:51 ` [PATCH v2 02/13] io_uring: refactor scheduling in io_cqring_wait Pavel Begunkov
@ 2021-02-04 13:51 ` Pavel Begunkov
  2021-02-04 13:51 ` [PATCH v2 04/13] io_uring: refactor io_read for unsupported nowait Pavel Begunkov
                   ` (10 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2021-02-04 13:51 UTC (permalink / raw)
  To: Jens Axboe, io-uring

It's easy to make a mistake in io_cqring_wait() because for all
break/continue clauses we need to watch for prepare/finish_wait to be
used correctly. Extract all those into a new helper
io_cqring_wait_schedule(), and transforming the loop into simple series
of func calls: prepare(); check_and_schedule(); finish();

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 43 ++++++++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5b735635b8f0..dcb9e937daa3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7195,6 +7195,25 @@ static int io_run_task_work_sig(void)
 	return -EINTR;
 }
 
+/* when returns >0, the caller should retry */
+static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
+					  struct io_wait_queue *iowq,
+					  signed long *timeout)
+{
+	int ret;
+
+	/* make sure we run task_work before checking for signals */
+	ret = io_run_task_work_sig();
+	if (ret || io_should_wake(iowq))
+		return ret;
+	/* let the caller flush overflows, retry */
+	if (test_bit(0, &ctx->cq_check_overflow))
+		return 1;
+
+	*timeout = schedule_timeout(*timeout);
+	return !*timeout ? -ETIME : 1;
+}
+
 /*
  * Wait until events become available, if we don't already have some. The
  * application must reap them itself, as they reside on the shared cq ring.
@@ -7251,27 +7270,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 		io_cqring_overflow_flush(ctx, false, NULL, NULL);
 		prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
 						TASK_INTERRUPTIBLE);
-		/* make sure we run task_work before checking for signals */
-		ret = io_run_task_work_sig();
-		if (ret > 0) {
-			finish_wait(&ctx->wait, &iowq.wq);
-			continue;
-		}
-		else if (ret < 0)
-			break;
-		if (io_should_wake(&iowq))
-			break;
-		if (test_bit(0, &ctx->cq_check_overflow)) {
-			finish_wait(&ctx->wait, &iowq.wq);
-			continue;
-		}
-		timeout = schedule_timeout(timeout);
-		if (timeout == 0) {
-			ret = -ETIME;
-			break;
-		}
-	} while (1);
-	finish_wait(&ctx->wait, &iowq.wq);
+		ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
+		finish_wait(&ctx->wait, &iowq.wq);
+	} while (ret > 0);
 
 	restore_saved_sigmask_unless(ret == -EINTR);
 
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 04/13] io_uring: refactor io_read for unsupported nowait
  2021-02-04 13:51 [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Pavel Begunkov
                   ` (2 preceding siblings ...)
  2021-02-04 13:51 ` [PATCH v2 03/13] io_uring: refactor io_cqring_wait Pavel Begunkov
@ 2021-02-04 13:51 ` Pavel Begunkov
  2021-02-04 13:52 ` [PATCH v2 05/13] io_uring: further simplify do_read error parsing Pavel Begunkov
                   ` (9 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2021-02-04 13:51 UTC (permalink / raw)
  To: Jens Axboe, io-uring

!io_file_supports_async() case of io_read() is hard to read, it jumps
somewhere in the middle of the function just to do async setup and fail
on a similar check. Call io_setup_async_rw() directly for this case,
it's much easier to follow.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index dcb9e937daa3..866e0ea83dbe 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3506,7 +3506,6 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	struct iov_iter __iter, *iter = &__iter;
 	struct io_async_rw *rw = req->async_data;
 	ssize_t io_size, ret, ret2;
-	bool no_async;
 
 	if (rw) {
 		iter = &rw->iter;
@@ -3527,9 +3526,12 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 		kiocb->ki_flags |= IOCB_NOWAIT;
 
 	/* If the file doesn't support async, just async punt */
-	no_async = force_nonblock && !io_file_supports_async(req->file, READ);
-	if (no_async)
-		goto copy_iov;
+	if (force_nonblock && !io_file_supports_async(req->file, READ)) {
+		ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
+		if (!ret)
+			return -EAGAIN;
+		goto out_free;
+	}
 
 	ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
 	if (unlikely(ret))
@@ -3568,8 +3570,6 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 		ret = ret2;
 		goto out_free;
 	}
-	if (no_async)
-		return -EAGAIN;
 	rw = req->async_data;
 	/* it's copied and will be cleaned with ->io */
 	iovec = NULL;
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 05/13] io_uring: further simplify do_read error parsing
  2021-02-04 13:51 [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Pavel Begunkov
                   ` (3 preceding siblings ...)
  2021-02-04 13:51 ` [PATCH v2 04/13] io_uring: refactor io_read for unsupported nowait Pavel Begunkov
@ 2021-02-04 13:52 ` Pavel Begunkov
  2021-02-04 13:52 ` [PATCH v2 06/13] io_uring: let io_setup_async_rw take care of iovec Pavel Begunkov
                   ` (8 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2021-02-04 13:52 UTC (permalink / raw)
  To: Jens Axboe, io-uring

First, instead of checking iov_iter_count(iter) for 0 to find out that
all needed bytes were read, just compare returned code against io_size.
It's more reliable and arguably cleaner.

Also, place the half-read case into an else branch and delete an extra
label.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 866e0ea83dbe..1d1fa1f77332 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3552,19 +3552,18 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 		/* some cases will consume bytes even on error returns */
 		iov_iter_revert(iter, io_size - iov_iter_count(iter));
 		ret = 0;
-		goto copy_iov;
-	} else if (ret <= 0) {
+	} else if (ret <= 0 || ret == io_size) {
 		/* make sure -ERESTARTSYS -> -EINTR is done */
 		goto done;
-	}
+	} else {
+		/* we did blocking attempt. no retry. */
+		if (!force_nonblock || (req->file->f_flags & O_NONBLOCK) ||
+		    !(req->flags & REQ_F_ISREG))
+			goto done;
 
-	/* read it all, or we did blocking attempt. no retry. */
-	if (!iov_iter_count(iter) || !force_nonblock ||
-	    (req->file->f_flags & O_NONBLOCK) || !(req->flags & REQ_F_ISREG))
-		goto done;
+		io_size -= ret;
+	}
 
-	io_size -= ret;
-copy_iov:
 	ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
 	if (ret2) {
 		ret = ret2;
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 06/13] io_uring: let io_setup_async_rw take care of iovec
  2021-02-04 13:51 [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Pavel Begunkov
                   ` (4 preceding siblings ...)
  2021-02-04 13:52 ` [PATCH v2 05/13] io_uring: further simplify do_read error parsing Pavel Begunkov
@ 2021-02-04 13:52 ` Pavel Begunkov
  2021-02-04 13:52 ` [PATCH v2 07/13] io_uring: don't forget to adjust io_size Pavel Begunkov
                   ` (7 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2021-02-04 13:52 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Now we give out ownership of iovec into io_setup_async_rw(), so it
either sets request's context right or frees the iovec on error itself.
Makes our life a bit easier at call sites.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1d1fa1f77332..f8492d62b6a1 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2721,11 +2721,7 @@ static bool io_resubmit_prep(struct io_kiocb *req)
 	ret = io_import_iovec(rw, req, &iovec, &iter, false);
 	if (ret < 0)
 		return false;
-	ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
-	if (!ret)
-		return true;
-	kfree(iovec);
-	return false;
+	return !io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
 }
 #endif
 
@@ -3366,8 +3362,10 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
 	if (!force && !io_op_defs[req->opcode].needs_async_data)
 		return 0;
 	if (!req->async_data) {
-		if (__io_alloc_async_data(req))
+		if (__io_alloc_async_data(req)) {
+			kfree(iovec);
 			return -ENOMEM;
+		}
 
 		io_req_map_rw(req, iovec, fast_iov, iter);
 	}
@@ -3528,9 +3526,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	/* If the file doesn't support async, just async punt */
 	if (force_nonblock && !io_file_supports_async(req->file, READ)) {
 		ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
-		if (!ret)
-			return -EAGAIN;
-		goto out_free;
+		return ret ?: -EAGAIN;
 	}
 
 	ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
@@ -3565,10 +3561,9 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	}
 
 	ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
-	if (ret2) {
-		ret = ret2;
-		goto out_free;
-	}
+	if (ret2)
+		return ret2;
+
 	rw = req->async_data;
 	/* it's copied and will be cleaned with ->io */
 	iovec = NULL;
@@ -3703,8 +3698,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
 		/* some cases will consume bytes even on error returns */
 		iov_iter_revert(iter, io_size - iov_iter_count(iter));
 		ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
-		if (!ret)
-			return -EAGAIN;
+		return ret ?: -EAGAIN;
 	}
 out_free:
 	/* it's reportedly faster than delegating the null check to kfree() */
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 07/13] io_uring: don't forget to adjust io_size
  2021-02-04 13:51 [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Pavel Begunkov
                   ` (5 preceding siblings ...)
  2021-02-04 13:52 ` [PATCH v2 06/13] io_uring: let io_setup_async_rw take care of iovec Pavel Begunkov
@ 2021-02-04 13:52 ` Pavel Begunkov
  2021-02-04 13:52 ` [PATCH v2 08/13] io_uring: inline io_read()'s iovec freeing Pavel Begunkov
                   ` (6 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2021-02-04 13:52 UTC (permalink / raw)
  To: Jens Axboe, io-uring

We have invariant in io_read() of how much we're trying to read spilled
into an iter and io_size variable. The last one controls decision making
about whether to do read-retries. However, io_size is modified only
after the first read attempt, so if we happen to go for a third retry in
a single call to io_read(), we will get io_size greater than in the
iterator, so may lead to various side effects up to live-locking.

Modify io_size each time.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f8492d62b6a1..25fffff27c76 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3548,16 +3548,11 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 		/* some cases will consume bytes even on error returns */
 		iov_iter_revert(iter, io_size - iov_iter_count(iter));
 		ret = 0;
-	} else if (ret <= 0 || ret == io_size) {
-		/* make sure -ERESTARTSYS -> -EINTR is done */
+	} else if (ret <= 0 || ret == io_size || !force_nonblock ||
+		   (req->file->f_flags & O_NONBLOCK) ||
+		   !(req->flags & REQ_F_ISREG)) {
+		/* read all, failed, already did sync or don't want to retry */
 		goto done;
-	} else {
-		/* we did blocking attempt. no retry. */
-		if (!force_nonblock || (req->file->f_flags & O_NONBLOCK) ||
-		    !(req->flags & REQ_F_ISREG))
-			goto done;
-
-		io_size -= ret;
 	}
 
 	ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
@@ -3570,6 +3565,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	/* now use our persistent iterator, if we aren't already */
 	iter = &rw->iter;
 retry:
+	io_size -= ret;
 	rw->bytes_done += ret;
 	/* if we can retry, do so with the callbacks armed */
 	if (!io_rw_should_retry(req)) {
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 08/13] io_uring: inline io_read()'s iovec freeing
  2021-02-04 13:51 [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Pavel Begunkov
                   ` (6 preceding siblings ...)
  2021-02-04 13:52 ` [PATCH v2 07/13] io_uring: don't forget to adjust io_size Pavel Begunkov
@ 2021-02-04 13:52 ` Pavel Begunkov
  2021-02-04 13:52 ` [PATCH v2 09/13] io_uring: highlight read-retry loop Pavel Begunkov
                   ` (5 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2021-02-04 13:52 UTC (permalink / raw)
  To: Jens Axboe, io-uring

io_read() has not the simpliest control flow with a lot of jumps and
it's hard to read. One of those is a out_free: label, which frees iovec.
However, from the middle of io_read() iovec is NULL'ed and so
kfree(iovec) is no-op, it leaves us with two place where we can inline
it and further clean up the code.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 25fffff27c76..35ad889afaec 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3530,14 +3530,18 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	}
 
 	ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
-	if (unlikely(ret))
-		goto out_free;
+	if (unlikely(ret)) {
+		kfree(iovec);
+		return ret;
+	}
 
 	ret = io_iter_do_read(req, iter);
 
 	if (ret == -EIOCBQUEUED) {
-		ret = 0;
-		goto out_free;
+		/* it's faster to check here then delegate to kfree */
+		if (iovec)
+			kfree(iovec);
+		return 0;
 	} else if (ret == -EAGAIN) {
 		/* IOPOLL retry should happen for io-wq threads */
 		if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
@@ -3560,8 +3564,6 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 		return ret2;
 
 	rw = req->async_data;
-	/* it's copied and will be cleaned with ->io */
-	iovec = NULL;
 	/* now use our persistent iterator, if we aren't already */
 	iter = &rw->iter;
 retry:
@@ -3580,21 +3582,14 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	 * do, then just retry at the new offset.
 	 */
 	ret = io_iter_do_read(req, iter);
-	if (ret == -EIOCBQUEUED) {
-		ret = 0;
-		goto out_free;
-	} else if (ret > 0 && ret < io_size) {
-		/* we got some bytes, but not all. retry. */
+	if (ret == -EIOCBQUEUED)
+		return 0;
+	/* we got some bytes, but not all. retry. */
+	if (ret > 0 && ret < io_size)
 		goto retry;
-	}
 done:
 	kiocb_done(kiocb, ret, cs);
-	ret = 0;
-out_free:
-	/* it's reportedly faster than delegating the null check to kfree() */
-	if (iovec)
-		kfree(iovec);
-	return ret;
+	return 0;
 }
 
 static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 09/13] io_uring: highlight read-retry loop
  2021-02-04 13:51 [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Pavel Begunkov
                   ` (7 preceding siblings ...)
  2021-02-04 13:52 ` [PATCH v2 08/13] io_uring: inline io_read()'s iovec freeing Pavel Begunkov
@ 2021-02-04 13:52 ` Pavel Begunkov
  2021-02-04 13:52 ` [PATCH v2 10/13] io_uring: treat NONBLOCK and RWF_NOWAIT similarly Pavel Begunkov
                   ` (4 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2021-02-04 13:52 UTC (permalink / raw)
  To: Jens Axboe, io-uring

We already have implicit do-while for read-retries but with goto in the
end. Convert it to an actual do-while, it highlights it so making a
bit more understandable and is cleaner in general.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 35ad889afaec..bbf8ea8370d6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3566,27 +3566,27 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	rw = req->async_data;
 	/* now use our persistent iterator, if we aren't already */
 	iter = &rw->iter;
-retry:
-	io_size -= ret;
-	rw->bytes_done += ret;
-	/* if we can retry, do so with the callbacks armed */
-	if (!io_rw_should_retry(req)) {
-		kiocb->ki_flags &= ~IOCB_WAITQ;
-		return -EAGAIN;
-	}
 
-	/*
-	 * Now retry read with the IOCB_WAITQ parts set in the iocb. If we
-	 * get -EIOCBQUEUED, then we'll get a notification when the desired
-	 * page gets unlocked. We can also get a partial read here, and if we
-	 * do, then just retry at the new offset.
-	 */
-	ret = io_iter_do_read(req, iter);
-	if (ret == -EIOCBQUEUED)
-		return 0;
-	/* we got some bytes, but not all. retry. */
-	if (ret > 0 && ret < io_size)
-		goto retry;
+	do {
+		io_size -= ret;
+		rw->bytes_done += ret;
+		/* if we can retry, do so with the callbacks armed */
+		if (!io_rw_should_retry(req)) {
+			kiocb->ki_flags &= ~IOCB_WAITQ;
+			return -EAGAIN;
+		}
+
+		/*
+		 * Now retry read with the IOCB_WAITQ parts set in the iocb. If
+		 * we get -EIOCBQUEUED, then we'll get a notification when the
+		 * desired page gets unlocked. We can also get a partial read
+		 * here, and if we do, then just retry at the new offset.
+		 */
+		ret = io_iter_do_read(req, iter);
+		if (ret == -EIOCBQUEUED)
+			return 0;
+		/* we got some bytes, but not all. retry. */
+	} while (ret > 0 && ret < io_size);
 done:
 	kiocb_done(kiocb, ret, cs);
 	return 0;
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 10/13] io_uring: treat NONBLOCK and RWF_NOWAIT similarly
  2021-02-04 13:51 [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Pavel Begunkov
                   ` (8 preceding siblings ...)
  2021-02-04 13:52 ` [PATCH v2 09/13] io_uring: highlight read-retry loop Pavel Begunkov
@ 2021-02-04 13:52 ` Pavel Begunkov
  2021-02-04 13:52 ` [PATCH v2 11/13] io_uring: io_import_iovec return type cleanup Pavel Begunkov
                   ` (3 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2021-02-04 13:52 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Make decision making of whether we need to retry read/write similar for
O_NONBLOCK and RWF_NOWAIT. Set REQ_F_NOWAIT when either is specified and
use it for all relevant checks. Also fix resubmitting NOWAIT requests
via io_rw_reissue().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index bbf8ea8370d6..ce2ea3f55f65 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2734,7 +2734,9 @@ static bool io_rw_reissue(struct io_kiocb *req, long res)
 	if (res != -EAGAIN && res != -EOPNOTSUPP)
 		return false;
 	mode = file_inode(req->file)->i_mode;
-	if ((!S_ISBLK(mode) && !S_ISREG(mode)) || io_wq_current_is_worker())
+	if (!S_ISBLK(mode) && !S_ISREG(mode))
+		return false;
+	if ((req->flags & REQ_F_NOWAIT) || io_wq_current_is_worker())
 		return false;
 
 	lockdep_assert_held(&req->ctx->uring_lock);
@@ -2907,16 +2909,17 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct kiocb *kiocb = &req->rw.kiocb;
+	struct file *file = req->file;
 	unsigned ioprio;
 	int ret;
 
-	if (S_ISREG(file_inode(req->file)->i_mode))
+	if (S_ISREG(file_inode(file)->i_mode))
 		req->flags |= REQ_F_ISREG;
 
 	kiocb->ki_pos = READ_ONCE(sqe->off);
-	if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) {
+	if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) {
 		req->flags |= REQ_F_CUR_POS;
-		kiocb->ki_pos = req->file->f_pos;
+		kiocb->ki_pos = file->f_pos;
 	}
 	kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
 	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
@@ -2924,6 +2927,10 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (unlikely(ret))
 		return ret;
 
+	/* don't allow async punt for O_NONBLOCK or RWF_NOWAIT */
+	if ((kiocb->ki_flags & IOCB_NOWAIT) || (file->f_flags & O_NONBLOCK))
+		req->flags |= REQ_F_NOWAIT;
+
 	ioprio = READ_ONCE(sqe->ioprio);
 	if (ioprio) {
 		ret = ioprio_check_cap(ioprio);
@@ -2934,10 +2941,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	} else
 		kiocb->ki_ioprio = get_current_ioprio();
 
-	/* don't allow async punt if RWF_NOWAIT was requested */
-	if (kiocb->ki_flags & IOCB_NOWAIT)
-		req->flags |= REQ_F_NOWAIT;
-
 	if (ctx->flags & IORING_SETUP_IOPOLL) {
 		if (!(kiocb->ki_flags & IOCB_DIRECT) ||
 		    !kiocb->ki_filp->f_op->iopoll)
@@ -3546,15 +3549,14 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 		/* IOPOLL retry should happen for io-wq threads */
 		if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
 			goto done;
-		/* no retry on NONBLOCK marked file */
-		if (req->file->f_flags & O_NONBLOCK)
+		/* no retry on NONBLOCK nor RWF_NOWAIT */
+		if (req->flags & REQ_F_NOWAIT)
 			goto done;
 		/* some cases will consume bytes even on error returns */
 		iov_iter_revert(iter, io_size - iov_iter_count(iter));
 		ret = 0;
 	} else if (ret <= 0 || ret == io_size || !force_nonblock ||
-		   (req->file->f_flags & O_NONBLOCK) ||
-		   !(req->flags & REQ_F_ISREG)) {
+		   (req->flags & REQ_F_NOWAIT) || !(req->flags & REQ_F_ISREG)) {
 		/* read all, failed, already did sync or don't want to retry */
 		goto done;
 	}
@@ -3675,8 +3677,8 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
 	 */
 	if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
 		ret2 = -EAGAIN;
-	/* no retry on NONBLOCK marked file */
-	if (ret2 == -EAGAIN && (req->file->f_flags & O_NONBLOCK))
+	/* no retry on NONBLOCK nor RWF_NOWAIT */
+	if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
 		goto done;
 	if (!force_nonblock || ret2 != -EAGAIN) {
 		/* IOPOLL retry should happen for io-wq threads */
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 11/13] io_uring: io_import_iovec return type cleanup
  2021-02-04 13:51 [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Pavel Begunkov
                   ` (9 preceding siblings ...)
  2021-02-04 13:52 ` [PATCH v2 10/13] io_uring: treat NONBLOCK and RWF_NOWAIT similarly Pavel Begunkov
@ 2021-02-04 13:52 ` Pavel Begunkov
  2021-02-04 13:52 ` [PATCH v2 12/13] io_uring: deduplicate file table slot calculation Pavel Begunkov
                   ` (2 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2021-02-04 13:52 UTC (permalink / raw)
  To: Jens Axboe, io-uring

io_import_iovec() doesn't return IO size anymore, only error code. Make
it more apparent by returning int instead of ssize and clean up
leftovers.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 30 +++++++++++-------------------
 1 file changed, 11 insertions(+), 19 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ce2ea3f55f65..24cc00ff7155 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1030,9 +1030,8 @@ static struct file *io_file_get(struct io_submit_state *state,
 static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs);
 static void io_rsrc_put_work(struct work_struct *work);
 
-static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
-			       struct iovec **iovec, struct iov_iter *iter,
-			       bool needs_lock);
+static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
+			   struct iov_iter *iter, bool needs_lock);
 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
 			     const struct iovec *fast_iov,
 			     struct iov_iter *iter, bool force);
@@ -2693,9 +2692,8 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res,
 static bool io_resubmit_prep(struct io_kiocb *req)
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
-	ssize_t ret = -ECANCELED;
+	int rw, ret = -ECANCELED;
 	struct iov_iter iter;
-	int rw;
 
 	/* already prepared */
 	if (req->async_data)
@@ -3004,8 +3002,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
 		io_rw_done(kiocb, ret);
 }
 
-static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
-			       struct iov_iter *iter)
+static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	size_t len = req->rw.len;
@@ -3069,7 +3066,7 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
 		}
 	}
 
-	return len;
+	return 0;
 }
 
 static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
@@ -3210,16 +3207,14 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
 	return __io_iov_buffer_select(req, iov, needs_lock);
 }
 
-static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
-				 struct iovec **iovec, struct iov_iter *iter,
-				 bool needs_lock)
+static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
+			   struct iov_iter *iter, bool needs_lock)
 {
 	void __user *buf = u64_to_user_ptr(req->rw.addr);
 	size_t sqe_len = req->rw.len;
+	u8 opcode = req->opcode;
 	ssize_t ret;
-	u8 opcode;
 
-	opcode = req->opcode;
 	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
 		*iovec = NULL;
 		return io_import_fixed(req, rw, iter);
@@ -3244,10 +3239,8 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
 
 	if (req->flags & REQ_F_BUFFER_SELECT) {
 		ret = io_iov_buffer_select(req, *iovec, needs_lock);
-		if (!ret) {
-			ret = (*iovec)->iov_len;
-			iov_iter_init(iter, rw, *iovec, 1, ret);
-		}
+		if (!ret)
+			iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len);
 		*iovec = NULL;
 		return ret;
 	}
@@ -3379,7 +3372,7 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
 {
 	struct io_async_rw *iorw = req->async_data;
 	struct iovec *iov = iorw->fast_iov;
-	ssize_t ret;
+	int ret;
 
 	ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
 	if (unlikely(ret < 0))
@@ -3518,7 +3511,6 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	}
 	io_size = iov_iter_count(iter);
 	req->result = io_size;
-	ret = 0;
 
 	/* Ensure we clear previously set non-block flag */
 	if (!force_nonblock)
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 12/13] io_uring: deduplicate file table slot calculation
  2021-02-04 13:51 [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Pavel Begunkov
                   ` (10 preceding siblings ...)
  2021-02-04 13:52 ` [PATCH v2 11/13] io_uring: io_import_iovec return type cleanup Pavel Begunkov
@ 2021-02-04 13:52 ` Pavel Begunkov
  2021-02-04 13:52 ` [PATCH v2 13/13] io_uring/io-wq: return 2-step work swap scheme Pavel Begunkov
  2021-02-04 15:07 ` [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Jens Axboe
  13 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2021-02-04 13:52 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Extract a helper io_fixed_file_slot() returning a place in our fixed
files table, so we don't hand-code it three times in the code.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 41 +++++++++++++++++++----------------------
 1 file changed, 19 insertions(+), 22 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 24cc00ff7155..5ee6a9273fca 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7740,6 +7740,15 @@ static void io_rsrc_put_work(struct work_struct *work)
 	}
 }
 
+static struct file **io_fixed_file_slot(struct fixed_rsrc_data *file_data,
+					unsigned i)
+{
+	struct fixed_rsrc_table *table;
+
+	table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT];
+	return &table->files[i & IORING_FILE_TABLE_MASK];
+}
+
 static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
 {
 	struct fixed_rsrc_ref_node *ref_node;
@@ -7808,6 +7817,7 @@ static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node)
 	kfree(ref_node);
 }
 
+
 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 				 unsigned nr_args)
 {
@@ -7840,9 +7850,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		goto out_free;
 
 	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
-		struct fixed_rsrc_table *table;
-		unsigned index;
-
 		if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
 			ret = -EFAULT;
 			goto out_fput;
@@ -7867,9 +7874,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 			fput(file);
 			goto out_fput;
 		}
-		table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT];
-		index = i & IORING_FILE_TABLE_MASK;
-		table->files[index] = file;
+		*io_fixed_file_slot(file_data, i) = file;
 	}
 
 	ret = io_sqe_files_scm(ctx);
@@ -7972,7 +7977,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 {
 	struct fixed_rsrc_data *data = ctx->file_data;
 	struct fixed_rsrc_ref_node *ref_node;
-	struct file *file;
+	struct file *file, **file_slot;
 	__s32 __user *fds;
 	int fd, i, err;
 	__u32 done;
@@ -7990,9 +7995,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 
 	fds = u64_to_user_ptr(up->data);
 	for (done = 0; done < nr_args; done++) {
-		struct fixed_rsrc_table *table;
-		unsigned index;
-
 		err = 0;
 		if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
 			err = -EFAULT;
@@ -8002,14 +8004,13 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 			continue;
 
 		i = array_index_nospec(up->offset + done, ctx->nr_user_files);
-		table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
-		index = i & IORING_FILE_TABLE_MASK;
-		if (table->files[index]) {
-			file = table->files[index];
-			err = io_queue_file_removal(data, file);
+		file_slot = io_fixed_file_slot(ctx->file_data, i);
+
+		if (*file_slot) {
+			err = io_queue_file_removal(data, *file_slot);
 			if (err)
 				break;
-			table->files[index] = NULL;
+			*file_slot = NULL;
 			needs_switch = true;
 		}
 		if (fd != -1) {
@@ -8031,13 +8032,12 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				err = -EBADF;
 				break;
 			}
-			table->files[index] = file;
 			err = io_sqe_file_register(ctx, file, i);
 			if (err) {
-				table->files[index] = NULL;
 				fput(file);
 				break;
 			}
+			*file_slot = file;
 		}
 	}
 
@@ -9488,11 +9488,8 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
 	seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
 	seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
 	for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
-		struct fixed_rsrc_table *table;
-		struct file *f;
+		struct file *f = *io_fixed_file_slot(ctx->file_data, i);
 
-		table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
-		f = table->files[i & IORING_FILE_TABLE_MASK];
 		if (f)
 			seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
 		else
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 13/13] io_uring/io-wq: return 2-step work swap scheme
  2021-02-04 13:51 [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Pavel Begunkov
                   ` (11 preceding siblings ...)
  2021-02-04 13:52 ` [PATCH v2 12/13] io_uring: deduplicate file table slot calculation Pavel Begunkov
@ 2021-02-04 13:52 ` Pavel Begunkov
  2021-02-04 14:52   ` Jens Axboe
  2021-02-04 15:07 ` [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Jens Axboe
  13 siblings, 1 reply; 18+ messages in thread
From: Pavel Begunkov @ 2021-02-04 13:52 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Saving one lock/unlock for io-wq is not super important, but adds some
ugliness in the code. More important, atomic decs not turning it to zero
for some archs won't give the right ordering/barriers so the
io_steal_work() may pretty easily get subtly and completely broken.

Return back 2-step io-wq work exchange and clean it up.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io-wq.c    | 16 ++++++----------
 fs/io-wq.h    |  4 ++--
 fs/io_uring.c | 26 ++++----------------------
 3 files changed, 12 insertions(+), 34 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 2e2f14f42bf2..63ef195b1acb 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -555,23 +555,21 @@ static void io_worker_handle_work(struct io_worker *worker)
 
 		/* handle a whole dependent link */
 		do {
-			struct io_wq_work *old_work, *next_hashed, *linked;
+			struct io_wq_work *next_hashed, *linked;
 			unsigned int hash = io_get_work_hash(work);
 
 			next_hashed = wq_next_work(work);
 			io_impersonate_work(worker, work);
+			wq->do_work(work);
+			io_assign_current_work(worker, NULL);
 
-			old_work = work;
-			linked = wq->do_work(work);
-
+			linked = wq->free_work(work);
 			work = next_hashed;
 			if (!work && linked && !io_wq_is_hashed(linked)) {
 				work = linked;
 				linked = NULL;
 			}
 			io_assign_current_work(worker, work);
-			wq->free_work(old_work);
-
 			if (linked)
 				io_wqe_enqueue(wqe, linked);
 
@@ -850,11 +848,9 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
 	struct io_wq *wq = wqe->wq;
 
 	do {
-		struct io_wq_work *old_work = work;
-
 		work->flags |= IO_WQ_WORK_CANCEL;
-		work = wq->do_work(work);
-		wq->free_work(old_work);
+		wq->do_work(work);
+		work = wq->free_work(work);
 	} while (work);
 }
 
diff --git a/fs/io-wq.h b/fs/io-wq.h
index e1ffb80a4a1d..e37a0f217cc8 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -106,8 +106,8 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
 	return container_of(work->list.next, struct io_wq_work, list);
 }
 
-typedef void (free_work_fn)(struct io_wq_work *);
-typedef struct io_wq_work *(io_wq_work_fn)(struct io_wq_work *);
+typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
+typedef void (io_wq_work_fn)(struct io_wq_work *);
 
 struct io_wq_data {
 	struct user_struct *user;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5ee6a9273fca..b740a39110d6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2379,22 +2379,6 @@ static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
 		io_free_req_deferred(req);
 }
 
-static struct io_wq_work *io_steal_work(struct io_kiocb *req)
-{
-	struct io_kiocb *nxt;
-
-	/*
-	 * A ref is owned by io-wq in which context we're. So, if that's the
-	 * last one, it's safe to steal next work. False negatives are Ok,
-	 * it just will be re-punted async in io_put_work()
-	 */
-	if (refcount_read(&req->refs) != 1)
-		return NULL;
-
-	nxt = io_req_find_next(req);
-	return nxt ? &nxt->work : NULL;
-}
-
 static void io_double_put_req(struct io_kiocb *req)
 {
 	/* drop both submit and complete references */
@@ -6343,7 +6327,7 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock,
 	return 0;
 }
 
-static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work)
+static void io_wq_submit_work(struct io_wq_work *work)
 {
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 	struct io_kiocb *timeout;
@@ -6394,8 +6378,6 @@ static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work)
 		if (lock_ctx)
 			mutex_unlock(&lock_ctx->uring_lock);
 	}
-
-	return io_steal_work(req);
 }
 
 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
@@ -8067,12 +8049,12 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
 	return __io_sqe_files_update(ctx, &up, nr_args);
 }
 
-static void io_free_work(struct io_wq_work *work)
+static struct io_wq_work *io_free_work(struct io_wq_work *work)
 {
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 
-	/* Consider that io_steal_work() relies on this ref */
-	io_put_req(req);
+	req = io_put_req_find_next(req);
+	return req ? &req->work : NULL;
 }
 
 static int io_init_wq_offload(struct io_ring_ctx *ctx,
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 13/13] io_uring/io-wq: return 2-step work swap scheme
  2021-02-04 13:52 ` [PATCH v2 13/13] io_uring/io-wq: return 2-step work swap scheme Pavel Begunkov
@ 2021-02-04 14:52   ` Jens Axboe
  2021-02-04 14:56     ` Pavel Begunkov
  0 siblings, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2021-02-04 14:52 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 2/4/21 6:52 AM, Pavel Begunkov wrote:
> Saving one lock/unlock for io-wq is not super important, but adds some
> ugliness in the code. More important, atomic decs not turning it to zero
> for some archs won't give the right ordering/barriers so the
> io_steal_work() may pretty easily get subtly and completely broken.
> 
> Return back 2-step io-wq work exchange and clean it up.

IIRC, this wasn't done to skip the lock/unlock exchange, which I agree
doesn't matter, but to ensure that a link would not need another io-wq
punt. And that is a big deal, it's much faster to run it from that
same thread, rather than needing a new async queue and new thread grab
to get there.

Just want to make sure that's on your mind... Maybe it's still fine
as-is, didn't look too closely yet or test it.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 13/13] io_uring/io-wq: return 2-step work swap scheme
  2021-02-04 14:52   ` Jens Axboe
@ 2021-02-04 14:56     ` Pavel Begunkov
  2021-02-04 15:05       ` Jens Axboe
  0 siblings, 1 reply; 18+ messages in thread
From: Pavel Begunkov @ 2021-02-04 14:56 UTC (permalink / raw)
  To: Jens Axboe, io-uring

On 04/02/2021 14:52, Jens Axboe wrote:
> On 2/4/21 6:52 AM, Pavel Begunkov wrote:
>> Saving one lock/unlock for io-wq is not super important, but adds some
>> ugliness in the code. More important, atomic decs not turning it to zero
>> for some archs won't give the right ordering/barriers so the
>> io_steal_work() may pretty easily get subtly and completely broken.
>>
>> Return back 2-step io-wq work exchange and clean it up.
> 
> IIRC, this wasn't done to skip the lock/unlock exchange, which I agree
> doesn't matter, but to ensure that a link would not need another io-wq
> punt. And that is a big deal, it's much faster to run it from that
> same thread, rather than needing a new async queue and new thread grab
> to get there.

Right, we just refer to different patches and moments. This one is fine
in that regard, it just moves returning link from ->do_work() to
->free_work().

> 
> Just want to make sure that's on your mind... Maybe it's still fine
> as-is, didn't look too closely yet or test it.
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 13/13] io_uring/io-wq: return 2-step work swap scheme
  2021-02-04 14:56     ` Pavel Begunkov
@ 2021-02-04 15:05       ` Jens Axboe
  0 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2021-02-04 15:05 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 2/4/21 7:56 AM, Pavel Begunkov wrote:
> On 04/02/2021 14:52, Jens Axboe wrote:
>> On 2/4/21 6:52 AM, Pavel Begunkov wrote:
>>> Saving one lock/unlock for io-wq is not super important, but adds some
>>> ugliness in the code. More important, atomic decs not turning it to zero
>>> for some archs won't give the right ordering/barriers so the
>>> io_steal_work() may pretty easily get subtly and completely broken.
>>>
>>> Return back 2-step io-wq work exchange and clean it up.
>>
>> IIRC, this wasn't done to skip the lock/unlock exchange, which I agree
>> doesn't matter, but to ensure that a link would not need another io-wq
>> punt. And that is a big deal, it's much faster to run it from that
>> same thread, rather than needing a new async queue and new thread grab
>> to get there.
> 
> Right, we just refer to different patches and moments. This one is fine
> in that regard, it just moves returning link from ->do_work() to
> ->free_work().

Got it, looks good then.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups
  2021-02-04 13:51 [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Pavel Begunkov
                   ` (12 preceding siblings ...)
  2021-02-04 13:52 ` [PATCH v2 13/13] io_uring/io-wq: return 2-step work swap scheme Pavel Begunkov
@ 2021-02-04 15:07 ` Jens Axboe
  13 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2021-02-04 15:07 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 2/4/21 6:51 AM, Pavel Begunkov wrote:
> Bunch of random cleanups, noticeable part of which (4-9/13) refactor
> io_read(), which is currently not in the best shape and is hard to
> understand.
> 
> 7/13 may actually fix a problem.
> 10/13 should make NOWAIT and NONBLOCK to work right
> 
> v2: add 9-13/13
> 
> Pavel Begunkov (13):
>   io_uring: deduplicate core cancellations sequence
>   io_uring: refactor scheduling in io_cqring_wait
>   io_uring: refactor io_cqring_wait
>   io_uring: refactor io_read for unsupported nowait
>   io_uring: further simplify do_read error parsing
>   io_uring: let io_setup_async_rw take care of iovec
>   io_uring: don't forget to adjust io_size
>   io_uring: inline io_read()'s iovec freeing
>   io_uring: highlight read-retry loop
>   io_uring: treat NONBLOCK and RWF_NOWAIT similarly
>   io_uring: io_import_iovec return type cleanup
>   io_uring: deduplicate file table slot calculation
>   io_uring/io-wq: return 2-step work swap scheme
> 
>  fs/io-wq.c    |  16 +--
>  fs/io-wq.h    |   4 +-
>  fs/io_uring.c | 366 ++++++++++++++++++++++----------------------------
>  3 files changed, 166 insertions(+), 220 deletions(-)

Applied, thanks Pavel!

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2021-02-04 16:27 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-02-04 13:51 [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Pavel Begunkov
2021-02-04 13:51 ` [PATCH v2 01/13] io_uring: deduplicate core cancellations sequence Pavel Begunkov
2021-02-04 13:51 ` [PATCH v2 02/13] io_uring: refactor scheduling in io_cqring_wait Pavel Begunkov
2021-02-04 13:51 ` [PATCH v2 03/13] io_uring: refactor io_cqring_wait Pavel Begunkov
2021-02-04 13:51 ` [PATCH v2 04/13] io_uring: refactor io_read for unsupported nowait Pavel Begunkov
2021-02-04 13:52 ` [PATCH v2 05/13] io_uring: further simplify do_read error parsing Pavel Begunkov
2021-02-04 13:52 ` [PATCH v2 06/13] io_uring: let io_setup_async_rw take care of iovec Pavel Begunkov
2021-02-04 13:52 ` [PATCH v2 07/13] io_uring: don't forget to adjust io_size Pavel Begunkov
2021-02-04 13:52 ` [PATCH v2 08/13] io_uring: inline io_read()'s iovec freeing Pavel Begunkov
2021-02-04 13:52 ` [PATCH v2 09/13] io_uring: highlight read-retry loop Pavel Begunkov
2021-02-04 13:52 ` [PATCH v2 10/13] io_uring: treat NONBLOCK and RWF_NOWAIT similarly Pavel Begunkov
2021-02-04 13:52 ` [PATCH v2 11/13] io_uring: io_import_iovec return type cleanup Pavel Begunkov
2021-02-04 13:52 ` [PATCH v2 12/13] io_uring: deduplicate file table slot calculation Pavel Begunkov
2021-02-04 13:52 ` [PATCH v2 13/13] io_uring/io-wq: return 2-step work swap scheme Pavel Begunkov
2021-02-04 14:52   ` Jens Axboe
2021-02-04 14:56     ` Pavel Begunkov
2021-02-04 15:05       ` Jens Axboe
2021-02-04 15:07 ` [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Jens Axboe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.