All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH for-next 00/13] 5.14 batch 2
@ 2021-05-24 23:50 Pavel Begunkov
  2021-05-24 23:51 ` [PATCH 01/13] io-wq: embed wqe ptr array into struct io_wq Pavel Begunkov
                   ` (12 more replies)
  0 siblings, 13 replies; 21+ messages in thread
From: Pavel Begunkov @ 2021-05-24 23:50 UTC (permalink / raw)
  To: Jens Axboe, io-uring

On top of "io_uring/io-wq: close io-wq full-stop gap" sent for 5.13.

io-wq cleaning and some rsrc-related technical debt. 12/13 is about
not measured optimisation, even though is already nice but targeting
future use cases.

Pavel Begunkov (13):
  io-wq: embed wqe ptr array into struct io_wq
  io-wq: remove unused io-wq refcounting
  io_uring: refactor io_iopoll_req_issued
  io_uring: rename function *task_file
  io-wq: replace goto while
  io-wq: don't repeat IO_WQ_BIT_EXIT check by worker
  io-wq: simplify worker exiting
  io_uring: hide rsrc tag copy into generic helpers
  io_uring: remove rsrc put work irq save/restore
  io_uring: add helpers for 2 level table alloc
  io_uring: don't vmalloc rsrc tags
  io_uring: cache task struct refs
  io_uring: unify SQPOLL and user task cancellations

 fs/io-wq.c    |  38 ++----
 fs/io_uring.c | 347 ++++++++++++++++++++++++++------------------------
 2 files changed, 195 insertions(+), 190 deletions(-)

-- 
2.31.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH 01/13] io-wq: embed wqe ptr array into struct io_wq
  2021-05-24 23:50 [PATCH for-next 00/13] 5.14 batch 2 Pavel Begunkov
@ 2021-05-24 23:51 ` Pavel Begunkov
  2021-05-24 23:51 ` [PATCH 02/13] io-wq: remove unused io-wq refcounting Pavel Begunkov
                   ` (11 subsequent siblings)
  12 siblings, 0 replies; 21+ messages in thread
From: Pavel Begunkov @ 2021-05-24 23:51 UTC (permalink / raw)
  To: Jens Axboe, io-uring

io-wq keeps an array of pointers to struct io_wqe, allocate this array
as a part of struct io-wq, it's easier to code and saves an extra
indirection for nearly each io-wq call.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io-wq.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index de9b7ba3ba01..961c4dbf1220 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -102,7 +102,6 @@ struct io_wqe {
  * Per io_wq state
   */
 struct io_wq {
-	struct io_wqe **wqes;
 	unsigned long state;
 
 	free_work_fn *free_work;
@@ -118,6 +117,8 @@ struct io_wq {
 	struct hlist_node cpuhp_node;
 
 	struct task_struct *task;
+
+	struct io_wqe *wqes[];
 };
 
 static enum cpuhp_state io_wq_online;
@@ -907,17 +908,12 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 	if (WARN_ON_ONCE(!data->free_work || !data->do_work))
 		return ERR_PTR(-EINVAL);
 
-	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
+	wq = kzalloc(struct_size(wq, wqes, nr_node_ids), GFP_KERNEL);
 	if (!wq)
 		return ERR_PTR(-ENOMEM);
-
-	wq->wqes = kcalloc(nr_node_ids, sizeof(struct io_wqe *), GFP_KERNEL);
-	if (!wq->wqes)
-		goto err_wq;
-
 	ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node);
 	if (ret)
-		goto err_wqes;
+		goto err_wq;
 
 	refcount_inc(&data->hash->refs);
 	wq->hash = data->hash;
@@ -962,8 +958,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 	cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
 	for_each_node(node)
 		kfree(wq->wqes[node]);
-err_wqes:
-	kfree(wq->wqes);
 err_wq:
 	kfree(wq);
 	return ERR_PTR(ret);
@@ -1033,7 +1027,6 @@ static void io_wq_destroy(struct io_wq *wq)
 		kfree(wqe);
 	}
 	io_wq_put_hash(wq->hash);
-	kfree(wq->wqes);
 	kfree(wq);
 }
 
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 02/13] io-wq: remove unused io-wq refcounting
  2021-05-24 23:50 [PATCH for-next 00/13] 5.14 batch 2 Pavel Begunkov
  2021-05-24 23:51 ` [PATCH 01/13] io-wq: embed wqe ptr array into struct io_wq Pavel Begunkov
@ 2021-05-24 23:51 ` Pavel Begunkov
  2021-05-24 23:51 ` [PATCH 03/13] io_uring: refactor io_iopoll_req_issued Pavel Begunkov
                   ` (10 subsequent siblings)
  12 siblings, 0 replies; 21+ messages in thread
From: Pavel Begunkov @ 2021-05-24 23:51 UTC (permalink / raw)
  To: Jens Axboe, io-uring

iowq->refs is initialised to one and killed on exit, so it's not used
and we can kill it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io-wq.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 961c4dbf1220..a0e43d1b94af 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -109,8 +109,6 @@ struct io_wq {
 
 	struct io_wq_hash *hash;
 
-	refcount_t refs;
-
 	atomic_t worker_refs;
 	struct completion worker_done;
 
@@ -949,7 +947,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 	}
 
 	wq->task = get_task_struct(data->task);
-	refcount_set(&wq->refs, 1);
 	atomic_set(&wq->worker_refs, 1);
 	init_completion(&wq->worker_done);
 	return wq;
@@ -1035,8 +1032,7 @@ void io_wq_put_and_exit(struct io_wq *wq)
 	WARN_ON_ONCE(!test_bit(IO_WQ_BIT_EXIT, &wq->state));
 
 	io_wq_exit_workers(wq);
-	if (refcount_dec_and_test(&wq->refs))
-		io_wq_destroy(wq);
+	io_wq_destroy(wq);
 }
 
 static bool io_wq_worker_affinity(struct io_worker *worker, void *data)
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 03/13] io_uring: refactor io_iopoll_req_issued
  2021-05-24 23:50 [PATCH for-next 00/13] 5.14 batch 2 Pavel Begunkov
  2021-05-24 23:51 ` [PATCH 01/13] io-wq: embed wqe ptr array into struct io_wq Pavel Begunkov
  2021-05-24 23:51 ` [PATCH 02/13] io-wq: remove unused io-wq refcounting Pavel Begunkov
@ 2021-05-24 23:51 ` Pavel Begunkov
  2021-05-24 23:51 ` [PATCH 04/13] io_uring: rename function *task_file Pavel Begunkov
                   ` (9 subsequent siblings)
  12 siblings, 0 replies; 21+ messages in thread
From: Pavel Begunkov @ 2021-05-24 23:51 UTC (permalink / raw)
  To: Jens Axboe, io-uring

A simple refactoring of io_iopoll_req_issued(), move in_async inside so
we don't pass it around and save on double checking it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 44 +++++++++++++++++++++-----------------------
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 057fbc8a190c..df1510adaaf7 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2520,9 +2520,14 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
  * find it from a io_do_iopoll() thread before the issuer is done
  * accessing the kiocb cookie.
  */
-static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
+static void io_iopoll_req_issued(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
+	const bool in_async = io_wq_current_is_worker();
+
+	/* workqueue context doesn't hold uring_lock, grab it now */
+	if (unlikely(in_async))
+		mutex_lock(&ctx->uring_lock);
 
 	/*
 	 * Track whether we have multiple files in our lists. This will impact
@@ -2549,14 +2554,19 @@ static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
 	else
 		list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
 
-	/*
-	 * If IORING_SETUP_SQPOLL is enabled, sqes are either handled in sq thread
-	 * task context or in io worker task context. If current task context is
-	 * sq thread, we don't need to check whether should wake up sq thread.
-	 */
-	if (in_async && (ctx->flags & IORING_SETUP_SQPOLL) &&
-	    wq_has_sleeper(&ctx->sq_data->wait))
-		wake_up(&ctx->sq_data->wait);
+	if (unlikely(in_async)) {
+		/*
+		 * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
+		 * in sq thread task context or in io worker task context. If
+		 * current task context is sq thread, we don't need to check
+		 * whether should wake up sq thread.
+		 */
+		if ((ctx->flags & IORING_SETUP_SQPOLL) &&
+		    wq_has_sleeper(&ctx->sq_data->wait))
+			wake_up(&ctx->sq_data->wait);
+
+		mutex_unlock(&ctx->uring_lock);
+	}
 }
 
 static inline void io_state_file_put(struct io_submit_state *state)
@@ -6210,23 +6220,11 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 
 	if (creds)
 		revert_creds(creds);
-
 	if (ret)
 		return ret;
-
 	/* If the op doesn't have a file, we're not polling for it */
-	if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) {
-		const bool in_async = io_wq_current_is_worker();
-
-		/* workqueue context doesn't hold uring_lock, grab it now */
-		if (in_async)
-			mutex_lock(&ctx->uring_lock);
-
-		io_iopoll_req_issued(req, in_async);
-
-		if (in_async)
-			mutex_unlock(&ctx->uring_lock);
-	}
+	if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file)
+		io_iopoll_req_issued(req);
 
 	return 0;
 }
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 04/13] io_uring: rename function *task_file
  2021-05-24 23:50 [PATCH for-next 00/13] 5.14 batch 2 Pavel Begunkov
                   ` (2 preceding siblings ...)
  2021-05-24 23:51 ` [PATCH 03/13] io_uring: refactor io_iopoll_req_issued Pavel Begunkov
@ 2021-05-24 23:51 ` Pavel Begunkov
  2021-05-24 23:51 ` [PATCH 05/13] io-wq: replace goto while Pavel Begunkov
                   ` (8 subsequent siblings)
  12 siblings, 0 replies; 21+ messages in thread
From: Pavel Begunkov @ 2021-05-24 23:51 UTC (permalink / raw)
  To: Jens Axboe, io-uring

What at some moment was references to struct file used to control
lifetimes of task/ctx is now just internal tctx structures/nodes,
so rename outdated *task_file() routines into something more sensible.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index df1510adaaf7..c34df3e01151 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1024,7 +1024,7 @@ static const struct io_op_def io_op_defs[] = {
 };
 
 static bool io_disarm_next(struct io_kiocb *req);
-static void io_uring_del_task_file(unsigned long index);
+static void io_uring_del_tctx_node(unsigned long index);
 static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 					 struct task_struct *task,
 					 bool cancel_all);
@@ -8706,7 +8706,7 @@ static void io_tctx_exit_cb(struct callback_head *cb)
 	 * node. It'll be removed by the end of cancellation, just ignore it.
 	 */
 	if (!atomic_read(&tctx->in_idle))
-		io_uring_del_task_file((unsigned long)work->ctx);
+		io_uring_del_tctx_node((unsigned long)work->ctx);
 	complete(&work->completion);
 }
 
@@ -8959,7 +8959,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 	}
 }
 
-static int __io_uring_add_task_file(struct io_ring_ctx *ctx)
+static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
 {
 	struct io_uring_task *tctx = current->io_uring;
 	struct io_tctx_node *node;
@@ -8996,19 +8996,19 @@ static int __io_uring_add_task_file(struct io_ring_ctx *ctx)
 /*
  * Note that this task has used io_uring. We use it for cancelation purposes.
  */
-static inline int io_uring_add_task_file(struct io_ring_ctx *ctx)
+static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
 {
 	struct io_uring_task *tctx = current->io_uring;
 
 	if (likely(tctx && tctx->last == ctx))
 		return 0;
-	return __io_uring_add_task_file(ctx);
+	return __io_uring_add_tctx_node(ctx);
 }
 
 /*
  * Remove this io_uring_file -> task mapping.
  */
-static void io_uring_del_task_file(unsigned long index)
+static void io_uring_del_tctx_node(unsigned long index)
 {
 	struct io_uring_task *tctx = current->io_uring;
 	struct io_tctx_node *node;
@@ -9039,7 +9039,7 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx)
 
 	tctx->io_wq = NULL;
 	xa_for_each(&tctx->xa, index, node)
-		io_uring_del_task_file(index);
+		io_uring_del_tctx_node(index);
 	if (wq)
 		io_wq_put_and_exit(wq);
 }
@@ -9317,7 +9317,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 		}
 		submitted = to_submit;
 	} else if (to_submit) {
-		ret = io_uring_add_task_file(ctx);
+		ret = io_uring_add_tctx_node(ctx);
 		if (unlikely(ret))
 			goto out;
 		mutex_lock(&ctx->uring_lock);
@@ -9527,7 +9527,7 @@ static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
 	if (fd < 0)
 		return fd;
 
-	ret = io_uring_add_task_file(ctx);
+	ret = io_uring_add_tctx_node(ctx);
 	if (ret) {
 		put_unused_fd(fd);
 		return ret;
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 05/13] io-wq: replace goto while
  2021-05-24 23:50 [PATCH for-next 00/13] 5.14 batch 2 Pavel Begunkov
                   ` (3 preceding siblings ...)
  2021-05-24 23:51 ` [PATCH 04/13] io_uring: rename function *task_file Pavel Begunkov
@ 2021-05-24 23:51 ` Pavel Begunkov
  2021-05-27 21:48   ` Noah Goldstein
  2021-05-24 23:51 ` [PATCH 06/13] io-wq: don't repeat IO_WQ_BIT_EXIT check by worker Pavel Begunkov
                   ` (7 subsequent siblings)
  12 siblings, 1 reply; 21+ messages in thread
From: Pavel Begunkov @ 2021-05-24 23:51 UTC (permalink / raw)
  To: Jens Axboe, io-uring

If for/while is simple enough it's more prefered over goto with labels
as the former one is more explicit and easier to read. So, replace a
trivial goto-based loop in io_wqe_worker() with a while.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io-wq.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index a0e43d1b94af..712eb062f822 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -538,12 +538,13 @@ static int io_wqe_worker(void *data)
 		long ret;
 
 		set_current_state(TASK_INTERRUPTIBLE);
-loop:
-		raw_spin_lock_irq(&wqe->lock);
-		if (io_wqe_run_queue(wqe)) {
+		while (1) {
+			raw_spin_lock_irq(&wqe->lock);
+			if (!io_wqe_run_queue(wqe))
+				break;
 			io_worker_handle_work(worker);
-			goto loop;
 		}
+
 		__io_worker_idle(wqe, worker);
 		raw_spin_unlock_irq(&wqe->lock);
 		if (io_flush_signals())
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 06/13] io-wq: don't repeat IO_WQ_BIT_EXIT check by worker
  2021-05-24 23:50 [PATCH for-next 00/13] 5.14 batch 2 Pavel Begunkov
                   ` (4 preceding siblings ...)
  2021-05-24 23:51 ` [PATCH 05/13] io-wq: replace goto while Pavel Begunkov
@ 2021-05-24 23:51 ` Pavel Begunkov
  2021-05-24 23:51 ` [PATCH 07/13] io-wq: simplify worker exiting Pavel Begunkov
                   ` (6 subsequent siblings)
  12 siblings, 0 replies; 21+ messages in thread
From: Pavel Begunkov @ 2021-05-24 23:51 UTC (permalink / raw)
  To: Jens Axboe, io-uring

io_wqe_worker()'s main loop does check IO_WQ_BIT_EXIT flag, so no need
for a second test_bit at the end as it will immediately jump to the
first check afterwards.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io-wq.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 712eb062f822..27a9ebbbf68e 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -560,8 +560,7 @@ static int io_wqe_worker(void *data)
 		if (ret)
 			continue;
 		/* timed out, exit unless we're the fixed worker */
-		if (test_bit(IO_WQ_BIT_EXIT, &wq->state) ||
-		    !(worker->flags & IO_WORKER_F_FIXED))
+		if (!(worker->flags & IO_WORKER_F_FIXED))
 			break;
 	}
 
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 07/13] io-wq: simplify worker exiting
  2021-05-24 23:50 [PATCH for-next 00/13] 5.14 batch 2 Pavel Begunkov
                   ` (5 preceding siblings ...)
  2021-05-24 23:51 ` [PATCH 06/13] io-wq: don't repeat IO_WQ_BIT_EXIT check by worker Pavel Begunkov
@ 2021-05-24 23:51 ` Pavel Begunkov
  2021-05-24 23:51 ` [PATCH 08/13] io_uring: hide rsrc tag copy into generic helpers Pavel Begunkov
                   ` (5 subsequent siblings)
  12 siblings, 0 replies; 21+ messages in thread
From: Pavel Begunkov @ 2021-05-24 23:51 UTC (permalink / raw)
  To: Jens Axboe, io-uring

io_worker_handle_work() already takes care of the empty list case and
releases spinlock, so get rid of ugly conditional unlocking and
unconditionally call handle_work()

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io-wq.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 27a9ebbbf68e..c57dd50d24d9 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -566,10 +566,7 @@ static int io_wqe_worker(void *data)
 
 	if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
 		raw_spin_lock_irq(&wqe->lock);
-		if (!wq_list_empty(&wqe->work_list))
-			io_worker_handle_work(worker);
-		else
-			raw_spin_unlock_irq(&wqe->lock);
+		io_worker_handle_work(worker);
 	}
 
 	io_worker_exit(worker);
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 08/13] io_uring: hide rsrc tag copy into generic helpers
  2021-05-24 23:50 [PATCH for-next 00/13] 5.14 batch 2 Pavel Begunkov
                   ` (6 preceding siblings ...)
  2021-05-24 23:51 ` [PATCH 07/13] io-wq: simplify worker exiting Pavel Begunkov
@ 2021-05-24 23:51 ` Pavel Begunkov
  2021-05-24 23:51 ` [PATCH 09/13] io_uring: remove rsrc put work irq save/restore Pavel Begunkov
                   ` (4 subsequent siblings)
  12 siblings, 0 replies; 21+ messages in thread
From: Pavel Begunkov @ 2021-05-24 23:51 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Make io_rsrc_data_alloc() taking care of rsrc tags loading on
registration, so we don't need to repeat it for each new rsrc type.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 55 +++++++++++++++++++++++++--------------------------
 1 file changed, 27 insertions(+), 28 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c34df3e01151..24178ca660c4 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7156,27 +7156,38 @@ static void io_rsrc_data_free(struct io_rsrc_data *data)
 	kfree(data);
 }
 
-static struct io_rsrc_data *io_rsrc_data_alloc(struct io_ring_ctx *ctx,
-					       rsrc_put_fn *do_put,
-					       unsigned nr)
+static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
+			      u64 __user *utags, unsigned nr,
+			      struct io_rsrc_data **pdata)
 {
 	struct io_rsrc_data *data;
+	unsigned i;
 
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
 	if (!data)
-		return NULL;
+		return -ENOMEM;
 
 	data->tags = kvcalloc(nr, sizeof(*data->tags), GFP_KERNEL);
 	if (!data->tags) {
 		kfree(data);
-		return NULL;
+		return -ENOMEM;
+	}
+	if (utags) {
+		for (i = 0; i < nr; i++) {
+			if (copy_from_user(&data->tags[i], &utags[i],
+					   sizeof(data->tags[i]))) {
+				io_rsrc_data_free(data);
+				return -EFAULT;
+			}
+		}
 	}
 
 	atomic_set(&data->refs, 1);
 	data->ctx = ctx;
 	data->do_put = do_put;
 	init_completion(&data->done);
-	return data;
+	*pdata = data;
+	return 0;
 }
 
 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
@@ -7628,7 +7639,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	struct file *file;
 	int fd, ret;
 	unsigned i;
-	struct io_rsrc_data *file_data;
 
 	if (ctx->file_data)
 		return -EBUSY;
@@ -7639,27 +7649,24 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	ret = io_rsrc_node_switch_start(ctx);
 	if (ret)
 		return ret;
+	ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
+				 &ctx->file_data);
+	if (ret)
+		return ret;
 
-	file_data = io_rsrc_data_alloc(ctx, io_rsrc_file_put, nr_args);
-	if (!file_data)
-		return -ENOMEM;
-	ctx->file_data = file_data;
 	ret = -ENOMEM;
 	if (!io_alloc_file_tables(&ctx->file_table, nr_args))
 		goto out_free;
 
 	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
-		u64 tag = 0;
-
-		if ((tags && copy_from_user(&tag, &tags[i], sizeof(tag))) ||
-		    copy_from_user(&fd, &fds[i], sizeof(fd))) {
+		if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
 			ret = -EFAULT;
 			goto out_fput;
 		}
 		/* allow sparse sets */
 		if (fd == -1) {
 			ret = -EINVAL;
-			if (unlikely(tag))
+			if (unlikely(ctx->file_data->tags[i]))
 				goto out_fput;
 			continue;
 		}
@@ -7680,7 +7687,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 			fput(file);
 			goto out_fput;
 		}
-		ctx->file_data->tags[i] = tag;
 		io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
 	}
 
@@ -8395,9 +8401,9 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 	ret = io_rsrc_node_switch_start(ctx);
 	if (ret)
 		return ret;
-	data = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, nr_args);
-	if (!data)
-		return -ENOMEM;
+	ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
+	if (ret)
+		return ret;
 	ret = io_buffers_map_alloc(ctx, nr_args);
 	if (ret) {
 		io_rsrc_data_free(data);
@@ -8405,19 +8411,13 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 	}
 
 	for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
-		u64 tag = 0;
-
-		if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) {
-			ret = -EFAULT;
-			break;
-		}
 		ret = io_copy_iov(ctx, &iov, arg, i);
 		if (ret)
 			break;
 		ret = io_buffer_validate(&iov);
 		if (ret)
 			break;
-		if (!iov.iov_base && tag) {
+		if (!iov.iov_base && data->tags[i]) {
 			ret = -EINVAL;
 			break;
 		}
@@ -8426,7 +8426,6 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 					     &last_hpage);
 		if (ret)
 			break;
-		data->tags[i] = tag;
 	}
 
 	WARN_ON_ONCE(ctx->buf_data);
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 09/13] io_uring: remove rsrc put work irq save/restore
  2021-05-24 23:50 [PATCH for-next 00/13] 5.14 batch 2 Pavel Begunkov
                   ` (7 preceding siblings ...)
  2021-05-24 23:51 ` [PATCH 08/13] io_uring: hide rsrc tag copy into generic helpers Pavel Begunkov
@ 2021-05-24 23:51 ` Pavel Begunkov
  2021-05-24 23:51 ` [PATCH 10/13] io_uring: add helpers for 2 level table alloc Pavel Begunkov
                   ` (3 subsequent siblings)
  12 siblings, 0 replies; 21+ messages in thread
From: Pavel Begunkov @ 2021-05-24 23:51 UTC (permalink / raw)
  To: Jens Axboe, io-uring

io_rsrc_put_work() is executed by workqueue in non-irq context, so no
need for irqsave/restore variants of spinlocking.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 24178ca660c4..40b70c34c1b2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7550,14 +7550,13 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 
 		if (prsrc->tag) {
 			bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL;
-			unsigned long flags;
 
 			io_ring_submit_lock(ctx, lock_ring);
-			spin_lock_irqsave(&ctx->completion_lock, flags);
+			spin_lock_irq(&ctx->completion_lock);
 			io_cqring_fill_event(ctx, prsrc->tag, 0, 0);
 			ctx->cq_extra++;
 			io_commit_cqring(ctx);
-			spin_unlock_irqrestore(&ctx->completion_lock, flags);
+			spin_unlock_irq(&ctx->completion_lock);
 			io_cqring_ev_posted(ctx);
 			io_ring_submit_unlock(ctx, lock_ring);
 		}
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 10/13] io_uring: add helpers for 2 level table alloc
  2021-05-24 23:50 [PATCH for-next 00/13] 5.14 batch 2 Pavel Begunkov
                   ` (8 preceding siblings ...)
  2021-05-24 23:51 ` [PATCH 09/13] io_uring: remove rsrc put work irq save/restore Pavel Begunkov
@ 2021-05-24 23:51 ` Pavel Begunkov
  2021-05-27 21:43   ` Noah Goldstein
  2021-05-24 23:51 ` [PATCH 11/13] io_uring: don't vmalloc rsrc tags Pavel Begunkov
                   ` (2 subsequent siblings)
  12 siblings, 1 reply; 21+ messages in thread
From: Pavel Begunkov @ 2021-05-24 23:51 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Some parts like fixed file table use 2 level tables, factor out helpers
for allocating/deallocating them as more users are to come.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 73 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 43 insertions(+), 30 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 40b70c34c1b2..1cc2d16637ff 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7054,14 +7054,36 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
 }
 
-static void io_free_file_tables(struct io_file_table *table, unsigned nr_files)
+static void io_free_page_table(void **table, size_t size)
 {
-	unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE);
+	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
 
 	for (i = 0; i < nr_tables; i++)
-		kfree(table->files[i]);
-	kfree(table->files);
-	table->files = NULL;
+		kfree(table[i]);
+	kfree(table);
+}
+
+static void **io_alloc_page_table(size_t size)
+{
+	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
+	size_t init_size = size;
+	void **table;
+
+	table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL);
+	if (!table)
+		return NULL;
+
+	for (i = 0; i < nr_tables; i++) {
+		unsigned int this_size = min(size, PAGE_SIZE);
+
+		table[i] = kzalloc(this_size, GFP_KERNEL);
+		if (!table[i]) {
+			io_free_page_table(table, init_size);
+			return NULL;
+		}
+		size -= this_size;
+	}
+	return table;
 }
 
 static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx)
@@ -7190,6 +7212,22 @@ static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
 	return 0;
 }
 
+static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
+{
+	size_t size = nr_files * sizeof(struct io_fixed_file);
+
+	table->files = (struct io_fixed_file **)io_alloc_page_table(size);
+	return !!table->files;
+}
+
+static void io_free_file_tables(struct io_file_table *table, unsigned nr_files)
+{
+	size_t size = nr_files * sizeof(struct io_fixed_file);
+
+	io_free_page_table((void **)table->files, size);
+	table->files = NULL;
+}
+
 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 {
 #if defined(CONFIG_UNIX)
@@ -7451,31 +7489,6 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx)
 }
 #endif
 
-static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
-{
-	unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE);
-
-	table->files = kcalloc(nr_tables, sizeof(*table->files), GFP_KERNEL);
-	if (!table->files)
-		return false;
-
-	for (i = 0; i < nr_tables; i++) {
-		unsigned int this_files = min(nr_files, IORING_MAX_FILES_TABLE);
-
-		table->files[i] = kcalloc(this_files, sizeof(*table->files[i]),
-					GFP_KERNEL);
-		if (!table->files[i])
-			break;
-		nr_files -= this_files;
-	}
-
-	if (i == nr_tables)
-		return true;
-
-	io_free_file_tables(table, nr_tables * IORING_MAX_FILES_TABLE);
-	return false;
-}
-
 static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
 {
 	struct file *file = prsrc->file;
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 11/13] io_uring: don't vmalloc rsrc tags
  2021-05-24 23:50 [PATCH for-next 00/13] 5.14 batch 2 Pavel Begunkov
                   ` (9 preceding siblings ...)
  2021-05-24 23:51 ` [PATCH 10/13] io_uring: add helpers for 2 level table alloc Pavel Begunkov
@ 2021-05-24 23:51 ` Pavel Begunkov
  2021-05-24 23:51 ` [PATCH 12/13] io_uring: cache task struct refs Pavel Begunkov
  2021-05-24 23:51 ` [PATCH 13/13] io_uring: unify SQPOLL and user task cancellations Pavel Begunkov
  12 siblings, 0 replies; 21+ messages in thread
From: Pavel Begunkov @ 2021-05-24 23:51 UTC (permalink / raw)
  To: Jens Axboe, io-uring

We don't really need vmalloc for keeping tags, it's not a hot path and
is there out of convenience, so replace it with two level tables to not
litter kernel virtual memory mappings.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 52 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 36 insertions(+), 16 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1cc2d16637ff..2b2d70a58a87 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -100,6 +100,10 @@
 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
 				 IORING_REGISTER_LAST + IORING_OP_LAST)
 
+#define IO_RSRC_TAG_TABLE_SHIFT	9
+#define IO_RSRC_TAG_TABLE_MAX	(1U << IO_RSRC_TAG_TABLE_SHIFT)
+#define IO_RSRC_TAG_TABLE_MASK	(IO_RSRC_TAG_TABLE_MAX - 1)
+
 #define IORING_MAX_REG_BUFFERS	(1U << 14)
 
 #define SQE_VALID_FLAGS	(IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK|	\
@@ -243,7 +247,8 @@ typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
 struct io_rsrc_data {
 	struct io_ring_ctx		*ctx;
 
-	u64				*tags;
+	u64				**tags;
+	unsigned int			nr;
 	rsrc_put_fn			*do_put;
 	atomic_t			refs;
 	struct completion		done;
@@ -7172,9 +7177,20 @@ static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ct
 	return ret;
 }
 
+static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
+{
+	unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
+	unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
+
+	return &data->tags[table_idx][off];
+}
+
 static void io_rsrc_data_free(struct io_rsrc_data *data)
 {
-	kvfree(data->tags);
+	size_t size = data->nr * sizeof(data->tags[0][0]);
+
+	if (data->tags)
+		io_free_page_table((void **)data->tags, size);
 	kfree(data);
 }
 
@@ -7183,33 +7199,37 @@ static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
 			      struct io_rsrc_data **pdata)
 {
 	struct io_rsrc_data *data;
+	int ret = -ENOMEM;
 	unsigned i;
 
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;
-
-	data->tags = kvcalloc(nr, sizeof(*data->tags), GFP_KERNEL);
+	data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
 	if (!data->tags) {
 		kfree(data);
 		return -ENOMEM;
 	}
+
+	data->nr = nr;
+	data->ctx = ctx;
+	data->do_put = do_put;
 	if (utags) {
+		ret = -EFAULT;
 		for (i = 0; i < nr; i++) {
-			if (copy_from_user(&data->tags[i], &utags[i],
-					   sizeof(data->tags[i]))) {
-				io_rsrc_data_free(data);
-				return -EFAULT;
-			}
+			if (copy_from_user(io_get_tag_slot(data, i), &utags[i],
+					   sizeof(data->tags[i])))
+				goto fail;
 		}
 	}
 
 	atomic_set(&data->refs, 1);
-	data->ctx = ctx;
-	data->do_put = do_put;
 	init_completion(&data->done);
 	*pdata = data;
 	return 0;
+fail:
+	io_rsrc_data_free(data);
+	return ret;
 }
 
 static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
@@ -7678,7 +7698,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		/* allow sparse sets */
 		if (fd == -1) {
 			ret = -EINVAL;
-			if (unlikely(ctx->file_data->tags[i]))
+			if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
 				goto out_fput;
 			continue;
 		}
@@ -7776,7 +7796,7 @@ static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
 	if (!prsrc)
 		return -ENOMEM;
 
-	prsrc->tag = data->tags[idx];
+	prsrc->tag = *io_get_tag_slot(data, idx);
 	prsrc->rsrc = rsrc;
 	list_add(&prsrc->list, &node->rsrc_list);
 	return 0;
@@ -7846,7 +7866,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				err = -EBADF;
 				break;
 			}
-			data->tags[up->offset + done] = tag;
+			*io_get_tag_slot(data, up->offset + done) = tag;
 			io_fixed_file_set(file_slot, file);
 			err = io_sqe_file_register(ctx, file, i);
 			if (err) {
@@ -8429,7 +8449,7 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 		ret = io_buffer_validate(&iov);
 		if (ret)
 			break;
-		if (!iov.iov_base && data->tags[i]) {
+		if (!iov.iov_base && *io_get_tag_slot(data, i)) {
 			ret = -EINVAL;
 			break;
 		}
@@ -8502,7 +8522,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 		}
 
 		ctx->user_bufs[i] = imu;
-		ctx->buf_data->tags[offset] = tag;
+		*io_get_tag_slot(ctx->buf_data, offset) = tag;
 	}
 
 	if (needs_switch)
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 12/13] io_uring: cache task struct refs
  2021-05-24 23:50 [PATCH for-next 00/13] 5.14 batch 2 Pavel Begunkov
                   ` (10 preceding siblings ...)
  2021-05-24 23:51 ` [PATCH 11/13] io_uring: don't vmalloc rsrc tags Pavel Begunkov
@ 2021-05-24 23:51 ` Pavel Begunkov
  2021-05-27 21:51   ` Noah Goldstein
  2021-05-24 23:51 ` [PATCH 13/13] io_uring: unify SQPOLL and user task cancellations Pavel Begunkov
  12 siblings, 1 reply; 21+ messages in thread
From: Pavel Begunkov @ 2021-05-24 23:51 UTC (permalink / raw)
  To: Jens Axboe, io-uring

tctx in submission part is always synchronised because is executed from
the task's context, so we can batch allocate tctx/task references and
store them across syscall boundaries. It avoids enough of operations,
including an atomic for getting task ref and a percpu_counter_add()
function call, which still fallback to spinlock for large batching
cases (around >=32). Should be good for SQPOLL submitting in small
portions and coming at some moment bpf submissions.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2b2d70a58a87..a95d55a0f9be 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -110,6 +110,8 @@
 				IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
 				IOSQE_BUFFER_SELECT)
 
+#define IO_TCTX_REFS_CACHE_NR	(1U << 10)
+
 struct io_uring {
 	u32 head ____cacheline_aligned_in_smp;
 	u32 tail ____cacheline_aligned_in_smp;
@@ -472,6 +474,7 @@ struct io_ring_ctx {
 
 struct io_uring_task {
 	/* submission side */
+	int			cached_refs;
 	struct xarray		xa;
 	struct wait_queue_head	wait;
 	const struct io_ring_ctx *last;
@@ -6702,16 +6705,23 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
 
 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 {
+	struct io_uring_task *tctx;
 	int submitted = 0;
 
 	/* make sure SQ entry isn't read before tail */
 	nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
-
 	if (!percpu_ref_tryget_many(&ctx->refs, nr))
 		return -EAGAIN;
 
-	percpu_counter_add(&current->io_uring->inflight, nr);
-	refcount_add(nr, &current->usage);
+	tctx = current->io_uring;
+	tctx->cached_refs -= nr;
+	if (unlikely(tctx->cached_refs < 0)) {
+		unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
+
+		percpu_counter_add(&tctx->inflight, refill);
+		refcount_add(refill, &current->usage);
+		tctx->cached_refs += refill;
+	}
 	io_submit_state_start(&ctx->submit_state, nr);
 
 	while (submitted < nr) {
@@ -6737,12 +6747,10 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 
 	if (unlikely(submitted != nr)) {
 		int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
-		struct io_uring_task *tctx = current->io_uring;
 		int unused = nr - ref_used;
 
+		current->io_uring->cached_refs += unused;
 		percpu_ref_put_many(&ctx->refs, unused);
-		percpu_counter_sub(&tctx->inflight, unused);
-		put_task_struct_many(current, unused);
 	}
 
 	io_submit_state_end(&ctx->submit_state, ctx);
@@ -7924,7 +7932,7 @@ static int io_uring_alloc_task_context(struct task_struct *task,
 	struct io_uring_task *tctx;
 	int ret;
 
-	tctx = kmalloc(sizeof(*tctx), GFP_KERNEL);
+	tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
 	if (unlikely(!tctx))
 		return -ENOMEM;
 
@@ -7944,13 +7952,11 @@ static int io_uring_alloc_task_context(struct task_struct *task,
 
 	xa_init(&tctx->xa);
 	init_waitqueue_head(&tctx->wait);
-	tctx->last = NULL;
 	atomic_set(&tctx->in_idle, 0);
 	atomic_set(&tctx->inflight_tracked, 0);
 	task->io_uring = tctx;
 	spin_lock_init(&tctx->task_lock);
 	INIT_WQ_LIST(&tctx->task_list);
-	tctx->task_state = 0;
 	init_task_work(&tctx->task_work, tctx_task_work);
 	return 0;
 }
@@ -7961,6 +7967,7 @@ void __io_uring_free(struct task_struct *tsk)
 
 	WARN_ON_ONCE(!xa_empty(&tctx->xa));
 	WARN_ON_ONCE(tctx->io_wq);
+	WARN_ON_ONCE(tctx->cached_refs);
 
 	percpu_counter_destroy(&tctx->inflight);
 	kfree(tctx);
@@ -9097,6 +9104,16 @@ static void io_uring_try_cancel(bool cancel_all)
 	}
 }
 
+static void io_uring_drop_tctx_refs(struct task_struct *task)
+{
+	struct io_uring_task *tctx = task->io_uring;
+	unsigned int refs = tctx->cached_refs;
+
+	tctx->cached_refs = 0;
+	percpu_counter_sub(&tctx->inflight, refs);
+	put_task_struct_many(task, refs);
+}
+
 /* should only be called by SQPOLL task */
 static void io_uring_cancel_sqpoll(struct io_sq_data *sqd)
 {
@@ -9112,6 +9129,7 @@ static void io_uring_cancel_sqpoll(struct io_sq_data *sqd)
 
 	WARN_ON_ONCE(!sqd || sqd->thread != current);
 
+	io_uring_drop_tctx_refs(current);
 	atomic_inc(&tctx->in_idle);
 	do {
 		/* read completions before cancelations */
@@ -9149,6 +9167,7 @@ void __io_uring_cancel(struct files_struct *files)
 		io_wq_exit_start(tctx->io_wq);
 
 	/* make sure overflow events are dropped */
+	io_uring_drop_tctx_refs(current);
 	atomic_inc(&tctx->in_idle);
 	do {
 		/* read completions before cancelations */
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 13/13] io_uring: unify SQPOLL and user task cancellations
  2021-05-24 23:50 [PATCH for-next 00/13] 5.14 batch 2 Pavel Begunkov
                   ` (11 preceding siblings ...)
  2021-05-24 23:51 ` [PATCH 12/13] io_uring: cache task struct refs Pavel Begunkov
@ 2021-05-24 23:51 ` Pavel Begunkov
  12 siblings, 0 replies; 21+ messages in thread
From: Pavel Begunkov @ 2021-05-24 23:51 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Merge io_uring_cancel_sqpoll() and __io_uring_cancel() as it's easier to
have a conditional ctx traverse inside than keeping them in sync.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 89 +++++++++++++++++----------------------------------
 1 file changed, 30 insertions(+), 59 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a95d55a0f9be..7db6aaf31080 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1036,7 +1036,7 @@ static void io_uring_del_tctx_node(unsigned long index);
 static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 					 struct task_struct *task,
 					 bool cancel_all);
-static void io_uring_cancel_sqpoll(struct io_sq_data *sqd);
+static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
 static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
 
 static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
@@ -6921,7 +6921,7 @@ static int io_sq_thread(void *data)
 		timeout = jiffies + sqd->sq_thread_idle;
 	}
 
-	io_uring_cancel_sqpoll(sqd);
+	io_uring_cancel_generic(true, sqd);
 	sqd->thread = NULL;
 	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
 		io_ring_set_wakeup_flag(ctx);
@@ -9089,21 +9089,6 @@ static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
 	return percpu_counter_sum(&tctx->inflight);
 }
 
-static void io_uring_try_cancel(bool cancel_all)
-{
-	struct io_uring_task *tctx = current->io_uring;
-	struct io_tctx_node *node;
-	unsigned long index;
-
-	xa_for_each(&tctx->xa, index, node) {
-		struct io_ring_ctx *ctx = node->ctx;
-
-		/* sqpoll task will cancel all its requests */
-		if (!ctx->sq_data)
-			io_uring_try_cancel_requests(ctx, current, cancel_all);
-	}
-}
-
 static void io_uring_drop_tctx_refs(struct task_struct *task)
 {
 	struct io_uring_task *tctx = task->io_uring;
@@ -9114,69 +9099,50 @@ static void io_uring_drop_tctx_refs(struct task_struct *task)
 	put_task_struct_many(task, refs);
 }
 
-/* should only be called by SQPOLL task */
-static void io_uring_cancel_sqpoll(struct io_sq_data *sqd)
+/*
+ * Find any io_uring ctx that this task has registered or done IO on, and cancel
+ * requests. @sqd should be not-null IIF it's an SQPOLL thread cancellation.
+ */
+static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
 {
 	struct io_uring_task *tctx = current->io_uring;
 	struct io_ring_ctx *ctx;
 	s64 inflight;
 	DEFINE_WAIT(wait);
 
+	WARN_ON_ONCE(sqd && sqd->thread != current);
+
 	if (!current->io_uring)
 		return;
 	if (tctx->io_wq)
 		io_wq_exit_start(tctx->io_wq);
 
-	WARN_ON_ONCE(!sqd || sqd->thread != current);
-
 	io_uring_drop_tctx_refs(current);
 	atomic_inc(&tctx->in_idle);
 	do {
 		/* read completions before cancelations */
-		inflight = tctx_inflight(tctx, false);
+		inflight = tctx_inflight(tctx, !cancel_all);
 		if (!inflight)
 			break;
-		list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
-			io_uring_try_cancel_requests(ctx, current, true);
 
-		prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
-		/*
-		 * If we've seen completions, retry without waiting. This
-		 * avoids a race where a completion comes in before we did
-		 * prepare_to_wait().
-		 */
-		if (inflight == tctx_inflight(tctx, false))
-			schedule();
-		finish_wait(&tctx->wait, &wait);
-	} while (1);
-	atomic_dec(&tctx->in_idle);
-}
+		if (!sqd) {
+			struct io_tctx_node *node;
+			unsigned long index;
 
-/*
- * Find any io_uring fd that this task has registered or done IO on, and cancel
- * requests.
- */
-void __io_uring_cancel(struct files_struct *files)
-{
-	struct io_uring_task *tctx = current->io_uring;
-	DEFINE_WAIT(wait);
-	s64 inflight;
-	bool cancel_all = !files;
-
-	if (tctx->io_wq)
-		io_wq_exit_start(tctx->io_wq);
+			xa_for_each(&tctx->xa, index, node) {
+				/* sqpoll task will cancel all its requests */
+				if (node->ctx->sq_data)
+					continue;
+				io_uring_try_cancel_requests(node->ctx, current,
+							     cancel_all);
+			}
+		} else {
+			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
+				io_uring_try_cancel_requests(ctx, current,
+							     cancel_all);
+		}
 
-	/* make sure overflow events are dropped */
-	io_uring_drop_tctx_refs(current);
-	atomic_inc(&tctx->in_idle);
-	do {
-		/* read completions before cancelations */
-		inflight = tctx_inflight(tctx, !cancel_all);
-		if (!inflight)
-			break;
-		io_uring_try_cancel(cancel_all);
 		prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
-
 		/*
 		 * If we've seen completions, retry without waiting. This
 		 * avoids a race where a completion comes in before we did
@@ -9195,6 +9161,11 @@ void __io_uring_cancel(struct files_struct *files)
 	}
 }
 
+void __io_uring_cancel(struct files_struct *files)
+{
+	io_uring_cancel_generic(!files, NULL);
+}
+
 static void *io_uring_validate_mmap_request(struct file *file,
 					    loff_t pgoff, size_t sz)
 {
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 10/13] io_uring: add helpers for 2 level table alloc
  2021-05-24 23:51 ` [PATCH 10/13] io_uring: add helpers for 2 level table alloc Pavel Begunkov
@ 2021-05-27 21:43   ` Noah Goldstein
  2021-05-27 22:14     ` Pavel Begunkov
  0 siblings, 1 reply; 21+ messages in thread
From: Noah Goldstein @ 2021-05-27 21:43 UTC (permalink / raw)
  To: Pavel Begunkov; +Cc: Jens Axboe, open list:IO_URING

On Mon, May 24, 2021 at 7:51 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>
> Some parts like fixed file table use 2 level tables, factor out helpers
> for allocating/deallocating them as more users are to come.
>
> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
> ---
>  fs/io_uring.c | 73 ++++++++++++++++++++++++++++++---------------------
>  1 file changed, 43 insertions(+), 30 deletions(-)
>
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 40b70c34c1b2..1cc2d16637ff 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -7054,14 +7054,36 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
>         return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
>  }
>
> -static void io_free_file_tables(struct io_file_table *table, unsigned nr_files)
> +static void io_free_page_table(void **table, size_t size)
>  {
> -       unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE);
> +       unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
>
>         for (i = 0; i < nr_tables; i++)
> -               kfree(table->files[i]);
> -       kfree(table->files);
> -       table->files = NULL;
> +               kfree(table[i]);
> +       kfree(table);
> +}
> +
> +static void **io_alloc_page_table(size_t size)
> +{
> +       unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
> +       size_t init_size = size;
> +       void **table;
> +
> +       table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL);
> +       if (!table)
> +               return NULL;
> +
> +       for (i = 0; i < nr_tables; i++) {
> +               unsigned int this_size = min(size, PAGE_SIZE);
> +
> +               table[i] = kzalloc(this_size, GFP_KERNEL);
> +               if (!table[i]) {
> +                       io_free_page_table(table, init_size);
> +                       return NULL;
Unless zalloc returns non-NULL for size == 0, you are guranteed to do
this for size <= PAGE_SIZE * (nr_tables - 1).  Possibly worth calculating early?

If you calculate early you could then make the loop:

for (i = 0; i < nr_tables - 1; i++) {
    table[i] = kzalloc(PAGE_SIZE, GFP_KERNEL);
    if (!table[i]) {
        io_free_page_table(table, init_size);
        return NULL;
    }
}

table[i] = kzalloc(size - (nr_tables - 1) * PAGE_SIZE, GFP_KERNEL);
    if (!table[i]) {
        io_free_page_table(table, init_size);
        return NULL;
    }

Which is almost certainly faster.

> +               }
> +               size -= this_size;
> +       }
> +       return table;
>  }
>
>  static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx)
> @@ -7190,6 +7212,22 @@ static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
>         return 0;
>  }
>
> +static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
> +{
> +       size_t size = nr_files * sizeof(struct io_fixed_file);
> +
> +       table->files = (struct io_fixed_file **)io_alloc_page_table(size);
> +       return !!table->files;
> +}
> +
> +static void io_free_file_tables(struct io_file_table *table, unsigned nr_files)
> +{
> +       size_t size = nr_files * sizeof(struct io_fixed_file);
> +
> +       io_free_page_table((void **)table->files, size);
> +       table->files = NULL;
> +}
> +
>  static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
>  {
>  #if defined(CONFIG_UNIX)
> @@ -7451,31 +7489,6 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx)
>  }
>  #endif
>
> -static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
> -{
> -       unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE);
> -
> -       table->files = kcalloc(nr_tables, sizeof(*table->files), GFP_KERNEL);
> -       if (!table->files)
> -               return false;
> -
> -       for (i = 0; i < nr_tables; i++) {
> -               unsigned int this_files = min(nr_files, IORING_MAX_FILES_TABLE);
> -
> -               table->files[i] = kcalloc(this_files, sizeof(*table->files[i]),
> -                                       GFP_KERNEL);
> -               if (!table->files[i])
> -                       break;
> -               nr_files -= this_files;
> -       }
> -
> -       if (i == nr_tables)
> -               return true;
> -
> -       io_free_file_tables(table, nr_tables * IORING_MAX_FILES_TABLE);
> -       return false;
> -}
> -
>  static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
>  {
>         struct file *file = prsrc->file;
> --
> 2.31.1
>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 05/13] io-wq: replace goto while
  2021-05-24 23:51 ` [PATCH 05/13] io-wq: replace goto while Pavel Begunkov
@ 2021-05-27 21:48   ` Noah Goldstein
  2021-05-27 22:18     ` Pavel Begunkov
  0 siblings, 1 reply; 21+ messages in thread
From: Noah Goldstein @ 2021-05-27 21:48 UTC (permalink / raw)
  To: Pavel Begunkov; +Cc: Jens Axboe, open list:IO_URING

On Mon, May 24, 2021 at 7:51 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>
> If for/while is simple enough it's more prefered over goto with labels
> as the former one is more explicit and easier to read. So, replace a
> trivial goto-based loop in io_wqe_worker() with a while.
>
> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
> ---
>  fs/io-wq.c | 9 +++++----
>  1 file changed, 5 insertions(+), 4 deletions(-)
>
> diff --git a/fs/io-wq.c b/fs/io-wq.c
> index a0e43d1b94af..712eb062f822 100644
> --- a/fs/io-wq.c
> +++ b/fs/io-wq.c
> @@ -538,12 +538,13 @@ static int io_wqe_worker(void *data)
>                 long ret;
>
>                 set_current_state(TASK_INTERRUPTIBLE);
> -loop:
> -               raw_spin_lock_irq(&wqe->lock);
> -               if (io_wqe_run_queue(wqe)) {
> +               while (1) {
> +                       raw_spin_lock_irq(&wqe->lock);
Can acquiring the spinlock be hoisted from the loop?
> +                       if (!io_wqe_run_queue(wqe))
> +                               break;
>                         io_worker_handle_work(worker);
> -                       goto loop;
>                 }
> +
>                 __io_worker_idle(wqe, worker);
>                 raw_spin_unlock_irq(&wqe->lock);
>                 if (io_flush_signals())
> --
> 2.31.1
>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 12/13] io_uring: cache task struct refs
  2021-05-24 23:51 ` [PATCH 12/13] io_uring: cache task struct refs Pavel Begunkov
@ 2021-05-27 21:51   ` Noah Goldstein
  2021-05-27 22:13     ` Pavel Begunkov
  0 siblings, 1 reply; 21+ messages in thread
From: Noah Goldstein @ 2021-05-27 21:51 UTC (permalink / raw)
  To: Pavel Begunkov; +Cc: Jens Axboe, open list:IO_URING

On Mon, May 24, 2021 at 7:51 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>
> tctx in submission part is always synchronised because is executed from
> the task's context, so we can batch allocate tctx/task references and
> store them across syscall boundaries. It avoids enough of operations,
> including an atomic for getting task ref and a percpu_counter_add()
> function call, which still fallback to spinlock for large batching
> cases (around >=32). Should be good for SQPOLL submitting in small
> portions and coming at some moment bpf submissions.
>
> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
> ---
>  fs/io_uring.c | 37 ++++++++++++++++++++++++++++---------
>  1 file changed, 28 insertions(+), 9 deletions(-)
>
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 2b2d70a58a87..a95d55a0f9be 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -110,6 +110,8 @@
>                                 IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
>                                 IOSQE_BUFFER_SELECT)
>
> +#define IO_TCTX_REFS_CACHE_NR  (1U << 10)
> +
>  struct io_uring {
>         u32 head ____cacheline_aligned_in_smp;
>         u32 tail ____cacheline_aligned_in_smp;
> @@ -472,6 +474,7 @@ struct io_ring_ctx {
>
>  struct io_uring_task {
>         /* submission side */
> +       int                     cached_refs;
>         struct xarray           xa;
>         struct wait_queue_head  wait;
>         const struct io_ring_ctx *last;
> @@ -6702,16 +6705,23 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
>
>  static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
>  {
> +       struct io_uring_task *tctx;
>         int submitted = 0;
>
>         /* make sure SQ entry isn't read before tail */
>         nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
> -
>         if (!percpu_ref_tryget_many(&ctx->refs, nr))
>                 return -EAGAIN;
>
> -       percpu_counter_add(&current->io_uring->inflight, nr);
> -       refcount_add(nr, &current->usage);
> +       tctx = current->io_uring;
> +       tctx->cached_refs -= nr;
> +       if (unlikely(tctx->cached_refs < 0)) {
> +               unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;

Might be cleared to use:

unsigned int refill =  IO_TCTX_REFS_CACHE_NR - tctx->cached_refs;
> +
> +               percpu_counter_add(&tctx->inflight, refill);
> +               refcount_add(refill, &current->usage);
> +               tctx->cached_refs += refill;
> +       }
>         io_submit_state_start(&ctx->submit_state, nr);
>
>         while (submitted < nr) {
> @@ -6737,12 +6747,10 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
>
>         if (unlikely(submitted != nr)) {
>                 int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
> -               struct io_uring_task *tctx = current->io_uring;
>                 int unused = nr - ref_used;
>
> +               current->io_uring->cached_refs += unused;
>                 percpu_ref_put_many(&ctx->refs, unused);
> -               percpu_counter_sub(&tctx->inflight, unused);
> -               put_task_struct_many(current, unused);
>         }
>
>         io_submit_state_end(&ctx->submit_state, ctx);
> @@ -7924,7 +7932,7 @@ static int io_uring_alloc_task_context(struct task_struct *task,
>         struct io_uring_task *tctx;
>         int ret;
>
> -       tctx = kmalloc(sizeof(*tctx), GFP_KERNEL);
> +       tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
>         if (unlikely(!tctx))
>                 return -ENOMEM;
>
> @@ -7944,13 +7952,11 @@ static int io_uring_alloc_task_context(struct task_struct *task,
>
>         xa_init(&tctx->xa);
>         init_waitqueue_head(&tctx->wait);
> -       tctx->last = NULL;
>         atomic_set(&tctx->in_idle, 0);
>         atomic_set(&tctx->inflight_tracked, 0);
>         task->io_uring = tctx;
>         spin_lock_init(&tctx->task_lock);
>         INIT_WQ_LIST(&tctx->task_list);
> -       tctx->task_state = 0;
>         init_task_work(&tctx->task_work, tctx_task_work);
>         return 0;
>  }
> @@ -7961,6 +7967,7 @@ void __io_uring_free(struct task_struct *tsk)
>
>         WARN_ON_ONCE(!xa_empty(&tctx->xa));
>         WARN_ON_ONCE(tctx->io_wq);
> +       WARN_ON_ONCE(tctx->cached_refs);
>
>         percpu_counter_destroy(&tctx->inflight);
>         kfree(tctx);
> @@ -9097,6 +9104,16 @@ static void io_uring_try_cancel(bool cancel_all)
>         }
>  }
>
> +static void io_uring_drop_tctx_refs(struct task_struct *task)
> +{
> +       struct io_uring_task *tctx = task->io_uring;
> +       unsigned int refs = tctx->cached_refs;
> +
> +       tctx->cached_refs = 0;
> +       percpu_counter_sub(&tctx->inflight, refs);
> +       put_task_struct_many(task, refs);
> +}
> +
>  /* should only be called by SQPOLL task */
>  static void io_uring_cancel_sqpoll(struct io_sq_data *sqd)
>  {
> @@ -9112,6 +9129,7 @@ static void io_uring_cancel_sqpoll(struct io_sq_data *sqd)
>
>         WARN_ON_ONCE(!sqd || sqd->thread != current);
>
> +       io_uring_drop_tctx_refs(current);
>         atomic_inc(&tctx->in_idle);
>         do {
>                 /* read completions before cancelations */
> @@ -9149,6 +9167,7 @@ void __io_uring_cancel(struct files_struct *files)
>                 io_wq_exit_start(tctx->io_wq);
>
>         /* make sure overflow events are dropped */
> +       io_uring_drop_tctx_refs(current);
>         atomic_inc(&tctx->in_idle);
>         do {
>                 /* read completions before cancelations */
> --
> 2.31.1
>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 12/13] io_uring: cache task struct refs
  2021-05-27 21:51   ` Noah Goldstein
@ 2021-05-27 22:13     ` Pavel Begunkov
  0 siblings, 0 replies; 21+ messages in thread
From: Pavel Begunkov @ 2021-05-27 22:13 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: Jens Axboe, open list:IO_URING

On 5/27/21 10:51 PM, Noah Goldstein wrote:
> On Mon, May 24, 2021 at 7:51 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
[...]
>>
>> -       percpu_counter_add(&current->io_uring->inflight, nr);
>> -       refcount_add(nr, &current->usage);
>> +       tctx = current->io_uring;
>> +       tctx->cached_refs -= nr;
>> +       if (unlikely(tctx->cached_refs < 0)) {
>> +               unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
> 
> Might be cleared to use:
> 
> unsigned int refill =  IO_TCTX_REFS_CACHE_NR - tctx->cached_refs;

=======================================================================
Did think about it, but left as is to emphasize that it's negative and
we convert it to positive, so all operations are with positive (and
unsigned due to C rules). Code generation will be same.


>> +
>> +               percpu_counter_add(&tctx->inflight, refill);
>> +               refcount_add(refill, &current->usage);
>> +               tctx->cached_refs += refill;
>> +       }
>>         io_submit_state_start(&ctx->submit_state, nr);
>>

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 10/13] io_uring: add helpers for 2 level table alloc
  2021-05-27 21:43   ` Noah Goldstein
@ 2021-05-27 22:14     ` Pavel Begunkov
  0 siblings, 0 replies; 21+ messages in thread
From: Pavel Begunkov @ 2021-05-27 22:14 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: Jens Axboe, open list:IO_URING

On 5/27/21 10:43 PM, Noah Goldstein wrote:
> On Mon, May 24, 2021 at 7:51 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>> -static void io_free_file_tables(struct io_file_table *table, unsigned nr_files)
>> +static void io_free_page_table(void **table, size_t size)
>>  {
>> -       unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE);
>> +       unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
>>
>>         for (i = 0; i < nr_tables; i++)
>> -               kfree(table->files[i]);
>> -       kfree(table->files);
>> -       table->files = NULL;
>> +               kfree(table[i]);
>> +       kfree(table);
>> +}
>> +
>> +static void **io_alloc_page_table(size_t size)
>> +{
>> +       unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
>> +       size_t init_size = size;
>> +       void **table;
>> +
>> +       table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL);
>> +       if (!table)
>> +               return NULL;
>> +
>> +       for (i = 0; i < nr_tables; i++) {
>> +               unsigned int this_size = min(size, PAGE_SIZE);
>> +
>> +               table[i] = kzalloc(this_size, GFP_KERNEL);
>> +               if (!table[i]) {
>> +                       io_free_page_table(table, init_size);
>> +                       return NULL;
> Unless zalloc returns non-NULL for size == 0, you are guranteed to do
> this for size <= PAGE_SIZE * (nr_tables - 1).  Possibly worth calculating early?

Far from being a fast path, so would rather keep it simpler
and cleaner

> 
> If you calculate early you could then make the loop:
> 
> for (i = 0; i < nr_tables - 1; i++) {
>     table[i] = kzalloc(PAGE_SIZE, GFP_KERNEL);
>     if (!table[i]) {
>         io_free_page_table(table, init_size);
>         return NULL;
>     }
> }
> 
> table[i] = kzalloc(size - (nr_tables - 1) * PAGE_SIZE, GFP_KERNEL);
>     if (!table[i]) {
>         io_free_page_table(table, init_size);
>         return NULL;
>     }
> 
> Which is almost certainly faster.

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 05/13] io-wq: replace goto while
  2021-05-27 21:48   ` Noah Goldstein
@ 2021-05-27 22:18     ` Pavel Begunkov
  0 siblings, 0 replies; 21+ messages in thread
From: Pavel Begunkov @ 2021-05-27 22:18 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: Jens Axboe, open list:IO_URING

On 5/27/21 10:48 PM, Noah Goldstein wrote:
> On Mon, May 24, 2021 at 7:51 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>>
>> If for/while is simple enough it's more prefered over goto with labels
>> as the former one is more explicit and easier to read. So, replace a
>> trivial goto-based loop in io_wqe_worker() with a while.
>>
>> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
>> ---
>>  fs/io-wq.c | 9 +++++----
>>  1 file changed, 5 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/io-wq.c b/fs/io-wq.c
>> index a0e43d1b94af..712eb062f822 100644
>> --- a/fs/io-wq.c
>> +++ b/fs/io-wq.c
>> @@ -538,12 +538,13 @@ static int io_wqe_worker(void *data)
>>                 long ret;
>>
>>                 set_current_state(TASK_INTERRUPTIBLE);
>> -loop:
>> -               raw_spin_lock_irq(&wqe->lock);
>> -               if (io_wqe_run_queue(wqe)) {
>> +               while (1) {
>> +                       raw_spin_lock_irq(&wqe->lock);
> Can acquiring the spinlock be hoisted from the loop?

lock();
while(1) { ... }
unlock();

If this, then no. If taken inside io_worker_handle_work(),
maybe, but not with this commit.

>> +                       if (!io_wqe_run_queue(wqe))
>> +                               break;
>>                         io_worker_handle_work(worker);
>> -                       goto loop;
>>                 }
>> +
>>                 __io_worker_idle(wqe, worker);
>>                 raw_spin_unlock_irq(&wqe->lock);
>>                 if (io_flush_signals())
>> --
>> 2.31.1
>>

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH 01/13] io-wq: embed wqe ptr array into struct io_wq
  2021-06-14  1:36 [PATCH v2 for-next 00/13] resend of for-next cleanups Pavel Begunkov
@ 2021-06-14  1:36 ` Pavel Begunkov
  0 siblings, 0 replies; 21+ messages in thread
From: Pavel Begunkov @ 2021-06-14  1:36 UTC (permalink / raw)
  To: Jens Axboe, io-uring

io-wq keeps an array of pointers to struct io_wqe, allocate this array
as a part of struct io-wq, it's easier to code and saves an extra
indirection for nearly each io-wq call.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io-wq.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index b3e8624a37d0..1ca98fc7d52b 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -102,7 +102,6 @@ struct io_wqe {
  * Per io_wq state
   */
 struct io_wq {
-	struct io_wqe **wqes;
 	unsigned long state;
 
 	free_work_fn *free_work;
@@ -118,6 +117,8 @@ struct io_wq {
 	struct hlist_node cpuhp_node;
 
 	struct task_struct *task;
+
+	struct io_wqe *wqes[];
 };
 
 static enum cpuhp_state io_wq_online;
@@ -907,17 +908,12 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 	if (WARN_ON_ONCE(!data->free_work || !data->do_work))
 		return ERR_PTR(-EINVAL);
 
-	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
+	wq = kzalloc(struct_size(wq, wqes, nr_node_ids), GFP_KERNEL);
 	if (!wq)
 		return ERR_PTR(-ENOMEM);
-
-	wq->wqes = kcalloc(nr_node_ids, sizeof(struct io_wqe *), GFP_KERNEL);
-	if (!wq->wqes)
-		goto err_wq;
-
 	ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node);
 	if (ret)
-		goto err_wqes;
+		goto err_wq;
 
 	refcount_inc(&data->hash->refs);
 	wq->hash = data->hash;
@@ -962,8 +958,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 	cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
 	for_each_node(node)
 		kfree(wq->wqes[node]);
-err_wqes:
-	kfree(wq->wqes);
 err_wq:
 	kfree(wq);
 	return ERR_PTR(ret);
@@ -1036,7 +1030,6 @@ static void io_wq_destroy(struct io_wq *wq)
 		kfree(wqe);
 	}
 	io_wq_put_hash(wq->hash);
-	kfree(wq->wqes);
 	kfree(wq);
 }
 
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2021-06-14  1:36 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-05-24 23:50 [PATCH for-next 00/13] 5.14 batch 2 Pavel Begunkov
2021-05-24 23:51 ` [PATCH 01/13] io-wq: embed wqe ptr array into struct io_wq Pavel Begunkov
2021-05-24 23:51 ` [PATCH 02/13] io-wq: remove unused io-wq refcounting Pavel Begunkov
2021-05-24 23:51 ` [PATCH 03/13] io_uring: refactor io_iopoll_req_issued Pavel Begunkov
2021-05-24 23:51 ` [PATCH 04/13] io_uring: rename function *task_file Pavel Begunkov
2021-05-24 23:51 ` [PATCH 05/13] io-wq: replace goto while Pavel Begunkov
2021-05-27 21:48   ` Noah Goldstein
2021-05-27 22:18     ` Pavel Begunkov
2021-05-24 23:51 ` [PATCH 06/13] io-wq: don't repeat IO_WQ_BIT_EXIT check by worker Pavel Begunkov
2021-05-24 23:51 ` [PATCH 07/13] io-wq: simplify worker exiting Pavel Begunkov
2021-05-24 23:51 ` [PATCH 08/13] io_uring: hide rsrc tag copy into generic helpers Pavel Begunkov
2021-05-24 23:51 ` [PATCH 09/13] io_uring: remove rsrc put work irq save/restore Pavel Begunkov
2021-05-24 23:51 ` [PATCH 10/13] io_uring: add helpers for 2 level table alloc Pavel Begunkov
2021-05-27 21:43   ` Noah Goldstein
2021-05-27 22:14     ` Pavel Begunkov
2021-05-24 23:51 ` [PATCH 11/13] io_uring: don't vmalloc rsrc tags Pavel Begunkov
2021-05-24 23:51 ` [PATCH 12/13] io_uring: cache task struct refs Pavel Begunkov
2021-05-27 21:51   ` Noah Goldstein
2021-05-27 22:13     ` Pavel Begunkov
2021-05-24 23:51 ` [PATCH 13/13] io_uring: unify SQPOLL and user task cancellations Pavel Begunkov
2021-06-14  1:36 [PATCH v2 for-next 00/13] resend of for-next cleanups Pavel Begunkov
2021-06-14  1:36 ` [PATCH 01/13] io-wq: embed wqe ptr array into struct io_wq Pavel Begunkov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.