All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 for-next 00/13] resend of for-next cleanups
@ 2021-06-14  1:36 Pavel Begunkov
  2021-06-14  1:36 ` [PATCH 01/13] io-wq: embed wqe ptr array into struct io_wq Pavel Begunkov
                   ` (13 more replies)
  0 siblings, 14 replies; 15+ messages in thread
From: Pavel Begunkov @ 2021-06-14  1:36 UTC (permalink / raw)
  To: Jens Axboe, io-uring

The series is based on the 5.14 branch with fixes from 5.13 that are
missing applied on top:

216e5835966a io_uring: fix misaccounting fix buf pinned pages
b16ef427adf3 io_uring: fix data race to avoid potential NULL-deref
3743c1723bfc io-wq: Fix UAF when wakeup wqe in hash waitqueue
17a91051fe63 io_uring/io-wq: close io-wq full-stop gap

v2: rebase
    droped one not important patch

Pavel Begunkov (13):
  io-wq: embed wqe ptr array into struct io_wq
  io-wq: remove unused io-wq refcounting
  io_uring: refactor io_iopoll_req_issued
  io_uring: rename function *task_file
  io-wq: don't repeat IO_WQ_BIT_EXIT check by worker
  io-wq: simplify worker exiting
  io_uring: hide rsrc tag copy into generic helpers
  io_uring: remove rsrc put work irq save/restore
  io_uring: add helpers for 2 level table alloc
  io_uring: don't vmalloc rsrc tags
  io_uring: cache task struct refs
  io_uring: unify SQPOLL and user task cancellations
  io_uring: inline io_iter_do_read()

 fs/io-wq.c    |  29 +----
 fs/io_uring.c | 349 ++++++++++++++++++++++++++------------------------
 2 files changed, 191 insertions(+), 187 deletions(-)

-- 
2.31.1


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 01/13] io-wq: embed wqe ptr array into struct io_wq
  2021-06-14  1:36 [PATCH v2 for-next 00/13] resend of for-next cleanups Pavel Begunkov
@ 2021-06-14  1:36 ` Pavel Begunkov
  2021-06-14  1:36 ` [PATCH 02/13] io-wq: remove unused io-wq refcounting Pavel Begunkov
                   ` (12 subsequent siblings)
  13 siblings, 0 replies; 15+ messages in thread
From: Pavel Begunkov @ 2021-06-14  1:36 UTC (permalink / raw)
  To: Jens Axboe, io-uring

io-wq keeps an array of pointers to struct io_wqe, allocate this array
as a part of struct io-wq, it's easier to code and saves an extra
indirection for nearly each io-wq call.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io-wq.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index b3e8624a37d0..1ca98fc7d52b 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -102,7 +102,6 @@ struct io_wqe {
  * Per io_wq state
   */
 struct io_wq {
-	struct io_wqe **wqes;
 	unsigned long state;
 
 	free_work_fn *free_work;
@@ -118,6 +117,8 @@ struct io_wq {
 	struct hlist_node cpuhp_node;
 
 	struct task_struct *task;
+
+	struct io_wqe *wqes[];
 };
 
 static enum cpuhp_state io_wq_online;
@@ -907,17 +908,12 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 	if (WARN_ON_ONCE(!data->free_work || !data->do_work))
 		return ERR_PTR(-EINVAL);
 
-	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
+	wq = kzalloc(struct_size(wq, wqes, nr_node_ids), GFP_KERNEL);
 	if (!wq)
 		return ERR_PTR(-ENOMEM);
-
-	wq->wqes = kcalloc(nr_node_ids, sizeof(struct io_wqe *), GFP_KERNEL);
-	if (!wq->wqes)
-		goto err_wq;
-
 	ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node);
 	if (ret)
-		goto err_wqes;
+		goto err_wq;
 
 	refcount_inc(&data->hash->refs);
 	wq->hash = data->hash;
@@ -962,8 +958,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 	cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
 	for_each_node(node)
 		kfree(wq->wqes[node]);
-err_wqes:
-	kfree(wq->wqes);
 err_wq:
 	kfree(wq);
 	return ERR_PTR(ret);
@@ -1036,7 +1030,6 @@ static void io_wq_destroy(struct io_wq *wq)
 		kfree(wqe);
 	}
 	io_wq_put_hash(wq->hash);
-	kfree(wq->wqes);
 	kfree(wq);
 }
 
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 02/13] io-wq: remove unused io-wq refcounting
  2021-06-14  1:36 [PATCH v2 for-next 00/13] resend of for-next cleanups Pavel Begunkov
  2021-06-14  1:36 ` [PATCH 01/13] io-wq: embed wqe ptr array into struct io_wq Pavel Begunkov
@ 2021-06-14  1:36 ` Pavel Begunkov
  2021-06-14  1:36 ` [PATCH 03/13] io_uring: refactor io_iopoll_req_issued Pavel Begunkov
                   ` (11 subsequent siblings)
  13 siblings, 0 replies; 15+ messages in thread
From: Pavel Begunkov @ 2021-06-14  1:36 UTC (permalink / raw)
  To: Jens Axboe, io-uring

iowq->refs is initialised to one and killed on exit, so it's not used
and we can kill it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io-wq.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 1ca98fc7d52b..f058ea0bcae8 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -109,8 +109,6 @@ struct io_wq {
 
 	struct io_wq_hash *hash;
 
-	refcount_t refs;
-
 	atomic_t worker_refs;
 	struct completion worker_done;
 
@@ -949,7 +947,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 	}
 
 	wq->task = get_task_struct(data->task);
-	refcount_set(&wq->refs, 1);
 	atomic_set(&wq->worker_refs, 1);
 	init_completion(&wq->worker_done);
 	return wq;
@@ -1038,8 +1035,7 @@ void io_wq_put_and_exit(struct io_wq *wq)
 	WARN_ON_ONCE(!test_bit(IO_WQ_BIT_EXIT, &wq->state));
 
 	io_wq_exit_workers(wq);
-	if (refcount_dec_and_test(&wq->refs))
-		io_wq_destroy(wq);
+	io_wq_destroy(wq);
 }
 
 static bool io_wq_worker_affinity(struct io_worker *worker, void *data)
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 03/13] io_uring: refactor io_iopoll_req_issued
  2021-06-14  1:36 [PATCH v2 for-next 00/13] resend of for-next cleanups Pavel Begunkov
  2021-06-14  1:36 ` [PATCH 01/13] io-wq: embed wqe ptr array into struct io_wq Pavel Begunkov
  2021-06-14  1:36 ` [PATCH 02/13] io-wq: remove unused io-wq refcounting Pavel Begunkov
@ 2021-06-14  1:36 ` Pavel Begunkov
  2021-06-14  1:36 ` [PATCH 04/13] io_uring: rename function *task_file Pavel Begunkov
                   ` (10 subsequent siblings)
  13 siblings, 0 replies; 15+ messages in thread
From: Pavel Begunkov @ 2021-06-14  1:36 UTC (permalink / raw)
  To: Jens Axboe, io-uring

A simple refactoring of io_iopoll_req_issued(), move in_async inside so
we don't pass it around and save on double checking it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 44 +++++++++++++++++++++-----------------------
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 485f1f055db3..8fbb48c1ac7a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2520,9 +2520,14 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
  * find it from a io_do_iopoll() thread before the issuer is done
  * accessing the kiocb cookie.
  */
-static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
+static void io_iopoll_req_issued(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
+	const bool in_async = io_wq_current_is_worker();
+
+	/* workqueue context doesn't hold uring_lock, grab it now */
+	if (unlikely(in_async))
+		mutex_lock(&ctx->uring_lock);
 
 	/*
 	 * Track whether we have multiple files in our lists. This will impact
@@ -2549,14 +2554,19 @@ static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
 	else
 		list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
 
-	/*
-	 * If IORING_SETUP_SQPOLL is enabled, sqes are either handled in sq thread
-	 * task context or in io worker task context. If current task context is
-	 * sq thread, we don't need to check whether should wake up sq thread.
-	 */
-	if (in_async && (ctx->flags & IORING_SETUP_SQPOLL) &&
-	    wq_has_sleeper(&ctx->sq_data->wait))
-		wake_up(&ctx->sq_data->wait);
+	if (unlikely(in_async)) {
+		/*
+		 * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
+		 * in sq thread task context or in io worker task context. If
+		 * current task context is sq thread, we don't need to check
+		 * whether should wake up sq thread.
+		 */
+		if ((ctx->flags & IORING_SETUP_SQPOLL) &&
+		    wq_has_sleeper(&ctx->sq_data->wait))
+			wake_up(&ctx->sq_data->wait);
+
+		mutex_unlock(&ctx->uring_lock);
+	}
 }
 
 static inline void io_state_file_put(struct io_submit_state *state)
@@ -6210,23 +6220,11 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 
 	if (creds)
 		revert_creds(creds);
-
 	if (ret)
 		return ret;
-
 	/* If the op doesn't have a file, we're not polling for it */
-	if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) {
-		const bool in_async = io_wq_current_is_worker();
-
-		/* workqueue context doesn't hold uring_lock, grab it now */
-		if (in_async)
-			mutex_lock(&ctx->uring_lock);
-
-		io_iopoll_req_issued(req, in_async);
-
-		if (in_async)
-			mutex_unlock(&ctx->uring_lock);
-	}
+	if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file)
+		io_iopoll_req_issued(req);
 
 	return 0;
 }
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 04/13] io_uring: rename function *task_file
  2021-06-14  1:36 [PATCH v2 for-next 00/13] resend of for-next cleanups Pavel Begunkov
                   ` (2 preceding siblings ...)
  2021-06-14  1:36 ` [PATCH 03/13] io_uring: refactor io_iopoll_req_issued Pavel Begunkov
@ 2021-06-14  1:36 ` Pavel Begunkov
  2021-06-14  1:36 ` [PATCH 05/13] io-wq: don't repeat IO_WQ_BIT_EXIT check by worker Pavel Begunkov
                   ` (9 subsequent siblings)
  13 siblings, 0 replies; 15+ messages in thread
From: Pavel Begunkov @ 2021-06-14  1:36 UTC (permalink / raw)
  To: Jens Axboe, io-uring

What at some moment was references to struct file used to control
lifetimes of task/ctx is now just internal tctx structures/nodes,
so rename outdated *task_file() routines into something more sensible.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8fbb48c1ac7a..cea3d0f5dad5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1024,7 +1024,7 @@ static const struct io_op_def io_op_defs[] = {
 };
 
 static bool io_disarm_next(struct io_kiocb *req);
-static void io_uring_del_task_file(unsigned long index);
+static void io_uring_del_tctx_node(unsigned long index);
 static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 					 struct task_struct *task,
 					 bool cancel_all);
@@ -8709,7 +8709,7 @@ static void io_tctx_exit_cb(struct callback_head *cb)
 	 * node. It'll be removed by the end of cancellation, just ignore it.
 	 */
 	if (!atomic_read(&tctx->in_idle))
-		io_uring_del_task_file((unsigned long)work->ctx);
+		io_uring_del_tctx_node((unsigned long)work->ctx);
 	complete(&work->completion);
 }
 
@@ -8962,7 +8962,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 	}
 }
 
-static int __io_uring_add_task_file(struct io_ring_ctx *ctx)
+static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
 {
 	struct io_uring_task *tctx = current->io_uring;
 	struct io_tctx_node *node;
@@ -8999,19 +8999,19 @@ static int __io_uring_add_task_file(struct io_ring_ctx *ctx)
 /*
  * Note that this task has used io_uring. We use it for cancelation purposes.
  */
-static inline int io_uring_add_task_file(struct io_ring_ctx *ctx)
+static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
 {
 	struct io_uring_task *tctx = current->io_uring;
 
 	if (likely(tctx && tctx->last == ctx))
 		return 0;
-	return __io_uring_add_task_file(ctx);
+	return __io_uring_add_tctx_node(ctx);
 }
 
 /*
  * Remove this io_uring_file -> task mapping.
  */
-static void io_uring_del_task_file(unsigned long index)
+static void io_uring_del_tctx_node(unsigned long index)
 {
 	struct io_uring_task *tctx = current->io_uring;
 	struct io_tctx_node *node;
@@ -9041,7 +9041,7 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx)
 	unsigned long index;
 
 	xa_for_each(&tctx->xa, index, node)
-		io_uring_del_task_file(index);
+		io_uring_del_tctx_node(index);
 	if (wq) {
 		/*
 		 * Must be after io_uring_del_task_file() (removes nodes under
@@ -9325,7 +9325,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 		}
 		submitted = to_submit;
 	} else if (to_submit) {
-		ret = io_uring_add_task_file(ctx);
+		ret = io_uring_add_tctx_node(ctx);
 		if (unlikely(ret))
 			goto out;
 		mutex_lock(&ctx->uring_lock);
@@ -9535,7 +9535,7 @@ static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
 	if (fd < 0)
 		return fd;
 
-	ret = io_uring_add_task_file(ctx);
+	ret = io_uring_add_tctx_node(ctx);
 	if (ret) {
 		put_unused_fd(fd);
 		return ret;
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 05/13] io-wq: don't repeat IO_WQ_BIT_EXIT check by worker
  2021-06-14  1:36 [PATCH v2 for-next 00/13] resend of for-next cleanups Pavel Begunkov
                   ` (3 preceding siblings ...)
  2021-06-14  1:36 ` [PATCH 04/13] io_uring: rename function *task_file Pavel Begunkov
@ 2021-06-14  1:36 ` Pavel Begunkov
  2021-06-14  1:36 ` [PATCH 06/13] io-wq: simplify worker exiting Pavel Begunkov
                   ` (8 subsequent siblings)
  13 siblings, 0 replies; 15+ messages in thread
From: Pavel Begunkov @ 2021-06-14  1:36 UTC (permalink / raw)
  To: Jens Axboe, io-uring

io_wqe_worker()'s main loop does check IO_WQ_BIT_EXIT flag, so no need
for a second test_bit at the end as it will immediately jump to the
first check afterwards.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io-wq.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index f058ea0bcae8..8c13e23d4a8a 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -559,8 +559,7 @@ static int io_wqe_worker(void *data)
 		if (ret)
 			continue;
 		/* timed out, exit unless we're the fixed worker */
-		if (test_bit(IO_WQ_BIT_EXIT, &wq->state) ||
-		    !(worker->flags & IO_WORKER_F_FIXED))
+		if (!(worker->flags & IO_WORKER_F_FIXED))
 			break;
 	}
 
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 06/13] io-wq: simplify worker exiting
  2021-06-14  1:36 [PATCH v2 for-next 00/13] resend of for-next cleanups Pavel Begunkov
                   ` (4 preceding siblings ...)
  2021-06-14  1:36 ` [PATCH 05/13] io-wq: don't repeat IO_WQ_BIT_EXIT check by worker Pavel Begunkov
@ 2021-06-14  1:36 ` Pavel Begunkov
  2021-06-14  1:36 ` [PATCH 07/13] io_uring: hide rsrc tag copy into generic helpers Pavel Begunkov
                   ` (7 subsequent siblings)
  13 siblings, 0 replies; 15+ messages in thread
From: Pavel Begunkov @ 2021-06-14  1:36 UTC (permalink / raw)
  To: Jens Axboe, io-uring

io_worker_handle_work() already takes care of the empty list case and
releases spinlock, so get rid of ugly conditional unlocking and
unconditionally call handle_work()

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io-wq.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 8c13e23d4a8a..2c37776c0280 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -565,10 +565,7 @@ static int io_wqe_worker(void *data)
 
 	if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
 		raw_spin_lock_irq(&wqe->lock);
-		if (!wq_list_empty(&wqe->work_list))
-			io_worker_handle_work(worker);
-		else
-			raw_spin_unlock_irq(&wqe->lock);
+		io_worker_handle_work(worker);
 	}
 
 	io_worker_exit(worker);
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 07/13] io_uring: hide rsrc tag copy into generic helpers
  2021-06-14  1:36 [PATCH v2 for-next 00/13] resend of for-next cleanups Pavel Begunkov
                   ` (5 preceding siblings ...)
  2021-06-14  1:36 ` [PATCH 06/13] io-wq: simplify worker exiting Pavel Begunkov
@ 2021-06-14  1:36 ` Pavel Begunkov
  2021-06-14  1:36 ` [PATCH 08/13] io_uring: remove rsrc put work irq save/restore Pavel Begunkov
                   ` (6 subsequent siblings)
  13 siblings, 0 replies; 15+ messages in thread
From: Pavel Begunkov @ 2021-06-14  1:36 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Make io_rsrc_data_alloc() taking care of rsrc tags loading on
registration, so we don't need to repeat it for each new rsrc type.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 55 +++++++++++++++++++++++++--------------------------
 1 file changed, 27 insertions(+), 28 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index cea3d0f5dad5..18ed6ecb1d76 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7156,27 +7156,38 @@ static void io_rsrc_data_free(struct io_rsrc_data *data)
 	kfree(data);
 }
 
-static struct io_rsrc_data *io_rsrc_data_alloc(struct io_ring_ctx *ctx,
-					       rsrc_put_fn *do_put,
-					       unsigned nr)
+static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
+			      u64 __user *utags, unsigned nr,
+			      struct io_rsrc_data **pdata)
 {
 	struct io_rsrc_data *data;
+	unsigned i;
 
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
 	if (!data)
-		return NULL;
+		return -ENOMEM;
 
 	data->tags = kvcalloc(nr, sizeof(*data->tags), GFP_KERNEL);
 	if (!data->tags) {
 		kfree(data);
-		return NULL;
+		return -ENOMEM;
+	}
+	if (utags) {
+		for (i = 0; i < nr; i++) {
+			if (copy_from_user(&data->tags[i], &utags[i],
+					   sizeof(data->tags[i]))) {
+				io_rsrc_data_free(data);
+				return -EFAULT;
+			}
+		}
 	}
 
 	atomic_set(&data->refs, 1);
 	data->ctx = ctx;
 	data->do_put = do_put;
 	init_completion(&data->done);
-	return data;
+	*pdata = data;
+	return 0;
 }
 
 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
@@ -7628,7 +7639,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	struct file *file;
 	int fd, ret;
 	unsigned i;
-	struct io_rsrc_data *file_data;
 
 	if (ctx->file_data)
 		return -EBUSY;
@@ -7639,27 +7649,24 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	ret = io_rsrc_node_switch_start(ctx);
 	if (ret)
 		return ret;
+	ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
+				 &ctx->file_data);
+	if (ret)
+		return ret;
 
-	file_data = io_rsrc_data_alloc(ctx, io_rsrc_file_put, nr_args);
-	if (!file_data)
-		return -ENOMEM;
-	ctx->file_data = file_data;
 	ret = -ENOMEM;
 	if (!io_alloc_file_tables(&ctx->file_table, nr_args))
 		goto out_free;
 
 	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
-		u64 tag = 0;
-
-		if ((tags && copy_from_user(&tag, &tags[i], sizeof(tag))) ||
-		    copy_from_user(&fd, &fds[i], sizeof(fd))) {
+		if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
 			ret = -EFAULT;
 			goto out_fput;
 		}
 		/* allow sparse sets */
 		if (fd == -1) {
 			ret = -EINVAL;
-			if (unlikely(tag))
+			if (unlikely(ctx->file_data->tags[i]))
 				goto out_fput;
 			continue;
 		}
@@ -7680,7 +7687,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 			fput(file);
 			goto out_fput;
 		}
-		ctx->file_data->tags[i] = tag;
 		io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
 	}
 
@@ -8398,9 +8404,9 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 	ret = io_rsrc_node_switch_start(ctx);
 	if (ret)
 		return ret;
-	data = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, nr_args);
-	if (!data)
-		return -ENOMEM;
+	ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
+	if (ret)
+		return ret;
 	ret = io_buffers_map_alloc(ctx, nr_args);
 	if (ret) {
 		io_rsrc_data_free(data);
@@ -8408,19 +8414,13 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 	}
 
 	for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
-		u64 tag = 0;
-
-		if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) {
-			ret = -EFAULT;
-			break;
-		}
 		ret = io_copy_iov(ctx, &iov, arg, i);
 		if (ret)
 			break;
 		ret = io_buffer_validate(&iov);
 		if (ret)
 			break;
-		if (!iov.iov_base && tag) {
+		if (!iov.iov_base && data->tags[i]) {
 			ret = -EINVAL;
 			break;
 		}
@@ -8429,7 +8429,6 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 					     &last_hpage);
 		if (ret)
 			break;
-		data->tags[i] = tag;
 	}
 
 	WARN_ON_ONCE(ctx->buf_data);
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 08/13] io_uring: remove rsrc put work irq save/restore
  2021-06-14  1:36 [PATCH v2 for-next 00/13] resend of for-next cleanups Pavel Begunkov
                   ` (6 preceding siblings ...)
  2021-06-14  1:36 ` [PATCH 07/13] io_uring: hide rsrc tag copy into generic helpers Pavel Begunkov
@ 2021-06-14  1:36 ` Pavel Begunkov
  2021-06-14  1:36 ` [PATCH 09/13] io_uring: add helpers for 2 level table alloc Pavel Begunkov
                   ` (5 subsequent siblings)
  13 siblings, 0 replies; 15+ messages in thread
From: Pavel Begunkov @ 2021-06-14  1:36 UTC (permalink / raw)
  To: Jens Axboe, io-uring

io_rsrc_put_work() is executed by workqueue in non-irq context, so no
need for irqsave/restore variants of spinlocking.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 18ed6ecb1d76..59cd9dc6164c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7550,14 +7550,13 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 
 		if (prsrc->tag) {
 			bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL;
-			unsigned long flags;
 
 			io_ring_submit_lock(ctx, lock_ring);
-			spin_lock_irqsave(&ctx->completion_lock, flags);
+			spin_lock_irq(&ctx->completion_lock);
 			io_cqring_fill_event(ctx, prsrc->tag, 0, 0);
 			ctx->cq_extra++;
 			io_commit_cqring(ctx);
-			spin_unlock_irqrestore(&ctx->completion_lock, flags);
+			spin_unlock_irq(&ctx->completion_lock);
 			io_cqring_ev_posted(ctx);
 			io_ring_submit_unlock(ctx, lock_ring);
 		}
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 09/13] io_uring: add helpers for 2 level table alloc
  2021-06-14  1:36 [PATCH v2 for-next 00/13] resend of for-next cleanups Pavel Begunkov
                   ` (7 preceding siblings ...)
  2021-06-14  1:36 ` [PATCH 08/13] io_uring: remove rsrc put work irq save/restore Pavel Begunkov
@ 2021-06-14  1:36 ` Pavel Begunkov
  2021-06-14  1:36 ` [PATCH 10/13] io_uring: don't vmalloc rsrc tags Pavel Begunkov
                   ` (4 subsequent siblings)
  13 siblings, 0 replies; 15+ messages in thread
From: Pavel Begunkov @ 2021-06-14  1:36 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Some parts like fixed file table use 2 level tables, factor out helpers
for allocating/deallocating them as more users are to come.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 73 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 43 insertions(+), 30 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 59cd9dc6164c..d6c1322119d6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7054,14 +7054,36 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
 }
 
-static void io_free_file_tables(struct io_file_table *table, unsigned nr_files)
+static void io_free_page_table(void **table, size_t size)
 {
-	unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE);
+	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
 
 	for (i = 0; i < nr_tables; i++)
-		kfree(table->files[i]);
-	kfree(table->files);
-	table->files = NULL;
+		kfree(table[i]);
+	kfree(table);
+}
+
+static void **io_alloc_page_table(size_t size)
+{
+	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
+	size_t init_size = size;
+	void **table;
+
+	table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL);
+	if (!table)
+		return NULL;
+
+	for (i = 0; i < nr_tables; i++) {
+		unsigned int this_size = min(size, PAGE_SIZE);
+
+		table[i] = kzalloc(this_size, GFP_KERNEL);
+		if (!table[i]) {
+			io_free_page_table(table, init_size);
+			return NULL;
+		}
+		size -= this_size;
+	}
+	return table;
 }
 
 static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx)
@@ -7190,6 +7212,22 @@ static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
 	return 0;
 }
 
+static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
+{
+	size_t size = nr_files * sizeof(struct io_fixed_file);
+
+	table->files = (struct io_fixed_file **)io_alloc_page_table(size);
+	return !!table->files;
+}
+
+static void io_free_file_tables(struct io_file_table *table, unsigned nr_files)
+{
+	size_t size = nr_files * sizeof(struct io_fixed_file);
+
+	io_free_page_table((void **)table->files, size);
+	table->files = NULL;
+}
+
 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 {
 #if defined(CONFIG_UNIX)
@@ -7451,31 +7489,6 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx)
 }
 #endif
 
-static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
-{
-	unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE);
-
-	table->files = kcalloc(nr_tables, sizeof(*table->files), GFP_KERNEL);
-	if (!table->files)
-		return false;
-
-	for (i = 0; i < nr_tables; i++) {
-		unsigned int this_files = min(nr_files, IORING_MAX_FILES_TABLE);
-
-		table->files[i] = kcalloc(this_files, sizeof(*table->files[i]),
-					GFP_KERNEL);
-		if (!table->files[i])
-			break;
-		nr_files -= this_files;
-	}
-
-	if (i == nr_tables)
-		return true;
-
-	io_free_file_tables(table, nr_tables * IORING_MAX_FILES_TABLE);
-	return false;
-}
-
 static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
 {
 	struct file *file = prsrc->file;
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 10/13] io_uring: don't vmalloc rsrc tags
  2021-06-14  1:36 [PATCH v2 for-next 00/13] resend of for-next cleanups Pavel Begunkov
                   ` (8 preceding siblings ...)
  2021-06-14  1:36 ` [PATCH 09/13] io_uring: add helpers for 2 level table alloc Pavel Begunkov
@ 2021-06-14  1:36 ` Pavel Begunkov
  2021-06-14  1:36 ` [PATCH 11/13] io_uring: cache task struct refs Pavel Begunkov
                   ` (3 subsequent siblings)
  13 siblings, 0 replies; 15+ messages in thread
From: Pavel Begunkov @ 2021-06-14  1:36 UTC (permalink / raw)
  To: Jens Axboe, io-uring

We don't really need vmalloc for keeping tags, it's not a hot path and
is there out of convenience, so replace it with two level tables to not
litter kernel virtual memory mappings.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 52 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 36 insertions(+), 16 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index d6c1322119d6..692c91d03054 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -100,6 +100,10 @@
 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
 				 IORING_REGISTER_LAST + IORING_OP_LAST)
 
+#define IO_RSRC_TAG_TABLE_SHIFT	9
+#define IO_RSRC_TAG_TABLE_MAX	(1U << IO_RSRC_TAG_TABLE_SHIFT)
+#define IO_RSRC_TAG_TABLE_MASK	(IO_RSRC_TAG_TABLE_MAX - 1)
+
 #define IORING_MAX_REG_BUFFERS	(1U << 14)
 
 #define SQE_VALID_FLAGS	(IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK|	\
@@ -243,7 +247,8 @@ typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
 struct io_rsrc_data {
 	struct io_ring_ctx		*ctx;
 
-	u64				*tags;
+	u64				**tags;
+	unsigned int			nr;
 	rsrc_put_fn			*do_put;
 	atomic_t			refs;
 	struct completion		done;
@@ -7172,9 +7177,20 @@ static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ct
 	return ret;
 }
 
+static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
+{
+	unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
+	unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
+
+	return &data->tags[table_idx][off];
+}
+
 static void io_rsrc_data_free(struct io_rsrc_data *data)
 {
-	kvfree(data->tags);
+	size_t size = data->nr * sizeof(data->tags[0][0]);
+
+	if (data->tags)
+		io_free_page_table((void **)data->tags, size);
 	kfree(data);
 }
 
@@ -7183,33 +7199,37 @@ static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
 			      struct io_rsrc_data **pdata)
 {
 	struct io_rsrc_data *data;
+	int ret = -ENOMEM;
 	unsigned i;
 
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;
-
-	data->tags = kvcalloc(nr, sizeof(*data->tags), GFP_KERNEL);
+	data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
 	if (!data->tags) {
 		kfree(data);
 		return -ENOMEM;
 	}
+
+	data->nr = nr;
+	data->ctx = ctx;
+	data->do_put = do_put;
 	if (utags) {
+		ret = -EFAULT;
 		for (i = 0; i < nr; i++) {
-			if (copy_from_user(&data->tags[i], &utags[i],
-					   sizeof(data->tags[i]))) {
-				io_rsrc_data_free(data);
-				return -EFAULT;
-			}
+			if (copy_from_user(io_get_tag_slot(data, i), &utags[i],
+					   sizeof(data->tags[i])))
+				goto fail;
 		}
 	}
 
 	atomic_set(&data->refs, 1);
-	data->ctx = ctx;
-	data->do_put = do_put;
 	init_completion(&data->done);
 	*pdata = data;
 	return 0;
+fail:
+	io_rsrc_data_free(data);
+	return ret;
 }
 
 static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
@@ -7678,7 +7698,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		/* allow sparse sets */
 		if (fd == -1) {
 			ret = -EINVAL;
-			if (unlikely(ctx->file_data->tags[i]))
+			if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
 				goto out_fput;
 			continue;
 		}
@@ -7776,7 +7796,7 @@ static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
 	if (!prsrc)
 		return -ENOMEM;
 
-	prsrc->tag = data->tags[idx];
+	prsrc->tag = *io_get_tag_slot(data, idx);
 	prsrc->rsrc = rsrc;
 	list_add(&prsrc->list, &node->rsrc_list);
 	return 0;
@@ -7846,7 +7866,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				err = -EBADF;
 				break;
 			}
-			data->tags[up->offset + done] = tag;
+			*io_get_tag_slot(data, up->offset + done) = tag;
 			io_fixed_file_set(file_slot, file);
 			err = io_sqe_file_register(ctx, file, i);
 			if (err) {
@@ -8432,7 +8452,7 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 		ret = io_buffer_validate(&iov);
 		if (ret)
 			break;
-		if (!iov.iov_base && data->tags[i]) {
+		if (!iov.iov_base && *io_get_tag_slot(data, i)) {
 			ret = -EINVAL;
 			break;
 		}
@@ -8505,7 +8525,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 		}
 
 		ctx->user_bufs[i] = imu;
-		ctx->buf_data->tags[offset] = tag;
+		*io_get_tag_slot(ctx->buf_data, offset) = tag;
 	}
 
 	if (needs_switch)
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 11/13] io_uring: cache task struct refs
  2021-06-14  1:36 [PATCH v2 for-next 00/13] resend of for-next cleanups Pavel Begunkov
                   ` (9 preceding siblings ...)
  2021-06-14  1:36 ` [PATCH 10/13] io_uring: don't vmalloc rsrc tags Pavel Begunkov
@ 2021-06-14  1:36 ` Pavel Begunkov
  2021-06-14  1:36 ` [PATCH 12/13] io_uring: unify SQPOLL and user task cancellations Pavel Begunkov
                   ` (2 subsequent siblings)
  13 siblings, 0 replies; 15+ messages in thread
From: Pavel Begunkov @ 2021-06-14  1:36 UTC (permalink / raw)
  To: Jens Axboe, io-uring

tctx in submission part is always synchronised because is executed from
the task's context, so we can batch allocate tctx/task references and
store them across syscall boundaries. It avoids enough of operations,
including an atomic for getting task ref and a percpu_counter_add()
function call, which still fallback to spinlock for large batching
cases (around >=32). Should be good for SQPOLL submitting in small
portions and coming at some moment bpf submissions.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 692c91d03054..23b15ed98815 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -110,6 +110,8 @@
 				IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
 				IOSQE_BUFFER_SELECT)
 
+#define IO_TCTX_REFS_CACHE_NR	(1U << 10)
+
 struct io_uring {
 	u32 head ____cacheline_aligned_in_smp;
 	u32 tail ____cacheline_aligned_in_smp;
@@ -472,6 +474,7 @@ struct io_ring_ctx {
 
 struct io_uring_task {
 	/* submission side */
+	int			cached_refs;
 	struct xarray		xa;
 	struct wait_queue_head	wait;
 	const struct io_ring_ctx *last;
@@ -6702,16 +6705,23 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
 
 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 {
+	struct io_uring_task *tctx;
 	int submitted = 0;
 
 	/* make sure SQ entry isn't read before tail */
 	nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
-
 	if (!percpu_ref_tryget_many(&ctx->refs, nr))
 		return -EAGAIN;
 
-	percpu_counter_add(&current->io_uring->inflight, nr);
-	refcount_add(nr, &current->usage);
+	tctx = current->io_uring;
+	tctx->cached_refs -= nr;
+	if (unlikely(tctx->cached_refs < 0)) {
+		unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
+
+		percpu_counter_add(&tctx->inflight, refill);
+		refcount_add(refill, &current->usage);
+		tctx->cached_refs += refill;
+	}
 	io_submit_state_start(&ctx->submit_state, nr);
 
 	while (submitted < nr) {
@@ -6737,12 +6747,10 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 
 	if (unlikely(submitted != nr)) {
 		int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
-		struct io_uring_task *tctx = current->io_uring;
 		int unused = nr - ref_used;
 
+		current->io_uring->cached_refs += unused;
 		percpu_ref_put_many(&ctx->refs, unused);
-		percpu_counter_sub(&tctx->inflight, unused);
-		put_task_struct_many(current, unused);
 	}
 
 	io_submit_state_end(&ctx->submit_state, ctx);
@@ -7924,7 +7932,7 @@ static int io_uring_alloc_task_context(struct task_struct *task,
 	struct io_uring_task *tctx;
 	int ret;
 
-	tctx = kmalloc(sizeof(*tctx), GFP_KERNEL);
+	tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
 	if (unlikely(!tctx))
 		return -ENOMEM;
 
@@ -7944,13 +7952,11 @@ static int io_uring_alloc_task_context(struct task_struct *task,
 
 	xa_init(&tctx->xa);
 	init_waitqueue_head(&tctx->wait);
-	tctx->last = NULL;
 	atomic_set(&tctx->in_idle, 0);
 	atomic_set(&tctx->inflight_tracked, 0);
 	task->io_uring = tctx;
 	spin_lock_init(&tctx->task_lock);
 	INIT_WQ_LIST(&tctx->task_list);
-	tctx->task_state = 0;
 	init_task_work(&tctx->task_work, tctx_task_work);
 	return 0;
 }
@@ -7961,6 +7967,7 @@ void __io_uring_free(struct task_struct *tsk)
 
 	WARN_ON_ONCE(!xa_empty(&tctx->xa));
 	WARN_ON_ONCE(tctx->io_wq);
+	WARN_ON_ONCE(tctx->cached_refs);
 
 	percpu_counter_destroy(&tctx->inflight);
 	kfree(tctx);
@@ -9105,6 +9112,16 @@ static void io_uring_try_cancel(bool cancel_all)
 	}
 }
 
+static void io_uring_drop_tctx_refs(struct task_struct *task)
+{
+	struct io_uring_task *tctx = task->io_uring;
+	unsigned int refs = tctx->cached_refs;
+
+	tctx->cached_refs = 0;
+	percpu_counter_sub(&tctx->inflight, refs);
+	put_task_struct_many(task, refs);
+}
+
 /* should only be called by SQPOLL task */
 static void io_uring_cancel_sqpoll(struct io_sq_data *sqd)
 {
@@ -9120,6 +9137,7 @@ static void io_uring_cancel_sqpoll(struct io_sq_data *sqd)
 
 	WARN_ON_ONCE(!sqd || sqd->thread != current);
 
+	io_uring_drop_tctx_refs(current);
 	atomic_inc(&tctx->in_idle);
 	do {
 		/* read completions before cancelations */
@@ -9157,6 +9175,7 @@ void __io_uring_cancel(struct files_struct *files)
 		io_wq_exit_start(tctx->io_wq);
 
 	/* make sure overflow events are dropped */
+	io_uring_drop_tctx_refs(current);
 	atomic_inc(&tctx->in_idle);
 	do {
 		/* read completions before cancelations */
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 12/13] io_uring: unify SQPOLL and user task cancellations
  2021-06-14  1:36 [PATCH v2 for-next 00/13] resend of for-next cleanups Pavel Begunkov
                   ` (10 preceding siblings ...)
  2021-06-14  1:36 ` [PATCH 11/13] io_uring: cache task struct refs Pavel Begunkov
@ 2021-06-14  1:36 ` Pavel Begunkov
  2021-06-14  1:36 ` [PATCH 13/13] io_uring: inline io_iter_do_read() Pavel Begunkov
  2021-06-14 14:25 ` [PATCH v2 for-next 00/13] resend of for-next cleanups Jens Axboe
  13 siblings, 0 replies; 15+ messages in thread
From: Pavel Begunkov @ 2021-06-14  1:36 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Merge io_uring_cancel_sqpoll() and __io_uring_cancel() as it's easier to
have a conditional ctx traverse inside than keeping them in sync.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 89 +++++++++++++++++----------------------------------
 1 file changed, 30 insertions(+), 59 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 23b15ed98815..23644179edd4 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1036,7 +1036,7 @@ static void io_uring_del_tctx_node(unsigned long index);
 static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 					 struct task_struct *task,
 					 bool cancel_all);
-static void io_uring_cancel_sqpoll(struct io_sq_data *sqd);
+static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
 static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
 
 static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
@@ -6921,7 +6921,7 @@ static int io_sq_thread(void *data)
 		timeout = jiffies + sqd->sq_thread_idle;
 	}
 
-	io_uring_cancel_sqpoll(sqd);
+	io_uring_cancel_generic(true, sqd);
 	sqd->thread = NULL;
 	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
 		io_ring_set_wakeup_flag(ctx);
@@ -9097,21 +9097,6 @@ static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
 	return percpu_counter_sum(&tctx->inflight);
 }
 
-static void io_uring_try_cancel(bool cancel_all)
-{
-	struct io_uring_task *tctx = current->io_uring;
-	struct io_tctx_node *node;
-	unsigned long index;
-
-	xa_for_each(&tctx->xa, index, node) {
-		struct io_ring_ctx *ctx = node->ctx;
-
-		/* sqpoll task will cancel all its requests */
-		if (!ctx->sq_data)
-			io_uring_try_cancel_requests(ctx, current, cancel_all);
-	}
-}
-
 static void io_uring_drop_tctx_refs(struct task_struct *task)
 {
 	struct io_uring_task *tctx = task->io_uring;
@@ -9122,69 +9107,50 @@ static void io_uring_drop_tctx_refs(struct task_struct *task)
 	put_task_struct_many(task, refs);
 }
 
-/* should only be called by SQPOLL task */
-static void io_uring_cancel_sqpoll(struct io_sq_data *sqd)
+/*
+ * Find any io_uring ctx that this task has registered or done IO on, and cancel
+ * requests. @sqd should be not-null IIF it's an SQPOLL thread cancellation.
+ */
+static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
 {
 	struct io_uring_task *tctx = current->io_uring;
 	struct io_ring_ctx *ctx;
 	s64 inflight;
 	DEFINE_WAIT(wait);
 
+	WARN_ON_ONCE(sqd && sqd->thread != current);
+
 	if (!current->io_uring)
 		return;
 	if (tctx->io_wq)
 		io_wq_exit_start(tctx->io_wq);
 
-	WARN_ON_ONCE(!sqd || sqd->thread != current);
-
 	io_uring_drop_tctx_refs(current);
 	atomic_inc(&tctx->in_idle);
 	do {
 		/* read completions before cancelations */
-		inflight = tctx_inflight(tctx, false);
+		inflight = tctx_inflight(tctx, !cancel_all);
 		if (!inflight)
 			break;
-		list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
-			io_uring_try_cancel_requests(ctx, current, true);
 
-		prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
-		/*
-		 * If we've seen completions, retry without waiting. This
-		 * avoids a race where a completion comes in before we did
-		 * prepare_to_wait().
-		 */
-		if (inflight == tctx_inflight(tctx, false))
-			schedule();
-		finish_wait(&tctx->wait, &wait);
-	} while (1);
-	atomic_dec(&tctx->in_idle);
-}
+		if (!sqd) {
+			struct io_tctx_node *node;
+			unsigned long index;
 
-/*
- * Find any io_uring fd that this task has registered or done IO on, and cancel
- * requests.
- */
-void __io_uring_cancel(struct files_struct *files)
-{
-	struct io_uring_task *tctx = current->io_uring;
-	DEFINE_WAIT(wait);
-	s64 inflight;
-	bool cancel_all = !files;
-
-	if (tctx->io_wq)
-		io_wq_exit_start(tctx->io_wq);
+			xa_for_each(&tctx->xa, index, node) {
+				/* sqpoll task will cancel all its requests */
+				if (node->ctx->sq_data)
+					continue;
+				io_uring_try_cancel_requests(node->ctx, current,
+							     cancel_all);
+			}
+		} else {
+			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
+				io_uring_try_cancel_requests(ctx, current,
+							     cancel_all);
+		}
 
-	/* make sure overflow events are dropped */
-	io_uring_drop_tctx_refs(current);
-	atomic_inc(&tctx->in_idle);
-	do {
-		/* read completions before cancelations */
-		inflight = tctx_inflight(tctx, !cancel_all);
-		if (!inflight)
-			break;
-		io_uring_try_cancel(cancel_all);
 		prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
-
 		/*
 		 * If we've seen completions, retry without waiting. This
 		 * avoids a race where a completion comes in before we did
@@ -9203,6 +9169,11 @@ void __io_uring_cancel(struct files_struct *files)
 	}
 }
 
+void __io_uring_cancel(struct files_struct *files)
+{
+	io_uring_cancel_generic(!files, NULL);
+}
+
 static void *io_uring_validate_mmap_request(struct file *file,
 					    loff_t pgoff, size_t sz)
 {
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 13/13] io_uring: inline io_iter_do_read()
  2021-06-14  1:36 [PATCH v2 for-next 00/13] resend of for-next cleanups Pavel Begunkov
                   ` (11 preceding siblings ...)
  2021-06-14  1:36 ` [PATCH 12/13] io_uring: unify SQPOLL and user task cancellations Pavel Begunkov
@ 2021-06-14  1:36 ` Pavel Begunkov
  2021-06-14 14:25 ` [PATCH v2 for-next 00/13] resend of for-next cleanups Jens Axboe
  13 siblings, 0 replies; 15+ messages in thread
From: Pavel Begunkov @ 2021-06-14  1:36 UTC (permalink / raw)
  To: Jens Axboe, io-uring

There are only two calls in source code of io_iter_do_read(), the
function is small and pretty hot though is failed to get inlined.
Makr it as inline.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 23644179edd4..9c6e1d7dd5b1 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3248,7 +3248,7 @@ static bool io_rw_should_retry(struct io_kiocb *req)
 	return true;
 }
 
-static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
+static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
 {
 	if (req->file->f_op->read_iter)
 		return call_read_iter(req->file, &req->rw.kiocb, iter);
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH v2 for-next 00/13] resend of for-next cleanups
  2021-06-14  1:36 [PATCH v2 for-next 00/13] resend of for-next cleanups Pavel Begunkov
                   ` (12 preceding siblings ...)
  2021-06-14  1:36 ` [PATCH 13/13] io_uring: inline io_iter_do_read() Pavel Begunkov
@ 2021-06-14 14:25 ` Jens Axboe
  13 siblings, 0 replies; 15+ messages in thread
From: Jens Axboe @ 2021-06-14 14:25 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 6/13/21 7:36 PM, Pavel Begunkov wrote:
> The series is based on the 5.14 branch with fixes from 5.13 that are
> missing applied on top:
> 
> 216e5835966a io_uring: fix misaccounting fix buf pinned pages
> b16ef427adf3 io_uring: fix data race to avoid potential NULL-deref
> 3743c1723bfc io-wq: Fix UAF when wakeup wqe in hash waitqueue
> 17a91051fe63 io_uring/io-wq: close io-wq full-stop gap

With the API change, I rebased the 5.14 branch and applied this on
top.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2021-06-14 14:25 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-14  1:36 [PATCH v2 for-next 00/13] resend of for-next cleanups Pavel Begunkov
2021-06-14  1:36 ` [PATCH 01/13] io-wq: embed wqe ptr array into struct io_wq Pavel Begunkov
2021-06-14  1:36 ` [PATCH 02/13] io-wq: remove unused io-wq refcounting Pavel Begunkov
2021-06-14  1:36 ` [PATCH 03/13] io_uring: refactor io_iopoll_req_issued Pavel Begunkov
2021-06-14  1:36 ` [PATCH 04/13] io_uring: rename function *task_file Pavel Begunkov
2021-06-14  1:36 ` [PATCH 05/13] io-wq: don't repeat IO_WQ_BIT_EXIT check by worker Pavel Begunkov
2021-06-14  1:36 ` [PATCH 06/13] io-wq: simplify worker exiting Pavel Begunkov
2021-06-14  1:36 ` [PATCH 07/13] io_uring: hide rsrc tag copy into generic helpers Pavel Begunkov
2021-06-14  1:36 ` [PATCH 08/13] io_uring: remove rsrc put work irq save/restore Pavel Begunkov
2021-06-14  1:36 ` [PATCH 09/13] io_uring: add helpers for 2 level table alloc Pavel Begunkov
2021-06-14  1:36 ` [PATCH 10/13] io_uring: don't vmalloc rsrc tags Pavel Begunkov
2021-06-14  1:36 ` [PATCH 11/13] io_uring: cache task struct refs Pavel Begunkov
2021-06-14  1:36 ` [PATCH 12/13] io_uring: unify SQPOLL and user task cancellations Pavel Begunkov
2021-06-14  1:36 ` [PATCH 13/13] io_uring: inline io_iter_do_read() Pavel Begunkov
2021-06-14 14:25 ` [PATCH v2 for-next 00/13] resend of for-next cleanups Jens Axboe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.