IO-Uring Archive on lore.kernel.org
 help / color / Atom feed
* [PATCHSET] Pending io_uring items not yet queued up for 5.5
@ 2019-11-16  1:53 Jens Axboe
  2019-11-16  1:53 ` [PATCH 1/8] io-wq: remove now redundant struct io_wq_nulls_list Jens Axboe
                   ` (7 more replies)
  0 siblings, 8 replies; 14+ messages in thread
From: Jens Axboe @ 2019-11-16  1:53 UTC (permalink / raw)
  To: io-uring; +Cc: asml.silence

A bit of a mix, some of them posted before. But this contains:

- Removal of io_wq_nulls_list
- rbtree for poll, making them scale better
- A few trivial cleanups, some of them just cleanups, some of them
  prep for changes
- Hopefully fix all the linked commands sequencing faults

Can also be found in my for-5.5/io_uring-post repo.

 fs/io-wq.c    |  29 ++---
 fs/io_uring.c | 333 +++++++++++++++++++++++++++++++-------------------
 2 files changed, 215 insertions(+), 147 deletions(-)

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/8] io-wq: remove now redundant struct io_wq_nulls_list
  2019-11-16  1:53 [PATCHSET] Pending io_uring items not yet queued up for 5.5 Jens Axboe
@ 2019-11-16  1:53 ` Jens Axboe
  2019-11-16  1:53 ` [PATCH 2/8] io_uring: make POLL_ADD/POLL_REMOVE scale better Jens Axboe
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 14+ messages in thread
From: Jens Axboe @ 2019-11-16  1:53 UTC (permalink / raw)
  To: io-uring; +Cc: asml.silence, Jens Axboe

Since we don't iterate these lists anymore after commit:

e61df66c69b1 ("io-wq: ensure free/busy list browsing see all items")

we don't need to retain the nulls value we use for them. That means it's
pretty pointless to wrap the hlist_nulls_head in a structure, so get rid
of it.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c | 29 ++++++++++-------------------
 1 file changed, 10 insertions(+), 19 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index fcb6c74209da..9174007ce107 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -59,11 +59,6 @@ struct io_worker {
 	struct files_struct *restore_files;
 };
 
-struct io_wq_nulls_list {
-	struct hlist_nulls_head head;
-	unsigned long nulls;
-};
-
 #if BITS_PER_LONG == 64
 #define IO_WQ_HASH_ORDER	6
 #else
@@ -95,8 +90,8 @@ struct io_wqe {
 	int node;
 	struct io_wqe_acct acct[2];
 
-	struct io_wq_nulls_list free_list;
-	struct io_wq_nulls_list busy_list;
+	struct hlist_nulls_head free_list;
+	struct hlist_nulls_head busy_list;
 	struct list_head all_list;
 
 	struct io_wq *wq;
@@ -249,7 +244,7 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
 	struct hlist_nulls_node *n;
 	struct io_worker *worker;
 
-	n = rcu_dereference(hlist_nulls_first_rcu(&wqe->free_list.head));
+	n = rcu_dereference(hlist_nulls_first_rcu(&wqe->free_list));
 	if (is_a_nulls(n))
 		return false;
 
@@ -325,8 +320,7 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
 	if (worker->flags & IO_WORKER_F_FREE) {
 		worker->flags &= ~IO_WORKER_F_FREE;
 		hlist_nulls_del_init_rcu(&worker->nulls_node);
-		hlist_nulls_add_head_rcu(&worker->nulls_node,
-						&wqe->busy_list.head);
+		hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->busy_list);
 	}
 
 	/*
@@ -365,8 +359,7 @@ static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
 	if (!(worker->flags & IO_WORKER_F_FREE)) {
 		worker->flags |= IO_WORKER_F_FREE;
 		hlist_nulls_del_init_rcu(&worker->nulls_node);
-		hlist_nulls_add_head_rcu(&worker->nulls_node,
-						&wqe->free_list.head);
+		hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
 	}
 
 	return __io_worker_unuse(wqe, worker);
@@ -592,7 +585,7 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
 	}
 
 	spin_lock_irq(&wqe->lock);
-	hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list.head);
+	hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
 	list_add_tail_rcu(&worker->all_list, &wqe->all_list);
 	worker->flags |= IO_WORKER_F_FREE;
 	if (index == IO_WQ_ACCT_BOUND)
@@ -617,7 +610,7 @@ static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index)
 	if (index == IO_WQ_ACCT_BOUND && !acct->nr_workers)
 		return true;
 	/* if we have available workers or no work, no need */
-	if (!hlist_nulls_empty(&wqe->free_list.head) || !io_wqe_run_queue(wqe))
+	if (!hlist_nulls_empty(&wqe->free_list) || !io_wqe_run_queue(wqe))
 		return false;
 	return acct->nr_workers < acct->max_workers;
 }
@@ -665,7 +658,7 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
 		return true;
 
 	rcu_read_lock();
-	free_worker = !hlist_nulls_empty(&wqe->free_list.head);
+	free_worker = !hlist_nulls_empty(&wqe->free_list);
 	rcu_read_unlock();
 	if (free_worker)
 		return true;
@@ -1009,10 +1002,8 @@ struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm,
 		wqe->wq = wq;
 		spin_lock_init(&wqe->lock);
 		INIT_LIST_HEAD(&wqe->work_list);
-		INIT_HLIST_NULLS_HEAD(&wqe->free_list.head, 0);
-		wqe->free_list.nulls = 0;
-		INIT_HLIST_NULLS_HEAD(&wqe->busy_list.head, 1);
-		wqe->busy_list.nulls = 1;
+		INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0);
+		INIT_HLIST_NULLS_HEAD(&wqe->busy_list, 1);
 		INIT_LIST_HEAD(&wqe->all_list);
 
 		i++;
-- 
2.24.0


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 2/8] io_uring: make POLL_ADD/POLL_REMOVE scale better
  2019-11-16  1:53 [PATCHSET] Pending io_uring items not yet queued up for 5.5 Jens Axboe
  2019-11-16  1:53 ` [PATCH 1/8] io-wq: remove now redundant struct io_wq_nulls_list Jens Axboe
@ 2019-11-16  1:53 ` Jens Axboe
  2019-11-16  1:53 ` [PATCH 3/8] io_uring: io_async_cancel() should pass in 'nxt' request pointer Jens Axboe
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 14+ messages in thread
From: Jens Axboe @ 2019-11-16  1:53 UTC (permalink / raw)
  To: io-uring; +Cc: asml.silence, Jens Axboe

One of the obvious use cases for these commands is networking, where
it's not uncommon to have tons of sockets open and polled for. The
current implementation uses a list for insertion and lookup, which works
fine for file based use cases where the count is usually low, it breaks
down somewhat for higher number of files / sockets. A test case with
30k sockets being polled for and cancelled takes:

real    0m6.968s
user    0m0.002s
sys     0m6.936s

with the patch it takes:

real    0m0.233s
user    0m0.010s
sys     0m0.176s

If you go to 50k sockets, it gets even more abysmal with the current
code:

real    0m40.602s
user    0m0.010s
sys     0m40.555s

with the patch it takes:

real    0m0.398s
user    0m0.000s
sys     0m0.341s

Change is pretty straight forward, just replace the cancel_list with
a red/black tree instead.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 69 ++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 54 insertions(+), 15 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 55f8b1d378df..5ad652fa24b8 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -271,7 +271,7 @@ struct io_ring_ctx {
 		 * manipulate the list, hence no extra locking is needed there.
 		 */
 		struct list_head	poll_list;
-		struct list_head	cancel_list;
+		struct rb_root		cancel_tree;
 
 		spinlock_t		inflight_lock;
 		struct list_head	inflight_list;
@@ -323,7 +323,10 @@ struct io_kiocb {
 	struct sqe_submit	submit;
 
 	struct io_ring_ctx	*ctx;
-	struct list_head	list;
+	union {
+		struct list_head	list;
+		struct rb_node		rb_node;
+	};
 	struct list_head	link_list;
 	unsigned int		flags;
 	refcount_t		refs;
@@ -433,7 +436,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	init_waitqueue_head(&ctx->wait);
 	spin_lock_init(&ctx->completion_lock);
 	INIT_LIST_HEAD(&ctx->poll_list);
-	INIT_LIST_HEAD(&ctx->cancel_list);
+	ctx->cancel_tree = RB_ROOT;
 	INIT_LIST_HEAD(&ctx->defer_list);
 	INIT_LIST_HEAD(&ctx->timeout_list);
 	init_waitqueue_head(&ctx->inflight_wait);
@@ -1934,6 +1937,14 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 #endif
 }
 
+static inline void io_poll_remove_req(struct io_kiocb *req)
+{
+	if (!RB_EMPTY_NODE(&req->rb_node)) {
+		rb_erase(&req->rb_node, &req->ctx->cancel_tree);
+		RB_CLEAR_NODE(&req->rb_node);
+	}
+}
+
 static void io_poll_remove_one(struct io_kiocb *req)
 {
 	struct io_poll_iocb *poll = &req->poll;
@@ -1945,17 +1956,17 @@ static void io_poll_remove_one(struct io_kiocb *req)
 		io_queue_async_work(req);
 	}
 	spin_unlock(&poll->head->lock);
-
-	list_del_init(&req->list);
+	io_poll_remove_req(req);
 }
 
 static void io_poll_remove_all(struct io_ring_ctx *ctx)
 {
+	struct rb_node *node;
 	struct io_kiocb *req;
 
 	spin_lock_irq(&ctx->completion_lock);
-	while (!list_empty(&ctx->cancel_list)) {
-		req = list_first_entry(&ctx->cancel_list, struct io_kiocb,list);
+	while ((node = rb_first(&ctx->cancel_tree)) != NULL) {
+		req = rb_entry(node, struct io_kiocb, rb_node);
 		io_poll_remove_one(req);
 	}
 	spin_unlock_irq(&ctx->completion_lock);
@@ -1963,13 +1974,21 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx)
 
 static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
 {
+	struct rb_node *p, *parent = NULL;
 	struct io_kiocb *req;
 
-	list_for_each_entry(req, &ctx->cancel_list, list) {
-		if (req->user_data != sqe_addr)
-			continue;
-		io_poll_remove_one(req);
-		return 0;
+	p = ctx->cancel_tree.rb_node;
+	while (p) {
+		parent = p;
+		req = rb_entry(parent, struct io_kiocb, rb_node);
+		if (sqe_addr < req->user_data) {
+			p = p->rb_left;
+		} else if (sqe_addr > req->user_data) {
+			p = p->rb_right;
+		} else {
+			io_poll_remove_one(req);
+			return 0;
+		}
 	}
 
 	return -ENOENT;
@@ -2039,7 +2058,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
 		spin_unlock_irq(&ctx->completion_lock);
 		return;
 	}
-	list_del_init(&req->list);
+	io_poll_remove_req(req);
 	io_poll_complete(req, mask);
 	spin_unlock_irq(&ctx->completion_lock);
 
@@ -2073,7 +2092,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 	 * for finalizing the request, mark us as having grabbed that already.
 	 */
 	if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
-		list_del(&req->list);
+		io_poll_remove_req(req);
 		io_poll_complete(req, mask);
 		req->flags |= REQ_F_COMP_LOCKED;
 		io_put_req(req);
@@ -2108,6 +2127,25 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
 	add_wait_queue(head, &pt->req->poll.wait);
 }
 
+static void io_poll_req_insert(struct io_kiocb *req)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	struct rb_node **p = &ctx->cancel_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct io_kiocb *tmp;
+
+	while (*p) {
+		parent = *p;
+		tmp = rb_entry(parent, struct io_kiocb, rb_node);
+		if (req->user_data < tmp->user_data)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+	rb_link_node(&req->rb_node, parent, p);
+	rb_insert_color(&req->rb_node, &ctx->cancel_tree);
+}
+
 static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		       struct io_kiocb **nxt)
 {
@@ -2129,6 +2167,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	INIT_IO_WORK(&req->work, io_poll_complete_work);
 	events = READ_ONCE(sqe->poll_events);
 	poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
+	RB_CLEAR_NODE(&req->rb_node);
 
 	poll->head = NULL;
 	poll->done = false;
@@ -2161,7 +2200,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		else if (cancel)
 			WRITE_ONCE(poll->canceled, true);
 		else if (!poll->done) /* actually waiting for an event */
-			list_add_tail(&req->list, &ctx->cancel_list);
+			io_poll_req_insert(req);
 		spin_unlock(&poll->head->lock);
 	}
 	if (mask) { /* no async, we'd stolen it */
-- 
2.24.0


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 3/8] io_uring: io_async_cancel() should pass in 'nxt' request pointer
  2019-11-16  1:53 [PATCHSET] Pending io_uring items not yet queued up for 5.5 Jens Axboe
  2019-11-16  1:53 ` [PATCH 1/8] io-wq: remove now redundant struct io_wq_nulls_list Jens Axboe
  2019-11-16  1:53 ` [PATCH 2/8] io_uring: make POLL_ADD/POLL_REMOVE scale better Jens Axboe
@ 2019-11-16  1:53 ` Jens Axboe
  2019-11-16  1:53 ` [PATCH 4/8] io_uring: cleanup return values from the queueing functions Jens Axboe
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 14+ messages in thread
From: Jens Axboe @ 2019-11-16  1:53 UTC (permalink / raw)
  To: io-uring; +Cc: asml.silence, Jens Axboe

If we have a linked request, this enables us to pass it back directly
without having to go through async context.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5ad652fa24b8..c60e0fa96d9f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2465,7 +2465,7 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	    sqe->cancel_flags)
 		return -EINVAL;
 
-	io_async_find_and_cancel(ctx, req, READ_ONCE(sqe->addr), NULL);
+	io_async_find_and_cancel(ctx, req, READ_ONCE(sqe->addr), nxt);
 	return 0;
 }
 
-- 
2.24.0


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 4/8] io_uring: cleanup return values from the queueing functions
  2019-11-16  1:53 [PATCHSET] Pending io_uring items not yet queued up for 5.5 Jens Axboe
                   ` (2 preceding siblings ...)
  2019-11-16  1:53 ` [PATCH 3/8] io_uring: io_async_cancel() should pass in 'nxt' request pointer Jens Axboe
@ 2019-11-16  1:53 ` Jens Axboe
  2019-11-16  1:53 ` [PATCH 5/8] io_uring: make io_double_put_req() use normal completion path Jens Axboe
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 14+ messages in thread
From: Jens Axboe @ 2019-11-16  1:53 UTC (permalink / raw)
  To: io-uring; +Cc: asml.silence, Jens Axboe

__io_queue_sqe(), io_queue_sqe(), io_queue_link_head() all return 0/err,
but the caller doesn't care since the errors are handled inline. Clean
these up and just make them void.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c60e0fa96d9f..56a9321a4232 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2825,7 +2825,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req,
 	return nxt;
 }
 
-static int __io_queue_sqe(struct io_kiocb *req)
+static void __io_queue_sqe(struct io_kiocb *req)
 {
 	enum hrtimer_mode mode;
 	struct io_kiocb *nxt;
@@ -2870,7 +2870,7 @@ static int __io_queue_sqe(struct io_kiocb *req)
 			if (nxt)
 				io_queue_linked_timeout(nxt, &ts, &mode);
 
-			return 0;
+			return;
 		}
 	}
 
@@ -2892,11 +2892,9 @@ static int __io_queue_sqe(struct io_kiocb *req)
 			req->flags |= REQ_F_FAIL_LINK;
 		io_put_req(req);
 	}
-
-	return ret;
 }
 
-static int io_queue_sqe(struct io_kiocb *req)
+static void io_queue_sqe(struct io_kiocb *req)
 {
 	int ret;
 
@@ -2906,20 +2904,20 @@ static int io_queue_sqe(struct io_kiocb *req)
 			io_cqring_add_event(req, ret);
 			io_double_put_req(req);
 		}
-		return 0;
-	}
-
-	return __io_queue_sqe(req);
+	} else
+		__io_queue_sqe(req);
 }
 
-static int io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow)
+static void io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow)
 {
 	int ret;
 	int need_submit = false;
 	struct io_ring_ctx *ctx = req->ctx;
 
-	if (!shadow)
-		return io_queue_sqe(req);
+	if (!shadow) {
+		io_queue_sqe(req);
+		return;
+	}
 
 	/*
 	 * Mark the first IO in link list as DRAIN, let all the following
@@ -2933,7 +2931,7 @@ static int io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow)
 			io_cqring_add_event(req, ret);
 			io_double_put_req(req);
 			__io_free_req(shadow);
-			return 0;
+			return;
 		}
 	} else {
 		/*
@@ -2950,9 +2948,7 @@ static int io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow)
 	spin_unlock_irq(&ctx->completion_lock);
 
 	if (need_submit)
-		return __io_queue_sqe(req);
-
-	return 0;
+		__io_queue_sqe(req);
 }
 
 #define SQE_VALID_FLAGS	(IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)
-- 
2.24.0


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 5/8] io_uring: make io_double_put_req() use normal completion path
  2019-11-16  1:53 [PATCHSET] Pending io_uring items not yet queued up for 5.5 Jens Axboe
                   ` (3 preceding siblings ...)
  2019-11-16  1:53 ` [PATCH 4/8] io_uring: cleanup return values from the queueing functions Jens Axboe
@ 2019-11-16  1:53 ` Jens Axboe
  2019-11-16  1:53 ` [PATCH 6/8] io_uring: make req->timeout be dynamically allocated Jens Axboe
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 14+ messages in thread
From: Jens Axboe @ 2019-11-16  1:53 UTC (permalink / raw)
  To: io-uring; +Cc: asml.silence, Jens Axboe

If we don't use the normal completion path, we may skip killing links
that should be errored and freed. Add __io_double_put_req() for use
within the completion path itself, other calls should just use
io_double_put_req().

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 56a9321a4232..3f4d6641bea2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -382,6 +382,7 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res);
 static void __io_free_req(struct io_kiocb *req);
 static void io_put_req(struct io_kiocb *req);
 static void io_double_put_req(struct io_kiocb *req);
+static void __io_double_put_req(struct io_kiocb *req);
 
 static struct kmem_cache *req_cachep;
 
@@ -911,7 +912,7 @@ static void io_fail_links(struct io_kiocb *req)
 			io_link_cancel_timeout(link);
 		} else {
 			io_cqring_fill_event(link, -ECANCELED);
-			io_double_put_req(link);
+			__io_double_put_req(link);
 		}
 	}
 
@@ -985,13 +986,24 @@ static void io_put_req(struct io_kiocb *req)
 		io_free_req(req);
 }
 
-static void io_double_put_req(struct io_kiocb *req)
+/*
+ * Must only be used if we don't need to care about links, usually from
+ * within the completion handling itself.
+ */
+static void __io_double_put_req(struct io_kiocb *req)
 {
 	/* drop both submit and complete references */
 	if (refcount_sub_and_test(2, &req->refs))
 		__io_free_req(req);
 }
 
+static void io_double_put_req(struct io_kiocb *req)
+{
+	/* drop both submit and complete references */
+	if (refcount_sub_and_test(2, &req->refs))
+		io_free_req(req);
+}
+
 static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
 {
 	struct io_rings *rings = ctx->rings;
-- 
2.24.0


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 6/8] io_uring: make req->timeout be dynamically allocated
  2019-11-16  1:53 [PATCHSET] Pending io_uring items not yet queued up for 5.5 Jens Axboe
                   ` (4 preceding siblings ...)
  2019-11-16  1:53 ` [PATCH 5/8] io_uring: make io_double_put_req() use normal completion path Jens Axboe
@ 2019-11-16  1:53 ` Jens Axboe
  2019-11-16  1:53 ` [PATCH 7/8] io_uring: fix sequencing issues with linked timeouts Jens Axboe
  2019-11-16  1:53 ` [PATCH 8/8] io_uring: remove dead REQ_F_SEQ_PREV flag Jens Axboe
  7 siblings, 0 replies; 14+ messages in thread
From: Jens Axboe @ 2019-11-16  1:53 UTC (permalink / raw)
  To: io-uring; +Cc: asml.silence, Jens Axboe

There are a few reasons for this:

- As a prep to improving the linked timeout logic
- io_timeout is the biggest member in the io_kiocb opcode union

This also enables a few cleanups, like unifying the timer setup between
IORING_OP_TIMEOUT and IORING_OP_LINK_TIMEOUT, and not needing multiple
arguments to the link/prep helpers.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 129 +++++++++++++++++++++++++++-----------------------
 1 file changed, 70 insertions(+), 59 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3f4d6641bea2..883ac9b01083 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -301,9 +301,16 @@ struct io_poll_iocb {
 	struct wait_queue_entry		wait;
 };
 
+struct io_timeout_data {
+	struct io_kiocb			*req;
+	struct hrtimer			timer;
+	struct timespec64		ts;
+	enum hrtimer_mode		mode;
+};
+
 struct io_timeout {
 	struct file			*file;
-	struct hrtimer			timer;
+	struct io_timeout_data		*data;
 };
 
 /*
@@ -568,7 +575,7 @@ static void io_kill_timeout(struct io_kiocb *req)
 {
 	int ret;
 
-	ret = hrtimer_try_to_cancel(&req->timeout.timer);
+	ret = hrtimer_try_to_cancel(&req->timeout.data->timer);
 	if (ret != -1) {
 		atomic_inc(&req->ctx->cq_timeouts);
 		list_del_init(&req->list);
@@ -823,6 +830,8 @@ static void __io_free_req(struct io_kiocb *req)
 			wake_up(&ctx->inflight_wait);
 		spin_unlock_irqrestore(&ctx->inflight_lock, flags);
 	}
+	if (req->flags & REQ_F_TIMEOUT)
+		kfree(req->timeout.data);
 	percpu_ref_put(&ctx->refs);
 	if (likely(!io_is_fallback_req(req)))
 		kmem_cache_free(req_cachep, req);
@@ -835,7 +844,7 @@ static bool io_link_cancel_timeout(struct io_kiocb *req)
 	struct io_ring_ctx *ctx = req->ctx;
 	int ret;
 
-	ret = hrtimer_try_to_cancel(&req->timeout.timer);
+	ret = hrtimer_try_to_cancel(&req->timeout.data->timer);
 	if (ret != -1) {
 		io_cqring_fill_event(req, -ECANCELED);
 		io_commit_cqring(ctx);
@@ -2230,12 +2239,12 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 
 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 {
-	struct io_ring_ctx *ctx;
-	struct io_kiocb *req;
+	struct io_timeout_data *data = container_of(timer,
+						struct io_timeout_data, timer);
+	struct io_kiocb *req = data->req;
+	struct io_ring_ctx *ctx = req->ctx;
 	unsigned long flags;
 
-	req = container_of(timer, struct io_kiocb, timeout.timer);
-	ctx = req->ctx;
 	atomic_inc(&ctx->cq_timeouts);
 
 	spin_lock_irqsave(&ctx->completion_lock, flags);
@@ -2285,7 +2294,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
 	if (ret == -ENOENT)
 		return ret;
 
-	ret = hrtimer_try_to_cancel(&req->timeout.timer);
+	ret = hrtimer_try_to_cancel(&req->timeout.data->timer);
 	if (ret == -1)
 		return -EALREADY;
 
@@ -2325,33 +2334,54 @@ static int io_timeout_remove(struct io_kiocb *req,
 	return 0;
 }
 
-static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int io_timeout_setup(struct io_kiocb *req)
 {
-	unsigned count;
-	struct io_ring_ctx *ctx = req->ctx;
-	struct list_head *entry;
-	enum hrtimer_mode mode;
-	struct timespec64 ts;
-	unsigned span = 0;
+	const struct io_uring_sqe *sqe = req->submit.sqe;
+	struct io_timeout_data *data;
 	unsigned flags;
 
-	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
+	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
-	if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len != 1)
+	if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
 		return -EINVAL;
 	flags = READ_ONCE(sqe->timeout_flags);
 	if (flags & ~IORING_TIMEOUT_ABS)
 		return -EINVAL;
 
-	if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr)))
+	data = kzalloc(sizeof(struct io_timeout_data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+	data->req = req;
+	req->timeout.data = data;
+	req->flags |= REQ_F_TIMEOUT;
+
+	if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
 		return -EFAULT;
 
 	if (flags & IORING_TIMEOUT_ABS)
-		mode = HRTIMER_MODE_ABS;
+		data->mode = HRTIMER_MODE_ABS;
 	else
-		mode = HRTIMER_MODE_REL;
+		data->mode = HRTIMER_MODE_REL;
+
+	hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
+	return 0;
+}
+
+static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	unsigned count;
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_timeout_data *data;
+	struct list_head *entry;
+	unsigned span = 0;
+	int ret;
 
-	hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, mode);
+	ret = io_timeout_setup(req);
+	/* common setup allows flags (like links) set, we don't */
+	if (!ret && sqe->flags)
+		ret = -EINVAL;
+	if (ret)
+		return ret;
 
 	/*
 	 * sqe->off holds how many events that need to occur for this
@@ -2364,7 +2394,6 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	req->sequence = ctx->cached_sq_head + count - 1;
 	/* reuse it to store the count */
 	req->submit.sequence = count;
-	req->flags |= REQ_F_TIMEOUT;
 
 	/*
 	 * Insertion sort, ensuring the first entry in the list is always
@@ -2403,8 +2432,9 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	}
 	req->sequence -= span;
 	list_add(&req->list, entry);
-	req->timeout.timer.function = io_timeout_fn;
-	hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts), mode);
+	data = req->timeout.data;
+	data->timer.function = io_timeout_fn;
+	hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
 	spin_unlock_irq(&ctx->completion_lock);
 	return 0;
 }
@@ -2739,8 +2769,9 @@ static int io_grab_files(struct io_kiocb *req)
 
 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 {
-	struct io_kiocb *req = container_of(timer, struct io_kiocb,
-						timeout.timer);
+	struct io_timeout_data *data = container_of(timer,
+						struct io_timeout_data, timer);
+	struct io_kiocb *req = data->req;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_kiocb *prev = NULL;
 	unsigned long flags;
@@ -2771,9 +2802,9 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
-static void io_queue_linked_timeout(struct io_kiocb *req, struct timespec64 *ts,
-				    enum hrtimer_mode *mode)
+static void io_queue_linked_timeout(struct io_kiocb *req)
 {
+	struct io_timeout_data *data = req->timeout.data;
 	struct io_ring_ctx *ctx = req->ctx;
 
 	/*
@@ -2782,9 +2813,9 @@ static void io_queue_linked_timeout(struct io_kiocb *req, struct timespec64 *ts,
 	 */
 	spin_lock_irq(&ctx->completion_lock);
 	if (!list_empty(&req->list)) {
-		req->timeout.timer.function = io_link_timeout_fn;
-		hrtimer_start(&req->timeout.timer, timespec64_to_ktime(*ts),
-				*mode);
+		data->timer.function = io_link_timeout_fn;
+		hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
+				data->mode);
 	}
 	spin_unlock_irq(&ctx->completion_lock);
 
@@ -2792,22 +2823,7 @@ static void io_queue_linked_timeout(struct io_kiocb *req, struct timespec64 *ts,
 	io_put_req(req);
 }
 
-static int io_validate_link_timeout(const struct io_uring_sqe *sqe,
-				    struct timespec64 *ts)
-{
-	if (sqe->ioprio || sqe->buf_index || sqe->len != 1 || sqe->off)
-		return -EINVAL;
-	if (sqe->timeout_flags & ~IORING_TIMEOUT_ABS)
-		return -EINVAL;
-	if (get_timespec64(ts, u64_to_user_ptr(sqe->addr)))
-		return -EFAULT;
-
-	return 0;
-}
-
-static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req,
-					       struct timespec64 *ts,
-					       enum hrtimer_mode *mode)
+static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
 {
 	struct io_kiocb *nxt;
 	int ret;
@@ -2819,7 +2835,10 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req,
 	if (!nxt || nxt->submit.sqe->opcode != IORING_OP_LINK_TIMEOUT)
 		return NULL;
 
-	ret = io_validate_link_timeout(nxt->submit.sqe, ts);
+	ret = io_timeout_setup(nxt);
+	/* common setup allows offset being set, we don't */
+	if (!ret && nxt->submit.sqe->off)
+		ret = -EINVAL;
 	if (ret) {
 		list_del_init(&nxt->list);
 		io_cqring_add_event(nxt, ret);
@@ -2827,24 +2846,16 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req,
 		return ERR_PTR(-ECANCELED);
 	}
 
-	if (nxt->submit.sqe->timeout_flags & IORING_TIMEOUT_ABS)
-		*mode = HRTIMER_MODE_ABS;
-	else
-		*mode = HRTIMER_MODE_REL;
-
 	req->flags |= REQ_F_LINK_TIMEOUT;
-	hrtimer_init(&nxt->timeout.timer, CLOCK_MONOTONIC, *mode);
 	return nxt;
 }
 
 static void __io_queue_sqe(struct io_kiocb *req)
 {
-	enum hrtimer_mode mode;
 	struct io_kiocb *nxt;
-	struct timespec64 ts;
 	int ret;
 
-	nxt = io_prep_linked_timeout(req, &ts, &mode);
+	nxt = io_prep_linked_timeout(req);
 	if (IS_ERR(nxt)) {
 		ret = PTR_ERR(nxt);
 		nxt = NULL;
@@ -2880,7 +2891,7 @@ static void __io_queue_sqe(struct io_kiocb *req)
 			io_queue_async_work(req);
 
 			if (nxt)
-				io_queue_linked_timeout(nxt, &ts, &mode);
+				io_queue_linked_timeout(nxt);
 
 			return;
 		}
@@ -2892,7 +2903,7 @@ static void __io_queue_sqe(struct io_kiocb *req)
 
 	if (nxt) {
 		if (!ret)
-			io_queue_linked_timeout(nxt, &ts, &mode);
+			io_queue_linked_timeout(nxt);
 		else
 			io_put_req(nxt);
 	}
-- 
2.24.0


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 7/8] io_uring: fix sequencing issues with linked timeouts
  2019-11-16  1:53 [PATCHSET] Pending io_uring items not yet queued up for 5.5 Jens Axboe
                   ` (5 preceding siblings ...)
  2019-11-16  1:53 ` [PATCH 6/8] io_uring: make req->timeout be dynamically allocated Jens Axboe
@ 2019-11-16  1:53 ` Jens Axboe
  2019-11-19 20:51   ` Pavel Begunkov
  2019-11-16  1:53 ` [PATCH 8/8] io_uring: remove dead REQ_F_SEQ_PREV flag Jens Axboe
  7 siblings, 1 reply; 14+ messages in thread
From: Jens Axboe @ 2019-11-16  1:53 UTC (permalink / raw)
  To: io-uring; +Cc: asml.silence, Jens Axboe

We have an issue with timeout links that are deeper in the submit chain,
because we only handle it upfront, not from later submissions. Move the
prep + issue of the timeout link to the async work prep handler, and do
it normally for non-async queue. If we validate and prepare the timeout
links upfront when we first see them, there's nothing stopping us from
supporting any sort of nesting.

Fixes: 2665abfd757f ("io_uring: add support for linked SQE timeouts")
Reported-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 102 ++++++++++++++++++++++++++++++--------------------
 1 file changed, 61 insertions(+), 41 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 883ac9b01083..b88bd65c9848 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -352,6 +352,7 @@ struct io_kiocb {
 #define REQ_F_MUST_PUNT		4096	/* must be punted even for NONBLOCK */
 #define REQ_F_INFLIGHT		8192	/* on inflight list */
 #define REQ_F_COMP_LOCKED	16384	/* completion under lock */
+#define REQ_F_FREE_SQE		32768	/* free sqe if not async queued */
 	u64			user_data;
 	u32			result;
 	u32			sequence;
@@ -390,6 +391,8 @@ static void __io_free_req(struct io_kiocb *req);
 static void io_put_req(struct io_kiocb *req);
 static void io_double_put_req(struct io_kiocb *req);
 static void __io_double_put_req(struct io_kiocb *req);
+static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
+static void io_queue_linked_timeout(struct io_kiocb *req);
 
 static struct kmem_cache *req_cachep;
 
@@ -524,7 +527,8 @@ static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
 		 opcode == IORING_OP_WRITE_FIXED);
 }
 
-static inline bool io_prep_async_work(struct io_kiocb *req)
+static inline bool io_prep_async_work(struct io_kiocb *req,
+				      struct io_kiocb **link)
 {
 	bool do_hashed = false;
 
@@ -553,13 +557,17 @@ static inline bool io_prep_async_work(struct io_kiocb *req)
 			req->work.flags |= IO_WQ_WORK_NEEDS_USER;
 	}
 
+	*link = io_prep_linked_timeout(req);
 	return do_hashed;
 }
 
 static inline void io_queue_async_work(struct io_kiocb *req)
 {
-	bool do_hashed = io_prep_async_work(req);
 	struct io_ring_ctx *ctx = req->ctx;
+	struct io_kiocb *link;
+	bool do_hashed;
+
+	do_hashed = io_prep_async_work(req, &link);
 
 	trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work,
 					req->flags);
@@ -569,6 +577,9 @@ static inline void io_queue_async_work(struct io_kiocb *req)
 		io_wq_enqueue_hashed(ctx->io_wq, &req->work,
 					file_inode(req->file));
 	}
+
+	if (link)
+		io_queue_linked_timeout(link);
 }
 
 static void io_kill_timeout(struct io_kiocb *req)
@@ -870,6 +881,15 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 	nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list);
 	while (nxt) {
 		list_del_init(&nxt->list);
+
+		if ((req->flags & REQ_F_LINK_TIMEOUT) &&
+		    (nxt->flags & REQ_F_TIMEOUT)) {
+			wake_ev |= io_link_cancel_timeout(nxt);
+			nxt = list_first_entry_or_null(&req->link_list,
+							struct io_kiocb, list);
+			req->flags &= ~REQ_F_LINK_TIMEOUT;
+			continue;
+		}
 		if (!list_empty(&req->link_list)) {
 			INIT_LIST_HEAD(&nxt->link_list);
 			list_splice(&req->link_list, &nxt->link_list);
@@ -880,19 +900,13 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 		 * If we're in async work, we can continue processing the chain
 		 * in this context instead of having to queue up new async work.
 		 */
-		if (req->flags & REQ_F_LINK_TIMEOUT) {
-			wake_ev = io_link_cancel_timeout(nxt);
-
-			/* we dropped this link, get next */
-			nxt = list_first_entry_or_null(&req->link_list,
-							struct io_kiocb, list);
-		} else if (nxtptr && io_wq_current_is_worker()) {
-			*nxtptr = nxt;
-			break;
-		} else {
-			io_queue_async_work(nxt);
-			break;
+		if (nxt) {
+			if (nxtptr && io_wq_current_is_worker())
+				*nxtptr = nxt;
+			else
+				io_queue_async_work(nxt);
 		}
+		break;
 	}
 
 	if (wake_ev)
@@ -911,11 +925,16 @@ static void io_fail_links(struct io_kiocb *req)
 	spin_lock_irqsave(&ctx->completion_lock, flags);
 
 	while (!list_empty(&req->link_list)) {
+		const struct io_uring_sqe *sqe_to_free = NULL;
+
 		link = list_first_entry(&req->link_list, struct io_kiocb, list);
 		list_del_init(&link->list);
 
 		trace_io_uring_fail_link(req, link);
 
+		if (link->flags & REQ_F_FREE_SQE)
+			sqe_to_free = link->submit.sqe;
+
 		if ((req->flags & REQ_F_LINK_TIMEOUT) &&
 		    link->submit.sqe->opcode == IORING_OP_LINK_TIMEOUT) {
 			io_link_cancel_timeout(link);
@@ -923,6 +942,7 @@ static void io_fail_links(struct io_kiocb *req)
 			io_cqring_fill_event(link, -ECANCELED);
 			__io_double_put_req(link);
 		}
+		kfree(sqe_to_free);
 	}
 
 	io_commit_cqring(ctx);
@@ -2668,8 +2688,12 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
 
 	/* if a dependent link is ready, pass it back */
 	if (!ret && nxt) {
-		io_prep_async_work(nxt);
+		struct io_kiocb *link;
+
+		io_prep_async_work(nxt, &link);
 		*workptr = &nxt->work;
+		if (link)
+			io_queue_linked_timeout(link);
 	}
 }
 
@@ -2804,7 +2828,6 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 
 static void io_queue_linked_timeout(struct io_kiocb *req)
 {
-	struct io_timeout_data *data = req->timeout.data;
 	struct io_ring_ctx *ctx = req->ctx;
 
 	/*
@@ -2813,6 +2836,8 @@ static void io_queue_linked_timeout(struct io_kiocb *req)
 	 */
 	spin_lock_irq(&ctx->completion_lock);
 	if (!list_empty(&req->list)) {
+		struct io_timeout_data *data = req->timeout.data;
+
 		data->timer.function = io_link_timeout_fn;
 		hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
 				data->mode);
@@ -2826,7 +2851,6 @@ static void io_queue_linked_timeout(struct io_kiocb *req)
 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
 {
 	struct io_kiocb *nxt;
-	int ret;
 
 	if (!(req->flags & REQ_F_LINK))
 		return NULL;
@@ -2835,33 +2859,15 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
 	if (!nxt || nxt->submit.sqe->opcode != IORING_OP_LINK_TIMEOUT)
 		return NULL;
 
-	ret = io_timeout_setup(nxt);
-	/* common setup allows offset being set, we don't */
-	if (!ret && nxt->submit.sqe->off)
-		ret = -EINVAL;
-	if (ret) {
-		list_del_init(&nxt->list);
-		io_cqring_add_event(nxt, ret);
-		io_double_put_req(nxt);
-		return ERR_PTR(-ECANCELED);
-	}
-
 	req->flags |= REQ_F_LINK_TIMEOUT;
 	return nxt;
 }
 
 static void __io_queue_sqe(struct io_kiocb *req)
 {
-	struct io_kiocb *nxt;
+	struct io_kiocb *nxt = io_prep_linked_timeout(req);
 	int ret;
 
-	nxt = io_prep_linked_timeout(req);
-	if (IS_ERR(nxt)) {
-		ret = PTR_ERR(nxt);
-		nxt = NULL;
-		goto err;
-	}
-
 	ret = __io_submit_sqe(req, NULL, true);
 
 	/*
@@ -2889,10 +2895,6 @@ static void __io_queue_sqe(struct io_kiocb *req)
 			 * submit reference when the iocb is actually submitted.
 			 */
 			io_queue_async_work(req);
-
-			if (nxt)
-				io_queue_linked_timeout(nxt);
-
 			return;
 		}
 	}
@@ -2937,6 +2939,10 @@ static void io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow)
 	int need_submit = false;
 	struct io_ring_ctx *ctx = req->ctx;
 
+	if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
+		ret = -ECANCELED;
+		goto err;
+	}
 	if (!shadow) {
 		io_queue_sqe(req);
 		return;
@@ -2951,9 +2957,11 @@ static void io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow)
 	ret = io_req_defer(req);
 	if (ret) {
 		if (ret != -EIOCBQUEUED) {
+err:
 			io_cqring_add_event(req, ret);
 			io_double_put_req(req);
-			__io_free_req(shadow);
+			if (shadow)
+				__io_free_req(shadow);
 			return;
 		}
 	} else {
@@ -3010,6 +3018,17 @@ static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
 	if (*link) {
 		struct io_kiocb *prev = *link;
 
+		if (READ_ONCE(s->sqe->opcode) == IORING_OP_LINK_TIMEOUT) {
+			ret = io_timeout_setup(req);
+			/* common setup allows offset being set, we don't */
+			if (!ret && s->sqe->off)
+				ret = -EINVAL;
+			if (ret) {
+				prev->flags |= REQ_F_FAIL_LINK;
+				goto err_req;
+			}
+		}
+
 		sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
 		if (!sqe_copy) {
 			ret = -EAGAIN;
@@ -3017,6 +3036,7 @@ static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
 		}
 
 		s->sqe = sqe_copy;
+		req->flags |= REQ_F_FREE_SQE;
 		trace_io_uring_link(ctx, req, prev);
 		list_add_tail(&req->list, &prev->link_list);
 	} else if (s->sqe->flags & IOSQE_IO_LINK) {
-- 
2.24.0


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 8/8] io_uring: remove dead REQ_F_SEQ_PREV flag
  2019-11-16  1:53 [PATCHSET] Pending io_uring items not yet queued up for 5.5 Jens Axboe
                   ` (6 preceding siblings ...)
  2019-11-16  1:53 ` [PATCH 7/8] io_uring: fix sequencing issues with linked timeouts Jens Axboe
@ 2019-11-16  1:53 ` Jens Axboe
  7 siblings, 0 replies; 14+ messages in thread
From: Jens Axboe @ 2019-11-16  1:53 UTC (permalink / raw)
  To: io-uring; +Cc: asml.silence, Jens Axboe

With the conversion to io-wq, we no longer use that flag. Kill it.

Fixes: 561fb04a6a22 ("io_uring: replace workqueue usage with io-wq")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b88bd65c9848..824ddd1fd3f0 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -340,7 +340,6 @@ struct io_kiocb {
 #define REQ_F_NOWAIT		1	/* must not punt to workers */
 #define REQ_F_IOPOLL_COMPLETED	2	/* polled IO has completed */
 #define REQ_F_FIXED_FILE	4	/* ctx owns file */
-#define REQ_F_SEQ_PREV		8	/* sequential with previous */
 #define REQ_F_IO_DRAIN		16	/* drain existing IO first */
 #define REQ_F_IO_DRAINED	32	/* drain done */
 #define REQ_F_LINK		64	/* linked sqes */
-- 
2.24.0


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 7/8] io_uring: fix sequencing issues with linked timeouts
  2019-11-16  1:53 ` [PATCH 7/8] io_uring: fix sequencing issues with linked timeouts Jens Axboe
@ 2019-11-19 20:51   ` Pavel Begunkov
  2019-11-19 22:13     ` Jens Axboe
  0 siblings, 1 reply; 14+ messages in thread
From: Pavel Begunkov @ 2019-11-19 20:51 UTC (permalink / raw)
  To: Jens Axboe, io-uring

[-- Attachment #1.1: Type: text/plain, Size: 1386 bytes --]

On 16/11/2019 04:53, Jens Axboe wrote:
> We have an issue with timeout links that are deeper in the submit chain,
> because we only handle it upfront, not from later submissions. Move the
> prep + issue of the timeout link to the async work prep handler, and do
> it normally for non-async queue. If we validate and prepare the timeout
> links upfront when we first see them, there's nothing stopping us from
> supporting any sort of nesting.
> 
> Fixes: 2665abfd757f ("io_uring: add support for linked SQE timeouts")
> Reported-by: Pavel Begunkov <asml.silence@gmail.com>
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---

> @@ -923,6 +942,7 @@ static void io_fail_links(struct io_kiocb *req)
>  			io_cqring_fill_event(link, -ECANCELED);
>  			__io_double_put_req(link);
>  		}
> +		kfree(sqe_to_free);
>  	}
>  
>  	io_commit_cqring(ctx);
> @@ -2668,8 +2688,12 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
>  
>  	/* if a dependent link is ready, pass it back */
>  	if (!ret && nxt) {
> -		io_prep_async_work(nxt);
> +		struct io_kiocb *link;
> +
> +		io_prep_async_work(nxt, &link);
>  		*workptr = &nxt->work;
Are we safe here without synchronisation?
Probably io_link_timeout_fn() may miss the new value
(doing io-wq cancel).


> +		if (link)
> +			io_queue_linked_timeout(link);
>  	}
>  }
>  

-- 
Pavel Begunkov


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 7/8] io_uring: fix sequencing issues with linked timeouts
  2019-11-19 20:51   ` Pavel Begunkov
@ 2019-11-19 22:13     ` Jens Axboe
  2019-11-20 12:42       ` Pavel Begunkov
  0 siblings, 1 reply; 14+ messages in thread
From: Jens Axboe @ 2019-11-19 22:13 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 11/19/19 1:51 PM, Pavel Begunkov wrote:
> On 16/11/2019 04:53, Jens Axboe wrote:
>> We have an issue with timeout links that are deeper in the submit chain,
>> because we only handle it upfront, not from later submissions. Move the
>> prep + issue of the timeout link to the async work prep handler, and do
>> it normally for non-async queue. If we validate and prepare the timeout
>> links upfront when we first see them, there's nothing stopping us from
>> supporting any sort of nesting.
>>
>> Fixes: 2665abfd757f ("io_uring: add support for linked SQE timeouts")
>> Reported-by: Pavel Begunkov <asml.silence@gmail.com>
>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>> ---
> 
>> @@ -923,6 +942,7 @@ static void io_fail_links(struct io_kiocb *req)
>>   			io_cqring_fill_event(link, -ECANCELED);
>>   			__io_double_put_req(link);
>>   		}
>> +		kfree(sqe_to_free);
>>   	}
>>   
>>   	io_commit_cqring(ctx);
>> @@ -2668,8 +2688,12 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
>>   
>>   	/* if a dependent link is ready, pass it back */
>>   	if (!ret && nxt) {
>> -		io_prep_async_work(nxt);
>> +		struct io_kiocb *link;
>> +
>> +		io_prep_async_work(nxt, &link);
>>   		*workptr = &nxt->work;
> Are we safe here without synchronisation?
> Probably io_link_timeout_fn() may miss the new value
> (doing io-wq cancel).

Miss what new value? Don't follow that part.

This should be safe, by the time the request is findable, we have
made the necessary setup in io_prep_async_work().

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 7/8] io_uring: fix sequencing issues with linked timeouts
  2019-11-19 22:13     ` Jens Axboe
@ 2019-11-20 12:42       ` Pavel Begunkov
  2019-11-20 17:19         ` Jens Axboe
  0 siblings, 1 reply; 14+ messages in thread
From: Pavel Begunkov @ 2019-11-20 12:42 UTC (permalink / raw)
  To: Jens Axboe, io-uring

On 11/20/2019 1:13 AM, Jens Axboe wrote:
> On 11/19/19 1:51 PM, Pavel Begunkov wrote:
>> On 16/11/2019 04:53, Jens Axboe wrote:
>>> We have an issue with timeout links that are deeper in the submit chain,
>>> because we only handle it upfront, not from later submissions. Move the
>>> prep + issue of the timeout link to the async work prep handler, and do
>>> it normally for non-async queue. If we validate and prepare the timeout
>>> links upfront when we first see them, there's nothing stopping us from
>>> supporting any sort of nesting.
>>>
>>> Fixes: 2665abfd757f ("io_uring: add support for linked SQE timeouts")
>>> Reported-by: Pavel Begunkov <asml.silence@gmail.com>
>>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>> ---
>>
>>> @@ -923,6 +942,7 @@ static void io_fail_links(struct io_kiocb *req)
>>>   			io_cqring_fill_event(link, -ECANCELED);
>>>   			__io_double_put_req(link);
>>>   		}
>>> +		kfree(sqe_to_free);
>>>   	}
>>>   
>>>   	io_commit_cqring(ctx);
>>> @@ -2668,8 +2688,12 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
>>>   
>>>   	/* if a dependent link is ready, pass it back */
>>>   	if (!ret && nxt) {
>>> -		io_prep_async_work(nxt);
>>> +		struct io_kiocb *link;
>>> +
>>> +		io_prep_async_work(nxt, &link);
>>>   		*workptr = &nxt->work;
>> Are we safe here without synchronisation?
>> Probably io_link_timeout_fn() may miss the new value
>> (doing io-wq cancel).
> 
> Miss what new value? Don't follow that part.
> 

As I've got the idea of postponing:
at the moment of io_queue_linked_timeout(), a request should be either
in io-wq or completed. So, @nxt->work after the assignment above should
be visible to asynchronously called io_wq_cancel_work().

>>>  *workptr = &nxt->work;
However, there is no synchronisation for this assignment, and it could
be not visible from a parallel thread. Is it somehow handled in io-wq?

The pseudo code is below (th1, th2 - parallel threads)
th1: *workptr = &req->work;
// non-atomic assignment, the new value of workptr (i.e. &req->work)
// isn't yet propagated to th2

th1: io_queue_linked_timeout()
th2: io_linked_timeout_fn(), calls io_wq_cancel_work(), @req not found
th2: // memory model finally propagated *workptr = &req->work to @th2


Please, let me know if that's also not clear.

> This should be safe, by the time the request is findable, we have
> made the necessary setup in io_prep_async_work().
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 7/8] io_uring: fix sequencing issues with linked timeouts
  2019-11-20 12:42       ` Pavel Begunkov
@ 2019-11-20 17:19         ` Jens Axboe
  2019-11-20 18:15           ` Jens Axboe
  0 siblings, 1 reply; 14+ messages in thread
From: Jens Axboe @ 2019-11-20 17:19 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 11/20/19 5:42 AM, Pavel Begunkov wrote:
> On 11/20/2019 1:13 AM, Jens Axboe wrote:
>> On 11/19/19 1:51 PM, Pavel Begunkov wrote:
>>> On 16/11/2019 04:53, Jens Axboe wrote:
>>>> We have an issue with timeout links that are deeper in the submit chain,
>>>> because we only handle it upfront, not from later submissions. Move the
>>>> prep + issue of the timeout link to the async work prep handler, and do
>>>> it normally for non-async queue. If we validate and prepare the timeout
>>>> links upfront when we first see them, there's nothing stopping us from
>>>> supporting any sort of nesting.
>>>>
>>>> Fixes: 2665abfd757f ("io_uring: add support for linked SQE timeouts")
>>>> Reported-by: Pavel Begunkov <asml.silence@gmail.com>
>>>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>>> ---
>>>
>>>> @@ -923,6 +942,7 @@ static void io_fail_links(struct io_kiocb *req)
>>>>    			io_cqring_fill_event(link, -ECANCELED);
>>>>    			__io_double_put_req(link);
>>>>    		}
>>>> +		kfree(sqe_to_free);
>>>>    	}
>>>>    
>>>>    	io_commit_cqring(ctx);
>>>> @@ -2668,8 +2688,12 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
>>>>    
>>>>    	/* if a dependent link is ready, pass it back */
>>>>    	if (!ret && nxt) {
>>>> -		io_prep_async_work(nxt);
>>>> +		struct io_kiocb *link;
>>>> +
>>>> +		io_prep_async_work(nxt, &link);
>>>>    		*workptr = &nxt->work;
>>> Are we safe here without synchronisation?
>>> Probably io_link_timeout_fn() may miss the new value
>>> (doing io-wq cancel).
>>
>> Miss what new value? Don't follow that part.
>>
> 
> As I've got the idea of postponing:
> at the moment of io_queue_linked_timeout(), a request should be either
> in io-wq or completed. So, @nxt->work after the assignment above should
> be visible to asynchronously called io_wq_cancel_work().
> 
>>>>   *workptr = &nxt->work;
> However, there is no synchronisation for this assignment, and it could
> be not visible from a parallel thread. Is it somehow handled in io-wq?
> 
> The pseudo code is below (th1, th2 - parallel threads)
> th1: *workptr = &req->work;
> // non-atomic assignment, the new value of workptr (i.e. &req->work)
> // isn't yet propagated to th2
> 
> th1: io_queue_linked_timeout()
> th2: io_linked_timeout_fn(), calls io_wq_cancel_work(), @req not found
> th2: // memory model finally propagated *workptr = &req->work to @th2
> 
> 
> Please, let me know if that's also not clear.

OK, so I see what you're saying, but I don't think it's missing locking.
There is, however, a gap where we won't be able to find the request.
What we need is a way to assign the io-wq current work before we call
io_queue_linked_timeout(). Something ala:

	io_prep_async_work(nxt, &link);
	*workptr = &nxt->work;
+	io_wq_assign_cur();
	if (link)
		io_queue_linked_timeout(link);

where io_wq_assign_cur() ensures that worker->cur_work is set to the new
work, so we know it's discoverable before calling
io_queue_linked_timeout(). Probably also needs to include the
->get_work() call as part of that, so moving the logic around a bit in
io_worker_handle_work().

If we do that, then by the time we arm the linked timer, we know we'll
be able to find the new work item. The old work is done at this point
anyway, so doing this a bit earlier is fine.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 7/8] io_uring: fix sequencing issues with linked timeouts
  2019-11-20 17:19         ` Jens Axboe
@ 2019-11-20 18:15           ` Jens Axboe
  0 siblings, 0 replies; 14+ messages in thread
From: Jens Axboe @ 2019-11-20 18:15 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 11/20/19 10:19 AM, Jens Axboe wrote:
> On 11/20/19 5:42 AM, Pavel Begunkov wrote:
>> On 11/20/2019 1:13 AM, Jens Axboe wrote:
>>> On 11/19/19 1:51 PM, Pavel Begunkov wrote:
>>>> On 16/11/2019 04:53, Jens Axboe wrote:
>>>>> We have an issue with timeout links that are deeper in the submit chain,
>>>>> because we only handle it upfront, not from later submissions. Move the
>>>>> prep + issue of the timeout link to the async work prep handler, and do
>>>>> it normally for non-async queue. If we validate and prepare the timeout
>>>>> links upfront when we first see them, there's nothing stopping us from
>>>>> supporting any sort of nesting.
>>>>>
>>>>> Fixes: 2665abfd757f ("io_uring: add support for linked SQE timeouts")
>>>>> Reported-by: Pavel Begunkov <asml.silence@gmail.com>
>>>>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>>>> ---
>>>>
>>>>> @@ -923,6 +942,7 @@ static void io_fail_links(struct io_kiocb *req)
>>>>>     			io_cqring_fill_event(link, -ECANCELED);
>>>>>     			__io_double_put_req(link);
>>>>>     		}
>>>>> +		kfree(sqe_to_free);
>>>>>     	}
>>>>>     
>>>>>     	io_commit_cqring(ctx);
>>>>> @@ -2668,8 +2688,12 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
>>>>>     
>>>>>     	/* if a dependent link is ready, pass it back */
>>>>>     	if (!ret && nxt) {
>>>>> -		io_prep_async_work(nxt);
>>>>> +		struct io_kiocb *link;
>>>>> +
>>>>> +		io_prep_async_work(nxt, &link);
>>>>>     		*workptr = &nxt->work;
>>>> Are we safe here without synchronisation?
>>>> Probably io_link_timeout_fn() may miss the new value
>>>> (doing io-wq cancel).
>>>
>>> Miss what new value? Don't follow that part.
>>>
>>
>> As I've got the idea of postponing:
>> at the moment of io_queue_linked_timeout(), a request should be either
>> in io-wq or completed. So, @nxt->work after the assignment above should
>> be visible to asynchronously called io_wq_cancel_work().
>>
>>>>>    *workptr = &nxt->work;
>> However, there is no synchronisation for this assignment, and it could
>> be not visible from a parallel thread. Is it somehow handled in io-wq?
>>
>> The pseudo code is below (th1, th2 - parallel threads)
>> th1: *workptr = &req->work;
>> // non-atomic assignment, the new value of workptr (i.e. &req->work)
>> // isn't yet propagated to th2
>>
>> th1: io_queue_linked_timeout()
>> th2: io_linked_timeout_fn(), calls io_wq_cancel_work(), @req not found
>> th2: // memory model finally propagated *workptr = &req->work to @th2
>>
>>
>> Please, let me know if that's also not clear.
> 
> OK, so I see what you're saying, but I don't think it's missing locking.
> There is, however, a gap where we won't be able to find the request.
> What we need is a way to assign the io-wq current work before we call
> io_queue_linked_timeout(). Something ala:
> 
> 	io_prep_async_work(nxt, &link);
> 	*workptr = &nxt->work;
> +	io_wq_assign_cur();
> 	if (link)
> 		io_queue_linked_timeout(link);
> 
> where io_wq_assign_cur() ensures that worker->cur_work is set to the new
> work, so we know it's discoverable before calling
> io_queue_linked_timeout(). Probably also needs to include the
> ->get_work() call as part of that, so moving the logic around a bit in
> io_worker_handle_work().
> 
> If we do that, then by the time we arm the linked timer, we know we'll
> be able to find the new work item. The old work is done at this point
> anyway, so doing this a bit earlier is fine.

Something like this, totally untested.


diff --git a/fs/io-wq.c b/fs/io-wq.c
index b4bc377dda61..2666384aaf44 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -427,6 +427,9 @@ static void io_worker_handle_work(struct io_worker *worker)
 		worker->cur_work = work;
 		spin_unlock_irq(&worker->lock);
 
+		if (work->flags & IO_WQ_WORK_CB)
+			work->cb.fn(work->cb.data);
+
 		if ((work->flags & IO_WQ_WORK_NEEDS_FILES) &&
 		    current->files != work->files) {
 			task_lock(current);
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 4b29f922f80c..892989f3e41e 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -11,6 +11,7 @@ enum {
 	IO_WQ_WORK_NEEDS_FILES	= 16,
 	IO_WQ_WORK_UNBOUND	= 32,
 	IO_WQ_WORK_INTERNAL	= 64,
+	IO_WQ_WORK_CB		= 128,
 
 	IO_WQ_HASH_SHIFT	= 24,	/* upper 8 bits are used for hash key */
 };
@@ -21,8 +22,17 @@ enum io_wq_cancel {
 	IO_WQ_CANCEL_NOTFOUND,	/* work not found */
 };
 
+struct io_wq_work;
+struct io_wq_work_cb {
+	void (*fn)(void *data);
+	void *data;
+};
+
 struct io_wq_work {
-	struct list_head list;
+	union {
+		struct list_head list;
+		struct io_wq_work_cb cb;
+	};
 	void (*func)(struct io_wq_work **);
 	unsigned flags;
 	struct files_struct *files;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 066b59ffb54e..6f5745342eb2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2654,6 +2654,13 @@ static int __io_submit_sqe(struct io_kiocb *req, struct io_kiocb **nxt,
 	return 0;
 }
 
+static void io_link_work_cb(void *data)
+{
+	struct io_kiocb *link = data;
+
+	io_queue_linked_timeout(link);
+}
+
 static void io_wq_submit_work(struct io_wq_work **workptr)
 {
 	struct io_wq_work *work = *workptr;
@@ -2700,8 +2707,11 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
 
 		io_prep_async_work(nxt, &link);
 		*workptr = &nxt->work;
-		if (link)
-			io_queue_linked_timeout(link);
+		if (link) {
+			nxt->work.flags |= IO_WQ_WORK_CB;
+			nxt->work.cb.fn = io_link_work_cb;
+			nxt->work.cb.data = link;
+		}
 	}
 }
 

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, back to index

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-11-16  1:53 [PATCHSET] Pending io_uring items not yet queued up for 5.5 Jens Axboe
2019-11-16  1:53 ` [PATCH 1/8] io-wq: remove now redundant struct io_wq_nulls_list Jens Axboe
2019-11-16  1:53 ` [PATCH 2/8] io_uring: make POLL_ADD/POLL_REMOVE scale better Jens Axboe
2019-11-16  1:53 ` [PATCH 3/8] io_uring: io_async_cancel() should pass in 'nxt' request pointer Jens Axboe
2019-11-16  1:53 ` [PATCH 4/8] io_uring: cleanup return values from the queueing functions Jens Axboe
2019-11-16  1:53 ` [PATCH 5/8] io_uring: make io_double_put_req() use normal completion path Jens Axboe
2019-11-16  1:53 ` [PATCH 6/8] io_uring: make req->timeout be dynamically allocated Jens Axboe
2019-11-16  1:53 ` [PATCH 7/8] io_uring: fix sequencing issues with linked timeouts Jens Axboe
2019-11-19 20:51   ` Pavel Begunkov
2019-11-19 22:13     ` Jens Axboe
2019-11-20 12:42       ` Pavel Begunkov
2019-11-20 17:19         ` Jens Axboe
2019-11-20 18:15           ` Jens Axboe
2019-11-16  1:53 ` [PATCH 8/8] io_uring: remove dead REQ_F_SEQ_PREV flag Jens Axboe

IO-Uring Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/io-uring/0 io-uring/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 io-uring io-uring/ https://lore.kernel.org/io-uring \
		io-uring@vger.kernel.org
	public-inbox-index io-uring

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.io-uring


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git