IO-Uring Archive on lore.kernel.org
 help / color / Atom feed
* [PATCHSET 0/4] io_uring patches for 5.5-rc
@ 2019-12-09 23:18 Jens Axboe
  2019-12-09 23:18 ` [PATCH 1/4] io_uring: allow unbreakable links Jens Axboe
                   ` (3 more replies)
  0 siblings, 4 replies; 7+ messages in thread
From: Jens Axboe @ 2019-12-09 23:18 UTC (permalink / raw)
  To: io-uring

A few patches that I have queued up for 5.5. #1 is marked for stable
so we retain the same interface, earlier kernels don't have timeouts.
2+3 fix a performance regression with 5.4, and 4 fixes a potential
issue now that we have fileset updates.

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 1/4] io_uring: allow unbreakable links
  2019-12-09 23:18 [PATCHSET 0/4] io_uring patches for 5.5-rc Jens Axboe
@ 2019-12-09 23:18 ` Jens Axboe
  2019-12-10 10:13   ` Pavel Begunkov
  2019-12-09 23:18 ` [PATCH 2/4] io-wq: remove worker->wait waitqueue Jens Axboe
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 7+ messages in thread
From: Jens Axboe @ 2019-12-09 23:18 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe, stable, 李通洲

Some commands will invariably end in a failure in the sense that the
completion result will be less than zero. One such example is timeouts
that don't have a completion count set, they will always complete with
-ETIME unless cancelled.

For linked commands, we sever links and fail the rest of the chain if
the result is less than zero. Since we have commands where we know that
will happen, add IOSQE_IO_HARDLINK as a stronger link that doesn't sever
regardless of the completion result. Note that the link will still sever
if we fail submitting the parent request, hard links are only resilient
in the presence of completion results for requests that did submit
correctly.

Cc: stable@vger.kernel.org # v5.4
Reported-by: 李通洲 <carter.li@eoitek.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 54 +++++++++++++++++++++--------------
 include/uapi/linux/io_uring.h |  1 +
 2 files changed, 33 insertions(+), 22 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 405be10da73d..662404854571 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -377,6 +377,7 @@ struct io_kiocb {
 #define REQ_F_TIMEOUT_NOSEQ	8192	/* no timeout sequence */
 #define REQ_F_INFLIGHT		16384	/* on inflight list */
 #define REQ_F_COMP_LOCKED	32768	/* completion under lock */
+#define REQ_F_HARDLINK		65536	/* doesn't sever on completion < 0 */
 	u64			user_data;
 	u32			result;
 	u32			sequence;
@@ -941,7 +942,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 
 		list_del_init(&req->link_list);
 		if (!list_empty(&nxt->link_list))
-			nxt->flags |= REQ_F_LINK;
+			nxt->flags |= req->flags & (REQ_F_LINK|REQ_F_HARDLINK);
 		*nxtptr = nxt;
 		break;
 	}
@@ -1292,6 +1293,11 @@ static void kiocb_end_write(struct io_kiocb *req)
 	file_end_write(req->file);
 }
 
+static inline bool req_fail_links(struct io_kiocb *req)
+{
+	return (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK;
+}
+
 static void io_complete_rw_common(struct kiocb *kiocb, long res)
 {
 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
@@ -1299,7 +1305,7 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res)
 	if (kiocb->ki_flags & IOCB_WRITE)
 		kiocb_end_write(req);
 
-	if ((req->flags & REQ_F_LINK) && res != req->result)
+	if (res != req->result && req_fail_links(req))
 		req->flags |= REQ_F_FAIL_LINK;
 	io_cqring_add_event(req, res);
 }
@@ -1330,7 +1336,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
 	if (kiocb->ki_flags & IOCB_WRITE)
 		kiocb_end_write(req);
 
-	if ((req->flags & REQ_F_LINK) && res != req->result)
+	if (res != req->result && req_fail_links(req))
 		req->flags |= REQ_F_FAIL_LINK;
 	req->result = res;
 	if (res != -EAGAIN)
@@ -1956,7 +1962,7 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 				end > 0 ? end : LLONG_MAX,
 				fsync_flags & IORING_FSYNC_DATASYNC);
 
-	if (ret < 0 && (req->flags & REQ_F_LINK))
+	if (ret < 0 && req_fail_links(req))
 		req->flags |= REQ_F_FAIL_LINK;
 	io_cqring_add_event(req, ret);
 	io_put_req_find_next(req, nxt);
@@ -2003,7 +2009,7 @@ static int io_sync_file_range(struct io_kiocb *req,
 
 	ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags);
 
-	if (ret < 0 && (req->flags & REQ_F_LINK))
+	if (ret < 0 && req_fail_links(req))
 		req->flags |= REQ_F_FAIL_LINK;
 	io_cqring_add_event(req, ret);
 	io_put_req_find_next(req, nxt);
@@ -2079,7 +2085,7 @@ static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 
 out:
 	io_cqring_add_event(req, ret);
-	if (ret < 0 && (req->flags & REQ_F_LINK))
+	if (ret < 0 && req_fail_links(req))
 		req->flags |= REQ_F_FAIL_LINK;
 	io_put_req_find_next(req, nxt);
 	return 0;
@@ -2161,7 +2167,7 @@ static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 
 out:
 	io_cqring_add_event(req, ret);
-	if (ret < 0 && (req->flags & REQ_F_LINK))
+	if (ret < 0 && req_fail_links(req))
 		req->flags |= REQ_F_FAIL_LINK;
 	io_put_req_find_next(req, nxt);
 	return 0;
@@ -2196,7 +2202,7 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	}
 	if (ret == -ERESTARTSYS)
 		ret = -EINTR;
-	if (ret < 0 && (req->flags & REQ_F_LINK))
+	if (ret < 0 && req_fail_links(req))
 		req->flags |= REQ_F_FAIL_LINK;
 	io_cqring_add_event(req, ret);
 	io_put_req_find_next(req, nxt);
@@ -2263,7 +2269,7 @@ static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	if (ret == -ERESTARTSYS)
 		ret = -EINTR;
 out:
-	if (ret < 0 && (req->flags & REQ_F_LINK))
+	if (ret < 0 && req_fail_links(req))
 		req->flags |= REQ_F_FAIL_LINK;
 	io_cqring_add_event(req, ret);
 	io_put_req_find_next(req, nxt);
@@ -2340,7 +2346,7 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	spin_unlock_irq(&ctx->completion_lock);
 
 	io_cqring_add_event(req, ret);
-	if (ret < 0 && (req->flags & REQ_F_LINK))
+	if (ret < 0 && req_fail_links(req))
 		req->flags |= REQ_F_FAIL_LINK;
 	io_put_req(req);
 	return 0;
@@ -2399,7 +2405,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
 
 	io_cqring_ev_posted(ctx);
 
-	if (ret < 0 && req->flags & REQ_F_LINK)
+	if (ret < 0 && req_fail_links(req))
 		req->flags |= REQ_F_FAIL_LINK;
 	io_put_req_find_next(req, &nxt);
 	if (nxt)
@@ -2582,7 +2588,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
 	io_cqring_ev_posted(ctx);
-	if (req->flags & REQ_F_LINK)
+	if (req_fail_links(req))
 		req->flags |= REQ_F_FAIL_LINK;
 	io_put_req(req);
 	return HRTIMER_NORESTART;
@@ -2608,7 +2614,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
 	if (ret == -1)
 		return -EALREADY;
 
-	if (req->flags & REQ_F_LINK)
+	if (req_fail_links(req))
 		req->flags |= REQ_F_FAIL_LINK;
 	io_cqring_fill_event(req, -ECANCELED);
 	io_put_req(req);
@@ -2640,7 +2646,7 @@ static int io_timeout_remove(struct io_kiocb *req,
 	io_commit_cqring(ctx);
 	spin_unlock_irq(&ctx->completion_lock);
 	io_cqring_ev_posted(ctx);
-	if (ret < 0 && req->flags & REQ_F_LINK)
+	if (ret < 0 && req_fail_links(req))
 		req->flags |= REQ_F_FAIL_LINK;
 	io_put_req(req);
 	return 0;
@@ -2822,7 +2828,7 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 	io_cqring_ev_posted(ctx);
 
-	if (ret < 0 && (req->flags & REQ_F_LINK))
+	if (ret < 0 && req_fail_links(req))
 		req->flags |= REQ_F_FAIL_LINK;
 	io_put_req_find_next(req, nxt);
 }
@@ -3044,7 +3050,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
 	io_put_req(req);
 
 	if (ret) {
-		if (req->flags & REQ_F_LINK)
+		if (req_fail_links(req))
 			req->flags |= REQ_F_FAIL_LINK;
 		io_cqring_add_event(req, ret);
 		io_put_req(req);
@@ -3179,7 +3185,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
 	if (prev) {
-		if (prev->flags & REQ_F_LINK)
+		if (req_fail_links(prev))
 			prev->flags |= REQ_F_FAIL_LINK;
 		io_async_find_and_cancel(ctx, req, prev->user_data, NULL,
 						-ETIME);
@@ -3273,7 +3279,7 @@ static void __io_queue_sqe(struct io_kiocb *req)
 	/* and drop final reference, if we failed */
 	if (ret) {
 		io_cqring_add_event(req, ret);
-		if (req->flags & REQ_F_LINK)
+		if (req_fail_links(req))
 			req->flags |= REQ_F_FAIL_LINK;
 		io_put_req(req);
 	}
@@ -3293,7 +3299,7 @@ static void io_queue_sqe(struct io_kiocb *req)
 	if (ret) {
 		if (ret != -EIOCBQUEUED) {
 			io_cqring_add_event(req, ret);
-			if (req->flags & REQ_F_LINK)
+			if (req_fail_links(req))
 				req->flags |= REQ_F_FAIL_LINK;
 			io_double_put_req(req);
 		}
@@ -3311,7 +3317,8 @@ static inline void io_queue_link_head(struct io_kiocb *req)
 }
 
 
-#define SQE_VALID_FLAGS	(IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)
+#define SQE_VALID_FLAGS	(IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK|	\
+				IOSQE_IO_HARDLINK)
 
 static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
 			  struct io_kiocb **link)
@@ -3358,13 +3365,16 @@ static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
 		ret = io_req_defer_prep(req, io);
 		if (ret) {
 			kfree(io);
+			/* fail even hard links since we don't submit */
 			prev->flags |= REQ_F_FAIL_LINK;
 			goto err_req;
 		}
 		trace_io_uring_link(ctx, req, prev);
 		list_add_tail(&req->link_list, &prev->link_list);
-	} else if (req->sqe->flags & IOSQE_IO_LINK) {
+	} else if (req->sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
 		req->flags |= REQ_F_LINK;
+		if (req->sqe->flags & IOSQE_IO_HARDLINK)
+			req->flags |= REQ_F_HARDLINK;
 
 		INIT_LIST_HEAD(&req->link_list);
 		*link = req;
@@ -3518,7 +3528,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 		 * If previous wasn't linked and we have a linked command,
 		 * that's the end of the chain. Submit the previous link.
 		 */
-		if (!(sqe_flags & IOSQE_IO_LINK) && link) {
+		if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) && link) {
 			io_queue_link_head(link);
 			link = NULL;
 		}
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index eabccb46edd1..f296a5e77661 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -48,6 +48,7 @@ struct io_uring_sqe {
 #define IOSQE_FIXED_FILE	(1U << 0)	/* use fixed fileset */
 #define IOSQE_IO_DRAIN		(1U << 1)	/* issue after inflight IO */
 #define IOSQE_IO_LINK		(1U << 2)	/* links next sqe */
+#define IOSQE_IO_HARDLINK	(1U << 3)	/* link LINK, but stronger */
 
 /*
  * io_uring_setup() flags
-- 
2.24.0


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 2/4] io-wq: remove worker->wait waitqueue
  2019-12-09 23:18 [PATCHSET 0/4] io_uring patches for 5.5-rc Jens Axboe
  2019-12-09 23:18 ` [PATCH 1/4] io_uring: allow unbreakable links Jens Axboe
@ 2019-12-09 23:18 ` Jens Axboe
  2019-12-09 23:18 ` [PATCH 3/4] io-wq: briefly spin for new work after finishing work Jens Axboe
  2019-12-09 23:18 ` [PATCH 4/4] io_uring: sqthread should grab ctx->uring_lock for submissions Jens Axboe
  3 siblings, 0 replies; 7+ messages in thread
From: Jens Axboe @ 2019-12-09 23:18 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

We only have one cases of using the waitqueue to wake the worker, the
rest are using wake_up_process(). Since we can save some cycles not
fiddling with the waitqueue io_wqe_worker(), switch the work activation
to task wakeup and get rid of the now unused wait_queue_head_t in
struct io_worker.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 74b40506c5d9..6b663ddceb42 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -49,7 +49,6 @@ struct io_worker {
 	struct hlist_nulls_node nulls_node;
 	struct list_head all_list;
 	struct task_struct *task;
-	wait_queue_head_t wait;
 	struct io_wqe *wqe;
 
 	struct io_wq_work *cur_work;
@@ -258,7 +257,7 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
 
 	worker = hlist_nulls_entry(n, struct io_worker, nulls_node);
 	if (io_worker_get(worker)) {
-		wake_up(&worker->wait);
+		wake_up_process(worker->task);
 		io_worker_release(worker);
 		return true;
 	}
@@ -497,13 +496,11 @@ static int io_wqe_worker(void *data)
 	struct io_worker *worker = data;
 	struct io_wqe *wqe = worker->wqe;
 	struct io_wq *wq = wqe->wq;
-	DEFINE_WAIT(wait);
 
 	io_worker_start(wqe, worker);
 
 	while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
-		prepare_to_wait(&worker->wait, &wait, TASK_INTERRUPTIBLE);
-
+		set_current_state(TASK_INTERRUPTIBLE);
 		spin_lock_irq(&wqe->lock);
 		if (io_wqe_run_queue(wqe)) {
 			__set_current_state(TASK_RUNNING);
@@ -526,8 +523,6 @@ static int io_wqe_worker(void *data)
 			break;
 	}
 
-	finish_wait(&worker->wait, &wait);
-
 	if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
 		spin_lock_irq(&wqe->lock);
 		if (!wq_list_empty(&wqe->work_list))
@@ -589,7 +584,6 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
 
 	refcount_set(&worker->ref, 1);
 	worker->nulls_node.pprev = NULL;
-	init_waitqueue_head(&worker->wait);
 	worker->wqe = wqe;
 	spin_lock_init(&worker->lock);
 
-- 
2.24.0


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 3/4] io-wq: briefly spin for new work after finishing work
  2019-12-09 23:18 [PATCHSET 0/4] io_uring patches for 5.5-rc Jens Axboe
  2019-12-09 23:18 ` [PATCH 1/4] io_uring: allow unbreakable links Jens Axboe
  2019-12-09 23:18 ` [PATCH 2/4] io-wq: remove worker->wait waitqueue Jens Axboe
@ 2019-12-09 23:18 ` Jens Axboe
  2019-12-09 23:18 ` [PATCH 4/4] io_uring: sqthread should grab ctx->uring_lock for submissions Jens Axboe
  3 siblings, 0 replies; 7+ messages in thread
From: Jens Axboe @ 2019-12-09 23:18 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

To avoid going to sleep only to get woken shortly thereafter, spin
briefly for new work upon completion of work.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c | 24 ++++++++++++++++++++++--
 fs/io-wq.h |  7 ++++---
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 6b663ddceb42..90c4978781fb 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -491,26 +491,46 @@ static void io_worker_handle_work(struct io_worker *worker)
 	} while (1);
 }
 
+static inline void io_worker_spin_for_work(struct io_wqe *wqe)
+{
+	int i = 0;
+
+	while (++i < 1000) {
+		if (io_wqe_run_queue(wqe))
+			break;
+		if (need_resched())
+			break;
+		cpu_relax();
+	}
+}
+
 static int io_wqe_worker(void *data)
 {
 	struct io_worker *worker = data;
 	struct io_wqe *wqe = worker->wqe;
 	struct io_wq *wq = wqe->wq;
+	bool did_work;
 
 	io_worker_start(wqe, worker);
 
+	did_work = false;
 	while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
 		set_current_state(TASK_INTERRUPTIBLE);
+loop:
+		if (did_work)
+			io_worker_spin_for_work(wqe);
 		spin_lock_irq(&wqe->lock);
 		if (io_wqe_run_queue(wqe)) {
 			__set_current_state(TASK_RUNNING);
 			io_worker_handle_work(worker);
-			continue;
+			did_work = true;
+			goto loop;
 		}
+		did_work = false;
 		/* drops the lock on success, retry */
 		if (__io_worker_idle(wqe, worker)) {
 			__release(&wqe->lock);
-			continue;
+			goto loop;
 		}
 		spin_unlock_irq(&wqe->lock);
 		if (signal_pending(current))
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 7c333a28e2a7..fb993b2bd0ef 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -35,7 +35,8 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
 				    struct io_wq_work_list *list)
 {
 	if (!list->first) {
-		list->first = list->last = node;
+		list->last = node;
+		WRITE_ONCE(list->first, node);
 	} else {
 		list->last->next = node;
 		list->last = node;
@@ -47,7 +48,7 @@ static inline void wq_node_del(struct io_wq_work_list *list,
 			       struct io_wq_work_node *prev)
 {
 	if (node == list->first)
-		list->first = node->next;
+		WRITE_ONCE(list->first, node->next);
 	if (node == list->last)
 		list->last = prev;
 	if (prev)
@@ -58,7 +59,7 @@ static inline void wq_node_del(struct io_wq_work_list *list,
 #define wq_list_for_each(pos, prv, head)			\
 	for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
 
-#define wq_list_empty(list)	((list)->first == NULL)
+#define wq_list_empty(list)	(READ_ONCE((list)->first) == NULL)
 #define INIT_WQ_LIST(list)	do {				\
 	(list)->first = NULL;					\
 	(list)->last = NULL;					\
-- 
2.24.0


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 4/4] io_uring: sqthread should grab ctx->uring_lock for submissions
  2019-12-09 23:18 [PATCHSET 0/4] io_uring patches for 5.5-rc Jens Axboe
                   ` (2 preceding siblings ...)
  2019-12-09 23:18 ` [PATCH 3/4] io-wq: briefly spin for new work after finishing work Jens Axboe
@ 2019-12-09 23:18 ` Jens Axboe
  3 siblings, 0 replies; 7+ messages in thread
From: Jens Axboe @ 2019-12-09 23:18 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

We use the mutex to guard against registered file updates, for instance.
Ensure we're safe in accessing that state against concurrent updates.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 662404854571..33e15e925609 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2997,12 +2997,7 @@ static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt,
 		if (req->result == -EAGAIN)
 			return -EAGAIN;
 
-		/* workqueue context doesn't hold uring_lock, grab it now */
-		if (req->in_async)
-			mutex_lock(&ctx->uring_lock);
 		io_iopoll_req_issued(req);
-		if (req->in_async)
-			mutex_unlock(&ctx->uring_lock);
 	}
 
 	return 0;
@@ -3657,7 +3652,9 @@ static int io_sq_thread(void *data)
 		}
 
 		to_submit = min(to_submit, ctx->sq_entries);
+		mutex_lock(&ctx->uring_lock);
 		ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
+		mutex_unlock(&ctx->uring_lock);
 		if (ret > 0)
 			inflight += ret;
 	}
-- 
2.24.0


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 1/4] io_uring: allow unbreakable links
  2019-12-09 23:18 ` [PATCH 1/4] io_uring: allow unbreakable links Jens Axboe
@ 2019-12-10 10:13   ` Pavel Begunkov
  2019-12-10 15:13     ` Jens Axboe
  0 siblings, 1 reply; 7+ messages in thread
From: Pavel Begunkov @ 2019-12-10 10:13 UTC (permalink / raw)
  To: Jens Axboe, io-uring; +Cc: stable, 李通洲

On 12/10/2019 2:18 AM, Jens Axboe wrote:
> Some commands will invariably end in a failure in the sense that the
> completion result will be less than zero. One such example is timeouts
> that don't have a completion count set, they will always complete with
> -ETIME unless cancelled.
> 
> For linked commands, we sever links and fail the rest of the chain if
> the result is less than zero. Since we have commands where we know that
> will happen, add IOSQE_IO_HARDLINK as a stronger link that doesn't sever
> regardless of the completion result. Note that the link will still sever
> if we fail submitting the parent request, hard links are only resilient
> in the presence of completion results for requests that did submit
> correctly.
> 
> Cc: stable@vger.kernel.org # v5.4
> Reported-by: 李通洲 <carter.li@eoitek.com>
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  fs/io_uring.c                 | 54 +++++++++++++++++++++--------------
>  include/uapi/linux/io_uring.h |  1 +
>  2 files changed, 33 insertions(+), 22 deletions(-)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 405be10da73d..662404854571 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -377,6 +377,7 @@ struct io_kiocb {
>  #define REQ_F_TIMEOUT_NOSEQ	8192	/* no timeout sequence */
>  #define REQ_F_INFLIGHT		16384	/* on inflight list */
>  #define REQ_F_COMP_LOCKED	32768	/* completion under lock */
> +#define REQ_F_HARDLINK		65536	/* doesn't sever on completion < 0 */
>  	u64			user_data;
>  	u32			result;
>  	u32			sequence;
> @@ -941,7 +942,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
>  
>  		list_del_init(&req->link_list);
>  		if (!list_empty(&nxt->link_list))
> -			nxt->flags |= REQ_F_LINK;
> +			nxt->flags |= req->flags & (REQ_F_LINK|REQ_F_HARDLINK);

I'm not sure we want to unconditionally propagate REQ_F_HARDLINK further.

E.g. timeout|REQ_F_HARDLINK -> read -> write
REQ_F_HARDLINK will be set to the following read and its fail won't
cancel the write, that seems strange. If users want such behaviour, they
can set REQ_F_HARDLINK when needed by hand.


>  		*nxtptr = nxt;
>  		break;
>  	}
> @@ -1292,6 +1293,11 @@ static void kiocb_end_write(struct io_kiocb *req)
>  	file_end_write(req->file);
>  }
>  
> +static inline bool req_fail_links(struct io_kiocb *req)
> +{
> +	return (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK;
> +}
> +

req_fail_links() sounds like it not only do checking, but actually fails
links. How about as follows?

+static inline void req_set_fail_links(struct io_kiocb *req)
+{
+	if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
+		req->flags |= REQ_F_FAIL_LINK;
+}

And it would be less code below


>  static void io_complete_rw_common(struct kiocb *kiocb, long res)
>  {
>  	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
> @@ -1299,7 +1305,7 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res)
>  	if (kiocb->ki_flags & IOCB_WRITE)
>  		kiocb_end_write(req);
>  
> -	if ((req->flags & REQ_F_LINK) && res != req->result)
> +	if (res != req->result && req_fail_links(req))
>  		req->flags |= REQ_F_FAIL_LINK;
>  	io_cqring_add_event(req, res);
>  }
> @@ -1330,7 +1336,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
>  	if (kiocb->ki_flags & IOCB_WRITE)
>  		kiocb_end_write(req);
>  
> -	if ((req->flags & REQ_F_LINK) && res != req->result)
> +	if (res != req->result && req_fail_links(req))
>  		req->flags |= REQ_F_FAIL_LINK;
>  	req->result = res;
>  	if (res != -EAGAIN)
> @@ -1956,7 +1962,7 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
>  				end > 0 ? end : LLONG_MAX,
>  				fsync_flags & IORING_FSYNC_DATASYNC);
>  
> -	if (ret < 0 && (req->flags & REQ_F_LINK))
> +	if (ret < 0 && req_fail_links(req))
>  		req->flags |= REQ_F_FAIL_LINK;
>  	io_cqring_add_event(req, ret);
>  	io_put_req_find_next(req, nxt);
> @@ -2003,7 +2009,7 @@ static int io_sync_file_range(struct io_kiocb *req,
>  
>  	ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags);
>  
> -	if (ret < 0 && (req->flags & REQ_F_LINK))
> +	if (ret < 0 && req_fail_links(req))
>  		req->flags |= REQ_F_FAIL_LINK;
>  	io_cqring_add_event(req, ret);
>  	io_put_req_find_next(req, nxt);
> @@ -2079,7 +2085,7 @@ static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
>  
>  out:
>  	io_cqring_add_event(req, ret);
> -	if (ret < 0 && (req->flags & REQ_F_LINK))
> +	if (ret < 0 && req_fail_links(req))
>  		req->flags |= REQ_F_FAIL_LINK;
>  	io_put_req_find_next(req, nxt);
>  	return 0;
> @@ -2161,7 +2167,7 @@ static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
>  
>  out:
>  	io_cqring_add_event(req, ret);
> -	if (ret < 0 && (req->flags & REQ_F_LINK))
> +	if (ret < 0 && req_fail_links(req))
>  		req->flags |= REQ_F_FAIL_LINK;
>  	io_put_req_find_next(req, nxt);
>  	return 0;
> @@ -2196,7 +2202,7 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe,
>  	}
>  	if (ret == -ERESTARTSYS)
>  		ret = -EINTR;
> -	if (ret < 0 && (req->flags & REQ_F_LINK))
> +	if (ret < 0 && req_fail_links(req))
>  		req->flags |= REQ_F_FAIL_LINK;
>  	io_cqring_add_event(req, ret);
>  	io_put_req_find_next(req, nxt);
> @@ -2263,7 +2269,7 @@ static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe,
>  	if (ret == -ERESTARTSYS)
>  		ret = -EINTR;
>  out:
> -	if (ret < 0 && (req->flags & REQ_F_LINK))
> +	if (ret < 0 && req_fail_links(req))
>  		req->flags |= REQ_F_FAIL_LINK;
>  	io_cqring_add_event(req, ret);
>  	io_put_req_find_next(req, nxt);
> @@ -2340,7 +2346,7 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>  	spin_unlock_irq(&ctx->completion_lock);
>  
>  	io_cqring_add_event(req, ret);
> -	if (ret < 0 && (req->flags & REQ_F_LINK))
> +	if (ret < 0 && req_fail_links(req))
>  		req->flags |= REQ_F_FAIL_LINK;
>  	io_put_req(req);
>  	return 0;
> @@ -2399,7 +2405,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
>  
>  	io_cqring_ev_posted(ctx);
>  
> -	if (ret < 0 && req->flags & REQ_F_LINK)
> +	if (ret < 0 && req_fail_links(req))
>  		req->flags |= REQ_F_FAIL_LINK;
>  	io_put_req_find_next(req, &nxt);
>  	if (nxt)
> @@ -2582,7 +2588,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
>  	spin_unlock_irqrestore(&ctx->completion_lock, flags);
>  
>  	io_cqring_ev_posted(ctx);
> -	if (req->flags & REQ_F_LINK)
> +	if (req_fail_links(req))
>  		req->flags |= REQ_F_FAIL_LINK;>  	io_put_req(req);
>  	return HRTIMER_NORESTART;
> @@ -2608,7 +2614,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
>  	if (ret == -1)
>  		return -EALREADY;
>  
> -	if (req->flags & REQ_F_LINK)
> +	if (req_fail_links(req))
>  		req->flags |= REQ_F_FAIL_LINK;
>  	io_cqring_fill_event(req, -ECANCELED);
>  	io_put_req(req);
> @@ -2640,7 +2646,7 @@ static int io_timeout_remove(struct io_kiocb *req,
>  	io_commit_cqring(ctx);
>  	spin_unlock_irq(&ctx->completion_lock);
>  	io_cqring_ev_posted(ctx);
> -	if (ret < 0 && req->flags & REQ_F_LINK)
> +	if (ret < 0 && req_fail_links(req))
>  		req->flags |= REQ_F_FAIL_LINK;
>  	io_put_req(req);
>  	return 0;
> @@ -2822,7 +2828,7 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
>  	spin_unlock_irqrestore(&ctx->completion_lock, flags);
>  	io_cqring_ev_posted(ctx);
>  
> -	if (ret < 0 && (req->flags & REQ_F_LINK))
> +	if (ret < 0 && req_fail_links(req))
>  		req->flags |= REQ_F_FAIL_LINK;
>  	io_put_req_find_next(req, nxt);
>  }
> @@ -3044,7 +3050,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
>  	io_put_req(req);
>  
>  	if (ret) {
> -		if (req->flags & REQ_F_LINK)
> +		if (req_fail_links(req))
>  			req->flags |= REQ_F_FAIL_LINK;
>  		io_cqring_add_event(req, ret);
>  		io_put_req(req);
> @@ -3179,7 +3185,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
>  	spin_unlock_irqrestore(&ctx->completion_lock, flags);
>  
>  	if (prev) {
> -		if (prev->flags & REQ_F_LINK)
> +		if (req_fail_links(prev))
>  			prev->flags |= REQ_F_FAIL_LINK;
>  		io_async_find_and_cancel(ctx, req, prev->user_data, NULL,
>  						-ETIME);
> @@ -3273,7 +3279,7 @@ static void __io_queue_sqe(struct io_kiocb *req)
>  	/* and drop final reference, if we failed */
>  	if (ret) {
>  		io_cqring_add_event(req, ret);
> -		if (req->flags & REQ_F_LINK)
> +		if (req_fail_links(req))
>  			req->flags |= REQ_F_FAIL_LINK;
>  		io_put_req(req);
>  	}
> @@ -3293,7 +3299,7 @@ static void io_queue_sqe(struct io_kiocb *req)
>  	if (ret) {
>  		if (ret != -EIOCBQUEUED) {
>  			io_cqring_add_event(req, ret);
> -			if (req->flags & REQ_F_LINK)
> +			if (req_fail_links(req))
>  				req->flags |= REQ_F_FAIL_LINK;
>  			io_double_put_req(req);
>  		}
> @@ -3311,7 +3317,8 @@ static inline void io_queue_link_head(struct io_kiocb *req)
>  }
>  
>  
> -#define SQE_VALID_FLAGS	(IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)
> +#define SQE_VALID_FLAGS	(IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK|	\
> +				IOSQE_IO_HARDLINK)
>  
>  static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
>  			  struct io_kiocb **link)
> @@ -3358,13 +3365,16 @@ static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
>  		ret = io_req_defer_prep(req, io);
>  		if (ret) {
>  			kfree(io);
> +			/* fail even hard links since we don't submit */
>  			prev->flags |= REQ_F_FAIL_LINK;
>  			goto err_req;
>  		}
>  		trace_io_uring_link(ctx, req, prev);
>  		list_add_tail(&req->link_list, &prev->link_list);
> -	} else if (req->sqe->flags & IOSQE_IO_LINK) {
> +	} else if (req->sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
>  		req->flags |= REQ_F_LINK;
> +		if (req->sqe->flags & IOSQE_IO_HARDLINK)
> +			req->flags |= REQ_F_HARDLINK;
>  
>  		INIT_LIST_HEAD(&req->link_list);
>  		*link = req;
> @@ -3518,7 +3528,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
>  		 * If previous wasn't linked and we have a linked command,
>  		 * that's the end of the chain. Submit the previous link.
>  		 */
> -		if (!(sqe_flags & IOSQE_IO_LINK) && link) {
> +		if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) && link) {

IMHO, requests shouldn't have IOSQE_IO_HARDLINK without IOSQE_IO_LINK,
the same is in the code. Then, it's sufficient check only IOSQE_IO_LINK.

>  			io_queue_link_head(link);
>  			link = NULL;
>  		}
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index eabccb46edd1..f296a5e77661 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -48,6 +48,7 @@ struct io_uring_sqe {
>  #define IOSQE_FIXED_FILE	(1U << 0)	/* use fixed fileset */
>  #define IOSQE_IO_DRAIN		(1U << 1)	/* issue after inflight IO */
>  #define IOSQE_IO_LINK		(1U << 2)	/* links next sqe */
> +#define IOSQE_IO_HARDLINK	(1U << 3)	/* link LINK, but stronger */
>  
>  /*
>   * io_uring_setup() flags
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 1/4] io_uring: allow unbreakable links
  2019-12-10 10:13   ` Pavel Begunkov
@ 2019-12-10 15:13     ` Jens Axboe
  0 siblings, 0 replies; 7+ messages in thread
From: Jens Axboe @ 2019-12-10 15:13 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring; +Cc: stable, 李通洲

On 12/10/19 3:13 AM, Pavel Begunkov wrote:
> On 12/10/2019 2:18 AM, Jens Axboe wrote:
>> Some commands will invariably end in a failure in the sense that the
>> completion result will be less than zero. One such example is timeouts
>> that don't have a completion count set, they will always complete with
>> -ETIME unless cancelled.
>>
>> For linked commands, we sever links and fail the rest of the chain if
>> the result is less than zero. Since we have commands where we know that
>> will happen, add IOSQE_IO_HARDLINK as a stronger link that doesn't sever
>> regardless of the completion result. Note that the link will still sever
>> if we fail submitting the parent request, hard links are only resilient
>> in the presence of completion results for requests that did submit
>> correctly.
>>
>> Cc: stable@vger.kernel.org # v5.4
>> Reported-by: 李通洲 <carter.li@eoitek.com>
>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>> ---
>>  fs/io_uring.c                 | 54 +++++++++++++++++++++--------------
>>  include/uapi/linux/io_uring.h |  1 +
>>  2 files changed, 33 insertions(+), 22 deletions(-)
>>
>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>> index 405be10da73d..662404854571 100644
>> --- a/fs/io_uring.c
>> +++ b/fs/io_uring.c
>> @@ -377,6 +377,7 @@ struct io_kiocb {
>>  #define REQ_F_TIMEOUT_NOSEQ	8192	/* no timeout sequence */
>>  #define REQ_F_INFLIGHT		16384	/* on inflight list */
>>  #define REQ_F_COMP_LOCKED	32768	/* completion under lock */
>> +#define REQ_F_HARDLINK		65536	/* doesn't sever on completion < 0 */
>>  	u64			user_data;
>>  	u32			result;
>>  	u32			sequence;
>> @@ -941,7 +942,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
>>  
>>  		list_del_init(&req->link_list);
>>  		if (!list_empty(&nxt->link_list))
>> -			nxt->flags |= REQ_F_LINK;
>> +			nxt->flags |= req->flags & (REQ_F_LINK|REQ_F_HARDLINK);
> 
> I'm not sure we want to unconditionally propagate REQ_F_HARDLINK further.
> 
> E.g. timeout|REQ_F_HARDLINK -> read -> write
> REQ_F_HARDLINK will be set to the following read and its fail won't
> cancel the write, that seems strange. If users want such behaviour, they
> can set REQ_F_HARDLINK when needed by hand.

Yeah you're right, how about if we set REQ_F_HARDLINK in io_submit_sqe()
if IOSQE_IO_HARDLINK is set, and just keep the above as it is. We need
to ensure we're correctly keeping hard link if it's set in the sqe, not
carry over like the above if it wasn't set.

>>  		*nxtptr = nxt;
>>  		break;
>>  	}
>> @@ -1292,6 +1293,11 @@ static void kiocb_end_write(struct io_kiocb *req)
>>  	file_end_write(req->file);
>>  }
>>  
>> +static inline bool req_fail_links(struct io_kiocb *req)
>> +{
>> +	return (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK;
>> +}
>> +
> 
> req_fail_links() sounds like it not only do checking, but actually fails
> links. How about as follows?
> 
> +static inline void req_set_fail_links(struct io_kiocb *req)
> +{
> +	if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
> +		req->flags |= REQ_F_FAIL_LINK;
> +}
> 
> And it would be less code below

That's cleaner, I'll make the change.

>> @@ -3518,7 +3528,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
>>  		 * If previous wasn't linked and we have a linked command,
>>  		 * that's the end of the chain. Submit the previous link.
>>  		 */
>> -		if (!(sqe_flags & IOSQE_IO_LINK) && link) {
>> +		if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) && link) {
> 
> IMHO, requests shouldn't have IOSQE_IO_HARDLINK without IOSQE_IO_LINK,
> the same is in the code. Then, it's sufficient check only IOSQE_IO_LINK.

IOSQE_IO_HARDLINK implies link behavior, it's really just a modifier. So
yes, I agree, and that change should then also be removable.

I'll post a v2, thanks for your review.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, back to index

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-12-09 23:18 [PATCHSET 0/4] io_uring patches for 5.5-rc Jens Axboe
2019-12-09 23:18 ` [PATCH 1/4] io_uring: allow unbreakable links Jens Axboe
2019-12-10 10:13   ` Pavel Begunkov
2019-12-10 15:13     ` Jens Axboe
2019-12-09 23:18 ` [PATCH 2/4] io-wq: remove worker->wait waitqueue Jens Axboe
2019-12-09 23:18 ` [PATCH 3/4] io-wq: briefly spin for new work after finishing work Jens Axboe
2019-12-09 23:18 ` [PATCH 4/4] io_uring: sqthread should grab ctx->uring_lock for submissions Jens Axboe

IO-Uring Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/io-uring/0 io-uring/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 io-uring io-uring/ https://lore.kernel.org/io-uring \
		io-uring@vger.kernel.org
	public-inbox-index io-uring

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.io-uring


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git