All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/4] timeout and sequence fixes
@ 2020-04-14 21:39 Pavel Begunkov
  2020-04-14 21:39 ` [PATCH 1/4] io_uring: fix cached_sq_head in io_timeout() Pavel Begunkov
                   ` (4 more replies)
  0 siblings, 5 replies; 6+ messages in thread
From: Pavel Begunkov @ 2020-04-14 21:39 UTC (permalink / raw)
  To: Jens Axboe, io-uring, linux-kernel

[4/4] is dirty, but fixes the issue. And there is still "SQ vs CQ"
problem, solving which can effectively revert it, so I suggest to
postpone the last patch for a while. I'll rebase if it'd be necessary.

Pavel Begunkov (4):
  io_uring: fix cached_sq_head in io_timeout()
  io_uring: kill already cached timeout.seq_offset
  io_uring: don't count rqs failed after current one
  io_uring: fix timeout's seq catching old requests

 fs/io_uring.c | 43 ++++++++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 19 deletions(-)

-- 
2.24.0


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 1/4] io_uring: fix cached_sq_head in io_timeout()
  2020-04-14 21:39 [PATCH 0/4] timeout and sequence fixes Pavel Begunkov
@ 2020-04-14 21:39 ` Pavel Begunkov
  2020-04-14 21:39 ` [PATCH 2/4] io_uring: kill already cached timeout.seq_offset Pavel Begunkov
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Pavel Begunkov @ 2020-04-14 21:39 UTC (permalink / raw)
  To: Jens Axboe, io-uring, linux-kernel

io_timeout() can be executed asynchronously by a worker and without
holding ctx->uring_lock

1. using ctx->cached_sq_head there is racy there
2. it should count events from a moment of timeout's submission, but
not execution

Use req->sequence.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c0cf57764329..fcc320d67606 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4673,6 +4673,7 @@ static int io_timeout(struct io_kiocb *req)
 	struct io_timeout_data *data;
 	struct list_head *entry;
 	unsigned span = 0;
+	u32 seq = req->sequence;
 
 	data = &req->io->timeout;
 
@@ -4689,7 +4690,7 @@ static int io_timeout(struct io_kiocb *req)
 		goto add;
 	}
 
-	req->sequence = ctx->cached_sq_head + count - 1;
+	req->sequence = seq + count;
 	data->seq_offset = count;
 
 	/*
@@ -4699,7 +4700,7 @@ static int io_timeout(struct io_kiocb *req)
 	spin_lock_irq(&ctx->completion_lock);
 	list_for_each_prev(entry, &ctx->timeout_list) {
 		struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
-		unsigned nxt_sq_head;
+		unsigned nxt_seq;
 		long long tmp, tmp_nxt;
 		u32 nxt_offset = nxt->io->timeout.seq_offset;
 
@@ -4707,18 +4708,18 @@ static int io_timeout(struct io_kiocb *req)
 			continue;
 
 		/*
-		 * Since cached_sq_head + count - 1 can overflow, use type long
+		 * Since seq + count can overflow, use type long
 		 * long to store it.
 		 */
-		tmp = (long long)ctx->cached_sq_head + count - 1;
-		nxt_sq_head = nxt->sequence - nxt_offset + 1;
-		tmp_nxt = (long long)nxt_sq_head + nxt_offset - 1;
+		tmp = (long long)seq + count;
+		nxt_seq = nxt->sequence - nxt_offset;
+		tmp_nxt = (long long)nxt_seq + nxt_offset;
 
 		/*
 		 * cached_sq_head may overflow, and it will never overflow twice
 		 * once there is some timeout req still be valid.
 		 */
-		if (ctx->cached_sq_head < nxt_sq_head)
+		if (seq < nxt_seq)
 			tmp += UINT_MAX;
 
 		if (tmp > tmp_nxt)
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/4] io_uring: kill already cached timeout.seq_offset
  2020-04-14 21:39 [PATCH 0/4] timeout and sequence fixes Pavel Begunkov
  2020-04-14 21:39 ` [PATCH 1/4] io_uring: fix cached_sq_head in io_timeout() Pavel Begunkov
@ 2020-04-14 21:39 ` Pavel Begunkov
  2020-04-14 21:39 ` [PATCH 3/4] io_uring: don't count rqs failed after current one Pavel Begunkov
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Pavel Begunkov @ 2020-04-14 21:39 UTC (permalink / raw)
  To: Jens Axboe, io-uring, linux-kernel

req->timeout.count and req->io->timeout.seq_offset store the same value,
which is sqe->off. Kill the second one

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index fcc320d67606..e69cb70c11d4 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -357,7 +357,6 @@ struct io_timeout_data {
 	struct hrtimer			timer;
 	struct timespec64		ts;
 	enum hrtimer_mode		mode;
-	u32				seq_offset;
 };
 
 struct io_accept {
@@ -385,7 +384,7 @@ struct io_timeout {
 	struct file			*file;
 	u64				addr;
 	int				flags;
-	unsigned			count;
+	u32				count;
 };
 
 struct io_rw {
@@ -4668,11 +4667,11 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 
 static int io_timeout(struct io_kiocb *req)
 {
-	unsigned count;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_timeout_data *data;
 	struct list_head *entry;
 	unsigned span = 0;
+	u32 count = req->timeout.count;
 	u32 seq = req->sequence;
 
 	data = &req->io->timeout;
@@ -4682,7 +4681,6 @@ static int io_timeout(struct io_kiocb *req)
 	 * timeout event to be satisfied. If it isn't set, then this is
 	 * a pure timeout request, sequence isn't used.
 	 */
-	count = req->timeout.count;
 	if (!count) {
 		req->flags |= REQ_F_TIMEOUT_NOSEQ;
 		spin_lock_irq(&ctx->completion_lock);
@@ -4691,7 +4689,6 @@ static int io_timeout(struct io_kiocb *req)
 	}
 
 	req->sequence = seq + count;
-	data->seq_offset = count;
 
 	/*
 	 * Insertion sort, ensuring the first entry in the list is always
@@ -4702,7 +4699,7 @@ static int io_timeout(struct io_kiocb *req)
 		struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
 		unsigned nxt_seq;
 		long long tmp, tmp_nxt;
-		u32 nxt_offset = nxt->io->timeout.seq_offset;
+		u32 nxt_offset = nxt->timeout.count;
 
 		if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
 			continue;
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 3/4] io_uring: don't count rqs failed after current one
  2020-04-14 21:39 [PATCH 0/4] timeout and sequence fixes Pavel Begunkov
  2020-04-14 21:39 ` [PATCH 1/4] io_uring: fix cached_sq_head in io_timeout() Pavel Begunkov
  2020-04-14 21:39 ` [PATCH 2/4] io_uring: kill already cached timeout.seq_offset Pavel Begunkov
@ 2020-04-14 21:39 ` Pavel Begunkov
  2020-04-14 21:39 ` [PATCH 4/4] io_uring: fix timeout's seq catching old requests Pavel Begunkov
  2020-04-15  1:29 ` [PATCH 0/4] timeout and sequence fixes Jens Axboe
  4 siblings, 0 replies; 6+ messages in thread
From: Pavel Begunkov @ 2020-04-14 21:39 UTC (permalink / raw)
  To: Jens Axboe, io-uring, linux-kernel

When checking for draining with __req_need_defer(), it tries to match
how many requests were sent before a current one with number of already
completed. Dropped SQEs are included in req->sequence, and they won't
ever appear in CQ. To compensate for that, __req_need_defer() substracts
ctx->cached_sq_dropped.
However, what it should really use is number of SQEs dropped __before__
the current one. In other words, any submitted request shouldn't
shouldn't affect dequeueing from the drain queue of previously submitted
ones.

Instead of saving proper ctx->cached_sq_dropped in each request,
substract from req->sequence it at initialisation, so it includes number
of properly submitted requests.

note: it also changes behaviour of timeouts, but
1. it's already diverge from the description because of using SQ
2. the description is ambiguous regarding dropped SQEs

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e69cb70c11d4..8ee7b4f72b8f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -957,8 +957,8 @@ static inline bool __req_need_defer(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
-	return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped
-					+ atomic_read(&ctx->cached_cq_overflow);
+	return req->sequence != ctx->cached_cq_tail
+				+ atomic_read(&ctx->cached_cq_overflow);
 }
 
 static inline bool req_need_defer(struct io_kiocb *req)
@@ -5760,7 +5760,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	 * it can be used to mark the position of the first IO in the
 	 * link list.
 	 */
-	req->sequence = ctx->cached_sq_head;
+	req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped;
 	req->opcode = READ_ONCE(sqe->opcode);
 	req->user_data = READ_ONCE(sqe->user_data);
 	req->io = NULL;
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 4/4] io_uring: fix timeout's seq catching old requests
  2020-04-14 21:39 [PATCH 0/4] timeout and sequence fixes Pavel Begunkov
                   ` (2 preceding siblings ...)
  2020-04-14 21:39 ` [PATCH 3/4] io_uring: don't count rqs failed after current one Pavel Begunkov
@ 2020-04-14 21:39 ` Pavel Begunkov
  2020-04-15  1:29 ` [PATCH 0/4] timeout and sequence fixes Jens Axboe
  4 siblings, 0 replies; 6+ messages in thread
From: Pavel Begunkov @ 2020-04-14 21:39 UTC (permalink / raw)
  To: Jens Axboe, io-uring, linux-kernel; +Cc: Hrvoje Zeba

As sequence logic use u32 for both req->sequence and sqe->off,
req->sequence + sqe->off can alias to a previously submtitted and
still inflight request, so triggering a timeout on it instead of waiting
for completion of requests submitted after the timeout.

Use u64 for sequences leaving sqe->off to be u32, so the issue will
happen only when there are more than 2^64 - 2^32 inflight requests, that
just won't fit in memory.

note 1: can't modify __req_need_defer() as well, because it's used
without synchronisation for draining, and reading ctx->cached_cq_tail
in 32bit arch won't be atomic.

note 2: io_timeout() overflow magic is left in u32.

Reported-by: Hrvoje Zeba <zeba.hrvoje@gmail.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8ee7b4f72b8f..1961562edf77 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -251,6 +251,7 @@ struct io_ring_ctx {
 
 		wait_queue_head_t	inflight_wait;
 		struct io_uring_sqe	*sq_sqes;
+		u64			sq_submitted;
 	} ____cacheline_aligned_in_smp;
 
 	struct io_rings	*rings;
@@ -302,6 +303,7 @@ struct io_ring_ctx {
 		struct wait_queue_head	cq_wait;
 		struct fasync_struct	*cq_fasync;
 		struct eventfd_ctx	*cq_ev_fd;
+		u64			cq_total;
 	} ____cacheline_aligned_in_smp;
 
 	struct {
@@ -624,7 +626,7 @@ struct io_kiocb {
 	unsigned long		fsize;
 	u64			user_data;
 	u32			result;
-	u32			sequence;
+	u64			sequence;
 
 	struct list_head	link_list;
 
@@ -957,8 +959,8 @@ static inline bool __req_need_defer(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
-	return req->sequence != ctx->cached_cq_tail
-				+ atomic_read(&ctx->cached_cq_overflow);
+	return (u32)req->sequence != ctx->cached_cq_tail
+			+ atomic_read(&ctx->cached_cq_overflow);
 }
 
 static inline bool req_need_defer(struct io_kiocb *req)
@@ -990,7 +992,7 @@ static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx)
 	if (req) {
 		if (req->flags & REQ_F_TIMEOUT_NOSEQ)
 			return NULL;
-		if (!__req_need_defer(req)) {
+		if (req->sequence == ctx->cq_total) {
 			list_del_init(&req->list);
 			return req;
 		}
@@ -1141,6 +1143,7 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
 	if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
 		return NULL;
 
+	ctx->cq_total++;
 	ctx->cached_cq_tail++;
 	return &rings->cqes[tail & ctx->cq_mask];
 }
@@ -1204,6 +1207,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 		} else {
 			WRITE_ONCE(ctx->rings->cq_overflow,
 				atomic_inc_return(&ctx->cached_cq_overflow));
+			ctx->cq_total++;
 		}
 	}
 
@@ -1244,6 +1248,7 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
 	} else if (ctx->cq_overflow_flushed) {
 		WRITE_ONCE(ctx->rings->cq_overflow,
 				atomic_inc_return(&ctx->cached_cq_overflow));
+		ctx->cq_total++;
 	} else {
 		if (list_empty(&ctx->cq_overflow_list)) {
 			set_bit(0, &ctx->sq_check_overflow);
@@ -4672,7 +4677,7 @@ static int io_timeout(struct io_kiocb *req)
 	struct list_head *entry;
 	unsigned span = 0;
 	u32 count = req->timeout.count;
-	u32 seq = req->sequence;
+	u32 seq = (u32)req->sequence;
 
 	data = &req->io->timeout;
 
@@ -5730,8 +5735,10 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
 	 *    though the application is the one updating it.
 	 */
 	head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]);
-	if (likely(head < ctx->sq_entries))
+	if (likely(head < ctx->sq_entries)) {
+		ctx->sq_submitted++;
 		return &ctx->sq_sqes[head];
+	}
 
 	/* drop invalid entries */
 	ctx->cached_sq_dropped++;
@@ -5760,7 +5767,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	 * it can be used to mark the position of the first IO in the
 	 * link list.
 	 */
-	req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped;
+	req->sequence = ctx->sq_submitted - 1;
 	req->opcode = READ_ONCE(sqe->opcode);
 	req->user_data = READ_ONCE(sqe->user_data);
 	req->io = NULL;
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH 0/4] timeout and sequence fixes
  2020-04-14 21:39 [PATCH 0/4] timeout and sequence fixes Pavel Begunkov
                   ` (3 preceding siblings ...)
  2020-04-14 21:39 ` [PATCH 4/4] io_uring: fix timeout's seq catching old requests Pavel Begunkov
@ 2020-04-15  1:29 ` Jens Axboe
  4 siblings, 0 replies; 6+ messages in thread
From: Jens Axboe @ 2020-04-15  1:29 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring, linux-kernel

On 4/14/20 3:39 PM, Pavel Begunkov wrote:
> [4/4] is dirty, but fixes the issue. And there is still "SQ vs CQ"
> problem, solving which can effectively revert it, so I suggest to
> postpone the last patch for a while. I'll rebase if it'd be necessary.
> 

Thanks, I've applied 1-3 for now. Looks good to me, and also tests out
fine.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2020-04-15  1:30 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-04-14 21:39 [PATCH 0/4] timeout and sequence fixes Pavel Begunkov
2020-04-14 21:39 ` [PATCH 1/4] io_uring: fix cached_sq_head in io_timeout() Pavel Begunkov
2020-04-14 21:39 ` [PATCH 2/4] io_uring: kill already cached timeout.seq_offset Pavel Begunkov
2020-04-14 21:39 ` [PATCH 3/4] io_uring: don't count rqs failed after current one Pavel Begunkov
2020-04-14 21:39 ` [PATCH 4/4] io_uring: fix timeout's seq catching old requests Pavel Begunkov
2020-04-15  1:29 ` [PATCH 0/4] timeout and sequence fixes Jens Axboe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.