All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 0/2] io_uring: fix skipping of old timeout events
@ 2020-12-19 19:15 Marcelo Diop-Gonzalez
  2020-12-19 19:15 ` [PATCH v2 1/2] io_uring: only increment ->cq_timeouts along with ->cached_cq_tail Marcelo Diop-Gonzalez
  2020-12-19 19:15 ` [PATCH v2 2/2] io_uring: flush timeouts that should already have expired Marcelo Diop-Gonzalez
  0 siblings, 2 replies; 16+ messages in thread
From: Marcelo Diop-Gonzalez @ 2020-12-19 19:15 UTC (permalink / raw)
  To: axboe, asml.silence; +Cc: io-uring, Marcelo Diop-Gonzalez

These two patches try to fix a problem with IORING_OP_TIMEOUT events
not being flushed if they should already have expired. The test below
hangs before this change (unless you run with $ ./a.out ~/somefile 1):

#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include <liburing.h>

int main(int argc, char **argv) {
	if (argc < 2)
		return 1;

	int fd = open(argv[1], O_RDONLY);
	if (fd < 0) {
		perror("open");
		return 1;
	}

	struct io_uring ring;
	io_uring_queue_init(4, &ring, 0);

	struct io_uring_sqe *sqe = io_uring_get_sqe(&ring);

	struct __kernel_timespec ts = { .tv_sec = 9999999 };
	io_uring_prep_timeout(sqe, &ts, 1, 0);
	sqe->user_data = 123;
	int ret = io_uring_submit(&ring);
	if (ret < 0) {
		fprintf(stderr, "submit(timeout_sqe): %d\n", ret);
		return 1;
	}

	int n = 2;
	if (argc > 2)
		n = atoi(argv[2]);

	char buf;
	for (int i = 0; i < n; i++) {
		sqe = io_uring_get_sqe(&ring);
		if (!sqe) {
			fprintf(stderr, "too many\n");
			exit(1);
		}
		io_uring_prep_read(sqe, fd, &buf, 1, 0);
	}
	ret = io_uring_submit(&ring);
	if (ret < 0) {
		fprintf(stderr, "submit(read_sqe): %d\n", ret);
		exit(1);
	}

	struct io_uring_cqe *cqe;
	for (int i = 0; i < n+1; i++) {
		struct io_uring_cqe *cqe;
		int ret = io_uring_wait_cqe(&ring, &cqe);
		if (ret < 0) {
			fprintf(stderr, "wait_cqe(): %d\n", ret);
			return 1;
		}
		if (cqe->user_data == 123)
			printf("timeout found\n");
		io_uring_cqe_seen(&ring, cqe);
	}
}

v2: Properly handle u32 overflow issues

Marcelo Diop-Gonzalez (2):
  io_uring: only increment ->cq_timeouts along with ->cached_cq_tail
  io_uring: flush timeouts that should already have expired

 fs/io_uring.c | 44 ++++++++++++++++++++++++++++++--------------
 1 file changed, 30 insertions(+), 14 deletions(-)

-- 
2.20.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2 1/2] io_uring: only increment ->cq_timeouts along with ->cached_cq_tail
  2020-12-19 19:15 [PATCH v2 0/2] io_uring: fix skipping of old timeout events Marcelo Diop-Gonzalez
@ 2020-12-19 19:15 ` Marcelo Diop-Gonzalez
  2021-01-02 20:03   ` Pavel Begunkov
  2020-12-19 19:15 ` [PATCH v2 2/2] io_uring: flush timeouts that should already have expired Marcelo Diop-Gonzalez
  1 sibling, 1 reply; 16+ messages in thread
From: Marcelo Diop-Gonzalez @ 2020-12-19 19:15 UTC (permalink / raw)
  To: axboe, asml.silence; +Cc: io-uring, Marcelo Diop-Gonzalez

The quantity ->cached_cq_tail - ->cq_timeouts is used to tell how many
non-timeout events have happened, but this subtraction could overflow
if ->cq_timeouts is incremented more times than ->cached_cq_tail.
It's maybe unlikely, but currently this can happen if a timeout event
overflows the cqring, since in that case io_get_cqring() doesn't
increment ->cached_cq_tail, but ->cq_timeouts is incremented by the
caller. Fix it by incrementing ->cq_timeouts inside io_get_cqring().

Signed-off-by: Marcelo Diop-Gonzalez <marcelo827@gmail.com>
---
 fs/io_uring.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f3690dfdd564..f394bf358022 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1582,8 +1582,6 @@ static void io_kill_timeout(struct io_kiocb *req)
 
 	ret = hrtimer_try_to_cancel(&io->timer);
 	if (ret != -1) {
-		atomic_set(&req->ctx->cq_timeouts,
-			atomic_read(&req->ctx->cq_timeouts) + 1);
 		list_del_init(&req->timeout.list);
 		io_cqring_fill_event(req, 0);
 		io_put_req_deferred(req, 1);
@@ -1664,7 +1662,7 @@ static inline bool io_sqring_full(struct io_ring_ctx *ctx)
 	return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == r->sq_ring_entries;
 }
 
-static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
+static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx, u8 opcode)
 {
 	struct io_rings *rings = ctx->rings;
 	unsigned tail;
@@ -1679,6 +1677,10 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
 		return NULL;
 
 	ctx->cached_cq_tail++;
+	if (opcode == IORING_OP_TIMEOUT)
+		atomic_set(&ctx->cq_timeouts,
+			   atomic_read(&ctx->cq_timeouts) + 1);
+
 	return &rings->cqes[tail & ctx->cq_mask];
 }
 
@@ -1728,7 +1730,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
 		if (!io_match_task(req, tsk, files))
 			continue;
 
-		cqe = io_get_cqring(ctx);
+		cqe = io_get_cqring(ctx, req->opcode);
 		if (!cqe && !force)
 			break;
 
@@ -1776,7 +1778,7 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
 	 * submission (by quite a lot). Increment the overflow count in
 	 * the ring.
 	 */
-	cqe = io_get_cqring(ctx);
+	cqe = io_get_cqring(ctx, req->opcode);
 	if (likely(cqe)) {
 		WRITE_ONCE(cqe->user_data, req->user_data);
 		WRITE_ONCE(cqe->res, res);
@@ -5618,8 +5620,6 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 
 	spin_lock_irqsave(&ctx->completion_lock, flags);
 	list_del_init(&req->timeout.list);
-	atomic_set(&req->ctx->cq_timeouts,
-		atomic_read(&req->ctx->cq_timeouts) + 1);
 
 	io_cqring_fill_event(req, -ETIME);
 	io_commit_cqring(ctx);
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 2/2] io_uring: flush timeouts that should already have expired
  2020-12-19 19:15 [PATCH v2 0/2] io_uring: fix skipping of old timeout events Marcelo Diop-Gonzalez
  2020-12-19 19:15 ` [PATCH v2 1/2] io_uring: only increment ->cq_timeouts along with ->cached_cq_tail Marcelo Diop-Gonzalez
@ 2020-12-19 19:15 ` Marcelo Diop-Gonzalez
  2021-01-02 19:54   ` Pavel Begunkov
  1 sibling, 1 reply; 16+ messages in thread
From: Marcelo Diop-Gonzalez @ 2020-12-19 19:15 UTC (permalink / raw)
  To: axboe, asml.silence; +Cc: io-uring, Marcelo Diop-Gonzalez

Right now io_flush_timeouts() checks if the current number of events
is equal to ->timeout.target_seq, but this will miss some timeouts if
there have been more than 1 event added since the last time they were
flushed (possible in io_submit_flush_completions(), for example). Fix
it by recording the starting value of ->cached_cq_overflow -
->cq_timeouts instead of the target value, so that we can safely
(without overflow problems) compare the number of events that have
happened with the number of events needed to trigger the timeout.

Signed-off-by: Marcelo Diop-Gonzalez <marcelo827@gmail.com>
---
 fs/io_uring.c | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f394bf358022..f62de0cb5fc4 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -444,7 +444,7 @@ struct io_cancel {
 struct io_timeout {
 	struct file			*file;
 	u32				off;
-	u32				target_seq;
+	u32				start_seq;
 	struct list_head		list;
 	/* head of the link, used by linked timeouts only */
 	struct io_kiocb			*head;
@@ -1629,6 +1629,24 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx)
 	} while (!list_empty(&ctx->defer_list));
 }
 
+static inline u32 io_timeout_events_left(struct io_kiocb *req)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	u32 events;
+
+	/*
+	 * events -= req->timeout.start_seq and the comparison between
+	 * ->timeout.off and events will not overflow because each time
+	 * ->cq_timeouts is incremented, ->cached_cq_tail is incremented too.
+	 */
+
+	events = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
+	events -= req->timeout.start_seq;
+	if (req->timeout.off > events)
+		return req->timeout.off - events;
+	return 0;
+}
+
 static void io_flush_timeouts(struct io_ring_ctx *ctx)
 {
 	while (!list_empty(&ctx->timeout_list)) {
@@ -1637,8 +1655,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx)
 
 		if (io_is_timeout_noseq(req))
 			break;
-		if (req->timeout.target_seq != ctx->cached_cq_tail
-					- atomic_read(&ctx->cq_timeouts))
+		if (io_timeout_events_left(req) > 0)
 			break;
 
 		list_del_init(&req->timeout.list);
@@ -5785,7 +5802,6 @@ static int io_timeout(struct io_kiocb *req)
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_timeout_data *data = req->async_data;
 	struct list_head *entry;
-	u32 tail, off = req->timeout.off;
 
 	spin_lock_irq(&ctx->completion_lock);
 
@@ -5799,8 +5815,8 @@ static int io_timeout(struct io_kiocb *req)
 		goto add;
 	}
 
-	tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
-	req->timeout.target_seq = tail + off;
+	req->timeout.start_seq = ctx->cached_cq_tail -
+		atomic_read(&ctx->cq_timeouts);
 
 	/*
 	 * Insertion sort, ensuring the first entry in the list is always
@@ -5813,7 +5829,7 @@ static int io_timeout(struct io_kiocb *req)
 		if (io_is_timeout_noseq(nxt))
 			continue;
 		/* nxt.seq is behind @tail, otherwise would've been completed */
-		if (off >= nxt->timeout.target_seq - tail)
+		if (req->timeout.off >= io_timeout_events_left(nxt))
 			break;
 	}
 add:
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 2/2] io_uring: flush timeouts that should already have expired
  2020-12-19 19:15 ` [PATCH v2 2/2] io_uring: flush timeouts that should already have expired Marcelo Diop-Gonzalez
@ 2021-01-02 19:54   ` Pavel Begunkov
  2021-01-02 20:26     ` Pavel Begunkov
  2021-01-04 17:56     ` Marcelo Diop-Gonzalez
  0 siblings, 2 replies; 16+ messages in thread
From: Pavel Begunkov @ 2021-01-02 19:54 UTC (permalink / raw)
  To: Marcelo Diop-Gonzalez, axboe; +Cc: io-uring

On 19/12/2020 19:15, Marcelo Diop-Gonzalez wrote:
> Right now io_flush_timeouts() checks if the current number of events
> is equal to ->timeout.target_seq, but this will miss some timeouts if
> there have been more than 1 event added since the last time they were
> flushed (possible in io_submit_flush_completions(), for example). Fix
> it by recording the starting value of ->cached_cq_overflow -
> ->cq_timeouts instead of the target value, so that we can safely
> (without overflow problems) compare the number of events that have
> happened with the number of events needed to trigger the timeout.
> 
> Signed-off-by: Marcelo Diop-Gonzalez <marcelo827@gmail.com>
> ---
>  fs/io_uring.c | 30 +++++++++++++++++++++++-------
>  1 file changed, 23 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index f394bf358022..f62de0cb5fc4 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -444,7 +444,7 @@ struct io_cancel {
>  struct io_timeout {
>  	struct file			*file;
>  	u32				off;
> -	u32				target_seq;
> +	u32				start_seq;
>  	struct list_head		list;
>  	/* head of the link, used by linked timeouts only */
>  	struct io_kiocb			*head;
> @@ -1629,6 +1629,24 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx)
>  	} while (!list_empty(&ctx->defer_list));
>  }
>  
> +static inline u32 io_timeout_events_left(struct io_kiocb *req)
> +{
> +	struct io_ring_ctx *ctx = req->ctx;
> +	u32 events;
> +
> +	/*
> +	 * events -= req->timeout.start_seq and the comparison between
> +	 * ->timeout.off and events will not overflow because each time
> +	 * ->cq_timeouts is incremented, ->cached_cq_tail is incremented too.
> +	 */
> +
> +	events = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
> +	events -= req->timeout.start_seq;

It looks to me that events before the start_seq subtraction can have got wrapped
around start_seq.

e.g.
1) you submit a timeout with off=0xff...ff (start_seq=0 for convenience)

2) some time has passed, let @events = 0xff..ff - 1
so the timeout still waits

3) we commit 5 requests at once and call io_commit_cqring() only once for
them, so we get @events == 0xff..ff - 1 + 5, i.e. 4

@events == 4 < off == 0xff...ff,
so we didn't trigger out timeout even though should have

> +	if (req->timeout.off > events)
> +		return req->timeout.off - events;
> +	return 0;
> +}
> +
>  static void io_flush_timeouts(struct io_ring_ctx *ctx)
>  {
>  	while (!list_empty(&ctx->timeout_list)) {
> @@ -1637,8 +1655,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx)
>  
>  		if (io_is_timeout_noseq(req))
>  			break;
> -		if (req->timeout.target_seq != ctx->cached_cq_tail
> -					- atomic_read(&ctx->cq_timeouts))
> +		if (io_timeout_events_left(req) > 0)
>  			break;
>  
>  		list_del_init(&req->timeout.list);
> @@ -5785,7 +5802,6 @@ static int io_timeout(struct io_kiocb *req)
>  	struct io_ring_ctx *ctx = req->ctx;
>  	struct io_timeout_data *data = req->async_data;
>  	struct list_head *entry;
> -	u32 tail, off = req->timeout.off;
>  
>  	spin_lock_irq(&ctx->completion_lock);
>  
> @@ -5799,8 +5815,8 @@ static int io_timeout(struct io_kiocb *req)
>  		goto add;
>  	}
>  
> -	tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
> -	req->timeout.target_seq = tail + off;
> +	req->timeout.start_seq = ctx->cached_cq_tail -
> +		atomic_read(&ctx->cq_timeouts);
>  
>  	/*
>  	 * Insertion sort, ensuring the first entry in the list is always
> @@ -5813,7 +5829,7 @@ static int io_timeout(struct io_kiocb *req)
>  		if (io_is_timeout_noseq(nxt))
>  			continue;
>  		/* nxt.seq is behind @tail, otherwise would've been completed */
> -		if (off >= nxt->timeout.target_seq - tail)
> +		if (req->timeout.off >= io_timeout_events_left(nxt))
>  			break;
>  	}
>  add:
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 1/2] io_uring: only increment ->cq_timeouts along with ->cached_cq_tail
  2020-12-19 19:15 ` [PATCH v2 1/2] io_uring: only increment ->cq_timeouts along with ->cached_cq_tail Marcelo Diop-Gonzalez
@ 2021-01-02 20:03   ` Pavel Begunkov
  2021-01-04 16:49     ` Marcelo Diop-Gonzalez
  0 siblings, 1 reply; 16+ messages in thread
From: Pavel Begunkov @ 2021-01-02 20:03 UTC (permalink / raw)
  To: Marcelo Diop-Gonzalez, axboe; +Cc: io-uring

On 19/12/2020 19:15, Marcelo Diop-Gonzalez wrote:
> The quantity ->cached_cq_tail - ->cq_timeouts is used to tell how many
> non-timeout events have happened, but this subtraction could overflow
> if ->cq_timeouts is incremented more times than ->cached_cq_tail.
> It's maybe unlikely, but currently this can happen if a timeout event
> overflows the cqring, since in that case io_get_cqring() doesn't
> increment ->cached_cq_tail, but ->cq_timeouts is incremented by the
> caller. Fix it by incrementing ->cq_timeouts inside io_get_cqring().
> 
> Signed-off-by: Marcelo Diop-Gonzalez <marcelo827@gmail.com>
> ---
>  fs/io_uring.c | 14 +++++++-------
>  1 file changed, 7 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index f3690dfdd564..f394bf358022 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -1582,8 +1582,6 @@ static void io_kill_timeout(struct io_kiocb *req)
>  
>  	ret = hrtimer_try_to_cancel(&io->timer);
>  	if (ret != -1) {
> -		atomic_set(&req->ctx->cq_timeouts,
> -			atomic_read(&req->ctx->cq_timeouts) + 1);
>  		list_del_init(&req->timeout.list);
>  		io_cqring_fill_event(req, 0);
>  		io_put_req_deferred(req, 1);
> @@ -1664,7 +1662,7 @@ static inline bool io_sqring_full(struct io_ring_ctx *ctx)
>  	return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == r->sq_ring_entries;
>  }
>  
> -static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
> +static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx, u8 opcode)
>  {
>  	struct io_rings *rings = ctx->rings;
>  	unsigned tail;
> @@ -1679,6 +1677,10 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
>  		return NULL;
>  
>  	ctx->cached_cq_tail++;
> +	if (opcode == IORING_OP_TIMEOUT)
> +		atomic_set(&ctx->cq_timeouts,
> +			   atomic_read(&ctx->cq_timeouts) + 1);
> +

Don't think I like it. The function is pretty hot, so wouldn't want that extra
burden just for timeouts, which should be cold enough especially with the new
timeout CQ waits. Also passing opcode here is awkward and not very great
abstraction wise.

>  	return &rings->cqes[tail & ctx->cq_mask];
>  }
>  
> @@ -1728,7 +1730,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
>  		if (!io_match_task(req, tsk, files))
>  			continue;
>  
> -		cqe = io_get_cqring(ctx);
> +		cqe = io_get_cqring(ctx, req->opcode);
>  		if (!cqe && !force)
>  			break;
>  
> @@ -1776,7 +1778,7 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
>  	 * submission (by quite a lot). Increment the overflow count in
>  	 * the ring.
>  	 */
> -	cqe = io_get_cqring(ctx);
> +	cqe = io_get_cqring(ctx, req->opcode);
>  	if (likely(cqe)) {
>  		WRITE_ONCE(cqe->user_data, req->user_data);
>  		WRITE_ONCE(cqe->res, res);
> @@ -5618,8 +5620,6 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
>  
>  	spin_lock_irqsave(&ctx->completion_lock, flags);
>  	list_del_init(&req->timeout.list);
> -	atomic_set(&req->ctx->cq_timeouts,
> -		atomic_read(&req->ctx->cq_timeouts) + 1);
>  
>  	io_cqring_fill_event(req, -ETIME);
>  	io_commit_cqring(ctx);
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 2/2] io_uring: flush timeouts that should already have expired
  2021-01-02 19:54   ` Pavel Begunkov
@ 2021-01-02 20:26     ` Pavel Begunkov
  2021-01-08 15:57       ` Marcelo Diop-Gonzalez
  2021-01-04 17:56     ` Marcelo Diop-Gonzalez
  1 sibling, 1 reply; 16+ messages in thread
From: Pavel Begunkov @ 2021-01-02 20:26 UTC (permalink / raw)
  To: Marcelo Diop-Gonzalez, axboe; +Cc: io-uring

On 02/01/2021 19:54, Pavel Begunkov wrote:
> On 19/12/2020 19:15, Marcelo Diop-Gonzalez wrote:
>> Right now io_flush_timeouts() checks if the current number of events
>> is equal to ->timeout.target_seq, but this will miss some timeouts if
>> there have been more than 1 event added since the last time they were
>> flushed (possible in io_submit_flush_completions(), for example). Fix
>> it by recording the starting value of ->cached_cq_overflow -
>> ->cq_timeouts instead of the target value, so that we can safely
>> (without overflow problems) compare the number of events that have
>> happened with the number of events needed to trigger the timeout.

https://www.spinics.net/lists/kernel/msg3475160.html

The idea was to replace u32 cached_cq_tail with u64 while keeping
timeout offsets u32. Assuming that we won't ever hit ~2^62 inflight
requests, complete all requests falling into some large enough window
behind that u64 cached_cq_tail.

simplifying:

i64 d = target_off - ctx->u64_cq_tail
if (d <= 0 && d > -2^32)
	complete_it()

Not fond  of it, but at least worked at that time. You can try out
this approach if you want, but would be perfect if you would find
something more elegant :)

>>
>> Signed-off-by: Marcelo Diop-Gonzalez <marcelo827@gmail.com>
>> ---
>>  fs/io_uring.c | 30 +++++++++++++++++++++++-------
>>  1 file changed, 23 insertions(+), 7 deletions(-)
>>
>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>> index f394bf358022..f62de0cb5fc4 100644
>> --- a/fs/io_uring.c
>> +++ b/fs/io_uring.c
>> @@ -444,7 +444,7 @@ struct io_cancel {
>>  struct io_timeout {
>>  	struct file			*file;
>>  	u32				off;
>> -	u32				target_seq;
>> +	u32				start_seq;
>>  	struct list_head		list;
>>  	/* head of the link, used by linked timeouts only */
>>  	struct io_kiocb			*head;
>> @@ -1629,6 +1629,24 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx)
>>  	} while (!list_empty(&ctx->defer_list));
>>  }
>>  
>> +static inline u32 io_timeout_events_left(struct io_kiocb *req)
>> +{
>> +	struct io_ring_ctx *ctx = req->ctx;
>> +	u32 events;
>> +
>> +	/*
>> +	 * events -= req->timeout.start_seq and the comparison between
>> +	 * ->timeout.off and events will not overflow because each time
>> +	 * ->cq_timeouts is incremented, ->cached_cq_tail is incremented too.
>> +	 */
>> +
>> +	events = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
>> +	events -= req->timeout.start_seq;
> 
> It looks to me that events before the start_seq subtraction can have got wrapped
> around start_seq.
> 
> e.g.
> 1) you submit a timeout with off=0xff...ff (start_seq=0 for convenience)
> 
> 2) some time has passed, let @events = 0xff..ff - 1
> so the timeout still waits
> 
> 3) we commit 5 requests at once and call io_commit_cqring() only once for
> them, so we get @events == 0xff..ff - 1 + 5, i.e. 4
> 
> @events == 4 < off == 0xff...ff,
> so we didn't trigger out timeout even though should have
> 
>> +	if (req->timeout.off > events)
>> +		return req->timeout.off - events;
>> +	return 0;
>> +}
>> +
>>  static void io_flush_timeouts(struct io_ring_ctx *ctx)
>>  {
>>  	while (!list_empty(&ctx->timeout_list)) {
>> @@ -1637,8 +1655,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx)
>>  
>>  		if (io_is_timeout_noseq(req))
>>  			break;
>> -		if (req->timeout.target_seq != ctx->cached_cq_tail
>> -					- atomic_read(&ctx->cq_timeouts))
>> +		if (io_timeout_events_left(req) > 0)
>>  			break;
>>  
>>  		list_del_init(&req->timeout.list);
>> @@ -5785,7 +5802,6 @@ static int io_timeout(struct io_kiocb *req)
>>  	struct io_ring_ctx *ctx = req->ctx;
>>  	struct io_timeout_data *data = req->async_data;
>>  	struct list_head *entry;
>> -	u32 tail, off = req->timeout.off;
>>  
>>  	spin_lock_irq(&ctx->completion_lock);
>>  
>> @@ -5799,8 +5815,8 @@ static int io_timeout(struct io_kiocb *req)
>>  		goto add;
>>  	}
>>  
>> -	tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
>> -	req->timeout.target_seq = tail + off;
>> +	req->timeout.start_seq = ctx->cached_cq_tail -
>> +		atomic_read(&ctx->cq_timeouts);
>>  
>>  	/*
>>  	 * Insertion sort, ensuring the first entry in the list is always
>> @@ -5813,7 +5829,7 @@ static int io_timeout(struct io_kiocb *req)
>>  		if (io_is_timeout_noseq(nxt))
>>  			continue;
>>  		/* nxt.seq is behind @tail, otherwise would've been completed */
>> -		if (off >= nxt->timeout.target_seq - tail)
>> +		if (req->timeout.off >= io_timeout_events_left(nxt))
>>  			break;
>>  	}
>>  add:
>>
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 1/2] io_uring: only increment ->cq_timeouts along with ->cached_cq_tail
  2021-01-02 20:03   ` Pavel Begunkov
@ 2021-01-04 16:49     ` Marcelo Diop-Gonzalez
  0 siblings, 0 replies; 16+ messages in thread
From: Marcelo Diop-Gonzalez @ 2021-01-04 16:49 UTC (permalink / raw)
  To: Pavel Begunkov; +Cc: axboe, io-uring

Yeah I agree this one is kindof ugly... I'll try to think of a different way

-Marcelo

On Sat, Jan 2, 2021 at 3:07 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>
> On 19/12/2020 19:15, Marcelo Diop-Gonzalez wrote:
> > The quantity ->cached_cq_tail - ->cq_timeouts is used to tell how many
> > non-timeout events have happened, but this subtraction could overflow
> > if ->cq_timeouts is incremented more times than ->cached_cq_tail.
> > It's maybe unlikely, but currently this can happen if a timeout event
> > overflows the cqring, since in that case io_get_cqring() doesn't
> > increment ->cached_cq_tail, but ->cq_timeouts is incremented by the
> > caller. Fix it by incrementing ->cq_timeouts inside io_get_cqring().
> >
> > Signed-off-by: Marcelo Diop-Gonzalez <marcelo827@gmail.com>
> > ---
> >  fs/io_uring.c | 14 +++++++-------
> >  1 file changed, 7 insertions(+), 7 deletions(-)
> >
> > diff --git a/fs/io_uring.c b/fs/io_uring.c
> > index f3690dfdd564..f394bf358022 100644
> > --- a/fs/io_uring.c
> > +++ b/fs/io_uring.c
> > @@ -1582,8 +1582,6 @@ static void io_kill_timeout(struct io_kiocb *req)
> >
> >       ret = hrtimer_try_to_cancel(&io->timer);
> >       if (ret != -1) {
> > -             atomic_set(&req->ctx->cq_timeouts,
> > -                     atomic_read(&req->ctx->cq_timeouts) + 1);
> >               list_del_init(&req->timeout.list);
> >               io_cqring_fill_event(req, 0);
> >               io_put_req_deferred(req, 1);
> > @@ -1664,7 +1662,7 @@ static inline bool io_sqring_full(struct io_ring_ctx *ctx)
> >       return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == r->sq_ring_entries;
> >  }
> >
> > -static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
> > +static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx, u8 opcode)
> >  {
> >       struct io_rings *rings = ctx->rings;
> >       unsigned tail;
> > @@ -1679,6 +1677,10 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
> >               return NULL;
> >
> >       ctx->cached_cq_tail++;
> > +     if (opcode == IORING_OP_TIMEOUT)
> > +             atomic_set(&ctx->cq_timeouts,
> > +                        atomic_read(&ctx->cq_timeouts) + 1);
> > +
>
> Don't think I like it. The function is pretty hot, so wouldn't want that extra
> burden just for timeouts, which should be cold enough especially with the new
> timeout CQ waits. Also passing opcode here is awkward and not very great
> abstraction wise.
>
> >       return &rings->cqes[tail & ctx->cq_mask];
> >  }
> >
> > @@ -1728,7 +1730,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
> >               if (!io_match_task(req, tsk, files))
> >                       continue;
> >
> > -             cqe = io_get_cqring(ctx);
> > +             cqe = io_get_cqring(ctx, req->opcode);
> >               if (!cqe && !force)
> >                       break;
> >
> > @@ -1776,7 +1778,7 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
> >        * submission (by quite a lot). Increment the overflow count in
> >        * the ring.
> >        */
> > -     cqe = io_get_cqring(ctx);
> > +     cqe = io_get_cqring(ctx, req->opcode);
> >       if (likely(cqe)) {
> >               WRITE_ONCE(cqe->user_data, req->user_data);
> >               WRITE_ONCE(cqe->res, res);
> > @@ -5618,8 +5620,6 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
> >
> >       spin_lock_irqsave(&ctx->completion_lock, flags);
> >       list_del_init(&req->timeout.list);
> > -     atomic_set(&req->ctx->cq_timeouts,
> > -             atomic_read(&req->ctx->cq_timeouts) + 1);
> >
> >       io_cqring_fill_event(req, -ETIME);
> >       io_commit_cqring(ctx);
> >
>
> --
> Pavel Begunkov

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 2/2] io_uring: flush timeouts that should already have expired
  2021-01-02 19:54   ` Pavel Begunkov
  2021-01-02 20:26     ` Pavel Begunkov
@ 2021-01-04 17:56     ` Marcelo Diop-Gonzalez
  1 sibling, 0 replies; 16+ messages in thread
From: Marcelo Diop-Gonzalez @ 2021-01-04 17:56 UTC (permalink / raw)
  To: Pavel Begunkov; +Cc: axboe, io-uring

On Sat, Jan 2, 2021 at 2:57 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>
> On 19/12/2020 19:15, Marcelo Diop-Gonzalez wrote:
> > Right now io_flush_timeouts() checks if the current number of events
> > is equal to ->timeout.target_seq, but this will miss some timeouts if
> > there have been more than 1 event added since the last time they were
> > flushed (possible in io_submit_flush_completions(), for example). Fix
> > it by recording the starting value of ->cached_cq_overflow -
> > ->cq_timeouts instead of the target value, so that we can safely
> > (without overflow problems) compare the number of events that have
> > happened with the number of events needed to trigger the timeout.
> >
> > Signed-off-by: Marcelo Diop-Gonzalez <marcelo827@gmail.com>
> > ---
> >  fs/io_uring.c | 30 +++++++++++++++++++++++-------
> >  1 file changed, 23 insertions(+), 7 deletions(-)
> >
> > diff --git a/fs/io_uring.c b/fs/io_uring.c
> > index f394bf358022..f62de0cb5fc4 100644
> > --- a/fs/io_uring.c
> > +++ b/fs/io_uring.c
> > @@ -444,7 +444,7 @@ struct io_cancel {
> >  struct io_timeout {
> >       struct file                     *file;
> >       u32                             off;
> > -     u32                             target_seq;
> > +     u32                             start_seq;
> >       struct list_head                list;
> >       /* head of the link, used by linked timeouts only */
> >       struct io_kiocb                 *head;
> > @@ -1629,6 +1629,24 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx)
> >       } while (!list_empty(&ctx->defer_list));
> >  }
> >
> > +static inline u32 io_timeout_events_left(struct io_kiocb *req)
> > +{
> > +     struct io_ring_ctx *ctx = req->ctx;
> > +     u32 events;
> > +
> > +     /*
> > +      * events -= req->timeout.start_seq and the comparison between
> > +      * ->timeout.off and events will not overflow because each time
> > +      * ->cq_timeouts is incremented, ->cached_cq_tail is incremented too.
> > +      */
> > +
> > +     events = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
> > +     events -= req->timeout.start_seq;
>
> It looks to me that events before the start_seq subtraction can have got wrapped
> around start_seq.
>
> e.g.
> 1) you submit a timeout with off=0xff...ff (start_seq=0 for convenience)
>
> 2) some time has passed, let @events = 0xff..ff - 1
> so the timeout still waits
>
> 3) we commit 5 requests at once and call io_commit_cqring() only once for
> them, so we get @events == 0xff..ff - 1 + 5, i.e. 4
>
> @events == 4 < off == 0xff...ff,

Oof, good catch... I'll try to think about it some more. Feels like
there ought to
be a nice way to do it but maybe it's quite tricky :/

-Marcelo

> so we didn't trigger out timeout even though should have
>
> > +     if (req->timeout.off > events)
> > +             return req->timeout.off - events;
> > +     return 0;
> > +}
> > +
> >  static void io_flush_timeouts(struct io_ring_ctx *ctx)
> >  {
> >       while (!list_empty(&ctx->timeout_list)) {
> > @@ -1637,8 +1655,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx)
> >
> >               if (io_is_timeout_noseq(req))
> >                       break;
> > -             if (req->timeout.target_seq != ctx->cached_cq_tail
> > -                                     - atomic_read(&ctx->cq_timeouts))
> > +             if (io_timeout_events_left(req) > 0)
> >                       break;
> >
> >               list_del_init(&req->timeout.list);
> > @@ -5785,7 +5802,6 @@ static int io_timeout(struct io_kiocb *req)
> >       struct io_ring_ctx *ctx = req->ctx;
> >       struct io_timeout_data *data = req->async_data;
> >       struct list_head *entry;
> > -     u32 tail, off = req->timeout.off;
> >
> >       spin_lock_irq(&ctx->completion_lock);
> >
> > @@ -5799,8 +5815,8 @@ static int io_timeout(struct io_kiocb *req)
> >               goto add;
> >       }
> >
> > -     tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
> > -     req->timeout.target_seq = tail + off;
> > +     req->timeout.start_seq = ctx->cached_cq_tail -
> > +             atomic_read(&ctx->cq_timeouts);
> >
> >       /*
> >        * Insertion sort, ensuring the first entry in the list is always
> > @@ -5813,7 +5829,7 @@ static int io_timeout(struct io_kiocb *req)
> >               if (io_is_timeout_noseq(nxt))
> >                       continue;
> >               /* nxt.seq is behind @tail, otherwise would've been completed */
> > -             if (off >= nxt->timeout.target_seq - tail)
> > +             if (req->timeout.off >= io_timeout_events_left(nxt))
> >                       break;
> >       }
> >  add:
> >
>
> --
> Pavel Begunkov

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 2/2] io_uring: flush timeouts that should already have expired
  2021-01-02 20:26     ` Pavel Begunkov
@ 2021-01-08 15:57       ` Marcelo Diop-Gonzalez
  2021-01-11  4:57         ` Pavel Begunkov
  2021-01-12 20:47         ` Pavel Begunkov
  0 siblings, 2 replies; 16+ messages in thread
From: Marcelo Diop-Gonzalez @ 2021-01-08 15:57 UTC (permalink / raw)
  To: Pavel Begunkov; +Cc: axboe, io-uring

On Sat, Jan 02, 2021 at 08:26:26PM +0000, Pavel Begunkov wrote:
> On 02/01/2021 19:54, Pavel Begunkov wrote:
> > On 19/12/2020 19:15, Marcelo Diop-Gonzalez wrote:
> >> Right now io_flush_timeouts() checks if the current number of events
> >> is equal to ->timeout.target_seq, but this will miss some timeouts if
> >> there have been more than 1 event added since the last time they were
> >> flushed (possible in io_submit_flush_completions(), for example). Fix
> >> it by recording the starting value of ->cached_cq_overflow -
> >> ->cq_timeouts instead of the target value, so that we can safely
> >> (without overflow problems) compare the number of events that have
> >> happened with the number of events needed to trigger the timeout.
> 
> https://www.spinics.net/lists/kernel/msg3475160.html
> 
> The idea was to replace u32 cached_cq_tail with u64 while keeping
> timeout offsets u32. Assuming that we won't ever hit ~2^62 inflight
> requests, complete all requests falling into some large enough window
> behind that u64 cached_cq_tail.
> 
> simplifying:
> 
> i64 d = target_off - ctx->u64_cq_tail
> if (d <= 0 && d > -2^32)
> 	complete_it()
> 
> Not fond  of it, but at least worked at that time. You can try out
> this approach if you want, but would be perfect if you would find
> something more elegant :)
>

What do you think about something like this? I think it's not totally
correct because it relies on having ->completion_lock in io_timeout() so
that ->cq_last_tm_flushed is updated, but in case of IORING_SETUP_IOPOLL,
io_iopoll_complete() doesn't take that lock, and ->uring_lock will not
be held if io_timeout() is called from io_wq_submit_work(), but maybe
could still be worth it since that was already possibly a problem?

diff --git a/fs/io_uring.c b/fs/io_uring.c
index cb57e0360fcb..50984709879c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -353,6 +353,7 @@ struct io_ring_ctx {
 		unsigned		cq_entries;
 		unsigned		cq_mask;
 		atomic_t		cq_timeouts;
+		unsigned		cq_last_tm_flush;
 		unsigned long		cq_check_overflow;
 		struct wait_queue_head	cq_wait;
 		struct fasync_struct	*cq_fasync;
@@ -1633,19 +1634,26 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx)
 
 static void io_flush_timeouts(struct io_ring_ctx *ctx)
 {
+	u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
+
 	while (!list_empty(&ctx->timeout_list)) {
+		u32 events_needed, events_got;
 		struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
 						struct io_kiocb, timeout.list);
 
 		if (io_is_timeout_noseq(req))
 			break;
-		if (req->timeout.target_seq != ctx->cached_cq_tail
-					- atomic_read(&ctx->cq_timeouts))
+
+		events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
+		events_got = seq - ctx->cq_last_tm_flush;
+		if (events_got < events_needed)
 			break;
 
 		list_del_init(&req->timeout.list);
 		io_kill_timeout(req);
 	}
+
+	ctx->cq_last_tm_flush = seq;
 }
 
 static void io_commit_cqring(struct io_ring_ctx *ctx)
-- 
2.20.1

> >>
> >> Signed-off-by: Marcelo Diop-Gonzalez <marcelo827@gmail.com>
> >> ---
> >>  fs/io_uring.c | 30 +++++++++++++++++++++++-------
> >>  1 file changed, 23 insertions(+), 7 deletions(-)
> >>
> >> diff --git a/fs/io_uring.c b/fs/io_uring.c
> >> index f394bf358022..f62de0cb5fc4 100644
> >> --- a/fs/io_uring.c
> >> +++ b/fs/io_uring.c
> >> @@ -444,7 +444,7 @@ struct io_cancel {
> >>  struct io_timeout {
> >>  	struct file			*file;
> >>  	u32				off;
> >> -	u32				target_seq;
> >> +	u32				start_seq;
> >>  	struct list_head		list;
> >>  	/* head of the link, used by linked timeouts only */
> >>  	struct io_kiocb			*head;
> >> @@ -1629,6 +1629,24 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx)
> >>  	} while (!list_empty(&ctx->defer_list));
> >>  }
> >>  
> >> +static inline u32 io_timeout_events_left(struct io_kiocb *req)
> >> +{
> >> +	struct io_ring_ctx *ctx = req->ctx;
> >> +	u32 events;
> >> +
> >> +	/*
> >> +	 * events -= req->timeout.start_seq and the comparison between
> >> +	 * ->timeout.off and events will not overflow because each time
> >> +	 * ->cq_timeouts is incremented, ->cached_cq_tail is incremented too.
> >> +	 */
> >> +
> >> +	events = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
> >> +	events -= req->timeout.start_seq;
> > 
> > It looks to me that events before the start_seq subtraction can have got wrapped
> > around start_seq.
> > 
> > e.g.
> > 1) you submit a timeout with off=0xff...ff (start_seq=0 for convenience)
> > 
> > 2) some time has passed, let @events = 0xff..ff - 1
> > so the timeout still waits
> > 
> > 3) we commit 5 requests at once and call io_commit_cqring() only once for
> > them, so we get @events == 0xff..ff - 1 + 5, i.e. 4
> > 
> > @events == 4 < off == 0xff...ff,
> > so we didn't trigger out timeout even though should have
> > 
> >> +	if (req->timeout.off > events)
> >> +		return req->timeout.off - events;
> >> +	return 0;
> >> +}
> >> +
> >>  static void io_flush_timeouts(struct io_ring_ctx *ctx)
> >>  {
> >>  	while (!list_empty(&ctx->timeout_list)) {
> >> @@ -1637,8 +1655,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx)
> >>  
> >>  		if (io_is_timeout_noseq(req))
> >>  			break;
> >> -		if (req->timeout.target_seq != ctx->cached_cq_tail
> >> -					- atomic_read(&ctx->cq_timeouts))
> >> +		if (io_timeout_events_left(req) > 0)
> >>  			break;
> >>  
> >>  		list_del_init(&req->timeout.list);
> >> @@ -5785,7 +5802,6 @@ static int io_timeout(struct io_kiocb *req)
> >>  	struct io_ring_ctx *ctx = req->ctx;
> >>  	struct io_timeout_data *data = req->async_data;
> >>  	struct list_head *entry;
> >> -	u32 tail, off = req->timeout.off;
> >>  
> >>  	spin_lock_irq(&ctx->completion_lock);
> >>  
> >> @@ -5799,8 +5815,8 @@ static int io_timeout(struct io_kiocb *req)
> >>  		goto add;
> >>  	}
> >>  
> >> -	tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
> >> -	req->timeout.target_seq = tail + off;
> >> +	req->timeout.start_seq = ctx->cached_cq_tail -
> >> +		atomic_read(&ctx->cq_timeouts);
> >>  
> >>  	/*
> >>  	 * Insertion sort, ensuring the first entry in the list is always
> >> @@ -5813,7 +5829,7 @@ static int io_timeout(struct io_kiocb *req)
> >>  		if (io_is_timeout_noseq(nxt))
> >>  			continue;
> >>  		/* nxt.seq is behind @tail, otherwise would've been completed */
> >> -		if (off >= nxt->timeout.target_seq - tail)
> >> +		if (req->timeout.off >= io_timeout_events_left(nxt))
> >>  			break;
> >>  	}
> >>  add:
> >>
> > 
> 
> -- 
> Pavel Begunkov

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 2/2] io_uring: flush timeouts that should already have expired
  2021-01-08 15:57       ` Marcelo Diop-Gonzalez
@ 2021-01-11  4:57         ` Pavel Begunkov
  2021-01-11 15:28           ` Marcelo Diop-Gonzalez
  2021-01-12 20:47         ` Pavel Begunkov
  1 sibling, 1 reply; 16+ messages in thread
From: Pavel Begunkov @ 2021-01-11  4:57 UTC (permalink / raw)
  To: Marcelo Diop-Gonzalez; +Cc: axboe, io-uring

On 08/01/2021 15:57, Marcelo Diop-Gonzalez wrote:
> On Sat, Jan 02, 2021 at 08:26:26PM +0000, Pavel Begunkov wrote:
>> On 02/01/2021 19:54, Pavel Begunkov wrote:
>>> On 19/12/2020 19:15, Marcelo Diop-Gonzalez wrote:
>>>> Right now io_flush_timeouts() checks if the current number of events
>>>> is equal to ->timeout.target_seq, but this will miss some timeouts if
>>>> there have been more than 1 event added since the last time they were
>>>> flushed (possible in io_submit_flush_completions(), for example). Fix
>>>> it by recording the starting value of ->cached_cq_overflow -
>>>> ->cq_timeouts instead of the target value, so that we can safely
>>>> (without overflow problems) compare the number of events that have
>>>> happened with the number of events needed to trigger the timeout.
>>
>> https://www.spinics.net/lists/kernel/msg3475160.html
>>
>> The idea was to replace u32 cached_cq_tail with u64 while keeping
>> timeout offsets u32. Assuming that we won't ever hit ~2^62 inflight
>> requests, complete all requests falling into some large enough window
>> behind that u64 cached_cq_tail.
>>
>> simplifying:
>>
>> i64 d = target_off - ctx->u64_cq_tail
>> if (d <= 0 && d > -2^32)
>> 	complete_it()
>>
>> Not fond  of it, but at least worked at that time. You can try out
>> this approach if you want, but would be perfect if you would find
>> something more elegant :)
>>
> 
> What do you think about something like this? I think it's not totally
> correct because it relies on having ->completion_lock in io_timeout() so
> that ->cq_last_tm_flushed is updated, but in case of IORING_SETUP_IOPOLL,
> io_iopoll_complete() doesn't take that lock, and ->uring_lock will not
> be held if io_timeout() is called from io_wq_submit_work(), but maybe
> could still be worth it since that was already possibly a problem?

I'll take a look later, but IOPOLL doesn't support timeouts, see
the first if in io_timeout_prep(), so that's not a problem, but would
better to leave a comment.

> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index cb57e0360fcb..50984709879c 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -353,6 +353,7 @@ struct io_ring_ctx {
>  		unsigned		cq_entries;
>  		unsigned		cq_mask;
>  		atomic_t		cq_timeouts;
> +		unsigned		cq_last_tm_flush;
>  		unsigned long		cq_check_overflow;
>  		struct wait_queue_head	cq_wait;
>  		struct fasync_struct	*cq_fasync;
> @@ -1633,19 +1634,26 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx)
>  
>  static void io_flush_timeouts(struct io_ring_ctx *ctx)
>  {
> +	u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
> +
>  	while (!list_empty(&ctx->timeout_list)) {
> +		u32 events_needed, events_got;
>  		struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
>  						struct io_kiocb, timeout.list);
>  
>  		if (io_is_timeout_noseq(req))
>  			break;
> -		if (req->timeout.target_seq != ctx->cached_cq_tail
> -					- atomic_read(&ctx->cq_timeouts))
> +
> +		events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
> +		events_got = seq - ctx->cq_last_tm_flush;
> +		if (events_got < events_needed)
>  			break;
>  
>  		list_del_init(&req->timeout.list);
>  		io_kill_timeout(req);
>  	}
> +
> +	ctx->cq_last_tm_flush = seq;
>  }
>  
>  static void io_commit_cqring(struct io_ring_ctx *ctx)
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 2/2] io_uring: flush timeouts that should already have expired
  2021-01-11  4:57         ` Pavel Begunkov
@ 2021-01-11 15:28           ` Marcelo Diop-Gonzalez
  0 siblings, 0 replies; 16+ messages in thread
From: Marcelo Diop-Gonzalez @ 2021-01-11 15:28 UTC (permalink / raw)
  To: Pavel Begunkov; +Cc: axboe, io-uring

On Mon, Jan 11, 2021 at 04:57:21AM +0000, Pavel Begunkov wrote:
> On 08/01/2021 15:57, Marcelo Diop-Gonzalez wrote:
> > On Sat, Jan 02, 2021 at 08:26:26PM +0000, Pavel Begunkov wrote:
> >> On 02/01/2021 19:54, Pavel Begunkov wrote:
> >>> On 19/12/2020 19:15, Marcelo Diop-Gonzalez wrote:
> >>>> Right now io_flush_timeouts() checks if the current number of events
> >>>> is equal to ->timeout.target_seq, but this will miss some timeouts if
> >>>> there have been more than 1 event added since the last time they were
> >>>> flushed (possible in io_submit_flush_completions(), for example). Fix
> >>>> it by recording the starting value of ->cached_cq_overflow -
> >>>> ->cq_timeouts instead of the target value, so that we can safely
> >>>> (without overflow problems) compare the number of events that have
> >>>> happened with the number of events needed to trigger the timeout.
> >>
> >> https://www.spinics.net/lists/kernel/msg3475160.html
> >>
> >> The idea was to replace u32 cached_cq_tail with u64 while keeping
> >> timeout offsets u32. Assuming that we won't ever hit ~2^62 inflight
> >> requests, complete all requests falling into some large enough window
> >> behind that u64 cached_cq_tail.
> >>
> >> simplifying:
> >>
> >> i64 d = target_off - ctx->u64_cq_tail
> >> if (d <= 0 && d > -2^32)
> >> 	complete_it()
> >>
> >> Not fond  of it, but at least worked at that time. You can try out
> >> this approach if you want, but would be perfect if you would find
> >> something more elegant :)
> >>
> > 
> > What do you think about something like this? I think it's not totally
> > correct because it relies on having ->completion_lock in io_timeout() so
> > that ->cq_last_tm_flushed is updated, but in case of IORING_SETUP_IOPOLL,
> > io_iopoll_complete() doesn't take that lock, and ->uring_lock will not
> > be held if io_timeout() is called from io_wq_submit_work(), but maybe
> > could still be worth it since that was already possibly a problem?
> 
> I'll take a look later, but IOPOLL doesn't support timeouts, see
> the first if in io_timeout_prep(), so that's not a problem, but would
> better to leave a comment.
>

Ah right! Nevermind about that then.

> > 
> > diff --git a/fs/io_uring.c b/fs/io_uring.c
> > index cb57e0360fcb..50984709879c 100644
> > --- a/fs/io_uring.c
> > +++ b/fs/io_uring.c
> > @@ -353,6 +353,7 @@ struct io_ring_ctx {
> >  		unsigned		cq_entries;
> >  		unsigned		cq_mask;
> >  		atomic_t		cq_timeouts;
> > +		unsigned		cq_last_tm_flush;
> >  		unsigned long		cq_check_overflow;
> >  		struct wait_queue_head	cq_wait;
> >  		struct fasync_struct	*cq_fasync;
> > @@ -1633,19 +1634,26 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx)
> >  
> >  static void io_flush_timeouts(struct io_ring_ctx *ctx)
> >  {
> > +	u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
> > +
> >  	while (!list_empty(&ctx->timeout_list)) {
> > +		u32 events_needed, events_got;
> >  		struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
> >  						struct io_kiocb, timeout.list);
> >  
> >  		if (io_is_timeout_noseq(req))
> >  			break;
> > -		if (req->timeout.target_seq != ctx->cached_cq_tail
> > -					- atomic_read(&ctx->cq_timeouts))
> > +
> > +		events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
> > +		events_got = seq - ctx->cq_last_tm_flush;
> > +		if (events_got < events_needed)
> >  			break;
> >  
> >  		list_del_init(&req->timeout.list);
> >  		io_kill_timeout(req);
> >  	}
> > +
> > +	ctx->cq_last_tm_flush = seq;
> >  }
> >  
> >  static void io_commit_cqring(struct io_ring_ctx *ctx)
> > 
> 
> -- 
> Pavel Begunkov

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 2/2] io_uring: flush timeouts that should already have expired
  2021-01-08 15:57       ` Marcelo Diop-Gonzalez
  2021-01-11  4:57         ` Pavel Begunkov
@ 2021-01-12 20:47         ` Pavel Begunkov
  2021-01-13 14:41           ` Marcelo Diop-Gonzalez
  2021-01-14  0:46           ` Marcelo Diop-Gonzalez
  1 sibling, 2 replies; 16+ messages in thread
From: Pavel Begunkov @ 2021-01-12 20:47 UTC (permalink / raw)
  To: Marcelo Diop-Gonzalez; +Cc: Jens Axboe, io-uring

On 08/01/2021 15:57, Marcelo Diop-Gonzalez wrote:
> On Sat, Jan 02, 2021 at 08:26:26PM +0000, Pavel Begunkov wrote:
>> On 02/01/2021 19:54, Pavel Begunkov wrote:
>>> On 19/12/2020 19:15, Marcelo Diop-Gonzalez wrote:
>>>> Right now io_flush_timeouts() checks if the current number of events
>>>> is equal to ->timeout.target_seq, but this will miss some timeouts if
>>>> there have been more than 1 event added since the last time they were
>>>> flushed (possible in io_submit_flush_completions(), for example). Fix
>>>> it by recording the starting value of ->cached_cq_overflow -
>>>> ->cq_timeouts instead of the target value, so that we can safely
>>>> (without overflow problems) compare the number of events that have
>>>> happened with the number of events needed to trigger the timeout.
>>
>> https://www.spinics.net/lists/kernel/msg3475160.html
>>
>> The idea was to replace u32 cached_cq_tail with u64 while keeping
>> timeout offsets u32. Assuming that we won't ever hit ~2^62 inflight
>> requests, complete all requests falling into some large enough window
>> behind that u64 cached_cq_tail.
>>
>> simplifying:
>>
>> i64 d = target_off - ctx->u64_cq_tail
>> if (d <= 0 && d > -2^32)
>> 	complete_it()
>>
>> Not fond  of it, but at least worked at that time. You can try out
>> this approach if you want, but would be perfect if you would find
>> something more elegant :)
>>
> 
> What do you think about something like this? I think it's not totally
> correct because it relies on having ->completion_lock in io_timeout() so
> that ->cq_last_tm_flushed is updated, but in case of IORING_SETUP_IOPOLL,
> io_iopoll_complete() doesn't take that lock, and ->uring_lock will not
> be held if io_timeout() is called from io_wq_submit_work(), but maybe
> could still be worth it since that was already possibly a problem?
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index cb57e0360fcb..50984709879c 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -353,6 +353,7 @@ struct io_ring_ctx {
>  		unsigned		cq_entries;
>  		unsigned		cq_mask;
>  		atomic_t		cq_timeouts;
> +		unsigned		cq_last_tm_flush;

It looks like that "last flush" is a good direction.
I think there can be problems at extremes like completing 2^32
requests at once, but should be ok in practice. Anyway better
than it's now.

What about the first patch about overflows and cq_timeouts? I
assume that problem is still there, isn't it?

See comments below, but if it passes liburing tests, please send
a patch.

>  		unsigned long		cq_check_overflow;
>  		struct wait_queue_head	cq_wait;
>  		struct fasync_struct	*cq_fasync;
> @@ -1633,19 +1634,26 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx)
>  
>  static void io_flush_timeouts(struct io_ring_ctx *ctx)
>  {
> +	u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
> +

a nit,

if (list_empty()) return; + do {} while();

timeouts can be rare enough

>  	while (!list_empty(&ctx->timeout_list)) {
> +		u32 events_needed, events_got;
>  		struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
>  						struct io_kiocb, timeout.list);
>  
>  		if (io_is_timeout_noseq(req))
>  			break;
> -		if (req->timeout.target_seq != ctx->cached_cq_tail
> -					- atomic_read(&ctx->cq_timeouts))
> +

extra new line

> +		events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
> +		events_got = seq - ctx->cq_last_tm_flush;
> +		if (events_got < events_needed) 

probably <=

>  			break;

basically it checks that @target is in [last_flush, cur_seq],
it can use such a comment + a note about underflows and using
the modulus arithmetic, like with algebraic rings

>  
>  		list_del_init(&req->timeout.list);
>  		io_kill_timeout(req);
>  	}
> +
> +	ctx->cq_last_tm_flush = seq;
>  }
>  
>  static void io_commit_cqring(struct io_ring_ctx *ctx)
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 2/2] io_uring: flush timeouts that should already have expired
  2021-01-12 20:47         ` Pavel Begunkov
@ 2021-01-13 14:41           ` Marcelo Diop-Gonzalez
  2021-01-13 15:20             ` Pavel Begunkov
  2021-01-14  0:46           ` Marcelo Diop-Gonzalez
  1 sibling, 1 reply; 16+ messages in thread
From: Marcelo Diop-Gonzalez @ 2021-01-13 14:41 UTC (permalink / raw)
  To: Pavel Begunkov; +Cc: Jens Axboe, io-uring

On Tue, Jan 12, 2021 at 08:47:11PM +0000, Pavel Begunkov wrote:
> On 08/01/2021 15:57, Marcelo Diop-Gonzalez wrote:
> > On Sat, Jan 02, 2021 at 08:26:26PM +0000, Pavel Begunkov wrote:
> >> On 02/01/2021 19:54, Pavel Begunkov wrote:
> >>> On 19/12/2020 19:15, Marcelo Diop-Gonzalez wrote:
> >>>> Right now io_flush_timeouts() checks if the current number of events
> >>>> is equal to ->timeout.target_seq, but this will miss some timeouts if
> >>>> there have been more than 1 event added since the last time they were
> >>>> flushed (possible in io_submit_flush_completions(), for example). Fix
> >>>> it by recording the starting value of ->cached_cq_overflow -
> >>>> ->cq_timeouts instead of the target value, so that we can safely
> >>>> (without overflow problems) compare the number of events that have
> >>>> happened with the number of events needed to trigger the timeout.
> >>
> >> https://www.spinics.net/lists/kernel/msg3475160.html
> >>
> >> The idea was to replace u32 cached_cq_tail with u64 while keeping
> >> timeout offsets u32. Assuming that we won't ever hit ~2^62 inflight
> >> requests, complete all requests falling into some large enough window
> >> behind that u64 cached_cq_tail.
> >>
> >> simplifying:
> >>
> >> i64 d = target_off - ctx->u64_cq_tail
> >> if (d <= 0 && d > -2^32)
> >> 	complete_it()
> >>
> >> Not fond  of it, but at least worked at that time. You can try out
> >> this approach if you want, but would be perfect if you would find
> >> something more elegant :)
> >>
> > 
> > What do you think about something like this? I think it's not totally
> > correct because it relies on having ->completion_lock in io_timeout() so
> > that ->cq_last_tm_flushed is updated, but in case of IORING_SETUP_IOPOLL,
> > io_iopoll_complete() doesn't take that lock, and ->uring_lock will not
> > be held if io_timeout() is called from io_wq_submit_work(), but maybe
> > could still be worth it since that was already possibly a problem?
> > 
> > diff --git a/fs/io_uring.c b/fs/io_uring.c
> > index cb57e0360fcb..50984709879c 100644
> > --- a/fs/io_uring.c
> > +++ b/fs/io_uring.c
> > @@ -353,6 +353,7 @@ struct io_ring_ctx {
> >  		unsigned		cq_entries;
> >  		unsigned		cq_mask;
> >  		atomic_t		cq_timeouts;
> > +		unsigned		cq_last_tm_flush;
> 
> It looks like that "last flush" is a good direction.
> I think there can be problems at extremes like completing 2^32
> requests at once, but should be ok in practice. Anyway better
> than it's now.
> 
> What about the first patch about overflows and cq_timeouts? I
> assume that problem is still there, isn't it?

Yeah it's still there I think, I just couldn't think of a good way
to fix it. So I figured I would just send this one since at least
it doesn't make that problem worse. Maybe could send a fix for that
one later if I think of something

> 
> See comments below, but if it passes liburing tests, please send
> a patch.

will do!

> 
> >  		unsigned long		cq_check_overflow;
> >  		struct wait_queue_head	cq_wait;
> >  		struct fasync_struct	*cq_fasync;
> > @@ -1633,19 +1634,26 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx)
> >  
> >  static void io_flush_timeouts(struct io_ring_ctx *ctx)
> >  {
> > +	u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
> > +
> 
> a nit,
> 
> if (list_empty()) return; + do {} while();
> 
> timeouts can be rare enough
> 
> >  	while (!list_empty(&ctx->timeout_list)) {
> > +		u32 events_needed, events_got;
> >  		struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
> >  						struct io_kiocb, timeout.list);
> >  
> >  		if (io_is_timeout_noseq(req))
> >  			break;
> > -		if (req->timeout.target_seq != ctx->cached_cq_tail
> > -					- atomic_read(&ctx->cq_timeouts))
> > +
> 
> extra new line
> 
> > +		events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
> > +		events_got = seq - ctx->cq_last_tm_flush;
> > +		if (events_got < events_needed) 
> 
> probably <=

Won't that make it break too early though? If you submit a timeout
with off = 1 when {seq == 0, last_flush == 0}, then target_seq ==
1. Then let's say there's 1 cqe added, so the timeout should trigger.
Then events_needed == 1 and events_got == 1, right?

> 
> >  			break;
> 
> basically it checks that @target is in [last_flush, cur_seq],
> it can use such a comment + a note about underflows and using
> the modulus arithmetic, like with algebraic rings
> 
> >  
> >  		list_del_init(&req->timeout.list);
> >  		io_kill_timeout(req);
> >  	}
> > +
> > +	ctx->cq_last_tm_flush = seq;
> >  }
> >  
> >  static void io_commit_cqring(struct io_ring_ctx *ctx)
> > 
> 
> -- 
> Pavel Begunkov

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 2/2] io_uring: flush timeouts that should already have expired
  2021-01-13 14:41           ` Marcelo Diop-Gonzalez
@ 2021-01-13 15:20             ` Pavel Begunkov
  0 siblings, 0 replies; 16+ messages in thread
From: Pavel Begunkov @ 2021-01-13 15:20 UTC (permalink / raw)
  To: Marcelo Diop-Gonzalez; +Cc: Jens Axboe, io-uring

On 13/01/2021 14:41, Marcelo Diop-Gonzalez wrote:
> On Tue, Jan 12, 2021 at 08:47:11PM +0000, Pavel Begunkov wrote:
>> What about the first patch about overflows and cq_timeouts? I
>> assume that problem is still there, isn't it?
> 
> Yeah it's still there I think, I just couldn't think of a good way
> to fix it. So I figured I would just send this one since at least
> it doesn't make that problem worse. Maybe could send a fix for that
> one later if I think of something

sounds good to me

>>> +		events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
>>> +		events_got = seq - ctx->cq_last_tm_flush;
>>> +		if (events_got < events_needed) 
>>
>> probably <=
> 
> Won't that make it break too early though? If you submit a timeout
> with off = 1 when {seq == 0, last_flush == 0}, then target_seq ==
> 1. Then let's say there's 1 cqe added, so the timeout should trigger.
> Then events_needed == 1 and events_got == 1, right?

Yep, you're right. I had in mind

@target in [last_flush, cur_flush], and so

(!(target - last_flush <= cur_flush - last_flush))
	break;

but that's same but reshuffled. Thanks for double checking. 

>> basically it checks that @target is in [last_flush, cur_seq],
>> it can use such a comment + a note about underflows and using
>> the modulus arithmetic, like with algebraic rings
-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 2/2] io_uring: flush timeouts that should already have expired
  2021-01-12 20:47         ` Pavel Begunkov
  2021-01-13 14:41           ` Marcelo Diop-Gonzalez
@ 2021-01-14  0:46           ` Marcelo Diop-Gonzalez
  2021-01-14 21:04             ` Pavel Begunkov
  1 sibling, 1 reply; 16+ messages in thread
From: Marcelo Diop-Gonzalez @ 2021-01-14  0:46 UTC (permalink / raw)
  To: Pavel Begunkov; +Cc: Jens Axboe, io-uring

On Tue, Jan 12, 2021 at 08:47:11PM +0000, Pavel Begunkov wrote:
> On 08/01/2021 15:57, Marcelo Diop-Gonzalez wrote:
> > On Sat, Jan 02, 2021 at 08:26:26PM +0000, Pavel Begunkov wrote:
> >> On 02/01/2021 19:54, Pavel Begunkov wrote:
> >>> On 19/12/2020 19:15, Marcelo Diop-Gonzalez wrote:
> >>>> Right now io_flush_timeouts() checks if the current number of events
> >>>> is equal to ->timeout.target_seq, but this will miss some timeouts if
> >>>> there have been more than 1 event added since the last time they were
> >>>> flushed (possible in io_submit_flush_completions(), for example). Fix
> >>>> it by recording the starting value of ->cached_cq_overflow -
> >>>> ->cq_timeouts instead of the target value, so that we can safely
> >>>> (without overflow problems) compare the number of events that have
> >>>> happened with the number of events needed to trigger the timeout.
> >>
> >> https://www.spinics.net/lists/kernel/msg3475160.html
> >>
> >> The idea was to replace u32 cached_cq_tail with u64 while keeping
> >> timeout offsets u32. Assuming that we won't ever hit ~2^62 inflight
> >> requests, complete all requests falling into some large enough window
> >> behind that u64 cached_cq_tail.
> >>
> >> simplifying:
> >>
> >> i64 d = target_off - ctx->u64_cq_tail
> >> if (d <= 0 && d > -2^32)
> >> 	complete_it()
> >>
> >> Not fond  of it, but at least worked at that time. You can try out
> >> this approach if you want, but would be perfect if you would find
> >> something more elegant :)
> >>
> > 
> > What do you think about something like this? I think it's not totally
> > correct because it relies on having ->completion_lock in io_timeout() so
> > that ->cq_last_tm_flushed is updated, but in case of IORING_SETUP_IOPOLL,
> > io_iopoll_complete() doesn't take that lock, and ->uring_lock will not
> > be held if io_timeout() is called from io_wq_submit_work(), but maybe
> > could still be worth it since that was already possibly a problem?
> > 
> > diff --git a/fs/io_uring.c b/fs/io_uring.c
> > index cb57e0360fcb..50984709879c 100644
> > --- a/fs/io_uring.c
> > +++ b/fs/io_uring.c
> > @@ -353,6 +353,7 @@ struct io_ring_ctx {
> >  		unsigned		cq_entries;
> >  		unsigned		cq_mask;
> >  		atomic_t		cq_timeouts;
> > +		unsigned		cq_last_tm_flush;
> 
> It looks like that "last flush" is a good direction.
> I think there can be problems at extremes like completing 2^32
> requests at once, but should be ok in practice. Anyway better
> than it's now.
> 
> What about the first patch about overflows and cq_timeouts? I
> assume that problem is still there, isn't it?
> 
> See comments below, but if it passes liburing tests, please send
> a patch.
> 
> >  		unsigned long		cq_check_overflow;
> >  		struct wait_queue_head	cq_wait;
> >  		struct fasync_struct	*cq_fasync;
> > @@ -1633,19 +1634,26 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx)
> >  
> >  static void io_flush_timeouts(struct io_ring_ctx *ctx)
> >  {
> > +	u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
> > +
> 
> a nit,
> 
> if (list_empty()) return; + do {} while();

Ah btw, so then we would have to add ->last_flush = seq in
io_timeout() too? I think that should be correct but just wanna make
sure that's what you meant.  Because otherwise if list_empty() is true
for a while without updating ->last_flush then there could be
problems. Like for example if there are no timeouts for a while, and
seq == 2^32-2, then we add a timeout with off == 4. If last_flush is
still 0 then target-last_flush == 2, but seq - last_flush == 2^32-2

> 
> timeouts can be rare enough
> 
> >  	while (!list_empty(&ctx->timeout_list)) {
> > +		u32 events_needed, events_got;
> >  		struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
> >  						struct io_kiocb, timeout.list);
> >  
> >  		if (io_is_timeout_noseq(req))
> >  			break;
> > -		if (req->timeout.target_seq != ctx->cached_cq_tail
> > -					- atomic_read(&ctx->cq_timeouts))
> > +
> 
> extra new line
> 
> > +		events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
> > +		events_got = seq - ctx->cq_last_tm_flush;
> > +		if (events_got < events_needed) 
> 
> probably <=
> 
> >  			break;
> 
> basically it checks that @target is in [last_flush, cur_seq],
> it can use such a comment + a note about underflows and using
> the modulus arithmetic, like with algebraic rings
> 
> >  
> >  		list_del_init(&req->timeout.list);
> >  		io_kill_timeout(req);
> >  	}
> > +
> > +	ctx->cq_last_tm_flush = seq;
> >  }
> >  
> >  static void io_commit_cqring(struct io_ring_ctx *ctx)
> > 
> 
> -- 
> Pavel Begunkov

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 2/2] io_uring: flush timeouts that should already have expired
  2021-01-14  0:46           ` Marcelo Diop-Gonzalez
@ 2021-01-14 21:04             ` Pavel Begunkov
  0 siblings, 0 replies; 16+ messages in thread
From: Pavel Begunkov @ 2021-01-14 21:04 UTC (permalink / raw)
  To: Marcelo Diop-Gonzalez; +Cc: Jens Axboe, io-uring

On 14/01/2021 00:46, Marcelo Diop-Gonzalez wrote:
> On Tue, Jan 12, 2021 at 08:47:11PM +0000, Pavel Begunkov wrote:
>> On 08/01/2021 15:57, Marcelo Diop-Gonzalez wrote:
>>> On Sat, Jan 02, 2021 at 08:26:26PM +0000, Pavel Begunkov wrote:
>>>> On 02/01/2021 19:54, Pavel Begunkov wrote:
>>>>> On 19/12/2020 19:15, Marcelo Diop-Gonzalez wrote:
>>>>>> Right now io_flush_timeouts() checks if the current number of events
>>>>>> is equal to ->timeout.target_seq, but this will miss some timeouts if
>>>>>> there have been more than 1 event added since the last time they were
>>>>>> flushed (possible in io_submit_flush_completions(), for example). Fix
>>>>>> it by recording the starting value of ->cached_cq_overflow -
>>>>>> ->cq_timeouts instead of the target value, so that we can safely
>>>>>> (without overflow problems) compare the number of events that have
>>>>>> happened with the number of events needed to trigger the timeout.
>>>>
>>>> https://www.spinics.net/lists/kernel/msg3475160.html
>>>>
>>>> The idea was to replace u32 cached_cq_tail with u64 while keeping
>>>> timeout offsets u32. Assuming that we won't ever hit ~2^62 inflight
>>>> requests, complete all requests falling into some large enough window
>>>> behind that u64 cached_cq_tail.
>>>>
>>>> simplifying:
>>>>
>>>> i64 d = target_off - ctx->u64_cq_tail
>>>> if (d <= 0 && d > -2^32)
>>>> 	complete_it()
>>>>
>>>> Not fond  of it, but at least worked at that time. You can try out
>>>> this approach if you want, but would be perfect if you would find
>>>> something more elegant :)
>>>>
>>>
>>> What do you think about something like this? I think it's not totally
>>> correct because it relies on having ->completion_lock in io_timeout() so
>>> that ->cq_last_tm_flushed is updated, but in case of IORING_SETUP_IOPOLL,
>>> io_iopoll_complete() doesn't take that lock, and ->uring_lock will not
>>> be held if io_timeout() is called from io_wq_submit_work(), but maybe
>>> could still be worth it since that was already possibly a problem?
>>>
>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>> index cb57e0360fcb..50984709879c 100644
>>> --- a/fs/io_uring.c
>>> +++ b/fs/io_uring.c
>>> @@ -353,6 +353,7 @@ struct io_ring_ctx {
>>>  		unsigned		cq_entries;
>>>  		unsigned		cq_mask;
>>>  		atomic_t		cq_timeouts;
>>> +		unsigned		cq_last_tm_flush;
>>
>> It looks like that "last flush" is a good direction.
>> I think there can be problems at extremes like completing 2^32
>> requests at once, but should be ok in practice. Anyway better
>> than it's now.
>>
>> What about the first patch about overflows and cq_timeouts? I
>> assume that problem is still there, isn't it?
>>
>> See comments below, but if it passes liburing tests, please send
>> a patch.
>>
>>>  		unsigned long		cq_check_overflow;
>>>  		struct wait_queue_head	cq_wait;
>>>  		struct fasync_struct	*cq_fasync;
>>> @@ -1633,19 +1634,26 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx)
>>>  
>>>  static void io_flush_timeouts(struct io_ring_ctx *ctx)
>>>  {
>>> +	u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
>>> +
>>
>> a nit, 
>>
>> if (list_empty()) return; + do {} while();
> 
> Ah btw, so then we would have to add ->last_flush = seq in
> io_timeout() too? I think that should be correct but just wanna make
> sure that's what you meant.  Because otherwise if list_empty() is true
> for a while without updating ->last_flush then there could be
> problems. Like for example if there are no timeouts for a while, and
> seq == 2^32-2, then we add a timeout with off == 4. If last_flush is
> still 0 then target-last_flush == 2, but seq - last_flush == 2^32-2

You've just answered your question :) You need to update it somehow,
either unconditionally in commit, or in io_timeout(), or anyhow else.

btw, I like your idea to do it in io_timeout(), because it adds a
timeout anyway, so makes that list_empty() fails and kind of
automatically pushes all that tracking.

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2021-01-14 21:08 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-12-19 19:15 [PATCH v2 0/2] io_uring: fix skipping of old timeout events Marcelo Diop-Gonzalez
2020-12-19 19:15 ` [PATCH v2 1/2] io_uring: only increment ->cq_timeouts along with ->cached_cq_tail Marcelo Diop-Gonzalez
2021-01-02 20:03   ` Pavel Begunkov
2021-01-04 16:49     ` Marcelo Diop-Gonzalez
2020-12-19 19:15 ` [PATCH v2 2/2] io_uring: flush timeouts that should already have expired Marcelo Diop-Gonzalez
2021-01-02 19:54   ` Pavel Begunkov
2021-01-02 20:26     ` Pavel Begunkov
2021-01-08 15:57       ` Marcelo Diop-Gonzalez
2021-01-11  4:57         ` Pavel Begunkov
2021-01-11 15:28           ` Marcelo Diop-Gonzalez
2021-01-12 20:47         ` Pavel Begunkov
2021-01-13 14:41           ` Marcelo Diop-Gonzalez
2021-01-13 15:20             ` Pavel Begunkov
2021-01-14  0:46           ` Marcelo Diop-Gonzalez
2021-01-14 21:04             ` Pavel Begunkov
2021-01-04 17:56     ` Marcelo Diop-Gonzalez

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.