All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] io_uring: export cq overflow status to userspace
@ 2020-07-07 13:24 Xiaoguang Wang
  2020-07-07 14:28 ` Jens Axboe
  0 siblings, 1 reply; 18+ messages in thread
From: Xiaoguang Wang @ 2020-07-07 13:24 UTC (permalink / raw)
  To: io-uring; +Cc: axboe, joseph.qi, Xiaoguang Wang

For those applications which are not willing to use io_uring_enter()
to reap and handle cqes, they may completely rely on liburing's
io_uring_peek_cqe(), but if cq ring has overflowed, currently because
io_uring_peek_cqe() is not aware of this overflow, it won't enter
kernel to flush cqes, below test program can reveal this bug:

static void test_cq_overflow(struct io_uring *ring)
{
        struct io_uring_cqe *cqe;
        struct io_uring_sqe *sqe;
        int issued = 0;
        int ret = 0;

        do {
                sqe = io_uring_get_sqe(ring);
                if (!sqe) {
                        fprintf(stderr, "get sqe failed\n");
                        break;;
                }
                ret = io_uring_submit(ring);
                if (ret <= 0) {
                        if (ret != -EBUSY)
                                fprintf(stderr, "sqe submit failed: %d\n", ret);
                        break;
                }
                issued++;
        } while (ret > 0);
        assert(ret == -EBUSY);

        printf("issued requests: %d\n", issued);

        while (issued) {
                ret = io_uring_peek_cqe(ring, &cqe);
                if (ret) {
                        if (ret != -EAGAIN) {
                                fprintf(stderr, "peek completion failed: %s\n",
                                        strerror(ret));
                                break;
                        }
                        printf("left requets: %d\n", issued);
                        continue;
                }
                io_uring_cqe_seen(ring, cqe);
                issued--;
                printf("left requets: %d\n", issued);
        }
}

int main(int argc, char *argv[])
{
        int ret;
        struct io_uring ring;

        ret = io_uring_queue_init(16, &ring, 0);
        if (ret) {
                fprintf(stderr, "ring setup failed: %d\n", ret);
                return 1;
        }

        test_cq_overflow(&ring);
        return 0;
}

To fix this issue, export cq overflow status to userspace, then
helper functions() in liburing, such as io_uring_peek_cqe, can be
aware of this cq overflow and do flush accordingly.

Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
---
 fs/io_uring.c                 | 12 ++++++++++++
 include/uapi/linux/io_uring.h |  2 +-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index d37d7ea5ebe5..30f50d72b6d5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -157,6 +157,14 @@ struct io_rings {
 	 * kernel.
 	 */
 	u32                     cq_flags;
+	/*
+	 * Runtime CQ overflow flags
+	 *
+	 * Written by the kernel, shouldn't be modified by the
+	 * application.
+	 *
+	 */
+	u32                     cq_check_overflow;
 	/*
 	 * Number of completion events lost because the queue was full;
 	 * this should be avoided by the application by making sure
@@ -1274,6 +1282,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 	if (cqe) {
 		clear_bit(0, &ctx->sq_check_overflow);
 		clear_bit(0, &ctx->cq_check_overflow);
+		WRITE_ONCE(ctx->rings->cq_check_overflow, 0);
 	}
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 	io_cqring_ev_posted(ctx);
@@ -1311,6 +1320,7 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
 		if (list_empty(&ctx->cq_overflow_list)) {
 			set_bit(0, &ctx->sq_check_overflow);
 			set_bit(0, &ctx->cq_check_overflow);
+			WRITE_ONCE(ctx->rings->cq_check_overflow, 1);
 		}
 		req->flags |= REQ_F_OVERFLOW;
 		refcount_inc(&req->refs);
@@ -7488,6 +7498,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
 			if (list_empty(&ctx->cq_overflow_list)) {
 				clear_bit(0, &ctx->sq_check_overflow);
 				clear_bit(0, &ctx->cq_check_overflow);
+				WRITE_ONCE(ctx->rings->cq_check_overflow, 0);
 			}
 			spin_unlock_irq(&ctx->completion_lock);
 
@@ -7960,6 +7971,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
 	p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
 	p->cq_off.cqes = offsetof(struct io_rings, cqes);
 	p->cq_off.flags = offsetof(struct io_rings, cq_flags);
+	p->cq_off.check_overflow = offsetof(struct io_rings, cq_check_overflow);
 
 	p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
 			IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 92c22699a5a7..2ae6adc6d22d 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -206,7 +206,7 @@ struct io_cqring_offsets {
 	__u32 overflow;
 	__u32 cqes;
 	__u32 flags;
-	__u32 resv1;
+	__u32 check_overflow;
 	__u64 resv2;
 };
 
-- 
2.17.2


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH] io_uring: export cq overflow status to userspace
  2020-07-07 13:24 [PATCH] io_uring: export cq overflow status to userspace Xiaoguang Wang
@ 2020-07-07 14:28 ` Jens Axboe
  2020-07-07 16:21   ` Jens Axboe
  2020-07-07 16:29   ` Xiaoguang Wang
  0 siblings, 2 replies; 18+ messages in thread
From: Jens Axboe @ 2020-07-07 14:28 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: joseph.qi

On 7/7/20 7:24 AM, Xiaoguang Wang wrote:
> For those applications which are not willing to use io_uring_enter()
> to reap and handle cqes, they may completely rely on liburing's
> io_uring_peek_cqe(), but if cq ring has overflowed, currently because
> io_uring_peek_cqe() is not aware of this overflow, it won't enter
> kernel to flush cqes, below test program can reveal this bug:
> 
> static void test_cq_overflow(struct io_uring *ring)
> {
>         struct io_uring_cqe *cqe;
>         struct io_uring_sqe *sqe;
>         int issued = 0;
>         int ret = 0;
> 
>         do {
>                 sqe = io_uring_get_sqe(ring);
>                 if (!sqe) {
>                         fprintf(stderr, "get sqe failed\n");
>                         break;;
>                 }
>                 ret = io_uring_submit(ring);
>                 if (ret <= 0) {
>                         if (ret != -EBUSY)
>                                 fprintf(stderr, "sqe submit failed: %d\n", ret);
>                         break;
>                 }
>                 issued++;
>         } while (ret > 0);
>         assert(ret == -EBUSY);
> 
>         printf("issued requests: %d\n", issued);
> 
>         while (issued) {
>                 ret = io_uring_peek_cqe(ring, &cqe);
>                 if (ret) {
>                         if (ret != -EAGAIN) {
>                                 fprintf(stderr, "peek completion failed: %s\n",
>                                         strerror(ret));
>                                 break;
>                         }
>                         printf("left requets: %d\n", issued);
>                         continue;
>                 }
>                 io_uring_cqe_seen(ring, cqe);
>                 issued--;
>                 printf("left requets: %d\n", issued);
>         }
> }
> 
> int main(int argc, char *argv[])
> {
>         int ret;
>         struct io_uring ring;
> 
>         ret = io_uring_queue_init(16, &ring, 0);
>         if (ret) {
>                 fprintf(stderr, "ring setup failed: %d\n", ret);
>                 return 1;
>         }
> 
>         test_cq_overflow(&ring);
>         return 0;
> }
> 
> To fix this issue, export cq overflow status to userspace, then
> helper functions() in liburing, such as io_uring_peek_cqe, can be
> aware of this cq overflow and do flush accordingly.

Is there any way we can accomplish the same without exporting
another set of flags? Would it be enough for the SQPOLl thread to set
IORING_SQ_NEED_WAKEUP if we're in overflow condition? That should
result in the app entering the kernel when it's flushed the user CQ
side, and then the sqthread could attempt to flush the pending
events as well.

Something like this, totally untested...

diff --git a/fs/io_uring.c b/fs/io_uring.c
index d37d7ea5ebe5..d409bd68553f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6110,8 +6110,18 @@ static int io_sq_thread(void *data)
 		}
 
 		mutex_lock(&ctx->uring_lock);
-		if (likely(!percpu_ref_is_dying(&ctx->refs)))
+		if (likely(!percpu_ref_is_dying(&ctx->refs))) {
+retry:
 			ret = io_submit_sqes(ctx, to_submit, NULL, -1);
+			if (unlikely(ret == -EBUSY)) {
+				ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
+				smp_mb();
+				if (io_cqring_overflow_flush(ctx, false)) {
+					ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
+					goto retry;
+				}
+			}
+		}
 		mutex_unlock(&ctx->uring_lock);
 		timeout = jiffies + ctx->sq_thread_idle;
 	}

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH] io_uring: export cq overflow status to userspace
  2020-07-07 14:28 ` Jens Axboe
@ 2020-07-07 16:21   ` Jens Axboe
  2020-07-07 16:25     ` Pavel Begunkov
                       ` (2 more replies)
  2020-07-07 16:29   ` Xiaoguang Wang
  1 sibling, 3 replies; 18+ messages in thread
From: Jens Axboe @ 2020-07-07 16:21 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: joseph.qi

On 7/7/20 8:28 AM, Jens Axboe wrote:
> On 7/7/20 7:24 AM, Xiaoguang Wang wrote:
>> For those applications which are not willing to use io_uring_enter()
>> to reap and handle cqes, they may completely rely on liburing's
>> io_uring_peek_cqe(), but if cq ring has overflowed, currently because
>> io_uring_peek_cqe() is not aware of this overflow, it won't enter
>> kernel to flush cqes, below test program can reveal this bug:
>>
>> static void test_cq_overflow(struct io_uring *ring)
>> {
>>         struct io_uring_cqe *cqe;
>>         struct io_uring_sqe *sqe;
>>         int issued = 0;
>>         int ret = 0;
>>
>>         do {
>>                 sqe = io_uring_get_sqe(ring);
>>                 if (!sqe) {
>>                         fprintf(stderr, "get sqe failed\n");
>>                         break;;
>>                 }
>>                 ret = io_uring_submit(ring);
>>                 if (ret <= 0) {
>>                         if (ret != -EBUSY)
>>                                 fprintf(stderr, "sqe submit failed: %d\n", ret);
>>                         break;
>>                 }
>>                 issued++;
>>         } while (ret > 0);
>>         assert(ret == -EBUSY);
>>
>>         printf("issued requests: %d\n", issued);
>>
>>         while (issued) {
>>                 ret = io_uring_peek_cqe(ring, &cqe);
>>                 if (ret) {
>>                         if (ret != -EAGAIN) {
>>                                 fprintf(stderr, "peek completion failed: %s\n",
>>                                         strerror(ret));
>>                                 break;
>>                         }
>>                         printf("left requets: %d\n", issued);
>>                         continue;
>>                 }
>>                 io_uring_cqe_seen(ring, cqe);
>>                 issued--;
>>                 printf("left requets: %d\n", issued);
>>         }
>> }
>>
>> int main(int argc, char *argv[])
>> {
>>         int ret;
>>         struct io_uring ring;
>>
>>         ret = io_uring_queue_init(16, &ring, 0);
>>         if (ret) {
>>                 fprintf(stderr, "ring setup failed: %d\n", ret);
>>                 return 1;
>>         }
>>
>>         test_cq_overflow(&ring);
>>         return 0;
>> }
>>
>> To fix this issue, export cq overflow status to userspace, then
>> helper functions() in liburing, such as io_uring_peek_cqe, can be
>> aware of this cq overflow and do flush accordingly.
> 
> Is there any way we can accomplish the same without exporting
> another set of flags? Would it be enough for the SQPOLl thread to set
> IORING_SQ_NEED_WAKEUP if we're in overflow condition? That should
> result in the app entering the kernel when it's flushed the user CQ
> side, and then the sqthread could attempt to flush the pending
> events as well.
> 
> Something like this, totally untested...

OK, took a closer look at this, it's a generic thing, not just
SQPOLL related. My bad!

Anyway, my suggestion would be to add IORING_SQ_CQ_OVERFLOW to the
existing flags, and then make a liburing change almost identical to
what you had.

Hence kernel side:


diff --git a/fs/io_uring.c b/fs/io_uring.c
index d37d7ea5ebe5..af9fd5cefc51 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1234,11 +1234,12 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 	struct io_uring_cqe *cqe;
 	struct io_kiocb *req;
 	unsigned long flags;
+	bool ret = true;
 	LIST_HEAD(list);
 
 	if (!force) {
 		if (list_empty_careful(&ctx->cq_overflow_list))
-			return true;
+			goto done;
 		if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
 		    rings->cq_ring_entries))
 			return false;
@@ -1284,7 +1285,11 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 		io_put_req(req);
 	}
 
-	return cqe != NULL;
+	ret = cqe != NULL;
+done:
+	if (ret)
+		ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
+	return ret;
 }
 
 static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
@@ -5933,10 +5938,13 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 	int i, submitted = 0;
 
 	/* if we have a backlog and couldn't flush it all, return BUSY */
-	if (test_bit(0, &ctx->sq_check_overflow)) {
+	if (unlikely(test_bit(0, &ctx->sq_check_overflow))) {
 		if (!list_empty(&ctx->cq_overflow_list) &&
-		    !io_cqring_overflow_flush(ctx, false))
+		    !io_cqring_overflow_flush(ctx, false)) {
+			ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
+			smp_mb();
 			return -EBUSY;
+		}
 	}
 
 	/* make sure SQ entry isn't read before tail */
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 92c22699a5a7..9c7e028beda5 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -197,6 +197,7 @@ struct io_sqring_offsets {
  * sq_ring->flags
  */
 #define IORING_SQ_NEED_WAKEUP	(1U << 0) /* needs io_uring_enter wakeup */
+#define IORING_SQ_CQ_OVERFLOW	(1U << 1) /* app needs to enter kernel */
 
 struct io_cqring_offsets {
 	__u32 head;

and then this for the liburing side:


diff --git a/src/include/liburing/io_uring.h b/src/include/liburing/io_uring.h
index 6a73522..e4314ed 100644
--- a/src/include/liburing/io_uring.h
+++ b/src/include/liburing/io_uring.h
@@ -202,6 +202,7 @@ struct io_sqring_offsets {
  * sq_ring->flags
  */
 #define IORING_SQ_NEED_WAKEUP	(1U << 0) /* needs io_uring_enter wakeup */
+#define IORING_SQ_CQ_OVERFLOW	(1U << 1)
 
 struct io_cqring_offsets {
 	__u32 head;
diff --git a/src/queue.c b/src/queue.c
index 88e0294..1f00251 100644
--- a/src/queue.c
+++ b/src/queue.c
@@ -32,6 +32,11 @@ static inline bool sq_ring_needs_enter(struct io_uring *ring,
 	return false;
 }
 
+static inline bool cq_ring_needs_flush(struct io_uring *ring)
+{
+	return IO_URING_READ_ONCE(*ring->sq.kflags) & IORING_SQ_CQ_OVERFLOW;
+}
+
 static int __io_uring_peek_cqe(struct io_uring *ring,
 			       struct io_uring_cqe **cqe_ptr)
 {
@@ -67,22 +72,26 @@ int __io_uring_get_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr,
 	int ret = 0, err;
 
 	do {
+		bool cq_overflow_flush = false;
 		unsigned flags = 0;
 
 		err = __io_uring_peek_cqe(ring, &cqe);
 		if (err)
 			break;
 		if (!cqe && !to_wait && !submit) {
-			err = -EAGAIN;
-			break;
+			if (!cq_ring_needs_flush(ring)) {
+				err = -EAGAIN;
+				break;
+			}
+			cq_overflow_flush = true;
 		}
 		if (wait_nr && cqe)
 			wait_nr--;
-		if (wait_nr)
+		if (wait_nr || cq_overflow_flush)
 			flags = IORING_ENTER_GETEVENTS;
 		if (submit)
 			sq_ring_needs_enter(ring, submit, &flags);
-		if (wait_nr || submit)
+		if (wait_nr || submit || cq_overflow_flush)
 			ret = __sys_io_uring_enter(ring->ring_fd, submit,
 						   wait_nr, flags, sigmask);
 		if (ret < 0) {

If you agree with this approach, could you test this and resubmit the
two patches?

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH] io_uring: export cq overflow status to userspace
  2020-07-07 16:21   ` Jens Axboe
@ 2020-07-07 16:25     ` Pavel Begunkov
  2020-07-07 16:30       ` Jens Axboe
  2020-07-07 16:36     ` Xiaoguang Wang
  2020-07-08  3:25     ` Xiaoguang Wang
  2 siblings, 1 reply; 18+ messages in thread
From: Pavel Begunkov @ 2020-07-07 16:25 UTC (permalink / raw)
  To: Jens Axboe, Xiaoguang Wang, io-uring; +Cc: joseph.qi

...
>>>
>>> To fix this issue, export cq overflow status to userspace, then
>>> helper functions() in liburing, such as io_uring_peek_cqe, can be
>>> aware of this cq overflow and do flush accordingly.
>>
>> Is there any way we can accomplish the same without exporting
>> another set of flags? Would it be enough for the SQPOLl thread to set
>> IORING_SQ_NEED_WAKEUP if we're in overflow condition? That should
>> result in the app entering the kernel when it's flushed the user CQ
>> side, and then the sqthread could attempt to flush the pending
>> events as well.
>>
>> Something like this, totally untested...
> 
> OK, took a closer look at this, it's a generic thing, not just
> SQPOLL related. My bad!
> 
> Anyway, my suggestion would be to add IORING_SQ_CQ_OVERFLOW to the
> existing flags, and then make a liburing change almost identical to
> what you had.

How about CQ being full as an indicator that flush is required?
With a properly set CQ size there shouldn't be much false positives.


-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] io_uring: export cq overflow status to userspace
  2020-07-07 14:28 ` Jens Axboe
  2020-07-07 16:21   ` Jens Axboe
@ 2020-07-07 16:29   ` Xiaoguang Wang
  2020-07-07 16:30     ` Jens Axboe
  1 sibling, 1 reply; 18+ messages in thread
From: Xiaoguang Wang @ 2020-07-07 16:29 UTC (permalink / raw)
  To: Jens Axboe, io-uring; +Cc: joseph.qi

hi,

> On 7/7/20 7:24 AM, Xiaoguang Wang wrote:
>> For those applications which are not willing to use io_uring_enter()
>> to reap and handle cqes, they may completely rely on liburing's
>> io_uring_peek_cqe(), but if cq ring has overflowed, currently because
>> io_uring_peek_cqe() is not aware of this overflow, it won't enter
>> kernel to flush cqes, below test program can reveal this bug:
>>
>> static void test_cq_overflow(struct io_uring *ring)
>> {
>>          struct io_uring_cqe *cqe;
>>          struct io_uring_sqe *sqe;
>>          int issued = 0;
>>          int ret = 0;
>>
>>          do {
>>                  sqe = io_uring_get_sqe(ring);
>>                  if (!sqe) {
>>                          fprintf(stderr, "get sqe failed\n");
>>                          break;;
>>                  }
>>                  ret = io_uring_submit(ring);
>>                  if (ret <= 0) {
>>                          if (ret != -EBUSY)
>>                                  fprintf(stderr, "sqe submit failed: %d\n", ret);
>>                          break;
>>                  }
>>                  issued++;
>>          } while (ret > 0);
>>          assert(ret == -EBUSY);
>>
>>          printf("issued requests: %d\n", issued);
>>
>>          while (issued) {
>>                  ret = io_uring_peek_cqe(ring, &cqe);
>>                  if (ret) {
>>                          if (ret != -EAGAIN) {
>>                                  fprintf(stderr, "peek completion failed: %s\n",
>>                                          strerror(ret));
>>                                  break;
>>                          }
>>                          printf("left requets: %d\n", issued);
>>                          continue;
>>                  }
>>                  io_uring_cqe_seen(ring, cqe);
>>                  issued--;
>>                  printf("left requets: %d\n", issued);
>>          }
>> }
>>
>> int main(int argc, char *argv[])
>> {
>>          int ret;
>>          struct io_uring ring;
>>
>>          ret = io_uring_queue_init(16, &ring, 0);
>>          if (ret) {
>>                  fprintf(stderr, "ring setup failed: %d\n", ret);
>>                  return 1;
>>          }
>>
>>          test_cq_overflow(&ring);
>>          return 0;
>> }
>>
>> To fix this issue, export cq overflow status to userspace, then
>> helper functions() in liburing, such as io_uring_peek_cqe, can be
>> aware of this cq overflow and do flush accordingly.
> 
> Is there any way we can accomplish the same without exporting
> another set of flags? 
I understand your concerns and will try to find some better methods later,
but not sure there're some better :)

> Would it be enough for the SQPOLl thread to set
> IORING_SQ_NEED_WAKEUP if we're in overflow condition? That should
> result in the app entering the kernel when it's flushed the user CQ
> side, and then the sqthread could attempt to flush the pending
> events as well.
> 
> Something like this, totally untested...
I haven't test your patch, but I think it doesn't work for non-sqpoll case, see
my above test program, it doesn't have SQPOLL enabled.

Regards,
Xiaoguang Wang
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index d37d7ea5ebe5..d409bd68553f 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -6110,8 +6110,18 @@ static int io_sq_thread(void *data)
>   		}
>   
>   		mutex_lock(&ctx->uring_lock);
> -		if (likely(!percpu_ref_is_dying(&ctx->refs)))
> +		if (likely(!percpu_ref_is_dying(&ctx->refs))) {
> +retry:
>   			ret = io_submit_sqes(ctx, to_submit, NULL, -1);
> +			if (unlikely(ret == -EBUSY)) {
> +				ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
> +				smp_mb();
> +				if (io_cqring_overflow_flush(ctx, false)) {
> +					ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
> +					goto retry;
> +				}
> +			}
> +		}
>   		mutex_unlock(&ctx->uring_lock);
>   		timeout = jiffies + ctx->sq_thread_idle;
>   	}
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] io_uring: export cq overflow status to userspace
  2020-07-07 16:25     ` Pavel Begunkov
@ 2020-07-07 16:30       ` Jens Axboe
  0 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2020-07-07 16:30 UTC (permalink / raw)
  To: Pavel Begunkov, Xiaoguang Wang, io-uring; +Cc: joseph.qi

On 7/7/20 10:25 AM, Pavel Begunkov wrote:
> ...
>>>>
>>>> To fix this issue, export cq overflow status to userspace, then
>>>> helper functions() in liburing, such as io_uring_peek_cqe, can be
>>>> aware of this cq overflow and do flush accordingly.
>>>
>>> Is there any way we can accomplish the same without exporting
>>> another set of flags? Would it be enough for the SQPOLl thread to set
>>> IORING_SQ_NEED_WAKEUP if we're in overflow condition? That should
>>> result in the app entering the kernel when it's flushed the user CQ
>>> side, and then the sqthread could attempt to flush the pending
>>> events as well.
>>>
>>> Something like this, totally untested...
>>
>> OK, took a closer look at this, it's a generic thing, not just
>> SQPOLL related. My bad!
>>
>> Anyway, my suggestion would be to add IORING_SQ_CQ_OVERFLOW to the
>> existing flags, and then make a liburing change almost identical to
>> what you had.
> 
> How about CQ being full as an indicator that flush is required?
> With a properly set CQ size there shouldn't be much false positives.

I guess my worry there would be that it makes the state more tricky.
When you do the first peek/get, the ring is full but you can't flush
at that point. You'd really want to flush when the ring is now empty,
AND it used to be full, and you get -EAGAIN.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] io_uring: export cq overflow status to userspace
  2020-07-07 16:29   ` Xiaoguang Wang
@ 2020-07-07 16:30     ` Jens Axboe
  0 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2020-07-07 16:30 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: joseph.qi

On 7/7/20 10:29 AM, Xiaoguang Wang wrote:
> hi,
> 
>> On 7/7/20 7:24 AM, Xiaoguang Wang wrote:
>>> For those applications which are not willing to use io_uring_enter()
>>> to reap and handle cqes, they may completely rely on liburing's
>>> io_uring_peek_cqe(), but if cq ring has overflowed, currently because
>>> io_uring_peek_cqe() is not aware of this overflow, it won't enter
>>> kernel to flush cqes, below test program can reveal this bug:
>>>
>>> static void test_cq_overflow(struct io_uring *ring)
>>> {
>>>          struct io_uring_cqe *cqe;
>>>          struct io_uring_sqe *sqe;
>>>          int issued = 0;
>>>          int ret = 0;
>>>
>>>          do {
>>>                  sqe = io_uring_get_sqe(ring);
>>>                  if (!sqe) {
>>>                          fprintf(stderr, "get sqe failed\n");
>>>                          break;;
>>>                  }
>>>                  ret = io_uring_submit(ring);
>>>                  if (ret <= 0) {
>>>                          if (ret != -EBUSY)
>>>                                  fprintf(stderr, "sqe submit failed: %d\n", ret);
>>>                          break;
>>>                  }
>>>                  issued++;
>>>          } while (ret > 0);
>>>          assert(ret == -EBUSY);
>>>
>>>          printf("issued requests: %d\n", issued);
>>>
>>>          while (issued) {
>>>                  ret = io_uring_peek_cqe(ring, &cqe);
>>>                  if (ret) {
>>>                          if (ret != -EAGAIN) {
>>>                                  fprintf(stderr, "peek completion failed: %s\n",
>>>                                          strerror(ret));
>>>                                  break;
>>>                          }
>>>                          printf("left requets: %d\n", issued);
>>>                          continue;
>>>                  }
>>>                  io_uring_cqe_seen(ring, cqe);
>>>                  issued--;
>>>                  printf("left requets: %d\n", issued);
>>>          }
>>> }
>>>
>>> int main(int argc, char *argv[])
>>> {
>>>          int ret;
>>>          struct io_uring ring;
>>>
>>>          ret = io_uring_queue_init(16, &ring, 0);
>>>          if (ret) {
>>>                  fprintf(stderr, "ring setup failed: %d\n", ret);
>>>                  return 1;
>>>          }
>>>
>>>          test_cq_overflow(&ring);
>>>          return 0;
>>> }
>>>
>>> To fix this issue, export cq overflow status to userspace, then
>>> helper functions() in liburing, such as io_uring_peek_cqe, can be
>>> aware of this cq overflow and do flush accordingly.
>>
>> Is there any way we can accomplish the same without exporting
>> another set of flags? 

> I understand your concerns and will try to find some better methods
> later, but not sure there're some better :)

Ignore this one and look at the one I just sent, hopefully that'll be
better for you.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] io_uring: export cq overflow status to userspace
  2020-07-07 16:21   ` Jens Axboe
  2020-07-07 16:25     ` Pavel Begunkov
@ 2020-07-07 16:36     ` Xiaoguang Wang
  2020-07-07 17:23       ` Jens Axboe
  2020-07-08  3:25     ` Xiaoguang Wang
  2 siblings, 1 reply; 18+ messages in thread
From: Xiaoguang Wang @ 2020-07-07 16:36 UTC (permalink / raw)
  To: Jens Axboe, io-uring; +Cc: joseph.qi

hi,

> On 7/7/20 8:28 AM, Jens Axboe wrote:
>> On 7/7/20 7:24 AM, Xiaoguang Wang wrote:
>>> For those applications which are not willing to use io_uring_enter()
>>> to reap and handle cqes, they may completely rely on liburing's
>>> io_uring_peek_cqe(), but if cq ring has overflowed, currently because
>>> io_uring_peek_cqe() is not aware of this overflow, it won't enter
>>> kernel to flush cqes, below test program can reveal this bug:
>>>
>>> static void test_cq_overflow(struct io_uring *ring)
>>> {
>>>          struct io_uring_cqe *cqe;
>>>          struct io_uring_sqe *sqe;
>>>          int issued = 0;
>>>          int ret = 0;
>>>
>>>          do {
>>>                  sqe = io_uring_get_sqe(ring);
>>>                  if (!sqe) {
>>>                          fprintf(stderr, "get sqe failed\n");
>>>                          break;;
>>>                  }
>>>                  ret = io_uring_submit(ring);
>>>                  if (ret <= 0) {
>>>                          if (ret != -EBUSY)
>>>                                  fprintf(stderr, "sqe submit failed: %d\n", ret);
>>>                          break;
>>>                  }
>>>                  issued++;
>>>          } while (ret > 0);
>>>          assert(ret == -EBUSY);
>>>
>>>          printf("issued requests: %d\n", issued);
>>>
>>>          while (issued) {
>>>                  ret = io_uring_peek_cqe(ring, &cqe);
>>>                  if (ret) {
>>>                          if (ret != -EAGAIN) {
>>>                                  fprintf(stderr, "peek completion failed: %s\n",
>>>                                          strerror(ret));
>>>                                  break;
>>>                          }
>>>                          printf("left requets: %d\n", issued);
>>>                          continue;
>>>                  }
>>>                  io_uring_cqe_seen(ring, cqe);
>>>                  issued--;
>>>                  printf("left requets: %d\n", issued);
>>>          }
>>> }
>>>
>>> int main(int argc, char *argv[])
>>> {
>>>          int ret;
>>>          struct io_uring ring;
>>>
>>>          ret = io_uring_queue_init(16, &ring, 0);
>>>          if (ret) {
>>>                  fprintf(stderr, "ring setup failed: %d\n", ret);
>>>                  return 1;
>>>          }
>>>
>>>          test_cq_overflow(&ring);
>>>          return 0;
>>> }
>>>
>>> To fix this issue, export cq overflow status to userspace, then
>>> helper functions() in liburing, such as io_uring_peek_cqe, can be
>>> aware of this cq overflow and do flush accordingly.
>>
>> Is there any way we can accomplish the same without exporting
>> another set of flags? Would it be enough for the SQPOLl thread to set
>> IORING_SQ_NEED_WAKEUP if we're in overflow condition? That should
>> result in the app entering the kernel when it's flushed the user CQ
>> side, and then the sqthread could attempt to flush the pending
>> events as well.
>>
>> Something like this, totally untested...
> 
> OK, took a closer look at this, it's a generic thing, not just
> SQPOLL related. My bad!
> 
> Anyway, my suggestion would be to add IORING_SQ_CQ_OVERFLOW to the
> existing flags, and then make a liburing change almost identical to
> what you had.
Thanks.
It's somewhat late today, I'll test and send these two patches tomorrow.

Regards,
Xiaoguang Wang

> 
> Hence kernel side:
> 
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index d37d7ea5ebe5..af9fd5cefc51 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -1234,11 +1234,12 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
>   	struct io_uring_cqe *cqe;
>   	struct io_kiocb *req;
>   	unsigned long flags;
> +	bool ret = true;
>   	LIST_HEAD(list);
>   
>   	if (!force) {
>   		if (list_empty_careful(&ctx->cq_overflow_list))
> -			return true;
> +			goto done;
>   		if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
>   		    rings->cq_ring_entries))
>   			return false;
> @@ -1284,7 +1285,11 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
>   		io_put_req(req);
>   	}
>   
> -	return cqe != NULL;
> +	ret = cqe != NULL;
> +done:
> +	if (ret)
> +		ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
> +	return ret;
>   }
>   
>   static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
> @@ -5933,10 +5938,13 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
>   	int i, submitted = 0;
>   
>   	/* if we have a backlog and couldn't flush it all, return BUSY */
> -	if (test_bit(0, &ctx->sq_check_overflow)) {
> +	if (unlikely(test_bit(0, &ctx->sq_check_overflow))) {
>   		if (!list_empty(&ctx->cq_overflow_list) &&
> -		    !io_cqring_overflow_flush(ctx, false))
> +		    !io_cqring_overflow_flush(ctx, false)) {
> +			ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
> +			smp_mb();
>   			return -EBUSY;
> +		}
>   	}
>   
>   	/* make sure SQ entry isn't read before tail */
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 92c22699a5a7..9c7e028beda5 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -197,6 +197,7 @@ struct io_sqring_offsets {
>    * sq_ring->flags
>    */
>   #define IORING_SQ_NEED_WAKEUP	(1U << 0) /* needs io_uring_enter wakeup */
> +#define IORING_SQ_CQ_OVERFLOW	(1U << 1) /* app needs to enter kernel */
>   
>   struct io_cqring_offsets {
>   	__u32 head;
> 
> and then this for the liburing side:
> 
> 
> diff --git a/src/include/liburing/io_uring.h b/src/include/liburing/io_uring.h
> index 6a73522..e4314ed 100644
> --- a/src/include/liburing/io_uring.h
> +++ b/src/include/liburing/io_uring.h
> @@ -202,6 +202,7 @@ struct io_sqring_offsets {
>    * sq_ring->flags
>    */
>   #define IORING_SQ_NEED_WAKEUP	(1U << 0) /* needs io_uring_enter wakeup */
> +#define IORING_SQ_CQ_OVERFLOW	(1U << 1)
>   
>   struct io_cqring_offsets {
>   	__u32 head;
> diff --git a/src/queue.c b/src/queue.c
> index 88e0294..1f00251 100644
> --- a/src/queue.c
> +++ b/src/queue.c
> @@ -32,6 +32,11 @@ static inline bool sq_ring_needs_enter(struct io_uring *ring,
>   	return false;
>   }
>   
> +static inline bool cq_ring_needs_flush(struct io_uring *ring)
> +{
> +	return IO_URING_READ_ONCE(*ring->sq.kflags) & IORING_SQ_CQ_OVERFLOW;
> +}
> +
>   static int __io_uring_peek_cqe(struct io_uring *ring,
>   			       struct io_uring_cqe **cqe_ptr)
>   {
> @@ -67,22 +72,26 @@ int __io_uring_get_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr,
>   	int ret = 0, err;
>   
>   	do {
> +		bool cq_overflow_flush = false;
>   		unsigned flags = 0;
>   
>   		err = __io_uring_peek_cqe(ring, &cqe);
>   		if (err)
>   			break;
>   		if (!cqe && !to_wait && !submit) {
> -			err = -EAGAIN;
> -			break;
> +			if (!cq_ring_needs_flush(ring)) {
> +				err = -EAGAIN;
> +				break;
> +			}
> +			cq_overflow_flush = true;
>   		}
>   		if (wait_nr && cqe)
>   			wait_nr--;
> -		if (wait_nr)
> +		if (wait_nr || cq_overflow_flush)
>   			flags = IORING_ENTER_GETEVENTS;
>   		if (submit)
>   			sq_ring_needs_enter(ring, submit, &flags);
> -		if (wait_nr || submit)
> +		if (wait_nr || submit || cq_overflow_flush)
>   			ret = __sys_io_uring_enter(ring->ring_fd, submit,
>   						   wait_nr, flags, sigmask);
>   		if (ret < 0) {
> 
> If you agree with this approach, could you test this and resubmit the
> two patches?
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] io_uring: export cq overflow status to userspace
  2020-07-07 16:36     ` Xiaoguang Wang
@ 2020-07-07 17:23       ` Jens Axboe
  0 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2020-07-07 17:23 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: joseph.qi

On 7/7/20 10:36 AM, Xiaoguang Wang wrote:
> hi,
> 
>> On 7/7/20 8:28 AM, Jens Axboe wrote:
>>> On 7/7/20 7:24 AM, Xiaoguang Wang wrote:
>>>> For those applications which are not willing to use io_uring_enter()
>>>> to reap and handle cqes, they may completely rely on liburing's
>>>> io_uring_peek_cqe(), but if cq ring has overflowed, currently because
>>>> io_uring_peek_cqe() is not aware of this overflow, it won't enter
>>>> kernel to flush cqes, below test program can reveal this bug:
>>>>
>>>> static void test_cq_overflow(struct io_uring *ring)
>>>> {
>>>>          struct io_uring_cqe *cqe;
>>>>          struct io_uring_sqe *sqe;
>>>>          int issued = 0;
>>>>          int ret = 0;
>>>>
>>>>          do {
>>>>                  sqe = io_uring_get_sqe(ring);
>>>>                  if (!sqe) {
>>>>                          fprintf(stderr, "get sqe failed\n");
>>>>                          break;;
>>>>                  }
>>>>                  ret = io_uring_submit(ring);
>>>>                  if (ret <= 0) {
>>>>                          if (ret != -EBUSY)
>>>>                                  fprintf(stderr, "sqe submit failed: %d\n", ret);
>>>>                          break;
>>>>                  }
>>>>                  issued++;
>>>>          } while (ret > 0);
>>>>          assert(ret == -EBUSY);
>>>>
>>>>          printf("issued requests: %d\n", issued);
>>>>
>>>>          while (issued) {
>>>>                  ret = io_uring_peek_cqe(ring, &cqe);
>>>>                  if (ret) {
>>>>                          if (ret != -EAGAIN) {
>>>>                                  fprintf(stderr, "peek completion failed: %s\n",
>>>>                                          strerror(ret));
>>>>                                  break;
>>>>                          }
>>>>                          printf("left requets: %d\n", issued);
>>>>                          continue;
>>>>                  }
>>>>                  io_uring_cqe_seen(ring, cqe);
>>>>                  issued--;
>>>>                  printf("left requets: %d\n", issued);
>>>>          }
>>>> }
>>>>
>>>> int main(int argc, char *argv[])
>>>> {
>>>>          int ret;
>>>>          struct io_uring ring;
>>>>
>>>>          ret = io_uring_queue_init(16, &ring, 0);
>>>>          if (ret) {
>>>>                  fprintf(stderr, "ring setup failed: %d\n", ret);
>>>>                  return 1;
>>>>          }
>>>>
>>>>          test_cq_overflow(&ring);
>>>>          return 0;
>>>> }
>>>>
>>>> To fix this issue, export cq overflow status to userspace, then
>>>> helper functions() in liburing, such as io_uring_peek_cqe, can be
>>>> aware of this cq overflow and do flush accordingly.
>>>
>>> Is there any way we can accomplish the same without exporting
>>> another set of flags? Would it be enough for the SQPOLl thread to set
>>> IORING_SQ_NEED_WAKEUP if we're in overflow condition? That should
>>> result in the app entering the kernel when it's flushed the user CQ
>>> side, and then the sqthread could attempt to flush the pending
>>> events as well.
>>>
>>> Something like this, totally untested...
>>
>> OK, took a closer look at this, it's a generic thing, not just
>> SQPOLL related. My bad!
>>
>> Anyway, my suggestion would be to add IORING_SQ_CQ_OVERFLOW to the
>> existing flags, and then make a liburing change almost identical to
>> what you had.
> Thanks.
> It's somewhat late today, I'll test and send these two patches tomorrow.

Sounds good, thanks.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] io_uring: export cq overflow status to userspace
  2020-07-07 16:21   ` Jens Axboe
  2020-07-07 16:25     ` Pavel Begunkov
  2020-07-07 16:36     ` Xiaoguang Wang
@ 2020-07-08  3:25     ` Xiaoguang Wang
  2020-07-08  3:46       ` Jens Axboe
  2 siblings, 1 reply; 18+ messages in thread
From: Xiaoguang Wang @ 2020-07-08  3:25 UTC (permalink / raw)
  To: Jens Axboe, io-uring; +Cc: joseph.qi

hi,

> On 7/7/20 8:28 AM, Jens Axboe wrote:
>> On 7/7/20 7:24 AM, Xiaoguang Wang wrote:
>>> For those applications which are not willing to use io_uring_enter()
>>> to reap and handle cqes, they may completely rely on liburing's
>>> io_uring_peek_cqe(), but if cq ring has overflowed, currently because
>>> io_uring_peek_cqe() is not aware of this overflow, it won't enter
>>> kernel to flush cqes, below test program can reveal this bug:
>>>
>>> static void test_cq_overflow(struct io_uring *ring)
>>> {
>>>          struct io_uring_cqe *cqe;
>>>          struct io_uring_sqe *sqe;
>>>          int issued = 0;
>>>          int ret = 0;
>>>
>>>          do {
>>>                  sqe = io_uring_get_sqe(ring);
>>>                  if (!sqe) {
>>>                          fprintf(stderr, "get sqe failed\n");
>>>                          break;;
>>>                  }
>>>                  ret = io_uring_submit(ring);
>>>                  if (ret <= 0) {
>>>                          if (ret != -EBUSY)
>>>                                  fprintf(stderr, "sqe submit failed: %d\n", ret);
>>>                          break;
>>>                  }
>>>                  issued++;
>>>          } while (ret > 0);
>>>          assert(ret == -EBUSY);
>>>
>>>          printf("issued requests: %d\n", issued);
>>>
>>>          while (issued) {
>>>                  ret = io_uring_peek_cqe(ring, &cqe);
>>>                  if (ret) {
>>>                          if (ret != -EAGAIN) {
>>>                                  fprintf(stderr, "peek completion failed: %s\n",
>>>                                          strerror(ret));
>>>                                  break;
>>>                          }
>>>                          printf("left requets: %d\n", issued);
>>>                          continue;
>>>                  }
>>>                  io_uring_cqe_seen(ring, cqe);
>>>                  issued--;
>>>                  printf("left requets: %d\n", issued);
>>>          }
>>> }
>>>
>>> int main(int argc, char *argv[])
>>> {
>>>          int ret;
>>>          struct io_uring ring;
>>>
>>>          ret = io_uring_queue_init(16, &ring, 0);
>>>          if (ret) {
>>>                  fprintf(stderr, "ring setup failed: %d\n", ret);
>>>                  return 1;
>>>          }
>>>
>>>          test_cq_overflow(&ring);
>>>          return 0;
>>> }
>>>
>>> To fix this issue, export cq overflow status to userspace, then
>>> helper functions() in liburing, such as io_uring_peek_cqe, can be
>>> aware of this cq overflow and do flush accordingly.
>>
>> Is there any way we can accomplish the same without exporting
>> another set of flags? Would it be enough for the SQPOLl thread to set
>> IORING_SQ_NEED_WAKEUP if we're in overflow condition? That should
>> result in the app entering the kernel when it's flushed the user CQ
>> side, and then the sqthread could attempt to flush the pending
>> events as well.
>>
>> Something like this, totally untested...
> 
> OK, took a closer look at this, it's a generic thing, not just
> SQPOLL related. My bad!
> 
> Anyway, my suggestion would be to add IORING_SQ_CQ_OVERFLOW to the
> existing flags, and then make a liburing change almost identical to
> what you had.
> 
> Hence kernel side:
> 
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index d37d7ea5ebe5..af9fd5cefc51 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -1234,11 +1234,12 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
>   	struct io_uring_cqe *cqe;
>   	struct io_kiocb *req;
>   	unsigned long flags;
> +	bool ret = true;
>   	LIST_HEAD(list);
>   
>   	if (!force) {
>   		if (list_empty_careful(&ctx->cq_overflow_list))
> -			return true;
> +			goto done;
>   		if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
>   		    rings->cq_ring_entries))
>   			return false;
> @@ -1284,7 +1285,11 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
>   		io_put_req(req);
>   	}
>   
> -	return cqe != NULL;
> +	ret = cqe != NULL;
> +done:
> +	if (ret)
> +		ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
> +	return ret;
>   }
>   
>   static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
> @@ -5933,10 +5938,13 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
>   	int i, submitted = 0;
>   
>   	/* if we have a backlog and couldn't flush it all, return BUSY */
> -	if (test_bit(0, &ctx->sq_check_overflow)) {
> +	if (unlikely(test_bit(0, &ctx->sq_check_overflow))) {
>   		if (!list_empty(&ctx->cq_overflow_list) &&
> -		    !io_cqring_overflow_flush(ctx, false))
> +		    !io_cqring_overflow_flush(ctx, false)) {
> +			ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
> +			smp_mb();
>   			return -EBUSY;
> +		}
>   	}
>   
>   	/* make sure SQ entry isn't read before tail */
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 92c22699a5a7..9c7e028beda5 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -197,6 +197,7 @@ struct io_sqring_offsets {
>    * sq_ring->flags
>    */
>   #define IORING_SQ_NEED_WAKEUP	(1U << 0) /* needs io_uring_enter wakeup */
> +#define IORING_SQ_CQ_OVERFLOW	(1U << 1) /* app needs to enter kernel */
>   
>   struct io_cqring_offsets {
>   	__u32 head;
> 
I think above codes are still not correct, you only set IORING_SQ_CQ_OVERFLOW in
io_submit_sqes, but if cq ring has been overflowed and applications don't do io
submit anymore, just calling io_uring_peek_cqe continuously, then applications
still won't be aware of the cq ring overflow.

We can put the IORING_SQ_CQ_OVERFLOW set in __io_cqring_fill_event() when setting
cq_check_overflow. In non-sqpoll, this will be safe, but in sqpoll mode, there maybe
concurrent modifications to sq_flags, which is a race condition and may need extral
lock to protect IORING_SQ_NEED_WAKEP and IORING_SQ_CQ_OVERFLOW.

Regards,
Xiaoguang Wang



> and then this for the liburing side:
> 
> 
> diff --git a/src/include/liburing/io_uring.h b/src/include/liburing/io_uring.h
> index 6a73522..e4314ed 100644
> --- a/src/include/liburing/io_uring.h
> +++ b/src/include/liburing/io_uring.h
> @@ -202,6 +202,7 @@ struct io_sqring_offsets {
>    * sq_ring->flags
>    */
>   #define IORING_SQ_NEED_WAKEUP	(1U << 0) /* needs io_uring_enter wakeup */
> +#define IORING_SQ_CQ_OVERFLOW	(1U << 1)
>   
>   struct io_cqring_offsets {
>   	__u32 head;
> diff --git a/src/queue.c b/src/queue.c
> index 88e0294..1f00251 100644
> --- a/src/queue.c
> +++ b/src/queue.c
> @@ -32,6 +32,11 @@ static inline bool sq_ring_needs_enter(struct io_uring *ring,
>   	return false;
>   }
>   
> +static inline bool cq_ring_needs_flush(struct io_uring *ring)
> +{
> +	return IO_URING_READ_ONCE(*ring->sq.kflags) & IORING_SQ_CQ_OVERFLOW;
> +}
> +
>   static int __io_uring_peek_cqe(struct io_uring *ring,
>   			       struct io_uring_cqe **cqe_ptr)
>   {
> @@ -67,22 +72,26 @@ int __io_uring_get_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr,
>   	int ret = 0, err;
>   
>   	do {
> +		bool cq_overflow_flush = false;
>   		unsigned flags = 0;
>   
>   		err = __io_uring_peek_cqe(ring, &cqe);
>   		if (err)
>   			break;
>   		if (!cqe && !to_wait && !submit) {
> -			err = -EAGAIN;
> -			break;
> +			if (!cq_ring_needs_flush(ring)) {
> +				err = -EAGAIN;
> +				break;
> +			}
> +			cq_overflow_flush = true;
>   		}
>   		if (wait_nr && cqe)
>   			wait_nr--;
> -		if (wait_nr)
> +		if (wait_nr || cq_overflow_flush)
>   			flags = IORING_ENTER_GETEVENTS;
>   		if (submit)
>   			sq_ring_needs_enter(ring, submit, &flags);
> -		if (wait_nr || submit)
> +		if (wait_nr || submit || cq_overflow_flush)
>   			ret = __sys_io_uring_enter(ring->ring_fd, submit,
>   						   wait_nr, flags, sigmask);
>   		if (ret < 0) {
> 
> If you agree with this approach, could you test this and resubmit the
> two patches?
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] io_uring: export cq overflow status to userspace
  2020-07-08  3:25     ` Xiaoguang Wang
@ 2020-07-08  3:46       ` Jens Axboe
  2020-07-08  5:29         ` Xiaoguang Wang
  0 siblings, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2020-07-08  3:46 UTC (permalink / raw)
  To: Xiaoguang Wang; +Cc: io-uring, joseph.qi


> On Jul 7, 2020, at 9:25 PM, Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com> wrote:
> 
> hi,
> 
>>> On 7/7/20 8:28 AM, Jens Axboe wrote:
>>> On 7/7/20 7:24 AM, Xiaoguang Wang wrote:
>>>> For those applications which are not willing to use io_uring_enter()
>>>> to reap and handle cqes, they may completely rely on liburing's
>>>> io_uring_peek_cqe(), but if cq ring has overflowed, currently because
>>>> io_uring_peek_cqe() is not aware of this overflow, it won't enter
>>>> kernel to flush cqes, below test program can reveal this bug:
>>>> 
>>>> static void test_cq_overflow(struct io_uring *ring)
>>>> {
>>>>         struct io_uring_cqe *cqe;
>>>>         struct io_uring_sqe *sqe;
>>>>         int issued = 0;
>>>>         int ret = 0;
>>>> 
>>>>         do {
>>>>                 sqe = io_uring_get_sqe(ring);
>>>>                 if (!sqe) {
>>>>                         fprintf(stderr, "get sqe failed\n");
>>>>                         break;;
>>>>                 }
>>>>                 ret = io_uring_submit(ring);
>>>>                 if (ret <= 0) {
>>>>                         if (ret != -EBUSY)
>>>>                                 fprintf(stderr, "sqe submit failed: %d\n", ret);
>>>>                         break;
>>>>                 }
>>>>                 issued++;
>>>>         } while (ret > 0);
>>>>         assert(ret == -EBUSY);
>>>> 
>>>>         printf("issued requests: %d\n", issued);
>>>> 
>>>>         while (issued) {
>>>>                 ret = io_uring_peek_cqe(ring, &cqe);
>>>>                 if (ret) {
>>>>                         if (ret != -EAGAIN) {
>>>>                                 fprintf(stderr, "peek completion failed: %s\n",
>>>>                                         strerror(ret));
>>>>                                 break;
>>>>                         }
>>>>                         printf("left requets: %d\n", issued);
>>>>                         continue;
>>>>                 }
>>>>                 io_uring_cqe_seen(ring, cqe);
>>>>                 issued--;
>>>>                 printf("left requets: %d\n", issued);
>>>>         }
>>>> }
>>>> 
>>>> int main(int argc, char *argv[])
>>>> {
>>>>         int ret;
>>>>         struct io_uring ring;
>>>> 
>>>>         ret = io_uring_queue_init(16, &ring, 0);
>>>>         if (ret) {
>>>>                 fprintf(stderr, "ring setup failed: %d\n", ret);
>>>>                 return 1;
>>>>         }
>>>> 
>>>>         test_cq_overflow(&ring);
>>>>         return 0;
>>>> }
>>>> 
>>>> To fix this issue, export cq overflow status to userspace, then
>>>> helper functions() in liburing, such as io_uring_peek_cqe, can be
>>>> aware of this cq overflow and do flush accordingly.
>>> 
>>> Is there any way we can accomplish the same without exporting
>>> another set of flags? Would it be enough for the SQPOLl thread to set
>>> IORING_SQ_NEED_WAKEUP if we're in overflow condition? That should
>>> result in the app entering the kernel when it's flushed the user CQ
>>> side, and then the sqthread could attempt to flush the pending
>>> events as well.
>>> 
>>> Something like this, totally untested...
>> OK, took a closer look at this, it's a generic thing, not just
>> SQPOLL related. My bad!
>> Anyway, my suggestion would be to add IORING_SQ_CQ_OVERFLOW to the
>> existing flags, and then make a liburing change almost identical to
>> what you had.
>> Hence kernel side:
>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>> index d37d7ea5ebe5..af9fd5cefc51 100644
>> --- a/fs/io_uring.c
>> +++ b/fs/io_uring.c
>> @@ -1234,11 +1234,12 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
>>      struct io_uring_cqe *cqe;
>>      struct io_kiocb *req;
>>      unsigned long flags;
>> +    bool ret = true;
>>      LIST_HEAD(list);
>>        if (!force) {
>>          if (list_empty_careful(&ctx->cq_overflow_list))
>> -            return true;
>> +            goto done;
>>          if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
>>              rings->cq_ring_entries))
>>              return false;
>> @@ -1284,7 +1285,11 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
>>          io_put_req(req);
>>      }
>>  -    return cqe != NULL;
>> +    ret = cqe != NULL;
>> +done:
>> +    if (ret)
>> +        ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
>> +    return ret;
>>  }
>>    static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
>> @@ -5933,10 +5938,13 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
>>      int i, submitted = 0;
>>        /* if we have a backlog and couldn't flush it all, return BUSY */
>> -    if (test_bit(0, &ctx->sq_check_overflow)) {
>> +    if (unlikely(test_bit(0, &ctx->sq_check_overflow))) {
>>          if (!list_empty(&ctx->cq_overflow_list) &&
>> -            !io_cqring_overflow_flush(ctx, false))
>> +            !io_cqring_overflow_flush(ctx, false)) {
>> +            ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
>> +            smp_mb();
>>              return -EBUSY;
>> +        }
>>      }
>>        /* make sure SQ entry isn't read before tail */
>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>> index 92c22699a5a7..9c7e028beda5 100644
>> --- a/include/uapi/linux/io_uring.h
>> +++ b/include/uapi/linux/io_uring.h
>> @@ -197,6 +197,7 @@ struct io_sqring_offsets {
>>   * sq_ring->flags
>>   */
>>  #define IORING_SQ_NEED_WAKEUP    (1U << 0) /* needs io_uring_enter wakeup */
>> +#define IORING_SQ_CQ_OVERFLOW    (1U << 1) /* app needs to enter kernel */
>>    struct io_cqring_offsets {
>>      __u32 head;
> I think above codes are still not correct, you only set IORING_SQ_CQ_OVERFLOW in
> io_submit_sqes, but if cq ring has been overflowed and applications don't do io
> submit anymore, just calling io_uring_peek_cqe continuously, then applications
> still won't be aware of the cq ring overflow.

With the associated liburing change in that same email, the peek will enter the kernel just to flush the overflow. So not sure I see your concern?

> We can put the IORING_SQ_CQ_OVERFLOW set in __io_cqring_fill_event() when setting
> cq_check_overflow. In non-sqpoll, this will be safe, but in sqpoll mode, there maybe
> concurrent modifications to sq_flags, which is a race condition and may need extral
> lock to protect IORING_SQ_NEED_WAKEP and IORING_SQ_CQ_OVERFLOW.

That’s why I wanted to keep it in the shared submission path, as that’s properly synchronized. 



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] io_uring: export cq overflow status to userspace
  2020-07-08  3:46       ` Jens Axboe
@ 2020-07-08  5:29         ` Xiaoguang Wang
  2020-07-08 15:29           ` Jens Axboe
  0 siblings, 1 reply; 18+ messages in thread
From: Xiaoguang Wang @ 2020-07-08  5:29 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring, joseph.qi

hi,

> 
>> On Jul 7, 2020, at 9:25 PM, Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com> wrote:
>>
>> hi,
>>
>>>> On 7/7/20 8:28 AM, Jens Axboe wrote:
>>>> On 7/7/20 7:24 AM, Xiaoguang Wang wrote:
>>>>> For those applications which are not willing to use io_uring_enter()
>>>>> to reap and handle cqes, they may completely rely on liburing's
>>>>> io_uring_peek_cqe(), but if cq ring has overflowed, currently because
>>>>> io_uring_peek_cqe() is not aware of this overflow, it won't enter
>>>>> kernel to flush cqes, below test program can reveal this bug:
>>>>>
>>>>> static void test_cq_overflow(struct io_uring *ring)
>>>>> {
>>>>>          struct io_uring_cqe *cqe;
>>>>>          struct io_uring_sqe *sqe;
>>>>>          int issued = 0;
>>>>>          int ret = 0;
>>>>>
>>>>>          do {
>>>>>                  sqe = io_uring_get_sqe(ring);
>>>>>                  if (!sqe) {
>>>>>                          fprintf(stderr, "get sqe failed\n");
>>>>>                          break;;
>>>>>                  }
>>>>>                  ret = io_uring_submit(ring);
>>>>>                  if (ret <= 0) {
>>>>>                          if (ret != -EBUSY)
>>>>>                                  fprintf(stderr, "sqe submit failed: %d\n", ret);
>>>>>                          break;
>>>>>                  }
>>>>>                  issued++;
>>>>>          } while (ret > 0);
>>>>>          assert(ret == -EBUSY);
>>>>>
>>>>>          printf("issued requests: %d\n", issued);
>>>>>
>>>>>          while (issued) {
>>>>>                  ret = io_uring_peek_cqe(ring, &cqe);
>>>>>                  if (ret) {
>>>>>                          if (ret != -EAGAIN) {
>>>>>                                  fprintf(stderr, "peek completion failed: %s\n",
>>>>>                                          strerror(ret));
>>>>>                                  break;
>>>>>                          }
>>>>>                          printf("left requets: %d\n", issued);
>>>>>                          continue;
>>>>>                  }
>>>>>                  io_uring_cqe_seen(ring, cqe);
>>>>>                  issued--;
>>>>>                  printf("left requets: %d\n", issued);
>>>>>          }
>>>>> }
>>>>>
>>>>> int main(int argc, char *argv[])
>>>>> {
>>>>>          int ret;
>>>>>          struct io_uring ring;
>>>>>
>>>>>          ret = io_uring_queue_init(16, &ring, 0);
>>>>>          if (ret) {
>>>>>                  fprintf(stderr, "ring setup failed: %d\n", ret);
>>>>>                  return 1;
>>>>>          }
>>>>>
>>>>>          test_cq_overflow(&ring);
>>>>>          return 0;
>>>>> }
>>>>>
>>>>> To fix this issue, export cq overflow status to userspace, then
>>>>> helper functions() in liburing, such as io_uring_peek_cqe, can be
>>>>> aware of this cq overflow and do flush accordingly.
>>>>
>>>> Is there any way we can accomplish the same without exporting
>>>> another set of flags? Would it be enough for the SQPOLl thread to set
>>>> IORING_SQ_NEED_WAKEUP if we're in overflow condition? That should
>>>> result in the app entering the kernel when it's flushed the user CQ
>>>> side, and then the sqthread could attempt to flush the pending
>>>> events as well.
>>>>
>>>> Something like this, totally untested...
>>> OK, took a closer look at this, it's a generic thing, not just
>>> SQPOLL related. My bad!
>>> Anyway, my suggestion would be to add IORING_SQ_CQ_OVERFLOW to the
>>> existing flags, and then make a liburing change almost identical to
>>> what you had.
>>> Hence kernel side:
>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>> index d37d7ea5ebe5..af9fd5cefc51 100644
>>> --- a/fs/io_uring.c
>>> +++ b/fs/io_uring.c
>>> @@ -1234,11 +1234,12 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
>>>       struct io_uring_cqe *cqe;
>>>       struct io_kiocb *req;
>>>       unsigned long flags;
>>> +    bool ret = true;
>>>       LIST_HEAD(list);
>>>         if (!force) {
>>>           if (list_empty_careful(&ctx->cq_overflow_list))
>>> -            return true;
>>> +            goto done;
>>>           if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
>>>               rings->cq_ring_entries))
>>>               return false;
>>> @@ -1284,7 +1285,11 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
>>>           io_put_req(req);
>>>       }
>>>   -    return cqe != NULL;
>>> +    ret = cqe != NULL;
>>> +done:
>>> +    if (ret)
>>> +        ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
>>> +    return ret;
>>>   }
>>>     static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
>>> @@ -5933,10 +5938,13 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
>>>       int i, submitted = 0;
>>>         /* if we have a backlog and couldn't flush it all, return BUSY */
>>> -    if (test_bit(0, &ctx->sq_check_overflow)) {
>>> +    if (unlikely(test_bit(0, &ctx->sq_check_overflow))) {
>>>           if (!list_empty(&ctx->cq_overflow_list) &&
>>> -            !io_cqring_overflow_flush(ctx, false))
>>> +            !io_cqring_overflow_flush(ctx, false)) {
>>> +            ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
>>> +            smp_mb();
>>>               return -EBUSY;
>>> +        }
>>>       }
>>>         /* make sure SQ entry isn't read before tail */
>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>>> index 92c22699a5a7..9c7e028beda5 100644
>>> --- a/include/uapi/linux/io_uring.h
>>> +++ b/include/uapi/linux/io_uring.h
>>> @@ -197,6 +197,7 @@ struct io_sqring_offsets {
>>>    * sq_ring->flags
>>>    */
>>>   #define IORING_SQ_NEED_WAKEUP    (1U << 0) /* needs io_uring_enter wakeup */
>>> +#define IORING_SQ_CQ_OVERFLOW    (1U << 1) /* app needs to enter kernel */
>>>     struct io_cqring_offsets {
>>>       __u32 head;
>> I think above codes are still not correct, you only set IORING_SQ_CQ_OVERFLOW in
>> io_submit_sqes, but if cq ring has been overflowed and applications don't do io
>> submit anymore, just calling io_uring_peek_cqe continuously, then applications
>> still won't be aware of the cq ring overflow.
> 
> With the associated liburing change in that same email, the peek will enter the kernel just to flush the overflow. So not sure I see your concern?
I modify above test program a bit:
#include <errno.h>
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <assert.h>

#include "liburing.h"

static void test_cq_overflow(struct io_uring *ring)
{
         struct io_uring_cqe *cqe;
         struct io_uring_sqe *sqe;
         int issued = 0;
         int ret = 0;
         int i;

         for (i = 0; i < 33; i++) {
                 sqe = io_uring_get_sqe(ring);
                 if (!sqe) {
                         fprintf(stderr, "get sqe failed\n");
                         break;;
                 }
                 ret = io_uring_submit(ring);
                 if (ret <= 0) {
                         if (ret != -EBUSY)
                                 fprintf(stderr, "sqe submit failed: %d\n", ret);
                         break;
                 }
                 issued++;
         }

         printf("issued requests: %d\n", issued);

         while (issued) {
                 ret = io_uring_peek_cqe(ring, &cqe);
                 if (ret) {
                         if (ret != -EAGAIN) {
                                 fprintf(stderr, "peek completion failed: %s\n",
                                         strerror(ret));
                                 break;
                         }
                         printf("left requets: %d %d\n", issued, IO_URING_READ_ONCE(*ring->sq.kflags));
                         continue;
                 }
                 io_uring_cqe_seen(ring, cqe);
                 issued--;
                 printf("left requets: %d\n", issued);
         }
}

int main(int argc, char *argv[])
{
         int ret;
         struct io_uring ring;

         ret = io_uring_queue_init(16, &ring, 0);
         if (ret) {
                 fprintf(stderr, "ring setup failed: %d\n", ret);
                 return 1;
         }

         test_cq_overflow(&ring);
         return 0;
}

Though with your patches applied, we still can not peek the last cqe.
This test program will only issue 33 sqes, so it won't get EBUSY error.

Regards,
Xiaoguang Wang


> 
>> We can put the IORING_SQ_CQ_OVERFLOW set in __io_cqring_fill_event() when setting
>> cq_check_overflow. In non-sqpoll, this will be safe, but in sqpoll mode, there maybe
>> concurrent modifications to sq_flags, which is a race condition and may need extral
>> lock to protect IORING_SQ_NEED_WAKEP and IORING_SQ_CQ_OVERFLOW.
> 
> That’s why I wanted to keep it in the shared submission path, as that’s properly synchronized.
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] io_uring: export cq overflow status to userspace
  2020-07-08  5:29         ` Xiaoguang Wang
@ 2020-07-08 15:29           ` Jens Axboe
  2020-07-08 15:39             ` Xiaoguang Wang
  0 siblings, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2020-07-08 15:29 UTC (permalink / raw)
  To: Xiaoguang Wang; +Cc: io-uring, joseph.qi

On 7/7/20 11:29 PM, Xiaoguang Wang wrote:
> I modify above test program a bit:
> #include <errno.h>
> #include <stdio.h>
> #include <unistd.h>
> #include <stdlib.h>
> #include <string.h>
> #include <fcntl.h>
> #include <assert.h>
> 
> #include "liburing.h"
> 
> static void test_cq_overflow(struct io_uring *ring)
> {
>          struct io_uring_cqe *cqe;
>          struct io_uring_sqe *sqe;
>          int issued = 0;
>          int ret = 0;
>          int i;
> 
>          for (i = 0; i < 33; i++) {
>                  sqe = io_uring_get_sqe(ring);
>                  if (!sqe) {
>                          fprintf(stderr, "get sqe failed\n");
>                          break;;
>                  }
>                  ret = io_uring_submit(ring);
>                  if (ret <= 0) {
>                          if (ret != -EBUSY)
>                                  fprintf(stderr, "sqe submit failed: %d\n", ret);
>                          break;
>                  }
>                  issued++;
>          }
> 
>          printf("issued requests: %d\n", issued);
> 
>          while (issued) {
>                  ret = io_uring_peek_cqe(ring, &cqe);
>                  if (ret) {
>                          if (ret != -EAGAIN) {
>                                  fprintf(stderr, "peek completion failed: %s\n",
>                                          strerror(ret));
>                                  break;
>                          }
>                          printf("left requets: %d %d\n", issued, IO_URING_READ_ONCE(*ring->sq.kflags));
>                          continue;
>                  }
>                  io_uring_cqe_seen(ring, cqe);
>                  issued--;
>                  printf("left requets: %d\n", issued);
>          }
> }
> 
> int main(int argc, char *argv[])
> {
>          int ret;
>          struct io_uring ring;
> 
>          ret = io_uring_queue_init(16, &ring, 0);
>          if (ret) {
>                  fprintf(stderr, "ring setup failed: %d\n", ret);
>                  return 1;
>          }
> 
>          test_cq_overflow(&ring);
>          return 0;
> }
> 
> Though with your patches applied, we still can not peek the last cqe.
> This test program will only issue 33 sqes, so it won't get EBUSY error.

How about we make this even simpler, then - make the
IORING_SQ_CQ_OVERFLOW actually track the state, rather than when we fail
on submission. The liburing change would be the same, the kernel side
would then look like the below.


diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4c9a494c9f9f..01981926cdf4 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1342,6 +1342,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 	if (cqe) {
 		clear_bit(0, &ctx->sq_check_overflow);
 		clear_bit(0, &ctx->cq_check_overflow);
+		ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
 	}
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 	io_cqring_ev_posted(ctx);
@@ -1379,6 +1380,7 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
 		if (list_empty(&ctx->cq_overflow_list)) {
 			set_bit(0, &ctx->sq_check_overflow);
 			set_bit(0, &ctx->cq_check_overflow);
+			ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
 		}
 		req->flags |= REQ_F_OVERFLOW;
 		refcount_inc(&req->refs);
@@ -6375,9 +6377,9 @@ static int io_sq_thread(void *data)
 			}
 
 			/* Tell userspace we may need a wakeup call */
+			spin_lock_irq(&ctx->completion_lock);
 			ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
-			/* make sure to read SQ tail after writing flags */
-			smp_mb();
+			spin_unlock_irq(&ctx->completion_lock);
 
 			to_submit = io_sqring_entries(ctx);
 			if (!to_submit || ret == -EBUSY) {
@@ -6400,7 +6402,9 @@ static int io_sq_thread(void *data)
 			}
 			finish_wait(&ctx->sqo_wait, &wait);
 
+			spin_lock_irq(&ctx->completion_lock);
 			ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
+			spin_unlock_irq(&ctx->completion_lock);
 		}
 
 		mutex_lock(&ctx->uring_lock);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 8d033961cb78..91953b543125 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -198,6 +198,7 @@ struct io_sqring_offsets {
  * sq_ring->flags
  */
 #define IORING_SQ_NEED_WAKEUP	(1U << 0) /* needs io_uring_enter wakeup */
+#define IORING_SQ_CQ_OVERFLOW	(1U << 1) /* app needs to enter kernel */
 
 struct io_cqring_offsets {
 	__u32 head;

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH] io_uring: export cq overflow status to userspace
  2020-07-08 15:29           ` Jens Axboe
@ 2020-07-08 15:39             ` Xiaoguang Wang
  2020-07-08 15:41               ` Jens Axboe
  0 siblings, 1 reply; 18+ messages in thread
From: Xiaoguang Wang @ 2020-07-08 15:39 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring, joseph.qi

hi,

> On 7/7/20 11:29 PM, Xiaoguang Wang wrote:
>> I modify above test program a bit:
>> #include <errno.h>
>> #include <stdio.h>
>> #include <unistd.h>
>> #include <stdlib.h>
>> #include <string.h>
>> #include <fcntl.h>
>> #include <assert.h>
>>
>> #include "liburing.h"
>>
>> static void test_cq_overflow(struct io_uring *ring)
>> {
>>           struct io_uring_cqe *cqe;
>>           struct io_uring_sqe *sqe;
>>           int issued = 0;
>>           int ret = 0;
>>           int i;
>>
>>           for (i = 0; i < 33; i++) {
>>                   sqe = io_uring_get_sqe(ring);
>>                   if (!sqe) {
>>                           fprintf(stderr, "get sqe failed\n");
>>                           break;;
>>                   }
>>                   ret = io_uring_submit(ring);
>>                   if (ret <= 0) {
>>                           if (ret != -EBUSY)
>>                                   fprintf(stderr, "sqe submit failed: %d\n", ret);
>>                           break;
>>                   }
>>                   issued++;
>>           }
>>
>>           printf("issued requests: %d\n", issued);
>>
>>           while (issued) {
>>                   ret = io_uring_peek_cqe(ring, &cqe);
>>                   if (ret) {
>>                           if (ret != -EAGAIN) {
>>                                   fprintf(stderr, "peek completion failed: %s\n",
>>                                           strerror(ret));
>>                                   break;
>>                           }
>>                           printf("left requets: %d %d\n", issued, IO_URING_READ_ONCE(*ring->sq.kflags));
>>                           continue;
>>                   }
>>                   io_uring_cqe_seen(ring, cqe);
>>                   issued--;
>>                   printf("left requets: %d\n", issued);
>>           }
>> }
>>
>> int main(int argc, char *argv[])
>> {
>>           int ret;
>>           struct io_uring ring;
>>
>>           ret = io_uring_queue_init(16, &ring, 0);
>>           if (ret) {
>>                   fprintf(stderr, "ring setup failed: %d\n", ret);
>>                   return 1;
>>           }
>>
>>           test_cq_overflow(&ring);
>>           return 0;
>> }
>>
>> Though with your patches applied, we still can not peek the last cqe.
>> This test program will only issue 33 sqes, so it won't get EBUSY error.
> 
> How about we make this even simpler, then - make the
> IORING_SQ_CQ_OVERFLOW actually track the state, rather than when we fail
> on submission. The liburing change would be the same, the kernel side
> would then look like the below.
> 
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 4c9a494c9f9f..01981926cdf4 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -1342,6 +1342,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
>   	if (cqe) {
>   		clear_bit(0, &ctx->sq_check_overflow);
>   		clear_bit(0, &ctx->cq_check_overflow);
> +		ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
>   	}
>   	spin_unlock_irqrestore(&ctx->completion_lock, flags);
>   	io_cqring_ev_posted(ctx);
> @@ -1379,6 +1380,7 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
>   		if (list_empty(&ctx->cq_overflow_list)) {
>   			set_bit(0, &ctx->sq_check_overflow);
>   			set_bit(0, &ctx->cq_check_overflow);
> +			ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
>   		}
>   		req->flags |= REQ_F_OVERFLOW;
>   		refcount_inc(&req->refs);
> @@ -6375,9 +6377,9 @@ static int io_sq_thread(void *data)
>   			}
>   
>   			/* Tell userspace we may need a wakeup call */
> +			spin_lock_irq(&ctx->completion_lock);
>   			ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
> -			/* make sure to read SQ tail after writing flags */
> -			smp_mb();
> +			spin_unlock_irq(&ctx->completion_lock);
>   
>   			to_submit = io_sqring_entries(ctx);
>   			if (!to_submit || ret == -EBUSY) {
> @@ -6400,7 +6402,9 @@ static int io_sq_thread(void *data)
>   			}
>   			finish_wait(&ctx->sqo_wait, &wait);
>   
> +			spin_lock_irq(&ctx->completion_lock);
>   			ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
> +			spin_unlock_irq(&ctx->completion_lock);
>   		}
>   
>   		mutex_lock(&ctx->uring_lock);
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 8d033961cb78..91953b543125 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -198,6 +198,7 @@ struct io_sqring_offsets {
>    * sq_ring->flags
>    */
>   #define IORING_SQ_NEED_WAKEUP	(1U << 0) /* needs io_uring_enter wakeup */
> +#define IORING_SQ_CQ_OVERFLOW	(1U << 1) /* app needs to enter kernel */
>   
>   struct io_cqring_offsets {
>   	__u32 head;
> 
Looks good, I think it'll work now.
I'll test and send patches soon, thanks.

Regards,
Xiaoguang Wang


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] io_uring: export cq overflow status to userspace
  2020-07-08 15:39             ` Xiaoguang Wang
@ 2020-07-08 15:41               ` Jens Axboe
  2020-07-08 16:51                 ` Xiaoguang Wang
  0 siblings, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2020-07-08 15:41 UTC (permalink / raw)
  To: Xiaoguang Wang; +Cc: io-uring, joseph.qi

On 7/8/20 9:39 AM, Xiaoguang Wang wrote:
> hi,
> 
>> On 7/7/20 11:29 PM, Xiaoguang Wang wrote:
>>> I modify above test program a bit:
>>> #include <errno.h>
>>> #include <stdio.h>
>>> #include <unistd.h>
>>> #include <stdlib.h>
>>> #include <string.h>
>>> #include <fcntl.h>
>>> #include <assert.h>
>>>
>>> #include "liburing.h"
>>>
>>> static void test_cq_overflow(struct io_uring *ring)
>>> {
>>>           struct io_uring_cqe *cqe;
>>>           struct io_uring_sqe *sqe;
>>>           int issued = 0;
>>>           int ret = 0;
>>>           int i;
>>>
>>>           for (i = 0; i < 33; i++) {
>>>                   sqe = io_uring_get_sqe(ring);
>>>                   if (!sqe) {
>>>                           fprintf(stderr, "get sqe failed\n");
>>>                           break;;
>>>                   }
>>>                   ret = io_uring_submit(ring);
>>>                   if (ret <= 0) {
>>>                           if (ret != -EBUSY)
>>>                                   fprintf(stderr, "sqe submit failed: %d\n", ret);
>>>                           break;
>>>                   }
>>>                   issued++;
>>>           }
>>>
>>>           printf("issued requests: %d\n", issued);
>>>
>>>           while (issued) {
>>>                   ret = io_uring_peek_cqe(ring, &cqe);
>>>                   if (ret) {
>>>                           if (ret != -EAGAIN) {
>>>                                   fprintf(stderr, "peek completion failed: %s\n",
>>>                                           strerror(ret));
>>>                                   break;
>>>                           }
>>>                           printf("left requets: %d %d\n", issued, IO_URING_READ_ONCE(*ring->sq.kflags));
>>>                           continue;
>>>                   }
>>>                   io_uring_cqe_seen(ring, cqe);
>>>                   issued--;
>>>                   printf("left requets: %d\n", issued);
>>>           }
>>> }
>>>
>>> int main(int argc, char *argv[])
>>> {
>>>           int ret;
>>>           struct io_uring ring;
>>>
>>>           ret = io_uring_queue_init(16, &ring, 0);
>>>           if (ret) {
>>>                   fprintf(stderr, "ring setup failed: %d\n", ret);
>>>                   return 1;
>>>           }
>>>
>>>           test_cq_overflow(&ring);
>>>           return 0;
>>> }
>>>
>>> Though with your patches applied, we still can not peek the last cqe.
>>> This test program will only issue 33 sqes, so it won't get EBUSY error.
>>
>> How about we make this even simpler, then - make the
>> IORING_SQ_CQ_OVERFLOW actually track the state, rather than when we fail
>> on submission. The liburing change would be the same, the kernel side
>> would then look like the below.
>>
>>
>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>> index 4c9a494c9f9f..01981926cdf4 100644
>> --- a/fs/io_uring.c
>> +++ b/fs/io_uring.c
>> @@ -1342,6 +1342,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
>>   	if (cqe) {
>>   		clear_bit(0, &ctx->sq_check_overflow);
>>   		clear_bit(0, &ctx->cq_check_overflow);
>> +		ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
>>   	}
>>   	spin_unlock_irqrestore(&ctx->completion_lock, flags);
>>   	io_cqring_ev_posted(ctx);
>> @@ -1379,6 +1380,7 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
>>   		if (list_empty(&ctx->cq_overflow_list)) {
>>   			set_bit(0, &ctx->sq_check_overflow);
>>   			set_bit(0, &ctx->cq_check_overflow);
>> +			ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
>>   		}
>>   		req->flags |= REQ_F_OVERFLOW;
>>   		refcount_inc(&req->refs);
>> @@ -6375,9 +6377,9 @@ static int io_sq_thread(void *data)
>>   			}
>>   
>>   			/* Tell userspace we may need a wakeup call */
>> +			spin_lock_irq(&ctx->completion_lock);
>>   			ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
>> -			/* make sure to read SQ tail after writing flags */
>> -			smp_mb();
>> +			spin_unlock_irq(&ctx->completion_lock);
>>   
>>   			to_submit = io_sqring_entries(ctx);
>>   			if (!to_submit || ret == -EBUSY) {
>> @@ -6400,7 +6402,9 @@ static int io_sq_thread(void *data)
>>   			}
>>   			finish_wait(&ctx->sqo_wait, &wait);
>>   
>> +			spin_lock_irq(&ctx->completion_lock);
>>   			ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
>> +			spin_unlock_irq(&ctx->completion_lock);
>>   		}
>>   
>>   		mutex_lock(&ctx->uring_lock);
>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>> index 8d033961cb78..91953b543125 100644
>> --- a/include/uapi/linux/io_uring.h
>> +++ b/include/uapi/linux/io_uring.h
>> @@ -198,6 +198,7 @@ struct io_sqring_offsets {
>>    * sq_ring->flags
>>    */
>>   #define IORING_SQ_NEED_WAKEUP	(1U << 0) /* needs io_uring_enter wakeup */
>> +#define IORING_SQ_CQ_OVERFLOW	(1U << 1) /* app needs to enter kernel */
>>   
>>   struct io_cqring_offsets {
>>   	__u32 head;
>>
> Looks good, I think it'll work now.
> I'll test and send patches soon, thanks.

One missing bit clear, and I corrected that last comment. Just base on
this one, thanks!


diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4c9a494c9f9f..0b6a4b2d7e76 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1342,6 +1342,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 	if (cqe) {
 		clear_bit(0, &ctx->sq_check_overflow);
 		clear_bit(0, &ctx->cq_check_overflow);
+		ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
 	}
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 	io_cqring_ev_posted(ctx);
@@ -1379,6 +1380,7 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
 		if (list_empty(&ctx->cq_overflow_list)) {
 			set_bit(0, &ctx->sq_check_overflow);
 			set_bit(0, &ctx->cq_check_overflow);
+			ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
 		}
 		req->flags |= REQ_F_OVERFLOW;
 		refcount_inc(&req->refs);
@@ -6375,9 +6377,9 @@ static int io_sq_thread(void *data)
 			}
 
 			/* Tell userspace we may need a wakeup call */
+			spin_lock_irq(&ctx->completion_lock);
 			ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
-			/* make sure to read SQ tail after writing flags */
-			smp_mb();
+			spin_unlock_irq(&ctx->completion_lock);
 
 			to_submit = io_sqring_entries(ctx);
 			if (!to_submit || ret == -EBUSY) {
@@ -6400,7 +6402,9 @@ static int io_sq_thread(void *data)
 			}
 			finish_wait(&ctx->sqo_wait, &wait);
 
+			spin_lock_irq(&ctx->completion_lock);
 			ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
+			spin_unlock_irq(&ctx->completion_lock);
 		}
 
 		mutex_lock(&ctx->uring_lock);
@@ -7810,6 +7814,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
 			if (list_empty(&ctx->cq_overflow_list)) {
 				clear_bit(0, &ctx->sq_check_overflow);
 				clear_bit(0, &ctx->cq_check_overflow);
+				ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
 			}
 			spin_unlock_irq(&ctx->completion_lock);
 
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 8d033961cb78..d65fde732518 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -198,6 +198,7 @@ struct io_sqring_offsets {
  * sq_ring->flags
  */
 #define IORING_SQ_NEED_WAKEUP	(1U << 0) /* needs io_uring_enter wakeup */
+#define IORING_SQ_CQ_OVERFLOW	(1U << 1) /* CQ ring is overflown */
 
 struct io_cqring_offsets {
 	__u32 head;

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH] io_uring: export cq overflow status to userspace
  2020-07-08 15:41               ` Jens Axboe
@ 2020-07-08 16:51                 ` Xiaoguang Wang
  2020-07-08 21:33                   ` Jens Axboe
  0 siblings, 1 reply; 18+ messages in thread
From: Xiaoguang Wang @ 2020-07-08 16:51 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring, joseph.qi

hi,

> On 7/8/20 9:39 AM, Xiaoguang Wang wrote:
>> hi,
>>
>>> On 7/7/20 11:29 PM, Xiaoguang Wang wrote:
>>>> I modify above test program a bit:
>>>> #include <errno.h>
>>>> #include <stdio.h>
>>>> #include <unistd.h>
>>>> #include <stdlib.h>
>>>> #include <string.h>
>>>> #include <fcntl.h>
>>>> #include <assert.h>
>>>>
>>>> #include "liburing.h"
>>>>
>>>> static void test_cq_overflow(struct io_uring *ring)
>>>> {
>>>>            struct io_uring_cqe *cqe;
>>>>            struct io_uring_sqe *sqe;
>>>>            int issued = 0;
>>>>            int ret = 0;
>>>>            int i;
>>>>
>>>>            for (i = 0; i < 33; i++) {
>>>>                    sqe = io_uring_get_sqe(ring);
>>>>                    if (!sqe) {
>>>>                            fprintf(stderr, "get sqe failed\n");
>>>>                            break;;
>>>>                    }
>>>>                    ret = io_uring_submit(ring);
>>>>                    if (ret <= 0) {
>>>>                            if (ret != -EBUSY)
>>>>                                    fprintf(stderr, "sqe submit failed: %d\n", ret);
>>>>                            break;
>>>>                    }
>>>>                    issued++;
>>>>            }
>>>>
>>>>            printf("issued requests: %d\n", issued);
>>>>
>>>>            while (issued) {
>>>>                    ret = io_uring_peek_cqe(ring, &cqe);
>>>>                    if (ret) {
>>>>                            if (ret != -EAGAIN) {
>>>>                                    fprintf(stderr, "peek completion failed: %s\n",
>>>>                                            strerror(ret));
>>>>                                    break;
>>>>                            }
>>>>                            printf("left requets: %d %d\n", issued, IO_URING_READ_ONCE(*ring->sq.kflags));
>>>>                            continue;
>>>>                    }
>>>>                    io_uring_cqe_seen(ring, cqe);
>>>>                    issued--;
>>>>                    printf("left requets: %d\n", issued);
>>>>            }
>>>> }
>>>>
>>>> int main(int argc, char *argv[])
>>>> {
>>>>            int ret;
>>>>            struct io_uring ring;
>>>>
>>>>            ret = io_uring_queue_init(16, &ring, 0);
>>>>            if (ret) {
>>>>                    fprintf(stderr, "ring setup failed: %d\n", ret);
>>>>                    return 1;
>>>>            }
>>>>
>>>>            test_cq_overflow(&ring);
>>>>            return 0;
>>>> }
>>>>
>>>> Though with your patches applied, we still can not peek the last cqe.
>>>> This test program will only issue 33 sqes, so it won't get EBUSY error.
>>>
>>> How about we make this even simpler, then - make the
>>> IORING_SQ_CQ_OVERFLOW actually track the state, rather than when we fail
>>> on submission. The liburing change would be the same, the kernel side
>>> would then look like the below.
>>>
>>>
>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>> index 4c9a494c9f9f..01981926cdf4 100644
>>> --- a/fs/io_uring.c
>>> +++ b/fs/io_uring.c
>>> @@ -1342,6 +1342,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
>>>    	if (cqe) {
>>>    		clear_bit(0, &ctx->sq_check_overflow);
>>>    		clear_bit(0, &ctx->cq_check_overflow);
>>> +		ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
>>>    	}
>>>    	spin_unlock_irqrestore(&ctx->completion_lock, flags);
>>>    	io_cqring_ev_posted(ctx);
>>> @@ -1379,6 +1380,7 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
>>>    		if (list_empty(&ctx->cq_overflow_list)) {
>>>    			set_bit(0, &ctx->sq_check_overflow);
>>>    			set_bit(0, &ctx->cq_check_overflow);
>>> +			ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
Some callers to __io_cqring_fill_event() don't hold completion_lock, for example:
==> io_iopoll_complete
====> __io_cqring_fill_event()
So this patch maybe still not safe when SQPOLL is enabled.
Do you perfer adding a new lock or just do completion_lock here only when cq ring is overflowed?

Regards,
Xiaoguang Wang

>>>    		}
>>>    		req->flags |= REQ_F_OVERFLOW;
>>>    		refcount_inc(&req->refs);
>>> @@ -6375,9 +6377,9 @@ static int io_sq_thread(void *data)
>>>    			}
>>>    
>>>    			/* Tell userspace we may need a wakeup call */
>>> +			spin_lock_irq(&ctx->completion_lock);
>>>    			ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
>>> -			/* make sure to read SQ tail after writing flags */
>>> -			smp_mb();
>>> +			spin_unlock_irq(&ctx->completion_lock);
>>>    
>>>    			to_submit = io_sqring_entries(ctx);
>>>    			if (!to_submit || ret == -EBUSY) {
>>> @@ -6400,7 +6402,9 @@ static int io_sq_thread(void *data)
>>>    			}
>>>    			finish_wait(&ctx->sqo_wait, &wait);
>>>    
>>> +			spin_lock_irq(&ctx->completion_lock);
>>>    			ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
>>> +			spin_unlock_irq(&ctx->completion_lock);
>>>    		}
>>>    
>>>    		mutex_lock(&ctx->uring_lock);
>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>>> index 8d033961cb78..91953b543125 100644
>>> --- a/include/uapi/linux/io_uring.h
>>> +++ b/include/uapi/linux/io_uring.h
>>> @@ -198,6 +198,7 @@ struct io_sqring_offsets {
>>>     * sq_ring->flags
>>>     */
>>>    #define IORING_SQ_NEED_WAKEUP	(1U << 0) /* needs io_uring_enter wakeup */
>>> +#define IORING_SQ_CQ_OVERFLOW	(1U << 1) /* app needs to enter kernel */
>>>    
>>>    struct io_cqring_offsets {
>>>    	__u32 head;
>>>
>> Looks good, I think it'll work now.
>> I'll test and send patches soon, thanks.
> 
> One missing bit clear, and I corrected that last comment. Just base on
> this one, thanks!
> 
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 4c9a494c9f9f..0b6a4b2d7e76 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -1342,6 +1342,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
>   	if (cqe) {
>   		clear_bit(0, &ctx->sq_check_overflow);
>   		clear_bit(0, &ctx->cq_check_overflow);
> +		ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
>   	}
>   	spin_unlock_irqrestore(&ctx->completion_lock, flags);
>   	io_cqring_ev_posted(ctx);
> @@ -1379,6 +1380,7 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
>   		if (list_empty(&ctx->cq_overflow_list)) {
>   			set_bit(0, &ctx->sq_check_overflow);
>   			set_bit(0, &ctx->cq_check_overflow);
> +			ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
>   		}
>   		req->flags |= REQ_F_OVERFLOW;
>   		refcount_inc(&req->refs);
> @@ -6375,9 +6377,9 @@ static int io_sq_thread(void *data)
>   			}
>   
>   			/* Tell userspace we may need a wakeup call */
> +			spin_lock_irq(&ctx->completion_lock);
>   			ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
> -			/* make sure to read SQ tail after writing flags */
> -			smp_mb();
> +			spin_unlock_irq(&ctx->completion_lock);
>   
>   			to_submit = io_sqring_entries(ctx);
>   			if (!to_submit || ret == -EBUSY) {
> @@ -6400,7 +6402,9 @@ static int io_sq_thread(void *data)
>   			}
>   			finish_wait(&ctx->sqo_wait, &wait);
>   
> +			spin_lock_irq(&ctx->completion_lock);
>   			ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
> +			spin_unlock_irq(&ctx->completion_lock);
>   		}
>   
>   		mutex_lock(&ctx->uring_lock);
> @@ -7810,6 +7814,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
>   			if (list_empty(&ctx->cq_overflow_list)) {
>   				clear_bit(0, &ctx->sq_check_overflow);
>   				clear_bit(0, &ctx->cq_check_overflow);
> +				ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
>   			}
>   			spin_unlock_irq(&ctx->completion_lock);
>   
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 8d033961cb78..d65fde732518 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -198,6 +198,7 @@ struct io_sqring_offsets {
>    * sq_ring->flags
>    */
>   #define IORING_SQ_NEED_WAKEUP	(1U << 0) /* needs io_uring_enter wakeup */
> +#define IORING_SQ_CQ_OVERFLOW	(1U << 1) /* CQ ring is overflown */
>   
>   struct io_cqring_offsets {
>   	__u32 head;
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] io_uring: export cq overflow status to userspace
  2020-07-08 16:51                 ` Xiaoguang Wang
@ 2020-07-08 21:33                   ` Jens Axboe
  2020-07-09  0:52                     ` Xiaoguang Wang
  0 siblings, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2020-07-08 21:33 UTC (permalink / raw)
  To: Xiaoguang Wang; +Cc: io-uring, joseph.qi

On 7/8/20 10:51 AM, Xiaoguang Wang wrote:
> hi,
> 
>> On 7/8/20 9:39 AM, Xiaoguang Wang wrote:
>>> hi,
>>>
>>>> On 7/7/20 11:29 PM, Xiaoguang Wang wrote:
>>>>> I modify above test program a bit:
>>>>> #include <errno.h>
>>>>> #include <stdio.h>
>>>>> #include <unistd.h>
>>>>> #include <stdlib.h>
>>>>> #include <string.h>
>>>>> #include <fcntl.h>
>>>>> #include <assert.h>
>>>>>
>>>>> #include "liburing.h"
>>>>>
>>>>> static void test_cq_overflow(struct io_uring *ring)
>>>>> {
>>>>>            struct io_uring_cqe *cqe;
>>>>>            struct io_uring_sqe *sqe;
>>>>>            int issued = 0;
>>>>>            int ret = 0;
>>>>>            int i;
>>>>>
>>>>>            for (i = 0; i < 33; i++) {
>>>>>                    sqe = io_uring_get_sqe(ring);
>>>>>                    if (!sqe) {
>>>>>                            fprintf(stderr, "get sqe failed\n");
>>>>>                            break;;
>>>>>                    }
>>>>>                    ret = io_uring_submit(ring);
>>>>>                    if (ret <= 0) {
>>>>>                            if (ret != -EBUSY)
>>>>>                                    fprintf(stderr, "sqe submit failed: %d\n", ret);
>>>>>                            break;
>>>>>                    }
>>>>>                    issued++;
>>>>>            }
>>>>>
>>>>>            printf("issued requests: %d\n", issued);
>>>>>
>>>>>            while (issued) {
>>>>>                    ret = io_uring_peek_cqe(ring, &cqe);
>>>>>                    if (ret) {
>>>>>                            if (ret != -EAGAIN) {
>>>>>                                    fprintf(stderr, "peek completion failed: %s\n",
>>>>>                                            strerror(ret));
>>>>>                                    break;
>>>>>                            }
>>>>>                            printf("left requets: %d %d\n", issued, IO_URING_READ_ONCE(*ring->sq.kflags));
>>>>>                            continue;
>>>>>                    }
>>>>>                    io_uring_cqe_seen(ring, cqe);
>>>>>                    issued--;
>>>>>                    printf("left requets: %d\n", issued);
>>>>>            }
>>>>> }
>>>>>
>>>>> int main(int argc, char *argv[])
>>>>> {
>>>>>            int ret;
>>>>>            struct io_uring ring;
>>>>>
>>>>>            ret = io_uring_queue_init(16, &ring, 0);
>>>>>            if (ret) {
>>>>>                    fprintf(stderr, "ring setup failed: %d\n", ret);
>>>>>                    return 1;
>>>>>            }
>>>>>
>>>>>            test_cq_overflow(&ring);
>>>>>            return 0;
>>>>> }
>>>>>
>>>>> Though with your patches applied, we still can not peek the last cqe.
>>>>> This test program will only issue 33 sqes, so it won't get EBUSY error.
>>>>
>>>> How about we make this even simpler, then - make the
>>>> IORING_SQ_CQ_OVERFLOW actually track the state, rather than when we fail
>>>> on submission. The liburing change would be the same, the kernel side
>>>> would then look like the below.
>>>>
>>>>
>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>> index 4c9a494c9f9f..01981926cdf4 100644
>>>> --- a/fs/io_uring.c
>>>> +++ b/fs/io_uring.c
>>>> @@ -1342,6 +1342,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
>>>>    	if (cqe) {
>>>>    		clear_bit(0, &ctx->sq_check_overflow);
>>>>    		clear_bit(0, &ctx->cq_check_overflow);
>>>> +		ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
>>>>    	}
>>>>    	spin_unlock_irqrestore(&ctx->completion_lock, flags);
>>>>    	io_cqring_ev_posted(ctx);
>>>> @@ -1379,6 +1380,7 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
>>>>    		if (list_empty(&ctx->cq_overflow_list)) {
>>>>    			set_bit(0, &ctx->sq_check_overflow);
>>>>    			set_bit(0, &ctx->cq_check_overflow);
>>>> +			ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
> Some callers to __io_cqring_fill_event() don't hold completion_lock, for example:
> ==> io_iopoll_complete
> ====> __io_cqring_fill_event()
> So this patch maybe still not safe when SQPOLL is enabled.
> Do you perfer adding a new lock or just do completion_lock here only when cq ring is overflowed?

The polled side isn't IRQ driven, so should be serialized separately. This works
because we don't allow non-polled IO on a polled context, and vice versa. If not,
we'd have bigger issues than just the flags modification.

So it should be fine as-is.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] io_uring: export cq overflow status to userspace
  2020-07-08 21:33                   ` Jens Axboe
@ 2020-07-09  0:52                     ` Xiaoguang Wang
  0 siblings, 0 replies; 18+ messages in thread
From: Xiaoguang Wang @ 2020-07-09  0:52 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring, joseph.qi

hi,

> On 7/8/20 10:51 AM, Xiaoguang Wang wrote:
>> hi,
>>
>>> On 7/8/20 9:39 AM, Xiaoguang Wang wrote:
>>>> hi,
>>>>
>>>>> On 7/7/20 11:29 PM, Xiaoguang Wang wrote:
>>>>>> I modify above test program a bit:
>>>>>> #include <errno.h>
>>>>>> #include <stdio.h>
>>>>>> #include <unistd.h>
>>>>>> #include <stdlib.h>
>>>>>> #include <string.h>
>>>>>> #include <fcntl.h>
>>>>>> #include <assert.h>
>>>>>>
>>>>>> #include "liburing.h"
>>>>>>
>>>>>> static void test_cq_overflow(struct io_uring *ring)
>>>>>> {
>>>>>>             struct io_uring_cqe *cqe;
>>>>>>             struct io_uring_sqe *sqe;
>>>>>>             int issued = 0;
>>>>>>             int ret = 0;
>>>>>>             int i;
>>>>>>
>>>>>>             for (i = 0; i < 33; i++) {
>>>>>>                     sqe = io_uring_get_sqe(ring);
>>>>>>                     if (!sqe) {
>>>>>>                             fprintf(stderr, "get sqe failed\n");
>>>>>>                             break;;
>>>>>>                     }
>>>>>>                     ret = io_uring_submit(ring);
>>>>>>                     if (ret <= 0) {
>>>>>>                             if (ret != -EBUSY)
>>>>>>                                     fprintf(stderr, "sqe submit failed: %d\n", ret);
>>>>>>                             break;
>>>>>>                     }
>>>>>>                     issued++;
>>>>>>             }
>>>>>>
>>>>>>             printf("issued requests: %d\n", issued);
>>>>>>
>>>>>>             while (issued) {
>>>>>>                     ret = io_uring_peek_cqe(ring, &cqe);
>>>>>>                     if (ret) {
>>>>>>                             if (ret != -EAGAIN) {
>>>>>>                                     fprintf(stderr, "peek completion failed: %s\n",
>>>>>>                                             strerror(ret));
>>>>>>                                     break;
>>>>>>                             }
>>>>>>                             printf("left requets: %d %d\n", issued, IO_URING_READ_ONCE(*ring->sq.kflags));
>>>>>>                             continue;
>>>>>>                     }
>>>>>>                     io_uring_cqe_seen(ring, cqe);
>>>>>>                     issued--;
>>>>>>                     printf("left requets: %d\n", issued);
>>>>>>             }
>>>>>> }
>>>>>>
>>>>>> int main(int argc, char *argv[])
>>>>>> {
>>>>>>             int ret;
>>>>>>             struct io_uring ring;
>>>>>>
>>>>>>             ret = io_uring_queue_init(16, &ring, 0);
>>>>>>             if (ret) {
>>>>>>                     fprintf(stderr, "ring setup failed: %d\n", ret);
>>>>>>                     return 1;
>>>>>>             }
>>>>>>
>>>>>>             test_cq_overflow(&ring);
>>>>>>             return 0;
>>>>>> }
>>>>>>
>>>>>> Though with your patches applied, we still can not peek the last cqe.
>>>>>> This test program will only issue 33 sqes, so it won't get EBUSY error.
>>>>>
>>>>> How about we make this even simpler, then - make the
>>>>> IORING_SQ_CQ_OVERFLOW actually track the state, rather than when we fail
>>>>> on submission. The liburing change would be the same, the kernel side
>>>>> would then look like the below.
>>>>>
>>>>>
>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>>> index 4c9a494c9f9f..01981926cdf4 100644
>>>>> --- a/fs/io_uring.c
>>>>> +++ b/fs/io_uring.c
>>>>> @@ -1342,6 +1342,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
>>>>>     	if (cqe) {
>>>>>     		clear_bit(0, &ctx->sq_check_overflow);
>>>>>     		clear_bit(0, &ctx->cq_check_overflow);
>>>>> +		ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
>>>>>     	}
>>>>>     	spin_unlock_irqrestore(&ctx->completion_lock, flags);
>>>>>     	io_cqring_ev_posted(ctx);
>>>>> @@ -1379,6 +1380,7 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
>>>>>     		if (list_empty(&ctx->cq_overflow_list)) {
>>>>>     			set_bit(0, &ctx->sq_check_overflow);
>>>>>     			set_bit(0, &ctx->cq_check_overflow);
>>>>> +			ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
>> Some callers to __io_cqring_fill_event() don't hold completion_lock, for example:
>> ==> io_iopoll_complete
>> ====> __io_cqring_fill_event()
>> So this patch maybe still not safe when SQPOLL is enabled.
>> Do you perfer adding a new lock or just do completion_lock here only when cq ring is overflowed?
> 
> The polled side isn't IRQ driven, so should be serialized separately. This works
> because we don't allow non-polled IO on a polled context, and vice versa. If not,
> we'd have bigger issues than just the flags modification.
> 
> So it should be fine as-is.
Thanks for explanation, previously I worry about below race:
==> io_uring_enter
==== > io_iopoll_check
======> io_iopoll_getevents
========> io_do_iopoll
==========> io_iopoll_complete
============> __io_cqring_fill_event, which will modify sq_flags

and

==> io_sq_thread
====> will go to sleep, so also modify sq_flags.

But indeed above race won't happen, becuase if SQPOLL and IOPOLL are both
enabled, io_uring_enter will rely on io_sq_thread to do the iopoll job.
		if (ctx->flags & IORING_SETUP_IOPOLL &&
		    !(ctx->flags & IORING_SETUP_SQPOLL)) {
			ret = io_iopoll_check(ctx, &nr_events, min_complete);
		} else {
			ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
		}

Regards,
Xiaoguang Wang


> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2020-07-09  0:52 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-07 13:24 [PATCH] io_uring: export cq overflow status to userspace Xiaoguang Wang
2020-07-07 14:28 ` Jens Axboe
2020-07-07 16:21   ` Jens Axboe
2020-07-07 16:25     ` Pavel Begunkov
2020-07-07 16:30       ` Jens Axboe
2020-07-07 16:36     ` Xiaoguang Wang
2020-07-07 17:23       ` Jens Axboe
2020-07-08  3:25     ` Xiaoguang Wang
2020-07-08  3:46       ` Jens Axboe
2020-07-08  5:29         ` Xiaoguang Wang
2020-07-08 15:29           ` Jens Axboe
2020-07-08 15:39             ` Xiaoguang Wang
2020-07-08 15:41               ` Jens Axboe
2020-07-08 16:51                 ` Xiaoguang Wang
2020-07-08 21:33                   ` Jens Axboe
2020-07-09  0:52                     ` Xiaoguang Wang
2020-07-07 16:29   ` Xiaoguang Wang
2020-07-07 16:30     ` Jens Axboe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.