All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] io_uring CQ ring backpressure
@ 2019-11-06 16:21 Jens Axboe
  2019-11-06 19:12 ` Pavel Begunkov
  2019-11-06 19:51 ` Jann Horn
  0 siblings, 2 replies; 9+ messages in thread
From: Jens Axboe @ 2019-11-06 16:21 UTC (permalink / raw)
  To: io-uring, linux-block

Currently we drop completion events, if the CQ ring is full. That's fine
for requests with bounded completion times, but it may make it harder to
use io_uring with networked IO where request completion times are
generally unbounded. Or with POLL, for example, which is also unbounded.

This patch adds IORING_SETUP_CQ_NODROP, which changes the behavior a bit
for CQ ring overflows. First of all, it doesn't overflow the ring, it
simply stores backlog of completions that we weren't able to put into
the CQ ring. To prevent the backlog from growing indefinitely, if the
backlog is non-empty, we apply back pressure on IO submissions. Any
attempt to submit new IO with a non-empty backlog will get an -EBUSY
return from the kernel.

I think that makes for a pretty sane API in terms of how the application
can handle it. With CQ_NODROP enabled, we'll never drop a completion
event (well unless we're totally out of memory...), but we'll also not
allow submissions with a completion backlog.

---

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b647cdf0312c..12e9fe2479f4 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -207,6 +207,7 @@ struct io_ring_ctx {
 
 		struct list_head	defer_list;
 		struct list_head	timeout_list;
+		struct list_head	cq_overflow_list;
 
 		wait_queue_head_t	inflight_wait;
 	} ____cacheline_aligned_in_smp;
@@ -414,6 +415,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 
 	ctx->flags = p->flags;
 	init_waitqueue_head(&ctx->cq_wait);
+	INIT_LIST_HEAD(&ctx->cq_overflow_list);
 	init_completion(&ctx->ctx_done);
 	init_completion(&ctx->sqo_thread_started);
 	mutex_init(&ctx->uring_lock);
@@ -588,6 +590,77 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
 	return &rings->cqes[tail & ctx->cq_mask];
 }
 
+static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+{
+	if (waitqueue_active(&ctx->wait))
+		wake_up(&ctx->wait);
+	if (waitqueue_active(&ctx->sqo_wait))
+		wake_up(&ctx->sqo_wait);
+	if (ctx->cq_ev_fd)
+		eventfd_signal(ctx->cq_ev_fd, 1);
+}
+
+struct cqe_drop {
+	struct list_head list;
+	u64 user_data;
+	s32 res;
+};
+
+static void io_cqring_overflow_flush(struct io_ring_ctx *ctx)
+{
+	struct io_rings *rings = ctx->rings;
+	struct io_uring_cqe *cqe;
+	struct cqe_drop *drop;
+	unsigned long flags;
+
+	if (list_empty_careful(&ctx->cq_overflow_list))
+		return;
+	if (ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
+	    rings->cq_ring_entries)
+		return;
+
+	spin_lock_irqsave(&ctx->completion_lock, flags);
+
+	while (!list_empty(&ctx->cq_overflow_list)) {
+		drop = list_first_entry(&ctx->cq_overflow_list, struct cqe_drop,
+						list);
+		cqe = io_get_cqring(ctx);
+		if (!cqe)
+			break;
+		list_del(&drop->list);
+		WRITE_ONCE(cqe->user_data, drop->user_data);
+		WRITE_ONCE(cqe->res, drop->res);
+		WRITE_ONCE(cqe->flags, 0);
+		kfree(drop);
+	}
+
+	io_commit_cqring(ctx);
+	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+	io_cqring_ev_posted(ctx);
+}
+
+static void io_cqring_overflow(struct io_ring_ctx *ctx, u64 ki_user_data,
+			       long res)
+	__must_hold(&ctx->completion_lock)
+{
+	struct cqe_drop *drop;
+
+	if (!(ctx->flags & IORING_SETUP_CQ_NODROP)) {
+log_overflow:
+		WRITE_ONCE(ctx->rings->cq_overflow,
+				atomic_inc_return(&ctx->cached_cq_overflow));
+		return;
+	}
+
+	drop = kmalloc(sizeof(*drop), GFP_ATOMIC);
+	if (!drop)
+		goto log_overflow;
+
+	drop->user_data = ki_user_data;
+	drop->res = res;
+	list_add_tail(&drop->list, &ctx->cq_overflow_list);
+}
+
 static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
 				 long res)
 {
@@ -601,26 +674,15 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
 	 * the ring.
 	 */
 	cqe = io_get_cqring(ctx);
-	if (cqe) {
+	if (likely(cqe)) {
 		WRITE_ONCE(cqe->user_data, ki_user_data);
 		WRITE_ONCE(cqe->res, res);
 		WRITE_ONCE(cqe->flags, 0);
 	} else {
-		WRITE_ONCE(ctx->rings->cq_overflow,
-				atomic_inc_return(&ctx->cached_cq_overflow));
+		io_cqring_overflow(ctx, ki_user_data, res);
 	}
 }
 
-static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
-{
-	if (waitqueue_active(&ctx->wait))
-		wake_up(&ctx->wait);
-	if (waitqueue_active(&ctx->sqo_wait))
-		wake_up(&ctx->sqo_wait);
-	if (ctx->cq_ev_fd)
-		eventfd_signal(ctx->cq_ev_fd, 1);
-}
-
 static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data,
 				long res)
 {
@@ -859,8 +921,13 @@ static void io_put_req(struct io_kiocb *req, struct io_kiocb **nxtptr)
 	}
 }
 
-static unsigned io_cqring_events(struct io_rings *rings)
+static unsigned io_cqring_events(struct io_ring_ctx *ctx)
 {
+	struct io_rings *rings = ctx->rings;
+
+	if (ctx->flags & IORING_SETUP_CQ_NODROP)
+		io_cqring_overflow_flush(ctx);
+
 	/* See comment at the top of this file */
 	smp_rmb();
 	return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
@@ -1016,7 +1083,7 @@ static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
 		 * If we do, we can potentially be spinning for commands that
 		 * already triggered a CQE (eg in error).
 		 */
-		if (io_cqring_events(ctx->rings))
+		if (io_cqring_events(ctx))
 			break;
 
 		/*
@@ -2873,6 +2940,10 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 	int i, submitted = 0;
 	bool mm_fault = false;
 
+	if ((ctx->flags & IORING_SETUP_CQ_NODROP) &&
+	    !list_empty(&ctx->cq_overflow_list))
+		return -EBUSY;
+
 	if (nr > IO_PLUG_THRESHOLD) {
 		io_submit_state_start(&state, ctx, nr);
 		statep = &state;
@@ -2952,6 +3023,7 @@ static int io_sq_thread(void *data)
 	timeout = inflight = 0;
 	while (!kthread_should_park()) {
 		unsigned int to_submit;
+		int ret;
 
 		if (inflight) {
 			unsigned nr_events = 0;
@@ -3036,8 +3108,10 @@ static int io_sq_thread(void *data)
 		}
 
 		to_submit = min(to_submit, ctx->sq_entries);
-		inflight += io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm,
-					   true);
+		ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
+		if (ret < 0)
+			continue;
+		inflight += ret;
 
 		/* Commit SQ ring head once we've consumed all SQEs */
 		io_commit_sqring(ctx);
@@ -3070,7 +3144,7 @@ static inline bool io_should_wake(struct io_wait_queue *iowq)
 	 * started waiting. For timeouts, we always want to return to userspace,
 	 * regardless of event count.
 	 */
-	return io_cqring_events(ctx->rings) >= iowq->to_wait ||
+	return io_cqring_events(ctx) >= iowq->to_wait ||
 			atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
 }
 
@@ -3105,7 +3179,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	struct io_rings *rings = ctx->rings;
 	int ret = 0;
 
-	if (io_cqring_events(rings) >= min_events)
+	if (io_cqring_events(ctx) >= min_events)
 		return 0;
 
 	if (sig) {
@@ -4406,7 +4480,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 	}
 
 	if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
-			IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE))
+			IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
+			IORING_SETUP_CQ_NODROP))
 		return -EINVAL;
 
 	ret = io_uring_create(entries, &p);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index f1a118b01d18..3d8517eb376e 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -56,6 +56,7 @@ struct io_uring_sqe {
 #define IORING_SETUP_SQPOLL	(1U << 1)	/* SQ poll thread */
 #define IORING_SETUP_SQ_AFF	(1U << 2)	/* sq_thread_cpu is valid */
 #define IORING_SETUP_CQSIZE	(1U << 3)	/* app defines CQ size */
+#define IORING_SETUP_CQ_NODROP	(1U << 4)	/* no CQ drops */
 
 #define IORING_OP_NOP		0
 #define IORING_OP_READV		1

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [RFC] io_uring CQ ring backpressure
  2019-11-06 16:21 [RFC] io_uring CQ ring backpressure Jens Axboe
@ 2019-11-06 19:12 ` Pavel Begunkov
  2019-11-06 19:43   ` Jens Axboe
  2019-11-06 19:51 ` Jann Horn
  1 sibling, 1 reply; 9+ messages in thread
From: Pavel Begunkov @ 2019-11-06 19:12 UTC (permalink / raw)
  To: Jens Axboe, io-uring, linux-block


[-- Attachment #1.1: Type: text/plain, Size: 8662 bytes --]

On 06/11/2019 19:21, Jens Axboe wrote:
> Currently we drop completion events, if the CQ ring is full. That's fine
> for requests with bounded completion times, but it may make it harder to
> use io_uring with networked IO where request completion times are
> generally unbounded. Or with POLL, for example, which is also unbounded.
> 
> This patch adds IORING_SETUP_CQ_NODROP, which changes the behavior a bit
> for CQ ring overflows. First of all, it doesn't overflow the ring, it
> simply stores backlog of completions that we weren't able to put into
> the CQ ring. To prevent the backlog from growing indefinitely, if the
> backlog is non-empty, we apply back pressure on IO submissions. Any
> attempt to submit new IO with a non-empty backlog will get an -EBUSY
> return from the kernel.
> 
> I think that makes for a pretty sane API in terms of how the application
> can handle it. With CQ_NODROP enabled, we'll never drop a completion
> event (well unless we're totally out of memory...), but we'll also not
> allow submissions with a completion backlog.
> 
> ---
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index b647cdf0312c..12e9fe2479f4 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -207,6 +207,7 @@ struct io_ring_ctx {
>  
>  		struct list_head	defer_list;
>  		struct list_head	timeout_list;
> +		struct list_head	cq_overflow_list;
>  
>  		wait_queue_head_t	inflight_wait;
>  	} ____cacheline_aligned_in_smp;
> @@ -414,6 +415,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
>  
>  	ctx->flags = p->flags;
>  	init_waitqueue_head(&ctx->cq_wait);
> +	INIT_LIST_HEAD(&ctx->cq_overflow_list);
>  	init_completion(&ctx->ctx_done);
>  	init_completion(&ctx->sqo_thread_started);
>  	mutex_init(&ctx->uring_lock);
> @@ -588,6 +590,77 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
>  	return &rings->cqes[tail & ctx->cq_mask];
>  }
>  
> +static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
> +{
> +	if (waitqueue_active(&ctx->wait))
> +		wake_up(&ctx->wait);
> +	if (waitqueue_active(&ctx->sqo_wait))
> +		wake_up(&ctx->sqo_wait);
> +	if (ctx->cq_ev_fd)
> +		eventfd_signal(ctx->cq_ev_fd, 1);
> +}
> +
> +struct cqe_drop {
> +	struct list_head list;
> +	u64 user_data;
> +	s32 res;
> +};

How about to use io_kiocb instead of new structure?
It already has valid req->user_data and occasionaly used
req->result. But this would probably take more work to do.

> +
> +static void io_cqring_overflow_flush(struct io_ring_ctx *ctx)
> +{
> +	struct io_rings *rings = ctx->rings;
> +	struct io_uring_cqe *cqe;
> +	struct cqe_drop *drop;
> +	unsigned long flags;
> +
> +	if (list_empty_careful(&ctx->cq_overflow_list))
> +		return;
> +	if (ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
> +	    rings->cq_ring_entries)
> +		return;
> +
> +	spin_lock_irqsave(&ctx->completion_lock, flags);
> +
> +	while (!list_empty(&ctx->cq_overflow_list)) {
> +		drop = list_first_entry(&ctx->cq_overflow_list, struct cqe_drop,
> +						list);
> +		cqe = io_get_cqring(ctx);
> +		if (!cqe)
> +			break;
> +		list_del(&drop->list);
> +		WRITE_ONCE(cqe->user_data, drop->user_data);
> +		WRITE_ONCE(cqe->res, drop->res);
> +		WRITE_ONCE(cqe->flags, 0);
> +		kfree(drop);
> +	}
> +
> +	io_commit_cqring(ctx);
> +	spin_unlock_irqrestore(&ctx->completion_lock, flags);
> +	io_cqring_ev_posted(ctx);
> +}
> +
> +static void io_cqring_overflow(struct io_ring_ctx *ctx, u64 ki_user_data,
> +			       long res)
> +	__must_hold(&ctx->completion_lock)
> +{
> +	struct cqe_drop *drop;
> +
> +	if (!(ctx->flags & IORING_SETUP_CQ_NODROP)) {
> +log_overflow:
> +		WRITE_ONCE(ctx->rings->cq_overflow,
> +				atomic_inc_return(&ctx->cached_cq_overflow));
> +		return;
> +	}
> +
> +	drop = kmalloc(sizeof(*drop), GFP_ATOMIC);
> +	if (!drop)
> +		goto log_overflow;
> +
> +	drop->user_data = ki_user_data;
> +	drop->res = res;
> +	list_add_tail(&drop->list, &ctx->cq_overflow_list);
> +}
> +
>  static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
>  				 long res)
>  {
> @@ -601,26 +674,15 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
>  	 * the ring.
>  	 */
>  	cqe = io_get_cqring(ctx);
> -	if (cqe) {
> +	if (likely(cqe)) {
>  		WRITE_ONCE(cqe->user_data, ki_user_data);
>  		WRITE_ONCE(cqe->res, res);
>  		WRITE_ONCE(cqe->flags, 0);
>  	} else {
> -		WRITE_ONCE(ctx->rings->cq_overflow,
> -				atomic_inc_return(&ctx->cached_cq_overflow));
> +		io_cqring_overflow(ctx, ki_user_data, res);
>  	}
>  }
>  
> -static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
> -{
> -	if (waitqueue_active(&ctx->wait))
> -		wake_up(&ctx->wait);
> -	if (waitqueue_active(&ctx->sqo_wait))
> -		wake_up(&ctx->sqo_wait);
> -	if (ctx->cq_ev_fd)
> -		eventfd_signal(ctx->cq_ev_fd, 1);
> -}
> -
>  static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data,
>  				long res)
>  {
> @@ -859,8 +921,13 @@ static void io_put_req(struct io_kiocb *req, struct io_kiocb **nxtptr)
>  	}
>  }
>  
> -static unsigned io_cqring_events(struct io_rings *rings)
> +static unsigned io_cqring_events(struct io_ring_ctx *ctx)
>  {
> +	struct io_rings *rings = ctx->rings;
> +
> +	if (ctx->flags & IORING_SETUP_CQ_NODROP)
> +		io_cqring_overflow_flush(ctx);
> +
>  	/* See comment at the top of this file */
>  	smp_rmb();
>  	return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
> @@ -1016,7 +1083,7 @@ static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
>  		 * If we do, we can potentially be spinning for commands that
>  		 * already triggered a CQE (eg in error).
>  		 */
> -		if (io_cqring_events(ctx->rings))
> +		if (io_cqring_events(ctx))
>  			break;
>  
>  		/*
> @@ -2873,6 +2940,10 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
>  	int i, submitted = 0;
>  	bool mm_fault = false;
>  
> +	if ((ctx->flags & IORING_SETUP_CQ_NODROP) &&
> +	    !list_empty(&ctx->cq_overflow_list))
> +		return -EBUSY;
> +
>  	if (nr > IO_PLUG_THRESHOLD) {
>  		io_submit_state_start(&state, ctx, nr);
>  		statep = &state;
> @@ -2952,6 +3023,7 @@ static int io_sq_thread(void *data)
>  	timeout = inflight = 0;
>  	while (!kthread_should_park()) {
>  		unsigned int to_submit;
> +		int ret;
>  
>  		if (inflight) {
>  			unsigned nr_events = 0;
> @@ -3036,8 +3108,10 @@ static int io_sq_thread(void *data)
>  		}
>  
>  		to_submit = min(to_submit, ctx->sq_entries);
> -		inflight += io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm,
> -					   true);
> +		ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
> +		if (ret < 0)
> +			continue;
> +		inflight += ret;
>  

After rebase could be simplified to 
if (ret >= 0)
	inflight += ret;


>  		/* Commit SQ ring head once we've consumed all SQEs */
>  		io_commit_sqring(ctx);
> @@ -3070,7 +3144,7 @@ static inline bool io_should_wake(struct io_wait_queue *iowq)
>  	 * started waiting. For timeouts, we always want to return to userspace,
>  	 * regardless of event count.
>  	 */
> -	return io_cqring_events(ctx->rings) >= iowq->to_wait ||
> +	return io_cqring_events(ctx) >= iowq->to_wait ||
>  			atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
>  }
>  
> @@ -3105,7 +3179,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
>  	struct io_rings *rings = ctx->rings;
>  	int ret = 0;
>  
> -	if (io_cqring_events(rings) >= min_events)
> +	if (io_cqring_events(ctx) >= min_events)
>  		return 0;
>  
>  	if (sig) {
> @@ -4406,7 +4480,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
>  	}
>  
>  	if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
> -			IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE))
> +			IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
> +			IORING_SETUP_CQ_NODROP))
>  		return -EINVAL;
>  
>  	ret = io_uring_create(entries, &p);
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index f1a118b01d18..3d8517eb376e 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -56,6 +56,7 @@ struct io_uring_sqe {
>  #define IORING_SETUP_SQPOLL	(1U << 1)	/* SQ poll thread */
>  #define IORING_SETUP_SQ_AFF	(1U << 2)	/* sq_thread_cpu is valid */
>  #define IORING_SETUP_CQSIZE	(1U << 3)	/* app defines CQ size */
> +#define IORING_SETUP_CQ_NODROP	(1U << 4)	/* no CQ drops */
>  
>  #define IORING_OP_NOP		0
>  #define IORING_OP_READV		1
> 

-- 
Pavel Begunkov


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC] io_uring CQ ring backpressure
  2019-11-06 19:12 ` Pavel Begunkov
@ 2019-11-06 19:43   ` Jens Axboe
  0 siblings, 0 replies; 9+ messages in thread
From: Jens Axboe @ 2019-11-06 19:43 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring, linux-block

On 11/6/19 12:12 PM, Pavel Begunkov wrote:
> On 06/11/2019 19:21, Jens Axboe wrote:
>> Currently we drop completion events, if the CQ ring is full. That's fine
>> for requests with bounded completion times, but it may make it harder to
>> use io_uring with networked IO where request completion times are
>> generally unbounded. Or with POLL, for example, which is also unbounded.
>>
>> This patch adds IORING_SETUP_CQ_NODROP, which changes the behavior a bit
>> for CQ ring overflows. First of all, it doesn't overflow the ring, it
>> simply stores backlog of completions that we weren't able to put into
>> the CQ ring. To prevent the backlog from growing indefinitely, if the
>> backlog is non-empty, we apply back pressure on IO submissions. Any
>> attempt to submit new IO with a non-empty backlog will get an -EBUSY
>> return from the kernel.
>>
>> I think that makes for a pretty sane API in terms of how the application
>> can handle it. With CQ_NODROP enabled, we'll never drop a completion
>> event (well unless we're totally out of memory...), but we'll also not
>> allow submissions with a completion backlog.
>>
>> ---
>>
>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>> index b647cdf0312c..12e9fe2479f4 100644
>> --- a/fs/io_uring.c
>> +++ b/fs/io_uring.c
>> @@ -207,6 +207,7 @@ struct io_ring_ctx {
>>   
>>   		struct list_head	defer_list;
>>   		struct list_head	timeout_list;
>> +		struct list_head	cq_overflow_list;
>>   
>>   		wait_queue_head_t	inflight_wait;
>>   	} ____cacheline_aligned_in_smp;
>> @@ -414,6 +415,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
>>   
>>   	ctx->flags = p->flags;
>>   	init_waitqueue_head(&ctx->cq_wait);
>> +	INIT_LIST_HEAD(&ctx->cq_overflow_list);
>>   	init_completion(&ctx->ctx_done);
>>   	init_completion(&ctx->sqo_thread_started);
>>   	mutex_init(&ctx->uring_lock);
>> @@ -588,6 +590,77 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
>>   	return &rings->cqes[tail & ctx->cq_mask];
>>   }
>>   
>> +static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
>> +{
>> +	if (waitqueue_active(&ctx->wait))
>> +		wake_up(&ctx->wait);
>> +	if (waitqueue_active(&ctx->sqo_wait))
>> +		wake_up(&ctx->sqo_wait);
>> +	if (ctx->cq_ev_fd)
>> +		eventfd_signal(ctx->cq_ev_fd, 1);
>> +}
>> +
>> +struct cqe_drop {
>> +	struct list_head list;
>> +	u64 user_data;
>> +	s32 res;
>> +};
> 
> How about to use io_kiocb instead of new structure?
> It already has valid req->user_data and occasionaly used
> req->result. But this would probably take more work to do.

I did consider that, we could use both those fields. But we can't just
take ownership of the io_kiocb at this point, so it's much easier (and
less error prone) to just stuff in this new structure. The downside is
of course that we could still have an overflow, but if an atomic
allocation of this size fails, then the system is hosed anyway.

This is also a somewhat more slow path. We don't really expect to have
constant overflows, but we need to be able to handle them. Otherwise CQ
ring sizing must be worst case, and maybe that isn't even enough. This
allows use cases that we could not support before.

>> @@ -3036,8 +3108,10 @@ static int io_sq_thread(void *data)
>>   		}
>>   
>>   		to_submit = min(to_submit, ctx->sq_entries);
>> -		inflight += io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm,
>> -					   true);
>> +		ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
>> +		if (ret < 0)
>> +			continue;
>> +		inflight += ret;
>>   
> 
> After rebase could be simplified to
> if (ret >= 0)
> 	inflight += ret;

Heh, that's exactly that I did:

https://git.kernel.dk/cgit/linux-block/commit/?h=for-5.5/io_uring-test&id=f491ca28c78b7ff2dcd288847a212bb49f256500

Thanks for the review!

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC] io_uring CQ ring backpressure
  2019-11-06 16:21 [RFC] io_uring CQ ring backpressure Jens Axboe
  2019-11-06 19:12 ` Pavel Begunkov
@ 2019-11-06 19:51 ` Jann Horn
  2019-11-06 20:08   ` Jens Axboe
  1 sibling, 1 reply; 9+ messages in thread
From: Jann Horn @ 2019-11-06 19:51 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring, linux-block

On Wed, Nov 6, 2019 at 5:23 PM Jens Axboe <axboe@kernel.dk> wrote:
> Currently we drop completion events, if the CQ ring is full. That's fine
> for requests with bounded completion times, but it may make it harder to
> use io_uring with networked IO where request completion times are
> generally unbounded. Or with POLL, for example, which is also unbounded.
>
> This patch adds IORING_SETUP_CQ_NODROP, which changes the behavior a bit
> for CQ ring overflows. First of all, it doesn't overflow the ring, it
> simply stores backlog of completions that we weren't able to put into
> the CQ ring. To prevent the backlog from growing indefinitely, if the
> backlog is non-empty, we apply back pressure on IO submissions. Any
> attempt to submit new IO with a non-empty backlog will get an -EBUSY
> return from the kernel.
>
> I think that makes for a pretty sane API in terms of how the application
> can handle it. With CQ_NODROP enabled, we'll never drop a completion
> event (well unless we're totally out of memory...), but we'll also not
> allow submissions with a completion backlog.
[...]
> +static void io_cqring_overflow(struct io_ring_ctx *ctx, u64 ki_user_data,
> +                              long res)
> +       __must_hold(&ctx->completion_lock)
> +{
> +       struct cqe_drop *drop;
> +
> +       if (!(ctx->flags & IORING_SETUP_CQ_NODROP)) {
> +log_overflow:
> +               WRITE_ONCE(ctx->rings->cq_overflow,
> +                               atomic_inc_return(&ctx->cached_cq_overflow));
> +               return;
> +       }
> +
> +       drop = kmalloc(sizeof(*drop), GFP_ATOMIC);
> +       if (!drop)
> +               goto log_overflow;
> +
> +       drop->user_data = ki_user_data;
> +       drop->res = res;
> +       list_add_tail(&drop->list, &ctx->cq_overflow_list);
> +}

This could potentially consume moderately large amounts of atomic
memory quickly and without any guarantee that the memory will be freed
anytime soon, right? That seems moderately bad. Is there no way to
e.g. pre-reserve memory for completion events, or something like that?

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC] io_uring CQ ring backpressure
  2019-11-06 19:51 ` Jann Horn
@ 2019-11-06 20:08   ` Jens Axboe
  2019-11-06 21:31     ` Jens Axboe
  0 siblings, 1 reply; 9+ messages in thread
From: Jens Axboe @ 2019-11-06 20:08 UTC (permalink / raw)
  To: Jann Horn; +Cc: io-uring, linux-block

On 11/6/19 12:51 PM, Jann Horn wrote:
> On Wed, Nov 6, 2019 at 5:23 PM Jens Axboe <axboe@kernel.dk> wrote:
>> Currently we drop completion events, if the CQ ring is full. That's fine
>> for requests with bounded completion times, but it may make it harder to
>> use io_uring with networked IO where request completion times are
>> generally unbounded. Or with POLL, for example, which is also unbounded.
>>
>> This patch adds IORING_SETUP_CQ_NODROP, which changes the behavior a bit
>> for CQ ring overflows. First of all, it doesn't overflow the ring, it
>> simply stores backlog of completions that we weren't able to put into
>> the CQ ring. To prevent the backlog from growing indefinitely, if the
>> backlog is non-empty, we apply back pressure on IO submissions. Any
>> attempt to submit new IO with a non-empty backlog will get an -EBUSY
>> return from the kernel.
>>
>> I think that makes for a pretty sane API in terms of how the application
>> can handle it. With CQ_NODROP enabled, we'll never drop a completion
>> event (well unless we're totally out of memory...), but we'll also not
>> allow submissions with a completion backlog.
> [...]
>> +static void io_cqring_overflow(struct io_ring_ctx *ctx, u64 ki_user_data,
>> +                              long res)
>> +       __must_hold(&ctx->completion_lock)
>> +{
>> +       struct cqe_drop *drop;
>> +
>> +       if (!(ctx->flags & IORING_SETUP_CQ_NODROP)) {
>> +log_overflow:
>> +               WRITE_ONCE(ctx->rings->cq_overflow,
>> +                               atomic_inc_return(&ctx->cached_cq_overflow));
>> +               return;
>> +       }
>> +
>> +       drop = kmalloc(sizeof(*drop), GFP_ATOMIC);
>> +       if (!drop)
>> +               goto log_overflow;
>> +
>> +       drop->user_data = ki_user_data;
>> +       drop->res = res;
>> +       list_add_tail(&drop->list, &ctx->cq_overflow_list);
>> +}
> 
> This could potentially consume moderately large amounts of atomic
> memory quickly and without any guarantee that the memory will be freed
> anytime soon, right? That seems moderately bad. Is there no way to
> e.g. pre-reserve memory for completion events, or something like that?

As soon as there's even one entry in that backlog, the ring won't accept
anymore new IO. So I don't think it's a huge concern. If we pre-reserve,
we haven't really made much progress in making sure we don't drop events,
and we'll be tying up that memory all the time.

The alternative, as Pavel also mentioned, is to re-use the io_kiocb
for this. But that'll tie up more memory, and it's a bit tricky with
the life times. Just because the request has completed doesn't mean
that someone isn't still holding a reference to it, and who knows
what they will do.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC] io_uring CQ ring backpressure
  2019-11-06 20:08   ` Jens Axboe
@ 2019-11-06 21:31     ` Jens Axboe
  2019-11-06 21:54       ` Pavel Begunkov
  2019-11-06 22:42       ` Jens Axboe
  0 siblings, 2 replies; 9+ messages in thread
From: Jens Axboe @ 2019-11-06 21:31 UTC (permalink / raw)
  To: Jann Horn; +Cc: io-uring, linux-block, Pavel Begunkov (Silence)

On 11/6/19 1:08 PM, Jens Axboe wrote:
> On 11/6/19 12:51 PM, Jann Horn wrote:
>> On Wed, Nov 6, 2019 at 5:23 PM Jens Axboe <axboe@kernel.dk> wrote:
>>> Currently we drop completion events, if the CQ ring is full. That's fine
>>> for requests with bounded completion times, but it may make it harder to
>>> use io_uring with networked IO where request completion times are
>>> generally unbounded. Or with POLL, for example, which is also unbounded.
>>>
>>> This patch adds IORING_SETUP_CQ_NODROP, which changes the behavior a bit
>>> for CQ ring overflows. First of all, it doesn't overflow the ring, it
>>> simply stores backlog of completions that we weren't able to put into
>>> the CQ ring. To prevent the backlog from growing indefinitely, if the
>>> backlog is non-empty, we apply back pressure on IO submissions. Any
>>> attempt to submit new IO with a non-empty backlog will get an -EBUSY
>>> return from the kernel.
>>>
>>> I think that makes for a pretty sane API in terms of how the application
>>> can handle it. With CQ_NODROP enabled, we'll never drop a completion
>>> event (well unless we're totally out of memory...), but we'll also not
>>> allow submissions with a completion backlog.
>> [...]
>>> +static void io_cqring_overflow(struct io_ring_ctx *ctx, u64 ki_user_data,
>>> +                              long res)
>>> +       __must_hold(&ctx->completion_lock)
>>> +{
>>> +       struct cqe_drop *drop;
>>> +
>>> +       if (!(ctx->flags & IORING_SETUP_CQ_NODROP)) {
>>> +log_overflow:
>>> +               WRITE_ONCE(ctx->rings->cq_overflow,
>>> +                               atomic_inc_return(&ctx->cached_cq_overflow));
>>> +               return;
>>> +       }
>>> +
>>> +       drop = kmalloc(sizeof(*drop), GFP_ATOMIC);
>>> +       if (!drop)
>>> +               goto log_overflow;
>>> +
>>> +       drop->user_data = ki_user_data;
>>> +       drop->res = res;
>>> +       list_add_tail(&drop->list, &ctx->cq_overflow_list);
>>> +}
>>
>> This could potentially consume moderately large amounts of atomic
>> memory quickly and without any guarantee that the memory will be freed
>> anytime soon, right? That seems moderately bad. Is there no way to
>> e.g. pre-reserve memory for completion events, or something like that?
> 
> As soon as there's even one entry in that backlog, the ring won't accept
> anymore new IO. So I don't think it's a huge concern. If we pre-reserve,
> we haven't really made much progress in making sure we don't drop events,
> and we'll be tying up that memory all the time.
> 
> The alternative, as Pavel also mentioned, is to re-use the io_kiocb
> for this. But that'll tie up more memory, and it's a bit tricky with
> the life times. Just because the request has completed doesn't mean
> that someone isn't still holding a reference to it, and who knows
> what they will do.

OK, I took a stab at it, here's a brain dump of the "complications"

1) Some places now use __io_free_req() to drop both references, if we
   know we haven't issued a request yet. Needs double drop, not a big
   deal.
2) Some ordering changes between io_put_req() and the fill/add event
   logic. Again not a huge deal, easy to spot.
3) We have one failure case that does not have a request, exactly because
   we failed to allocate one. Don't look at that part in the below patch,
   I think what we should do here is just reserve a request for that case.
   It won't help with the submission, but it'll get it logged correctly
   for the overflow backlog. Any new submission can't proceed with that
   request in the overflow backlog anyway, so we need just the one.
   Not super pretty, but at least we can keep this out of the fast path,
   as the only one that will free this request is the overflow flush
   path.

I'll do a prep patch that makes the fill/add event path deal in requests,
then we can build the backpressure on top.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC] io_uring CQ ring backpressure
  2019-11-06 21:31     ` Jens Axboe
@ 2019-11-06 21:54       ` Pavel Begunkov
  2019-11-06 21:56         ` Jens Axboe
  2019-11-06 22:42       ` Jens Axboe
  1 sibling, 1 reply; 9+ messages in thread
From: Pavel Begunkov @ 2019-11-06 21:54 UTC (permalink / raw)
  To: Jens Axboe, Jann Horn; +Cc: io-uring, linux-block


[-- Attachment #1.1: Type: text/plain, Size: 4246 bytes --]

On 07/11/2019 00:31, Jens Axboe wrote:
> On 11/6/19 1:08 PM, Jens Axboe wrote:
>> On 11/6/19 12:51 PM, Jann Horn wrote:
>>> On Wed, Nov 6, 2019 at 5:23 PM Jens Axboe <axboe@kernel.dk> wrote:
>>>> Currently we drop completion events, if the CQ ring is full. That's fine
>>>> for requests with bounded completion times, but it may make it harder to
>>>> use io_uring with networked IO where request completion times are
>>>> generally unbounded. Or with POLL, for example, which is also unbounded.
>>>>
>>>> This patch adds IORING_SETUP_CQ_NODROP, which changes the behavior a bit
>>>> for CQ ring overflows. First of all, it doesn't overflow the ring, it
>>>> simply stores backlog of completions that we weren't able to put into
>>>> the CQ ring. To prevent the backlog from growing indefinitely, if the
>>>> backlog is non-empty, we apply back pressure on IO submissions. Any
>>>> attempt to submit new IO with a non-empty backlog will get an -EBUSY
>>>> return from the kernel.
>>>>
>>>> I think that makes for a pretty sane API in terms of how the application
>>>> can handle it. With CQ_NODROP enabled, we'll never drop a completion
>>>> event (well unless we're totally out of memory...), but we'll also not
>>>> allow submissions with a completion backlog.
>>> [...]
>>>> +static void io_cqring_overflow(struct io_ring_ctx *ctx, u64 ki_user_data,
>>>> +                              long res)
>>>> +       __must_hold(&ctx->completion_lock)
>>>> +{
>>>> +       struct cqe_drop *drop;
>>>> +
>>>> +       if (!(ctx->flags & IORING_SETUP_CQ_NODROP)) {
>>>> +log_overflow:
>>>> +               WRITE_ONCE(ctx->rings->cq_overflow,
>>>> +                               atomic_inc_return(&ctx->cached_cq_overflow));
>>>> +               return;
>>>> +       }
>>>> +
>>>> +       drop = kmalloc(sizeof(*drop), GFP_ATOMIC);
>>>> +       if (!drop)
>>>> +               goto log_overflow;
>>>> +
>>>> +       drop->user_data = ki_user_data;
>>>> +       drop->res = res;
>>>> +       list_add_tail(&drop->list, &ctx->cq_overflow_list);
>>>> +}
>>>
>>> This could potentially consume moderately large amounts of atomic
>>> memory quickly and without any guarantee that the memory will be freed
>>> anytime soon, right? That seems moderately bad. Is there no way to
>>> e.g. pre-reserve memory for completion events, or something like that?
>>
>> As soon as there's even one entry in that backlog, the ring won't accept
>> anymore new IO. So I don't think it's a huge concern. If we pre-reserve,
>> we haven't really made much progress in making sure we don't drop events,
>> and we'll be tying up that memory all the time.
>>
>> The alternative, as Pavel also mentioned, is to re-use the io_kiocb
>> for this. But that'll tie up more memory, and it's a bit tricky with
>> the life times. Just because the request has completed doesn't mean
>> that someone isn't still holding a reference to it, and who knows
>> what they will do.
> 
> OK, I took a stab at it, here's a brain dump of the "complications"
> 
> 1) Some places now use __io_free_req() to drop both references, if we
>    know we haven't issued a request yet. Needs double drop, not a big
>    deal.
> 2) Some ordering changes between io_put_req() and the fill/add event
>    logic. Again not a huge deal, easy to spot.
> 3) We have one failure case that does not have a request, exactly because
>    we failed to allocate one. Don't look at that part in the below patch,
>    I think what we should do here is just reserve a request for that case.
>    It won't help with the submission, but it'll get it logged correctly
>    for the overflow backlog. Any new submission can't proceed with that
>    request in the overflow backlog anyway, so we need just the one.
>    Not super pretty, but at least we can keep this out of the fast path,
>    as the only one that will free this request is the overflow flush
>    path.
> 

2 (maybe partially) and 3 will hopefully be solved by the patchset
removing passing sqe_submit. I'll resend it in a minute.

> I'll do a prep patch that makes the fill/add event path deal in requests,
> then we can build the backpressure on top.
> 

-- 
Pavel Begunkov


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC] io_uring CQ ring backpressure
  2019-11-06 21:54       ` Pavel Begunkov
@ 2019-11-06 21:56         ` Jens Axboe
  0 siblings, 0 replies; 9+ messages in thread
From: Jens Axboe @ 2019-11-06 21:56 UTC (permalink / raw)
  To: Pavel Begunkov, Jann Horn; +Cc: io-uring, linux-block

On 11/6/19 2:54 PM, Pavel Begunkov wrote:
> On 07/11/2019 00:31, Jens Axboe wrote:
>> On 11/6/19 1:08 PM, Jens Axboe wrote:
>>> On 11/6/19 12:51 PM, Jann Horn wrote:
>>>> On Wed, Nov 6, 2019 at 5:23 PM Jens Axboe <axboe@kernel.dk> wrote:
>>>>> Currently we drop completion events, if the CQ ring is full. That's fine
>>>>> for requests with bounded completion times, but it may make it harder to
>>>>> use io_uring with networked IO where request completion times are
>>>>> generally unbounded. Or with POLL, for example, which is also unbounded.
>>>>>
>>>>> This patch adds IORING_SETUP_CQ_NODROP, which changes the behavior a bit
>>>>> for CQ ring overflows. First of all, it doesn't overflow the ring, it
>>>>> simply stores backlog of completions that we weren't able to put into
>>>>> the CQ ring. To prevent the backlog from growing indefinitely, if the
>>>>> backlog is non-empty, we apply back pressure on IO submissions. Any
>>>>> attempt to submit new IO with a non-empty backlog will get an -EBUSY
>>>>> return from the kernel.
>>>>>
>>>>> I think that makes for a pretty sane API in terms of how the application
>>>>> can handle it. With CQ_NODROP enabled, we'll never drop a completion
>>>>> event (well unless we're totally out of memory...), but we'll also not
>>>>> allow submissions with a completion backlog.
>>>> [...]
>>>>> +static void io_cqring_overflow(struct io_ring_ctx *ctx, u64 ki_user_data,
>>>>> +                              long res)
>>>>> +       __must_hold(&ctx->completion_lock)
>>>>> +{
>>>>> +       struct cqe_drop *drop;
>>>>> +
>>>>> +       if (!(ctx->flags & IORING_SETUP_CQ_NODROP)) {
>>>>> +log_overflow:
>>>>> +               WRITE_ONCE(ctx->rings->cq_overflow,
>>>>> +                               atomic_inc_return(&ctx->cached_cq_overflow));
>>>>> +               return;
>>>>> +       }
>>>>> +
>>>>> +       drop = kmalloc(sizeof(*drop), GFP_ATOMIC);
>>>>> +       if (!drop)
>>>>> +               goto log_overflow;
>>>>> +
>>>>> +       drop->user_data = ki_user_data;
>>>>> +       drop->res = res;
>>>>> +       list_add_tail(&drop->list, &ctx->cq_overflow_list);
>>>>> +}
>>>>
>>>> This could potentially consume moderately large amounts of atomic
>>>> memory quickly and without any guarantee that the memory will be freed
>>>> anytime soon, right? That seems moderately bad. Is there no way to
>>>> e.g. pre-reserve memory for completion events, or something like that?
>>>
>>> As soon as there's even one entry in that backlog, the ring won't accept
>>> anymore new IO. So I don't think it's a huge concern. If we pre-reserve,
>>> we haven't really made much progress in making sure we don't drop events,
>>> and we'll be tying up that memory all the time.
>>>
>>> The alternative, as Pavel also mentioned, is to re-use the io_kiocb
>>> for this. But that'll tie up more memory, and it's a bit tricky with
>>> the life times. Just because the request has completed doesn't mean
>>> that someone isn't still holding a reference to it, and who knows
>>> what they will do.
>>
>> OK, I took a stab at it, here's a brain dump of the "complications"
>>
>> 1) Some places now use __io_free_req() to drop both references, if we
>>     know we haven't issued a request yet. Needs double drop, not a big
>>     deal.
>> 2) Some ordering changes between io_put_req() and the fill/add event
>>     logic. Again not a huge deal, easy to spot.
>> 3) We have one failure case that does not have a request, exactly because
>>     we failed to allocate one. Don't look at that part in the below patch,
>>     I think what we should do here is just reserve a request for that case.
>>     It won't help with the submission, but it'll get it logged correctly
>>     for the overflow backlog. Any new submission can't proceed with that
>>     request in the overflow backlog anyway, so we need just the one.
>>     Not super pretty, but at least we can keep this out of the fast path,
>>     as the only one that will free this request is the overflow flush
>>     path.
>>
> 
> 2 (maybe partially) and 3 will hopefully be solved by the patchset
> removing passing sqe_submit. I'll resend it in a minute.

Please do, it'll definitely make a few things easier. Then I'll base the
cleanup/prep patch on top of that, and then the backpressure patch.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC] io_uring CQ ring backpressure
  2019-11-06 21:31     ` Jens Axboe
  2019-11-06 21:54       ` Pavel Begunkov
@ 2019-11-06 22:42       ` Jens Axboe
  1 sibling, 0 replies; 9+ messages in thread
From: Jens Axboe @ 2019-11-06 22:42 UTC (permalink / raw)
  To: Jann Horn; +Cc: io-uring, linux-block, Pavel Begunkov (Silence)

On 11/6/19 2:31 PM, Jens Axboe wrote:
> On 11/6/19 1:08 PM, Jens Axboe wrote:
>> On 11/6/19 12:51 PM, Jann Horn wrote:
>>> On Wed, Nov 6, 2019 at 5:23 PM Jens Axboe <axboe@kernel.dk> wrote:
>>>> Currently we drop completion events, if the CQ ring is full. That's fine
>>>> for requests with bounded completion times, but it may make it harder to
>>>> use io_uring with networked IO where request completion times are
>>>> generally unbounded. Or with POLL, for example, which is also unbounded.
>>>>
>>>> This patch adds IORING_SETUP_CQ_NODROP, which changes the behavior a bit
>>>> for CQ ring overflows. First of all, it doesn't overflow the ring, it
>>>> simply stores backlog of completions that we weren't able to put into
>>>> the CQ ring. To prevent the backlog from growing indefinitely, if the
>>>> backlog is non-empty, we apply back pressure on IO submissions. Any
>>>> attempt to submit new IO with a non-empty backlog will get an -EBUSY
>>>> return from the kernel.
>>>>
>>>> I think that makes for a pretty sane API in terms of how the application
>>>> can handle it. With CQ_NODROP enabled, we'll never drop a completion
>>>> event (well unless we're totally out of memory...), but we'll also not
>>>> allow submissions with a completion backlog.
>>> [...]
>>>> +static void io_cqring_overflow(struct io_ring_ctx *ctx, u64 ki_user_data,
>>>> +                              long res)
>>>> +       __must_hold(&ctx->completion_lock)
>>>> +{
>>>> +       struct cqe_drop *drop;
>>>> +
>>>> +       if (!(ctx->flags & IORING_SETUP_CQ_NODROP)) {
>>>> +log_overflow:
>>>> +               WRITE_ONCE(ctx->rings->cq_overflow,
>>>> +                               atomic_inc_return(&ctx->cached_cq_overflow));
>>>> +               return;
>>>> +       }
>>>> +
>>>> +       drop = kmalloc(sizeof(*drop), GFP_ATOMIC);
>>>> +       if (!drop)
>>>> +               goto log_overflow;
>>>> +
>>>> +       drop->user_data = ki_user_data;
>>>> +       drop->res = res;
>>>> +       list_add_tail(&drop->list, &ctx->cq_overflow_list);
>>>> +}
>>>
>>> This could potentially consume moderately large amounts of atomic
>>> memory quickly and without any guarantee that the memory will be freed
>>> anytime soon, right? That seems moderately bad. Is there no way to
>>> e.g. pre-reserve memory for completion events, or something like that?
>>
>> As soon as there's even one entry in that backlog, the ring won't accept
>> anymore new IO. So I don't think it's a huge concern. If we pre-reserve,
>> we haven't really made much progress in making sure we don't drop events,
>> and we'll be tying up that memory all the time.
>>
>> The alternative, as Pavel also mentioned, is to re-use the io_kiocb
>> for this. But that'll tie up more memory, and it's a bit tricky with
>> the life times. Just because the request has completed doesn't mean
>> that someone isn't still holding a reference to it, and who knows
>> what they will do.
> 
> OK, I took a stab at it, here's a brain dump of the "complications"
> 
> 1) Some places now use __io_free_req() to drop both references, if we
>     know we haven't issued a request yet. Needs double drop, not a big
>     deal.
> 2) Some ordering changes between io_put_req() and the fill/add event
>     logic. Again not a huge deal, easy to spot.
> 3) We have one failure case that does not have a request, exactly because
>     we failed to allocate one. Don't look at that part in the below patch,
>     I think what we should do here is just reserve a request for that case.
>     It won't help with the submission, but it'll get it logged correctly
>     for the overflow backlog. Any new submission can't proceed with that
>     request in the overflow backlog anyway, so we need just the one.
>     Not super pretty, but at least we can keep this out of the fast path,
>     as the only one that will free this request is the overflow flush
>     path.
> 
> I'll do a prep patch that makes the fill/add event path deal in requests,
> then we can build the backpressure on top.

On top of Pavel's cleanup and using io_kiocb instead. Turned out much
nicer that way. Totally untested, will go through the motions with all
of it now.

commit 38d95aad728b139160359ff519ec4b2262c90121
Author: Jens Axboe <axboe@kernel.dk>
Date:   Wed Nov 6 11:31:17 2019 -0700

    io_uring: add support for backlogged CQ ring
    
    Currently we drop completion events, if the CQ ring is full. That's fine
    for requests with bounded completion times, but it may make it harder to
    use io_uring with networked IO where request completion times are
    generally unbounded. Or with POLL, for example, which is also unbounded.
    
    This patch adds IORING_SETUP_CQ_NODROP, which changes the behavior a bit
    for CQ ring overflows. First of all, it doesn't overflow the ring, it
    simply stores a backlog of completions that we weren't able to put into
    the CQ ring. To prevent the backlog from growing indefinitely, if the
    backlog is non-empty, we apply back pressure on IO submissions. Any
    attempt to submit new IO with a non-empty backlog will get an -EBUSY
    return from the kernel. This is a signal to the application that it has
    backlogged CQ events, and that it must reap those before being allowed
    to submit more IO.
    
    Signed-off-by: Jens Axboe <axboe@kernel.dk>

diff --git a/fs/io_uring.c b/fs/io_uring.c
index fb621a564dcf..22373b7b3db0 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -207,6 +207,7 @@ struct io_ring_ctx {
 
 		struct list_head	defer_list;
 		struct list_head	timeout_list;
+		struct list_head	cq_overflow_list;
 
 		wait_queue_head_t	inflight_wait;
 	} ____cacheline_aligned_in_smp;
@@ -413,6 +414,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 
 	ctx->flags = p->flags;
 	init_waitqueue_head(&ctx->cq_wait);
+	INIT_LIST_HEAD(&ctx->cq_overflow_list);
 	init_completion(&ctx->ctx_done);
 	init_completion(&ctx->sqo_thread_started);
 	mutex_init(&ctx->uring_lock);
@@ -587,6 +589,72 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
 	return &rings->cqes[tail & ctx->cq_mask];
 }
 
+static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+{
+	if (waitqueue_active(&ctx->wait))
+		wake_up(&ctx->wait);
+	if (waitqueue_active(&ctx->sqo_wait))
+		wake_up(&ctx->sqo_wait);
+	if (ctx->cq_ev_fd)
+		eventfd_signal(ctx->cq_ev_fd, 1);
+}
+
+static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
+{
+	struct io_rings *rings = ctx->rings;
+	struct io_uring_cqe *cqe;
+	struct io_kiocb *req;
+	unsigned long flags;
+	LIST_HEAD(list);
+
+	if (list_empty_careful(&ctx->cq_overflow_list))
+		return;
+	if (ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
+	    rings->cq_ring_entries)
+		return;
+
+	spin_lock_irqsave(&ctx->completion_lock, flags);
+
+	while (!list_empty(&ctx->cq_overflow_list)) {
+		cqe = io_get_cqring(ctx);
+		if (!cqe && !force)
+			break;
+
+		req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb,
+						list);
+		list_move(&req->list, &list);
+		if (cqe) {
+			WRITE_ONCE(cqe->user_data, req->user_data);
+			WRITE_ONCE(cqe->res, req->result);
+			WRITE_ONCE(cqe->flags, 0);
+		}
+	}
+
+	io_commit_cqring(ctx);
+	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+	io_cqring_ev_posted(ctx);
+
+	while (!list_empty(&list)) {
+		req = list_first_entry(&list, struct io_kiocb, list);
+		list_del(&req->list);
+		io_put_req(req, NULL);
+	}
+}
+
+static void io_cqring_overflow(struct io_ring_ctx *ctx, struct io_kiocb *req,
+			       long res)
+	__must_hold(&ctx->completion_lock)
+{
+	if (!(ctx->flags & IORING_SETUP_CQ_NODROP)) {
+		WRITE_ONCE(ctx->rings->cq_overflow,
+				atomic_inc_return(&ctx->cached_cq_overflow));
+	} else {
+		refcount_inc(&req->refs);
+		req->result = res;
+		list_add_tail(&req->list, &ctx->cq_overflow_list);
+	}
+}
+
 static void io_cqring_fill_event(struct io_kiocb *req, long res)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -600,26 +668,15 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res)
 	 * the ring.
 	 */
 	cqe = io_get_cqring(ctx);
-	if (cqe) {
+	if (likely(cqe)) {
 		WRITE_ONCE(cqe->user_data, req->user_data);
 		WRITE_ONCE(cqe->res, res);
 		WRITE_ONCE(cqe->flags, 0);
 	} else {
-		WRITE_ONCE(ctx->rings->cq_overflow,
-				atomic_inc_return(&ctx->cached_cq_overflow));
+		io_cqring_overflow(ctx, req, res);
 	}
 }
 
-static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
-{
-	if (waitqueue_active(&ctx->wait))
-		wake_up(&ctx->wait);
-	if (waitqueue_active(&ctx->sqo_wait))
-		wake_up(&ctx->sqo_wait);
-	if (ctx->cq_ev_fd)
-		eventfd_signal(ctx->cq_ev_fd, 1);
-}
-
 static void io_cqring_add_event(struct io_kiocb *req, long res)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -864,6 +921,9 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx)
 {
 	struct io_rings *rings = ctx->rings;
 
+	if (ctx->flags & IORING_SETUP_CQ_NODROP)
+		io_cqring_overflow_flush(ctx, false);
+
 	/* See comment at the top of this file */
 	smp_rmb();
 	return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
@@ -2863,6 +2923,10 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 	int i, submitted = 0;
 	bool mm_fault = false;
 
+	if ((ctx->flags & IORING_SETUP_CQ_NODROP) &&
+	    !list_empty(&ctx->cq_overflow_list))
+		return -EBUSY;
+
 	if (nr > IO_PLUG_THRESHOLD) {
 		io_submit_state_start(&state, ctx, nr);
 		statep = &state;
@@ -2951,6 +3015,7 @@ static int io_sq_thread(void *data)
 	timeout = inflight = 0;
 	while (!kthread_should_park()) {
 		unsigned int to_submit;
+		int ret;
 
 		if (inflight) {
 			unsigned nr_events = 0;
@@ -3035,8 +3100,9 @@ static int io_sq_thread(void *data)
 		}
 
 		to_submit = min(to_submit, ctx->sq_entries);
-		inflight += io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm,
-					   true);
+		ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
+		if (ret > 0)
+			inflight += ret;
 	}
 
 	set_fs(old_fs);
@@ -4100,8 +4166,10 @@ static int io_uring_flush(struct file *file, void *data)
 	struct io_ring_ctx *ctx = file->private_data;
 
 	io_uring_cancel_files(ctx, data);
-	if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
+	if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) {
+		io_cqring_overflow_flush(ctx, true);
 		io_wq_cancel_all(ctx->io_wq);
+	}
 	return 0;
 }
 
@@ -4402,7 +4470,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 	}
 
 	if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
-			IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE))
+			IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
+			IORING_SETUP_CQ_NODROP))
 		return -EINVAL;
 
 	ret = io_uring_create(entries, &p);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index f1a118b01d18..3d8517eb376e 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -56,6 +56,7 @@ struct io_uring_sqe {
 #define IORING_SETUP_SQPOLL	(1U << 1)	/* SQ poll thread */
 #define IORING_SETUP_SQ_AFF	(1U << 2)	/* sq_thread_cpu is valid */
 #define IORING_SETUP_CQSIZE	(1U << 3)	/* app defines CQ size */
+#define IORING_SETUP_CQ_NODROP	(1U << 4)	/* no CQ drops */
 
 #define IORING_OP_NOP		0
 #define IORING_OP_READV		1

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2019-11-06 22:42 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-11-06 16:21 [RFC] io_uring CQ ring backpressure Jens Axboe
2019-11-06 19:12 ` Pavel Begunkov
2019-11-06 19:43   ` Jens Axboe
2019-11-06 19:51 ` Jann Horn
2019-11-06 20:08   ` Jens Axboe
2019-11-06 21:31     ` Jens Axboe
2019-11-06 21:54       ` Pavel Begunkov
2019-11-06 21:56         ` Jens Axboe
2019-11-06 22:42       ` Jens Axboe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.