Re: [PATCH RFC] io_uring: limit inflight IO

From: Pavel Begunkov <asml.silence@gmail.com>
To: Jens Axboe <axboe@kernel.dk>, io-uring@vger.kernel.org
Subject: Re: [PATCH RFC] io_uring: limit inflight IO
Date: Fri, 8 Nov 2019 12:56:49 +0300	[thread overview]
Message-ID: <273391a6-1def-3c29-986c-8efca569fc16@gmail.com> (raw)
In-Reply-To: <33958a03-6714-ce51-9856-1dcbf53f67d5@kernel.dk>

[-- Attachment #1.1: Type: text/plain, Size: 5362 bytes --]

On 08/11/2019 03:19, Jens Axboe wrote:
> On 11/7/19 4:21 PM, Jens Axboe wrote:
>> I'd like some feedback on this one. Even tith the overflow backpressure
>> patch, we still have a potentially large gap where applications can
>> submit IO before we get any dropped events in the CQ ring. This is
>> especially true if the execution time of those requests are long
>> (unbounded).
>>
>> This adds IORING_SETUP_INFLIGHT, which if set, will return -EBUSY if we
>> have more IO pending than we can feasibly support. This is normally the
>> CQ ring size, but of IORING_SETUP_CQ_NODROP is enabled, then it's twice
>> the CQ ring size.
>>
>> This helps manage the pending queue size instead of letting it grow
>> indefinitely.
>>
>> Note that we could potentially just make this the default behavior -
>> applications need to handle -EAGAIN returns already, in case we run out
>> of memory, and if we change this to return -EAGAIN as well, then it
>> doesn't introduce any new failure cases. I'm tempted to do that...
>>
>> Anyway, comments solicited!
What's wrong with giving away overflow handling to the userspace? It
knows its inflight count, and it shouldn't be a problem to allocate
large enough rings. The same goes for the backpressure patch. Do you
have a particular requirement/user for this?

Awhile something could be done {efficiently,securely,etc} in the
userspace, I would prefer to keep the kernel part simpler.

> 
> After a little deliberation, I think we should go with the one that
> doesn't require users to opt-in. As mentioned, let's change it to
> -EAGAIN to not introduce a new errno for this. They essentially mean
> the same thing anyway.
> 
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index f8344f95817e..4c488bf6e889 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -203,6 +203,7 @@ struct io_ring_ctx {
>  		unsigned		sq_mask;
>  		unsigned		sq_thread_idle;
>  		unsigned		cached_sq_dropped;
> +		atomic_t		cached_cq_overflow;
>  		struct io_uring_sqe	*sq_sqes;
>  
>  		struct list_head	defer_list;
> @@ -221,13 +222,12 @@ struct io_ring_ctx {
>  
>  	struct {
>  		unsigned		cached_cq_tail;
> -		atomic_t		cached_cq_overflow;
>  		unsigned		cq_entries;
>  		unsigned		cq_mask;
> +		atomic_t		cq_timeouts;
>  		struct wait_queue_head	cq_wait;
>  		struct fasync_struct	*cq_fasync;
>  		struct eventfd_ctx	*cq_ev_fd;
> -		atomic_t		cq_timeouts;
>  	} ____cacheline_aligned_in_smp;
>  
>  	struct io_rings	*rings;
> @@ -705,16 +705,39 @@ static void io_cqring_add_event(struct io_kiocb *req, long res)
>  	io_cqring_ev_posted(ctx);
>  }
>  
> +static bool io_req_over_limit(struct io_ring_ctx *ctx)
> +{
> +	unsigned limit, inflight;
> +
> +	/*
> +	 * This doesn't need to be super precise, so only check every once
> +	 * in a while.
> +	 */
> +	if (ctx->cached_sq_head & ctx->sq_mask)
> +		return false;
> +
> +	if (ctx->flags & IORING_SETUP_CQ_NODROP)
> +		limit = 2 * ctx->cq_entries;
> +	else
> +		limit = ctx->cq_entries;
> +
> +	inflight = ctx->cached_sq_head -
> +		  (ctx->cached_cq_tail + atomic_read(&ctx->cached_cq_overflow));
> +	return inflight >= limit;
> +}
> +
>  static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
> -				   struct io_submit_state *state)
> +				   struct io_submit_state *state, bool force)
>  {
>  	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
>  	struct io_kiocb *req;
>  
>  	if (!percpu_ref_tryget(&ctx->refs))
> -		return NULL;
> +		return ERR_PTR(-ENXIO);
>  
>  	if (!state) {
> +		if (unlikely(!force && io_req_over_limit(ctx)))
> +			goto out;
>  		req = kmem_cache_alloc(req_cachep, gfp);
>  		if (unlikely(!req))
>  			goto out;
> @@ -722,6 +745,8 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
>  		size_t sz;
>  		int ret;
>  
> +		if (unlikely(!force && io_req_over_limit(ctx)))
> +			goto out;
>  		sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
>  		ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
>  
> @@ -754,7 +779,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
>  	return req;
>  out:
>  	percpu_ref_put(&ctx->refs);
> -	return NULL;
> +	return ERR_PTR(-EAGAIN);
>  }
>  
>  static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
> @@ -2963,10 +2988,11 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
>  		struct io_kiocb *req;
>  		unsigned int sqe_flags;
>  
> -		req = io_get_req(ctx, statep);
> -		if (unlikely(!req)) {
> +		req = io_get_req(ctx, statep, false);
> +		if (unlikely(IS_ERR(req))) {
>  			if (!submitted)
> -				submitted = -EAGAIN;
> +				submitted = PTR_ERR(req);
> +			req = NULL;
>  			break;
>  		}
>  		if (!io_get_sqring(ctx, &req->submit)) {
> @@ -2986,9 +3012,11 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
>  
>  		if (link && (sqe_flags & IOSQE_IO_DRAIN)) {
>  			if (!shadow_req) {
> -				shadow_req = io_get_req(ctx, NULL);
> -				if (unlikely(!shadow_req))
> +				shadow_req = io_get_req(ctx, NULL, true);
> +				if (unlikely(IS_ERR(shadow_req))) {
> +					shadow_req = NULL;
>  					goto out;
> +				}
>  				shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
>  				refcount_dec(&shadow_req->refs);
>  			}
> 

-- 
Pavel Begunkov

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]