Re: [PATCH 2/3] io_uring: Fix broken links with offloading

From: Pavel Begunkov <asml.silence@gmail.com>
To: Jens Axboe <axboe@kernel.dk>,
	linux-block@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: Re: [PATCH 2/3] io_uring: Fix broken links with offloading
Date: Fri, 25 Oct 2019 12:36:14 +0300	[thread overview]
Message-ID: <dd88f5be-930b-53e5-c3b6-12927d6634b1@gmail.com> (raw)
In-Reply-To: <bd0eaa7729e3b8b599a25167df0a4ee583da69cc.1571991701.git.asml.silence@gmail.com>


[-- Attachment #1.1: Type: text/plain, Size: 5989 bytes --]

On 25/10/2019 12:31, Pavel Begunkov (Silence) wrote:
> From: Pavel Begunkov <asml.silence@gmail.com>
> 
> io_sq_thread() processes sqes by 8 without considering links. As a
> result, links will be randomely subdivided.
> 
> The easiest way to fix it is to call io_get_sqring() inside
> io_submit_sqes() as do io_ring_submit().
> 
> Downsides:
> 1. This removes optimisation of not grabbing mm_struct for fixed files
> 2. It submitting all sqes in one go, without finer-grained sheduling
> with cq processing.
> 
Is this logic with not-grabbing mm and fixed files critical?
I want to put it back later after some cleanup. 

> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
> ---
>  fs/io_uring.c | 62 +++++++++++++++++++++++++++------------------------
>  1 file changed, 33 insertions(+), 29 deletions(-)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 0e141d905a5b..949c82a40d16 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -735,6 +735,14 @@ static unsigned io_cqring_events(struct io_rings *rings)
>  	return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
>  }
>  
> +static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
> +{
> +	struct io_rings *rings = ctx->rings;
> +
> +	/* make sure SQ entry isn't read before tail */
> +	return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
> +}
> +
>  /*
>   * Find and free completed poll iocbs
>   */
> @@ -2560,8 +2568,8 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
>  	return false;
>  }
>  
> -static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
> -			  unsigned int nr, bool has_user, bool mm_fault)
> +static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
> +			  bool has_user, bool mm_fault)
>  {
>  	struct io_submit_state state, *statep = NULL;
>  	struct io_kiocb *link = NULL;
> @@ -2575,6 +2583,11 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
>  	}
>  
>  	for (i = 0; i < nr; i++) {
> +		struct sqe_submit s;
> +
> +		if (!io_get_sqring(ctx, &s))
> +			break;
> +
>  		/*
>  		 * If previous wasn't linked and we have a linked command,
>  		 * that's the end of the chain. Submit the previous link.
> @@ -2584,9 +2597,9 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
>  			link = NULL;
>  			shadow_req = NULL;
>  		}
> -		prev_was_link = (sqes[i].sqe->flags & IOSQE_IO_LINK) != 0;
> +		prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0;
>  
> -		if (link && (sqes[i].sqe->flags & IOSQE_IO_DRAIN)) {
> +		if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) {
>  			if (!shadow_req) {
>  				shadow_req = io_get_req(ctx, NULL);
>  				if (unlikely(!shadow_req))
> @@ -2594,18 +2607,18 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
>  				shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
>  				refcount_dec(&shadow_req->refs);
>  			}
> -			shadow_req->sequence = sqes[i].sequence;
> +			shadow_req->sequence = s.sequence;
>  		}
>  
>  out:
>  		if (unlikely(mm_fault)) {
> -			io_cqring_add_event(ctx, sqes[i].sqe->user_data,
> +			io_cqring_add_event(ctx, s.sqe->user_data,
>  						-EFAULT);
>  		} else {
> -			sqes[i].has_user = has_user;
> -			sqes[i].needs_lock = true;
> -			sqes[i].needs_fixed_file = true;
> -			io_submit_sqe(ctx, &sqes[i], statep, &link);
> +			s.has_user = has_user;
> +			s.needs_lock = true;
> +			s.needs_fixed_file = true;
> +			io_submit_sqe(ctx, &s, statep, &link);
>  			submitted++;
>  		}
>  	}
> @@ -2620,7 +2633,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
>  
>  static int io_sq_thread(void *data)
>  {
> -	struct sqe_submit sqes[IO_IOPOLL_BATCH];
>  	struct io_ring_ctx *ctx = data;
>  	struct mm_struct *cur_mm = NULL;
>  	mm_segment_t old_fs;
> @@ -2635,8 +2647,8 @@ static int io_sq_thread(void *data)
>  
>  	timeout = inflight = 0;
>  	while (!kthread_should_park()) {
> -		bool all_fixed, mm_fault = false;
> -		int i;
> +		bool mm_fault = false;
> +		unsigned int to_submit;
>  
>  		if (inflight) {
>  			unsigned nr_events = 0;
> @@ -2656,7 +2668,8 @@ static int io_sq_thread(void *data)
>  				timeout = jiffies + ctx->sq_thread_idle;
>  		}
>  
> -		if (!io_get_sqring(ctx, &sqes[0])) {
> +		to_submit = io_sqring_entries(ctx);
> +		if (!to_submit) {
>  			/*
>  			 * We're polling. If we're within the defined idle
>  			 * period, then let us spin without work before going
> @@ -2687,7 +2700,8 @@ static int io_sq_thread(void *data)
>  			/* make sure to read SQ tail after writing flags */
>  			smp_mb();
>  
> -			if (!io_get_sqring(ctx, &sqes[0])) {
> +			to_submit = io_sqring_entries(ctx);
> +			if (!to_submit) {
>  				if (kthread_should_park()) {
>  					finish_wait(&ctx->sqo_wait, &wait);
>  					break;
> @@ -2705,19 +2719,8 @@ static int io_sq_thread(void *data)
>  			ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
>  		}
>  
> -		i = 0;
> -		all_fixed = true;
> -		do {
> -			if (all_fixed && io_sqe_needs_user(sqes[i].sqe))
> -				all_fixed = false;
> -
> -			i++;
> -			if (i == ARRAY_SIZE(sqes))
> -				break;
> -		} while (io_get_sqring(ctx, &sqes[i]));
> -
>  		/* Unless all new commands are FIXED regions, grab mm */
> -		if (!all_fixed && !cur_mm) {
> +		if (!cur_mm) {
>  			mm_fault = !mmget_not_zero(ctx->sqo_mm);
>  			if (!mm_fault) {
>  				use_mm(ctx->sqo_mm);
> @@ -2725,8 +2728,9 @@ static int io_sq_thread(void *data)
>  			}
>  		}
>  
> -		inflight += io_submit_sqes(ctx, sqes, i, cur_mm != NULL,
> -						mm_fault);
> +		to_submit = min(to_submit, ctx->sq_entries);
> +		inflight += io_submit_sqes(ctx, to_submit, cur_mm != NULL,
> +					   mm_fault);
>  
>  		/* Commit SQ ring head once we've consumed all SQEs */
>  		io_commit_sqring(ctx);
> 

-- 
Yours sincerely,
Pavel Begunkov


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]