linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] io_uring: introduce inline reqs for IORING_SETUP_IOPOLL & direct_io
@ 2019-04-02  3:10 Jianchao Wang
  2019-04-02  3:47 ` Jens Axboe
  0 siblings, 1 reply; 3+ messages in thread
From: Jianchao Wang @ 2019-04-02  3:10 UTC (permalink / raw)
  To: axboe; +Cc: viro, linux-block, linux-fsdevel, linux-kernel

For the IORING_SETUP_IOPOLL & direct_io case, all of the submission
and completion are handled under ctx->uring_lock or in SQ poll thread
context, so io_get_req and io_put_req has been serialized well.

Based on this, we introduce the preallocated reqs ring per ctx and
needn't to provide any lock to serialize the updating of the head
and tail. The performacne benefits from this. The test result of
following fio command

fio --name=io_uring_test --ioengine=io_uring --hipri --fixedbufs
--iodepth=16 --direct=1 --numjobs=1 --filename=/dev/nvme0n1 --bs=4k
--group_reporting --runtime=10

shows IOPS upgrade from 197K to 206K.

Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
---
 fs/io_uring.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 78 insertions(+), 18 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6aaa3058..40837e4 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -104,11 +104,17 @@ struct async_list {
 	size_t			io_pages;
 };
 
+#define INLINE_REQS_TOTAL 128
+
 struct io_ring_ctx {
 	struct {
 		struct percpu_ref	refs;
 	} ____cacheline_aligned_in_smp;
 
+	struct io_kiocb *inline_reqs[INLINE_REQS_TOTAL];
+	struct io_kiocb *inline_req_array;
+	unsigned long inline_reqs_h, inline_reqs_t;
+
 	struct {
 		unsigned int		flags;
 		bool			compat;
@@ -183,7 +189,9 @@ struct io_ring_ctx {
 
 struct sqe_submit {
 	const struct io_uring_sqe	*sqe;
+	struct file 			*file;
 	unsigned short			index;
+	bool 				is_fixed;
 	bool				has_user;
 	bool				needs_lock;
 	bool				needs_fixed_file;
@@ -228,7 +236,7 @@ struct io_kiocb {
 #define REQ_F_PREPPED		16	/* prep already done */
 	u64			user_data;
 	u64			error;
-
+	bool 			ctx_inline;
 	struct work_struct	work;
 };
 
@@ -397,7 +405,8 @@ static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
 }
 
 static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
-				   struct io_submit_state *state)
+				   struct io_submit_state *state,
+				   bool direct_io)
 {
 	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
 	struct io_kiocb *req;
@@ -405,10 +414,19 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
 	if (!percpu_ref_tryget(&ctx->refs))
 		return NULL;
 
-	if (!state) {
+	/*
+	 * Avoid race with workqueue context that handle buffered IO.
+	 */
+	if (direct_io &&
+	    ctx->inline_reqs_h - ctx->inline_reqs_t < INLINE_REQS_TOTAL) {
+	    req = ctx->inline_reqs[ctx->inline_reqs_h % INLINE_REQS_TOTAL];
+	    ctx->inline_reqs_h++;
+	    req->ctx_inline = true;
+	} else if (!state) {
 		req = kmem_cache_alloc(req_cachep, gfp);
 		if (unlikely(!req))
 			goto out;
+		req->ctx_inline = false;
 	} else if (!state->free_reqs) {
 		size_t sz;
 		int ret;
@@ -429,10 +447,12 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
 		state->free_reqs = ret - 1;
 		state->cur_req = 1;
 		req = state->reqs[0];
+		req->ctx_inline = false;
 	} else {
 		req = state->reqs[state->cur_req];
 		state->free_reqs--;
 		state->cur_req++;
+		req->ctx_inline = false;
 	}
 
 	req->ctx = ctx;
@@ -456,10 +476,17 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
 
 static void io_free_req(struct io_kiocb *req)
 {
+	struct io_ring_ctx	*ctx = req->ctx;
+
 	if (req->file && !(req->flags & REQ_F_FIXED_FILE))
 		fput(req->file);
 	io_ring_drop_ctx_refs(req->ctx, 1);
-	kmem_cache_free(req_cachep, req);
+	if (req->ctx_inline) {
+	    ctx->inline_reqs[ctx->inline_reqs_t % INLINE_REQS_TOTAL] = req;
+	    ctx->inline_reqs_t++;
+	} else {
+	    kmem_cache_free(req_cachep, req);
+	}
 }
 
 static void io_put_req(struct io_kiocb *req)
@@ -492,7 +519,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 			 * completions for those, only batch free for fixed
 			 * file.
 			 */
-			if (req->flags & REQ_F_FIXED_FILE) {
+			if (!req->ctx_inline && req->flags & REQ_F_FIXED_FILE) {
 				reqs[to_free++] = req;
 				if (to_free == ARRAY_SIZE(reqs))
 					io_free_req_many(ctx, reqs, &to_free);
@@ -1562,7 +1589,7 @@ static bool io_op_needs_file(const struct io_uring_sqe *sqe)
 	}
 }
 
-static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
+static int io_req_set_file(struct io_ring_ctx *ctx, struct sqe_submit *s,
 			   struct io_submit_state *state, struct io_kiocb *req)
 {
 	unsigned flags;
@@ -1572,7 +1599,7 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
 	fd = READ_ONCE(s->sqe->fd);
 
 	if (!io_op_needs_file(s->sqe)) {
-		req->file = NULL;
+		s->file = NULL;
 		return 0;
 	}
 
@@ -1580,13 +1607,13 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
 		if (unlikely(!ctx->user_files ||
 		    (unsigned) fd >= ctx->nr_user_files))
 			return -EBADF;
-		req->file = ctx->user_files[fd];
-		req->flags |= REQ_F_FIXED_FILE;
+		s->file = ctx->user_files[fd];
+		s->is_fixed = true;
 	} else {
 		if (s->needs_fixed_file)
 			return -EBADF;
-		req->file = io_file_get(state, fd);
-		if (unlikely(!req->file))
+		s->file = io_file_get(state, fd);
+		if (unlikely(!s->file))
 			return -EBADF;
 	}
 
@@ -1603,13 +1630,20 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
 	if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE))
 		return -EINVAL;
 
-	req = io_get_req(ctx, state);
-	if (unlikely(!req))
-		return -EAGAIN;
-
 	ret = io_req_set_file(ctx, s, state, req);
 	if (unlikely(ret))
-		goto out;
+		return ret;
+
+	req = io_get_req(ctx, state, io_is_direct(s->file));
+	if (unlikely(!req)) {
+		if (s->file && !s->is_fixed)
+			fput(s->file);
+		return -EAGAIN;
+	}
+
+	req->file = s->file;
+	if (s->is_fixed)
+	    req->flags |= REQ_F_FIXED_FILE;
 
 	ret = __io_submit_sqe(ctx, req, s, true, state);
 	if (ret == -EAGAIN) {
@@ -1640,7 +1674,6 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
 		}
 	}
 
-out:
 	/* drop submission reference */
 	io_put_req(req);
 
@@ -2520,6 +2553,9 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 		sock_release(ctx->ring_sock);
 #endif
 
+	if (ctx->inline_req_array)
+		kfree(ctx->inline_req_array);
+
 	io_mem_free(ctx->sq_ring);
 	io_mem_free(ctx->sq_sqes);
 	io_mem_free(ctx->cq_ring);
@@ -2783,7 +2819,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
 	struct user_struct *user = NULL;
 	struct io_ring_ctx *ctx;
 	bool account_mem;
-	int ret;
+	int ret, i;
 
 	if (!entries || entries > IORING_MAX_ENTRIES)
 		return -EINVAL;
@@ -2817,6 +2853,30 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
 		free_uid(user);
 		return -ENOMEM;
 	}
+
+	/*
+	 * When IORING_SETUP_IOPOLL and direct_io, all of submit and
+	 * completion are handled under ctx->uring_lock or in SQ poll
+	 * thread context, so io_get_req and io_put_req are serialized
+	 * well. we could update inline_reqs_h and inline_reqs_t w/o any
+	 * lock and benefit from the inline reqs.
+	 */
+	if (ctx->flags & IORING_SETUP_IOPOLL) {
+		ctx->inline_req_array = kmalloc(
+			sizeof(struct io_kiocb) * INLINE_REQS_TOTAL,
+			GFP_KERNEL);
+		if (ctx->inline_req_array) {
+			for (i = 0; i < INLINE_REQS_TOTAL; i++)
+				ctx->inline_reqs[i] = ctx->inline_req_array + i;
+			ctx->inline_reqs_h = ctx->inline_reqs_t = 0;
+		}
+	}
+
+	if (!ctx->inline_req_array) {
+		ctx->inline_reqs_h = INLINE_REQS_TOTAL;
+		ctx->inline_reqs_t = 0;
+	}
+
 	ctx->compat = in_compat_syscall();
 	ctx->account_mem = account_mem;
 	ctx->user = user;
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] io_uring: introduce inline reqs for IORING_SETUP_IOPOLL & direct_io
  2019-04-02  3:10 [PATCH] io_uring: introduce inline reqs for IORING_SETUP_IOPOLL & direct_io Jianchao Wang
@ 2019-04-02  3:47 ` Jens Axboe
  2019-04-02  8:29   ` jianchao.wang
  0 siblings, 1 reply; 3+ messages in thread
From: Jens Axboe @ 2019-04-02  3:47 UTC (permalink / raw)
  To: Jianchao Wang; +Cc: viro, linux-block, linux-fsdevel, linux-kernel

On 4/1/19 9:10 PM, Jianchao Wang wrote:
> For the IORING_SETUP_IOPOLL & direct_io case, all of the submission
> and completion are handled under ctx->uring_lock or in SQ poll thread
> context, so io_get_req and io_put_req has been serialized well.
> 
> Based on this, we introduce the preallocated reqs ring per ctx and
> needn't to provide any lock to serialize the updating of the head
> and tail. The performacne benefits from this. The test result of
> following fio command
> 
> fio --name=io_uring_test --ioengine=io_uring --hipri --fixedbufs
> --iodepth=16 --direct=1 --numjobs=1 --filename=/dev/nvme0n1 --bs=4k
> --group_reporting --runtime=10
> 
> shows IOPS upgrade from 197K to 206K.

I like this idea, but not a fan of the execution of it. See below.

> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 6aaa3058..40837e4 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -104,11 +104,17 @@ struct async_list {
>  	size_t			io_pages;
>  };
>  
> +#define INLINE_REQS_TOTAL 128
> +
>  struct io_ring_ctx {
>  	struct {
>  		struct percpu_ref	refs;
>  	} ____cacheline_aligned_in_smp;
>  
> +	struct io_kiocb *inline_reqs[INLINE_REQS_TOTAL];
> +	struct io_kiocb *inline_req_array;
> +	unsigned long inline_reqs_h, inline_reqs_t;

Why not just use a list? The req has a list member anyway. Then you don't
need a huge array, just a count.

> +
>  	struct {
>  		unsigned int		flags;
>  		bool			compat;
> @@ -183,7 +189,9 @@ struct io_ring_ctx {
>  
>  struct sqe_submit {
>  	const struct io_uring_sqe	*sqe;
> +	struct file 			*file;
>  	unsigned short			index;
> +	bool 				is_fixed;
>  	bool				has_user;
>  	bool				needs_lock;
>  	bool				needs_fixed_file;

Not sure why you're moving these to the sqe_submit?

> @@ -228,7 +236,7 @@ struct io_kiocb {
>  #define REQ_F_PREPPED		16	/* prep already done */
>  	u64			user_data;
>  	u64			error;
> -
> +	bool 			ctx_inline;
>  	struct work_struct	work;
>  };

ctx_inline should just be a req flag.

>  
> @@ -397,7 +405,8 @@ static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
>  }
>  
>  static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
> -				   struct io_submit_state *state)
> +				   struct io_submit_state *state,
> +				   bool direct_io)
>  {
>  	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
>  	struct io_kiocb *req;
> @@ -405,10 +414,19 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
>  	if (!percpu_ref_tryget(&ctx->refs))
>  		return NULL;
>  
> -	if (!state) {
> +	/*
> +	 * Avoid race with workqueue context that handle buffered IO.
> +	 */
> +	if (direct_io &&
> +	    ctx->inline_reqs_h - ctx->inline_reqs_t < INLINE_REQS_TOTAL) {
> +	    req = ctx->inline_reqs[ctx->inline_reqs_h % INLINE_REQS_TOTAL];
> +	    ctx->inline_reqs_h++;
> +	    req->ctx_inline = true;
> +	} else if (!state) {

What happens for O_DIRECT that ends up being punted to async context?
We need a clearer indication of whether or not we're under the lock or
not, and then get rid of the direct_io "limitation" for this. Arguably,
cached buffered IO needs this even more than O_DIRECT does, since that
is much faster.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] io_uring: introduce inline reqs for IORING_SETUP_IOPOLL & direct_io
  2019-04-02  3:47 ` Jens Axboe
@ 2019-04-02  8:29   ` jianchao.wang
  0 siblings, 0 replies; 3+ messages in thread
From: jianchao.wang @ 2019-04-02  8:29 UTC (permalink / raw)
  To: Jens Axboe; +Cc: viro, linux-block, linux-fsdevel, linux-kernel

Hi Jens

On 4/2/19 11:47 AM, Jens Axboe wrote:
> On 4/1/19 9:10 PM, Jianchao Wang wrote:
>> For the IORING_SETUP_IOPOLL & direct_io case, all of the submission
>> and completion are handled under ctx->uring_lock or in SQ poll thread
>> context, so io_get_req and io_put_req has been serialized well.
>>
>> Based on this, we introduce the preallocated reqs ring per ctx and
>> needn't to provide any lock to serialize the updating of the head
>> and tail. The performacne benefits from this. The test result of
>> following fio command
>>
>> fio --name=io_uring_test --ioengine=io_uring --hipri --fixedbufs
>> --iodepth=16 --direct=1 --numjobs=1 --filename=/dev/nvme0n1 --bs=4k
>> --group_reporting --runtime=10
>>
>> shows IOPS upgrade from 197K to 206K.
> 
> I like this idea, but not a fan of the execution of it. See below.
> 
>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>> index 6aaa3058..40837e4 100644
>> --- a/fs/io_uring.c
>> +++ b/fs/io_uring.c
>> @@ -104,11 +104,17 @@ struct async_list {
>>  	size_t			io_pages;
>>  };
>>  
>> +#define INLINE_REQS_TOTAL 128
>> +
>>  struct io_ring_ctx {
>>  	struct {
>>  		struct percpu_ref	refs;
>>  	} ____cacheline_aligned_in_smp;
>>  
>> +	struct io_kiocb *inline_reqs[INLINE_REQS_TOTAL];
>> +	struct io_kiocb *inline_req_array;
>> +	unsigned long inline_reqs_h, inline_reqs_t;
> 
> Why not just use a list? The req has a list member anyway. Then you don't
> need a huge array, just a count.

Yes, indeed.

> 
>> +
>>  	struct {
>>  		unsigned int		flags;
>>  		bool			compat;
>> @@ -183,7 +189,9 @@ struct io_ring_ctx {
>>  
>>  struct sqe_submit {
>>  	const struct io_uring_sqe	*sqe;
>> +	struct file 			*file;
>>  	unsigned short			index;
>> +	bool 				is_fixed;
>>  	bool				has_user;
>>  	bool				needs_lock;
>>  	bool				needs_fixed_file;
> 
> Not sure why you're moving these to the sqe_submit?

Just want to get the file before io_get_req to know whether it is direct_io.
This is unnecessary if eliminate the direct io limitation.

> 
>> @@ -228,7 +236,7 @@ struct io_kiocb {
>>  #define REQ_F_PREPPED		16	/* prep already done */
>>  	u64			user_data;
>>  	u64			error;
>> -
>> +	bool 			ctx_inline;
>>  	struct work_struct	work;
>>  };
> 
> ctx_inline should just be a req flag.

Yes.

> 
>>  
>> @@ -397,7 +405,8 @@ static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
>>  }
>>  
>>  static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
>> -				   struct io_submit_state *state)
>> +				   struct io_submit_state *state,
>> +				   bool direct_io)
>>  {
>>  	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
>>  	struct io_kiocb *req;
>> @@ -405,10 +414,19 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
>>  	if (!percpu_ref_tryget(&ctx->refs))
>>  		return NULL;
>>  
>> -	if (!state) {
>> +	/*
>> +	 * Avoid race with workqueue context that handle buffered IO.
>> +	 */
>> +	if (direct_io &&
>> +	    ctx->inline_reqs_h - ctx->inline_reqs_t < INLINE_REQS_TOTAL) {
>> +	    req = ctx->inline_reqs[ctx->inline_reqs_h % INLINE_REQS_TOTAL];
>> +	    ctx->inline_reqs_h++;
>> +	    req->ctx_inline = true;
>> +	} else if (!state) {
> 
> What happens for O_DIRECT that ends up being punted to async context?

I misunderstand that only buffered io would be punted to async workqueue context.

> We need a clearer indication of whether or not we're under the lock or
> not, and then get rid of the direct_io "limitation" for this. Arguably,
> cached buffered IO needs this even more than O_DIRECT does, since that
> is much faster.
> 

Before punt the IO to async workqueue context, a sqe_copy will be allocated.
How about allocating a structure with both a sqe and a io_kiocb ?
Then use the newly allocated io_kiocb to replace the preallocated io_kiocb and
release the latter one. Then we could eliminate the wrong direct_io limitation.

Thanks
Jianchao

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2019-04-02  8:29 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-04-02  3:10 [PATCH] io_uring: introduce inline reqs for IORING_SETUP_IOPOLL & direct_io Jianchao Wang
2019-04-02  3:47 ` Jens Axboe
2019-04-02  8:29   ` jianchao.wang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).