[PATCH 5.11 0/2] registered files improvements for IOPOLL mode

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 5.11 0/2] registered files improvements for IOPOLL mode
@ 2020-11-17  6:17 Xiaoguang Wang
  2020-11-17  6:17 ` [PATCH 5.11 1/2] io_uring: keep a pointer ref_node in io_kiocb Xiaoguang Wang
  2020-11-17  6:17 ` [PATCH 5.11 2/2] io_uring: don't take percpu_ref operations for registered files in IOPOLL mode Xiaoguang Wang
  0 siblings, 2 replies; 14+ messages in thread
From: Xiaoguang Wang @ 2020-11-17  6:17 UTC (permalink / raw)
  To: io-uring; +Cc: axboe, joseph.qi

Xiaoguang Wang (2):
  io_uring: keep a pointer ref_node in io_kiocb
  io_uring: don't take percpu_ref operations for registered files in
    IOPOLL mode

 fs/io_uring.c | 44 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 36 insertions(+), 8 deletions(-)

-- 
2.17.2


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 5.11 1/2] io_uring: keep a pointer ref_node in io_kiocb
  2020-11-17  6:17 [PATCH 5.11 0/2] registered files improvements for IOPOLL mode Xiaoguang Wang
@ 2020-11-17  6:17 ` Xiaoguang Wang
  2020-11-17  6:17 ` [PATCH 5.11 2/2] io_uring: don't take percpu_ref operations for registered files in IOPOLL mode Xiaoguang Wang
  1 sibling, 0 replies; 14+ messages in thread
From: Xiaoguang Wang @ 2020-11-17  6:17 UTC (permalink / raw)
  To: io-uring; +Cc: axboe, joseph.qi

Basically no functional changes in this patch, just a preparation
for later patch.

Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
---
 fs/io_uring.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index edfd7c3b8de6..219609c38e48 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -714,7 +714,7 @@ struct io_kiocb {
 	u64				user_data;
 
 	struct io_kiocb			*link;
-	struct percpu_ref		*fixed_file_refs;
+	struct fixed_file_ref_node      *ref_node;
 
 	/*
 	 * 1. used with ctx->iopoll_list with reads/writes
@@ -1927,7 +1927,7 @@ static inline void io_put_file(struct io_kiocb *req, struct file *file,
 			  bool fixed)
 {
 	if (fixed)
-		percpu_ref_put(req->fixed_file_refs);
+		percpu_ref_put(&req->ref_node->refs);
 	else
 		fput(file);
 }
@@ -6344,8 +6344,8 @@ static struct file *io_file_get(struct io_submit_state *state,
 		fd = array_index_nospec(fd, ctx->nr_user_files);
 		file = io_file_from_index(ctx, fd);
 		if (file) {
-			req->fixed_file_refs = &ctx->file_data->node->refs;
-			percpu_ref_get(req->fixed_file_refs);
+			req->ref_node = ctx->file_data->node;
+			percpu_ref_get(&req->ref_node->refs);
 		}
 	} else {
 		trace_io_uring_file_get(ctx, fd);
-- 
2.17.2


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 5.11 2/2] io_uring: don't take percpu_ref operations for registered files in IOPOLL mode
  2020-11-17  6:17 [PATCH 5.11 0/2] registered files improvements for IOPOLL mode Xiaoguang Wang
  2020-11-17  6:17 ` [PATCH 5.11 1/2] io_uring: keep a pointer ref_node in io_kiocb Xiaoguang Wang
@ 2020-11-17  6:17 ` Xiaoguang Wang
  2020-11-17 10:43   ` Pavel Begunkov
  1 sibling, 1 reply; 14+ messages in thread
From: Xiaoguang Wang @ 2020-11-17  6:17 UTC (permalink / raw)
  To: io-uring; +Cc: axboe, joseph.qi

In io_file_get() and io_put_file(), currently we use percpu_ref_get() and
percpu_ref_put() for registered files, but it's hard to say they're very
light-weight synchronization primitives. In one our x86 machine, I get below
perf data(registered files enabled):
Samples: 480K of event 'cycles', Event count (approx.): 298552867297
Overhead  Comman  Shared Object     Symbol
   0.45%  :53243  [kernel.vmlinux]  [k] io_file_get

Currently I don't find any good and generic solution for this issue, but
in IOPOLL mode, given that we can always ensure get/put registered files
under uring_lock, we can use a simple and plain u64 counter to synchronize
with registered files update operations in __io_sqe_files_update().

With this patch, perf data show shows:
Samples: 480K of event 'cycles', Event count (approx.): 298811248406
Overhead  Comma  Shared Object     Symbol
   0.25%  :4182  [kernel.vmlinux]  [k] io_file_get

Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
---
 fs/io_uring.c | 40 ++++++++++++++++++++++++++++++++++------
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 219609c38e48..0fa48ea50ff9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -201,6 +201,11 @@ struct fixed_file_table {
 
 struct fixed_file_ref_node {
 	struct percpu_ref		refs;
+	/*
+	 * Track the number of reqs that reference this node, currently it's
+	 * only used in IOPOLL mode.
+	 */
+	u64				count;
 	struct list_head		node;
 	struct list_head		file_list;
 	struct fixed_file_data		*file_data;
@@ -1926,10 +1931,17 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
 static inline void io_put_file(struct io_kiocb *req, struct file *file,
 			  bool fixed)
 {
-	if (fixed)
-		percpu_ref_put(&req->ref_node->refs);
-	else
+	if (fixed) {
+		/* See same comments in io_file_get(). */
+		if (req->ctx->flags & IORING_SETUP_IOPOLL) {
+			if (!--req->ref_node->count)
+				percpu_ref_kill(&req->ref_node->refs);
+		} else {
+			percpu_ref_put(&req->ref_node->refs);
+		}
+	} else {
 		fput(file);
+	}
 }
 
 static void io_dismantle_req(struct io_kiocb *req)
@@ -6344,8 +6356,16 @@ static struct file *io_file_get(struct io_submit_state *state,
 		fd = array_index_nospec(fd, ctx->nr_user_files);
 		file = io_file_from_index(ctx, fd);
 		if (file) {
+			/*
+			 * IOPOLL mode can always ensure get/put registered files under uring_lock,
+			 * so we can use a simple plain u64 counter to synchronize with registered
+			 * files update operations in __io_sqe_files_update.
+			 */
 			req->ref_node = ctx->file_data->node;
-			percpu_ref_get(&req->ref_node->refs);
+			if (ctx->flags & IORING_SETUP_IOPOLL)
+				req->ref_node->count++;
+			else
+				percpu_ref_get(&req->ref_node->refs);
 		}
 	} else {
 		trace_io_uring_file_get(ctx, fd);
@@ -7215,7 +7235,12 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 		ref_node = list_first_entry(&data->ref_list,
 				struct fixed_file_ref_node, node);
 	spin_unlock(&data->lock);
-	if (ref_node)
+	/*
+	 * If count is not zero, that means we're in IOPOLL mode, and there are
+	 * still reqs that reference this ref_node, let the final req do the
+	 * percpu_ref_kill job.
+	 */
+	if (ref_node && (!--ref_node->count))
 		percpu_ref_kill(&ref_node->refs);
 
 	percpu_ref_kill(&data->refs);
@@ -7625,6 +7650,7 @@ static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
 	INIT_LIST_HEAD(&ref_node->node);
 	INIT_LIST_HEAD(&ref_node->file_list);
 	ref_node->file_data = ctx->file_data;
+	ref_node->count = 1;
 	return ref_node;
 }
 
@@ -7877,7 +7903,9 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 	}
 
 	if (needs_switch) {
-		percpu_ref_kill(&data->node->refs);
+		/* See same comments in io_sqe_files_unregister(). */
+		if (!--data->node->count)
+			percpu_ref_kill(&data->node->refs);
 		spin_lock(&data->lock);
 		list_add(&ref_node->node, &data->ref_list);
 		data->node = ref_node;
-- 
2.17.2


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 5.11 2/2] io_uring: don't take percpu_ref operations for registered files in IOPOLL mode
  2020-11-17  6:17 ` [PATCH 5.11 2/2] io_uring: don't take percpu_ref operations for registered files in IOPOLL mode Xiaoguang Wang
@ 2020-11-17 10:43   ` Pavel Begunkov
  2020-11-17 16:21     ` Xiaoguang Wang
  2020-11-17 16:30     ` Jens Axboe
  0 siblings, 2 replies; 14+ messages in thread
From: Pavel Begunkov @ 2020-11-17 10:43 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: axboe, joseph.qi

On 17/11/2020 06:17, Xiaoguang Wang wrote:
> In io_file_get() and io_put_file(), currently we use percpu_ref_get() and
> percpu_ref_put() for registered files, but it's hard to say they're very
> light-weight synchronization primitives. In one our x86 machine, I get below
> perf data(registered files enabled):
> Samples: 480K of event 'cycles', Event count (approx.): 298552867297
> Overhead  Comman  Shared Object     Symbol
>    0.45%  :53243  [kernel.vmlinux]  [k] io_file_get

Do you have throughput/latency numbers? In my experience for polling for
such small overheads all CPU cycles you win earlier in the stack will be
just burned on polling, because it would still wait for the same fixed*
time for the next response by device. fixed* here means post-factum but
still mostly independent of how your host machine behaves. 

e.g. you kick io_uring at a moment K, at device responses at a moment
K+M. And regardless of what you do in io_uring, the event won't complete
earlier than after M.

That's not the whole story, but as you penalising non-IOPOLL and complicate
it, I just want to confirm that you really get any extra performance from it.
Moreover, your drop (0.45%->0.25%) is just 0.20%, and it's comparable with
the rest of the function (left 0.25%), that's just a dozen of instructions.

> 
> Currently I don't find any good and generic solution for this issue, but
> in IOPOLL mode, given that we can always ensure get/put registered files
> under uring_lock, we can use a simple and plain u64 counter to synchronize
> with registered files update operations in __io_sqe_files_update().
> 
> With this patch, perf data show shows:
> Samples: 480K of event 'cycles', Event count (approx.): 298811248406
> Overhead  Comma  Shared Object     Symbol
>    0.25%  :4182  [kernel.vmlinux]  [k] io_file_get
> 
> Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
> ---
>  fs/io_uring.c | 40 ++++++++++++++++++++++++++++++++++------
>  1 file changed, 34 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 219609c38e48..0fa48ea50ff9 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -201,6 +201,11 @@ struct fixed_file_table {
>  
>  struct fixed_file_ref_node {
>  	struct percpu_ref		refs;
> +	/*
> +	 * Track the number of reqs that reference this node, currently it's
> +	 * only used in IOPOLL mode.
> +	 */
> +	u64				count;
>  	struct list_head		node;
>  	struct list_head		file_list;
>  	struct fixed_file_data		*file_data;
> @@ -1926,10 +1931,17 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
>  static inline void io_put_file(struct io_kiocb *req, struct file *file,
>  			  bool fixed)
>  {
> -	if (fixed)
> -		percpu_ref_put(&req->ref_node->refs);
> -	else
> +	if (fixed) {
> +		/* See same comments in io_file_get(). */
> +		if (req->ctx->flags & IORING_SETUP_IOPOLL) {
> +			if (!--req->ref_node->count)
> +				percpu_ref_kill(&req->ref_node->refs);
> +		} else {
> +			percpu_ref_put(&req->ref_node->refs);
> +		}
> +	} else {
>  		fput(file);
> +	}
>  }
>  
>  static void io_dismantle_req(struct io_kiocb *req)
> @@ -6344,8 +6356,16 @@ static struct file *io_file_get(struct io_submit_state *state,
>  		fd = array_index_nospec(fd, ctx->nr_user_files);
>  		file = io_file_from_index(ctx, fd);
>  		if (file) {
> +			/*
> +			 * IOPOLL mode can always ensure get/put registered files under uring_lock,
> +			 * so we can use a simple plain u64 counter to synchronize with registered
> +			 * files update operations in __io_sqe_files_update.
> +			 */
>  			req->ref_node = ctx->file_data->node;
> -			percpu_ref_get(&req->ref_node->refs);
> +			if (ctx->flags & IORING_SETUP_IOPOLL)
> +				req->ref_node->count++;
> +			else
> +				percpu_ref_get(&req->ref_node->refs);
>  		}
>  	} else {
>  		trace_io_uring_file_get(ctx, fd);
> @@ -7215,7 +7235,12 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
>  		ref_node = list_first_entry(&data->ref_list,
>  				struct fixed_file_ref_node, node);
>  	spin_unlock(&data->lock);
> -	if (ref_node)
> +	/*
> +	 * If count is not zero, that means we're in IOPOLL mode, and there are
> +	 * still reqs that reference this ref_node, let the final req do the
> +	 * percpu_ref_kill job.
> +	 */
> +	if (ref_node && (!--ref_node->count))
>  		percpu_ref_kill(&ref_node->refs);
>  
>  	percpu_ref_kill(&data->refs);
> @@ -7625,6 +7650,7 @@ static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
>  	INIT_LIST_HEAD(&ref_node->node);
>  	INIT_LIST_HEAD(&ref_node->file_list);
>  	ref_node->file_data = ctx->file_data;
> +	ref_node->count = 1;
>  	return ref_node;
>  }
>  
> @@ -7877,7 +7903,9 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
>  	}
>  
>  	if (needs_switch) {
> -		percpu_ref_kill(&data->node->refs);
> +		/* See same comments in io_sqe_files_unregister(). */
> +		if (!--data->node->count)
> +			percpu_ref_kill(&data->node->refs);
>  		spin_lock(&data->lock);
>  		list_add(&ref_node->node, &data->ref_list);
>  		data->node = ref_node;
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 5.11 2/2] io_uring: don't take percpu_ref operations for registered files in IOPOLL mode
  2020-11-17 10:43   ` Pavel Begunkov
@ 2020-11-17 16:21     ` Xiaoguang Wang
  2020-11-17 16:42       ` Pavel Begunkov
  2020-11-17 16:30     ` Jens Axboe
  1 sibling, 1 reply; 14+ messages in thread
From: Xiaoguang Wang @ 2020-11-17 16:21 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring; +Cc: axboe, joseph.qi

hi,

> On 17/11/2020 06:17, Xiaoguang Wang wrote:
>> In io_file_get() and io_put_file(), currently we use percpu_ref_get() and
>> percpu_ref_put() for registered files, but it's hard to say they're very
>> light-weight synchronization primitives. In one our x86 machine, I get below
>> perf data(registered files enabled):
>> Samples: 480K of event 'cycles', Event count (approx.): 298552867297
>> Overhead  Comman  Shared Object     Symbol
>>     0.45%  :53243  [kernel.vmlinux]  [k] io_file_get
> 
> Do you have throughput/latency numbers? In my experience for polling for
> such small overheads all CPU cycles you win earlier in the stack will be
> just burned on polling, because it would still wait for the same fixed*
> time for the next response by device. fixed* here means post-factum but
> still mostly independent of how your host machine behaves.
> 
> e.g. you kick io_uring at a moment K, at device responses at a moment
> K+M. And regardless of what you do in io_uring, the event won't complete
> earlier than after M.
I'm not sure this assumption is correct for real device. IO requests can be
completed in any time, seems that there isn't so-called fixed* time. Say
we're submitting sqe-2 and sqe-1 has been issued to device, the sooner we finish
submitting sqe-2, the sooner and better we start to poll sqe-2 and sqe-2 can be
completed timely.

> 
> That's not the whole story, but as you penalising non-IOPOLL and complicate
> it, I just want to confirm that you really get any extra performance from it.
> Moreover, your drop (0.45%->0.25%) is just 0.20%, and it's comparable with
> the rest of the function (left 0.25%), that's just a dozen of instructions.
I agree that this improvement is minor, and will penalise non-IOPOLL a bit, so I'm
very ok that we drop this patchset.

Here I'd like to have some explanations why I submitted such patch set.
I found in some our arm machine, whose computing power is not that strong,
io_uring(sqpoll and iopoll enabled) even couldn't achieve the capacity of
nvme ssd(but spdk can), so I tried to reduce extral overhead in IOPOLL mode.
Indeed there're are many factors which will influence io performance, not just
io_uring framework, such as block-layer merge operations, various io statistics, etc.

Sometimes I even think whether there should be a light io_uring mainly foucs
on iopoll mode, in which it works like in one kernel task context, then we may get
rid of many atomic operations, memory-barrier, etc. I wonder whether we can
provide a high performance io stack based on io_uring, which will stand shoulder
to shoulder with spdk.

As for the throughput/latency numbers for this patch set, I tried to have
some tests in a real nvme ssd, but don't get a steady resule, sometimes it
shows minor improvements, sometimes it does not. My nvme ssd spec says 4k
rand read iops is 880k, maybe needs a higher nvme ssd to test...

Regards,
Xiaoguang Wang

> 
>>
>> Currently I don't find any good and generic solution for this issue, but
>> in IOPOLL mode, given that we can always ensure get/put registered files
>> under uring_lock, we can use a simple and plain u64 counter to synchronize
>> with registered files update operations in __io_sqe_files_update().
>>
>> With this patch, perf data show shows:
>> Samples: 480K of event 'cycles', Event count (approx.): 298811248406
>> Overhead  Comma  Shared Object     Symbol
>>     0.25%  :4182  [kernel.vmlinux]  [k] io_file_get
>>
>> Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
>> ---
>>   fs/io_uring.c | 40 ++++++++++++++++++++++++++++++++++------
>>   1 file changed, 34 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>> index 219609c38e48..0fa48ea50ff9 100644
>> --- a/fs/io_uring.c
>> +++ b/fs/io_uring.c
>> @@ -201,6 +201,11 @@ struct fixed_file_table {
>>   
>>   struct fixed_file_ref_node {
>>   	struct percpu_ref		refs;
>> +	/*
>> +	 * Track the number of reqs that reference this node, currently it's
>> +	 * only used in IOPOLL mode.
>> +	 */
>> +	u64				count;
>>   	struct list_head		node;
>>   	struct list_head		file_list;
>>   	struct fixed_file_data		*file_data;
>> @@ -1926,10 +1931,17 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
>>   static inline void io_put_file(struct io_kiocb *req, struct file *file,
>>   			  bool fixed)
>>   {
>> -	if (fixed)
>> -		percpu_ref_put(&req->ref_node->refs);
>> -	else
>> +	if (fixed) {
>> +		/* See same comments in io_file_get(). */
>> +		if (req->ctx->flags & IORING_SETUP_IOPOLL) {
>> +			if (!--req->ref_node->count)
>> +				percpu_ref_kill(&req->ref_node->refs);
>> +		} else {
>> +			percpu_ref_put(&req->ref_node->refs);
>> +		}
>> +	} else {
>>   		fput(file);
>> +	}
>>   }
>>   
>>   static void io_dismantle_req(struct io_kiocb *req)
>> @@ -6344,8 +6356,16 @@ static struct file *io_file_get(struct io_submit_state *state,
>>   		fd = array_index_nospec(fd, ctx->nr_user_files);
>>   		file = io_file_from_index(ctx, fd);
>>   		if (file) {
>> +			/*
>> +			 * IOPOLL mode can always ensure get/put registered files under uring_lock,
>> +			 * so we can use a simple plain u64 counter to synchronize with registered
>> +			 * files update operations in __io_sqe_files_update.
>> +			 */
>>   			req->ref_node = ctx->file_data->node;
>> -			percpu_ref_get(&req->ref_node->refs);
>> +			if (ctx->flags & IORING_SETUP_IOPOLL)
>> +				req->ref_node->count++;
>> +			else
>> +				percpu_ref_get(&req->ref_node->refs);
>>   		}
>>   	} else {
>>   		trace_io_uring_file_get(ctx, fd);
>> @@ -7215,7 +7235,12 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
>>   		ref_node = list_first_entry(&data->ref_list,
>>   				struct fixed_file_ref_node, node);
>>   	spin_unlock(&data->lock);
>> -	if (ref_node)
>> +	/*
>> +	 * If count is not zero, that means we're in IOPOLL mode, and there are
>> +	 * still reqs that reference this ref_node, let the final req do the
>> +	 * percpu_ref_kill job.
>> +	 */
>> +	if (ref_node && (!--ref_node->count))
>>   		percpu_ref_kill(&ref_node->refs);
>>   
>>   	percpu_ref_kill(&data->refs);
>> @@ -7625,6 +7650,7 @@ static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
>>   	INIT_LIST_HEAD(&ref_node->node);
>>   	INIT_LIST_HEAD(&ref_node->file_list);
>>   	ref_node->file_data = ctx->file_data;
>> +	ref_node->count = 1;
>>   	return ref_node;
>>   }
>>   
>> @@ -7877,7 +7903,9 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
>>   	}
>>   
>>   	if (needs_switch) {
>> -		percpu_ref_kill(&data->node->refs);
>> +		/* See same comments in io_sqe_files_unregister(). */
>> +		if (!--data->node->count)
>> +			percpu_ref_kill(&data->node->refs);
>>   		spin_lock(&data->lock);
>>   		list_add(&ref_node->node, &data->ref_list);
>>   		data->node = ref_node;
>>
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 5.11 2/2] io_uring: don't take percpu_ref operations for registered files in IOPOLL mode
  2020-11-17 10:43   ` Pavel Begunkov
  2020-11-17 16:21     ` Xiaoguang Wang
@ 2020-11-17 16:30     ` Jens Axboe
  2020-11-17 16:58       ` Pavel Begunkov
  1 sibling, 1 reply; 14+ messages in thread
From: Jens Axboe @ 2020-11-17 16:30 UTC (permalink / raw)
  To: Pavel Begunkov, Xiaoguang Wang, io-uring; +Cc: joseph.qi

On 11/17/20 3:43 AM, Pavel Begunkov wrote:
> On 17/11/2020 06:17, Xiaoguang Wang wrote:
>> In io_file_get() and io_put_file(), currently we use percpu_ref_get() and
>> percpu_ref_put() for registered files, but it's hard to say they're very
>> light-weight synchronization primitives. In one our x86 machine, I get below
>> perf data(registered files enabled):
>> Samples: 480K of event 'cycles', Event count (approx.): 298552867297
>> Overhead  Comman  Shared Object     Symbol
>>    0.45%  :53243  [kernel.vmlinux]  [k] io_file_get
> 
> Do you have throughput/latency numbers? In my experience for polling for
> such small overheads all CPU cycles you win earlier in the stack will be
> just burned on polling, because it would still wait for the same fixed*
> time for the next response by device. fixed* here means post-factum but
> still mostly independent of how your host machine behaves. 

That's only true if you can max out the device with a single core.
Freeing any cycles directly translate into a performance win otherwise,
if your device isn't the bottleneck. For the high performance testing
I've done, the actual polling isn't the bottleneck, it's the rest of the
stack.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 5.11 2/2] io_uring: don't take percpu_ref operations for registered files in IOPOLL mode
  2020-11-17 16:21     ` Xiaoguang Wang
@ 2020-11-17 16:42       ` Pavel Begunkov
  0 siblings, 0 replies; 14+ messages in thread
From: Pavel Begunkov @ 2020-11-17 16:42 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: axboe, joseph.qi

On 17/11/2020 16:21, Xiaoguang Wang wrote:
> hi,
> 
>> On 17/11/2020 06:17, Xiaoguang Wang wrote:
>>> In io_file_get() and io_put_file(), currently we use percpu_ref_get() and
>>> percpu_ref_put() for registered files, but it's hard to say they're very
>>> light-weight synchronization primitives. In one our x86 machine, I get below
>>> perf data(registered files enabled):
>>> Samples: 480K of event 'cycles', Event count (approx.): 298552867297
>>> Overhead  Comman  Shared Object     Symbol
>>>     0.45%  :53243  [kernel.vmlinux]  [k] io_file_get
>>
>> Do you have throughput/latency numbers? In my experience for polling for
>> such small overheads all CPU cycles you win earlier in the stack will be
>> just burned on polling, because it would still wait for the same fixed*
>> time for the next response by device. fixed* here means post-factum but
>> still mostly independent of how your host machine behaves.
>>
>> e.g. you kick io_uring at a moment K, at device responses at a moment
>> K+M. And regardless of what you do in io_uring, the event won't complete
>> earlier than after M.
> I'm not sure this assumption is correct for real device. IO requests can be
> completed in any time, seems that there isn't so-called fixed* time. Say
> we're submitting sqe-2 and sqe-1 has been issued to device, the sooner we finish
> submitting sqe-2, the sooner and better we start to poll sqe-2 and sqe-2 can be
> completed timely.

Definitely, that what I mean by "That's not the whole story". There are
even several patterns how polling in general is used

- trying to poll prior to completions to reduce latency
- poll as a mean to coalesce, reduce overhead on IRQ, etc.

that's why I asked for number to see whether you get anything from it :)

> 
>>
>> That's not the whole story, but as you penalising non-IOPOLL and complicate
>> it, I just want to confirm that you really get any extra performance from it.
>> Moreover, your drop (0.45%->0.25%) is just 0.20%, and it's comparable with
>> the rest of the function (left 0.25%), that's just a dozen of instructions.
> I agree that this improvement is minor, and will penalise non-IOPOLL a bit, so I'm
> very ok that we drop this patchset.
> 
> Here I'd like to have some explanations why I submitted such patch set.
> I found in some our arm machine, whose computing power is not that strong,
> io_uring(sqpoll and iopoll enabled) even couldn't achieve the capacity of
> nvme ssd(but spdk can), so I tried to reduce extral overhead in IOPOLL mode.
> Indeed there're are many factors which will influence io performance, not just
> io_uring framework, such as block-layer merge operations, various io statistics, etc.
> 
> Sometimes I even think whether there should be a light io_uring mainly foucs
> on iopoll mode, in which it works like in one kernel task context, then we may get
> rid of many atomic operations, memory-barrier, etc. I wonder whether we can
> provide a high performance io stack based on io_uring, which will stand shoulder
> to shoulder with spdk.
> 
> As for the throughput/latency numbers for this patch set, I tried to have
> some tests in a real nvme ssd, but don't get a steady resule, sometimes it
> shows minor improvements, sometimes it does not. My nvme ssd spec says 4k
> rand read iops is 880k, maybe needs a higher nvme ssd to test...

Did you tune your host machine for consistency? Pinning threads, fixing in
place CPU clocks, set priorities, etc. How minor your improvements are?
I don't ask to disclose actual numbers, but there is a huge difference in
getting 0.0000001% throughput vs a visible 1%.

> 
> Regards,
> Xiaoguang Wang
> 
>>
>>>
>>> Currently I don't find any good and generic solution for this issue, but
>>> in IOPOLL mode, given that we can always ensure get/put registered files
>>> under uring_lock, we can use a simple and plain u64 counter to synchronize
>>> with registered files update operations in __io_sqe_files_update().
>>>
>>> With this patch, perf data show shows:
>>> Samples: 480K of event 'cycles', Event count (approx.): 298811248406
>>> Overhead  Comma  Shared Object     Symbol
>>>     0.25%  :4182  [kernel.vmlinux]  [k] io_file_get
>>>
>>> Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
>>> ---
>>>   fs/io_uring.c | 40 ++++++++++++++++++++++++++++++++++------
>>>   1 file changed, 34 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>> index 219609c38e48..0fa48ea50ff9 100644
>>> --- a/fs/io_uring.c
>>> +++ b/fs/io_uring.c
>>> @@ -201,6 +201,11 @@ struct fixed_file_table {
>>>     struct fixed_file_ref_node {
>>>       struct percpu_ref        refs;
>>> +    /*
>>> +     * Track the number of reqs that reference this node, currently it's
>>> +     * only used in IOPOLL mode.
>>> +     */
>>> +    u64                count;
>>>       struct list_head        node;
>>>       struct list_head        file_list;
>>>       struct fixed_file_data        *file_data;
>>> @@ -1926,10 +1931,17 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
>>>   static inline void io_put_file(struct io_kiocb *req, struct file *file,
>>>                 bool fixed)
>>>   {
>>> -    if (fixed)
>>> -        percpu_ref_put(&req->ref_node->refs);
>>> -    else
>>> +    if (fixed) {
>>> +        /* See same comments in io_file_get(). */
>>> +        if (req->ctx->flags & IORING_SETUP_IOPOLL) {
>>> +            if (!--req->ref_node->count)
>>> +                percpu_ref_kill(&req->ref_node->refs);
>>> +        } else {
>>> +            percpu_ref_put(&req->ref_node->refs);
>>> +        }
>>> +    } else {
>>>           fput(file);
>>> +    }
>>>   }
>>>     static void io_dismantle_req(struct io_kiocb *req)
>>> @@ -6344,8 +6356,16 @@ static struct file *io_file_get(struct io_submit_state *state,
>>>           fd = array_index_nospec(fd, ctx->nr_user_files);
>>>           file = io_file_from_index(ctx, fd);
>>>           if (file) {
>>> +            /*
>>> +             * IOPOLL mode can always ensure get/put registered files under uring_lock,
>>> +             * so we can use a simple plain u64 counter to synchronize with registered
>>> +             * files update operations in __io_sqe_files_update.
>>> +             */
>>>               req->ref_node = ctx->file_data->node;
>>> -            percpu_ref_get(&req->ref_node->refs);
>>> +            if (ctx->flags & IORING_SETUP_IOPOLL)
>>> +                req->ref_node->count++;
>>> +            else
>>> +                percpu_ref_get(&req->ref_node->refs);
>>>           }
>>>       } else {
>>>           trace_io_uring_file_get(ctx, fd);
>>> @@ -7215,7 +7235,12 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
>>>           ref_node = list_first_entry(&data->ref_list,
>>>                   struct fixed_file_ref_node, node);
>>>       spin_unlock(&data->lock);
>>> -    if (ref_node)
>>> +    /*
>>> +     * If count is not zero, that means we're in IOPOLL mode, and there are
>>> +     * still reqs that reference this ref_node, let the final req do the
>>> +     * percpu_ref_kill job.
>>> +     */
>>> +    if (ref_node && (!--ref_node->count))
>>>           percpu_ref_kill(&ref_node->refs);
>>>         percpu_ref_kill(&data->refs);
>>> @@ -7625,6 +7650,7 @@ static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
>>>       INIT_LIST_HEAD(&ref_node->node);
>>>       INIT_LIST_HEAD(&ref_node->file_list);
>>>       ref_node->file_data = ctx->file_data;
>>> +    ref_node->count = 1;
>>>       return ref_node;
>>>   }
>>>   @@ -7877,7 +7903,9 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
>>>       }
>>>         if (needs_switch) {
>>> -        percpu_ref_kill(&data->node->refs);
>>> +        /* See same comments in io_sqe_files_unregister(). */
>>> +        if (!--data->node->count)
>>> +            percpu_ref_kill(&data->node->refs);
>>>           spin_lock(&data->lock);
>>>           list_add(&ref_node->node, &data->ref_list);
>>>           data->node = ref_node;
>>>
>>

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 5.11 2/2] io_uring: don't take percpu_ref operations for registered files in IOPOLL mode
  2020-11-17 16:30     ` Jens Axboe
@ 2020-11-17 16:58       ` Pavel Begunkov
  2020-11-18  1:42         ` Jens Axboe
  0 siblings, 1 reply; 14+ messages in thread
From: Pavel Begunkov @ 2020-11-17 16:58 UTC (permalink / raw)
  To: Jens Axboe, Xiaoguang Wang, io-uring; +Cc: joseph.qi

On 17/11/2020 16:30, Jens Axboe wrote:
> On 11/17/20 3:43 AM, Pavel Begunkov wrote:
>> On 17/11/2020 06:17, Xiaoguang Wang wrote:
>>> In io_file_get() and io_put_file(), currently we use percpu_ref_get() and
>>> percpu_ref_put() for registered files, but it's hard to say they're very
>>> light-weight synchronization primitives. In one our x86 machine, I get below
>>> perf data(registered files enabled):
>>> Samples: 480K of event 'cycles', Event count (approx.): 298552867297
>>> Overhead  Comman  Shared Object     Symbol
>>>    0.45%  :53243  [kernel.vmlinux]  [k] io_file_get
>>
>> Do you have throughput/latency numbers? In my experience for polling for
>> such small overheads all CPU cycles you win earlier in the stack will be
>> just burned on polling, because it would still wait for the same fixed*
>> time for the next response by device. fixed* here means post-factum but
>> still mostly independent of how your host machine behaves. 
> 
> That's only true if you can max out the device with a single core.
> Freeing any cycles directly translate into a performance win otherwise,
> if your device isn't the bottleneck. For the high performance testing

Agree, that's what happens if a host can't keep up with a device, or e.g.
in case 2. of my other reply. Why don't you mention throwing many-cores
into a single many (poll) queue SSD?

> I've done, the actual polling isn't the bottleneck, it's the rest of the
> stack.
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 5.11 2/2] io_uring: don't take percpu_ref operations for registered files in IOPOLL mode
  2020-11-17 16:58       ` Pavel Begunkov
@ 2020-11-18  1:42         ` Jens Axboe
  2020-11-18 13:59           ` Pavel Begunkov
  0 siblings, 1 reply; 14+ messages in thread
From: Jens Axboe @ 2020-11-18  1:42 UTC (permalink / raw)
  To: Pavel Begunkov, Xiaoguang Wang, io-uring; +Cc: joseph.qi

On 11/17/20 9:58 AM, Pavel Begunkov wrote:
> On 17/11/2020 16:30, Jens Axboe wrote:
>> On 11/17/20 3:43 AM, Pavel Begunkov wrote:
>>> On 17/11/2020 06:17, Xiaoguang Wang wrote:
>>>> In io_file_get() and io_put_file(), currently we use percpu_ref_get() and
>>>> percpu_ref_put() for registered files, but it's hard to say they're very
>>>> light-weight synchronization primitives. In one our x86 machine, I get below
>>>> perf data(registered files enabled):
>>>> Samples: 480K of event 'cycles', Event count (approx.): 298552867297
>>>> Overhead  Comman  Shared Object     Symbol
>>>>    0.45%  :53243  [kernel.vmlinux]  [k] io_file_get
>>>
>>> Do you have throughput/latency numbers? In my experience for polling for
>>> such small overheads all CPU cycles you win earlier in the stack will be
>>> just burned on polling, because it would still wait for the same fixed*
>>> time for the next response by device. fixed* here means post-factum but
>>> still mostly independent of how your host machine behaves. 
>>
>> That's only true if you can max out the device with a single core.
>> Freeing any cycles directly translate into a performance win otherwise,
>> if your device isn't the bottleneck. For the high performance testing
> 
> Agree, that's what happens if a host can't keep up with a device, or e.g.

Right, and it's a direct measure of the efficiency. Moving cycles _to_
polling is a good thing! It means that the rest of the stack got more
efficient. And if the device is fast enough, then that'll directly
result in higher peak IOPS and lower latencies.

> in case 2. of my other reply. Why don't you mention throwing many-cores
> into a single many (poll) queue SSD?

Not really relevant imho, you can obviously always increase performance
if you are core limited by utilizing multiple cores. 

I haven't tested these patches yet, will try and see if I get some time
to do so tomorrow.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 5.11 2/2] io_uring: don't take percpu_ref operations for registered files in IOPOLL mode
  2020-11-18  1:42         ` Jens Axboe
@ 2020-11-18 13:59           ` Pavel Begunkov
  2020-11-18 14:59             ` Jens Axboe
  0 siblings, 1 reply; 14+ messages in thread
From: Pavel Begunkov @ 2020-11-18 13:59 UTC (permalink / raw)
  To: Jens Axboe, Xiaoguang Wang, io-uring; +Cc: joseph.qi

On 18/11/2020 01:42, Jens Axboe wrote:
> On 11/17/20 9:58 AM, Pavel Begunkov wrote:
>> On 17/11/2020 16:30, Jens Axboe wrote:
>>> On 11/17/20 3:43 AM, Pavel Begunkov wrote:
>>>> On 17/11/2020 06:17, Xiaoguang Wang wrote:
>>>>> In io_file_get() and io_put_file(), currently we use percpu_ref_get() and
>>>>> percpu_ref_put() for registered files, but it's hard to say they're very
>>>>> light-weight synchronization primitives. In one our x86 machine, I get below
>>>>> perf data(registered files enabled):
>>>>> Samples: 480K of event 'cycles', Event count (approx.): 298552867297
>>>>> Overhead  Comman  Shared Object     Symbol
>>>>>    0.45%  :53243  [kernel.vmlinux]  [k] io_file_get
>>>>
>>>> Do you have throughput/latency numbers? In my experience for polling for
>>>> such small overheads all CPU cycles you win earlier in the stack will be
>>>> just burned on polling, because it would still wait for the same fixed*
>>>> time for the next response by device. fixed* here means post-factum but
>>>> still mostly independent of how your host machine behaves. 
>>>
>>> That's only true if you can max out the device with a single core.
>>> Freeing any cycles directly translate into a performance win otherwise,
>>> if your device isn't the bottleneck. For the high performance testing
>>
>> Agree, that's what happens if a host can't keep up with a device, or e.g.
> 
> Right, and it's a direct measure of the efficiency. Moving cycles _to_
> polling is a good thing! It means that the rest of the stack got more

Absolutely, but the patch makes code a bit more complex and adds some
overhead for non-iopoll path, definitely not huge, but the showed overhead
reduction (i.e. 0.20%) doesn't do much either. Comparing with left 0.25%
it costs just a couple of instructions.

And that's why I wanted to see if there is any real visible impact.

> efficient. And if the device is fast enough, then that'll directly
> result in higher peak IOPS and lower latencies.
> 
>> in case 2. of my other reply. Why don't you mention throwing many-cores
>> into a single many (poll) queue SSD?
> 
> Not really relevant imho, you can obviously always increase performance
> if you are core limited by utilizing multiple cores. 
> 
> I haven't tested these patches yet, will try and see if I get some time
> to do so tomorrow.

Great

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 5.11 2/2] io_uring: don't take percpu_ref operations for registered files in IOPOLL mode
  2020-11-18 13:59           ` Pavel Begunkov
@ 2020-11-18 14:59             ` Jens Axboe
  2020-11-18 15:36               ` Xiaoguang Wang
  0 siblings, 1 reply; 14+ messages in thread
From: Jens Axboe @ 2020-11-18 14:59 UTC (permalink / raw)
  To: Pavel Begunkov, Xiaoguang Wang, io-uring; +Cc: joseph.qi

On 11/18/20 6:59 AM, Pavel Begunkov wrote:
> On 18/11/2020 01:42, Jens Axboe wrote:
>> On 11/17/20 9:58 AM, Pavel Begunkov wrote:
>>> On 17/11/2020 16:30, Jens Axboe wrote:
>>>> On 11/17/20 3:43 AM, Pavel Begunkov wrote:
>>>>> On 17/11/2020 06:17, Xiaoguang Wang wrote:
>>>>>> In io_file_get() and io_put_file(), currently we use percpu_ref_get() and
>>>>>> percpu_ref_put() for registered files, but it's hard to say they're very
>>>>>> light-weight synchronization primitives. In one our x86 machine, I get below
>>>>>> perf data(registered files enabled):
>>>>>> Samples: 480K of event 'cycles', Event count (approx.): 298552867297
>>>>>> Overhead  Comman  Shared Object     Symbol
>>>>>>    0.45%  :53243  [kernel.vmlinux]  [k] io_file_get
>>>>>
>>>>> Do you have throughput/latency numbers? In my experience for polling for
>>>>> such small overheads all CPU cycles you win earlier in the stack will be
>>>>> just burned on polling, because it would still wait for the same fixed*
>>>>> time for the next response by device. fixed* here means post-factum but
>>>>> still mostly independent of how your host machine behaves. 
>>>>
>>>> That's only true if you can max out the device with a single core.
>>>> Freeing any cycles directly translate into a performance win otherwise,
>>>> if your device isn't the bottleneck. For the high performance testing
>>>
>>> Agree, that's what happens if a host can't keep up with a device, or e.g.
>>
>> Right, and it's a direct measure of the efficiency. Moving cycles _to_
>> polling is a good thing! It means that the rest of the stack got more
> 
> Absolutely, but the patch makes code a bit more complex and adds some
> overhead for non-iopoll path, definitely not huge, but the showed overhead
> reduction (i.e. 0.20%) doesn't do much either. Comparing with left 0.25%
> it costs just a couple of instructions.
> 
> And that's why I wanted to see if there is any real visible impact.

Definitely, it's always a tradeoff between the size of the win and
complexity and other factors. Especially adding to io_kiocb is a big
negative in my book.

>> efficient. And if the device is fast enough, then that'll directly
>> result in higher peak IOPS and lower latencies.
>>
>>> in case 2. of my other reply. Why don't you mention throwing many-cores
>>> into a single many (poll) queue SSD?
>>
>> Not really relevant imho, you can obviously always increase performance
>> if you are core limited by utilizing multiple cores. 
>>
>> I haven't tested these patches yet, will try and see if I get some time
>> to do so tomorrow.
> 
> Great

Ran it through the polled testing which is core limited, and I didn't
see any changes...

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 5.11 2/2] io_uring: don't take percpu_ref operations for registered files in IOPOLL mode
  2020-11-18 14:59             ` Jens Axboe
@ 2020-11-18 15:36               ` Xiaoguang Wang
  2020-11-18 15:52                 ` Pavel Begunkov
  0 siblings, 1 reply; 14+ messages in thread
From: Xiaoguang Wang @ 2020-11-18 15:36 UTC (permalink / raw)
  To: Jens Axboe, Pavel Begunkov, io-uring; +Cc: joseph.qi

hi,

> On 11/18/20 6:59 AM, Pavel Begunkov wrote:
>> On 18/11/2020 01:42, Jens Axboe wrote:
>>> On 11/17/20 9:58 AM, Pavel Begunkov wrote:
>>>> On 17/11/2020 16:30, Jens Axboe wrote:
>>>>> On 11/17/20 3:43 AM, Pavel Begunkov wrote:
>>>>>> On 17/11/2020 06:17, Xiaoguang Wang wrote:
>>>>>>> In io_file_get() and io_put_file(), currently we use percpu_ref_get() and
>>>>>>> percpu_ref_put() for registered files, but it's hard to say they're very
>>>>>>> light-weight synchronization primitives. In one our x86 machine, I get below
>>>>>>> perf data(registered files enabled):
>>>>>>> Samples: 480K of event 'cycles', Event count (approx.): 298552867297
>>>>>>> Overhead  Comman  Shared Object     Symbol
>>>>>>>     0.45%  :53243  [kernel.vmlinux]  [k] io_file_get
>>>>>>
>>>>>> Do you have throughput/latency numbers? In my experience for polling for
>>>>>> such small overheads all CPU cycles you win earlier in the stack will be
>>>>>> just burned on polling, because it would still wait for the same fixed*
>>>>>> time for the next response by device. fixed* here means post-factum but
>>>>>> still mostly independent of how your host machine behaves.
>>>>>
>>>>> That's only true if you can max out the device with a single core.
>>>>> Freeing any cycles directly translate into a performance win otherwise,
>>>>> if your device isn't the bottleneck. For the high performance testing
>>>>
>>>> Agree, that's what happens if a host can't keep up with a device, or e.g.
>>>
>>> Right, and it's a direct measure of the efficiency. Moving cycles _to_
>>> polling is a good thing! It means that the rest of the stack got more
>>
>> Absolutely, but the patch makes code a bit more complex and adds some
>> overhead for non-iopoll path, definitely not huge, but the showed overhead
>> reduction (i.e. 0.20%) doesn't do much either. Comparing with left 0.25%
>> it costs just a couple of instructions.
>>
>> And that's why I wanted to see if there is any real visible impact.
> 
> Definitely, it's always a tradeoff between the size of the win and
> complexity and other factors. Especially adding to io_kiocb is a big
> negative in my book.
> 
>>> efficient. And if the device is fast enough, then that'll directly
>>> result in higher peak IOPS and lower latencies.
>>>
>>>> in case 2. of my other reply. Why don't you mention throwing many-cores
>>>> into a single many (poll) queue SSD?
>>>
>>> Not really relevant imho, you can obviously always increase performance
>>> if you are core limited by utilizing multiple cores.
>>>
>>> I haven't tested these patches yet, will try and see if I get some time
>>> to do so tomorrow.
>>
>> Great
> 
> Ran it through the polled testing which is core limited, and I didn't
> see any changes...
Jens and Pavel, sorry for the noise...
I also have some tests today, in upstream kernel, I also don't see any changes,
but in our internal 4.19 kernel, I got a steady about 1% iops improvement, and
our kernel don't include Ming Lei's patch "2b0d3d3e4fcf percpu_ref: reduce memory
footprint of percpu_ref in fast path".

Regards,
Xiaoguang Wang

> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 5.11 2/2] io_uring: don't take percpu_ref operations for registered files in IOPOLL mode
  2020-11-18 15:36               ` Xiaoguang Wang
@ 2020-11-18 15:52                 ` Pavel Begunkov
  2020-11-18 15:57                   ` Jens Axboe
  0 siblings, 1 reply; 14+ messages in thread
From: Pavel Begunkov @ 2020-11-18 15:52 UTC (permalink / raw)
  To: Xiaoguang Wang, Jens Axboe, io-uring; +Cc: joseph.qi

On 18/11/2020 15:36, Xiaoguang Wang wrote:
>> On 11/18/20 6:59 AM, Pavel Begunkov wrote:
>> Ran it through the polled testing which is core limited, and I didn't
>> see any changes...
> Jens and Pavel, sorry for the noise...

Not at all, that's great you're trying all means to improve performance,
some are just don't worth the effort.

> I also have some tests today, in upstream kernel, I also don't see any changes,
> but in our internal 4.19 kernel, I got a steady about 1% iops improvement, and

hmm, 1% is actually a good result

> our kernel don't include Ming Lei's patch "2b0d3d3e4fcf percpu_ref: reduce memory
> footprint of percpu_ref in fast path".

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 5.11 2/2] io_uring: don't take percpu_ref operations for registered files in IOPOLL mode
  2020-11-18 15:52                 ` Pavel Begunkov
@ 2020-11-18 15:57                   ` Jens Axboe
  0 siblings, 0 replies; 14+ messages in thread
From: Jens Axboe @ 2020-11-18 15:57 UTC (permalink / raw)
  To: Pavel Begunkov, Xiaoguang Wang, io-uring; +Cc: joseph.qi

On 11/18/20 8:52 AM, Pavel Begunkov wrote:
> On 18/11/2020 15:36, Xiaoguang Wang wrote:
>>> On 11/18/20 6:59 AM, Pavel Begunkov wrote:
>>> Ran it through the polled testing which is core limited, and I didn't
>>> see any changes...
>> Jens and Pavel, sorry for the noise...
> 
> Not at all, that's great you're trying all means to improve
> performance, some are just don't worth the effort.

Exactly, never stop trying, it's just that not all efforts pan out.
That's pretty typical for software development :-)

>> I also have some tests today, in upstream kernel, I also don't see
>> any changes, but in our internal 4.19 kernel, I got a steady about 1%
>> iops improvement, and
> 
> hmm, 1% is actually a good result

It is, I wonder why there's such a big discrepancy between the 4.19 base
and current -git in terms of the win on that. Might be changes outside
of io_uring.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2020-11-18 15:58 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-11-17  6:17 [PATCH 5.11 0/2] registered files improvements for IOPOLL mode Xiaoguang Wang
2020-11-17  6:17 ` [PATCH 5.11 1/2] io_uring: keep a pointer ref_node in io_kiocb Xiaoguang Wang
2020-11-17  6:17 ` [PATCH 5.11 2/2] io_uring: don't take percpu_ref operations for registered files in IOPOLL mode Xiaoguang Wang
2020-11-17 10:43   ` Pavel Begunkov
2020-11-17 16:21     ` Xiaoguang Wang
2020-11-17 16:42       ` Pavel Begunkov
2020-11-17 16:30     ` Jens Axboe
2020-11-17 16:58       ` Pavel Begunkov
2020-11-18  1:42         ` Jens Axboe
2020-11-18 13:59           ` Pavel Begunkov
2020-11-18 14:59             ` Jens Axboe
2020-11-18 15:36               ` Xiaoguang Wang
2020-11-18 15:52                 ` Pavel Begunkov
2020-11-18 15:57                   ` Jens Axboe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.