All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] block: optimise bvec_iter_advance()
       [not found] <cover.1574974574.git.asml.silence@gmail.com>
@ 2019-11-28 21:04 ` Pavel Begunkov
  2019-11-29 22:17   ` Arvind Sankar
  0 siblings, 1 reply; 9+ messages in thread
From: Pavel Begunkov @ 2019-11-28 21:04 UTC (permalink / raw)
  To: Jens Axboe, Ming Lei, linux-block, linux-kernel

bvec_iter_advance() is quite popular, but compilers fail to do proper
alias analysis and optimise it good enough. The assembly is checked
for gcc 9.2, x86-64.

- remove @iter->bi_size from min(...), as it's always less than @bytes.
Modify at the beginning and forget about it.

- the compiler isn't able to collapse memory dependencies and remove
writes in the loop. Help it by explicitely using local vars.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 include/linux/bvec.h | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index a032f01e928c..7b2f05faae14 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -87,26 +87,31 @@ struct bvec_iter_all {
 static inline bool bvec_iter_advance(const struct bio_vec *bv,
 		struct bvec_iter *iter, unsigned bytes)
 {
+	unsigned int done = iter->bi_bvec_done;
+	unsigned int idx = iter->bi_idx;
+
 	if (WARN_ONCE(bytes > iter->bi_size,
 		     "Attempted to advance past end of bvec iter\n")) {
 		iter->bi_size = 0;
 		return false;
 	}
 
+	iter->bi_size -= bytes;
+
 	while (bytes) {
-		const struct bio_vec *cur = bv + iter->bi_idx;
-		unsigned len = min3(bytes, iter->bi_size,
-				    cur->bv_len - iter->bi_bvec_done);
+		const struct bio_vec *cur = bv + idx;
+		unsigned int len = min(bytes, cur->bv_len - done);
 
 		bytes -= len;
-		iter->bi_size -= len;
-		iter->bi_bvec_done += len;
-
-		if (iter->bi_bvec_done == cur->bv_len) {
-			iter->bi_bvec_done = 0;
-			iter->bi_idx++;
+		done += len;
+		if (done == cur->bv_len) {
+			idx++;
+			done = 0;
 		}
 	}
+
+	iter->bi_idx = idx;
+	iter->bi_bvec_done = done;
 	return true;
 }
 
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] block: optimise bvec_iter_advance()
  2019-11-28 21:04 ` [PATCH] block: optimise bvec_iter_advance() Pavel Begunkov
@ 2019-11-29 22:17   ` Arvind Sankar
  2019-11-29 22:47     ` Pavel Begunkov
  0 siblings, 1 reply; 9+ messages in thread
From: Arvind Sankar @ 2019-11-29 22:17 UTC (permalink / raw)
  To: Pavel Begunkov; +Cc: Jens Axboe, Ming Lei, linux-block, linux-kernel

On Fri, Nov 29, 2019 at 12:04:37AM +0300, Pavel Begunkov wrote:
> bvec_iter_advance() is quite popular, but compilers fail to do proper
> alias analysis and optimise it good enough. The assembly is checked
> for gcc 9.2, x86-64.
> 
> - remove @iter->bi_size from min(...), as it's always less than @bytes.
> Modify at the beginning and forget about it.
> 
> - the compiler isn't able to collapse memory dependencies and remove
> writes in the loop. Help it by explicitely using local vars.
> 
> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
> ---
>  include/linux/bvec.h | 23 ++++++++++++++---------
>  1 file changed, 14 insertions(+), 9 deletions(-)
> 
> diff --git a/include/linux/bvec.h b/include/linux/bvec.h
> index a032f01e928c..7b2f05faae14 100644
> --- a/include/linux/bvec.h
> +++ b/include/linux/bvec.h
> @@ -87,26 +87,31 @@ struct bvec_iter_all {
>  static inline bool bvec_iter_advance(const struct bio_vec *bv,
>  		struct bvec_iter *iter, unsigned bytes)
>  {
> +	unsigned int done = iter->bi_bvec_done;
> +	unsigned int idx = iter->bi_idx;
> +
>  	if (WARN_ONCE(bytes > iter->bi_size,
>  		     "Attempted to advance past end of bvec iter\n")) {
>  		iter->bi_size = 0;
>  		return false;
>  	}
>  
> +	iter->bi_size -= bytes;
> +
>  	while (bytes) {
> -		const struct bio_vec *cur = bv + iter->bi_idx;
> -		unsigned len = min3(bytes, iter->bi_size,
> -				    cur->bv_len - iter->bi_bvec_done);
> +		const struct bio_vec *cur = bv + idx;
> +		unsigned int len = min(bytes, cur->bv_len - done);
>  
>  		bytes -= len;
> -		iter->bi_size -= len;
> -		iter->bi_bvec_done += len;
> -
> -		if (iter->bi_bvec_done == cur->bv_len) {
> -			iter->bi_bvec_done = 0;
> -			iter->bi_idx++;
> +		done += len;
> +		if (done == cur->bv_len) {
> +			idx++;
> +			done = 0;
>  		}
>  	}
> +
> +	iter->bi_idx = idx;
> +	iter->bi_bvec_done = done;
>  	return true;
>  }
>  
> -- 
> 2.24.0
> 

The loop can be simplified a bit further, as done has to be 0 once we go
beyond the current bio_vec. See below for the simplified version.

I also check if bi_size became zero so we can skip the rest of the
calculations in that case. If we want to preserve the current behavior of
updating iter->bi_idx and iter->bi_bvec_done even if bi_size is going to
become zero, the loop condition should change to

		while (bytes && bytes >= cur->bv_len)

to ensure that we don't try to access past the end of the bio_vec array.

diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index a032f01e928c..380d188dfbd2 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -87,25 +87,26 @@ struct bvec_iter_all {
 static inline bool bvec_iter_advance(const struct bio_vec *bv,
 		struct bvec_iter *iter, unsigned bytes)
 {
+	unsigned int idx = iter->bi_idx;
+	const struct bio_vec *cur = bv + idx;
+
 	if (WARN_ONCE(bytes > iter->bi_size,
 		     "Attempted to advance past end of bvec iter\n")) {
 		iter->bi_size = 0;
 		return false;
 	}
 
-	while (bytes) {
-		const struct bio_vec *cur = bv + iter->bi_idx;
-		unsigned len = min3(bytes, iter->bi_size,
-				    cur->bv_len - iter->bi_bvec_done);
-
-		bytes -= len;
-		iter->bi_size -= len;
-		iter->bi_bvec_done += len;
-
-		if (iter->bi_bvec_done == cur->bv_len) {
-			iter->bi_bvec_done = 0;
-			iter->bi_idx++;
+	iter->bi_size -= bytes;
+	if (iter->bi_size != 0) {
+		bytes += iter->bi_bvec_done;
+		while (bytes >= cur->bv_len) {
+			bytes -= cur->bv_len;
+			idx++;
+			cur++;
 		}
+
+		iter->bi_idx = idx;
+		iter->bi_bvec_done = bytes;
 	}
 	return true;
 }

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] block: optimise bvec_iter_advance()
  2019-11-29 22:17   ` Arvind Sankar
@ 2019-11-29 22:47     ` Pavel Begunkov
  2019-11-29 23:24       ` Arvind Sankar
  0 siblings, 1 reply; 9+ messages in thread
From: Pavel Begunkov @ 2019-11-29 22:47 UTC (permalink / raw)
  To: Arvind Sankar; +Cc: Jens Axboe, Ming Lei, linux-block, linux-kernel


[-- Attachment #1.1: Type: text/plain, Size: 2375 bytes --]

On 30/11/2019 01:17, Arvind Sankar wrote:
> 
> The loop can be simplified a bit further, as done has to be 0 once we go
> beyond the current bio_vec. See below for the simplified version.
> 

Thanks for the suggestion! I thought about it, and decided to not
for several reasons. I prefer to not fine-tune and give compilers
more opportunity to do their job. And it's already fast enough with
modern architectures (MOVcc, complex addressing, etc).

Also need to consider code clarity and the fact, that this is inline,
so should be brief and register-friendly.


> I also check if bi_size became zero so we can skip the rest of the
> calculations in that case. If we want to preserve the current behavior of
> updating iter->bi_idx and iter->bi_bvec_done even if bi_size is going to
> become zero, the loop condition should change to
> 
> 		while (bytes && bytes >= cur->bv_len)

Probably, it's better to leave it in a consistent state. Shouldn't be
a problem, but never know when and who will screw it up. 

> 
> to ensure that we don't try to access past the end of the bio_vec array.
> 
> diff --git a/include/linux/bvec.h b/include/linux/bvec.h
> index a032f01e928c..380d188dfbd2 100644
> --- a/include/linux/bvec.h
> +++ b/include/linux/bvec.h
> @@ -87,25 +87,26 @@ struct bvec_iter_all {
>  static inline bool bvec_iter_advance(const struct bio_vec *bv,
>  		struct bvec_iter *iter, unsigned bytes)
>  {
> +	unsigned int idx = iter->bi_idx;
> +	const struct bio_vec *cur = bv + idx;
> +
>  	if (WARN_ONCE(bytes > iter->bi_size,
>  		     "Attempted to advance past end of bvec iter\n")) {
>  		iter->bi_size = 0;
>  		return false;
>  	}
>  
> -	while (bytes) {
> -		const struct bio_vec *cur = bv + iter->bi_idx;
> -		unsigned len = min3(bytes, iter->bi_size,
> -				    cur->bv_len - iter->bi_bvec_done);
> -
> -		bytes -= len;
> -		iter->bi_size -= len;
> -		iter->bi_bvec_done += len;
> -
> -		if (iter->bi_bvec_done == cur->bv_len) {
> -			iter->bi_bvec_done = 0;
> -			iter->bi_idx++;
> +	iter->bi_size -= bytes;
> +	if (iter->bi_size != 0) {
> +		bytes += iter->bi_bvec_done;
> +		while (bytes >= cur->bv_len) {
> +			bytes -= cur->bv_len;
> +			idx++;
> +			cur++;
>  		}
> +
> +		iter->bi_idx = idx;
> +		iter->bi_bvec_done = bytes;
>  	}
>  	return true;
>  }
> 

-- 
Pavel Begunkov


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] block: optimise bvec_iter_advance()
  2019-11-29 22:47     ` Pavel Begunkov
@ 2019-11-29 23:24       ` Arvind Sankar
  2019-11-30  9:22         ` Pavel Begunkov
  0 siblings, 1 reply; 9+ messages in thread
From: Arvind Sankar @ 2019-11-29 23:24 UTC (permalink / raw)
  To: Pavel Begunkov
  Cc: Arvind Sankar, Jens Axboe, Ming Lei, linux-block, linux-kernel

On Sat, Nov 30, 2019 at 01:47:16AM +0300, Pavel Begunkov wrote:
> On 30/11/2019 01:17, Arvind Sankar wrote:
> > 
> > The loop can be simplified a bit further, as done has to be 0 once we go
> > beyond the current bio_vec. See below for the simplified version.
> > 
> 
> Thanks for the suggestion! I thought about it, and decided to not
> for several reasons. I prefer to not fine-tune and give compilers
> more opportunity to do their job. And it's already fast enough with
> modern architectures (MOVcc, complex addressing, etc).
> 
> Also need to consider code clarity and the fact, that this is inline,
> so should be brief and register-friendly.
> 

It should be more register-friendly, as it uses fewer variables, and I
think it's easier to see what the loop is doing, i.e. that we advance
one bio_vec per iteration: in the existing code, it takes a bit of
thinking to see that we won't spend more than one iteration within the
same bio_vec.

I don't see this as fine-tuning, rather simplifying the code. I do agree
that it's not going to make much difference for performance of the loop
itself, as the most common case I think is that we either stay in the
current bio_vec or advance by one.

> 
> > I also check if bi_size became zero so we can skip the rest of the
> > calculations in that case. If we want to preserve the current behavior of
> > updating iter->bi_idx and iter->bi_bvec_done even if bi_size is going to
> > become zero, the loop condition should change to
> > 
> > 		while (bytes && bytes >= cur->bv_len)
> 
> Probably, it's better to leave it in a consistent state. Shouldn't be
> a problem, but never know when and who will screw it up. 
> 

The WARN_ONCE case does leave it inconsistent, though that's not
supposed to happen, so less of a pitfall there.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] block: optimise bvec_iter_advance()
  2019-11-29 23:24       ` Arvind Sankar
@ 2019-11-30  9:22         ` Pavel Begunkov
  2019-11-30 18:56           ` Arvind Sankar
  0 siblings, 1 reply; 9+ messages in thread
From: Pavel Begunkov @ 2019-11-30  9:22 UTC (permalink / raw)
  To: Arvind Sankar; +Cc: Jens Axboe, Ming Lei, linux-block, linux-kernel


[-- Attachment #1.1: Type: text/plain, Size: 2325 bytes --]

On 30/11/2019 02:24, Arvind Sankar wrote:
> On Sat, Nov 30, 2019 at 01:47:16AM +0300, Pavel Begunkov wrote:
>> On 30/11/2019 01:17, Arvind Sankar wrote:
>>>
>>> The loop can be simplified a bit further, as done has to be 0 once we go
>>> beyond the current bio_vec. See below for the simplified version.
>>>
>>
>> Thanks for the suggestion! I thought about it, and decided to not
>> for several reasons. I prefer to not fine-tune and give compilers
>> more opportunity to do their job. And it's already fast enough with
>> modern architectures (MOVcc, complex addressing, etc).
>>
>> Also need to consider code clarity and the fact, that this is inline,
>> so should be brief and register-friendly.
>>
> 
> It should be more register-friendly, as it uses fewer variables, and I
> think it's easier to see what the loop is doing, i.e. that we advance
> one bio_vec per iteration: in the existing code, it takes a bit of
> thinking to see that we won't spend more than one iteration within the
> same bio_vec.

Yeah, may be. It's more the matter of preference then. I don't think
it's simpler, and performance is entirely depends on a compiler and 
input. But, that's rather subjective and IMHO not worth of time.

Anyway, thanks for thinking this through!

> 
> I don't see this as fine-tuning, rather simplifying the code. I do agree
> that it's not going to make much difference for performance of the loop
> itself, as the most common case I think is that we either stay in the
> current bio_vec or advance by one.
> 
>>
>>> I also check if bi_size became zero so we can skip the rest of the
>>> calculations in that case. If we want to preserve the current behavior of
>>> updating iter->bi_idx and iter->bi_bvec_done even if bi_size is going to
>>> become zero, the loop condition should change to
>>>
>>> 		while (bytes && bytes >= cur->bv_len)
>>
>> Probably, it's better to leave it in a consistent state. Shouldn't be
>> a problem, but never know when and who will screw it up. 
>>
> 
> The WARN_ONCE case does leave it inconsistent, though that's not
> supposed to happen, so less of a pitfall there.
> 

But I hope, this WARN_ONCE won't ever happen, but I wouldn't be
suprised by code like:

last_page = (bv + iter->idx - 1)->page. 


-- 
Pavel Begunkov


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] block: optimise bvec_iter_advance()
  2019-11-30  9:22         ` Pavel Begunkov
@ 2019-11-30 18:56           ` Arvind Sankar
  2019-11-30 18:57             ` Jens Axboe
  0 siblings, 1 reply; 9+ messages in thread
From: Arvind Sankar @ 2019-11-30 18:56 UTC (permalink / raw)
  To: Pavel Begunkov
  Cc: Arvind Sankar, Jens Axboe, Ming Lei, linux-block, linux-kernel

On Sat, Nov 30, 2019 at 12:22:27PM +0300, Pavel Begunkov wrote:
> On 30/11/2019 02:24, Arvind Sankar wrote:
> > On Sat, Nov 30, 2019 at 01:47:16AM +0300, Pavel Begunkov wrote:
> >> On 30/11/2019 01:17, Arvind Sankar wrote:
> >>>
> >>> The loop can be simplified a bit further, as done has to be 0 once we go
> >>> beyond the current bio_vec. See below for the simplified version.
> >>>
> >>
> >> Thanks for the suggestion! I thought about it, and decided to not
> >> for several reasons. I prefer to not fine-tune and give compilers
> >> more opportunity to do their job. And it's already fast enough with
> >> modern architectures (MOVcc, complex addressing, etc).
> >>
> >> Also need to consider code clarity and the fact, that this is inline,
> >> so should be brief and register-friendly.
> >>
> > 
> > It should be more register-friendly, as it uses fewer variables, and I
> > think it's easier to see what the loop is doing, i.e. that we advance
> > one bio_vec per iteration: in the existing code, it takes a bit of
> > thinking to see that we won't spend more than one iteration within the
> > same bio_vec.
> 
> Yeah, may be. It's more the matter of preference then. I don't think
> it's simpler, and performance is entirely depends on a compiler and 
> input. But, that's rather subjective and IMHO not worth of time.
> 
> Anyway, thanks for thinking this through!
> 

You don't find listing 1 simpler than listing 2? It does save one
register, as it doesn't have to keep track of done independently from
bytes. This is always going to be the case unless the compiler can
eliminate done by transforming Listing 2 into Listing 1. Unfortunately,
even if it gets much smarter, it's unlikely to be able to do that,
because they're equivalent only if there is no overflow, so it would
need to know that bytes + iter->bi_bvec_done cannot overflow, and that
iter->bi_bvec_done must be smaller than cur->bv_len initially.

Listing 1:

	bytes += iter->bi_bvec_done;
	while (bytes) {
		const struct bio_vec *cur = bv + idx;

		if (bytes < cur->bv_len)
			break;
		bytes -= cur->bv_len;
		idx++;
	}

	iter->bi_idx = idx;
	iter->bi_bvec_done = bytes;

Listing 2:

	while (bytes) {
		const struct bio_vec *cur = bv + idx;
		unsigned int len = min(bytes, cur->bv_len - done);

		bytes -= len;
		done += len;
		if (done == cur->bv_len) {
			idx++;
			done = 0;
		}
	}

	iter->bi_idx = idx;
	iter->bi_bvec_done = done;

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] block: optimise bvec_iter_advance()
  2019-11-30 18:56           ` Arvind Sankar
@ 2019-11-30 18:57             ` Jens Axboe
  2019-11-30 20:11               ` Pavel Begunkov
  0 siblings, 1 reply; 9+ messages in thread
From: Jens Axboe @ 2019-11-30 18:57 UTC (permalink / raw)
  To: Arvind Sankar, Pavel Begunkov; +Cc: Ming Lei, linux-block, linux-kernel

On 11/30/19 10:56 AM, Arvind Sankar wrote:
> On Sat, Nov 30, 2019 at 12:22:27PM +0300, Pavel Begunkov wrote:
>> On 30/11/2019 02:24, Arvind Sankar wrote:
>>> On Sat, Nov 30, 2019 at 01:47:16AM +0300, Pavel Begunkov wrote:
>>>> On 30/11/2019 01:17, Arvind Sankar wrote:
>>>>>
>>>>> The loop can be simplified a bit further, as done has to be 0 once we go
>>>>> beyond the current bio_vec. See below for the simplified version.
>>>>>
>>>>
>>>> Thanks for the suggestion! I thought about it, and decided to not
>>>> for several reasons. I prefer to not fine-tune and give compilers
>>>> more opportunity to do their job. And it's already fast enough with
>>>> modern architectures (MOVcc, complex addressing, etc).
>>>>
>>>> Also need to consider code clarity and the fact, that this is inline,
>>>> so should be brief and register-friendly.
>>>>
>>>
>>> It should be more register-friendly, as it uses fewer variables, and I
>>> think it's easier to see what the loop is doing, i.e. that we advance
>>> one bio_vec per iteration: in the existing code, it takes a bit of
>>> thinking to see that we won't spend more than one iteration within the
>>> same bio_vec.
>>
>> Yeah, may be. It's more the matter of preference then. I don't think
>> it's simpler, and performance is entirely depends on a compiler and
>> input. But, that's rather subjective and IMHO not worth of time.
>>
>> Anyway, thanks for thinking this through!
>>
> 
> You don't find listing 1 simpler than listing 2? It does save one
> register, as it doesn't have to keep track of done independently from
> bytes. This is always going to be the case unless the compiler can
> eliminate done by transforming Listing 2 into Listing 1. Unfortunately,
> even if it gets much smarter, it's unlikely to be able to do that,
> because they're equivalent only if there is no overflow, so it would
> need to know that bytes + iter->bi_bvec_done cannot overflow, and that
> iter->bi_bvec_done must be smaller than cur->bv_len initially.
> 
> Listing 1:
> 
> 	bytes += iter->bi_bvec_done;
> 	while (bytes) {
> 		const struct bio_vec *cur = bv + idx;
> 
> 		if (bytes < cur->bv_len)
> 			break;
> 		bytes -= cur->bv_len;
> 		idx++;
> 	}
> 
> 	iter->bi_idx = idx;
> 	iter->bi_bvec_done = bytes;
> 
> Listing 2:
> 
> 	while (bytes) {
> 		const struct bio_vec *cur = bv + idx;
> 		unsigned int len = min(bytes, cur->bv_len - done);
> 
> 		bytes -= len;
> 		done += len;
> 		if (done == cur->bv_len) {
> 			idx++;
> 			done = 0;
> 		}
> 	}
> 
> 	iter->bi_idx = idx;
> 	iter->bi_bvec_done = done;

Have yet to take a closer look (and benchmark) and the patches and
the generated code, but fwiw I do agree that case #1 is easier to
read.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] block: optimise bvec_iter_advance()
  2019-11-30 18:57             ` Jens Axboe
@ 2019-11-30 20:11               ` Pavel Begunkov
  2019-11-30 20:56                 ` Jens Axboe
  0 siblings, 1 reply; 9+ messages in thread
From: Pavel Begunkov @ 2019-11-30 20:11 UTC (permalink / raw)
  To: Jens Axboe, Arvind Sankar; +Cc: linux-block, linux-kernel


[-- Attachment #1.1: Type: text/plain, Size: 3226 bytes --]

On 30/11/2019 21:57, Jens Axboe wrote:
> On 11/30/19 10:56 AM, Arvind Sankar wrote:
>> On Sat, Nov 30, 2019 at 12:22:27PM +0300, Pavel Begunkov wrote:
>>> On 30/11/2019 02:24, Arvind Sankar wrote:
>>>> On Sat, Nov 30, 2019 at 01:47:16AM +0300, Pavel Begunkov wrote:
>>>>> On 30/11/2019 01:17, Arvind Sankar wrote:
>>>>>>
>>>>>> The loop can be simplified a bit further, as done has to be 0 once we go
>>>>>> beyond the current bio_vec. See below for the simplified version.
>>>>>>
>>>>>
>>>>> Thanks for the suggestion! I thought about it, and decided to not
>>>>> for several reasons. I prefer to not fine-tune and give compilers
>>>>> more opportunity to do their job. And it's already fast enough with
>>>>> modern architectures (MOVcc, complex addressing, etc).
>>>>>
>>>>> Also need to consider code clarity and the fact, that this is inline,
>>>>> so should be brief and register-friendly.
>>>>>
>>>>
>>>> It should be more register-friendly, as it uses fewer variables, and I
>>>> think it's easier to see what the loop is doing, i.e. that we advance
>>>> one bio_vec per iteration: in the existing code, it takes a bit of
>>>> thinking to see that we won't spend more than one iteration within the
>>>> same bio_vec.
>>>
>>> Yeah, may be. It's more the matter of preference then. I don't think
>>> it's simpler, and performance is entirely depends on a compiler and
>>> input. But, that's rather subjective and IMHO not worth of time.
>>>
>>> Anyway, thanks for thinking this through!
>>>
>>
>> You don't find listing 1 simpler than listing 2? It does save one
>> register, as it doesn't have to keep track of done independently from
>> bytes. This is always going to be the case unless the compiler can
>> eliminate done by transforming Listing 2 into Listing 1. Unfortunately,
>> even if it gets much smarter, it's unlikely to be able to do that,
>> because they're equivalent only if there is no overflow, so it would
>> need to know that bytes + iter->bi_bvec_done cannot overflow, and that
>> iter->bi_bvec_done must be smaller than cur->bv_len initially.
>>
>> Listing 1:
>>
>>     bytes += iter->bi_bvec_done;
>>     while (bytes) {
>>         const struct bio_vec *cur = bv + idx;
>>
>>         if (bytes < cur->bv_len)
>>             break;
>>         bytes -= cur->bv_len;
>>         idx++;
>>     }
>>
>>     iter->bi_idx = idx;
>>     iter->bi_bvec_done = bytes;
>>
>> Listing 2:
>>
>>     while (bytes) {
>>         const struct bio_vec *cur = bv + idx;
>>         unsigned int len = min(bytes, cur->bv_len - done);
>>
>>         bytes -= len;
>>         done += len;
>>         if (done == cur->bv_len) {
>>             idx++;
>>             done = 0;
>>         }
>>     }
>>
>>     iter->bi_idx = idx;
>>     iter->bi_bvec_done = done;
> 
> Have yet to take a closer look (and benchmark) and the patches and
> the generated code, but fwiw I do agree that case #1 is easier to
> read.
> 
Ok, ok, I'm not keen on bike-shedding. I'll resend a simplified version

-- 
Pavel Begunkov


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] block: optimise bvec_iter_advance()
  2019-11-30 20:11               ` Pavel Begunkov
@ 2019-11-30 20:56                 ` Jens Axboe
  0 siblings, 0 replies; 9+ messages in thread
From: Jens Axboe @ 2019-11-30 20:56 UTC (permalink / raw)
  To: Pavel Begunkov, Arvind Sankar; +Cc: linux-block, linux-kernel

On 11/30/19 12:11 PM, Pavel Begunkov wrote:
> On 30/11/2019 21:57, Jens Axboe wrote:
>> On 11/30/19 10:56 AM, Arvind Sankar wrote:
>>> On Sat, Nov 30, 2019 at 12:22:27PM +0300, Pavel Begunkov wrote:
>>>> On 30/11/2019 02:24, Arvind Sankar wrote:
>>>>> On Sat, Nov 30, 2019 at 01:47:16AM +0300, Pavel Begunkov wrote:
>>>>>> On 30/11/2019 01:17, Arvind Sankar wrote:
>>>>>>>
>>>>>>> The loop can be simplified a bit further, as done has to be 0 once we go
>>>>>>> beyond the current bio_vec. See below for the simplified version.
>>>>>>>
>>>>>>
>>>>>> Thanks for the suggestion! I thought about it, and decided to not
>>>>>> for several reasons. I prefer to not fine-tune and give compilers
>>>>>> more opportunity to do their job. And it's already fast enough with
>>>>>> modern architectures (MOVcc, complex addressing, etc).
>>>>>>
>>>>>> Also need to consider code clarity and the fact, that this is inline,
>>>>>> so should be brief and register-friendly.
>>>>>>
>>>>>
>>>>> It should be more register-friendly, as it uses fewer variables, and I
>>>>> think it's easier to see what the loop is doing, i.e. that we advance
>>>>> one bio_vec per iteration: in the existing code, it takes a bit of
>>>>> thinking to see that we won't spend more than one iteration within the
>>>>> same bio_vec.
>>>>
>>>> Yeah, may be. It's more the matter of preference then. I don't think
>>>> it's simpler, and performance is entirely depends on a compiler and
>>>> input. But, that's rather subjective and IMHO not worth of time.
>>>>
>>>> Anyway, thanks for thinking this through!
>>>>
>>>
>>> You don't find listing 1 simpler than listing 2? It does save one
>>> register, as it doesn't have to keep track of done independently from
>>> bytes. This is always going to be the case unless the compiler can
>>> eliminate done by transforming Listing 2 into Listing 1. Unfortunately,
>>> even if it gets much smarter, it's unlikely to be able to do that,
>>> because they're equivalent only if there is no overflow, so it would
>>> need to know that bytes + iter->bi_bvec_done cannot overflow, and that
>>> iter->bi_bvec_done must be smaller than cur->bv_len initially.
>>>
>>> Listing 1:
>>>
>>>      bytes += iter->bi_bvec_done;
>>>      while (bytes) {
>>>          const struct bio_vec *cur = bv + idx;
>>>
>>>          if (bytes < cur->bv_len)
>>>              break;
>>>          bytes -= cur->bv_len;
>>>          idx++;
>>>      }
>>>
>>>      iter->bi_idx = idx;
>>>      iter->bi_bvec_done = bytes;
>>>
>>> Listing 2:
>>>
>>>      while (bytes) {
>>>          const struct bio_vec *cur = bv + idx;
>>>          unsigned int len = min(bytes, cur->bv_len - done);
>>>
>>>          bytes -= len;
>>>          done += len;
>>>          if (done == cur->bv_len) {
>>>              idx++;
>>>              done = 0;
>>>          }
>>>      }
>>>
>>>      iter->bi_idx = idx;
>>>      iter->bi_bvec_done = done;
>>
>> Have yet to take a closer look (and benchmark) and the patches and
>> the generated code, but fwiw I do agree that case #1 is easier to
>> read.
>>
> Ok, ok, I'm not keen on bike-shedding. I'll resend a simplified version

Sweet thanks. Make sure it's green.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2019-11-30 20:56 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <cover.1574974574.git.asml.silence@gmail.com>
2019-11-28 21:04 ` [PATCH] block: optimise bvec_iter_advance() Pavel Begunkov
2019-11-29 22:17   ` Arvind Sankar
2019-11-29 22:47     ` Pavel Begunkov
2019-11-29 23:24       ` Arvind Sankar
2019-11-30  9:22         ` Pavel Begunkov
2019-11-30 18:56           ` Arvind Sankar
2019-11-30 18:57             ` Jens Axboe
2019-11-30 20:11               ` Pavel Begunkov
2019-11-30 20:56                 ` Jens Axboe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.