All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] timers: Optimize usleep_range()
@ 2022-07-29 20:28 Christophe JAILLET
  2022-07-29 21:22 ` John Stultz
  2022-08-01  8:18 ` David Laight
  0 siblings, 2 replies; 4+ messages in thread
From: Christophe JAILLET @ 2022-07-29 20:28 UTC (permalink / raw)
  To: John Stultz, Thomas Gleixner, Stephen Boyd
  Cc: linux-kernel, kernel-janitors, Christophe JAILLET

Most of the time the 'min' and 'max' parameters of usleep_range() are
constant. We can take advantage of it to pre-compute at compile time
some values otherwise computer at run-time in usleep_range_state().

Replace usleep_range_state() by a new __nsleep_range_delta_state() function
that takes as parameters the pre-computed values.

The main benefit is to save a few instructions, especially 2
multiplications (x1000 when converting us to ns).


Some hand simplified diff of the generated asm are given below. They were
produced on a Intel(R) Core(TM) i7-3770, with gcc 11.2.0.

drivers/clk/clk-si514.c (taken as an example)
-----------------------
In this driver we have:
   usleep_range(10000, 12000);

--- clk_before.asm	2022-07-29 21:49:05.702289425 +0200
+++ clk_after.asm	2022-07-29 21:50:23.801548963 +0200
@@ -972,8 +972,8 @@
  ea0:	45 85 e4             	test   %r12d,%r12d
  ea3:	0f 88 f6 fc ff ff    	js     b9f <si514_set_rate+0x9f>
  ea9:	e8 00 00 00 00       	call   eae <si514_set_rate+0x3ae>
- eae:	be e0 2e 00 00       	mov    $0x2ee0,%esi             ;     12.000
- eb3:	bf 10 27 00 00       	mov    $0x2710,%edi             ;     10.000
+ eae:	be 80 84 1e 00       	mov    $0x1e8480,%esi           ;  2.000.000
+ eb3:	bf 80 96 98 00       	mov    $0x989680,%edi           ; 10.000.000
  eb8:	ba 02 00 00 00       	mov    $0x2,%edx
  ebd:	e8 00 00 00 00       	call   ec2 <si514_set_rate+0x3c2>
  ec2:	44 8b 74 24 30       	mov    0x30(%rsp),%r14d

The asm produced in the caller is mostly the same. Only constant values
passed to usleep_range_state() or __nsleep_range_delta_state() are
different. No other instructions or whatever is different.


kernel/time/timer.c
-------------------
-0000000000000000 <usleep_range_state>:
+0000000000000000 <__nsleep_range_delta_state>:
  f3 0f 1e fa          	endbr64
  e8 00 00 00 00       	call   ...
  48 b8 00 00 00 00 00 	movabs $0xdffffc0000000000,%rax
@@ -10692,16 +10692,14 @@
  41 56                	push   %r14
  49 c7 c6 00 00 00 00 	mov    $0x0,%r14
  41 55                	push   %r13
- 41 89 d5             	mov    %edx,%r13d
+ 49 89 f5             	mov    %rsi,%r13
  41 54                	push   %r12
- 49 89 f4             	mov    %rsi,%r12
+ 41 89 d4             	mov    %edx,%r12d
  55                   	push   %rbp
- 44 89 ed             	mov    %r13d,%ebp
+ 44 89 e5             	mov    %r12d,%ebp
  53                   	push   %rbx
  48 89 fb             	mov    %rdi,%rbx
  81 e5 cc 00 00 00    	and    $0xcc,%ebp
- 49 29 dc             	sub    %rbx,%r12              ; (max - min)
- 4d 69 e4 e8 03 00 00 	imul   $0x3e8,%r12,%r12       ; us --> ns (x 1000)
  48 83 ec 68          	sub    $0x68,%rsp
  48 c7 44 24 08 b3 8a 	movq   $0x41b58ab3,0x8(%rsp)
  b5 41
@@ -10721,18 +10719,16 @@
  31 c0                	xor    %eax,%eax
  e8 00 00 00 00       	call   ...
  e8 00 00 00 00       	call   ...
- 49 89 c0             	mov    %rax,%r8
- 48 69 c3 e8 03 00 00 	imul   $0x3e8,%rbx,%rax       ; us --> ns (x 1000)
+ 48 01 d8             	add    %rbx,%rax
+ 48 89 44 24 28       	mov    %rax,0x28(%rsp)
  65 48 8b 1c 25 00 00 	mov    %gs:0x0,%rbx
  00 00
- 4c 01 c0             	add    %r8,%rax
- 48 89 44 24 28       	mov    %rax,0x28(%rsp)
  e8 00 00 00 00       	call   ...
  31 ff                	xor    %edi,%edi
  89 ee                	mov    %ebp,%esi

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
---
v1 -> v2
  - Simplify and avoid use of __buildint_constant_p() [John Stultz <jstultz@google.com>]
  - Also update usleep_idle_range()
  - Axe usleep_range_state()  [John Stultz <jstultz@google.com>]
  - Fix kerneldoc  [John Stultz <jstultz@google.com>]
  - Update log message accordingly
https://lore.kernel.org/all/d7fc85736adee02ce52ee88a54fa7477fbd18ed2.1653236802.git.christophe.jaillet@wanadoo.fr/
---
 include/linux/delay.h | 17 +++++++++++++----
 kernel/time/timer.c   | 17 ++++++++---------
 2 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/include/linux/delay.h b/include/linux/delay.h
index 039e7e0c7378..27938a49c701 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -59,17 +59,26 @@ void calibrate_delay(void);
 void __attribute__((weak)) calibration_delay_done(void);
 void msleep(unsigned int msecs);
 unsigned long msleep_interruptible(unsigned int msecs);
-void usleep_range_state(unsigned long min, unsigned long max,
-			unsigned int state);
+void __nsleep_range_delta_state(u64 min, u64 delta, unsigned int state);
 
 static inline void usleep_range(unsigned long min, unsigned long max)
 {
-	usleep_range_state(min, max, TASK_UNINTERRUPTIBLE);
+	/*
+	 * Most of the time min and max are constant, so the time delta and the
+	 * convertion to ns will be optimized-out at compile time.
+	 */
+	u64 delta = (u64)(max - min) * NSEC_PER_USEC;
+
+	__nsleep_range_delta_state(min * NSEC_PER_USEC, delta,
+				   TASK_UNINTERRUPTIBLE);
 }
 
 static inline void usleep_idle_range(unsigned long min, unsigned long max)
 {
-	usleep_range_state(min, max, TASK_IDLE);
+	u64 delta = (u64)(max - min) * NSEC_PER_USEC;
+
+	__nsleep_range_delta_state(min * NSEC_PER_USEC, delta,
+				   TASK_IDLE);
 }
 
 static inline void ssleep(unsigned int seconds)
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 717fcb9fb14a..475b1c0406d7 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -2109,22 +2109,21 @@ unsigned long msleep_interruptible(unsigned int msecs)
 EXPORT_SYMBOL(msleep_interruptible);
 
 /**
- * usleep_range_state - Sleep for an approximate time in a given state
- * @min:	Minimum time in usecs to sleep
- * @max:	Maximum time in usecs to sleep
+ * __nsleep_range_delta_state - Sleep for an approximate time in a given state
+ * @min:	Minimum time in nsecs to sleep
+ * @delta:	Duration in nsecs that can be tolerated after @min
  * @state:	State of the current task that will be while sleeping
  *
  * In non-atomic context where the exact wakeup time is flexible, use
- * usleep_range_state() instead of udelay().  The sleep improves responsiveness
+ * usleep_[idle_]range() instead of udelay().  The sleep improves responsiveness
  * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces
  * power usage by allowing hrtimers to take advantage of an already-
  * scheduled interrupt instead of scheduling a new one just for this sleep.
  */
-void __sched usleep_range_state(unsigned long min, unsigned long max,
-				unsigned int state)
+void __sched __nsleep_range_delta_state(u64 min, u64 delta,
+				        unsigned int state)
 {
-	ktime_t exp = ktime_add_us(ktime_get(), min);
-	u64 delta = (u64)(max - min) * NSEC_PER_USEC;
+	ktime_t exp = ktime_add_ns(ktime_get(), min);
 
 	for (;;) {
 		__set_current_state(state);
@@ -2133,4 +2132,4 @@ void __sched usleep_range_state(unsigned long min, unsigned long max,
 			break;
 	}
 }
-EXPORT_SYMBOL(usleep_range_state);
+EXPORT_SYMBOL(__nsleep_range_delta_state);
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] timers: Optimize usleep_range()
  2022-07-29 20:28 [PATCH v2] timers: Optimize usleep_range() Christophe JAILLET
@ 2022-07-29 21:22 ` John Stultz
  2022-08-01  8:18 ` David Laight
  1 sibling, 0 replies; 4+ messages in thread
From: John Stultz @ 2022-07-29 21:22 UTC (permalink / raw)
  To: Christophe JAILLET; +Cc: Thomas Gleixner, Stephen Boyd, LKML, kernel-janitors

On Fri, Jul 29, 2022 at 1:29 PM Christophe JAILLET
<christophe.jaillet@wanadoo.fr> wrote:
>
> Most of the time the 'min' and 'max' parameters of usleep_range() are
> constant. We can take advantage of it to pre-compute at compile time
> some values otherwise computer at run-time in usleep_range_state().
>
> Replace usleep_range_state() by a new __nsleep_range_delta_state() function
> that takes as parameters the pre-computed values.
>
> The main benefit is to save a few instructions, especially 2
> multiplications (x1000 when converting us to ns).
>
>
> Some hand simplified diff of the generated asm are given below. They were
> produced on a Intel(R) Core(TM) i7-3770, with gcc 11.2.0.
>
> drivers/clk/clk-si514.c (taken as an example)
> -----------------------
> In this driver we have:
>    usleep_range(10000, 12000);
>
> --- clk_before.asm      2022-07-29 21:49:05.702289425 +0200
> +++ clk_after.asm       2022-07-29 21:50:23.801548963 +0200
> @@ -972,8 +972,8 @@
>   ea0:  45 85 e4                test   %r12d,%r12d
>   ea3:  0f 88 f6 fc ff ff       js     b9f <si514_set_rate+0x9f>
>   ea9:  e8 00 00 00 00          call   eae <si514_set_rate+0x3ae>
> - eae:  be e0 2e 00 00          mov    $0x2ee0,%esi             ;     12.000
> - eb3:  bf 10 27 00 00          mov    $0x2710,%edi             ;     10.000
> + eae:  be 80 84 1e 00          mov    $0x1e8480,%esi           ;  2.000.000
> + eb3:  bf 80 96 98 00          mov    $0x989680,%edi           ; 10.000.000
>   eb8:  ba 02 00 00 00          mov    $0x2,%edx
>   ebd:  e8 00 00 00 00          call   ec2 <si514_set_rate+0x3c2>
>   ec2:  44 8b 74 24 30          mov    0x30(%rsp),%r14d
>
> The asm produced in the caller is mostly the same. Only constant values
> passed to usleep_range_state() or __nsleep_range_delta_state() are
> different. No other instructions or whatever is different.
>
>
> kernel/time/timer.c
> -------------------
> -0000000000000000 <usleep_range_state>:
> +0000000000000000 <__nsleep_range_delta_state>:
>   f3 0f 1e fa           endbr64
>   e8 00 00 00 00        call   ...
>   48 b8 00 00 00 00 00  movabs $0xdffffc0000000000,%rax
> @@ -10692,16 +10692,14 @@
>   41 56                 push   %r14
>   49 c7 c6 00 00 00 00  mov    $0x0,%r14
>   41 55                 push   %r13
> - 41 89 d5              mov    %edx,%r13d
> + 49 89 f5              mov    %rsi,%r13
>   41 54                 push   %r12
> - 49 89 f4              mov    %rsi,%r12
> + 41 89 d4              mov    %edx,%r12d
>   55                    push   %rbp
> - 44 89 ed              mov    %r13d,%ebp
> + 44 89 e5              mov    %r12d,%ebp
>   53                    push   %rbx
>   48 89 fb              mov    %rdi,%rbx
>   81 e5 cc 00 00 00     and    $0xcc,%ebp
> - 49 29 dc              sub    %rbx,%r12              ; (max - min)
> - 4d 69 e4 e8 03 00 00  imul   $0x3e8,%r12,%r12       ; us --> ns (x 1000)
>   48 83 ec 68           sub    $0x68,%rsp
>   48 c7 44 24 08 b3 8a  movq   $0x41b58ab3,0x8(%rsp)
>   b5 41
> @@ -10721,18 +10719,16 @@
>   31 c0                 xor    %eax,%eax
>   e8 00 00 00 00        call   ...
>   e8 00 00 00 00        call   ...
> - 49 89 c0              mov    %rax,%r8
> - 48 69 c3 e8 03 00 00  imul   $0x3e8,%rbx,%rax       ; us --> ns (x 1000)
> + 48 01 d8              add    %rbx,%rax
> + 48 89 44 24 28        mov    %rax,0x28(%rsp)
>   65 48 8b 1c 25 00 00  mov    %gs:0x0,%rbx
>   00 00
> - 4c 01 c0              add    %r8,%rax
> - 48 89 44 24 28        mov    %rax,0x28(%rsp)
>   e8 00 00 00 00        call   ...
>   31 ff                 xor    %edi,%edi
>   89 ee                 mov    %ebp,%esi
>
> Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
> ---
> v1 -> v2
>   - Simplify and avoid use of __buildint_constant_p() [John Stultz <jstultz@google.com>]
>   - Also update usleep_idle_range()
>   - Axe usleep_range_state()  [John Stultz <jstultz@google.com>]
>   - Fix kerneldoc  [John Stultz <jstultz@google.com>]
>   - Update log message accordingly
> https://lore.kernel.org/all/d7fc85736adee02ce52ee88a54fa7477fbd18ed2.1653236802.git.christophe.jaillet@wanadoo.fr/
> ---

Thanks for taking the time to rework this! It looks much better to me!

The only nit I have is you still have a few checkpatch issues to resolve:

WARNING: 'convertion' may be misspelled - perhaps 'conversion'?
#154: FILE: include/linux/delay.h:68:
+        * convertion to ns will be optimized-out at compile time.
           ^^^^^^^^^^

ERROR: code indent should use tabs where possible
#198: FILE: kernel/time/timer.c:2124:
+^I^I^I^I        unsigned int state)$

CHECK: Alignment should match open parenthesis
#198: FILE: kernel/time/timer.c:2124:
+void __sched __nsleep_range_delta_state(u64 min, u64 delta,
+                                       unsigned int state)


(also the path lines in the commit message is confusing checkpatch a bit)

With those resolved, when you resubmit, you can add my:
  Acked-by: John Stultz <jstultz@google.com>

thanks
-john

^ permalink raw reply	[flat|nested] 4+ messages in thread

* RE: [PATCH v2] timers: Optimize usleep_range()
  2022-07-29 20:28 [PATCH v2] timers: Optimize usleep_range() Christophe JAILLET
  2022-07-29 21:22 ` John Stultz
@ 2022-08-01  8:18 ` David Laight
  2022-08-01 18:01   ` Christophe JAILLET
  1 sibling, 1 reply; 4+ messages in thread
From: David Laight @ 2022-08-01  8:18 UTC (permalink / raw)
  To: 'Christophe JAILLET', John Stultz, Thomas Gleixner, Stephen Boyd
  Cc: linux-kernel, kernel-janitors

From: Christophe JAILLET
> Sent: 29 July 2022 21:29
> 
> Most of the time the 'min' and 'max' parameters of usleep_range() are
> constant. We can take advantage of it to pre-compute at compile time
> some values otherwise computer at run-time in usleep_range_state().
> 
> Replace usleep_range_state() by a new __nsleep_range_delta_state() function
> that takes as parameters the pre-computed values.
> 
> The main benefit is to save a few instructions, especially 2
> multiplications (x1000 when converting us to ns).
...
>   53                   	push   %rbx
>   48 89 fb             	mov    %rdi,%rbx
>   81 e5 cc 00 00 00    	and    $0xcc,%ebp
> - 49 29 dc             	sub    %rbx,%r12              ; (max - min)
> - 4d 69 e4 e8 03 00 00 	imul   $0x3e8,%r12,%r12       ; us --> ns (x 1000)
>   48 83 ec 68          	sub    $0x68,%rsp
>   48 c7 44 24 08 b3 8a 	movq   $0x41b58ab3,0x8(%rsp)
>   b5 41
> @@ -10721,18 +10719,16 @@
>   31 c0                	xor    %eax,%eax
>   e8 00 00 00 00       	call   ...
>   e8 00 00 00 00       	call   ...
> - 49 89 c0             	mov    %rax,%r8
> - 48 69 c3 e8 03 00 00 	imul   $0x3e8,%rbx,%rax       ; us --> ns (x 1000)
> + 48 01 d8             	add    %rbx,%rax
> + 48 89 44 24 28       	mov    %rax,0x28(%rsp)
>   65 48 8b 1c 25 00 00 	mov    %gs:0x0,%rbx
>   00 00
> - 4c 01 c0             	add    %r8,%rax
> - 48 89 44 24 28       	mov    %rax,0x28(%rsp)
>   e8 00 00 00 00       	call   ...
...

Is that really measurable in any test?
Integer multiply is one clock on almost every modern cpu.

By the time you've allowed for superscaler cpu there is
probably no difference at all on anything except the simplest
cpus.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] timers: Optimize usleep_range()
  2022-08-01  8:18 ` David Laight
@ 2022-08-01 18:01   ` Christophe JAILLET
  0 siblings, 0 replies; 4+ messages in thread
From: Christophe JAILLET @ 2022-08-01 18:01 UTC (permalink / raw)
  To: David Laight, John Stultz, Thomas Gleixner, Stephen Boyd
  Cc: linux-kernel, kernel-janitors

Le 01/08/2022 à 10:18, David Laight a écrit :
> From: Christophe JAILLET
>> Sent: 29 July 2022 21:29
>>
>> Most of the time the 'min' and 'max' parameters of usleep_range() are
>> constant. We can take advantage of it to pre-compute at compile time
>> some values otherwise computer at run-time in usleep_range_state().
>>
>> Replace usleep_range_state() by a new __nsleep_range_delta_state() function
>> that takes as parameters the pre-computed values.
>>
>> The main benefit is to save a few instructions, especially 2
>> multiplications (x1000 when converting us to ns).
> ...
>>    53                   	push   %rbx
>>    48 89 fb             	mov    %rdi,%rbx
>>    81 e5 cc 00 00 00    	and    $0xcc,%ebp
>> - 49 29 dc             	sub    %rbx,%r12              ; (max - min)
>> - 4d 69 e4 e8 03 00 00 	imul   $0x3e8,%r12,%r12       ; us --> ns (x 1000)
>>    48 83 ec 68          	sub    $0x68,%rsp
>>    48 c7 44 24 08 b3 8a 	movq   $0x41b58ab3,0x8(%rsp)
>>    b5 41
>> @@ -10721,18 +10719,16 @@
>>    31 c0                	xor    %eax,%eax
>>    e8 00 00 00 00       	call   ...
>>    e8 00 00 00 00       	call   ...
>> - 49 89 c0             	mov    %rax,%r8
>> - 48 69 c3 e8 03 00 00 	imul   $0x3e8,%rbx,%rax       ; us --> ns (x 1000)
>> + 48 01 d8             	add    %rbx,%rax
>> + 48 89 44 24 28       	mov    %rax,0x28(%rsp)
>>    65 48 8b 1c 25 00 00 	mov    %gs:0x0,%rbx
>>    00 00
>> - 4c 01 c0             	add    %r8,%rax
>> - 48 89 44 24 28       	mov    %rax,0x28(%rsp)
>>    e8 00 00 00 00       	call   ...
> ...
> 
> Is that really measurable in any test?

I don't think so, even on 32 bits arch.

> Integer multiply is one clock on almost every modern cpu.
> 
> By the time you've allowed for superscaler cpu there is
> probably no difference at all on anything except the simplest
> cpus.

My point is that it is a low hanging fruit.
Just moving some simple computations from one function to another, to 
have the compiler do the job instead of at runtime.

I won't argue the value of the patch itself.
I spotted a potential opportunity and proposed a patch for it.

If someone finds it valuable enough, just take it.
If no-one care, just forget about it.

Both alternative are fine for me.

Best regards,
CJ

> 
> 	David
> 
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)
> 
> 


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2022-08-01 18:01 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-07-29 20:28 [PATCH v2] timers: Optimize usleep_range() Christophe JAILLET
2022-07-29 21:22 ` John Stultz
2022-08-01  8:18 ` David Laight
2022-08-01 18:01   ` Christophe JAILLET

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.