linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] include: kernel.h: rewrite min3, max3 and clamp using min and max
@ 2014-06-16 21:07 Michal Nazarewicz
  2014-06-16 23:18 ` Andrew Morton
  2014-06-17 11:37 ` Hagen Paul Pfeifer
  0 siblings, 2 replies; 9+ messages in thread
From: Michal Nazarewicz @ 2014-06-16 21:07 UTC (permalink / raw)
  To: Steven Rostedt, Andrew Morton, Hagen Paul Pfeifer
  Cc: linux-kernel, Michal Nazarewicz

It appears that gcc is better at optimising a double call to min
and max rather than open coded min3 and max3.  This can be observed
here:

    $ cat min-max.c
    #define min(x, y) ({				\
    	typeof(x) _min1 = (x);			\
    	typeof(y) _min2 = (y);			\
    	(void) (&_min1 == &_min2);		\
    	_min1 < _min2 ? _min1 : _min2; })
    #define min3(x, y, z) ({			\
    	typeof(x) _min1 = (x);			\
    	typeof(y) _min2 = (y);			\
    	typeof(z) _min3 = (z);			\
    	(void) (&_min1 == &_min2);		\
    	(void) (&_min1 == &_min3);		\
    	_min1 < _min2 ? (_min1 < _min3 ? _min1 : _min3) : \
    		(_min2 < _min3 ? _min2 : _min3); })

    int fmin3(int x, int y, int z) { return min3(x, y, z); }
    int fmin2(int x, int y, int z) { return min(min(x, y), z); }

    $ gcc -O2 -o min-max.s -S min-max.c; cat min-max.s
    	.file	"min-max.c"
    	.text
    	.p2align 4,,15
    	.globl	fmin3
    	.type	fmin3, @function
    fmin3:
    .LFB0:
    	.cfi_startproc
    	cmpl	%esi, %edi
    	jl	.L5
    	cmpl	%esi, %edx
    	movl	%esi, %eax
    	cmovle	%edx, %eax
    	ret
    	.p2align 4,,10
    	.p2align 3
    .L5:
    	cmpl	%edi, %edx
    	movl	%edi, %eax
    	cmovle	%edx, %eax
    	ret
    	.cfi_endproc
    .LFE0:
    	.size	fmin3, .-fmin3
    	.p2align 4,,15
    	.globl	fmin2
    	.type	fmin2, @function
    fmin2:
    .LFB1:
    	.cfi_startproc
    	cmpl	%edi, %esi
    	movl	%edx, %eax
    	cmovle	%esi, %edi
    	cmpl	%edx, %edi
    	cmovle	%edi, %eax
    	ret
    	.cfi_endproc
    .LFE1:
    	.size	fmin2, .-fmin2
    	.ident	"GCC: (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3"
    	.section	.note.GNU-stack,"",@progbits

fmin3 function, which uses open-coded min3 macro, is compiled into
total of ten instructions including a conditional branch, whereas fmin2
function, which uses two calls to min2 macro, is compiled into six
instructions with no branches.

Similarly, open-coded clamp produces the same code as clamp using min
and max macros, but the latter is much shorter:

    $ cat clamp.c
    #define clamp(val, min, max) ({			\
    	typeof(val) __val = (val);		\
    	typeof(min) __min = (min);		\
    	typeof(max) __max = (max);		\
    	(void) (&__val == &__min);		\
    	(void) (&__val == &__max);		\
    	__val = __val < __min ? __min: __val;	\
    	__val > __max ? __max: __val; })
    #define min(x, y) ({				\
    	typeof(x) _min1 = (x);			\
    	typeof(y) _min2 = (y);			\
    	(void) (&_min1 == &_min2);		\
    	_min1 < _min2 ? _min1 : _min2; })
    #define max(x, y) ({				\
    	typeof(x) _max1 = (x);			\
    	typeof(y) _max2 = (y);			\
    	(void) (&_max1 == &_max2);		\
    	_max1 > _max2 ? _max1 : _max2; })

    int fclamp(int v, int min, int max) { return clamp(v, min, max); }
    int fclampmm(int v, int min, int max) { return min(max(v, min), max); }

    $ gcc -O2 -o clamp.s -S clamp.c; cat clamp.s
    	.file	"clamp.c"
    	.text
    	.p2align 4,,15
    	.globl	fclamp
    	.type	fclamp, @function
    fclamp:
    .LFB0:
    	.cfi_startproc
    	cmpl	%edi, %esi
    	movl	%edx, %eax
    	cmovge	%esi, %edi
    	cmpl	%edx, %edi
    	cmovle	%edi, %eax
    	ret
    	.cfi_endproc
    .LFE0:
    	.size	fclamp, .-fclamp
    	.p2align 4,,15
    	.globl	fclampmm
    	.type	fclampmm, @function
    fclampmm:
    .LFB1:
    	.cfi_startproc
    	cmpl	%edi, %esi
    	cmovge	%esi, %edi
    	cmpl	%edi, %edx
    	movl	%edi, %eax
    	cmovle	%edx, %eax
    	ret
    	.cfi_endproc
    .LFE1:
    	.size	fclampmm, .-fclampmm
    	.ident	"GCC: (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3"
    	.section	.note.GNU-stack,"",@progbits

Furthermore, after “make allmodconfig && make bzImage modules” this is the
comparison of image and modules sizes:

    # Without this patch applied
    $ ls -l arch/x86/boot/bzImage **/*.ko |awk '{size += $5} END {print size}'
    350715800

    # With this patch applied
    $ ls -l arch/x86/boot/bzImage **/*.ko |awk '{size += $5} END {print size}'
    349856528

The above builds were done on:

    $ uname -a; gcc --version
    Linux mpn-glaptop 3.13.0-29-generic #53~precise1-Ubuntu SMP Wed Jun 4 22:06:25 UTC 2014 x86_64 x86_64 x86_64 GNU/Linux
    gcc (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3
    Copyright (C) 2011 Free Software Foundation, Inc.
    This is free software; see the source for copying conditions.  There is NO
    warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

Signed-off-by: Michal Nazarewicz <mina86@mina86.com>
---
 include/linux/kernel.h | 32 +++++---------------------------
 1 file changed, 5 insertions(+), 27 deletions(-)

 Interestingly commit [f27c85c56b: “add {min,max}3 macros”] claims
 thatthe open-coded min3/max3 “will save some cycles as well as some
 bytes on the stack”, but as far as I can see that statement is false.
 But maybe there's something Hagen knows that I missed?

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 4c52907..44649e0 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -719,23 +719,8 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 	(void) (&_max1 == &_max2);		\
 	_max1 > _max2 ? _max1 : _max2; })
 
-#define min3(x, y, z) ({			\
-	typeof(x) _min1 = (x);			\
-	typeof(y) _min2 = (y);			\
-	typeof(z) _min3 = (z);			\
-	(void) (&_min1 == &_min2);		\
-	(void) (&_min1 == &_min3);		\
-	_min1 < _min2 ? (_min1 < _min3 ? _min1 : _min3) : \
-		(_min2 < _min3 ? _min2 : _min3); })
-
-#define max3(x, y, z) ({			\
-	typeof(x) _max1 = (x);			\
-	typeof(y) _max2 = (y);			\
-	typeof(z) _max3 = (z);			\
-	(void) (&_max1 == &_max2);		\
-	(void) (&_max1 == &_max3);		\
-	_max1 > _max2 ? (_max1 > _max3 ? _max1 : _max3) : \
-		(_max2 > _max3 ? _max2 : _max3); })
+#define min3(x, y, z) min(min(x, y), z)
+#define max3(x, y, z) max(max(x, y), z)
 
 /**
  * min_not_zero - return the minimum that is _not_ zero, unless both are zero
@@ -750,20 +735,13 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 /**
  * clamp - return a value clamped to a given range with strict typechecking
  * @val: current value
- * @min: minimum allowable value
- * @max: maximum allowable value
+ * @lo: lowest allowable value
+ * @hi: highest allowable value
  *
  * This macro does strict typechecking of min/max to make sure they are of the
  * same type as val.  See the unnecessary pointer comparisons.
  */
-#define clamp(val, min, max) ({			\
-	typeof(val) __val = (val);		\
-	typeof(min) __min = (min);		\
-	typeof(max) __max = (max);		\
-	(void) (&__val == &__min);		\
-	(void) (&__val == &__max);		\
-	__val = __val < __min ? __min: __val;	\
-	__val > __max ? __max: __val; })
+#define clamp(val, lo, hi) min(max(val, lo), hi)
 
 /*
  * ..and if you can't take the strict
-- 
2.0.0.526.g5318336


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] include: kernel.h: rewrite min3, max3 and clamp using min and max
  2014-06-16 21:07 [PATCH] include: kernel.h: rewrite min3, max3 and clamp using min and max Michal Nazarewicz
@ 2014-06-16 23:18 ` Andrew Morton
  2014-06-16 23:25   ` David Rientjes
  2014-06-17 13:49   ` Michal Nazarewicz
  2014-06-17 11:37 ` Hagen Paul Pfeifer
  1 sibling, 2 replies; 9+ messages in thread
From: Andrew Morton @ 2014-06-16 23:18 UTC (permalink / raw)
  To: Michal Nazarewicz; +Cc: Steven Rostedt, Hagen Paul Pfeifer, linux-kernel

On Mon, 16 Jun 2014 23:07:22 +0200 Michal Nazarewicz <mina86@mina86.com> wrote:

> It appears that gcc is better at optimising a double call to min
> and max rather than open coded min3 and max3.  This can be observed
> here:
> 
> ...
>
> Furthermore, after ___make allmodconfig && make bzImage modules___ this is the
> comparison of image and modules sizes:
> 
>     # Without this patch applied
>     $ ls -l arch/x86/boot/bzImage **/*.ko |awk '{size += $5} END {print size}'
>     350715800
> 
>     # With this patch applied
>     $ ls -l arch/x86/boot/bzImage **/*.ko |awk '{size += $5} END {print size}'
>     349856528

We saved nearly a megabyte by optimising min3(), max3() and clamp()? 

I'm counting a grand total of 182 callsites for those macros.  So the
saving is 4700 bytes per invokation?  I don't believe it...



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] include: kernel.h: rewrite min3, max3 and clamp using min and max
  2014-06-16 23:18 ` Andrew Morton
@ 2014-06-16 23:25   ` David Rientjes
  2014-06-16 23:35     ` Andrew Morton
  2014-06-17 13:49   ` Michal Nazarewicz
  1 sibling, 1 reply; 9+ messages in thread
From: David Rientjes @ 2014-06-16 23:25 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michal Nazarewicz, Steven Rostedt, Hagen Paul Pfeifer, linux-kernel

On Mon, 16 Jun 2014, Andrew Morton wrote:

> > It appears that gcc is better at optimising a double call to min
> > and max rather than open coded min3 and max3.  This can be observed
> > here:
> > 
> > ...
> >
> > Furthermore, after ___make allmodconfig && make bzImage modules___ this is the
> > comparison of image and modules sizes:
> > 
> >     # Without this patch applied
> >     $ ls -l arch/x86/boot/bzImage **/*.ko |awk '{size += $5} END {print size}'
> >     350715800
> > 
> >     # With this patch applied
> >     $ ls -l arch/x86/boot/bzImage **/*.ko |awk '{size += $5} END {print size}'
> >     349856528
> 
> We saved nearly a megabyte by optimising min3(), max3() and clamp()? 
> 
> I'm counting a grand total of 182 callsites for those macros.  So the
> saving is 4700 bytes per invokation?  I don't believe it...
> 

I was checking just the instances of min3() in mm/ and gcc ends up 
inlining transfer_objects() in mm/slab.c as a result of this change and 
increases its text size:

   text	   data	    bss	    dec	    hex	filename
  28369	  21559	      4	  49932	   c30c	slab.o.before
  28399	  21559	      4	  49962	   c32a	slab.o.after

It also seems to use one additional temp variable of type typeof(x) on the 
stack, so I do think the old version was superior.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] include: kernel.h: rewrite min3, max3 and clamp using min and max
  2014-06-16 23:25   ` David Rientjes
@ 2014-06-16 23:35     ` Andrew Morton
  2014-06-16 23:54       ` David Rientjes
  0 siblings, 1 reply; 9+ messages in thread
From: Andrew Morton @ 2014-06-16 23:35 UTC (permalink / raw)
  To: David Rientjes
  Cc: Michal Nazarewicz, Steven Rostedt, Hagen Paul Pfeifer, linux-kernel

On Mon, 16 Jun 2014 16:25:15 -0700 (PDT) David Rientjes <rientjes@google.com> wrote:

> On Mon, 16 Jun 2014, Andrew Morton wrote:
> 
> > > It appears that gcc is better at optimising a double call to min
> > > and max rather than open coded min3 and max3.  This can be observed
> > > here:
> > > 
> > > ...
> > >
> > > Furthermore, after ___make allmodconfig && make bzImage modules___ this is the
> > > comparison of image and modules sizes:
> > > 
> > >     # Without this patch applied
> > >     $ ls -l arch/x86/boot/bzImage **/*.ko |awk '{size += $5} END {print size}'
> > >     350715800
> > > 
> > >     # With this patch applied
> > >     $ ls -l arch/x86/boot/bzImage **/*.ko |awk '{size += $5} END {print size}'
> > >     349856528
> > 
> > We saved nearly a megabyte by optimising min3(), max3() and clamp()? 
> > 
> > I'm counting a grand total of 182 callsites for those macros.  So the
> > saving is 4700 bytes per invokation?  I don't believe it...
> > 
> 
> I was checking just the instances of min3() in mm/ and gcc ends up 
> inlining transfer_objects() in mm/slab.c as a result of this change and 
> increases its text size:
> 
>    text	   data	    bss	    dec	    hex	filename
>   28369	  21559	      4	  49932	   c30c	slab.o.before
>   28399	  21559	      4	  49962	   c32a	slab.o.after

Maybe that's a good thing in disguise: gcc said "hey this thing is now
small enough to inline it".

> It also seems to use one additional temp variable of type typeof(x) on the 
> stack, so I do think the old version was superior.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] include: kernel.h: rewrite min3, max3 and clamp using min and max
  2014-06-16 23:35     ` Andrew Morton
@ 2014-06-16 23:54       ` David Rientjes
  2014-06-17  0:21         ` Steven Rostedt
  0 siblings, 1 reply; 9+ messages in thread
From: David Rientjes @ 2014-06-16 23:54 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michal Nazarewicz, Steven Rostedt, Hagen Paul Pfeifer, linux-kernel

On Mon, 16 Jun 2014, Andrew Morton wrote:

> On Mon, 16 Jun 2014 16:25:15 -0700 (PDT) David Rientjes <rientjes@google.com> wrote:
> 
> > On Mon, 16 Jun 2014, Andrew Morton wrote:
> > 
> > > > It appears that gcc is better at optimising a double call to min
> > > > and max rather than open coded min3 and max3.  This can be observed
> > > > here:
> > > > 
> > > > ...
> > > >
> > > > Furthermore, after ___make allmodconfig && make bzImage modules___ this is the
> > > > comparison of image and modules sizes:
> > > > 
> > > >     # Without this patch applied
> > > >     $ ls -l arch/x86/boot/bzImage **/*.ko |awk '{size += $5} END {print size}'
> > > >     350715800
> > > > 
> > > >     # With this patch applied
> > > >     $ ls -l arch/x86/boot/bzImage **/*.ko |awk '{size += $5} END {print size}'
> > > >     349856528
> > > 
> > > We saved nearly a megabyte by optimising min3(), max3() and clamp()? 
> > > 
> > > I'm counting a grand total of 182 callsites for those macros.  So the
> > > saving is 4700 bytes per invokation?  I don't believe it...
> > > 
> > 
> > I was checking just the instances of min3() in mm/ and gcc ends up 
> > inlining transfer_objects() in mm/slab.c as a result of this change and 
> > increases its text size:
> > 
> >    text	   data	    bss	    dec	    hex	filename
> >   28369	  21559	      4	  49932	   c30c	slab.o.before
> >   28399	  21559	      4	  49962	   c32a	slab.o.after
> 
> Maybe that's a good thing in disguise: gcc said "hey this thing is now
> small enough to inline it".
> 

On linux-next, allyesconfig has a 0.0001% savings as a result of the 
patch, but I'd be worried about the extra temp variable it allocates on 
the stack that is evident in the mm/slab.c disassembly unless all cases 
can be audited to show that we're not potentially deep.

   text	   data	    bss	    dec	    hex	filename
108573045	23488016	51580928	183641989	af22785	vmlinux.before
108572908	23488016	51580928	183641852	af226fc	vmlinux.after

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] include: kernel.h: rewrite min3, max3 and clamp using min and max
  2014-06-16 23:54       ` David Rientjes
@ 2014-06-17  0:21         ` Steven Rostedt
  2014-06-17  4:01           ` Steven Rostedt
  0 siblings, 1 reply; 9+ messages in thread
From: Steven Rostedt @ 2014-06-17  0:21 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Michal Nazarewicz, Hagen Paul Pfeifer, linux-kernel

On Mon, 16 Jun 2014 16:54:32 -0700 (PDT)
David Rientjes <rientjes@google.com> wrote:

> 
> 
> On linux-next, allyesconfig has a 0.0001% savings as a result of the 
> patch, but I'd be worried about the extra temp variable it allocates on 
> the stack that is evident in the mm/slab.c disassembly unless all cases 
> can be audited to show that we're not potentially deep.

A 0.0001% change means it's not worth changing, and we may be able to
mark this up as a fluke in Michal's results.

I'll give it a try on my 4.6.3 compiler.

-- Steve

> 
>    text	   data	    bss	    dec	    hex	filename
> 108573045	23488016	51580928	183641989	af22785	vmlinux.before
> 108572908	23488016	51580928	183641852	af226fc	vmlinux.after


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] include: kernel.h: rewrite min3, max3 and clamp using min and max
  2014-06-17  0:21         ` Steven Rostedt
@ 2014-06-17  4:01           ` Steven Rostedt
  0 siblings, 0 replies; 9+ messages in thread
From: Steven Rostedt @ 2014-06-17  4:01 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: David Rientjes, Andrew Morton, Michal Nazarewicz,
	Hagen Paul Pfeifer, linux-kernel

On Mon, 16 Jun 2014 20:21:20 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Mon, 16 Jun 2014 16:54:32 -0700 (PDT)
> David Rientjes <rientjes@google.com> wrote:
> 
> > 
> > 
> > On linux-next, allyesconfig has a 0.0001% savings as a result of the 
> > patch, but I'd be worried about the extra temp variable it allocates on 
> > the stack that is evident in the mm/slab.c disassembly unless all cases 
> > can be audited to show that we're not potentially deep.
> 
> A 0.0001% change means it's not worth changing, and we may be able to
> mark this up as a fluke in Michal's results.
> 
> I'll give it a try on my 4.6.3 compiler.
> 
> -- Steve
> 
> > 
> >    text	   data	    bss	    dec	    hex	filename
> > 108573045	23488016	51580928	183641989	af22785	vmlinux.before
> > 108572908	23488016	51580928	183641852	af226fc	vmlinux.after
> 

Here's my results:

   text    data     bss     dec     hex filename
108662851       23470256        51580928        183714035       af340f3 /tmp/vmlinux-orig
108662714       23470224        51580928        183713866	af3404a /tmp/vmlinux-patched

The patched version saved a total of 169 bytes.

Doing a diff on the vmlinux objdumps of do_fault_around, I get this:

  mov    -0x68(%rbp),%rdi
  mov    -0x70(%rbp),%r8d
- jae    ffffffff81302bf5 <do_fault_around+0xbd>
- cmp    %r15,%rax
- mov    %r15,%r14
- cmovbe %rax,%r14
- jmp    ffffffff81302c03 <do_fault_around+0xcb>
- cmp    %r14,%rax
- cmovbe %rax,%r14
- incq   0x8d384b5(%rip)        # ffffffff8a03b0b8 <high_memory+0x14e8>
- mov    0x8d384be(%rip),%rsi        # ffffffff8a03b0c8 <high_memory+0x14f8>
- mov    0x8d384bf(%rip),%rdx        # ffffffff8a03b0d0 <high_memory+0x1500>
- mov    0x8d384a8(%rip),%rax        # ffffffff8a03b0c0 <high_memory+0x14f0>
+ cmp    %rax,%r15
+ cmova  %rax,%r15
+ mov    0x8d384c0(%rip),%rax        # ffffffff8a03b0b8 <high_memory+0x14e8>
+ cmp    %r14,%r15
+ cmovbe %r15,%r14
  sub    %r12,%rsi


So for gcc 4.6.3 it does seem to produce nicer assembly. I haven't
tried it with other versions though (or with clang).

But I'll still give it a:

Acked-by: Steven Rostedt <rostedt@goodmis.org>

-- Steve

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] include: kernel.h: rewrite min3, max3 and clamp using min and max
  2014-06-16 21:07 [PATCH] include: kernel.h: rewrite min3, max3 and clamp using min and max Michal Nazarewicz
  2014-06-16 23:18 ` Andrew Morton
@ 2014-06-17 11:37 ` Hagen Paul Pfeifer
  1 sibling, 0 replies; 9+ messages in thread
From: Hagen Paul Pfeifer @ 2014-06-17 11:37 UTC (permalink / raw)
  To: Michal Nazarewicz; +Cc: Steven Rostedt, Andrew Morton, linux-kernel

Wow, back in the days where I submitted the patch I saw improvement in
the generated code (focus on number of instructions). Don't know what
happends with gcc in the meantime. Anyway, patch and analysis looks
good. So from a clean-patch perspective you get an:

Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] include: kernel.h: rewrite min3, max3 and clamp using min and max
  2014-06-16 23:18 ` Andrew Morton
  2014-06-16 23:25   ` David Rientjes
@ 2014-06-17 13:49   ` Michal Nazarewicz
  1 sibling, 0 replies; 9+ messages in thread
From: Michal Nazarewicz @ 2014-06-17 13:49 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Steven Rostedt, Hagen Paul Pfeifer, linux-kernel

On Mon, Jun 16 2014, Andrew Morton <akpm@linux-foundation.org> wrote:
> On Mon, 16 Jun 2014 23:07:22 +0200 Michal Nazarewicz <mina86@mina86.com> wrote:
>
>> It appears that gcc is better at optimising a double call to min
>> and max rather than open coded min3 and max3.  This can be observed
>> here:
>> 
>> ...
>>
>> Furthermore, after ___make allmodconfig && make bzImage modules___ this is the
>> comparison of image and modules sizes:
>> 
>>     # Without this patch applied
>>     $ ls -l arch/x86/boot/bzImage **/*.ko |awk '{size += $5} END {print size}'
>>     350715800
>> 
>>     # With this patch applied
>>     $ ls -l arch/x86/boot/bzImage **/*.ko |awk '{size += $5} END {print size}'
>>     349856528
>
> We saved nearly a megabyte by optimising min3(), max3() and clamp()? 
>
> I'm counting a grand total of 182 callsites for those macros.  So the
> saving is 4700 bytes per invokation?  I don't believe it...

You're absolutely right.  I must have messed something up here.  This
portion of the commit message should be removed.

So I've redone this just on just the kernel image with allmodconfig:

    Linux mpn-glaptop 3.13.0-29-generic #53~precise1-Ubuntu SMP Wed Jun 4 22:06:25 UTC 2014 x86_64 x86_64 x86_64 GNU/Linux
    gcc (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3
    Copyright (C) 2011 Free Software Foundation, Inc.
    This is free software; see the source for copying conditions.  There is NO
    warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
    
    -rwx------ 1 mpn eng 51224656 Jun 17 14:15 vmlinux.before
    -rwx------ 1 mpn eng 51224608 Jun 17 13:57 vmlinux.after

48 bytes reduction.  The do_fault_around was a few instruction shorter
and as far as I can tell saved 12 bytes on the stack, i.e.:

    $ grep -e rsp -e pop -e push do_fault_around.*
    do_fault_around.before.s:push   %rbp
    do_fault_around.before.s:mov    %rsp,%rbp
    do_fault_around.before.s:push   %r13
    do_fault_around.before.s:push   %r12
    do_fault_around.before.s:push   %rbx
    do_fault_around.before.s:sub    $0x38,%rsp
    do_fault_around.before.s:add    $0x38,%rsp
    do_fault_around.before.s:pop    %rbx
    do_fault_around.before.s:pop    %r12
    do_fault_around.before.s:pop    %r13
    do_fault_around.before.s:pop    %rbp

    do_fault_around.after.s:push   %rbp
    do_fault_around.after.s:mov    %rsp,%rbp
    do_fault_around.after.s:push   %r12
    do_fault_around.after.s:push   %rbx
    do_fault_around.after.s:sub    $0x30,%rsp
    do_fault_around.after.s:add    $0x30,%rsp
    do_fault_around.after.s:pop    %rbx
    do_fault_around.after.s:pop    %r12
    do_fault_around.after.s:pop    %rbp

or here side-by-side:

    Before                    After
    push   %rbp               push   %rbp          
    mov    %rsp,%rbp          mov    %rsp,%rbp     
    push   %r13                                    
    push   %r12               push   %r12          
    push   %rbx               push   %rbx          
    sub    $0x38,%rsp         sub    $0x30,%rsp    
    add    $0x38,%rsp         add    $0x30,%rsp    
    pop    %rbx               pop    %rbx          
    pop    %r12               pop    %r12          
    pop    %r13                                    
    pop    %rbp               pop    %rbp          

There are also fewer branches:

    $ grep ^j do_fault_around.*
    do_fault_around.before.s:jae    ffffffff812079b7
    do_fault_around.before.s:jmp    ffffffff812079c5
    do_fault_around.before.s:jmp    ffffffff81207a14
    do_fault_around.before.s:ja     ffffffff812079f9
    do_fault_around.before.s:jb     ffffffff81207a10
    do_fault_around.before.s:jmp    ffffffff81207a63
    do_fault_around.before.s:jne    ffffffff812079df
    
    do_fault_around.after.s:jmp    ffffffff812079fd
    do_fault_around.after.s:ja     ffffffff812079e2
    do_fault_around.after.s:jb     ffffffff812079f9
    do_fault_around.after.s:jmp    ffffffff81207a4c
    do_fault_around.after.s:jne    ffffffff812079c8

And here's with allyesconfig on a different machine:

    $ uname -a; gcc --version; ls -l vmlinux.*
    Linux erwin 3.14.7-mn #54 SMP Sun Jun 15 11:25:08 CEST 2014 x86_64 AMD Phenom(tm) II X3 710 Processor AuthenticAMD GNU/Linux
    gcc (GCC) 4.8.3
    Copyright (C) 2013 Free Software Foundation, Inc.
    This is free software; see the source for copying conditions.  There is NO
    warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
    
    -rwx------ 1 mina86 mina86 230616126 Jun 17 15:39 vmlinux.before*
    -rwx------ 1 mina86 mina86 230614861 Jun 17 14:36 vmlinux.after*

1265 bytes reduction.

-- 
Best regards,                                         _     _
.o. | Liege of Serenely Enlightened Majesty of      o' \,=./ `o
..o | Computer Science,  Michał “mina86” Nazarewicz    (o o)
ooo +--<mpn@google.com>--<xmpp:mina86@jabber.org>--ooO--(_)--Ooo--

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2014-06-17 13:49 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-06-16 21:07 [PATCH] include: kernel.h: rewrite min3, max3 and clamp using min and max Michal Nazarewicz
2014-06-16 23:18 ` Andrew Morton
2014-06-16 23:25   ` David Rientjes
2014-06-16 23:35     ` Andrew Morton
2014-06-16 23:54       ` David Rientjes
2014-06-17  0:21         ` Steven Rostedt
2014-06-17  4:01           ` Steven Rostedt
2014-06-17 13:49   ` Michal Nazarewicz
2014-06-17 11:37 ` Hagen Paul Pfeifer

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).