All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] force inlining of spinlock ops
@ 2015-05-11 17:57 Denys Vlasenko
  2015-05-11 18:53 ` Josh Triplett
                   ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Denys Vlasenko @ 2015-05-11 17:57 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Denys Vlasenko, Thomas Graf, David S. Miller, Bart Van Assche,
	Peter Zijlstra, David Rientjes, Andrew Morton, Oleg Nesterov,
	Paul E. McKenney, Ingo Molnar, linux-kernel

With both gcc 4.7.2 and 4.9.2, sometimes gcc mysteriously doesn't inline
very small functions we expect to be inlined. In particular,
with this config: http://busybox.net/~vda/kernel_config
there are more than a thousand copies of tiny spinlock-related functions:

$ nm --size-sort vmlinux | grep -iF ' t ' | uniq -c | grep -v '^ *1 ' | sort -rn | grep ' spin'
    473 000000000000000b t spin_unlock_irqrestore
    292 000000000000000b t spin_unlock
    215 000000000000000b t spin_lock
    134 000000000000000b t spin_unlock_irq
    130 000000000000000b t spin_unlock_bh
    120 000000000000000b t spin_lock_irq
    106 000000000000000b t spin_lock_bh

Disassembly:

ffffffff81004720 <spin_lock>:
ffffffff81004720:       55                      push   %rbp
ffffffff81004721:       48 89 e5                mov    %rsp,%rbp
ffffffff81004724:       e8 f8 4e e2 02          callq  <_raw_spin_lock>
ffffffff81004729:       5d                      pop    %rbp
ffffffff8100472a:       c3                      retq

This patch fixes this via s/inline/__always_inline/ in spinlock.h.
This decreases vmlinux by about 30k:

    text     data      bss       dec     hex filename
82375570 22255544 20627456 125258570 7774b4a vmlinux.before
82335059 22255416 20627456 125217931 776ac8b vmlinux

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: David S. Miller <davem@davemloft.net>
Cc: Bart Van Assche <bvanassche@acm.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Rientjes <rientjes@google.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
CC: linux-kernel@vger.kernel.org
---
 include/linux/spinlock.h | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 3e18379..073925d 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -296,7 +296,7 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
  * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
  */
 
-static inline raw_spinlock_t *spinlock_check(spinlock_t *lock)
+static __always_inline raw_spinlock_t *spinlock_check(spinlock_t *lock)
 {
 	return &lock->rlock;
 }
@@ -307,17 +307,17 @@ do {							\
 	raw_spin_lock_init(&(_lock)->rlock);		\
 } while (0)
 
-static inline void spin_lock(spinlock_t *lock)
+static __always_inline void spin_lock(spinlock_t *lock)
 {
 	raw_spin_lock(&lock->rlock);
 }
 
-static inline void spin_lock_bh(spinlock_t *lock)
+static __always_inline void spin_lock_bh(spinlock_t *lock)
 {
 	raw_spin_lock_bh(&lock->rlock);
 }
 
-static inline int spin_trylock(spinlock_t *lock)
+static __always_inline int spin_trylock(spinlock_t *lock)
 {
 	return raw_spin_trylock(&lock->rlock);
 }
@@ -337,7 +337,7 @@ do {									\
 	raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock);	\
 } while (0)
 
-static inline void spin_lock_irq(spinlock_t *lock)
+static __always_inline void spin_lock_irq(spinlock_t *lock)
 {
 	raw_spin_lock_irq(&lock->rlock);
 }
@@ -352,32 +352,32 @@ do {									\
 	raw_spin_lock_irqsave_nested(spinlock_check(lock), flags, subclass); \
 } while (0)
 
-static inline void spin_unlock(spinlock_t *lock)
+static __always_inline void spin_unlock(spinlock_t *lock)
 {
 	raw_spin_unlock(&lock->rlock);
 }
 
-static inline void spin_unlock_bh(spinlock_t *lock)
+static __always_inline void spin_unlock_bh(spinlock_t *lock)
 {
 	raw_spin_unlock_bh(&lock->rlock);
 }
 
-static inline void spin_unlock_irq(spinlock_t *lock)
+static __always_inline void spin_unlock_irq(spinlock_t *lock)
 {
 	raw_spin_unlock_irq(&lock->rlock);
 }
 
-static inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
+static __always_inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
 {
 	raw_spin_unlock_irqrestore(&lock->rlock, flags);
 }
 
-static inline int spin_trylock_bh(spinlock_t *lock)
+static __always_inline int spin_trylock_bh(spinlock_t *lock)
 {
 	return raw_spin_trylock_bh(&lock->rlock);
 }
 
-static inline int spin_trylock_irq(spinlock_t *lock)
+static __always_inline int spin_trylock_irq(spinlock_t *lock)
 {
 	return raw_spin_trylock_irq(&lock->rlock);
 }
@@ -387,22 +387,22 @@ static inline int spin_trylock_irq(spinlock_t *lock)
 	raw_spin_trylock_irqsave(spinlock_check(lock), flags); \
 })
 
-static inline void spin_unlock_wait(spinlock_t *lock)
+static __always_inline void spin_unlock_wait(spinlock_t *lock)
 {
 	raw_spin_unlock_wait(&lock->rlock);
 }
 
-static inline int spin_is_locked(spinlock_t *lock)
+static __always_inline int spin_is_locked(spinlock_t *lock)
 {
 	return raw_spin_is_locked(&lock->rlock);
 }
 
-static inline int spin_is_contended(spinlock_t *lock)
+static __always_inline int spin_is_contended(spinlock_t *lock)
 {
 	return raw_spin_is_contended(&lock->rlock);
 }
 
-static inline int spin_can_lock(spinlock_t *lock)
+static __always_inline int spin_can_lock(spinlock_t *lock)
 {
 	return raw_spin_can_lock(&lock->rlock);
 }
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH] force inlining of spinlock ops
  2015-05-11 17:57 [PATCH] force inlining of spinlock ops Denys Vlasenko
@ 2015-05-11 18:53 ` Josh Triplett
  2015-05-11 22:19 ` Andrew Morton
  2015-05-12  7:44 ` Ingo Molnar
  2 siblings, 0 replies; 16+ messages in thread
From: Josh Triplett @ 2015-05-11 18:53 UTC (permalink / raw)
  To: Denys Vlasenko
  Cc: Linus Torvalds, Thomas Graf, David S. Miller, Bart Van Assche,
	Peter Zijlstra, David Rientjes, Andrew Morton, linux-kernel

On Mon, May 11, 2015 at 07:57:22PM +0200, Denys Vlasenko wrote:
> With both gcc 4.7.2 and 4.9.2, sometimes gcc mysteriously doesn't inline
> very small functions we expect to be inlined. In particular,
> with this config: http://busybox.net/~vda/kernel_config
> there are more than a thousand copies of tiny spinlock-related functions:
> 
> $ nm --size-sort vmlinux | grep -iF ' t ' | uniq -c | grep -v '^ *1 ' | sort -rn | grep ' spin'
>     473 000000000000000b t spin_unlock_irqrestore
>     292 000000000000000b t spin_unlock
>     215 000000000000000b t spin_lock
>     134 000000000000000b t spin_unlock_irq
>     130 000000000000000b t spin_unlock_bh
>     120 000000000000000b t spin_lock_irq
>     106 000000000000000b t spin_lock_bh
> 
> Disassembly:
> 
> ffffffff81004720 <spin_lock>:
> ffffffff81004720:       55                      push   %rbp
> ffffffff81004721:       48 89 e5                mov    %rsp,%rbp
> ffffffff81004724:       e8 f8 4e e2 02          callq  <_raw_spin_lock>
> ffffffff81004729:       5d                      pop    %rbp
> ffffffff8100472a:       c3                      retq

Frame pointers make this even more awful, since without them this could
just become a single jmp.  (Assuming _raw_spin_lock shouldn't be
inlined too.)

> This patch fixes this via s/inline/__always_inline/ in spinlock.h.
> This decreases vmlinux by about 30k:
> 
>     text     data      bss       dec     hex filename
> 82375570 22255544 20627456 125258570 7774b4a vmlinux.before
> 82335059 22255416 20627456 125217931 776ac8b vmlinux

Nice improvement.  Given that this actually makes the kernel *smaller*,
presumably in addition to faster, this forced inlining seems completely
reasonable.

> Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
> Cc: Thomas Graf <tgraf@suug.ch>
> Cc: David S. Miller <davem@davemloft.net>
> Cc: Bart Van Assche <bvanassche@acm.org>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: David Rientjes <rientjes@google.com>
> Cc: David S. Miller <davem@davemloft.net>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Linus Torvalds <torvalds@linux-foundation.org>
> Cc: Oleg Nesterov <oleg@redhat.com>
> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> Cc: Ingo Molnar <mingo@kernel.org>
> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> CC: linux-kernel@vger.kernel.org

Reviewed-by: Josh Triplett <josh@joshtriplett.org>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] force inlining of spinlock ops
  2015-05-11 17:57 [PATCH] force inlining of spinlock ops Denys Vlasenko
  2015-05-11 18:53 ` Josh Triplett
@ 2015-05-11 22:19 ` Andrew Morton
  2015-05-12  8:16   ` Hagen Paul Pfeifer
  2015-05-12  9:44   ` Denys Vlasenko
  2015-05-12  7:44 ` Ingo Molnar
  2 siblings, 2 replies; 16+ messages in thread
From: Andrew Morton @ 2015-05-11 22:19 UTC (permalink / raw)
  To: Denys Vlasenko
  Cc: Linus Torvalds, Thomas Graf, David S. Miller, Bart Van Assche,
	Peter Zijlstra, David Rientjes, Oleg Nesterov, Paul E. McKenney,
	Ingo Molnar, linux-kernel, Hagen Paul Pfeifer

On Mon, 11 May 2015 19:57:22 +0200 Denys Vlasenko <dvlasenk@redhat.com> wrote:

> With both gcc 4.7.2 and 4.9.2, sometimes gcc mysteriously doesn't inline
> very small functions we expect to be inlined. In particular,
> with this config: http://busybox.net/~vda/kernel_config
> there are more than a thousand copies of tiny spinlock-related functions:
> 
> $ nm --size-sort vmlinux | grep -iF ' t ' | uniq -c | grep -v '^ *1 ' | sort -rn | grep ' spin'
>     473 000000000000000b t spin_unlock_irqrestore
>     292 000000000000000b t spin_unlock
>     215 000000000000000b t spin_lock
>     134 000000000000000b t spin_unlock_irq
>     130 000000000000000b t spin_unlock_bh
>     120 000000000000000b t spin_lock_irq
>     106 000000000000000b t spin_lock_bh
> 
> Disassembly:
> 
> ffffffff81004720 <spin_lock>:
> ffffffff81004720:       55                      push   %rbp
> ffffffff81004721:       48 89 e5                mov    %rsp,%rbp
> ffffffff81004724:       e8 f8 4e e2 02          callq  <_raw_spin_lock>
> ffffffff81004729:       5d                      pop    %rbp
> ffffffff8100472a:       c3                      retq
> 
> This patch fixes this via s/inline/__always_inline/ in spinlock.h.
> This decreases vmlinux by about 30k:
> 
>     text     data      bss       dec     hex filename
> 82375570 22255544 20627456 125258570 7774b4a vmlinux.before
> 82335059 22255416 20627456 125217931 776ac8b vmlinux

See also https://lkml.org/lkml/2015/4/23/598 ("enforce function
inlining for hot functions").

Presumably Hagen didn't see the issue with spinlock functions.  I
wonder why not.

I suppose we should get both these consolidated into a coherent whole.

It's a bit irritating to have to do this: presumably gcc will get fixed
and the huge sprinkling of __always_inline will become less and less
relevant over time and people will have trouble distinguishing "real
__always_inline which was put here for a purpose" from "dopey
__always_inline to work around a short-term gcc glitch".

__always_inline is one of those things where a usage site should always
be commented, because it's near impossible to work out why someone
chose to use it.  Quick, tell me what's happening in include/linux/slab.h.




Perhaps we should do

/*
 * Comment goes here.  It is very specific about gcc versions.
 */
#define inline_for_broken_gcc __always_inline

and then use inline_for_broken_gcc everywhere.  That way, the reason
for the marker is self-explanatory and we can later hunt all these
things down and remvoe them.

Also, the inline_for_broken_gcc definition can be made dependent on
particular gcc versions, which will allow us to easily keep an eye on
the behaviour of later gcc versions.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] force inlining of spinlock ops
  2015-05-11 17:57 [PATCH] force inlining of spinlock ops Denys Vlasenko
  2015-05-11 18:53 ` Josh Triplett
  2015-05-11 22:19 ` Andrew Morton
@ 2015-05-12  7:44 ` Ingo Molnar
  2015-05-12 11:02   ` Denys Vlasenko
  2 siblings, 1 reply; 16+ messages in thread
From: Ingo Molnar @ 2015-05-12  7:44 UTC (permalink / raw)
  To: Denys Vlasenko
  Cc: Linus Torvalds, Thomas Graf, David S. Miller, Bart Van Assche,
	Peter Zijlstra, David Rientjes, Andrew Morton, Oleg Nesterov,
	Paul E. McKenney, linux-kernel


* Denys Vlasenko <dvlasenk@redhat.com> wrote:

> With both gcc 4.7.2 and 4.9.2, sometimes gcc mysteriously doesn't inline
> very small functions we expect to be inlined. In particular,
> with this config: http://busybox.net/~vda/kernel_config
> there are more than a thousand copies of tiny spinlock-related functions:

That's an x86-64 allyesconfig AFAICS, right?

It's not mysterious, but an effect of -Os plus allowing GCC to do 
inlining heuristics:

  CONFIG_CC_OPTIMIZE_FOR_SIZE=y
  CONFIG_OPTIMIZE_INLINING=y

Does the problem go away if you unset of these config options?

Furtermore, what is the size win on x86 defconfig with these options 
set? allyesconfig has all sorts of crazy stuff enabled while defconfig 
on x86 tries to track typical distro configs.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] force inlining of spinlock ops
  2015-05-11 22:19 ` Andrew Morton
@ 2015-05-12  8:16   ` Hagen Paul Pfeifer
  2015-05-12  9:44   ` Denys Vlasenko
  1 sibling, 0 replies; 16+ messages in thread
From: Hagen Paul Pfeifer @ 2015-05-12  8:16 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Denys Vlasenko, Linus Torvalds, Thomas Graf, David S. Miller,
	Bart Van Assche, Peter Zijlstra, David Rientjes, Oleg Nesterov,
	Paul E. McKenney, Ingo Molnar, linux-kernel

* Andrew Morton | 2015-05-11 15:19:13 [-0700]:

>Presumably Hagen didn't see the issue with spinlock functions.  I
>wonder why not.

I think it is a compiler version thing. Not sure why I didn't see it.

>I suppose we should get both these consolidated into a coherent whole.

+1 (let wait for a moment and delay patch inclusion)

>It's a bit irritating to have to do this: presumably gcc will get fixed
>and the huge sprinkling of __always_inline will become less and less
>relevant over time and people will have trouble distinguishing "real
>__always_inline which was put here for a purpose" from "dopey
>__always_inline to work around a short-term gcc glitch".
>
>__always_inline is one of those things where a usage site should always
>be commented, because it's near impossible to work out why someone
>chose to use it.  Quick, tell me what's happening in include/linux/slab.h.
>
>
>Perhaps we should do
>
>/*
> * Comment goes here.  It is very specific about gcc versions.
> */
>#define inline_for_broken_gcc __always_inline

yeah, but name it in a compiler independent way. Sometimes we may seen similar
misbehaving with clang too. But see my other comments

#define inline_for_broken_cc __always_inline

>and then use inline_for_broken_gcc everywhere.  That way, the reason
>for the marker is self-explanatory and we can later hunt all these
>things down and remvoe them.
>
>Also, the inline_for_broken_gcc definition can be made dependent on
>particular gcc versions, which will allow us to easily keep an eye on
>the behaviour of later gcc versions.

Mhh, I am not a big fan of this. I think we maneuver into a unmaintainable
area with this approach. We must test, check this for all compiler version,
new version, all kinds of compiler flags, etc pp.

Another Idea: we talk roundabout about 50 functions where inlining is mission
critical (and correct) but gcc sometimes have trouble to do so. Why not
enforce __always_inline there?  E.g. annotate these rare function with
enforce_inline to highlight that these functions are always inlined. No matter
what optimization and what compiler flags:

#define enforce_inline __always_inline

Developers are encouraged to use inline - because then the compiler can decide
based on his algorithms/heuristics if a function should be inlined or not. For
some really hot & short function the developer can use enforce_inline
- but this should be an exception.

Hagen


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] force inlining of spinlock ops
  2015-05-11 22:19 ` Andrew Morton
  2015-05-12  8:16   ` Hagen Paul Pfeifer
@ 2015-05-12  9:44   ` Denys Vlasenko
  2015-05-12  9:48     ` Ingo Molnar
  1 sibling, 1 reply; 16+ messages in thread
From: Denys Vlasenko @ 2015-05-12  9:44 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, Thomas Graf, David S. Miller, Bart Van Assche,
	Peter Zijlstra, David Rientjes, Oleg Nesterov, Paul E. McKenney,
	Ingo Molnar, linux-kernel, Hagen Paul Pfeifer

On 05/12/2015 12:19 AM, Andrew Morton wrote:
> On Mon, 11 May 2015 19:57:22 +0200 Denys Vlasenko <dvlasenk@redhat.com> wrote:
> 
>> With both gcc 4.7.2 and 4.9.2, sometimes gcc mysteriously doesn't inline
>> very small functions we expect to be inlined. In particular,
>> with this config: http://busybox.net/~vda/kernel_config
>> there are more than a thousand copies of tiny spinlock-related functions:
>>
>> $ nm --size-sort vmlinux | grep -iF ' t ' | uniq -c | grep -v '^ *1 ' | sort -rn | grep ' spin'
>>     473 000000000000000b t spin_unlock_irqrestore
>>     292 000000000000000b t spin_unlock
>>     215 000000000000000b t spin_lock
>>     134 000000000000000b t spin_unlock_irq
>>     130 000000000000000b t spin_unlock_bh
>>     120 000000000000000b t spin_lock_irq
>>     106 000000000000000b t spin_lock_bh
>>
>> Disassembly:
>>
>> ffffffff81004720 <spin_lock>:
>> ffffffff81004720:       55                      push   %rbp
>> ffffffff81004721:       48 89 e5                mov    %rsp,%rbp
>> ffffffff81004724:       e8 f8 4e e2 02          callq  <_raw_spin_lock>
>> ffffffff81004729:       5d                      pop    %rbp
>> ffffffff8100472a:       c3                      retq
>>
>> This patch fixes this via s/inline/__always_inline/ in spinlock.h.
>> This decreases vmlinux by about 30k:
>>
>>     text     data      bss       dec     hex filename
>> 82375570 22255544 20627456 125258570 7774b4a vmlinux.before
>> 82335059 22255416 20627456 125217931 776ac8b vmlinux
> 
> See also https://lkml.org/lkml/2015/4/23/598 ("enforce function
> inlining for hot functions").
> 
> Presumably Hagen didn't see the issue with spinlock functions.  I
> wonder why not.
> 
> I suppose we should get both these consolidated into a coherent whole.
> 
> It's a bit irritating to have to do this: presumably gcc will get fixed
> and the huge sprinkling of __always_inline will become less and less
> relevant over time and people will have trouble distinguishing "real
> __always_inline which was put here for a purpose" from "dopey
> __always_inline to work around a short-term gcc glitch".

In my patches, I put __always_inline *only* on functions
where my measurements show a large size decrease from doing so.
*Not* on functions where "I think it may be a good idea".

So far, all such functions were so trivial that inlining decision there
is a no-brainer.

> and then use inline_for_broken_gcc everywhere.  That way, the reason
> for the marker is self-explanatory and we can later hunt all these
> things down and remvoe them.
> 
> Also, the inline_for_broken_gcc definition can be made dependent on
> particular gcc versions, which will allow us to easily keep an eye on
> the behaviour of later gcc versions.

I've seen it on gcc-4.7.2 and gcc-4.9.2, so this behavior is not
limited to a narrow range of gcc versions. I'd say by now about half
of running kernels can easily be affected.

-- 
vda


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] force inlining of spinlock ops
  2015-05-12  9:44   ` Denys Vlasenko
@ 2015-05-12  9:48     ` Ingo Molnar
  0 siblings, 0 replies; 16+ messages in thread
From: Ingo Molnar @ 2015-05-12  9:48 UTC (permalink / raw)
  To: Denys Vlasenko
  Cc: Andrew Morton, Linus Torvalds, Thomas Graf, David S. Miller,
	Bart Van Assche, Peter Zijlstra, David Rientjes, Oleg Nesterov,
	Paul E. McKenney, linux-kernel, Hagen Paul Pfeifer


* Denys Vlasenko <dvlasenk@redhat.com> wrote:

> > Also, the inline_for_broken_gcc definition can be made dependent 
> > on particular gcc versions, which will allow us to easily keep an 
> > eye on the behaviour of later gcc versions.
> 
> I've seen it on gcc-4.7.2 and gcc-4.9.2, so this behavior is not 
> limited to a narrow range of gcc versions. I'd say by now about half 
> of running kernels can easily be affected.

Please do the measurements on x86 defconfig (with OPTIMIZE_FOR_SIZE 
and OPTIMIZE_INLINING enabled if necessary), to make sure we are truly 
getting a decrease in kernel size on common distro configs as well.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] force inlining of spinlock ops
  2015-05-12  7:44 ` Ingo Molnar
@ 2015-05-12 11:02   ` Denys Vlasenko
  2015-05-12 11:43     ` Ingo Molnar
  0 siblings, 1 reply; 16+ messages in thread
From: Denys Vlasenko @ 2015-05-12 11:02 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Thomas Graf, David S. Miller, Bart Van Assche,
	Peter Zijlstra, David Rientjes, Andrew Morton, Oleg Nesterov,
	Paul E. McKenney, linux-kernel

On 05/12/2015 09:44 AM, Ingo Molnar wrote:
> 
> * Denys Vlasenko <dvlasenk@redhat.com> wrote:
> 
>> With both gcc 4.7.2 and 4.9.2, sometimes gcc mysteriously doesn't inline
>> very small functions we expect to be inlined. In particular,
>> with this config: http://busybox.net/~vda/kernel_config
>> there are more than a thousand copies of tiny spinlock-related functions:
> 
> That's an x86-64 allyesconfig AFAICS, right?

Close, but I disabled options which are clearly "heavy debugging" stuff.
IOW: many developers run their work machines with lock debugging etc,
but few would constantly use something which slows kernel down by a factor of 3!

So, CONFIG_KASAN is off. CONFIG_STAGING is also off. And a few others I forgot.

I'm using this config to see which inlines should be deinlined.
For that, I need to cover all callsites of each inline.
Thus, I need ~allyesconfig.

The discovery that there also exists the opposite problem (wrongly
*un*inlined functions) was accidental.


> It's not mysterious, but an effect of -Os plus allowing GCC to do 
> inlining heuristics:
> 
>   CONFIG_CC_OPTIMIZE_FOR_SIZE=y
>   CONFIG_OPTIMIZE_INLINING=y
> 
> Does the problem go away if you unset of these config options?

With CONFIG_CC_OPTIMIZE_FOR_SIZE off,
problem greatly diminishes, but is not eliminated.
Testing allyesconfig would take too long, so I just took defconfig.

On defconfig kernel, the following functions below 16 bytes
of machine code are auto-deinlined:

#Calls_ Size(hex)_______   Name____________________
      7 000000000000000b t hweight_long
      5 000000000000000f t init_once
      4 000000000000000d t cpumask_set_cpu
      4 000000000000000b t udp_lib_close
      4 0000000000000006 t udp_lib_hash
      3 000000000000000a t nofill
      3 0000000000000006 t sg_set_page.part.7
      2 000000000000000f t udplite_sk_init
      2 000000000000000f t ct_seq_next
      2 000000000000000e t encode_cookie
      2 000000000000000d t ktime_get_real
      2 000000000000000b t spin_lock
      2 000000000000000b t device_create_release
      2 000000000000000b t cpu_smt_flags
      2 000000000000000b t cpu_core_flags
      2 0000000000000009 t default_write_file
      2 0000000000000008 t __initcall_pl_driver_init6
      2 0000000000000008 t __initcall_nf_defrag_init6
      2 0000000000000008 t __initcall_hid_init6
      2 0000000000000008 t __initcall_ch_driver_init6
      2 0000000000000008 t default_read_file
      2 0000000000000006 t wiphy_to_rdev.part.4
      2 0000000000000006 t s_stop
      2 0000000000000006 t sg_set_page.part.3
      2 0000000000000006 t generic_print_tuple
      2 0000000000000006 t exp_seq_stop
      2 0000000000000006 t ct_seq_stop
      2 0000000000000006 t ct_cpu_seq_stop

In particular, one of the functions from my patches,
spin_lock(), has been auto-deinlined:

ffffffff8108adb0 <spin_lock>:
ffffffff8108adb0:       55                      push   %rbp
ffffffff8108adb1:       48 89 e5                mov    %rsp,%rbp
ffffffff8108adb4:       e8 37 db 81 00          callq  ffffffff818a88f0 <_raw_spin_lock>
ffffffff8108adb9:       5d                      pop    %rbp
ffffffff8108adba:       c3                      retq


> Furtermore, what is the size win on x86 defconfig with these options 
> set?

CONFIG_OPTIMIZE_INLINING=y is in defconfig.

Size difference for CC_OPTIMIZE_FOR_SIZE:

    text    data     bss      dec    hex filename
12335864 1746152 1081344 15163360 e75fe0 vmlinux.CC_OPTIMIZE_FOR_SIZE=y
10373764 1684200 1077248 13135212 c86d6c vmlinux.CC_OPTIMIZE_FOR_SIZE=n

Decrease by about 19%.

-- 
vda


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] force inlining of spinlock ops
  2015-05-12 11:02   ` Denys Vlasenko
@ 2015-05-12 11:43     ` Ingo Molnar
  2015-05-12 13:13       ` Denys Vlasenko
  0 siblings, 1 reply; 16+ messages in thread
From: Ingo Molnar @ 2015-05-12 11:43 UTC (permalink / raw)
  To: Denys Vlasenko
  Cc: Linus Torvalds, Thomas Graf, David S. Miller, Bart Van Assche,
	Peter Zijlstra, David Rientjes, Andrew Morton, Oleg Nesterov,
	Paul E. McKenney, linux-kernel


* Denys Vlasenko <dvlasenk@redhat.com> wrote:

> On 05/12/2015 09:44 AM, Ingo Molnar wrote:
> > 
> > * Denys Vlasenko <dvlasenk@redhat.com> wrote:
> > 
> >> With both gcc 4.7.2 and 4.9.2, sometimes gcc mysteriously doesn't inline
> >> very small functions we expect to be inlined. In particular,
> >> with this config: http://busybox.net/~vda/kernel_config
> >> there are more than a thousand copies of tiny spinlock-related functions:
> > 
> > That's an x86-64 allyesconfig AFAICS, right?
> 
> Close, but I disabled options which are clearly "heavy debugging" stuff.
> IOW: many developers run their work machines with lock debugging etc,
> but few would constantly use something which slows kernel down by a factor of 3!
> 
> So, CONFIG_KASAN is off. CONFIG_STAGING is also off. And a few others I forgot.
> 
> I'm using this config to see which inlines should be deinlined.
> For that, I need to cover all callsites of each inline.
> Thus, I need ~allyesconfig.
> 
> The discovery that there also exists the opposite problem (wrongly
> *un*inlined functions) was accidental.
> 
> 
> > It's not mysterious, but an effect of -Os plus allowing GCC to do 
> > inlining heuristics:
> > 
> >   CONFIG_CC_OPTIMIZE_FOR_SIZE=y
> >   CONFIG_OPTIMIZE_INLINING=y
> > 
> > Does the problem go away if you unset of these config options?
> 
> With CONFIG_CC_OPTIMIZE_FOR_SIZE off,
> problem greatly diminishes, but is not eliminated.
> Testing allyesconfig would take too long, so I just took defconfig.
> 
> On defconfig kernel, the following functions below 16 bytes
> of machine code are auto-deinlined:
> 
> #Calls_ Size(hex)_______   Name____________________
>       7 000000000000000b t hweight_long
>       5 000000000000000f t init_once
>       4 000000000000000d t cpumask_set_cpu
>       4 000000000000000b t udp_lib_close
>       4 0000000000000006 t udp_lib_hash
>       3 000000000000000a t nofill
>       3 0000000000000006 t sg_set_page.part.7
>       2 000000000000000f t udplite_sk_init
>       2 000000000000000f t ct_seq_next
>       2 000000000000000e t encode_cookie
>       2 000000000000000d t ktime_get_real
>       2 000000000000000b t spin_lock
>       2 000000000000000b t device_create_release
>       2 000000000000000b t cpu_smt_flags
>       2 000000000000000b t cpu_core_flags
>       2 0000000000000009 t default_write_file
>       2 0000000000000008 t __initcall_pl_driver_init6
>       2 0000000000000008 t __initcall_nf_defrag_init6
>       2 0000000000000008 t __initcall_hid_init6
>       2 0000000000000008 t __initcall_ch_driver_init6
>       2 0000000000000008 t default_read_file
>       2 0000000000000006 t wiphy_to_rdev.part.4
>       2 0000000000000006 t s_stop
>       2 0000000000000006 t sg_set_page.part.3
>       2 0000000000000006 t generic_print_tuple
>       2 0000000000000006 t exp_seq_stop
>       2 0000000000000006 t ct_seq_stop
>       2 0000000000000006 t ct_cpu_seq_stop
> 
> In particular, one of the functions from my patches,
> spin_lock(), has been auto-deinlined:
> 
> ffffffff8108adb0 <spin_lock>:
> ffffffff8108adb0:       55                      push   %rbp
> ffffffff8108adb1:       48 89 e5                mov    %rsp,%rbp
> ffffffff8108adb4:       e8 37 db 81 00          callq  ffffffff818a88f0 <_raw_spin_lock>
> ffffffff8108adb9:       5d                      pop    %rbp
> ffffffff8108adba:       c3                      retq
> 
> 
> > Furtermore, what is the size win on x86 defconfig with these options 
> > set?
> 
> CONFIG_OPTIMIZE_INLINING=y is in defconfig.
> 
> Size difference for CC_OPTIMIZE_FOR_SIZE:
> 
>     text    data     bss      dec    hex filename
> 12335864 1746152 1081344 15163360 e75fe0 vmlinux.CC_OPTIMIZE_FOR_SIZE=y
> 10373764 1684200 1077248 13135212 c86d6c vmlinux.CC_OPTIMIZE_FOR_SIZE=n
> 
> Decrease by about 19%.

I suspect the 'filename' field wants to be flipped?

In any case, the interesting measurement would not be -Os comparisons 
(which causes GCC to be too crazy), but to see the size effect of your 
_patch_ that always-inlines spinlock ops, on plain defconfig and on 
defconfig-Os.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] force inlining of spinlock ops
  2015-05-12 11:43     ` Ingo Molnar
@ 2015-05-12 13:13       ` Denys Vlasenko
  2015-05-13 10:17         ` Ingo Molnar
  0 siblings, 1 reply; 16+ messages in thread
From: Denys Vlasenko @ 2015-05-12 13:13 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Thomas Graf, David S. Miller, Bart Van Assche,
	Peter Zijlstra, David Rientjes, Andrew Morton, Oleg Nesterov,
	Paul E. McKenney, linux-kernel

On 05/12/2015 01:43 PM, Ingo Molnar wrote:
>>> Furtermore, what is the size win on x86 defconfig with these options 
>>> set?
>>
>> CONFIG_OPTIMIZE_INLINING=y is in defconfig.
>>
>> Size difference for CC_OPTIMIZE_FOR_SIZE:
>>
>>     text    data     bss      dec    hex filename
>> 12335864 1746152 1081344 15163360 e75fe0 vmlinux.CC_OPTIMIZE_FOR_SIZE=y
>> 10373764 1684200 1077248 13135212 c86d6c vmlinux.CC_OPTIMIZE_FOR_SIZE=n
>>
>> Decrease by about 19%.
> 
> I suspect the 'filename' field wants to be flipped?

Yes.

> In any case, the interesting measurement would not be -Os comparisons 
> (which causes GCC to be too crazy), but to see the size effect of your 
> _patch_ that always-inlines spinlock ops, on plain defconfig and on 
> defconfig-Os.

Here it is:

    text    data     bss      dec    hex filename
12335864 1746152 1081344 15163360 e75fe0 vmlinuxO2.before
12335930 1746152 1081344 15163426 e76022 vmlinux

    text    data     bss      dec    hex filename
10373764 1684200 1077248 13135212 c86d6c vmlinuxOs.before
10363621 1684200 1077248 13125069 c845cd vmlinux


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] force inlining of spinlock ops
  2015-05-12 13:13       ` Denys Vlasenko
@ 2015-05-13 10:17         ` Ingo Molnar
  2015-05-13 10:28           ` Denys Vlasenko
  0 siblings, 1 reply; 16+ messages in thread
From: Ingo Molnar @ 2015-05-13 10:17 UTC (permalink / raw)
  To: Denys Vlasenko
  Cc: Linus Torvalds, Thomas Graf, David S. Miller, Bart Van Assche,
	Peter Zijlstra, David Rientjes, Andrew Morton, Oleg Nesterov,
	Paul E. McKenney, linux-kernel


* Denys Vlasenko <dvlasenk@redhat.com> wrote:

> On 05/12/2015 01:43 PM, Ingo Molnar wrote:
> >>> Furtermore, what is the size win on x86 defconfig with these options 
> >>> set?
> >>
> >> CONFIG_OPTIMIZE_INLINING=y is in defconfig.
> >>
> >> Size difference for CC_OPTIMIZE_FOR_SIZE:
> >>
> >>     text    data     bss      dec    hex filename
> >> 12335864 1746152 1081344 15163360 e75fe0 vmlinux.CC_OPTIMIZE_FOR_SIZE=y
> >> 10373764 1684200 1077248 13135212 c86d6c vmlinux.CC_OPTIMIZE_FOR_SIZE=n
> >>
> >> Decrease by about 19%.
> > 
> > I suspect the 'filename' field wants to be flipped?
> 
> Yes.
> 
> > In any case, the interesting measurement would not be -Os comparisons 
> > (which causes GCC to be too crazy), but to see the size effect of your 
> > _patch_ that always-inlines spinlock ops, on plain defconfig and on 
> > defconfig-Os.
> 
> Here it is:
> 
>     text    data     bss      dec    hex filename
> 12335864 1746152 1081344 15163360 e75fe0 vmlinuxO2.before
> 12335930 1746152 1081344 15163426 e76022 vmlinux

Hm, that's a (small) size increase on O2.

That might be a net positive though: because now we've eliminated 
quite a few function calls. Do we know which individual functions 
bloat and which debloat?

>     text    data     bss      dec    hex filename
> 10373764 1684200 1077248 13135212 c86d6c vmlinuxOs.before
> 10363621 1684200 1077248 13125069 c845cd vmlinux

A decrease - which gets exploded on allyesconfig.

So as long as the -O2 case does not get hurt we can do -Os fixes.

I think this needs a bit more work to ensure that the O2 case is a net 
win.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] force inlining of spinlock ops
  2015-05-13 10:17         ` Ingo Molnar
@ 2015-05-13 10:28           ` Denys Vlasenko
  2015-05-13 10:43             ` Ingo Molnar
  0 siblings, 1 reply; 16+ messages in thread
From: Denys Vlasenko @ 2015-05-13 10:28 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Thomas Graf, David S. Miller, Bart Van Assche,
	Peter Zijlstra, David Rientjes, Andrew Morton, Oleg Nesterov,
	Paul E. McKenney, linux-kernel

On 05/13/2015 12:17 PM, Ingo Molnar wrote:
>>> In any case, the interesting measurement would not be -Os comparisons 
>>> (which causes GCC to be too crazy), but to see the size effect of your 
>>> _patch_ that always-inlines spinlock ops, on plain defconfig and on 
>>> defconfig-Os.
>>
>> Here it is:
>>
>>     text    data     bss      dec    hex filename
>> 12335864 1746152 1081344 15163360 e75fe0 vmlinuxO2.before
>> 12335930 1746152 1081344 15163426 e76022 vmlinux
> 
> Hm, that's a (small) size increase on O2.
> 
> That might be a net positive though: because now we've eliminated 
> quite a few function calls. Do we know which individual functions 
> bloat and which debloat?

>>     text    data     bss      dec    hex filename
>> 10373764 1684200 1077248 13135212 c86d6c vmlinuxOs.before
>> 10363621 1684200 1077248 13125069 c845cd vmlinux
> 
> A decrease - which gets exploded on allyesconfig.
> 
> So as long as the -O2 case does not get hurt we can do -Os fixes.
> 
> I think this needs a bit more work to ensure that the O2 case is a net 
> win.

I think O2 difference is just noise: with -O2 gcc is far less prone
to bogus deinlining, my patch should have negligible effect.
And effect is indeed negligible: +70 bytes on 12 megabytes.



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] force inlining of spinlock ops
  2015-05-13 10:28           ` Denys Vlasenko
@ 2015-05-13 10:43             ` Ingo Molnar
  2015-05-13 14:09               ` Denys Vlasenko
  0 siblings, 1 reply; 16+ messages in thread
From: Ingo Molnar @ 2015-05-13 10:43 UTC (permalink / raw)
  To: Denys Vlasenko
  Cc: Linus Torvalds, Thomas Graf, David S. Miller, Bart Van Assche,
	Peter Zijlstra, David Rientjes, Andrew Morton, Oleg Nesterov,
	Paul E. McKenney, linux-kernel


* Denys Vlasenko <dvlasenk@redhat.com> wrote:

> On 05/13/2015 12:17 PM, Ingo Molnar wrote:
> >>> In any case, the interesting measurement would not be -Os comparisons 
> >>> (which causes GCC to be too crazy), but to see the size effect of your 
> >>> _patch_ that always-inlines spinlock ops, on plain defconfig and on 
> >>> defconfig-Os.
> >>
> >> Here it is:
> >>
> >>     text    data     bss      dec    hex filename
> >> 12335864 1746152 1081344 15163360 e75fe0 vmlinuxO2.before
> >> 12335930 1746152 1081344 15163426 e76022 vmlinux
> > 
> > Hm, that's a (small) size increase on O2.
> > 
> > That might be a net positive though: because now we've eliminated 
> > quite a few function calls. Do we know which individual functions 
> > bloat and which debloat?
> 
> >>     text    data     bss      dec    hex filename
> >> 10373764 1684200 1077248 13135212 c86d6c vmlinuxOs.before
> >> 10363621 1684200 1077248 13125069 c845cd vmlinux
> > 
> > A decrease - which gets exploded on allyesconfig.
> > 
> > So as long as the -O2 case does not get hurt we can do -Os fixes.
> > 
> > I think this needs a bit more work to ensure that the O2 case is a 
> > net win.
> 
> I think O2 difference is just noise: with -O2 gcc is far less prone 
> to bogus deinlining, my patch should have negligible effect. And 
> effect is indeed negligible: +70 bytes on 12 megabytes.

So the patch force-inlines about a dozen locking APIs:

 - Some of those decrease the defconfig kernel size.
   Which ones and by how much?

 - Some of those increase the defconfig kernel size.
   Which ones and by how much?

We only know that the net effect is +70 bytes. Does that come out of:

 - large fluctuations such as -1000-1000+1000+1070, which happens to 
   net out into a small net number?

 - or does it come from much smaller fluctuations?

So to make an informed decision we need to know those details. When I 
deinline or reinline functions I usually do it on a per function 
basis, to avoid such ambiguity.

In the end what we want to have is only those deinlining/reinlining 
changes that decrease the defconfig kernel size, or at worst only 
increase it marginally.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] force inlining of spinlock ops
  2015-05-13 10:43             ` Ingo Molnar
@ 2015-05-13 14:09               ` Denys Vlasenko
  2015-05-15  7:20                 ` Heiko Carstens
  0 siblings, 1 reply; 16+ messages in thread
From: Denys Vlasenko @ 2015-05-13 14:09 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Thomas Graf, David S. Miller, Bart Van Assche,
	Peter Zijlstra, David Rientjes, Andrew Morton, Oleg Nesterov,
	Paul E. McKenney, linux-kernel

On 05/13/2015 12:43 PM, Ingo Molnar wrote:
> 
> * Denys Vlasenko <dvlasenk@redhat.com> wrote:
> 
>> On 05/13/2015 12:17 PM, Ingo Molnar wrote:
>>>>> In any case, the interesting measurement would not be -Os comparisons 
>>>>> (which causes GCC to be too crazy), but to see the size effect of your 
>>>>> _patch_ that always-inlines spinlock ops, on plain defconfig and on 
>>>>> defconfig-Os.
>>>>
>>>> Here it is:
>>>>
>>>>     text    data     bss      dec    hex filename
>>>> 12335864 1746152 1081344 15163360 e75fe0 vmlinuxO2.before
>>>> 12335930 1746152 1081344 15163426 e76022 vmlinux
>>>
>>> Hm, that's a (small) size increase on O2.
>>>
>>> That might be a net positive though: because now we've eliminated 
>>> quite a few function calls. Do we know which individual functions 
>>> bloat and which debloat?
>>
>>>>     text    data     bss      dec    hex filename
>>>> 10373764 1684200 1077248 13135212 c86d6c vmlinuxOs.before
>>>> 10363621 1684200 1077248 13125069 c845cd vmlinux
>>>
>>> A decrease - which gets exploded on allyesconfig.
>>>
>>> So as long as the -O2 case does not get hurt we can do -Os fixes.
>>>
>>> I think this needs a bit more work to ensure that the O2 case is a 
>>> net win.
>>
>> I think O2 difference is just noise: with -O2 gcc is far less prone 
>> to bogus deinlining, my patch should have negligible effect. And 
>> effect is indeed negligible: +70 bytes on 12 megabytes.
> 
> So the patch force-inlines about a dozen locking APIs:
> 
>  - Some of those decrease the defconfig kernel size.
>    Which ones and by how much?
> 
>  - Some of those increase the defconfig kernel size.
>    Which ones and by how much?
> 
> We only know that the net effect is +70 bytes. Does that come out of:
> 
>  - large fluctuations such as -1000-1000+1000+1070, which happens to 
>    net out into a small net number?
> 
>  - or does it come from much smaller fluctuations?
> 
> So to make an informed decision we need to know those details.

Fair enough. Let's investigate.

I produced a list of functions with their sizes from each vmlinux,
and diffed them:

$ nm --size-sort vmlinux | sed 's/\.[0-9]*.*/.NNN/' >vmlinux.nm
$ nm --size-sort vmlinuxO2.before | sed 's/\.[0-9]*.*/.NNN/' >vmlinuxO2.before.nm
$ diff -u vmlinuxO2.before.nm vmlinux.nm | grep -v '^[ @]' >vmlinux.nm.dif

I see the following:

- spin_[un]lock_foo's are gone as expected.
- some other functions got spuriously deinlined
  (such as __raw_spin_unlock).
- yet other functions which were spuriously deinlined before,
  now aren't deinlined (such as nf_conntrack_put).

--- vmlinuxO2.before.nm	2015-05-13 15:46:37.147058665 +0200
+++ vmlinux.nm	2015-05-13 15:46:26.079086233 +0200
+0000000000000009 t ipc_unlock_object
+0000000000000009 t __raw_spin_unlock
+0000000000000009 t __raw_spin_unlock
-0000000000000009 t spin_unlock
-000000000000000b t spin_lock
-000000000000000b t spin_lock
-000000000000000b t spin_unlock_irqrestore
+000000000000000d t task_unlock
+0000000000000011 t arch_spin_is_locked
+0000000000000013 t double_unlock_hb
-000000000000001c t nf_conntrack_put
-000000000000001d t ctnetlink_done_list
+000000000000001e t unix_state_double_unlock
+0000000000000025 t check_and_drop
-0000000000000025 t hugetlbfs_inc_free_inodes.NNN
-0000000000000025 t sem_lock.NNN
-0000000000000027 t check_and_drop

- many other functions now have slightly different sizes.
  For example:
  ext4_mb_pa_callback grew from 0x28 to 0x2a bytes (+2 bytes)
  hid_alloc_report_buf shrank from 0x2a to 0x28 bytes (-2 bytes)

-0000000000000028 t ext4_mb_pa_callback
+0000000000000028 T hid_alloc_report_buf
+0000000000000028 t nf_conntrack_double_unlock
+0000000000000029 t do_shm_rmid
+000000000000002a t ext4_mb_pa_callback
-000000000000002a T hid_alloc_report_buf

Let's take a look at ext4_mb_pa_callback.
The difference stems from gcc choosing a different scratch register:

<ext4_mb_pa_callback>:                                          <ext4_mb_pa_callback>:
8b 47 14                mov    0x14(%rdi),%eax                  8b 47 14                mov    0x14(%rdi),%eax
48 8d 77 e0             lea    -0x20(%rdi),%rsi                 48 8d 77 e0             lea    -0x20(%rdi),%rsi
85 c0                   test   %eax,%eax                        85 c0                   test   %eax,%eax
75 1b                   jne    <ext4_mb_pa_callback+0x26>       75 19                   jne    <ext4_mb_pa_callback+0x24>
44 8b 57 18             mov    0x18(%rdi),%r10d                 8b 7f 18                mov    0x18(%rdi),%edi
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
45 85 d2                test   %r10d,%r10d                      85 ff                   test   %edi,%edi
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
74 14                   je     <ext4_mb_pa_callback+0x28>       74 14                   je     <ext4_mb_pa_callback+0x26>
55                      push   %rbp                             55                      push   %rbp
48 8b 3d e4 b8 eb 00    mov    0xebb8e4(%rip),%rdi              48 8b 3d 96 c0 eb 00    mov    0xebc096(%rip),%rdi
48 89 e5                mov    %rsp,%rbp                        48 89 e5                mov    %rsp,%rbp
e8 8c 3f f4 ff          callq  <kmem_cache_free>                e8 fe 46 f4 ff          callq  <kmem_cache_free>
5d                      pop    %rbp                             5d                      pop    %rbp
c3                      retq                                    c3                      retq
0f 0b                   ud2                                     0f 0b                   ud2
0f 0b                   ud2                                     0f 0b                   ud2

Working with r10 requires REX prefix in two underlined instructions,
which caused function to grow by 2 bytes.

This is not in any way related to my patch.

I think this confirms my hypothesis that the difference of +70 bytes is noise.

The remainder of the diff is below:

-000000000000002b t do_shm_rmid
+000000000000002b T jbd2_journal_ack_err
-000000000000002c t ctnetlink_done
+000000000000002d t ctnetlink_done_list
-000000000000002d T jbd2_journal_ack_err
-000000000000002e t amd_iommu_stats_add
-000000000000002e t hid_parser_reserved
+0000000000000030 t amd_iommu_stats_add
+0000000000000030 t hid_parser_reserved
-0000000000000034 T acpi_ec_unblock_transactions
+0000000000000034 T usb_hub_to_struct_hub
-0000000000000036 t acpi_ec_stopped
+0000000000000036 T acpi_ec_unblock_transactions
-0000000000000037 T completion_done
-000000000000003c T usb_hub_to_struct_hub
+000000000000003e T completion_done
-000000000000003e t ehci_clear_tt_buffer.NNN
+000000000000003f t acpi_ec_gpe_handler
+000000000000003f t ehci_clear_tt_buffer.NNN
-0000000000000041 t acpi_ec_gpe_handler
+0000000000000042 T usb_wakeup_notification
-0000000000000044 T acpi_boot_ec_enable
-0000000000000044 T usb_wakeup_notification
+0000000000000045 T acpi_boot_ec_enable
-0000000000000046 t hugetlbfs_destroy_inode
+0000000000000047 t ctnetlink_done
+0000000000000049 T efivar_entry_iter
+000000000000004a t turn_on_io_watchdog
-000000000000004b T efivar_entry_iter
-000000000000004c t init_fat_fs
-000000000000004c t turn_on_io_watchdog
+000000000000004e t init_fat_fs
+0000000000000054 T shm_destroy_orphaned
-0000000000000055 t sem_wait_array
+0000000000000056 t dm_dirty_log_init
-0000000000000056 T shm_destroy_orphaned
-0000000000000057 t dm_dirty_log_init
+0000000000000058 t fat_evict_inode
+0000000000000058 t mirror_available
-0000000000000059 t mirror_available
-000000000000005a t fat_evict_inode
+000000000000005b t sem_wait_array
-000000000000005c T blk_pre_runtime_suspend
+000000000000005c t free_dev_data
-000000000000005d t free_dev_data
+000000000000005f t hugetlbfs_destroy_inode
+0000000000000064 T blk_pre_runtime_suspend
+0000000000000071 t metadata_show
+0000000000000071 t update_changeattr
-0000000000000072 t update_changeattr
-0000000000000073 t hugetlbfs_alloc_inode
-0000000000000076 t fat_calc_dir_size
+0000000000000079 t fat_calc_dir_size
-0000000000000079 t metadata_show
+000000000000007c t rtc_dev_open
+000000000000007d t hugetlbfs_alloc_inode
-000000000000007e t ptrace_unfreeze_traced.NNN
-000000000000007f t rtc_dev_open
+0000000000000080 t ptrace_unfreeze_traced.NNN
-0000000000000083 T vfs_whiteout
+0000000000000086 T vfs_whiteout
-0000000000000087 t read_disk_sb.NNN
+0000000000000089 T assert_forcewakes_inactive
+000000000000008a t read_disk_sb.NNN
-000000000000008b T assert_forcewakes_inactive
-000000000000008e t cache_ioctl.NNN
+0000000000000093 T usbhid_close
-0000000000000094 t nf_nat_init
+0000000000000095 t nf_nat_init
+0000000000000096 t cache_ioctl.NNN
+0000000000000096 t __lock_sock
-0000000000000096 T usbhid_close
-0000000000000097 t __lock_sock
+0000000000000098 T jbd2_trans_will_send_data_barrier
-000000000000009a T jbd2_trans_will_send_data_barrier
+000000000000009b T drm_crtc_vblank_reset
-000000000000009d T drm_crtc_vblank_reset
+00000000000000a6 T md_setup_cluster
+00000000000000aa t hugetlbfs_statfs
+00000000000000ab t uhci_hcd_init
-00000000000000ac t hugetlbfs_statfs
-00000000000000ac t uhci_hcd_init
+00000000000000b1 t gro_cell_poll
-00000000000000b2 t gro_cell_poll
+00000000000000b2 t nf_conntrack_double_lock
-00000000000000b4 T md_setup_cluster
-00000000000000b4 t pps_init
+00000000000000b5 t pps_init
-00000000000000bb T svc_authenticate
+00000000000000bd T svc_authenticate
-00000000000000be t ehci_poll_ASS
+00000000000000c1 t ehci_poll_PSS
+00000000000000c1 t loop_queue_write_work
+00000000000000c5 T usb_disable_lpm
+00000000000000c6 t ehci_poll_ASS
-00000000000000c7 t nf_conntrack_double_lock
-00000000000000c7 T usb_disable_lpm
-00000000000000c9 t ehci_poll_PSS
+00000000000000cb t xfrm_del_sa
+00000000000000cd t snd_timer_user_ccallback
-00000000000000d0 t xfrm_del_sa
+00000000000000d2 T d_prune_aliases
-00000000000000d4 T usb_sg_cancel
-00000000000000d5 t pci_pm_freeze_noirq
-00000000000000d5 t snd_timer_user_ccallback
+00000000000000d5 T usb_sg_cancel
+00000000000000d7 t pci_pm_freeze_noirq
-00000000000000d9 T d_prune_aliases
-00000000000000db t loop_queue_write_work
-00000000000000e9 t dentry_lru_isolate
+00000000000000ed t dentry_lru_isolate
+00000000000000f6 t max_sync_store
+00000000000000fa t slab_sysfs_init
+00000000000000fc T fat_attach
-00000000000000fc t slab_sysfs_init
-00000000000000fe T fat_attach
-00000000000000fe t max_sync_store
-00000000000000ff t slab_out_of_memory
+0000000000000100 t slab_out_of_memory
+0000000000000100 t tg3_nway_reset
-0000000000000102 t ctnetlink_dump_exp_ct
+0000000000000103 t calgary_fixup_tce_spaces
+0000000000000103 t ext4_inode_csum_set
-0000000000000103 t tg3_nway_reset
-0000000000000104 t calgary_fixup_tce_spaces
-0000000000000105 t __unmap_single.NNN
-0000000000000106 t acpi_ec_stop
-0000000000000107 t pktsched_init
+0000000000000107 t __unmap_single.NNN
+0000000000000108 t pktsched_init
-000000000000010b t ext4_inode_csum_set
+000000000000010f T drm_vblank_on
+0000000000000113 t ctnetlink_dump_exp_ct
-0000000000000115 t mddev_put
+0000000000000117 t nl80211_set_mac_acl
-0000000000000119 T drm_vblank_on
+000000000000011a t mddev_put
+000000000000011b T wait_for_completion_interruptible_timeout
-000000000000011c T wait_for_completion_interruptible_timeout
+0000000000000123 t acpi_ec_stop
+0000000000000125 t ext4_mb_simple_scan_group
-0000000000000125 t nfs_do_filldir
+0000000000000125 T remove_proc_subtree
-0000000000000127 t nl80211_set_mac_acl
+000000000000012b t azx_irq_pending_work
+000000000000012d t nfs_do_filldir
-000000000000012e T remove_proc_subtree
+0000000000000131 t store_uframe_periodic_max
-0000000000000133 t store_uframe_periodic_max
-0000000000000137 t ext4_mb_simple_scan_group
-0000000000000139 T jbd2_journal_put_journal_head
-000000000000013b t azx_irq_pending_work
+000000000000013b T jbd2_journal_put_journal_head
+000000000000013d t xfrm_add_policy
+0000000000000142 t serial8250_backup_timeout
-0000000000000145 t xfrm_add_policy
+000000000000014a T hid_dump_device
-000000000000014c t alloc_buddy_huge_page
+0000000000000151 t finish_urb
+0000000000000151 t submit_flushes
-0000000000000152 T hid_dump_device
-0000000000000152 t serial8250_backup_timeout
+0000000000000154 t alloc_buddy_huge_page
-0000000000000159 t submit_flushes
+000000000000015a T vfs_mknod
+000000000000015c T remove_proc_entry
-000000000000015e T remove_proc_entry
-0000000000000161 t finish_urb
+0000000000000162 t nv_remove
-0000000000000162 T vfs_mknod
+0000000000000165 T nf_ct_delete
-0000000000000166 t nv_nic_irq_rx
+0000000000000167 t nv_nic_irq_rx
-000000000000016a t nv_remove
-0000000000000174 T iommu_tbl_pool_init
-0000000000000175 t queue_process
+000000000000017c T iommu_tbl_pool_init
+0000000000000180 t queue_process
-0000000000000183 t ctnetlink_del_conntrack
-0000000000000185 T nf_ct_delete
-000000000000019f t ctnetlink_new_conntrack
-00000000000001aa t azx_attach_pcm_stream
-00000000000001aa t dump_header.NNN
+00000000000001ab t azx_attach_pcm_stream
+00000000000001ab t dump_header.NNN
+00000000000001ae t acpi_ec_add
-00000000000001b0 t acpi_ec_add
+00000000000001b2 t ctnetlink_del_conntrack
-00000000000001b8 t nv_close
+00000000000001c9 t ctnetlink_new_conntrack
-00000000000001d1 t ctnetlink_create_expect.NNN
+00000000000001dc t nv_close
+00000000000001e1 t ctnetlink_create_expect.NNN
-00000000000001e3 t ctnetlink_get_conntrack
-00000000000001f0 t ctnetlink_dump_table
+00000000000001f0 t hid_debug_rdesc_show
-00000000000001f8 t hid_debug_rdesc_show
-00000000000001fa T ata_task_ioctl
-00000000000001fa T dm_kcopyd_copy
+00000000000001fb t ctnetlink_dump_table
+00000000000001fc T dm_kcopyd_copy
+0000000000000202 T ata_task_ioctl
-0000000000000207 T usb_reset_device
-000000000000020c T inet_bind
+000000000000020f T usb_reset_device
+0000000000000212 t ctnetlink_get_conntrack
-000000000000021a t nl80211_send_mlme_event.NNN
+000000000000021c T inet_bind
+000000000000021c t nl80211_send_mlme_event.NNN
+0000000000000226 t nv_nic_irq_other
-000000000000022e T destroy_workqueue
-000000000000022e t nv_nic_irq_other
+0000000000000230 T destroy_workqueue
-0000000000000233 T netpoll_send_skb_on_dev
+0000000000000242 t snd_timer_user_read
-0000000000000245 t snd_timer_user_read
-000000000000024a t univ8250_setup_irq
+000000000000024b T netpoll_send_skb_on_dev
-000000000000024c t ctnetlink_dump_list
+000000000000024f t ctnetlink_dump_list
+0000000000000252 t univ8250_setup_irq
+0000000000000253 t unix_dgram_connect
-000000000000026f t ieee80211_tx_h_select_key
-0000000000000275 t remove_and_add_spares
+0000000000000277 t ieee80211_tx_h_select_key
+0000000000000277 t remove_and_add_spares
+0000000000000278 t slot_store
-0000000000000280 t slot_store
-0000000000000286 t d_walk
-0000000000000288 T sys_shmctl
-0000000000000288 T SyS_shmctl
-0000000000000291 t unix_dgram_connect
+00000000000002a3 t prepend_path
+00000000000002a4 t d_walk
+00000000000002a4 T sys_shmctl
+00000000000002a4 T SyS_shmctl
+00000000000002b6 t ext4_mb_find_by_goal
+00000000000002b6 T nf_conntrack_hash_check_insert
-00000000000002b7 t prepend_path
+00000000000002bf t nl80211_set_reg
-00000000000002c6 t ext4_mb_find_by_goal
+00000000000002c7 T __nf_conntrack_confirm
-00000000000002cf t nl80211_set_reg
-00000000000002f6 T nf_conntrack_hash_check_insert
-00000000000002ff T __nf_conntrack_confirm
-0000000000000308 T detect_calgary
+0000000000000309 T detect_calgary
-0000000000000326 t hiddev_read
-0000000000000328 t ext4_mb_generate_buddy
+0000000000000328 t hiddev_read
+0000000000000330 t ext4_mb_generate_buddy
-0000000000000345 t alps_process_touchpad_packet_v3_v5
-0000000000000346 t bsg_map_hdr.NNN
+0000000000000347 t bsg_map_hdr.NNN
+000000000000034c t nfs_end_delegation_return
+000000000000034d t alps_process_touchpad_packet_v3_v5
-000000000000035d t nfs_end_delegation_return
-0000000000000372 T oom_kill_process
+0000000000000373 T oom_kill_process
+0000000000000379 t acpi_ec_transaction
-0000000000000394 t acpi_ec_transaction
+00000000000003a3 t chv_read16
-00000000000003ab t chv_read16
-00000000000003d9 t hiddev_ioctl_usage.NNN
+00000000000003e9 t do_timerfd_settime
+00000000000003e9 t hiddev_ioctl_usage.NNN
-00000000000003eb t do_timerfd_settime
-0000000000000405 T ohci_hub_status_data
-0000000000000406 t chv_write16
-0000000000000406 t super_90_load
+0000000000000407 t super_90_load
-000000000000040b t loop_clr_fd
+000000000000040e t chv_write16
+0000000000000415 T ohci_hub_status_data
+000000000000041b t loop_clr_fd
+0000000000000425 t worker_thread
-0000000000000428 t worker_thread
+0000000000000444 T azx_init_chip
-000000000000044c T azx_init_chip
-0000000000000453 T ext4_discard_preallocations
-0000000000000475 T do_shmat
+000000000000047a T do_shmat
+000000000000047b T ext4_discard_preallocations
-0000000000000488 t ext4_direct_IO
+0000000000000498 t ext4_direct_IO
+00000000000004f9 t ext4_mb_normalize_request
-0000000000000509 t ext4_mb_normalize_request
-000000000000050b t __dev_queue_xmit
+0000000000000511 t __dev_queue_xmit
-00000000000005d6 T serial8250_do_startup
+00000000000005e6 T serial8250_do_startup
-000000000000061c t nv_update_linkspeed
+000000000000061b t nv_napi_poll
-0000000000000625 t nv_napi_poll
+000000000000062c t nv_update_linkspeed
-00000000000007e8 t nv_start_xmit_optimized
+00000000000007ea t nv_start_xmit_optimized
-0000000000000893 t futex_requeue
+0000000000000891 t futex_requeue
+0000000000000898 t nl80211_parse_sched_scan.NNN
-00000000000008a0 t nl80211_parse_sched_scan.NNN
-000000000000094c T sock_setsockopt
+0000000000000950 T sock_setsockopt
-00000000000009ee T do_futex
+00000000000009fe T do_futex
-0000000000000a66 t nv_self_test
+0000000000000aaa t nv_self_test
-0000000000000af3 T sata_pmp_error_handler
+0000000000000afb T sata_pmp_error_handler
-0000000000000b5d t ohci_urb_enqueue
+0000000000000b65 t ohci_urb_enqueue
-0000000000000b72 t display_crc_ctl_write
+0000000000000b79 t display_crc_ctl_write
-0000000000000ca7 t SYSC_semtimedop
+0000000000000c9b t SYSC_semtimedop
+0000000000000e31 T md_do_sync
-0000000000000e3f T md_do_sync
+0000000000000fd3 t ehci_urb_enqueue
-0000000000000ffb t ehci_urb_enqueue
-0000000000001027 t packet_sendmsg
+0000000000001029 t packet_sendmsg
-0000000000001e17 t do_blockdev_direct_IO
+0000000000001e07 t do_blockdev_direct_IO
-000000000000261e t nl80211_send_wiphy
+000000000000262e t nl80211_send_wiphy

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] force inlining of spinlock ops
  2015-05-13 14:09               ` Denys Vlasenko
@ 2015-05-15  7:20                 ` Heiko Carstens
  0 siblings, 0 replies; 16+ messages in thread
From: Heiko Carstens @ 2015-05-15  7:20 UTC (permalink / raw)
  To: Denys Vlasenko
  Cc: Ingo Molnar, Linus Torvalds, Thomas Graf, David S. Miller,
	Bart Van Assche, Peter Zijlstra, David Rientjes, Andrew Morton,
	Oleg Nesterov, Paul E. McKenney, linux-kernel

On Wed, May 13, 2015 at 04:09:18PM +0200, Denys Vlasenko wrote:
> On 05/13/2015 12:43 PM, Ingo Molnar wrote:
> > We only know that the net effect is +70 bytes. Does that come out of:
> > 
> >  - large fluctuations such as -1000-1000+1000+1070, which happens to 
> >    net out into a small net number?
> > 
> >  - or does it come from much smaller fluctuations?
> > 
> > So to make an informed decision we need to know those details.
> 
> Fair enough. Let's investigate.
> 
> I produced a list of functions with their sizes from each vmlinux,
> and diffed them:
> 
> $ nm --size-sort vmlinux | sed 's/\.[0-9]*.*/.NNN/' >vmlinux.nm
> $ nm --size-sort vmlinuxO2.before | sed 's/\.[0-9]*.*/.NNN/' >vmlinuxO2.before.nm
> $ diff -u vmlinuxO2.before.nm vmlinux.nm | grep -v '^[ @]' >vmlinux.nm.dif

FWIW, scripts/bloat-o-meter is a nice tool to examine the size differences
of two vmlinux images.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH] force inlining of spinlock ops
@ 2015-07-13 18:31 Denys Vlasenko
  0 siblings, 0 replies; 16+ messages in thread
From: Denys Vlasenko @ 2015-07-13 18:31 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Denys Vlasenko, Thomas Graf, Bart Van Assche, Peter Zijlstra,
	David Rientjes, Andrew Morton, Paul E. McKenney, linux-kernel

With both gcc 4.7.2 and 4.9.2, sometimes gcc mysteriously doesn't inline
very small functions we expect to be inlined. See
    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122

In particular,
with this config: http://busybox.net/~vda/kernel_config
there are more than a thousand copies of tiny spinlock-related functions:

$ nm --size-sort vmlinux | grep -iF ' t ' | uniq -c | grep -v '^ *1 ' | sort -rn | grep ' spin'
    473 000000000000000b t spin_unlock_irqrestore
    292 000000000000000b t spin_unlock
    215 000000000000000b t spin_lock
    134 000000000000000b t spin_unlock_irq
    130 000000000000000b t spin_unlock_bh
    120 000000000000000b t spin_lock_irq
    106 000000000000000b t spin_lock_bh

Disassembly:

ffffffff81004720 <spin_lock>:
ffffffff81004720:       55                      push   %rbp
ffffffff81004721:       48 89 e5                mov    %rsp,%rbp
ffffffff81004724:       e8 f8 4e e2 02          callq  <_raw_spin_lock>
ffffffff81004729:       5d                      pop    %rbp
ffffffff8100472a:       c3                      retq

This patch fixes this via s/inline/__always_inline/ in spinlock.h.
This decreases vmlinux by about 40k:

    text     data      bss       dec     hex filename
82375570 22255544 20627456 125258570 7774b4a vmlinux.before
82335059 22255416 20627456 125217931 776ac8b vmlinux

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Bart Van Assche <bvanassche@acm.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: linux-kernel@vger.kernel.org
---
 include/linux/spinlock.h | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 3e18379..073925d 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -296,7 +296,7 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
  * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
  */
 
-static inline raw_spinlock_t *spinlock_check(spinlock_t *lock)
+static __always_inline raw_spinlock_t *spinlock_check(spinlock_t *lock)
 {
 	return &lock->rlock;
 }
@@ -307,17 +307,17 @@ do {							\
 	raw_spin_lock_init(&(_lock)->rlock);		\
 } while (0)
 
-static inline void spin_lock(spinlock_t *lock)
+static __always_inline void spin_lock(spinlock_t *lock)
 {
 	raw_spin_lock(&lock->rlock);
 }
 
-static inline void spin_lock_bh(spinlock_t *lock)
+static __always_inline void spin_lock_bh(spinlock_t *lock)
 {
 	raw_spin_lock_bh(&lock->rlock);
 }
 
-static inline int spin_trylock(spinlock_t *lock)
+static __always_inline int spin_trylock(spinlock_t *lock)
 {
 	return raw_spin_trylock(&lock->rlock);
 }
@@ -337,7 +337,7 @@ do {									\
 	raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock);	\
 } while (0)
 
-static inline void spin_lock_irq(spinlock_t *lock)
+static __always_inline void spin_lock_irq(spinlock_t *lock)
 {
 	raw_spin_lock_irq(&lock->rlock);
 }
@@ -352,32 +352,32 @@ do {									\
 	raw_spin_lock_irqsave_nested(spinlock_check(lock), flags, subclass); \
 } while (0)
 
-static inline void spin_unlock(spinlock_t *lock)
+static __always_inline void spin_unlock(spinlock_t *lock)
 {
 	raw_spin_unlock(&lock->rlock);
 }
 
-static inline void spin_unlock_bh(spinlock_t *lock)
+static __always_inline void spin_unlock_bh(spinlock_t *lock)
 {
 	raw_spin_unlock_bh(&lock->rlock);
 }
 
-static inline void spin_unlock_irq(spinlock_t *lock)
+static __always_inline void spin_unlock_irq(spinlock_t *lock)
 {
 	raw_spin_unlock_irq(&lock->rlock);
 }
 
-static inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
+static __always_inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
 {
 	raw_spin_unlock_irqrestore(&lock->rlock, flags);
 }
 
-static inline int spin_trylock_bh(spinlock_t *lock)
+static __always_inline int spin_trylock_bh(spinlock_t *lock)
 {
 	return raw_spin_trylock_bh(&lock->rlock);
 }
 
-static inline int spin_trylock_irq(spinlock_t *lock)
+static __always_inline int spin_trylock_irq(spinlock_t *lock)
 {
 	return raw_spin_trylock_irq(&lock->rlock);
 }
@@ -387,22 +387,22 @@ static inline int spin_trylock_irq(spinlock_t *lock)
 	raw_spin_trylock_irqsave(spinlock_check(lock), flags); \
 })
 
-static inline void spin_unlock_wait(spinlock_t *lock)
+static __always_inline void spin_unlock_wait(spinlock_t *lock)
 {
 	raw_spin_unlock_wait(&lock->rlock);
 }
 
-static inline int spin_is_locked(spinlock_t *lock)
+static __always_inline int spin_is_locked(spinlock_t *lock)
 {
 	return raw_spin_is_locked(&lock->rlock);
 }
 
-static inline int spin_is_contended(spinlock_t *lock)
+static __always_inline int spin_is_contended(spinlock_t *lock)
 {
 	return raw_spin_is_contended(&lock->rlock);
 }
 
-static inline int spin_can_lock(spinlock_t *lock)
+static __always_inline int spin_can_lock(spinlock_t *lock)
 {
 	return raw_spin_can_lock(&lock->rlock);
 }
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2015-07-13 18:31 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-05-11 17:57 [PATCH] force inlining of spinlock ops Denys Vlasenko
2015-05-11 18:53 ` Josh Triplett
2015-05-11 22:19 ` Andrew Morton
2015-05-12  8:16   ` Hagen Paul Pfeifer
2015-05-12  9:44   ` Denys Vlasenko
2015-05-12  9:48     ` Ingo Molnar
2015-05-12  7:44 ` Ingo Molnar
2015-05-12 11:02   ` Denys Vlasenko
2015-05-12 11:43     ` Ingo Molnar
2015-05-12 13:13       ` Denys Vlasenko
2015-05-13 10:17         ` Ingo Molnar
2015-05-13 10:28           ` Denys Vlasenko
2015-05-13 10:43             ` Ingo Molnar
2015-05-13 14:09               ` Denys Vlasenko
2015-05-15  7:20                 ` Heiko Carstens
2015-07-13 18:31 Denys Vlasenko

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.