All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] x86: force inlining of test_and_set_bit and friends
@ 2016-02-07 21:51 Denys Vlasenko
  2016-02-09 12:19 ` [tip:x86/asm] x86/asm/bitops: Force " tip-bot for Denys Vlasenko
  0 siblings, 1 reply; 2+ messages in thread
From: Denys Vlasenko @ 2016-02-07 21:51 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Denys Vlasenko, Thomas Graf, Peter Zijlstra, David Rientjes,
	Andrew Morton, H. Peter Anvin, Andi Kleen, x86, linux-kernel

Sometimes gcc mysteriously doesn't inline
very small functions we expect to be inlined. See
    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122
Arguably, gcc should do better, but gcc people aren't willing
to invest time into it, asking to use __always_inline instead.

With this .config:
http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os,
the following functions get deinlined many times.
Examples of disassembly:

test_and_set_bit (166 copies, ~1260 calls)
       55                      push   %rbp
       48 89 e5                mov    %rsp,%rbp
       f0 48 0f ab 3e          lock bts %rdi,(%rsi)
       72 04                   jb     <test_and_set_bit+0xf>
       31 c0                   xor    %eax,%eax
       eb 05                   jmp    <test_and_set_bit+0x14>
       b8 01 00 00 00          mov    $0x1,%eax
       5d                      pop    %rbp
       c3                      retq

test_and_clear_bit (124 copies, ~1000 calls)
       55                      push   %rbp
       48 89 e5                mov    %rsp,%rbp
       f0 48 0f b3 3e          lock btr %rdi,(%rsi)
       72 04                   jb     <test_and_clear_bit+0xf>
       31 c0                   xor    %eax,%eax
       eb 05                   jmp    <test_and_clear_bit+0x14>
       b8 01 00 00 00          mov    $0x1,%eax
       5d                      pop    %rbp
       c3                      retq

change_bit (3 copies, 8 calls)
       55                      push   %rbp
       48 89 e5                mov    %rsp,%rbp
       f0 48 0f bb 3e          lock btc %rdi,(%rsi)
       5d                      pop    %rbp
       c3                      retq

clear_bit_unlock (2 copies, 11 calls)
       55                      push   %rbp
       48 89 e5                mov    %rsp,%rbp
       f0 48 0f b3 3e          lock btr %rdi,(%rsi)
       5d                      pop    %rbp
       c3                      retq

This patch fixes this via s/inline/__always_inline/.

Code size decrease after the patch is ~13.5k:

    text     data      bss       dec     hex filename
92110727 20826144 36417536 149354407 8e6f7a7 vmlinux
92097234 20826176 36417536 149340946 8e6c312 vmlinux9_bitops_after

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: x86@kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/x86/include/asm/bitops.h | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index cfe3b95..7766d1c 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -91,7 +91,7 @@ set_bit(long nr, volatile unsigned long *addr)
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static inline void __set_bit(long nr, volatile unsigned long *addr)
+static __always_inline void __set_bit(long nr, volatile unsigned long *addr)
 {
 	asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory");
 }
@@ -128,13 +128,13 @@ clear_bit(long nr, volatile unsigned long *addr)
  * clear_bit() is atomic and implies release semantics before the memory
  * operation. It can be used for an unlock.
  */
-static inline void clear_bit_unlock(long nr, volatile unsigned long *addr)
+static __always_inline void clear_bit_unlock(long nr, volatile unsigned long *addr)
 {
 	barrier();
 	clear_bit(nr, addr);
 }
 
-static inline void __clear_bit(long nr, volatile unsigned long *addr)
+static __always_inline void __clear_bit(long nr, volatile unsigned long *addr)
 {
 	asm volatile("btr %1,%0" : ADDR : "Ir" (nr));
 }
@@ -151,7 +151,7 @@ static inline void __clear_bit(long nr, volatile unsigned long *addr)
  * No memory barrier is required here, because x86 cannot reorder stores past
  * older loads. Same principle as spin_unlock.
  */
-static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
+static __always_inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
 {
 	barrier();
 	__clear_bit(nr, addr);
@@ -166,7 +166,7 @@ static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static inline void __change_bit(long nr, volatile unsigned long *addr)
+static __always_inline void __change_bit(long nr, volatile unsigned long *addr)
 {
 	asm volatile("btc %1,%0" : ADDR : "Ir" (nr));
 }
@@ -180,7 +180,7 @@ static inline void __change_bit(long nr, volatile unsigned long *addr)
  * Note that @nr may be almost arbitrarily large; this function is not
  * restricted to acting on a single-word quantity.
  */
-static inline void change_bit(long nr, volatile unsigned long *addr)
+static __always_inline void change_bit(long nr, volatile unsigned long *addr)
 {
 	if (IS_IMMEDIATE(nr)) {
 		asm volatile(LOCK_PREFIX "xorb %1,%0"
@@ -201,7 +201,7 @@ static inline void change_bit(long nr, volatile unsigned long *addr)
  * This operation is atomic and cannot be reordered.
  * It also implies a memory barrier.
  */
-static inline int test_and_set_bit(long nr, volatile unsigned long *addr)
+static __always_inline int test_and_set_bit(long nr, volatile unsigned long *addr)
 {
 	GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, "Ir", nr, "%0", "c");
 }
@@ -228,7 +228,7 @@ test_and_set_bit_lock(long nr, volatile unsigned long *addr)
  * If two examples of this operation race, one can appear to succeed
  * but actually fail.  You must protect multiple accesses with a lock.
  */
-static inline int __test_and_set_bit(long nr, volatile unsigned long *addr)
+static __always_inline int __test_and_set_bit(long nr, volatile unsigned long *addr)
 {
 	int oldbit;
 
@@ -247,7 +247,7 @@ static inline int __test_and_set_bit(long nr, volatile unsigned long *addr)
  * This operation is atomic and cannot be reordered.
  * It also implies a memory barrier.
  */
-static inline int test_and_clear_bit(long nr, volatile unsigned long *addr)
+static __always_inline int test_and_clear_bit(long nr, volatile unsigned long *addr)
 {
 	GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, "Ir", nr, "%0", "c");
 }
@@ -268,7 +268,7 @@ static inline int test_and_clear_bit(long nr, volatile unsigned long *addr)
  * accessed from a hypervisor on the same CPU if running in a VM: don't change
  * this without also updating arch/x86/kernel/kvm.c
  */
-static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr)
+static __always_inline int __test_and_clear_bit(long nr, volatile unsigned long *addr)
 {
 	int oldbit;
 
@@ -280,7 +280,7 @@ static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr)
 }
 
 /* WARNING: non atomic and it can be reordered! */
-static inline int __test_and_change_bit(long nr, volatile unsigned long *addr)
+static __always_inline int __test_and_change_bit(long nr, volatile unsigned long *addr)
 {
 	int oldbit;
 
@@ -300,7 +300,7 @@ static inline int __test_and_change_bit(long nr, volatile unsigned long *addr)
  * This operation is atomic and cannot be reordered.
  * It also implies a memory barrier.
  */
-static inline int test_and_change_bit(long nr, volatile unsigned long *addr)
+static __always_inline int test_and_change_bit(long nr, volatile unsigned long *addr)
 {
 	GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, "Ir", nr, "%0", "c");
 }
@@ -311,7 +311,7 @@ static __always_inline int constant_test_bit(long nr, const volatile unsigned lo
 		(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
 }
 
-static inline int variable_test_bit(long nr, volatile const unsigned long *addr)
+static __always_inline int variable_test_bit(long nr, volatile const unsigned long *addr)
 {
 	int oldbit;
 
@@ -343,7 +343,7 @@ static int test_bit(int nr, const volatile unsigned long *addr);
  *
  * Undefined if no bit exists, so code should check against 0 first.
  */
-static inline unsigned long __ffs(unsigned long word)
+static __always_inline unsigned long __ffs(unsigned long word)
 {
 	asm("rep; bsf %1,%0"
 		: "=r" (word)
@@ -357,7 +357,7 @@ static inline unsigned long __ffs(unsigned long word)
  *
  * Undefined if no zero exists, so code should check against ~0UL first.
  */
-static inline unsigned long ffz(unsigned long word)
+static __always_inline unsigned long ffz(unsigned long word)
 {
 	asm("rep; bsf %1,%0"
 		: "=r" (word)
@@ -371,7 +371,7 @@ static inline unsigned long ffz(unsigned long word)
  *
  * Undefined if no set bit exists, so code should check against 0 first.
  */
-static inline unsigned long __fls(unsigned long word)
+static __always_inline unsigned long __fls(unsigned long word)
 {
 	asm("bsr %1,%0"
 	    : "=r" (word)
@@ -393,7 +393,7 @@ static inline unsigned long __fls(unsigned long word)
  * set bit if value is nonzero. The first (least significant) bit
  * is at position 1.
  */
-static inline int ffs(int x)
+static __always_inline int ffs(int x)
 {
 	int r;
 
@@ -434,7 +434,7 @@ static inline int ffs(int x)
  * set bit if value is nonzero. The last (most significant) bit is
  * at position 32.
  */
-static inline int fls(int x)
+static __always_inline int fls(int x)
 {
 	int r;
 
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 2+ messages in thread

* [tip:x86/asm] x86/asm/bitops: Force inlining of test_and_set_bit and friends
  2016-02-07 21:51 [PATCH] x86: force inlining of test_and_set_bit and friends Denys Vlasenko
@ 2016-02-09 12:19 ` tip-bot for Denys Vlasenko
  0 siblings, 0 replies; 2+ messages in thread
From: tip-bot for Denys Vlasenko @ 2016-02-09 12:19 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: rientjes, hpa, linux-kernel, akpm, bp, peterz, mingo, tglx,
	brgerst, dvlasenk, torvalds, luto, tgraf

Commit-ID:  8dd5032d9c540111dd673078738d137a998d6c3f
Gitweb:     http://git.kernel.org/tip/8dd5032d9c540111dd673078738d137a998d6c3f
Author:     Denys Vlasenko <dvlasenk@redhat.com>
AuthorDate: Sun, 7 Feb 2016 22:51:27 +0100
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Tue, 9 Feb 2016 10:31:54 +0100

x86/asm/bitops: Force inlining of test_and_set_bit and friends

Sometimes GCC mysteriously doesn't inline very small functions
we expect to be inlined, see:

  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122

Arguably, GCC should do better, but GCC people aren't willing
to invest time into it and are asking to use __always_inline
instead.

With this .config:

  http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os

here's an example of functions getting deinlined many times:

  test_and_set_bit (166 copies, ~1260 calls)
         55                      push   %rbp
         48 89 e5                mov    %rsp,%rbp
         f0 48 0f ab 3e          lock bts %rdi,(%rsi)
         72 04                   jb     <test_and_set_bit+0xf>
         31 c0                   xor    %eax,%eax
         eb 05                   jmp    <test_and_set_bit+0x14>
         b8 01 00 00 00          mov    $0x1,%eax
         5d                      pop    %rbp
         c3                      retq

  test_and_clear_bit (124 copies, ~1000 calls)
         55                      push   %rbp
         48 89 e5                mov    %rsp,%rbp
         f0 48 0f b3 3e          lock btr %rdi,(%rsi)
         72 04                   jb     <test_and_clear_bit+0xf>
         31 c0                   xor    %eax,%eax
         eb 05                   jmp    <test_and_clear_bit+0x14>
         b8 01 00 00 00          mov    $0x1,%eax
         5d                      pop    %rbp
         c3                      retq

  change_bit (3 copies, 8 calls)
         55                      push   %rbp
         48 89 e5                mov    %rsp,%rbp
         f0 48 0f bb 3e          lock btc %rdi,(%rsi)
         5d                      pop    %rbp
         c3                      retq

  clear_bit_unlock (2 copies, 11 calls)
         55                      push   %rbp
         48 89 e5                mov    %rsp,%rbp
         f0 48 0f b3 3e          lock btr %rdi,(%rsi)
         5d                      pop    %rbp
         c3                      retq

This patch works it around via s/inline/__always_inline/.

Code size decrease by ~13.5k after the patch:

      text     data      bss       dec    filename
  92110727 20826144 36417536 149354407    vmlinux.before
  92097234 20826176 36417536 149340946    vmlinux.after

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thomas Graf <tgraf@suug.ch>
Link: http://lkml.kernel.org/r/1454881887-1367-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/bitops.h | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index cfe3b95..7766d1c 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -91,7 +91,7 @@ set_bit(long nr, volatile unsigned long *addr)
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static inline void __set_bit(long nr, volatile unsigned long *addr)
+static __always_inline void __set_bit(long nr, volatile unsigned long *addr)
 {
 	asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory");
 }
@@ -128,13 +128,13 @@ clear_bit(long nr, volatile unsigned long *addr)
  * clear_bit() is atomic and implies release semantics before the memory
  * operation. It can be used for an unlock.
  */
-static inline void clear_bit_unlock(long nr, volatile unsigned long *addr)
+static __always_inline void clear_bit_unlock(long nr, volatile unsigned long *addr)
 {
 	barrier();
 	clear_bit(nr, addr);
 }
 
-static inline void __clear_bit(long nr, volatile unsigned long *addr)
+static __always_inline void __clear_bit(long nr, volatile unsigned long *addr)
 {
 	asm volatile("btr %1,%0" : ADDR : "Ir" (nr));
 }
@@ -151,7 +151,7 @@ static inline void __clear_bit(long nr, volatile unsigned long *addr)
  * No memory barrier is required here, because x86 cannot reorder stores past
  * older loads. Same principle as spin_unlock.
  */
-static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
+static __always_inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
 {
 	barrier();
 	__clear_bit(nr, addr);
@@ -166,7 +166,7 @@ static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static inline void __change_bit(long nr, volatile unsigned long *addr)
+static __always_inline void __change_bit(long nr, volatile unsigned long *addr)
 {
 	asm volatile("btc %1,%0" : ADDR : "Ir" (nr));
 }
@@ -180,7 +180,7 @@ static inline void __change_bit(long nr, volatile unsigned long *addr)
  * Note that @nr may be almost arbitrarily large; this function is not
  * restricted to acting on a single-word quantity.
  */
-static inline void change_bit(long nr, volatile unsigned long *addr)
+static __always_inline void change_bit(long nr, volatile unsigned long *addr)
 {
 	if (IS_IMMEDIATE(nr)) {
 		asm volatile(LOCK_PREFIX "xorb %1,%0"
@@ -201,7 +201,7 @@ static inline void change_bit(long nr, volatile unsigned long *addr)
  * This operation is atomic and cannot be reordered.
  * It also implies a memory barrier.
  */
-static inline int test_and_set_bit(long nr, volatile unsigned long *addr)
+static __always_inline int test_and_set_bit(long nr, volatile unsigned long *addr)
 {
 	GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, "Ir", nr, "%0", "c");
 }
@@ -228,7 +228,7 @@ test_and_set_bit_lock(long nr, volatile unsigned long *addr)
  * If two examples of this operation race, one can appear to succeed
  * but actually fail.  You must protect multiple accesses with a lock.
  */
-static inline int __test_and_set_bit(long nr, volatile unsigned long *addr)
+static __always_inline int __test_and_set_bit(long nr, volatile unsigned long *addr)
 {
 	int oldbit;
 
@@ -247,7 +247,7 @@ static inline int __test_and_set_bit(long nr, volatile unsigned long *addr)
  * This operation is atomic and cannot be reordered.
  * It also implies a memory barrier.
  */
-static inline int test_and_clear_bit(long nr, volatile unsigned long *addr)
+static __always_inline int test_and_clear_bit(long nr, volatile unsigned long *addr)
 {
 	GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, "Ir", nr, "%0", "c");
 }
@@ -268,7 +268,7 @@ static inline int test_and_clear_bit(long nr, volatile unsigned long *addr)
  * accessed from a hypervisor on the same CPU if running in a VM: don't change
  * this without also updating arch/x86/kernel/kvm.c
  */
-static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr)
+static __always_inline int __test_and_clear_bit(long nr, volatile unsigned long *addr)
 {
 	int oldbit;
 
@@ -280,7 +280,7 @@ static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr)
 }
 
 /* WARNING: non atomic and it can be reordered! */
-static inline int __test_and_change_bit(long nr, volatile unsigned long *addr)
+static __always_inline int __test_and_change_bit(long nr, volatile unsigned long *addr)
 {
 	int oldbit;
 
@@ -300,7 +300,7 @@ static inline int __test_and_change_bit(long nr, volatile unsigned long *addr)
  * This operation is atomic and cannot be reordered.
  * It also implies a memory barrier.
  */
-static inline int test_and_change_bit(long nr, volatile unsigned long *addr)
+static __always_inline int test_and_change_bit(long nr, volatile unsigned long *addr)
 {
 	GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, "Ir", nr, "%0", "c");
 }
@@ -311,7 +311,7 @@ static __always_inline int constant_test_bit(long nr, const volatile unsigned lo
 		(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
 }
 
-static inline int variable_test_bit(long nr, volatile const unsigned long *addr)
+static __always_inline int variable_test_bit(long nr, volatile const unsigned long *addr)
 {
 	int oldbit;
 
@@ -343,7 +343,7 @@ static int test_bit(int nr, const volatile unsigned long *addr);
  *
  * Undefined if no bit exists, so code should check against 0 first.
  */
-static inline unsigned long __ffs(unsigned long word)
+static __always_inline unsigned long __ffs(unsigned long word)
 {
 	asm("rep; bsf %1,%0"
 		: "=r" (word)
@@ -357,7 +357,7 @@ static inline unsigned long __ffs(unsigned long word)
  *
  * Undefined if no zero exists, so code should check against ~0UL first.
  */
-static inline unsigned long ffz(unsigned long word)
+static __always_inline unsigned long ffz(unsigned long word)
 {
 	asm("rep; bsf %1,%0"
 		: "=r" (word)
@@ -371,7 +371,7 @@ static inline unsigned long ffz(unsigned long word)
  *
  * Undefined if no set bit exists, so code should check against 0 first.
  */
-static inline unsigned long __fls(unsigned long word)
+static __always_inline unsigned long __fls(unsigned long word)
 {
 	asm("bsr %1,%0"
 	    : "=r" (word)
@@ -393,7 +393,7 @@ static inline unsigned long __fls(unsigned long word)
  * set bit if value is nonzero. The first (least significant) bit
  * is at position 1.
  */
-static inline int ffs(int x)
+static __always_inline int ffs(int x)
 {
 	int r;
 
@@ -434,7 +434,7 @@ static inline int ffs(int x)
  * set bit if value is nonzero. The last (most significant) bit is
  * at position 32.
  */
-static inline int fls(int x)
+static __always_inline int fls(int x)
 {
 	int r;
 

^ permalink raw reply related	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2016-02-09 12:20 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-02-07 21:51 [PATCH] x86: force inlining of test_and_set_bit and friends Denys Vlasenko
2016-02-09 12:19 ` [tip:x86/asm] x86/asm/bitops: Force " tip-bot for Denys Vlasenko

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.