From: Uros Bizjak <ubizjak@gmail.com>
To: x86@kernel.org, linux-kernel@vger.kernel.org
Cc: Uros Bizjak <ubizjak@gmail.com>,
Andy Lutomirski <luto@kernel.org>, Ingo Molnar <mingo@kernel.org>,
Nadav Amit <namit@vmware.com>, Brian Gerst <brgerst@gmail.com>,
Denys Vlasenko <dvlasenk@redhat.com>,
"H . Peter Anvin" <hpa@zytor.com>,
Linus Torvalds <torvalds@linux-foundation.org>,
Peter Zijlstra <peterz@infradead.org>,
Thomas Gleixner <tglx@linutronix.de>,
Borislav Petkov <bp@alien8.de>,
Josh Poimboeuf <jpoimboe@redhat.com>
Subject: [PATCH v2 4/4] x86/percpu: Use C for percpu read/write accessors
Date: Wed, 4 Oct 2023 21:23:08 +0200 [thread overview]
Message-ID: <20231004192404.31733-1-ubizjak@gmail.com> (raw)
In-Reply-To: <ZR2U4DLycLT5xFH6@gmail.com>
The percpu code mostly uses inline assembly. Using segment qualifiers
allows to use C code instead, which enables the compiler to perform
various optimizations (e.g. propagation of memory arguments). Convert
percpu read and write accessors to C code, so the memory argument can
be propagated to the instruction that uses this argument.
Some examples of propagations:
a) into sign/zero extensions:
the code improves from:
65 8a 05 00 00 00 00 mov %gs:0x0(%rip),%al
0f b6 c0 movzbl %al,%eax
to:
65 0f b6 05 00 00 00 movzbl %gs:0x0(%rip),%eax
00
and in a similar way for:
movzbl %gs:0x0(%rip),%edx
movzwl %gs:0x0(%rip),%esi
movzbl %gs:0x78(%rbx),%eax
movslq %gs:0x0(%rip),%rdx
movslq %gs:(%rdi),%rbx
b) into compares:
the code improves from:
65 8b 05 00 00 00 00 mov %gs:0x0(%rip),%eax
a9 00 00 0f 00 test $0xf0000,%eax
to:
65 f7 05 00 00 00 00 testl $0xf0000,%gs:0x0(%rip)
00 00 0f 00
and in a similar way for:
testl $0xf0000,%gs:0x0(%rip)
testb $0x1,%gs:0x0(%rip)
testl $0xff00,%gs:0x0(%rip)
cmpb $0x0,%gs:0x0(%rip)
cmp %gs:0x0(%rip),%r14d
cmpw $0x8,%gs:0x0(%rip)
cmpb $0x0,%gs:(%rax)
c) into other insns:
the code improves from:
1a355: 83 fa ff cmp $0xffffffff,%edx
1a358: 75 07 jne 1a361 <...>
1a35a: 65 8b 15 00 00 00 00 mov %gs:0x0(%rip),%edx
1a361:
to:
1a35a: 83 fa ff cmp $0xffffffff,%edx
1a35d: 65 0f 44 15 00 00 00 cmove %gs:0x0(%rip),%edx
1a364: 00
The above propagations result in the following code size
improvements for current mainline kernel (with the default config),
compiled with
gcc (GCC) 12.3.1 20230508 (Red Hat 12.3.1-1)
text data bss dec hex filename
25508862 4386540 808388 30703790 1d480ae vmlinux-vanilla.o
25500922 4386532 808388 30695842 1d461a2 vmlinux-new.o
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Nadav Amit <namit@vmware.com>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Co-developed-by: Nadav Amit <namit@vmware.com>
Signed-off-by: Nadav Amit <namit@vmware.com>
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
---
v2: Rewrite code examples in the commit message.
---
arch/x86/include/asm/percpu.h | 65 +++++++++++++++++++++++++++++------
1 file changed, 54 insertions(+), 11 deletions(-)
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index da451202a1b9..60ea7755c0fe 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -400,13 +400,66 @@ do { \
#define this_cpu_read_stable_8(pcp) percpu_stable_op(8, "mov", pcp)
#define this_cpu_read_stable(pcp) __pcpu_size_call_return(this_cpu_read_stable_, pcp)
+#ifdef CONFIG_USE_X86_SEG_SUPPORT
+
+#define __raw_cpu_read(qual, pcp) \
+({ \
+ *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp)); \
+})
+
+#define __raw_cpu_write(qual, pcp, val) \
+do { \
+ *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp)) = (val); \
+} while (0)
+
+#define raw_cpu_read_1(pcp) __raw_cpu_read(, pcp)
+#define raw_cpu_read_2(pcp) __raw_cpu_read(, pcp)
+#define raw_cpu_read_4(pcp) __raw_cpu_read(, pcp)
+#define raw_cpu_write_1(pcp, val) __raw_cpu_write(, pcp, val)
+#define raw_cpu_write_2(pcp, val) __raw_cpu_write(, pcp, val)
+#define raw_cpu_write_4(pcp, val) __raw_cpu_write(, pcp, val)
+
+#define this_cpu_read_1(pcp) __raw_cpu_read(volatile, pcp)
+#define this_cpu_read_2(pcp) __raw_cpu_read(volatile, pcp)
+#define this_cpu_read_4(pcp) __raw_cpu_read(volatile, pcp)
+#define this_cpu_write_1(pcp, val) __raw_cpu_write(volatile, pcp, val)
+#define this_cpu_write_2(pcp, val) __raw_cpu_write(volatile, pcp, val)
+#define this_cpu_write_4(pcp, val) __raw_cpu_write(volatile, pcp, val)
+
+#ifdef CONFIG_X86_64
+#define raw_cpu_read_8(pcp) __raw_cpu_read(, pcp)
+#define raw_cpu_write_8(pcp, val) __raw_cpu_write(, pcp, val)
+
+#define this_cpu_read_8(pcp) __raw_cpu_read(volatile, pcp)
+#define this_cpu_write_8(pcp, val) __raw_cpu_write(volatile, pcp, val)
+#endif
+
+#else /* CONFIG_USE_X86_SEG_SUPPORT */
+
#define raw_cpu_read_1(pcp) percpu_from_op(1, , "mov", pcp)
#define raw_cpu_read_2(pcp) percpu_from_op(2, , "mov", pcp)
#define raw_cpu_read_4(pcp) percpu_from_op(4, , "mov", pcp)
-
#define raw_cpu_write_1(pcp, val) percpu_to_op(1, , "mov", (pcp), val)
#define raw_cpu_write_2(pcp, val) percpu_to_op(2, , "mov", (pcp), val)
#define raw_cpu_write_4(pcp, val) percpu_to_op(4, , "mov", (pcp), val)
+
+#define this_cpu_read_1(pcp) percpu_from_op(1, volatile, "mov", pcp)
+#define this_cpu_read_2(pcp) percpu_from_op(2, volatile, "mov", pcp)
+#define this_cpu_read_4(pcp) percpu_from_op(4, volatile, "mov", pcp)
+#define this_cpu_write_1(pcp, val) percpu_to_op(1, volatile, "mov", (pcp), val)
+#define this_cpu_write_2(pcp, val) percpu_to_op(2, volatile, "mov", (pcp), val)
+#define this_cpu_write_4(pcp, val) percpu_to_op(4, volatile, "mov", (pcp), val)
+
+#ifdef CONFIG_X86_64
+#define raw_cpu_read_8(pcp) percpu_from_op(8, , "mov", pcp)
+#define raw_cpu_write_8(pcp, val) percpu_to_op(8, , "mov", (pcp), val)
+
+#define this_cpu_read_8(pcp) percpu_from_op(8, volatile, "mov", pcp)
+#define this_cpu_write_8(pcp, val) percpu_to_op(8, volatile, "mov", (pcp), val)
+#endif
+
+#endif /* CONFIG_USE_X86_SEG_SUPPORT */
+
#define raw_cpu_add_1(pcp, val) percpu_add_op(1, , (pcp), val)
#define raw_cpu_add_2(pcp, val) percpu_add_op(2, , (pcp), val)
#define raw_cpu_add_4(pcp, val) percpu_add_op(4, , (pcp), val)
@@ -432,12 +485,6 @@ do { \
#define raw_cpu_xchg_2(pcp, val) raw_percpu_xchg_op(pcp, val)
#define raw_cpu_xchg_4(pcp, val) raw_percpu_xchg_op(pcp, val)
-#define this_cpu_read_1(pcp) percpu_from_op(1, volatile, "mov", pcp)
-#define this_cpu_read_2(pcp) percpu_from_op(2, volatile, "mov", pcp)
-#define this_cpu_read_4(pcp) percpu_from_op(4, volatile, "mov", pcp)
-#define this_cpu_write_1(pcp, val) percpu_to_op(1, volatile, "mov", (pcp), val)
-#define this_cpu_write_2(pcp, val) percpu_to_op(2, volatile, "mov", (pcp), val)
-#define this_cpu_write_4(pcp, val) percpu_to_op(4, volatile, "mov", (pcp), val)
#define this_cpu_add_1(pcp, val) percpu_add_op(1, volatile, (pcp), val)
#define this_cpu_add_2(pcp, val) percpu_add_op(2, volatile, (pcp), val)
#define this_cpu_add_4(pcp, val) percpu_add_op(4, volatile, (pcp), val)
@@ -476,8 +523,6 @@ do { \
* 32 bit must fall back to generic operations.
*/
#ifdef CONFIG_X86_64
-#define raw_cpu_read_8(pcp) percpu_from_op(8, , "mov", pcp)
-#define raw_cpu_write_8(pcp, val) percpu_to_op(8, , "mov", (pcp), val)
#define raw_cpu_add_8(pcp, val) percpu_add_op(8, , (pcp), val)
#define raw_cpu_and_8(pcp, val) percpu_to_op(8, , "and", (pcp), val)
#define raw_cpu_or_8(pcp, val) percpu_to_op(8, , "or", (pcp), val)
@@ -486,8 +531,6 @@ do { \
#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval)
#define raw_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, , pcp, ovalp, nval)
-#define this_cpu_read_8(pcp) percpu_from_op(8, volatile, "mov", pcp)
-#define this_cpu_write_8(pcp, val) percpu_to_op(8, volatile, "mov", (pcp), val)
#define this_cpu_add_8(pcp, val) percpu_add_op(8, volatile, (pcp), val)
#define this_cpu_and_8(pcp, val) percpu_to_op(8, volatile, "and", (pcp), val)
#define this_cpu_or_8(pcp, val) percpu_to_op(8, volatile, "or", (pcp), val)
--
2.41.0
next prev parent reply other threads:[~2023-10-04 19:24 UTC|newest]
Thread overview: 37+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-10-04 14:49 [PATCH 0/4] x86/percpu: Use segment qualifiers Uros Bizjak
2023-10-04 14:49 ` [PATCH 1/4] x86/percpu: Update arch/x86/include/asm/percpu.h to the current tip Uros Bizjak
2023-10-04 14:49 ` [PATCH 2/4] x86/percpu: Enable named address spaces with known compiler version Uros Bizjak
2023-10-05 7:20 ` [tip: x86/percpu] " tip-bot2 for Uros Bizjak
2023-10-04 14:49 ` [PATCH 3/4] x86/percpu: Use compiler segment prefix qualifier Uros Bizjak
2023-10-05 7:20 ` [tip: x86/percpu] " tip-bot2 for Nadav Amit
2023-10-04 14:49 ` [PATCH 4/4] x86/percpu: Use C for percpu read/write accessors Uros Bizjak
2023-10-04 16:37 ` Ingo Molnar
2023-10-04 16:40 ` Ingo Molnar
2023-10-04 19:23 ` Uros Bizjak [this message]
2023-10-04 19:42 ` [PATCH v2 " Linus Torvalds
2023-10-04 20:07 ` Uros Bizjak
2023-10-04 20:12 ` Linus Torvalds
2023-10-04 20:19 ` Linus Torvalds
2023-10-04 20:22 ` Uros Bizjak
2023-10-05 7:06 ` Ingo Molnar
2023-10-05 7:40 ` Uros Bizjak
2023-10-05 7:20 ` [tip: x86/percpu] " tip-bot2 for Uros Bizjak
2023-10-08 17:59 ` [PATCH 4/4] " Linus Torvalds
2023-10-08 19:17 ` Uros Bizjak
2023-10-08 20:13 ` Linus Torvalds
2023-10-08 20:48 ` Linus Torvalds
2023-10-08 21:41 ` Uros Bizjak
2023-10-09 11:41 ` Ingo Molnar
2023-10-09 11:51 ` Ingo Molnar
2023-10-09 12:00 ` Uros Bizjak
2023-10-09 12:20 ` Ingo Molnar
2023-10-09 12:21 ` Nadav Amit
2023-10-09 12:42 ` Uros Bizjak
2023-10-09 12:53 ` Nadav Amit
2023-10-09 12:27 ` Uros Bizjak
2023-10-09 14:35 ` Uros Bizjak
2024-04-10 11:11 ` Andrey Konovalov
2024-04-10 11:21 ` Uros Bizjak
2024-04-10 11:24 ` Andrey Konovalov
2023-10-09 11:42 ` Ingo Molnar
2023-10-10 6:37 ` Uros Bizjak
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20231004192404.31733-1-ubizjak@gmail.com \
--to=ubizjak@gmail.com \
--cc=bp@alien8.de \
--cc=brgerst@gmail.com \
--cc=dvlasenk@redhat.com \
--cc=hpa@zytor.com \
--cc=jpoimboe@redhat.com \
--cc=linux-kernel@vger.kernel.org \
--cc=luto@kernel.org \
--cc=mingo@kernel.org \
--cc=namit@vmware.com \
--cc=peterz@infradead.org \
--cc=tglx@linutronix.de \
--cc=torvalds@linux-foundation.org \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).