All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] x86: branchless clear_page()
@ 2016-08-22 20:52 Alexey Dobriyan
  2016-08-22 20:54 ` Alexey Dobriyan
                   ` (3 more replies)
  0 siblings, 4 replies; 9+ messages in thread
From: Alexey Dobriyan @ 2016-08-22 20:52 UTC (permalink / raw)
  To: x86, linux-kernel

Apply alternatives at the call site instead of function body.
Save branch per clean page.

Bonus: tell gcc to not flush whole shebang of registers,
just RDI, RAX, RCX.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---

 arch/x86/include/asm/page_64.h |   16 +++++++++++++++-
 arch/x86/lib/clear_page_64.S   |   18 ++++++------------
 2 files changed, 21 insertions(+), 13 deletions(-)

--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -4,6 +4,7 @@
 #include <asm/page_64_types.h>
 
 #ifndef __ASSEMBLY__
+#include <asm/alternative.h>
 
 /* duplicated to the one in bootmem.h */
 extern unsigned long max_pfn;
@@ -34,7 +35,20 @@ extern unsigned long __phys_addr_symbol(unsigned long);
 #define pfn_valid(pfn)          ((pfn) < max_pfn)
 #endif
 
-void clear_page(void *page);
+void clear_page_mov(void *page);
+void clear_page_rep_stosq(void *page);
+void clear_page_rep_stosb(void *page);
+static __always_inline void clear_page(void *page)
+{
+	alternative_call_2(
+		clear_page_mov,
+		clear_page_rep_stosq, X86_FEATURE_REP_GOOD,
+		clear_page_rep_stosb, X86_FEATURE_ERMS,
+		"=D" (page),
+		"0" (page)
+		: "rax", "rcx", "memory"
+	);
+}
 void copy_page(void *to, void *from);
 
 #endif	/* !__ASSEMBLY__ */
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,6 +1,4 @@
 #include <linux/linkage.h>
-#include <asm/cpufeatures.h>
-#include <asm/alternative-asm.h>
 
 /*
  * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is
@@ -13,18 +11,14 @@
  * Zero a page.
  * %rdi	- page
  */
-ENTRY(clear_page)
-
-	ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \
-		      "jmp clear_page_c_e", X86_FEATURE_ERMS
-
+ENTRY(clear_page_rep_stosq)
 	movl $4096/8,%ecx
 	xorl %eax,%eax
 	rep stosq
 	ret
-ENDPROC(clear_page)
+ENDPROC(clear_page_rep_stosq)
 
-ENTRY(clear_page_orig)
+ENTRY(clear_page_mov)
 
 	xorl   %eax,%eax
 	movl   $4096/64,%ecx
@@ -44,11 +38,11 @@ ENTRY(clear_page_orig)
 	jnz	.Lloop
 	nop
 	ret
-ENDPROC(clear_page_orig)
+ENDPROC(clear_page_mov)
 
-ENTRY(clear_page_c_e)
+ENTRY(clear_page_rep_stosb)
 	movl $4096,%ecx
 	xorl %eax,%eax
 	rep stosb
 	ret
-ENDPROC(clear_page_c_e)
+ENDPROC(clear_page_rep_stosb)

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] x86: branchless clear_page()
  2016-08-22 20:52 [PATCH] x86: branchless clear_page() Alexey Dobriyan
@ 2016-08-22 20:54 ` Alexey Dobriyan
  2016-08-22 21:06 ` Borislav Petkov
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 9+ messages in thread
From: Alexey Dobriyan @ 2016-08-22 20:54 UTC (permalink / raw)
  To: x86, linux-kernel

On Mon, Aug 22, 2016 at 11:52:29PM +0300, Alexey Dobriyan wrote:
> Apply alternatives at the call site instead of function body.
> Save branch per clean page.
> 
> Bonus: tell gcc to not flush whole shebang of registers,
> just RDI, RAX, RCX.

Rename individual clear_page() functions closer to implementation
details (_c_e suffix sucks).

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] x86: branchless clear_page()
  2016-08-22 20:52 [PATCH] x86: branchless clear_page() Alexey Dobriyan
  2016-08-22 20:54 ` Alexey Dobriyan
@ 2016-08-22 21:06 ` Borislav Petkov
  2016-08-23 10:46   ` Alexey Dobriyan
  2016-08-23 13:43 ` Brian Gerst
  2016-08-23 16:19 ` [PATCH 1/3] " Alexey Dobriyan
  3 siblings, 1 reply; 9+ messages in thread
From: Borislav Petkov @ 2016-08-22 21:06 UTC (permalink / raw)
  To: Alexey Dobriyan; +Cc: x86, linux-kernel

On Mon, Aug 22, 2016 at 11:52:29PM +0300, Alexey Dobriyan wrote:
> Apply alternatives at the call site instead of function body.
> Save branch per clean page.

Any size increase to an allyesconfig vmlinux and increase to
alternatives application time?

Booting with "debug-alternative" should help.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.
--

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] x86: branchless clear_page()
  2016-08-22 21:06 ` Borislav Petkov
@ 2016-08-23 10:46   ` Alexey Dobriyan
  0 siblings, 0 replies; 9+ messages in thread
From: Alexey Dobriyan @ 2016-08-23 10:46 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: x86, Linux Kernel

On Tue, Aug 23, 2016 at 12:06 AM, Borislav Petkov <bp@alien8.de> wrote:
> On Mon, Aug 22, 2016 at 11:52:29PM +0300, Alexey Dobriyan wrote:
>> Apply alternatives at the call site instead of function body.
>> Save branch per clean page.
>
> Any size increase to an allyesconfig vmlinux and increase to
> alternatives application time?
>
> Booting with "debug-alternative" should help.

x86_64 allyesconfig kernel lists whole 56 calls to clear_page().
My usual kernel lists 8.

There are tons copy_from_user/copy_to_user entries and POPCNT
and CLAC/STAC, so I think those several dozens clear pages don't matter.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] x86: branchless clear_page()
  2016-08-22 20:52 [PATCH] x86: branchless clear_page() Alexey Dobriyan
  2016-08-22 20:54 ` Alexey Dobriyan
  2016-08-22 21:06 ` Borislav Petkov
@ 2016-08-23 13:43 ` Brian Gerst
  2016-08-23 16:15   ` Alexey Dobriyan
  2016-08-23 16:19 ` [PATCH 1/3] " Alexey Dobriyan
  3 siblings, 1 reply; 9+ messages in thread
From: Brian Gerst @ 2016-08-23 13:43 UTC (permalink / raw)
  To: Alexey Dobriyan; +Cc: the arch/x86 maintainers, Linux Kernel Mailing List

On Mon, Aug 22, 2016 at 4:52 PM, Alexey Dobriyan <adobriyan@gmail.com> wrote:
> Apply alternatives at the call site instead of function body.
> Save branch per clean page.
>
> Bonus: tell gcc to not flush whole shebang of registers,
> just RDI, RAX, RCX.
>
> Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
> ---
>
>  arch/x86/include/asm/page_64.h |   16 +++++++++++++++-
>  arch/x86/lib/clear_page_64.S   |   18 ++++++------------
>  2 files changed, 21 insertions(+), 13 deletions(-)
>
> --- a/arch/x86/include/asm/page_64.h
> +++ b/arch/x86/include/asm/page_64.h
> @@ -4,6 +4,7 @@
>  #include <asm/page_64_types.h>
>
>  #ifndef __ASSEMBLY__
> +#include <asm/alternative.h>
>
>  /* duplicated to the one in bootmem.h */
>  extern unsigned long max_pfn;
> @@ -34,7 +35,20 @@ extern unsigned long __phys_addr_symbol(unsigned long);
>  #define pfn_valid(pfn)          ((pfn) < max_pfn)
>  #endif
>
> -void clear_page(void *page);
> +void clear_page_mov(void *page);
> +void clear_page_rep_stosq(void *page);
> +void clear_page_rep_stosb(void *page);
> +static __always_inline void clear_page(void *page)
> +{
> +       alternative_call_2(
> +               clear_page_mov,
> +               clear_page_rep_stosq, X86_FEATURE_REP_GOOD,
> +               clear_page_rep_stosb, X86_FEATURE_ERMS,
> +               "=D" (page),
> +               "0" (page)
> +               : "rax", "rcx", "memory"
> +       );
> +}
>  void copy_page(void *to, void *from);
>
>  #endif /* !__ASSEMBLY__ */
> --- a/arch/x86/lib/clear_page_64.S
> +++ b/arch/x86/lib/clear_page_64.S
> @@ -1,6 +1,4 @@
>  #include <linux/linkage.h>
> -#include <asm/cpufeatures.h>
> -#include <asm/alternative-asm.h>
>
>  /*
>   * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is
> @@ -13,18 +11,14 @@
>   * Zero a page.
>   * %rdi        - page
>   */
> -ENTRY(clear_page)
> -
> -       ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \
> -                     "jmp clear_page_c_e", X86_FEATURE_ERMS
> -
> +ENTRY(clear_page_rep_stosq)
>         movl $4096/8,%ecx
>         xorl %eax,%eax
>         rep stosq
>         ret
> -ENDPROC(clear_page)
> +ENDPROC(clear_page_rep_stosq)
>
> -ENTRY(clear_page_orig)
> +ENTRY(clear_page_mov)
>
>         xorl   %eax,%eax
>         movl   $4096/64,%ecx
> @@ -44,11 +38,11 @@ ENTRY(clear_page_orig)
>         jnz     .Lloop
>         nop
>         ret
> -ENDPROC(clear_page_orig)
> +ENDPROC(clear_page_mov)
>
> -ENTRY(clear_page_c_e)
> +ENTRY(clear_page_rep_stosb)
>         movl $4096,%ecx
>         xorl %eax,%eax
>         rep stosb
>         ret
> -ENDPROC(clear_page_c_e)
> +ENDPROC(clear_page_rep_stosb)

I like this idea, but does it make sense to take it a step further and
inline the string instruction alternatives to avoid a call altogether?

Also, 32-bit should be converted to do the same thing as 64-bit.

--
Brian Gerst

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] x86: branchless clear_page()
  2016-08-23 13:43 ` Brian Gerst
@ 2016-08-23 16:15   ` Alexey Dobriyan
  0 siblings, 0 replies; 9+ messages in thread
From: Alexey Dobriyan @ 2016-08-23 16:15 UTC (permalink / raw)
  To: Brian Gerst; +Cc: the arch/x86 maintainers, Linux Kernel Mailing List

On Tue, Aug 23, 2016 at 09:43:25AM -0400, Brian Gerst wrote:
> > -ENTRY(clear_page_c_e)
> > +ENTRY(clear_page_rep_stosb)
> >         movl $4096,%ecx
> >         xorl %eax,%eax
> >         rep stosb
> >         ret
> > -ENDPROC(clear_page_c_e)
> > +ENDPROC(clear_page_rep_stosb)
> 
> I like this idea, but does it make sense to take it a step further and
> inline the string instruction alternatives to avoid a call altogether?

It is easy but you can't do runtime patching then.

> Also, 32-bit should be converted to do the same thing as 64-bit.

I don't run 32-bit. :-)

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 1/3] x86: branchless clear_page()
  2016-08-22 20:52 [PATCH] x86: branchless clear_page() Alexey Dobriyan
                   ` (2 preceding siblings ...)
  2016-08-23 13:43 ` Brian Gerst
@ 2016-08-23 16:19 ` Alexey Dobriyan
  2016-08-23 16:25   ` [PATCH 2/3] x86: support REP MOVSB copy_page() Alexey Dobriyan
  3 siblings, 1 reply; 9+ messages in thread
From: Alexey Dobriyan @ 2016-08-23 16:19 UTC (permalink / raw)
  To: x86, linux-kernel

Apply alternatives at the call site instead of function body.
Save branch per clean page.

Bonus: tell gcc to not flush whole shebang of registers,
just RDI, RAX, RCX.

Rename individual clear_page() functions closer to implementation
details.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---

	Export individual functions.

 arch/x86/include/asm/page_64.h   |   16 +++++++++++++++-
 arch/x86/kernel/x8664_ksyms_64.c |    4 +++-
 arch/x86/lib/clear_page_64.S     |   18 ++++++------------
 3 files changed, 24 insertions(+), 14 deletions(-)

--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -4,6 +4,7 @@
 #include <asm/page_64_types.h>
 
 #ifndef __ASSEMBLY__
+#include <asm/alternative.h>
 
 /* duplicated to the one in bootmem.h */
 extern unsigned long max_pfn;
@@ -34,7 +35,20 @@ extern unsigned long __phys_addr_symbol(unsigned long);
 #define pfn_valid(pfn)          ((pfn) < max_pfn)
 #endif
 
-void clear_page(void *page);
+void clear_page_mov(void *page);
+void clear_page_rep_stosq(void *page);
+void clear_page_rep_stosb(void *page);
+static __always_inline void clear_page(void *page)
+{
+	alternative_call_2(
+		clear_page_mov,
+		clear_page_rep_stosq, X86_FEATURE_REP_GOOD,
+		clear_page_rep_stosb, X86_FEATURE_ERMS,
+		"=D" (page),
+		"0" (page)
+		: "rax", "rcx", "memory"
+	);
+}
 void copy_page(void *to, void *from);
 
 #endif	/* !__ASSEMBLY__ */
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -41,7 +41,9 @@ EXPORT_SYMBOL(_copy_to_user);
 EXPORT_SYMBOL_GPL(memcpy_mcsafe);
 
 EXPORT_SYMBOL(copy_page);
-EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(clear_page_mov);
+EXPORT_SYMBOL(clear_page_rep_stosq);
+EXPORT_SYMBOL(clear_page_rep_stosb);
 
 EXPORT_SYMBOL(csum_partial);
 
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,6 +1,4 @@
 #include <linux/linkage.h>
-#include <asm/cpufeatures.h>
-#include <asm/alternative-asm.h>
 
 /*
  * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is
@@ -13,18 +11,14 @@
  * Zero a page.
  * %rdi	- page
  */
-ENTRY(clear_page)
-
-	ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \
-		      "jmp clear_page_c_e", X86_FEATURE_ERMS
-
+ENTRY(clear_page_rep_stosq)
 	movl $4096/8,%ecx
 	xorl %eax,%eax
 	rep stosq
 	ret
-ENDPROC(clear_page)
+ENDPROC(clear_page_rep_stosq)
 
-ENTRY(clear_page_orig)
+ENTRY(clear_page_mov)
 
 	xorl   %eax,%eax
 	movl   $4096/64,%ecx
@@ -44,11 +38,11 @@ ENTRY(clear_page_orig)
 	jnz	.Lloop
 	nop
 	ret
-ENDPROC(clear_page_orig)
+ENDPROC(clear_page_mov)
 
-ENTRY(clear_page_c_e)
+ENTRY(clear_page_rep_stosb)
 	movl $4096,%ecx
 	xorl %eax,%eax
 	rep stosb
 	ret
-ENDPROC(clear_page_c_e)
+ENDPROC(clear_page_rep_stosb)

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 2/3] x86: support REP MOVSB copy_page()
  2016-08-23 16:19 ` [PATCH 1/3] " Alexey Dobriyan
@ 2016-08-23 16:25   ` Alexey Dobriyan
  2016-08-23 16:28     ` [PATCH 3/3] x86: branchless copy_page() Alexey Dobriyan
  0 siblings, 1 reply; 9+ messages in thread
From: Alexey Dobriyan @ 2016-08-23 16:25 UTC (permalink / raw)
  To: x86, linux-kernel

Microbenchmark shows that "REP MOVSB" copy_page() is faster
than "REP MOVSQ" version on Intel i5-something Haswell
REP_GOOD/ERMS capable CPU.

N=1<<27
rep movsq:	6.758841901 ± 0.04%
rep movsb:	6.253927309 ± 0.02%
-----------------------------------
			-7.5%

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---

 arch/x86/lib/copy_page_64.S |   11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -12,12 +12,21 @@
  */
 	ALIGN
 ENTRY(copy_page)
-	ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD
+	ALTERNATIVE_2 "jmp copy_page_regs",	\
+		"", X86_FEATURE_REP_GOOD,	\
+		"jmp copy_page_rep_movsb", X86_FEATURE_ERMS
+
 	movl	$4096/8, %ecx
 	rep	movsq
 	ret
 ENDPROC(copy_page)
 
+ENTRY(copy_page_rep_movsb)
+	mov	$4096, %ecx
+	rep movsb
+	ret
+ENDPROC(copy_page_rep_movsb)
+
 ENTRY(copy_page_regs)
 	subq	$2*8,	%rsp
 	movq	%rbx,	(%rsp)

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 3/3] x86: branchless copy_page()
  2016-08-23 16:25   ` [PATCH 2/3] x86: support REP MOVSB copy_page() Alexey Dobriyan
@ 2016-08-23 16:28     ` Alexey Dobriyan
  0 siblings, 0 replies; 9+ messages in thread
From: Alexey Dobriyan @ 2016-08-23 16:28 UTC (permalink / raw)
  To: x86, linux-kernel

Apply alternatives at the call site instead of function body.
Save branch per copied page.

Rename individual function, show immediately in profiles
which method is being used.

RBX and R12 are saved and restored by unrolled-mov implementation,
don't clobber them explicitly.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---

 arch/x86/include/asm/page_64.h   |   15 ++++++++++++++-
 arch/x86/kernel/x8664_ksyms_64.c |    4 +++-
 arch/x86/lib/copy_page_64.S      |   14 ++++----------
 3 files changed, 21 insertions(+), 12 deletions(-)

--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -49,8 +49,21 @@ static __always_inline void clear_page(void *page)
 		: "rax", "rcx", "memory"
 	);
 }
-void copy_page(void *to, void *from);
 
+void copy_page_mov(void *to, void *from);
+void copy_page_rep_movsq(void *to, void *from);
+void copy_page_rep_movsb(void *to, void *from);
+static __always_inline void copy_page(void *to, void *from)
+{
+	alternative_call_2(
+		copy_page_mov,
+		copy_page_rep_movsq, X86_FEATURE_REP_GOOD,
+		copy_page_rep_movsb, X86_FEATURE_ERMS,
+		ASM_OUTPUT2("=D" (to), "=S" (from)),
+		"0" (to), "1" (from)
+		: "rax", "rcx", "rdx", "r8", "r9", "r10", "r11", "memory"
+	);
+}
 #endif	/* !__ASSEMBLY__ */
 
 #ifdef CONFIG_X86_VSYSCALL_EMULATION
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -40,7 +40,9 @@ EXPORT_SYMBOL(_copy_to_user);
 
 EXPORT_SYMBOL_GPL(memcpy_mcsafe);
 
-EXPORT_SYMBOL(copy_page);
+EXPORT_SYMBOL(copy_page_mov);
+EXPORT_SYMBOL(copy_page_rep_movsq);
+EXPORT_SYMBOL(copy_page_rep_movsb);
 EXPORT_SYMBOL(clear_page_mov);
 EXPORT_SYMBOL(clear_page_rep_stosq);
 EXPORT_SYMBOL(clear_page_rep_stosb);
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -1,8 +1,6 @@
 /* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
 
 #include <linux/linkage.h>
-#include <asm/cpufeatures.h>
-#include <asm/alternative-asm.h>
 
 /*
  * Some CPUs run faster using the string copy instructions (sane microcode).
@@ -11,15 +9,11 @@
  * prefetch distance based on SMP/UP.
  */
 	ALIGN
-ENTRY(copy_page)
-	ALTERNATIVE_2 "jmp copy_page_regs",	\
-		"", X86_FEATURE_REP_GOOD,	\
-		"jmp copy_page_rep_movsb", X86_FEATURE_ERMS
-
+ENTRY(copy_page_rep_movsq)
 	movl	$4096/8, %ecx
 	rep	movsq
 	ret
-ENDPROC(copy_page)
+ENDPROC(copy_page_rep_movsq)
 
 ENTRY(copy_page_rep_movsb)
 	mov	$4096, %ecx
@@ -27,7 +21,7 @@ ENTRY(copy_page_rep_movsb)
 	ret
 ENDPROC(copy_page_rep_movsb)
 
-ENTRY(copy_page_regs)
+ENTRY(copy_page_mov)
 	subq	$2*8,	%rsp
 	movq	%rbx,	(%rsp)
 	movq	%r12,	1*8(%rsp)
@@ -92,4 +86,4 @@ ENTRY(copy_page_regs)
 	movq	1*8(%rsp), %r12
 	addq	$2*8, %rsp
 	ret
-ENDPROC(copy_page_regs)
+ENDPROC(copy_page_mov)

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2016-08-23 16:28 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-08-22 20:52 [PATCH] x86: branchless clear_page() Alexey Dobriyan
2016-08-22 20:54 ` Alexey Dobriyan
2016-08-22 21:06 ` Borislav Petkov
2016-08-23 10:46   ` Alexey Dobriyan
2016-08-23 13:43 ` Brian Gerst
2016-08-23 16:15   ` Alexey Dobriyan
2016-08-23 16:19 ` [PATCH 1/3] " Alexey Dobriyan
2016-08-23 16:25   ` [PATCH 2/3] x86: support REP MOVSB copy_page() Alexey Dobriyan
2016-08-23 16:28     ` [PATCH 3/3] x86: branchless copy_page() Alexey Dobriyan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.