linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] x86/usercopy: speed up 64-bit __clear_user() with stos{b,q}
@ 2021-05-23 18:04 Samuel Neves
  2021-05-23 19:03 ` Borislav Petkov
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Samuel Neves @ 2021-05-23 18:04 UTC (permalink / raw)
  To: x86, ak, linux-kernel; +Cc: Samuel Neves

The current 64-bit implementation of __clear_user consists of a simple loop
writing an 8-byte register per iteration. On typical x86_64 chips, this will
result in a rate of ~8 bytes per cycle.

On those same typical chips, much better is often possible, ranging from 16
to 32 to 64 bytes per cycle. Here we want to avoid bringing vector
instructions for this, but we can still achieve something close to those fill
rates using `rep stos{b,q}`. This is actually how it is already done in
usercopy_32.c.

This patch does precisely this. But because `rep stosb` can be slower for
short fills, I've retained the old loop for sizes below 256 bytes. This is a
somewhat arbitrary threshold; some documents say that `rep stosb` should be
faster after 128 bytes, whereas glibc puts the threshold at 2048 bytes (but
there it is competing against vector instructions). My measurements on
various (but not an exhaustive variety of) machines suggest this is a
reasonable threshold, but I could be mistaken.

It should also be mentioned that the existent code contains a bug. In the loop

    "0: movq $0,(%[dst])\n"
    "   addq   $8,%[dst]\n"
    "   decl %%ecx ; jnz   0b\n"

The `decl %%ecx` instruction truncates the register containing `size/8` to
32 bits, which means that calling __clear_user on a buffer longer than 32 GiB
would leave part of it unzeroed.

This change is noticeable from userspace. That is in fact how I spotted it; in
a hashing benchmark that read from /dev/zero, around 10-15% of the CPU time
was spent in __clear_user. After this patch, on a Skylake CPU, these are the
before/after figures:

$ dd if=/dev/zero of=/dev/null bs=1024k status=progress
94402248704 bytes (94 GB, 88 GiB) copied, 6 s, 15.7 GB/s

$ dd if=/dev/zero of=/dev/null bs=1024k status=progress
446476320768 bytes (446 GB, 416 GiB) copied, 15 s, 29.8 GB/s

The difference decreases when reading in smaller increments, but I have
observed no slowdowns.

Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
---
 arch/x86/lib/usercopy_64.c | 59 +++++++++++++++++++++++++-------------
 1 file changed, 39 insertions(+), 20 deletions(-)

diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 508c81e97..af0f3089a 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -9,6 +9,7 @@
 #include <linux/export.h>
 #include <linux/uaccess.h>
 #include <linux/highmem.h>
+#include <asm/alternative.h>
 
 /*
  * Zero Userspace
@@ -16,33 +17,51 @@
 
 unsigned long __clear_user(void __user *addr, unsigned long size)
 {
-	long __d0;
+	long __d0, __d1;
 	might_fault();
 	/* no memory constraint because it doesn't change any memory gcc knows
 	   about */
 	stac();
 	asm volatile(
-		"	testq  %[size8],%[size8]\n"
-		"	jz     4f\n"
-		"	.align 16\n"
-		"0:	movq $0,(%[dst])\n"
-		"	addq   $8,%[dst]\n"
-		"	decl %%ecx ; jnz   0b\n"
-		"4:	movq  %[size1],%%rcx\n"
-		"	testl %%ecx,%%ecx\n"
-		"	jz     2f\n"
-		"1:	movb   $0,(%[dst])\n"
-		"	incq   %[dst]\n"
-		"	decl %%ecx ; jnz  1b\n"
-		"2:\n"
+		"	cmp    $256, %[size]\n"
+		"	jae    3f\n"  /* size >= 256 */
+		"	mov    %k[size], %k[aux]\n"
+		"	and    $7, %k[aux]\n"
+		"	shr    $3, %[size]\n"
+		"	jz     1f\n"  /* size < 8 */
+		".align 16\n"
+		"0:	movq   %%rax,(%[dst])\n"
+		"	add    $8,%[dst]\n"
+		"	dec    %[size]; jnz 0b\n"
+		"1:	mov    %k[aux],%k[size]\n"
+		"	test   %k[aux], %k[aux]\n"
+		"	jz     6f\n"
+		"2:	movb   %%al,(%[dst])\n"
+		"	inc    %[dst]\n"
+		"	dec    %k[size]; jnz 2b\n"
+		"	jmp	   6f\n"
+		"3:	\n"
+		ALTERNATIVE(
+			"mov   %k[size], %k[aux]\n"
+			"shr   $3, %[size]\n"
+			"and   $7, %k[aux]\n"
+			"4:    rep stosq\n"
+			"mov   %k[aux], %k[size]\n",
+			"",
+			X86_FEATURE_ERMS
+		)
+		"5: rep stosb\n"
+		"6:	\n"
 		".section .fixup,\"ax\"\n"
-		"3:	lea 0(%[size1],%[size8],8),%[size8]\n"
-		"	jmp 2b\n"
+		"7:	lea 0(%[aux],%[size],8),%[size]\n"
+		"	jmp    6b\n"
 		".previous\n"
-		_ASM_EXTABLE_UA(0b, 3b)
-		_ASM_EXTABLE_UA(1b, 2b)
-		: [size8] "=&c"(size), [dst] "=&D" (__d0)
-		: [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr));
+		_ASM_EXTABLE_UA(0b, 7b)
+		_ASM_EXTABLE_UA(2b, 6b)
+		_ASM_EXTABLE_UA(4b, 7b)
+		_ASM_EXTABLE_UA(5b, 6b)
+		: [size] "=&c"(size), [dst] "=&D" (__d0), [aux] "=&r"(__d1)
+		: "[size]" (size), "[dst]"(addr), "a"(0));
 	clac();
 	return size;
 }
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] x86/usercopy: speed up 64-bit __clear_user() with stos{b,q}
  2021-05-23 18:04 [PATCH] x86/usercopy: speed up 64-bit __clear_user() with stos{b,q} Samuel Neves
@ 2021-05-23 19:03 ` Borislav Petkov
  2021-05-24  3:43 ` Andi Kleen
  2021-05-24  8:29 ` David Laight
  2 siblings, 0 replies; 4+ messages in thread
From: Borislav Petkov @ 2021-05-23 19:03 UTC (permalink / raw)
  To: Samuel Neves; +Cc: x86, ak, linux-kernel

On Sun, May 23, 2021 at 07:04:23PM +0100, Samuel Neves wrote:
> The current 64-bit implementation of __clear_user consists of a simple loop
> writing an 8-byte register per iteration. On typical x86_64 chips, this will
> result in a rate of ~8 bytes per cycle.
> 
> On those same typical chips, much better is often possible, ranging from 16
> to 32 to 64 bytes per cycle. Here we want to avoid bringing vector
> instructions for this, but we can still achieve something close to those fill
> rates using `rep stos{b,q}`. This is actually how it is already done in
> usercopy_32.c.
> 
> This patch does precisely this. But because `rep stosb` can be slower for
> short fills, I've retained the old loop for sizes below 256 bytes.

Oh yes, you wanna retain the old code for old machines.

But instead of adding more unreadable asm, you can test the size and if
it is > 256 or whatever we decide is the magic value, call a separate
function which contains the ERMS alternative. Similar to how those
different functions are done in arch/x86/lib/copy_user_64.S.

> This is a somewhat arbitrary threshold; some documents say that `rep
> stosb` should be faster after 128 bytes, whereas glibc puts the
> threshold at 2048 bytes (but there it is competing against vector
> instructions). My measurements on various (but not an exhaustive
> variety of) machines suggest this is a reasonable threshold, but I
> could be mistaken.

Those measurements should be part of this commit message. Also, you
wanna test on the currently widely used microarchitectures.

> It should also be mentioned that the existent code contains a bug. In the loop
> 
>     "0: movq $0,(%[dst])\n"
>     "   addq   $8,%[dst]\n"
>     "   decl %%ecx ; jnz   0b\n"
> 
> The `decl %%ecx` instruction truncates the register containing `size/8` to
> 32 bits, which means that calling __clear_user on a buffer longer than 32 GiB
> would leave part of it unzeroed.

That needs to be a separate pre-patch fixing only this.

> This change is noticeable from userspace. That is in fact how I spotted it; in
> a hashing benchmark that read from /dev/zero, around 10-15% of the CPU time
> was spent in __clear_user. After this patch, on a Skylake CPU, these are the
> before/after figures:

I'm guessing you got those 10-15% with perf profiles?

It is a lot more persuasive when you have a before/after perf profile in
your commit message showing how __clear_user() disappears from the list
of hot functions.

> $ dd if=/dev/zero of=/dev/null bs=1024k status=progress
> 94402248704 bytes (94 GB, 88 GiB) copied, 6 s, 15.7 GB/s
> 
> $ dd if=/dev/zero of=/dev/null bs=1024k status=progress
> 446476320768 bytes (446 GB, 416 GiB) copied, 15 s, 29.8 GB/s

As said, you wanna test a couple of currently widespread architectures
and also use a proper benchmark (not dd) to make sure you're not
introducing regressions.

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] x86/usercopy: speed up 64-bit __clear_user() with stos{b,q}
  2021-05-23 18:04 [PATCH] x86/usercopy: speed up 64-bit __clear_user() with stos{b,q} Samuel Neves
  2021-05-23 19:03 ` Borislav Petkov
@ 2021-05-24  3:43 ` Andi Kleen
  2021-05-24  8:29 ` David Laight
  2 siblings, 0 replies; 4+ messages in thread
From: Andi Kleen @ 2021-05-24  3:43 UTC (permalink / raw)
  To: Samuel Neves, x86, linux-kernel


On 5/23/2021 11:04 AM, Samuel Neves wrote:
> The
> This change is noticeable from userspace. That is in fact how I spotted it; in
> a hashing benchmark that read from /dev/zero, around 10-15% of the CPU time
> was spent in __clear_user. After this patch, on a Skylake CPU, these are the
> before/after figures:
>
> $ dd if=/dev/zero of=/dev/null bs=1024k status=progress
> 94402248704 bytes (94 GB, 88 GiB) copied, 6 s, 15.7 GB/s

The question is of course if this actually represents any real workload.

-Andi

^ permalink raw reply	[flat|nested] 4+ messages in thread

* RE: [PATCH] x86/usercopy: speed up 64-bit __clear_user() with stos{b,q}
  2021-05-23 18:04 [PATCH] x86/usercopy: speed up 64-bit __clear_user() with stos{b,q} Samuel Neves
  2021-05-23 19:03 ` Borislav Petkov
  2021-05-24  3:43 ` Andi Kleen
@ 2021-05-24  8:29 ` David Laight
  2 siblings, 0 replies; 4+ messages in thread
From: David Laight @ 2021-05-24  8:29 UTC (permalink / raw)
  To: 'Samuel Neves', x86, ak, linux-kernel

From: Samuel Neves
> Sent: 23 May 2021 19:04
> 
> The current 64-bit implementation of __clear_user consists of a simple loop
> writing an 8-byte register per iteration. On typical x86_64 chips, this will
> result in a rate of ~8 bytes per cycle.
> 
> On those same typical chips, much better is often possible, ranging from 16
> to 32 to 64 bytes per cycle. Here we want to avoid bringing vector
> instructions for this, but we can still achieve something close to those fill
> rates using `rep stos{b,q}`. This is actually how it is already done in
> usercopy_32.c.
> 
> This patch does precisely this. But because `rep stosb` can be slower for
> short fills, I've retained the old loop for sizes below 256 bytes. This is a
> somewhat arbitrary threshold; some documents say that `rep stosb` should be
> faster after 128 bytes, whereas glibc puts the threshold at 2048 bytes (but
> there it is competing against vector instructions). My measurements on
> various (but not an exhaustive variety of) machines suggest this is a
> reasonable threshold, but I could be mistaken.
> 
> It should also be mentioned that the existent code contains a bug. In the loop
> 
>     "0: movq $0,(%[dst])\n"
>     "   addq   $8,%[dst]\n"
>     "   decl %%ecx ; jnz   0b\n"
> 
> The `decl %%ecx` instruction truncates the register containing `size/8` to
> 32 bits, which means that calling __clear_user on a buffer longer than 32 GiB
> would leave part of it unzeroed.
> 
> This change is noticeable from userspace. That is in fact how I spotted it; in
> a hashing benchmark that read from /dev/zero, around 10-15% of the CPU time
> was spent in __clear_user. After this patch, on a Skylake CPU, these are the
> before/after figures:
> 
> $ dd if=/dev/zero of=/dev/null bs=1024k status=progress
> 94402248704 bytes (94 GB, 88 GiB) copied, 6 s, 15.7 GB/s
> 
> $ dd if=/dev/zero of=/dev/null bs=1024k status=progress
> 446476320768 bytes (446 GB, 416 GiB) copied, 15 s, 29.8 GB/s
> 
> The difference decreases when reading in smaller increments, but I have
> observed no slowdowns.
> 
> Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
> ---
>  arch/x86/lib/usercopy_64.c | 59 +++++++++++++++++++++++++-------------
>  1 file changed, 39 insertions(+), 20 deletions(-)
> 
> diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
> index 508c81e97..af0f3089a 100644
> --- a/arch/x86/lib/usercopy_64.c
> +++ b/arch/x86/lib/usercopy_64.c
> @@ -9,6 +9,7 @@
>  #include <linux/export.h>
>  #include <linux/uaccess.h>
>  #include <linux/highmem.h>
> +#include <asm/alternative.h>
> 
>  /*
>   * Zero Userspace
> @@ -16,33 +17,51 @@
> 
>  unsigned long __clear_user(void __user *addr, unsigned long size)
>  {
> -	long __d0;
> +	long __d0, __d1;
>  	might_fault();
>  	/* no memory constraint because it doesn't change any memory gcc knows
>  	   about */
>  	stac();
>  	asm volatile(
> -		"	testq  %[size8],%[size8]\n"
> -		"	jz     4f\n"
> -		"	.align 16\n"
> -		"0:	movq $0,(%[dst])\n"
> -		"	addq   $8,%[dst]\n"
> -		"	decl %%ecx ; jnz   0b\n"
> -		"4:	movq  %[size1],%%rcx\n"
> -		"	testl %%ecx,%%ecx\n"
> -		"	jz     2f\n"
> -		"1:	movb   $0,(%[dst])\n"
> -		"	incq   %[dst]\n"
> -		"	decl %%ecx ; jnz  1b\n"
> -		"2:\n"
> +		"	cmp    $256, %[size]\n"
> +		"	jae    3f\n"  /* size >= 256 */
> +		"	mov    %k[size], %k[aux]\n"
> +		"	and    $7, %k[aux]\n"
> +		"	shr    $3, %[size]\n"
> +		"	jz     1f\n"  /* size < 8 */
> +		".align 16\n"
> +		"0:	movq   %%rax,(%[dst])\n"
> +		"	add    $8,%[dst]\n"
> +		"	dec    %[size]; jnz 0b\n"

No need for a loop, just write zeros to the end of the buffer.
It may be worth doing that even if the size is a multiple of
8 and the last 'block zero' clears the same bytes.

> +		"1:	mov    %k[aux],%k[size]\n"
> +		"	test   %k[aux], %k[aux]\n"
> +		"	jz     6f\n"
> +		"2:	movb   %%al,(%[dst])\n"
> +		"	inc    %[dst]\n"
> +		"	dec    %k[size]; jnz 2b\n"
> +		"	jmp	   6f\n"
> +		"3:	\n"
> +		ALTERNATIVE(
> +			"mov   %k[size], %k[aux]\n"
> +			"shr   $3, %[size]\n"
> +			"and   $7, %k[aux]\n"
> +			"4:    rep stosq\n"
> +			"mov   %k[aux], %k[size]\n",

You really don't want to use 'rep stosb' here.
There are a large class of x86 cpu where it is really horrid.
IIRC there is one small set (just before the ERMS ones) where
short 'rep movsb' isn't too bad.

	David

> +			"",
> +			X86_FEATURE_ERMS
> +		)
> +		"5: rep stosb\n"
> +		"6:	\n"
>  		".section .fixup,\"ax\"\n"
> -		"3:	lea 0(%[size1],%[size8],8),%[size8]\n"
> -		"	jmp 2b\n"
> +		"7:	lea 0(%[aux],%[size],8),%[size]\n"
> +		"	jmp    6b\n"
>  		".previous\n"
> -		_ASM_EXTABLE_UA(0b, 3b)
> -		_ASM_EXTABLE_UA(1b, 2b)
> -		: [size8] "=&c"(size), [dst] "=&D" (__d0)
> -		: [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr));
> +		_ASM_EXTABLE_UA(0b, 7b)
> +		_ASM_EXTABLE_UA(2b, 6b)
> +		_ASM_EXTABLE_UA(4b, 7b)
> +		_ASM_EXTABLE_UA(5b, 6b)
> +		: [size] "=&c"(size), [dst] "=&D" (__d0), [aux] "=&r"(__d1)
> +		: "[size]" (size), "[dst]"(addr), "a"(0));
>  	clac();
>  	return size;
>  }
> --
> 2.31.1

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2021-05-24  8:29 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-05-23 18:04 [PATCH] x86/usercopy: speed up 64-bit __clear_user() with stos{b,q} Samuel Neves
2021-05-23 19:03 ` Borislav Petkov
2021-05-24  3:43 ` Andi Kleen
2021-05-24  8:29 ` David Laight

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).