linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* x86: should clear_user() have alternatives?
@ 2022-02-08  5:45 Hugh Dickins
  2022-02-08 11:45 ` David Laight
  2022-02-08 12:52 ` Borislav Petkov
  0 siblings, 2 replies; 6+ messages in thread
From: Hugh Dickins @ 2022-02-08  5:45 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: Peter Zijlstra, linux-kernel, x86

Hi Boris,

I've noticed that clear_user() is slower than it need be:

dd if=/dev/zero of=/dev/null bs=1M count=1M
1099511627776 bytes (1.1 TB) copied, 45.9641 s, 23.9 GB/s
whereas with the hacked patch below
1099511627776 bytes (1.1 TB) copied, 33.4 s, 32.9 GB/s

That was on some Intel machine: IIRC an AMD went faster.

It's because clear_user() lacks alternatives, and uses a
nowadays suboptimal implementation; whereas clear_page()
and copy_user() do support alternatives.

I came across this because reading a sparse file on tmpfs uses
copy_page_to_iter() from the ZERO_PAGE(), and I wanted to change that
to iov_iter_zero(), which sounds obviously faster - but in fact slower.

I realize that dd'ing from /dev/zero to /dev/null, and sparse files on
tmpfs, are not prime candidates for optimization; and I've no idea how
much clear_user() normally gets used for long clears.

If I were capable of compiler asm, with alternatives, and knew at what
length ERMS becomes advantageous when clearing, I would be sending you
a proper patch.  As it is, I'm hoping to tempt someone else to do the
work!  Or reject it as too niche to bother with.

Thanks,
Hugh

Hacked patch against 5.16 (5.17-rc is a little different there):

 arch/x86/lib/copy_user_64.S |   26 ++++++++++++++++++++++++++
 arch/x86/lib/usercopy_64.c  |   36 +-----------------------------------
 2 files changed, 27 insertions(+), 35 deletions(-)

--- 5.16/arch/x86/lib/copy_user_64.S	2022-01-09 14:55:34.000000000 -0800
+++ erms/arch/x86/lib/copy_user_64.S	2022-01-22 16:57:21.156968363 -0800
@@ -392,3 +392,29 @@ SYM_FUNC_START(__copy_user_nocache)
 	_ASM_EXTABLE_CPY(41b, .L_fixup_1b_copy)
 SYM_FUNC_END(__copy_user_nocache)
 EXPORT_SYMBOL(__copy_user_nocache)
+
+/*
+ * Recent CPUs have added enhanced REP MOVSB/STOSB instructions.
+ * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
+ * Assume that's best for __clear_user(), until alternatives are provided
+ * (though would be better to avoid REP STOSB for short clears, if no FSRM).
+ *
+ * Input:
+ * rdi destination
+ * rsi count
+ *
+ * Output:
+ * rax uncopied bytes or 0 if successful.
+ */
+SYM_FUNC_START(__clear_user)
+	ASM_STAC
+	movl %esi,%ecx
+	xorq %rax,%rax
+1:	rep stosb
+2:	movl %ecx,%eax
+	ASM_CLAC
+	ret
+
+	_ASM_EXTABLE_UA(1b, 2b)
+SYM_FUNC_END(__clear_user)
+EXPORT_SYMBOL(__clear_user)
--- 5.16/arch/x86/lib/usercopy_64.c	2020-12-13 14:41:30.000000000 -0800
+++ erms/arch/x86/lib/usercopy_64.c	2022-01-20 18:40:04.125752474 -0800
@@ -13,43 +13,9 @@
 /*
  * Zero Userspace
  */
-
-unsigned long __clear_user(void __user *addr, unsigned long size)
-{
-	long __d0;
-	might_fault();
-	/* no memory constraint because it doesn't change any memory gcc knows
-	   about */
-	stac();
-	asm volatile(
-		"	testq  %[size8],%[size8]\n"
-		"	jz     4f\n"
-		"	.align 16\n"
-		"0:	movq $0,(%[dst])\n"
-		"	addq   $8,%[dst]\n"
-		"	decl %%ecx ; jnz   0b\n"
-		"4:	movq  %[size1],%%rcx\n"
-		"	testl %%ecx,%%ecx\n"
-		"	jz     2f\n"
-		"1:	movb   $0,(%[dst])\n"
-		"	incq   %[dst]\n"
-		"	decl %%ecx ; jnz  1b\n"
-		"2:\n"
-		".section .fixup,\"ax\"\n"
-		"3:	lea 0(%[size1],%[size8],8),%[size8]\n"
-		"	jmp 2b\n"
-		".previous\n"
-		_ASM_EXTABLE_UA(0b, 3b)
-		_ASM_EXTABLE_UA(1b, 2b)
-		: [size8] "=&c"(size), [dst] "=&D" (__d0)
-		: [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr));
-	clac();
-	return size;
-}
-EXPORT_SYMBOL(__clear_user);
-
 unsigned long clear_user(void __user *to, unsigned long n)
 {
+	might_fault();
 	if (access_ok(to, n))
 		return __clear_user(to, n);
 	return n;

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2022-04-07 23:22 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-02-08  5:45 x86: should clear_user() have alternatives? Hugh Dickins
2022-02-08 11:45 ` David Laight
2022-02-08 12:52 ` Borislav Petkov
2022-02-08 23:36   ` David Laight
2022-02-09  6:18   ` Hugh Dickins
2022-04-07 23:21     ` Jason A. Donenfeld

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).