All of lore.kernel.org
 help / color / mirror / Atom feed
From: Hugh Dickins <hughd@google.com>
To: Borislav Petkov <bp@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>,
	linux-kernel@vger.kernel.org, x86@kernel.org
Subject: x86: should clear_user() have alternatives?
Date: Mon, 7 Feb 2022 21:45:36 -0800 (PST)	[thread overview]
Message-ID: <2f5ca5e4-e250-a41c-11fb-a7f4ebc7e1c9@google.com> (raw)

Hi Boris,

I've noticed that clear_user() is slower than it need be:

dd if=/dev/zero of=/dev/null bs=1M count=1M
1099511627776 bytes (1.1 TB) copied, 45.9641 s, 23.9 GB/s
whereas with the hacked patch below
1099511627776 bytes (1.1 TB) copied, 33.4 s, 32.9 GB/s

That was on some Intel machine: IIRC an AMD went faster.

It's because clear_user() lacks alternatives, and uses a
nowadays suboptimal implementation; whereas clear_page()
and copy_user() do support alternatives.

I came across this because reading a sparse file on tmpfs uses
copy_page_to_iter() from the ZERO_PAGE(), and I wanted to change that
to iov_iter_zero(), which sounds obviously faster - but in fact slower.

I realize that dd'ing from /dev/zero to /dev/null, and sparse files on
tmpfs, are not prime candidates for optimization; and I've no idea how
much clear_user() normally gets used for long clears.

If I were capable of compiler asm, with alternatives, and knew at what
length ERMS becomes advantageous when clearing, I would be sending you
a proper patch.  As it is, I'm hoping to tempt someone else to do the
work!  Or reject it as too niche to bother with.

Thanks,
Hugh

Hacked patch against 5.16 (5.17-rc is a little different there):

 arch/x86/lib/copy_user_64.S |   26 ++++++++++++++++++++++++++
 arch/x86/lib/usercopy_64.c  |   36 +-----------------------------------
 2 files changed, 27 insertions(+), 35 deletions(-)

--- 5.16/arch/x86/lib/copy_user_64.S	2022-01-09 14:55:34.000000000 -0800
+++ erms/arch/x86/lib/copy_user_64.S	2022-01-22 16:57:21.156968363 -0800
@@ -392,3 +392,29 @@ SYM_FUNC_START(__copy_user_nocache)
 	_ASM_EXTABLE_CPY(41b, .L_fixup_1b_copy)
 SYM_FUNC_END(__copy_user_nocache)
 EXPORT_SYMBOL(__copy_user_nocache)
+
+/*
+ * Recent CPUs have added enhanced REP MOVSB/STOSB instructions.
+ * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
+ * Assume that's best for __clear_user(), until alternatives are provided
+ * (though would be better to avoid REP STOSB for short clears, if no FSRM).
+ *
+ * Input:
+ * rdi destination
+ * rsi count
+ *
+ * Output:
+ * rax uncopied bytes or 0 if successful.
+ */
+SYM_FUNC_START(__clear_user)
+	ASM_STAC
+	movl %esi,%ecx
+	xorq %rax,%rax
+1:	rep stosb
+2:	movl %ecx,%eax
+	ASM_CLAC
+	ret
+
+	_ASM_EXTABLE_UA(1b, 2b)
+SYM_FUNC_END(__clear_user)
+EXPORT_SYMBOL(__clear_user)
--- 5.16/arch/x86/lib/usercopy_64.c	2020-12-13 14:41:30.000000000 -0800
+++ erms/arch/x86/lib/usercopy_64.c	2022-01-20 18:40:04.125752474 -0800
@@ -13,43 +13,9 @@
 /*
  * Zero Userspace
  */
-
-unsigned long __clear_user(void __user *addr, unsigned long size)
-{
-	long __d0;
-	might_fault();
-	/* no memory constraint because it doesn't change any memory gcc knows
-	   about */
-	stac();
-	asm volatile(
-		"	testq  %[size8],%[size8]\n"
-		"	jz     4f\n"
-		"	.align 16\n"
-		"0:	movq $0,(%[dst])\n"
-		"	addq   $8,%[dst]\n"
-		"	decl %%ecx ; jnz   0b\n"
-		"4:	movq  %[size1],%%rcx\n"
-		"	testl %%ecx,%%ecx\n"
-		"	jz     2f\n"
-		"1:	movb   $0,(%[dst])\n"
-		"	incq   %[dst]\n"
-		"	decl %%ecx ; jnz  1b\n"
-		"2:\n"
-		".section .fixup,\"ax\"\n"
-		"3:	lea 0(%[size1],%[size8],8),%[size8]\n"
-		"	jmp 2b\n"
-		".previous\n"
-		_ASM_EXTABLE_UA(0b, 3b)
-		_ASM_EXTABLE_UA(1b, 2b)
-		: [size8] "=&c"(size), [dst] "=&D" (__d0)
-		: [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr));
-	clac();
-	return size;
-}
-EXPORT_SYMBOL(__clear_user);
-
 unsigned long clear_user(void __user *to, unsigned long n)
 {
+	might_fault();
 	if (access_ok(to, n))
 		return __clear_user(to, n);
 	return n;

             reply	other threads:[~2022-02-08  5:45 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-02-08  5:45 Hugh Dickins [this message]
2022-02-08 11:45 ` x86: should clear_user() have alternatives? David Laight
2022-02-08 12:52 ` Borislav Petkov
2022-02-08 23:36   ` David Laight
2022-02-09  6:18   ` Hugh Dickins
2022-04-07 23:21     ` Jason A. Donenfeld

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=2f5ca5e4-e250-a41c-11fb-a7f4ebc7e1c9@google.com \
    --to=hughd@google.com \
    --cc=bp@suse.de \
    --cc=linux-kernel@vger.kernel.org \
    --cc=peterz@infradead.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.