All of lore.kernel.org
 help / color / mirror / Atom feed
From: Alexey Dobriyan <adobriyan@gmail.com>
To: tglx@linutronix.de, mingo@redhat.com, bp@alien8.de, hpa@zytor.com
Cc: linux-kernel@vger.kernel.org, x86@kernel.org,
	adobriyan@gmail.com, linux-kbuild@vger.kernel.org,
	yamada.masahiro@socionext.com, michal.lkml@markovi.net
Subject: [PATCH 4/5] x86_64, -march=native: REP STOSB support
Date: Mon, 22 Jul 2019 23:27:22 +0300	[thread overview]
Message-ID: <20190722202723.13408-4-adobriyan@gmail.com> (raw)
In-Reply-To: <20190722202723.13408-1-adobriyan@gmail.com>

Use REP STOSB everywhere if CPU advertises fast REP STOSB.

Gcc LOVES to unroll memset(), using -mmemset-strategy saves terabytes of
.text.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 Makefile                           |  3 +++
 arch/x86/boot/compressed/head_64.S |  4 ++++
 arch/x86/crypto/sha1_ssse3_asm.S   |  7 ++++++-
 arch/x86/include/asm/page_64.h     | 13 +++++++++++++
 arch/x86/kernel/verify_cpu.S       |  2 +-
 arch/x86/lib/Makefile              |  2 ++
 arch/x86/lib/memset_64.S           | 15 +++++++++++++++
 arch/x86/lib/usercopy_64.c         | 16 +++++++++++++++-
 scripts/kconfig/cpuid.c            |  6 +++++-
 scripts/march-native.sh            |  1 +
 10 files changed, 65 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index aa194c96d27c..31a6375d0e31 100644
--- a/Makefile
+++ b/Makefile
@@ -612,6 +612,9 @@ endif
 ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
 KBUILD_CFLAGS += -mmemcpy-strategy=rep_byte:-1:align,rep_byte:-1:noalign
 endif
+ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+KBUILD_CFLAGS += -mmemset-strategy=rep_byte:-1:align,rep_byte:-1:noalign
+endif
 
 ifeq ($(KBUILD_EXTMOD),)
 # Objects we will link into vmlinux / subdirs we need to visit
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 6233ae35d0d9..a350d265e8af 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -520,8 +520,12 @@ relocated:
 	leaq    _bss(%rip), %rdi
 	leaq    _ebss(%rip), %rcx
 	subq	%rdi, %rcx
+#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+	rep stosb
+#else
 	shrq	$3, %rcx
 	rep	stosq
+#endif
 
 /*
  * Do the extraction, and jump to the new kernel..
diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
index 99c5b8c4dc38..c98f8f2aead6 100644
--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ b/arch/x86/crypto/sha1_ssse3_asm.S
@@ -90,10 +90,15 @@
 	SHA1_PIPELINED_MAIN_BODY
 
 	# cleanup workspace
-	mov	$8, %ecx
 	mov	%rsp, %rdi
 	xor	%eax, %eax
+#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+	mov	$64, %ecx
+	rep stosb
+#else
+	mov	$8, %ecx
 	rep stosq
+#endif
 
 	mov	%rbp, %rsp		# deallocate workspace
 	pop	%rbp
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 051da768273d..7654d5544e0b 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -40,6 +40,18 @@ extern unsigned long __phys_addr_symbol(unsigned long);
 #define pfn_valid(pfn)          ((pfn) < max_pfn)
 #endif
 
+#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+static __always_inline void clear_page(void *page)
+{
+	uint32_t len = PAGE_SIZE;
+	asm volatile (
+		"rep stosb"
+		: "+D" (page), "+c" (len)
+		: "a" (0)
+		: "memory"
+	);
+}
+#else
 void clear_page_orig(void *page);
 void clear_page_rep(void *page);
 void clear_page_erms(void *page);
@@ -53,6 +65,7 @@ static inline void clear_page(void *page)
 			   "0" (page)
 			   : "cc", "memory", "rax", "rcx");
 }
+#endif
 
 #ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
 static __always_inline void copy_page(void *to, void *from)
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index 57b41dafc592..d3f3370e7dab 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -142,7 +142,7 @@ ENTRY(verify_cpu)
 	jnc	.Lverify_cpu_no_longmode
 #endif
 
-#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+#if defined(CONFIG_MARCH_NATIVE_REP_MOVSB) || defined(CONFIG_MARCH_NATIVE_REP_STOSB)
 	xor	%eax, %eax
 	cpuid
 	cmp	$7, %eax
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index fa24cc717fb1..ed71e88cb859 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -59,7 +59,9 @@ endif
 else
         obj-y += iomap_copy_64.o
         lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
+ifneq ($(CONFIG_MARCH_NATIVE_REP_STOSB),y)
         lib-y += clear_page_64.o
+endif
 ifneq ($(CONFIG_MARCH_NATIVE_REP_MOVSB),y)
 	lib-y += copy_page_64.o
 endif
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 9bc861c71e75..7786d1a65423 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -8,6 +8,20 @@
 
 .weak memset
 
+#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+ENTRY(memset)
+ENTRY(__memset)
+	mov	%esi, %eax
+	mov	%rdi, %rsi
+	mov	%rdx, %rcx
+	rep stosb
+	mov	%rsi, %rax
+	ret
+ENDPROC(memset)
+ENDPROC(__memset)
+EXPORT_SYMBOL(memset)
+EXPORT_SYMBOL(__memset)
+#else
 /*
  * ISO C memset - set a memory block to a byte value. This function uses fast
  * string to get better performance than the original function. The code is
@@ -140,3 +154,4 @@ ENTRY(memset_orig)
 	jmp .Lafter_bad_alignment
 .Lfinal:
 ENDPROC(memset_orig)
+#endif
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index fff28c6f73a2..a90779b12d89 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -16,11 +16,23 @@
 
 unsigned long __clear_user(void __user *addr, unsigned long size)
 {
-	long __d0;
 	might_fault();
 	/* no memory constraint because it doesn't change any memory gcc knows
 	   about */
 	stac();
+
+#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+	asm volatile (
+		"0:     rep stosb\n"
+		"1:\n"
+		_ASM_EXTABLE(0b,1b)
+		: "+D" (addr), "+c" (size)
+		: "a" (0)
+		: "memory"
+	);
+#else
+	{
+	long __d0;
 	asm volatile(
 		"	testq  %[size8],%[size8]\n"
 		"	jz     4f\n"
@@ -42,6 +54,8 @@ unsigned long __clear_user(void __user *addr, unsigned long size)
 		_ASM_EXTABLE_UA(1b, 2b)
 		: [size8] "=&c"(size), [dst] "=&D" (__d0)
 		: [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr));
+	}
+#endif
 	clac();
 	return size;
 }
diff --git a/scripts/kconfig/cpuid.c b/scripts/kconfig/cpuid.c
index 2d78fba1dcc7..58d09bda61e5 100644
--- a/scripts/kconfig/cpuid.c
+++ b/scripts/kconfig/cpuid.c
@@ -45,6 +45,7 @@ static inline void cpuid2(uint32_t eax0, uint32_t ecx0, uint32_t *eax, uint32_t
 
 static bool popcnt	= false;
 static bool rep_movsb	= false;
+static bool rep_stosb	= false;
 
 static uint32_t eax0_max;
 
@@ -64,8 +65,10 @@ static void intel(void)
 		cpuid2(7, 0, &eax, &ecx, &edx, &ebx);
 //		printf("%08x %08x %08x %08x\n", eax, ecx, edx, ebx);
 
-		if (ebx & (1 << 9))
+		if (ebx & (1 << 9)) {
 			rep_movsb = true;
+			rep_stosb = true;
+		}
 	}
 }
 
@@ -88,6 +91,7 @@ int main(int argc, char *argv[])
 #define _(x)	if (streq(opt, #x)) return x ? EXIT_SUCCESS : EXIT_FAILURE
 	_(popcnt);
 	_(rep_movsb);
+	_(rep_stosb);
 #undef _
 
 	return EXIT_FAILURE;
diff --git a/scripts/march-native.sh b/scripts/march-native.sh
index 87f00cdb8e10..a41a15a64df4 100755
--- a/scripts/march-native.sh
+++ b/scripts/march-native.sh
@@ -43,6 +43,7 @@ echo "-march=native: $COLLECT_GCC_OPTIONS"
 
 "$CPUID" popcnt		&& option "CONFIG_MARCH_NATIVE_POPCNT"
 "$CPUID" rep_movsb	&& option "CONFIG_MARCH_NATIVE_REP_MOVSB"
+"$CPUID" rep_stosb	&& option "CONFIG_MARCH_NATIVE_REP_STOSB"
 
 for i in $COLLECT_GCC_OPTIONS; do
 	case $i in
-- 
2.21.0


  parent reply	other threads:[~2019-07-22 20:27 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-07-22 20:27 [PATCH 1/5] x86_64: -march=native support Alexey Dobriyan
2019-07-22 20:27 ` [PATCH 2/5] x86_64, -march=native: POPCNT support Alexey Dobriyan
2019-07-22 21:12   ` Peter Zijlstra
2019-07-22 21:15     ` Alexey Dobriyan
2019-07-22 21:27       ` Alexey Dobriyan
2019-07-23  7:20       ` Peter Zijlstra
2019-07-23 20:04         ` Alexey Dobriyan
2019-07-22 20:27 ` [PATCH 3/5] x86_64, -march=native: REP MOVSB support Alexey Dobriyan
2019-07-22 20:27 ` Alexey Dobriyan [this message]
2019-07-22 20:27 ` [PATCH 5/5] x86_64, -march=native: MOVBE support Alexey Dobriyan
  -- strict thread matches above, loose matches on Subject: below --
2019-07-04 20:47 [PATCH 1/5] x86_64: -march=native support Alexey Dobriyan
2019-07-04 20:47 ` [PATCH 4/5] x86_64, -march=native: REP STOSB support Alexey Dobriyan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190722202723.13408-4-adobriyan@gmail.com \
    --to=adobriyan@gmail.com \
    --cc=bp@alien8.de \
    --cc=hpa@zytor.com \
    --cc=linux-kbuild@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=michal.lkml@markovi.net \
    --cc=mingo@redhat.com \
    --cc=tglx@linutronix.de \
    --cc=x86@kernel.org \
    --cc=yamada.masahiro@socionext.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.