linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/5] x86_64: -march=native support
@ 2019-07-22 20:27 Alexey Dobriyan
  2019-07-22 20:27 ` [PATCH 2/5] x86_64, -march=native: POPCNT support Alexey Dobriyan
                   ` (3 more replies)
  0 siblings, 4 replies; 11+ messages in thread
From: Alexey Dobriyan @ 2019-07-22 20:27 UTC (permalink / raw)
  To: tglx, mingo, bp, hpa
  Cc: linux-kernel, x86, adobriyan, linux-kbuild, yamada.masahiro, michal.lkml

I'm tired of rebasing it, so...

"-march=native" has been available in userspace for a long time and is
trivial to enable in Gentoo:

	$ grep -e ^CFLAGS /etc/portage/make.conf
	CFLAGS="-march=native -O2 -pipe"

Patchset enables kernel compile with "-march=native" and do additional
optimizations based on CPU detection. Unfortunately most of the fun is in
SSE2/AVX2 instructions and kernel can't use those. But I have ideas for
at least BMI2.

This is intended to be an alternative to old school MCORE2 options.
Gentoo also ships a patch unrolling all those individual -march= options
into kernel config options. This patch should deprecate it.

See the link for more information:

	https://www.shlomifish.org/humour/by-others/funroll-loops/Gentoo-is-Rice.html

Patch adds:
* -mgeneral-regs-only
	with -march=native all those shiny AVX42-666 instructions
	may suddenly became available

* small compile time partial CPUID detection,
* detect L1 cache shift at compile time,
* show "-march=native" line in /proc/config.gz,
* bump Kconfig "shell" output buffer to accomodate the option,

* inject individual MARCH_NATIVE options at compile time,
	see other patches.

Currently only Intel and gcc are supported.

Intel, because I never had and AMD box.

Gcc, because clang emits detailed "march=native" information in
a different way, so I need to think how to extract it reliably.

Size benchmarks, my trimmed down kernel:

	add/remove: 1/11 grow/shrink: 1856/5598 up/down: 14452/-65830 (-51378)
	Function                                     old     new   delta
	sha_transform                               4302    4606    +304
				...
	udf_write_fi                                1907    1023    -884
	Total: Before=7814760, After=7763382, chg -0.66%

This is mostly due to memset() un-unrolling.

In general, say, crypto and hash code becomes bigger because all those
rotations and shifts become RORX and SHLX instructions and those are 5+
bytes. Older compilers may also emit "REP RET" on generic kernels
because AMD, but upon detecting Intel those REP prefixes may go.

Users are advised to enable it and do their own benchmarks to decide if
it is worth the hassle.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 Makefile                     |  4 ++
 arch/x86/Kconfig.cpu         | 20 +++++++++
 arch/x86/Makefile            |  1 +
 scripts/kconfig/.gitignore   |  1 +
 scripts/kconfig/Makefile     |  7 ++-
 scripts/kconfig/cpuid.c      | 85 ++++++++++++++++++++++++++++++++++++
 scripts/kconfig/preprocess.c |  2 +-
 scripts/march-native.sh      | 66 ++++++++++++++++++++++++++++
 8 files changed, 184 insertions(+), 2 deletions(-)
 create mode 100644 scripts/kconfig/cpuid.c
 create mode 100755 scripts/march-native.sh

diff --git a/Makefile b/Makefile
index 0623c6f88b5b..690f70afa74e 100644
--- a/Makefile
+++ b/Makefile
@@ -606,6 +606,10 @@ ifeq ($(dot-config),1)
 include include/config/auto.conf
 endif
 
+ifdef CONFIG_MARCH_NATIVE
+KBUILD_CFLAGS += -march=native
+endif
+
 ifeq ($(KBUILD_EXTMOD),)
 # Objects we will link into vmlinux / subdirs we need to visit
 init-y		:= init/
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 8e29c991ba3e..d71928d636be 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -287,8 +287,26 @@ config GENERIC_CPU
 	  Generic x86-64 CPU.
 	  Run equally well on all x86-64 CPUs.
 
+config MARCH_NATIVE
+	bool "-march=native"
+	depends on X86_64 && CC_IS_GCC
+	---help---
+	  Compile with -march=native.
+
+	  Optimise for the machine where compilation is done at. Resulting
+	  kernel and modules will not run reliably on a different machine
+	  unless exactly identical CPUs are used.
+
+	  Select only if you're self-compiling kernels and never share
+	  the binaries. If unsure, select "Generic x86_64".
+
 endchoice
 
+config MARCH_NATIVE_CC_FLAGS
+	string
+	depends on MARCH_NATIVE && CC_IS_GCC
+	default "$(shell,$(CC) -march=native -v -E -x c /dev/null 2>&1 | sed -ne '/^COLLECT_GCC_OPTIONS=/{n;p}' | awk '{$1=$1};1')"
+
 config X86_GENERIC
 	bool "Generic x86 support"
 	depends on X86_32
@@ -307,6 +325,7 @@ config X86_INTERNODE_CACHE_SHIFT
 	int
 	default "12" if X86_VSMP
 	default X86_L1_CACHE_SHIFT
+	depends on !MARCH_NATIVE
 
 config X86_L1_CACHE_SHIFT
 	int
@@ -314,6 +333,7 @@ config X86_L1_CACHE_SHIFT
 	default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
 	default "4" if MELAN || M486 || MGEODEGX1
 	default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
+	depends on !MARCH_NATIVE
 
 config X86_F00F_BUG
 	def_bool y
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 56e748a7679f..5d7355c88142 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -59,6 +59,7 @@ endif
 #
 KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
 KBUILD_CFLAGS += $(call cc-option,-mno-avx,)
+KBUILD_CFLAGS += $(call cc-option,-mgeneral-regs-only)
 
 ifeq ($(CONFIG_X86_32),y)
         BITS := 32
diff --git a/scripts/kconfig/.gitignore b/scripts/kconfig/.gitignore
index b5bf92f66d11..411a885ad9b1 100644
--- a/scripts/kconfig/.gitignore
+++ b/scripts/kconfig/.gitignore
@@ -8,6 +8,7 @@
 # configuration programs
 #
 conf
+cpuid
 mconf
 nconf
 qconf
diff --git a/scripts/kconfig/Makefile b/scripts/kconfig/Makefile
index 7656e1137b6b..cebd1711ba7a 100644
--- a/scripts/kconfig/Makefile
+++ b/scripts/kconfig/Makefile
@@ -69,8 +69,9 @@ simple-targets := oldconfig allnoconfig allyesconfig allmodconfig \
 	alldefconfig randconfig listnewconfig olddefconfig syncconfig
 PHONY += $(simple-targets)
 
-$(simple-targets): $(obj)/conf
+$(simple-targets): $(obj)/conf $(obj)/cpuid
 	$< $(silent) --$@ $(Kconfig)
+	$(Q)$(srctree)/scripts/march-native.sh $(CC) $(obj)/cpuid
 
 PHONY += savedefconfig defconfig
 
@@ -150,6 +151,10 @@ $(obj)/lexer.lex.o: $(obj)/parser.tab.h
 HOSTCFLAGS_lexer.lex.o	:= -I $(srctree)/$(src)
 HOSTCFLAGS_parser.tab.o	:= -I $(srctree)/$(src)
 
+# cpuid: -march=native, CONFIG_MARCH_NATIVE_* detection
+hostprogs-y	+= cpuid
+cpuid-objs	:= cpuid.o
+
 # conf: Used for defconfig, oldconfig and related targets
 hostprogs-y	+= conf
 conf-objs	:= conf.o $(common-objs)
diff --git a/scripts/kconfig/cpuid.c b/scripts/kconfig/cpuid.c
new file mode 100644
index 000000000000..81b292382e26
--- /dev/null
+++ b/scripts/kconfig/cpuid.c
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017, 2019 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#ifdef __x86_64__
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static inline bool streq(const char *s1, const char *s2)
+{
+	return strcmp(s1, s2) == 0;
+}
+
+static inline void cpuid(uint32_t eax0, uint32_t *eax, uint32_t *ecx, uint32_t *edx, uint32_t *ebx)
+{
+	asm volatile (
+		"cpuid"
+		: "=a" (*eax), "=c" (*ecx), "=d" (*edx), "=b" (*ebx)
+		: "a" (eax0)
+	);
+}
+
+static inline void cpuid2(uint32_t eax0, uint32_t ecx0, uint32_t *eax, uint32_t *ecx, uint32_t *edx, uint32_t *ebx)
+{
+	asm volatile (
+		"cpuid"
+		: "=a" (*eax), "=c" (*ecx), "=d" (*edx), "=b" (*ebx)
+		: "a" (eax0), "c" (ecx0)
+	);
+}
+
+static uint32_t eax0_max;
+
+static void intel(void)
+{
+	uint32_t eax, ecx, edx, ebx;
+
+	if (eax0_max >= 1) {
+		cpuid(1, &eax, &ecx, &edx, &ebx);
+//		printf("%08x %08x %08x %08x\n", eax, ecx, edx, ebx);
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	const char *opt = argv[1];
+	uint32_t eax, ecx, edx, ebx;
+
+	if (argc != 2)
+		return EXIT_FAILURE;
+
+	cpuid(0, &eax, &ecx, &edx, &ebx);
+//	printf("%08x %08x %08x %08x\n", eax, ecx, edx, ebx);
+	eax0_max = eax;
+
+	if (ecx == 0x6c65746e && edx == 0x49656e69 && ebx == 0x756e6547) {
+		intel();
+	}
+
+#define _(x)	if (streq(opt, #x)) return x ? EXIT_SUCCESS : EXIT_FAILURE
+#undef _
+
+	return EXIT_FAILURE;
+}
+#else
+#include <stdlib.h>
+int main(void)
+{
+	return EXIT_FAILURE;
+}
+#endif
diff --git a/scripts/kconfig/preprocess.c b/scripts/kconfig/preprocess.c
index 0243086fb168..75a819d3abaa 100644
--- a/scripts/kconfig/preprocess.c
+++ b/scripts/kconfig/preprocess.c
@@ -141,7 +141,7 @@ static char *do_lineno(int argc, char *argv[])
 static char *do_shell(int argc, char *argv[])
 {
 	FILE *p;
-	char buf[256];
+	char buf[2048];
 	char *cmd;
 	size_t nread;
 	int i;
diff --git a/scripts/march-native.sh b/scripts/march-native.sh
new file mode 100755
index 000000000000..29a33c80b62b
--- /dev/null
+++ b/scripts/march-native.sh
@@ -0,0 +1,66 @@
+#!/bin/sh
+# Copyright (c) 2017-2019 Alexey Dobriyan <adobriyan@gmail.com>
+if test "$(uname -m)" != "x86_64"; then
+	exit 0
+fi
+
+CC="$1"
+CPUID="$2"
+CONFIG=".config"
+AUTOCONF1="include/config/auto.conf"
+AUTOCONF2="include/generated/autoconf.h"
+
+if ! grep -q -e '^CONFIG_MARCH_NATIVE=y$' "$CONFIG"; then
+	sed -i -e '/^CONFIG_MARCH_NATIVE/d' "$AUTOCONF1" "$AUTOCONF2" >/dev/null 2>&1
+	exit 0
+fi
+
+if ! "$CC" -march=native -x c -c -o /dev/null /dev/null >/dev/null 2>&1; then
+	echo >&2 "error: unsupported '-march=native' compiler option"
+	exit 1
+fi
+
+_option() {
+	echo "$1=$2"		>>"$AUTOCONF1"
+	echo "#define $1 $2"	>>"$AUTOCONF2"
+}
+
+option() {
+	echo "$1=y"		>>"$AUTOCONF1"
+	echo "#define $1 1"	>>"$AUTOCONF2"
+}
+
+if test ! -f "$CONFIG" -o ! -f "$AUTOCONF1" -o ! -f "$AUTOCONF2"; then
+	exit 0
+fi
+
+COLLECT_GCC_OPTIONS=$(
+	"$CC" -march=native -v -E -x c /dev/null 2>&1	|\
+	sed -ne '/^COLLECT_GCC_OPTIONS=/{n;p}'		|\
+	awk '{$1=$1};1'
+)
+echo "-march=native: $COLLECT_GCC_OPTIONS"
+
+for i in $COLLECT_GCC_OPTIONS; do
+	case $i in
+		*/cc1|-E|-quiet|-v|/dev/null|--param|-fstack-protector*)
+			;;
+
+		l1-cache-line-size=64)
+			_option "CONFIG_X86_L1_CACHE_SHIFT"		6
+			_option "CONFIG_X86_INTERNODE_CACHE_SHIFT"	6
+			;;
+
+		l1-cache-size=*);;
+		l2-cache-size=*);;
+
+		-march=*);;
+		-mtune=*);;
+
+		-m*);;
+		-mno-*);;
+
+		*)
+			echo >&2 "warning: unexpected -march=native option '$i'"
+	esac
+done
-- 
2.21.0


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 2/5] x86_64, -march=native: POPCNT support
  2019-07-22 20:27 [PATCH 1/5] x86_64: -march=native support Alexey Dobriyan
@ 2019-07-22 20:27 ` Alexey Dobriyan
  2019-07-22 21:12   ` Peter Zijlstra
  2019-07-22 20:27 ` [PATCH 3/5] x86_64, -march=native: REP MOVSB support Alexey Dobriyan
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 11+ messages in thread
From: Alexey Dobriyan @ 2019-07-22 20:27 UTC (permalink / raw)
  To: tglx, mingo, bp, hpa
  Cc: linux-kernel, x86, adobriyan, linux-kbuild, yamada.masahiro, michal.lkml

Detect POPCNT instruction support and inline hweigth*() functions
if it is supported by CPU.

Detect POPCNT at boot time and conditionally refuse to boot.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 arch/x86/include/asm/arch_hweight.h           | 24 +++++++++++++++++++
 arch/x86/include/asm/segment.h                |  1 +
 arch/x86/kernel/verify_cpu.S                  |  8 +++++++
 arch/x86/lib/Makefile                         |  5 +++-
 .../drm/i915/display/intel_display_power.c    |  2 +-
 drivers/misc/sgi-gru/grumain.c                |  2 +-
 fs/btrfs/tree-checker.c                       |  4 ++--
 include/linux/bitops.h                        |  2 ++
 lib/Makefile                                  |  2 ++
 scripts/kconfig/cpuid.c                       |  7 ++++++
 scripts/march-native.sh                       |  2 ++
 11 files changed, 54 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
index ba88edd0d58b..3797aa57baa5 100644
--- a/arch/x86/include/asm/arch_hweight.h
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -2,6 +2,28 @@
 #ifndef _ASM_X86_HWEIGHT_H
 #define _ASM_X86_HWEIGHT_H
 
+#ifdef CONFIG_MARCH_NATIVE_POPCNT
+static inline unsigned int __arch_hweight64(uint64_t x)
+{
+	return __builtin_popcountll(x);
+}
+
+static inline unsigned int __arch_hweight32(uint32_t x)
+{
+	return __builtin_popcount(x);
+}
+
+static inline unsigned int __arch_hweight16(uint16_t x)
+{
+	return __builtin_popcount(x);
+}
+
+static inline unsigned int __arch_hweight8(uint8_t x)
+{
+	return __builtin_popcount(x);
+}
+#else
+
 #include <asm/cpufeatures.h>
 
 #ifdef CONFIG_64BIT
@@ -53,3 +75,5 @@ static __always_inline unsigned long __arch_hweight64(__u64 w)
 #endif /* CONFIG_X86_32 */
 
 #endif
+
+#endif
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index ac3892920419..d314c6b9b632 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -4,6 +4,7 @@
 
 #include <linux/const.h>
 #include <asm/alternative.h>
+#include <asm/cpufeatures.h>
 
 /*
  * Constructor for a conventional segment GDT (or LDT) entry.
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index a024c4f7ba56..a9be8904faa3 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -134,6 +134,14 @@ ENTRY(verify_cpu)
 	movl $1,%eax
 	ret
 .Lverify_cpu_sse_ok:
+
+#ifdef CONFIG_MARCH_NATIVE_POPCNT
+	mov	$1, %eax
+	cpuid
+	bt	$23, %ecx
+	jnc	.Lverify_cpu_no_longmode
+#endif
+
 	popf				# Restore caller passed flags
 	xorl %eax, %eax
 	ret
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 5246db42de45..7dc0e71b0ef3 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -40,7 +40,10 @@ lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
 lib-$(CONFIG_FUNCTION_ERROR_INJECTION)	+= error-inject.o
 lib-$(CONFIG_RETPOLINE) += retpoline.o
 
-obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
+obj-y += msr.o msr-reg.o msr-reg-export.o
+ifneq ($(CONFIG_MARCH_NATIVE_POPCNT),y)
+	obj-y += hweight.o
+endif
 obj-y += iomem.o
 
 ifeq ($(CONFIG_X86_32),y)
diff --git a/drivers/gpu/drm/i915/display/intel_display_power.c b/drivers/gpu/drm/i915/display/intel_display_power.c
index c93ad512014c..9066105f2fea 100644
--- a/drivers/gpu/drm/i915/display/intel_display_power.c
+++ b/drivers/gpu/drm/i915/display/intel_display_power.c
@@ -1570,7 +1570,7 @@ static void print_power_domains(struct i915_power_domains *power_domains,
 {
 	enum intel_display_power_domain domain;
 
-	DRM_DEBUG_DRIVER("%s (%lu):\n", prefix, hweight64(mask));
+	DRM_DEBUG_DRIVER("%s (%u):\n", prefix, hweight64(mask));
 	for_each_power_domain(domain, mask)
 		DRM_DEBUG_DRIVER("%s use_count %d\n",
 				 intel_display_power_domain_str(domain),
diff --git a/drivers/misc/sgi-gru/grumain.c b/drivers/misc/sgi-gru/grumain.c
index 40ac59dd018c..30cfeeb28e74 100644
--- a/drivers/misc/sgi-gru/grumain.c
+++ b/drivers/misc/sgi-gru/grumain.c
@@ -833,7 +833,7 @@ void gru_steal_context(struct gru_thread_state *gts)
 	}
 	gru_dbg(grudev,
 		"stole gid %d, ctxnum %d from gts %p. Need cb %d, ds %d;"
-		" avail cb %ld, ds %ld\n",
+		" avail cb %u, ds %u\n",
 		gru->gs_gid, ctxnum, ngts, cbr, dsr, hweight64(gru->gs_cbr_map),
 		hweight64(gru->gs_dsr_map));
 }
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index ccd5706199d7..2d33c6ae0e61 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -478,7 +478,7 @@ static int check_block_group_item(struct extent_buffer *leaf,
 	flags = btrfs_block_group_flags(&bgi);
 	if (hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) > 1) {
 		block_group_err(leaf, slot,
-"invalid profile flags, have 0x%llx (%lu bits set) expect no more than 1 bit set",
+"invalid profile flags, have 0x%llx (%u bits set) expect no more than 1 bit set",
 			flags & BTRFS_BLOCK_GROUP_PROFILE_MASK,
 			hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK));
 		return -EUCLEAN;
@@ -491,7 +491,7 @@ static int check_block_group_item(struct extent_buffer *leaf,
 	    type != (BTRFS_BLOCK_GROUP_METADATA |
 			   BTRFS_BLOCK_GROUP_DATA)) {
 		block_group_err(leaf, slot,
-"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx",
+"invalid type, have 0x%llx (%u bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx",
 			type, hweight64(type),
 			BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA,
 			BTRFS_BLOCK_GROUP_SYSTEM,
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index cf074bce3eb3..655b120bba66 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -7,10 +7,12 @@
 #define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE)
 #define BITS_TO_LONGS(nr)	DIV_ROUND_UP(nr, BITS_PER_TYPE(long))
 
+#ifndef CONFIG_MARCH_NATIVE_POPCNT
 extern unsigned int __sw_hweight8(unsigned int w);
 extern unsigned int __sw_hweight16(unsigned int w);
 extern unsigned int __sw_hweight32(unsigned int w);
 extern unsigned long __sw_hweight64(__u64 w);
+#endif
 
 /*
  * Include this here because some architectures need generic_ffs/fls in
diff --git a/lib/Makefile b/lib/Makefile
index 095601ce371d..32400f3a3328 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -114,7 +114,9 @@ obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o
 
 obj-y += logic_pio.o
 
+ifneq ($(CONFIG_MARCH_NATIVE_POPCNT),y)
 obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
+endif
 
 obj-$(CONFIG_BTREE) += btree.o
 obj-$(CONFIG_INTERVAL_TREE) += interval_tree.o
diff --git a/scripts/kconfig/cpuid.c b/scripts/kconfig/cpuid.c
index 81b292382e26..9efc0d9464d8 100644
--- a/scripts/kconfig/cpuid.c
+++ b/scripts/kconfig/cpuid.c
@@ -43,6 +43,8 @@ static inline void cpuid2(uint32_t eax0, uint32_t ecx0, uint32_t *eax, uint32_t
 	);
 }
 
+static bool popcnt	= false;
+
 static uint32_t eax0_max;
 
 static void intel(void)
@@ -52,6 +54,10 @@ static void intel(void)
 	if (eax0_max >= 1) {
 		cpuid(1, &eax, &ecx, &edx, &ebx);
 //		printf("%08x %08x %08x %08x\n", eax, ecx, edx, ebx);
+
+		if (ecx & (1 << 23)) {
+			popcnt = true;
+		}
 	}
 }
 
@@ -72,6 +78,7 @@ int main(int argc, char *argv[])
 	}
 
 #define _(x)	if (streq(opt, #x)) return x ? EXIT_SUCCESS : EXIT_FAILURE
+	_(popcnt);
 #undef _
 
 	return EXIT_FAILURE;
diff --git a/scripts/march-native.sh b/scripts/march-native.sh
index 29a33c80b62b..c3059f93ed2b 100755
--- a/scripts/march-native.sh
+++ b/scripts/march-native.sh
@@ -41,6 +41,8 @@ COLLECT_GCC_OPTIONS=$(
 )
 echo "-march=native: $COLLECT_GCC_OPTIONS"
 
+"$CPUID" popcnt		&& option "CONFIG_MARCH_NATIVE_POPCNT"
+
 for i in $COLLECT_GCC_OPTIONS; do
 	case $i in
 		*/cc1|-E|-quiet|-v|/dev/null|--param|-fstack-protector*)
-- 
2.21.0


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 3/5] x86_64, -march=native: REP MOVSB support
  2019-07-22 20:27 [PATCH 1/5] x86_64: -march=native support Alexey Dobriyan
  2019-07-22 20:27 ` [PATCH 2/5] x86_64, -march=native: POPCNT support Alexey Dobriyan
@ 2019-07-22 20:27 ` Alexey Dobriyan
  2019-07-22 20:27 ` [PATCH 4/5] x86_64, -march=native: REP STOSB support Alexey Dobriyan
  2019-07-22 20:27 ` [PATCH 5/5] x86_64, -march=native: MOVBE support Alexey Dobriyan
  3 siblings, 0 replies; 11+ messages in thread
From: Alexey Dobriyan @ 2019-07-22 20:27 UTC (permalink / raw)
  To: tglx, mingo, bp, hpa
  Cc: linux-kernel, x86, adobriyan, linux-kbuild, yamada.masahiro, michal.lkml

Detect fast REP MOVSB support and use it for page copying.

Inline copy_page(), this saves alternative entry and a function call
overhead which should hopefully improve code generation.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 Makefile                             |  3 +++
 arch/x86/include/asm/page_64.h       | 13 +++++++++++++
 arch/x86/kernel/relocate_kernel_64.S | 15 +++++++++++++++
 arch/x86/kernel/verify_cpu.S         | 12 ++++++++++++
 arch/x86/lib/Makefile                |  5 ++++-
 arch/x86/lib/memcpy_64.S             | 13 +++++++++++++
 arch/x86/platform/pvh/head.S         |  4 ++++
 scripts/kconfig/cpuid.c              |  9 +++++++++
 scripts/march-native.sh              |  1 +
 9 files changed, 74 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 690f70afa74e..aa194c96d27c 100644
--- a/Makefile
+++ b/Makefile
@@ -609,6 +609,9 @@ endif
 ifdef CONFIG_MARCH_NATIVE
 KBUILD_CFLAGS += -march=native
 endif
+ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+KBUILD_CFLAGS += -mmemcpy-strategy=rep_byte:-1:align,rep_byte:-1:noalign
+endif
 
 ifeq ($(KBUILD_EXTMOD),)
 # Objects we will link into vmlinux / subdirs we need to visit
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 939b1cff4a7b..051da768273d 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -54,7 +54,20 @@ static inline void clear_page(void *page)
 			   : "cc", "memory", "rax", "rcx");
 }
 
+#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+static __always_inline void copy_page(void *to, void *from)
+{
+	uint32_t len = PAGE_SIZE;
+	asm volatile (
+		"rep movsb"
+		: "+D" (to), "+S" (from), "+c" (len)
+		:
+		: "memory"
+	);
+}
+#else
 void copy_page(void *to, void *from);
+#endif
 
 #endif	/* !__ASSEMBLY__ */
 
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index c51ccff5cd01..822f7a3d035a 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -266,18 +266,33 @@ swap_pages:
 	movq	%rsi, %rax
 
 	movq	%r10, %rdi
+#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+	mov	$4096, %ecx
+	rep movsb
+#else
 	movl	$512, %ecx
 	rep ; movsq
+#endif
 
 	movq	%rax, %rdi
 	movq	%rdx, %rsi
+#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+	mov	$4096, %ecx
+	rep movsb
+#else
 	movl	$512, %ecx
 	rep ; movsq
+#endif
 
 	movq	%rdx, %rdi
 	movq	%r10, %rsi
+#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+	mov	$4096, %ecx
+	rep movsb
+#else
 	movl	$512, %ecx
 	rep ; movsq
+#endif
 
 	lea	PAGE_SIZE(%rax), %rsi
 	jmp	0b
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index a9be8904faa3..57b41dafc592 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -142,6 +142,18 @@ ENTRY(verify_cpu)
 	jnc	.Lverify_cpu_no_longmode
 #endif
 
+#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+	xor	%eax, %eax
+	cpuid
+	cmp	$7, %eax
+	jb	.Lverify_cpu_no_longmode
+	mov	$7, %eax
+	xor	%ecx, %ecx
+	cpuid
+	bt	$9, %ebx
+	jnc	.Lverify_cpu_no_longmode
+#endif
+
 	popf				# Restore caller passed flags
 	xorl %eax, %eax
 	ret
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 7dc0e71b0ef3..fa24cc717fb1 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -59,7 +59,10 @@ endif
 else
         obj-y += iomap_copy_64.o
         lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
-        lib-y += clear_page_64.o copy_page_64.o
+        lib-y += clear_page_64.o
+ifneq ($(CONFIG_MARCH_NATIVE_REP_MOVSB),y)
+	lib-y += copy_page_64.o
+endif
         lib-y += memmove_64.o memset_64.o
         lib-y += copy_user_64.o
 	lib-y += cmpxchg16b_emu.o
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 92748660ba51..ab5b9662b348 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -17,6 +17,18 @@
 
 .weak memcpy
 
+#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+ENTRY(__memcpy)
+ENTRY(memcpy)
+	mov	%rdi, %rax
+	mov	%rdx, %rcx
+	rep movsb
+	ret
+ENDPROC(memcpy)
+ENDPROC(__memcpy)
+EXPORT_SYMBOL(memcpy)
+EXPORT_SYMBOL(__memcpy)
+#else
 /*
  * memcpy - Copy a memory block.
  *
@@ -183,6 +195,7 @@ ENTRY(memcpy_orig)
 .Lend:
 	retq
 ENDPROC(memcpy_orig)
+#endif
 
 #ifndef CONFIG_UML
 
diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S
index 1f8825bbaffb..2737f3e8c021 100644
--- a/arch/x86/platform/pvh/head.S
+++ b/arch/x86/platform/pvh/head.S
@@ -64,9 +64,13 @@ ENTRY(pvh_start_xen)
 	mov $_pa(pvh_start_info), %edi
 	mov %ebx, %esi
 	mov _pa(pvh_start_info_sz), %ecx
+#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+	rep movsb
+#else
 	shr $2,%ecx
 	rep
 	movsl
+#endif
 
 	mov $_pa(early_stack_end), %esp
 
diff --git a/scripts/kconfig/cpuid.c b/scripts/kconfig/cpuid.c
index 9efc0d9464d8..2d78fba1dcc7 100644
--- a/scripts/kconfig/cpuid.c
+++ b/scripts/kconfig/cpuid.c
@@ -44,6 +44,7 @@ static inline void cpuid2(uint32_t eax0, uint32_t ecx0, uint32_t *eax, uint32_t
 }
 
 static bool popcnt	= false;
+static bool rep_movsb	= false;
 
 static uint32_t eax0_max;
 
@@ -59,6 +60,13 @@ static void intel(void)
 			popcnt = true;
 		}
 	}
+	if (eax0_max >= 7) {
+		cpuid2(7, 0, &eax, &ecx, &edx, &ebx);
+//		printf("%08x %08x %08x %08x\n", eax, ecx, edx, ebx);
+
+		if (ebx & (1 << 9))
+			rep_movsb = true;
+	}
 }
 
 int main(int argc, char *argv[])
@@ -79,6 +87,7 @@ int main(int argc, char *argv[])
 
 #define _(x)	if (streq(opt, #x)) return x ? EXIT_SUCCESS : EXIT_FAILURE
 	_(popcnt);
+	_(rep_movsb);
 #undef _
 
 	return EXIT_FAILURE;
diff --git a/scripts/march-native.sh b/scripts/march-native.sh
index c3059f93ed2b..87f00cdb8e10 100755
--- a/scripts/march-native.sh
+++ b/scripts/march-native.sh
@@ -42,6 +42,7 @@ COLLECT_GCC_OPTIONS=$(
 echo "-march=native: $COLLECT_GCC_OPTIONS"
 
 "$CPUID" popcnt		&& option "CONFIG_MARCH_NATIVE_POPCNT"
+"$CPUID" rep_movsb	&& option "CONFIG_MARCH_NATIVE_REP_MOVSB"
 
 for i in $COLLECT_GCC_OPTIONS; do
 	case $i in
-- 
2.21.0


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 4/5] x86_64, -march=native: REP STOSB support
  2019-07-22 20:27 [PATCH 1/5] x86_64: -march=native support Alexey Dobriyan
  2019-07-22 20:27 ` [PATCH 2/5] x86_64, -march=native: POPCNT support Alexey Dobriyan
  2019-07-22 20:27 ` [PATCH 3/5] x86_64, -march=native: REP MOVSB support Alexey Dobriyan
@ 2019-07-22 20:27 ` Alexey Dobriyan
  2019-07-22 20:27 ` [PATCH 5/5] x86_64, -march=native: MOVBE support Alexey Dobriyan
  3 siblings, 0 replies; 11+ messages in thread
From: Alexey Dobriyan @ 2019-07-22 20:27 UTC (permalink / raw)
  To: tglx, mingo, bp, hpa
  Cc: linux-kernel, x86, adobriyan, linux-kbuild, yamada.masahiro, michal.lkml

Use REP STOSB everywhere if CPU advertises fast REP STOSB.

Gcc LOVES to unroll memset(), using -mmemset-strategy saves terabytes of
.text.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 Makefile                           |  3 +++
 arch/x86/boot/compressed/head_64.S |  4 ++++
 arch/x86/crypto/sha1_ssse3_asm.S   |  7 ++++++-
 arch/x86/include/asm/page_64.h     | 13 +++++++++++++
 arch/x86/kernel/verify_cpu.S       |  2 +-
 arch/x86/lib/Makefile              |  2 ++
 arch/x86/lib/memset_64.S           | 15 +++++++++++++++
 arch/x86/lib/usercopy_64.c         | 16 +++++++++++++++-
 scripts/kconfig/cpuid.c            |  6 +++++-
 scripts/march-native.sh            |  1 +
 10 files changed, 65 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index aa194c96d27c..31a6375d0e31 100644
--- a/Makefile
+++ b/Makefile
@@ -612,6 +612,9 @@ endif
 ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
 KBUILD_CFLAGS += -mmemcpy-strategy=rep_byte:-1:align,rep_byte:-1:noalign
 endif
+ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+KBUILD_CFLAGS += -mmemset-strategy=rep_byte:-1:align,rep_byte:-1:noalign
+endif
 
 ifeq ($(KBUILD_EXTMOD),)
 # Objects we will link into vmlinux / subdirs we need to visit
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 6233ae35d0d9..a350d265e8af 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -520,8 +520,12 @@ relocated:
 	leaq    _bss(%rip), %rdi
 	leaq    _ebss(%rip), %rcx
 	subq	%rdi, %rcx
+#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+	rep stosb
+#else
 	shrq	$3, %rcx
 	rep	stosq
+#endif
 
 /*
  * Do the extraction, and jump to the new kernel..
diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
index 99c5b8c4dc38..c98f8f2aead6 100644
--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ b/arch/x86/crypto/sha1_ssse3_asm.S
@@ -90,10 +90,15 @@
 	SHA1_PIPELINED_MAIN_BODY
 
 	# cleanup workspace
-	mov	$8, %ecx
 	mov	%rsp, %rdi
 	xor	%eax, %eax
+#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+	mov	$64, %ecx
+	rep stosb
+#else
+	mov	$8, %ecx
 	rep stosq
+#endif
 
 	mov	%rbp, %rsp		# deallocate workspace
 	pop	%rbp
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 051da768273d..7654d5544e0b 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -40,6 +40,18 @@ extern unsigned long __phys_addr_symbol(unsigned long);
 #define pfn_valid(pfn)          ((pfn) < max_pfn)
 #endif
 
+#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+static __always_inline void clear_page(void *page)
+{
+	uint32_t len = PAGE_SIZE;
+	asm volatile (
+		"rep stosb"
+		: "+D" (page), "+c" (len)
+		: "a" (0)
+		: "memory"
+	);
+}
+#else
 void clear_page_orig(void *page);
 void clear_page_rep(void *page);
 void clear_page_erms(void *page);
@@ -53,6 +65,7 @@ static inline void clear_page(void *page)
 			   "0" (page)
 			   : "cc", "memory", "rax", "rcx");
 }
+#endif
 
 #ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
 static __always_inline void copy_page(void *to, void *from)
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index 57b41dafc592..d3f3370e7dab 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -142,7 +142,7 @@ ENTRY(verify_cpu)
 	jnc	.Lverify_cpu_no_longmode
 #endif
 
-#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+#if defined(CONFIG_MARCH_NATIVE_REP_MOVSB) || defined(CONFIG_MARCH_NATIVE_REP_STOSB)
 	xor	%eax, %eax
 	cpuid
 	cmp	$7, %eax
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index fa24cc717fb1..ed71e88cb859 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -59,7 +59,9 @@ endif
 else
         obj-y += iomap_copy_64.o
         lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
+ifneq ($(CONFIG_MARCH_NATIVE_REP_STOSB),y)
         lib-y += clear_page_64.o
+endif
 ifneq ($(CONFIG_MARCH_NATIVE_REP_MOVSB),y)
 	lib-y += copy_page_64.o
 endif
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 9bc861c71e75..7786d1a65423 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -8,6 +8,20 @@
 
 .weak memset
 
+#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+ENTRY(memset)
+ENTRY(__memset)
+	mov	%esi, %eax
+	mov	%rdi, %rsi
+	mov	%rdx, %rcx
+	rep stosb
+	mov	%rsi, %rax
+	ret
+ENDPROC(memset)
+ENDPROC(__memset)
+EXPORT_SYMBOL(memset)
+EXPORT_SYMBOL(__memset)
+#else
 /*
  * ISO C memset - set a memory block to a byte value. This function uses fast
  * string to get better performance than the original function. The code is
@@ -140,3 +154,4 @@ ENTRY(memset_orig)
 	jmp .Lafter_bad_alignment
 .Lfinal:
 ENDPROC(memset_orig)
+#endif
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index fff28c6f73a2..a90779b12d89 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -16,11 +16,23 @@
 
 unsigned long __clear_user(void __user *addr, unsigned long size)
 {
-	long __d0;
 	might_fault();
 	/* no memory constraint because it doesn't change any memory gcc knows
 	   about */
 	stac();
+
+#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+	asm volatile (
+		"0:     rep stosb\n"
+		"1:\n"
+		_ASM_EXTABLE(0b,1b)
+		: "+D" (addr), "+c" (size)
+		: "a" (0)
+		: "memory"
+	);
+#else
+	{
+	long __d0;
 	asm volatile(
 		"	testq  %[size8],%[size8]\n"
 		"	jz     4f\n"
@@ -42,6 +54,8 @@ unsigned long __clear_user(void __user *addr, unsigned long size)
 		_ASM_EXTABLE_UA(1b, 2b)
 		: [size8] "=&c"(size), [dst] "=&D" (__d0)
 		: [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr));
+	}
+#endif
 	clac();
 	return size;
 }
diff --git a/scripts/kconfig/cpuid.c b/scripts/kconfig/cpuid.c
index 2d78fba1dcc7..58d09bda61e5 100644
--- a/scripts/kconfig/cpuid.c
+++ b/scripts/kconfig/cpuid.c
@@ -45,6 +45,7 @@ static inline void cpuid2(uint32_t eax0, uint32_t ecx0, uint32_t *eax, uint32_t
 
 static bool popcnt	= false;
 static bool rep_movsb	= false;
+static bool rep_stosb	= false;
 
 static uint32_t eax0_max;
 
@@ -64,8 +65,10 @@ static void intel(void)
 		cpuid2(7, 0, &eax, &ecx, &edx, &ebx);
 //		printf("%08x %08x %08x %08x\n", eax, ecx, edx, ebx);
 
-		if (ebx & (1 << 9))
+		if (ebx & (1 << 9)) {
 			rep_movsb = true;
+			rep_stosb = true;
+		}
 	}
 }
 
@@ -88,6 +91,7 @@ int main(int argc, char *argv[])
 #define _(x)	if (streq(opt, #x)) return x ? EXIT_SUCCESS : EXIT_FAILURE
 	_(popcnt);
 	_(rep_movsb);
+	_(rep_stosb);
 #undef _
 
 	return EXIT_FAILURE;
diff --git a/scripts/march-native.sh b/scripts/march-native.sh
index 87f00cdb8e10..a41a15a64df4 100755
--- a/scripts/march-native.sh
+++ b/scripts/march-native.sh
@@ -43,6 +43,7 @@ echo "-march=native: $COLLECT_GCC_OPTIONS"
 
 "$CPUID" popcnt		&& option "CONFIG_MARCH_NATIVE_POPCNT"
 "$CPUID" rep_movsb	&& option "CONFIG_MARCH_NATIVE_REP_MOVSB"
+"$CPUID" rep_stosb	&& option "CONFIG_MARCH_NATIVE_REP_STOSB"
 
 for i in $COLLECT_GCC_OPTIONS; do
 	case $i in
-- 
2.21.0


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 5/5] x86_64, -march=native: MOVBE support
  2019-07-22 20:27 [PATCH 1/5] x86_64: -march=native support Alexey Dobriyan
                   ` (2 preceding siblings ...)
  2019-07-22 20:27 ` [PATCH 4/5] x86_64, -march=native: REP STOSB support Alexey Dobriyan
@ 2019-07-22 20:27 ` Alexey Dobriyan
  3 siblings, 0 replies; 11+ messages in thread
From: Alexey Dobriyan @ 2019-07-22 20:27 UTC (permalink / raw)
  To: tglx, mingo, bp, hpa
  Cc: linux-kernel, x86, adobriyan, linux-kbuild, yamada.masahiro, michal.lkml

Use MOVBE if it is available.

Internally MOVBE probably translates to MOV+BSWAP anyway, but who knows.

Do it because it is easy to do...

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 arch/x86/crypto/des3_ede-asm_64.S | 28 ++++++++++++++++++++++++++++
 arch/x86/kernel/verify_cpu.S      |  7 +++++++
 scripts/kconfig/cpuid.c           |  5 +++++
 scripts/march-native.sh           |  1 +
 4 files changed, 41 insertions(+)

diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S
index 7fca43099a5f..2fd310e98b0b 100644
--- a/arch/x86/crypto/des3_ede-asm_64.S
+++ b/arch/x86/crypto/des3_ede-asm_64.S
@@ -150,6 +150,15 @@
 
 #define dummy2(a, b) /*_*/
 
+#ifdef CONFIG_MARCH_NATIVE_MOVBE
+#define read_block(io, left, right) \
+	movbe	 (io), left##d; \
+	movbe	4(io), right##d;
+
+#define write_block(io, left, right) \
+	movbe	left##d,   (io); \
+	movbe	right##d, 4(io);
+#else
 #define read_block(io, left, right) \
 	movl    (io), left##d; \
 	movl   4(io), right##d; \
@@ -161,6 +170,7 @@
 	bswapl right##d; \
 	movl   left##d,   (io); \
 	movl   right##d, 4(io);
+#endif
 
 ENTRY(des3_ede_x86_64_crypt_blk)
 	/* input:
@@ -434,6 +444,14 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
 	pushq %rsi /* dst */
 
 	/* load input */
+#ifdef CONFIG_MARCH_NATIVE_MOVBE
+	movbe 0 * 4(%rdx), RL0d;
+	movbe 1 * 4(%rdx), RR0d;
+	movbe 2 * 4(%rdx), RL1d;
+	movbe 3 * 4(%rdx), RR1d;
+	movbe 4 * 4(%rdx), RL2d;
+	movbe 5 * 4(%rdx), RR2d;
+#else
 	movl 0 * 4(%rdx), RL0d;
 	movl 1 * 4(%rdx), RR0d;
 	movl 2 * 4(%rdx), RL1d;
@@ -447,6 +465,7 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
 	bswapl RR1d;
 	bswapl RL2d;
 	bswapl RR2d;
+#endif
 
 	initial_permutation3(RL, RR);
 
@@ -507,6 +526,14 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
 
 	final_permutation3(RR, RL);
 
+#ifdef CONFIG_MARCH_NATIVE_MOVBE
+	movbe RR0d, 0 * 4(%rsi);
+	movbe RL0d, 1 * 4(%rsi);
+	movbe RR1d, 2 * 4(%rsi);
+	movbe RL1d, 3 * 4(%rsi);
+	movbe RR2d, 4 * 4(%rsi);
+	movbe RL2d, 5 * 4(%rsi);
+#else
 	bswapl RR0d;
 	bswapl RL0d;
 	bswapl RR1d;
@@ -521,6 +548,7 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
 	movl RL1d, 3 * 4(%rsi);
 	movl RR2d, 4 * 4(%rsi);
 	movl RL2d, 5 * 4(%rsi);
+#endif
 
 	popq %r15;
 	popq %r14;
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index d3f3370e7dab..f8ff130edfb3 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -142,6 +142,13 @@ ENTRY(verify_cpu)
 	jnc	.Lverify_cpu_no_longmode
 #endif
 
+#ifdef CONFIG_MARCH_NATIVE_MOVBE
+	mov	$1, %eax
+	cpuid
+	bt	$22, %ecx
+	jnc	.Lverify_cpu_no_longmode
+#endif
+
 #if defined(CONFIG_MARCH_NATIVE_REP_MOVSB) || defined(CONFIG_MARCH_NATIVE_REP_STOSB)
 	xor	%eax, %eax
 	cpuid
diff --git a/scripts/kconfig/cpuid.c b/scripts/kconfig/cpuid.c
index 58d09bda61e5..0da1142a59da 100644
--- a/scripts/kconfig/cpuid.c
+++ b/scripts/kconfig/cpuid.c
@@ -43,6 +43,7 @@ static inline void cpuid2(uint32_t eax0, uint32_t ecx0, uint32_t *eax, uint32_t
 	);
 }
 
+static bool movbe	= false;
 static bool popcnt	= false;
 static bool rep_movsb	= false;
 static bool rep_stosb	= false;
@@ -57,6 +58,9 @@ static void intel(void)
 		cpuid(1, &eax, &ecx, &edx, &ebx);
 //		printf("%08x %08x %08x %08x\n", eax, ecx, edx, ebx);
 
+		if (ecx & (1 << 22)) {
+			movbe = true;
+		}
 		if (ecx & (1 << 23)) {
 			popcnt = true;
 		}
@@ -89,6 +93,7 @@ int main(int argc, char *argv[])
 	}
 
 #define _(x)	if (streq(opt, #x)) return x ? EXIT_SUCCESS : EXIT_FAILURE
+	_(movbe);
 	_(popcnt);
 	_(rep_movsb);
 	_(rep_stosb);
diff --git a/scripts/march-native.sh b/scripts/march-native.sh
index a41a15a64df4..530bac22fa07 100755
--- a/scripts/march-native.sh
+++ b/scripts/march-native.sh
@@ -41,6 +41,7 @@ COLLECT_GCC_OPTIONS=$(
 )
 echo "-march=native: $COLLECT_GCC_OPTIONS"
 
+"$CPUID" movbe		&& option "CONFIG_MARCH_NATIVE_MOVBE"
 "$CPUID" popcnt		&& option "CONFIG_MARCH_NATIVE_POPCNT"
 "$CPUID" rep_movsb	&& option "CONFIG_MARCH_NATIVE_REP_MOVSB"
 "$CPUID" rep_stosb	&& option "CONFIG_MARCH_NATIVE_REP_STOSB"
-- 
2.21.0


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/5] x86_64, -march=native: POPCNT support
  2019-07-22 20:27 ` [PATCH 2/5] x86_64, -march=native: POPCNT support Alexey Dobriyan
@ 2019-07-22 21:12   ` Peter Zijlstra
  2019-07-22 21:15     ` Alexey Dobriyan
  0 siblings, 1 reply; 11+ messages in thread
From: Peter Zijlstra @ 2019-07-22 21:12 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: tglx, mingo, bp, hpa, linux-kernel, x86, linux-kbuild,
	yamada.masahiro, michal.lkml

On Mon, Jul 22, 2019 at 11:27:20PM +0300, Alexey Dobriyan wrote:
> Detect POPCNT instruction support and inline hweigth*() functions
> if it is supported by CPU.
> 
> Detect POPCNT at boot time and conditionally refuse to boot.
> 
> Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
> ---
>  arch/x86/include/asm/arch_hweight.h           | 24 +++++++++++++++++++
>  arch/x86/include/asm/segment.h                |  1 +
>  arch/x86/kernel/verify_cpu.S                  |  8 +++++++
>  arch/x86/lib/Makefile                         |  5 +++-
>  .../drm/i915/display/intel_display_power.c    |  2 +-
>  drivers/misc/sgi-gru/grumain.c                |  2 +-
>  fs/btrfs/tree-checker.c                       |  4 ++--
>  include/linux/bitops.h                        |  2 ++
>  lib/Makefile                                  |  2 ++
>  scripts/kconfig/cpuid.c                       |  7 ++++++
>  scripts/march-native.sh                       |  2 ++
>  11 files changed, 54 insertions(+), 5 deletions(-)

*WHY* ?

AFAICT this just adds lines and complexity and wins aboslutely nothing.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/5] x86_64, -march=native: POPCNT support
  2019-07-22 21:12   ` Peter Zijlstra
@ 2019-07-22 21:15     ` Alexey Dobriyan
  2019-07-22 21:27       ` Alexey Dobriyan
  2019-07-23  7:20       ` Peter Zijlstra
  0 siblings, 2 replies; 11+ messages in thread
From: Alexey Dobriyan @ 2019-07-22 21:15 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, bp, hpa, linux-kernel, x86, linux-kbuild,
	yamada.masahiro, michal.lkml

On Mon, Jul 22, 2019 at 11:12:10PM +0200, Peter Zijlstra wrote:
> On Mon, Jul 22, 2019 at 11:27:20PM +0300, Alexey Dobriyan wrote:
> > Detect POPCNT instruction support and inline hweigth*() functions
> > if it is supported by CPU.
> > 
> > Detect POPCNT at boot time and conditionally refuse to boot.
> > 
> > Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
> > ---
> >  arch/x86/include/asm/arch_hweight.h           | 24 +++++++++++++++++++
> >  arch/x86/include/asm/segment.h                |  1 +
> >  arch/x86/kernel/verify_cpu.S                  |  8 +++++++
> >  arch/x86/lib/Makefile                         |  5 +++-
> >  .../drm/i915/display/intel_display_power.c    |  2 +-
> >  drivers/misc/sgi-gru/grumain.c                |  2 +-
> >  fs/btrfs/tree-checker.c                       |  4 ++--
> >  include/linux/bitops.h                        |  2 ++
> >  lib/Makefile                                  |  2 ++
> >  scripts/kconfig/cpuid.c                       |  7 ++++++
> >  scripts/march-native.sh                       |  2 ++
> >  11 files changed, 54 insertions(+), 5 deletions(-)
> 
> *WHY* ?
> 
> AFAICT this just adds lines and complexity and wins aboslutely nothing.

If CPU is know to have POPCNT, it doesn't make sense to go through RDI.
Additionally some CPUs (still?) have fake dependency on the destination,
so "popcnt rax, rdi" is suboptimal.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/5] x86_64, -march=native: POPCNT support
  2019-07-22 21:15     ` Alexey Dobriyan
@ 2019-07-22 21:27       ` Alexey Dobriyan
  2019-07-23  7:20       ` Peter Zijlstra
  1 sibling, 0 replies; 11+ messages in thread
From: Alexey Dobriyan @ 2019-07-22 21:27 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, bp, hpa, linux-kernel, x86, linux-kbuild,
	yamada.masahiro, michal.lkml

On Tue, Jul 23, 2019 at 12:15:39AM +0300, Alexey Dobriyan wrote:
> On Mon, Jul 22, 2019 at 11:12:10PM +0200, Peter Zijlstra wrote:
> > On Mon, Jul 22, 2019 at 11:27:20PM +0300, Alexey Dobriyan wrote:
> > > Detect POPCNT instruction support and inline hweigth*() functions
> > > if it is supported by CPU.
> > > 
> > > Detect POPCNT at boot time and conditionally refuse to boot.
> > > 
> > > Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
> > > ---
> > >  arch/x86/include/asm/arch_hweight.h           | 24 +++++++++++++++++++
> > >  arch/x86/include/asm/segment.h                |  1 +
> > >  arch/x86/kernel/verify_cpu.S                  |  8 +++++++
> > >  arch/x86/lib/Makefile                         |  5 +++-
> > >  .../drm/i915/display/intel_display_power.c    |  2 +-
> > >  drivers/misc/sgi-gru/grumain.c                |  2 +-
> > >  fs/btrfs/tree-checker.c                       |  4 ++--
> > >  include/linux/bitops.h                        |  2 ++
> > >  lib/Makefile                                  |  2 ++
> > >  scripts/kconfig/cpuid.c                       |  7 ++++++
> > >  scripts/march-native.sh                       |  2 ++
> > >  11 files changed, 54 insertions(+), 5 deletions(-)
> > 
> > *WHY* ?
> > 
> > AFAICT this just adds lines and complexity and wins aboslutely nothing.
> 
> If CPU is know to have POPCNT, it doesn't make sense to go through RDI.
> Additionally some CPUs (still?) have fake dependency on the destination,
> so "popcnt rax, rdi" is suboptimal.

More general argument is that if -march=native is accepted, compiler will
generate new instructions which will throw #UD on CPUs which aren't
capable, so it doesn't make sense to _not_ go deeper and use all the
knowledge about current CPU.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/5] x86_64, -march=native: POPCNT support
  2019-07-22 21:15     ` Alexey Dobriyan
  2019-07-22 21:27       ` Alexey Dobriyan
@ 2019-07-23  7:20       ` Peter Zijlstra
  2019-07-23 20:04         ` Alexey Dobriyan
  1 sibling, 1 reply; 11+ messages in thread
From: Peter Zijlstra @ 2019-07-23  7:20 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: tglx, mingo, bp, hpa, linux-kernel, x86, linux-kbuild,
	yamada.masahiro, michal.lkml

On Tue, Jul 23, 2019 at 12:15:39AM +0300, Alexey Dobriyan wrote:
> On Mon, Jul 22, 2019 at 11:12:10PM +0200, Peter Zijlstra wrote:
> > On Mon, Jul 22, 2019 at 11:27:20PM +0300, Alexey Dobriyan wrote:
> > > Detect POPCNT instruction support and inline hweigth*() functions
> > > if it is supported by CPU.
> > > 
> > > Detect POPCNT at boot time and conditionally refuse to boot.
> > > 
> > > Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
> > > ---
> > >  arch/x86/include/asm/arch_hweight.h           | 24 +++++++++++++++++++
> > >  arch/x86/include/asm/segment.h                |  1 +
> > >  arch/x86/kernel/verify_cpu.S                  |  8 +++++++
> > >  arch/x86/lib/Makefile                         |  5 +++-
> > >  .../drm/i915/display/intel_display_power.c    |  2 +-
> > >  drivers/misc/sgi-gru/grumain.c                |  2 +-
> > >  fs/btrfs/tree-checker.c                       |  4 ++--
> > >  include/linux/bitops.h                        |  2 ++
> > >  lib/Makefile                                  |  2 ++
> > >  scripts/kconfig/cpuid.c                       |  7 ++++++
> > >  scripts/march-native.sh                       |  2 ++
> > >  11 files changed, 54 insertions(+), 5 deletions(-)
> > 
> > *WHY* ?
> > 
> > AFAICT this just adds lines and complexity and wins aboslutely nothing.
> 
> If CPU is know to have POPCNT, it doesn't make sense to go through RDI.
> Additionally some CPUs (still?) have fake dependency on the destination,
> so "popcnt rax, rdi" is suboptimal.

You completely forgot to mention any of that in your Changelog, also I
doubt you can find code where this makes a measurable difference. IOW, I
still doubt it makes any kind of sense.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/5] x86_64, -march=native: POPCNT support
  2019-07-23  7:20       ` Peter Zijlstra
@ 2019-07-23 20:04         ` Alexey Dobriyan
  0 siblings, 0 replies; 11+ messages in thread
From: Alexey Dobriyan @ 2019-07-23 20:04 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, mingo, bp, hpa, linux-kernel, x86, linux-kbuild,
	yamada.masahiro, michal.lkml

On Tue, Jul 23, 2019 at 09:20:43AM +0200, Peter Zijlstra wrote:
> On Tue, Jul 23, 2019 at 12:15:39AM +0300, Alexey Dobriyan wrote:
> > On Mon, Jul 22, 2019 at 11:12:10PM +0200, Peter Zijlstra wrote:
> > > On Mon, Jul 22, 2019 at 11:27:20PM +0300, Alexey Dobriyan wrote:
> > > > Detect POPCNT instruction support and inline hweigth*() functions
> > > > if it is supported by CPU.
> > > > 
> > > > Detect POPCNT at boot time and conditionally refuse to boot.
> > > > 
> > > > Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
> > > > ---
> > > >  arch/x86/include/asm/arch_hweight.h           | 24 +++++++++++++++++++
> > > >  arch/x86/include/asm/segment.h                |  1 +
> > > >  arch/x86/kernel/verify_cpu.S                  |  8 +++++++
> > > >  arch/x86/lib/Makefile                         |  5 +++-
> > > >  .../drm/i915/display/intel_display_power.c    |  2 +-
> > > >  drivers/misc/sgi-gru/grumain.c                |  2 +-
> > > >  fs/btrfs/tree-checker.c                       |  4 ++--
> > > >  include/linux/bitops.h                        |  2 ++
> > > >  lib/Makefile                                  |  2 ++
> > > >  scripts/kconfig/cpuid.c                       |  7 ++++++
> > > >  scripts/march-native.sh                       |  2 ++
> > > >  11 files changed, 54 insertions(+), 5 deletions(-)
> > > 
> > > *WHY* ?
> > > 
> > > AFAICT this just adds lines and complexity and wins aboslutely nothing.
> > 
> > If CPU is know to have POPCNT, it doesn't make sense to go through RDI.
> > Additionally some CPUs (still?) have fake dependency on the destination,
> > so "popcnt rax, rdi" is suboptimal.
> 
> You completely forgot to mention any of that in your Changelog, also I
> doubt you can find code where this makes a measurable difference. IOW, I
> still doubt it makes any kind of sense.

It saves some space, although not much. gcc likes to use 64-bit version
even where 32-bit version should suffice.

Regardless I found some problems with POPCNT patch, so hold off the
series.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 4/5] x86_64, -march=native: REP STOSB support
  2019-07-04 20:47 [PATCH 1/5] x86_64: -march=native support Alexey Dobriyan
@ 2019-07-04 20:47 ` Alexey Dobriyan
  0 siblings, 0 replies; 11+ messages in thread
From: Alexey Dobriyan @ 2019-07-04 20:47 UTC (permalink / raw)
  To: tglx, mingo, bp, hpa; +Cc: linux-kernel, x86, adobriyan

Use REP STOSB everywhere if CPU advertises fast REP STOSB.

Gcc LOVES to unroll memset(), using -mmemset-strategy saves terabytes of
.text.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 Makefile                           |  3 +++
 arch/x86/boot/compressed/head_64.S |  4 ++++
 arch/x86/crypto/sha1_ssse3_asm.S   |  7 ++++++-
 arch/x86/include/asm/page_64.h     | 13 +++++++++++++
 arch/x86/kernel/verify_cpu.S       |  2 +-
 arch/x86/lib/Makefile              |  2 ++
 arch/x86/lib/memset_64.S           | 15 +++++++++++++++
 arch/x86/lib/usercopy_64.c         | 16 +++++++++++++++-
 scripts/kconfig/cpuid.c            |  6 +++++-
 scripts/march-native.sh            |  1 +
 10 files changed, 65 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 4422dcf1254b..8b7e6769886e 100644
--- a/Makefile
+++ b/Makefile
@@ -609,6 +609,9 @@ endif
 ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
 KBUILD_CFLAGS += -mmemcpy-strategy=rep_byte:-1:align,rep_byte:-1:noalign
 endif
+ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+KBUILD_CFLAGS += -mmemset-strategy=rep_byte:-1:align,rep_byte:-1:noalign
+endif
 
 ifeq ($(KBUILD_EXTMOD),)
 # Objects we will link into vmlinux / subdirs we need to visit
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index fafb75c6c592..72ed646301b0 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -520,8 +520,12 @@ relocated:
 	leaq    _bss(%rip), %rdi
 	leaq    _ebss(%rip), %rcx
 	subq	%rdi, %rcx
+#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+	rep stosb
+#else
 	shrq	$3, %rcx
 	rep	stosq
+#endif
 
 /*
  * Do the extraction, and jump to the new kernel..
diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
index 99c5b8c4dc38..c98f8f2aead6 100644
--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ b/arch/x86/crypto/sha1_ssse3_asm.S
@@ -90,10 +90,15 @@
 	SHA1_PIPELINED_MAIN_BODY
 
 	# cleanup workspace
-	mov	$8, %ecx
 	mov	%rsp, %rdi
 	xor	%eax, %eax
+#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+	mov	$64, %ecx
+	rep stosb
+#else
+	mov	$8, %ecx
 	rep stosq
+#endif
 
 	mov	%rbp, %rsp		# deallocate workspace
 	pop	%rbp
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 051da768273d..7654d5544e0b 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -40,6 +40,18 @@ extern unsigned long __phys_addr_symbol(unsigned long);
 #define pfn_valid(pfn)          ((pfn) < max_pfn)
 #endif
 
+#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+static __always_inline void clear_page(void *page)
+{
+	uint32_t len = PAGE_SIZE;
+	asm volatile (
+		"rep stosb"
+		: "+D" (page), "+c" (len)
+		: "a" (0)
+		: "memory"
+	);
+}
+#else
 void clear_page_orig(void *page);
 void clear_page_rep(void *page);
 void clear_page_erms(void *page);
@@ -53,6 +65,7 @@ static inline void clear_page(void *page)
 			   "0" (page)
 			   : "cc", "memory", "rax", "rcx");
 }
+#endif
 
 #ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
 static __always_inline void copy_page(void *to, void *from)
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index 57b41dafc592..d3f3370e7dab 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -142,7 +142,7 @@ ENTRY(verify_cpu)
 	jnc	.Lverify_cpu_no_longmode
 #endif
 
-#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+#if defined(CONFIG_MARCH_NATIVE_REP_MOVSB) || defined(CONFIG_MARCH_NATIVE_REP_STOSB)
 	xor	%eax, %eax
 	cpuid
 	cmp	$7, %eax
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index fa24cc717fb1..ed71e88cb859 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -59,7 +59,9 @@ endif
 else
         obj-y += iomap_copy_64.o
         lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
+ifneq ($(CONFIG_MARCH_NATIVE_REP_STOSB),y)
         lib-y += clear_page_64.o
+endif
 ifneq ($(CONFIG_MARCH_NATIVE_REP_MOVSB),y)
 	lib-y += copy_page_64.o
 endif
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 9bc861c71e75..7786d1a65423 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -8,6 +8,20 @@
 
 .weak memset
 
+#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+ENTRY(memset)
+ENTRY(__memset)
+	mov	%esi, %eax
+	mov	%rdi, %rsi
+	mov	%rdx, %rcx
+	rep stosb
+	mov	%rsi, %rax
+	ret
+ENDPROC(memset)
+ENDPROC(__memset)
+EXPORT_SYMBOL(memset)
+EXPORT_SYMBOL(__memset)
+#else
 /*
  * ISO C memset - set a memory block to a byte value. This function uses fast
  * string to get better performance than the original function. The code is
@@ -140,3 +154,4 @@ ENTRY(memset_orig)
 	jmp .Lafter_bad_alignment
 .Lfinal:
 ENDPROC(memset_orig)
+#endif
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index e0e006f1624e..9e9d957bd8b3 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -16,11 +16,23 @@
 
 unsigned long __clear_user(void __user *addr, unsigned long size)
 {
-	long __d0;
 	might_fault();
 	/* no memory constraint because it doesn't change any memory gcc knows
 	   about */
 	stac();
+
+#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+	asm volatile (
+		"0:     rep stosb\n"
+		"1:\n"
+		_ASM_EXTABLE(0b,1b)
+		: "+D" (addr), "+c" (size)
+		: "a" (0)
+		: "memory"
+	);
+#else
+	{
+	long __d0;
 	asm volatile(
 		"	testq  %[size8],%[size8]\n"
 		"	jz     4f\n"
@@ -42,6 +54,8 @@ unsigned long __clear_user(void __user *addr, unsigned long size)
 		_ASM_EXTABLE_UA(1b, 2b)
 		: [size8] "=&c"(size), [dst] "=&D" (__d0)
 		: [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr));
+	}
+#endif
 	clac();
 	return size;
 }
diff --git a/scripts/kconfig/cpuid.c b/scripts/kconfig/cpuid.c
index 2d78fba1dcc7..58d09bda61e5 100644
--- a/scripts/kconfig/cpuid.c
+++ b/scripts/kconfig/cpuid.c
@@ -45,6 +45,7 @@ static inline void cpuid2(uint32_t eax0, uint32_t ecx0, uint32_t *eax, uint32_t
 
 static bool popcnt	= false;
 static bool rep_movsb	= false;
+static bool rep_stosb	= false;
 
 static uint32_t eax0_max;
 
@@ -64,8 +65,10 @@ static void intel(void)
 		cpuid2(7, 0, &eax, &ecx, &edx, &ebx);
 //		printf("%08x %08x %08x %08x\n", eax, ecx, edx, ebx);
 
-		if (ebx & (1 << 9))
+		if (ebx & (1 << 9)) {
 			rep_movsb = true;
+			rep_stosb = true;
+		}
 	}
 }
 
@@ -88,6 +91,7 @@ int main(int argc, char *argv[])
 #define _(x)	if (streq(opt, #x)) return x ? EXIT_SUCCESS : EXIT_FAILURE
 	_(popcnt);
 	_(rep_movsb);
+	_(rep_stosb);
 #undef _
 
 	return EXIT_FAILURE;
diff --git a/scripts/march-native.sh b/scripts/march-native.sh
index 87f00cdb8e10..a41a15a64df4 100755
--- a/scripts/march-native.sh
+++ b/scripts/march-native.sh
@@ -43,6 +43,7 @@ echo "-march=native: $COLLECT_GCC_OPTIONS"
 
 "$CPUID" popcnt		&& option "CONFIG_MARCH_NATIVE_POPCNT"
 "$CPUID" rep_movsb	&& option "CONFIG_MARCH_NATIVE_REP_MOVSB"
+"$CPUID" rep_stosb	&& option "CONFIG_MARCH_NATIVE_REP_STOSB"
 
 for i in $COLLECT_GCC_OPTIONS; do
 	case $i in
-- 
2.21.0


^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2019-07-23 20:04 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-07-22 20:27 [PATCH 1/5] x86_64: -march=native support Alexey Dobriyan
2019-07-22 20:27 ` [PATCH 2/5] x86_64, -march=native: POPCNT support Alexey Dobriyan
2019-07-22 21:12   ` Peter Zijlstra
2019-07-22 21:15     ` Alexey Dobriyan
2019-07-22 21:27       ` Alexey Dobriyan
2019-07-23  7:20       ` Peter Zijlstra
2019-07-23 20:04         ` Alexey Dobriyan
2019-07-22 20:27 ` [PATCH 3/5] x86_64, -march=native: REP MOVSB support Alexey Dobriyan
2019-07-22 20:27 ` [PATCH 4/5] x86_64, -march=native: REP STOSB support Alexey Dobriyan
2019-07-22 20:27 ` [PATCH 5/5] x86_64, -march=native: MOVBE support Alexey Dobriyan
  -- strict thread matches above, loose matches on Subject: below --
2019-07-04 20:47 [PATCH 1/5] x86_64: -march=native support Alexey Dobriyan
2019-07-04 20:47 ` [PATCH 4/5] x86_64, -march=native: REP STOSB support Alexey Dobriyan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).