[2/5] -march=native: POPCNT support
diff mbox series

Message ID 20171207224154.4687-2-adobriyan@gmail.com
State New, archived
Headers show
Series
  • [v0,1/5] x86_64: march=native support
Related show

Commit Message

Alexey Dobriyan Dec. 7, 2017, 10:41 p.m. UTC
Mainline kernel can only generate "popcnt rax, rdi" instruction
with alternative masquareading as function call. Patch allows
to generate all POPCNT variations and inlines hweigth*() family of functions.

	$ objdump  -dr ../obj/vmlinux | grep popcnt
	ffffffff81004f6d:       f3 48 0f b8 c9          popcnt rcx,rcx
	ffffffff81008484:       f3 48 0f b8 03          popcnt rax,QWORD PTR [rbx]
	ffffffff81073aae:       f3 48 0f b8 d8          popcnt rbx,rax
		...

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 arch/x86/include/asm/arch_hweight.h | 32 ++++++++++++++++++++++++++++++--
 arch/x86/lib/Makefile               |  5 ++++-
 include/linux/bitops.h              |  2 ++
 lib/Makefile                        |  2 ++
 scripts/kconfig/cpuid.c             |  6 ++++++
 scripts/march-native.sh             |  6 +++++-
 6 files changed, 49 insertions(+), 4 deletions(-)

Comments

H. Peter Anvin Dec. 7, 2017, 11:07 p.m. UTC | #1
On 12/07/17 14:41, Alexey Dobriyan wrote:
> Mainline kernel can only generate "popcnt rax, rdi" instruction
> with alternative masquareading as function call. Patch allows
> to generate all POPCNT variations and inlines hweigth*() family of functions.
> 
> 	$ objdump  -dr ../obj/vmlinux | grep popcnt
> 	ffffffff81004f6d:       f3 48 0f b8 c9          popcnt rcx,rcx
> 	ffffffff81008484:       f3 48 0f b8 03          popcnt rax,QWORD PTR [rbx]
> 	ffffffff81073aae:       f3 48 0f b8 d8          popcnt rbx,rax
> 		...
> 
> Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
> ---
>  arch/x86/include/asm/arch_hweight.h | 32 ++++++++++++++++++++++++++++++--
>  arch/x86/lib/Makefile               |  5 ++++-
>  include/linux/bitops.h              |  2 ++
>  lib/Makefile                        |  2 ++
>  scripts/kconfig/cpuid.c             |  6 ++++++
>  scripts/march-native.sh             |  6 +++++-
>  6 files changed, 49 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
> index 34a10b2d5b73..58e4f65d8665 100644
> --- a/arch/x86/include/asm/arch_hweight.h
> +++ b/arch/x86/include/asm/arch_hweight.h
> @@ -2,6 +2,34 @@
>  #ifndef _ASM_X86_HWEIGHT_H
>  #define _ASM_X86_HWEIGHT_H
>  
> +#define __HAVE_ARCH_SW_HWEIGHT
> +
> +#ifdef CONFIG_MARCH_NATIVE_POPCNT
> +static inline unsigned int __arch_hweight64(uint64_t x)
> +{
> +	uint64_t rv;
> +	asm ("popcnt %1, %0" : "=r" (rv) : "rm" (x));
> +	return rv;
> +}
> +
> +static inline unsigned int __arch_hweight32(uint32_t x)
> +{
> +	uint32_t rv;
> +	asm ("popcnt %1, %0" : "=r" (rv) : "rm" (x));
> +	return rv;
> +}
> +
> +static inline unsigned int __arch_hweight16(uint16_t x)
> +{
> +	return __arch_hweight32(x);
> +}
> +
> +static inline unsigned int __arch_hweight8(uint8_t x)
> +{
> +	return __arch_hweight32(x);
> +}


-march=native really would be better implemented by examining the macros
generated by gcc which correspond to the selected -m options
(-march=native really just selects a combination of -m options.)  It
seems bizarre to just reimplement the mechanism that already exists.

Now, this specific case would be better done with alternatives; we can
patch in a JMP to an out-of-line stub to mangle the arguments.  Then you
get the benefit on all systems and don't need to decide at compile time.

The reason to use -m options for this would be to be able to use the
__builtin_popcount() and __builtin_popcountl() intrinsics, which would
allow gcc to schedule it and optimize arbitrarily.

So, much more something like:

#ifdef __POPCNT__

static inline unsigned int __arch_hweight64(uint64_t x)
{
	return __builtin_popcountll(x);
}

static inline unsigned int __arch_hweight32(uint32_t x)
{
	return __builtin_popcount(x);
}

#else

/* Assembly code with alternatives */

	/* Enabled alternative */
	popcnt %1, %0

	/* Non-enabled alternative */
	jmp 1f
2:
	.pushsection .altinstr_aux
1:
	pushq %q1		/* pushl %k1 for i386 */
	call ___arch_hweight%z1
	popq %q0		/* popl %k0 for i386 */
	jmp 2b
	.popsection

#endif


The ___arch_hweight[bwlq] functions would have to be written in assembly
with all registers preserved.  The input and output is a common word on
the stack -- 8(%rsp) or 4(%esp) for x86-64 v. i386.

	-hpa
Alexey Dobriyan Dec. 8, 2017, 10:09 a.m. UTC | #2
On 12/8/17, H. Peter Anvin <hpa@zytor.com> wrote:
> On 12/07/17 14:41, Alexey Dobriyan wrote:
>> Mainline kernel can only generate "popcnt rax, rdi" instruction
>> with alternative masquareading as function call. Patch allows
>> to generate all POPCNT variations and inlines hweigth*() family of
>> functions.

> -march=native really would be better implemented by examining the macros
> generated by gcc which correspond to the selected -m options
> (-march=native really just selects a combination of -m options.)  It
> seems bizarre to just reimplement the mechanism that already exists.

This mechanism can do feature detection part only.

Code generation tweaks (uop fusing etc) require passing -march=native,
and hardly can be expressed through defines.

Some things aren't recorded in defines (--param l1-cache-size),
it is not clear how and what to optimize base on cache sizes,
but if kernel controls CPU detection code there is no need to wait
when someone smart comes up with an idea.

Again, clang emits slightly different defines.

> Now, this specific case would be better done with alternatives; we can
> patch in a JMP to an out-of-line stub to mangle the arguments.  Then you
> get the benefit on all systems and don't need to decide at compile time.
>
> The reason to use -m options for this would be to be able to use the
> __builtin_popcount() and __builtin_popcountl() intrinsics, which would
> allow gcc to schedule it and optimize arbitrarily.
>
> So, much more something like:
>
> #ifdef __POPCNT__
>
> static inline unsigned int __arch_hweight64(uint64_t x)
> {
> 	return __builtin_popcountll(x);

OK

Patch
diff mbox series

diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
index 34a10b2d5b73..58e4f65d8665 100644
--- a/arch/x86/include/asm/arch_hweight.h
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -2,6 +2,34 @@ 
 #ifndef _ASM_X86_HWEIGHT_H
 #define _ASM_X86_HWEIGHT_H
 
+#define __HAVE_ARCH_SW_HWEIGHT
+
+#ifdef CONFIG_MARCH_NATIVE_POPCNT
+static inline unsigned int __arch_hweight64(uint64_t x)
+{
+	uint64_t rv;
+	asm ("popcnt %1, %0" : "=r" (rv) : "rm" (x));
+	return rv;
+}
+
+static inline unsigned int __arch_hweight32(uint32_t x)
+{
+	uint32_t rv;
+	asm ("popcnt %1, %0" : "=r" (rv) : "rm" (x));
+	return rv;
+}
+
+static inline unsigned int __arch_hweight16(uint16_t x)
+{
+	return __arch_hweight32(x);
+}
+
+static inline unsigned int __arch_hweight8(uint8_t x)
+{
+	return __arch_hweight32(x);
+}
+#else
+
 #include <asm/cpufeatures.h>
 
 #ifdef CONFIG_64BIT
@@ -18,8 +46,6 @@ 
 #define REG_OUT "a"
 #endif
 
-#define __HAVE_ARCH_SW_HWEIGHT
-
 static __always_inline unsigned int __arch_hweight32(unsigned int w)
 {
 	unsigned int res;
@@ -61,3 +87,5 @@  static __always_inline unsigned long __arch_hweight64(__u64 w)
 #endif /* CONFIG_X86_32 */
 
 #endif
+
+#endif
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 7b181b61170e..c26ad76e7048 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -27,7 +27,10 @@  lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o
 lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
 
-obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
+obj-y += msr.o msr-reg.o msr-reg-export.o
+ifneq ($(CONFIG_MARCH_NATIVE),y)
+	obj-y += hweight.o
+endif
 
 ifeq ($(CONFIG_X86_32),y)
         obj-y += atomic64_32.o
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 4cac4e1a72ff..ab58fed4ab90 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -26,10 +26,12 @@ 
 	(((~0ULL) - (1ULL << (l)) + 1) & \
 	 (~0ULL >> (BITS_PER_LONG_LONG - 1 - (h))))
 
+#ifndef CONFIG_MARCH_NATIVE_POPCNT
 extern unsigned int __sw_hweight8(unsigned int w);
 extern unsigned int __sw_hweight16(unsigned int w);
 extern unsigned int __sw_hweight32(unsigned int w);
 extern unsigned long __sw_hweight64(__u64 w);
+#endif
 
 /*
  * Include this here because some architectures need generic_ffs/fls in
diff --git a/lib/Makefile b/lib/Makefile
index d11c48ec8ffd..3867b73721aa 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -81,7 +81,9 @@  obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o
 obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o
 obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o
 
+ifneq ($(CONFIG_MARCH_NATIVE_POPCNT),y)
 obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
+endif
 
 obj-$(CONFIG_BTREE) += btree.o
 obj-$(CONFIG_INTERVAL_TREE) += interval_tree.o
diff --git a/scripts/kconfig/cpuid.c b/scripts/kconfig/cpuid.c
index f1983027fe2b..e565dd446bdf 100644
--- a/scripts/kconfig/cpuid.c
+++ b/scripts/kconfig/cpuid.c
@@ -42,6 +42,8 @@  static inline void cpuid2(uint32_t eax0, uint32_t ecx0, uint32_t *eax, uint32_t
 	);
 }
 
+static bool popcnt	= false;
+
 static uint32_t eax0_max;
 
 static void intel(void)
@@ -51,6 +53,9 @@  static void intel(void)
 	if (eax0_max >= 1) {
 		cpuid(1, &eax, &ecx, &edx, &ebx);
 //		printf("%08x %08x %08x %08x\n", eax, ecx, edx, ebx);
+
+		if (ecx & (1 << 23))
+			popcnt = true;
 	}
 }
 
@@ -70,6 +75,7 @@  int main(int argc, char *argv[])
 		intel();
 
 #define _(x)	if (streq(opt, #x)) return x ? EXIT_SUCCESS : EXIT_FAILURE
+	_(popcnt);
 #undef _
 
 	return EXIT_FAILURE;
diff --git a/scripts/march-native.sh b/scripts/march-native.sh
index 4f0fc82f7722..6641e356b646 100755
--- a/scripts/march-native.sh
+++ b/scripts/march-native.sh
@@ -29,6 +29,10 @@  option() {
 	echo "#define $1 1"	>>"$AUTOCONF2"
 }
 
+if test -x "$CPUID"; then
+	"$CPUID" popcnt		&& option "CONFIG_MARCH_NATIVE_POPCNT"
+fi
+
 if test ! -f "$AUTOCONF1" -o ! -f "$AUTOCONF2"; then
 	exit 0
 fi
@@ -72,7 +76,7 @@  for i in $COLLECT_GCC_OPTIONS; do
 		-mmmx)		option "CONFIG_MARCH_NATIVE_MMX"	;;
 		-mmovbe)	option "CONFIG_MARCH_NATIVE_MOVBE"	;;
 		-mpclmul)	option "CONFIG_MARCH_NATIVE_PCLMUL"	;;
-		-mpopcnt)	option "CONFIG_MATCH_NATIVE_POPCNT"	;;
+		-mpopcnt);;
 		-mprfchw)	option "CONFIG_MARCH_NATIVE_PREFETCHW"	;;
 		-mrdrnd)	option "CONFIG_MARCH_NATIVE_RDRND"	;;
 		-mrdseed)	option "CONFIG_MARCH_NATIVE_RDSEED"	;;