[2/5] x86_64, -march=native: POPCNT support
diff mbox series

Message ID 20190722202723.13408-2-adobriyan@gmail.com
State New
Headers show
Series
  • [1/5] x86_64: -march=native support
Related show

Commit Message

Alexey Dobriyan July 22, 2019, 8:27 p.m. UTC
Detect POPCNT instruction support and inline hweigth*() functions
if it is supported by CPU.

Detect POPCNT at boot time and conditionally refuse to boot.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 arch/x86/include/asm/arch_hweight.h           | 24 +++++++++++++++++++
 arch/x86/include/asm/segment.h                |  1 +
 arch/x86/kernel/verify_cpu.S                  |  8 +++++++
 arch/x86/lib/Makefile                         |  5 +++-
 .../drm/i915/display/intel_display_power.c    |  2 +-
 drivers/misc/sgi-gru/grumain.c                |  2 +-
 fs/btrfs/tree-checker.c                       |  4 ++--
 include/linux/bitops.h                        |  2 ++
 lib/Makefile                                  |  2 ++
 scripts/kconfig/cpuid.c                       |  7 ++++++
 scripts/march-native.sh                       |  2 ++
 11 files changed, 54 insertions(+), 5 deletions(-)

Comments

Peter Zijlstra July 22, 2019, 9:12 p.m. UTC | #1
On Mon, Jul 22, 2019 at 11:27:20PM +0300, Alexey Dobriyan wrote:
> Detect POPCNT instruction support and inline hweigth*() functions
> if it is supported by CPU.
> 
> Detect POPCNT at boot time and conditionally refuse to boot.
> 
> Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
> ---
>  arch/x86/include/asm/arch_hweight.h           | 24 +++++++++++++++++++
>  arch/x86/include/asm/segment.h                |  1 +
>  arch/x86/kernel/verify_cpu.S                  |  8 +++++++
>  arch/x86/lib/Makefile                         |  5 +++-
>  .../drm/i915/display/intel_display_power.c    |  2 +-
>  drivers/misc/sgi-gru/grumain.c                |  2 +-
>  fs/btrfs/tree-checker.c                       |  4 ++--
>  include/linux/bitops.h                        |  2 ++
>  lib/Makefile                                  |  2 ++
>  scripts/kconfig/cpuid.c                       |  7 ++++++
>  scripts/march-native.sh                       |  2 ++
>  11 files changed, 54 insertions(+), 5 deletions(-)

*WHY* ?

AFAICT this just adds lines and complexity and wins aboslutely nothing.
Alexey Dobriyan July 22, 2019, 9:15 p.m. UTC | #2
On Mon, Jul 22, 2019 at 11:12:10PM +0200, Peter Zijlstra wrote:
> On Mon, Jul 22, 2019 at 11:27:20PM +0300, Alexey Dobriyan wrote:
> > Detect POPCNT instruction support and inline hweigth*() functions
> > if it is supported by CPU.
> > 
> > Detect POPCNT at boot time and conditionally refuse to boot.
> > 
> > Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
> > ---
> >  arch/x86/include/asm/arch_hweight.h           | 24 +++++++++++++++++++
> >  arch/x86/include/asm/segment.h                |  1 +
> >  arch/x86/kernel/verify_cpu.S                  |  8 +++++++
> >  arch/x86/lib/Makefile                         |  5 +++-
> >  .../drm/i915/display/intel_display_power.c    |  2 +-
> >  drivers/misc/sgi-gru/grumain.c                |  2 +-
> >  fs/btrfs/tree-checker.c                       |  4 ++--
> >  include/linux/bitops.h                        |  2 ++
> >  lib/Makefile                                  |  2 ++
> >  scripts/kconfig/cpuid.c                       |  7 ++++++
> >  scripts/march-native.sh                       |  2 ++
> >  11 files changed, 54 insertions(+), 5 deletions(-)
> 
> *WHY* ?
> 
> AFAICT this just adds lines and complexity and wins aboslutely nothing.

If CPU is know to have POPCNT, it doesn't make sense to go through RDI.
Additionally some CPUs (still?) have fake dependency on the destination,
so "popcnt rax, rdi" is suboptimal.
Alexey Dobriyan July 22, 2019, 9:27 p.m. UTC | #3
On Tue, Jul 23, 2019 at 12:15:39AM +0300, Alexey Dobriyan wrote:
> On Mon, Jul 22, 2019 at 11:12:10PM +0200, Peter Zijlstra wrote:
> > On Mon, Jul 22, 2019 at 11:27:20PM +0300, Alexey Dobriyan wrote:
> > > Detect POPCNT instruction support and inline hweigth*() functions
> > > if it is supported by CPU.
> > > 
> > > Detect POPCNT at boot time and conditionally refuse to boot.
> > > 
> > > Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
> > > ---
> > >  arch/x86/include/asm/arch_hweight.h           | 24 +++++++++++++++++++
> > >  arch/x86/include/asm/segment.h                |  1 +
> > >  arch/x86/kernel/verify_cpu.S                  |  8 +++++++
> > >  arch/x86/lib/Makefile                         |  5 +++-
> > >  .../drm/i915/display/intel_display_power.c    |  2 +-
> > >  drivers/misc/sgi-gru/grumain.c                |  2 +-
> > >  fs/btrfs/tree-checker.c                       |  4 ++--
> > >  include/linux/bitops.h                        |  2 ++
> > >  lib/Makefile                                  |  2 ++
> > >  scripts/kconfig/cpuid.c                       |  7 ++++++
> > >  scripts/march-native.sh                       |  2 ++
> > >  11 files changed, 54 insertions(+), 5 deletions(-)
> > 
> > *WHY* ?
> > 
> > AFAICT this just adds lines and complexity and wins aboslutely nothing.
> 
> If CPU is know to have POPCNT, it doesn't make sense to go through RDI.
> Additionally some CPUs (still?) have fake dependency on the destination,
> so "popcnt rax, rdi" is suboptimal.

More general argument is that if -march=native is accepted, compiler will
generate new instructions which will throw #UD on CPUs which aren't
capable, so it doesn't make sense to _not_ go deeper and use all the
knowledge about current CPU.
Peter Zijlstra July 23, 2019, 7:20 a.m. UTC | #4
On Tue, Jul 23, 2019 at 12:15:39AM +0300, Alexey Dobriyan wrote:
> On Mon, Jul 22, 2019 at 11:12:10PM +0200, Peter Zijlstra wrote:
> > On Mon, Jul 22, 2019 at 11:27:20PM +0300, Alexey Dobriyan wrote:
> > > Detect POPCNT instruction support and inline hweigth*() functions
> > > if it is supported by CPU.
> > > 
> > > Detect POPCNT at boot time and conditionally refuse to boot.
> > > 
> > > Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
> > > ---
> > >  arch/x86/include/asm/arch_hweight.h           | 24 +++++++++++++++++++
> > >  arch/x86/include/asm/segment.h                |  1 +
> > >  arch/x86/kernel/verify_cpu.S                  |  8 +++++++
> > >  arch/x86/lib/Makefile                         |  5 +++-
> > >  .../drm/i915/display/intel_display_power.c    |  2 +-
> > >  drivers/misc/sgi-gru/grumain.c                |  2 +-
> > >  fs/btrfs/tree-checker.c                       |  4 ++--
> > >  include/linux/bitops.h                        |  2 ++
> > >  lib/Makefile                                  |  2 ++
> > >  scripts/kconfig/cpuid.c                       |  7 ++++++
> > >  scripts/march-native.sh                       |  2 ++
> > >  11 files changed, 54 insertions(+), 5 deletions(-)
> > 
> > *WHY* ?
> > 
> > AFAICT this just adds lines and complexity and wins aboslutely nothing.
> 
> If CPU is know to have POPCNT, it doesn't make sense to go through RDI.
> Additionally some CPUs (still?) have fake dependency on the destination,
> so "popcnt rax, rdi" is suboptimal.

You completely forgot to mention any of that in your Changelog, also I
doubt you can find code where this makes a measurable difference. IOW, I
still doubt it makes any kind of sense.
Alexey Dobriyan July 23, 2019, 8:04 p.m. UTC | #5
On Tue, Jul 23, 2019 at 09:20:43AM +0200, Peter Zijlstra wrote:
> On Tue, Jul 23, 2019 at 12:15:39AM +0300, Alexey Dobriyan wrote:
> > On Mon, Jul 22, 2019 at 11:12:10PM +0200, Peter Zijlstra wrote:
> > > On Mon, Jul 22, 2019 at 11:27:20PM +0300, Alexey Dobriyan wrote:
> > > > Detect POPCNT instruction support and inline hweigth*() functions
> > > > if it is supported by CPU.
> > > > 
> > > > Detect POPCNT at boot time and conditionally refuse to boot.
> > > > 
> > > > Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
> > > > ---
> > > >  arch/x86/include/asm/arch_hweight.h           | 24 +++++++++++++++++++
> > > >  arch/x86/include/asm/segment.h                |  1 +
> > > >  arch/x86/kernel/verify_cpu.S                  |  8 +++++++
> > > >  arch/x86/lib/Makefile                         |  5 +++-
> > > >  .../drm/i915/display/intel_display_power.c    |  2 +-
> > > >  drivers/misc/sgi-gru/grumain.c                |  2 +-
> > > >  fs/btrfs/tree-checker.c                       |  4 ++--
> > > >  include/linux/bitops.h                        |  2 ++
> > > >  lib/Makefile                                  |  2 ++
> > > >  scripts/kconfig/cpuid.c                       |  7 ++++++
> > > >  scripts/march-native.sh                       |  2 ++
> > > >  11 files changed, 54 insertions(+), 5 deletions(-)
> > > 
> > > *WHY* ?
> > > 
> > > AFAICT this just adds lines and complexity and wins aboslutely nothing.
> > 
> > If CPU is know to have POPCNT, it doesn't make sense to go through RDI.
> > Additionally some CPUs (still?) have fake dependency on the destination,
> > so "popcnt rax, rdi" is suboptimal.
> 
> You completely forgot to mention any of that in your Changelog, also I
> doubt you can find code where this makes a measurable difference. IOW, I
> still doubt it makes any kind of sense.

It saves some space, although not much. gcc likes to use 64-bit version
even where 32-bit version should suffice.

Regardless I found some problems with POPCNT patch, so hold off the
series.

Patch
diff mbox series

diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
index ba88edd0d58b..3797aa57baa5 100644
--- a/arch/x86/include/asm/arch_hweight.h
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -2,6 +2,28 @@ 
 #ifndef _ASM_X86_HWEIGHT_H
 #define _ASM_X86_HWEIGHT_H
 
+#ifdef CONFIG_MARCH_NATIVE_POPCNT
+static inline unsigned int __arch_hweight64(uint64_t x)
+{
+	return __builtin_popcountll(x);
+}
+
+static inline unsigned int __arch_hweight32(uint32_t x)
+{
+	return __builtin_popcount(x);
+}
+
+static inline unsigned int __arch_hweight16(uint16_t x)
+{
+	return __builtin_popcount(x);
+}
+
+static inline unsigned int __arch_hweight8(uint8_t x)
+{
+	return __builtin_popcount(x);
+}
+#else
+
 #include <asm/cpufeatures.h>
 
 #ifdef CONFIG_64BIT
@@ -53,3 +75,5 @@  static __always_inline unsigned long __arch_hweight64(__u64 w)
 #endif /* CONFIG_X86_32 */
 
 #endif
+
+#endif
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index ac3892920419..d314c6b9b632 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -4,6 +4,7 @@ 
 
 #include <linux/const.h>
 #include <asm/alternative.h>
+#include <asm/cpufeatures.h>
 
 /*
  * Constructor for a conventional segment GDT (or LDT) entry.
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index a024c4f7ba56..a9be8904faa3 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -134,6 +134,14 @@  ENTRY(verify_cpu)
 	movl $1,%eax
 	ret
 .Lverify_cpu_sse_ok:
+
+#ifdef CONFIG_MARCH_NATIVE_POPCNT
+	mov	$1, %eax
+	cpuid
+	bt	$23, %ecx
+	jnc	.Lverify_cpu_no_longmode
+#endif
+
 	popf				# Restore caller passed flags
 	xorl %eax, %eax
 	ret
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 5246db42de45..7dc0e71b0ef3 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -40,7 +40,10 @@  lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
 lib-$(CONFIG_FUNCTION_ERROR_INJECTION)	+= error-inject.o
 lib-$(CONFIG_RETPOLINE) += retpoline.o
 
-obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
+obj-y += msr.o msr-reg.o msr-reg-export.o
+ifneq ($(CONFIG_MARCH_NATIVE_POPCNT),y)
+	obj-y += hweight.o
+endif
 obj-y += iomem.o
 
 ifeq ($(CONFIG_X86_32),y)
diff --git a/drivers/gpu/drm/i915/display/intel_display_power.c b/drivers/gpu/drm/i915/display/intel_display_power.c
index c93ad512014c..9066105f2fea 100644
--- a/drivers/gpu/drm/i915/display/intel_display_power.c
+++ b/drivers/gpu/drm/i915/display/intel_display_power.c
@@ -1570,7 +1570,7 @@  static void print_power_domains(struct i915_power_domains *power_domains,
 {
 	enum intel_display_power_domain domain;
 
-	DRM_DEBUG_DRIVER("%s (%lu):\n", prefix, hweight64(mask));
+	DRM_DEBUG_DRIVER("%s (%u):\n", prefix, hweight64(mask));
 	for_each_power_domain(domain, mask)
 		DRM_DEBUG_DRIVER("%s use_count %d\n",
 				 intel_display_power_domain_str(domain),
diff --git a/drivers/misc/sgi-gru/grumain.c b/drivers/misc/sgi-gru/grumain.c
index 40ac59dd018c..30cfeeb28e74 100644
--- a/drivers/misc/sgi-gru/grumain.c
+++ b/drivers/misc/sgi-gru/grumain.c
@@ -833,7 +833,7 @@  void gru_steal_context(struct gru_thread_state *gts)
 	}
 	gru_dbg(grudev,
 		"stole gid %d, ctxnum %d from gts %p. Need cb %d, ds %d;"
-		" avail cb %ld, ds %ld\n",
+		" avail cb %u, ds %u\n",
 		gru->gs_gid, ctxnum, ngts, cbr, dsr, hweight64(gru->gs_cbr_map),
 		hweight64(gru->gs_dsr_map));
 }
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index ccd5706199d7..2d33c6ae0e61 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -478,7 +478,7 @@  static int check_block_group_item(struct extent_buffer *leaf,
 	flags = btrfs_block_group_flags(&bgi);
 	if (hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) > 1) {
 		block_group_err(leaf, slot,
-"invalid profile flags, have 0x%llx (%lu bits set) expect no more than 1 bit set",
+"invalid profile flags, have 0x%llx (%u bits set) expect no more than 1 bit set",
 			flags & BTRFS_BLOCK_GROUP_PROFILE_MASK,
 			hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK));
 		return -EUCLEAN;
@@ -491,7 +491,7 @@  static int check_block_group_item(struct extent_buffer *leaf,
 	    type != (BTRFS_BLOCK_GROUP_METADATA |
 			   BTRFS_BLOCK_GROUP_DATA)) {
 		block_group_err(leaf, slot,
-"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx",
+"invalid type, have 0x%llx (%u bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx",
 			type, hweight64(type),
 			BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA,
 			BTRFS_BLOCK_GROUP_SYSTEM,
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index cf074bce3eb3..655b120bba66 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -7,10 +7,12 @@ 
 #define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE)
 #define BITS_TO_LONGS(nr)	DIV_ROUND_UP(nr, BITS_PER_TYPE(long))
 
+#ifndef CONFIG_MARCH_NATIVE_POPCNT
 extern unsigned int __sw_hweight8(unsigned int w);
 extern unsigned int __sw_hweight16(unsigned int w);
 extern unsigned int __sw_hweight32(unsigned int w);
 extern unsigned long __sw_hweight64(__u64 w);
+#endif
 
 /*
  * Include this here because some architectures need generic_ffs/fls in
diff --git a/lib/Makefile b/lib/Makefile
index 095601ce371d..32400f3a3328 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -114,7 +114,9 @@  obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o
 
 obj-y += logic_pio.o
 
+ifneq ($(CONFIG_MARCH_NATIVE_POPCNT),y)
 obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
+endif
 
 obj-$(CONFIG_BTREE) += btree.o
 obj-$(CONFIG_INTERVAL_TREE) += interval_tree.o
diff --git a/scripts/kconfig/cpuid.c b/scripts/kconfig/cpuid.c
index 81b292382e26..9efc0d9464d8 100644
--- a/scripts/kconfig/cpuid.c
+++ b/scripts/kconfig/cpuid.c
@@ -43,6 +43,8 @@  static inline void cpuid2(uint32_t eax0, uint32_t ecx0, uint32_t *eax, uint32_t
 	);
 }
 
+static bool popcnt	= false;
+
 static uint32_t eax0_max;
 
 static void intel(void)
@@ -52,6 +54,10 @@  static void intel(void)
 	if (eax0_max >= 1) {
 		cpuid(1, &eax, &ecx, &edx, &ebx);
 //		printf("%08x %08x %08x %08x\n", eax, ecx, edx, ebx);
+
+		if (ecx & (1 << 23)) {
+			popcnt = true;
+		}
 	}
 }
 
@@ -72,6 +78,7 @@  int main(int argc, char *argv[])
 	}
 
 #define _(x)	if (streq(opt, #x)) return x ? EXIT_SUCCESS : EXIT_FAILURE
+	_(popcnt);
 #undef _
 
 	return EXIT_FAILURE;
diff --git a/scripts/march-native.sh b/scripts/march-native.sh
index 29a33c80b62b..c3059f93ed2b 100755
--- a/scripts/march-native.sh
+++ b/scripts/march-native.sh
@@ -41,6 +41,8 @@  COLLECT_GCC_OPTIONS=$(
 )
 echo "-march=native: $COLLECT_GCC_OPTIONS"
 
+"$CPUID" popcnt		&& option "CONFIG_MARCH_NATIVE_POPCNT"
+
 for i in $COLLECT_GCC_OPTIONS; do
 	case $i in
 		*/cc1|-E|-quiet|-v|/dev/null|--param|-fstack-protector*)