[Xen-devel] [PATCH v2] x86: use POPCNT for hweight<N>() when available

* [Xen-devel] [PATCH v2] x86: use POPCNT for hweight<N>() when available
@ 2019-07-15 14:39 Jan Beulich
  2020-05-14 14:05 ` Roger Pau Monné
  2023-03-17 11:22 ` Roger Pau Monné
  0 siblings, 2 replies; 21+ messages in thread
From: Jan Beulich @ 2019-07-15 14:39 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monné

[-- Attachment #1: Type: text/plain, Size: 4300 bytes --]

This is faster than using the software implementation, and the insn is
available on all half-way recent hardware. Therefore convert
generic_hweight<N>() to out-of-line functions (without affecting Arm)
and use alternatives patching to replace the function calls.

Note that the approach doesn#t work for clang, due to it not recognizing
-ffixed-*.

Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Also suppress UB sanitizer instrumentation. Reduce macroization in
     hweight.c. Exclude clang builds.
---
Note: Using "g" instead of "X" as the dummy constraint in hweight64()
       and hweight32(), other than expected, produces slightly better
       code with gcc 8.

--- a/xen/arch/x86/Makefile
+++ b/xen/arch/x86/Makefile
@@ -31,6 +31,10 @@ obj-y += emul-i8254.o
  obj-y += extable.o
  obj-y += flushtlb.o
  obj-$(CONFIG_CRASH_DEBUG) += gdbstub.o
+# clang doesn't appear to know of -ffixed-*
+hweight-$(gcc) := hweight.o
+hweight-$(clang) :=
+obj-y += $(hweight-y)
  obj-y += hypercall.o
  obj-y += i387.o
  obj-y += i8259.o
@@ -251,6 +255,10 @@ boot/mkelf32: boot/mkelf32.c
  efi/mkreloc: efi/mkreloc.c
  	$(HOSTCC) $(HOSTCFLAGS) -g -o $@ $<
  
+nocov-y += hweight.o
+noubsan-y += hweight.o
+hweight.o: CFLAGS += $(foreach reg,cx dx si 8 9 10 11,-ffixed-r$(reg))
+
  .PHONY: clean
  clean::
  	rm -f asm-offsets.s *.lds boot/*.o boot/*~ boot/core boot/mkelf32
--- /dev/null
+++ b/xen/arch/x86/hweight.c
@@ -0,0 +1,21 @@
+#define generic_hweight64 _hweight64
+#define generic_hweight32 _hweight32
+#define generic_hweight16 _hweight16
+#define generic_hweight8  _hweight8
+
+#include <xen/compiler.h>
+
+#undef inline
+#define inline always_inline
+
+#include <xen/bitops.h>
+
+#undef generic_hweight8
+#undef generic_hweight16
+#undef generic_hweight32
+#undef generic_hweight64
+
+unsigned int generic_hweight8 (unsigned int x) { return _hweight8 (x); }
+unsigned int generic_hweight16(unsigned int x) { return _hweight16(x); }
+unsigned int generic_hweight32(unsigned int x) { return _hweight32(x); }
+unsigned int generic_hweight64(uint64_t x)     { return _hweight64(x); }
--- a/xen/include/asm-x86/bitops.h
+++ b/xen/include/asm-x86/bitops.h
@@ -475,9 +475,36 @@ static inline int fls(unsigned int x)
   *
   * The Hamming Weight of a number is the total number of bits set in it.
   */
+#ifndef __clang__
+/* POPCNT encodings with %{r,e}di input and %{r,e}ax output: */
+#define POPCNT_64 ".byte 0xF3, 0x48, 0x0F, 0xB8, 0xC7"
+#define POPCNT_32 ".byte 0xF3, 0x0F, 0xB8, 0xC7"
+
+#define hweight_(n, x, insn, setup, cout, cin) ({                         \
+    unsigned int res_;                                                    \
+    /*                                                                    \
+     * For the function call the POPCNT input register needs to be marked \
+     * modified as well. Set up a local variable of appropriate type      \
+     * for this purpose.                                                  \
+     */                                                                   \
+    typeof((uint##n##_t)(x) + 0U) val_ = (x);                             \
+    alternative_io(setup "; call generic_hweight" #n,                     \
+                   insn, X86_FEATURE_POPCNT,                              \
+                   ASM_OUTPUT2([res] "=a" (res_), [val] cout (val_)),     \
+                   [src] cin (val_));                                     \
+    res_;                                                                 \
+})
+#define hweight64(x) hweight_(64, x, POPCNT_64, "", "+D", "g")
+#define hweight32(x) hweight_(32, x, POPCNT_32, "", "+D", "g")
+#define hweight16(x) hweight_(16, x, "movzwl %w[src], %[val]; " POPCNT_32, \
+                              "mov %[src], %[val]", "=&D", "rm")
+#define hweight8(x)  hweight_( 8, x, "movzbl %b[src], %[val]; " POPCNT_32, \
+                              "mov %[src], %[val]", "=&D", "rm")
+#else
  #define hweight64(x) generic_hweight64(x)
  #define hweight32(x) generic_hweight32(x)
  #define hweight16(x) generic_hweight16(x)
  #define hweight8(x) generic_hweight8(x)
+#endif
  
  #endif /* _X86_BITOPS_H */

[-- Attachment #2: x86-hweight-popcnt.patch --]
[-- Type: text/plain, Size: 4220 bytes --]

x86: use POPCNT for hweight<N>() when available

This is faster than using the software implementation, and the insn is
available on all half-way recent hardware. Therefore convert
generic_hweight<N>() to out-of-line functions (without affecting Arm)
and use alternatives patching to replace the function calls.

Note that the approach doesn#t work for clang, due to it not recognizing
-ffixed-*.

Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Also suppress UB sanitizer instrumentation. Reduce macroization in
    hweight.c. Exclude clang builds.
---
Note: Using "g" instead of "X" as the dummy constraint in hweight64()
      and hweight32(), other than expected, produces slightly better
      code with gcc 8.

--- a/xen/arch/x86/Makefile
+++ b/xen/arch/x86/Makefile
@@ -31,6 +31,10 @@ obj-y += emul-i8254.o
 obj-y += extable.o
 obj-y += flushtlb.o
 obj-$(CONFIG_CRASH_DEBUG) += gdbstub.o
+# clang doesn't appear to know of -ffixed-*
+hweight-$(gcc) := hweight.o
+hweight-$(clang) :=
+obj-y += $(hweight-y)
 obj-y += hypercall.o
 obj-y += i387.o
 obj-y += i8259.o
@@ -251,6 +255,10 @@ boot/mkelf32: boot/mkelf32.c
 efi/mkreloc: efi/mkreloc.c
 	$(HOSTCC) $(HOSTCFLAGS) -g -o $@ $<
 
+nocov-y += hweight.o
+noubsan-y += hweight.o
+hweight.o: CFLAGS += $(foreach reg,cx dx si 8 9 10 11,-ffixed-r$(reg))
+
 .PHONY: clean
 clean::
 	rm -f asm-offsets.s *.lds boot/*.o boot/*~ boot/core boot/mkelf32
--- /dev/null
+++ b/xen/arch/x86/hweight.c
@@ -0,0 +1,21 @@
+#define generic_hweight64 _hweight64
+#define generic_hweight32 _hweight32
+#define generic_hweight16 _hweight16
+#define generic_hweight8  _hweight8
+
+#include <xen/compiler.h>
+
+#undef inline
+#define inline always_inline
+
+#include <xen/bitops.h>
+
+#undef generic_hweight8
+#undef generic_hweight16
+#undef generic_hweight32
+#undef generic_hweight64
+
+unsigned int generic_hweight8 (unsigned int x) { return _hweight8 (x); }
+unsigned int generic_hweight16(unsigned int x) { return _hweight16(x); }
+unsigned int generic_hweight32(unsigned int x) { return _hweight32(x); }
+unsigned int generic_hweight64(uint64_t x)     { return _hweight64(x); }
--- a/xen/include/asm-x86/bitops.h
+++ b/xen/include/asm-x86/bitops.h
@@ -475,9 +475,36 @@ static inline int fls(unsigned int x)
  *
  * The Hamming Weight of a number is the total number of bits set in it.
  */
+#ifndef __clang__
+/* POPCNT encodings with %{r,e}di input and %{r,e}ax output: */
+#define POPCNT_64 ".byte 0xF3, 0x48, 0x0F, 0xB8, 0xC7"
+#define POPCNT_32 ".byte 0xF3, 0x0F, 0xB8, 0xC7"
+
+#define hweight_(n, x, insn, setup, cout, cin) ({                         \
+    unsigned int res_;                                                    \
+    /*                                                                    \
+     * For the function call the POPCNT input register needs to be marked \
+     * modified as well. Set up a local variable of appropriate type      \
+     * for this purpose.                                                  \
+     */                                                                   \
+    typeof((uint##n##_t)(x) + 0U) val_ = (x);                             \
+    alternative_io(setup "; call generic_hweight" #n,                     \
+                   insn, X86_FEATURE_POPCNT,                              \
+                   ASM_OUTPUT2([res] "=a" (res_), [val] cout (val_)),     \
+                   [src] cin (val_));                                     \
+    res_;                                                                 \
+})
+#define hweight64(x) hweight_(64, x, POPCNT_64, "", "+D", "g")
+#define hweight32(x) hweight_(32, x, POPCNT_32, "", "+D", "g")
+#define hweight16(x) hweight_(16, x, "movzwl %w[src], %[val]; " POPCNT_32, \
+                              "mov %[src], %[val]", "=&D", "rm")
+#define hweight8(x)  hweight_( 8, x, "movzbl %b[src], %[val]; " POPCNT_32, \
+                              "mov %[src], %[val]", "=&D", "rm")
+#else
 #define hweight64(x) generic_hweight64(x)
 #define hweight32(x) generic_hweight32(x)
 #define hweight16(x) generic_hweight16(x)
 #define hweight8(x) generic_hweight8(x)
+#endif
 
 #endif /* _X86_BITOPS_H */

[-- Attachment #3: Type: text/plain, Size: 157 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 21+ messages in thread