From: Jan Beulich <JBeulich@suse.com>
To: "xen-devel@lists.xenproject.org" <xen-devel@lists.xenproject.org>
Cc: "Andrew Cooper" <andrew.cooper3@citrix.com>,
"Wei Liu" <wl@xen.org>, "Roger Pau Monné" <roger.pau@citrix.com>
Subject: [Xen-devel] [PATCH v2] x86: use POPCNT for hweight<N>() when available
Date: Mon, 15 Jul 2019 14:39:04 +0000 [thread overview]
Message-ID: <55a4a24d-7fac-527c-6bcf-8d689136bac2@suse.com> (raw)
[-- Attachment #1: Type: text/plain, Size: 4300 bytes --]
This is faster than using the software implementation, and the insn is
available on all half-way recent hardware. Therefore convert
generic_hweight<N>() to out-of-line functions (without affecting Arm)
and use alternatives patching to replace the function calls.
Note that the approach doesn#t work for clang, due to it not recognizing
-ffixed-*.
Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Also suppress UB sanitizer instrumentation. Reduce macroization in
hweight.c. Exclude clang builds.
---
Note: Using "g" instead of "X" as the dummy constraint in hweight64()
and hweight32(), other than expected, produces slightly better
code with gcc 8.
--- a/xen/arch/x86/Makefile
+++ b/xen/arch/x86/Makefile
@@ -31,6 +31,10 @@ obj-y += emul-i8254.o
obj-y += extable.o
obj-y += flushtlb.o
obj-$(CONFIG_CRASH_DEBUG) += gdbstub.o
+# clang doesn't appear to know of -ffixed-*
+hweight-$(gcc) := hweight.o
+hweight-$(clang) :=
+obj-y += $(hweight-y)
obj-y += hypercall.o
obj-y += i387.o
obj-y += i8259.o
@@ -251,6 +255,10 @@ boot/mkelf32: boot/mkelf32.c
efi/mkreloc: efi/mkreloc.c
$(HOSTCC) $(HOSTCFLAGS) -g -o $@ $<
+nocov-y += hweight.o
+noubsan-y += hweight.o
+hweight.o: CFLAGS += $(foreach reg,cx dx si 8 9 10 11,-ffixed-r$(reg))
+
.PHONY: clean
clean::
rm -f asm-offsets.s *.lds boot/*.o boot/*~ boot/core boot/mkelf32
--- /dev/null
+++ b/xen/arch/x86/hweight.c
@@ -0,0 +1,21 @@
+#define generic_hweight64 _hweight64
+#define generic_hweight32 _hweight32
+#define generic_hweight16 _hweight16
+#define generic_hweight8 _hweight8
+
+#include <xen/compiler.h>
+
+#undef inline
+#define inline always_inline
+
+#include <xen/bitops.h>
+
+#undef generic_hweight8
+#undef generic_hweight16
+#undef generic_hweight32
+#undef generic_hweight64
+
+unsigned int generic_hweight8 (unsigned int x) { return _hweight8 (x); }
+unsigned int generic_hweight16(unsigned int x) { return _hweight16(x); }
+unsigned int generic_hweight32(unsigned int x) { return _hweight32(x); }
+unsigned int generic_hweight64(uint64_t x) { return _hweight64(x); }
--- a/xen/include/asm-x86/bitops.h
+++ b/xen/include/asm-x86/bitops.h
@@ -475,9 +475,36 @@ static inline int fls(unsigned int x)
*
* The Hamming Weight of a number is the total number of bits set in it.
*/
+#ifndef __clang__
+/* POPCNT encodings with %{r,e}di input and %{r,e}ax output: */
+#define POPCNT_64 ".byte 0xF3, 0x48, 0x0F, 0xB8, 0xC7"
+#define POPCNT_32 ".byte 0xF3, 0x0F, 0xB8, 0xC7"
+
+#define hweight_(n, x, insn, setup, cout, cin) ({ \
+ unsigned int res_; \
+ /* \
+ * For the function call the POPCNT input register needs to be marked \
+ * modified as well. Set up a local variable of appropriate type \
+ * for this purpose. \
+ */ \
+ typeof((uint##n##_t)(x) + 0U) val_ = (x); \
+ alternative_io(setup "; call generic_hweight" #n, \
+ insn, X86_FEATURE_POPCNT, \
+ ASM_OUTPUT2([res] "=a" (res_), [val] cout (val_)), \
+ [src] cin (val_)); \
+ res_; \
+})
+#define hweight64(x) hweight_(64, x, POPCNT_64, "", "+D", "g")
+#define hweight32(x) hweight_(32, x, POPCNT_32, "", "+D", "g")
+#define hweight16(x) hweight_(16, x, "movzwl %w[src], %[val]; " POPCNT_32, \
+ "mov %[src], %[val]", "=&D", "rm")
+#define hweight8(x) hweight_( 8, x, "movzbl %b[src], %[val]; " POPCNT_32, \
+ "mov %[src], %[val]", "=&D", "rm")
+#else
#define hweight64(x) generic_hweight64(x)
#define hweight32(x) generic_hweight32(x)
#define hweight16(x) generic_hweight16(x)
#define hweight8(x) generic_hweight8(x)
+#endif
#endif /* _X86_BITOPS_H */
[-- Attachment #2: x86-hweight-popcnt.patch --]
[-- Type: text/plain, Size: 4220 bytes --]
x86: use POPCNT for hweight<N>() when available
This is faster than using the software implementation, and the insn is
available on all half-way recent hardware. Therefore convert
generic_hweight<N>() to out-of-line functions (without affecting Arm)
and use alternatives patching to replace the function calls.
Note that the approach doesn#t work for clang, due to it not recognizing
-ffixed-*.
Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Also suppress UB sanitizer instrumentation. Reduce macroization in
hweight.c. Exclude clang builds.
---
Note: Using "g" instead of "X" as the dummy constraint in hweight64()
and hweight32(), other than expected, produces slightly better
code with gcc 8.
--- a/xen/arch/x86/Makefile
+++ b/xen/arch/x86/Makefile
@@ -31,6 +31,10 @@ obj-y += emul-i8254.o
obj-y += extable.o
obj-y += flushtlb.o
obj-$(CONFIG_CRASH_DEBUG) += gdbstub.o
+# clang doesn't appear to know of -ffixed-*
+hweight-$(gcc) := hweight.o
+hweight-$(clang) :=
+obj-y += $(hweight-y)
obj-y += hypercall.o
obj-y += i387.o
obj-y += i8259.o
@@ -251,6 +255,10 @@ boot/mkelf32: boot/mkelf32.c
efi/mkreloc: efi/mkreloc.c
$(HOSTCC) $(HOSTCFLAGS) -g -o $@ $<
+nocov-y += hweight.o
+noubsan-y += hweight.o
+hweight.o: CFLAGS += $(foreach reg,cx dx si 8 9 10 11,-ffixed-r$(reg))
+
.PHONY: clean
clean::
rm -f asm-offsets.s *.lds boot/*.o boot/*~ boot/core boot/mkelf32
--- /dev/null
+++ b/xen/arch/x86/hweight.c
@@ -0,0 +1,21 @@
+#define generic_hweight64 _hweight64
+#define generic_hweight32 _hweight32
+#define generic_hweight16 _hweight16
+#define generic_hweight8 _hweight8
+
+#include <xen/compiler.h>
+
+#undef inline
+#define inline always_inline
+
+#include <xen/bitops.h>
+
+#undef generic_hweight8
+#undef generic_hweight16
+#undef generic_hweight32
+#undef generic_hweight64
+
+unsigned int generic_hweight8 (unsigned int x) { return _hweight8 (x); }
+unsigned int generic_hweight16(unsigned int x) { return _hweight16(x); }
+unsigned int generic_hweight32(unsigned int x) { return _hweight32(x); }
+unsigned int generic_hweight64(uint64_t x) { return _hweight64(x); }
--- a/xen/include/asm-x86/bitops.h
+++ b/xen/include/asm-x86/bitops.h
@@ -475,9 +475,36 @@ static inline int fls(unsigned int x)
*
* The Hamming Weight of a number is the total number of bits set in it.
*/
+#ifndef __clang__
+/* POPCNT encodings with %{r,e}di input and %{r,e}ax output: */
+#define POPCNT_64 ".byte 0xF3, 0x48, 0x0F, 0xB8, 0xC7"
+#define POPCNT_32 ".byte 0xF3, 0x0F, 0xB8, 0xC7"
+
+#define hweight_(n, x, insn, setup, cout, cin) ({ \
+ unsigned int res_; \
+ /* \
+ * For the function call the POPCNT input register needs to be marked \
+ * modified as well. Set up a local variable of appropriate type \
+ * for this purpose. \
+ */ \
+ typeof((uint##n##_t)(x) + 0U) val_ = (x); \
+ alternative_io(setup "; call generic_hweight" #n, \
+ insn, X86_FEATURE_POPCNT, \
+ ASM_OUTPUT2([res] "=a" (res_), [val] cout (val_)), \
+ [src] cin (val_)); \
+ res_; \
+})
+#define hweight64(x) hweight_(64, x, POPCNT_64, "", "+D", "g")
+#define hweight32(x) hweight_(32, x, POPCNT_32, "", "+D", "g")
+#define hweight16(x) hweight_(16, x, "movzwl %w[src], %[val]; " POPCNT_32, \
+ "mov %[src], %[val]", "=&D", "rm")
+#define hweight8(x) hweight_( 8, x, "movzbl %b[src], %[val]; " POPCNT_32, \
+ "mov %[src], %[val]", "=&D", "rm")
+#else
#define hweight64(x) generic_hweight64(x)
#define hweight32(x) generic_hweight32(x)
#define hweight16(x) generic_hweight16(x)
#define hweight8(x) generic_hweight8(x)
+#endif
#endif /* _X86_BITOPS_H */
[-- Attachment #3: Type: text/plain, Size: 157 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
next reply other threads:[~2019-07-15 14:40 UTC|newest]
Thread overview: 21+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-07-15 14:39 Jan Beulich [this message]
2020-05-14 14:05 ` [PATCH v2] x86: use POPCNT for hweight<N>() when available Roger Pau Monné
2020-05-20 8:31 ` Jan Beulich
2020-05-20 9:31 ` Roger Pau Monné
2020-05-20 10:17 ` Jan Beulich
2020-05-20 10:28 ` Roger Pau Monné
2020-05-20 10:57 ` Jan Beulich
2020-05-20 11:43 ` Roger Pau Monné
2020-05-20 13:12 ` Jan Beulich
2020-05-20 17:18 ` Roger Pau Monné
2020-05-22 9:58 ` Jan Beulich
2020-05-22 10:19 ` Roger Pau Monné
2023-03-17 11:22 ` Roger Pau Monné
2023-03-17 12:26 ` Andrew Cooper
2023-03-20 9:48 ` Jan Beulich
2023-03-21 14:57 ` Roger Pau Monné
2023-03-21 15:35 ` Jan Beulich
2023-03-21 15:44 ` Juergen Gross
2023-03-21 16:31 ` Roger Pau Monné
2023-03-21 16:41 ` Jan Beulich
2023-03-21 17:02 ` Roger Pau Monné
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=55a4a24d-7fac-527c-6bcf-8d689136bac2@suse.com \
--to=jbeulich@suse.com \
--cc=andrew.cooper3@citrix.com \
--cc=roger.pau@citrix.com \
--cc=wl@xen.org \
--cc=xen-devel@lists.xenproject.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.