All of lore.kernel.org
 help / color / mirror / Atom feed
From: Borislav Petkov <bp@suse.de>
To: "H. Peter Anvin" <hpa@zytor.com>, Peter Zijlstra <peterz@infradead.org>
Cc: Brian Gerst <brgerst@gmail.com>, x86-ml <x86@kernel.org>,
	Denys Vlasenko <dvlasenk@redhat.com>,
	LKML <linux-kernel@vger.kernel.org>,
	Dmitry Vyukov <dvyukov@google.com>,
	Andi Kleen <andi@firstfloor.org>,
	zengzhaoxiu@163.com, Thomas Gleixner <tglx@linutronix.de>,
	Ingo Molnar <mingo@redhat.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Kees Cook <keescook@chromium.org>,
	Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>,
	Andy Lutomirski <luto@amacapital.net>
Subject: Re: [PATCH -v2] x86/hweight: Get rid of the special calling convention
Date: Wed, 18 May 2016 12:38:46 +0200	[thread overview]
Message-ID: <20160518103846.GA30633@pd.tnic> (raw)
In-Reply-To: <20160512130932.GC10056@pd.tnic>

On Thu, May 12, 2016 at 03:09:32PM +0200, Borislav Petkov wrote:
> I wanted to have gcc use %[w] and this way not hardcode the reg but the
> ABI kinda hardcodes it to rAX. And you're right about tracing funkyness
> adding glue so we're probably better off doing the .S thing directly and
> making it more robust this way.

Ok, here's a new version. Let me know what do you think before I hammer
on it more seriously. But booting in kvm with "-popcnt" and without
seems to work, no splats, corruptions or whatnot.

I've also been running POPCNT vs the asm versions in userspace and
comparing results, looks good too, seems to work correctly. :)

Thanks.

---
From: Borislav Petkov <bp@suse.de>
Date: Wed, 4 May 2016 18:52:09 +0200
Subject: [PATCH] x86/hweight: Get rid of the special calling convention

People complained about ARCH_HWEIGHT_CFLAGS and how it throws a
wrench into kcov, lto, etc, experimentation. Add asm versions for
__sw_hweight{32,64}() and do explicitly saving and restoring of
clobbered registers. This gets rid of the special calling convention.

We still need to hardcode POPCNT and register operands as some old gas
versions which we support, do not know about POPCNT.

Btw, remove redundant REX prefix from 32-bit POPCNT because alternatives
can do padding now.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/Kconfig                    |  5 ---
 arch/x86/include/asm/arch_hweight.h | 24 +++++-------
 arch/x86/kernel/i386_ksyms_32.c     |  2 +
 arch/x86/kernel/x8664_ksyms_64.c    |  3 ++
 arch/x86/lib/Makefile               |  2 +-
 arch/x86/lib/hweight.S              | 77 +++++++++++++++++++++++++++++++++++++
 lib/Makefile                        |  5 ---
 lib/hweight.c                       |  4 ++
 8 files changed, 97 insertions(+), 25 deletions(-)
 create mode 100644 arch/x86/lib/hweight.S

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2dc18605831f..c3a8f360683b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -296,11 +296,6 @@ config X86_32_LAZY_GS
 	def_bool y
 	depends on X86_32 && !CC_STACKPROTECTOR
 
-config ARCH_HWEIGHT_CFLAGS
-	string
-	default "-fcall-saved-ecx -fcall-saved-edx" if X86_32
-	default "-fcall-saved-rdi -fcall-saved-rsi -fcall-saved-rdx -fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 -fcall-saved-r11" if X86_64
-
 config ARCH_SUPPORTS_UPROBES
 	def_bool y
 
diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
index 02e799fa43d1..e7cd63175de4 100644
--- a/arch/x86/include/asm/arch_hweight.h
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -4,8 +4,8 @@
 #include <asm/cpufeatures.h>
 
 #ifdef CONFIG_64BIT
-/* popcnt %edi, %eax -- redundant REX prefix for alignment */
-#define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7"
+/* popcnt %edi, %eax */
+#define POPCNT32 ".byte 0xf3,0x0f,0xb8,0xc7"
 /* popcnt %rdi, %rax */
 #define POPCNT64 ".byte 0xf3,0x48,0x0f,0xb8,0xc7"
 #define REG_IN "D"
@@ -17,19 +17,15 @@
 #define REG_OUT "a"
 #endif
 
-/*
- * __sw_hweightXX are called from within the alternatives below
- * and callee-clobbered registers need to be taken care of. See
- * ARCH_HWEIGHT_CFLAGS in <arch/x86/Kconfig> for the respective
- * compiler switches.
- */
+#define __HAVE_ARCH_SW_HWEIGHT
+
 static __always_inline unsigned int __arch_hweight32(unsigned int w)
 {
-	unsigned int res = 0;
+	unsigned int res;
 
 	asm (ALTERNATIVE("call __sw_hweight32", POPCNT32, X86_FEATURE_POPCNT)
-		     : "="REG_OUT (res)
-		     : REG_IN (w));
+			 : "="REG_OUT (res)
+			 : REG_IN (w));
 
 	return res;
 }
@@ -53,11 +49,11 @@ static inline unsigned long __arch_hweight64(__u64 w)
 #else
 static __always_inline unsigned long __arch_hweight64(__u64 w)
 {
-	unsigned long res = 0;
+	unsigned long res;
 
 	asm (ALTERNATIVE("call __sw_hweight64", POPCNT64, X86_FEATURE_POPCNT)
-		     : "="REG_OUT (res)
-		     : REG_IN (w));
+			 : "="REG_OUT (res)
+			 : REG_IN (w));
 
 	return res;
 }
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 64341aa485ae..d40ee8a38fed 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -42,3 +42,5 @@ EXPORT_SYMBOL(empty_zero_page);
 EXPORT_SYMBOL(___preempt_schedule);
 EXPORT_SYMBOL(___preempt_schedule_notrace);
 #endif
+
+EXPORT_SYMBOL(__sw_hweight32);
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index cd05942bc918..f1aebfb49c36 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -44,6 +44,9 @@ EXPORT_SYMBOL(clear_page);
 
 EXPORT_SYMBOL(csum_partial);
 
+EXPORT_SYMBOL(__sw_hweight32);
+EXPORT_SYMBOL(__sw_hweight64);
+
 /*
  * Export string functions. We normally rely on gcc builtin for most of these,
  * but gcc sometimes decides not to inline them.
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 72a576752a7e..ec969cc3eb20 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -25,7 +25,7 @@ lib-y += memcpy_$(BITS).o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
 
-obj-y += msr.o msr-reg.o msr-reg-export.o
+obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
 
 ifeq ($(CONFIG_X86_32),y)
         obj-y += atomic64_32.o
diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S
new file mode 100644
index 000000000000..8cacaf6aa74d
--- /dev/null
+++ b/arch/x86/lib/hweight.S
@@ -0,0 +1,77 @@
+#include <linux/linkage.h>
+
+#include <asm/asm.h>
+
+/*
+ * unsigned int __sw_hweight32(unsigned int w)
+ * %rdi: w
+ */
+ENTRY(__sw_hweight32)
+
+#ifdef CONFIG_X86_64
+	movl %edi, %eax				# w
+#endif
+	__ASM_SIZE(push,) %__ASM_REG(dx)
+	movl %eax, %edx				# w -> t
+	shrl %edx				# t >>= 1
+	andl $0x55555555, %edx			# t &= 0x55555555
+	subl %edx, %eax				# w -= t
+
+	movl %eax, %edx				# w -> t
+	shrl $2, %eax				# w_tmp >>= 2
+	andl $0x33333333, %edx			# t	&= 0x33333333
+	andl $0x33333333, %eax			# w_tmp &= 0x33333333
+	addl %edx, %eax				# w = w_tmp + t
+
+	movl %eax, %edx				# w -> t
+	shrl $4, %edx				# t >>= 4
+	addl %edx, %eax				# w_tmp += t
+	andl  $0x0f0f0f0f, %eax			# w_tmp &= 0x0f0f0f0f
+	imull $0x01010101, %eax, %eax		# w_tmp *= 0x01010101
+	shrl $24, %eax				# w = w_tmp >> 24
+	__ASM_SIZE(pop,) %__ASM_REG(dx)
+	ret
+ENDPROC(__sw_hweight32)
+
+ENTRY(__sw_hweight64)
+#ifdef CONFIG_X86_64
+	pushq   %rdx
+
+	movq    %rdi, %rdx                      # w -> t
+	movabsq $0x5555555555555555, %rax
+	shrq    %rdx                            # t >>= 1
+	andq    %rdx, %rax                      # t &= 0x5555555555555555
+	movabsq $0x3333333333333333, %rdx
+	subq    %rax, %rdi                      # w -= t
+
+	movq    %rdi, %rax                      # w -> t
+	shrq    $2, %rdi                        # w_tmp >>= 2
+	andq    %rdx, %rax                      # t     &= 0x3333333333333333
+	andq    %rdi, %rdx                      # w_tmp &= 0x3333333333333333
+	addq    %rdx, %rax                      # w = w_tmp + t
+
+	movq    %rax, %rdx                      # w -> t
+	shrq    $4, %rdx                        # t >>= 4
+	addq    %rdx, %rax                      # w_tmp += t
+	movabsq $0x0f0f0f0f0f0f0f0f, %rdx
+	andq    %rdx, %rax                      # w_tmp &= 0x0f0f0f0f0f0f0f0f
+	movabsq $0x0101010101010101, %rdx
+	imulq   %rdx, %rax                      # w_tmp *= 0x0101010101010101
+	shrq    $56, %rax                       # w = w_tmp >> 56
+
+	popq    %rdx
+	ret
+#else /* CONFIG_X86_32 */
+        /* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */
+        pushl   %ecx
+
+        call    __sw_hweight32
+        movl    %eax, %ecx                      # stash away result
+        movl    %edx, %eax                      # second part of input
+        call    __sw_hweight32
+        addl    %ecx, %eax                      # result
+
+        popl    %ecx
+        ret
+#endif
+ENDPROC(__sw_hweight64)
diff --git a/lib/Makefile b/lib/Makefile
index 7bd6fd436c97..08ea9f1c0c49 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -15,9 +15,6 @@ KCOV_INSTRUMENT_rbtree.o := n
 KCOV_INSTRUMENT_list_debug.o := n
 KCOV_INSTRUMENT_debugobjects.o := n
 KCOV_INSTRUMENT_dynamic_debug.o := n
-# Kernel does not boot if we instrument this file as it uses custom calling
-# convention (see CONFIG_ARCH_HWEIGHT_CFLAGS).
-KCOV_INSTRUMENT_hweight.o := n
 
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o dump_stack.o timerqueue.o\
@@ -72,8 +69,6 @@ obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o
 obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o
 obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o
 
-GCOV_PROFILE_hweight.o := n
-CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS))
 obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
 
 obj-$(CONFIG_BTREE) += btree.o
diff --git a/lib/hweight.c b/lib/hweight.c
index 9a5c1f221558..43273a7d83cf 100644
--- a/lib/hweight.c
+++ b/lib/hweight.c
@@ -9,6 +9,7 @@
  * The Hamming Weight of a number is the total number of bits set in it.
  */
 
+#ifndef __HAVE_ARCH_SW_HWEIGHT
 unsigned int __sw_hweight32(unsigned int w)
 {
 #ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER
@@ -25,6 +26,7 @@ unsigned int __sw_hweight32(unsigned int w)
 #endif
 }
 EXPORT_SYMBOL(__sw_hweight32);
+#endif
 
 unsigned int __sw_hweight16(unsigned int w)
 {
@@ -43,6 +45,7 @@ unsigned int __sw_hweight8(unsigned int w)
 }
 EXPORT_SYMBOL(__sw_hweight8);
 
+#ifndef __HAVE_ARCH_SW_HWEIGHT
 unsigned long __sw_hweight64(__u64 w)
 {
 #if BITS_PER_LONG == 32
@@ -65,3 +68,4 @@ unsigned long __sw_hweight64(__u64 w)
 #endif
 }
 EXPORT_SYMBOL(__sw_hweight64);
+#endif
-- 
2.7.3

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
-- 

  reply	other threads:[~2016-05-18 10:39 UTC|newest]

Thread overview: 104+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-04-05  2:06 [PATCH V2 01/30] bitops: add parity functions Zeng Zhaoxiu
2016-04-05  4:23 ` [PATCH V2 02/30] Include generic parity.h in some architectures' bitops.h Zeng Zhaoxiu
2016-04-05  4:23 ` Zeng Zhaoxiu
2016-04-05  4:23   ` Zeng Zhaoxiu
2016-04-05  4:23   ` Zeng Zhaoxiu
2016-04-05  4:23   ` Zeng Zhaoxiu
2016-04-05  4:23   ` Zeng Zhaoxiu
2016-04-05  4:23   ` Zeng Zhaoxiu
2016-04-06  8:41   ` [PATCH v2 " zengzhaoxiu
2016-04-06  8:41   ` zengzhaoxiu
2016-04-06  8:41     ` zengzhaoxiu at 163.com
2016-04-06  8:41     ` zengzhaoxiu
2016-04-06  8:41     ` zengzhaoxiu
2016-04-06  8:41     ` zengzhaoxiu
2016-04-11 17:31     ` Alexey Brodkin
2016-04-11 17:31       ` Alexey Brodkin
2016-04-11 17:31       ` Alexey Brodkin
2016-04-11 17:31       ` Alexey Brodkin
2016-04-05 19:04 ` [PATCH V2 01/30] bitops: add parity functions Sam Ravnborg
2016-04-06  5:33   ` Zeng Zhaoxiu
2016-04-06  8:24     ` Sam Ravnborg
2016-04-06  8:22   ` [PATCH v2 " zengzhaoxiu
2016-04-06  8:46 ` [PATCH v2 03/30] Add alpha-specific " zengzhaoxiu
2016-04-06  8:53 ` [PATCH v2 04/30] Add blackfin-specific " zengzhaoxiu
2016-04-06  8:57 ` [PATCH v2 05/30] Add ia64-specific " zengzhaoxiu
2016-04-06  8:57   ` zengzhaoxiu
2016-04-06  8:59 ` [PATCH v2 06/30] Add mips-specific " zengzhaoxiu
2016-04-06 10:23   ` zengzhaoxiu
2016-04-06  9:03 ` [PATCH v2 07/30] Add powerpc-specific " zengzhaoxiu
2016-04-06  9:07 ` [PATCH v2 08/30] Add sparc-specific " zengzhaoxiu
2016-04-06  9:07   ` zengzhaoxiu
2016-04-06 16:37   ` Josip Rodin
2016-04-06 18:44   ` Sam Ravnborg
2016-04-06 18:44     ` Sam Ravnborg
2016-04-07  3:56     ` Zeng Zhaoxiu
2016-04-07  3:56       ` Zeng Zhaoxiu
2016-04-06  9:08 ` [PATCH v2 09/30] Add tile-specific " zengzhaoxiu
2016-04-06 13:27   ` Chris Metcalf
2016-04-07  3:55     ` Zeng Zhaoxiu
2016-04-06  9:14 ` [PATCH v2 10/30] Add x86-specific " zengzhaoxiu
2016-04-06 10:13   ` Borislav Petkov
2016-04-06 10:37     ` One Thousand Gnomes
2016-04-06 10:53       ` Borislav Petkov
2016-04-07  3:55         ` Zeng Zhaoxiu
2016-04-07  9:39           ` Borislav Petkov
2016-04-11  2:43       ` Zeng Zhaoxiu
2016-04-15  2:11         ` Borislav Petkov
2016-04-07  3:55     ` Zeng Zhaoxiu
2016-04-07  9:41       ` Borislav Petkov
2016-04-06 19:45   ` Andi Kleen
2016-04-07  3:56     ` Zeng Zhaoxiu
2016-04-07  6:31     ` Dmitry Vyukov
2016-04-07  9:43       ` Borislav Petkov
2016-05-04 18:46         ` [RFC PATCH] x86/hweight: Get rid of the special calling convention Borislav Petkov
2016-05-04 19:31           ` Brian Gerst
2016-05-04 19:33             ` H. Peter Anvin
2016-05-04 19:41               ` Borislav Petkov
2016-05-04 19:49                 ` H. Peter Anvin
2016-05-04 20:22                   ` Borislav Petkov
2016-05-04 20:51                     ` H. Peter Anvin
2016-05-04 21:09                     ` Andi Kleen
2016-05-05 13:02                     ` Denys Vlasenko
2016-05-05 14:04                       ` Borislav Petkov
2016-05-10 16:53                         ` [PATCH -v2] " Borislav Petkov
2016-05-10 17:23                           ` Peter Zijlstra
2016-05-10 19:02                             ` Borislav Petkov
2016-05-10 19:03                             ` H. Peter Anvin
2016-05-10 19:10                               ` Borislav Petkov
2016-05-10 22:30                                 ` H. Peter Anvin
2016-05-11  4:11                                   ` Borislav Petkov
2016-05-11 11:15                                     ` Brian Gerst
2016-05-11 11:24                                       ` Peter Zijlstra
2016-05-11 12:47                                         ` Borislav Petkov
2016-05-12  4:54                                         ` H. Peter Anvin
2016-05-12 11:57                                           ` Borislav Petkov
2016-05-12 12:14                                             ` Peter Zijlstra
2016-05-12 13:09                                               ` Borislav Petkov
2016-05-18 10:38                                                 ` Borislav Petkov [this message]
2016-04-07 14:10     ` [PATCH v2 10/30] Add x86-specific parity functions One Thousand Gnomes
2016-04-06  9:27 ` [PATCH v2 11/30] sunrpc: use parity8 zengzhaoxiu
2016-04-06  9:30 ` [PATCH v2 12/30] mips: use parity functions in cerr-sb1.c zengzhaoxiu
2016-04-06  9:36 ` [PATCH v2 13/30] bch: use parity32 zengzhaoxiu
2016-04-06  9:39 ` [PATCH v2 14/30] media: use parity8 in vivid-vbi-gen.c zengzhaoxiu
2016-04-06  9:41 ` [PATCH v2 15/30] media: use parity functions in saa7115 zengzhaoxiu
2016-04-06  9:43 ` [PATCH v2 16/30] input: use parity32 in grip_mp zengzhaoxiu
2016-04-06  9:44 ` [PATCH v2 17/30] input: use parity64 in sidewinder zengzhaoxiu
2016-04-06  9:45 ` [PATCH v2 18/30] input: use parity16 in ams_delta_serio zengzhaoxiu
2016-04-06  9:47 ` [PATCH v2 19/30] scsi: use parity32 in isci's phy zengzhaoxiu
2016-04-06  9:52 ` [PATCH v2 20/30] mtd: use parity16 in ssfdc zengzhaoxiu
2016-04-06  9:53 ` [PATCH v2 21/30] mtd: use parity functions in inftlcore zengzhaoxiu
2016-04-06  9:58 ` [PATCH v2 22/30] crypto: use parity functions in qat_hal zengzhaoxiu
2016-04-06 10:05 ` [PATCH v2 23/30] mtd: use parity16 in sm_ftl zengzhaoxiu
2016-04-06 10:11 ` [PATCH v2 24/30] ethernet: use parity8 in sun/niu.c zengzhaoxiu
2016-04-06 10:14 ` [PATCH v2 25/30] input: use parity8 in pcips2 zengzhaoxiu
2016-04-06 10:15 ` [PATCH v2 26/30] input: use parity8 in sa1111ps2 zengzhaoxiu
2016-04-06 10:16 ` [PATCH v2 27/30] iio: use parity32 in adxrs450 zengzhaoxiu
2016-04-10 14:37   ` Jonathan Cameron
2016-04-10 14:41     ` Lars-Peter Clausen
2016-04-10 15:13       ` Jonathan Cameron
2016-04-10 15:14         ` Jonathan Cameron
2016-04-06 10:18 ` [PATCH v2 28/30] serial: use parity32 in max3100 zengzhaoxiu
2016-04-06 10:25   ` Greg KH
2016-04-06 10:20 ` [PATCH v2 29/30] input: use parity8 in elantech zengzhaoxiu
2016-04-06 10:21 ` [PATCH v2 30/30] ethernet: use parity8 in broadcom/tg3.c zengzhaoxiu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20160518103846.GA30633@pd.tnic \
    --to=bp@suse.de \
    --cc=akpm@linux-foundation.org \
    --cc=andi@firstfloor.org \
    --cc=brgerst@gmail.com \
    --cc=dvlasenk@redhat.com \
    --cc=dvyukov@google.com \
    --cc=hpa@zytor.com \
    --cc=keescook@chromium.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=luto@amacapital.net \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=tglx@linutronix.de \
    --cc=x86@kernel.org \
    --cc=zengzhaoxiu@163.com \
    --cc=zhaoxiu.zeng@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.