linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH V2 01/30] bitops: add parity functions
@ 2016-04-05  2:06 Zeng Zhaoxiu
  2016-04-05  4:23 ` [PATCH V2 02/30] Include generic parity.h in some architectures' bitops.h Zeng Zhaoxiu
                   ` (29 more replies)
  0 siblings, 30 replies; 84+ messages in thread
From: Zeng Zhaoxiu @ 2016-04-05  2:06 UTC (permalink / raw)
  To: Arnd Bergmann, Andrew Morton, Martin Kepplinger,
	Rasmus Villemoes, Ingo Molnar, Yury Norov, Sasha Levin,
	Denys Vlasenko
  Cc: linux-kernel, linux-arch

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

These patches provide generic and architecture-specific odd parity 
calculations.

I did not use GCC's __builtin_parity* functions, based on the following 
reasons:
   1. I don't know where to identify which version of GCC from the beginning
      supported __builtin_parity for the architecture.
   2. For the architectures that doesn't have popcount instruction, GCC 
instead use
      "call __paritysi2" (__paritydi2 for 64-bits). So if use 
__builtin_parity, we must
      provide __paritysi2 and __paritydi2 functions for these architectures.
      Additionally, parity4,8,16 might be "__builtin_parity(x & mask)", 
but the "& mask"
      operation is totally unnecessary.
   3. For the architectures that have popcount instruction, we do the 
same things.
   4. For powerpc, sparc, and x86, we do runtime patching to use 
popcount instruction
      if the CPU support.

I have compiled successfully with x86_64_defconfig, i386_defconfig, 
pseries_defconfig
and sparc64_defconfig. And I used the following codes to test:

     #include <stdio.h>
     #include <stdlib.h>
     #include <stdint.h>

     #ifdef __x86_64__
     /* popcnt %edi, %eax -- redundant REX prefix for alignment */
     #define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7"
     /* popcnt %rdi, %rax */
     #define POPCNT64 ".byte 0xf3,0x48,0x0f,0xb8,0xc7"
     #define REG_IN "D"
     #define REG_OUT "a"
     #else
     /* popcnt %eax, %eax */
     #define POPCNT32 ".byte 0xf3,0x0f,0xb8,0xc0"
     #define REG_IN "a"
     #define REG_OUT "a"
     #endif

     static inline int c_parity4(unsigned int w)
     {
         w &= 0xf;
         return (0x6996 >> w) & 1;
     }

     static inline int c_parity8(unsigned int w)
     {
         w ^= w >> 4;
         return c_parity4(w);
     }

     static inline int c_parity16(unsigned int w)
     {
         w ^= w >> 8;
         return c_parity8(w);
     }

     static inline int c_parity32(unsigned int w)
     {
         w ^= w >> 16;
         return c_parity16(w);
     }

     static inline int c_parity64(uint64_t w)
     {
         return c_parity32((unsigned int)w ^ (unsigned int)(w >> 32));
     }

     static inline int asm_parity4(unsigned int w)
     {
         unsigned int res = 0;

         asm("test    $0xf, %1        \n"
             "setpo    %b0                \n"
             : "+q" (res)
             : "r" (w)
             : "cc");

         return res;
     }

     static inline int asm_parity8(unsigned int w)
     {
         unsigned int res = 0;

         asm("test    %1, %1            \n"
             "setpo    %b0            \n"
             : "+q" (res)
             : "r" (w)
             : "cc");

         return res;
     }

     static inline int asm_parity16(unsigned int w)
     {
         unsigned int res = 0;

         asm("xor    %h1, %b1        \n"
             "setpo    %b0            \n"
             : "+q" (res), "+q" (w)
             : : "cc");

         return res;
     }

     static inline int asm_parity32_1(unsigned int w)
     {
         unsigned int res;

         w ^= w >> 16;
         asm("xor    %%ah, %%al        \n"
             "mov    $0, %%eax        \n"
             "setpo    %%al            \n"
             : "=a" (res)
             : "a" (w)
             : "cc");

         return res;
     }

     static inline int asm_parity32_2(unsigned int w)
     {
         unsigned int res;

         asm(POPCNT32 "                \n"
             "andl    $1, %0            \n"
             : "="REG_OUT (res)
             : REG_IN (w)
             : "cc");

         return res;
     }

     #ifdef __x86_64__
     static inline int asm_parity64_1(uint64_t w)
     {
         unsigned int res = (unsigned int)w ^ (unsigned int)(w >> 32);

         res ^= res >> 16;
         asm("xor    %%ah, %%al        \n"
             "mov    $0, %%eax        \n"
             "setpo    %%al            \n"
             : "=a" (res)
             : "a" (res)
             : "cc");

         return res;
     }

     static inline int asm_parity64_2(uint64_t w)
     {
         unsigned int res;

         asm(POPCNT64 "                \n"
             "andl    $1, %0            \n"
             : "="REG_OUT (res)
             : REG_IN (w)
             : "cc");

         return res;
     }
     #else
     static inline int asm_parity64_1(uint64_t w)
     {
         return asm_parity32_1((unsigned int)(w >> 32) ^ (unsigned int)w);
     }

     static inline int asm_parity64_2(uint64_t w)
     {
         return asm_parity32_2((unsigned int)(w >> 32) ^ (unsigned int)w);
     }
     #endif

     int main(int argc, char **argv)
     {
         int ok = 1;
         int count = 1000, i;

         if (argc >= 2)
             count = atoi(argv[1]);

         srand((unsigned)time(NULL));

         for (i = 0; i < count; i++) {
             uint64_t w = rand() | (uint64_t)rand() << 32;
             int p4_1 = c_parity4(w);
             int p4_2 = asm_parity4(w);
             int p8_1 = c_parity8(w);
             int p8_2 = asm_parity8(w);
             int p16_1 = c_parity16(w);
             int p16_2 = asm_parity16(w);
             int p32_1 = c_parity32(w);
             int p32_2 = asm_parity32_1(w);
             int p32_3 = asm_parity32_2(w);
             int p64_1 = c_parity64(w);
             int p64_2 = asm_parity64_1(w);
             int p64_3 = asm_parity64_2(w);
             if (p4_1 != p4_2 ||
                 p8_1 != p8_2 ||
                 p16_1 != p16_2 ||
                 p32_1 != p32_2 || p32_1 != p32_3 ||
                 p64_1 != p64_2 || p64_1 != p64_3) {
                 fprintf(stderr, "Err: %llx\n"
                             "\tc_parity4 = %d, asm_parity4 = %d,\n"
                             "\tc_parity8 = %d, asm_parity8 = %d,\n"
                             "\tc_parity16 = %d, asm_parity16 = %d,\n"
                             "\tc_parity32 = %d, asm_parity32_1 = %d, 
asm_parity32_2 = %d\n"
                             "\tc_parity64 = %d, asm_parity64_1 = %d, 
asm_parity64_2 = %d\n",
                             w, p4_1, p4_2, p8_1, p8_2, p16_1, p16_2, 
p32_1, p32_2, p32_3, p64_1, p64_2, p64_3);
                 ok = 0;
             }
         }

         fprintf(stderr, "%s\n", ok ? "OK" : "FAIL");
         return 0;
     }

---
  include/asm-generic/bitops.h              |  1 +
  include/asm-generic/bitops/arch_parity.h  | 39 
+++++++++++++++++++++++++++++++
  include/asm-generic/bitops/const_parity.h | 36 
++++++++++++++++++++++++++++
  include/asm-generic/bitops/parity.h       |  7 ++++++
  include/linux/bitops.h                    |  5 ++++
  5 files changed, 88 insertions(+)
  create mode 100644 include/asm-generic/bitops/arch_parity.h
  create mode 100644 include/asm-generic/bitops/const_parity.h
  create mode 100644 include/asm-generic/bitops/parity.h

diff --git a/include/asm-generic/bitops.h b/include/asm-generic/bitops.h
index dcdcacf..d85722f 100644
--- a/include/asm-generic/bitops.h
+++ b/include/asm-generic/bitops.h
@@ -27,6 +27,7 @@
  #include <asm-generic/bitops/sched.h>
  #include <asm-generic/bitops/ffs.h>
  #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
  #include <asm-generic/bitops/lock.h>

  #include <asm-generic/bitops/atomic.h>
diff --git a/include/asm-generic/bitops/arch_parity.h 
b/include/asm-generic/bitops/arch_parity.h
new file mode 100644
index 0000000..cddc555
--- /dev/null
+++ b/include/asm-generic/bitops/arch_parity.h
@@ -0,0 +1,39 @@
+#ifndef _ASM_GENERIC_BITOPS_ARCH_PARITY_H_
+#define _ASM_GENERIC_BITOPS_ARCH_PARITY_H_
+
+#include <asm/types.h>
+
+/*
+ * Refrence to 
'https://graphics.stanford.edu/~seander/bithacks.html#ParityParallel'.
+ */
+
+static inline unsigned int __arch_parity4(unsigned int w)
+{
+    w &= 0xf;
+    return (0x6996 >> w) & 1;
+}
+
+static inline unsigned int __arch_parity8(unsigned int w)
+{
+    w ^= w >> 4;
+    return __arch_parity4(w);
+}
+
+static inline unsigned int __arch_parity16(unsigned int w)
+{
+    w ^= w >> 8;
+    return __arch_parity8(w);
+}
+
+static inline unsigned int __arch_parity32(unsigned int w)
+{
+    w ^= w >> 16;
+    return __arch_parity16(w);
+}
+
+static inline unsigned int __arch_parity64(__u64 w)
+{
+    return __arch_parity32((unsigned int)(w >> 32) ^ (unsigned int)w);
+}
+
+#endif /* _ASM_GENERIC_BITOPS_ARCH_PARITY_H_ */
diff --git a/include/asm-generic/bitops/const_parity.h 
b/include/asm-generic/bitops/const_parity.h
new file mode 100644
index 0000000..6af7987
--- /dev/null
+++ b/include/asm-generic/bitops/const_parity.h
@@ -0,0 +1,36 @@
+#ifndef _ASM_GENERIC_BITOPS_CONST_PARITY_H_
+#define _ASM_GENERIC_BITOPS_CONST_PARITY_H_
+
+/*
+ * Compile time versions of __arch_parityN()
+ */
+#define __const_parity4(w)   ((0x6996 >> ((w) & 0xf)) & 1)
+#define __const_parity8(w)   (__const_parity4((w) ^ ((w) >> 4)))
+#define __const_parity16(w)  (__const_parity8((w) ^ ((w) >> 8)))
+#define __const_parity32(w)  (__const_parity16((w) ^ ((w) >> 16)))
+#define __const_parity64(w)  (__const_parity32((w) ^ ((w) >> 32)))
+
+/*
+ * Generic interface.
+ */
+#define parity4(w)   (__builtin_constant_p(w) ? __const_parity4(w) : 
__arch_parity4(w))
+#define parity8(w)   (__builtin_constant_p(w) ? __const_parity8(w) : 
__arch_parity8(w))
+#define parity16(w)  (__builtin_constant_p(w) ? __const_parity16(w) : 
__arch_parity16(w))
+#define parity32(w)  (__builtin_constant_p(w) ? __const_parity32(w) : 
__arch_parity32(w))
+#define parity64(w)  (__builtin_constant_p(w) ? __const_parity64(w) : 
__arch_parity64(w))
+
+/*
+ * Interface for known constant arguments
+ */
+#define PARITY4(w)   (BUILD_BUG_ON_ZERO(!__builtin_constant_p(w)) + 
__const_parity4(w))
+#define PARITY8(w)   (BUILD_BUG_ON_ZERO(!__builtin_constant_p(w)) + 
__const_parity8(w))
+#define PARITY16(w)  (BUILD_BUG_ON_ZERO(!__builtin_constant_p(w)) + 
__const_parity16(w))
+#define PARITY32(w)  (BUILD_BUG_ON_ZERO(!__builtin_constant_p(w)) + 
__const_parity32(w))
+#define PARITY64(w)  (BUILD_BUG_ON_ZERO(!__builtin_constant_p(w)) + 
__const_parity64(w))
+
+/*
+ * Type invariant interface to the compile time constant parity functions.
+ */
+#define PARITY(w)    PARITY64((u64)(w))
+
+#endif /* _ASM_GENERIC_BITOPS_CONST_PARITY_H_ */
diff --git a/include/asm-generic/bitops/parity.h 
b/include/asm-generic/bitops/parity.h
new file mode 100644
index 0000000..a91dce7
--- /dev/null
+++ b/include/asm-generic/bitops/parity.h
@@ -0,0 +1,7 @@
+#ifndef _ASM_GENERIC_BITOPS_PARITY_H_
+#define _ASM_GENERIC_BITOPS_PARITY_H_
+
+#include <asm-generic/bitops/arch_parity.h>
+#include <asm-generic/bitops/const_parity.h>
+
+#endif /* _ASM_GENERIC_BITOPS_PARITY_H_ */
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index defeaac..8952f88 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -80,6 +80,11 @@ static __always_inline unsigned long 
hweight_long(unsigned long w)
      return sizeof(w) == 4 ? hweight32(w) : hweight64(w);
  }

+static __always_inline unsigned int parity_long(unsigned long w)
+{
+    return sizeof(w) == 4 ? parity32(w) : parity64(w);
+}
+
  /**
   * rol64 - rotate a 64-bit value left
   * @word: value to rotate
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH V2 02/30] Include generic parity.h in some architectures' bitops.h
  2016-04-05  2:06 [PATCH V2 01/30] bitops: add parity functions Zeng Zhaoxiu
@ 2016-04-05  4:23 ` Zeng Zhaoxiu
  2016-04-06  8:41   ` [PATCH v2 " zengzhaoxiu
  2016-04-05 19:04 ` [PATCH V2 01/30] bitops: add parity functions Sam Ravnborg
                   ` (28 subsequent siblings)
  29 siblings, 1 reply; 84+ messages in thread
From: Zeng Zhaoxiu @ 2016-04-05  4:23 UTC (permalink / raw)
  To: Vineet Gupta, Russell King, Catalin Marinas, Will Deacon,
	Haavard Skinnemoen, Hans-Christian Egtvedt, Mark Salter,
	Aurelien Jacquiot, Mikael Starvik, Jesper Nilsson,
	Yoshinori Sato, Richard Kuo, Geert Uytterhoeven, James Hogan,
	David Howells, Koichi Yasutake, Jonas Bonn, James E.J. Bottomley,
	Helge Deller, Martin Schwidefsky, Heiko Carstens, Rich Felker,
	Chris Zankel, Max Filippov, Peter Zijlstra (Intel),
	Masahiro Yamada, Thomas Gleixner, Hendrik Brueckner
  Cc: linux-kernel, linux-snps-arc, linux-arm-kernel, linux-c6x-dev,
	linux-cris-kernel, uclinux-h8-devel, linux-hexagon, linux-m68k,
	linux-metag, linux-am33-list, linux, linux-parisc, linux-s390,
	linux-sh, linux-xtensa

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

Use the generic version
---
  arch/arc/include/asm/bitops.h      | 1 +
  arch/arm/include/asm/bitops.h      | 1 +
  arch/arm64/include/asm/bitops.h    | 1 +
  arch/avr32/include/asm/bitops.h    | 1 +
  arch/c6x/include/asm/bitops.h      | 1 +
  arch/cris/include/asm/bitops.h     | 1 +
  arch/frv/include/asm/bitops.h      | 1 +
  arch/h8300/include/asm/bitops.h    | 1 +
  arch/hexagon/include/asm/bitops.h  | 1 +
  arch/m32r/include/asm/bitops.h     | 1 +
  arch/m68k/include/asm/bitops.h     | 1 +
  arch/metag/include/asm/bitops.h    | 1 +
  arch/mn10300/include/asm/bitops.h  | 1 +
  arch/openrisc/include/asm/bitops.h | 1 +
  arch/parisc/include/asm/bitops.h   | 1 +
  arch/s390/include/asm/bitops.h     | 1 +
  arch/sh/include/asm/bitops.h       | 1 +
  arch/xtensa/include/asm/bitops.h   | 1 +
  18 files changed, 18 insertions(+)

diff --git a/arch/arc/include/asm/bitops.h b/arch/arc/include/asm/bitops.h
index 0352fb8..7967e47 100644
--- a/arch/arc/include/asm/bitops.h
+++ b/arch/arc/include/asm/bitops.h
@@ -370,6 +370,7 @@ static inline __attribute__ ((const)) int 
__ffs(unsigned long x)
  #define ffz(x)    __ffs(~(x))

  #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
  #include <asm-generic/bitops/fls64.h>
  #include <asm-generic/bitops/sched.h>
  #include <asm-generic/bitops/lock.h>
diff --git a/arch/arm/include/asm/bitops.h b/arch/arm/include/asm/bitops.h
index e943e6c..99f28a6 100644
--- a/arch/arm/include/asm/bitops.h
+++ b/arch/arm/include/asm/bitops.h
@@ -313,6 +313,7 @@ static inline unsigned long __ffs(unsigned long x)

  #include <asm-generic/bitops/sched.h>
  #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
  #include <asm-generic/bitops/lock.h>

  #ifdef __ARMEB__
diff --git a/arch/arm64/include/asm/bitops.h 
b/arch/arm64/include/asm/bitops.h
index 9c19594..eac4965 100644
--- a/arch/arm64/include/asm/bitops.h
+++ b/arch/arm64/include/asm/bitops.h
@@ -44,6 +44,7 @@ extern int test_and_change_bit(int nr, volatile 
unsigned long *p);

  #include <asm-generic/bitops/sched.h>
  #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
  #include <asm-generic/bitops/lock.h>

  #include <asm-generic/bitops/non-atomic.h>
diff --git a/arch/avr32/include/asm/bitops.h 
b/arch/avr32/include/asm/bitops.h
index 910d537..9f4a2ce 100644
--- a/arch/avr32/include/asm/bitops.h
+++ b/arch/avr32/include/asm/bitops.h
@@ -298,6 +298,7 @@ static inline int ffs(unsigned long word)
  #include <asm-generic/bitops/fls64.h>
  #include <asm-generic/bitops/sched.h>
  #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
  #include <asm-generic/bitops/lock.h>

  extern unsigned long find_next_zero_bit_le(const void *addr,
diff --git a/arch/c6x/include/asm/bitops.h b/arch/c6x/include/asm/bitops.h
index f0ab012..94eb0d1 100644
--- a/arch/c6x/include/asm/bitops.h
+++ b/arch/c6x/include/asm/bitops.h
@@ -87,6 +87,7 @@ static inline int ffs(int x)

  #include <asm-generic/bitops/sched.h>
  #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
  #include <asm-generic/bitops/lock.h>

  #include <asm-generic/bitops/atomic.h>
diff --git a/arch/cris/include/asm/bitops.h b/arch/cris/include/asm/bitops.h
index 8062cb5..06bc246 100644
--- a/arch/cris/include/asm/bitops.h
+++ b/arch/cris/include/asm/bitops.h
@@ -36,6 +36,7 @@
  #include <asm-generic/bitops/__fls.h>
  #include <asm-generic/bitops/fls64.h>
  #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
  #include <asm-generic/bitops/find.h>
  #include <asm-generic/bitops/lock.h>

diff --git a/arch/frv/include/asm/bitops.h b/arch/frv/include/asm/bitops.h
index 0df8e95..f2a7ee8 100644
--- a/arch/frv/include/asm/bitops.h
+++ b/arch/frv/include/asm/bitops.h
@@ -314,6 +314,7 @@ int __ilog2_u64(u64 n)

  #include <asm-generic/bitops/sched.h>
  #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
  #include <asm-generic/bitops/lock.h>

  #include <asm-generic/bitops/le.h>
diff --git a/arch/h8300/include/asm/bitops.h 
b/arch/h8300/include/asm/bitops.h
index 05999ab..e392db2 100644
--- a/arch/h8300/include/asm/bitops.h
+++ b/arch/h8300/include/asm/bitops.h
@@ -172,6 +172,7 @@ static inline unsigned long __ffs(unsigned long word)
  #include <asm-generic/bitops/find.h>
  #include <asm-generic/bitops/sched.h>
  #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
  #include <asm-generic/bitops/lock.h>
  #include <asm-generic/bitops/le.h>
  #include <asm-generic/bitops/ext2-atomic.h>
diff --git a/arch/hexagon/include/asm/bitops.h 
b/arch/hexagon/include/asm/bitops.h
index 5e4a59b..2df614e 100644
--- a/arch/hexagon/include/asm/bitops.h
+++ b/arch/hexagon/include/asm/bitops.h
@@ -290,6 +290,7 @@ static inline unsigned long __fls(unsigned long word)
  #include <asm-generic/bitops/fls64.h>
  #include <asm-generic/bitops/sched.h>
  #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>

  #include <asm-generic/bitops/le.h>
  #include <asm-generic/bitops/ext2-atomic.h>
diff --git a/arch/m32r/include/asm/bitops.h b/arch/m32r/include/asm/bitops.h
index 86ba2b4..e3cf46b 100644
--- a/arch/m32r/include/asm/bitops.h
+++ b/arch/m32r/include/asm/bitops.h
@@ -259,6 +259,7 @@ static __inline__ int test_and_change_bit(int nr, 
volatile void * addr)
  #include <asm-generic/bitops/find.h>
  #include <asm-generic/bitops/ffs.h>
  #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
  #include <asm-generic/bitops/lock.h>

  #endif /* __KERNEL__ */
diff --git a/arch/m68k/include/asm/bitops.h b/arch/m68k/include/asm/bitops.h
index b4a9b0d..fd673ea 100644
--- a/arch/m68k/include/asm/bitops.h
+++ b/arch/m68k/include/asm/bitops.h
@@ -519,6 +519,7 @@ static inline int __fls(int x)
  #include <asm-generic/bitops/fls64.h>
  #include <asm-generic/bitops/sched.h>
  #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
  #include <asm-generic/bitops/lock.h>
  #endif /* __KERNEL__ */

diff --git a/arch/metag/include/asm/bitops.h 
b/arch/metag/include/asm/bitops.h
index 2671134..ad13087 100644
--- a/arch/metag/include/asm/bitops.h
+++ b/arch/metag/include/asm/bitops.h
@@ -118,6 +118,7 @@ static inline int test_and_change_bit(unsigned int bit,
  #include <asm-generic/bitops/__fls.h>
  #include <asm-generic/bitops/fls64.h>
  #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
  #include <asm-generic/bitops/lock.h>
  #include <asm-generic/bitops/sched.h>
  #include <asm-generic/bitops/le.h>
diff --git a/arch/mn10300/include/asm/bitops.h 
b/arch/mn10300/include/asm/bitops.h
index fe6f8e2..60761b7 100644
--- a/arch/mn10300/include/asm/bitops.h
+++ b/arch/mn10300/include/asm/bitops.h
@@ -225,6 +225,7 @@ int ffs(int x)
  #include <asm-generic/bitops/find.h>
  #include <asm-generic/bitops/sched.h>
  #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
  #include <asm-generic/bitops/ext2-atomic-setbit.h>
  #include <asm-generic/bitops/le.h>

diff --git a/arch/openrisc/include/asm/bitops.h 
b/arch/openrisc/include/asm/bitops.h
index 3003cda..8c97642 100644
--- a/arch/openrisc/include/asm/bitops.h
+++ b/arch/openrisc/include/asm/bitops.h
@@ -43,6 +43,7 @@
  #include <asm-generic/bitops/sched.h>
  #include <asm/bitops/ffs.h>
  #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
  #include <asm-generic/bitops/lock.h>

  #include <asm-generic/bitops/atomic.h>
diff --git a/arch/parisc/include/asm/bitops.h 
b/arch/parisc/include/asm/bitops.h
index 3f9406d..867ba10 100644
--- a/arch/parisc/include/asm/bitops.h
+++ b/arch/parisc/include/asm/bitops.h
@@ -211,6 +211,7 @@ static __inline__ int fls(int x)
  #include <asm-generic/bitops/__fls.h>
  #include <asm-generic/bitops/fls64.h>
  #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
  #include <asm-generic/bitops/lock.h>
  #include <asm-generic/bitops/sched.h>

diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h
index 8043f10..198eead 100644
--- a/arch/s390/include/asm/bitops.h
+++ b/arch/s390/include/asm/bitops.h
@@ -460,6 +460,7 @@ static inline int fls(int word)
  #include <asm-generic/bitops/ffz.h>
  #include <asm-generic/bitops/find.h>
  #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
  #include <asm-generic/bitops/sched.h>
  #include <asm-generic/bitops/le.h>
  #include <asm-generic/bitops/ext2-atomic-setbit.h>
diff --git a/arch/sh/include/asm/bitops.h b/arch/sh/include/asm/bitops.h
index fc8e652..4bf0c35 100644
--- a/arch/sh/include/asm/bitops.h
+++ b/arch/sh/include/asm/bitops.h
@@ -86,6 +86,7 @@ static inline unsigned long ffz(unsigned long word)
  #include <asm-generic/bitops/find.h>
  #include <asm-generic/bitops/ffs.h>
  #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
  #include <asm-generic/bitops/lock.h>
  #include <asm-generic/bitops/sched.h>
  #include <asm-generic/bitops/le.h>
diff --git a/arch/xtensa/include/asm/bitops.h 
b/arch/xtensa/include/asm/bitops.h
index 3f44fa2..981fa83 100644
--- a/arch/xtensa/include/asm/bitops.h
+++ b/arch/xtensa/include/asm/bitops.h
@@ -229,6 +229,7 @@ test_and_change_bit(unsigned int bit, volatile 
unsigned long *p)
  #include <asm-generic/bitops/ext2-atomic-setbit.h>

  #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
  #include <asm-generic/bitops/lock.h>
  #include <asm-generic/bitops/sched.h>

-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* Re: [PATCH V2 01/30] bitops: add parity functions
  2016-04-05  2:06 [PATCH V2 01/30] bitops: add parity functions Zeng Zhaoxiu
  2016-04-05  4:23 ` [PATCH V2 02/30] Include generic parity.h in some architectures' bitops.h Zeng Zhaoxiu
@ 2016-04-05 19:04 ` Sam Ravnborg
  2016-04-06  5:33   ` Zeng Zhaoxiu
  2016-04-06  8:22   ` [PATCH v2 " zengzhaoxiu
  2016-04-06  8:46 ` [PATCH v2 03/30] Add alpha-specific " zengzhaoxiu
                   ` (27 subsequent siblings)
  29 siblings, 2 replies; 84+ messages in thread
From: Sam Ravnborg @ 2016-04-05 19:04 UTC (permalink / raw)
  To: Zeng Zhaoxiu
  Cc: Arnd Bergmann, Andrew Morton, Martin Kepplinger,
	Rasmus Villemoes, Ingo Molnar, Yury Norov, Sasha Levin,
	Denys Vlasenko, linux-kernel, linux-arch

On Tue, Apr 05, 2016 at 10:06:21AM +0800, Zeng Zhaoxiu wrote:
> From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
> 
> These patches provide generic and architecture-specific odd parity
> calculations.

Hi Zeng.

Can you please fix your mail script.
I see only 1/30 (sent to linux-arch) - and patch looks mangeled (broken lines)
No mail was sent to sparclinux - but sparc was mentioned.

git send-mail usually do the trick.

	Sam

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH V2 01/30] bitops: add parity functions
  2016-04-05 19:04 ` [PATCH V2 01/30] bitops: add parity functions Sam Ravnborg
@ 2016-04-06  5:33   ` Zeng Zhaoxiu
  2016-04-06  8:24     ` Sam Ravnborg
  2016-04-06  8:22   ` [PATCH v2 " zengzhaoxiu
  1 sibling, 1 reply; 84+ messages in thread
From: Zeng Zhaoxiu @ 2016-04-06  5:33 UTC (permalink / raw)
  To: Sam Ravnborg; +Cc: linux-kernel, linux-arch

在 2016年04月06日 03:04, Sam Ravnborg 写道:
> On Tue, Apr 05, 2016 at 10:06:21AM +0800, Zeng Zhaoxiu wrote:
>> From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
>>
>> These patches provide generic and architecture-specific odd parity
>> calculations.
> Hi Zeng.
>
> Can you please fix your mail script.
> I see only 1/30 (sent to linux-arch) - and patch looks mangeled (broken lines)
> No mail was sent to sparclinux - but sparc was mentioned.
>
> git send-mail usually do the trick.
>
> 	Sam

When I do "git send-email", I got:
...
5.7.14 JTibJDWdGxPcfa-E9KgtF-grMQl9w> Please log in via your web browser and
5.7.14 then try again.
5.7.14  Learn more at
5.7.14  https://support.google.com/mail/answer/78754 zp5sm464879pac.9 - gsmtp
...

So I use thunderbird to send email, but the text wrap error.

^ permalink raw reply	[flat|nested] 84+ messages in thread

* [PATCH v2 01/30] bitops: add parity functions
  2016-04-05 19:04 ` [PATCH V2 01/30] bitops: add parity functions Sam Ravnborg
  2016-04-06  5:33   ` Zeng Zhaoxiu
@ 2016-04-06  8:22   ` zengzhaoxiu
  1 sibling, 0 replies; 84+ messages in thread
From: zengzhaoxiu @ 2016-04-06  8:22 UTC (permalink / raw)
  To: joe, sam, arnd, akpm, martink, linux, mingo, yury.norov,
	sasha.levin, dvlasenk
  Cc: linux-kernel, linux-arch, Zhaoxiu Zeng

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

These patches provide generic and architecture-specific odd parity calculations.

I did not use GCC's __builtin_parity* functions, based on the following reasons:
  1. I don't know where to identify which version of GCC from the beginning
     supported __builtin_parity for the architecture.
  2. For the architecture that doesn't has popcount instruction, GCC instead use
     "call __paritysi2" (__paritydi2 for 64-bits). So if use __builtin_parity, we must
     provide __paritysi2 and __paritydi2 functions for these architectures.
     Additionally, parity4,8,16 might be "__builtin_parity(x & mask)", but the "& mask"
     operation is totally unnecessary.
  3. For the architecture that has popcount instruction, we do the same things.
  4. For powerpc64, sparc64, and x86, we do runtime patching to use popcount instruction
     if the CPU support.

I have compiled successfully with x86_64_defconfig, i386_defconfig, pseries_defconfig
and sparc64_defconfig. And I used the following codes to test:

	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>

	#ifdef __x86_64__
	/* popcnt %edi, %eax -- redundant REX prefix for alignment */
	#define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7"
	/* popcnt %rdi, %rax */
	#define POPCNT64 ".byte 0xf3,0x48,0x0f,0xb8,0xc7"
	#define REG_IN "D"
	#define REG_OUT "a"
	#else
	/* popcnt %eax, %eax */
	#define POPCNT32 ".byte 0xf3,0x0f,0xb8,0xc0"
	#define REG_IN "a"
	#define REG_OUT "a"
	#endif

	static inline int c_parity4(unsigned int w)
	{
		w &= 0xf;
		return (0x6996 >> w) & 1;
	}

	static inline int c_parity8(unsigned int w)
	{
		w ^= w >> 4;
		return c_parity4(w);
	}

	static inline int c_parity16(unsigned int w)
	{
		w ^= w >> 8;
		return c_parity8(w);
	}

	static inline int c_parity32(unsigned int w)
	{
		w ^= w >> 16;
		return c_parity16(w);
	}

	static inline int c_parity64(uint64_t w)
	{
		return c_parity32((unsigned int)w ^ (unsigned int)(w >> 32));
	}

	static inline int asm_parity4(unsigned int w)
	{
		unsigned int res = 0;

		asm("test	$0xf, %1		\n"
			"setpo	%b0				\n"
			: "+q" (res)
			: "r" (w)
			: "cc");

		return res;
	}

	static inline int asm_parity8(unsigned int w)
	{
		unsigned int res = 0;

		asm("test	%1, %1			\n"
			"setpo	%b0			\n"
			: "+q" (res)
			: "r" (w)
			: "cc");

		return res;
	}

	static inline int asm_parity16(unsigned int w)
	{
		unsigned int res = 0;

		asm("xor	%h1, %b1		\n"
			"setpo	%b0			\n"
			: "+q" (res), "+q" (w)
			: : "cc");

		return res;
	}

	static inline int asm_parity32_1(unsigned int w)
	{
		unsigned int res;

		w ^= w >> 16;
		asm("xor	%%ah, %%al		\n"
			"mov	$0, %%eax		\n"
			"setpo	%%al			\n"
			: "=a" (res)
			: "a" (w)
			: "cc");

		return res;
	}

	static inline int asm_parity32_2(unsigned int w)
	{
		unsigned int res;

		asm(POPCNT32 "				\n"
			"andl	$1, %0			\n"
			: "="REG_OUT (res)
			: REG_IN (w)
			: "cc");

		return res;
	}

	#ifdef __x86_64__
	static inline int asm_parity64_1(uint64_t w)
	{
		unsigned int res = (unsigned int)w ^ (unsigned int)(w >> 32);

		res ^= res >> 16;
		asm("xor	%%ah, %%al		\n"
			"mov	$0, %%eax		\n"
			"setpo	%%al			\n"
			: "=a" (res)
			: "a" (res)
			: "cc");

		return res;
	}

	static inline int asm_parity64_2(uint64_t w)
	{
		unsigned int res;

		asm(POPCNT64 "				\n"
			"andl	$1, %0			\n"
			: "="REG_OUT (res)
			: REG_IN (w)
			: "cc");

		return res;
	}
	#else
	static inline int asm_parity64_1(uint64_t w)
	{
		return asm_parity32_1((unsigned int)(w >> 32) ^ (unsigned int)w);
	}

	static inline int asm_parity64_2(uint64_t w)
	{
		return asm_parity32_2((unsigned int)(w >> 32) ^ (unsigned int)w);
	}
	#endif

	int main(int argc, char **argv)
	{
		int ok = 1;
		int count = 1000, i;

		if (argc >= 2)
			count = atoi(argv[1]);

		srand((unsigned)time(NULL));

		for (i = 0; i < count; i++) {
			uint64_t w = rand() | (uint64_t)rand() << 32;
			int p4_1 = c_parity4(w);
			int p4_2 = asm_parity4(w);
			int p8_1 = c_parity8(w);
			int p8_2 = asm_parity8(w);
			int p16_1 = c_parity16(w);
			int p16_2 = asm_parity16(w);
			int p32_1 = c_parity32(w);
			int p32_2 = asm_parity32_1(w);
			int p32_3 = asm_parity32_2(w);
			int p64_1 = c_parity64(w);
			int p64_2 = asm_parity64_1(w);
			int p64_3 = asm_parity64_2(w);
			if (p4_1 != p4_2 ||
				p8_1 != p8_2 ||
				p16_1 != p16_2 ||
				p32_1 != p32_2 || p32_1 != p32_3 ||
				p64_1 != p64_2 || p64_1 != p64_3) {
				fprintf(stderr, "Err: %llx\n"
							"\tc_parity4 = %d, asm_parity4 = %d,\n"
							"\tc_parity8 = %d, asm_parity8 = %d,\n"
							"\tc_parity16 = %d, asm_parity16 = %d,\n"
							"\tc_parity32 = %d, asm_parity32_1 = %d, asm_parity32_2 = %d\n"
							"\tc_parity64 = %d, asm_parity64_1 = %d, asm_parity64_2 = %d\n",
							w, p4_1, p4_2, p8_1, p8_2, p16_1, p16_2, p32_1, p32_2, p32_3, p64_1, p64_2, p64_3);
				ok = 0;
			}
		}

		fprintf(stderr, "%s\n", ok ? "OK" : "FAIL");
		return 0;
	}

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
---
 include/asm-generic/bitops.h              |  1 +
 include/asm-generic/bitops/arch_parity.h  | 39 +++++++++++++++++++++++++++++++
 include/asm-generic/bitops/const_parity.h | 36 ++++++++++++++++++++++++++++
 include/asm-generic/bitops/parity.h       |  7 ++++++
 include/linux/bitops.h                    |  5 ++++
 5 files changed, 88 insertions(+)
 create mode 100644 include/asm-generic/bitops/arch_parity.h
 create mode 100644 include/asm-generic/bitops/const_parity.h
 create mode 100644 include/asm-generic/bitops/parity.h

diff --git a/include/asm-generic/bitops.h b/include/asm-generic/bitops.h
index dcdcacf..d85722f 100644
--- a/include/asm-generic/bitops.h
+++ b/include/asm-generic/bitops.h
@@ -27,6 +27,7 @@
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/ffs.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
 #include <asm-generic/bitops/lock.h>
 
 #include <asm-generic/bitops/atomic.h>
diff --git a/include/asm-generic/bitops/arch_parity.h b/include/asm-generic/bitops/arch_parity.h
new file mode 100644
index 0000000..cddc555
--- /dev/null
+++ b/include/asm-generic/bitops/arch_parity.h
@@ -0,0 +1,39 @@
+#ifndef _ASM_GENERIC_BITOPS_ARCH_PARITY_H_
+#define _ASM_GENERIC_BITOPS_ARCH_PARITY_H_
+
+#include <asm/types.h>
+
+/*
+ * Refrence to 'https://graphics.stanford.edu/~seander/bithacks.html#ParityParallel'.
+ */
+
+static inline unsigned int __arch_parity4(unsigned int w)
+{
+	w &= 0xf;
+	return (0x6996 >> w) & 1;
+}
+
+static inline unsigned int __arch_parity8(unsigned int w)
+{
+	w ^= w >> 4;
+	return __arch_parity4(w);
+}
+
+static inline unsigned int __arch_parity16(unsigned int w)
+{
+	w ^= w >> 8;
+	return __arch_parity8(w);
+}
+
+static inline unsigned int __arch_parity32(unsigned int w)
+{
+	w ^= w >> 16;
+	return __arch_parity16(w);
+}
+
+static inline unsigned int __arch_parity64(__u64 w)
+{
+	return __arch_parity32((unsigned int)(w >> 32) ^ (unsigned int)w);
+}
+
+#endif /* _ASM_GENERIC_BITOPS_ARCH_PARITY_H_ */
diff --git a/include/asm-generic/bitops/const_parity.h b/include/asm-generic/bitops/const_parity.h
new file mode 100644
index 0000000..6af7987
--- /dev/null
+++ b/include/asm-generic/bitops/const_parity.h
@@ -0,0 +1,36 @@
+#ifndef _ASM_GENERIC_BITOPS_CONST_PARITY_H_
+#define _ASM_GENERIC_BITOPS_CONST_PARITY_H_
+
+/*
+ * Compile time versions of __arch_parityN()
+ */
+#define __const_parity4(w)   ((0x6996 >> ((w) & 0xf)) & 1)
+#define __const_parity8(w)   (__const_parity4((w) ^ ((w) >> 4)))
+#define __const_parity16(w)  (__const_parity8((w) ^ ((w) >> 8)))
+#define __const_parity32(w)  (__const_parity16((w) ^ ((w) >> 16)))
+#define __const_parity64(w)  (__const_parity32((w) ^ ((w) >> 32)))
+
+/*
+ * Generic interface.
+ */
+#define parity4(w)   (__builtin_constant_p(w) ? __const_parity4(w)  : __arch_parity4(w))
+#define parity8(w)   (__builtin_constant_p(w) ? __const_parity8(w)  : __arch_parity8(w))
+#define parity16(w)  (__builtin_constant_p(w) ? __const_parity16(w) : __arch_parity16(w))
+#define parity32(w)  (__builtin_constant_p(w) ? __const_parity32(w) : __arch_parity32(w))
+#define parity64(w)  (__builtin_constant_p(w) ? __const_parity64(w) : __arch_parity64(w))
+
+/*
+ * Interface for known constant arguments
+ */
+#define PARITY4(w)   (BUILD_BUG_ON_ZERO(!__builtin_constant_p(w)) + __const_parity4(w))
+#define PARITY8(w)   (BUILD_BUG_ON_ZERO(!__builtin_constant_p(w)) + __const_parity8(w))
+#define PARITY16(w)  (BUILD_BUG_ON_ZERO(!__builtin_constant_p(w)) + __const_parity16(w))
+#define PARITY32(w)  (BUILD_BUG_ON_ZERO(!__builtin_constant_p(w)) + __const_parity32(w))
+#define PARITY64(w)  (BUILD_BUG_ON_ZERO(!__builtin_constant_p(w)) + __const_parity64(w))
+
+/*
+ * Type invariant interface to the compile time constant parity functions.
+ */
+#define PARITY(w)    PARITY64((u64)(w))
+
+#endif /* _ASM_GENERIC_BITOPS_CONST_PARITY_H_ */
diff --git a/include/asm-generic/bitops/parity.h b/include/asm-generic/bitops/parity.h
new file mode 100644
index 0000000..a91dce7
--- /dev/null
+++ b/include/asm-generic/bitops/parity.h
@@ -0,0 +1,7 @@
+#ifndef _ASM_GENERIC_BITOPS_PARITY_H_
+#define _ASM_GENERIC_BITOPS_PARITY_H_
+
+#include <asm-generic/bitops/arch_parity.h>
+#include <asm-generic/bitops/const_parity.h>
+
+#endif /* _ASM_GENERIC_BITOPS_PARITY_H_ */
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index defeaac..8952f88 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -80,6 +80,11 @@ static __always_inline unsigned long hweight_long(unsigned long w)
 	return sizeof(w) == 4 ? hweight32(w) : hweight64(w);
 }
 
+static __always_inline unsigned int parity_long(unsigned long w)
+{
+	return sizeof(w) == 4 ? parity32(w) : parity64(w);
+}
+
 /**
  * rol64 - rotate a 64-bit value left
  * @word: value to rotate
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* Re: [PATCH V2 01/30] bitops: add parity functions
  2016-04-06  5:33   ` Zeng Zhaoxiu
@ 2016-04-06  8:24     ` Sam Ravnborg
  0 siblings, 0 replies; 84+ messages in thread
From: Sam Ravnborg @ 2016-04-06  8:24 UTC (permalink / raw)
  To: Zeng Zhaoxiu; +Cc: linux-kernel, linux-arch

On Wed, Apr 06, 2016 at 01:33:35PM +0800, Zeng Zhaoxiu wrote:
> 在 2016年04月06日 03:04, Sam Ravnborg 写道:
> >On Tue, Apr 05, 2016 at 10:06:21AM +0800, Zeng Zhaoxiu wrote:
> >>From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
> >>
> >>These patches provide generic and architecture-specific odd parity
> >>calculations.
> >Hi Zeng.
> >
> >Can you please fix your mail script.
> >I see only 1/30 (sent to linux-arch) - and patch looks mangeled (broken lines)
> >No mail was sent to sparclinux - but sparc was mentioned.
> >
> >git send-mail usually do the trick.
> >
> >	Sam
> 
> When I do "git send-email", I got:
> ...
> 5.7.14 JTibJDWdGxPcfa-E9KgtF-grMQl9w> Please log in via your web browser and
> 5.7.14 then try again.
> 5.7.14  Learn more at
> 5.7.14  https://support.google.com/mail/answer/78754 zp5sm464879pac.9 - gsmtp
> ...
> 
> So I use thunderbird to send email, but the text wrap error.
Please try to search for "How to configure git send-email to use Gmail"
and follow the instructions given in some of the links.
Test with your own mail address before you send to a list.

Also linux-arch only saw 1/30 - make sure to address this too.

	Sam

^ permalink raw reply	[flat|nested] 84+ messages in thread

* [PATCH v2 02/30] Include generic parity.h in some architectures' bitops.h
  2016-04-05  4:23 ` [PATCH V2 02/30] Include generic parity.h in some architectures' bitops.h Zeng Zhaoxiu
@ 2016-04-06  8:41   ` zengzhaoxiu
  0 siblings, 0 replies; 84+ messages in thread
From: zengzhaoxiu @ 2016-04-06  8:41 UTC (permalink / raw)
  To: vgupta, linux, catalin.marinas, will.deacon, hskinnemoen,
	egtvedt, msalter, a-jacquiot, starvik, jesper.nilsson, ysato,
	rkuo, geert, james.hogan, dhowells, yasutake.koichi, jonas, jejb,
	deller, schwidefsky, heiko.carstens, dalias, chris, jcmvbkbc,
	peterz, yamada.masahiro, tglx, brueckner
  Cc: linux-kernel, linux-snps-arc, linux-arm-kernel, linux-c6x-dev,
	linux-cris-kernel, uclinux-h8-devel, linux-hexagon, linux-m68k,
	linux-metag, linux-am33-list, linux, linux-parisc, linux-s390,
	linux-sh, linux-xtensa, Zhaoxiu Zeng

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

Use the generic version

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
---
 arch/arc/include/asm/bitops.h      | 1 +
 arch/arm/include/asm/bitops.h      | 1 +
 arch/arm64/include/asm/bitops.h    | 1 +
 arch/avr32/include/asm/bitops.h    | 1 +
 arch/c6x/include/asm/bitops.h      | 1 +
 arch/cris/include/asm/bitops.h     | 1 +
 arch/frv/include/asm/bitops.h      | 1 +
 arch/h8300/include/asm/bitops.h    | 1 +
 arch/hexagon/include/asm/bitops.h  | 1 +
 arch/m32r/include/asm/bitops.h     | 1 +
 arch/m68k/include/asm/bitops.h     | 1 +
 arch/metag/include/asm/bitops.h    | 1 +
 arch/mn10300/include/asm/bitops.h  | 1 +
 arch/openrisc/include/asm/bitops.h | 1 +
 arch/parisc/include/asm/bitops.h   | 1 +
 arch/s390/include/asm/bitops.h     | 1 +
 arch/sh/include/asm/bitops.h       | 1 +
 arch/xtensa/include/asm/bitops.h   | 1 +
 18 files changed, 18 insertions(+)

diff --git a/arch/arc/include/asm/bitops.h b/arch/arc/include/asm/bitops.h
index 0352fb8..7967e47 100644
--- a/arch/arc/include/asm/bitops.h
+++ b/arch/arc/include/asm/bitops.h
@@ -370,6 +370,7 @@ static inline __attribute__ ((const)) int __ffs(unsigned long x)
 #define ffz(x)	__ffs(~(x))
 
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
 #include <asm-generic/bitops/fls64.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/lock.h>
diff --git a/arch/arm/include/asm/bitops.h b/arch/arm/include/asm/bitops.h
index e943e6c..99f28a6 100644
--- a/arch/arm/include/asm/bitops.h
+++ b/arch/arm/include/asm/bitops.h
@@ -313,6 +313,7 @@ static inline unsigned long __ffs(unsigned long x)
 
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
 #include <asm-generic/bitops/lock.h>
 
 #ifdef __ARMEB__
diff --git a/arch/arm64/include/asm/bitops.h b/arch/arm64/include/asm/bitops.h
index 9c19594..eac4965 100644
--- a/arch/arm64/include/asm/bitops.h
+++ b/arch/arm64/include/asm/bitops.h
@@ -44,6 +44,7 @@ extern int test_and_change_bit(int nr, volatile unsigned long *p);
 
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
 #include <asm-generic/bitops/lock.h>
 
 #include <asm-generic/bitops/non-atomic.h>
diff --git a/arch/avr32/include/asm/bitops.h b/arch/avr32/include/asm/bitops.h
index 910d537..9f4a2ce 100644
--- a/arch/avr32/include/asm/bitops.h
+++ b/arch/avr32/include/asm/bitops.h
@@ -298,6 +298,7 @@ static inline int ffs(unsigned long word)
 #include <asm-generic/bitops/fls64.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
 #include <asm-generic/bitops/lock.h>
 
 extern unsigned long find_next_zero_bit_le(const void *addr,
diff --git a/arch/c6x/include/asm/bitops.h b/arch/c6x/include/asm/bitops.h
index f0ab012..94eb0d1 100644
--- a/arch/c6x/include/asm/bitops.h
+++ b/arch/c6x/include/asm/bitops.h
@@ -87,6 +87,7 @@ static inline int ffs(int x)
 
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
 #include <asm-generic/bitops/lock.h>
 
 #include <asm-generic/bitops/atomic.h>
diff --git a/arch/cris/include/asm/bitops.h b/arch/cris/include/asm/bitops.h
index 8062cb5..06bc246 100644
--- a/arch/cris/include/asm/bitops.h
+++ b/arch/cris/include/asm/bitops.h
@@ -36,6 +36,7 @@
 #include <asm-generic/bitops/__fls.h>
 #include <asm-generic/bitops/fls64.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
 #include <asm-generic/bitops/find.h>
 #include <asm-generic/bitops/lock.h>
 
diff --git a/arch/frv/include/asm/bitops.h b/arch/frv/include/asm/bitops.h
index 0df8e95..f2a7ee8 100644
--- a/arch/frv/include/asm/bitops.h
+++ b/arch/frv/include/asm/bitops.h
@@ -314,6 +314,7 @@ int __ilog2_u64(u64 n)
 
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
 #include <asm-generic/bitops/lock.h>
 
 #include <asm-generic/bitops/le.h>
diff --git a/arch/h8300/include/asm/bitops.h b/arch/h8300/include/asm/bitops.h
index 05999ab..e392db2 100644
--- a/arch/h8300/include/asm/bitops.h
+++ b/arch/h8300/include/asm/bitops.h
@@ -172,6 +172,7 @@ static inline unsigned long __ffs(unsigned long word)
 #include <asm-generic/bitops/find.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
 #include <asm-generic/bitops/lock.h>
 #include <asm-generic/bitops/le.h>
 #include <asm-generic/bitops/ext2-atomic.h>
diff --git a/arch/hexagon/include/asm/bitops.h b/arch/hexagon/include/asm/bitops.h
index 5e4a59b..2df614e 100644
--- a/arch/hexagon/include/asm/bitops.h
+++ b/arch/hexagon/include/asm/bitops.h
@@ -290,6 +290,7 @@ static inline unsigned long __fls(unsigned long word)
 #include <asm-generic/bitops/fls64.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
 
 #include <asm-generic/bitops/le.h>
 #include <asm-generic/bitops/ext2-atomic.h>
diff --git a/arch/m32r/include/asm/bitops.h b/arch/m32r/include/asm/bitops.h
index 86ba2b4..e3cf46b 100644
--- a/arch/m32r/include/asm/bitops.h
+++ b/arch/m32r/include/asm/bitops.h
@@ -259,6 +259,7 @@ static __inline__ int test_and_change_bit(int nr, volatile void * addr)
 #include <asm-generic/bitops/find.h>
 #include <asm-generic/bitops/ffs.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
 #include <asm-generic/bitops/lock.h>
 
 #endif /* __KERNEL__ */
diff --git a/arch/m68k/include/asm/bitops.h b/arch/m68k/include/asm/bitops.h
index b4a9b0d..fd673ea 100644
--- a/arch/m68k/include/asm/bitops.h
+++ b/arch/m68k/include/asm/bitops.h
@@ -519,6 +519,7 @@ static inline int __fls(int x)
 #include <asm-generic/bitops/fls64.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
 #include <asm-generic/bitops/lock.h>
 #endif /* __KERNEL__ */
 
diff --git a/arch/metag/include/asm/bitops.h b/arch/metag/include/asm/bitops.h
index 2671134..ad13087 100644
--- a/arch/metag/include/asm/bitops.h
+++ b/arch/metag/include/asm/bitops.h
@@ -118,6 +118,7 @@ static inline int test_and_change_bit(unsigned int bit,
 #include <asm-generic/bitops/__fls.h>
 #include <asm-generic/bitops/fls64.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
 #include <asm-generic/bitops/lock.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/le.h>
diff --git a/arch/mn10300/include/asm/bitops.h b/arch/mn10300/include/asm/bitops.h
index fe6f8e2..60761b7 100644
--- a/arch/mn10300/include/asm/bitops.h
+++ b/arch/mn10300/include/asm/bitops.h
@@ -225,6 +225,7 @@ int ffs(int x)
 #include <asm-generic/bitops/find.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
 #include <asm-generic/bitops/ext2-atomic-setbit.h>
 #include <asm-generic/bitops/le.h>
 
diff --git a/arch/openrisc/include/asm/bitops.h b/arch/openrisc/include/asm/bitops.h
index 3003cda..8c97642 100644
--- a/arch/openrisc/include/asm/bitops.h
+++ b/arch/openrisc/include/asm/bitops.h
@@ -43,6 +43,7 @@
 #include <asm-generic/bitops/sched.h>
 #include <asm/bitops/ffs.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
 #include <asm-generic/bitops/lock.h>
 
 #include <asm-generic/bitops/atomic.h>
diff --git a/arch/parisc/include/asm/bitops.h b/arch/parisc/include/asm/bitops.h
index 3f9406d..867ba10 100644
--- a/arch/parisc/include/asm/bitops.h
+++ b/arch/parisc/include/asm/bitops.h
@@ -211,6 +211,7 @@ static __inline__ int fls(int x)
 #include <asm-generic/bitops/__fls.h>
 #include <asm-generic/bitops/fls64.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
 #include <asm-generic/bitops/lock.h>
 #include <asm-generic/bitops/sched.h>
 
diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h
index 8043f10..198eead 100644
--- a/arch/s390/include/asm/bitops.h
+++ b/arch/s390/include/asm/bitops.h
@@ -460,6 +460,7 @@ static inline int fls(int word)
 #include <asm-generic/bitops/ffz.h>
 #include <asm-generic/bitops/find.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/le.h>
 #include <asm-generic/bitops/ext2-atomic-setbit.h>
diff --git a/arch/sh/include/asm/bitops.h b/arch/sh/include/asm/bitops.h
index fc8e652..4bf0c35 100644
--- a/arch/sh/include/asm/bitops.h
+++ b/arch/sh/include/asm/bitops.h
@@ -86,6 +86,7 @@ static inline unsigned long ffz(unsigned long word)
 #include <asm-generic/bitops/find.h>
 #include <asm-generic/bitops/ffs.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
 #include <asm-generic/bitops/lock.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/le.h>
diff --git a/arch/xtensa/include/asm/bitops.h b/arch/xtensa/include/asm/bitops.h
index 3f44fa2..981fa83 100644
--- a/arch/xtensa/include/asm/bitops.h
+++ b/arch/xtensa/include/asm/bitops.h
@@ -229,6 +229,7 @@ test_and_change_bit(unsigned int bit, volatile unsigned long *p)
 #include <asm-generic/bitops/ext2-atomic-setbit.h>
 
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
 #include <asm-generic/bitops/lock.h>
 #include <asm-generic/bitops/sched.h>
 
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 03/30] Add alpha-specific parity functions
  2016-04-05  2:06 [PATCH V2 01/30] bitops: add parity functions Zeng Zhaoxiu
  2016-04-05  4:23 ` [PATCH V2 02/30] Include generic parity.h in some architectures' bitops.h Zeng Zhaoxiu
  2016-04-05 19:04 ` [PATCH V2 01/30] bitops: add parity functions Sam Ravnborg
@ 2016-04-06  8:46 ` zengzhaoxiu
  2016-04-06  8:53 ` [PATCH v2 04/30] Add blackfin-specific " zengzhaoxiu
                   ` (26 subsequent siblings)
  29 siblings, 0 replies; 84+ messages in thread
From: zengzhaoxiu @ 2016-04-06  8:46 UTC (permalink / raw)
  To: rth, ink, mattst88; +Cc: linux-alpha, linux-kernel, Zhaoxiu Zeng

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
---
 arch/alpha/include/asm/bitops.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/arch/alpha/include/asm/bitops.h b/arch/alpha/include/asm/bitops.h
index 4bdfbd4..95a43fa 100644
--- a/arch/alpha/include/asm/bitops.h
+++ b/arch/alpha/include/asm/bitops.h
@@ -421,11 +421,38 @@ static inline unsigned int __arch_hweight8(unsigned int w)
 {
 	return __arch_hweight64(w & 0xff);
 }
+
+static inline unsigned int __arch_parity64(unsigned long w)
+{
+	return (unsigned int)__kernel_ctpop(w) & 1;
+}
+
+static inline unsigned int __arch_parity32(unsigned int w)
+{
+	return __arch_parity64(w);
+}
+
+static inline unsigned int __arch_parity16(unsigned int w)
+{
+	return __arch_parity64(w & 0xffff);
+}
+
+static inline unsigned int __arch_parity8(unsigned int w)
+{
+	return __arch_parity64(w & 0xff);
+}
+
+static inline unsigned int __arch_parity4(unsigned int w)
+{
+	return __arch_parity64(w & 0xf);
+}
 #else
 #include <asm-generic/bitops/arch_hweight.h>
+#include <asm-generic/bitops/arch_parity.h>
 #endif
 
 #include <asm-generic/bitops/const_hweight.h>
+#include <asm-generic/bitops/const_parity.h>
 
 #endif /* __KERNEL__ */
 
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 04/30] Add blackfin-specific parity functions
  2016-04-05  2:06 [PATCH V2 01/30] bitops: add parity functions Zeng Zhaoxiu
                   ` (2 preceding siblings ...)
  2016-04-06  8:46 ` [PATCH v2 03/30] Add alpha-specific " zengzhaoxiu
@ 2016-04-06  8:53 ` zengzhaoxiu
  2016-04-06  8:57 ` [PATCH v2 05/30] Add ia64-specific " zengzhaoxiu
                   ` (25 subsequent siblings)
  29 siblings, 0 replies; 84+ messages in thread
From: zengzhaoxiu @ 2016-04-06  8:53 UTC (permalink / raw)
  To: realmz6; +Cc: adi-buildroot-devel, linux-kernel, Zhaoxiu Zeng

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
---
 arch/blackfin/include/asm/bitops.h | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/arch/blackfin/include/asm/bitops.h b/arch/blackfin/include/asm/bitops.h
index b298b65..6609b7e 100644
--- a/arch/blackfin/include/asm/bitops.h
+++ b/arch/blackfin/include/asm/bitops.h
@@ -23,6 +23,7 @@
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/ffs.h>
 #include <asm-generic/bitops/const_hweight.h>
+#include <asm-generic/bitops/const_parity.h>
 #include <asm-generic/bitops/lock.h>
 
 #include <asm-generic/bitops/ext2-atomic.h>
@@ -137,4 +138,34 @@ static inline unsigned int __arch_hweight8(unsigned int w)
 	return __arch_hweight32(w & 0xff);
 }
 
+/*
+ * parityN: returns the parity of a N-bit word,
+ * i.e. the number of 1-bits in w modulo 2.
+ */
+
+static inline unsigned int __arch_parity32(unsigned int w)
+{
+	return __arch_hweight32(w) & 1;
+}
+
+static inline unsigned int __arch_parity64(__u64 w)
+{
+	return __arch_parity32((unsigned int)(w >> 32) ^ (unsigned int)w);
+}
+
+static inline unsigned int __arch_parity16(unsigned int w)
+{
+	return __arch_parity32(w & 0xffff);
+}
+
+static inline unsigned int __arch_parity8(unsigned int w)
+{
+	return __arch_parity32(w & 0xff);
+}
+
+static inline unsigned int __arch_parity4(unsigned int w)
+{
+	return __arch_parity32(w & 0xf);
+}
+
 #endif				/* _BLACKFIN_BITOPS_H */
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 05/30] Add ia64-specific parity functions
  2016-04-05  2:06 [PATCH V2 01/30] bitops: add parity functions Zeng Zhaoxiu
                   ` (3 preceding siblings ...)
  2016-04-06  8:53 ` [PATCH v2 04/30] Add blackfin-specific " zengzhaoxiu
@ 2016-04-06  8:57 ` zengzhaoxiu
  2016-04-06  8:59 ` [PATCH v2 06/30] Add mips-specific " zengzhaoxiu
                   ` (24 subsequent siblings)
  29 siblings, 0 replies; 84+ messages in thread
From: zengzhaoxiu @ 2016-04-06  8:57 UTC (permalink / raw)
  To: tony.luck, fenghua.yu; +Cc: linux-ia64, linux-kernel, Zhaoxiu Zeng

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
---
 arch/ia64/include/asm/bitops.h | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/arch/ia64/include/asm/bitops.h b/arch/ia64/include/asm/bitops.h
index 71e8145..041d1d6 100644
--- a/arch/ia64/include/asm/bitops.h
+++ b/arch/ia64/include/asm/bitops.h
@@ -439,6 +439,37 @@ static __inline__ unsigned long __arch_hweight64(unsigned long x)
 
 #include <asm-generic/bitops/const_hweight.h>
 
+/*
+ * parityN: returns the parity of a N-bit word,
+ * i.e. the number of 1-bits in x modulo 2.
+ */
+static __inline__ unsigned int __arch_parity64(unsigned long x)
+{
+	return (unsigned int)ia64_popcnt(x) & 1;
+}
+
+static __inline__ unsigned int __arch_parity32(unsigned int x)
+{
+	return __arch_parity64((unsigned long)x << 32);
+}
+
+static __inline__ unsigned int __arch_parity16(unsigned int x)
+{
+	return __arch_parity64((unsigned long)x << 48);
+}
+
+static __inline__ unsigned int __arch_parity8(unsigned int x)
+{
+	return __arch_parity64((unsigned long)x << 56);
+}
+
+static __inline__ unsigned int __arch_parity4(unsigned int x)
+{
+	return __arch_parity64((unsigned long)x << 60);
+}
+
+#include <asm-generic/bitops/const_parity.h>
+
 #endif /* __KERNEL__ */
 
 #include <asm-generic/bitops/find.h>
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 06/30] Add mips-specific parity functions
  2016-04-05  2:06 [PATCH V2 01/30] bitops: add parity functions Zeng Zhaoxiu
                   ` (4 preceding siblings ...)
  2016-04-06  8:57 ` [PATCH v2 05/30] Add ia64-specific " zengzhaoxiu
@ 2016-04-06  8:59 ` zengzhaoxiu
  2016-04-06 10:23   ` zengzhaoxiu
  2016-04-06  9:03 ` [PATCH v2 07/30] Add powerpc-specific " zengzhaoxiu
                   ` (23 subsequent siblings)
  29 siblings, 1 reply; 84+ messages in thread
From: zengzhaoxiu @ 2016-04-06  8:59 UTC (permalink / raw)
  To: ralf, Leonid.Yegoshin, macro; +Cc: linux-mips, linux-kernel, Zhaoxiu Zeng

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

Lifted from arch_hweight.h

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
---
 arch/mips/include/asm/arch_parity.h | 44 +++++++++++++++++++++++++++++++++++++
 arch/mips/include/asm/bitops.h      |  3 +++
 2 files changed, 47 insertions(+)
 create mode 100644 arch/mips/include/asm/arch_parity.h

diff --git a/arch/mips/include/asm/arch_parity.h b/arch/mips/include/asm/arch_parity.h
new file mode 100644
index 0000000..23b3c23
--- /dev/null
+++ b/arch/mips/include/asm/arch_parity.h
@@ -0,0 +1,44 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ */
+#ifndef _ASM_ARCH_PARITY_H
+#define _ASM_ARCH_PARITY_H
+
+#ifdef ARCH_HAS_USABLE_BUILTIN_POPCOUNT
+
+#include <asm/types.h>
+
+static inline unsigned int __arch_parity32(unsigned int w)
+{
+	return __builtin_popcount(w) & 1;
+}
+
+static inline unsigned int __arch_parity16(unsigned int w)
+{
+	return __arch_parity32(w & 0xffff);
+}
+
+static inline unsigned int __arch_parity8(unsigned int w)
+{
+	return __arch_parity32(w & 0xff);
+}
+
+static inline unsigned int __arch_parity4(unsigned int w)
+{
+	return __arch_parity32(w & 0xf);
+}
+
+static inline unsigned int __arch_parity64(__u64 w)
+{
+	return (unsigned int)__builtin_popcountll(w) & 1;
+}
+
+#else
+#include <asm-generic/bitops/arch_hweight.h>
+#include <asm-generic/bitops/arch_parity.h>
+#endif
+
+#endif /* _ASM_ARCH_PARITY_H */
diff --git a/arch/mips/include/asm/bitops.h b/arch/mips/include/asm/bitops.h
index ce9666c..0b87734 100644
--- a/arch/mips/include/asm/bitops.h
+++ b/arch/mips/include/asm/bitops.h
@@ -626,6 +626,9 @@ static inline int ffs(int word)
 #include <asm/arch_hweight.h>
 #include <asm-generic/bitops/const_hweight.h>
 
+#include <asm/arch_parity.h>
+#include <asm-generic/bitops/const_parity.h>
+
 #include <asm-generic/bitops/le.h>
 #include <asm-generic/bitops/ext2-atomic.h>
 
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 07/30] Add powerpc-specific parity functions
  2016-04-05  2:06 [PATCH V2 01/30] bitops: add parity functions Zeng Zhaoxiu
                   ` (5 preceding siblings ...)
  2016-04-06  8:59 ` [PATCH v2 06/30] Add mips-specific " zengzhaoxiu
@ 2016-04-06  9:03 ` zengzhaoxiu
  2016-04-06  9:07 ` [PATCH v2 08/30] Add sparc-specific " zengzhaoxiu
                   ` (22 subsequent siblings)
  29 siblings, 0 replies; 84+ messages in thread
From: zengzhaoxiu @ 2016-04-06  9:03 UTC (permalink / raw)
  To: benh, paulus, mpe, anton, oss, christophe.leroy, duwe
  Cc: linuxppc-dev, linux-kernel, Zhaoxiu Zeng

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

Use runtime patching for ppc64, lifted from hweight_64

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
---
 arch/powerpc/include/asm/bitops.h |  11 ++++
 arch/powerpc/lib/Makefile         |   2 +-
 arch/powerpc/lib/parity_64.S      | 107 ++++++++++++++++++++++++++++++++++++++
 arch/powerpc/lib/ppc_ksyms.c      |   5 ++
 4 files changed, 124 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/lib/parity_64.S

diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h
index 59abc62..cb92783 100644
--- a/arch/powerpc/include/asm/bitops.h
+++ b/arch/powerpc/include/asm/bitops.h
@@ -269,8 +269,19 @@ unsigned int __arch_hweight16(unsigned int w);
 unsigned int __arch_hweight32(unsigned int w);
 unsigned long __arch_hweight64(__u64 w);
 #include <asm-generic/bitops/const_hweight.h>
+static inline unsigned int __arch_parity4(unsigned int w)
+{
+	w &= 0xf;
+	return (0x6996 >> w) & 1;
+}
+unsigned int __arch_parity8(unsigned int w);
+unsigned int __arch_parity16(unsigned int w);
+unsigned int __arch_parity32(unsigned int w);
+unsigned int __arch_parity64(__u64 w);
+#include <asm-generic/bitops/const_parity.h>
 #else
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
 #endif
 
 #include <asm-generic/bitops/find.h>
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index ba21be1..cae2e7f 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -16,7 +16,7 @@ obj-$(CONFIG_PPC32)	+= div64.o copy_32.o
 
 obj64-y	+= copypage_64.o copyuser_64.o usercopy_64.o mem_64.o hweight_64.o \
 	   copyuser_power7.o string_64.o copypage_power7.o memcpy_power7.o \
-	   memcpy_64.o memcmp_64.o
+	   memcpy_64.o memcmp_64.o parity_64.o
 
 obj64-$(CONFIG_SMP)	+= locks.o
 obj64-$(CONFIG_ALTIVEC)	+= vmx-helper.o
diff --git a/arch/powerpc/lib/parity_64.S b/arch/powerpc/lib/parity_64.S
new file mode 100644
index 0000000..f8a2771
--- /dev/null
+++ b/arch/powerpc/lib/parity_64.S
@@ -0,0 +1,107 @@
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+
+/* Note: This code relies on -mminimal-toc */
+
+_GLOBAL(__arch_parity8)
+BEGIN_FTR_SECTION
+	srdi	r4,r3,4
+	xor	r3,r3,r4
+	clrldi	r3,r3,64-4
+	li	r4,0x6996
+	srd	r3,r4,r3
+	clrldi	r3,r3,64-1
+	blr
+FTR_SECTION_ELSE
+	PPC_POPCNTB(R3,R3)
+	clrldi  r3,r3,64-1
+	blr
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB)
+
+_GLOBAL(__arch_parity16)
+BEGIN_FTR_SECTION
+	srdi	r4,r3,8
+	xor	r3,r3,r4
+	srdi	r4,r3,4
+	xor	r3,r3,r4
+	clrldi	r3,r3,64-4
+	li	r4,0x6996
+	srd	r3,r4,r3
+	clrldi	r3,r3,64-1
+	blr
+FTR_SECTION_ELSE
+  BEGIN_FTR_SECTION_NESTED(50)
+	PPC_POPCNTB(R3,R3)
+	srdi	r4,r3,8
+	add	r3,r4,r3
+	clrldi	r3,r3,64-1
+	blr
+  FTR_SECTION_ELSE_NESTED(50)
+	clrlwi  r3,r3,16
+	PPC_POPCNTW(R3,R3)
+	clrldi	r3,r3,64-1
+	blr
+  ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 50)
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB)
+
+_GLOBAL(__arch_parity32)
+BEGIN_FTR_SECTION
+	srdi	r4,r3,16
+	xor	r3,r3,r4
+	srdi	r4,r3,8
+	xor	r3,r3,r4
+	srdi	r4,r3,4
+	xor	r3,r3,r4
+	clrldi	r3,r3,64-4
+	li	r4,0x6996
+	srd	r3,r4,r3
+	clrldi	r3,r3,64-1
+	blr
+FTR_SECTION_ELSE
+  BEGIN_FTR_SECTION_NESTED(51)
+	PPC_POPCNTB(R3,R3)
+	srdi	r4,r3,16
+	add	r3,r4,r3
+	srdi	r4,r3,8
+	add	r3,r4,r3
+	clrldi	r3,r3,64-1
+	blr
+  FTR_SECTION_ELSE_NESTED(51)
+	PPC_POPCNTW(R3,R3)
+	clrldi	r3,r3,64-1
+	blr
+  ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 51)
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB)
+
+_GLOBAL(__arch_parity64)
+BEGIN_FTR_SECTION
+	srdi	r4,r3,32
+	xor	r3,r3,r4
+	srdi	r4,r3,16
+	xor	r3,r3,r4
+	srdi	r4,r3,8
+	xor	r3,r3,r4
+	srdi	r4,r3,4
+	xor	r3,r3,r4
+	clrldi	r3,r3,64-4
+	li	r4,0x6996
+	srd	r3,r4,r3
+	clrldi	r3,r3,64-1
+	blr
+FTR_SECTION_ELSE
+  BEGIN_FTR_SECTION_NESTED(52)
+	PPC_POPCNTB(R3,R3)
+	srdi	r4,r3,32
+	add	r3,r4,r3
+	srdi	r4,r3,16
+	add	r3,r4,r3
+	srdi	r4,r3,8
+	add	r3,r4,r3
+	clrldi	r3,r3,64-1
+	blr
+  FTR_SECTION_ELSE_NESTED(52)
+	PPC_POPCNTD(R3,R3)
+	clrldi	r3,r3,64-1
+	blr
+  ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 52)
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB)
diff --git a/arch/powerpc/lib/ppc_ksyms.c b/arch/powerpc/lib/ppc_ksyms.c
index c422812..1ccfc29 100644
--- a/arch/powerpc/lib/ppc_ksyms.c
+++ b/arch/powerpc/lib/ppc_ksyms.c
@@ -30,4 +30,9 @@ EXPORT_SYMBOL(__arch_hweight8);
 EXPORT_SYMBOL(__arch_hweight16);
 EXPORT_SYMBOL(__arch_hweight32);
 EXPORT_SYMBOL(__arch_hweight64);
+
+EXPORT_SYMBOL(__arch_parity8);
+EXPORT_SYMBOL(__arch_parity16);
+EXPORT_SYMBOL(__arch_parity32);
+EXPORT_SYMBOL(__arch_parity64);
 #endif
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 08/30] Add sparc-specific parity functions
  2016-04-05  2:06 [PATCH V2 01/30] bitops: add parity functions Zeng Zhaoxiu
                   ` (6 preceding siblings ...)
  2016-04-06  9:03 ` [PATCH v2 07/30] Add powerpc-specific " zengzhaoxiu
@ 2016-04-06  9:07 ` zengzhaoxiu
  2016-04-06 18:44   ` Sam Ravnborg
  2016-04-06  9:08 ` [PATCH v2 09/30] Add tile-specific " zengzhaoxiu
                   ` (21 subsequent siblings)
  29 siblings, 1 reply; 84+ messages in thread
From: zengzhaoxiu @ 2016-04-06  9:07 UTC (permalink / raw)
  To: davem, wim.coekaerts, linux, julian.calaby, sam
  Cc: sparclinux, linux-kernel, Zhaoxiu Zeng

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

Use runtime patching for sparc64, lifted from hweight

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
---
 arch/sparc/include/asm/bitops_32.h |  1 +
 arch/sparc/include/asm/bitops_64.h | 18 ++++++++
 arch/sparc/kernel/sparc_ksyms_64.c |  6 +++
 arch/sparc/lib/Makefile            |  2 +-
 arch/sparc/lib/parity.S            | 93 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 119 insertions(+), 1 deletion(-)
 create mode 100644 arch/sparc/lib/parity.S

diff --git a/arch/sparc/include/asm/bitops_32.h b/arch/sparc/include/asm/bitops_32.h
index 600ed1d..8c41896 100644
--- a/arch/sparc/include/asm/bitops_32.h
+++ b/arch/sparc/include/asm/bitops_32.h
@@ -98,6 +98,7 @@ static inline void change_bit(unsigned long nr, volatile unsigned long *addr)
 #include <asm-generic/bitops/__fls.h>
 #include <asm-generic/bitops/fls64.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/parity.h>
 #include <asm-generic/bitops/lock.h>
 #include <asm-generic/bitops/find.h>
 #include <asm-generic/bitops/le.h>
diff --git a/arch/sparc/include/asm/bitops_64.h b/arch/sparc/include/asm/bitops_64.h
index 2d52240..5312fed 100644
--- a/arch/sparc/include/asm/bitops_64.h
+++ b/arch/sparc/include/asm/bitops_64.h
@@ -47,6 +47,24 @@ unsigned int __arch_hweight16(unsigned int w);
 unsigned int __arch_hweight8(unsigned int w);
 
 #include <asm-generic/bitops/const_hweight.h>
+
+/*
+ * parityN: returns the parity of a N-bit word,
+ * i.e. the number of 1-bits in w modulo 2.
+ */
+
+static inline unsigned int __arch_parity4(unsigned int w)
+{
+	w &= 0xf;
+	return (0x6996 >> w) & 1;
+}
+unsigned int __arch_parity8(unsigned int w);
+unsigned int __arch_parity16(unsigned int w);
+unsigned int __arch_parity32(unsigned int w);
+unsigned int __arch_parity64(__u64 w);
+
+#include <asm-generic/bitops/const_parity.h>
+
 #include <asm-generic/bitops/lock.h>
 #endif /* __KERNEL__ */
 
diff --git a/arch/sparc/kernel/sparc_ksyms_64.c b/arch/sparc/kernel/sparc_ksyms_64.c
index 9e034f2..7ee0818 100644
--- a/arch/sparc/kernel/sparc_ksyms_64.c
+++ b/arch/sparc/kernel/sparc_ksyms_64.c
@@ -45,6 +45,12 @@ EXPORT_SYMBOL(__arch_hweight16);
 EXPORT_SYMBOL(__arch_hweight32);
 EXPORT_SYMBOL(__arch_hweight64);
 
+/* from parity.S */
+EXPORT_SYMBOL(__arch_parity8);
+EXPORT_SYMBOL(__arch_parity16);
+EXPORT_SYMBOL(__arch_parity32);
+EXPORT_SYMBOL(__arch_parity64);
+
 /* from ffs_ffz.S */
 EXPORT_SYMBOL(ffs);
 EXPORT_SYMBOL(__ffs);
diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile
index 3269b02..2dbbcb1 100644
--- a/arch/sparc/lib/Makefile
+++ b/arch/sparc/lib/Makefile
@@ -39,7 +39,7 @@ lib-$(CONFIG_SPARC64) += GENmemcpy.o GENcopy_from_user.o GENcopy_to_user.o
 lib-$(CONFIG_SPARC64) += GENpatch.o GENpage.o GENbzero.o
 
 lib-$(CONFIG_SPARC64) += copy_in_user.o user_fixup.o memmove.o
-lib-$(CONFIG_SPARC64) += mcount.o ipcsum.o xor.o hweight.o ffs.o
+lib-$(CONFIG_SPARC64) += mcount.o ipcsum.o xor.o hweight.o ffs.o parity.o
 
 obj-$(CONFIG_SPARC64) += iomap.o
 obj-$(CONFIG_SPARC32) += atomic32.o ucmpdi2.o
diff --git a/arch/sparc/lib/parity.S b/arch/sparc/lib/parity.S
new file mode 100644
index 0000000..b1945e3
--- /dev/null
+++ b/arch/sparc/lib/parity.S
@@ -0,0 +1,93 @@
+#include <linux/linkage.h>
+
+	.text
+	.align	32
+
+ENTRY(__arch_parity8)
+	srl		%o0, 4, %g1
+	xor		%o0, %g1, %o0
+	and		%o0, 0xf, %o0
+	sethi		%hi(0x6996), %g1
+	or		%g1, %lo(0x6996), %g1
+	srl		%g1, %o0, %o0
+	retl
+	 and		%o0, 1, %o0
+ENDPROC(__arch_parity8)
+	.section	.popc_6insn_patch, "ax"
+	.word		__arch_parity8
+	sllx		%o0, 64-8, %g1
+	popc		%g1, %o0
+	retl
+	 and		%o0, 1, %o0
+	nop
+	nop
+	.previous
+
+ENTRY(__arch_parity16)
+	srl		%o0, 8, %g1
+	xor		%o0, %g1, %o0
+	srl		%o0, 4, %g1
+	xor		%o0, %g1, %o0
+	and		%o0, 0xf, %o0
+	sethi		%hi(0x6996), %g1
+	or		%g1, %lo(0x6996), %g1
+	srl		%g1, %o0, %o0
+	retl
+	 and		%o0, 1, %o0
+ENDPROC(__arch_parity16)
+	.section	.popc_6insn_patch, "ax"
+	.word		__arch_parity16
+	sllx		%o0, 64-16, %g1
+	popc		%g1, %o0
+	retl
+	 and		%o0, 1, %o0
+	nop
+	nop
+	.previous
+
+ENTRY(__arch_parity32)
+	srl		%o0, 16, %g1
+	xor		%o0, %g1, %o0
+	srl		%o0, 8, %g1
+	xor		%o0, %g1, %o0
+	srl		%o0, 4, %g1
+	xor		%o0, %g1, %o0
+	and		%o0, 0xf, %o0
+	sethi		%hi(0x6996), %g1
+	or		%g1, %lo(0x6996), %g1
+	srl		%g1, %o0, %o0
+	retl
+	 and		%o0, 1, %o0
+ENDPROC(__arch_parity32)
+	.section	.popc_6insn_patch, "ax"
+	.word		__arch_parity32
+	sllx		%o0, 64-32, %g1
+	popc		%g1, %o0
+	retl
+	 and		%o0, 1, %o0
+	nop
+	nop
+	.previous
+
+ENTRY(__arch_parity64)
+	srlx	%o0, 32, %g1
+	xor		%o0, %g1, %o0
+	srl		%o0, 16, %g1
+	xor		%o0, %g1, %o0
+	srl		%o0, 8, %g1
+	xor		%o0, %g1, %o0
+	srl		%o0, 4, %g1
+	xor		%o0, %g1, %o0
+	and		%o0, 0xf, %o0
+	sethi		%hi(0x6996), %g1
+	or		%g1, %lo(0x6996), %g1
+	srl		%g1, %o0, %o0
+	retl
+	 and		%o0, 1, %o0
+ENDPROC(__arch_parity64)
+	.section	.popc_3insn_patch, "ax"
+	.word		__arch_parity64
+	popc		%o0, %o0
+	retl
+	 and		%o0, 1, %o0
+	.previous
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 09/30] Add tile-specific parity functions
  2016-04-05  2:06 [PATCH V2 01/30] bitops: add parity functions Zeng Zhaoxiu
                   ` (7 preceding siblings ...)
  2016-04-06  9:07 ` [PATCH v2 08/30] Add sparc-specific " zengzhaoxiu
@ 2016-04-06  9:08 ` zengzhaoxiu
  2016-04-06 13:27   ` Chris Metcalf
  2016-04-06  9:14 ` [PATCH v2 10/30] Add x86-specific " zengzhaoxiu
                   ` (20 subsequent siblings)
  29 siblings, 1 reply; 84+ messages in thread
From: zengzhaoxiu @ 2016-04-06  9:08 UTC (permalink / raw)
  To: cmetcalf; +Cc: linux-kernel, Zhaoxiu Zeng

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
---
 arch/tile/include/asm/bitops.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/arch/tile/include/asm/bitops.h b/arch/tile/include/asm/bitops.h
index 20caa34..370d007 100644
--- a/arch/tile/include/asm/bitops.h
+++ b/arch/tile/include/asm/bitops.h
@@ -81,10 +81,36 @@ static inline unsigned long __arch_hweight64(__u64 w)
 	return __builtin_popcountll(w);
 }
 
+static inline unsigned int __arch_parity32(unsigned int w)
+{
+	return __builtin_popcount(w) & 1;
+}
+
+static inline unsigned int __arch_parity16(unsigned int w)
+{
+	return __arch_parity32(w & 0xffff);
+}
+
+static inline unsigned int __arch_parity8(unsigned int w)
+{
+	return __arch_parity32(w & 0xff);
+}
+
+static inline unsigned int __arch_parity4(unsigned int w)
+{
+	return __arch_parity32(w & 0xf);
+}
+
+static inline unsigned int __arch_parity64(__u64 w)
+{
+	return (unsigned int)__builtin_popcountll(w) & 1;
+}
+
 #include <asm-generic/bitops/builtin-__ffs.h>
 #include <asm-generic/bitops/builtin-__fls.h>
 #include <asm-generic/bitops/builtin-ffs.h>
 #include <asm-generic/bitops/const_hweight.h>
+#include <asm-generic/bitops/const_parity.h>
 #include <asm-generic/bitops/lock.h>
 #include <asm-generic/bitops/find.h>
 #include <asm-generic/bitops/sched.h>
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 10/30] Add x86-specific parity functions
  2016-04-05  2:06 [PATCH V2 01/30] bitops: add parity functions Zeng Zhaoxiu
                   ` (8 preceding siblings ...)
  2016-04-06  9:08 ` [PATCH v2 09/30] Add tile-specific " zengzhaoxiu
@ 2016-04-06  9:14 ` zengzhaoxiu
  2016-04-06 10:13   ` Borislav Petkov
  2016-04-06 19:45   ` Andi Kleen
  2016-04-06  9:27 ` [PATCH v2 11/30] sunrpc: use parity8 zengzhaoxiu
                   ` (19 subsequent siblings)
  29 siblings, 2 replies; 84+ messages in thread
From: zengzhaoxiu @ 2016-04-06  9:14 UTC (permalink / raw)
  To: tglx, mingo, hpa, dvlasenk, bp, akpm, dvyukov, keescook
  Cc: linux-kernel, Zhaoxiu Zeng

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

Use alternatives, lifted from arch_hweight

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
---
 arch/x86/include/asm/arch_hweight.h |   5 ++
 arch/x86/include/asm/arch_parity.h  | 102 ++++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/bitops.h       |   4 +-
 arch/x86/lib/Makefile               |   8 +++
 arch/x86/lib/parity.c               |  32 ++++++++++++
 5 files changed, 150 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/include/asm/arch_parity.h
 create mode 100644 arch/x86/lib/parity.c

diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
index 02e799f..c79d50d 100644
--- a/arch/x86/include/asm/arch_hweight.h
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -63,4 +63,9 @@ static __always_inline unsigned long __arch_hweight64(__u64 w)
 }
 #endif /* CONFIG_X86_32 */
 
+#undef POPCNT32
+#undef POPCNT64
+#undef REG_IN
+#undef REG_OUT
+
 #endif
diff --git a/arch/x86/include/asm/arch_parity.h b/arch/x86/include/asm/arch_parity.h
new file mode 100644
index 0000000..09463fd
--- /dev/null
+++ b/arch/x86/include/asm/arch_parity.h
@@ -0,0 +1,100 @@
+#ifndef _ASM_X86_PARITY_H
+#define _ASM_X86_PARITY_H
+
+#include <asm/cpufeatures.h>
+
+#ifdef CONFIG_64BIT
+/* popcnt %edi, %eax -- redundant REX prefix for alignment */
+#define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7"
+/* popcnt %rdi, %rax */
+#define POPCNT64 ".byte 0xf3,0x48,0x0f,0xb8,0xc7"
+#define REG_IN "D"
+#define REG_OUT "a"
+#else
+/* popcnt %eax, %eax */
+#define POPCNT32 ".byte 0xf3,0x0f,0xb8,0xc0"
+#define REG_IN "a"
+#define REG_OUT "a"
+#endif
+
+/*
+ * __sw_parityXX are called from within the alternatives below
+ * and callee-clobbered registers need to be taken care of. See
+ * ARCH_HWEIGHT_CFLAGS in <arch/x86/Kconfig> for the respective
+ * compiler switches.
+ */
+unsigned int __sw_parity32(unsigned int w);
+#ifndef CONFIG_X86_32
+unsigned int __sw_parity64(__u64 w);
+#endif
+
+static inline unsigned int __arch_parity4(unsigned int w)
+{
+	unsigned int res = 0;
+
+	asm("test $0xf, %1; setpo %b0"
+		: "+q" (res)
+		: "r" (w)
+		: "cc");
+
+	return res;
+}
+
+static inline unsigned int __arch_parity8(unsigned int w)
+{
+	unsigned int res = 0;
+
+	asm("test %1, %1; setpo %b0"
+		: "+q" (res)
+		: "r" (w)
+		: "cc");
+
+	return res;
+}
+
+static inline unsigned int __arch_parity16(unsigned int w)
+{
+	unsigned int res = 0;
+
+	asm("xor %h1, %b1; setpo %b0"
+		: "+q" (res), "+q" (w)
+		: : "cc");
+
+	return res;
+}
+
+static __always_inline unsigned int __arch_parity32(unsigned int w)
+{
+	unsigned int res;
+
+	asm(ALTERNATIVE("call __sw_parity32", POPCNT32 "; and $1, %0", X86_FEATURE_POPCNT)
+		: "="REG_OUT (res)
+		: REG_IN (w)
+		: "cc");
+
+	return res;
+}
+
+#ifdef CONFIG_X86_32
+static inline unsigned long __arch_parity64(__u64 w)
+{
+	return __arch_parity32((u32)w ^ (u32)(w >> 32));
+}
+#else
+static __always_inline unsigned long __arch_parity64(__u64 w)
+{
+	unsigned long res;
+
+	asm(ALTERNATIVE("call __sw_parity64", POPCNT64 "; and $1, %0", X86_FEATURE_POPCNT)
+		: "="REG_OUT (res)
+		: REG_IN (w)
+		: "cc");
+
+	return res;
+}
+#endif /* CONFIG_X86_32 */
+
+#undef POPCNT32
+#undef POPCNT64
+#undef REG_IN
+#undef REG_OUT
+
+#endif
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 7766d1c..f5b0122 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -498,9 +498,11 @@ static __always_inline int fls64(__u64 x)
 #include <asm-generic/bitops/sched.h>
 
 #include <asm/arch_hweight.h>
-
 #include <asm-generic/bitops/const_hweight.h>
 
+#include <asm/arch_parity.h>
+#include <asm-generic/bitops/const_parity.h>
+
 #include <asm-generic/bitops/le.h>
 
 #include <asm-generic/bitops/ext2-atomic-setbit.h>
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 72a5767..5716295 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -4,6 +4,9 @@
 
 # Produces uninteresting flaky coverage.
 KCOV_INSTRUMENT_delay.o	:= n
+# Kernel does not boot if we instrument this file as it uses custom calling
+# convention (see CONFIG_ARCH_HWEIGHT_CFLAGS).
+KCOV_INSTRUMENT_parity.o := n
 
 inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
 inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt
@@ -45,3 +48,8 @@ else
         lib-y += copy_user_64.o
 	lib-y += cmpxchg16b_emu.o
 endif
+
+GCOV_PROFILE_parity.o := n
+CFLAGS_parity.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS))
+obj-y  += parity.o
+
diff --git a/arch/x86/lib/parity.c b/arch/x86/lib/parity.c
new file mode 100644
index 0000000..762117b
--- /dev/null
+++ b/arch/x86/lib/parity.c
@@ -0,0 +1,32 @@
+#include <linux/export.h>
+#include <linux/bitops.h>
+
+unsigned int __sw_parity32(unsigned int w)
+{
+	unsigned int res;
+	w ^= w >> 16;
+	asm("xor	%%ah, %%al		\n"
+		"mov	$0, %%eax		\n"
+		"setpo	%%al			\n"
+		: "=a" (res)
+		: "a" (w)
+		: "cc");
+	return res;
+}
+EXPORT_SYMBOL(__sw_parity32);
+
+#ifndef CONFIG_X86_32
+unsigned int __sw_parity64(__u64 w)
+{
+	unsigned int res = (unsigned int)w ^ (unsigned int)(w >> 32);
+	res ^= res >> 16;
+	asm("xor	%%ah, %%al		\n"
+		"mov	$0, %%eax		\n"
+		"setpo	%%al			\n"
+		: "=a" (res)
+		: "a" (res)
+		: "cc");
+	return res;
+}
+EXPORT_SYMBOL(__sw_parity64);
+#endif
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 11/30] sunrpc: use parity8
  2016-04-05  2:06 [PATCH V2 01/30] bitops: add parity functions Zeng Zhaoxiu
                   ` (9 preceding siblings ...)
  2016-04-06  9:14 ` [PATCH v2 10/30] Add x86-specific " zengzhaoxiu
@ 2016-04-06  9:27 ` zengzhaoxiu
  2016-04-06  9:30 ` [PATCH v2 12/30] mips: use parity functions in cerr-sb1.c zengzhaoxiu
                   ` (18 subsequent siblings)
  29 siblings, 0 replies; 84+ messages in thread
From: zengzhaoxiu @ 2016-04-06  9:27 UTC (permalink / raw)
  To: bfields, jlayton, trond.myklebust, anna.schumaker, davem, herbert
  Cc: linux-nfs, netdev, linux-kernel, Zhaoxiu Zeng

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
---
 net/sunrpc/auth_gss/gss_krb5_keys.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/net/sunrpc/auth_gss/gss_krb5_keys.c b/net/sunrpc/auth_gss/gss_krb5_keys.c
index 8701331..c41b389 100644
--- a/net/sunrpc/auth_gss/gss_krb5_keys.c
+++ b/net/sunrpc/auth_gss/gss_krb5_keys.c
@@ -243,16 +243,12 @@ err_return:
 	return ret;
 }
 
-#define smask(step) ((1<<step)-1)
-#define pstep(x, step) (((x)&smask(step))^(((x)>>step)&smask(step)))
-#define parity_char(x) pstep(pstep(pstep((x), 4), 2), 1)
-
 static void mit_des_fixup_key_parity(u8 key[8])
 {
 	int i;
 	for (i = 0; i < 8; i++) {
 		key[i] &= 0xfe;
-		key[i] |= 1^parity_char(key[i]);
+		key[i] |= !parity8(key[i]);
 	}
 }
 
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 12/30] mips: use parity functions in cerr-sb1.c
  2016-04-05  2:06 [PATCH V2 01/30] bitops: add parity functions Zeng Zhaoxiu
                   ` (10 preceding siblings ...)
  2016-04-06  9:27 ` [PATCH v2 11/30] sunrpc: use parity8 zengzhaoxiu
@ 2016-04-06  9:30 ` zengzhaoxiu
  2016-04-06  9:36 ` [PATCH v2 13/30] bch: use parity32 zengzhaoxiu
                   ` (17 subsequent siblings)
  29 siblings, 0 replies; 84+ messages in thread
From: zengzhaoxiu @ 2016-04-06  9:30 UTC (permalink / raw)
  To: ralf; +Cc: linux-mips, linux-kernel, Zhaoxiu Zeng

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
---
 arch/mips/mm/cerr-sb1.c | 67 +++++++++++++------------------------------------
 1 file changed, 17 insertions(+), 50 deletions(-)

diff --git a/arch/mips/mm/cerr-sb1.c b/arch/mips/mm/cerr-sb1.c
index ee5c1ff..2e7d660 100644
--- a/arch/mips/mm/cerr-sb1.c
+++ b/arch/mips/mm/cerr-sb1.c
@@ -264,27 +264,6 @@ asmlinkage void sb1_cache_error(void)
 #endif
 }
 
-
-/* Parity lookup table. */
-static const uint8_t parity[256] = {
-	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
-	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
-	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
-	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
-	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
-	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
-	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
-	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
-	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
-	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
-	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
-	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
-	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
-	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
-	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
-	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0
-};
-
 /* Masks to select bits for Hamming parity, mask_72_64[i] for bit[i] */
 static const uint64_t mask_72_64[8] = {
 	0x0738C808099264FFULL,
@@ -298,34 +277,28 @@ static const uint64_t mask_72_64[8] = {
 };
 
 /* Calculate the parity on a range of bits */
-static char range_parity(uint64_t dword, int max, int min)
+static inline char range_parity(uint64_t dword, int max, int min)
 {
-	char parity = 0;
-	int i;
-	dword >>= min;
-	for (i=max-min; i>=0; i--) {
-		if (dword & 0x1)
-			parity = !parity;
-		dword >>= 1;
+	int n = max - min + 1;
+	if (__builtin_constant_p(n)) {
+		if (n <= 8)
+			return parity8((unsigned int)(dword >> min) & ((1U << n) - 1));
+		if (n <= 16)
+			return parity16((unsigned int)(dword >> min) & ((1U << n) - 1));
+		if (n <= 32)
+			return parity32((unsigned int)(dword >> min) & ((1U << n) - 1));
 	}
-	return parity;
+	return parity64((dword >> min) & ((1ULL << n) - 1));
 }
 
 /* Calculate the 4-bit even byte-parity for an instruction */
-static unsigned char inst_parity(uint32_t word)
+static inline unsigned char inst_parity(uint32_t word)
 {
-	int i, j;
-	char parity = 0;
-	for (j=0; j<4; j++) {
-		char byte_parity = 0;
-		for (i=0; i<8; i++) {
-			if (word & 0x80000000)
-				byte_parity = !byte_parity;
-			word <<= 1;
-		}
-		parity <<= 1;
-		parity |= byte_parity;
-	}
+	char parity;
+	parity  = parity8(word >> 24) << 3;
+	parity |= parity8(word >> 16) << 2;
+	parity |= parity8(word >> 8) << 1;
+	parity |= parity8(word);
 	return parity;
 }
 
@@ -436,7 +409,6 @@ static uint32_t extract_ic(unsigned short addr, int data)
 static uint8_t dc_ecc(uint64_t dword)
 {
 	uint64_t t;
-	uint32_t w;
 	uint8_t	 p;
 	int	 i;
 
@@ -445,12 +417,7 @@ static uint8_t dc_ecc(uint64_t dword)
 	{
 		p <<= 1;
 		t = dword & mask_72_64[i];
-		w = (uint32_t)(t >> 32);
-		p ^= (parity[w>>24] ^ parity[(w>>16) & 0xFF]
-		      ^ parity[(w>>8) & 0xFF] ^ parity[w & 0xFF]);
-		w = (uint32_t)(t & 0xFFFFFFFF);
-		p ^= (parity[w>>24] ^ parity[(w>>16) & 0xFF]
-		      ^ parity[(w>>8) & 0xFF] ^ parity[w & 0xFF]);
+		p |= parity64(t);
 	}
 	return p;
 }
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 13/30] bch: use parity32
  2016-04-05  2:06 [PATCH V2 01/30] bitops: add parity functions Zeng Zhaoxiu
                   ` (11 preceding siblings ...)
  2016-04-06  9:30 ` [PATCH v2 12/30] mips: use parity functions in cerr-sb1.c zengzhaoxiu
@ 2016-04-06  9:36 ` zengzhaoxiu
  2016-04-06  9:39 ` [PATCH v2 14/30] media: use parity8 in vivid-vbi-gen.c zengzhaoxiu
                   ` (16 subsequent siblings)
  29 siblings, 0 replies; 84+ messages in thread
From: zengzhaoxiu @ 2016-04-06  9:36 UTC (permalink / raw)
  To: linux-kernel; +Cc: Zhaoxiu Zeng

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
---
 lib/bch.c | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/lib/bch.c b/lib/bch.c
index bc89dfe4..6c6e8d4 100644
--- a/lib/bch.c
+++ b/lib/bch.c
@@ -278,18 +278,6 @@ static inline int deg(unsigned int poly)
 	return fls(poly)-1;
 }
 
-static inline int parity(unsigned int x)
-{
-	/*
-	 * public domain code snippet, lifted from
-	 * http://www-graphics.stanford.edu/~seander/bithacks.html
-	 */
-	x ^= x >> 1;
-	x ^= x >> 2;
-	x = (x & 0x11111111U) * 0x11111111U;
-	return (x >> 28) & 1;
-}
-
 /* Galois field basic operations: multiply, divide, inverse, etc. */
 
 static inline unsigned int gf_mul(struct bch_control *bch, unsigned int a,
@@ -494,7 +482,7 @@ static int solve_linear_system(struct bch_control *bch, unsigned int *rows,
 		tmp = 0;
 		for (r = m-1; r >= 0; r--) {
 			mask = rows[r] & (tmp|1);
-			tmp |= parity(mask) << (m-r);
+			tmp |= parity32(mask) << (m-r);
 		}
 		sol[p] = tmp >> 1;
 	}
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 14/30] media: use parity8 in vivid-vbi-gen.c
  2016-04-05  2:06 [PATCH V2 01/30] bitops: add parity functions Zeng Zhaoxiu
                   ` (12 preceding siblings ...)
  2016-04-06  9:36 ` [PATCH v2 13/30] bch: use parity32 zengzhaoxiu
@ 2016-04-06  9:39 ` zengzhaoxiu
  2016-04-06  9:41 ` [PATCH v2 15/30] media: use parity functions in saa7115 zengzhaoxiu
                   ` (15 subsequent siblings)
  29 siblings, 0 replies; 84+ messages in thread
From: zengzhaoxiu @ 2016-04-06  9:39 UTC (permalink / raw)
  To: hverkuil, mchehab; +Cc: linux-media, linux-kernel, Zhaoxiu Zeng

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
---
 drivers/media/platform/vivid/vivid-vbi-gen.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/drivers/media/platform/vivid/vivid-vbi-gen.c b/drivers/media/platform/vivid/vivid-vbi-gen.c
index a2159de..d5ba0fc 100644
--- a/drivers/media/platform/vivid/vivid-vbi-gen.c
+++ b/drivers/media/platform/vivid/vivid-vbi-gen.c
@@ -175,14 +175,9 @@ static const u8 vivid_cc_sequence2[30] = {
 	0x14, 0x2f,	/* End of Caption */
 };
 
-static u8 calc_parity(u8 val)
+static inline u8 calc_parity(u8 val)
 {
-	unsigned i;
-	unsigned tot = 0;
-
-	for (i = 0; i < 7; i++)
-		tot += (val & (1 << i)) ? 1 : 0;
-	return val | ((tot & 1) ? 0 : 0x80);
+	return (!parity8(val) << 7) | val;
 }
 
 static void vivid_vbi_gen_set_time_of_day(u8 *packet)
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 15/30] media: use parity functions in saa7115
  2016-04-05  2:06 [PATCH V2 01/30] bitops: add parity functions Zeng Zhaoxiu
                   ` (13 preceding siblings ...)
  2016-04-06  9:39 ` [PATCH v2 14/30] media: use parity8 in vivid-vbi-gen.c zengzhaoxiu
@ 2016-04-06  9:41 ` zengzhaoxiu
  2016-04-06  9:43 ` [PATCH v2 16/30] input: use parity32 in grip_mp zengzhaoxiu
                   ` (14 subsequent siblings)
  29 siblings, 0 replies; 84+ messages in thread
From: zengzhaoxiu @ 2016-04-06  9:41 UTC (permalink / raw)
  To: mchehab, arnd, hans.verkuil, k.kozlowski
  Cc: linux-media, linux-kernel, Zhaoxiu Zeng

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
---
 drivers/media/i2c/saa7115.c | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/drivers/media/i2c/saa7115.c b/drivers/media/i2c/saa7115.c
index d2a1ce2..4c22df8 100644
--- a/drivers/media/i2c/saa7115.c
+++ b/drivers/media/i2c/saa7115.c
@@ -672,15 +672,6 @@ static const unsigned char saa7115_init_misc[] = {
 	0x00, 0x00
 };
 
-static int saa711x_odd_parity(u8 c)
-{
-	c ^= (c >> 4);
-	c ^= (c >> 2);
-	c ^= (c >> 1);
-
-	return c & 1;
-}
-
 static int saa711x_decode_vps(u8 *dst, u8 *p)
 {
 	static const u8 biphase_tbl[] = {
@@ -733,7 +724,6 @@ static int saa711x_decode_wss(u8 *p)
 	static const int wss_bits[8] = {
 		0, 0, 0, 1, 0, 1, 1, 1
 	};
-	unsigned char parity;
 	int wss = 0;
 	int i;
 
@@ -745,11 +735,8 @@ static int saa711x_decode_wss(u8 *p)
 			return -1;
 		wss |= b2 << i;
 	}
-	parity = wss & 15;
-	parity ^= parity >> 2;
-	parity ^= parity >> 1;
 
-	if (!(parity & 1))
+	if (!parity4(wss))
 		return -1;
 
 	return wss;
@@ -1235,7 +1222,7 @@ static int saa711x_decode_vbi_line(struct v4l2_subdev *sd, struct v4l2_decode_vb
 		vbi->type = V4L2_SLICED_TELETEXT_B;
 		break;
 	case 4:
-		if (!saa711x_odd_parity(p[0]) || !saa711x_odd_parity(p[1]))
+		if (!parity8(p[0]) || !parity8(p[1]))
 			return 0;
 		vbi->type = V4L2_SLICED_CAPTION_525;
 		break;
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 16/30] input: use parity32 in grip_mp
  2016-04-05  2:06 [PATCH V2 01/30] bitops: add parity functions Zeng Zhaoxiu
                   ` (14 preceding siblings ...)
  2016-04-06  9:41 ` [PATCH v2 15/30] media: use parity functions in saa7115 zengzhaoxiu
@ 2016-04-06  9:43 ` zengzhaoxiu
  2016-04-06  9:44 ` [PATCH v2 17/30] input: use parity64 in sidewinder zengzhaoxiu
                   ` (13 subsequent siblings)
  29 siblings, 0 replies; 84+ messages in thread
From: zengzhaoxiu @ 2016-04-06  9:43 UTC (permalink / raw)
  To: dmitry.torokhov; +Cc: linux-input, linux-kernel, Zhaoxiu Zeng

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
---
 drivers/input/joystick/grip_mp.c | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/drivers/input/joystick/grip_mp.c b/drivers/input/joystick/grip_mp.c
index 573191d..3e29eb1 100644
--- a/drivers/input/joystick/grip_mp.c
+++ b/drivers/input/joystick/grip_mp.c
@@ -112,20 +112,6 @@ static const int axis_map[] = { 5, 9, 1, 5, 6, 10, 2, 6, 4, 8, 0, 4, 5, 9, 1, 5
 static int register_slot(int i, struct grip_mp *grip);
 
 /*
- * Returns whether an odd or even number of bits are on in pkt.
- */
-
-static int bit_parity(u32 pkt)
-{
-	int x = pkt ^ (pkt >> 16);
-	x ^= x >> 8;
-	x ^= x >> 4;
-	x ^= x >> 2;
-	x ^= x >> 1;
-	return x & 1;
-}
-
-/*
  * Poll gameport; return true if all bits set in 'onbits' are on and
  * all bits set in 'offbits' are off.
  */
@@ -235,7 +221,7 @@ static int mp_io(struct gameport* gameport, int sendflags, int sendcode, u32 *pa
 		pkt = (pkt >> 2) | 0xf0000000;
 	}
 
-	if (bit_parity(pkt) == 1)
+	if (parity32(pkt))
 		return IO_RESET;
 
 	/* Acknowledge packet receipt */
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 17/30] input: use parity64 in sidewinder
  2016-04-05  2:06 [PATCH V2 01/30] bitops: add parity functions Zeng Zhaoxiu
                   ` (15 preceding siblings ...)
  2016-04-06  9:43 ` [PATCH v2 16/30] input: use parity32 in grip_mp zengzhaoxiu
@ 2016-04-06  9:44 ` zengzhaoxiu
  2016-04-06  9:45 ` [PATCH v2 18/30] input: use parity16 in ams_delta_serio zengzhaoxiu
                   ` (12 subsequent siblings)
  29 siblings, 0 replies; 84+ messages in thread
From: zengzhaoxiu @ 2016-04-06  9:44 UTC (permalink / raw)
  To: dmitry.torokhov; +Cc: linux-input, linux-kernel, Zhaoxiu Zeng

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
---
 drivers/input/joystick/sidewinder.c | 24 ++++--------------------
 1 file changed, 4 insertions(+), 20 deletions(-)

diff --git a/drivers/input/joystick/sidewinder.c b/drivers/input/joystick/sidewinder.c
index 4a95b22..7ea486e 100644
--- a/drivers/input/joystick/sidewinder.c
+++ b/drivers/input/joystick/sidewinder.c
@@ -259,22 +259,6 @@ static void sw_init_digital(struct gameport *gameport)
 }
 
 /*
- * sw_parity() computes parity of __u64
- */
-
-static int sw_parity(__u64 t)
-{
-	int x = t ^ (t >> 32);
-
-	x ^= x >> 16;
-	x ^= x >> 8;
-	x ^= x >> 4;
-	x ^= x >> 2;
-	x ^= x >> 1;
-	return x & 1;
-}
-
-/*
  * sw_ccheck() checks synchronization bits and computes checksum of nibbles.
  */
 
@@ -334,7 +318,7 @@ static int sw_parse(unsigned char *buf, struct sw *sw)
 
 			for (i = 0; i < sw->number; i ++) {
 
-				if (sw_parity(GB(i*15,15)))
+				if (parity64(GB(i*15,15)))
 					return -1;
 
 				input_report_abs(sw->dev[i], ABS_X, GB(i*15+3,1) - GB(i*15+2,1));
@@ -351,7 +335,7 @@ static int sw_parse(unsigned char *buf, struct sw *sw)
 		case SW_ID_PP:
 		case SW_ID_FFP:
 
-			if (!sw_parity(GB(0,48)) || (hat = GB(42,4)) > 8)
+			if (!parity64(GB(0,48)) || (hat = GB(42,4)) > 8)
 				return -1;
 
 			dev = sw->dev[0];
@@ -372,7 +356,7 @@ static int sw_parse(unsigned char *buf, struct sw *sw)
 
 		case SW_ID_FSP:
 
-			if (!sw_parity(GB(0,43)) || (hat = GB(28,4)) > 8)
+			if (!parity64(GB(0,43)) || (hat = GB(28,4)) > 8)
 				return -1;
 
 			dev = sw->dev[0];
@@ -397,7 +381,7 @@ static int sw_parse(unsigned char *buf, struct sw *sw)
 
 		case SW_ID_FFW:
 
-			if (!sw_parity(GB(0,33)))
+			if (!parity64(GB(0,33)))
 				return -1;
 
 			dev = sw->dev[0];
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 18/30] input: use parity16 in ams_delta_serio
  2016-04-05  2:06 [PATCH V2 01/30] bitops: add parity functions Zeng Zhaoxiu
                   ` (16 preceding siblings ...)
  2016-04-06  9:44 ` [PATCH v2 17/30] input: use parity64 in sidewinder zengzhaoxiu
@ 2016-04-06  9:45 ` zengzhaoxiu
  2016-04-06  9:47 ` [PATCH v2 19/30] scsi: use parity32 in isci's phy zengzhaoxiu
                   ` (11 subsequent siblings)
  29 siblings, 0 replies; 84+ messages in thread
From: zengzhaoxiu @ 2016-04-06  9:45 UTC (permalink / raw)
  To: dmitry.torokhov; +Cc: linux-input, linux-kernel, Zhaoxiu Zeng

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
---
 drivers/input/serio/ams_delta_serio.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/input/serio/ams_delta_serio.c b/drivers/input/serio/ams_delta_serio.c
index 45887e3..85459b3 100644
--- a/drivers/input/serio/ams_delta_serio.c
+++ b/drivers/input/serio/ams_delta_serio.c
@@ -48,13 +48,9 @@ static int check_data(int data)
 				data);
 		return SERIO_FRAME;
 	}
-	/* calculate the parity */
-	for (i = 1; i < 10; i++) {
-		if (data & (1 << i))
-			parity++;
-	}
 	/* it should be odd */
-	if (!(parity & 0x01)) {
+	parity = parity16(data & 0x3fe);
+	if (!parity) {
 		dev_warn(&ams_delta_serio->dev,
 				"paritiy check failed, data=0x%X parity=0x%X\n",
 				data, parity);
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 19/30] scsi: use parity32 in isci's phy
  2016-04-05  2:06 [PATCH V2 01/30] bitops: add parity functions Zeng Zhaoxiu
                   ` (17 preceding siblings ...)
  2016-04-06  9:45 ` [PATCH v2 18/30] input: use parity16 in ams_delta_serio zengzhaoxiu
@ 2016-04-06  9:47 ` zengzhaoxiu
  2016-04-06  9:52 ` [PATCH v2 20/30] mtd: use parity16 in ssfdc zengzhaoxiu
                   ` (10 subsequent siblings)
  29 siblings, 0 replies; 84+ messages in thread
From: zengzhaoxiu @ 2016-04-06  9:47 UTC (permalink / raw)
  To: intel-linux-scu, artur.paszkiewicz, jejb, martin.petersen
  Cc: linux-scsi, linux-kernel, Zhaoxiu Zeng

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
---
 drivers/scsi/isci/phy.c | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/drivers/scsi/isci/phy.c b/drivers/scsi/isci/phy.c
index cb87b2e..a06aff6 100644
--- a/drivers/scsi/isci/phy.c
+++ b/drivers/scsi/isci/phy.c
@@ -122,8 +122,6 @@ sci_phy_link_layer_initialization(struct isci_phy *iphy,
 	int phy_idx = iphy->phy_index;
 	struct sci_phy_cap phy_cap;
 	u32 phy_configuration;
-	u32 parity_check = 0;
-	u32 parity_count = 0;
 	u32 llctl, link_rate;
 	u32 clksm_value = 0;
 	u32 sp_timeouts = 0;
@@ -225,18 +223,7 @@ sci_phy_link_layer_initialization(struct isci_phy *iphy,
 	/* The SAS specification indicates that the phy_capabilities that
 	 * are transmitted shall have an even parity.  Calculate the parity.
 	 */
-	parity_check = phy_cap.all;
-	while (parity_check != 0) {
-		if (parity_check & 0x1)
-			parity_count++;
-		parity_check >>= 1;
-	}
-
-	/* If parity indicates there are an odd number of bits set, then
-	 * set the parity bit to 1 in the phy capabilities.
-	 */
-	if ((parity_count % 2) != 0)
-		phy_cap.parity = 1;
+	phy_cap.parity = parity32(phy_cap.all);
 
 	writel(phy_cap.all, &llr->phy_capabilities);
 
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 20/30] mtd: use parity16 in ssfdc
  2016-04-05  2:06 [PATCH V2 01/30] bitops: add parity functions Zeng Zhaoxiu
                   ` (18 preceding siblings ...)
  2016-04-06  9:47 ` [PATCH v2 19/30] scsi: use parity32 in isci's phy zengzhaoxiu
@ 2016-04-06  9:52 ` zengzhaoxiu
  2016-04-06  9:53 ` [PATCH v2 21/30] mtd: use parity functions in inftlcore zengzhaoxiu
                   ` (9 subsequent siblings)
  29 siblings, 0 replies; 84+ messages in thread
From: zengzhaoxiu @ 2016-04-06  9:52 UTC (permalink / raw)
  To: dwmw2, computersforpeace; +Cc: linux-mtd, linux-kernel, Zhaoxiu Zeng

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

The original get_parity return even parity of the input number. So
hweight of "block_address & 0x7ff" must be odd if block_address is valid.

This patch use parity16, and rearrange the code.

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
---
 drivers/mtd/ssfdc.c | 31 ++++++++-----------------------
 1 file changed, 8 insertions(+), 23 deletions(-)

diff --git a/drivers/mtd/ssfdc.c b/drivers/mtd/ssfdc.c
index daf82ba..1d55f15 100644
--- a/drivers/mtd/ssfdc.c
+++ b/drivers/mtd/ssfdc.c
@@ -182,24 +182,10 @@ static int read_raw_oob(struct mtd_info *mtd, loff_t offs, uint8_t *buf)
 	return 0;
 }
 
-/* Parity calculator on a word of n bit size */
-static int get_parity(int number, int size)
-{
- 	int k;
-	int parity;
-
-	parity = 1;
-	for (k = 0; k < size; k++) {
-		parity += (number >> k);
-		parity &= 1;
-	}
-	return parity;
-}
-
 /* Read and validate the logical block address field stored in the OOB */
 static int get_logical_address(uint8_t *oob_buf)
 {
-	int block_address, parity;
+	int block_address;
 	int offset[2] = {6, 11}; /* offset of the 2 address fields within OOB */
 	int j;
 	int ok = 0;
@@ -215,18 +201,17 @@ static int get_logical_address(uint8_t *oob_buf)
 
 		/* Check for the signature bits in the address field (MSBits) */
 		if ((block_address & ~0x7FF) == 0x1000) {
-			parity = block_address & 0x01;
 			block_address &= 0x7FF;
-			block_address >>= 1;
 
-			if (get_parity(block_address, 10) != parity) {
-				pr_debug("SSFDC_RO: logical address field%d"
-					"parity error(0x%04X)\n", j+1,
-					block_address);
-			} else {
-				ok = 1;
+			ok = parity16(block_address);
+			if (ok) {
+				block_address >>= 1;
 				break;
 			}
+
+			pr_debug("SSFDC_RO: logical address field%d"
+				"parity error(0x%04X)\n", j+1,
+				block_address);
 		}
 	}
 
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 21/30] mtd: use parity functions in inftlcore
  2016-04-05  2:06 [PATCH V2 01/30] bitops: add parity functions Zeng Zhaoxiu
                   ` (19 preceding siblings ...)
  2016-04-06  9:52 ` [PATCH v2 20/30] mtd: use parity16 in ssfdc zengzhaoxiu
@ 2016-04-06  9:53 ` zengzhaoxiu
  2016-04-06  9:58 ` [PATCH v2 22/30] crypto: use parity functions in qat_hal zengzhaoxiu
                   ` (8 subsequent siblings)
  29 siblings, 0 replies; 84+ messages in thread
From: zengzhaoxiu @ 2016-04-06  9:53 UTC (permalink / raw)
  To: dwmw2, computersforpeace; +Cc: linux-mtd, linux-kernel, Zhaoxiu Zeng

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
---
 drivers/mtd/inftlcore.c | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/drivers/mtd/inftlcore.c b/drivers/mtd/inftlcore.c
index b66b541..8c9457b 100644
--- a/drivers/mtd/inftlcore.c
+++ b/drivers/mtd/inftlcore.c
@@ -457,15 +457,6 @@ static u16 INFTL_makefreeblock(struct INFTLrecord *inftl, unsigned pendingblock)
 	return INFTL_foldchain(inftl, LongestChain, pendingblock);
 }
 
-static int nrbits(unsigned int val, int bitcount)
-{
-	int i, total = 0;
-
-	for (i = 0; (i < bitcount); i++)
-		total += (((0x1 << i) & val) ? 1 : 0);
-	return total;
-}
-
 /*
  * INFTL_findwriteunit: Return the unit number into which we can write
  *                      for this block. Make it available if it isn't already.
@@ -593,10 +584,10 @@ hitused:
 		if (prev_block < inftl->nb_blocks)
 			prev_block -= inftl->firstEUN;
 
-		parity = (nrbits(thisVUC, 16) & 0x1) ? 0x1 : 0;
-		parity |= (nrbits(prev_block, 16) & 0x1) ? 0x2 : 0;
-		parity |= (nrbits(anac, 8) & 0x1) ? 0x4 : 0;
-		parity |= (nrbits(nacs, 8) & 0x1) ? 0x8 : 0;
+		parity  = parity16(thisVUC);
+		parity |= parity16(prev_block) << 1;
+		parity |= parity8(anac) << 2;
+		parity |= parity8(nacs) << 3;
 
 		oob.u.a.virtualUnitNo = cpu_to_le16(thisVUC);
 		oob.u.a.prevUnitNo = cpu_to_le16(prev_block);
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 22/30] crypto: use parity functions in qat_hal
  2016-04-05  2:06 [PATCH V2 01/30] bitops: add parity functions Zeng Zhaoxiu
                   ` (20 preceding siblings ...)
  2016-04-06  9:53 ` [PATCH v2 21/30] mtd: use parity functions in inftlcore zengzhaoxiu
@ 2016-04-06  9:58 ` zengzhaoxiu
  2016-04-06 10:05 ` [PATCH v2 23/30] mtd: use parity16 in sm_ftl zengzhaoxiu
                   ` (7 subsequent siblings)
  29 siblings, 0 replies; 84+ messages in thread
From: zengzhaoxiu @ 2016-04-06  9:58 UTC (permalink / raw)
  To: tadeusz.struk, herbert, davem, pingchao.yang, bruce.w.allan
  Cc: qat-linux, linux-crypto, linux-kernel, Zhaoxiu Zeng

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
---
 drivers/crypto/qat/qat_common/qat_hal.c | 32 ++++++++++----------------------
 1 file changed, 10 insertions(+), 22 deletions(-)

diff --git a/drivers/crypto/qat/qat_common/qat_hal.c b/drivers/crypto/qat/qat_common/qat_hal.c
index 1e480f1..318558f 100644
--- a/drivers/crypto/qat/qat_common/qat_hal.c
+++ b/drivers/crypto/qat/qat_common/qat_hal.c
@@ -546,17 +546,6 @@ static void qat_hal_disable_ctx(struct icp_qat_fw_loader_handle *handle,
 	qat_hal_wr_ae_csr(handle, ae, CTX_ENABLES, ctx);
 }
 
-static uint64_t qat_hal_parity_64bit(uint64_t word)
-{
-	word ^= word >> 1;
-	word ^= word >> 2;
-	word ^= word >> 4;
-	word ^= word >> 8;
-	word ^= word >> 16;
-	word ^= word >> 32;
-	return word & 1;
-}
-
 static uint64_t qat_hal_set_uword_ecc(uint64_t uword)
 {
 	uint64_t bit0_mask = 0xff800007fffULL, bit1_mask = 0x1f801ff801fULL,
@@ -566,13 +555,13 @@ static uint64_t qat_hal_set_uword_ecc(uint64_t uword)
 
 	/* clear the ecc bits */
 	uword &= ~(0x7fULL << 0x2C);
-	uword |= qat_hal_parity_64bit(bit0_mask & uword) << 0x2C;
-	uword |= qat_hal_parity_64bit(bit1_mask & uword) << 0x2D;
-	uword |= qat_hal_parity_64bit(bit2_mask & uword) << 0x2E;
-	uword |= qat_hal_parity_64bit(bit3_mask & uword) << 0x2F;
-	uword |= qat_hal_parity_64bit(bit4_mask & uword) << 0x30;
-	uword |= qat_hal_parity_64bit(bit5_mask & uword) << 0x31;
-	uword |= qat_hal_parity_64bit(bit6_mask & uword) << 0x32;
+	uword |= (uint64_t)parity64(bit0_mask & uword) << 0x2C;
+	uword |= (uint64_t)parity64(bit1_mask & uword) << 0x2D;
+	uword |= (uint64_t)parity64(bit2_mask & uword) << 0x2E;
+	uword |= (uint64_t)parity64(bit3_mask & uword) << 0x2F;
+	uword |= (uint64_t)parity64(bit4_mask & uword) << 0x30;
+	uword |= (uint64_t)parity64(bit5_mask & uword) << 0x31;
+	uword |= (uint64_t)parity64(bit6_mask & uword) << 0x32;
 	return uword;
 }
 
@@ -853,15 +842,14 @@ void qat_hal_wr_umem(struct icp_qat_fw_loader_handle *handle,
 	uaddr |= UA_ECS;
 	qat_hal_wr_ae_csr(handle, ae, USTORE_ADDRESS, uaddr);
 	for (i = 0; i < words_num; i++) {
-		unsigned int uwrd_lo, uwrd_hi, tmp;
+		unsigned int uwrd_lo, uwrd_hi;
 
 		uwrd_lo = ((data[i] & 0xfff0000) << 4) | (0x3 << 18) |
 			  ((data[i] & 0xff00) << 2) |
 			  (0x3 << 8) | (data[i] & 0xff);
 		uwrd_hi = (0xf << 4) | ((data[i] & 0xf0000000) >> 28);
-		uwrd_hi |= (hweight32(data[i] & 0xffff) & 0x1) << 8;
-		tmp = ((data[i] >> 0x10) & 0xffff);
-		uwrd_hi |= (hweight32(tmp) & 0x1) << 9;
+		uwrd_hi |= parity16(data[i]) << 8;
+		uwrd_hi |= parity16(data[i] >> 16) << 9;
 		qat_hal_wr_ae_csr(handle, ae, USTORE_DATA_LOWER, uwrd_lo);
 		qat_hal_wr_ae_csr(handle, ae, USTORE_DATA_UPPER, uwrd_hi);
 	}
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 23/30] mtd: use parity16 in sm_ftl
  2016-04-05  2:06 [PATCH V2 01/30] bitops: add parity functions Zeng Zhaoxiu
                   ` (21 preceding siblings ...)
  2016-04-06  9:58 ` [PATCH v2 22/30] crypto: use parity functions in qat_hal zengzhaoxiu
@ 2016-04-06 10:05 ` zengzhaoxiu
  2016-04-06 10:11 ` [PATCH v2 24/30] ethernet: use parity8 in sun/niu.c zengzhaoxiu
                   ` (6 subsequent siblings)
  29 siblings, 0 replies; 84+ messages in thread
From: zengzhaoxiu @ 2016-04-06 10:05 UTC (permalink / raw)
  To: dwmw2, computersforpeace; +Cc: linux-mtd, linux-kernel, Zhaoxiu Zeng

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
---
 drivers/mtd/sm_ftl.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/mtd/sm_ftl.c b/drivers/mtd/sm_ftl.c
index b096f8b..345ff1a 100644
--- a/drivers/mtd/sm_ftl.c
+++ b/drivers/mtd/sm_ftl.c
@@ -136,7 +136,7 @@ static int sm_get_lba(uint8_t *lba)
 		return -2;
 
 	/* check parity - endianness doesn't matter */
-	if (hweight16(*(uint16_t *)lba) & 1)
+	if (parity16(*(uint16_t *)lba))
 		return -2;
 
 	return (lba[1] >> 1) | ((lba[0] & 0x07) << 7);
@@ -183,8 +183,7 @@ static void sm_write_lba(struct sm_oob *oob, uint16_t lba)
 	tmp[0] = 0x10 | ((lba >> 7) & 0x07);
 	tmp[1] = (lba << 1) & 0xFF;
 
-	if (hweight16(*(uint16_t *)tmp) & 0x01)
-		tmp[1] |= 1;
+	tmp[1] |= parity16(*(uint16_t *)tmp);
 
 	oob->lba_copy1[0] = oob->lba_copy2[0] = tmp[0];
 	oob->lba_copy1[1] = oob->lba_copy2[1] = tmp[1];
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 24/30] ethernet: use parity8 in sun/niu.c
  2016-04-05  2:06 [PATCH V2 01/30] bitops: add parity functions Zeng Zhaoxiu
                   ` (22 preceding siblings ...)
  2016-04-06 10:05 ` [PATCH v2 23/30] mtd: use parity16 in sm_ftl zengzhaoxiu
@ 2016-04-06 10:11 ` zengzhaoxiu
  2016-04-06 10:14 ` [PATCH v2 25/30] input: use parity8 in pcips2 zengzhaoxiu
                   ` (5 subsequent siblings)
  29 siblings, 0 replies; 84+ messages in thread
From: zengzhaoxiu @ 2016-04-06 10:11 UTC (permalink / raw)
  To: iamjoonsoo.kim, akpm, vbabka, davem, jiri
  Cc: netdev, linux-kernel, Zhaoxiu Zeng

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
---
 drivers/net/ethernet/sun/niu.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c
index 9cc4564..8c344ef 100644
--- a/drivers/net/ethernet/sun/niu.c
+++ b/drivers/net/ethernet/sun/niu.c
@@ -2742,18 +2742,12 @@ static int niu_set_alt_mac_rdc_table(struct niu *np, int idx,
 
 static u64 vlan_entry_set_parity(u64 reg_val)
 {
-	u64 port01_mask;
-	u64 port23_mask;
-
-	port01_mask = 0x00ff;
-	port23_mask = 0xff00;
-
-	if (hweight64(reg_val & port01_mask) & 1)
+	if (parity8(reg_val))
 		reg_val |= ENET_VLAN_TBL_PARITY0;
 	else
 		reg_val &= ~ENET_VLAN_TBL_PARITY0;
 
-	if (hweight64(reg_val & port23_mask) & 1)
+	if (parity8((unsigned int)reg_val >> 8))
 		reg_val |= ENET_VLAN_TBL_PARITY1;
 	else
 		reg_val &= ~ENET_VLAN_TBL_PARITY1;
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 10/30] Add x86-specific parity functions
  2016-04-06  9:14 ` [PATCH v2 10/30] Add x86-specific " zengzhaoxiu
@ 2016-04-06 10:13   ` Borislav Petkov
  2016-04-06 10:37     ` One Thousand Gnomes
  2016-04-07  3:55     ` Zeng Zhaoxiu
  2016-04-06 19:45   ` Andi Kleen
  1 sibling, 2 replies; 84+ messages in thread
From: Borislav Petkov @ 2016-04-06 10:13 UTC (permalink / raw)
  To: zengzhaoxiu
  Cc: tglx, mingo, hpa, dvlasenk, akpm, dvyukov, keescook,
	linux-kernel, Zhaoxiu Zeng

On Wed, Apr 06, 2016 at 05:14:45PM +0800, zengzhaoxiu@163.com wrote:
> From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
> 
> Use alternatives, lifted from arch_hweight
> 
> Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
> ---
>  arch/x86/include/asm/arch_hweight.h |   5 ++
>  arch/x86/include/asm/arch_parity.h  | 102 ++++++++++++++++++++++++++++++++++++
>  arch/x86/include/asm/bitops.h       |   4 +-
>  arch/x86/lib/Makefile               |   8 +++
>  arch/x86/lib/parity.c               |  32 ++++++++++++
>  5 files changed, 150 insertions(+), 1 deletion(-)
>  create mode 100644 arch/x86/include/asm/arch_parity.h
>  create mode 100644 arch/x86/lib/parity.c

...

> +static __always_inline unsigned int __arch_parity32(unsigned int w)
> +{
> +	unsigned int res;
> +
> +	asm(ALTERNATIVE("call __sw_parity32", POPCNT32 "; and $1, %0", X86_FEATURE_POPCNT)
> +		: "="REG_OUT (res)
> +		: REG_IN (w)
> +		: "cc");

So why all that churn instead of simply doing:

static __always_inline unsigned int __arch_parity32(unsigned int w)
{
	return hweight32(w) & 1;
}

Ditto for the 64-bit version.

-- 
Regards/Gruss,
    Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
-- 

^ permalink raw reply	[flat|nested] 84+ messages in thread

* [PATCH v2 25/30] input: use parity8 in pcips2
  2016-04-05  2:06 [PATCH V2 01/30] bitops: add parity functions Zeng Zhaoxiu
                   ` (23 preceding siblings ...)
  2016-04-06 10:11 ` [PATCH v2 24/30] ethernet: use parity8 in sun/niu.c zengzhaoxiu
@ 2016-04-06 10:14 ` zengzhaoxiu
  2016-04-06 10:15 ` [PATCH v2 26/30] input: use parity8 in sa1111ps2 zengzhaoxiu
                   ` (4 subsequent siblings)
  29 siblings, 0 replies; 84+ messages in thread
From: zengzhaoxiu @ 2016-04-06 10:14 UTC (permalink / raw)
  To: dmitry.torokhov; +Cc: linux-input, linux-kernel, Zhaoxiu Zeng

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
---
 drivers/input/serio/pcips2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/input/serio/pcips2.c b/drivers/input/serio/pcips2.c
index e862c6e..a51e7f0 100644
--- a/drivers/input/serio/pcips2.c
+++ b/drivers/input/serio/pcips2.c
@@ -77,7 +77,7 @@ static irqreturn_t pcips2_interrupt(int irq, void *devid)
 
 		flag = (status & PS2_STAT_PARITY) ? 0 : SERIO_PARITY;
 
-		if (hweight8(scancode) & 1)
+		if (parity8(scancode))
 			flag ^= SERIO_PARITY;
 
 		serio_interrupt(ps2if->io, scancode, flag);
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 26/30] input: use parity8 in sa1111ps2
  2016-04-05  2:06 [PATCH V2 01/30] bitops: add parity functions Zeng Zhaoxiu
                   ` (24 preceding siblings ...)
  2016-04-06 10:14 ` [PATCH v2 25/30] input: use parity8 in pcips2 zengzhaoxiu
@ 2016-04-06 10:15 ` zengzhaoxiu
  2016-04-06 10:16 ` [PATCH v2 27/30] iio: use parity32 in adxrs450 zengzhaoxiu
                   ` (3 subsequent siblings)
  29 siblings, 0 replies; 84+ messages in thread
From: zengzhaoxiu @ 2016-04-06 10:15 UTC (permalink / raw)
  To: dmitry.torokhov; +Cc: linux-input, linux-kernel, Zhaoxiu Zeng

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
---
 drivers/input/serio/sa1111ps2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/input/serio/sa1111ps2.c b/drivers/input/serio/sa1111ps2.c
index b3e6889..324b193 100644
--- a/drivers/input/serio/sa1111ps2.c
+++ b/drivers/input/serio/sa1111ps2.c
@@ -74,7 +74,7 @@ static irqreturn_t ps2_rxint(int irq, void *dev_id)
 
 		scancode = sa1111_readl(ps2if->base + PS2DATA) & 0xff;
 
-		if (hweight8(scancode) & 1)
+		if (parity8(scancode))
 			flag ^= SERIO_PARITY;
 
 		serio_interrupt(ps2if->io, scancode, flag);
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 27/30] iio: use parity32 in adxrs450
  2016-04-05  2:06 [PATCH V2 01/30] bitops: add parity functions Zeng Zhaoxiu
                   ` (25 preceding siblings ...)
  2016-04-06 10:15 ` [PATCH v2 26/30] input: use parity8 in sa1111ps2 zengzhaoxiu
@ 2016-04-06 10:16 ` zengzhaoxiu
  2016-04-10 14:37   ` Jonathan Cameron
  2016-04-06 10:18 ` [PATCH v2 28/30] serial: use parity32 in max3100 zengzhaoxiu
                   ` (2 subsequent siblings)
  29 siblings, 1 reply; 84+ messages in thread
From: zengzhaoxiu @ 2016-04-06 10:16 UTC (permalink / raw)
  To: lars, Michael.Hennerich, jic23, knaack.h, pmeerw
  Cc: linux-iio, linux-kernel, Zhaoxiu Zeng

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
---
 drivers/iio/gyro/adxrs450.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/iio/gyro/adxrs450.c b/drivers/iio/gyro/adxrs450.c
index a330d42..307f55b 100644
--- a/drivers/iio/gyro/adxrs450.c
+++ b/drivers/iio/gyro/adxrs450.c
@@ -109,7 +109,7 @@ static int adxrs450_spi_read_reg_16(struct iio_dev *indio_dev,
 	mutex_lock(&st->buf_lock);
 	tx = ADXRS450_READ_DATA | (reg_address << 17);
 
-	if (!(hweight32(tx) & 1))
+	if (!parity32(tx))
 		tx |= ADXRS450_P;
 
 	st->tx = cpu_to_be32(tx);
@@ -145,7 +145,7 @@ static int adxrs450_spi_write_reg_16(struct iio_dev *indio_dev,
 	mutex_lock(&st->buf_lock);
 	tx = ADXRS450_WRITE_DATA | (reg_address << 17) | (val << 1);
 
-	if (!(hweight32(tx) & 1))
+	if (!parity32(tx))
 		tx |= ADXRS450_P;
 
 	st->tx = cpu_to_be32(tx);
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 28/30] serial: use parity32 in max3100
  2016-04-05  2:06 [PATCH V2 01/30] bitops: add parity functions Zeng Zhaoxiu
                   ` (26 preceding siblings ...)
  2016-04-06 10:16 ` [PATCH v2 27/30] iio: use parity32 in adxrs450 zengzhaoxiu
@ 2016-04-06 10:18 ` zengzhaoxiu
  2016-04-06 10:25   ` Greg KH
  2016-04-06 10:20 ` [PATCH v2 29/30] input: use parity8 in elantech zengzhaoxiu
  2016-04-06 10:21 ` [PATCH v2 30/30] ethernet: use parity8 in broadcom/tg3.c zengzhaoxiu
  29 siblings, 1 reply; 84+ messages in thread
From: zengzhaoxiu @ 2016-04-06 10:18 UTC (permalink / raw)
  To: lars, gregkh, jslaby; +Cc: linux-serial, linux-kernel, Zhaoxiu Zeng

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
---
 drivers/tty/serial/max3100.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/tty/serial/max3100.c b/drivers/tty/serial/max3100.c
index 5c4c280..a0cc84a 100644
--- a/drivers/tty/serial/max3100.c
+++ b/drivers/tty/serial/max3100.c
@@ -155,7 +155,7 @@ static int max3100_do_parity(struct max3100_port *s, u16 c)
 	else
 		c &= 0xff;
 
-	parity = parity ^ (hweight8(c) & 1);
+	parity ^= parity8(c);
 	return parity;
 }
 
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 29/30] input: use parity8 in elantech
  2016-04-05  2:06 [PATCH V2 01/30] bitops: add parity functions Zeng Zhaoxiu
                   ` (27 preceding siblings ...)
  2016-04-06 10:18 ` [PATCH v2 28/30] serial: use parity32 in max3100 zengzhaoxiu
@ 2016-04-06 10:20 ` zengzhaoxiu
  2016-04-06 10:21 ` [PATCH v2 30/30] ethernet: use parity8 in broadcom/tg3.c zengzhaoxiu
  29 siblings, 0 replies; 84+ messages in thread
From: zengzhaoxiu @ 2016-04-06 10:20 UTC (permalink / raw)
  To: dmitry.torokhov, benjamin.tissoires, ulrik.debie-os, hdegoede,
	peter.hutterer, tiwai, dusonlin
  Cc: linux-input, linux-kernel, Zhaoxiu Zeng

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

Remove even parity table, use parity8 instead.

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
---
 drivers/input/mouse/elantech.c | 10 +++-------
 drivers/input/mouse/elantech.h |  1 -
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/input/mouse/elantech.c b/drivers/input/mouse/elantech.c
index 78f93cf..bbb4aa6 100644
--- a/drivers/input/mouse/elantech.c
+++ b/drivers/input/mouse/elantech.c
@@ -693,9 +693,9 @@ static int elantech_packet_check_v1(struct psmouse *psmouse)
 
 	p3 = (packet[0] & 0x04) >> 2;
 
-	return etd->parity[packet[1]] == p1 &&
-	       etd->parity[packet[2]] == p2 &&
-	       etd->parity[packet[3]] == p3;
+	return parity8(packet[1]) != p1 &&
+	       parity8(packet[2]) != p2 &&
+	       parity8(packet[3]) != p3;
 }
 
 static int elantech_debounce_check_v2(struct psmouse *psmouse)
@@ -1635,10 +1635,6 @@ int elantech_init(struct psmouse *psmouse)
 
 	psmouse_reset(psmouse);
 
-	etd->parity[0] = 1;
-	for (i = 1; i < 256; i++)
-		etd->parity[i] = etd->parity[i & (i - 1)] ^ 1;
-
 	/*
 	 * Do the version query again so we can store the result
 	 */
diff --git a/drivers/input/mouse/elantech.h b/drivers/input/mouse/elantech.h
index e1cbf40..542c5d9 100644
--- a/drivers/input/mouse/elantech.h
+++ b/drivers/input/mouse/elantech.h
@@ -141,7 +141,6 @@ struct elantech_data {
 	unsigned int y_max;
 	unsigned int width;
 	struct finger_pos mt[ETP_MAX_FINGERS];
-	unsigned char parity[256];
 	int (*send_cmd)(struct psmouse *psmouse, unsigned char c, unsigned char *param);
 	void (*original_set_rate)(struct psmouse *psmouse, unsigned int rate);
 };
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 30/30] ethernet: use parity8 in broadcom/tg3.c
  2016-04-05  2:06 [PATCH V2 01/30] bitops: add parity functions Zeng Zhaoxiu
                   ` (28 preceding siblings ...)
  2016-04-06 10:20 ` [PATCH v2 29/30] input: use parity8 in elantech zengzhaoxiu
@ 2016-04-06 10:21 ` zengzhaoxiu
  29 siblings, 0 replies; 84+ messages in thread
From: zengzhaoxiu @ 2016-04-06 10:21 UTC (permalink / raw)
  To: siva.kallam, prashant, mchan; +Cc: netdev, linux-kernel, Zhaoxiu Zeng

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
---
 drivers/net/ethernet/broadcom/tg3.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 3010080..802a429 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -12939,11 +12939,7 @@ static int tg3_test_nvram(struct tg3 *tp)
 
 		err = -EIO;
 		for (i = 0; i < NVRAM_SELFBOOT_DATA_SIZE; i++) {
-			u8 hw8 = hweight8(data[i]);
-
-			if ((hw8 & 0x1) && parity[i])
-				goto out;
-			else if (!(hw8 & 0x1) && !parity[i])
+			if (parity8(data[i]) == !!parity[i])
 				goto out;
 		}
 		err = 0;
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* [PATCH v2 06/30] Add mips-specific parity functions
  2016-04-06  8:59 ` [PATCH v2 06/30] Add mips-specific " zengzhaoxiu
@ 2016-04-06 10:23   ` zengzhaoxiu
  0 siblings, 0 replies; 84+ messages in thread
From: zengzhaoxiu @ 2016-04-06 10:23 UTC (permalink / raw)
  To: ralf, Leonid.Yegoshin, macro; +Cc: linux-mips, linux-kernel, Zhaoxiu Zeng

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

Lifted from arch_hweight.h

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
---
 arch/mips/include/asm/arch_parity.h | 43 +++++++++++++++++++++++++++++++++++++
 arch/mips/include/asm/bitops.h      |  3 +++
 2 files changed, 46 insertions(+)
 create mode 100644 arch/mips/include/asm/arch_parity.h

diff --git a/arch/mips/include/asm/arch_parity.h b/arch/mips/include/asm/arch_parity.h
new file mode 100644
index 0000000..23b3c23
--- /dev/null
+++ b/arch/mips/include/asm/arch_parity.h
@@ -0,0 +1,44 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ */
+#ifndef _ASM_ARCH_PARITY_H
+#define _ASM_ARCH_PARITY_H
+
+#ifdef ARCH_HAS_USABLE_BUILTIN_POPCOUNT
+
+#include <asm/types.h>
+
+static inline unsigned int __arch_parity32(unsigned int w)
+{
+	return __builtin_popcount(w) & 1;
+}
+
+static inline unsigned int __arch_parity16(unsigned int w)
+{
+	return __arch_parity32(w & 0xffff);
+}
+
+static inline unsigned int __arch_parity8(unsigned int w)
+{
+	return __arch_parity32(w & 0xff);
+}
+
+static inline unsigned int __arch_parity4(unsigned int w)
+{
+	return __arch_parity32(w & 0xf);
+}
+
+static inline unsigned int __arch_parity64(__u64 w)
+{
+	return (unsigned int)__builtin_popcountll(w) & 1;
+}
+
+#else
+#include <asm-generic/bitops/arch_parity.h>
+#endif
+
+#endif /* _ASM_ARCH_PARITY_H */
diff --git a/arch/mips/include/asm/bitops.h b/arch/mips/include/asm/bitops.h
index ce9666c..0b87734 100644
--- a/arch/mips/include/asm/bitops.h
+++ b/arch/mips/include/asm/bitops.h
@@ -626,6 +626,9 @@ static inline int ffs(int word)
 #include <asm/arch_hweight.h>
 #include <asm-generic/bitops/const_hweight.h>
 
+#include <asm/arch_parity.h>
+#include <asm-generic/bitops/const_parity.h>
+
 #include <asm-generic/bitops/le.h>
 #include <asm-generic/bitops/ext2-atomic.h>
 
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 28/30] serial: use parity32 in max3100
  2016-04-06 10:18 ` [PATCH v2 28/30] serial: use parity32 in max3100 zengzhaoxiu
@ 2016-04-06 10:25   ` Greg KH
  0 siblings, 0 replies; 84+ messages in thread
From: Greg KH @ 2016-04-06 10:25 UTC (permalink / raw)
  To: zengzhaoxiu; +Cc: lars, jslaby, linux-serial, linux-kernel, Zhaoxiu Zeng

On Wed, Apr 06, 2016 at 06:18:42PM +0800, zengzhaoxiu@163.com wrote:
> From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
> 
> Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
> ---
>  drivers/tty/serial/max3100.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)

I can't take patches without any changelog text :(

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 10/30] Add x86-specific parity functions
  2016-04-06 10:13   ` Borislav Petkov
@ 2016-04-06 10:37     ` One Thousand Gnomes
  2016-04-06 10:53       ` Borislav Petkov
  2016-04-11  2:43       ` Zeng Zhaoxiu
  2016-04-07  3:55     ` Zeng Zhaoxiu
  1 sibling, 2 replies; 84+ messages in thread
From: One Thousand Gnomes @ 2016-04-06 10:37 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: zengzhaoxiu, tglx, mingo, hpa, dvlasenk, akpm, dvyukov, keescook,
	linux-kernel, Zhaoxiu Zeng

On Wed, 6 Apr 2016 12:13:00 +0200
Borislav Petkov <bp@suse.de> wrote:

> On Wed, Apr 06, 2016 at 05:14:45PM +0800, zengzhaoxiu@163.com wrote:
> > From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
> > 
> > Use alternatives, lifted from arch_hweight
> > 
> > Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
> > ---
> >  arch/x86/include/asm/arch_hweight.h |   5 ++
> >  arch/x86/include/asm/arch_parity.h  | 102 ++++++++++++++++++++++++++++++++++++
> >  arch/x86/include/asm/bitops.h       |   4 +-
> >  arch/x86/lib/Makefile               |   8 +++
> >  arch/x86/lib/parity.c               |  32 ++++++++++++
> >  5 files changed, 150 insertions(+), 1 deletion(-)
> >  create mode 100644 arch/x86/include/asm/arch_parity.h
> >  create mode 100644 arch/x86/lib/parity.c  
> 
> ...
> 
> > +static __always_inline unsigned int __arch_parity32(unsigned int w)
> > +{
> > +	unsigned int res;
> > +
> > +	asm(ALTERNATIVE("call __sw_parity32", POPCNT32 "; and $1, %0", X86_FEATURE_POPCNT)
> > +		: "="REG_OUT (res)
> > +		: REG_IN (w)
> > +		: "cc");  
> 
> So why all that churn instead of simply doing:
> 
> static __always_inline unsigned int __arch_parity32(unsigned int w)
> {
> 	return hweight32(w) & 1;
> }
> 
> Ditto for the 64-bit version.

Even that would still be wrong for the smaller parity values. The CPU
supports 8bit parity directly going back to the 8086 so the
implementation for 8bit and I think 16bit is still wrong.

Alan

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 10/30] Add x86-specific parity functions
  2016-04-06 10:37     ` One Thousand Gnomes
@ 2016-04-06 10:53       ` Borislav Petkov
  2016-04-07  3:55         ` Zeng Zhaoxiu
  2016-04-11  2:43       ` Zeng Zhaoxiu
  1 sibling, 1 reply; 84+ messages in thread
From: Borislav Petkov @ 2016-04-06 10:53 UTC (permalink / raw)
  To: One Thousand Gnomes
  Cc: zengzhaoxiu, tglx, mingo, hpa, dvlasenk, akpm, dvyukov, keescook,
	linux-kernel, Zhaoxiu Zeng

On Wed, Apr 06, 2016 at 11:37:37AM +0100, One Thousand Gnomes wrote:
> Even that would still be wrong for the smaller parity values. The CPU
> supports 8bit parity directly going back to the 8086 so the
> implementation for 8bit and I think 16bit is still wrong.

I was objecting to the unnecessary replication of the hweight/popcnt
glue.

And yes, one could look up the definition of the parity flag on x86 and
then base the implementation of all those smaller ones on that as the
hardware does it for one practically for free there.

:-)

-- 
Regards/Gruss,
    Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
-- 

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 09/30] Add tile-specific parity functions
  2016-04-06  9:08 ` [PATCH v2 09/30] Add tile-specific " zengzhaoxiu
@ 2016-04-06 13:27   ` Chris Metcalf
  2016-04-07  3:55     ` Zeng Zhaoxiu
  0 siblings, 1 reply; 84+ messages in thread
From: Chris Metcalf @ 2016-04-06 13:27 UTC (permalink / raw)
  To: zengzhaoxiu; +Cc: linux-kernel, Zhaoxiu Zeng

On 4/6/2016 5:08 AM, zengzhaoxiu@163.com wrote:
> From: Zhaoxiu Zeng<zhaoxiu.zeng@gmail.com>
>
> Signed-off-by: Zhaoxiu Zeng<zhaoxiu.zeng@gmail.com>
> ---
>   arch/tile/include/asm/bitops.h | 26 ++++++++++++++++++++++++++
>   1 file changed, 26 insertions(+)

Since all the code you are adding here is architecture-independent,
I think it would make more sense to have it be in a file like
include/asm-generic/bitops/parity-popcount.h, which can then
be included from arch/tile/include/asm/bitops.h.

-- 
Chris Metcalf, Mellanox Technologies
http://www.mellanox.com

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 08/30] Add sparc-specific parity functions
  2016-04-06  9:07 ` [PATCH v2 08/30] Add sparc-specific " zengzhaoxiu
@ 2016-04-06 18:44   ` Sam Ravnborg
  2016-04-07  3:56     ` Zeng Zhaoxiu
  0 siblings, 1 reply; 84+ messages in thread
From: Sam Ravnborg @ 2016-04-06 18:44 UTC (permalink / raw)
  To: zengzhaoxiu
  Cc: davem, wim.coekaerts, linux, julian.calaby, sparclinux,
	linux-kernel, Zhaoxiu Zeng

Hi Zeng.

> 
> Use runtime patching for sparc64, lifted from hweight
No errors found in patch - but a few comments.
In general patch looks good.

> +++ b/arch/sparc/include/asm/bitops_64.h
> @@ -47,6 +47,24 @@ unsigned int __arch_hweight16(unsigned int w);
>  unsigned int __arch_hweight8(unsigned int w);
>  
>  #include <asm-generic/bitops/const_hweight.h>
> +
> +/*
> + * parityN: returns the parity of a N-bit word,
> + * i.e. the number of 1-bits in w modulo 2.
> + */
> +
> +static inline unsigned int __arch_parity4(unsigned int w)
> +{
> +	w &= 0xf;
> +	return (0x6996 >> w) & 1;
> +}
As Josef already said - this constant should have a name.
PARITY_BIT
?

> +++ b/arch/sparc/kernel/sparc_ksyms_64.c
> @@ -45,6 +45,12 @@ EXPORT_SYMBOL(__arch_hweight16);
>  EXPORT_SYMBOL(__arch_hweight32);
>  EXPORT_SYMBOL(__arch_hweight64);
>  
> +/* from parity.S */
> +EXPORT_SYMBOL(__arch_parity8);
> +EXPORT_SYMBOL(__arch_parity16);
> +EXPORT_SYMBOL(__arch_parity32);
> +EXPORT_SYMBOL(__arch_parity64);

Did you compile this?
I wonder if bitops_64.h is indirectly included.

> index 0000000..b1945e3
> --- /dev/null
> +++ b/arch/sparc/lib/parity.S
> @@ -0,0 +1,93 @@
> +#include <linux/linkage.h>
> +
> +	.text
> +	.align	32
> +
> +ENTRY(__arch_parity8)
> +	srl		%o0, 4, %g1
> +	xor		%o0, %g1, %o0
> +	and		%o0, 0xf, %o0
> +	sethi		%hi(0x6996), %g1
> +	or		%g1, %lo(0x6996), %g1
> +	srl		%g1, %o0, %o0
> +	retl
> +	 and		%o0, 1, %o0
> +ENDPROC(__arch_parity8)

I know the level of comments in hweight is equal to none.
But please do not follow this bad example.
At least for each function a one-liner of the C code.
And in the top of the file maybe two lines that the functions
are patched at run-time if the processor has popc available.


	Sam

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 10/30] Add x86-specific parity functions
  2016-04-06  9:14 ` [PATCH v2 10/30] Add x86-specific " zengzhaoxiu
  2016-04-06 10:13   ` Borislav Petkov
@ 2016-04-06 19:45   ` Andi Kleen
  2016-04-07  3:56     ` Zeng Zhaoxiu
                       ` (2 more replies)
  1 sibling, 3 replies; 84+ messages in thread
From: Andi Kleen @ 2016-04-06 19:45 UTC (permalink / raw)
  To: zengzhaoxiu
  Cc: tglx, mingo, hpa, dvlasenk, bp, akpm, dvyukov, keescook,
	linux-kernel, Zhaoxiu Zeng

zengzhaoxiu@163.com writes:

> From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
>
> Use alternatives, lifted from arch_hweight

Is there actually anything performance critical in the kernel that uses
parity?

FWIW the arch hweight custom calling convention is a problem for LTO
because it needs different special flags, so I usually have to disable
it. Likely other reasonable usages, such as automatic source code
analysis, and other tool chain based usages have similar problems.

As far as I can tell both for hweight and likely for parity it is
badly overengineering and normal calling conventions would work as well,
and cause much less problems.

So if parity is really worth adding here (which I find doubtful,
but you may have numbers), please add it without these magic
calling hacks.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 10/30] Add x86-specific parity functions
  2016-04-06 10:13   ` Borislav Petkov
  2016-04-06 10:37     ` One Thousand Gnomes
@ 2016-04-07  3:55     ` Zeng Zhaoxiu
  2016-04-07  9:41       ` Borislav Petkov
  1 sibling, 1 reply; 84+ messages in thread
From: Zeng Zhaoxiu @ 2016-04-07  3:55 UTC (permalink / raw)
  To: Borislav Petkov, zengzhaoxiu
  Cc: tglx, mingo, hpa, dvlasenk, akpm, dvyukov, keescook, linux-kernel

在 2016年04月06日 18:13, Borislav Petkov 写道:
> On Wed, Apr 06, 2016 at 05:14:45PM +0800, zengzhaoxiu@163.com wrote:
>> From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
>>
>> Use alternatives, lifted from arch_hweight
>>
>> Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
>> ---
>>   arch/x86/include/asm/arch_hweight.h |   5 ++
>>   arch/x86/include/asm/arch_parity.h  | 102 ++++++++++++++++++++++++++++++++++++
>>   arch/x86/include/asm/bitops.h       |   4 +-
>>   arch/x86/lib/Makefile               |   8 +++
>>   arch/x86/lib/parity.c               |  32 ++++++++++++
>>   5 files changed, 150 insertions(+), 1 deletion(-)
>>   create mode 100644 arch/x86/include/asm/arch_parity.h
>>   create mode 100644 arch/x86/lib/parity.c
> ...
>
>> +static __always_inline unsigned int __arch_parity32(unsigned int w)
>> +{
>> +	unsigned int res;
>> +
>> +	asm(ALTERNATIVE("call __sw_parity32", POPCNT32 "; and $1, %0", X86_FEATURE_POPCNT)
>> +		: "="REG_OUT (res)
>> +		: REG_IN (w)
>> +		: "cc");
> So why all that churn instead of simply doing:
>
> static __always_inline unsigned int __arch_parity32(unsigned int w)
> {
> 	return hweight32(w) & 1;
> }
>
> Ditto for the 64-bit version.
>

__sw_parity32 is faster than __sw_hweight32.
I don't know how many CPUs do not support the popc, if they are outdated,
use __arch_hweight32 is the easiest way.

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 10/30] Add x86-specific parity functions
  2016-04-06 10:53       ` Borislav Petkov
@ 2016-04-07  3:55         ` Zeng Zhaoxiu
  2016-04-07  9:39           ` Borislav Petkov
  0 siblings, 1 reply; 84+ messages in thread
From: Zeng Zhaoxiu @ 2016-04-07  3:55 UTC (permalink / raw)
  To: Borislav Petkov, One Thousand Gnomes
  Cc: zengzhaoxiu, tglx, mingo, hpa, dvlasenk, akpm, dvyukov, keescook,
	linux-kernel

在 2016年04月06日 18:53, Borislav Petkov 写道:
> On Wed, Apr 06, 2016 at 11:37:37AM +0100, One Thousand Gnomes wrote:
>> Even that would still be wrong for the smaller parity values. The CPU
>> supports 8bit parity directly going back to the 8086 so the
>> implementation for 8bit and I think 16bit is still wrong.
> I was objecting to the unnecessary replication of the hweight/popcnt
> glue.
>
> And yes, one could look up the definition of the parity flag on x86 and
> then base the implementation of all those smaller ones on that as the
> hardware does it for one practically for free there.
>
> :-)
>

SETcc (SETPO etc.) added since 80386, is this a problem?

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 09/30] Add tile-specific parity functions
  2016-04-06 13:27   ` Chris Metcalf
@ 2016-04-07  3:55     ` Zeng Zhaoxiu
  0 siblings, 0 replies; 84+ messages in thread
From: Zeng Zhaoxiu @ 2016-04-07  3:55 UTC (permalink / raw)
  To: Chris Metcalf, zengzhaoxiu; +Cc: linux-kernel

在 2016年04月06日 21:27, Chris Metcalf 写道:
> On 4/6/2016 5:08 AM, zengzhaoxiu@163.com wrote:
>> From: Zhaoxiu Zeng<zhaoxiu.zeng@gmail.com>
>>
>> Signed-off-by: Zhaoxiu Zeng<zhaoxiu.zeng@gmail.com>
>> ---
>>   arch/tile/include/asm/bitops.h | 26 ++++++++++++++++++++++++++
>>   1 file changed, 26 insertions(+)
>
> Since all the code you are adding here is architecture-independent,
> I think it would make more sense to have it be in a file like
> include/asm-generic/bitops/parity-popcount.h, which can then
> be included from arch/tile/include/asm/bitops.h.
>

Agreed, thanks!

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 08/30] Add sparc-specific parity functions
  2016-04-06 18:44   ` Sam Ravnborg
@ 2016-04-07  3:56     ` Zeng Zhaoxiu
  0 siblings, 0 replies; 84+ messages in thread
From: Zeng Zhaoxiu @ 2016-04-07  3:56 UTC (permalink / raw)
  To: Sam Ravnborg, zengzhaoxiu
  Cc: davem, wim.coekaerts, linux, julian.calaby, sparclinux, linux-kernel

在 2016年04月07日 02:44, Sam Ravnborg 写道:
> Hi Zeng.
>
>> Use runtime patching for sparc64, lifted from hweight
> No errors found in patch - but a few comments.
> In general patch looks good.

Thanks. Sparc, powerpc, and x86 are all new to me.

>> +++ b/arch/sparc/include/asm/bitops_64.h
>> @@ -47,6 +47,24 @@ unsigned int __arch_hweight16(unsigned int w);
>>   unsigned int __arch_hweight8(unsigned int w);
>>   
>>   #include <asm-generic/bitops/const_hweight.h>
>> +
>> +/*
>> + * parityN: returns the parity of a N-bit word,
>> + * i.e. the number of 1-bits in w modulo 2.
>> + */
>> +
>> +static inline unsigned int __arch_parity4(unsigned int w)
>> +{
>> +	w &= 0xf;
>> +	return (0x6996 >> w) & 1;
>> +}
> As Josef already said - this constant should have a name.
> PARITY_BIT
> ?
>

Maybe PARITY_MAGIC?

>> +++ b/arch/sparc/kernel/sparc_ksyms_64.c
>> @@ -45,6 +45,12 @@ EXPORT_SYMBOL(__arch_hweight16);
>>   EXPORT_SYMBOL(__arch_hweight32);
>>   EXPORT_SYMBOL(__arch_hweight64);
>>   
>> +/* from parity.S */
>> +EXPORT_SYMBOL(__arch_parity8);
>> +EXPORT_SYMBOL(__arch_parity16);
>> +EXPORT_SYMBOL(__arch_parity32);
>> +EXPORT_SYMBOL(__arch_parity64);
> Did you compile this?
> I wonder if bitops_64.h is indirectly included.

Yes.

>> index 0000000..b1945e3
>> --- /dev/null
>> +++ b/arch/sparc/lib/parity.S
>> @@ -0,0 +1,93 @@
>> +#include <linux/linkage.h>
>> +
>> +	.text
>> +	.align	32
>> +
>> +ENTRY(__arch_parity8)
>> +	srl		%o0, 4, %g1
>> +	xor		%o0, %g1, %o0
>> +	and		%o0, 0xf, %o0
>> +	sethi		%hi(0x6996), %g1
>> +	or		%g1, %lo(0x6996), %g1
>> +	srl		%g1, %o0, %o0
>> +	retl
>> +	 and		%o0, 1, %o0
>> +ENDPROC(__arch_parity8)
> I know the level of comments in hweight is equal to none.
> But please do not follow this bad example.
> At least for each function a one-liner of the C code.
> And in the top of the file maybe two lines that the functions
> are patched at run-time if the processor has popc available.
>
>
> 	Sam

OK. I will try, but my english is very poor! :-)

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 10/30] Add x86-specific parity functions
  2016-04-06 19:45   ` Andi Kleen
@ 2016-04-07  3:56     ` Zeng Zhaoxiu
  2016-04-07  6:31     ` Dmitry Vyukov
  2016-04-07 14:10     ` [PATCH v2 10/30] Add x86-specific parity functions One Thousand Gnomes
  2 siblings, 0 replies; 84+ messages in thread
From: Zeng Zhaoxiu @ 2016-04-07  3:56 UTC (permalink / raw)
  To: Andi Kleen, zengzhaoxiu
  Cc: tglx, mingo, hpa, dvlasenk, bp, akpm, dvyukov, keescook, linux-kernel

在 2016年04月07日 03:45, Andi Kleen 写道:
> zengzhaoxiu@163.com writes:
>
>> From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
>>
>> Use alternatives, lifted from arch_hweight
> Is there actually anything performance critical in the kernel that uses
> parity?
>
> FWIW the arch hweight custom calling convention is a problem for LTO
> because it needs different special flags, so I usually have to disable
> it. Likely other reasonable usages, such as automatic source code
> analysis, and other tool chain based usages have similar problems.
>
> As far as I can tell both for hweight and likely for parity it is
> badly overengineering and normal calling conventions would work as well,
> and cause much less problems.
>
> So if parity is really worth adding here (which I find doubtful,
> but you may have numbers), please add it without these magic
> calling hacks.
>
> -Andi
>

Thanks. I will instead use __arch_hweight.

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 10/30] Add x86-specific parity functions
  2016-04-06 19:45   ` Andi Kleen
  2016-04-07  3:56     ` Zeng Zhaoxiu
@ 2016-04-07  6:31     ` Dmitry Vyukov
  2016-04-07  9:43       ` Borislav Petkov
  2016-04-07 14:10     ` [PATCH v2 10/30] Add x86-specific parity functions One Thousand Gnomes
  2 siblings, 1 reply; 84+ messages in thread
From: Dmitry Vyukov @ 2016-04-07  6:31 UTC (permalink / raw)
  To: Andi Kleen
  Cc: zengzhaoxiu, Thomas Gleixner, Ingo Molnar, H. Peter Anvin,
	Denys Vlasenko, bp, Andrew Morton, Kees Cook, LKML, Zhaoxiu Zeng

On Wed, Apr 6, 2016 at 9:45 PM, Andi Kleen <andi@firstfloor.org> wrote:
> zengzhaoxiu@163.com writes:
>
>> From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
>>
>> Use alternatives, lifted from arch_hweight
>
> Is there actually anything performance critical in the kernel that uses
> parity?
>
> FWIW the arch hweight custom calling convention is a problem for LTO
> because it needs different special flags, so I usually have to disable
> it. Likely other reasonable usages, such as automatic source code
> analysis, and other tool chain based usages have similar problems.
>
> As far as I can tell both for hweight and likely for parity it is
> badly overengineering and normal calling conventions would work as well,
> and cause much less problems.
>
> So if parity is really worth adding here (which I find doubtful,
> but you may have numbers), please add it without these magic
> calling hacks.


Hweight custom calling convention caused crashes with KCOV coverage.
We had to disable instrumentation of the file.

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 10/30] Add x86-specific parity functions
  2016-04-07  3:55         ` Zeng Zhaoxiu
@ 2016-04-07  9:39           ` Borislav Petkov
  0 siblings, 0 replies; 84+ messages in thread
From: Borislav Petkov @ 2016-04-07  9:39 UTC (permalink / raw)
  To: Zeng Zhaoxiu
  Cc: One Thousand Gnomes, zengzhaoxiu, tglx, mingo, hpa, dvlasenk,
	akpm, dvyukov, keescook, linux-kernel

On Thu, Apr 07, 2016 at 11:55:51AM +0800, Zeng Zhaoxiu wrote:
> SETcc (SETPO etc.) added since 80386, is this a problem?

Sounds to me you didn't make the effort to look up the definition of the
Parity Flag...

-- 
Regards/Gruss,
    Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
-- 

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 10/30] Add x86-specific parity functions
  2016-04-07  3:55     ` Zeng Zhaoxiu
@ 2016-04-07  9:41       ` Borislav Petkov
  0 siblings, 0 replies; 84+ messages in thread
From: Borislav Petkov @ 2016-04-07  9:41 UTC (permalink / raw)
  To: Zeng Zhaoxiu
  Cc: zengzhaoxiu, tglx, mingo, hpa, dvlasenk, akpm, dvyukov, keescook,
	linux-kernel

On Thu, Apr 07, 2016 at 11:55:22AM +0800, Zeng Zhaoxiu wrote:
> __sw_parity32 is faster than __sw_hweight32.
> I don't know how many CPUs do not support the popc, if they are outdated,
> use __arch_hweight32 is the easiest way.

I don't really understand what you're trying to tell me here. And it's
not like I didn't try.

-- 
Regards/Gruss,
    Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
-- 

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 10/30] Add x86-specific parity functions
  2016-04-07  6:31     ` Dmitry Vyukov
@ 2016-04-07  9:43       ` Borislav Petkov
  2016-05-04 18:46         ` [RFC PATCH] x86/hweight: Get rid of the special calling convention Borislav Petkov
  0 siblings, 1 reply; 84+ messages in thread
From: Borislav Petkov @ 2016-04-07  9:43 UTC (permalink / raw)
  To: Dmitry Vyukov
  Cc: Andi Kleen, zengzhaoxiu, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, Denys Vlasenko, Andrew Morton, Kees Cook, LKML,
	Zhaoxiu Zeng

On Thu, Apr 07, 2016 at 08:31:09AM +0200, Dmitry Vyukov wrote:
> Hweight custom calling convention caused crashes with KCOV coverage.
> We had to disable instrumentation of the file.

I guess we can do something like this:

       if (likely(static_cpu_has(X86_FEATURE_POPCNT)))
               asm volatile(POPCNT32
                            : "="REG_OUT (res)
                            : REG_IN (w));
       else
               res = __sw_hweight32(w);

and get rid of the custom calling convention.

Along with some numbers showing that the change doesn't cause any
noticeable slowdown...

-- 
Regards/Gruss,
    Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
-- 

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 10/30] Add x86-specific parity functions
  2016-04-06 19:45   ` Andi Kleen
  2016-04-07  3:56     ` Zeng Zhaoxiu
  2016-04-07  6:31     ` Dmitry Vyukov
@ 2016-04-07 14:10     ` One Thousand Gnomes
  2 siblings, 0 replies; 84+ messages in thread
From: One Thousand Gnomes @ 2016-04-07 14:10 UTC (permalink / raw)
  To: Andi Kleen
  Cc: zengzhaoxiu, tglx, mingo, hpa, dvlasenk, bp, akpm, dvyukov,
	keescook, linux-kernel, Zhaoxiu Zeng

On Wed, 06 Apr 2016 12:45:27 -0700
Andi Kleen <andi@firstfloor.org> wrote:

> zengzhaoxiu@163.com writes:
> 
> > From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
> >
> > Use alternatives, lifted from arch_hweight  
> 
> Is there actually anything performance critical in the kernel that uses
> parity?

On very low end devices some of the flash, but the implementation here is
ironically pretty much worst-case for such x86 devices 8)

Alan

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 27/30] iio: use parity32 in adxrs450
  2016-04-06 10:16 ` [PATCH v2 27/30] iio: use parity32 in adxrs450 zengzhaoxiu
@ 2016-04-10 14:37   ` Jonathan Cameron
  2016-04-10 14:41     ` Lars-Peter Clausen
  0 siblings, 1 reply; 84+ messages in thread
From: Jonathan Cameron @ 2016-04-10 14:37 UTC (permalink / raw)
  To: zengzhaoxiu, lars, Michael.Hennerich, knaack.h, pmeerw
  Cc: linux-iio, linux-kernel, Zhaoxiu Zeng

On 06/04/16 11:16, zengzhaoxiu@163.com wrote:
> From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
> 
> Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
Applied to the togreg branch of iio.git as you seem to have addressed
Lars' comments.

Thanks,

Jonathan
> ---
>  drivers/iio/gyro/adxrs450.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/iio/gyro/adxrs450.c b/drivers/iio/gyro/adxrs450.c
> index a330d42..307f55b 100644
> --- a/drivers/iio/gyro/adxrs450.c
> +++ b/drivers/iio/gyro/adxrs450.c
> @@ -109,7 +109,7 @@ static int adxrs450_spi_read_reg_16(struct iio_dev *indio_dev,
>  	mutex_lock(&st->buf_lock);
>  	tx = ADXRS450_READ_DATA | (reg_address << 17);
>  
> -	if (!(hweight32(tx) & 1))
> +	if (!parity32(tx))
>  		tx |= ADXRS450_P;
>  
>  	st->tx = cpu_to_be32(tx);
> @@ -145,7 +145,7 @@ static int adxrs450_spi_write_reg_16(struct iio_dev *indio_dev,
>  	mutex_lock(&st->buf_lock);
>  	tx = ADXRS450_WRITE_DATA | (reg_address << 17) | (val << 1);
>  
> -	if (!(hweight32(tx) & 1))
> +	if (!parity32(tx))
>  		tx |= ADXRS450_P;
>  
>  	st->tx = cpu_to_be32(tx);
> 

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 27/30] iio: use parity32 in adxrs450
  2016-04-10 14:37   ` Jonathan Cameron
@ 2016-04-10 14:41     ` Lars-Peter Clausen
  2016-04-10 15:13       ` Jonathan Cameron
  0 siblings, 1 reply; 84+ messages in thread
From: Lars-Peter Clausen @ 2016-04-10 14:41 UTC (permalink / raw)
  To: Jonathan Cameron, zengzhaoxiu, Michael.Hennerich, knaack.h, pmeerw
  Cc: linux-iio, linux-kernel, Zhaoxiu Zeng

On 04/10/2016 04:37 PM, Jonathan Cameron wrote:
> On 06/04/16 11:16, zengzhaoxiu@163.com wrote:
>> From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
>>
>> Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
> Applied to the togreg branch of iio.git as you seem to have addressed
> Lars' comments.

The whole series needs to go through the same tree since the new functions
are introduced in the beginning.

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 27/30] iio: use parity32 in adxrs450
  2016-04-10 14:41     ` Lars-Peter Clausen
@ 2016-04-10 15:13       ` Jonathan Cameron
  2016-04-10 15:14         ` Jonathan Cameron
  0 siblings, 1 reply; 84+ messages in thread
From: Jonathan Cameron @ 2016-04-10 15:13 UTC (permalink / raw)
  To: Lars-Peter Clausen, zengzhaoxiu, Michael.Hennerich, knaack.h, pmeerw
  Cc: linux-iio, linux-kernel, Zhaoxiu Zeng

On 10/04/16 15:41, Lars-Peter Clausen wrote:
> On 04/10/2016 04:37 PM, Jonathan Cameron wrote:
>> On 06/04/16 11:16, zengzhaoxiu@163.com wrote:
>>> From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
>>>
>>> Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
>> Applied to the togreg branch of iio.git as you seem to have addressed
>> Lars' comments.
> 
> The whole series needs to go through the same tree since the new functions
> are introduced in the beginning.
Yeah, I just realized that when my build test failed!

Acked-by: Jonathan Cameron <jic23@kernel.org>
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-iio" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 27/30] iio: use parity32 in adxrs450
  2016-04-10 15:13       ` Jonathan Cameron
@ 2016-04-10 15:14         ` Jonathan Cameron
  0 siblings, 0 replies; 84+ messages in thread
From: Jonathan Cameron @ 2016-04-10 15:14 UTC (permalink / raw)
  To: Lars-Peter Clausen, zengzhaoxiu, Michael.Hennerich, knaack.h, pmeerw
  Cc: linux-iio, linux-kernel, Zhaoxiu Zeng

On 10/04/16 16:13, Jonathan Cameron wrote:
> On 10/04/16 15:41, Lars-Peter Clausen wrote:
>> On 04/10/2016 04:37 PM, Jonathan Cameron wrote:
>>> On 06/04/16 11:16, zengzhaoxiu@163.com wrote:
>>>> From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
>>>>
>>>> Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
>>> Applied to the togreg branch of iio.git as you seem to have addressed
>>> Lars' comments.
>>
>> The whole series needs to go through the same tree since the new functions
>> are introduced in the beginning.
> Yeah, I just realized that when my build test failed!
> 
> Acked-by: Jonathan Cameron <jic23@kernel.org>
Backed out of the togreg branch of iio.git...

oops and thanks Lars!
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-iio" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-iio" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 10/30] Add x86-specific parity functions
  2016-04-06 10:37     ` One Thousand Gnomes
  2016-04-06 10:53       ` Borislav Petkov
@ 2016-04-11  2:43       ` Zeng Zhaoxiu
  2016-04-15  2:11         ` Borislav Petkov
  1 sibling, 1 reply; 84+ messages in thread
From: Zeng Zhaoxiu @ 2016-04-11  2:43 UTC (permalink / raw)
  To: One Thousand Gnomes, Borislav Petkov
  Cc: zengzhaoxiu, tglx, mingo, hpa, dvlasenk, akpm, dvyukov, keescook,
	linux-kernel

在 2016年04月06日 18:37, One Thousand Gnomes 写道:
> Even that would still be wrong for the smaller parity values. The CPU
> supports 8bit parity directly going back to the 8086 so the
> implementation for 8bit and I think 16bit is still wrong.
>
> Alan

I don't know where the errors. X86 is new to me.

The definition of the parity flag on x86 from Wikipedia:

In x86 processors, the parity flag reflects the parity only of the least significant
byte of the result, and is set if the number of set bits of ones is even. According to
80386 Intel manual, the parity flag is changed in the x86 processor family by the
following instructions:
     All arithmetic instructions;
     Compare instruction (equivalent to a subtract instruction without storing the result);
     Logical instructions - XOR, AND, OR;
     the TEST instruction (equivalent to the AND instruction without storing the result).
     the POPF instruction

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH v2 10/30] Add x86-specific parity functions
  2016-04-11  2:43       ` Zeng Zhaoxiu
@ 2016-04-15  2:11         ` Borislav Petkov
  0 siblings, 0 replies; 84+ messages in thread
From: Borislav Petkov @ 2016-04-15  2:11 UTC (permalink / raw)
  To: Zeng Zhaoxiu
  Cc: One Thousand Gnomes, zengzhaoxiu, tglx, mingo, hpa, dvlasenk,
	akpm, dvyukov, keescook, linux-kernel

On Mon, Apr 11, 2016 at 10:43:32AM +0800, Zeng Zhaoxiu wrote:
> I don't know where the errors. X86 is new to me.

Ok, let me try again by pasting here the relevant text from the manual:

"Parity Flag (PF). Bit 2. Hardware sets the parity flag to 1 if there
is an even number of 1 bits in the least-significant byte of the last
result of certain operations. Otherwise (i.e., for an odd number of 1
bits), hardware clears the flag to 0. Software can read the flag to
implement parity checking."

-- 
Regards/Gruss,
    Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
-- 

^ permalink raw reply	[flat|nested] 84+ messages in thread

* [RFC PATCH] x86/hweight: Get rid of the special calling convention
  2016-04-07  9:43       ` Borislav Petkov
@ 2016-05-04 18:46         ` Borislav Petkov
  2016-05-04 19:31           ` Brian Gerst
  0 siblings, 1 reply; 84+ messages in thread
From: Borislav Petkov @ 2016-05-04 18:46 UTC (permalink / raw)
  To: LKML
  Cc: Dmitry Vyukov, Andi Kleen, zengzhaoxiu, Thomas Gleixner,
	Ingo Molnar, H. Peter Anvin, Denys Vlasenko, Andrew Morton,
	Kees Cook, Zhaoxiu Zeng, Andy Lutomirski, Peter Zijlstra

On Thu, Apr 07, 2016 at 11:43:33AM +0200, Borislav Petkov wrote:
> I guess we can do something like this:
> 
>        if (likely(static_cpu_has(X86_FEATURE_POPCNT)))
>                asm volatile(POPCNT32
>                             : "="REG_OUT (res)
>                             : REG_IN (w));
>        else
>                res = __sw_hweight32(w);
> 
> and get rid of the custom calling convention.
> 
> Along with some numbers showing that the change doesn't cause any
> noticeable slowdown...

Ok, here's something which seems to build and boot in kvm.

I like how we don't need the special calling conventions anymore and we
can actually say "popcnt .." and gcc selects registers.

The include files hackery is kinda nasty but I had to do it because I
needed to be able to use static_cpu_has() in a header and including
asm/cpufeature.h pulls in all kinds of nasty dependencies. I'm certainly
open for better ideas...

---
From: Borislav Petkov <bp@suse.de>
Date: Wed, 4 May 2016 18:52:09 +0200
Subject: [PATCH] x86/hweight: Get rid of the special calling convention

People complained about ARCH_HWEIGHT_CFLAGS and how it throws a wrench
into kcov, lto, etc, experimentation.

And its not like we absolutely need it so let's get rid of it and
streamline it a bit. I had to do some carving out of facilities so
that the include hell doesn't swallow me but other than that, the new
__arch_hweight*() versions look much more palatable and gcc is more free
to select registers than us hardcoding them in the insn bytes.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/Kconfig                      |   5 --
 arch/x86/include/asm/arch_hweight.h   |  43 ++++---------
 arch/x86/include/asm/cpufeature.h     | 112 +-------------------------------
 arch/x86/include/asm/cpuinfo.h        |  65 +++++++++++++++++++
 arch/x86/include/asm/processor.h      |  63 +-----------------
 arch/x86/include/asm/static_cpu_has.h | 116 ++++++++++++++++++++++++++++++++++
 lib/Makefile                          |   5 --
 7 files changed, 197 insertions(+), 212 deletions(-)
 create mode 100644 arch/x86/include/asm/cpuinfo.h
 create mode 100644 arch/x86/include/asm/static_cpu_has.h

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7bb15747fea2..79e0bcd61cb1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -292,11 +292,6 @@ config X86_32_LAZY_GS
 	def_bool y
 	depends on X86_32 && !CC_STACKPROTECTOR
 
-config ARCH_HWEIGHT_CFLAGS
-	string
-	default "-fcall-saved-ecx -fcall-saved-edx" if X86_32
-	default "-fcall-saved-rdi -fcall-saved-rsi -fcall-saved-rdx -fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 -fcall-saved-r11" if X86_64
-
 config ARCH_SUPPORTS_UPROBES
 	def_bool y
 
diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
index 02e799fa43d1..6c1a2d500c4c 100644
--- a/arch/x86/include/asm/arch_hweight.h
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -2,36 +2,18 @@
 #define _ASM_X86_HWEIGHT_H
 
 #include <asm/cpufeatures.h>
+#include <asm/static_cpu_has.h>
 
-#ifdef CONFIG_64BIT
-/* popcnt %edi, %eax -- redundant REX prefix for alignment */
-#define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7"
-/* popcnt %rdi, %rax */
-#define POPCNT64 ".byte 0xf3,0x48,0x0f,0xb8,0xc7"
-#define REG_IN "D"
-#define REG_OUT "a"
-#else
-/* popcnt %eax, %eax */
-#define POPCNT32 ".byte 0xf3,0x0f,0xb8,0xc0"
-#define REG_IN "a"
-#define REG_OUT "a"
-#endif
-
-/*
- * __sw_hweightXX are called from within the alternatives below
- * and callee-clobbered registers need to be taken care of. See
- * ARCH_HWEIGHT_CFLAGS in <arch/x86/Kconfig> for the respective
- * compiler switches.
- */
 static __always_inline unsigned int __arch_hweight32(unsigned int w)
 {
-	unsigned int res = 0;
+	unsigned int res;
 
-	asm (ALTERNATIVE("call __sw_hweight32", POPCNT32, X86_FEATURE_POPCNT)
-		     : "="REG_OUT (res)
-		     : REG_IN (w));
+	if (likely(static_cpu_has(X86_FEATURE_POPCNT))) {
+		asm volatile("popcnt %[w], %[res]" : [res] "=r" (res) : [w] "r" (w));
 
-	return res;
+		return res;
+	}
+	return __sw_hweight32(w);
 }
 
 static inline unsigned int __arch_hweight16(unsigned int w)
@@ -53,13 +35,14 @@ static inline unsigned long __arch_hweight64(__u64 w)
 #else
 static __always_inline unsigned long __arch_hweight64(__u64 w)
 {
-	unsigned long res = 0;
+	unsigned long res;
 
-	asm (ALTERNATIVE("call __sw_hweight64", POPCNT64, X86_FEATURE_POPCNT)
-		     : "="REG_OUT (res)
-		     : REG_IN (w));
+	if (likely(static_cpu_has(X86_FEATURE_POPCNT))) {
+		asm volatile("popcnt %[w], %[res]" : [res] "=r" (res) : [w] "r" (w));
 
-	return res;
+		return res;
+	}
+	return __sw_hweight64(w);
 }
 #endif /* CONFIG_X86_32 */
 
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 07c942d84662..9a70b12ae8df 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -6,6 +6,8 @@
 #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
 
 #include <asm/asm.h>
+#include <asm/static_cpu_has.h>
+
 #include <linux/bitops.h>
 
 enum cpuid_leafs
@@ -45,51 +47,6 @@ extern const char * const x86_power_flags[32];
  */
 extern const char * const x86_bug_flags[NBUGINTS*32];
 
-#define test_cpu_cap(c, bit)						\
-	 test_bit(bit, (unsigned long *)((c)->x86_capability))
-
-#define REQUIRED_MASK_BIT_SET(bit)					\
-	 ( (((bit)>>5)==0  && (1UL<<((bit)&31) & REQUIRED_MASK0 )) ||	\
-	   (((bit)>>5)==1  && (1UL<<((bit)&31) & REQUIRED_MASK1 )) ||	\
-	   (((bit)>>5)==2  && (1UL<<((bit)&31) & REQUIRED_MASK2 )) ||	\
-	   (((bit)>>5)==3  && (1UL<<((bit)&31) & REQUIRED_MASK3 )) ||	\
-	   (((bit)>>5)==4  && (1UL<<((bit)&31) & REQUIRED_MASK4 )) ||	\
-	   (((bit)>>5)==5  && (1UL<<((bit)&31) & REQUIRED_MASK5 )) ||	\
-	   (((bit)>>5)==6  && (1UL<<((bit)&31) & REQUIRED_MASK6 )) ||	\
-	   (((bit)>>5)==7  && (1UL<<((bit)&31) & REQUIRED_MASK7 )) ||	\
-	   (((bit)>>5)==8  && (1UL<<((bit)&31) & REQUIRED_MASK8 )) ||	\
-	   (((bit)>>5)==9  && (1UL<<((bit)&31) & REQUIRED_MASK9 )) ||	\
-	   (((bit)>>5)==10 && (1UL<<((bit)&31) & REQUIRED_MASK10)) ||	\
-	   (((bit)>>5)==11 && (1UL<<((bit)&31) & REQUIRED_MASK11)) ||	\
-	   (((bit)>>5)==12 && (1UL<<((bit)&31) & REQUIRED_MASK12)) ||	\
-	   (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK13)) ||	\
-	   (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK14)) ||	\
-	   (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK15)) ||	\
-	   (((bit)>>5)==14 && (1UL<<((bit)&31) & REQUIRED_MASK16)) )
-
-#define DISABLED_MASK_BIT_SET(bit)					\
-	 ( (((bit)>>5)==0  && (1UL<<((bit)&31) & DISABLED_MASK0 )) ||	\
-	   (((bit)>>5)==1  && (1UL<<((bit)&31) & DISABLED_MASK1 )) ||	\
-	   (((bit)>>5)==2  && (1UL<<((bit)&31) & DISABLED_MASK2 )) ||	\
-	   (((bit)>>5)==3  && (1UL<<((bit)&31) & DISABLED_MASK3 )) ||	\
-	   (((bit)>>5)==4  && (1UL<<((bit)&31) & DISABLED_MASK4 )) ||	\
-	   (((bit)>>5)==5  && (1UL<<((bit)&31) & DISABLED_MASK5 )) ||	\
-	   (((bit)>>5)==6  && (1UL<<((bit)&31) & DISABLED_MASK6 )) ||	\
-	   (((bit)>>5)==7  && (1UL<<((bit)&31) & DISABLED_MASK7 )) ||	\
-	   (((bit)>>5)==8  && (1UL<<((bit)&31) & DISABLED_MASK8 )) ||	\
-	   (((bit)>>5)==9  && (1UL<<((bit)&31) & DISABLED_MASK9 )) ||	\
-	   (((bit)>>5)==10 && (1UL<<((bit)&31) & DISABLED_MASK10)) ||	\
-	   (((bit)>>5)==11 && (1UL<<((bit)&31) & DISABLED_MASK11)) ||	\
-	   (((bit)>>5)==12 && (1UL<<((bit)&31) & DISABLED_MASK12)) ||	\
-	   (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK13)) ||	\
-	   (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK14)) ||	\
-	   (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK15)) ||	\
-	   (((bit)>>5)==14 && (1UL<<((bit)&31) & DISABLED_MASK16)) )
-
-#define cpu_has(c, bit)							\
-	(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 :	\
-	 test_cpu_cap(c, bit))
-
 #define this_cpu_has(bit)						\
 	(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : 	\
 	 x86_this_cpu_test_bit(bit, (unsigned long *)&cpu_info.x86_capability))
@@ -105,8 +62,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
 #define cpu_feature_enabled(bit)	\
 	(__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : static_cpu_has(bit))
 
-#define boot_cpu_has(bit)	cpu_has(&boot_cpu_data, bit)
-
 #define set_cpu_cap(c, bit)	set_bit(bit, (unsigned long *)((c)->x86_capability))
 #define clear_cpu_cap(c, bit)	clear_bit(bit, (unsigned long *)((c)->x86_capability))
 #define setup_clear_cpu_cap(bit) do { \
@@ -118,69 +73,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
 	set_bit(bit, (unsigned long *)cpu_caps_set);	\
 } while (0)
 
-#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
-/*
- * Static testing of CPU features.  Used the same as boot_cpu_has().
- * These will statically patch the target code for additional
- * performance.
- */
-static __always_inline __pure bool _static_cpu_has(u16 bit)
-{
-		asm_volatile_goto("1: jmp 6f\n"
-			 "2:\n"
-			 ".skip -(((5f-4f) - (2b-1b)) > 0) * "
-			         "((5f-4f) - (2b-1b)),0x90\n"
-			 "3:\n"
-			 ".section .altinstructions,\"a\"\n"
-			 " .long 1b - .\n"		/* src offset */
-			 " .long 4f - .\n"		/* repl offset */
-			 " .word %P1\n"			/* always replace */
-			 " .byte 3b - 1b\n"		/* src len */
-			 " .byte 5f - 4f\n"		/* repl len */
-			 " .byte 3b - 2b\n"		/* pad len */
-			 ".previous\n"
-			 ".section .altinstr_replacement,\"ax\"\n"
-			 "4: jmp %l[t_no]\n"
-			 "5:\n"
-			 ".previous\n"
-			 ".section .altinstructions,\"a\"\n"
-			 " .long 1b - .\n"		/* src offset */
-			 " .long 0\n"			/* no replacement */
-			 " .word %P0\n"			/* feature bit */
-			 " .byte 3b - 1b\n"		/* src len */
-			 " .byte 0\n"			/* repl len */
-			 " .byte 0\n"			/* pad len */
-			 ".previous\n"
-			 ".section .altinstr_aux,\"ax\"\n"
-			 "6:\n"
-			 " testb %[bitnum],%[cap_byte]\n"
-			 " jnz %l[t_yes]\n"
-			 " jmp %l[t_no]\n"
-			 ".previous\n"
-			 : : "i" (bit), "i" (X86_FEATURE_ALWAYS),
-			     [bitnum] "i" (1 << (bit & 7)),
-			     [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3])
-			 : : t_yes, t_no);
-	t_yes:
-		return true;
-	t_no:
-		return false;
-}
-
-#define static_cpu_has(bit)					\
-(								\
-	__builtin_constant_p(boot_cpu_has(bit)) ?		\
-		boot_cpu_has(bit) :				\
-		_static_cpu_has(bit)				\
-)
-#else
-/*
- * Fall back to dynamic for gcc versions which don't support asm goto. Should be
- * a minority now anyway.
- */
-#define static_cpu_has(bit)		boot_cpu_has(bit)
-#endif
-
 #define cpu_has_bug(c, bit)		cpu_has(c, (bit))
 #define set_cpu_bug(c, bit)		set_cpu_cap(c, (bit))
 #define clear_cpu_bug(c, bit)		clear_cpu_cap(c, (bit))
diff --git a/arch/x86/include/asm/cpuinfo.h b/arch/x86/include/asm/cpuinfo.h
new file mode 100644
index 000000000000..a6632044f199
--- /dev/null
+++ b/arch/x86/include/asm/cpuinfo.h
@@ -0,0 +1,65 @@
+#ifndef _ASM_X86_CPUINFO_H_
+#define _ASM_X86_CPUINFO_H_
+
+/*
+ *  CPU type and hardware bug flags. Kept separately for each CPU.
+ *  Members of this structure are referenced in head.S, so think twice
+ *  before touching them. [mj]
+ */
+struct cpuinfo_x86 {
+	__u8			x86;		/* CPU family */
+	__u8			x86_vendor;	/* CPU vendor */
+	__u8			x86_model;
+	__u8			x86_mask;
+#ifdef CONFIG_X86_32
+	char			wp_works_ok;	/* It doesn't on 386's */
+
+	/* Problems on some 486Dx4's and old 386's: */
+	char			rfu;
+	char			pad0;
+	char			pad1;
+#else
+	/* Number of 4K pages in DTLB/ITLB combined(in pages): */
+	int			x86_tlbsize;
+#endif
+	__u8			x86_virt_bits;
+	__u8			x86_phys_bits;
+	/* CPUID returned core id bits: */
+	__u8			x86_coreid_bits;
+	/* Max extended CPUID function supported: */
+	__u32			extended_cpuid_level;
+	/* Maximum supported CPUID level, -1=no CPUID: */
+	int			cpuid_level;
+	__u32			x86_capability[NCAPINTS + NBUGINTS];
+	char			x86_vendor_id[16];
+	char			x86_model_id[64];
+	/* in KB - valid for CPUS which support this call: */
+	int			x86_cache_size;
+	int			x86_cache_alignment;	/* In bytes */
+	/* Cache QoS architectural values: */
+	int			x86_cache_max_rmid;	/* max index */
+	int			x86_cache_occ_scale;	/* scale to bytes */
+	int			x86_power;
+	unsigned long		loops_per_jiffy;
+	/* cpuid returned max cores value: */
+	u16			 x86_max_cores;
+	u16			apicid;
+	u16			initial_apicid;
+	u16			x86_clflush_size;
+	/* number of cores as seen by the OS: */
+	u16			booted_cores;
+	/* Physical processor id: */
+	u16			phys_proc_id;
+	/* Logical processor id: */
+	u16			logical_proc_id;
+	/* Core id: */
+	u16			cpu_core_id;
+	/* Index into per_cpu list: */
+	u16			cpu_index;
+	u32			microcode;
+};
+
+extern struct cpuinfo_x86	boot_cpu_data;
+extern struct cpuinfo_x86	new_cpu_data;
+
+#endif /* _ASM_X86_CPUINFO_H_ */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 62c6cc3cc5d3..6f6555b20e3d 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -22,6 +22,7 @@ struct vm86;
 #include <asm/nops.h>
 #include <asm/special_insns.h>
 #include <asm/fpu/types.h>
+#include <asm/cpuinfo.h>
 
 #include <linux/personality.h>
 #include <linux/cache.h>
@@ -78,65 +79,6 @@ extern u16 __read_mostly tlb_lld_2m[NR_INFO];
 extern u16 __read_mostly tlb_lld_4m[NR_INFO];
 extern u16 __read_mostly tlb_lld_1g[NR_INFO];
 
-/*
- *  CPU type and hardware bug flags. Kept separately for each CPU.
- *  Members of this structure are referenced in head.S, so think twice
- *  before touching them. [mj]
- */
-
-struct cpuinfo_x86 {
-	__u8			x86;		/* CPU family */
-	__u8			x86_vendor;	/* CPU vendor */
-	__u8			x86_model;
-	__u8			x86_mask;
-#ifdef CONFIG_X86_32
-	char			wp_works_ok;	/* It doesn't on 386's */
-
-	/* Problems on some 486Dx4's and old 386's: */
-	char			rfu;
-	char			pad0;
-	char			pad1;
-#else
-	/* Number of 4K pages in DTLB/ITLB combined(in pages): */
-	int			x86_tlbsize;
-#endif
-	__u8			x86_virt_bits;
-	__u8			x86_phys_bits;
-	/* CPUID returned core id bits: */
-	__u8			x86_coreid_bits;
-	/* Max extended CPUID function supported: */
-	__u32			extended_cpuid_level;
-	/* Maximum supported CPUID level, -1=no CPUID: */
-	int			cpuid_level;
-	__u32			x86_capability[NCAPINTS + NBUGINTS];
-	char			x86_vendor_id[16];
-	char			x86_model_id[64];
-	/* in KB - valid for CPUS which support this call: */
-	int			x86_cache_size;
-	int			x86_cache_alignment;	/* In bytes */
-	/* Cache QoS architectural values: */
-	int			x86_cache_max_rmid;	/* max index */
-	int			x86_cache_occ_scale;	/* scale to bytes */
-	int			x86_power;
-	unsigned long		loops_per_jiffy;
-	/* cpuid returned max cores value: */
-	u16			 x86_max_cores;
-	u16			apicid;
-	u16			initial_apicid;
-	u16			x86_clflush_size;
-	/* number of cores as seen by the OS: */
-	u16			booted_cores;
-	/* Physical processor id: */
-	u16			phys_proc_id;
-	/* Logical processor id: */
-	u16			logical_proc_id;
-	/* Core id: */
-	u16			cpu_core_id;
-	/* Index into per_cpu list: */
-	u16			cpu_index;
-	u32			microcode;
-};
-
 #define X86_VENDOR_INTEL	0
 #define X86_VENDOR_CYRIX	1
 #define X86_VENDOR_AMD		2
@@ -151,9 +93,6 @@ struct cpuinfo_x86 {
 /*
  * capabilities of CPUs
  */
-extern struct cpuinfo_x86	boot_cpu_data;
-extern struct cpuinfo_x86	new_cpu_data;
-
 extern struct tss_struct	doublefault_tss;
 extern __u32			cpu_caps_cleared[NCAPINTS];
 extern __u32			cpu_caps_set[NCAPINTS];
diff --git a/arch/x86/include/asm/static_cpu_has.h b/arch/x86/include/asm/static_cpu_has.h
new file mode 100644
index 000000000000..648ada0c7ffe
--- /dev/null
+++ b/arch/x86/include/asm/static_cpu_has.h
@@ -0,0 +1,116 @@
+#ifndef _ASM_X86_STATIC_CPU_HAS_H
+#define _ASM_X86_STATIC_CPU_HAS_H
+
+#include <asm/cpuinfo.h>
+
+#define test_cpu_cap(c, bit)						\
+	 test_bit(bit, (unsigned long *)((c)->x86_capability))
+
+#define REQUIRED_MASK_BIT_SET(bit)					\
+	 ( (((bit)>>5)==0  && (1UL<<((bit)&31) & REQUIRED_MASK0 )) ||	\
+	   (((bit)>>5)==1  && (1UL<<((bit)&31) & REQUIRED_MASK1 )) ||	\
+	   (((bit)>>5)==2  && (1UL<<((bit)&31) & REQUIRED_MASK2 )) ||	\
+	   (((bit)>>5)==3  && (1UL<<((bit)&31) & REQUIRED_MASK3 )) ||	\
+	   (((bit)>>5)==4  && (1UL<<((bit)&31) & REQUIRED_MASK4 )) ||	\
+	   (((bit)>>5)==5  && (1UL<<((bit)&31) & REQUIRED_MASK5 )) ||	\
+	   (((bit)>>5)==6  && (1UL<<((bit)&31) & REQUIRED_MASK6 )) ||	\
+	   (((bit)>>5)==7  && (1UL<<((bit)&31) & REQUIRED_MASK7 )) ||	\
+	   (((bit)>>5)==8  && (1UL<<((bit)&31) & REQUIRED_MASK8 )) ||	\
+	   (((bit)>>5)==9  && (1UL<<((bit)&31) & REQUIRED_MASK9 )) ||	\
+	   (((bit)>>5)==10 && (1UL<<((bit)&31) & REQUIRED_MASK10)) ||	\
+	   (((bit)>>5)==11 && (1UL<<((bit)&31) & REQUIRED_MASK11)) ||	\
+	   (((bit)>>5)==12 && (1UL<<((bit)&31) & REQUIRED_MASK12)) ||	\
+	   (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK13)) ||	\
+	   (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK14)) ||	\
+	   (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK15)) ||	\
+	   (((bit)>>5)==14 && (1UL<<((bit)&31) & REQUIRED_MASK16)) )
+
+#define DISABLED_MASK_BIT_SET(bit)					\
+	 ( (((bit)>>5)==0  && (1UL<<((bit)&31) & DISABLED_MASK0 )) ||	\
+	   (((bit)>>5)==1  && (1UL<<((bit)&31) & DISABLED_MASK1 )) ||	\
+	   (((bit)>>5)==2  && (1UL<<((bit)&31) & DISABLED_MASK2 )) ||	\
+	   (((bit)>>5)==3  && (1UL<<((bit)&31) & DISABLED_MASK3 )) ||	\
+	   (((bit)>>5)==4  && (1UL<<((bit)&31) & DISABLED_MASK4 )) ||	\
+	   (((bit)>>5)==5  && (1UL<<((bit)&31) & DISABLED_MASK5 )) ||	\
+	   (((bit)>>5)==6  && (1UL<<((bit)&31) & DISABLED_MASK6 )) ||	\
+	   (((bit)>>5)==7  && (1UL<<((bit)&31) & DISABLED_MASK7 )) ||	\
+	   (((bit)>>5)==8  && (1UL<<((bit)&31) & DISABLED_MASK8 )) ||	\
+	   (((bit)>>5)==9  && (1UL<<((bit)&31) & DISABLED_MASK9 )) ||	\
+	   (((bit)>>5)==10 && (1UL<<((bit)&31) & DISABLED_MASK10)) ||	\
+	   (((bit)>>5)==11 && (1UL<<((bit)&31) & DISABLED_MASK11)) ||	\
+	   (((bit)>>5)==12 && (1UL<<((bit)&31) & DISABLED_MASK12)) ||	\
+	   (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK13)) ||	\
+	   (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK14)) ||	\
+	   (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK15)) ||	\
+	   (((bit)>>5)==14 && (1UL<<((bit)&31) & DISABLED_MASK16)) )
+
+#define cpu_has(c, bit)							\
+	(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 :	\
+	 test_cpu_cap(c, bit))
+
+#define boot_cpu_has(bit)	cpu_has(&boot_cpu_data, bit)
+
+#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
+/*
+ * Static testing of CPU features.  Used the same as boot_cpu_has().
+ * These will statically patch the target code for additional
+ * performance.
+ */
+static __always_inline __pure bool _static_cpu_has(u16 bit)
+{
+		asm_volatile_goto("1: jmp 6f\n"
+			 "2:\n"
+			 ".skip -(((5f-4f) - (2b-1b)) > 0) * "
+			         "((5f-4f) - (2b-1b)),0x90\n"
+			 "3:\n"
+			 ".section .altinstructions,\"a\"\n"
+			 " .long 1b - .\n"		/* src offset */
+			 " .long 4f - .\n"		/* repl offset */
+			 " .word %P1\n"			/* always replace */
+			 " .byte 3b - 1b\n"		/* src len */
+			 " .byte 5f - 4f\n"		/* repl len */
+			 " .byte 3b - 2b\n"		/* pad len */
+			 ".previous\n"
+			 ".section .altinstr_replacement,\"ax\"\n"
+			 "4: jmp %l[t_no]\n"
+			 "5:\n"
+			 ".previous\n"
+			 ".section .altinstructions,\"a\"\n"
+			 " .long 1b - .\n"		/* src offset */
+			 " .long 0\n"			/* no replacement */
+			 " .word %P0\n"			/* feature bit */
+			 " .byte 3b - 1b\n"		/* src len */
+			 " .byte 0\n"			/* repl len */
+			 " .byte 0\n"			/* pad len */
+			 ".previous\n"
+			 ".section .altinstr_aux,\"ax\"\n"
+			 "6:\n"
+			 " testb %[bitnum],%[cap_byte]\n"
+			 " jnz %l[t_yes]\n"
+			 " jmp %l[t_no]\n"
+			 ".previous\n"
+			 : : "i" (bit), "i" (X86_FEATURE_ALWAYS),
+			     [bitnum] "i" (1 << (bit & 7)),
+			     [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3])
+			 : : t_yes, t_no);
+	t_yes:
+		return true;
+	t_no:
+		return false;
+}
+
+#define static_cpu_has(bit)					\
+(								\
+	__builtin_constant_p(boot_cpu_has(bit)) ?		\
+		boot_cpu_has(bit) :				\
+		_static_cpu_has(bit)				\
+)
+#else
+/*
+ * Fall back to dynamic for gcc versions which don't support asm goto. Should be
+ * a minority now anyway.
+ */
+#define static_cpu_has(bit)		boot_cpu_has(bit)
+#endif
+
+#endif /* _ASM_X86_STATIC_CPU_HAS_H */
diff --git a/lib/Makefile b/lib/Makefile
index a65e9a861535..55ad20701dc0 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -15,9 +15,6 @@ KCOV_INSTRUMENT_rbtree.o := n
 KCOV_INSTRUMENT_list_debug.o := n
 KCOV_INSTRUMENT_debugobjects.o := n
 KCOV_INSTRUMENT_dynamic_debug.o := n
-# Kernel does not boot if we instrument this file as it uses custom calling
-# convention (see CONFIG_ARCH_HWEIGHT_CFLAGS).
-KCOV_INSTRUMENT_hweight.o := n
 
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o dump_stack.o timerqueue.o\
@@ -72,8 +69,6 @@ obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o
 obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o
 obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o
 
-GCOV_PROFILE_hweight.o := n
-CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS))
 obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
 
 obj-$(CONFIG_BTREE) += btree.o

-- 
2.7.3

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
-- 

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* Re: [RFC PATCH] x86/hweight: Get rid of the special calling convention
  2016-05-04 18:46         ` [RFC PATCH] x86/hweight: Get rid of the special calling convention Borislav Petkov
@ 2016-05-04 19:31           ` Brian Gerst
  2016-05-04 19:33             ` H. Peter Anvin
  0 siblings, 1 reply; 84+ messages in thread
From: Brian Gerst @ 2016-05-04 19:31 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: LKML, Dmitry Vyukov, Andi Kleen, zengzhaoxiu, Thomas Gleixner,
	Ingo Molnar, H. Peter Anvin, Denys Vlasenko, Andrew Morton,
	Kees Cook, Zhaoxiu Zeng, Andy Lutomirski, Peter Zijlstra

On Wed, May 4, 2016 at 2:46 PM, Borislav Petkov <bp@suse.de> wrote:
> On Thu, Apr 07, 2016 at 11:43:33AM +0200, Borislav Petkov wrote:
>> I guess we can do something like this:
>>
>>        if (likely(static_cpu_has(X86_FEATURE_POPCNT)))
>>                asm volatile(POPCNT32
>>                             : "="REG_OUT (res)
>>                             : REG_IN (w));
>>        else
>>                res = __sw_hweight32(w);
>>
>> and get rid of the custom calling convention.
>>
>> Along with some numbers showing that the change doesn't cause any
>> noticeable slowdown...
>
> Ok, here's something which seems to build and boot in kvm.
>
> I like how we don't need the special calling conventions anymore and we
> can actually say "popcnt .." and gcc selects registers.
>
> The include files hackery is kinda nasty but I had to do it because I
> needed to be able to use static_cpu_has() in a header and including
> asm/cpufeature.h pulls in all kinds of nasty dependencies. I'm certainly
> open for better ideas...
>
> ---
> From: Borislav Petkov <bp@suse.de>
> Date: Wed, 4 May 2016 18:52:09 +0200
> Subject: [PATCH] x86/hweight: Get rid of the special calling convention
>
> People complained about ARCH_HWEIGHT_CFLAGS and how it throws a wrench
> into kcov, lto, etc, experimentation.
>
> And its not like we absolutely need it so let's get rid of it and
> streamline it a bit. I had to do some carving out of facilities so
> that the include hell doesn't swallow me but other than that, the new
> __arch_hweight*() versions look much more palatable and gcc is more free
> to select registers than us hardcoding them in the insn bytes.
>
> Signed-off-by: Borislav Petkov <bp@suse.de>
> ---
>  arch/x86/Kconfig                      |   5 --
>  arch/x86/include/asm/arch_hweight.h   |  43 ++++---------
>  arch/x86/include/asm/cpufeature.h     | 112 +-------------------------------
>  arch/x86/include/asm/cpuinfo.h        |  65 +++++++++++++++++++
>  arch/x86/include/asm/processor.h      |  63 +-----------------
>  arch/x86/include/asm/static_cpu_has.h | 116 ++++++++++++++++++++++++++++++++++
>  lib/Makefile                          |   5 --
>  7 files changed, 197 insertions(+), 212 deletions(-)
>  create mode 100644 arch/x86/include/asm/cpuinfo.h
>  create mode 100644 arch/x86/include/asm/static_cpu_has.h
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 7bb15747fea2..79e0bcd61cb1 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -292,11 +292,6 @@ config X86_32_LAZY_GS
>         def_bool y
>         depends on X86_32 && !CC_STACKPROTECTOR
>
> -config ARCH_HWEIGHT_CFLAGS
> -       string
> -       default "-fcall-saved-ecx -fcall-saved-edx" if X86_32
> -       default "-fcall-saved-rdi -fcall-saved-rsi -fcall-saved-rdx -fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 -fcall-saved-r11" if X86_64
> -
>  config ARCH_SUPPORTS_UPROBES
>         def_bool y
>
> diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
> index 02e799fa43d1..6c1a2d500c4c 100644
> --- a/arch/x86/include/asm/arch_hweight.h
> +++ b/arch/x86/include/asm/arch_hweight.h
> @@ -2,36 +2,18 @@
>  #define _ASM_X86_HWEIGHT_H
>
>  #include <asm/cpufeatures.h>
> +#include <asm/static_cpu_has.h>
>
> -#ifdef CONFIG_64BIT
> -/* popcnt %edi, %eax -- redundant REX prefix for alignment */
> -#define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7"
> -/* popcnt %rdi, %rax */
> -#define POPCNT64 ".byte 0xf3,0x48,0x0f,0xb8,0xc7"
> -#define REG_IN "D"
> -#define REG_OUT "a"
> -#else
> -/* popcnt %eax, %eax */
> -#define POPCNT32 ".byte 0xf3,0x0f,0xb8,0xc0"
> -#define REG_IN "a"
> -#define REG_OUT "a"
> -#endif
> -
> -/*
> - * __sw_hweightXX are called from within the alternatives below
> - * and callee-clobbered registers need to be taken care of. See
> - * ARCH_HWEIGHT_CFLAGS in <arch/x86/Kconfig> for the respective
> - * compiler switches.
> - */
>  static __always_inline unsigned int __arch_hweight32(unsigned int w)
>  {
> -       unsigned int res = 0;
> +       unsigned int res;
>
> -       asm (ALTERNATIVE("call __sw_hweight32", POPCNT32, X86_FEATURE_POPCNT)
> -                    : "="REG_OUT (res)
> -                    : REG_IN (w));
> +       if (likely(static_cpu_has(X86_FEATURE_POPCNT))) {
> +               asm volatile("popcnt %[w], %[res]" : [res] "=r" (res) : [w] "r" (w));

Do all supported versions of the assembler know of the popcnt
instruction?  That's why is was open coded before.  The problem is
Intel and AMD are constantly adding new instructions and it's a long
cycle for the user's assembler to get updated.

--
Brian Gerst

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [RFC PATCH] x86/hweight: Get rid of the special calling convention
  2016-05-04 19:31           ` Brian Gerst
@ 2016-05-04 19:33             ` H. Peter Anvin
  2016-05-04 19:41               ` Borislav Petkov
  0 siblings, 1 reply; 84+ messages in thread
From: H. Peter Anvin @ 2016-05-04 19:33 UTC (permalink / raw)
  To: Brian Gerst, Borislav Petkov
  Cc: LKML, Dmitry Vyukov, Andi Kleen, zengzhaoxiu, Thomas Gleixner,
	Ingo Molnar, Denys Vlasenko, Andrew Morton, Kees Cook,
	Zhaoxiu Zeng, Andy Lutomirski, Peter Zijlstra

On 05/04/2016 12:31 PM, Brian Gerst wrote:
>>
>> -       asm (ALTERNATIVE("call __sw_hweight32", POPCNT32, X86_FEATURE_POPCNT)
>> -                    : "="REG_OUT (res)
>> -                    : REG_IN (w));
>> +       if (likely(static_cpu_has(X86_FEATURE_POPCNT))) {
>> +               asm volatile("popcnt %[w], %[res]" : [res] "=r" (res) : [w] "r" (w));
> 
> Do all supported versions of the assembler know of the popcnt
> instruction?  That's why is was open coded before.  The problem is
> Intel and AMD are constantly adding new instructions and it's a long
> cycle for the user's assembler to get updated.
> 

Most likely not.  It would be nice to have some more uniform solution to
that.  I'm wondering if we could use the -Wa option to load some kind of
macro package.

	-hpa

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [RFC PATCH] x86/hweight: Get rid of the special calling convention
  2016-05-04 19:33             ` H. Peter Anvin
@ 2016-05-04 19:41               ` Borislav Petkov
  2016-05-04 19:49                 ` H. Peter Anvin
  0 siblings, 1 reply; 84+ messages in thread
From: Borislav Petkov @ 2016-05-04 19:41 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Brian Gerst, LKML, Dmitry Vyukov, Andi Kleen, zengzhaoxiu,
	Thomas Gleixner, Ingo Molnar, Denys Vlasenko, Andrew Morton,
	Kees Cook, Zhaoxiu Zeng, Andy Lutomirski, Peter Zijlstra

On Wed, May 04, 2016 at 12:33:24PM -0700, H. Peter Anvin wrote:
> Most likely not.  It would be nice to have some more uniform solution to
> that.  I'm wondering if we could use the -Wa option to load some kind of
> macro package.

Lemme try out some old compilers first, I'm guessing 3.2 won't know
about popcnt...

-- 
Regards/Gruss,
    Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
-- 

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [RFC PATCH] x86/hweight: Get rid of the special calling convention
  2016-05-04 19:41               ` Borislav Petkov
@ 2016-05-04 19:49                 ` H. Peter Anvin
  2016-05-04 20:22                   ` Borislav Petkov
  0 siblings, 1 reply; 84+ messages in thread
From: H. Peter Anvin @ 2016-05-04 19:49 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Brian Gerst, LKML, Dmitry Vyukov, Andi Kleen, zengzhaoxiu,
	Thomas Gleixner, Ingo Molnar, Denys Vlasenko, Andrew Morton,
	Kees Cook, Zhaoxiu Zeng, Andy Lutomirski, Peter Zijlstra

On 05/04/2016 12:41 PM, Borislav Petkov wrote:
> On Wed, May 04, 2016 at 12:33:24PM -0700, H. Peter Anvin wrote:
>> Most likely not.  It would be nice to have some more uniform solution to
>> that.  I'm wondering if we could use the -Wa option to load some kind of
>> macro package.
> 
> Lemme try out some old compilers first, I'm guessing 3.2 won't know
> about popcnt...
> 

Sigh.  Doesn't look like -Wa is going to help due to the lack of the
equivalent of an -include option in gas.

	-hpa

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [RFC PATCH] x86/hweight: Get rid of the special calling convention
  2016-05-04 19:49                 ` H. Peter Anvin
@ 2016-05-04 20:22                   ` Borislav Petkov
  2016-05-04 20:51                     ` H. Peter Anvin
                                       ` (2 more replies)
  0 siblings, 3 replies; 84+ messages in thread
From: Borislav Petkov @ 2016-05-04 20:22 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Brian Gerst, LKML, Dmitry Vyukov, Andi Kleen, zengzhaoxiu,
	Thomas Gleixner, Ingo Molnar, Denys Vlasenko, Andrew Morton,
	Kees Cook, Zhaoxiu Zeng, Andy Lutomirski, Peter Zijlstra

On Wed, May 04, 2016 at 12:49:17PM -0700, H. Peter Anvin wrote:
> Sigh.  Doesn't look like -Wa is going to help due to the lack of the
> equivalent of an -include option in gas.

So much for the register "freedom" - I'll resurrect the hardcoded insn
bytes. :-\

Unless my gcc friends have some other ideas...

sarge:~# gcc --version
gcc (GCC) 3.3.5 (Debian 1:3.3.5-13)
Copyright (C) 2003 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

sarge:~# cat popcnt.c
int main(void)
{
        int a, b;

        asm volatile("popcnt %0, %1" : "=r" (a) : "r" (b));

        return 0;
}
sarge:~# gcc -Wall -o popcnt{,.c}
/tmp/ccHmmgjH.s: Assembler messages:
/tmp/ccHmmgjH.s:14: Error: no such instruction: `popcnt %eax,%eax'
sarge:~#

-- 
Regards/Gruss,
    Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
-- 

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [RFC PATCH] x86/hweight: Get rid of the special calling convention
  2016-05-04 20:22                   ` Borislav Petkov
@ 2016-05-04 20:51                     ` H. Peter Anvin
  2016-05-04 21:09                     ` Andi Kleen
  2016-05-05 13:02                     ` Denys Vlasenko
  2 siblings, 0 replies; 84+ messages in thread
From: H. Peter Anvin @ 2016-05-04 20:51 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Brian Gerst, LKML, Dmitry Vyukov, Andi Kleen, zengzhaoxiu,
	Thomas Gleixner, Ingo Molnar, Denys Vlasenko, Andrew Morton,
	Kees Cook, Zhaoxiu Zeng, Andy Lutomirski, Peter Zijlstra

On 05/04/2016 01:22 PM, Borislav Petkov wrote:
> On Wed, May 04, 2016 at 12:49:17PM -0700, H. Peter Anvin wrote:
>> Sigh.  Doesn't look like -Wa is going to help due to the lack of the
>> equivalent of an -include option in gas.
> 
> So much for the register "freedom" - I'll resurrect the hardcoded insn
> bytes. :-\
> 
> Unless my gcc friends have some other ideas...

There is the option of looking for assembler support for popcnt and only
hard-code the register if not supported.  This is where being able to
insert a macro package would help...

	-hpa

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [RFC PATCH] x86/hweight: Get rid of the special calling convention
  2016-05-04 20:22                   ` Borislav Petkov
  2016-05-04 20:51                     ` H. Peter Anvin
@ 2016-05-04 21:09                     ` Andi Kleen
  2016-05-05 13:02                     ` Denys Vlasenko
  2 siblings, 0 replies; 84+ messages in thread
From: Andi Kleen @ 2016-05-04 21:09 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: H. Peter Anvin, Brian Gerst, LKML, Dmitry Vyukov, Andi Kleen,
	zengzhaoxiu, Thomas Gleixner, Ingo Molnar, Denys Vlasenko,
	Andrew Morton, Kees Cook, Zhaoxiu Zeng, Andy Lutomirski,
	Peter Zijlstra

On Wed, May 04, 2016 at 10:22:13PM +0200, Borislav Petkov wrote:
> On Wed, May 04, 2016 at 12:49:17PM -0700, H. Peter Anvin wrote:
> > Sigh.  Doesn't look like -Wa is going to help due to the lack of the
> > equivalent of an -include option in gas.
> 
> So much for the register "freedom" - I'll resurrect the hardcoded insn
> bytes. :-\
> 
> Unless my gcc friends have some other ideas...

You can probe the assembler in the Makefile and pass a define,
like it is done by the dwarf code.  When the define is not
set use the hard coded registers

Not very scalable, but may work in this case.

Longer term would probably need compiler probes at Kconfig
time (this would be useful for a lot of things)

-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [RFC PATCH] x86/hweight: Get rid of the special calling convention
  2016-05-04 20:22                   ` Borislav Petkov
  2016-05-04 20:51                     ` H. Peter Anvin
  2016-05-04 21:09                     ` Andi Kleen
@ 2016-05-05 13:02                     ` Denys Vlasenko
  2016-05-05 14:04                       ` Borislav Petkov
  2 siblings, 1 reply; 84+ messages in thread
From: Denys Vlasenko @ 2016-05-05 13:02 UTC (permalink / raw)
  To: Borislav Petkov, H. Peter Anvin
  Cc: Brian Gerst, LKML, Dmitry Vyukov, Andi Kleen, zengzhaoxiu,
	Thomas Gleixner, Ingo Molnar, Andrew Morton, Kees Cook,
	Zhaoxiu Zeng, Andy Lutomirski, Peter Zijlstra

On 05/04/2016 10:22 PM, Borislav Petkov wrote:
> On Wed, May 04, 2016 at 12:49:17PM -0700, H. Peter Anvin wrote:
>> Sigh.  Doesn't look like -Wa is going to help due to the lack of the
>> equivalent of an -include option in gas.
> 
> So much for the register "freedom" - I'll resurrect the hardcoded insn
> bytes. :-\
> 
> Unless my gcc friends have some other ideas...
> 
> sarge:~# gcc --version
> gcc (GCC) 3.3.5 (Debian 1:3.3.5-13)
> Copyright (C) 2003 Free Software Foundation, Inc.
> This is free software; see the source for copying conditions.  There is NO
> warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
> 
> sarge:~# cat popcnt.c
> int main(void)
> {
>         int a, b;
> 
>         asm volatile("popcnt %0, %1" : "=r" (a) : "r" (b));
> 
>         return 0;
> }
> sarge:~# gcc -Wall -o popcnt{,.c}
> /tmp/ccHmmgjH.s: Assembler messages:
> /tmp/ccHmmgjH.s:14: Error: no such instruction: `popcnt %eax,%eax'
> sarge:~#

You are looking at the version of a wrong program.
gcc doesn't process assembly, it generates it.
as is part of binutils, not gcc. "as --version".

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [RFC PATCH] x86/hweight: Get rid of the special calling convention
  2016-05-05 13:02                     ` Denys Vlasenko
@ 2016-05-05 14:04                       ` Borislav Petkov
  2016-05-10 16:53                         ` [PATCH -v2] " Borislav Petkov
  0 siblings, 1 reply; 84+ messages in thread
From: Borislav Petkov @ 2016-05-05 14:04 UTC (permalink / raw)
  To: Denys Vlasenko
  Cc: H. Peter Anvin, Brian Gerst, LKML, Dmitry Vyukov, Andi Kleen,
	zengzhaoxiu, Thomas Gleixner, Ingo Molnar, Andrew Morton,
	Kees Cook, Zhaoxiu Zeng, Andy Lutomirski, Peter Zijlstra

On Thu, May 05, 2016 at 03:02:37PM +0200, Denys Vlasenko wrote:
> You are looking at the version of a wrong program.
> gcc doesn't process assembly, it generates it.
> as is part of binutils, not gcc. "as --version".

I know. It doesn't matter a whole lot in this case if there's a subset
of gas versions which simply don't know about POPCNT and we do use
those in the kernel build.

Pending a better solution, I'll simply revert to the old, spelled POPCNT
bytes and don't bother with versions. Especially if someone tries to
build the kernel with some other compiler...

-- 
Regards/Gruss,
    Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
-- 

^ permalink raw reply	[flat|nested] 84+ messages in thread

* [PATCH -v2] x86/hweight: Get rid of the special calling convention
  2016-05-05 14:04                       ` Borislav Petkov
@ 2016-05-10 16:53                         ` Borislav Petkov
  2016-05-10 17:23                           ` Peter Zijlstra
  0 siblings, 1 reply; 84+ messages in thread
From: Borislav Petkov @ 2016-05-10 16:53 UTC (permalink / raw)
  To: x86-ml
  Cc: Denys Vlasenko, H. Peter Anvin, Brian Gerst, LKML, Dmitry Vyukov,
	Andi Kleen, zengzhaoxiu, Thomas Gleixner, Ingo Molnar,
	Andrew Morton, Kees Cook, Zhaoxiu Zeng, Andy Lutomirski,
	Peter Zijlstra

From: Borislav Petkov <bp@suse.de>
Date: Wed, 4 May 2016 18:52:09 +0200
Subject: [PATCH -v2] x86/hweight: Get rid of the special calling convention

People complained about ARCH_HWEIGHT_CFLAGS and how it throws a wrench
into kcov, lto, etc, experimentation.

And its not like we absolutely need it so let's get rid of it and
streamline it a bit. I had to do some carving out of facilities so that
the include hell doesn't swallow me.

We still need to hardcode POPCNT and register operands as some old gas
versions which we support, do not know about POPCNT.

Signed-off-by: Borislav Petkov <bp@suse.de>
---

-v2: Revert to the old spelled-out POPCNT insn bytes.

 arch/x86/Kconfig                      |   5 --
 arch/x86/include/asm/arch_hweight.h   |  38 +++++------
 arch/x86/include/asm/cpufeature.h     | 112 +-------------------------------
 arch/x86/include/asm/cpuinfo.h        |  65 +++++++++++++++++++
 arch/x86/include/asm/processor.h      |  63 +-----------------
 arch/x86/include/asm/static_cpu_has.h | 116 ++++++++++++++++++++++++++++++++++
 lib/Makefile                          |   5 --
 7 files changed, 204 insertions(+), 200 deletions(-)
 create mode 100644 arch/x86/include/asm/cpuinfo.h
 create mode 100644 arch/x86/include/asm/static_cpu_has.h

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7bb15747fea2..79e0bcd61cb1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -292,11 +292,6 @@ config X86_32_LAZY_GS
 	def_bool y
 	depends on X86_32 && !CC_STACKPROTECTOR
 
-config ARCH_HWEIGHT_CFLAGS
-	string
-	default "-fcall-saved-ecx -fcall-saved-edx" if X86_32
-	default "-fcall-saved-rdi -fcall-saved-rsi -fcall-saved-rdx -fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 -fcall-saved-r11" if X86_64
-
 config ARCH_SUPPORTS_UPROBES
 	def_bool y
 
diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
index 02e799fa43d1..324f5fb30392 100644
--- a/arch/x86/include/asm/arch_hweight.h
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -2,10 +2,11 @@
 #define _ASM_X86_HWEIGHT_H
 
 #include <asm/cpufeatures.h>
+#include <asm/static_cpu_has.h>
 
 #ifdef CONFIG_64BIT
-/* popcnt %edi, %eax -- redundant REX prefix for alignment */
-#define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7"
+/* popcnt %edi, %eax */
+#define POPCNT32 ".byte 0xf3,0x0f,0xb8,0xc7"
 /* popcnt %rdi, %rax */
 #define POPCNT64 ".byte 0xf3,0x48,0x0f,0xb8,0xc7"
 #define REG_IN "D"
@@ -17,21 +18,19 @@
 #define REG_OUT "a"
 #endif
 
-/*
- * __sw_hweightXX are called from within the alternatives below
- * and callee-clobbered registers need to be taken care of. See
- * ARCH_HWEIGHT_CFLAGS in <arch/x86/Kconfig> for the respective
- * compiler switches.
- */
 static __always_inline unsigned int __arch_hweight32(unsigned int w)
 {
-	unsigned int res = 0;
+	unsigned int res;
 
-	asm (ALTERNATIVE("call __sw_hweight32", POPCNT32, X86_FEATURE_POPCNT)
-		     : "="REG_OUT (res)
-		     : REG_IN (w));
+	if (likely(static_cpu_has(X86_FEATURE_POPCNT))) {
+		/* popcnt %eax, %eax */
+		asm volatile(POPCNT32
+				: "="REG_OUT (res)
+				: REG_IN (w));
 
-	return res;
+		return res;
+	}
+	return __sw_hweight32(w);
 }
 
 static inline unsigned int __arch_hweight16(unsigned int w)
@@ -53,13 +52,16 @@ static inline unsigned long __arch_hweight64(__u64 w)
 #else
 static __always_inline unsigned long __arch_hweight64(__u64 w)
 {
-	unsigned long res = 0;
+	unsigned long res;
 
-	asm (ALTERNATIVE("call __sw_hweight64", POPCNT64, X86_FEATURE_POPCNT)
-		     : "="REG_OUT (res)
-		     : REG_IN (w));
+	if (likely(static_cpu_has(X86_FEATURE_POPCNT))) {
+		asm volatile(POPCNT64
+				: "="REG_OUT (res)
+				: REG_IN (w));
 
-	return res;
+		return res;
+	}
+	return __sw_hweight64(w);
 }
 #endif /* CONFIG_X86_32 */
 
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 07c942d84662..9a70b12ae8df 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -6,6 +6,8 @@
 #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
 
 #include <asm/asm.h>
+#include <asm/static_cpu_has.h>
+
 #include <linux/bitops.h>
 
 enum cpuid_leafs
@@ -45,51 +47,6 @@ extern const char * const x86_power_flags[32];
  */
 extern const char * const x86_bug_flags[NBUGINTS*32];
 
-#define test_cpu_cap(c, bit)						\
-	 test_bit(bit, (unsigned long *)((c)->x86_capability))
-
-#define REQUIRED_MASK_BIT_SET(bit)					\
-	 ( (((bit)>>5)==0  && (1UL<<((bit)&31) & REQUIRED_MASK0 )) ||	\
-	   (((bit)>>5)==1  && (1UL<<((bit)&31) & REQUIRED_MASK1 )) ||	\
-	   (((bit)>>5)==2  && (1UL<<((bit)&31) & REQUIRED_MASK2 )) ||	\
-	   (((bit)>>5)==3  && (1UL<<((bit)&31) & REQUIRED_MASK3 )) ||	\
-	   (((bit)>>5)==4  && (1UL<<((bit)&31) & REQUIRED_MASK4 )) ||	\
-	   (((bit)>>5)==5  && (1UL<<((bit)&31) & REQUIRED_MASK5 )) ||	\
-	   (((bit)>>5)==6  && (1UL<<((bit)&31) & REQUIRED_MASK6 )) ||	\
-	   (((bit)>>5)==7  && (1UL<<((bit)&31) & REQUIRED_MASK7 )) ||	\
-	   (((bit)>>5)==8  && (1UL<<((bit)&31) & REQUIRED_MASK8 )) ||	\
-	   (((bit)>>5)==9  && (1UL<<((bit)&31) & REQUIRED_MASK9 )) ||	\
-	   (((bit)>>5)==10 && (1UL<<((bit)&31) & REQUIRED_MASK10)) ||	\
-	   (((bit)>>5)==11 && (1UL<<((bit)&31) & REQUIRED_MASK11)) ||	\
-	   (((bit)>>5)==12 && (1UL<<((bit)&31) & REQUIRED_MASK12)) ||	\
-	   (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK13)) ||	\
-	   (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK14)) ||	\
-	   (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK15)) ||	\
-	   (((bit)>>5)==14 && (1UL<<((bit)&31) & REQUIRED_MASK16)) )
-
-#define DISABLED_MASK_BIT_SET(bit)					\
-	 ( (((bit)>>5)==0  && (1UL<<((bit)&31) & DISABLED_MASK0 )) ||	\
-	   (((bit)>>5)==1  && (1UL<<((bit)&31) & DISABLED_MASK1 )) ||	\
-	   (((bit)>>5)==2  && (1UL<<((bit)&31) & DISABLED_MASK2 )) ||	\
-	   (((bit)>>5)==3  && (1UL<<((bit)&31) & DISABLED_MASK3 )) ||	\
-	   (((bit)>>5)==4  && (1UL<<((bit)&31) & DISABLED_MASK4 )) ||	\
-	   (((bit)>>5)==5  && (1UL<<((bit)&31) & DISABLED_MASK5 )) ||	\
-	   (((bit)>>5)==6  && (1UL<<((bit)&31) & DISABLED_MASK6 )) ||	\
-	   (((bit)>>5)==7  && (1UL<<((bit)&31) & DISABLED_MASK7 )) ||	\
-	   (((bit)>>5)==8  && (1UL<<((bit)&31) & DISABLED_MASK8 )) ||	\
-	   (((bit)>>5)==9  && (1UL<<((bit)&31) & DISABLED_MASK9 )) ||	\
-	   (((bit)>>5)==10 && (1UL<<((bit)&31) & DISABLED_MASK10)) ||	\
-	   (((bit)>>5)==11 && (1UL<<((bit)&31) & DISABLED_MASK11)) ||	\
-	   (((bit)>>5)==12 && (1UL<<((bit)&31) & DISABLED_MASK12)) ||	\
-	   (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK13)) ||	\
-	   (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK14)) ||	\
-	   (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK15)) ||	\
-	   (((bit)>>5)==14 && (1UL<<((bit)&31) & DISABLED_MASK16)) )
-
-#define cpu_has(c, bit)							\
-	(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 :	\
-	 test_cpu_cap(c, bit))
-
 #define this_cpu_has(bit)						\
 	(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : 	\
 	 x86_this_cpu_test_bit(bit, (unsigned long *)&cpu_info.x86_capability))
@@ -105,8 +62,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
 #define cpu_feature_enabled(bit)	\
 	(__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : static_cpu_has(bit))
 
-#define boot_cpu_has(bit)	cpu_has(&boot_cpu_data, bit)
-
 #define set_cpu_cap(c, bit)	set_bit(bit, (unsigned long *)((c)->x86_capability))
 #define clear_cpu_cap(c, bit)	clear_bit(bit, (unsigned long *)((c)->x86_capability))
 #define setup_clear_cpu_cap(bit) do { \
@@ -118,69 +73,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
 	set_bit(bit, (unsigned long *)cpu_caps_set);	\
 } while (0)
 
-#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
-/*
- * Static testing of CPU features.  Used the same as boot_cpu_has().
- * These will statically patch the target code for additional
- * performance.
- */
-static __always_inline __pure bool _static_cpu_has(u16 bit)
-{
-		asm_volatile_goto("1: jmp 6f\n"
-			 "2:\n"
-			 ".skip -(((5f-4f) - (2b-1b)) > 0) * "
-			         "((5f-4f) - (2b-1b)),0x90\n"
-			 "3:\n"
-			 ".section .altinstructions,\"a\"\n"
-			 " .long 1b - .\n"		/* src offset */
-			 " .long 4f - .\n"		/* repl offset */
-			 " .word %P1\n"			/* always replace */
-			 " .byte 3b - 1b\n"		/* src len */
-			 " .byte 5f - 4f\n"		/* repl len */
-			 " .byte 3b - 2b\n"		/* pad len */
-			 ".previous\n"
-			 ".section .altinstr_replacement,\"ax\"\n"
-			 "4: jmp %l[t_no]\n"
-			 "5:\n"
-			 ".previous\n"
-			 ".section .altinstructions,\"a\"\n"
-			 " .long 1b - .\n"		/* src offset */
-			 " .long 0\n"			/* no replacement */
-			 " .word %P0\n"			/* feature bit */
-			 " .byte 3b - 1b\n"		/* src len */
-			 " .byte 0\n"			/* repl len */
-			 " .byte 0\n"			/* pad len */
-			 ".previous\n"
-			 ".section .altinstr_aux,\"ax\"\n"
-			 "6:\n"
-			 " testb %[bitnum],%[cap_byte]\n"
-			 " jnz %l[t_yes]\n"
-			 " jmp %l[t_no]\n"
-			 ".previous\n"
-			 : : "i" (bit), "i" (X86_FEATURE_ALWAYS),
-			     [bitnum] "i" (1 << (bit & 7)),
-			     [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3])
-			 : : t_yes, t_no);
-	t_yes:
-		return true;
-	t_no:
-		return false;
-}
-
-#define static_cpu_has(bit)					\
-(								\
-	__builtin_constant_p(boot_cpu_has(bit)) ?		\
-		boot_cpu_has(bit) :				\
-		_static_cpu_has(bit)				\
-)
-#else
-/*
- * Fall back to dynamic for gcc versions which don't support asm goto. Should be
- * a minority now anyway.
- */
-#define static_cpu_has(bit)		boot_cpu_has(bit)
-#endif
-
 #define cpu_has_bug(c, bit)		cpu_has(c, (bit))
 #define set_cpu_bug(c, bit)		set_cpu_cap(c, (bit))
 #define clear_cpu_bug(c, bit)		clear_cpu_cap(c, (bit))
diff --git a/arch/x86/include/asm/cpuinfo.h b/arch/x86/include/asm/cpuinfo.h
new file mode 100644
index 000000000000..a6632044f199
--- /dev/null
+++ b/arch/x86/include/asm/cpuinfo.h
@@ -0,0 +1,65 @@
+#ifndef _ASM_X86_CPUINFO_H_
+#define _ASM_X86_CPUINFO_H_
+
+/*
+ *  CPU type and hardware bug flags. Kept separately for each CPU.
+ *  Members of this structure are referenced in head.S, so think twice
+ *  before touching them. [mj]
+ */
+struct cpuinfo_x86 {
+	__u8			x86;		/* CPU family */
+	__u8			x86_vendor;	/* CPU vendor */
+	__u8			x86_model;
+	__u8			x86_mask;
+#ifdef CONFIG_X86_32
+	char			wp_works_ok;	/* It doesn't on 386's */
+
+	/* Problems on some 486Dx4's and old 386's: */
+	char			rfu;
+	char			pad0;
+	char			pad1;
+#else
+	/* Number of 4K pages in DTLB/ITLB combined(in pages): */
+	int			x86_tlbsize;
+#endif
+	__u8			x86_virt_bits;
+	__u8			x86_phys_bits;
+	/* CPUID returned core id bits: */
+	__u8			x86_coreid_bits;
+	/* Max extended CPUID function supported: */
+	__u32			extended_cpuid_level;
+	/* Maximum supported CPUID level, -1=no CPUID: */
+	int			cpuid_level;
+	__u32			x86_capability[NCAPINTS + NBUGINTS];
+	char			x86_vendor_id[16];
+	char			x86_model_id[64];
+	/* in KB - valid for CPUS which support this call: */
+	int			x86_cache_size;
+	int			x86_cache_alignment;	/* In bytes */
+	/* Cache QoS architectural values: */
+	int			x86_cache_max_rmid;	/* max index */
+	int			x86_cache_occ_scale;	/* scale to bytes */
+	int			x86_power;
+	unsigned long		loops_per_jiffy;
+	/* cpuid returned max cores value: */
+	u16			 x86_max_cores;
+	u16			apicid;
+	u16			initial_apicid;
+	u16			x86_clflush_size;
+	/* number of cores as seen by the OS: */
+	u16			booted_cores;
+	/* Physical processor id: */
+	u16			phys_proc_id;
+	/* Logical processor id: */
+	u16			logical_proc_id;
+	/* Core id: */
+	u16			cpu_core_id;
+	/* Index into per_cpu list: */
+	u16			cpu_index;
+	u32			microcode;
+};
+
+extern struct cpuinfo_x86	boot_cpu_data;
+extern struct cpuinfo_x86	new_cpu_data;
+
+#endif /* _ASM_X86_CPUINFO_H_ */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 62c6cc3cc5d3..6f6555b20e3d 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -22,6 +22,7 @@ struct vm86;
 #include <asm/nops.h>
 #include <asm/special_insns.h>
 #include <asm/fpu/types.h>
+#include <asm/cpuinfo.h>
 
 #include <linux/personality.h>
 #include <linux/cache.h>
@@ -78,65 +79,6 @@ extern u16 __read_mostly tlb_lld_2m[NR_INFO];
 extern u16 __read_mostly tlb_lld_4m[NR_INFO];
 extern u16 __read_mostly tlb_lld_1g[NR_INFO];
 
-/*
- *  CPU type and hardware bug flags. Kept separately for each CPU.
- *  Members of this structure are referenced in head.S, so think twice
- *  before touching them. [mj]
- */
-
-struct cpuinfo_x86 {
-	__u8			x86;		/* CPU family */
-	__u8			x86_vendor;	/* CPU vendor */
-	__u8			x86_model;
-	__u8			x86_mask;
-#ifdef CONFIG_X86_32
-	char			wp_works_ok;	/* It doesn't on 386's */
-
-	/* Problems on some 486Dx4's and old 386's: */
-	char			rfu;
-	char			pad0;
-	char			pad1;
-#else
-	/* Number of 4K pages in DTLB/ITLB combined(in pages): */
-	int			x86_tlbsize;
-#endif
-	__u8			x86_virt_bits;
-	__u8			x86_phys_bits;
-	/* CPUID returned core id bits: */
-	__u8			x86_coreid_bits;
-	/* Max extended CPUID function supported: */
-	__u32			extended_cpuid_level;
-	/* Maximum supported CPUID level, -1=no CPUID: */
-	int			cpuid_level;
-	__u32			x86_capability[NCAPINTS + NBUGINTS];
-	char			x86_vendor_id[16];
-	char			x86_model_id[64];
-	/* in KB - valid for CPUS which support this call: */
-	int			x86_cache_size;
-	int			x86_cache_alignment;	/* In bytes */
-	/* Cache QoS architectural values: */
-	int			x86_cache_max_rmid;	/* max index */
-	int			x86_cache_occ_scale;	/* scale to bytes */
-	int			x86_power;
-	unsigned long		loops_per_jiffy;
-	/* cpuid returned max cores value: */
-	u16			 x86_max_cores;
-	u16			apicid;
-	u16			initial_apicid;
-	u16			x86_clflush_size;
-	/* number of cores as seen by the OS: */
-	u16			booted_cores;
-	/* Physical processor id: */
-	u16			phys_proc_id;
-	/* Logical processor id: */
-	u16			logical_proc_id;
-	/* Core id: */
-	u16			cpu_core_id;
-	/* Index into per_cpu list: */
-	u16			cpu_index;
-	u32			microcode;
-};
-
 #define X86_VENDOR_INTEL	0
 #define X86_VENDOR_CYRIX	1
 #define X86_VENDOR_AMD		2
@@ -151,9 +93,6 @@ struct cpuinfo_x86 {
 /*
  * capabilities of CPUs
  */
-extern struct cpuinfo_x86	boot_cpu_data;
-extern struct cpuinfo_x86	new_cpu_data;
-
 extern struct tss_struct	doublefault_tss;
 extern __u32			cpu_caps_cleared[NCAPINTS];
 extern __u32			cpu_caps_set[NCAPINTS];
diff --git a/arch/x86/include/asm/static_cpu_has.h b/arch/x86/include/asm/static_cpu_has.h
new file mode 100644
index 000000000000..648ada0c7ffe
--- /dev/null
+++ b/arch/x86/include/asm/static_cpu_has.h
@@ -0,0 +1,116 @@
+#ifndef _ASM_X86_STATIC_CPU_HAS_H
+#define _ASM_X86_STATIC_CPU_HAS_H
+
+#include <asm/cpuinfo.h>
+
+#define test_cpu_cap(c, bit)						\
+	 test_bit(bit, (unsigned long *)((c)->x86_capability))
+
+#define REQUIRED_MASK_BIT_SET(bit)					\
+	 ( (((bit)>>5)==0  && (1UL<<((bit)&31) & REQUIRED_MASK0 )) ||	\
+	   (((bit)>>5)==1  && (1UL<<((bit)&31) & REQUIRED_MASK1 )) ||	\
+	   (((bit)>>5)==2  && (1UL<<((bit)&31) & REQUIRED_MASK2 )) ||	\
+	   (((bit)>>5)==3  && (1UL<<((bit)&31) & REQUIRED_MASK3 )) ||	\
+	   (((bit)>>5)==4  && (1UL<<((bit)&31) & REQUIRED_MASK4 )) ||	\
+	   (((bit)>>5)==5  && (1UL<<((bit)&31) & REQUIRED_MASK5 )) ||	\
+	   (((bit)>>5)==6  && (1UL<<((bit)&31) & REQUIRED_MASK6 )) ||	\
+	   (((bit)>>5)==7  && (1UL<<((bit)&31) & REQUIRED_MASK7 )) ||	\
+	   (((bit)>>5)==8  && (1UL<<((bit)&31) & REQUIRED_MASK8 )) ||	\
+	   (((bit)>>5)==9  && (1UL<<((bit)&31) & REQUIRED_MASK9 )) ||	\
+	   (((bit)>>5)==10 && (1UL<<((bit)&31) & REQUIRED_MASK10)) ||	\
+	   (((bit)>>5)==11 && (1UL<<((bit)&31) & REQUIRED_MASK11)) ||	\
+	   (((bit)>>5)==12 && (1UL<<((bit)&31) & REQUIRED_MASK12)) ||	\
+	   (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK13)) ||	\
+	   (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK14)) ||	\
+	   (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK15)) ||	\
+	   (((bit)>>5)==14 && (1UL<<((bit)&31) & REQUIRED_MASK16)) )
+
+#define DISABLED_MASK_BIT_SET(bit)					\
+	 ( (((bit)>>5)==0  && (1UL<<((bit)&31) & DISABLED_MASK0 )) ||	\
+	   (((bit)>>5)==1  && (1UL<<((bit)&31) & DISABLED_MASK1 )) ||	\
+	   (((bit)>>5)==2  && (1UL<<((bit)&31) & DISABLED_MASK2 )) ||	\
+	   (((bit)>>5)==3  && (1UL<<((bit)&31) & DISABLED_MASK3 )) ||	\
+	   (((bit)>>5)==4  && (1UL<<((bit)&31) & DISABLED_MASK4 )) ||	\
+	   (((bit)>>5)==5  && (1UL<<((bit)&31) & DISABLED_MASK5 )) ||	\
+	   (((bit)>>5)==6  && (1UL<<((bit)&31) & DISABLED_MASK6 )) ||	\
+	   (((bit)>>5)==7  && (1UL<<((bit)&31) & DISABLED_MASK7 )) ||	\
+	   (((bit)>>5)==8  && (1UL<<((bit)&31) & DISABLED_MASK8 )) ||	\
+	   (((bit)>>5)==9  && (1UL<<((bit)&31) & DISABLED_MASK9 )) ||	\
+	   (((bit)>>5)==10 && (1UL<<((bit)&31) & DISABLED_MASK10)) ||	\
+	   (((bit)>>5)==11 && (1UL<<((bit)&31) & DISABLED_MASK11)) ||	\
+	   (((bit)>>5)==12 && (1UL<<((bit)&31) & DISABLED_MASK12)) ||	\
+	   (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK13)) ||	\
+	   (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK14)) ||	\
+	   (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK15)) ||	\
+	   (((bit)>>5)==14 && (1UL<<((bit)&31) & DISABLED_MASK16)) )
+
+#define cpu_has(c, bit)							\
+	(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 :	\
+	 test_cpu_cap(c, bit))
+
+#define boot_cpu_has(bit)	cpu_has(&boot_cpu_data, bit)
+
+#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
+/*
+ * Static testing of CPU features.  Used the same as boot_cpu_has().
+ * These will statically patch the target code for additional
+ * performance.
+ */
+static __always_inline __pure bool _static_cpu_has(u16 bit)
+{
+		asm_volatile_goto("1: jmp 6f\n"
+			 "2:\n"
+			 ".skip -(((5f-4f) - (2b-1b)) > 0) * "
+			         "((5f-4f) - (2b-1b)),0x90\n"
+			 "3:\n"
+			 ".section .altinstructions,\"a\"\n"
+			 " .long 1b - .\n"		/* src offset */
+			 " .long 4f - .\n"		/* repl offset */
+			 " .word %P1\n"			/* always replace */
+			 " .byte 3b - 1b\n"		/* src len */
+			 " .byte 5f - 4f\n"		/* repl len */
+			 " .byte 3b - 2b\n"		/* pad len */
+			 ".previous\n"
+			 ".section .altinstr_replacement,\"ax\"\n"
+			 "4: jmp %l[t_no]\n"
+			 "5:\n"
+			 ".previous\n"
+			 ".section .altinstructions,\"a\"\n"
+			 " .long 1b - .\n"		/* src offset */
+			 " .long 0\n"			/* no replacement */
+			 " .word %P0\n"			/* feature bit */
+			 " .byte 3b - 1b\n"		/* src len */
+			 " .byte 0\n"			/* repl len */
+			 " .byte 0\n"			/* pad len */
+			 ".previous\n"
+			 ".section .altinstr_aux,\"ax\"\n"
+			 "6:\n"
+			 " testb %[bitnum],%[cap_byte]\n"
+			 " jnz %l[t_yes]\n"
+			 " jmp %l[t_no]\n"
+			 ".previous\n"
+			 : : "i" (bit), "i" (X86_FEATURE_ALWAYS),
+			     [bitnum] "i" (1 << (bit & 7)),
+			     [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3])
+			 : : t_yes, t_no);
+	t_yes:
+		return true;
+	t_no:
+		return false;
+}
+
+#define static_cpu_has(bit)					\
+(								\
+	__builtin_constant_p(boot_cpu_has(bit)) ?		\
+		boot_cpu_has(bit) :				\
+		_static_cpu_has(bit)				\
+)
+#else
+/*
+ * Fall back to dynamic for gcc versions which don't support asm goto. Should be
+ * a minority now anyway.
+ */
+#define static_cpu_has(bit)		boot_cpu_has(bit)
+#endif
+
+#endif /* _ASM_X86_STATIC_CPU_HAS_H */
diff --git a/lib/Makefile b/lib/Makefile
index a65e9a861535..55ad20701dc0 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -15,9 +15,6 @@ KCOV_INSTRUMENT_rbtree.o := n
 KCOV_INSTRUMENT_list_debug.o := n
 KCOV_INSTRUMENT_debugobjects.o := n
 KCOV_INSTRUMENT_dynamic_debug.o := n
-# Kernel does not boot if we instrument this file as it uses custom calling
-# convention (see CONFIG_ARCH_HWEIGHT_CFLAGS).
-KCOV_INSTRUMENT_hweight.o := n
 
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o dump_stack.o timerqueue.o\
@@ -72,8 +69,6 @@ obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o
 obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o
 obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o
 
-GCOV_PROFILE_hweight.o := n
-CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS))
 obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
 
 obj-$(CONFIG_BTREE) += btree.o
-- 
2.7.3

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* Re: [PATCH -v2] x86/hweight: Get rid of the special calling convention
  2016-05-10 16:53                         ` [PATCH -v2] " Borislav Petkov
@ 2016-05-10 17:23                           ` Peter Zijlstra
  2016-05-10 19:02                             ` Borislav Petkov
  2016-05-10 19:03                             ` H. Peter Anvin
  0 siblings, 2 replies; 84+ messages in thread
From: Peter Zijlstra @ 2016-05-10 17:23 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: x86-ml, Denys Vlasenko, H. Peter Anvin, Brian Gerst, LKML,
	Dmitry Vyukov, Andi Kleen, zengzhaoxiu, Thomas Gleixner,
	Ingo Molnar, Andrew Morton, Kees Cook, Zhaoxiu Zeng,
	Andy Lutomirski

On Tue, May 10, 2016 at 06:53:18PM +0200, Borislav Petkov wrote:
>  static __always_inline unsigned int __arch_hweight32(unsigned int w)
>  {
> -	unsigned int res = 0;
> +	unsigned int res;
>  
> -	asm (ALTERNATIVE("call __sw_hweight32", POPCNT32, X86_FEATURE_POPCNT)
> -		     : "="REG_OUT (res)
> -		     : REG_IN (w));
> +	if (likely(static_cpu_has(X86_FEATURE_POPCNT))) {
> +		/* popcnt %eax, %eax */
> +		asm volatile(POPCNT32
> +				: "="REG_OUT (res)
> +				: REG_IN (w));
>  
> -	return res;
> +		return res;
> +	}
> +	return __sw_hweight32(w);
>  }

So what was wrong with using the normal thunk_*.S wrappers for the
calls? That would allow you to use the alternative() stuff which does
generate smaller code.

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH -v2] x86/hweight: Get rid of the special calling convention
  2016-05-10 17:23                           ` Peter Zijlstra
@ 2016-05-10 19:02                             ` Borislav Petkov
  2016-05-10 19:03                             ` H. Peter Anvin
  1 sibling, 0 replies; 84+ messages in thread
From: Borislav Petkov @ 2016-05-10 19:02 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: x86-ml, Denys Vlasenko, H. Peter Anvin, Brian Gerst, LKML,
	Dmitry Vyukov, Andi Kleen, zengzhaoxiu, Thomas Gleixner,
	Ingo Molnar, Andrew Morton, Kees Cook, Zhaoxiu Zeng,
	Andy Lutomirski

On Tue, May 10, 2016 at 07:23:13PM +0200, Peter Zijlstra wrote:
> So what was wrong with using the normal thunk_*.S wrappers for the
> calls? That would allow you to use the alternative() stuff which does
> generate smaller code.

Yeah, so a full allyesconfig vmlinux gives ~22K .text size increase:

   text    data     bss     dec     hex filename
85391772        105899159       70717440        262008371       f9dee33 vmlinux 	before
85413991        105899223       70746112        262059326       f9eb53e vmlinux		after
--------
   22219

I guess I better try the thunk stuff, might make it smaller.

Also, in the next version I'll split out the static_cpu_has() move into
a separate patch, as you requested on IRC.

Thanks.

-- 
Regards/Gruss,
    Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
-- 

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH -v2] x86/hweight: Get rid of the special calling convention
  2016-05-10 17:23                           ` Peter Zijlstra
  2016-05-10 19:02                             ` Borislav Petkov
@ 2016-05-10 19:03                             ` H. Peter Anvin
  2016-05-10 19:10                               ` Borislav Petkov
  1 sibling, 1 reply; 84+ messages in thread
From: H. Peter Anvin @ 2016-05-10 19:03 UTC (permalink / raw)
  To: Peter Zijlstra, Borislav Petkov
  Cc: x86-ml, Denys Vlasenko, Brian Gerst, LKML, Dmitry Vyukov,
	Andi Kleen, zengzhaoxiu, Thomas Gleixner, Ingo Molnar,
	Andrew Morton, Kees Cook, Zhaoxiu Zeng, Andy Lutomirski

On May 10, 2016 10:23:13 AM PDT, Peter Zijlstra <peterz@infradead.org> wrote:
>On Tue, May 10, 2016 at 06:53:18PM +0200, Borislav Petkov wrote:
>>  static __always_inline unsigned int __arch_hweight32(unsigned int w)
>>  {
>> -	unsigned int res = 0;
>> +	unsigned int res;
>>  
>> -	asm (ALTERNATIVE("call __sw_hweight32", POPCNT32,
>X86_FEATURE_POPCNT)
>> -		     : "="REG_OUT (res)
>> -		     : REG_IN (w));
>> +	if (likely(static_cpu_has(X86_FEATURE_POPCNT))) {
>> +		/* popcnt %eax, %eax */
>> +		asm volatile(POPCNT32
>> +				: "="REG_OUT (res)
>> +				: REG_IN (w));
>>  
>> -	return res;
>> +		return res;
>> +	}
>> +	return __sw_hweight32(w);
>>  }
>
>So what was wrong with using the normal thunk_*.S wrappers for the
>calls? That would allow you to use the alternative() stuff which does
>generate smaller code.

Also, to be fair... if the problem is with these being in C then we could just do it in assembly easily enough.
-- 
Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH -v2] x86/hweight: Get rid of the special calling convention
  2016-05-10 19:03                             ` H. Peter Anvin
@ 2016-05-10 19:10                               ` Borislav Petkov
  2016-05-10 22:30                                 ` H. Peter Anvin
  0 siblings, 1 reply; 84+ messages in thread
From: Borislav Petkov @ 2016-05-10 19:10 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Peter Zijlstra, x86-ml, Denys Vlasenko, Brian Gerst, LKML,
	Dmitry Vyukov, Andi Kleen, zengzhaoxiu, Thomas Gleixner,
	Ingo Molnar, Andrew Morton, Kees Cook, Zhaoxiu Zeng,
	Andy Lutomirski

On Tue, May 10, 2016 at 12:03:48PM -0700, H. Peter Anvin wrote:
> Also, to be fair... if the problem is with these being in C then we
> could just do it in assembly easily enough.

I thought about converting the __sw_hweight* variants to asm but
__sw_hweight32, for example, is 55 bytes here and that's a lot.

Or do you have a better idea?

peterz's sounds ok to me: we call a thunk which then calls __sw_hweight*
after having saved regs properly - problem solved.

-- 
Regards/Gruss,
    Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
-- 

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH -v2] x86/hweight: Get rid of the special calling convention
  2016-05-10 19:10                               ` Borislav Petkov
@ 2016-05-10 22:30                                 ` H. Peter Anvin
  2016-05-11  4:11                                   ` Borislav Petkov
  0 siblings, 1 reply; 84+ messages in thread
From: H. Peter Anvin @ 2016-05-10 22:30 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Peter Zijlstra, x86-ml, Denys Vlasenko, Brian Gerst, LKML,
	Dmitry Vyukov, Andi Kleen, zengzhaoxiu, Thomas Gleixner,
	Ingo Molnar, Andrew Morton, Kees Cook, Zhaoxiu Zeng,
	Andy Lutomirski

On 05/10/16 12:10, Borislav Petkov wrote:
> On Tue, May 10, 2016 at 12:03:48PM -0700, H. Peter Anvin wrote:
>> Also, to be fair... if the problem is with these being in C then we
>> could just do it in assembly easily enough.
> 
> I thought about converting the __sw_hweight* variants to asm but
> __sw_hweight32, for example, is 55 bytes here and that's a lot.
> 
> Or do you have a better idea?
> 
> peterz's sounds ok to me: we call a thunk which then calls __sw_hweight*
> after having saved regs properly - problem solved.
> 

I didn't mean inline assembly.

	-hpa

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH -v2] x86/hweight: Get rid of the special calling convention
  2016-05-10 22:30                                 ` H. Peter Anvin
@ 2016-05-11  4:11                                   ` Borislav Petkov
  2016-05-11 11:15                                     ` Brian Gerst
  0 siblings, 1 reply; 84+ messages in thread
From: Borislav Petkov @ 2016-05-11  4:11 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Peter Zijlstra, x86-ml, Denys Vlasenko, Brian Gerst, LKML,
	Dmitry Vyukov, Andi Kleen, zengzhaoxiu, Thomas Gleixner,
	Ingo Molnar, Andrew Morton, Kees Cook, Zhaoxiu Zeng,
	Andy Lutomirski

On Tue, May 10, 2016 at 03:30:48PM -0700, H. Peter Anvin wrote:
> I didn't mean inline assembly.

How does that matter?

The problem is having as less insn bytes as possible and the minimal
size we can do is issuing POPCNT everywhere which is 4 or 5 bytes. The
alternatives then replace that with a CALL which is also 5 bytes.

The way I did it now, it adds 22K more to allyesconfig vmlinux due to
the static_cpu_has doubled alternatives sections and the JMPs. The
thunks will keep those 5 bytes *and* get rid of the calling convention
without the growth.

Or?

-- 
Regards/Gruss,
    Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
-- 

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH -v2] x86/hweight: Get rid of the special calling convention
  2016-05-11  4:11                                   ` Borislav Petkov
@ 2016-05-11 11:15                                     ` Brian Gerst
  2016-05-11 11:24                                       ` Peter Zijlstra
  0 siblings, 1 reply; 84+ messages in thread
From: Brian Gerst @ 2016-05-11 11:15 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: H. Peter Anvin, Peter Zijlstra, x86-ml, Denys Vlasenko, LKML,
	Dmitry Vyukov, Andi Kleen, zengzhaoxiu, Thomas Gleixner,
	Ingo Molnar, Andrew Morton, Kees Cook, Zhaoxiu Zeng,
	Andy Lutomirski

On Wed, May 11, 2016 at 12:11 AM, Borislav Petkov <bp@suse.de> wrote:
> On Tue, May 10, 2016 at 03:30:48PM -0700, H. Peter Anvin wrote:
>> I didn't mean inline assembly.
>
> How does that matter?
>
> The problem is having as less insn bytes as possible and the minimal
> size we can do is issuing POPCNT everywhere which is 4 or 5 bytes. The
> alternatives then replace that with a CALL which is also 5 bytes.
>
> The way I did it now, it adds 22K more to allyesconfig vmlinux due to
> the static_cpu_has doubled alternatives sections and the JMPs. The
> thunks will keep those 5 bytes *and* get rid of the calling convention
> without the growth.
>
> Or?

I think he meant the out of line version would be asm, so you could
control what registers were clobbered.

--
Brian Gerst

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH -v2] x86/hweight: Get rid of the special calling convention
  2016-05-11 11:15                                     ` Brian Gerst
@ 2016-05-11 11:24                                       ` Peter Zijlstra
  2016-05-11 12:47                                         ` Borislav Petkov
  2016-05-12  4:54                                         ` H. Peter Anvin
  0 siblings, 2 replies; 84+ messages in thread
From: Peter Zijlstra @ 2016-05-11 11:24 UTC (permalink / raw)
  To: Brian Gerst
  Cc: Borislav Petkov, H. Peter Anvin, x86-ml, Denys Vlasenko, LKML,
	Dmitry Vyukov, Andi Kleen, zengzhaoxiu, Thomas Gleixner,
	Ingo Molnar, Andrew Morton, Kees Cook, Zhaoxiu Zeng,
	Andy Lutomirski

On Wed, May 11, 2016 at 07:15:19AM -0400, Brian Gerst wrote:

> I think he meant the out of line version would be asm, so you could
> control what registers were clobbered.

Yeah, it might save a few cycles on the call, but given that most
machines should have popcnt these days is it worth the hassle/cost of
duplicating the lib/hweight.c magic in asm (and remember, twice, once
for 32bit and once for 64bit) ?

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH -v2] x86/hweight: Get rid of the special calling convention
  2016-05-11 11:24                                       ` Peter Zijlstra
@ 2016-05-11 12:47                                         ` Borislav Petkov
  2016-05-12  4:54                                         ` H. Peter Anvin
  1 sibling, 0 replies; 84+ messages in thread
From: Borislav Petkov @ 2016-05-11 12:47 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Brian Gerst, H. Peter Anvin, x86-ml, Denys Vlasenko, LKML,
	Dmitry Vyukov, Andi Kleen, zengzhaoxiu, Thomas Gleixner,
	Ingo Molnar, Andrew Morton, Kees Cook, Zhaoxiu Zeng,
	Andy Lutomirski

On Wed, May 11, 2016 at 01:24:09PM +0200, Peter Zijlstra wrote:
> Yeah, it might save a few cycles on the call, but given that most
> machines should have popcnt these days is it worth the hassle/cost of
> duplicating the lib/hweight.c magic in asm (and remember, twice, once
> for 32bit and once for 64bit) ?

Makes sense to me - we can do the funky cool stuff but considering the
fact that we have all the required bits already, I think we should do
the thunking game and be done with it.

I guess we're older, lazy and more realists now...

-- 
Regards/Gruss,
    Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
-- 

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH -v2] x86/hweight: Get rid of the special calling convention
  2016-05-11 11:24                                       ` Peter Zijlstra
  2016-05-11 12:47                                         ` Borislav Petkov
@ 2016-05-12  4:54                                         ` H. Peter Anvin
  2016-05-12 11:57                                           ` Borislav Petkov
  1 sibling, 1 reply; 84+ messages in thread
From: H. Peter Anvin @ 2016-05-12  4:54 UTC (permalink / raw)
  To: Peter Zijlstra, Brian Gerst
  Cc: Borislav Petkov, x86-ml, Denys Vlasenko, LKML, Dmitry Vyukov,
	Andi Kleen, zengzhaoxiu, Thomas Gleixner, Ingo Molnar,
	Andrew Morton, Kees Cook, Zhaoxiu Zeng, Andy Lutomirski

On May 11, 2016 4:24:09 AM PDT, Peter Zijlstra <peterz@infradead.org> wrote:
>On Wed, May 11, 2016 at 07:15:19AM -0400, Brian Gerst wrote:
>
>> I think he meant the out of line version would be asm, so you could
>> control what registers were clobbered.
>
>Yeah, it might save a few cycles on the call, but given that most
>machines should have popcnt these days is it worth the hassle/cost of
>duplicating the lib/hweight.c magic in asm (and remember, twice, once
>for 32bit and once for 64bit) ?

I was thinking it isn't really very complex code even in assembly as it is super-regular; you can even crib the gcc-generated code if you wish.
-- 
Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH -v2] x86/hweight: Get rid of the special calling convention
  2016-05-12  4:54                                         ` H. Peter Anvin
@ 2016-05-12 11:57                                           ` Borislav Petkov
  2016-05-12 12:14                                             ` Peter Zijlstra
  0 siblings, 1 reply; 84+ messages in thread
From: Borislav Petkov @ 2016-05-12 11:57 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Peter Zijlstra, Brian Gerst, x86-ml, Denys Vlasenko, LKML,
	Dmitry Vyukov, Andi Kleen, zengzhaoxiu, Thomas Gleixner,
	Ingo Molnar, Andrew Morton, Kees Cook, Zhaoxiu Zeng,
	Andy Lutomirski

On Wed, May 11, 2016 at 09:54:50PM -0700, H. Peter Anvin wrote:
> I was thinking it isn't really very complex code even in assembly as
> it is super-regular; you can even crib the gcc-generated code if you
> wish.

Do I wanna do experiments in asm? Always! :-)

Ok, so I did steal gcc -m32 -O3 output because there it uses only one
additional register. So how about this (only __sw_hweight32 today):

#ifdef CONFIG_X86_32
# define PUSH_DX        "pushl %%edx\n\t"
# define POP_DX         "popl %%edx\n\t"
#else
# define PUSH_DX        "pushq %%rdx\n\t"
# define POP_DX         "popq %%rdx\n\t"
#endif

unsigned int __sw_hweight32(unsigned int w)
{
        asm volatile(PUSH_DX
                     "movl %[w], %%edx\n\t"                     /* w -> t */
                     "shrl %%edx\n\t"                           /* t >> 1 */
                     "andl $0x55555555, %%edx\n\t"              /* t & 0x55555555 */
                     "subl %%edx, %[w]\n"                       /* w -= t */
                     "\n\t"
                     "movl %[w], %%edx\n\t"                     /* w -> t */
                     "shrl $2, %[w]\n\t"                        /* w_tmp >> 2 */
                     "andl $0x33333333, %%edx\n\t"              /* t & 0x33333333 */
                     "andl $0x33333333, %[w]\n\t"               /* w_tmp & 0x33333333 */
                     "addl %%edx, %[w]\n"                       /* w = w_tmp + t */
                     "\n\t"
                     "movl %[w], %%edx\n\t"                     /* w -> t */
                     "shrl $4, %%edx\n\t"                       /* t >> 4 */
                     "addl %%edx, %[w]\n\t"                     /* w_tmp += t */
                     "andl  $0x0f0f0f0f, %[w]\n\t"              /* w_tmp &= 0x0f0f0f0f */
                     "imull $0x01010101, %[w], %[w]\n\t"        /* w_tmp *= 0x01010101 */
                     "shrl $24, %[w]\n\t"                       /* w = w_tmp >> 24 */
                     POP_DX
                     : [w] "+r" (w));

        return w;
}

I've chosen rDX as a temp because gcc takes that one but it doesn't
matter which - we're stashing it.

And then we rely on gcc to figure out which reg to use for w. It ends up
using rAX as that is the return reg which fits nicely with our intention
of returning POPCNT values in rAX.

I'm guessing we can just as well write %%rax in the asm because we're
returning that value and that's ABI.

Generated asm looks ok, only on 64-bit it does one

	movl    %edi, %eax      # w, w

before the inline asm in order to stick w in rAX.

Complaints?

Full diff:

---
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7bb15747fea2..79e0bcd61cb1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -292,11 +292,6 @@ config X86_32_LAZY_GS
 	def_bool y
 	depends on X86_32 && !CC_STACKPROTECTOR
 
-config ARCH_HWEIGHT_CFLAGS
-	string
-	default "-fcall-saved-ecx -fcall-saved-edx" if X86_32
-	default "-fcall-saved-rdi -fcall-saved-rsi -fcall-saved-rdx -fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 -fcall-saved-r11" if X86_64
-
 config ARCH_SUPPORTS_UPROBES
 	def_bool y
 
diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
index 02e799fa43d1..7dd97eaba67d 100644
--- a/arch/x86/include/asm/arch_hweight.h
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -2,10 +2,11 @@
 #define _ASM_X86_HWEIGHT_H
 
 #include <asm/cpufeatures.h>
+#include <asm/static_cpu_has.h>
 
 #ifdef CONFIG_64BIT
-/* popcnt %edi, %eax -- redundant REX prefix for alignment */
-#define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7"
+/* popcnt %edi, %eax */
+#define POPCNT32 ".byte 0xf3,0x0f,0xb8,0xc7"
 /* popcnt %rdi, %rax */
 #define POPCNT64 ".byte 0xf3,0x48,0x0f,0xb8,0xc7"
 #define REG_IN "D"
@@ -17,19 +18,15 @@
 #define REG_OUT "a"
 #endif
 
-/*
- * __sw_hweightXX are called from within the alternatives below
- * and callee-clobbered registers need to be taken care of. See
- * ARCH_HWEIGHT_CFLAGS in <arch/x86/Kconfig> for the respective
- * compiler switches.
- */
+#define __HAVE_ARCH_SW_HWEIGHT
+
 static __always_inline unsigned int __arch_hweight32(unsigned int w)
 {
-	unsigned int res = 0;
+	unsigned int res;
 
 	asm (ALTERNATIVE("call __sw_hweight32", POPCNT32, X86_FEATURE_POPCNT)
-		     : "="REG_OUT (res)
-		     : REG_IN (w));
+			: "="REG_OUT (res)
+			: REG_IN (w));
 
 	return res;
 }
@@ -53,13 +50,16 @@ static inline unsigned long __arch_hweight64(__u64 w)
 #else
 static __always_inline unsigned long __arch_hweight64(__u64 w)
 {
-	unsigned long res = 0;
+	unsigned long res;
 
-	asm (ALTERNATIVE("call __sw_hweight64", POPCNT64, X86_FEATURE_POPCNT)
-		     : "="REG_OUT (res)
-		     : REG_IN (w));
+	if (likely(static_cpu_has(X86_FEATURE_POPCNT))) {
+		asm volatile(POPCNT64
+				: "="REG_OUT (res)
+				: REG_IN (w));
 
-	return res;
+		return res;
+	}
+	return __sw_hweight64(w);
 }
 #endif /* CONFIG_X86_32 */
 
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 07c942d84662..9a70b12ae8df 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -6,6 +6,8 @@
 #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
 
 #include <asm/asm.h>
+#include <asm/static_cpu_has.h>
+
 #include <linux/bitops.h>
 
 enum cpuid_leafs
@@ -45,51 +47,6 @@ extern const char * const x86_power_flags[32];
  */
 extern const char * const x86_bug_flags[NBUGINTS*32];
 
-#define test_cpu_cap(c, bit)						\
-	 test_bit(bit, (unsigned long *)((c)->x86_capability))
-
-#define REQUIRED_MASK_BIT_SET(bit)					\
-	 ( (((bit)>>5)==0  && (1UL<<((bit)&31) & REQUIRED_MASK0 )) ||	\
-	   (((bit)>>5)==1  && (1UL<<((bit)&31) & REQUIRED_MASK1 )) ||	\
-	   (((bit)>>5)==2  && (1UL<<((bit)&31) & REQUIRED_MASK2 )) ||	\
-	   (((bit)>>5)==3  && (1UL<<((bit)&31) & REQUIRED_MASK3 )) ||	\
-	   (((bit)>>5)==4  && (1UL<<((bit)&31) & REQUIRED_MASK4 )) ||	\
-	   (((bit)>>5)==5  && (1UL<<((bit)&31) & REQUIRED_MASK5 )) ||	\
-	   (((bit)>>5)==6  && (1UL<<((bit)&31) & REQUIRED_MASK6 )) ||	\
-	   (((bit)>>5)==7  && (1UL<<((bit)&31) & REQUIRED_MASK7 )) ||	\
-	   (((bit)>>5)==8  && (1UL<<((bit)&31) & REQUIRED_MASK8 )) ||	\
-	   (((bit)>>5)==9  && (1UL<<((bit)&31) & REQUIRED_MASK9 )) ||	\
-	   (((bit)>>5)==10 && (1UL<<((bit)&31) & REQUIRED_MASK10)) ||	\
-	   (((bit)>>5)==11 && (1UL<<((bit)&31) & REQUIRED_MASK11)) ||	\
-	   (((bit)>>5)==12 && (1UL<<((bit)&31) & REQUIRED_MASK12)) ||	\
-	   (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK13)) ||	\
-	   (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK14)) ||	\
-	   (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK15)) ||	\
-	   (((bit)>>5)==14 && (1UL<<((bit)&31) & REQUIRED_MASK16)) )
-
-#define DISABLED_MASK_BIT_SET(bit)					\
-	 ( (((bit)>>5)==0  && (1UL<<((bit)&31) & DISABLED_MASK0 )) ||	\
-	   (((bit)>>5)==1  && (1UL<<((bit)&31) & DISABLED_MASK1 )) ||	\
-	   (((bit)>>5)==2  && (1UL<<((bit)&31) & DISABLED_MASK2 )) ||	\
-	   (((bit)>>5)==3  && (1UL<<((bit)&31) & DISABLED_MASK3 )) ||	\
-	   (((bit)>>5)==4  && (1UL<<((bit)&31) & DISABLED_MASK4 )) ||	\
-	   (((bit)>>5)==5  && (1UL<<((bit)&31) & DISABLED_MASK5 )) ||	\
-	   (((bit)>>5)==6  && (1UL<<((bit)&31) & DISABLED_MASK6 )) ||	\
-	   (((bit)>>5)==7  && (1UL<<((bit)&31) & DISABLED_MASK7 )) ||	\
-	   (((bit)>>5)==8  && (1UL<<((bit)&31) & DISABLED_MASK8 )) ||	\
-	   (((bit)>>5)==9  && (1UL<<((bit)&31) & DISABLED_MASK9 )) ||	\
-	   (((bit)>>5)==10 && (1UL<<((bit)&31) & DISABLED_MASK10)) ||	\
-	   (((bit)>>5)==11 && (1UL<<((bit)&31) & DISABLED_MASK11)) ||	\
-	   (((bit)>>5)==12 && (1UL<<((bit)&31) & DISABLED_MASK12)) ||	\
-	   (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK13)) ||	\
-	   (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK14)) ||	\
-	   (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK15)) ||	\
-	   (((bit)>>5)==14 && (1UL<<((bit)&31) & DISABLED_MASK16)) )
-
-#define cpu_has(c, bit)							\
-	(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 :	\
-	 test_cpu_cap(c, bit))
-
 #define this_cpu_has(bit)						\
 	(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : 	\
 	 x86_this_cpu_test_bit(bit, (unsigned long *)&cpu_info.x86_capability))
@@ -105,8 +62,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
 #define cpu_feature_enabled(bit)	\
 	(__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : static_cpu_has(bit))
 
-#define boot_cpu_has(bit)	cpu_has(&boot_cpu_data, bit)
-
 #define set_cpu_cap(c, bit)	set_bit(bit, (unsigned long *)((c)->x86_capability))
 #define clear_cpu_cap(c, bit)	clear_bit(bit, (unsigned long *)((c)->x86_capability))
 #define setup_clear_cpu_cap(bit) do { \
@@ -118,69 +73,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
 	set_bit(bit, (unsigned long *)cpu_caps_set);	\
 } while (0)
 
-#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
-/*
- * Static testing of CPU features.  Used the same as boot_cpu_has().
- * These will statically patch the target code for additional
- * performance.
- */
-static __always_inline __pure bool _static_cpu_has(u16 bit)
-{
-		asm_volatile_goto("1: jmp 6f\n"
-			 "2:\n"
-			 ".skip -(((5f-4f) - (2b-1b)) > 0) * "
-			         "((5f-4f) - (2b-1b)),0x90\n"
-			 "3:\n"
-			 ".section .altinstructions,\"a\"\n"
-			 " .long 1b - .\n"		/* src offset */
-			 " .long 4f - .\n"		/* repl offset */
-			 " .word %P1\n"			/* always replace */
-			 " .byte 3b - 1b\n"		/* src len */
-			 " .byte 5f - 4f\n"		/* repl len */
-			 " .byte 3b - 2b\n"		/* pad len */
-			 ".previous\n"
-			 ".section .altinstr_replacement,\"ax\"\n"
-			 "4: jmp %l[t_no]\n"
-			 "5:\n"
-			 ".previous\n"
-			 ".section .altinstructions,\"a\"\n"
-			 " .long 1b - .\n"		/* src offset */
-			 " .long 0\n"			/* no replacement */
-			 " .word %P0\n"			/* feature bit */
-			 " .byte 3b - 1b\n"		/* src len */
-			 " .byte 0\n"			/* repl len */
-			 " .byte 0\n"			/* pad len */
-			 ".previous\n"
-			 ".section .altinstr_aux,\"ax\"\n"
-			 "6:\n"
-			 " testb %[bitnum],%[cap_byte]\n"
-			 " jnz %l[t_yes]\n"
-			 " jmp %l[t_no]\n"
-			 ".previous\n"
-			 : : "i" (bit), "i" (X86_FEATURE_ALWAYS),
-			     [bitnum] "i" (1 << (bit & 7)),
-			     [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3])
-			 : : t_yes, t_no);
-	t_yes:
-		return true;
-	t_no:
-		return false;
-}
-
-#define static_cpu_has(bit)					\
-(								\
-	__builtin_constant_p(boot_cpu_has(bit)) ?		\
-		boot_cpu_has(bit) :				\
-		_static_cpu_has(bit)				\
-)
-#else
-/*
- * Fall back to dynamic for gcc versions which don't support asm goto. Should be
- * a minority now anyway.
- */
-#define static_cpu_has(bit)		boot_cpu_has(bit)
-#endif
-
 #define cpu_has_bug(c, bit)		cpu_has(c, (bit))
 #define set_cpu_bug(c, bit)		set_cpu_cap(c, (bit))
 #define clear_cpu_bug(c, bit)		clear_cpu_cap(c, (bit))
diff --git a/arch/x86/include/asm/cpuinfo.h b/arch/x86/include/asm/cpuinfo.h
new file mode 100644
index 000000000000..a6632044f199
--- /dev/null
+++ b/arch/x86/include/asm/cpuinfo.h
@@ -0,0 +1,65 @@
+#ifndef _ASM_X86_CPUINFO_H_
+#define _ASM_X86_CPUINFO_H_
+
+/*
+ *  CPU type and hardware bug flags. Kept separately for each CPU.
+ *  Members of this structure are referenced in head.S, so think twice
+ *  before touching them. [mj]
+ */
+struct cpuinfo_x86 {
+	__u8			x86;		/* CPU family */
+	__u8			x86_vendor;	/* CPU vendor */
+	__u8			x86_model;
+	__u8			x86_mask;
+#ifdef CONFIG_X86_32
+	char			wp_works_ok;	/* It doesn't on 386's */
+
+	/* Problems on some 486Dx4's and old 386's: */
+	char			rfu;
+	char			pad0;
+	char			pad1;
+#else
+	/* Number of 4K pages in DTLB/ITLB combined(in pages): */
+	int			x86_tlbsize;
+#endif
+	__u8			x86_virt_bits;
+	__u8			x86_phys_bits;
+	/* CPUID returned core id bits: */
+	__u8			x86_coreid_bits;
+	/* Max extended CPUID function supported: */
+	__u32			extended_cpuid_level;
+	/* Maximum supported CPUID level, -1=no CPUID: */
+	int			cpuid_level;
+	__u32			x86_capability[NCAPINTS + NBUGINTS];
+	char			x86_vendor_id[16];
+	char			x86_model_id[64];
+	/* in KB - valid for CPUS which support this call: */
+	int			x86_cache_size;
+	int			x86_cache_alignment;	/* In bytes */
+	/* Cache QoS architectural values: */
+	int			x86_cache_max_rmid;	/* max index */
+	int			x86_cache_occ_scale;	/* scale to bytes */
+	int			x86_power;
+	unsigned long		loops_per_jiffy;
+	/* cpuid returned max cores value: */
+	u16			 x86_max_cores;
+	u16			apicid;
+	u16			initial_apicid;
+	u16			x86_clflush_size;
+	/* number of cores as seen by the OS: */
+	u16			booted_cores;
+	/* Physical processor id: */
+	u16			phys_proc_id;
+	/* Logical processor id: */
+	u16			logical_proc_id;
+	/* Core id: */
+	u16			cpu_core_id;
+	/* Index into per_cpu list: */
+	u16			cpu_index;
+	u32			microcode;
+};
+
+extern struct cpuinfo_x86	boot_cpu_data;
+extern struct cpuinfo_x86	new_cpu_data;
+
+#endif /* _ASM_X86_CPUINFO_H_ */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 62c6cc3cc5d3..6f6555b20e3d 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -22,6 +22,7 @@ struct vm86;
 #include <asm/nops.h>
 #include <asm/special_insns.h>
 #include <asm/fpu/types.h>
+#include <asm/cpuinfo.h>
 
 #include <linux/personality.h>
 #include <linux/cache.h>
@@ -78,65 +79,6 @@ extern u16 __read_mostly tlb_lld_2m[NR_INFO];
 extern u16 __read_mostly tlb_lld_4m[NR_INFO];
 extern u16 __read_mostly tlb_lld_1g[NR_INFO];
 
-/*
- *  CPU type and hardware bug flags. Kept separately for each CPU.
- *  Members of this structure are referenced in head.S, so think twice
- *  before touching them. [mj]
- */
-
-struct cpuinfo_x86 {
-	__u8			x86;		/* CPU family */
-	__u8			x86_vendor;	/* CPU vendor */
-	__u8			x86_model;
-	__u8			x86_mask;
-#ifdef CONFIG_X86_32
-	char			wp_works_ok;	/* It doesn't on 386's */
-
-	/* Problems on some 486Dx4's and old 386's: */
-	char			rfu;
-	char			pad0;
-	char			pad1;
-#else
-	/* Number of 4K pages in DTLB/ITLB combined(in pages): */
-	int			x86_tlbsize;
-#endif
-	__u8			x86_virt_bits;
-	__u8			x86_phys_bits;
-	/* CPUID returned core id bits: */
-	__u8			x86_coreid_bits;
-	/* Max extended CPUID function supported: */
-	__u32			extended_cpuid_level;
-	/* Maximum supported CPUID level, -1=no CPUID: */
-	int			cpuid_level;
-	__u32			x86_capability[NCAPINTS + NBUGINTS];
-	char			x86_vendor_id[16];
-	char			x86_model_id[64];
-	/* in KB - valid for CPUS which support this call: */
-	int			x86_cache_size;
-	int			x86_cache_alignment;	/* In bytes */
-	/* Cache QoS architectural values: */
-	int			x86_cache_max_rmid;	/* max index */
-	int			x86_cache_occ_scale;	/* scale to bytes */
-	int			x86_power;
-	unsigned long		loops_per_jiffy;
-	/* cpuid returned max cores value: */
-	u16			 x86_max_cores;
-	u16			apicid;
-	u16			initial_apicid;
-	u16			x86_clflush_size;
-	/* number of cores as seen by the OS: */
-	u16			booted_cores;
-	/* Physical processor id: */
-	u16			phys_proc_id;
-	/* Logical processor id: */
-	u16			logical_proc_id;
-	/* Core id: */
-	u16			cpu_core_id;
-	/* Index into per_cpu list: */
-	u16			cpu_index;
-	u32			microcode;
-};
-
 #define X86_VENDOR_INTEL	0
 #define X86_VENDOR_CYRIX	1
 #define X86_VENDOR_AMD		2
@@ -151,9 +93,6 @@ struct cpuinfo_x86 {
 /*
  * capabilities of CPUs
  */
-extern struct cpuinfo_x86	boot_cpu_data;
-extern struct cpuinfo_x86	new_cpu_data;
-
 extern struct tss_struct	doublefault_tss;
 extern __u32			cpu_caps_cleared[NCAPINTS];
 extern __u32			cpu_caps_set[NCAPINTS];
diff --git a/arch/x86/include/asm/static_cpu_has.h b/arch/x86/include/asm/static_cpu_has.h
new file mode 100644
index 000000000000..648ada0c7ffe
--- /dev/null
+++ b/arch/x86/include/asm/static_cpu_has.h
@@ -0,0 +1,116 @@
+#ifndef _ASM_X86_STATIC_CPU_HAS_H
+#define _ASM_X86_STATIC_CPU_HAS_H
+
+#include <asm/cpuinfo.h>
+
+#define test_cpu_cap(c, bit)						\
+	 test_bit(bit, (unsigned long *)((c)->x86_capability))
+
+#define REQUIRED_MASK_BIT_SET(bit)					\
+	 ( (((bit)>>5)==0  && (1UL<<((bit)&31) & REQUIRED_MASK0 )) ||	\
+	   (((bit)>>5)==1  && (1UL<<((bit)&31) & REQUIRED_MASK1 )) ||	\
+	   (((bit)>>5)==2  && (1UL<<((bit)&31) & REQUIRED_MASK2 )) ||	\
+	   (((bit)>>5)==3  && (1UL<<((bit)&31) & REQUIRED_MASK3 )) ||	\
+	   (((bit)>>5)==4  && (1UL<<((bit)&31) & REQUIRED_MASK4 )) ||	\
+	   (((bit)>>5)==5  && (1UL<<((bit)&31) & REQUIRED_MASK5 )) ||	\
+	   (((bit)>>5)==6  && (1UL<<((bit)&31) & REQUIRED_MASK6 )) ||	\
+	   (((bit)>>5)==7  && (1UL<<((bit)&31) & REQUIRED_MASK7 )) ||	\
+	   (((bit)>>5)==8  && (1UL<<((bit)&31) & REQUIRED_MASK8 )) ||	\
+	   (((bit)>>5)==9  && (1UL<<((bit)&31) & REQUIRED_MASK9 )) ||	\
+	   (((bit)>>5)==10 && (1UL<<((bit)&31) & REQUIRED_MASK10)) ||	\
+	   (((bit)>>5)==11 && (1UL<<((bit)&31) & REQUIRED_MASK11)) ||	\
+	   (((bit)>>5)==12 && (1UL<<((bit)&31) & REQUIRED_MASK12)) ||	\
+	   (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK13)) ||	\
+	   (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK14)) ||	\
+	   (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK15)) ||	\
+	   (((bit)>>5)==14 && (1UL<<((bit)&31) & REQUIRED_MASK16)) )
+
+#define DISABLED_MASK_BIT_SET(bit)					\
+	 ( (((bit)>>5)==0  && (1UL<<((bit)&31) & DISABLED_MASK0 )) ||	\
+	   (((bit)>>5)==1  && (1UL<<((bit)&31) & DISABLED_MASK1 )) ||	\
+	   (((bit)>>5)==2  && (1UL<<((bit)&31) & DISABLED_MASK2 )) ||	\
+	   (((bit)>>5)==3  && (1UL<<((bit)&31) & DISABLED_MASK3 )) ||	\
+	   (((bit)>>5)==4  && (1UL<<((bit)&31) & DISABLED_MASK4 )) ||	\
+	   (((bit)>>5)==5  && (1UL<<((bit)&31) & DISABLED_MASK5 )) ||	\
+	   (((bit)>>5)==6  && (1UL<<((bit)&31) & DISABLED_MASK6 )) ||	\
+	   (((bit)>>5)==7  && (1UL<<((bit)&31) & DISABLED_MASK7 )) ||	\
+	   (((bit)>>5)==8  && (1UL<<((bit)&31) & DISABLED_MASK8 )) ||	\
+	   (((bit)>>5)==9  && (1UL<<((bit)&31) & DISABLED_MASK9 )) ||	\
+	   (((bit)>>5)==10 && (1UL<<((bit)&31) & DISABLED_MASK10)) ||	\
+	   (((bit)>>5)==11 && (1UL<<((bit)&31) & DISABLED_MASK11)) ||	\
+	   (((bit)>>5)==12 && (1UL<<((bit)&31) & DISABLED_MASK12)) ||	\
+	   (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK13)) ||	\
+	   (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK14)) ||	\
+	   (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK15)) ||	\
+	   (((bit)>>5)==14 && (1UL<<((bit)&31) & DISABLED_MASK16)) )
+
+#define cpu_has(c, bit)							\
+	(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 :	\
+	 test_cpu_cap(c, bit))
+
+#define boot_cpu_has(bit)	cpu_has(&boot_cpu_data, bit)
+
+#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
+/*
+ * Static testing of CPU features.  Used the same as boot_cpu_has().
+ * These will statically patch the target code for additional
+ * performance.
+ */
+static __always_inline __pure bool _static_cpu_has(u16 bit)
+{
+		asm_volatile_goto("1: jmp 6f\n"
+			 "2:\n"
+			 ".skip -(((5f-4f) - (2b-1b)) > 0) * "
+			         "((5f-4f) - (2b-1b)),0x90\n"
+			 "3:\n"
+			 ".section .altinstructions,\"a\"\n"
+			 " .long 1b - .\n"		/* src offset */
+			 " .long 4f - .\n"		/* repl offset */
+			 " .word %P1\n"			/* always replace */
+			 " .byte 3b - 1b\n"		/* src len */
+			 " .byte 5f - 4f\n"		/* repl len */
+			 " .byte 3b - 2b\n"		/* pad len */
+			 ".previous\n"
+			 ".section .altinstr_replacement,\"ax\"\n"
+			 "4: jmp %l[t_no]\n"
+			 "5:\n"
+			 ".previous\n"
+			 ".section .altinstructions,\"a\"\n"
+			 " .long 1b - .\n"		/* src offset */
+			 " .long 0\n"			/* no replacement */
+			 " .word %P0\n"			/* feature bit */
+			 " .byte 3b - 1b\n"		/* src len */
+			 " .byte 0\n"			/* repl len */
+			 " .byte 0\n"			/* pad len */
+			 ".previous\n"
+			 ".section .altinstr_aux,\"ax\"\n"
+			 "6:\n"
+			 " testb %[bitnum],%[cap_byte]\n"
+			 " jnz %l[t_yes]\n"
+			 " jmp %l[t_no]\n"
+			 ".previous\n"
+			 : : "i" (bit), "i" (X86_FEATURE_ALWAYS),
+			     [bitnum] "i" (1 << (bit & 7)),
+			     [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3])
+			 : : t_yes, t_no);
+	t_yes:
+		return true;
+	t_no:
+		return false;
+}
+
+#define static_cpu_has(bit)					\
+(								\
+	__builtin_constant_p(boot_cpu_has(bit)) ?		\
+		boot_cpu_has(bit) :				\
+		_static_cpu_has(bit)				\
+)
+#else
+/*
+ * Fall back to dynamic for gcc versions which don't support asm goto. Should be
+ * a minority now anyway.
+ */
+#define static_cpu_has(bit)		boot_cpu_has(bit)
+#endif
+
+#endif /* _ASM_X86_STATIC_CPU_HAS_H */
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 72a576752a7e..ec969cc3eb20 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -25,7 +25,7 @@ lib-y += memcpy_$(BITS).o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
 
-obj-y += msr.o msr-reg.o msr-reg-export.o
+obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
 
 ifeq ($(CONFIG_X86_32),y)
         obj-y += atomic64_32.o
diff --git a/arch/x86/lib/hweight.c b/arch/x86/lib/hweight.c
new file mode 100644
index 000000000000..5834fb9af6ff
--- /dev/null
+++ b/arch/x86/lib/hweight.c
@@ -0,0 +1,37 @@
+#include <linux/export.h>
+#include <linux/compiler.h>
+
+#ifdef CONFIG_X86_32
+# define PUSH_DX	"pushl %%edx\n\t"
+# define POP_DX		"popl %%edx\n\t"
+#else
+# define PUSH_DX	"pushq %%rdx\n\t"
+# define POP_DX		"popq %%rdx\n\t"
+#endif
+
+unsigned int __sw_hweight32(unsigned int w)
+{
+	asm volatile(PUSH_DX
+		     "movl %[w], %%edx\n\t"			/* w -> t */
+		     "shrl %%edx\n\t"				/* t >> 1 */
+		     "andl $0x55555555, %%edx\n\t"		/* t & 0x55555555 */
+		     "subl %%edx, %[w]\n"			/* w -= t */
+		     "\n\t"
+		     "movl %[w], %%edx\n\t"			/* w -> t */
+		     "shrl $2, %[w]\n\t"			/* w_tmp >> 2 */
+		     "andl $0x33333333, %%edx\n\t"		/* t & 0x33333333 */
+		     "andl $0x33333333, %[w]\n\t"		/* w_tmp & 0x33333333 */
+		     "addl %%edx, %[w]\n"			/* w = w_tmp + t */
+		     "\n\t"
+		     "movl %[w], %%edx\n\t"			/* w -> t */
+		     "shrl $4, %%edx\n\t"			/* t >> 4 */
+		     "addl %%edx, %[w]\n\t"			/* w_tmp += t */
+		     "andl  $0x0f0f0f0f, %[w]\n\t"		/* w_tmp &= 0x0f0f0f0f */
+		     "imull $0x01010101, %[w], %[w]\n\t"	/* w_tmp *= 0x01010101 */
+		     "shrl $24, %[w]\n\t"			/* w = w_tmp >> 24 */
+		     POP_DX
+		     : [w] "+r" (w));
+
+	return w;
+}
+EXPORT_SYMBOL(__sw_hweight32);
diff --git a/lib/Makefile b/lib/Makefile
index a65e9a861535..55ad20701dc0 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -15,9 +15,6 @@ KCOV_INSTRUMENT_rbtree.o := n
 KCOV_INSTRUMENT_list_debug.o := n
 KCOV_INSTRUMENT_debugobjects.o := n
 KCOV_INSTRUMENT_dynamic_debug.o := n
-# Kernel does not boot if we instrument this file as it uses custom calling
-# convention (see CONFIG_ARCH_HWEIGHT_CFLAGS).
-KCOV_INSTRUMENT_hweight.o := n
 
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o dump_stack.o timerqueue.o\
@@ -72,8 +69,6 @@ obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o
 obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o
 obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o
 
-GCOV_PROFILE_hweight.o := n
-CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS))
 obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
 
 obj-$(CONFIG_BTREE) += btree.o
diff --git a/lib/hweight.c b/lib/hweight.c
index 9a5c1f221558..d53137a8def4 100644
--- a/lib/hweight.c
+++ b/lib/hweight.c
@@ -9,6 +9,7 @@
  * The Hamming Weight of a number is the total number of bits set in it.
  */
 
+#ifndef __HAVE_ARCH_SW_HWEIGHT
 unsigned int __sw_hweight32(unsigned int w)
 {
 #ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER
@@ -25,6 +26,7 @@ unsigned int __sw_hweight32(unsigned int w)
 #endif
 }
 EXPORT_SYMBOL(__sw_hweight32);
+#endif
 
 unsigned int __sw_hweight16(unsigned int w)
 {

-- 
Regards/Gruss,
    Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
-- 

^ permalink raw reply related	[flat|nested] 84+ messages in thread

* Re: [PATCH -v2] x86/hweight: Get rid of the special calling convention
  2016-05-12 11:57                                           ` Borislav Petkov
@ 2016-05-12 12:14                                             ` Peter Zijlstra
  2016-05-12 13:09                                               ` Borislav Petkov
  0 siblings, 1 reply; 84+ messages in thread
From: Peter Zijlstra @ 2016-05-12 12:14 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: H. Peter Anvin, Brian Gerst, x86-ml, Denys Vlasenko, LKML,
	Dmitry Vyukov, Andi Kleen, zengzhaoxiu, Thomas Gleixner,
	Ingo Molnar, Andrew Morton, Kees Cook, Zhaoxiu Zeng,
	Andy Lutomirski

On Thu, May 12, 2016 at 01:57:38PM +0200, Borislav Petkov wrote:
> #ifdef CONFIG_X86_32
> # define PUSH_DX        "pushl %%edx\n\t"
> # define POP_DX         "popl %%edx\n\t"
> #else
> # define PUSH_DX        "pushq %%rdx\n\t"
> # define POP_DX         "popq %%rdx\n\t"
> #endif
> 
> unsigned int __sw_hweight32(unsigned int w)
> {
>         asm volatile(PUSH_DX
>                      "movl %[w], %%edx\n\t"                     /* w -> t */
>                      "shrl %%edx\n\t"                           /* t >> 1 */
>                      "andl $0x55555555, %%edx\n\t"              /* t & 0x55555555 */
>                      "subl %%edx, %[w]\n"                       /* w -= t */
>                      "\n\t"
>                      "movl %[w], %%edx\n\t"                     /* w -> t */
>                      "shrl $2, %[w]\n\t"                        /* w_tmp >> 2 */
>                      "andl $0x33333333, %%edx\n\t"              /* t & 0x33333333 */
>                      "andl $0x33333333, %[w]\n\t"               /* w_tmp & 0x33333333 */
>                      "addl %%edx, %[w]\n"                       /* w = w_tmp + t */
>                      "\n\t"
>                      "movl %[w], %%edx\n\t"                     /* w -> t */
>                      "shrl $4, %%edx\n\t"                       /* t >> 4 */
>                      "addl %%edx, %[w]\n\t"                     /* w_tmp += t */
>                      "andl  $0x0f0f0f0f, %[w]\n\t"              /* w_tmp &= 0x0f0f0f0f */
>                      "imull $0x01010101, %[w], %[w]\n\t"        /* w_tmp *= 0x01010101 */
>                      "shrl $24, %[w]\n\t"                       /* w = w_tmp >> 24 */
>                      POP_DX
>                      : [w] "+r" (w));
> 
>         return w;
> }

But this is a C function, with C calling convention. You're now assuming
GCC doesn't clobber anything with its prologue/epilogue.

I think hpa meant to put it in an .S file and avoid all that.

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH -v2] x86/hweight: Get rid of the special calling convention
  2016-05-12 12:14                                             ` Peter Zijlstra
@ 2016-05-12 13:09                                               ` Borislav Petkov
  2016-05-18 10:38                                                 ` Borislav Petkov
  0 siblings, 1 reply; 84+ messages in thread
From: Borislav Petkov @ 2016-05-12 13:09 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: H. Peter Anvin, Brian Gerst, x86-ml, Denys Vlasenko, LKML,
	Dmitry Vyukov, Andi Kleen, zengzhaoxiu, Thomas Gleixner,
	Ingo Molnar, Andrew Morton, Kees Cook, Zhaoxiu Zeng,
	Andy Lutomirski

On Thu, May 12, 2016 at 02:14:52PM +0200, Peter Zijlstra wrote:
> But this is a C function, with C calling convention. You're now assuming
> GCC doesn't clobber anything with its prologue/epilogue.
> 
> I think hpa meant to put it in an .S file and avoid all that.

I wanted to have gcc use %[w] and this way not hardcode the reg but the
ABI kinda hardcodes it to rAX. And you're right about tracing funkyness
adding glue so we're probably better off doing the .S thing directly and
making it more robust this way.

-- 
Regards/Gruss,
    Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
-- 

^ permalink raw reply	[flat|nested] 84+ messages in thread

* Re: [PATCH -v2] x86/hweight: Get rid of the special calling convention
  2016-05-12 13:09                                               ` Borislav Petkov
@ 2016-05-18 10:38                                                 ` Borislav Petkov
  0 siblings, 0 replies; 84+ messages in thread
From: Borislav Petkov @ 2016-05-18 10:38 UTC (permalink / raw)
  To: H. Peter Anvin, Peter Zijlstra
  Cc: Brian Gerst, x86-ml, Denys Vlasenko, LKML, Dmitry Vyukov,
	Andi Kleen, zengzhaoxiu, Thomas Gleixner, Ingo Molnar,
	Andrew Morton, Kees Cook, Zhaoxiu Zeng, Andy Lutomirski

On Thu, May 12, 2016 at 03:09:32PM +0200, Borislav Petkov wrote:
> I wanted to have gcc use %[w] and this way not hardcode the reg but the
> ABI kinda hardcodes it to rAX. And you're right about tracing funkyness
> adding glue so we're probably better off doing the .S thing directly and
> making it more robust this way.

Ok, here's a new version. Let me know what do you think before I hammer
on it more seriously. But booting in kvm with "-popcnt" and without
seems to work, no splats, corruptions or whatnot.

I've also been running POPCNT vs the asm versions in userspace and
comparing results, looks good too, seems to work correctly. :)

Thanks.

---
From: Borislav Petkov <bp@suse.de>
Date: Wed, 4 May 2016 18:52:09 +0200
Subject: [PATCH] x86/hweight: Get rid of the special calling convention

People complained about ARCH_HWEIGHT_CFLAGS and how it throws a
wrench into kcov, lto, etc, experimentation. Add asm versions for
__sw_hweight{32,64}() and do explicitly saving and restoring of
clobbered registers. This gets rid of the special calling convention.

We still need to hardcode POPCNT and register operands as some old gas
versions which we support, do not know about POPCNT.

Btw, remove redundant REX prefix from 32-bit POPCNT because alternatives
can do padding now.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/Kconfig                    |  5 ---
 arch/x86/include/asm/arch_hweight.h | 24 +++++-------
 arch/x86/kernel/i386_ksyms_32.c     |  2 +
 arch/x86/kernel/x8664_ksyms_64.c    |  3 ++
 arch/x86/lib/Makefile               |  2 +-
 arch/x86/lib/hweight.S              | 77 +++++++++++++++++++++++++++++++++++++
 lib/Makefile                        |  5 ---
 lib/hweight.c                       |  4 ++
 8 files changed, 97 insertions(+), 25 deletions(-)
 create mode 100644 arch/x86/lib/hweight.S

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2dc18605831f..c3a8f360683b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -296,11 +296,6 @@ config X86_32_LAZY_GS
 	def_bool y
 	depends on X86_32 && !CC_STACKPROTECTOR
 
-config ARCH_HWEIGHT_CFLAGS
-	string
-	default "-fcall-saved-ecx -fcall-saved-edx" if X86_32
-	default "-fcall-saved-rdi -fcall-saved-rsi -fcall-saved-rdx -fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 -fcall-saved-r11" if X86_64
-
 config ARCH_SUPPORTS_UPROBES
 	def_bool y
 
diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
index 02e799fa43d1..e7cd63175de4 100644
--- a/arch/x86/include/asm/arch_hweight.h
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -4,8 +4,8 @@
 #include <asm/cpufeatures.h>
 
 #ifdef CONFIG_64BIT
-/* popcnt %edi, %eax -- redundant REX prefix for alignment */
-#define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7"
+/* popcnt %edi, %eax */
+#define POPCNT32 ".byte 0xf3,0x0f,0xb8,0xc7"
 /* popcnt %rdi, %rax */
 #define POPCNT64 ".byte 0xf3,0x48,0x0f,0xb8,0xc7"
 #define REG_IN "D"
@@ -17,19 +17,15 @@
 #define REG_OUT "a"
 #endif
 
-/*
- * __sw_hweightXX are called from within the alternatives below
- * and callee-clobbered registers need to be taken care of. See
- * ARCH_HWEIGHT_CFLAGS in <arch/x86/Kconfig> for the respective
- * compiler switches.
- */
+#define __HAVE_ARCH_SW_HWEIGHT
+
 static __always_inline unsigned int __arch_hweight32(unsigned int w)
 {
-	unsigned int res = 0;
+	unsigned int res;
 
 	asm (ALTERNATIVE("call __sw_hweight32", POPCNT32, X86_FEATURE_POPCNT)
-		     : "="REG_OUT (res)
-		     : REG_IN (w));
+			 : "="REG_OUT (res)
+			 : REG_IN (w));
 
 	return res;
 }
@@ -53,11 +49,11 @@ static inline unsigned long __arch_hweight64(__u64 w)
 #else
 static __always_inline unsigned long __arch_hweight64(__u64 w)
 {
-	unsigned long res = 0;
+	unsigned long res;
 
 	asm (ALTERNATIVE("call __sw_hweight64", POPCNT64, X86_FEATURE_POPCNT)
-		     : "="REG_OUT (res)
-		     : REG_IN (w));
+			 : "="REG_OUT (res)
+			 : REG_IN (w));
 
 	return res;
 }
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 64341aa485ae..d40ee8a38fed 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -42,3 +42,5 @@ EXPORT_SYMBOL(empty_zero_page);
 EXPORT_SYMBOL(___preempt_schedule);
 EXPORT_SYMBOL(___preempt_schedule_notrace);
 #endif
+
+EXPORT_SYMBOL(__sw_hweight32);
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index cd05942bc918..f1aebfb49c36 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -44,6 +44,9 @@ EXPORT_SYMBOL(clear_page);
 
 EXPORT_SYMBOL(csum_partial);
 
+EXPORT_SYMBOL(__sw_hweight32);
+EXPORT_SYMBOL(__sw_hweight64);
+
 /*
  * Export string functions. We normally rely on gcc builtin for most of these,
  * but gcc sometimes decides not to inline them.
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 72a576752a7e..ec969cc3eb20 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -25,7 +25,7 @@ lib-y += memcpy_$(BITS).o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
 
-obj-y += msr.o msr-reg.o msr-reg-export.o
+obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
 
 ifeq ($(CONFIG_X86_32),y)
         obj-y += atomic64_32.o
diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S
new file mode 100644
index 000000000000..8cacaf6aa74d
--- /dev/null
+++ b/arch/x86/lib/hweight.S
@@ -0,0 +1,77 @@
+#include <linux/linkage.h>
+
+#include <asm/asm.h>
+
+/*
+ * unsigned int __sw_hweight32(unsigned int w)
+ * %rdi: w
+ */
+ENTRY(__sw_hweight32)
+
+#ifdef CONFIG_X86_64
+	movl %edi, %eax				# w
+#endif
+	__ASM_SIZE(push,) %__ASM_REG(dx)
+	movl %eax, %edx				# w -> t
+	shrl %edx				# t >>= 1
+	andl $0x55555555, %edx			# t &= 0x55555555
+	subl %edx, %eax				# w -= t
+
+	movl %eax, %edx				# w -> t
+	shrl $2, %eax				# w_tmp >>= 2
+	andl $0x33333333, %edx			# t	&= 0x33333333
+	andl $0x33333333, %eax			# w_tmp &= 0x33333333
+	addl %edx, %eax				# w = w_tmp + t
+
+	movl %eax, %edx				# w -> t
+	shrl $4, %edx				# t >>= 4
+	addl %edx, %eax				# w_tmp += t
+	andl  $0x0f0f0f0f, %eax			# w_tmp &= 0x0f0f0f0f
+	imull $0x01010101, %eax, %eax		# w_tmp *= 0x01010101
+	shrl $24, %eax				# w = w_tmp >> 24
+	__ASM_SIZE(pop,) %__ASM_REG(dx)
+	ret
+ENDPROC(__sw_hweight32)
+
+ENTRY(__sw_hweight64)
+#ifdef CONFIG_X86_64
+	pushq   %rdx
+
+	movq    %rdi, %rdx                      # w -> t
+	movabsq $0x5555555555555555, %rax
+	shrq    %rdx                            # t >>= 1
+	andq    %rdx, %rax                      # t &= 0x5555555555555555
+	movabsq $0x3333333333333333, %rdx
+	subq    %rax, %rdi                      # w -= t
+
+	movq    %rdi, %rax                      # w -> t
+	shrq    $2, %rdi                        # w_tmp >>= 2
+	andq    %rdx, %rax                      # t     &= 0x3333333333333333
+	andq    %rdi, %rdx                      # w_tmp &= 0x3333333333333333
+	addq    %rdx, %rax                      # w = w_tmp + t
+
+	movq    %rax, %rdx                      # w -> t
+	shrq    $4, %rdx                        # t >>= 4
+	addq    %rdx, %rax                      # w_tmp += t
+	movabsq $0x0f0f0f0f0f0f0f0f, %rdx
+	andq    %rdx, %rax                      # w_tmp &= 0x0f0f0f0f0f0f0f0f
+	movabsq $0x0101010101010101, %rdx
+	imulq   %rdx, %rax                      # w_tmp *= 0x0101010101010101
+	shrq    $56, %rax                       # w = w_tmp >> 56
+
+	popq    %rdx
+	ret
+#else /* CONFIG_X86_32 */
+        /* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */
+        pushl   %ecx
+
+        call    __sw_hweight32
+        movl    %eax, %ecx                      # stash away result
+        movl    %edx, %eax                      # second part of input
+        call    __sw_hweight32
+        addl    %ecx, %eax                      # result
+
+        popl    %ecx
+        ret
+#endif
+ENDPROC(__sw_hweight64)
diff --git a/lib/Makefile b/lib/Makefile
index 7bd6fd436c97..08ea9f1c0c49 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -15,9 +15,6 @@ KCOV_INSTRUMENT_rbtree.o := n
 KCOV_INSTRUMENT_list_debug.o := n
 KCOV_INSTRUMENT_debugobjects.o := n
 KCOV_INSTRUMENT_dynamic_debug.o := n
-# Kernel does not boot if we instrument this file as it uses custom calling
-# convention (see CONFIG_ARCH_HWEIGHT_CFLAGS).
-KCOV_INSTRUMENT_hweight.o := n
 
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o dump_stack.o timerqueue.o\
@@ -72,8 +69,6 @@ obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o
 obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o
 obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o
 
-GCOV_PROFILE_hweight.o := n
-CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS))
 obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
 
 obj-$(CONFIG_BTREE) += btree.o
diff --git a/lib/hweight.c b/lib/hweight.c
index 9a5c1f221558..43273a7d83cf 100644
--- a/lib/hweight.c
+++ b/lib/hweight.c
@@ -9,6 +9,7 @@
  * The Hamming Weight of a number is the total number of bits set in it.
  */
 
+#ifndef __HAVE_ARCH_SW_HWEIGHT
 unsigned int __sw_hweight32(unsigned int w)
 {
 #ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER
@@ -25,6 +26,7 @@ unsigned int __sw_hweight32(unsigned int w)
 #endif
 }
 EXPORT_SYMBOL(__sw_hweight32);
+#endif
 
 unsigned int __sw_hweight16(unsigned int w)
 {
@@ -43,6 +45,7 @@ unsigned int __sw_hweight8(unsigned int w)
 }
 EXPORT_SYMBOL(__sw_hweight8);
 
+#ifndef __HAVE_ARCH_SW_HWEIGHT
 unsigned long __sw_hweight64(__u64 w)
 {
 #if BITS_PER_LONG == 32
@@ -65,3 +68,4 @@ unsigned long __sw_hweight64(__u64 w)
 #endif
 }
 EXPORT_SYMBOL(__sw_hweight64);
+#endif
-- 
2.7.3

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
-- 

^ permalink raw reply related	[flat|nested] 84+ messages in thread

end of thread, other threads:[~2016-05-18 10:39 UTC | newest]

Thread overview: 84+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-04-05  2:06 [PATCH V2 01/30] bitops: add parity functions Zeng Zhaoxiu
2016-04-05  4:23 ` [PATCH V2 02/30] Include generic parity.h in some architectures' bitops.h Zeng Zhaoxiu
2016-04-06  8:41   ` [PATCH v2 " zengzhaoxiu
2016-04-05 19:04 ` [PATCH V2 01/30] bitops: add parity functions Sam Ravnborg
2016-04-06  5:33   ` Zeng Zhaoxiu
2016-04-06  8:24     ` Sam Ravnborg
2016-04-06  8:22   ` [PATCH v2 " zengzhaoxiu
2016-04-06  8:46 ` [PATCH v2 03/30] Add alpha-specific " zengzhaoxiu
2016-04-06  8:53 ` [PATCH v2 04/30] Add blackfin-specific " zengzhaoxiu
2016-04-06  8:57 ` [PATCH v2 05/30] Add ia64-specific " zengzhaoxiu
2016-04-06  8:59 ` [PATCH v2 06/30] Add mips-specific " zengzhaoxiu
2016-04-06 10:23   ` zengzhaoxiu
2016-04-06  9:03 ` [PATCH v2 07/30] Add powerpc-specific " zengzhaoxiu
2016-04-06  9:07 ` [PATCH v2 08/30] Add sparc-specific " zengzhaoxiu
2016-04-06 18:44   ` Sam Ravnborg
2016-04-07  3:56     ` Zeng Zhaoxiu
2016-04-06  9:08 ` [PATCH v2 09/30] Add tile-specific " zengzhaoxiu
2016-04-06 13:27   ` Chris Metcalf
2016-04-07  3:55     ` Zeng Zhaoxiu
2016-04-06  9:14 ` [PATCH v2 10/30] Add x86-specific " zengzhaoxiu
2016-04-06 10:13   ` Borislav Petkov
2016-04-06 10:37     ` One Thousand Gnomes
2016-04-06 10:53       ` Borislav Petkov
2016-04-07  3:55         ` Zeng Zhaoxiu
2016-04-07  9:39           ` Borislav Petkov
2016-04-11  2:43       ` Zeng Zhaoxiu
2016-04-15  2:11         ` Borislav Petkov
2016-04-07  3:55     ` Zeng Zhaoxiu
2016-04-07  9:41       ` Borislav Petkov
2016-04-06 19:45   ` Andi Kleen
2016-04-07  3:56     ` Zeng Zhaoxiu
2016-04-07  6:31     ` Dmitry Vyukov
2016-04-07  9:43       ` Borislav Petkov
2016-05-04 18:46         ` [RFC PATCH] x86/hweight: Get rid of the special calling convention Borislav Petkov
2016-05-04 19:31           ` Brian Gerst
2016-05-04 19:33             ` H. Peter Anvin
2016-05-04 19:41               ` Borislav Petkov
2016-05-04 19:49                 ` H. Peter Anvin
2016-05-04 20:22                   ` Borislav Petkov
2016-05-04 20:51                     ` H. Peter Anvin
2016-05-04 21:09                     ` Andi Kleen
2016-05-05 13:02                     ` Denys Vlasenko
2016-05-05 14:04                       ` Borislav Petkov
2016-05-10 16:53                         ` [PATCH -v2] " Borislav Petkov
2016-05-10 17:23                           ` Peter Zijlstra
2016-05-10 19:02                             ` Borislav Petkov
2016-05-10 19:03                             ` H. Peter Anvin
2016-05-10 19:10                               ` Borislav Petkov
2016-05-10 22:30                                 ` H. Peter Anvin
2016-05-11  4:11                                   ` Borislav Petkov
2016-05-11 11:15                                     ` Brian Gerst
2016-05-11 11:24                                       ` Peter Zijlstra
2016-05-11 12:47                                         ` Borislav Petkov
2016-05-12  4:54                                         ` H. Peter Anvin
2016-05-12 11:57                                           ` Borislav Petkov
2016-05-12 12:14                                             ` Peter Zijlstra
2016-05-12 13:09                                               ` Borislav Petkov
2016-05-18 10:38                                                 ` Borislav Petkov
2016-04-07 14:10     ` [PATCH v2 10/30] Add x86-specific parity functions One Thousand Gnomes
2016-04-06  9:27 ` [PATCH v2 11/30] sunrpc: use parity8 zengzhaoxiu
2016-04-06  9:30 ` [PATCH v2 12/30] mips: use parity functions in cerr-sb1.c zengzhaoxiu
2016-04-06  9:36 ` [PATCH v2 13/30] bch: use parity32 zengzhaoxiu
2016-04-06  9:39 ` [PATCH v2 14/30] media: use parity8 in vivid-vbi-gen.c zengzhaoxiu
2016-04-06  9:41 ` [PATCH v2 15/30] media: use parity functions in saa7115 zengzhaoxiu
2016-04-06  9:43 ` [PATCH v2 16/30] input: use parity32 in grip_mp zengzhaoxiu
2016-04-06  9:44 ` [PATCH v2 17/30] input: use parity64 in sidewinder zengzhaoxiu
2016-04-06  9:45 ` [PATCH v2 18/30] input: use parity16 in ams_delta_serio zengzhaoxiu
2016-04-06  9:47 ` [PATCH v2 19/30] scsi: use parity32 in isci's phy zengzhaoxiu
2016-04-06  9:52 ` [PATCH v2 20/30] mtd: use parity16 in ssfdc zengzhaoxiu
2016-04-06  9:53 ` [PATCH v2 21/30] mtd: use parity functions in inftlcore zengzhaoxiu
2016-04-06  9:58 ` [PATCH v2 22/30] crypto: use parity functions in qat_hal zengzhaoxiu
2016-04-06 10:05 ` [PATCH v2 23/30] mtd: use parity16 in sm_ftl zengzhaoxiu
2016-04-06 10:11 ` [PATCH v2 24/30] ethernet: use parity8 in sun/niu.c zengzhaoxiu
2016-04-06 10:14 ` [PATCH v2 25/30] input: use parity8 in pcips2 zengzhaoxiu
2016-04-06 10:15 ` [PATCH v2 26/30] input: use parity8 in sa1111ps2 zengzhaoxiu
2016-04-06 10:16 ` [PATCH v2 27/30] iio: use parity32 in adxrs450 zengzhaoxiu
2016-04-10 14:37   ` Jonathan Cameron
2016-04-10 14:41     ` Lars-Peter Clausen
2016-04-10 15:13       ` Jonathan Cameron
2016-04-10 15:14         ` Jonathan Cameron
2016-04-06 10:18 ` [PATCH v2 28/30] serial: use parity32 in max3100 zengzhaoxiu
2016-04-06 10:25   ` Greg KH
2016-04-06 10:20 ` [PATCH v2 29/30] input: use parity8 in elantech zengzhaoxiu
2016-04-06 10:21 ` [PATCH v2 30/30] ethernet: use parity8 in broadcom/tg3.c zengzhaoxiu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).