linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch V3] lib: GCD: add binary GCD algorithm
@ 2016-04-28 11:43 zengzhaoxiu
  2016-04-28 12:18 ` kbuild test robot
                   ` (4 more replies)
  0 siblings, 5 replies; 11+ messages in thread
From: zengzhaoxiu @ 2016-04-28 11:43 UTC (permalink / raw)
  To: akpm, linux, peterz
  Cc: Zhaoxiu Zeng, Richard Henderson, Ivan Kokshaysky, Matt Turner,
	Russell King, Yoshinori Sato, Geert Uytterhoeven, James Hogan,
	Michal Simek, Ralf Baechle, Ley Foon Tan, Jonas Bonn,
	James E.J. Bottomley, Helge Deller, Chen Liqin, Lennox Wu,
	Rich Felker, David S. Miller, linux-kernel, linux-alpha,
	linux-arm-kernel, uclinux-h8-devel, linux-m68k, linux-metag,
	linux-mips, nios2-dev, linux, linux-parisc, linux-sh, sparclinux

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

Because some architectures (alpha, armv6, etc.) don't provide hardware division,
the mod operation is slow! Binary GCD algorithm uses simple arithmetic operations,
it replaces division with arithmetic shifts, comparisons, and subtractions.

I have compiled successfully with x86_64_defconfig and i386_defconfig.

Changes to V2:
- Add a new Kconfig variable CPU_NO_EFFICIENT_FFS
- Separate into two versions by CPU_NO_EFFICIENT_FFS
- Return directly from the loop, rather than using break().
- Use "r &= -r" mostly because it's clearer.
- Improve a little bit in even/odd version

Changes to V1:
- Don't touch Kconfig, remove the Euclidean algorithm implementation
- Don't use the "even-odd" variant
- Use __ffs if the CPU has efficient __ffs

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
Signed-off-by: George Spelvin <linux@horizon.com>
---
 arch/Kconfig                         |  3 ++
 arch/alpha/Kconfig                   |  1 +
 arch/arm/mm/Kconfig                  |  3 ++
 arch/h8300/Kconfig                   |  1 +
 arch/m32r/Kconfig                    |  1 +
 arch/m68k/Kconfig.cpu                | 11 ++++++
 arch/metag/Kconfig                   |  1 +
 arch/microblaze/Kconfig              |  1 +
 arch/mips/include/asm/cpu-features.h |  3 ++
 arch/nios2/Kconfig                   |  1 +
 arch/openrisc/Kconfig                |  1 +
 arch/parisc/Kconfig                  |  1 +
 arch/score/Kconfig                   |  1 +
 arch/sh/Kconfig                      |  1 +
 arch/sparc/Kconfig                   |  1 +
 lib/gcd.c                            | 66 +++++++++++++++++++++++++++++++-----
 16 files changed, 88 insertions(+), 9 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 81869a5..275f17d 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -638,4 +638,7 @@ config COMPAT_OLD_SIGACTION
 config ARCH_NO_COHERENT_DMA_MMAP
 	bool
 
+config CPU_NO_EFFICIENT_FFS
+	def_bool n
+
 source "kernel/gcov/Kconfig"
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 9d8a858..44e6f05 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -27,6 +27,7 @@ config ALPHA
 	select MODULES_USE_ELF_RELA
 	select ODD_RT_SIGACTION
 	select OLD_SIGSUSPEND
+	select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67
 	help
 	  The Alpha is a 64-bit general-purpose processor designed and
 	  marketed by the Digital Equipment Corporation of blessed memory,
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index 5534766..cb569b6 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -421,18 +421,21 @@ config CPU_32v3
 	select CPU_USE_DOMAINS if MMU
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_NO_EFFICIENT_FFS
 
 config CPU_32v4
 	bool
 	select CPU_USE_DOMAINS if MMU
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_NO_EFFICIENT_FFS
 
 config CPU_32v4T
 	bool
 	select CPU_USE_DOMAINS if MMU
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_NO_EFFICIENT_FFS
 
 config CPU_32v5
 	bool
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index 986ea84..aa232de 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -20,6 +20,7 @@ config H8300
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZO
 	select HAVE_ARCH_KGDB
+	select CPU_NO_EFFICIENT_FFS
 
 config RWSEM_GENERIC_SPINLOCK
 	def_bool y
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index c82b292..3cc8498 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -17,6 +17,7 @@ config M32R
 	select ARCH_USES_GETTIMEOFFSET
 	select MODULES_USE_ELF_RELA
 	select HAVE_DEBUG_STACKOVERFLOW
+	select CPU_NO_EFFICIENT_FFS
 
 config SBUS
 	bool
diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu
index 0dfcf12..0b6efe8 100644
--- a/arch/m68k/Kconfig.cpu
+++ b/arch/m68k/Kconfig.cpu
@@ -40,6 +40,7 @@ config M68000
 	select CPU_HAS_NO_MULDIV64
 	select CPU_HAS_NO_UNALIGNED
 	select GENERIC_CSUM
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The Freescale (was Motorola) 68000 CPU is the first generation of
 	  the well known M68K family of processors. The CPU core as well as
@@ -51,6 +52,7 @@ config MCPU32
 	bool
 	select CPU_HAS_NO_BITFIELDS
 	select CPU_HAS_NO_UNALIGNED
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The Freescale (was then Motorola) CPU32 is a CPU core that is
 	  based on the 68020 processor. For the most part it is used in
@@ -130,6 +132,7 @@ config M5206
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5206 processor support.
 
@@ -138,6 +141,7 @@ config M5206e
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5206e processor support.
 
@@ -163,6 +167,7 @@ config M5249
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5249 processor support.
 
@@ -171,6 +176,7 @@ config M525x
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Freescale (Motorola) Coldfire 5251/5253 processor support.
 
@@ -189,6 +195,7 @@ config M5272
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5272 processor support.
 
@@ -217,6 +224,7 @@ config M5307
 	select COLDFIRE_SW_A7
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5307 processor support.
 
@@ -242,6 +250,7 @@ config M5407
 	select COLDFIRE_SW_A7
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5407 processor support.
 
@@ -251,6 +260,7 @@ config M547x
 	select MMU_COLDFIRE if MMU
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Freescale ColdFire 5470/5471/5472/5473/5474/5475 processor support.
 
@@ -260,6 +270,7 @@ config M548x
 	select M54xx
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Freescale ColdFire 5480/5481/5482/5483/5484/5485 processor support.
 
diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig
index a0fa88d..2ac2de6 100644
--- a/arch/metag/Kconfig
+++ b/arch/metag/Kconfig
@@ -29,6 +29,7 @@ config METAG
 	select OF
 	select OF_EARLY_FLATTREE
 	select SPARSE_IRQ
+	select CPU_NO_EFFICIENT_FFS
 
 config STACKTRACE_SUPPORT
 	def_bool y
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 3d793b5..f17c3a4 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -32,6 +32,7 @@ config MICROBLAZE
 	select OF_EARLY_FLATTREE
 	select TRACING_SUPPORT
 	select VIRT_TO_BUS
+	select CPU_NO_EFFICIENT_FFS
 
 config SWAP
 	def_bool n
diff --git a/arch/mips/include/asm/cpu-features.h b/arch/mips/include/asm/cpu-features.h
index eeec8c8..fd4ae7d 100644
--- a/arch/mips/include/asm/cpu-features.h
+++ b/arch/mips/include/asm/cpu-features.h
@@ -288,6 +288,9 @@
 #ifndef cpu_has_clo_clz
 #define cpu_has_clo_clz	cpu_has_mips_r
 #endif
+#if !cpu_has_clo_clz
+#define CONFIG_CPU_NO_EFFICIENT_FFS 1
+#endif
 
 /*
  * MIPS32 R2, MIPS64 R2, Loongson 3A and Octeon have WSBH.
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index 4375554..f10bd2c 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -16,6 +16,7 @@ config NIOS2
 	select SOC_BUS
 	select SPARSE_IRQ
 	select USB_ARCH_HAS_HCD if USB_SUPPORT
+	select CPU_NO_EFFICIENT_FFS
 
 config GENERIC_CSUM
 	def_bool y
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index e118c02..142cb05 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -25,6 +25,7 @@ config OPENRISC
 	select MODULES_USE_ELF_RELA
 	select HAVE_DEBUG_STACKOVERFLOW
 	select OR1K_PIC
+	select CPU_NO_EFFICIENT_FFS if !OPENRISC_HAVE_INST_FF1
 
 config MMU
 	def_bool y
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 88cfaa8..3d498a6 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -32,6 +32,7 @@ config PARISC
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_SECCOMP_FILTER
 	select ARCH_NO_COHERENT_DMA_MMAP
+	select CPU_NO_EFFICIENT_FFS
 
 	help
 	  The PA-RISC microprocessor is designed by Hewlett-Packard and used
diff --git a/arch/score/Kconfig b/arch/score/Kconfig
index 366e1b5..507d631 100644
--- a/arch/score/Kconfig
+++ b/arch/score/Kconfig
@@ -14,6 +14,7 @@ config SCORE
 	select VIRT_TO_BUS
 	select MODULES_USE_ELF_REL
 	select CLONE_BACKWARDS
+	select CPU_NO_EFFICIENT_FFS
 
 choice
 	prompt "System type"
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 7ed20fc..56cf5e5 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -44,6 +44,7 @@ config SUPERH
 	select OLD_SIGSUSPEND
 	select OLD_SIGACTION
 	select HAVE_ARCH_AUDITSYSCALL
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The SuperH is a RISC processor targeted for use in embedded systems
 	  and consumer electronics; it was also used in the Sega Dreamcast
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 57ffaf2..ca675ed 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -42,6 +42,7 @@ config SPARC
 	select ODD_RT_SIGACTION
 	select OLD_SIGSUSPEND
 	select ARCH_HAS_SG_CHAIN
+	select CPU_NO_EFFICIENT_FFS
 
 config SPARC32
 	def_bool !64BIT
diff --git a/lib/gcd.c b/lib/gcd.c
index 3657f12..eba7d4e 100644
--- a/lib/gcd.c
+++ b/lib/gcd.c
@@ -2,20 +2,68 @@
 #include <linux/gcd.h>
 #include <linux/export.h>
 
-/* Greatest common divisor */
+/*
+ * This implements the binary GCD algorithm. (Often attributed to Stein,
+ * but as Knuth has noted, appears a first-century Chinese math text.)
+ *
+ * This is faster than the division-based algorithm even on x86, which
+ * has decent hardware division.
+ */
+
+#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS)
+
+/* If __ffs is available, the even/odd algorithm benchmarks slower. */
 unsigned long gcd(unsigned long a, unsigned long b)
 {
-	unsigned long r;
+	unsigned long r = a | b;
+
+	if (!a || !b)
+		return r;
 
-	if (a < b)
-		swap(a, b);
+	b >>= __ffs(b);
 
-	if (!b)
-		return a;
-	while ((r = a % b) != 0) {
-		a = b;
-		b = r;
+	for (;;) {
+		a >>= __ffs(a);
+		if (a == b)
+			return a << __ffs(r);
+		if (a < b)
+			swap(a, b);
+		a -= b;
 	}
+}
+
+#else
+
+/* If normalization is done by loops, the even/odd algorithm is a win. */
+unsigned long gcd(unsigned long a, unsigned long b)
+{
+	unsigned long r = a | b;
+
+	if (!a || !b)
+		return r;
+
+	/* Isolate lsbit of r */
+	r &= -r;
+
+	while (!(a & r))
+		a >>= 1;
+	while (!(b & r))
+		b >>= 1;
+
+	while (a != b) {
+		if (a < b)
+			swap(a, b);
+		a -= b;
+
+		a >>= 1;
+		if (a & r)
+			a += b;
+		do a >>= 1; while (!(a & r));
+	}
+
 	return b;
 }
+
+#endif
+
 EXPORT_SYMBOL_GPL(gcd);
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [patch V3] lib: GCD: add binary GCD algorithm
  2016-04-28 11:43 [patch V3] lib: GCD: add binary GCD algorithm zengzhaoxiu
@ 2016-04-28 12:18 ` kbuild test robot
  2016-04-28 16:48 ` George Spelvin
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 11+ messages in thread
From: kbuild test robot @ 2016-04-28 12:18 UTC (permalink / raw)
  To: zengzhaoxiu
  Cc: kbuild-all, akpm, linux, peterz, Zhaoxiu Zeng, Richard Henderson,
	Ivan Kokshaysky, Matt Turner, Russell King, Yoshinori Sato,
	Geert Uytterhoeven, James Hogan, Michal Simek, Ralf Baechle,
	Ley Foon Tan, Jonas Bonn, James E.J. Bottomley, Helge Deller,
	Chen Liqin, Lennox Wu, Rich Felker, David S. Miller,
	linux-kernel, linux-alpha, linux-arm-kernel, uclinux-h8-devel,
	linux-m68k, linux-metag, linux-mips, nios2-dev, linux,
	linux-parisc, linux-sh, sparclinux

[-- Attachment #1: Type: text/plain, Size: 14162 bytes --]

Hi,

[auto build test ERROR on v4.6-rc5]
[cannot apply to next-20160428]
[if your patch is applied to the wrong git tree, please drop us a note to help improving the system]

url:    https://github.com/0day-ci/linux/commits/zengzhaoxiu-163-com/lib-GCD-add-binary-GCD-algorithm/20160428-195527
config: mips-allyesconfig (attached as .config)
reproduce:
        wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=mips 

All error/warnings (new ones prefixed by >>):

   In file included from arch/mips/include/asm/bitops.h:21:0,
                    from include/linux/bitops.h:36,
                    from include/linux/kernel.h:10,
                    from include/asm-generic/bug.h:13,
                    from arch/mips/include/asm/bug.h:41,
                    from include/linux/bug.h:4,
                    from include/linux/page-flags.h:9,
                    from kernel/bounds.c:9:
>> arch/mips/include/asm/cpu-features.h:205:28: warning: "cpu_data" is not defined [-Wundef]
    # define cpu_has_mips32r6 (cpu_data[0].isa_level & MIPS_CPU_ISA_M32R6)
                               ^
>> arch/mips/include/asm/cpu-features.h:241:5: note: in expansion of macro 'cpu_has_mips32r6'
        cpu_has_mips32r6 | cpu_has_mips64r1 | \
        ^
>> arch/mips/include/asm/cpu-features.h:289:25: note: in expansion of macro 'cpu_has_mips_r'
    #define cpu_has_clo_clz cpu_has_mips_r
                            ^
>> arch/mips/include/asm/cpu-features.h:291:6: note: in expansion of macro 'cpu_has_clo_clz'
    #if !cpu_has_clo_clz
         ^
>> arch/mips/include/asm/cpu-features.h:205:36: error: token "[" is not valid in preprocessor expressions
    # define cpu_has_mips32r6 (cpu_data[0].isa_level & MIPS_CPU_ISA_M32R6)
                                       ^
>> arch/mips/include/asm/cpu-features.h:241:5: note: in expansion of macro 'cpu_has_mips32r6'
        cpu_has_mips32r6 | cpu_has_mips64r1 | \
        ^
>> arch/mips/include/asm/cpu-features.h:289:25: note: in expansion of macro 'cpu_has_mips_r'
    #define cpu_has_clo_clz cpu_has_mips_r
                            ^
>> arch/mips/include/asm/cpu-features.h:291:6: note: in expansion of macro 'cpu_has_clo_clz'
    #if !cpu_has_clo_clz
         ^
   make[2]: *** [kernel/bounds.s] Error 1
   make[2]: Target '__build' not remade because of errors.
   make[1]: *** [prepare0] Error 2
   make[1]: Target 'prepare' not remade because of errors.
   make: *** [sub-make] Error 2

vim +205 arch/mips/include/asm/cpu-features.h

0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  199  # define cpu_has_mips32r1	(cpu_data[0].isa_level & MIPS_CPU_ISA_M32R1)
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  200  #endif
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  201  #ifndef cpu_has_mips32r2
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  202  # define cpu_has_mips32r2	(cpu_data[0].isa_level & MIPS_CPU_ISA_M32R2)
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  203  #endif
34c56fc1c arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  204  #ifndef cpu_has_mips32r6
34c56fc1c arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13 @205  # define cpu_has_mips32r6	(cpu_data[0].isa_level & MIPS_CPU_ISA_M32R6)
34c56fc1c arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  206  #endif
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  207  #ifndef cpu_has_mips64r1
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  208  # define cpu_has_mips64r1	(cpu_data[0].isa_level & MIPS_CPU_ISA_M64R1)
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  209  #endif
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  210  #ifndef cpu_has_mips64r2
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  211  # define cpu_has_mips64r2	(cpu_data[0].isa_level & MIPS_CPU_ISA_M64R2)
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  212  #endif
34c56fc1c arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  213  #ifndef cpu_has_mips64r6
34c56fc1c arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  214  # define cpu_has_mips64r6	(cpu_data[0].isa_level & MIPS_CPU_ISA_M64R6)
34c56fc1c arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  215  #endif
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  216  
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  217  /*
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  218   * Shortcuts ...
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  219   */
08a07904e arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  220  #define cpu_has_mips_2_3_4_5	(cpu_has_mips_2 | cpu_has_mips_3_4_5)
08a07904e arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  221  #define cpu_has_mips_3_4_5	(cpu_has_mips_3 | cpu_has_mips_4_5)
08a07904e arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  222  #define cpu_has_mips_4_5	(cpu_has_mips_4 | cpu_has_mips_5)
08a07904e arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  223  
08a07904e arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  224  #define cpu_has_mips_2_3_4_5_r	(cpu_has_mips_2 | cpu_has_mips_3_4_5_r)
08a07904e arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  225  #define cpu_has_mips_3_4_5_r	(cpu_has_mips_3 | cpu_has_mips_4_5_r)
08a07904e arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  226  #define cpu_has_mips_4_5_r	(cpu_has_mips_4 | cpu_has_mips_5_r)
08a07904e arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  227  #define cpu_has_mips_5_r	(cpu_has_mips_5 | cpu_has_mips_r)
08a07904e arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  228  
2d83fea78 arch/mips/include/asm/cpu-features.h Maciej W. Rozycki 2015-04-03  229  #define cpu_has_mips_3_4_5_64_r2_r6					\
2d83fea78 arch/mips/include/asm/cpu-features.h Maciej W. Rozycki 2015-04-03  230  				(cpu_has_mips_3 | cpu_has_mips_4_5_64_r2_r6)
2d83fea78 arch/mips/include/asm/cpu-features.h Maciej W. Rozycki 2015-04-03  231  #define cpu_has_mips_4_5_64_r2_r6					\
2d83fea78 arch/mips/include/asm/cpu-features.h Maciej W. Rozycki 2015-04-03  232  				(cpu_has_mips_4_5 | cpu_has_mips64r1 |	\
2d83fea78 arch/mips/include/asm/cpu-features.h Maciej W. Rozycki 2015-04-03  233  				 cpu_has_mips_r2 | cpu_has_mips_r6)
08a07904e arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  234  
34c56fc1c arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  235  #define cpu_has_mips32	(cpu_has_mips32r1 | cpu_has_mips32r2 | cpu_has_mips32r6)
34c56fc1c arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  236  #define cpu_has_mips64	(cpu_has_mips64r1 | cpu_has_mips64r2 | cpu_has_mips64r6)
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  237  #define cpu_has_mips_r1 (cpu_has_mips32r1 | cpu_has_mips64r1)
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  238  #define cpu_has_mips_r2 (cpu_has_mips32r2 | cpu_has_mips64r2)
34c56fc1c arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  239  #define cpu_has_mips_r6	(cpu_has_mips32r6 | cpu_has_mips64r6)
c46b302b9 arch/mips/include/asm/cpu-features.h Ralf Baechle      2008-10-28  240  #define cpu_has_mips_r	(cpu_has_mips32r1 | cpu_has_mips32r2 | \
34c56fc1c arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13 @241  			 cpu_has_mips32r6 | cpu_has_mips64r1 | \
34c56fc1c arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  242  			 cpu_has_mips64r2 | cpu_has_mips64r6)
34c56fc1c arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  243  
34c56fc1c arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  244  /* MIPSR2 and MIPSR6 have a lot of similarities */
34c56fc1c arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  245  #define cpu_has_mips_r2_r6	(cpu_has_mips_r2 | cpu_has_mips_r6)
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  246  
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  247  /*
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  248   * cpu_has_mips_r2_exec_hazard - return if IHB is required on current processor
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  249   *
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  250   * Returns non-zero value if the current processor implementation requires
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  251   * an IHB instruction to deal with an instruction hazard as per MIPS R2
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  252   * architecture specification, zero otherwise.
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  253   */
41f0e4d04 arch/mips/include/asm/cpu-features.h David Daney       2009-05-12  254  #ifndef cpu_has_mips_r2_exec_hazard
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  255  #define cpu_has_mips_r2_exec_hazard					\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  256  ({									\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  257  	int __res;							\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  258  									\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  259  	switch (current_cpu_type()) {					\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  260  	case CPU_M14KC:							\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  261  	case CPU_74K:							\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  262  	case CPU_1074K:							\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  263  	case CPU_PROAPTIV:						\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  264  	case CPU_P5600:							\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  265  	case CPU_M5150:							\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  266  	case CPU_QEMU_GENERIC:						\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  267  	case CPU_CAVIUM_OCTEON:						\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  268  	case CPU_CAVIUM_OCTEON_PLUS:					\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  269  	case CPU_CAVIUM_OCTEON2:					\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  270  	case CPU_CAVIUM_OCTEON3:					\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  271  		__res = 0;						\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  272  		break;							\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  273  									\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  274  	default:							\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  275  		__res = 1;						\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  276  	}								\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  277  									\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  278  	__res;								\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  279  })
41f0e4d04 arch/mips/include/asm/cpu-features.h David Daney       2009-05-12  280  #endif
41f0e4d04 arch/mips/include/asm/cpu-features.h David Daney       2009-05-12  281  
47740eb88 arch/mips/include/asm/cpu-features.h Ralf Baechle      2009-04-19  282  /*
47740eb88 arch/mips/include/asm/cpu-features.h Ralf Baechle      2009-04-19  283   * MIPS32, MIPS64, VR5500, IDT32332, IDT32334 and maybe a few other
becee6b8c arch/mips/include/asm/cpu-features.h Maciej W. Rozycki 2013-09-22  284   * pre-MIPS32/MIPS64 processors have CLO, CLZ.	The IDT RC64574 is 64-bit and
417a5eb02 arch/mips/include/asm/cpu-features.h Ralf Baechle      2010-08-05  285   * has CLO and CLZ but not DCLO nor DCLZ.  For 64-bit kernels
47740eb88 arch/mips/include/asm/cpu-features.h Ralf Baechle      2009-04-19  286   * cpu_has_clo_clz also indicates the availability of DCLO and DCLZ.
47740eb88 arch/mips/include/asm/cpu-features.h Ralf Baechle      2009-04-19  287   */
47740eb88 arch/mips/include/asm/cpu-features.h Ralf Baechle      2009-04-19  288  #ifndef cpu_has_clo_clz
47740eb88 arch/mips/include/asm/cpu-features.h Ralf Baechle      2009-04-19 @289  #define cpu_has_clo_clz	cpu_has_mips_r
47740eb88 arch/mips/include/asm/cpu-features.h Ralf Baechle      2009-04-19  290  #endif
35e1a24e8 arch/mips/include/asm/cpu-features.h Zhaoxiu Zeng      2016-04-28 @291  #if !cpu_has_clo_clz
35e1a24e8 arch/mips/include/asm/cpu-features.h Zhaoxiu Zeng      2016-04-28  292  #define CONFIG_CPU_NO_EFFICIENT_FFS 1
35e1a24e8 arch/mips/include/asm/cpu-features.h Zhaoxiu Zeng      2016-04-28  293  #endif
47740eb88 arch/mips/include/asm/cpu-features.h Ralf Baechle      2009-04-19  294  

:::::: The code at line 205 was first introduced by commit
:::::: 34c56fc1c167facc375d927687df0a3891d164ac MIPS: asm: cpu: Add MIPSR6 ISA definitions

:::::: TO: Leonid Yegoshin <Leonid.Yegoshin@imgtec.com>
:::::: CC: Markos Chandras <markos.chandras@imgtec.com>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/octet-stream, Size: 41439 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch V3] lib: GCD: add binary GCD algorithm
  2016-04-28 11:43 [patch V3] lib: GCD: add binary GCD algorithm zengzhaoxiu
  2016-04-28 12:18 ` kbuild test robot
@ 2016-04-28 16:48 ` George Spelvin
  2016-04-28 17:51   ` Geert Uytterhoeven
  2016-04-28 19:54   ` Sam Ravnborg
  2016-04-28 17:10 ` Josh Juran
                   ` (2 subsequent siblings)
  4 siblings, 2 replies; 11+ messages in thread
From: George Spelvin @ 2016-04-28 16:48 UTC (permalink / raw)
  To: akpm, linux, peterz, zengzhaoxiu
  Cc: dalias, davem, deller, geert, ink, james.hogan, jejb, jonas,
	lennox.wu, lftan, linux-alpha, linux-arm-kernel, linux-kernel,
	linux-m68k, linux-metag, linux-mips, linux-parisc, linux-sh,
	linux, linux, liqin.linux, mattst88, monstr, nios2-dev, ralf,
	rth, sparclinux, uclinux-h8-devel, ysato, zhaoxiu.zeng

Another few comments:

1. Would ARCH_HAS_FAST_FFS involve fewer changes than CPU_NO_EFFICIENT_FFS?
   
   Rather than updating all the Kconfig files, it could be #defined in
   the arch/*/asm/bitops.h files where the __ffs macro is defined.  E.g.

diff --git a/arch/alpha/include/asm/bitops.h b/arch/alpha/include/asm/bitops.h
index 4bdfbd44..c9c307a8 100644
--- a/arch/alpha/include/asm/bitops.h
+++ b/arch/alpha/include/asm/bitops.h
@@ -333,6 +333,7 @@ static inline unsigned long ffz(unsigned long word)
 static inline unsigned long __ffs(unsigned long word)
 {
 #if defined(CONFIG_ALPHA_EV6) && defined(CONFIG_ALPHA_EV67)
+#define ARCH_HAS_FAST_FFS 1
 	/* Whee.  EV67 can calculate it directly.  */
 	return __kernel_cttz(word);
 #else

Looking at the available architectures (list below), it looks like we
have 9 with fast __ffs, 13 without, and 7 where it depends on the model.
Three of the architectures without could have one written.

In terms of code changes, ARCH_HAS_SLOW_FFS would be slightly smaller,
inserting it into the asm-generic version and the few arch-specific
versions (marked "NO" below) which have optimized but bit-at-a-time code.

__ffs on the available architectures:
	Alpha: sometimes (CONFIG_ALPHA_EV6, CONFIG_ALPHA_EV67)
	ARC: sometimes (!CONFIG_ISA_ARCOMPACT)
	ARM: sometimes (V5+)
	ARM64: NO, could be written using RBIT and CLZ
	AVR: yes
	Blackfin: NO, could be written using hweight()
	C6x: yes
	CRIS: NO
	FR-V: yes
	H8300: NO
	Hexagon: yes
	IA64: yes
	M32R: NO
	M68k: sometimes
	MetaG: NO
	Microblaze: NO
	MIPS: sometimes
	MN10300: yes
	OpenRISC: NO
	PA-RISC: NO?  Interesting code, but I think it's a net loss.
	PowerPC: yes
	S390: sometimes (CONFIG_HAVE_MARCH_Z9_109_FEATURES)
	Score: NO
	SH: NO
	SPARC: NO
	Tile: NO, could be written using hweight()
	Unicore32: yes
	x86: yes
	Xtensa: sometimes (XCHAL_HAVE_NSA)


2. The documentation could definitely be improved.  If I may humbly
   recommend something like the following.  I think it's particularly
   important to say in the summary line that this is replacing something
   rather than adding (which sets off code bloat alarms).

+Subject: lib: GCD: Use binary GCD algorithm instead of Euclidean
+
+Even on x86 machines with reasonable division hardware, the binary
+algorithm runs about 25% faster (80% the execution time) than the
+division-based Euclidian algorithm.
+
+On platforms like Alpha and ARMv6 where division is a function call to
+emulation code, it's even more significant.
+
+There are two variants of the code here, depending on whether a
+fast __ffs (find least significant set bit) instruction is available.
+This allows the unpredictable branches in the bit-at-a-time shifting
+loop to be eliminated.
+
+If fast __ffs is not available, the "even/odd" GCD variant is used.
+This adds an additional test in the loop to choose between
+(a-b)/4 and (a+b)/4, dividing by 4 each iteration.  If fast __ffs
+is available, this doesn't help.


3. The benchmarking could be made more realistic.  Zhaoxiu Zeng's test
   program uses full-width random inputs.  However, if there is a large
   difference in the size of the inputs, it takes the binary algorithm
   many steps to do what division does in one.
   
   (Aside: I'd use informal address, but I don't know which name to use.
   Are those names in Western given-family order Zhaoxiu ZENG, or Eastern
   family-given order ZHAOXIU Zeng?)

For example, if I benchmark
	gcd(random64(), 1000000)
the binary code is barely faster on a Phenom, and if I drop that to
1000, it's actually 25% slower.  On Ivy Bridge, the binary code is
still consistently faster in both cases.

On a Pentium 4, if anyone cares, the binary code is 2.4x faster
on full-width inputs (32 bits in this case!), 25% faster with a fixed
1,000,000 and about 7% slower with a fixed 1,000.

It still seems like a net win to me, especially as the large speedups
apply to the worst case (so you're saving large*large), while the
slowdowns apply to the best case (you're losing small*small).
But if someone wants to do suggest more realistic benchmark conditions,
it would be interesting.

This entire function isn't actually used in any performance-sensitive
places AFAICT, so it's not very important one way or another, but I
understand the urge.


One way I changed the benchmark program was to eliminate the sleep(1)
calls, bump the iteration count 100-fold, and do two passes over the
list of, discarding the first one.

Without that, the Euclidean algorithm, being the first to run, gets a
huge penalty due to the CPU throttling up in the middle of its run.

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [patch V3] lib: GCD: add binary GCD algorithm
  2016-04-28 11:43 [patch V3] lib: GCD: add binary GCD algorithm zengzhaoxiu
  2016-04-28 12:18 ` kbuild test robot
  2016-04-28 16:48 ` George Spelvin
@ 2016-04-28 17:10 ` Josh Juran
  2016-04-28 17:22 ` James Bottomley
  2016-05-20 10:27 ` kbuild test robot
  4 siblings, 0 replies; 11+ messages in thread
From: Josh Juran @ 2016-04-28 17:10 UTC (permalink / raw)
  To: zengzhaoxiu
  Cc: akpm, linux, peterz, Zhaoxiu Zeng, Richard Henderson,
	Ivan Kokshaysky, Matt Turner, Russell King, Yoshinori Sato,
	Geert Uytterhoeven, James Hogan, Michal Simek, Ralf Baechle,
	Ley Foon Tan, Jonas Bonn, James E.J. Bottomley, Helge Deller,
	Chen Liqin, Lennox Wu, Rich Felker, David S. Miller,
	linux-kernel, linux-alpha, linux-arm-kernel, uclinux-h8-devel,
	linux-m68k, linux-metag, linux-mips, nios2-dev, linux,
	linux-parisc, linux-sh, sparclinux

On Apr 28, 2016, at 7:43 AM, zengzhaoxiu@163.com wrote:

> + * This implements the binary GCD algorithm. (Often attributed to Stein,
> + * but as Knuth has noted, appears a first-century Chinese math text.)

Should this be "appears in a"?

Josh

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch V3] lib: GCD: add binary GCD algorithm
  2016-04-28 11:43 [patch V3] lib: GCD: add binary GCD algorithm zengzhaoxiu
                   ` (2 preceding siblings ...)
  2016-04-28 17:10 ` Josh Juran
@ 2016-04-28 17:22 ` James Bottomley
  2016-05-20 10:27 ` kbuild test robot
  4 siblings, 0 replies; 11+ messages in thread
From: James Bottomley @ 2016-04-28 17:22 UTC (permalink / raw)
  To: zengzhaoxiu, akpm, linux, peterz
  Cc: Zhaoxiu Zeng, Richard Henderson, Ivan Kokshaysky, Matt Turner,
	Russell King, Yoshinori Sato, Geert Uytterhoeven, James Hogan,
	Michal Simek, Ralf Baechle, Ley Foon Tan, Jonas Bonn,
	James E.J. Bottomley, Helge Deller, Chen Liqin, Lennox Wu,
	Rich Felker, David S. Miller, linux-kernel, linux-alpha,
	linux-arm-kernel, uclinux-h8-devel, linux-m68k, linux-metag,
	linux-mips, nios2-dev, linux, linux-parisc, linux-sh, sparclinux

On Thu, 2016-04-28 at 19:43 +0800, zengzhaoxiu@163.com wrote:
> From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
> 
> Because some architectures (alpha, armv6, etc.) don't provide 
> hardware division, the mod operation is slow! Binary GCD algorithm 
> uses simple arithmetic operations, it replaces division with 
> arithmetic shifts, comparisons, and subtractions.
> 
> I have compiled successfully with x86_64_defconfig and 
> i386_defconfig.

What's the reason for wanting to optimize this and thus have to
maintain (and test) two separate code paths, which is a significant
expense? As far as I can see, gcd() is mosly used in finding optimal
clocks for operations, which is usually done at start of day and not
time critical.

James

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch V3] lib: GCD: add binary GCD algorithm
  2016-04-28 16:48 ` George Spelvin
@ 2016-04-28 17:51   ` Geert Uytterhoeven
  2016-04-28 17:58     ` Rich Felker
  2016-04-28 19:54   ` Sam Ravnborg
  1 sibling, 1 reply; 11+ messages in thread
From: Geert Uytterhoeven @ 2016-04-28 17:51 UTC (permalink / raw)
  To: George Spelvin
  Cc: Andrew Morton, Peter Zijlstra, zengzhaoxiu, Rich Felker,
	David S. Miller, Helge Deller, Ivan Kokshaysky, James Hogan,
	James E.J. Bottomley, Jonas Bonn, Lennox Wu, Ley Foon Tan, alpha,
	linux-arm-kernel, linux-kernel, linux-m68k,
	open list:METAG ARCHITECTURE, Linux MIPS Mailing List,
	Parisc List, Linux-sh list, Russell King, linux, Chen Liqin,
	Matt Turner, Michal Simek, nios2-dev, Ralf Baechle,
	Richard Henderson, sparclinux, uclinux-h8-devel, Yoshinori Sato,
	zhaoxiu.zeng

On Thu, Apr 28, 2016 at 6:48 PM, George Spelvin <linux@horizon.com> wrote:
> Another few comments:
>
> 1. Would ARCH_HAS_FAST_FFS involve fewer changes than CPU_NO_EFFICIENT_FFS?

No, as you want to _disable_ ARCH_HAS_FAST_FFS / _enable_
CPU_NO_EFFICIENT_FFS as soon as you're enabling support for a
CPU that doesn't support it.

Logical OR is easier in both the Kconfig and C preprocessor languages
than logical NAND.

E.g. in Kconfig, a CPU core not supporting it can just select
CPU_NO_EFFICIENT_FFS.

Gr{oetje,eeting}s,

                        Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch V3] lib: GCD: add binary GCD algorithm
  2016-04-28 17:51   ` Geert Uytterhoeven
@ 2016-04-28 17:58     ` Rich Felker
  2016-04-28 18:11       ` Geert Uytterhoeven
  2016-04-28 19:15       ` George Spelvin
  0 siblings, 2 replies; 11+ messages in thread
From: Rich Felker @ 2016-04-28 17:58 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: George Spelvin, Andrew Morton, Peter Zijlstra, zengzhaoxiu,
	David S. Miller, Helge Deller, Ivan Kokshaysky, James Hogan,
	James E.J. Bottomley, Jonas Bonn, Lennox Wu, Ley Foon Tan, alpha,
	linux-arm-kernel, linux-kernel, linux-m68k,
	open list:METAG ARCHITECTURE, Linux MIPS Mailing List,
	Parisc List, Linux-sh list, Russell King, linux, Chen Liqin,
	Matt Turner, Michal Simek, nios2-dev, Ralf Baechle,
	Richard Henderson, sparclinux, uclinux-h8-devel, Yoshinori Sato,
	zhaoxiu.zeng

On Thu, Apr 28, 2016 at 07:51:06PM +0200, Geert Uytterhoeven wrote:
> On Thu, Apr 28, 2016 at 6:48 PM, George Spelvin <linux@horizon.com> wrote:
> > Another few comments:
> >
> > 1. Would ARCH_HAS_FAST_FFS involve fewer changes than CPU_NO_EFFICIENT_FFS?
> 
> No, as you want to _disable_ ARCH_HAS_FAST_FFS / _enable_
> CPU_NO_EFFICIENT_FFS as soon as you're enabling support for a
> CPU that doesn't support it.
> 
> Logical OR is easier in both the Kconfig and C preprocessor languages
> than logical NAND.
> 
> E.g. in Kconfig, a CPU core not supporting it can just select
> CPU_NO_EFFICIENT_FFS.

How does a CPU lack an efficient ffs/ctz anyway? There are all sorts
of ways to implement it without a native insn, some of which are
almost or just as fast as the native insn on cpus that have the
latter. On anything with a fast multiply, the de Bruijn sequence
approach is near-optimal, and otherwise one of the binary-search type
approaches (possibly branchless) can be used. If the compiler doesn't
generate an appropriate one for __builtin_ctz, that's arguably a
compiler bug.

Rich

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch V3] lib: GCD: add binary GCD algorithm
  2016-04-28 17:58     ` Rich Felker
@ 2016-04-28 18:11       ` Geert Uytterhoeven
  2016-04-28 19:15       ` George Spelvin
  1 sibling, 0 replies; 11+ messages in thread
From: Geert Uytterhoeven @ 2016-04-28 18:11 UTC (permalink / raw)
  To: Rich Felker
  Cc: George Spelvin, Andrew Morton, Peter Zijlstra, zengzhaoxiu,
	David S. Miller, Helge Deller, Ivan Kokshaysky, James Hogan,
	James E.J. Bottomley, Jonas Bonn, Lennox Wu, Ley Foon Tan, alpha,
	linux-arm-kernel, linux-kernel, linux-m68k,
	open list:METAG ARCHITECTURE, Linux MIPS Mailing List,
	Parisc List, Linux-sh list, Russell King, linux, Chen Liqin,
	Matt Turner, Michal Simek, nios2-dev, Ralf Baechle,
	Richard Henderson, sparclinux, uclinux-h8-devel, Yoshinori Sato,
	zhaoxiu.zeng

On Thu, Apr 28, 2016 at 7:58 PM, Rich Felker <dalias@libc.org> wrote:
> On Thu, Apr 28, 2016 at 07:51:06PM +0200, Geert Uytterhoeven wrote:
>> On Thu, Apr 28, 2016 at 6:48 PM, George Spelvin <linux@horizon.com> wrote:
>> > Another few comments:
>> >
>> > 1. Would ARCH_HAS_FAST_FFS involve fewer changes than CPU_NO_EFFICIENT_FFS?
>>
>> No, as you want to _disable_ ARCH_HAS_FAST_FFS / _enable_
>> CPU_NO_EFFICIENT_FFS as soon as you're enabling support for a
>> CPU that doesn't support it.
>>
>> Logical OR is easier in both the Kconfig and C preprocessor languages
>> than logical NAND.
>>
>> E.g. in Kconfig, a CPU core not supporting it can just select
>> CPU_NO_EFFICIENT_FFS.
>
> How does a CPU lack an efficient ffs/ctz anyway? There are all sorts
> of ways to implement it without a native insn, some of which are
> almost or just as fast as the native insn on cpus that have the
> latter. On anything with a fast multiply, the de Bruijn sequence
> approach is near-optimal, and otherwise one of the binary-search type
> approaches (possibly branchless) can be used. If the compiler doesn't
> generate an appropriate one for __builtin_ctz, that's arguably a
> compiler bug.

m68k-linux-gcc 4.6.3 generates:

        jsr __ctzsi2

Gr{oetje,eeting}s,

                        Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch V3] lib: GCD: add binary GCD algorithm
  2016-04-28 17:58     ` Rich Felker
  2016-04-28 18:11       ` Geert Uytterhoeven
@ 2016-04-28 19:15       ` George Spelvin
  1 sibling, 0 replies; 11+ messages in thread
From: George Spelvin @ 2016-04-28 19:15 UTC (permalink / raw)
  To: dalias, geert
  Cc: akpm, davem, deller, ink, james.hogan, jejb, jonas, lennox.wu,
	lftan, linux-alpha, linux-arm-kernel, linux-kernel, linux-m68k,
	linux-metag, linux-mips, linux-parisc, linux-sh, linux, linux,
	linux, liqin.linux, mattst88, monstr, nios2-dev, peterz, ralf,
	rth, sparclinux, uclinux-h8-devel, ysato, zengzhaoxiu,
	zhaoxiu.zeng

> How does a CPU lack an efficient ffs/ctz anyway? There are all sorts
> of ways to implement it without a native insn, some of which are
> almost or just as fast as the native insn on cpus that have the
> latter. On anything with a fast multiply, the de Bruijn sequence
> approach is near-optimal, and otherwise one of the binary-search type
> approaches (possibly branchless) can be used. If the compiler doesn't
> generate an appropriate one for __builtin_ctz, that's arguably a
> compiler bug.

What's wanted here is something faster than any of those.
Yes, there's a simple constant-time branch-free implementation:

unsigned inline __attribute__((const))
hweight32(uint32_t x)
{
	x -= (x >> 1) & 0x55555555;
	x  = ((x >> 2) & 0x33333333) + (x & 0x33333333);
	x += x >> 4;
	x &= 0x0f0f0f0f;
	x += x >> 8;
	x += x >> 16;
	return x & 63;
}

unsigned inline __attribute__((const))
__ffs32(uint32_t x)
{
	return hweight(~x & (x-1));
}

but if you work it through, that's about 19 instructions; a few more on
platforms without 32-bit immediates.  The shift itself makes an even 20,
and there are a lot of sequential dependencies (I count a 17-op chain
including the shift) limiting execution time.

The de Bruijn hack reduces the length but adds a memory access for
the table lookup.  (http://supertech.csail.mit.edu/papers/debruijn.pdf)

In the GCD code, the number to normalize is basically random, so the
normalization loop shifts an average of 1 bit.  One bit half the time,
a second bit 1/4 of the time, etc.

(The posted code in the FAST_FFS case omits one guaranteed shift at the
end of the loop because the normalization code is constant-time.)

So "fast __ffs" basically means faster than *one* iteration of
"while (!(x & 1)) x >>= 1;".

In this case "fast" means cheaper than *one* unpredictable branch, which
is a very small handful of instructions.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch V3] lib: GCD: add binary GCD algorithm
  2016-04-28 16:48 ` George Spelvin
  2016-04-28 17:51   ` Geert Uytterhoeven
@ 2016-04-28 19:54   ` Sam Ravnborg
  1 sibling, 0 replies; 11+ messages in thread
From: Sam Ravnborg @ 2016-04-28 19:54 UTC (permalink / raw)
  To: George Spelvin
  Cc: akpm, peterz, zengzhaoxiu, dalias, davem, deller, geert, ink,
	james.hogan, jejb, jonas, lennox.wu, lftan, linux-alpha,
	linux-arm-kernel, linux-kernel, linux-m68k, linux-metag,
	linux-mips, linux-parisc, linux-sh, linux, liqin.linux, mattst88,
	monstr, nios2-dev, ralf, rth, sparclinux, uclinux-h8-devel,
	ysato, zhaoxiu.zeng

> __ffs on the available architectures:
> 	Alpha: sometimes (CONFIG_ALPHA_EV6, CONFIG_ALPHA_EV67)
> 	ARC: sometimes (!CONFIG_ISA_ARCOMPACT)
> 	ARM: sometimes (V5+)
> 	ARM64: NO, could be written using RBIT and CLZ
> 	AVR: yes
> 	Blackfin: NO, could be written using hweight()
> 	C6x: yes
> 	CRIS: NO
> 	FR-V: yes
> 	H8300: NO
> 	Hexagon: yes
> 	IA64: yes
> 	M32R: NO
> 	M68k: sometimes
> 	MetaG: NO
> 	Microblaze: NO
> 	MIPS: sometimes
> 	MN10300: yes
> 	OpenRISC: NO
> 	PA-RISC: NO?  Interesting code, but I think it's a net loss.
> 	PowerPC: yes
> 	S390: sometimes (CONFIG_HAVE_MARCH_Z9_109_FEATURES)
> 	Score: NO
> 	SH: NO
> 	SPARC: NO
SPARC: sparc64: YES, sparc32: NO
Patch needs to be updated to refelct this.

	Sam

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch V3] lib: GCD: add binary GCD algorithm
  2016-04-28 11:43 [patch V3] lib: GCD: add binary GCD algorithm zengzhaoxiu
                   ` (3 preceding siblings ...)
  2016-04-28 17:22 ` James Bottomley
@ 2016-05-20 10:27 ` kbuild test robot
  4 siblings, 0 replies; 11+ messages in thread
From: kbuild test robot @ 2016-05-20 10:27 UTC (permalink / raw)
  To: zengzhaoxiu
  Cc: kbuild-all, akpm, linux, peterz, Zhaoxiu Zeng, Richard Henderson,
	Ivan Kokshaysky, Matt Turner, Russell King, Yoshinori Sato,
	Geert Uytterhoeven, James Hogan, Michal Simek, Ralf Baechle,
	Ley Foon Tan, Jonas Bonn, James E.J. Bottomley, Helge Deller,
	Chen Liqin, Lennox Wu, Rich Felker, David S. Miller,
	linux-kernel, linux-alpha, linux-arm-kernel, uclinux-h8-devel,
	linux-m68k, linux-metag, linux-mips, nios2-dev, linux,
	linux-parisc, linux-sh, sparclinux

[-- Attachment #1: Type: text/plain, Size: 8871 bytes --]

Hi,

[auto build test ERROR on v4.6-rc5]
[cannot apply to next-20160519]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/zengzhaoxiu-163-com/lib-GCD-add-binary-GCD-algorithm/20160428-195527
config: mips-jz4740 (attached as .config)
compiler: mips-linux-gnu-gcc (Debian 5.3.1-8) 5.3.1 20160205
reproduce:
        wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=mips 

Note: the linux-review/zengzhaoxiu-163-com/lib-GCD-add-binary-GCD-algorithm/20160428-195527 HEAD 35e1a24e8fc3a5308b053ed3c744f02ec2a76820 builds fine.
      It only hurts bisectibility.

All errors (new ones prefixed by >>):

   In file included from arch/mips/include/asm/bitops.h:21:0,
                    from include/linux/bitops.h:36,
                    from include/linux/kernel.h:10,
                    from include/asm-generic/bug.h:13,
                    from arch/mips/include/asm/bug.h:41,
                    from include/linux/bug.h:4,
                    from include/linux/page-flags.h:9,
                    from kernel/bounds.c:9:
   arch/mips/include/asm/cpu-features.h:205:28: warning: "cpu_data" is not defined [-Wundef]
    # define cpu_has_mips32r6 (cpu_data[0].isa_level & MIPS_CPU_ISA_M32R6)
                               ^
   arch/mips/include/asm/cpu-features.h:241:5: note: in expansion of macro 'cpu_has_mips32r6'
        cpu_has_mips32r6 | cpu_has_mips64r1 | \
        ^
   arch/mips/include/asm/cpu-features.h:289:25: note: in expansion of macro 'cpu_has_mips_r'
    #define cpu_has_clo_clz cpu_has_mips_r
                            ^
   arch/mips/include/asm/cpu-features.h:291:6: note: in expansion of macro 'cpu_has_clo_clz'
    #if !cpu_has_clo_clz
         ^
>> arch/mips/include/asm/cpu-features.h:205:36: error: token "[" is not valid in preprocessor expressions
    # define cpu_has_mips32r6 (cpu_data[0].isa_level & MIPS_CPU_ISA_M32R6)
                                       ^
   arch/mips/include/asm/cpu-features.h:241:5: note: in expansion of macro 'cpu_has_mips32r6'
        cpu_has_mips32r6 | cpu_has_mips64r1 | \
        ^
   arch/mips/include/asm/cpu-features.h:289:25: note: in expansion of macro 'cpu_has_mips_r'
    #define cpu_has_clo_clz cpu_has_mips_r
                            ^
   arch/mips/include/asm/cpu-features.h:291:6: note: in expansion of macro 'cpu_has_clo_clz'
    #if !cpu_has_clo_clz
         ^
   make[2]: *** [kernel/bounds.s] Error 1
   make[2]: Target '__build' not remade because of errors.
   make[1]: *** [prepare0] Error 2
   make[1]: Target 'prepare' not remade because of errors.
   make: *** [sub-make] Error 2

vim +205 arch/mips/include/asm/cpu-features.h

0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  199  # define cpu_has_mips32r1	(cpu_data[0].isa_level & MIPS_CPU_ISA_M32R1)
0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  200  #endif
0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  201  #ifndef cpu_has_mips32r2
0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  202  # define cpu_has_mips32r2	(cpu_data[0].isa_level & MIPS_CPU_ISA_M32R2)
0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  203  #endif
34c56fc1 arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  204  #ifndef cpu_has_mips32r6
34c56fc1 arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13 @205  # define cpu_has_mips32r6	(cpu_data[0].isa_level & MIPS_CPU_ISA_M32R6)
34c56fc1 arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  206  #endif
0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  207  #ifndef cpu_has_mips64r1
0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  208  # define cpu_has_mips64r1	(cpu_data[0].isa_level & MIPS_CPU_ISA_M64R1)
0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  209  #endif
0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  210  #ifndef cpu_has_mips64r2
0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  211  # define cpu_has_mips64r2	(cpu_data[0].isa_level & MIPS_CPU_ISA_M64R2)
0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  212  #endif
34c56fc1 arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  213  #ifndef cpu_has_mips64r6
34c56fc1 arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  214  # define cpu_has_mips64r6	(cpu_data[0].isa_level & MIPS_CPU_ISA_M64R6)
34c56fc1 arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  215  #endif
0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  216  
0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  217  /*
0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  218   * Shortcuts ...
0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  219   */
08a07904 arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  220  #define cpu_has_mips_2_3_4_5	(cpu_has_mips_2 | cpu_has_mips_3_4_5)
08a07904 arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  221  #define cpu_has_mips_3_4_5	(cpu_has_mips_3 | cpu_has_mips_4_5)
08a07904 arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  222  #define cpu_has_mips_4_5	(cpu_has_mips_4 | cpu_has_mips_5)
08a07904 arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  223  
08a07904 arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  224  #define cpu_has_mips_2_3_4_5_r	(cpu_has_mips_2 | cpu_has_mips_3_4_5_r)
08a07904 arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  225  #define cpu_has_mips_3_4_5_r	(cpu_has_mips_3 | cpu_has_mips_4_5_r)
08a07904 arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  226  #define cpu_has_mips_4_5_r	(cpu_has_mips_4 | cpu_has_mips_5_r)
08a07904 arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  227  #define cpu_has_mips_5_r	(cpu_has_mips_5 | cpu_has_mips_r)
08a07904 arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  228  
2d83fea7 arch/mips/include/asm/cpu-features.h Maciej W. Rozycki 2015-04-03  229  #define cpu_has_mips_3_4_5_64_r2_r6					\
2d83fea7 arch/mips/include/asm/cpu-features.h Maciej W. Rozycki 2015-04-03  230  				(cpu_has_mips_3 | cpu_has_mips_4_5_64_r2_r6)
2d83fea7 arch/mips/include/asm/cpu-features.h Maciej W. Rozycki 2015-04-03  231  #define cpu_has_mips_4_5_64_r2_r6					\
2d83fea7 arch/mips/include/asm/cpu-features.h Maciej W. Rozycki 2015-04-03  232  				(cpu_has_mips_4_5 | cpu_has_mips64r1 |	\
2d83fea7 arch/mips/include/asm/cpu-features.h Maciej W. Rozycki 2015-04-03  233  				 cpu_has_mips_r2 | cpu_has_mips_r6)
08a07904 arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  234  
34c56fc1 arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  235  #define cpu_has_mips32	(cpu_has_mips32r1 | cpu_has_mips32r2 | cpu_has_mips32r6)
34c56fc1 arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  236  #define cpu_has_mips64	(cpu_has_mips64r1 | cpu_has_mips64r2 | cpu_has_mips64r6)
0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  237  #define cpu_has_mips_r1 (cpu_has_mips32r1 | cpu_has_mips64r1)
0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  238  #define cpu_has_mips_r2 (cpu_has_mips32r2 | cpu_has_mips64r2)
34c56fc1 arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  239  #define cpu_has_mips_r6	(cpu_has_mips32r6 | cpu_has_mips64r6)
c46b302b arch/mips/include/asm/cpu-features.h Ralf Baechle      2008-10-28  240  #define cpu_has_mips_r	(cpu_has_mips32r1 | cpu_has_mips32r2 | \
34c56fc1 arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13 @241  			 cpu_has_mips32r6 | cpu_has_mips64r1 | \
34c56fc1 arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  242  			 cpu_has_mips64r2 | cpu_has_mips64r6)
34c56fc1 arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  243  
34c56fc1 arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  244  /* MIPSR2 and MIPSR6 have a lot of similarities */

:::::: The code at line 205 was first introduced by commit
:::::: 34c56fc1c167facc375d927687df0a3891d164ac MIPS: asm: cpu: Add MIPSR6 ISA definitions

:::::: TO: Leonid Yegoshin <Leonid.Yegoshin@imgtec.com>
:::::: CC: Markos Chandras <markos.chandras@imgtec.com>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/octet-stream, Size: 18282 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2016-05-20 10:28 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-04-28 11:43 [patch V3] lib: GCD: add binary GCD algorithm zengzhaoxiu
2016-04-28 12:18 ` kbuild test robot
2016-04-28 16:48 ` George Spelvin
2016-04-28 17:51   ` Geert Uytterhoeven
2016-04-28 17:58     ` Rich Felker
2016-04-28 18:11       ` Geert Uytterhoeven
2016-04-28 19:15       ` George Spelvin
2016-04-28 19:54   ` Sam Ravnborg
2016-04-28 17:10 ` Josh Juran
2016-04-28 17:22 ` James Bottomley
2016-05-20 10:27 ` kbuild test robot

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).