All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
@ 2016-05-06  9:42 ` zengzhaoxiu
  0 siblings, 0 replies; 41+ messages in thread
From: zengzhaoxiu @ 2016-05-06  9:42 UTC (permalink / raw)
  To: linux-arm-kernel

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

The binary GCD algorithm is based on the following facts:
	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)

Even on x86 machines with reasonable division hardware, the binary
algorithm runs about 25% faster (80% the execution time) than the
division-based Euclidian algorithm.

On platforms like Alpha and ARMv6 where division is a function call to
emulation code, it's even more significant.

There are two variants of the code here, depending on whether a
fast __ffs (find least significant set bit) instruction is available.
This allows the unpredictable branches in the bit-at-a-time shifting
loop to be eliminated.

If fast __ffs is not available, the "even/odd" GCD variant is used.

I use the following code to benchmark:

	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <string.h>
	#include <time.h>
	#include <unistd.h>

	#define swap(a, b) \
		do { \
			a ^= b; \
			b ^= a; \
			a ^= b; \
		} while (0)

	unsigned long gcd0(unsigned long a, unsigned long b)
	{
		unsigned long r;

		if (a < b) {
			swap(a, b);
		}

		if (b = 0)
			return a;

		while ((r = a % b) != 0) {
			a = b;
			b = r;
		}

		return b;
	}

	unsigned long gcd1(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a = b)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd2(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a = b)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	unsigned long gcd3(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);
		if (b = 1)
			return r & -r;

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a = 1)
				return r & -r;
			if (a = b)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd4(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;
		if (b = r)
			return r;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a = r)
				return r;
			if (a = b)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	static unsigned long (*gcd_func[])(unsigned long a, unsigned long b) = {
		gcd0, gcd1, gcd2, gcd3, gcd4,
	};

	#define TEST_ENTRIES (sizeof(gcd_func) / sizeof(gcd_func[0]))

	#if defined(__x86_64__)

	#define rdtscll(val) do { \
		unsigned long __a,__d; \
		__asm__ __volatile__("rdtsc" : "=a" (__a), "=d" (__d)); \
		(val) = ((unsigned long long)__a) | (((unsigned long long)__d)<<32); \
	} while(0)

	static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
								unsigned long a, unsigned long b, unsigned long *res)
	{
		unsigned long long start, end;
		unsigned long long ret;
		unsigned long gcd_res;

		rdtscll(start);
		gcd_res = gcd(a, b);
		rdtscll(end);

		if (end >= start)
			ret = end - start;
		else
			ret = ~0ULL - start + 1 + end;

		*res = gcd_res;
		return ret;
	}

	#else

	static inline struct timespec read_time(void)
	{
		struct timespec time;
		clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time);
		return time;
	}

	static inline unsigned long long diff_time(struct timespec start, struct timespec end)
	{
		struct timespec temp;

		if ((end.tv_nsec - start.tv_nsec) < 0) {
			temp.tv_sec = end.tv_sec - start.tv_sec - 1;
			temp.tv_nsec = 1000000000ULL + end.tv_nsec - start.tv_nsec;
		} else {
			temp.tv_sec = end.tv_sec - start.tv_sec;
			temp.tv_nsec = end.tv_nsec - start.tv_nsec;
		}

		return temp.tv_sec * 1000000000ULL + temp.tv_nsec;
	}

	static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
								unsigned long a, unsigned long b, unsigned long *res)
	{
		struct timespec start, end;
		unsigned long gcd_res;

		start = read_time();
		gcd_res = gcd(a, b);
		end = read_time();

		*res = gcd_res;
		return diff_time(start, end);
	}

	#endif

	static inline unsigned long get_rand()
	{
		if (sizeof(long) = 8)
			return (unsigned long)rand() << 32 | rand();
		else
			return rand();
	}

	int main(int argc, char **argv)
	{
		unsigned int seed = time(0);
		int loops = 100;
		int repeats = 1000;
		unsigned long (*res)[TEST_ENTRIES];
		unsigned long long elapsed[TEST_ENTRIES];
		int i, j, k;

		for (;;) {
			int opt = getopt(argc, argv, "n:r:s:");
			/* End condition always first */
			if (opt = -1)
				break;

			switch (opt) {
			case 'n':
				loops = atoi(optarg);
				break;
			case 'r':
				repeats = atoi(optarg);
				break;
			case 's':
				seed = strtoul(optarg, NULL, 10);
				break;
			default:
				/* You won't actually get here. */
				break;
			}
		}

		res = malloc(sizeof(unsigned long) * TEST_ENTRIES * loops);
		memset(elapsed, 0, sizeof(elapsed));

		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			/* Do we have args? */
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			unsigned long long min_elapsed[TEST_ENTRIES];
			for (k = 0; k < repeats; k++) {
				for (i = 0; i < TEST_ENTRIES; i++) {
					unsigned long long tmp = benchmark_gcd_func(gcd_func[i], a, b, &res[j][i]);
					if (k = 0 || min_elapsed[i] > tmp)
						min_elapsed[i] = tmp;
				}
			}
			for (i = 0; i < TEST_ENTRIES; i++)
				elapsed[i] += min_elapsed[i];
		}

		for (i = 0; i < TEST_ENTRIES; i++)
			printf("gcd%d: elapsed %llu\n", i, elapsed[i]);

		k = 0;
		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			for (i = 1; i < TEST_ENTRIES; i++) {
				if (res[j][i] != res[j][0])
					break;
			}
			if (i < TEST_ENTRIES) {
				if (k = 0) {
					k = 1;
					fprintf(stderr, "Error:\n");
				}
				fprintf(stderr, "gcd(%lu, %lu): ", a, b);
				for (i = 0; i < TEST_ENTRIES; i++)
					fprintf(stderr, "%ld%s", res[j][i], i < TEST_ENTRIES - 1 ? ", " : "\n");
			}
		}

		if (k = 0)
			fprintf(stderr, "PASS\n");

		free(res);

		return 0;
	}

Compiled with "-O2", on "VirtualBox 4.4.0-22-generic #38-Ubuntu x86_64" got:

zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 10174
gcd1: elapsed 2120
gcd2: elapsed 2902
gcd3: elapsed 2039
gcd4: elapsed 2812
PASS
zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 9309
gcd1: elapsed 2280
gcd2: elapsed 2822
gcd3: elapsed 2217
gcd4: elapsed 2710
PASS
zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 9589
gcd1: elapsed 2098
gcd2: elapsed 2815
gcd3: elapsed 2030
gcd4: elapsed 2718
PASS
zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 9914
gcd1: elapsed 2309
gcd2: elapsed 2779
gcd3: elapsed 2228
gcd4: elapsed 2709
PASS

Changes to V3:
- Fix build error
- Select CPU_NO_EFFICIENT_FFS if CONFIG_ARC && CONFIG_ISA_ARCOMPACT
- Select CPU_NO_EFFICIENT_FFS if CONFIG_S390 && !CONFIG_HAVE_MARCH_Z9_109_FEATURES
- Do new brnchmark
- Return immediately if one number becomes a power of 2
- Add comments written by George Spelvin

Changes to V2:
- Add a new Kconfig variable CPU_NO_EFFICIENT_FFS
- Separate into two versions by CPU_NO_EFFICIENT_FFS
- Return directly from the loop, rather than using break().
- Use "r &= -r" mostly because it's clearer.

Changes to V1:
- Don't touch Kconfig, remove the Euclidean algorithm implementation
- Don't use the "even-odd" variant
- Use __ffs if the CPU has efficient __ffs

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
Signed-off-by: George Spelvin <linux@horizon.com>
---
 arch/Kconfig                         |  3 ++
 arch/alpha/Kconfig                   |  1 +
 arch/arc/Kconfig                     |  1 +
 arch/arm/mm/Kconfig                  |  3 ++
 arch/h8300/Kconfig                   |  1 +
 arch/m32r/Kconfig                    |  1 +
 arch/m68k/Kconfig.cpu                | 11 ++++++
 arch/metag/Kconfig                   |  1 +
 arch/microblaze/Kconfig              |  1 +
 arch/mips/include/asm/cpu-features.h | 10 +++++
 arch/nios2/Kconfig                   |  1 +
 arch/openrisc/Kconfig                |  1 +
 arch/parisc/Kconfig                  |  1 +
 arch/s390/Kconfig                    |  2 +-
 arch/score/Kconfig                   |  1 +
 arch/sh/Kconfig                      |  1 +
 arch/sparc/Kconfig                   |  1 +
 lib/gcd.c                            | 77 +++++++++++++++++++++++++++++++-----
 18 files changed, 107 insertions(+), 11 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 81869a5..275f17d 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -638,4 +638,7 @@ config COMPAT_OLD_SIGACTION
 config ARCH_NO_COHERENT_DMA_MMAP
 	bool
 
+config CPU_NO_EFFICIENT_FFS
+	def_bool n
+
 source "kernel/gcov/Kconfig"
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 9d8a858..44e6f05 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -27,6 +27,7 @@ config ALPHA
 	select MODULES_USE_ELF_RELA
 	select ODD_RT_SIGACTION
 	select OLD_SIGSUSPEND
+	select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67
 	help
 	  The Alpha is a 64-bit general-purpose processor designed and
 	  marketed by the Digital Equipment Corporation of blessed memory,
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index ec4791e..f41eb4c 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -101,6 +101,7 @@ choice
 
 config ISA_ARCOMPACT
 	bool "ARCompact ISA"
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The original ARC ISA of ARC600/700 cores
 
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index 5534766..cb569b6 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -421,18 +421,21 @@ config CPU_32v3
 	select CPU_USE_DOMAINS if MMU
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_NO_EFFICIENT_FFS
 
 config CPU_32v4
 	bool
 	select CPU_USE_DOMAINS if MMU
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_NO_EFFICIENT_FFS
 
 config CPU_32v4T
 	bool
 	select CPU_USE_DOMAINS if MMU
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_NO_EFFICIENT_FFS
 
 config CPU_32v5
 	bool
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index 986ea84..aa232de 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -20,6 +20,7 @@ config H8300
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZO
 	select HAVE_ARCH_KGDB
+	select CPU_NO_EFFICIENT_FFS
 
 config RWSEM_GENERIC_SPINLOCK
 	def_bool y
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index c82b292..3cc8498 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -17,6 +17,7 @@ config M32R
 	select ARCH_USES_GETTIMEOFFSET
 	select MODULES_USE_ELF_RELA
 	select HAVE_DEBUG_STACKOVERFLOW
+	select CPU_NO_EFFICIENT_FFS
 
 config SBUS
 	bool
diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu
index 0dfcf12..0b6efe8 100644
--- a/arch/m68k/Kconfig.cpu
+++ b/arch/m68k/Kconfig.cpu
@@ -40,6 +40,7 @@ config M68000
 	select CPU_HAS_NO_MULDIV64
 	select CPU_HAS_NO_UNALIGNED
 	select GENERIC_CSUM
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The Freescale (was Motorola) 68000 CPU is the first generation of
 	  the well known M68K family of processors. The CPU core as well as
@@ -51,6 +52,7 @@ config MCPU32
 	bool
 	select CPU_HAS_NO_BITFIELDS
 	select CPU_HAS_NO_UNALIGNED
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The Freescale (was then Motorola) CPU32 is a CPU core that is
 	  based on the 68020 processor. For the most part it is used in
@@ -130,6 +132,7 @@ config M5206
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5206 processor support.
 
@@ -138,6 +141,7 @@ config M5206e
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5206e processor support.
 
@@ -163,6 +167,7 @@ config M5249
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5249 processor support.
 
@@ -171,6 +176,7 @@ config M525x
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Freescale (Motorola) Coldfire 5251/5253 processor support.
 
@@ -189,6 +195,7 @@ config M5272
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5272 processor support.
 
@@ -217,6 +224,7 @@ config M5307
 	select COLDFIRE_SW_A7
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5307 processor support.
 
@@ -242,6 +250,7 @@ config M5407
 	select COLDFIRE_SW_A7
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5407 processor support.
 
@@ -251,6 +260,7 @@ config M547x
 	select MMU_COLDFIRE if MMU
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Freescale ColdFire 5470/5471/5472/5473/5474/5475 processor support.
 
@@ -260,6 +270,7 @@ config M548x
 	select M54xx
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Freescale ColdFire 5480/5481/5482/5483/5484/5485 processor support.
 
diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig
index a0fa88d..2ac2de6 100644
--- a/arch/metag/Kconfig
+++ b/arch/metag/Kconfig
@@ -29,6 +29,7 @@ config METAG
 	select OF
 	select OF_EARLY_FLATTREE
 	select SPARSE_IRQ
+	select CPU_NO_EFFICIENT_FFS
 
 config STACKTRACE_SUPPORT
 	def_bool y
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 3d793b5..f17c3a4 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -32,6 +32,7 @@ config MICROBLAZE
 	select OF_EARLY_FLATTREE
 	select TRACING_SUPPORT
 	select VIRT_TO_BUS
+	select CPU_NO_EFFICIENT_FFS
 
 config SWAP
 	def_bool n
diff --git a/arch/mips/include/asm/cpu-features.h b/arch/mips/include/asm/cpu-features.h
index eeec8c8..e20e100 100644
--- a/arch/mips/include/asm/cpu-features.h
+++ b/arch/mips/include/asm/cpu-features.h
@@ -180,6 +180,16 @@
 #endif
 #endif
 
+/* __builtin_constant_p(cpu_has_mips_r) && cpu_has_mips_r */
+#if !((defined(cpu_has_mips32r1) && cpu_has_mips32r1) || \
+	  (defined(cpu_has_mips32r2) && cpu_has_mips32r2) || \
+	  (defined(cpu_has_mips32r6) && cpu_has_mips32r6) || \
+	  (defined(cpu_has_mips64r1) && cpu_has_mips64r1) || \
+	  (defined(cpu_has_mips64r2) && cpu_has_mips64r2) || \
+	  (defined(cpu_has_mips64r6) && cpu_has_mips64r6))
+#define CONFIG_CPU_NO_EFFICIENT_FFS 1
+#endif
+
 #ifndef cpu_has_mips_1
 # define cpu_has_mips_1		(!cpu_has_mips_r6)
 #endif
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index 4375554..f10bd2c 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -16,6 +16,7 @@ config NIOS2
 	select SOC_BUS
 	select SPARSE_IRQ
 	select USB_ARCH_HAS_HCD if USB_SUPPORT
+	select CPU_NO_EFFICIENT_FFS
 
 config GENERIC_CSUM
 	def_bool y
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index e118c02..142cb05 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -25,6 +25,7 @@ config OPENRISC
 	select MODULES_USE_ELF_RELA
 	select HAVE_DEBUG_STACKOVERFLOW
 	select OR1K_PIC
+	select CPU_NO_EFFICIENT_FFS if !OPENRISC_HAVE_INST_FF1
 
 config MMU
 	def_bool y
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 88cfaa8..3d498a6 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -32,6 +32,7 @@ config PARISC
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_SECCOMP_FILTER
 	select ARCH_NO_COHERENT_DMA_MMAP
+	select CPU_NO_EFFICIENT_FFS
 
 	help
 	  The PA-RISC microprocessor is designed by Hewlett-Packard and used
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index bf24ab1..9eb3932 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -164,7 +164,7 @@ config S390
 	select TTY
 	select VIRT_CPU_ACCOUNTING
 	select VIRT_TO_BUS
-
+	select CPU_NO_EFFICIENT_FFS if !HAVE_MARCH_Z9_109_FEATURES
 
 config SCHED_OMIT_FRAME_POINTER
 	def_bool y
diff --git a/arch/score/Kconfig b/arch/score/Kconfig
index 366e1b5..507d631 100644
--- a/arch/score/Kconfig
+++ b/arch/score/Kconfig
@@ -14,6 +14,7 @@ config SCORE
 	select VIRT_TO_BUS
 	select MODULES_USE_ELF_REL
 	select CLONE_BACKWARDS
+	select CPU_NO_EFFICIENT_FFS
 
 choice
 	prompt "System type"
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 7ed20fc..56cf5e5 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -44,6 +44,7 @@ config SUPERH
 	select OLD_SIGSUSPEND
 	select OLD_SIGACTION
 	select HAVE_ARCH_AUDITSYSCALL
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The SuperH is a RISC processor targeted for use in embedded systems
 	  and consumer electronics; it was also used in the Sega Dreamcast
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 57ffaf2..ca675ed 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -42,6 +42,7 @@ config SPARC
 	select ODD_RT_SIGACTION
 	select OLD_SIGSUSPEND
 	select ARCH_HAS_SG_CHAIN
+	select CPU_NO_EFFICIENT_FFS
 
 config SPARC32
 	def_bool !64BIT
diff --git a/lib/gcd.c b/lib/gcd.c
index 3657f12..b9fbe73 100644
--- a/lib/gcd.c
+++ b/lib/gcd.c
@@ -2,20 +2,77 @@
 #include <linux/gcd.h>
 #include <linux/export.h>
 
-/* Greatest common divisor */
+/*
+ * This implements the binary GCD algorithm. (Often attributed to Stein,
+ * but as Knuth has noted, appears in a first-century Chinese math text.)
+ *
+ * This is faster than the division-based algorithm even on x86, which
+ * has decent hardware division.
+ */
+
+#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS)
+
+/* If __ffs is available, the even/odd algorithm benchmarks slower. */
 unsigned long gcd(unsigned long a, unsigned long b)
 {
-	unsigned long r;
+	unsigned long r = a | b;
+
+	if (!a || !b)
+		return r;
 
-	if (a < b)
-		swap(a, b);
+	b >>= __ffs(b);
+	if (b = 1)
+		return r & -r;
 
-	if (!b)
-		return a;
-	while ((r = a % b) != 0) {
-		a = b;
-		b = r;
+	for (;;) {
+		a >>= __ffs(a);
+		if (a = 1)
+			return r & -r;
+		if (a = b)
+			return a << __ffs(r);
+
+		if (a < b)
+			swap(a, b);
+		a -= b;
 	}
-	return b;
 }
+
+#else
+
+/* If normalization is done by loops, the even/odd algorithm is a win. */
+unsigned long gcd(unsigned long a, unsigned long b)
+{
+	unsigned long r = a | b;
+
+	if (!a || !b)
+		return r;
+
+	/* Isolate lsbit of r */
+	r &= -r;
+
+	while (!(b & r))
+		b >>= 1;
+	if (b = r)
+		return r;
+
+	for (;;) {
+		while (!(a & r))
+			a >>= 1;
+		if (a = r)
+			return r;
+		if (a = b)
+			return a;
+
+		if (a < b)
+			swap(a, b);
+		a -= b;
+		a >>= 1;
+		if (a & r)
+			a += b;
+		a >>= 1;
+	}
+}
+
+#endif
+
 EXPORT_SYMBOL_GPL(gcd);
-- 
2.7.4



^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
@ 2016-05-06  9:42 ` zengzhaoxiu
  0 siblings, 0 replies; 41+ messages in thread
From: zengzhaoxiu @ 2016-05-06  9:42 UTC (permalink / raw)
  To: akpm, linux, peterz, jjuran, James.Bottomley, geert, dalias, sam, davem
  Cc: Zhaoxiu Zeng, Richard Henderson, Ivan Kokshaysky, Matt Turner,
	Vineet Gupta, Russell King, Yoshinori Sato, James Hogan,
	Michal Simek, Ralf Baechle, Ley Foon Tan, Jonas Bonn,
	James E.J. Bottomley, Helge Deller, Martin Schwidefsky,
	Heiko Carstens, Chen Liqin, Lennox Wu, linux-kernel, linux-alpha,
	linux-snps

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

The binary GCD algorithm is based on the following facts:
	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)

Even on x86 machines with reasonable division hardware, the binary
algorithm runs about 25% faster (80% the execution time) than the
division-based Euclidian algorithm.

On platforms like Alpha and ARMv6 where division is a function call to
emulation code, it's even more significant.

There are two variants of the code here, depending on whether a
fast __ffs (find least significant set bit) instruction is available.
This allows the unpredictable branches in the bit-at-a-time shifting
loop to be eliminated.

If fast __ffs is not available, the "even/odd" GCD variant is used.

I use the following code to benchmark:

	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <string.h>
	#include <time.h>
	#include <unistd.h>

	#define swap(a, b) \
		do { \
			a ^= b; \
			b ^= a; \
			a ^= b; \
		} while (0)

	unsigned long gcd0(unsigned long a, unsigned long b)
	{
		unsigned long r;

		if (a < b) {
			swap(a, b);
		}

		if (b == 0)
			return a;

		while ((r = a % b) != 0) {
			a = b;
			b = r;
		}

		return b;
	}

	unsigned long gcd1(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a == b)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd2(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a == b)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	unsigned long gcd3(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);
		if (b == 1)
			return r & -r;

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a == 1)
				return r & -r;
			if (a == b)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd4(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;
		if (b == r)
			return r;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a == r)
				return r;
			if (a == b)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	static unsigned long (*gcd_func[])(unsigned long a, unsigned long b) = {
		gcd0, gcd1, gcd2, gcd3, gcd4,
	};

	#define TEST_ENTRIES (sizeof(gcd_func) / sizeof(gcd_func[0]))

	#if defined(__x86_64__)

	#define rdtscll(val) do { \
		unsigned long __a,__d; \
		__asm__ __volatile__("rdtsc" : "=a" (__a), "=d" (__d)); \
		(val) = ((unsigned long long)__a) | (((unsigned long long)__d)<<32); \
	} while(0)

	static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
								unsigned long a, unsigned long b, unsigned long *res)
	{
		unsigned long long start, end;
		unsigned long long ret;
		unsigned long gcd_res;

		rdtscll(start);
		gcd_res = gcd(a, b);
		rdtscll(end);

		if (end >= start)
			ret = end - start;
		else
			ret = ~0ULL - start + 1 + end;

		*res = gcd_res;
		return ret;
	}

	#else

	static inline struct timespec read_time(void)
	{
		struct timespec time;
		clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time);
		return time;
	}

	static inline unsigned long long diff_time(struct timespec start, struct timespec end)
	{
		struct timespec temp;

		if ((end.tv_nsec - start.tv_nsec) < 0) {
			temp.tv_sec = end.tv_sec - start.tv_sec - 1;
			temp.tv_nsec = 1000000000ULL + end.tv_nsec - start.tv_nsec;
		} else {
			temp.tv_sec = end.tv_sec - start.tv_sec;
			temp.tv_nsec = end.tv_nsec - start.tv_nsec;
		}

		return temp.tv_sec * 1000000000ULL + temp.tv_nsec;
	}

	static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
								unsigned long a, unsigned long b, unsigned long *res)
	{
		struct timespec start, end;
		unsigned long gcd_res;

		start = read_time();
		gcd_res = gcd(a, b);
		end = read_time();

		*res = gcd_res;
		return diff_time(start, end);
	}

	#endif

	static inline unsigned long get_rand()
	{
		if (sizeof(long) == 8)
			return (unsigned long)rand() << 32 | rand();
		else
			return rand();
	}

	int main(int argc, char **argv)
	{
		unsigned int seed = time(0);
		int loops = 100;
		int repeats = 1000;
		unsigned long (*res)[TEST_ENTRIES];
		unsigned long long elapsed[TEST_ENTRIES];
		int i, j, k;

		for (;;) {
			int opt = getopt(argc, argv, "n:r:s:");
			/* End condition always first */
			if (opt == -1)
				break;

			switch (opt) {
			case 'n':
				loops = atoi(optarg);
				break;
			case 'r':
				repeats = atoi(optarg);
				break;
			case 's':
				seed = strtoul(optarg, NULL, 10);
				break;
			default:
				/* You won't actually get here. */
				break;
			}
		}

		res = malloc(sizeof(unsigned long) * TEST_ENTRIES * loops);
		memset(elapsed, 0, sizeof(elapsed));

		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			/* Do we have args? */
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			unsigned long long min_elapsed[TEST_ENTRIES];
			for (k = 0; k < repeats; k++) {
				for (i = 0; i < TEST_ENTRIES; i++) {
					unsigned long long tmp = benchmark_gcd_func(gcd_func[i], a, b, &res[j][i]);
					if (k == 0 || min_elapsed[i] > tmp)
						min_elapsed[i] = tmp;
				}
			}
			for (i = 0; i < TEST_ENTRIES; i++)
				elapsed[i] += min_elapsed[i];
		}

		for (i = 0; i < TEST_ENTRIES; i++)
			printf("gcd%d: elapsed %llu\n", i, elapsed[i]);

		k = 0;
		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			for (i = 1; i < TEST_ENTRIES; i++) {
				if (res[j][i] != res[j][0])
					break;
			}
			if (i < TEST_ENTRIES) {
				if (k == 0) {
					k = 1;
					fprintf(stderr, "Error:\n");
				}
				fprintf(stderr, "gcd(%lu, %lu): ", a, b);
				for (i = 0; i < TEST_ENTRIES; i++)
					fprintf(stderr, "%ld%s", res[j][i], i < TEST_ENTRIES - 1 ? ", " : "\n");
			}
		}

		if (k == 0)
			fprintf(stderr, "PASS\n");

		free(res);

		return 0;
	}

Compiled with "-O2", on "VirtualBox 4.4.0-22-generic #38-Ubuntu x86_64" got:

zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 10174
gcd1: elapsed 2120
gcd2: elapsed 2902
gcd3: elapsed 2039
gcd4: elapsed 2812
PASS
zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 9309
gcd1: elapsed 2280
gcd2: elapsed 2822
gcd3: elapsed 2217
gcd4: elapsed 2710
PASS
zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 9589
gcd1: elapsed 2098
gcd2: elapsed 2815
gcd3: elapsed 2030
gcd4: elapsed 2718
PASS
zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 9914
gcd1: elapsed 2309
gcd2: elapsed 2779
gcd3: elapsed 2228
gcd4: elapsed 2709
PASS

Changes to V3:
- Fix build error
- Select CPU_NO_EFFICIENT_FFS if CONFIG_ARC && CONFIG_ISA_ARCOMPACT
- Select CPU_NO_EFFICIENT_FFS if CONFIG_S390 && !CONFIG_HAVE_MARCH_Z9_109_FEATURES
- Do new brnchmark
- Return immediately if one number becomes a power of 2
- Add comments written by George Spelvin

Changes to V2:
- Add a new Kconfig variable CPU_NO_EFFICIENT_FFS
- Separate into two versions by CPU_NO_EFFICIENT_FFS
- Return directly from the loop, rather than using break().
- Use "r &= -r" mostly because it's clearer.

Changes to V1:
- Don't touch Kconfig, remove the Euclidean algorithm implementation
- Don't use the "even-odd" variant
- Use __ffs if the CPU has efficient __ffs

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
Signed-off-by: George Spelvin <linux@horizon.com>
---
 arch/Kconfig                         |  3 ++
 arch/alpha/Kconfig                   |  1 +
 arch/arc/Kconfig                     |  1 +
 arch/arm/mm/Kconfig                  |  3 ++
 arch/h8300/Kconfig                   |  1 +
 arch/m32r/Kconfig                    |  1 +
 arch/m68k/Kconfig.cpu                | 11 ++++++
 arch/metag/Kconfig                   |  1 +
 arch/microblaze/Kconfig              |  1 +
 arch/mips/include/asm/cpu-features.h | 10 +++++
 arch/nios2/Kconfig                   |  1 +
 arch/openrisc/Kconfig                |  1 +
 arch/parisc/Kconfig                  |  1 +
 arch/s390/Kconfig                    |  2 +-
 arch/score/Kconfig                   |  1 +
 arch/sh/Kconfig                      |  1 +
 arch/sparc/Kconfig                   |  1 +
 lib/gcd.c                            | 77 +++++++++++++++++++++++++++++++-----
 18 files changed, 107 insertions(+), 11 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 81869a5..275f17d 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -638,4 +638,7 @@ config COMPAT_OLD_SIGACTION
 config ARCH_NO_COHERENT_DMA_MMAP
 	bool
 
+config CPU_NO_EFFICIENT_FFS
+	def_bool n
+
 source "kernel/gcov/Kconfig"
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 9d8a858..44e6f05 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -27,6 +27,7 @@ config ALPHA
 	select MODULES_USE_ELF_RELA
 	select ODD_RT_SIGACTION
 	select OLD_SIGSUSPEND
+	select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67
 	help
 	  The Alpha is a 64-bit general-purpose processor designed and
 	  marketed by the Digital Equipment Corporation of blessed memory,
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index ec4791e..f41eb4c 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -101,6 +101,7 @@ choice
 
 config ISA_ARCOMPACT
 	bool "ARCompact ISA"
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The original ARC ISA of ARC600/700 cores
 
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index 5534766..cb569b6 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -421,18 +421,21 @@ config CPU_32v3
 	select CPU_USE_DOMAINS if MMU
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_NO_EFFICIENT_FFS
 
 config CPU_32v4
 	bool
 	select CPU_USE_DOMAINS if MMU
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_NO_EFFICIENT_FFS
 
 config CPU_32v4T
 	bool
 	select CPU_USE_DOMAINS if MMU
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_NO_EFFICIENT_FFS
 
 config CPU_32v5
 	bool
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index 986ea84..aa232de 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -20,6 +20,7 @@ config H8300
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZO
 	select HAVE_ARCH_KGDB
+	select CPU_NO_EFFICIENT_FFS
 
 config RWSEM_GENERIC_SPINLOCK
 	def_bool y
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index c82b292..3cc8498 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -17,6 +17,7 @@ config M32R
 	select ARCH_USES_GETTIMEOFFSET
 	select MODULES_USE_ELF_RELA
 	select HAVE_DEBUG_STACKOVERFLOW
+	select CPU_NO_EFFICIENT_FFS
 
 config SBUS
 	bool
diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu
index 0dfcf12..0b6efe8 100644
--- a/arch/m68k/Kconfig.cpu
+++ b/arch/m68k/Kconfig.cpu
@@ -40,6 +40,7 @@ config M68000
 	select CPU_HAS_NO_MULDIV64
 	select CPU_HAS_NO_UNALIGNED
 	select GENERIC_CSUM
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The Freescale (was Motorola) 68000 CPU is the first generation of
 	  the well known M68K family of processors. The CPU core as well as
@@ -51,6 +52,7 @@ config MCPU32
 	bool
 	select CPU_HAS_NO_BITFIELDS
 	select CPU_HAS_NO_UNALIGNED
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The Freescale (was then Motorola) CPU32 is a CPU core that is
 	  based on the 68020 processor. For the most part it is used in
@@ -130,6 +132,7 @@ config M5206
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5206 processor support.
 
@@ -138,6 +141,7 @@ config M5206e
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5206e processor support.
 
@@ -163,6 +167,7 @@ config M5249
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5249 processor support.
 
@@ -171,6 +176,7 @@ config M525x
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Freescale (Motorola) Coldfire 5251/5253 processor support.
 
@@ -189,6 +195,7 @@ config M5272
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5272 processor support.
 
@@ -217,6 +224,7 @@ config M5307
 	select COLDFIRE_SW_A7
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5307 processor support.
 
@@ -242,6 +250,7 @@ config M5407
 	select COLDFIRE_SW_A7
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5407 processor support.
 
@@ -251,6 +260,7 @@ config M547x
 	select MMU_COLDFIRE if MMU
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Freescale ColdFire 5470/5471/5472/5473/5474/5475 processor support.
 
@@ -260,6 +270,7 @@ config M548x
 	select M54xx
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Freescale ColdFire 5480/5481/5482/5483/5484/5485 processor support.
 
diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig
index a0fa88d..2ac2de6 100644
--- a/arch/metag/Kconfig
+++ b/arch/metag/Kconfig
@@ -29,6 +29,7 @@ config METAG
 	select OF
 	select OF_EARLY_FLATTREE
 	select SPARSE_IRQ
+	select CPU_NO_EFFICIENT_FFS
 
 config STACKTRACE_SUPPORT
 	def_bool y
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 3d793b5..f17c3a4 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -32,6 +32,7 @@ config MICROBLAZE
 	select OF_EARLY_FLATTREE
 	select TRACING_SUPPORT
 	select VIRT_TO_BUS
+	select CPU_NO_EFFICIENT_FFS
 
 config SWAP
 	def_bool n
diff --git a/arch/mips/include/asm/cpu-features.h b/arch/mips/include/asm/cpu-features.h
index eeec8c8..e20e100 100644
--- a/arch/mips/include/asm/cpu-features.h
+++ b/arch/mips/include/asm/cpu-features.h
@@ -180,6 +180,16 @@
 #endif
 #endif
 
+/* __builtin_constant_p(cpu_has_mips_r) && cpu_has_mips_r */
+#if !((defined(cpu_has_mips32r1) && cpu_has_mips32r1) || \
+	  (defined(cpu_has_mips32r2) && cpu_has_mips32r2) || \
+	  (defined(cpu_has_mips32r6) && cpu_has_mips32r6) || \
+	  (defined(cpu_has_mips64r1) && cpu_has_mips64r1) || \
+	  (defined(cpu_has_mips64r2) && cpu_has_mips64r2) || \
+	  (defined(cpu_has_mips64r6) && cpu_has_mips64r6))
+#define CONFIG_CPU_NO_EFFICIENT_FFS 1
+#endif
+
 #ifndef cpu_has_mips_1
 # define cpu_has_mips_1		(!cpu_has_mips_r6)
 #endif
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index 4375554..f10bd2c 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -16,6 +16,7 @@ config NIOS2
 	select SOC_BUS
 	select SPARSE_IRQ
 	select USB_ARCH_HAS_HCD if USB_SUPPORT
+	select CPU_NO_EFFICIENT_FFS
 
 config GENERIC_CSUM
 	def_bool y
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index e118c02..142cb05 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -25,6 +25,7 @@ config OPENRISC
 	select MODULES_USE_ELF_RELA
 	select HAVE_DEBUG_STACKOVERFLOW
 	select OR1K_PIC
+	select CPU_NO_EFFICIENT_FFS if !OPENRISC_HAVE_INST_FF1
 
 config MMU
 	def_bool y
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 88cfaa8..3d498a6 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -32,6 +32,7 @@ config PARISC
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_SECCOMP_FILTER
 	select ARCH_NO_COHERENT_DMA_MMAP
+	select CPU_NO_EFFICIENT_FFS
 
 	help
 	  The PA-RISC microprocessor is designed by Hewlett-Packard and used
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index bf24ab1..9eb3932 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -164,7 +164,7 @@ config S390
 	select TTY
 	select VIRT_CPU_ACCOUNTING
 	select VIRT_TO_BUS
-
+	select CPU_NO_EFFICIENT_FFS if !HAVE_MARCH_Z9_109_FEATURES
 
 config SCHED_OMIT_FRAME_POINTER
 	def_bool y
diff --git a/arch/score/Kconfig b/arch/score/Kconfig
index 366e1b5..507d631 100644
--- a/arch/score/Kconfig
+++ b/arch/score/Kconfig
@@ -14,6 +14,7 @@ config SCORE
 	select VIRT_TO_BUS
 	select MODULES_USE_ELF_REL
 	select CLONE_BACKWARDS
+	select CPU_NO_EFFICIENT_FFS
 
 choice
 	prompt "System type"
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 7ed20fc..56cf5e5 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -44,6 +44,7 @@ config SUPERH
 	select OLD_SIGSUSPEND
 	select OLD_SIGACTION
 	select HAVE_ARCH_AUDITSYSCALL
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The SuperH is a RISC processor targeted for use in embedded systems
 	  and consumer electronics; it was also used in the Sega Dreamcast
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 57ffaf2..ca675ed 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -42,6 +42,7 @@ config SPARC
 	select ODD_RT_SIGACTION
 	select OLD_SIGSUSPEND
 	select ARCH_HAS_SG_CHAIN
+	select CPU_NO_EFFICIENT_FFS
 
 config SPARC32
 	def_bool !64BIT
diff --git a/lib/gcd.c b/lib/gcd.c
index 3657f12..b9fbe73 100644
--- a/lib/gcd.c
+++ b/lib/gcd.c
@@ -2,20 +2,77 @@
 #include <linux/gcd.h>
 #include <linux/export.h>
 
-/* Greatest common divisor */
+/*
+ * This implements the binary GCD algorithm. (Often attributed to Stein,
+ * but as Knuth has noted, appears in a first-century Chinese math text.)
+ *
+ * This is faster than the division-based algorithm even on x86, which
+ * has decent hardware division.
+ */
+
+#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS)
+
+/* If __ffs is available, the even/odd algorithm benchmarks slower. */
 unsigned long gcd(unsigned long a, unsigned long b)
 {
-	unsigned long r;
+	unsigned long r = a | b;
+
+	if (!a || !b)
+		return r;
 
-	if (a < b)
-		swap(a, b);
+	b >>= __ffs(b);
+	if (b == 1)
+		return r & -r;
 
-	if (!b)
-		return a;
-	while ((r = a % b) != 0) {
-		a = b;
-		b = r;
+	for (;;) {
+		a >>= __ffs(a);
+		if (a == 1)
+			return r & -r;
+		if (a == b)
+			return a << __ffs(r);
+
+		if (a < b)
+			swap(a, b);
+		a -= b;
 	}
-	return b;
 }
+
+#else
+
+/* If normalization is done by loops, the even/odd algorithm is a win. */
+unsigned long gcd(unsigned long a, unsigned long b)
+{
+	unsigned long r = a | b;
+
+	if (!a || !b)
+		return r;
+
+	/* Isolate lsbit of r */
+	r &= -r;
+
+	while (!(b & r))
+		b >>= 1;
+	if (b == r)
+		return r;
+
+	for (;;) {
+		while (!(a & r))
+			a >>= 1;
+		if (a == r)
+			return r;
+		if (a == b)
+			return a;
+
+		if (a < b)
+			swap(a, b);
+		a -= b;
+		a >>= 1;
+		if (a & r)
+			a += b;
+		a >>= 1;
+	}
+}
+
+#endif
+
 EXPORT_SYMBOL_GPL(gcd);
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
@ 2016-05-06  9:42 ` zengzhaoxiu
  0 siblings, 0 replies; 41+ messages in thread
From: zengzhaoxiu @ 2016-05-06  9:42 UTC (permalink / raw)
  To: akpm, linux, peterz, jjuran, James.Bottomley, geert, dalias, sam, davem
  Cc: Zhaoxiu Zeng, Richard Henderson, Ivan Kokshaysky, Matt Turner,
	Vineet Gupta, Russell King, Yoshinori Sato, James Hogan,
	Michal Simek, Ralf Baechle, Ley Foon Tan, Jonas Bonn,
	James E.J. Bottomley, Helge Deller, Martin Schwidefsky,
	Heiko Carstens, Chen Liqin, Lennox Wu, linux-kernel, linux-alpha,
	linux-snps-arc, linux-arm-kernel, uclinux-h8-devel, linux-m68k,
	linux-metag, linux-mips, nios2-dev, linux, linux-parisc,
	linux-s390, linux-sh, sparclinux

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

The binary GCD algorithm is based on the following facts:
	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)

Even on x86 machines with reasonable division hardware, the binary
algorithm runs about 25% faster (80% the execution time) than the
division-based Euclidian algorithm.

On platforms like Alpha and ARMv6 where division is a function call to
emulation code, it's even more significant.

There are two variants of the code here, depending on whether a
fast __ffs (find least significant set bit) instruction is available.
This allows the unpredictable branches in the bit-at-a-time shifting
loop to be eliminated.

If fast __ffs is not available, the "even/odd" GCD variant is used.

I use the following code to benchmark:

	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <string.h>
	#include <time.h>
	#include <unistd.h>

	#define swap(a, b) \
		do { \
			a ^= b; \
			b ^= a; \
			a ^= b; \
		} while (0)

	unsigned long gcd0(unsigned long a, unsigned long b)
	{
		unsigned long r;

		if (a < b) {
			swap(a, b);
		}

		if (b == 0)
			return a;

		while ((r = a % b) != 0) {
			a = b;
			b = r;
		}

		return b;
	}

	unsigned long gcd1(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a == b)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd2(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a == b)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	unsigned long gcd3(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);
		if (b == 1)
			return r & -r;

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a == 1)
				return r & -r;
			if (a == b)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd4(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;
		if (b == r)
			return r;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a == r)
				return r;
			if (a == b)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	static unsigned long (*gcd_func[])(unsigned long a, unsigned long b) = {
		gcd0, gcd1, gcd2, gcd3, gcd4,
	};

	#define TEST_ENTRIES (sizeof(gcd_func) / sizeof(gcd_func[0]))

	#if defined(__x86_64__)

	#define rdtscll(val) do { \
		unsigned long __a,__d; \
		__asm__ __volatile__("rdtsc" : "=a" (__a), "=d" (__d)); \
		(val) = ((unsigned long long)__a) | (((unsigned long long)__d)<<32); \
	} while(0)

	static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
								unsigned long a, unsigned long b, unsigned long *res)
	{
		unsigned long long start, end;
		unsigned long long ret;
		unsigned long gcd_res;

		rdtscll(start);
		gcd_res = gcd(a, b);
		rdtscll(end);

		if (end >= start)
			ret = end - start;
		else
			ret = ~0ULL - start + 1 + end;

		*res = gcd_res;
		return ret;
	}

	#else

	static inline struct timespec read_time(void)
	{
		struct timespec time;
		clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time);
		return time;
	}

	static inline unsigned long long diff_time(struct timespec start, struct timespec end)
	{
		struct timespec temp;

		if ((end.tv_nsec - start.tv_nsec) < 0) {
			temp.tv_sec = end.tv_sec - start.tv_sec - 1;
			temp.tv_nsec = 1000000000ULL + end.tv_nsec - start.tv_nsec;
		} else {
			temp.tv_sec = end.tv_sec - start.tv_sec;
			temp.tv_nsec = end.tv_nsec - start.tv_nsec;
		}

		return temp.tv_sec * 1000000000ULL + temp.tv_nsec;
	}

	static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
								unsigned long a, unsigned long b, unsigned long *res)
	{
		struct timespec start, end;
		unsigned long gcd_res;

		start = read_time();
		gcd_res = gcd(a, b);
		end = read_time();

		*res = gcd_res;
		return diff_time(start, end);
	}

	#endif

	static inline unsigned long get_rand()
	{
		if (sizeof(long) == 8)
			return (unsigned long)rand() << 32 | rand();
		else
			return rand();
	}

	int main(int argc, char **argv)
	{
		unsigned int seed = time(0);
		int loops = 100;
		int repeats = 1000;
		unsigned long (*res)[TEST_ENTRIES];
		unsigned long long elapsed[TEST_ENTRIES];
		int i, j, k;

		for (;;) {
			int opt = getopt(argc, argv, "n:r:s:");
			/* End condition always first */
			if (opt == -1)
				break;

			switch (opt) {
			case 'n':
				loops = atoi(optarg);
				break;
			case 'r':
				repeats = atoi(optarg);
				break;
			case 's':
				seed = strtoul(optarg, NULL, 10);
				break;
			default:
				/* You won't actually get here. */
				break;
			}
		}

		res = malloc(sizeof(unsigned long) * TEST_ENTRIES * loops);
		memset(elapsed, 0, sizeof(elapsed));

		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			/* Do we have args? */
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			unsigned long long min_elapsed[TEST_ENTRIES];
			for (k = 0; k < repeats; k++) {
				for (i = 0; i < TEST_ENTRIES; i++) {
					unsigned long long tmp = benchmark_gcd_func(gcd_func[i], a, b, &res[j][i]);
					if (k == 0 || min_elapsed[i] > tmp)
						min_elapsed[i] = tmp;
				}
			}
			for (i = 0; i < TEST_ENTRIES; i++)
				elapsed[i] += min_elapsed[i];
		}

		for (i = 0; i < TEST_ENTRIES; i++)
			printf("gcd%d: elapsed %llu\n", i, elapsed[i]);

		k = 0;
		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			for (i = 1; i < TEST_ENTRIES; i++) {
				if (res[j][i] != res[j][0])
					break;
			}
			if (i < TEST_ENTRIES) {
				if (k == 0) {
					k = 1;
					fprintf(stderr, "Error:\n");
				}
				fprintf(stderr, "gcd(%lu, %lu): ", a, b);
				for (i = 0; i < TEST_ENTRIES; i++)
					fprintf(stderr, "%ld%s", res[j][i], i < TEST_ENTRIES - 1 ? ", " : "\n");
			}
		}

		if (k == 0)
			fprintf(stderr, "PASS\n");

		free(res);

		return 0;
	}

Compiled with "-O2", on "VirtualBox 4.4.0-22-generic #38-Ubuntu x86_64" got:

zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 10174
gcd1: elapsed 2120
gcd2: elapsed 2902
gcd3: elapsed 2039
gcd4: elapsed 2812
PASS
zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 9309
gcd1: elapsed 2280
gcd2: elapsed 2822
gcd3: elapsed 2217
gcd4: elapsed 2710
PASS
zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 9589
gcd1: elapsed 2098
gcd2: elapsed 2815
gcd3: elapsed 2030
gcd4: elapsed 2718
PASS
zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 9914
gcd1: elapsed 2309
gcd2: elapsed 2779
gcd3: elapsed 2228
gcd4: elapsed 2709
PASS

Changes to V3:
- Fix build error
- Select CPU_NO_EFFICIENT_FFS if CONFIG_ARC && CONFIG_ISA_ARCOMPACT
- Select CPU_NO_EFFICIENT_FFS if CONFIG_S390 && !CONFIG_HAVE_MARCH_Z9_109_FEATURES
- Do new brnchmark
- Return immediately if one number becomes a power of 2
- Add comments written by George Spelvin

Changes to V2:
- Add a new Kconfig variable CPU_NO_EFFICIENT_FFS
- Separate into two versions by CPU_NO_EFFICIENT_FFS
- Return directly from the loop, rather than using break().
- Use "r &= -r" mostly because it's clearer.

Changes to V1:
- Don't touch Kconfig, remove the Euclidean algorithm implementation
- Don't use the "even-odd" variant
- Use __ffs if the CPU has efficient __ffs

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
Signed-off-by: George Spelvin <linux@horizon.com>
---
 arch/Kconfig                         |  3 ++
 arch/alpha/Kconfig                   |  1 +
 arch/arc/Kconfig                     |  1 +
 arch/arm/mm/Kconfig                  |  3 ++
 arch/h8300/Kconfig                   |  1 +
 arch/m32r/Kconfig                    |  1 +
 arch/m68k/Kconfig.cpu                | 11 ++++++
 arch/metag/Kconfig                   |  1 +
 arch/microblaze/Kconfig              |  1 +
 arch/mips/include/asm/cpu-features.h | 10 +++++
 arch/nios2/Kconfig                   |  1 +
 arch/openrisc/Kconfig                |  1 +
 arch/parisc/Kconfig                  |  1 +
 arch/s390/Kconfig                    |  2 +-
 arch/score/Kconfig                   |  1 +
 arch/sh/Kconfig                      |  1 +
 arch/sparc/Kconfig                   |  1 +
 lib/gcd.c                            | 77 +++++++++++++++++++++++++++++++-----
 18 files changed, 107 insertions(+), 11 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 81869a5..275f17d 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -638,4 +638,7 @@ config COMPAT_OLD_SIGACTION
 config ARCH_NO_COHERENT_DMA_MMAP
 	bool
 
+config CPU_NO_EFFICIENT_FFS
+	def_bool n
+
 source "kernel/gcov/Kconfig"
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 9d8a858..44e6f05 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -27,6 +27,7 @@ config ALPHA
 	select MODULES_USE_ELF_RELA
 	select ODD_RT_SIGACTION
 	select OLD_SIGSUSPEND
+	select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67
 	help
 	  The Alpha is a 64-bit general-purpose processor designed and
 	  marketed by the Digital Equipment Corporation of blessed memory,
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index ec4791e..f41eb4c 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -101,6 +101,7 @@ choice
 
 config ISA_ARCOMPACT
 	bool "ARCompact ISA"
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The original ARC ISA of ARC600/700 cores
 
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index 5534766..cb569b6 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -421,18 +421,21 @@ config CPU_32v3
 	select CPU_USE_DOMAINS if MMU
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_NO_EFFICIENT_FFS
 
 config CPU_32v4
 	bool
 	select CPU_USE_DOMAINS if MMU
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_NO_EFFICIENT_FFS
 
 config CPU_32v4T
 	bool
 	select CPU_USE_DOMAINS if MMU
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_NO_EFFICIENT_FFS
 
 config CPU_32v5
 	bool
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index 986ea84..aa232de 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -20,6 +20,7 @@ config H8300
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZO
 	select HAVE_ARCH_KGDB
+	select CPU_NO_EFFICIENT_FFS
 
 config RWSEM_GENERIC_SPINLOCK
 	def_bool y
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index c82b292..3cc8498 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -17,6 +17,7 @@ config M32R
 	select ARCH_USES_GETTIMEOFFSET
 	select MODULES_USE_ELF_RELA
 	select HAVE_DEBUG_STACKOVERFLOW
+	select CPU_NO_EFFICIENT_FFS
 
 config SBUS
 	bool
diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu
index 0dfcf12..0b6efe8 100644
--- a/arch/m68k/Kconfig.cpu
+++ b/arch/m68k/Kconfig.cpu
@@ -40,6 +40,7 @@ config M68000
 	select CPU_HAS_NO_MULDIV64
 	select CPU_HAS_NO_UNALIGNED
 	select GENERIC_CSUM
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The Freescale (was Motorola) 68000 CPU is the first generation of
 	  the well known M68K family of processors. The CPU core as well as
@@ -51,6 +52,7 @@ config MCPU32
 	bool
 	select CPU_HAS_NO_BITFIELDS
 	select CPU_HAS_NO_UNALIGNED
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The Freescale (was then Motorola) CPU32 is a CPU core that is
 	  based on the 68020 processor. For the most part it is used in
@@ -130,6 +132,7 @@ config M5206
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5206 processor support.
 
@@ -138,6 +141,7 @@ config M5206e
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5206e processor support.
 
@@ -163,6 +167,7 @@ config M5249
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5249 processor support.
 
@@ -171,6 +176,7 @@ config M525x
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Freescale (Motorola) Coldfire 5251/5253 processor support.
 
@@ -189,6 +195,7 @@ config M5272
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5272 processor support.
 
@@ -217,6 +224,7 @@ config M5307
 	select COLDFIRE_SW_A7
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5307 processor support.
 
@@ -242,6 +250,7 @@ config M5407
 	select COLDFIRE_SW_A7
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5407 processor support.
 
@@ -251,6 +260,7 @@ config M547x
 	select MMU_COLDFIRE if MMU
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Freescale ColdFire 5470/5471/5472/5473/5474/5475 processor support.
 
@@ -260,6 +270,7 @@ config M548x
 	select M54xx
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Freescale ColdFire 5480/5481/5482/5483/5484/5485 processor support.
 
diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig
index a0fa88d..2ac2de6 100644
--- a/arch/metag/Kconfig
+++ b/arch/metag/Kconfig
@@ -29,6 +29,7 @@ config METAG
 	select OF
 	select OF_EARLY_FLATTREE
 	select SPARSE_IRQ
+	select CPU_NO_EFFICIENT_FFS
 
 config STACKTRACE_SUPPORT
 	def_bool y
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 3d793b5..f17c3a4 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -32,6 +32,7 @@ config MICROBLAZE
 	select OF_EARLY_FLATTREE
 	select TRACING_SUPPORT
 	select VIRT_TO_BUS
+	select CPU_NO_EFFICIENT_FFS
 
 config SWAP
 	def_bool n
diff --git a/arch/mips/include/asm/cpu-features.h b/arch/mips/include/asm/cpu-features.h
index eeec8c8..e20e100 100644
--- a/arch/mips/include/asm/cpu-features.h
+++ b/arch/mips/include/asm/cpu-features.h
@@ -180,6 +180,16 @@
 #endif
 #endif
 
+/* __builtin_constant_p(cpu_has_mips_r) && cpu_has_mips_r */
+#if !((defined(cpu_has_mips32r1) && cpu_has_mips32r1) || \
+	  (defined(cpu_has_mips32r2) && cpu_has_mips32r2) || \
+	  (defined(cpu_has_mips32r6) && cpu_has_mips32r6) || \
+	  (defined(cpu_has_mips64r1) && cpu_has_mips64r1) || \
+	  (defined(cpu_has_mips64r2) && cpu_has_mips64r2) || \
+	  (defined(cpu_has_mips64r6) && cpu_has_mips64r6))
+#define CONFIG_CPU_NO_EFFICIENT_FFS 1
+#endif
+
 #ifndef cpu_has_mips_1
 # define cpu_has_mips_1		(!cpu_has_mips_r6)
 #endif
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index 4375554..f10bd2c 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -16,6 +16,7 @@ config NIOS2
 	select SOC_BUS
 	select SPARSE_IRQ
 	select USB_ARCH_HAS_HCD if USB_SUPPORT
+	select CPU_NO_EFFICIENT_FFS
 
 config GENERIC_CSUM
 	def_bool y
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index e118c02..142cb05 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -25,6 +25,7 @@ config OPENRISC
 	select MODULES_USE_ELF_RELA
 	select HAVE_DEBUG_STACKOVERFLOW
 	select OR1K_PIC
+	select CPU_NO_EFFICIENT_FFS if !OPENRISC_HAVE_INST_FF1
 
 config MMU
 	def_bool y
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 88cfaa8..3d498a6 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -32,6 +32,7 @@ config PARISC
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_SECCOMP_FILTER
 	select ARCH_NO_COHERENT_DMA_MMAP
+	select CPU_NO_EFFICIENT_FFS
 
 	help
 	  The PA-RISC microprocessor is designed by Hewlett-Packard and used
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index bf24ab1..9eb3932 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -164,7 +164,7 @@ config S390
 	select TTY
 	select VIRT_CPU_ACCOUNTING
 	select VIRT_TO_BUS
-
+	select CPU_NO_EFFICIENT_FFS if !HAVE_MARCH_Z9_109_FEATURES
 
 config SCHED_OMIT_FRAME_POINTER
 	def_bool y
diff --git a/arch/score/Kconfig b/arch/score/Kconfig
index 366e1b5..507d631 100644
--- a/arch/score/Kconfig
+++ b/arch/score/Kconfig
@@ -14,6 +14,7 @@ config SCORE
 	select VIRT_TO_BUS
 	select MODULES_USE_ELF_REL
 	select CLONE_BACKWARDS
+	select CPU_NO_EFFICIENT_FFS
 
 choice
 	prompt "System type"
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 7ed20fc..56cf5e5 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -44,6 +44,7 @@ config SUPERH
 	select OLD_SIGSUSPEND
 	select OLD_SIGACTION
 	select HAVE_ARCH_AUDITSYSCALL
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The SuperH is a RISC processor targeted for use in embedded systems
 	  and consumer electronics; it was also used in the Sega Dreamcast
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 57ffaf2..ca675ed 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -42,6 +42,7 @@ config SPARC
 	select ODD_RT_SIGACTION
 	select OLD_SIGSUSPEND
 	select ARCH_HAS_SG_CHAIN
+	select CPU_NO_EFFICIENT_FFS
 
 config SPARC32
 	def_bool !64BIT
diff --git a/lib/gcd.c b/lib/gcd.c
index 3657f12..b9fbe73 100644
--- a/lib/gcd.c
+++ b/lib/gcd.c
@@ -2,20 +2,77 @@
 #include <linux/gcd.h>
 #include <linux/export.h>
 
-/* Greatest common divisor */
+/*
+ * This implements the binary GCD algorithm. (Often attributed to Stein,
+ * but as Knuth has noted, appears in a first-century Chinese math text.)
+ *
+ * This is faster than the division-based algorithm even on x86, which
+ * has decent hardware division.
+ */
+
+#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS)
+
+/* If __ffs is available, the even/odd algorithm benchmarks slower. */
 unsigned long gcd(unsigned long a, unsigned long b)
 {
-	unsigned long r;
+	unsigned long r = a | b;
+
+	if (!a || !b)
+		return r;
 
-	if (a < b)
-		swap(a, b);
+	b >>= __ffs(b);
+	if (b == 1)
+		return r & -r;
 
-	if (!b)
-		return a;
-	while ((r = a % b) != 0) {
-		a = b;
-		b = r;
+	for (;;) {
+		a >>= __ffs(a);
+		if (a == 1)
+			return r & -r;
+		if (a == b)
+			return a << __ffs(r);
+
+		if (a < b)
+			swap(a, b);
+		a -= b;
 	}
-	return b;
 }
+
+#else
+
+/* If normalization is done by loops, the even/odd algorithm is a win. */
+unsigned long gcd(unsigned long a, unsigned long b)
+{
+	unsigned long r = a | b;
+
+	if (!a || !b)
+		return r;
+
+	/* Isolate lsbit of r */
+	r &= -r;
+
+	while (!(b & r))
+		b >>= 1;
+	if (b == r)
+		return r;
+
+	for (;;) {
+		while (!(a & r))
+			a >>= 1;
+		if (a == r)
+			return r;
+		if (a == b)
+			return a;
+
+		if (a < b)
+			swap(a, b);
+		a -= b;
+		a >>= 1;
+		if (a & r)
+			a += b;
+		a >>= 1;
+	}
+}
+
+#endif
+
 EXPORT_SYMBOL_GPL(gcd);
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
@ 2016-05-06  9:42 ` zengzhaoxiu
  0 siblings, 0 replies; 41+ messages in thread
From: zengzhaoxiu @ 2016-05-06  9:42 UTC (permalink / raw)
  To: akpm, linux, peterz, jjuran, James.Bottomley, geert, dalias, sam, davem
  Cc: Zhaoxiu Zeng, Richard Henderson, Ivan Kokshaysky, Matt Turner,
	Vineet Gupta, Russell King, Yoshinori Sato, James Hogan,
	Michal Simek, Ralf Baechle, Ley Foon Tan, Jonas Bonn,
	James E.J. Bottomley, Helge Deller, Martin Schwidefsky,
	Heiko Carstens, Chen Liqin, Lennox Wu, linux-kernel, linux-alpha,
	linux-snps-arc, linux-arm-kernel, uclinux-h8-devel, linux-m68k,
	linux-metag, linux-mips, nios2-dev, linux, linux-parisc,
	linux-s390, linux-sh, sparclinux

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

The binary GCD algorithm is based on the following facts:
	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)

Even on x86 machines with reasonable division hardware, the binary
algorithm runs about 25% faster (80% the execution time) than the
division-based Euclidian algorithm.

On platforms like Alpha and ARMv6 where division is a function call to
emulation code, it's even more significant.

There are two variants of the code here, depending on whether a
fast __ffs (find least significant set bit) instruction is available.
This allows the unpredictable branches in the bit-at-a-time shifting
loop to be eliminated.

If fast __ffs is not available, the "even/odd" GCD variant is used.

I use the following code to benchmark:

	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <string.h>
	#include <time.h>
	#include <unistd.h>

	#define swap(a, b) \
		do { \
			a ^= b; \
			b ^= a; \
			a ^= b; \
		} while (0)

	unsigned long gcd0(unsigned long a, unsigned long b)
	{
		unsigned long r;

		if (a < b) {
			swap(a, b);
		}

		if (b == 0)
			return a;

		while ((r = a % b) != 0) {
			a = b;
			b = r;
		}

		return b;
	}

	unsigned long gcd1(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a == b)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd2(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a == b)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	unsigned long gcd3(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);
		if (b == 1)
			return r & -r;

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a == 1)
				return r & -r;
			if (a == b)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd4(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;
		if (b == r)
			return r;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a == r)
				return r;
			if (a == b)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	static unsigned long (*gcd_func[])(unsigned long a, unsigned long b) = {
		gcd0, gcd1, gcd2, gcd3, gcd4,
	};

	#define TEST_ENTRIES (sizeof(gcd_func) / sizeof(gcd_func[0]))

	#if defined(__x86_64__)

	#define rdtscll(val) do { \
		unsigned long __a,__d; \
		__asm__ __volatile__("rdtsc" : "=a" (__a), "=d" (__d)); \
		(val) = ((unsigned long long)__a) | (((unsigned long long)__d)<<32); \
	} while(0)

	static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
								unsigned long a, unsigned long b, unsigned long *res)
	{
		unsigned long long start, end;
		unsigned long long ret;
		unsigned long gcd_res;

		rdtscll(start);
		gcd_res = gcd(a, b);
		rdtscll(end);

		if (end >= start)
			ret = end - start;
		else
			ret = ~0ULL - start + 1 + end;

		*res = gcd_res;
		return ret;
	}

	#else

	static inline struct timespec read_time(void)
	{
		struct timespec time;
		clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time);
		return time;
	}

	static inline unsigned long long diff_time(struct timespec start, struct timespec end)
	{
		struct timespec temp;

		if ((end.tv_nsec - start.tv_nsec) < 0) {
			temp.tv_sec = end.tv_sec - start.tv_sec - 1;
			temp.tv_nsec = 1000000000ULL + end.tv_nsec - start.tv_nsec;
		} else {
			temp.tv_sec = end.tv_sec - start.tv_sec;
			temp.tv_nsec = end.tv_nsec - start.tv_nsec;
		}

		return temp.tv_sec * 1000000000ULL + temp.tv_nsec;
	}

	static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
								unsigned long a, unsigned long b, unsigned long *res)
	{
		struct timespec start, end;
		unsigned long gcd_res;

		start = read_time();
		gcd_res = gcd(a, b);
		end = read_time();

		*res = gcd_res;
		return diff_time(start, end);
	}

	#endif

	static inline unsigned long get_rand()
	{
		if (sizeof(long) == 8)
			return (unsigned long)rand() << 32 | rand();
		else
			return rand();
	}

	int main(int argc, char **argv)
	{
		unsigned int seed = time(0);
		int loops = 100;
		int repeats = 1000;
		unsigned long (*res)[TEST_ENTRIES];
		unsigned long long elapsed[TEST_ENTRIES];
		int i, j, k;

		for (;;) {
			int opt = getopt(argc, argv, "n:r:s:");
			/* End condition always first */
			if (opt == -1)
				break;

			switch (opt) {
			case 'n':
				loops = atoi(optarg);
				break;
			case 'r':
				repeats = atoi(optarg);
				break;
			case 's':
				seed = strtoul(optarg, NULL, 10);
				break;
			default:
				/* You won't actually get here. */
				break;
			}
		}

		res = malloc(sizeof(unsigned long) * TEST_ENTRIES * loops);
		memset(elapsed, 0, sizeof(elapsed));

		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			/* Do we have args? */
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			unsigned long long min_elapsed[TEST_ENTRIES];
			for (k = 0; k < repeats; k++) {
				for (i = 0; i < TEST_ENTRIES; i++) {
					unsigned long long tmp = benchmark_gcd_func(gcd_func[i], a, b, &res[j][i]);
					if (k == 0 || min_elapsed[i] > tmp)
						min_elapsed[i] = tmp;
				}
			}
			for (i = 0; i < TEST_ENTRIES; i++)
				elapsed[i] += min_elapsed[i];
		}

		for (i = 0; i < TEST_ENTRIES; i++)
			printf("gcd%d: elapsed %llu\n", i, elapsed[i]);

		k = 0;
		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			for (i = 1; i < TEST_ENTRIES; i++) {
				if (res[j][i] != res[j][0])
					break;
			}
			if (i < TEST_ENTRIES) {
				if (k == 0) {
					k = 1;
					fprintf(stderr, "Error:\n");
				}
				fprintf(stderr, "gcd(%lu, %lu): ", a, b);
				for (i = 0; i < TEST_ENTRIES; i++)
					fprintf(stderr, "%ld%s", res[j][i], i < TEST_ENTRIES - 1 ? ", " : "\n");
			}
		}

		if (k == 0)
			fprintf(stderr, "PASS\n");

		free(res);

		return 0;
	}

Compiled with "-O2", on "VirtualBox 4.4.0-22-generic #38-Ubuntu x86_64" got:

zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 10174
gcd1: elapsed 2120
gcd2: elapsed 2902
gcd3: elapsed 2039
gcd4: elapsed 2812
PASS
zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 9309
gcd1: elapsed 2280
gcd2: elapsed 2822
gcd3: elapsed 2217
gcd4: elapsed 2710
PASS
zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 9589
gcd1: elapsed 2098
gcd2: elapsed 2815
gcd3: elapsed 2030
gcd4: elapsed 2718
PASS
zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 9914
gcd1: elapsed 2309
gcd2: elapsed 2779
gcd3: elapsed 2228
gcd4: elapsed 2709
PASS

Changes to V3:
- Fix build error
- Select CPU_NO_EFFICIENT_FFS if CONFIG_ARC && CONFIG_ISA_ARCOMPACT
- Select CPU_NO_EFFICIENT_FFS if CONFIG_S390 && !CONFIG_HAVE_MARCH_Z9_109_FEATURES
- Do new brnchmark
- Return immediately if one number becomes a power of 2
- Add comments written by George Spelvin

Changes to V2:
- Add a new Kconfig variable CPU_NO_EFFICIENT_FFS
- Separate into two versions by CPU_NO_EFFICIENT_FFS
- Return directly from the loop, rather than using break().
- Use "r &= -r" mostly because it's clearer.

Changes to V1:
- Don't touch Kconfig, remove the Euclidean algorithm implementation
- Don't use the "even-odd" variant
- Use __ffs if the CPU has efficient __ffs

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
Signed-off-by: George Spelvin <linux@horizon.com>
---
 arch/Kconfig                         |  3 ++
 arch/alpha/Kconfig                   |  1 +
 arch/arc/Kconfig                     |  1 +
 arch/arm/mm/Kconfig                  |  3 ++
 arch/h8300/Kconfig                   |  1 +
 arch/m32r/Kconfig                    |  1 +
 arch/m68k/Kconfig.cpu                | 11 ++++++
 arch/metag/Kconfig                   |  1 +
 arch/microblaze/Kconfig              |  1 +
 arch/mips/include/asm/cpu-features.h | 10 +++++
 arch/nios2/Kconfig                   |  1 +
 arch/openrisc/Kconfig                |  1 +
 arch/parisc/Kconfig                  |  1 +
 arch/s390/Kconfig                    |  2 +-
 arch/score/Kconfig                   |  1 +
 arch/sh/Kconfig                      |  1 +
 arch/sparc/Kconfig                   |  1 +
 lib/gcd.c                            | 77 +++++++++++++++++++++++++++++++-----
 18 files changed, 107 insertions(+), 11 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 81869a5..275f17d 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -638,4 +638,7 @@ config COMPAT_OLD_SIGACTION
 config ARCH_NO_COHERENT_DMA_MMAP
 	bool
 
+config CPU_NO_EFFICIENT_FFS
+	def_bool n
+
 source "kernel/gcov/Kconfig"
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 9d8a858..44e6f05 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -27,6 +27,7 @@ config ALPHA
 	select MODULES_USE_ELF_RELA
 	select ODD_RT_SIGACTION
 	select OLD_SIGSUSPEND
+	select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67
 	help
 	  The Alpha is a 64-bit general-purpose processor designed and
 	  marketed by the Digital Equipment Corporation of blessed memory,
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index ec4791e..f41eb4c 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -101,6 +101,7 @@ choice
 
 config ISA_ARCOMPACT
 	bool "ARCompact ISA"
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The original ARC ISA of ARC600/700 cores
 
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index 5534766..cb569b6 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -421,18 +421,21 @@ config CPU_32v3
 	select CPU_USE_DOMAINS if MMU
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_NO_EFFICIENT_FFS
 
 config CPU_32v4
 	bool
 	select CPU_USE_DOMAINS if MMU
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_NO_EFFICIENT_FFS
 
 config CPU_32v4T
 	bool
 	select CPU_USE_DOMAINS if MMU
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_NO_EFFICIENT_FFS
 
 config CPU_32v5
 	bool
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index 986ea84..aa232de 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -20,6 +20,7 @@ config H8300
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZO
 	select HAVE_ARCH_KGDB
+	select CPU_NO_EFFICIENT_FFS
 
 config RWSEM_GENERIC_SPINLOCK
 	def_bool y
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index c82b292..3cc8498 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -17,6 +17,7 @@ config M32R
 	select ARCH_USES_GETTIMEOFFSET
 	select MODULES_USE_ELF_RELA
 	select HAVE_DEBUG_STACKOVERFLOW
+	select CPU_NO_EFFICIENT_FFS
 
 config SBUS
 	bool
diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu
index 0dfcf12..0b6efe8 100644
--- a/arch/m68k/Kconfig.cpu
+++ b/arch/m68k/Kconfig.cpu
@@ -40,6 +40,7 @@ config M68000
 	select CPU_HAS_NO_MULDIV64
 	select CPU_HAS_NO_UNALIGNED
 	select GENERIC_CSUM
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The Freescale (was Motorola) 68000 CPU is the first generation of
 	  the well known M68K family of processors. The CPU core as well as
@@ -51,6 +52,7 @@ config MCPU32
 	bool
 	select CPU_HAS_NO_BITFIELDS
 	select CPU_HAS_NO_UNALIGNED
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The Freescale (was then Motorola) CPU32 is a CPU core that is
 	  based on the 68020 processor. For the most part it is used in
@@ -130,6 +132,7 @@ config M5206
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5206 processor support.
 
@@ -138,6 +141,7 @@ config M5206e
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5206e processor support.
 
@@ -163,6 +167,7 @@ config M5249
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5249 processor support.
 
@@ -171,6 +176,7 @@ config M525x
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Freescale (Motorola) Coldfire 5251/5253 processor support.
 
@@ -189,6 +195,7 @@ config M5272
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5272 processor support.
 
@@ -217,6 +224,7 @@ config M5307
 	select COLDFIRE_SW_A7
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5307 processor support.
 
@@ -242,6 +250,7 @@ config M5407
 	select COLDFIRE_SW_A7
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5407 processor support.
 
@@ -251,6 +260,7 @@ config M547x
 	select MMU_COLDFIRE if MMU
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Freescale ColdFire 5470/5471/5472/5473/5474/5475 processor support.
 
@@ -260,6 +270,7 @@ config M548x
 	select M54xx
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Freescale ColdFire 5480/5481/5482/5483/5484/5485 processor support.
 
diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig
index a0fa88d..2ac2de6 100644
--- a/arch/metag/Kconfig
+++ b/arch/metag/Kconfig
@@ -29,6 +29,7 @@ config METAG
 	select OF
 	select OF_EARLY_FLATTREE
 	select SPARSE_IRQ
+	select CPU_NO_EFFICIENT_FFS
 
 config STACKTRACE_SUPPORT
 	def_bool y
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 3d793b5..f17c3a4 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -32,6 +32,7 @@ config MICROBLAZE
 	select OF_EARLY_FLATTREE
 	select TRACING_SUPPORT
 	select VIRT_TO_BUS
+	select CPU_NO_EFFICIENT_FFS
 
 config SWAP
 	def_bool n
diff --git a/arch/mips/include/asm/cpu-features.h b/arch/mips/include/asm/cpu-features.h
index eeec8c8..e20e100 100644
--- a/arch/mips/include/asm/cpu-features.h
+++ b/arch/mips/include/asm/cpu-features.h
@@ -180,6 +180,16 @@
 #endif
 #endif
 
+/* __builtin_constant_p(cpu_has_mips_r) && cpu_has_mips_r */
+#if !((defined(cpu_has_mips32r1) && cpu_has_mips32r1) || \
+	  (defined(cpu_has_mips32r2) && cpu_has_mips32r2) || \
+	  (defined(cpu_has_mips32r6) && cpu_has_mips32r6) || \
+	  (defined(cpu_has_mips64r1) && cpu_has_mips64r1) || \
+	  (defined(cpu_has_mips64r2) && cpu_has_mips64r2) || \
+	  (defined(cpu_has_mips64r6) && cpu_has_mips64r6))
+#define CONFIG_CPU_NO_EFFICIENT_FFS 1
+#endif
+
 #ifndef cpu_has_mips_1
 # define cpu_has_mips_1		(!cpu_has_mips_r6)
 #endif
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index 4375554..f10bd2c 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -16,6 +16,7 @@ config NIOS2
 	select SOC_BUS
 	select SPARSE_IRQ
 	select USB_ARCH_HAS_HCD if USB_SUPPORT
+	select CPU_NO_EFFICIENT_FFS
 
 config GENERIC_CSUM
 	def_bool y
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index e118c02..142cb05 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -25,6 +25,7 @@ config OPENRISC
 	select MODULES_USE_ELF_RELA
 	select HAVE_DEBUG_STACKOVERFLOW
 	select OR1K_PIC
+	select CPU_NO_EFFICIENT_FFS if !OPENRISC_HAVE_INST_FF1
 
 config MMU
 	def_bool y
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 88cfaa8..3d498a6 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -32,6 +32,7 @@ config PARISC
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_SECCOMP_FILTER
 	select ARCH_NO_COHERENT_DMA_MMAP
+	select CPU_NO_EFFICIENT_FFS
 
 	help
 	  The PA-RISC microprocessor is designed by Hewlett-Packard and used
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index bf24ab1..9eb3932 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -164,7 +164,7 @@ config S390
 	select TTY
 	select VIRT_CPU_ACCOUNTING
 	select VIRT_TO_BUS
-
+	select CPU_NO_EFFICIENT_FFS if !HAVE_MARCH_Z9_109_FEATURES
 
 config SCHED_OMIT_FRAME_POINTER
 	def_bool y
diff --git a/arch/score/Kconfig b/arch/score/Kconfig
index 366e1b5..507d631 100644
--- a/arch/score/Kconfig
+++ b/arch/score/Kconfig
@@ -14,6 +14,7 @@ config SCORE
 	select VIRT_TO_BUS
 	select MODULES_USE_ELF_REL
 	select CLONE_BACKWARDS
+	select CPU_NO_EFFICIENT_FFS
 
 choice
 	prompt "System type"
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 7ed20fc..56cf5e5 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -44,6 +44,7 @@ config SUPERH
 	select OLD_SIGSUSPEND
 	select OLD_SIGACTION
 	select HAVE_ARCH_AUDITSYSCALL
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The SuperH is a RISC processor targeted for use in embedded systems
 	  and consumer electronics; it was also used in the Sega Dreamcast
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 57ffaf2..ca675ed 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -42,6 +42,7 @@ config SPARC
 	select ODD_RT_SIGACTION
 	select OLD_SIGSUSPEND
 	select ARCH_HAS_SG_CHAIN
+	select CPU_NO_EFFICIENT_FFS
 
 config SPARC32
 	def_bool !64BIT
diff --git a/lib/gcd.c b/lib/gcd.c
index 3657f12..b9fbe73 100644
--- a/lib/gcd.c
+++ b/lib/gcd.c
@@ -2,20 +2,77 @@
 #include <linux/gcd.h>
 #include <linux/export.h>
 
-/* Greatest common divisor */
+/*
+ * This implements the binary GCD algorithm. (Often attributed to Stein,
+ * but as Knuth has noted, appears in a first-century Chinese math text.)
+ *
+ * This is faster than the division-based algorithm even on x86, which
+ * has decent hardware division.
+ */
+
+#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS)
+
+/* If __ffs is available, the even/odd algorithm benchmarks slower. */
 unsigned long gcd(unsigned long a, unsigned long b)
 {
-	unsigned long r;
+	unsigned long r = a | b;
+
+	if (!a || !b)
+		return r;
 
-	if (a < b)
-		swap(a, b);
+	b >>= __ffs(b);
+	if (b == 1)
+		return r & -r;
 
-	if (!b)
-		return a;
-	while ((r = a % b) != 0) {
-		a = b;
-		b = r;
+	for (;;) {
+		a >>= __ffs(a);
+		if (a == 1)
+			return r & -r;
+		if (a == b)
+			return a << __ffs(r);
+
+		if (a < b)
+			swap(a, b);
+		a -= b;
 	}
-	return b;
 }
+
+#else
+
+/* If normalization is done by loops, the even/odd algorithm is a win. */
+unsigned long gcd(unsigned long a, unsigned long b)
+{
+	unsigned long r = a | b;
+
+	if (!a || !b)
+		return r;
+
+	/* Isolate lsbit of r */
+	r &= -r;
+
+	while (!(b & r))
+		b >>= 1;
+	if (b == r)
+		return r;
+
+	for (;;) {
+		while (!(a & r))
+			a >>= 1;
+		if (a == r)
+			return r;
+		if (a == b)
+			return a;
+
+		if (a < b)
+			swap(a, b);
+		a -= b;
+		a >>= 1;
+		if (a & r)
+			a += b;
+		a >>= 1;
+	}
+}
+
+#endif
+
 EXPORT_SYMBOL_GPL(gcd);
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
@ 2016-05-06  9:42 ` zengzhaoxiu
  0 siblings, 0 replies; 41+ messages in thread
From: zengzhaoxiu @ 2016-05-06  9:42 UTC (permalink / raw)
  To: linux-snps-arc

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

The binary GCD algorithm is based on the following facts:
	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)

Even on x86 machines with reasonable division hardware, the binary
algorithm runs about 25% faster (80% the execution time) than the
division-based Euclidian algorithm.

On platforms like Alpha and ARMv6 where division is a function call to
emulation code, it's even more significant.

There are two variants of the code here, depending on whether a
fast __ffs (find least significant set bit) instruction is available.
This allows the unpredictable branches in the bit-at-a-time shifting
loop to be eliminated.

If fast __ffs is not available, the "even/odd" GCD variant is used.

I use the following code to benchmark:

	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <string.h>
	#include <time.h>
	#include <unistd.h>

	#define swap(a, b) \
		do { \
			a ^= b; \
			b ^= a; \
			a ^= b; \
		} while (0)

	unsigned long gcd0(unsigned long a, unsigned long b)
	{
		unsigned long r;

		if (a < b) {
			swap(a, b);
		}

		if (b == 0)
			return a;

		while ((r = a % b) != 0) {
			a = b;
			b = r;
		}

		return b;
	}

	unsigned long gcd1(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a == b)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd2(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a == b)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	unsigned long gcd3(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);
		if (b == 1)
			return r & -r;

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a == 1)
				return r & -r;
			if (a == b)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd4(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;
		if (b == r)
			return r;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a == r)
				return r;
			if (a == b)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	static unsigned long (*gcd_func[])(unsigned long a, unsigned long b) = {
		gcd0, gcd1, gcd2, gcd3, gcd4,
	};

	#define TEST_ENTRIES (sizeof(gcd_func) / sizeof(gcd_func[0]))

	#if defined(__x86_64__)

	#define rdtscll(val) do { \
		unsigned long __a,__d; \
		__asm__ __volatile__("rdtsc" : "=a" (__a), "=d" (__d)); \
		(val) = ((unsigned long long)__a) | (((unsigned long long)__d)<<32); \
	} while(0)

	static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
								unsigned long a, unsigned long b, unsigned long *res)
	{
		unsigned long long start, end;
		unsigned long long ret;
		unsigned long gcd_res;

		rdtscll(start);
		gcd_res = gcd(a, b);
		rdtscll(end);

		if (end >= start)
			ret = end - start;
		else
			ret = ~0ULL - start + 1 + end;

		*res = gcd_res;
		return ret;
	}

	#else

	static inline struct timespec read_time(void)
	{
		struct timespec time;
		clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time);
		return time;
	}

	static inline unsigned long long diff_time(struct timespec start, struct timespec end)
	{
		struct timespec temp;

		if ((end.tv_nsec - start.tv_nsec) < 0) {
			temp.tv_sec = end.tv_sec - start.tv_sec - 1;
			temp.tv_nsec = 1000000000ULL + end.tv_nsec - start.tv_nsec;
		} else {
			temp.tv_sec = end.tv_sec - start.tv_sec;
			temp.tv_nsec = end.tv_nsec - start.tv_nsec;
		}

		return temp.tv_sec * 1000000000ULL + temp.tv_nsec;
	}

	static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
								unsigned long a, unsigned long b, unsigned long *res)
	{
		struct timespec start, end;
		unsigned long gcd_res;

		start = read_time();
		gcd_res = gcd(a, b);
		end = read_time();

		*res = gcd_res;
		return diff_time(start, end);
	}

	#endif

	static inline unsigned long get_rand()
	{
		if (sizeof(long) == 8)
			return (unsigned long)rand() << 32 | rand();
		else
			return rand();
	}

	int main(int argc, char **argv)
	{
		unsigned int seed = time(0);
		int loops = 100;
		int repeats = 1000;
		unsigned long (*res)[TEST_ENTRIES];
		unsigned long long elapsed[TEST_ENTRIES];
		int i, j, k;

		for (;;) {
			int opt = getopt(argc, argv, "n:r:s:");
			/* End condition always first */
			if (opt == -1)
				break;

			switch (opt) {
			case 'n':
				loops = atoi(optarg);
				break;
			case 'r':
				repeats = atoi(optarg);
				break;
			case 's':
				seed = strtoul(optarg, NULL, 10);
				break;
			default:
				/* You won't actually get here. */
				break;
			}
		}

		res = malloc(sizeof(unsigned long) * TEST_ENTRIES * loops);
		memset(elapsed, 0, sizeof(elapsed));

		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			/* Do we have args? */
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			unsigned long long min_elapsed[TEST_ENTRIES];
			for (k = 0; k < repeats; k++) {
				for (i = 0; i < TEST_ENTRIES; i++) {
					unsigned long long tmp = benchmark_gcd_func(gcd_func[i], a, b, &res[j][i]);
					if (k == 0 || min_elapsed[i] > tmp)
						min_elapsed[i] = tmp;
				}
			}
			for (i = 0; i < TEST_ENTRIES; i++)
				elapsed[i] += min_elapsed[i];
		}

		for (i = 0; i < TEST_ENTRIES; i++)
			printf("gcd%d: elapsed %llu\n", i, elapsed[i]);

		k = 0;
		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			for (i = 1; i < TEST_ENTRIES; i++) {
				if (res[j][i] != res[j][0])
					break;
			}
			if (i < TEST_ENTRIES) {
				if (k == 0) {
					k = 1;
					fprintf(stderr, "Error:\n");
				}
				fprintf(stderr, "gcd(%lu, %lu): ", a, b);
				for (i = 0; i < TEST_ENTRIES; i++)
					fprintf(stderr, "%ld%s", res[j][i], i < TEST_ENTRIES - 1 ? ", " : "\n");
			}
		}

		if (k == 0)
			fprintf(stderr, "PASS\n");

		free(res);

		return 0;
	}

Compiled with "-O2", on "VirtualBox 4.4.0-22-generic #38-Ubuntu x86_64" got:

zhaoxiuzeng at zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 10174
gcd1: elapsed 2120
gcd2: elapsed 2902
gcd3: elapsed 2039
gcd4: elapsed 2812
PASS
zhaoxiuzeng at zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 9309
gcd1: elapsed 2280
gcd2: elapsed 2822
gcd3: elapsed 2217
gcd4: elapsed 2710
PASS
zhaoxiuzeng at zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 9589
gcd1: elapsed 2098
gcd2: elapsed 2815
gcd3: elapsed 2030
gcd4: elapsed 2718
PASS
zhaoxiuzeng at zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 9914
gcd1: elapsed 2309
gcd2: elapsed 2779
gcd3: elapsed 2228
gcd4: elapsed 2709
PASS

Changes to V3:
- Fix build error
- Select CPU_NO_EFFICIENT_FFS if CONFIG_ARC && CONFIG_ISA_ARCOMPACT
- Select CPU_NO_EFFICIENT_FFS if CONFIG_S390 && !CONFIG_HAVE_MARCH_Z9_109_FEATURES
- Do new brnchmark
- Return immediately if one number becomes a power of 2
- Add comments written by George Spelvin

Changes to V2:
- Add a new Kconfig variable CPU_NO_EFFICIENT_FFS
- Separate into two versions by CPU_NO_EFFICIENT_FFS
- Return directly from the loop, rather than using break().
- Use "r &= -r" mostly because it's clearer.

Changes to V1:
- Don't touch Kconfig, remove the Euclidean algorithm implementation
- Don't use the "even-odd" variant
- Use __ffs if the CPU has efficient __ffs

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng at gmail.com>
Signed-off-by: George Spelvin <linux at horizon.com>
---
 arch/Kconfig                         |  3 ++
 arch/alpha/Kconfig                   |  1 +
 arch/arc/Kconfig                     |  1 +
 arch/arm/mm/Kconfig                  |  3 ++
 arch/h8300/Kconfig                   |  1 +
 arch/m32r/Kconfig                    |  1 +
 arch/m68k/Kconfig.cpu                | 11 ++++++
 arch/metag/Kconfig                   |  1 +
 arch/microblaze/Kconfig              |  1 +
 arch/mips/include/asm/cpu-features.h | 10 +++++
 arch/nios2/Kconfig                   |  1 +
 arch/openrisc/Kconfig                |  1 +
 arch/parisc/Kconfig                  |  1 +
 arch/s390/Kconfig                    |  2 +-
 arch/score/Kconfig                   |  1 +
 arch/sh/Kconfig                      |  1 +
 arch/sparc/Kconfig                   |  1 +
 lib/gcd.c                            | 77 +++++++++++++++++++++++++++++++-----
 18 files changed, 107 insertions(+), 11 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 81869a5..275f17d 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -638,4 +638,7 @@ config COMPAT_OLD_SIGACTION
 config ARCH_NO_COHERENT_DMA_MMAP
 	bool
 
+config CPU_NO_EFFICIENT_FFS
+	def_bool n
+
 source "kernel/gcov/Kconfig"
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 9d8a858..44e6f05 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -27,6 +27,7 @@ config ALPHA
 	select MODULES_USE_ELF_RELA
 	select ODD_RT_SIGACTION
 	select OLD_SIGSUSPEND
+	select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67
 	help
 	  The Alpha is a 64-bit general-purpose processor designed and
 	  marketed by the Digital Equipment Corporation of blessed memory,
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index ec4791e..f41eb4c 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -101,6 +101,7 @@ choice
 
 config ISA_ARCOMPACT
 	bool "ARCompact ISA"
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The original ARC ISA of ARC600/700 cores
 
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index 5534766..cb569b6 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -421,18 +421,21 @@ config CPU_32v3
 	select CPU_USE_DOMAINS if MMU
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_NO_EFFICIENT_FFS
 
 config CPU_32v4
 	bool
 	select CPU_USE_DOMAINS if MMU
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_NO_EFFICIENT_FFS
 
 config CPU_32v4T
 	bool
 	select CPU_USE_DOMAINS if MMU
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_NO_EFFICIENT_FFS
 
 config CPU_32v5
 	bool
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index 986ea84..aa232de 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -20,6 +20,7 @@ config H8300
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZO
 	select HAVE_ARCH_KGDB
+	select CPU_NO_EFFICIENT_FFS
 
 config RWSEM_GENERIC_SPINLOCK
 	def_bool y
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index c82b292..3cc8498 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -17,6 +17,7 @@ config M32R
 	select ARCH_USES_GETTIMEOFFSET
 	select MODULES_USE_ELF_RELA
 	select HAVE_DEBUG_STACKOVERFLOW
+	select CPU_NO_EFFICIENT_FFS
 
 config SBUS
 	bool
diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu
index 0dfcf12..0b6efe8 100644
--- a/arch/m68k/Kconfig.cpu
+++ b/arch/m68k/Kconfig.cpu
@@ -40,6 +40,7 @@ config M68000
 	select CPU_HAS_NO_MULDIV64
 	select CPU_HAS_NO_UNALIGNED
 	select GENERIC_CSUM
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The Freescale (was Motorola) 68000 CPU is the first generation of
 	  the well known M68K family of processors. The CPU core as well as
@@ -51,6 +52,7 @@ config MCPU32
 	bool
 	select CPU_HAS_NO_BITFIELDS
 	select CPU_HAS_NO_UNALIGNED
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The Freescale (was then Motorola) CPU32 is a CPU core that is
 	  based on the 68020 processor. For the most part it is used in
@@ -130,6 +132,7 @@ config M5206
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5206 processor support.
 
@@ -138,6 +141,7 @@ config M5206e
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5206e processor support.
 
@@ -163,6 +167,7 @@ config M5249
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5249 processor support.
 
@@ -171,6 +176,7 @@ config M525x
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Freescale (Motorola) Coldfire 5251/5253 processor support.
 
@@ -189,6 +195,7 @@ config M5272
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5272 processor support.
 
@@ -217,6 +224,7 @@ config M5307
 	select COLDFIRE_SW_A7
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5307 processor support.
 
@@ -242,6 +250,7 @@ config M5407
 	select COLDFIRE_SW_A7
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5407 processor support.
 
@@ -251,6 +260,7 @@ config M547x
 	select MMU_COLDFIRE if MMU
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Freescale ColdFire 5470/5471/5472/5473/5474/5475 processor support.
 
@@ -260,6 +270,7 @@ config M548x
 	select M54xx
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Freescale ColdFire 5480/5481/5482/5483/5484/5485 processor support.
 
diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig
index a0fa88d..2ac2de6 100644
--- a/arch/metag/Kconfig
+++ b/arch/metag/Kconfig
@@ -29,6 +29,7 @@ config METAG
 	select OF
 	select OF_EARLY_FLATTREE
 	select SPARSE_IRQ
+	select CPU_NO_EFFICIENT_FFS
 
 config STACKTRACE_SUPPORT
 	def_bool y
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 3d793b5..f17c3a4 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -32,6 +32,7 @@ config MICROBLAZE
 	select OF_EARLY_FLATTREE
 	select TRACING_SUPPORT
 	select VIRT_TO_BUS
+	select CPU_NO_EFFICIENT_FFS
 
 config SWAP
 	def_bool n
diff --git a/arch/mips/include/asm/cpu-features.h b/arch/mips/include/asm/cpu-features.h
index eeec8c8..e20e100 100644
--- a/arch/mips/include/asm/cpu-features.h
+++ b/arch/mips/include/asm/cpu-features.h
@@ -180,6 +180,16 @@
 #endif
 #endif
 
+/* __builtin_constant_p(cpu_has_mips_r) && cpu_has_mips_r */
+#if !((defined(cpu_has_mips32r1) && cpu_has_mips32r1) || \
+	  (defined(cpu_has_mips32r2) && cpu_has_mips32r2) || \
+	  (defined(cpu_has_mips32r6) && cpu_has_mips32r6) || \
+	  (defined(cpu_has_mips64r1) && cpu_has_mips64r1) || \
+	  (defined(cpu_has_mips64r2) && cpu_has_mips64r2) || \
+	  (defined(cpu_has_mips64r6) && cpu_has_mips64r6))
+#define CONFIG_CPU_NO_EFFICIENT_FFS 1
+#endif
+
 #ifndef cpu_has_mips_1
 # define cpu_has_mips_1		(!cpu_has_mips_r6)
 #endif
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index 4375554..f10bd2c 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -16,6 +16,7 @@ config NIOS2
 	select SOC_BUS
 	select SPARSE_IRQ
 	select USB_ARCH_HAS_HCD if USB_SUPPORT
+	select CPU_NO_EFFICIENT_FFS
 
 config GENERIC_CSUM
 	def_bool y
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index e118c02..142cb05 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -25,6 +25,7 @@ config OPENRISC
 	select MODULES_USE_ELF_RELA
 	select HAVE_DEBUG_STACKOVERFLOW
 	select OR1K_PIC
+	select CPU_NO_EFFICIENT_FFS if !OPENRISC_HAVE_INST_FF1
 
 config MMU
 	def_bool y
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 88cfaa8..3d498a6 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -32,6 +32,7 @@ config PARISC
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_SECCOMP_FILTER
 	select ARCH_NO_COHERENT_DMA_MMAP
+	select CPU_NO_EFFICIENT_FFS
 
 	help
 	  The PA-RISC microprocessor is designed by Hewlett-Packard and used
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index bf24ab1..9eb3932 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -164,7 +164,7 @@ config S390
 	select TTY
 	select VIRT_CPU_ACCOUNTING
 	select VIRT_TO_BUS
-
+	select CPU_NO_EFFICIENT_FFS if !HAVE_MARCH_Z9_109_FEATURES
 
 config SCHED_OMIT_FRAME_POINTER
 	def_bool y
diff --git a/arch/score/Kconfig b/arch/score/Kconfig
index 366e1b5..507d631 100644
--- a/arch/score/Kconfig
+++ b/arch/score/Kconfig
@@ -14,6 +14,7 @@ config SCORE
 	select VIRT_TO_BUS
 	select MODULES_USE_ELF_REL
 	select CLONE_BACKWARDS
+	select CPU_NO_EFFICIENT_FFS
 
 choice
 	prompt "System type"
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 7ed20fc..56cf5e5 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -44,6 +44,7 @@ config SUPERH
 	select OLD_SIGSUSPEND
 	select OLD_SIGACTION
 	select HAVE_ARCH_AUDITSYSCALL
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The SuperH is a RISC processor targeted for use in embedded systems
 	  and consumer electronics; it was also used in the Sega Dreamcast
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 57ffaf2..ca675ed 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -42,6 +42,7 @@ config SPARC
 	select ODD_RT_SIGACTION
 	select OLD_SIGSUSPEND
 	select ARCH_HAS_SG_CHAIN
+	select CPU_NO_EFFICIENT_FFS
 
 config SPARC32
 	def_bool !64BIT
diff --git a/lib/gcd.c b/lib/gcd.c
index 3657f12..b9fbe73 100644
--- a/lib/gcd.c
+++ b/lib/gcd.c
@@ -2,20 +2,77 @@
 #include <linux/gcd.h>
 #include <linux/export.h>
 
-/* Greatest common divisor */
+/*
+ * This implements the binary GCD algorithm. (Often attributed to Stein,
+ * but as Knuth has noted, appears in a first-century Chinese math text.)
+ *
+ * This is faster than the division-based algorithm even on x86, which
+ * has decent hardware division.
+ */
+
+#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS)
+
+/* If __ffs is available, the even/odd algorithm benchmarks slower. */
 unsigned long gcd(unsigned long a, unsigned long b)
 {
-	unsigned long r;
+	unsigned long r = a | b;
+
+	if (!a || !b)
+		return r;
 
-	if (a < b)
-		swap(a, b);
+	b >>= __ffs(b);
+	if (b == 1)
+		return r & -r;
 
-	if (!b)
-		return a;
-	while ((r = a % b) != 0) {
-		a = b;
-		b = r;
+	for (;;) {
+		a >>= __ffs(a);
+		if (a == 1)
+			return r & -r;
+		if (a == b)
+			return a << __ffs(r);
+
+		if (a < b)
+			swap(a, b);
+		a -= b;
 	}
-	return b;
 }
+
+#else
+
+/* If normalization is done by loops, the even/odd algorithm is a win. */
+unsigned long gcd(unsigned long a, unsigned long b)
+{
+	unsigned long r = a | b;
+
+	if (!a || !b)
+		return r;
+
+	/* Isolate lsbit of r */
+	r &= -r;
+
+	while (!(b & r))
+		b >>= 1;
+	if (b == r)
+		return r;
+
+	for (;;) {
+		while (!(a & r))
+			a >>= 1;
+		if (a == r)
+			return r;
+		if (a == b)
+			return a;
+
+		if (a < b)
+			swap(a, b);
+		a -= b;
+		a >>= 1;
+		if (a & r)
+			a += b;
+		a >>= 1;
+	}
+}
+
+#endif
+
 EXPORT_SYMBOL_GPL(gcd);
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
@ 2016-05-06  9:42 ` zengzhaoxiu
  0 siblings, 0 replies; 41+ messages in thread
From: zengzhaoxiu at 163.com @ 2016-05-06  9:42 UTC (permalink / raw)
  To: linux-arm-kernel

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

The binary GCD algorithm is based on the following facts:
	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)

Even on x86 machines with reasonable division hardware, the binary
algorithm runs about 25% faster (80% the execution time) than the
division-based Euclidian algorithm.

On platforms like Alpha and ARMv6 where division is a function call to
emulation code, it's even more significant.

There are two variants of the code here, depending on whether a
fast __ffs (find least significant set bit) instruction is available.
This allows the unpredictable branches in the bit-at-a-time shifting
loop to be eliminated.

If fast __ffs is not available, the "even/odd" GCD variant is used.

I use the following code to benchmark:

	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <string.h>
	#include <time.h>
	#include <unistd.h>

	#define swap(a, b) \
		do { \
			a ^= b; \
			b ^= a; \
			a ^= b; \
		} while (0)

	unsigned long gcd0(unsigned long a, unsigned long b)
	{
		unsigned long r;

		if (a < b) {
			swap(a, b);
		}

		if (b == 0)
			return a;

		while ((r = a % b) != 0) {
			a = b;
			b = r;
		}

		return b;
	}

	unsigned long gcd1(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a == b)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd2(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a == b)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	unsigned long gcd3(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);
		if (b == 1)
			return r & -r;

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a == 1)
				return r & -r;
			if (a == b)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd4(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;
		if (b == r)
			return r;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a == r)
				return r;
			if (a == b)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	static unsigned long (*gcd_func[])(unsigned long a, unsigned long b) = {
		gcd0, gcd1, gcd2, gcd3, gcd4,
	};

	#define TEST_ENTRIES (sizeof(gcd_func) / sizeof(gcd_func[0]))

	#if defined(__x86_64__)

	#define rdtscll(val) do { \
		unsigned long __a,__d; \
		__asm__ __volatile__("rdtsc" : "=a" (__a), "=d" (__d)); \
		(val) = ((unsigned long long)__a) | (((unsigned long long)__d)<<32); \
	} while(0)

	static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
								unsigned long a, unsigned long b, unsigned long *res)
	{
		unsigned long long start, end;
		unsigned long long ret;
		unsigned long gcd_res;

		rdtscll(start);
		gcd_res = gcd(a, b);
		rdtscll(end);

		if (end >= start)
			ret = end - start;
		else
			ret = ~0ULL - start + 1 + end;

		*res = gcd_res;
		return ret;
	}

	#else

	static inline struct timespec read_time(void)
	{
		struct timespec time;
		clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time);
		return time;
	}

	static inline unsigned long long diff_time(struct timespec start, struct timespec end)
	{
		struct timespec temp;

		if ((end.tv_nsec - start.tv_nsec) < 0) {
			temp.tv_sec = end.tv_sec - start.tv_sec - 1;
			temp.tv_nsec = 1000000000ULL + end.tv_nsec - start.tv_nsec;
		} else {
			temp.tv_sec = end.tv_sec - start.tv_sec;
			temp.tv_nsec = end.tv_nsec - start.tv_nsec;
		}

		return temp.tv_sec * 1000000000ULL + temp.tv_nsec;
	}

	static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
								unsigned long a, unsigned long b, unsigned long *res)
	{
		struct timespec start, end;
		unsigned long gcd_res;

		start = read_time();
		gcd_res = gcd(a, b);
		end = read_time();

		*res = gcd_res;
		return diff_time(start, end);
	}

	#endif

	static inline unsigned long get_rand()
	{
		if (sizeof(long) == 8)
			return (unsigned long)rand() << 32 | rand();
		else
			return rand();
	}

	int main(int argc, char **argv)
	{
		unsigned int seed = time(0);
		int loops = 100;
		int repeats = 1000;
		unsigned long (*res)[TEST_ENTRIES];
		unsigned long long elapsed[TEST_ENTRIES];
		int i, j, k;

		for (;;) {
			int opt = getopt(argc, argv, "n:r:s:");
			/* End condition always first */
			if (opt == -1)
				break;

			switch (opt) {
			case 'n':
				loops = atoi(optarg);
				break;
			case 'r':
				repeats = atoi(optarg);
				break;
			case 's':
				seed = strtoul(optarg, NULL, 10);
				break;
			default:
				/* You won't actually get here. */
				break;
			}
		}

		res = malloc(sizeof(unsigned long) * TEST_ENTRIES * loops);
		memset(elapsed, 0, sizeof(elapsed));

		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			/* Do we have args? */
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			unsigned long long min_elapsed[TEST_ENTRIES];
			for (k = 0; k < repeats; k++) {
				for (i = 0; i < TEST_ENTRIES; i++) {
					unsigned long long tmp = benchmark_gcd_func(gcd_func[i], a, b, &res[j][i]);
					if (k == 0 || min_elapsed[i] > tmp)
						min_elapsed[i] = tmp;
				}
			}
			for (i = 0; i < TEST_ENTRIES; i++)
				elapsed[i] += min_elapsed[i];
		}

		for (i = 0; i < TEST_ENTRIES; i++)
			printf("gcd%d: elapsed %llu\n", i, elapsed[i]);

		k = 0;
		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			for (i = 1; i < TEST_ENTRIES; i++) {
				if (res[j][i] != res[j][0])
					break;
			}
			if (i < TEST_ENTRIES) {
				if (k == 0) {
					k = 1;
					fprintf(stderr, "Error:\n");
				}
				fprintf(stderr, "gcd(%lu, %lu): ", a, b);
				for (i = 0; i < TEST_ENTRIES; i++)
					fprintf(stderr, "%ld%s", res[j][i], i < TEST_ENTRIES - 1 ? ", " : "\n");
			}
		}

		if (k == 0)
			fprintf(stderr, "PASS\n");

		free(res);

		return 0;
	}

Compiled with "-O2", on "VirtualBox 4.4.0-22-generic #38-Ubuntu x86_64" got:

zhaoxiuzeng at zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 10174
gcd1: elapsed 2120
gcd2: elapsed 2902
gcd3: elapsed 2039
gcd4: elapsed 2812
PASS
zhaoxiuzeng at zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 9309
gcd1: elapsed 2280
gcd2: elapsed 2822
gcd3: elapsed 2217
gcd4: elapsed 2710
PASS
zhaoxiuzeng at zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 9589
gcd1: elapsed 2098
gcd2: elapsed 2815
gcd3: elapsed 2030
gcd4: elapsed 2718
PASS
zhaoxiuzeng at zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 9914
gcd1: elapsed 2309
gcd2: elapsed 2779
gcd3: elapsed 2228
gcd4: elapsed 2709
PASS

Changes to V3:
- Fix build error
- Select CPU_NO_EFFICIENT_FFS if CONFIG_ARC && CONFIG_ISA_ARCOMPACT
- Select CPU_NO_EFFICIENT_FFS if CONFIG_S390 && !CONFIG_HAVE_MARCH_Z9_109_FEATURES
- Do new brnchmark
- Return immediately if one number becomes a power of 2
- Add comments written by George Spelvin

Changes to V2:
- Add a new Kconfig variable CPU_NO_EFFICIENT_FFS
- Separate into two versions by CPU_NO_EFFICIENT_FFS
- Return directly from the loop, rather than using break().
- Use "r &= -r" mostly because it's clearer.

Changes to V1:
- Don't touch Kconfig, remove the Euclidean algorithm implementation
- Don't use the "even-odd" variant
- Use __ffs if the CPU has efficient __ffs

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
Signed-off-by: George Spelvin <linux@horizon.com>
---
 arch/Kconfig                         |  3 ++
 arch/alpha/Kconfig                   |  1 +
 arch/arc/Kconfig                     |  1 +
 arch/arm/mm/Kconfig                  |  3 ++
 arch/h8300/Kconfig                   |  1 +
 arch/m32r/Kconfig                    |  1 +
 arch/m68k/Kconfig.cpu                | 11 ++++++
 arch/metag/Kconfig                   |  1 +
 arch/microblaze/Kconfig              |  1 +
 arch/mips/include/asm/cpu-features.h | 10 +++++
 arch/nios2/Kconfig                   |  1 +
 arch/openrisc/Kconfig                |  1 +
 arch/parisc/Kconfig                  |  1 +
 arch/s390/Kconfig                    |  2 +-
 arch/score/Kconfig                   |  1 +
 arch/sh/Kconfig                      |  1 +
 arch/sparc/Kconfig                   |  1 +
 lib/gcd.c                            | 77 +++++++++++++++++++++++++++++++-----
 18 files changed, 107 insertions(+), 11 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 81869a5..275f17d 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -638,4 +638,7 @@ config COMPAT_OLD_SIGACTION
 config ARCH_NO_COHERENT_DMA_MMAP
 	bool
 
+config CPU_NO_EFFICIENT_FFS
+	def_bool n
+
 source "kernel/gcov/Kconfig"
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 9d8a858..44e6f05 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -27,6 +27,7 @@ config ALPHA
 	select MODULES_USE_ELF_RELA
 	select ODD_RT_SIGACTION
 	select OLD_SIGSUSPEND
+	select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67
 	help
 	  The Alpha is a 64-bit general-purpose processor designed and
 	  marketed by the Digital Equipment Corporation of blessed memory,
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index ec4791e..f41eb4c 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -101,6 +101,7 @@ choice
 
 config ISA_ARCOMPACT
 	bool "ARCompact ISA"
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The original ARC ISA of ARC600/700 cores
 
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index 5534766..cb569b6 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -421,18 +421,21 @@ config CPU_32v3
 	select CPU_USE_DOMAINS if MMU
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_NO_EFFICIENT_FFS
 
 config CPU_32v4
 	bool
 	select CPU_USE_DOMAINS if MMU
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_NO_EFFICIENT_FFS
 
 config CPU_32v4T
 	bool
 	select CPU_USE_DOMAINS if MMU
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_NO_EFFICIENT_FFS
 
 config CPU_32v5
 	bool
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index 986ea84..aa232de 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -20,6 +20,7 @@ config H8300
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZO
 	select HAVE_ARCH_KGDB
+	select CPU_NO_EFFICIENT_FFS
 
 config RWSEM_GENERIC_SPINLOCK
 	def_bool y
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index c82b292..3cc8498 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -17,6 +17,7 @@ config M32R
 	select ARCH_USES_GETTIMEOFFSET
 	select MODULES_USE_ELF_RELA
 	select HAVE_DEBUG_STACKOVERFLOW
+	select CPU_NO_EFFICIENT_FFS
 
 config SBUS
 	bool
diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu
index 0dfcf12..0b6efe8 100644
--- a/arch/m68k/Kconfig.cpu
+++ b/arch/m68k/Kconfig.cpu
@@ -40,6 +40,7 @@ config M68000
 	select CPU_HAS_NO_MULDIV64
 	select CPU_HAS_NO_UNALIGNED
 	select GENERIC_CSUM
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The Freescale (was Motorola) 68000 CPU is the first generation of
 	  the well known M68K family of processors. The CPU core as well as
@@ -51,6 +52,7 @@ config MCPU32
 	bool
 	select CPU_HAS_NO_BITFIELDS
 	select CPU_HAS_NO_UNALIGNED
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The Freescale (was then Motorola) CPU32 is a CPU core that is
 	  based on the 68020 processor. For the most part it is used in
@@ -130,6 +132,7 @@ config M5206
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5206 processor support.
 
@@ -138,6 +141,7 @@ config M5206e
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5206e processor support.
 
@@ -163,6 +167,7 @@ config M5249
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5249 processor support.
 
@@ -171,6 +176,7 @@ config M525x
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Freescale (Motorola) Coldfire 5251/5253 processor support.
 
@@ -189,6 +195,7 @@ config M5272
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5272 processor support.
 
@@ -217,6 +224,7 @@ config M5307
 	select COLDFIRE_SW_A7
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5307 processor support.
 
@@ -242,6 +250,7 @@ config M5407
 	select COLDFIRE_SW_A7
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5407 processor support.
 
@@ -251,6 +260,7 @@ config M547x
 	select MMU_COLDFIRE if MMU
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Freescale ColdFire 5470/5471/5472/5473/5474/5475 processor support.
 
@@ -260,6 +270,7 @@ config M548x
 	select M54xx
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Freescale ColdFire 5480/5481/5482/5483/5484/5485 processor support.
 
diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig
index a0fa88d..2ac2de6 100644
--- a/arch/metag/Kconfig
+++ b/arch/metag/Kconfig
@@ -29,6 +29,7 @@ config METAG
 	select OF
 	select OF_EARLY_FLATTREE
 	select SPARSE_IRQ
+	select CPU_NO_EFFICIENT_FFS
 
 config STACKTRACE_SUPPORT
 	def_bool y
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 3d793b5..f17c3a4 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -32,6 +32,7 @@ config MICROBLAZE
 	select OF_EARLY_FLATTREE
 	select TRACING_SUPPORT
 	select VIRT_TO_BUS
+	select CPU_NO_EFFICIENT_FFS
 
 config SWAP
 	def_bool n
diff --git a/arch/mips/include/asm/cpu-features.h b/arch/mips/include/asm/cpu-features.h
index eeec8c8..e20e100 100644
--- a/arch/mips/include/asm/cpu-features.h
+++ b/arch/mips/include/asm/cpu-features.h
@@ -180,6 +180,16 @@
 #endif
 #endif
 
+/* __builtin_constant_p(cpu_has_mips_r) && cpu_has_mips_r */
+#if !((defined(cpu_has_mips32r1) && cpu_has_mips32r1) || \
+	  (defined(cpu_has_mips32r2) && cpu_has_mips32r2) || \
+	  (defined(cpu_has_mips32r6) && cpu_has_mips32r6) || \
+	  (defined(cpu_has_mips64r1) && cpu_has_mips64r1) || \
+	  (defined(cpu_has_mips64r2) && cpu_has_mips64r2) || \
+	  (defined(cpu_has_mips64r6) && cpu_has_mips64r6))
+#define CONFIG_CPU_NO_EFFICIENT_FFS 1
+#endif
+
 #ifndef cpu_has_mips_1
 # define cpu_has_mips_1		(!cpu_has_mips_r6)
 #endif
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index 4375554..f10bd2c 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -16,6 +16,7 @@ config NIOS2
 	select SOC_BUS
 	select SPARSE_IRQ
 	select USB_ARCH_HAS_HCD if USB_SUPPORT
+	select CPU_NO_EFFICIENT_FFS
 
 config GENERIC_CSUM
 	def_bool y
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index e118c02..142cb05 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -25,6 +25,7 @@ config OPENRISC
 	select MODULES_USE_ELF_RELA
 	select HAVE_DEBUG_STACKOVERFLOW
 	select OR1K_PIC
+	select CPU_NO_EFFICIENT_FFS if !OPENRISC_HAVE_INST_FF1
 
 config MMU
 	def_bool y
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 88cfaa8..3d498a6 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -32,6 +32,7 @@ config PARISC
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_SECCOMP_FILTER
 	select ARCH_NO_COHERENT_DMA_MMAP
+	select CPU_NO_EFFICIENT_FFS
 
 	help
 	  The PA-RISC microprocessor is designed by Hewlett-Packard and used
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index bf24ab1..9eb3932 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -164,7 +164,7 @@ config S390
 	select TTY
 	select VIRT_CPU_ACCOUNTING
 	select VIRT_TO_BUS
-
+	select CPU_NO_EFFICIENT_FFS if !HAVE_MARCH_Z9_109_FEATURES
 
 config SCHED_OMIT_FRAME_POINTER
 	def_bool y
diff --git a/arch/score/Kconfig b/arch/score/Kconfig
index 366e1b5..507d631 100644
--- a/arch/score/Kconfig
+++ b/arch/score/Kconfig
@@ -14,6 +14,7 @@ config SCORE
 	select VIRT_TO_BUS
 	select MODULES_USE_ELF_REL
 	select CLONE_BACKWARDS
+	select CPU_NO_EFFICIENT_FFS
 
 choice
 	prompt "System type"
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 7ed20fc..56cf5e5 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -44,6 +44,7 @@ config SUPERH
 	select OLD_SIGSUSPEND
 	select OLD_SIGACTION
 	select HAVE_ARCH_AUDITSYSCALL
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The SuperH is a RISC processor targeted for use in embedded systems
 	  and consumer electronics; it was also used in the Sega Dreamcast
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 57ffaf2..ca675ed 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -42,6 +42,7 @@ config SPARC
 	select ODD_RT_SIGACTION
 	select OLD_SIGSUSPEND
 	select ARCH_HAS_SG_CHAIN
+	select CPU_NO_EFFICIENT_FFS
 
 config SPARC32
 	def_bool !64BIT
diff --git a/lib/gcd.c b/lib/gcd.c
index 3657f12..b9fbe73 100644
--- a/lib/gcd.c
+++ b/lib/gcd.c
@@ -2,20 +2,77 @@
 #include <linux/gcd.h>
 #include <linux/export.h>
 
-/* Greatest common divisor */
+/*
+ * This implements the binary GCD algorithm. (Often attributed to Stein,
+ * but as Knuth has noted, appears in a first-century Chinese math text.)
+ *
+ * This is faster than the division-based algorithm even on x86, which
+ * has decent hardware division.
+ */
+
+#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS)
+
+/* If __ffs is available, the even/odd algorithm benchmarks slower. */
 unsigned long gcd(unsigned long a, unsigned long b)
 {
-	unsigned long r;
+	unsigned long r = a | b;
+
+	if (!a || !b)
+		return r;
 
-	if (a < b)
-		swap(a, b);
+	b >>= __ffs(b);
+	if (b == 1)
+		return r & -r;
 
-	if (!b)
-		return a;
-	while ((r = a % b) != 0) {
-		a = b;
-		b = r;
+	for (;;) {
+		a >>= __ffs(a);
+		if (a == 1)
+			return r & -r;
+		if (a == b)
+			return a << __ffs(r);
+
+		if (a < b)
+			swap(a, b);
+		a -= b;
 	}
-	return b;
 }
+
+#else
+
+/* If normalization is done by loops, the even/odd algorithm is a win. */
+unsigned long gcd(unsigned long a, unsigned long b)
+{
+	unsigned long r = a | b;
+
+	if (!a || !b)
+		return r;
+
+	/* Isolate lsbit of r */
+	r &= -r;
+
+	while (!(b & r))
+		b >>= 1;
+	if (b == r)
+		return r;
+
+	for (;;) {
+		while (!(a & r))
+			a >>= 1;
+		if (a == r)
+			return r;
+		if (a == b)
+			return a;
+
+		if (a < b)
+			swap(a, b);
+		a -= b;
+		a >>= 1;
+		if (a & r)
+			a += b;
+		a >>= 1;
+	}
+}
+
+#endif
+
 EXPORT_SYMBOL_GPL(gcd);
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
  2016-05-06  9:42 ` zengzhaoxiu
                     ` (2 preceding siblings ...)
  (?)
@ 2016-05-06 23:00   ` Andrew Morton
  -1 siblings, 0 replies; 41+ messages in thread
From: Andrew Morton @ 2016-05-06 23:00 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri,  6 May 2016 17:42:42 +0800 zengzhaoxiu@163.com wrote:

> From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
> 
> The binary GCD algorithm is based on the following facts:
> 	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
> 	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
> 	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)
> 
> Even on x86 machines with reasonable division hardware, the binary
> algorithm runs about 25% faster (80% the execution time) than the
> division-based Euclidian algorithm.
> 
> On platforms like Alpha and ARMv6 where division is a function call to
> emulation code, it's even more significant.
> 
> There are two variants of the code here, depending on whether a
> fast __ffs (find least significant set bit) instruction is available.
> This allows the unpredictable branches in the bit-at-a-time shifting
> loop to be eliminated.
> 
> ...
>
> --- a/arch/alpha/Kconfig
> +++ b/arch/alpha/Kconfig
> @@ -27,6 +27,7 @@ config ALPHA
>  	select MODULES_USE_ELF_RELA
>  	select ODD_RT_SIGACTION
>  	select OLD_SIGSUSPEND
> +	select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67
>  	help

argh.  Please don't always put new items at end-of-list.  That's the
perfect way of maximizing the number of patch collisions.  Insert it at
a random position. avoiding the end (if the list isn't alpha-sorted,
which it should be).

<fixes it all up>

>
> ...
>
> --- a/arch/mips/include/asm/cpu-features.h
> +++ b/arch/mips/include/asm/cpu-features.h
> @@ -180,6 +180,16 @@
>  #endif
>  #endif
>  
> +/* __builtin_constant_p(cpu_has_mips_r) && cpu_has_mips_r */
> +#if !((defined(cpu_has_mips32r1) && cpu_has_mips32r1) || \
> +	  (defined(cpu_has_mips32r2) && cpu_has_mips32r2) || \
> +	  (defined(cpu_has_mips32r6) && cpu_has_mips32r6) || \
> +	  (defined(cpu_has_mips64r1) && cpu_has_mips64r1) || \
> +	  (defined(cpu_has_mips64r2) && cpu_has_mips64r2) || \
> +	  (defined(cpu_has_mips64r6) && cpu_has_mips64r6))
> +#define CONFIG_CPU_NO_EFFICIENT_FFS 1
> +#endif

#defining a CONFIG_ variable is pretty rude - defining these is the
role of the Kconfig system, not of header files macros.

This was easy:

--- a/arch/mips/include/asm/cpu-features.h~lib-gcd-use-binary-gcd-algorithm-instead-of-euclidean-fix
+++ a/arch/mips/include/asm/cpu-features.h
@@ -187,7 +187,7 @@
 	  (defined(cpu_has_mips64r1) && cpu_has_mips64r1) || \
 	  (defined(cpu_has_mips64r2) && cpu_has_mips64r2) || \
 	  (defined(cpu_has_mips64r6) && cpu_has_mips64r6))
-#define CONFIG_CPU_NO_EFFICIENT_FFS 1
+#define CPU_NO_EFFICIENT_FFS 1
 #endif
 
 #ifndef cpu_has_mips_1
--- a/lib/gcd.c~lib-gcd-use-binary-gcd-algorithm-instead-of-euclidean-fix
+++ a/lib/gcd.c
@@ -10,7 +10,7 @@
  * has decent hardware division.
  */
 
-#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS)
+#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS) && !defined(CPU_NO_EFFICIENT_FFS)
 
 /* If __ffs is available, the even/odd algorithm benchmarks slower. */
 unsigned long gcd(unsigned long a, unsigned long b)
_


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
@ 2016-05-06 23:00   ` Andrew Morton
  0 siblings, 0 replies; 41+ messages in thread
From: Andrew Morton @ 2016-05-06 23:00 UTC (permalink / raw)
  To: zengzhaoxiu
  Cc: linux, peterz, jjuran, James.Bottomley, geert, dalias, sam,
	davem, Zhaoxiu Zeng, Richard Henderson, Ivan Kokshaysky,
	Matt Turner, Vineet Gupta, Russell King, Yoshinori Sato,
	James Hogan, Michal Simek, Ralf Baechle, Ley Foon Tan,
	Jonas Bonn, James E.J. Bottomley, Helge Deller,
	Martin Schwidefsky, Heiko Carstens, Chen Liqin, Lennox Wu,
	linux-kernel

On Fri,  6 May 2016 17:42:42 +0800 zengzhaoxiu@163.com wrote:

> From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
> 
> The binary GCD algorithm is based on the following facts:
> 	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
> 	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
> 	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)
> 
> Even on x86 machines with reasonable division hardware, the binary
> algorithm runs about 25% faster (80% the execution time) than the
> division-based Euclidian algorithm.
> 
> On platforms like Alpha and ARMv6 where division is a function call to
> emulation code, it's even more significant.
> 
> There are two variants of the code here, depending on whether a
> fast __ffs (find least significant set bit) instruction is available.
> This allows the unpredictable branches in the bit-at-a-time shifting
> loop to be eliminated.
> 
> ...
>
> --- a/arch/alpha/Kconfig
> +++ b/arch/alpha/Kconfig
> @@ -27,6 +27,7 @@ config ALPHA
>  	select MODULES_USE_ELF_RELA
>  	select ODD_RT_SIGACTION
>  	select OLD_SIGSUSPEND
> +	select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67
>  	help

argh.  Please don't always put new items at end-of-list.  That's the
perfect way of maximizing the number of patch collisions.  Insert it at
a random position. avoiding the end (if the list isn't alpha-sorted,
which it should be).

<fixes it all up>

>
> ...
>
> --- a/arch/mips/include/asm/cpu-features.h
> +++ b/arch/mips/include/asm/cpu-features.h
> @@ -180,6 +180,16 @@
>  #endif
>  #endif
>  
> +/* __builtin_constant_p(cpu_has_mips_r) && cpu_has_mips_r */
> +#if !((defined(cpu_has_mips32r1) && cpu_has_mips32r1) || \
> +	  (defined(cpu_has_mips32r2) && cpu_has_mips32r2) || \
> +	  (defined(cpu_has_mips32r6) && cpu_has_mips32r6) || \
> +	  (defined(cpu_has_mips64r1) && cpu_has_mips64r1) || \
> +	  (defined(cpu_has_mips64r2) && cpu_has_mips64r2) || \
> +	  (defined(cpu_has_mips64r6) && cpu_has_mips64r6))
> +#define CONFIG_CPU_NO_EFFICIENT_FFS 1
> +#endif

#defining a CONFIG_ variable is pretty rude - defining these is the
role of the Kconfig system, not of header files macros.

This was easy:

--- a/arch/mips/include/asm/cpu-features.h~lib-gcd-use-binary-gcd-algorithm-instead-of-euclidean-fix
+++ a/arch/mips/include/asm/cpu-features.h
@@ -187,7 +187,7 @@
 	  (defined(cpu_has_mips64r1) && cpu_has_mips64r1) || \
 	  (defined(cpu_has_mips64r2) && cpu_has_mips64r2) || \
 	  (defined(cpu_has_mips64r6) && cpu_has_mips64r6))
-#define CONFIG_CPU_NO_EFFICIENT_FFS 1
+#define CPU_NO_EFFICIENT_FFS 1
 #endif
 
 #ifndef cpu_has_mips_1
--- a/lib/gcd.c~lib-gcd-use-binary-gcd-algorithm-instead-of-euclidean-fix
+++ a/lib/gcd.c
@@ -10,7 +10,7 @@
  * has decent hardware division.
  */
 
-#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS)
+#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS) && !defined(CPU_NO_EFFICIENT_FFS)
 
 /* If __ffs is available, the even/odd algorithm benchmarks slower. */
 unsigned long gcd(unsigned long a, unsigned long b)
_


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
@ 2016-05-06 23:00   ` Andrew Morton
  0 siblings, 0 replies; 41+ messages in thread
From: Andrew Morton @ 2016-05-06 23:00 UTC (permalink / raw)
  To: zengzhaoxiu
  Cc: linux, peterz, jjuran, James.Bottomley, geert, dalias, sam,
	davem, Zhaoxiu Zeng, Richard Henderson, Ivan Kokshaysky,
	Matt Turner, Vineet Gupta, Russell King, Yoshinori Sato,
	James Hogan, Michal Simek, Ralf Baechle, Ley Foon Tan,
	Jonas Bonn, James E.J. Bottomley, Helge Deller,
	Martin Schwidefsky, Heiko Carstens, Chen Liqin, Lennox Wu,
	linux-kernel, linux-alpha, linux-snps-arc, linux-arm-kernel,
	uclinux-h8-devel, linux-m68k, linux-metag, linux-mips, nios2-dev,
	linux, linux-parisc, linux-s390, linux-sh, sparclinux

On Fri,  6 May 2016 17:42:42 +0800 zengzhaoxiu@163.com wrote:

> From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
> 
> The binary GCD algorithm is based on the following facts:
> 	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
> 	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
> 	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)
> 
> Even on x86 machines with reasonable division hardware, the binary
> algorithm runs about 25% faster (80% the execution time) than the
> division-based Euclidian algorithm.
> 
> On platforms like Alpha and ARMv6 where division is a function call to
> emulation code, it's even more significant.
> 
> There are two variants of the code here, depending on whether a
> fast __ffs (find least significant set bit) instruction is available.
> This allows the unpredictable branches in the bit-at-a-time shifting
> loop to be eliminated.
> 
> ...
>
> --- a/arch/alpha/Kconfig
> +++ b/arch/alpha/Kconfig
> @@ -27,6 +27,7 @@ config ALPHA
>  	select MODULES_USE_ELF_RELA
>  	select ODD_RT_SIGACTION
>  	select OLD_SIGSUSPEND
> +	select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67
>  	help

argh.  Please don't always put new items at end-of-list.  That's the
perfect way of maximizing the number of patch collisions.  Insert it at
a random position. avoiding the end (if the list isn't alpha-sorted,
which it should be).

<fixes it all up>

>
> ...
>
> --- a/arch/mips/include/asm/cpu-features.h
> +++ b/arch/mips/include/asm/cpu-features.h
> @@ -180,6 +180,16 @@
>  #endif
>  #endif
>  
> +/* __builtin_constant_p(cpu_has_mips_r) && cpu_has_mips_r */
> +#if !((defined(cpu_has_mips32r1) && cpu_has_mips32r1) || \
> +	  (defined(cpu_has_mips32r2) && cpu_has_mips32r2) || \
> +	  (defined(cpu_has_mips32r6) && cpu_has_mips32r6) || \
> +	  (defined(cpu_has_mips64r1) && cpu_has_mips64r1) || \
> +	  (defined(cpu_has_mips64r2) && cpu_has_mips64r2) || \
> +	  (defined(cpu_has_mips64r6) && cpu_has_mips64r6))
> +#define CONFIG_CPU_NO_EFFICIENT_FFS 1
> +#endif

#defining a CONFIG_ variable is pretty rude - defining these is the
role of the Kconfig system, not of header files macros.

This was easy:

--- a/arch/mips/include/asm/cpu-features.h~lib-gcd-use-binary-gcd-algorithm-instead-of-euclidean-fix
+++ a/arch/mips/include/asm/cpu-features.h
@@ -187,7 +187,7 @@
 	  (defined(cpu_has_mips64r1) && cpu_has_mips64r1) || \
 	  (defined(cpu_has_mips64r2) && cpu_has_mips64r2) || \
 	  (defined(cpu_has_mips64r6) && cpu_has_mips64r6))
-#define CONFIG_CPU_NO_EFFICIENT_FFS 1
+#define CPU_NO_EFFICIENT_FFS 1
 #endif
 
 #ifndef cpu_has_mips_1
--- a/lib/gcd.c~lib-gcd-use-binary-gcd-algorithm-instead-of-euclidean-fix
+++ a/lib/gcd.c
@@ -10,7 +10,7 @@
  * has decent hardware division.
  */
 
-#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS)
+#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS) && !defined(CPU_NO_EFFICIENT_FFS)
 
 /* If __ffs is available, the even/odd algorithm benchmarks slower. */
 unsigned long gcd(unsigned long a, unsigned long b)
_

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
@ 2016-05-06 23:00   ` Andrew Morton
  0 siblings, 0 replies; 41+ messages in thread
From: Andrew Morton @ 2016-05-06 23:00 UTC (permalink / raw)
  To: linux-snps-arc

On Fri,  6 May 2016 17:42:42 +0800 zengzhaoxiu@163.com wrote:

> From: Zhaoxiu Zeng <zhaoxiu.zeng at gmail.com>
> 
> The binary GCD algorithm is based on the following facts:
> 	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
> 	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
> 	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)
> 
> Even on x86 machines with reasonable division hardware, the binary
> algorithm runs about 25% faster (80% the execution time) than the
> division-based Euclidian algorithm.
> 
> On platforms like Alpha and ARMv6 where division is a function call to
> emulation code, it's even more significant.
> 
> There are two variants of the code here, depending on whether a
> fast __ffs (find least significant set bit) instruction is available.
> This allows the unpredictable branches in the bit-at-a-time shifting
> loop to be eliminated.
> 
> ...
>
> --- a/arch/alpha/Kconfig
> +++ b/arch/alpha/Kconfig
> @@ -27,6 +27,7 @@ config ALPHA
>  	select MODULES_USE_ELF_RELA
>  	select ODD_RT_SIGACTION
>  	select OLD_SIGSUSPEND
> +	select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67
>  	help

argh.  Please don't always put new items at end-of-list.  That's the
perfect way of maximizing the number of patch collisions.  Insert it at
a random position. avoiding the end (if the list isn't alpha-sorted,
which it should be).

<fixes it all up>

>
> ...
>
> --- a/arch/mips/include/asm/cpu-features.h
> +++ b/arch/mips/include/asm/cpu-features.h
> @@ -180,6 +180,16 @@
>  #endif
>  #endif
>  
> +/* __builtin_constant_p(cpu_has_mips_r) && cpu_has_mips_r */
> +#if !((defined(cpu_has_mips32r1) && cpu_has_mips32r1) || \
> +	  (defined(cpu_has_mips32r2) && cpu_has_mips32r2) || \
> +	  (defined(cpu_has_mips32r6) && cpu_has_mips32r6) || \
> +	  (defined(cpu_has_mips64r1) && cpu_has_mips64r1) || \
> +	  (defined(cpu_has_mips64r2) && cpu_has_mips64r2) || \
> +	  (defined(cpu_has_mips64r6) && cpu_has_mips64r6))
> +#define CONFIG_CPU_NO_EFFICIENT_FFS 1
> +#endif

#defining a CONFIG_ variable is pretty rude - defining these is the
role of the Kconfig system, not of header files macros.

This was easy:

--- a/arch/mips/include/asm/cpu-features.h~lib-gcd-use-binary-gcd-algorithm-instead-of-euclidean-fix
+++ a/arch/mips/include/asm/cpu-features.h
@@ -187,7 +187,7 @@
 	  (defined(cpu_has_mips64r1) && cpu_has_mips64r1) || \
 	  (defined(cpu_has_mips64r2) && cpu_has_mips64r2) || \
 	  (defined(cpu_has_mips64r6) && cpu_has_mips64r6))
-#define CONFIG_CPU_NO_EFFICIENT_FFS 1
+#define CPU_NO_EFFICIENT_FFS 1
 #endif
 
 #ifndef cpu_has_mips_1
--- a/lib/gcd.c~lib-gcd-use-binary-gcd-algorithm-instead-of-euclidean-fix
+++ a/lib/gcd.c
@@ -10,7 +10,7 @@
  * has decent hardware division.
  */
 
-#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS)
+#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS) && !defined(CPU_NO_EFFICIENT_FFS)
 
 /* If __ffs is available, the even/odd algorithm benchmarks slower. */
 unsigned long gcd(unsigned long a, unsigned long b)
_

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
@ 2016-05-06 23:00   ` Andrew Morton
  0 siblings, 0 replies; 41+ messages in thread
From: Andrew Morton @ 2016-05-06 23:00 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri,  6 May 2016 17:42:42 +0800 zengzhaoxiu at 163.com wrote:

> From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
> 
> The binary GCD algorithm is based on the following facts:
> 	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
> 	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
> 	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)
> 
> Even on x86 machines with reasonable division hardware, the binary
> algorithm runs about 25% faster (80% the execution time) than the
> division-based Euclidian algorithm.
> 
> On platforms like Alpha and ARMv6 where division is a function call to
> emulation code, it's even more significant.
> 
> There are two variants of the code here, depending on whether a
> fast __ffs (find least significant set bit) instruction is available.
> This allows the unpredictable branches in the bit-at-a-time shifting
> loop to be eliminated.
> 
> ...
>
> --- a/arch/alpha/Kconfig
> +++ b/arch/alpha/Kconfig
> @@ -27,6 +27,7 @@ config ALPHA
>  	select MODULES_USE_ELF_RELA
>  	select ODD_RT_SIGACTION
>  	select OLD_SIGSUSPEND
> +	select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67
>  	help

argh.  Please don't always put new items at end-of-list.  That's the
perfect way of maximizing the number of patch collisions.  Insert it at
a random position. avoiding the end (if the list isn't alpha-sorted,
which it should be).

<fixes it all up>

>
> ...
>
> --- a/arch/mips/include/asm/cpu-features.h
> +++ b/arch/mips/include/asm/cpu-features.h
> @@ -180,6 +180,16 @@
>  #endif
>  #endif
>  
> +/* __builtin_constant_p(cpu_has_mips_r) && cpu_has_mips_r */
> +#if !((defined(cpu_has_mips32r1) && cpu_has_mips32r1) || \
> +	  (defined(cpu_has_mips32r2) && cpu_has_mips32r2) || \
> +	  (defined(cpu_has_mips32r6) && cpu_has_mips32r6) || \
> +	  (defined(cpu_has_mips64r1) && cpu_has_mips64r1) || \
> +	  (defined(cpu_has_mips64r2) && cpu_has_mips64r2) || \
> +	  (defined(cpu_has_mips64r6) && cpu_has_mips64r6))
> +#define CONFIG_CPU_NO_EFFICIENT_FFS 1
> +#endif

#defining a CONFIG_ variable is pretty rude - defining these is the
role of the Kconfig system, not of header files macros.

This was easy:

--- a/arch/mips/include/asm/cpu-features.h~lib-gcd-use-binary-gcd-algorithm-instead-of-euclidean-fix
+++ a/arch/mips/include/asm/cpu-features.h
@@ -187,7 +187,7 @@
 	  (defined(cpu_has_mips64r1) && cpu_has_mips64r1) || \
 	  (defined(cpu_has_mips64r2) && cpu_has_mips64r2) || \
 	  (defined(cpu_has_mips64r6) && cpu_has_mips64r6))
-#define CONFIG_CPU_NO_EFFICIENT_FFS 1
+#define CPU_NO_EFFICIENT_FFS 1
 #endif
 
 #ifndef cpu_has_mips_1
--- a/lib/gcd.c~lib-gcd-use-binary-gcd-algorithm-instead-of-euclidean-fix
+++ a/lib/gcd.c
@@ -10,7 +10,7 @@
  * has decent hardware division.
  */
 
-#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS)
+#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS) && !defined(CPU_NO_EFFICIENT_FFS)
 
 /* If __ffs is available, the even/odd algorithm benchmarks slower. */
 unsigned long gcd(unsigned long a, unsigned long b)
_

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
  2016-05-06  9:42 ` zengzhaoxiu
                     ` (2 preceding siblings ...)
  (?)
@ 2016-05-07  8:41   ` George Spelvin
  -1 siblings, 0 replies; 41+ messages in thread
From: George Spelvin @ 2016-05-07  8:41 UTC (permalink / raw)
  To: linux-arm-kernel

Nothing critical, but a bit of kibitzing.
(That is slang in the Yiddish language for a person
who offers annoying and unwanted advice.)

> The binary GCD algorithm is based on the following facts:
> 	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
>	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
>	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)

1) "even" and "odd" are adjectives.  In English, adjectives to not have
   plural suffixes.  Thus, "they are even" or "they are odd".
2) Although "all" is not exactly wrong, it sounds odd.  Since there are
   exactly two of them it's clearer to say "both".

If I also rephrase the last line to fit into 80 columns, you get:

  The binary GCD algorithm is based on the following facts:
- 	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
+ 	1. If a and b are both even, then gcd(a,b) = 2 * gcd(a/2, b/2)
 	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
-	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)
+	3. If both are odd, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)

3) Negative config options are always confusing.

Would it be better to call it CONFIG_INEFFICIEBNT_FFS, or even simpler
CONFIG_SLOW_FFS?

Also, you're allowed to add "help" to a non-interactive config option,
and some documentation might be useful.
E.g.

+config CPU_SLOW_FFS
+	def_bool n
+	help
+	  If n, the CPU supports a fast __ffs (__builtin_ctz) operation,
+	  either directly or via a short code sequence using a count
+	  leading zeros or population count instruction.  If y, the
+	  operation is emulated by slower software, such as an unrolled
+	  binary search.
+
+	  This is purely an optimization option; the kernel
+	  will function correctly regardless of how it is set.
+

Your benchmark code doesn't have to have a separate code path if
__x86_64__; rdtsc works on 32-bit code just as well.  paths.  And the
way you subtract the end and start times is unnecessarily complicated.
The C language guarantees that unsigned arithmetic simply wraps modulo
2^bits as expected.

Here's a simplified version:

	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <string.h>
	#include <time.h>
	#include <unistd.h>

	#define swap(a, b) \
		do { \
			a ^= b; \
			b ^= a; \
			a ^= b; \
		} while (0)

	/* The Euclidean GCD algorithm */
	unsigned long gcd0(unsigned long a, unsigned long b)
	{
		if (a < b)
			swap(a, b);

		while (b != 0) {
			unsigned long r = a % b;
			a = b;
			b = r;
		}
		return a;
	}

	/* The binary GCD algorithm, using __builtin_ctzl */
	unsigned long gcd1(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);

		do {
			a >>= __builtin_ctzl(a);

			if (a < b)
				swap(a, b);
			a -= b;
		} while (a);
		return b << __builtin_ctzl(r);
	}

	/* Binary GCD algorithm, even/odd variant, without __builtin_ctzl */
	unsigned long gcd2(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a < b)
				swap(a, b);
			else if (a = b)
				return a;
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	/* A variant of gcd1, with early out for gcd = 1 */
	unsigned long gcd3(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);
		if (b = 1)
			return r & -r;

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a = b || a = 1)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd4(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;
		if (b = r)
			return r;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a = b || a = r)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	static unsigned long (*gcd_func[])(unsigned long a, unsigned long b) = {
		gcd0, gcd1, gcd2, gcd3, gcd4,
	};

	#define TEST_ENTRIES (int)(sizeof(gcd_func) / sizeof(gcd_func[0]))

	#define rdtscll(val) do { \
		unsigned __a,__d; \
		__asm__ __volatile__("rdtsc" : "=a" (__a), "=d" (__d)); \
		(val) = __a | (unsigned long long)__d << 32; \
	} while(0)

	static unsigned long long
	benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
			   unsigned long a, unsigned long b, unsigned long *res)
	{
		unsigned long long start, end;
		unsigned long gcd_res;

		rdtscll(start);
		gcd_res = gcd(a, b);
		rdtscll(end);

		*res = gcd_res;
		return end - start;
	}

	static inline unsigned long get_rand()
	{
		if (sizeof(long) = 8)
			return (unsigned long)rand() << 32 | rand();
		else
			return rand();
	}

	int main(int argc, char **argv)
	{
		unsigned int seed = time(0);
		int loops = 100;
		int repeats = 1000;
		unsigned long (*res)[TEST_ENTRIES];
		unsigned long long elapsed[TEST_ENTRIES];
		int i, j, k;

		for (;;) {
			int opt = getopt(argc, argv, "n:r:s:");
			/* End condition always first */
			if (opt = -1)
				break;

			switch (opt) {
			case 'n':
				loops = atoi(optarg);
				break;
			case 'r':
				repeats = atoi(optarg);
				break;
			case 's':
				seed = strtoul(optarg, NULL, 10);
				break;
			default:
				/* You won't actually get here. */
				break;
			}
		}

		res = malloc(sizeof(unsigned long) * TEST_ENTRIES * loops);
		memset(elapsed, 0, sizeof(elapsed));

		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			/* Do we have args? */
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			unsigned long long min_elapsed[TEST_ENTRIES];
			for (k = 0; k < repeats; k++) {
				for (i = 0; i < TEST_ENTRIES; i++) {
					unsigned long long tmp = benchmark_gcd_func(gcd_func[i], a, b, &res[j][i]);
					if (k = 0 || min_elapsed[i] > tmp)
						min_elapsed[i] = tmp;
				}
			}
			for (i = 0; i < TEST_ENTRIES; i++)
				elapsed[i] += min_elapsed[i];
		}

		for (i = 0; i < TEST_ENTRIES; i++)
			printf("gcd%d: elapsed %llu\n", i, elapsed[i]);

		k = 0;
		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			for (i = 1; i < TEST_ENTRIES; i++) {
				if (res[j][i] != res[j][0])
					break;
			}
			if (i < TEST_ENTRIES) {
				if (k = 0) {
					k = 1;
					fprintf(stderr, "Error:\n");
				}
				fprintf(stderr, "gcd(%lu, %lu): ", a, b);
				for (i = 0; i < TEST_ENTRIES; i++)
					fprintf(stderr, "%ld%s", res[j][i], i < TEST_ENTRIES - 1 ? ", " : "\n");
			}
		}

		if (k = 0)
			fprintf(stderr, "PASS\n");

		free(res);

		return 0;
	}

Here are some more timings, with the same flags as your tests:

First, 32 bit code:
		gcd0	gcd1	gcd2	gcd3	gcd4
Ivy Bridge	3156	1192	1740	1160	1640	PASS
AMD Phenom	7150	2564	2348	2975	2843	PASS
Core 2		4176	2592	4164	2604	3900	PASS
Pentium 4	11492	4784	7632	4852	6452	PASS

And 64-bit (longer times becuase the inputs are larger):
Ivy Bridge	10636	2496	3500	2432	3360	PASS
AMD Phenom	19482	4058	6030	5001	6845	PASS

Looking at those, I'm not sure how much better the gcd3/4 versions are
than gcd1/2.  The difference seems pretty minor and sometimes negative.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
@ 2016-05-07  8:41   ` George Spelvin
  0 siblings, 0 replies; 41+ messages in thread
From: George Spelvin @ 2016-05-07  8:41 UTC (permalink / raw)
  To: akpm, dalias, davem, geert, James.Bottomley, jjuran, linux,
	peterz, sam, zengzhaoxiu
  Cc: deller, heiko.carstens, ink, james.hogan, jejb, jonas, lennox.wu,
	lftan, linux-alpha, linux-arm-kernel, linux-kernel, linux-m68k,
	linux-metag, linux-mips, linux-parisc, linux-s390, linux-sh,
	linux-snps-arc, linux, linux, liqin.linux, mattst88, monstr,
	nios2-dev, ralf, rth, schwidefsky, sparclinux, uclinux-h8-devel,
	vgupta, ysato, zhaoxiu.zeng

Nothing critical, but a bit of kibitzing.
(That is slang in the Yiddish language for a person
who offers annoying and unwanted advice.)

> The binary GCD algorithm is based on the following facts:
> 	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
>	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
>	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)

1) "even" and "odd" are adjectives.  In English, adjectives to not have
   plural suffixes.  Thus, "they are even" or "they are odd".
2) Although "all" is not exactly wrong, it sounds odd.  Since there are
   exactly two of them it's clearer to say "both".

If I also rephrase the last line to fit into 80 columns, you get:

  The binary GCD algorithm is based on the following facts:
- 	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
+ 	1. If a and b are both even, then gcd(a,b) = 2 * gcd(a/2, b/2)
 	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
-	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)
+	3. If both are odd, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)

3) Negative config options are always confusing.

Would it be better to call it CONFIG_INEFFICIEBNT_FFS, or even simpler
CONFIG_SLOW_FFS?

Also, you're allowed to add "help" to a non-interactive config option,
and some documentation might be useful.
E.g.

+config CPU_SLOW_FFS
+	def_bool n
+	help
+	  If n, the CPU supports a fast __ffs (__builtin_ctz) operation,
+	  either directly or via a short code sequence using a count
+	  leading zeros or population count instruction.  If y, the
+	  operation is emulated by slower software, such as an unrolled
+	  binary search.
+
+	  This is purely an optimization option; the kernel
+	  will function correctly regardless of how it is set.
+

Your benchmark code doesn't have to have a separate code path if
__x86_64__; rdtsc works on 32-bit code just as well.  paths.  And the
way you subtract the end and start times is unnecessarily complicated.
The C language guarantees that unsigned arithmetic simply wraps modulo
2^bits as expected.

Here's a simplified version:

	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <string.h>
	#include <time.h>
	#include <unistd.h>

	#define swap(a, b) \
		do { \
			a ^= b; \
			b ^= a; \
			a ^= b; \
		} while (0)

	/* The Euclidean GCD algorithm */
	unsigned long gcd0(unsigned long a, unsigned long b)
	{
		if (a < b)
			swap(a, b);

		while (b != 0) {
			unsigned long r = a % b;
			a = b;
			b = r;
		}
		return a;
	}

	/* The binary GCD algorithm, using __builtin_ctzl */
	unsigned long gcd1(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);

		do {
			a >>= __builtin_ctzl(a);

			if (a < b)
				swap(a, b);
			a -= b;
		} while (a);
		return b << __builtin_ctzl(r);
	}

	/* Binary GCD algorithm, even/odd variant, without __builtin_ctzl */
	unsigned long gcd2(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a < b)
				swap(a, b);
			else if (a == b)
				return a;
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	/* A variant of gcd1, with early out for gcd = 1 */
	unsigned long gcd3(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);
		if (b == 1)
			return r & -r;

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a == b || a == 1)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd4(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;
		if (b == r)
			return r;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a == b || a == r)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	static unsigned long (*gcd_func[])(unsigned long a, unsigned long b) = {
		gcd0, gcd1, gcd2, gcd3, gcd4,
	};

	#define TEST_ENTRIES (int)(sizeof(gcd_func) / sizeof(gcd_func[0]))

	#define rdtscll(val) do { \
		unsigned __a,__d; \
		__asm__ __volatile__("rdtsc" : "=a" (__a), "=d" (__d)); \
		(val) = __a | (unsigned long long)__d << 32; \
	} while(0)

	static unsigned long long
	benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
			   unsigned long a, unsigned long b, unsigned long *res)
	{
		unsigned long long start, end;
		unsigned long gcd_res;

		rdtscll(start);
		gcd_res = gcd(a, b);
		rdtscll(end);

		*res = gcd_res;
		return end - start;
	}

	static inline unsigned long get_rand()
	{
		if (sizeof(long) == 8)
			return (unsigned long)rand() << 32 | rand();
		else
			return rand();
	}

	int main(int argc, char **argv)
	{
		unsigned int seed = time(0);
		int loops = 100;
		int repeats = 1000;
		unsigned long (*res)[TEST_ENTRIES];
		unsigned long long elapsed[TEST_ENTRIES];
		int i, j, k;

		for (;;) {
			int opt = getopt(argc, argv, "n:r:s:");
			/* End condition always first */
			if (opt == -1)
				break;

			switch (opt) {
			case 'n':
				loops = atoi(optarg);
				break;
			case 'r':
				repeats = atoi(optarg);
				break;
			case 's':
				seed = strtoul(optarg, NULL, 10);
				break;
			default:
				/* You won't actually get here. */
				break;
			}
		}

		res = malloc(sizeof(unsigned long) * TEST_ENTRIES * loops);
		memset(elapsed, 0, sizeof(elapsed));

		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			/* Do we have args? */
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			unsigned long long min_elapsed[TEST_ENTRIES];
			for (k = 0; k < repeats; k++) {
				for (i = 0; i < TEST_ENTRIES; i++) {
					unsigned long long tmp = benchmark_gcd_func(gcd_func[i], a, b, &res[j][i]);
					if (k == 0 || min_elapsed[i] > tmp)
						min_elapsed[i] = tmp;
				}
			}
			for (i = 0; i < TEST_ENTRIES; i++)
				elapsed[i] += min_elapsed[i];
		}

		for (i = 0; i < TEST_ENTRIES; i++)
			printf("gcd%d: elapsed %llu\n", i, elapsed[i]);

		k = 0;
		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			for (i = 1; i < TEST_ENTRIES; i++) {
				if (res[j][i] != res[j][0])
					break;
			}
			if (i < TEST_ENTRIES) {
				if (k == 0) {
					k = 1;
					fprintf(stderr, "Error:\n");
				}
				fprintf(stderr, "gcd(%lu, %lu): ", a, b);
				for (i = 0; i < TEST_ENTRIES; i++)
					fprintf(stderr, "%ld%s", res[j][i], i < TEST_ENTRIES - 1 ? ", " : "\n");
			}
		}

		if (k == 0)
			fprintf(stderr, "PASS\n");

		free(res);

		return 0;
	}

Here are some more timings, with the same flags as your tests:

First, 32 bit code:
		gcd0	gcd1	gcd2	gcd3	gcd4
Ivy Bridge	3156	1192	1740	1160	1640	PASS
AMD Phenom	7150	2564	2348	2975	2843	PASS
Core 2		4176	2592	4164	2604	3900	PASS
Pentium 4	11492	4784	7632	4852	6452	PASS

And 64-bit (longer times becuase the inputs are larger):
Ivy Bridge	10636	2496	3500	2432	3360	PASS
AMD Phenom	19482	4058	6030	5001	6845	PASS

Looking at those, I'm not sure how much better the gcd3/4 versions are
than gcd1/2.  The difference seems pretty minor and sometimes negative.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
@ 2016-05-07  8:41   ` George Spelvin
  0 siblings, 0 replies; 41+ messages in thread
From: George Spelvin @ 2016-05-07  8:41 UTC (permalink / raw)
  To: akpm, dalias, davem, geert, James.Bottomley, jjuran, linux,
	peterz, sam, zengzhaoxiu
  Cc: deller, heiko.carstens, ink, james.hogan, jejb, jonas, lennox.wu,
	lftan, linux-alpha, linux-arm-kernel, linux-kernel, linux-m68k,
	linux-metag, linux-mips, linux-parisc, linux-s390, linux-sh,
	linux-snps-arc, linux, linux, liqin.linux, mattst88, monstr,
	nios2-dev, ralf, rth, schwidefsky, sparclinux, uclinux-h8-devel,
	vgupta, ysato, zhaoxiu.zeng

Nothing critical, but a bit of kibitzing.
(That is slang in the Yiddish language for a person
who offers annoying and unwanted advice.)

> The binary GCD algorithm is based on the following facts:
> 	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
>	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
>	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)

1) "even" and "odd" are adjectives.  In English, adjectives to not have
   plural suffixes.  Thus, "they are even" or "they are odd".
2) Although "all" is not exactly wrong, it sounds odd.  Since there are
   exactly two of them it's clearer to say "both".

If I also rephrase the last line to fit into 80 columns, you get:

  The binary GCD algorithm is based on the following facts:
- 	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
+ 	1. If a and b are both even, then gcd(a,b) = 2 * gcd(a/2, b/2)
 	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
-	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)
+	3. If both are odd, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)

3) Negative config options are always confusing.

Would it be better to call it CONFIG_INEFFICIEBNT_FFS, or even simpler
CONFIG_SLOW_FFS?

Also, you're allowed to add "help" to a non-interactive config option,
and some documentation might be useful.
E.g.

+config CPU_SLOW_FFS
+	def_bool n
+	help
+	  If n, the CPU supports a fast __ffs (__builtin_ctz) operation,
+	  either directly or via a short code sequence using a count
+	  leading zeros or population count instruction.  If y, the
+	  operation is emulated by slower software, such as an unrolled
+	  binary search.
+
+	  This is purely an optimization option; the kernel
+	  will function correctly regardless of how it is set.
+

Your benchmark code doesn't have to have a separate code path if
__x86_64__; rdtsc works on 32-bit code just as well.  paths.  And the
way you subtract the end and start times is unnecessarily complicated.
The C language guarantees that unsigned arithmetic simply wraps modulo
2^bits as expected.

Here's a simplified version:

	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <string.h>
	#include <time.h>
	#include <unistd.h>

	#define swap(a, b) \
		do { \
			a ^= b; \
			b ^= a; \
			a ^= b; \
		} while (0)

	/* The Euclidean GCD algorithm */
	unsigned long gcd0(unsigned long a, unsigned long b)
	{
		if (a < b)
			swap(a, b);

		while (b != 0) {
			unsigned long r = a % b;
			a = b;
			b = r;
		}
		return a;
	}

	/* The binary GCD algorithm, using __builtin_ctzl */
	unsigned long gcd1(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);

		do {
			a >>= __builtin_ctzl(a);

			if (a < b)
				swap(a, b);
			a -= b;
		} while (a);
		return b << __builtin_ctzl(r);
	}

	/* Binary GCD algorithm, even/odd variant, without __builtin_ctzl */
	unsigned long gcd2(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a < b)
				swap(a, b);
			else if (a == b)
				return a;
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	/* A variant of gcd1, with early out for gcd = 1 */
	unsigned long gcd3(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);
		if (b == 1)
			return r & -r;

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a == b || a == 1)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd4(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;
		if (b == r)
			return r;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a == b || a == r)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	static unsigned long (*gcd_func[])(unsigned long a, unsigned long b) = {
		gcd0, gcd1, gcd2, gcd3, gcd4,
	};

	#define TEST_ENTRIES (int)(sizeof(gcd_func) / sizeof(gcd_func[0]))

	#define rdtscll(val) do { \
		unsigned __a,__d; \
		__asm__ __volatile__("rdtsc" : "=a" (__a), "=d" (__d)); \
		(val) = __a | (unsigned long long)__d << 32; \
	} while(0)

	static unsigned long long
	benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
			   unsigned long a, unsigned long b, unsigned long *res)
	{
		unsigned long long start, end;
		unsigned long gcd_res;

		rdtscll(start);
		gcd_res = gcd(a, b);
		rdtscll(end);

		*res = gcd_res;
		return end - start;
	}

	static inline unsigned long get_rand()
	{
		if (sizeof(long) == 8)
			return (unsigned long)rand() << 32 | rand();
		else
			return rand();
	}

	int main(int argc, char **argv)
	{
		unsigned int seed = time(0);
		int loops = 100;
		int repeats = 1000;
		unsigned long (*res)[TEST_ENTRIES];
		unsigned long long elapsed[TEST_ENTRIES];
		int i, j, k;

		for (;;) {
			int opt = getopt(argc, argv, "n:r:s:");
			/* End condition always first */
			if (opt == -1)
				break;

			switch (opt) {
			case 'n':
				loops = atoi(optarg);
				break;
			case 'r':
				repeats = atoi(optarg);
				break;
			case 's':
				seed = strtoul(optarg, NULL, 10);
				break;
			default:
				/* You won't actually get here. */
				break;
			}
		}

		res = malloc(sizeof(unsigned long) * TEST_ENTRIES * loops);
		memset(elapsed, 0, sizeof(elapsed));

		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			/* Do we have args? */
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			unsigned long long min_elapsed[TEST_ENTRIES];
			for (k = 0; k < repeats; k++) {
				for (i = 0; i < TEST_ENTRIES; i++) {
					unsigned long long tmp = benchmark_gcd_func(gcd_func[i], a, b, &res[j][i]);
					if (k == 0 || min_elapsed[i] > tmp)
						min_elapsed[i] = tmp;
				}
			}
			for (i = 0; i < TEST_ENTRIES; i++)
				elapsed[i] += min_elapsed[i];
		}

		for (i = 0; i < TEST_ENTRIES; i++)
			printf("gcd%d: elapsed %llu\n", i, elapsed[i]);

		k = 0;
		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			for (i = 1; i < TEST_ENTRIES; i++) {
				if (res[j][i] != res[j][0])
					break;
			}
			if (i < TEST_ENTRIES) {
				if (k == 0) {
					k = 1;
					fprintf(stderr, "Error:\n");
				}
				fprintf(stderr, "gcd(%lu, %lu): ", a, b);
				for (i = 0; i < TEST_ENTRIES; i++)
					fprintf(stderr, "%ld%s", res[j][i], i < TEST_ENTRIES - 1 ? ", " : "\n");
			}
		}

		if (k == 0)
			fprintf(stderr, "PASS\n");

		free(res);

		return 0;
	}

Here are some more timings, with the same flags as your tests:

First, 32 bit code:
		gcd0	gcd1	gcd2	gcd3	gcd4
Ivy Bridge	3156	1192	1740	1160	1640	PASS
AMD Phenom	7150	2564	2348	2975	2843	PASS
Core 2		4176	2592	4164	2604	3900	PASS
Pentium 4	11492	4784	7632	4852	6452	PASS

And 64-bit (longer times becuase the inputs are larger):
Ivy Bridge	10636	2496	3500	2432	3360	PASS
AMD Phenom	19482	4058	6030	5001	6845	PASS

Looking at those, I'm not sure how much better the gcd3/4 versions are
than gcd1/2.  The difference seems pretty minor and sometimes negative.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
  2016-05-06  9:42 ` zengzhaoxiu
                   ` (5 preceding siblings ...)
  (?)
@ 2016-05-07  8:41 ` George Spelvin
  -1 siblings, 0 replies; 41+ messages in thread
From: George Spelvin @ 2016-05-07  8:41 UTC (permalink / raw)
  To: akpm, dalias, davem, geert, James.Bottomley, jjuran, linux,
	peterz, sam, zengzhaoxiu
  Cc: deller, heiko.carstens, ink, james.hogan, jejb, jonas, lennox.wu,
	lftan, linux-alpha, linux-arm-kernel, linux-kernel, linux-m68k,
	linux-metag, linux-mips, linux-parisc, linux-s390, linux-sh,
	linux-snps-arc, linux, linux, liqin.linux, mattst88, monstr,
	nios2-dev, ralf, rth, schwidefsky, sparclinux, uclinux-h8-devel,
	vgupta, ysato, zhaoxiu.zeng

Nothing critical, but a bit of kibitzing.
(That is slang in the Yiddish language for a person
who offers annoying and unwanted advice.)

> The binary GCD algorithm is based on the following facts:
> 	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
>	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
>	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)

1) "even" and "odd" are adjectives.  In English, adjectives to not have
   plural suffixes.  Thus, "they are even" or "they are odd".
2) Although "all" is not exactly wrong, it sounds odd.  Since there are
   exactly two of them it's clearer to say "both".

If I also rephrase the last line to fit into 80 columns, you get:

  The binary GCD algorithm is based on the following facts:
- 	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
+ 	1. If a and b are both even, then gcd(a,b) = 2 * gcd(a/2, b/2)
 	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
-	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)
+	3. If both are odd, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)

3) Negative config options are always confusing.

Would it be better to call it CONFIG_INEFFICIEBNT_FFS, or even simpler
CONFIG_SLOW_FFS?

Also, you're allowed to add "help" to a non-interactive config option,
and some documentation might be useful.
E.g.

+config CPU_SLOW_FFS
+	def_bool n
+	help
+	  If n, the CPU supports a fast __ffs (__builtin_ctz) operation,
+	  either directly or via a short code sequence using a count
+	  leading zeros or population count instruction.  If y, the
+	  operation is emulated by slower software, such as an unrolled
+	  binary search.
+
+	  This is purely an optimization option; the kernel
+	  will function correctly regardless of how it is set.
+

Your benchmark code doesn't have to have a separate code path if
__x86_64__; rdtsc works on 32-bit code just as well.  paths.  And the
way you subtract the end and start times is unnecessarily complicated.
The C language guarantees that unsigned arithmetic simply wraps modulo
2^bits as expected.

Here's a simplified version:

	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <string.h>
	#include <time.h>
	#include <unistd.h>

	#define swap(a, b) \
		do { \
			a ^= b; \
			b ^= a; \
			a ^= b; \
		} while (0)

	/* The Euclidean GCD algorithm */
	unsigned long gcd0(unsigned long a, unsigned long b)
	{
		if (a < b)
			swap(a, b);

		while (b != 0) {
			unsigned long r = a % b;
			a = b;
			b = r;
		}
		return a;
	}

	/* The binary GCD algorithm, using __builtin_ctzl */
	unsigned long gcd1(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);

		do {
			a >>= __builtin_ctzl(a);

			if (a < b)
				swap(a, b);
			a -= b;
		} while (a);
		return b << __builtin_ctzl(r);
	}

	/* Binary GCD algorithm, even/odd variant, without __builtin_ctzl */
	unsigned long gcd2(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a < b)
				swap(a, b);
			else if (a == b)
				return a;
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	/* A variant of gcd1, with early out for gcd = 1 */
	unsigned long gcd3(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);
		if (b == 1)
			return r & -r;

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a == b || a == 1)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd4(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;
		if (b == r)
			return r;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a == b || a == r)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	static unsigned long (*gcd_func[])(unsigned long a, unsigned long b) = {
		gcd0, gcd1, gcd2, gcd3, gcd4,
	};

	#define TEST_ENTRIES (int)(sizeof(gcd_func) / sizeof(gcd_func[0]))

	#define rdtscll(val) do { \
		unsigned __a,__d; \
		__asm__ __volatile__("rdtsc" : "=a" (__a), "=d" (__d)); \
		(val) = __a | (unsigned long long)__d << 32; \
	} while(0)

	static unsigned long long
	benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
			   unsigned long a, unsigned long b, unsigned long *res)
	{
		unsigned long long start, end;
		unsigned long gcd_res;

		rdtscll(start);
		gcd_res = gcd(a, b);
		rdtscll(end);

		*res = gcd_res;
		return end - start;
	}

	static inline unsigned long get_rand()
	{
		if (sizeof(long) == 8)
			return (unsigned long)rand() << 32 | rand();
		else
			return rand();
	}

	int main(int argc, char **argv)
	{
		unsigned int seed = time(0);
		int loops = 100;
		int repeats = 1000;
		unsigned long (*res)[TEST_ENTRIES];
		unsigned long long elapsed[TEST_ENTRIES];
		int i, j, k;

		for (;;) {
			int opt = getopt(argc, argv, "n:r:s:");
			/* End condition always first */
			if (opt == -1)
				break;

			switch (opt) {
			case 'n':
				loops = atoi(optarg);
				break;
			case 'r':
				repeats = atoi(optarg);
				break;
			case 's':
				seed = strtoul(optarg, NULL, 10);
				break;
			default:
				/* You won't actually get here. */
				break;
			}
		}

		res = malloc(sizeof(unsigned long) * TEST_ENTRIES * loops);
		memset(elapsed, 0, sizeof(elapsed));

		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			/* Do we have args? */
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			unsigned long long min_elapsed[TEST_ENTRIES];
			for (k = 0; k < repeats; k++) {
				for (i = 0; i < TEST_ENTRIES; i++) {
					unsigned long long tmp = benchmark_gcd_func(gcd_func[i], a, b, &res[j][i]);
					if (k == 0 || min_elapsed[i] > tmp)
						min_elapsed[i] = tmp;
				}
			}
			for (i = 0; i < TEST_ENTRIES; i++)
				elapsed[i] += min_elapsed[i];
		}

		for (i = 0; i < TEST_ENTRIES; i++)
			printf("gcd%d: elapsed %llu\n", i, elapsed[i]);

		k = 0;
		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			for (i = 1; i < TEST_ENTRIES; i++) {
				if (res[j][i] != res[j][0])
					break;
			}
			if (i < TEST_ENTRIES) {
				if (k == 0) {
					k = 1;
					fprintf(stderr, "Error:\n");
				}
				fprintf(stderr, "gcd(%lu, %lu): ", a, b);
				for (i = 0; i < TEST_ENTRIES; i++)
					fprintf(stderr, "%ld%s", res[j][i], i < TEST_ENTRIES - 1 ? ", " : "\n");
			}
		}

		if (k == 0)
			fprintf(stderr, "PASS\n");

		free(res);

		return 0;
	}

Here are some more timings, with the same flags as your tests:

First, 32 bit code:
		gcd0	gcd1	gcd2	gcd3	gcd4
Ivy Bridge	3156	1192	1740	1160	1640	PASS
AMD Phenom	7150	2564	2348	2975	2843	PASS
Core 2		4176	2592	4164	2604	3900	PASS
Pentium 4	11492	4784	7632	4852	6452	PASS

And 64-bit (longer times becuase the inputs are larger):
Ivy Bridge	10636	2496	3500	2432	3360	PASS
AMD Phenom	19482	4058	6030	5001	6845	PASS

Looking at those, I'm not sure how much better the gcd3/4 versions are
than gcd1/2.  The difference seems pretty minor and sometimes negative.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
@ 2016-05-07  8:41   ` George Spelvin
  0 siblings, 0 replies; 41+ messages in thread
From: George Spelvin @ 2016-05-07  8:41 UTC (permalink / raw)
  To: linux-snps-arc

Nothing critical, but a bit of kibitzing.
(That is slang in the Yiddish language for a person
who offers annoying and unwanted advice.)

> The binary GCD algorithm is based on the following facts:
> 	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
>	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
>	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)

1) "even" and "odd" are adjectives.  In English, adjectives to not have
   plural suffixes.  Thus, "they are even" or "they are odd".
2) Although "all" is not exactly wrong, it sounds odd.  Since there are
   exactly two of them it's clearer to say "both".

If I also rephrase the last line to fit into 80 columns, you get:

  The binary GCD algorithm is based on the following facts:
- 	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
+ 	1. If a and b are both even, then gcd(a,b) = 2 * gcd(a/2, b/2)
 	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
-	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)
+	3. If both are odd, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)

3) Negative config options are always confusing.

Would it be better to call it CONFIG_INEFFICIEBNT_FFS, or even simpler
CONFIG_SLOW_FFS?

Also, you're allowed to add "help" to a non-interactive config option,
and some documentation might be useful.
E.g.

+config CPU_SLOW_FFS
+	def_bool n
+	help
+	  If n, the CPU supports a fast __ffs (__builtin_ctz) operation,
+	  either directly or via a short code sequence using a count
+	  leading zeros or population count instruction.  If y, the
+	  operation is emulated by slower software, such as an unrolled
+	  binary search.
+
+	  This is purely an optimization option; the kernel
+	  will function correctly regardless of how it is set.
+

Your benchmark code doesn't have to have a separate code path if
__x86_64__; rdtsc works on 32-bit code just as well.  paths.  And the
way you subtract the end and start times is unnecessarily complicated.
The C language guarantees that unsigned arithmetic simply wraps modulo
2^bits as expected.

Here's a simplified version:

	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <string.h>
	#include <time.h>
	#include <unistd.h>

	#define swap(a, b) \
		do { \
			a ^= b; \
			b ^= a; \
			a ^= b; \
		} while (0)

	/* The Euclidean GCD algorithm */
	unsigned long gcd0(unsigned long a, unsigned long b)
	{
		if (a < b)
			swap(a, b);

		while (b != 0) {
			unsigned long r = a % b;
			a = b;
			b = r;
		}
		return a;
	}

	/* The binary GCD algorithm, using __builtin_ctzl */
	unsigned long gcd1(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);

		do {
			a >>= __builtin_ctzl(a);

			if (a < b)
				swap(a, b);
			a -= b;
		} while (a);
		return b << __builtin_ctzl(r);
	}

	/* Binary GCD algorithm, even/odd variant, without __builtin_ctzl */
	unsigned long gcd2(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a < b)
				swap(a, b);
			else if (a == b)
				return a;
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	/* A variant of gcd1, with early out for gcd = 1 */
	unsigned long gcd3(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);
		if (b == 1)
			return r & -r;

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a == b || a == 1)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd4(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;
		if (b == r)
			return r;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a == b || a == r)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	static unsigned long (*gcd_func[])(unsigned long a, unsigned long b) = {
		gcd0, gcd1, gcd2, gcd3, gcd4,
	};

	#define TEST_ENTRIES (int)(sizeof(gcd_func) / sizeof(gcd_func[0]))

	#define rdtscll(val) do { \
		unsigned __a,__d; \
		__asm__ __volatile__("rdtsc" : "=a" (__a), "=d" (__d)); \
		(val) = __a | (unsigned long long)__d << 32; \
	} while(0)

	static unsigned long long
	benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
			   unsigned long a, unsigned long b, unsigned long *res)
	{
		unsigned long long start, end;
		unsigned long gcd_res;

		rdtscll(start);
		gcd_res = gcd(a, b);
		rdtscll(end);

		*res = gcd_res;
		return end - start;
	}

	static inline unsigned long get_rand()
	{
		if (sizeof(long) == 8)
			return (unsigned long)rand() << 32 | rand();
		else
			return rand();
	}

	int main(int argc, char **argv)
	{
		unsigned int seed = time(0);
		int loops = 100;
		int repeats = 1000;
		unsigned long (*res)[TEST_ENTRIES];
		unsigned long long elapsed[TEST_ENTRIES];
		int i, j, k;

		for (;;) {
			int opt = getopt(argc, argv, "n:r:s:");
			/* End condition always first */
			if (opt == -1)
				break;

			switch (opt) {
			case 'n':
				loops = atoi(optarg);
				break;
			case 'r':
				repeats = atoi(optarg);
				break;
			case 's':
				seed = strtoul(optarg, NULL, 10);
				break;
			default:
				/* You won't actually get here. */
				break;
			}
		}

		res = malloc(sizeof(unsigned long) * TEST_ENTRIES * loops);
		memset(elapsed, 0, sizeof(elapsed));

		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			/* Do we have args? */
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			unsigned long long min_elapsed[TEST_ENTRIES];
			for (k = 0; k < repeats; k++) {
				for (i = 0; i < TEST_ENTRIES; i++) {
					unsigned long long tmp = benchmark_gcd_func(gcd_func[i], a, b, &res[j][i]);
					if (k == 0 || min_elapsed[i] > tmp)
						min_elapsed[i] = tmp;
				}
			}
			for (i = 0; i < TEST_ENTRIES; i++)
				elapsed[i] += min_elapsed[i];
		}

		for (i = 0; i < TEST_ENTRIES; i++)
			printf("gcd%d: elapsed %llu\n", i, elapsed[i]);

		k = 0;
		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			for (i = 1; i < TEST_ENTRIES; i++) {
				if (res[j][i] != res[j][0])
					break;
			}
			if (i < TEST_ENTRIES) {
				if (k == 0) {
					k = 1;
					fprintf(stderr, "Error:\n");
				}
				fprintf(stderr, "gcd(%lu, %lu): ", a, b);
				for (i = 0; i < TEST_ENTRIES; i++)
					fprintf(stderr, "%ld%s", res[j][i], i < TEST_ENTRIES - 1 ? ", " : "\n");
			}
		}

		if (k == 0)
			fprintf(stderr, "PASS\n");

		free(res);

		return 0;
	}

Here are some more timings, with the same flags as your tests:

First, 32 bit code:
		gcd0	gcd1	gcd2	gcd3	gcd4
Ivy Bridge	3156	1192	1740	1160	1640	PASS
AMD Phenom	7150	2564	2348	2975	2843	PASS
Core 2		4176	2592	4164	2604	3900	PASS
Pentium 4	11492	4784	7632	4852	6452	PASS

And 64-bit (longer times becuase the inputs are larger):
Ivy Bridge	10636	2496	3500	2432	3360	PASS
AMD Phenom	19482	4058	6030	5001	6845	PASS

Looking at those, I'm not sure how much better the gcd3/4 versions are
than gcd1/2.  The difference seems pretty minor and sometimes negative.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
@ 2016-05-07  8:41   ` George Spelvin
  0 siblings, 0 replies; 41+ messages in thread
From: George Spelvin @ 2016-05-07  8:41 UTC (permalink / raw)
  To: linux-arm-kernel

Nothing critical, but a bit of kibitzing.
(That is slang in the Yiddish language for a person
who offers annoying and unwanted advice.)

> The binary GCD algorithm is based on the following facts:
> 	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
>	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
>	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)

1) "even" and "odd" are adjectives.  In English, adjectives to not have
   plural suffixes.  Thus, "they are even" or "they are odd".
2) Although "all" is not exactly wrong, it sounds odd.  Since there are
   exactly two of them it's clearer to say "both".

If I also rephrase the last line to fit into 80 columns, you get:

  The binary GCD algorithm is based on the following facts:
- 	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
+ 	1. If a and b are both even, then gcd(a,b) = 2 * gcd(a/2, b/2)
 	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
-	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)
+	3. If both are odd, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)

3) Negative config options are always confusing.

Would it be better to call it CONFIG_INEFFICIEBNT_FFS, or even simpler
CONFIG_SLOW_FFS?

Also, you're allowed to add "help" to a non-interactive config option,
and some documentation might be useful.
E.g.

+config CPU_SLOW_FFS
+	def_bool n
+	help
+	  If n, the CPU supports a fast __ffs (__builtin_ctz) operation,
+	  either directly or via a short code sequence using a count
+	  leading zeros or population count instruction.  If y, the
+	  operation is emulated by slower software, such as an unrolled
+	  binary search.
+
+	  This is purely an optimization option; the kernel
+	  will function correctly regardless of how it is set.
+

Your benchmark code doesn't have to have a separate code path if
__x86_64__; rdtsc works on 32-bit code just as well.  paths.  And the
way you subtract the end and start times is unnecessarily complicated.
The C language guarantees that unsigned arithmetic simply wraps modulo
2^bits as expected.

Here's a simplified version:

	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <string.h>
	#include <time.h>
	#include <unistd.h>

	#define swap(a, b) \
		do { \
			a ^= b; \
			b ^= a; \
			a ^= b; \
		} while (0)

	/* The Euclidean GCD algorithm */
	unsigned long gcd0(unsigned long a, unsigned long b)
	{
		if (a < b)
			swap(a, b);

		while (b != 0) {
			unsigned long r = a % b;
			a = b;
			b = r;
		}
		return a;
	}

	/* The binary GCD algorithm, using __builtin_ctzl */
	unsigned long gcd1(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);

		do {
			a >>= __builtin_ctzl(a);

			if (a < b)
				swap(a, b);
			a -= b;
		} while (a);
		return b << __builtin_ctzl(r);
	}

	/* Binary GCD algorithm, even/odd variant, without __builtin_ctzl */
	unsigned long gcd2(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a < b)
				swap(a, b);
			else if (a == b)
				return a;
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	/* A variant of gcd1, with early out for gcd = 1 */
	unsigned long gcd3(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);
		if (b == 1)
			return r & -r;

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a == b || a == 1)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd4(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;
		if (b == r)
			return r;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a == b || a == r)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	static unsigned long (*gcd_func[])(unsigned long a, unsigned long b) = {
		gcd0, gcd1, gcd2, gcd3, gcd4,
	};

	#define TEST_ENTRIES (int)(sizeof(gcd_func) / sizeof(gcd_func[0]))

	#define rdtscll(val) do { \
		unsigned __a,__d; \
		__asm__ __volatile__("rdtsc" : "=a" (__a), "=d" (__d)); \
		(val) = __a | (unsigned long long)__d << 32; \
	} while(0)

	static unsigned long long
	benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
			   unsigned long a, unsigned long b, unsigned long *res)
	{
		unsigned long long start, end;
		unsigned long gcd_res;

		rdtscll(start);
		gcd_res = gcd(a, b);
		rdtscll(end);

		*res = gcd_res;
		return end - start;
	}

	static inline unsigned long get_rand()
	{
		if (sizeof(long) == 8)
			return (unsigned long)rand() << 32 | rand();
		else
			return rand();
	}

	int main(int argc, char **argv)
	{
		unsigned int seed = time(0);
		int loops = 100;
		int repeats = 1000;
		unsigned long (*res)[TEST_ENTRIES];
		unsigned long long elapsed[TEST_ENTRIES];
		int i, j, k;

		for (;;) {
			int opt = getopt(argc, argv, "n:r:s:");
			/* End condition always first */
			if (opt == -1)
				break;

			switch (opt) {
			case 'n':
				loops = atoi(optarg);
				break;
			case 'r':
				repeats = atoi(optarg);
				break;
			case 's':
				seed = strtoul(optarg, NULL, 10);
				break;
			default:
				/* You won't actually get here. */
				break;
			}
		}

		res = malloc(sizeof(unsigned long) * TEST_ENTRIES * loops);
		memset(elapsed, 0, sizeof(elapsed));

		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			/* Do we have args? */
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			unsigned long long min_elapsed[TEST_ENTRIES];
			for (k = 0; k < repeats; k++) {
				for (i = 0; i < TEST_ENTRIES; i++) {
					unsigned long long tmp = benchmark_gcd_func(gcd_func[i], a, b, &res[j][i]);
					if (k == 0 || min_elapsed[i] > tmp)
						min_elapsed[i] = tmp;
				}
			}
			for (i = 0; i < TEST_ENTRIES; i++)
				elapsed[i] += min_elapsed[i];
		}

		for (i = 0; i < TEST_ENTRIES; i++)
			printf("gcd%d: elapsed %llu\n", i, elapsed[i]);

		k = 0;
		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			for (i = 1; i < TEST_ENTRIES; i++) {
				if (res[j][i] != res[j][0])
					break;
			}
			if (i < TEST_ENTRIES) {
				if (k == 0) {
					k = 1;
					fprintf(stderr, "Error:\n");
				}
				fprintf(stderr, "gcd(%lu, %lu): ", a, b);
				for (i = 0; i < TEST_ENTRIES; i++)
					fprintf(stderr, "%ld%s", res[j][i], i < TEST_ENTRIES - 1 ? ", " : "\n");
			}
		}

		if (k == 0)
			fprintf(stderr, "PASS\n");

		free(res);

		return 0;
	}

Here are some more timings, with the same flags as your tests:

First, 32 bit code:
		gcd0	gcd1	gcd2	gcd3	gcd4
Ivy Bridge	3156	1192	1740	1160	1640	PASS
AMD Phenom	7150	2564	2348	2975	2843	PASS
Core 2		4176	2592	4164	2604	3900	PASS
Pentium 4	11492	4784	7632	4852	6452	PASS

And 64-bit (longer times becuase the inputs are larger):
Ivy Bridge	10636	2496	3500	2432	3360	PASS
AMD Phenom	19482	4058	6030	5001	6845	PASS

Looking@those, I'm not sure how much better the gcd3/4 versions are
than gcd1/2.  The difference seems pretty minor and sometimes negative.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
  2016-05-07  8:41   ` George Spelvin
                       ` (3 preceding siblings ...)
  (?)
@ 2016-05-07 10:46     ` Andreas Schwab
  -1 siblings, 0 replies; 41+ messages in thread
From: Andreas Schwab @ 2016-05-07 10:46 UTC (permalink / raw)
  To: linux-arm-kernel

"George Spelvin" <linux@horizon.com> writes:

> Your benchmark code doesn't have to have a separate code path if
> __x86_64__; rdtsc works on 32-bit code just as well.

Take a look at the CC: list.

Andreas.

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 58CA 54C7 6D53 942B 1756  01D3 44D5 214B 8276 4ED5
"And now for something completely different."

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
@ 2016-05-07 10:46     ` Andreas Schwab
  0 siblings, 0 replies; 41+ messages in thread
From: Andreas Schwab @ 2016-05-07 10:46 UTC (permalink / raw)
  To: George Spelvin
  Cc: akpm, dalias, davem, geert, James.Bottomley, jjuran, peterz, sam,
	zengzhaoxiu, deller, heiko.carstens, ink, james.hogan, jejb,
	jonas, lennox.wu, lftan, linux-alpha, linux-arm-kernel,
	linux-kernel, linux-m68k, linux-metag, linux-mips, linux-parisc,
	linux-s390, linux-sh, linux-snps-arc, linux, linux, liqin.linux,
	mattst88, monstr, nios2-dev, ralf, rth, schwidefsky

"George Spelvin" <linux@horizon.com> writes:

> Your benchmark code doesn't have to have a separate code path if
> __x86_64__; rdtsc works on 32-bit code just as well.

Take a look at the CC: list.

Andreas.

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 58CA 54C7 6D53 942B 1756  01D3 44D5 214B 8276 4ED5
"And now for something completely different."

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
@ 2016-05-07 10:46     ` Andreas Schwab
  0 siblings, 0 replies; 41+ messages in thread
From: Andreas Schwab @ 2016-05-07 10:46 UTC (permalink / raw)
  To: George Spelvin
  Cc: akpm, dalias, davem, geert, James.Bottomley, jjuran, peterz, sam,
	zengzhaoxiu, deller, heiko.carstens, ink, james.hogan, jejb,
	jonas, lennox.wu, lftan, linux-alpha, linux-arm-kernel,
	linux-kernel, linux-m68k, linux-metag, linux-mips, linux-parisc,
	linux-s390, linux-sh, linux-snps-arc, linux, linux, liqin.linux,
	mattst88, monstr, nios2-dev, ralf, rth, schwidefsky, sparclinux,
	uclinux-h8-devel, vgupta, ysato, zhaoxiu.zeng

"George Spelvin" <linux@horizon.com> writes:

> Your benchmark code doesn't have to have a separate code path if
> __x86_64__; rdtsc works on 32-bit code just as well.

Take a look at the CC: list.

Andreas.

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 58CA 54C7 6D53 942B 1756  01D3 44D5 214B 8276 4ED5
"And now for something completely different."

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
  2016-05-07  8:41   ` George Spelvin
                     ` (4 preceding siblings ...)
  (?)
@ 2016-05-07 10:46   ` Andreas Schwab
  -1 siblings, 0 replies; 41+ messages in thread
From: Andreas Schwab @ 2016-05-07 10:46 UTC (permalink / raw)
  To: George Spelvin
  Cc: akpm, dalias, davem, geert, James.Bottomley, jjuran, peterz, sam,
	zengzhaoxiu, deller, heiko.carstens, ink, james.hogan, jejb,
	jonas, lennox.wu, lftan, linux-alpha, linux-arm-kernel,
	linux-kernel, linux-m68k, linux-metag, linux-mips, linux-parisc,
	linux-s390, linux-sh, linux-snps-arc, linux, linux, liqin.linux,
	mattst88, monstr, nios2-dev, ralf, rth, schwidefsky

"George Spelvin" <linux@horizon.com> writes:

> Your benchmark code doesn't have to have a separate code path if
> __x86_64__; rdtsc works on 32-bit code just as well.

Take a look at the CC: list.

Andreas.

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 58CA 54C7 6D53 942B 1756  01D3 44D5 214B 8276 4ED5
"And now for something completely different."

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
@ 2016-05-07 10:46     ` Andreas Schwab
  0 siblings, 0 replies; 41+ messages in thread
From: Andreas Schwab @ 2016-05-07 10:46 UTC (permalink / raw)
  To: George Spelvin
  Cc: akpm, dalias, davem, geert, James.Bottomley, jjuran, peterz, sam,
	zengzhaoxiu, deller, heiko.carstens, ink, james.hogan, jejb,
	jonas, lennox.wu, lftan, linux-alpha, linux-arm-kernel,
	linux-kernel, linux-m68k, linux-metag, linux-mips, linux-parisc,
	linux-s390, linux-sh, linux-snps-arc, linux, linux, liqin.linux,
	mattst88, monstr, nios2-dev, ralf, rth, schwidefsky, sparclinux,
	uclinux-h8-devel, vgupta, ysato, zhaoxiu.zeng

"George Spelvin" <linux@horizon.com> writes:

> Your benchmark code doesn't have to have a separate code path if
> __x86_64__; rdtsc works on 32-bit code just as well.

Take a look at the CC: list.

Andreas.

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 58CA 54C7 6D53 942B 1756  01D3 44D5 214B 8276 4ED5
"And now for something completely different."

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
@ 2016-05-07 10:46     ` Andreas Schwab
  0 siblings, 0 replies; 41+ messages in thread
From: Andreas Schwab @ 2016-05-07 10:46 UTC (permalink / raw)
  To: linux-snps-arc

"George Spelvin" <linux at horizon.com> writes:

> Your benchmark code doesn't have to have a separate code path if
> __x86_64__; rdtsc works on 32-bit code just as well.

Take a look at the CC: list.

Andreas.

-- 
Andreas Schwab, schwab at linux-m68k.org
GPG Key fingerprint = 58CA 54C7 6D53 942B 1756  01D3 44D5 214B 8276 4ED5
"And now for something completely different."

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
@ 2016-05-07 10:46     ` Andreas Schwab
  0 siblings, 0 replies; 41+ messages in thread
From: Andreas Schwab @ 2016-05-07 10:46 UTC (permalink / raw)
  To: linux-arm-kernel

"George Spelvin" <linux@horizon.com> writes:

> Your benchmark code doesn't have to have a separate code path if
> __x86_64__; rdtsc works on 32-bit code just as well.

Take a look at the CC: list.

Andreas.

-- 
Andreas Schwab, schwab at linux-m68k.org
GPG Key fingerprint = 58CA 54C7 6D53 942B 1756  01D3 44D5 214B 8276 4ED5
"And now for something completely different."

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
  2016-05-06  9:42 ` zengzhaoxiu
                     ` (3 preceding siblings ...)
  (?)
@ 2016-05-07 11:23   ` Sam Ravnborg
  -1 siblings, 0 replies; 41+ messages in thread
From: Sam Ravnborg @ 2016-05-07 11:23 UTC (permalink / raw)
  To: linux-arm-kernel

> diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
> index 57ffaf2..ca675ed 100644
> --- a/arch/sparc/Kconfig
> +++ b/arch/sparc/Kconfig
> @@ -42,6 +42,7 @@ config SPARC
>  	select ODD_RT_SIGACTION
>  	select OLD_SIGSUSPEND
>  	select ARCH_HAS_SG_CHAIN
> +	select CPU_NO_EFFICIENT_FFS
>  
>  config SPARC32
>  	def_bool !64BIT

sparc64 have an efficient ffs implementation.
We use run-time patching to use the proper version
depending on the actual sparc cpu.

As this is determinded at config time, then let the
sparc cpu that has the efficient ffs benefit from this.

In other words - select CPU_NO_EFFICIENT_FFS only for SPARC32.

	Sam

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
@ 2016-05-07 11:23   ` Sam Ravnborg
  0 siblings, 0 replies; 41+ messages in thread
From: Sam Ravnborg @ 2016-05-07 11:23 UTC (permalink / raw)
  To: zengzhaoxiu
  Cc: akpm, linux, peterz, jjuran, James.Bottomley, geert, dalias,
	davem, Zhaoxiu Zeng, Richard Henderson, Ivan Kokshaysky,
	Matt Turner, Vineet Gupta, Russell King, Yoshinori Sato,
	James Hogan, Michal Simek, Ralf Baechle, Ley Foon Tan,
	Jonas Bonn, James E.J. Bottomley, Helge Deller,
	Martin Schwidefsky, Heiko Carstens, Chen Liqin, Lennox Wu,
	linux-ke

> diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
> index 57ffaf2..ca675ed 100644
> --- a/arch/sparc/Kconfig
> +++ b/arch/sparc/Kconfig
> @@ -42,6 +42,7 @@ config SPARC
>  	select ODD_RT_SIGACTION
>  	select OLD_SIGSUSPEND
>  	select ARCH_HAS_SG_CHAIN
> +	select CPU_NO_EFFICIENT_FFS
>  
>  config SPARC32
>  	def_bool !64BIT

sparc64 have an efficient ffs implementation.
We use run-time patching to use the proper version
depending on the actual sparc cpu.

As this is determinded at config time, then let the
sparc cpu that has the efficient ffs benefit from this.

In other words - select CPU_NO_EFFICIENT_FFS only for SPARC32.

	Sam

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
@ 2016-05-07 11:23   ` Sam Ravnborg
  0 siblings, 0 replies; 41+ messages in thread
From: Sam Ravnborg @ 2016-05-07 11:23 UTC (permalink / raw)
  To: zengzhaoxiu
  Cc: akpm, linux, peterz, jjuran, James.Bottomley, geert, dalias,
	davem, Zhaoxiu Zeng, Richard Henderson, Ivan Kokshaysky,
	Matt Turner, Vineet Gupta, Russell King, Yoshinori Sato,
	James Hogan, Michal Simek, Ralf Baechle, Ley Foon Tan,
	Jonas Bonn, James E.J. Bottomley, Helge Deller,
	Martin Schwidefsky, Heiko Carstens, Chen Liqin, Lennox Wu,
	linux-kernel, linux-alpha, linux-snps-arc, linux-arm-kernel,
	uclinux-h8-devel, linux-m68k, linux-metag, linux-mips, nios2-dev,
	linux-parisc, linux-s390, linux-sh, sparclinux

> diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
> index 57ffaf2..ca675ed 100644
> --- a/arch/sparc/Kconfig
> +++ b/arch/sparc/Kconfig
> @@ -42,6 +42,7 @@ config SPARC
>  	select ODD_RT_SIGACTION
>  	select OLD_SIGSUSPEND
>  	select ARCH_HAS_SG_CHAIN
> +	select CPU_NO_EFFICIENT_FFS
>  
>  config SPARC32
>  	def_bool !64BIT

sparc64 have an efficient ffs implementation.
We use run-time patching to use the proper version
depending on the actual sparc cpu.

As this is determinded at config time, then let the
sparc cpu that has the efficient ffs benefit from this.

In other words - select CPU_NO_EFFICIENT_FFS only for SPARC32.

	Sam

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
@ 2016-05-07 11:23   ` Sam Ravnborg
  0 siblings, 0 replies; 41+ messages in thread
From: Sam Ravnborg @ 2016-05-07 11:23 UTC (permalink / raw)
  To: zengzhaoxiu
  Cc: akpm, linux, peterz, jjuran, James.Bottomley, geert, dalias,
	davem, Zhaoxiu Zeng, Richard Henderson, Ivan Kokshaysky,
	Matt Turner, Vineet Gupta, Russell King, Yoshinori Sato,
	James Hogan, Michal Simek, Ralf Baechle, Ley Foon Tan,
	Jonas Bonn, James E.J. Bottomley, Helge Deller,
	Martin Schwidefsky, Heiko Carstens, Chen Liqin, Lennox Wu,
	linux-kernel, linux-alpha, linux-snps-arc, linux-arm-kernel,
	uclinux-h8-devel, linux-m68k, linux-metag, linux-mips, nios2-dev,
	linux-parisc, linux-s390, linux-sh, sparclinux

> diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
> index 57ffaf2..ca675ed 100644
> --- a/arch/sparc/Kconfig
> +++ b/arch/sparc/Kconfig
> @@ -42,6 +42,7 @@ config SPARC
>  	select ODD_RT_SIGACTION
>  	select OLD_SIGSUSPEND
>  	select ARCH_HAS_SG_CHAIN
> +	select CPU_NO_EFFICIENT_FFS
>  
>  config SPARC32
>  	def_bool !64BIT

sparc64 have an efficient ffs implementation.
We use run-time patching to use the proper version
depending on the actual sparc cpu.

As this is determinded at config time, then let the
sparc cpu that has the efficient ffs benefit from this.

In other words - select CPU_NO_EFFICIENT_FFS only for SPARC32.

	Sam

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
@ 2016-05-07 11:23   ` Sam Ravnborg
  0 siblings, 0 replies; 41+ messages in thread
From: Sam Ravnborg @ 2016-05-07 11:23 UTC (permalink / raw)
  To: linux-snps-arc

> diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
> index 57ffaf2..ca675ed 100644
> --- a/arch/sparc/Kconfig
> +++ b/arch/sparc/Kconfig
> @@ -42,6 +42,7 @@ config SPARC
>  	select ODD_RT_SIGACTION
>  	select OLD_SIGSUSPEND
>  	select ARCH_HAS_SG_CHAIN
> +	select CPU_NO_EFFICIENT_FFS
>  
>  config SPARC32
>  	def_bool !64BIT

sparc64 have an efficient ffs implementation.
We use run-time patching to use the proper version
depending on the actual sparc cpu.

As this is determinded at config time, then let the
sparc cpu that has the efficient ffs benefit from this.

In other words - select CPU_NO_EFFICIENT_FFS only for SPARC32.

	Sam

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
@ 2016-05-07 11:23   ` Sam Ravnborg
  0 siblings, 0 replies; 41+ messages in thread
From: Sam Ravnborg @ 2016-05-07 11:23 UTC (permalink / raw)
  To: linux-arm-kernel

> diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
> index 57ffaf2..ca675ed 100644
> --- a/arch/sparc/Kconfig
> +++ b/arch/sparc/Kconfig
> @@ -42,6 +42,7 @@ config SPARC
>  	select ODD_RT_SIGACTION
>  	select OLD_SIGSUSPEND
>  	select ARCH_HAS_SG_CHAIN
> +	select CPU_NO_EFFICIENT_FFS
>  
>  config SPARC32
>  	def_bool !64BIT

sparc64 have an efficient ffs implementation.
We use run-time patching to use the proper version
depending on the actual sparc cpu.

As this is determinded at config time, then let the
sparc cpu that has the efficient ffs benefit from this.

In other words - select CPU_NO_EFFICIENT_FFS only for SPARC32.

	Sam

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
  2016-05-07 10:46     ` Andreas Schwab
                       ` (4 preceding siblings ...)
  (?)
@ 2016-05-07 20:41     ` George Spelvin
  -1 siblings, 0 replies; 41+ messages in thread
From: George Spelvin @ 2016-05-07 20:41 UTC (permalink / raw)
  To: schwab; +Cc: linux-kernel, linux

> Take a look at the Cc: list.

Yes, I realized how long it was about half a second after hitting "send".

Although I didn't really think it through at the time, even now that I
do it's not entirely clear who should have been trimmed, either.

In general lkml tradition is to not drop people from the Cc: list unless
you're really sure they're not interested, and my suggestion to rename the
CONFIG_ option is something that really wants multiple people's opinions.

Do you think it's borderline (and I'll think harder in future), or
am I misjudging and I was *clearly* over the line?

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
  2016-05-07 11:23   ` Sam Ravnborg
@ 2016-05-07 22:30     ` George Spelvin
  -1 siblings, 0 replies; 41+ messages in thread
From: George Spelvin @ 2016-05-07 22:30 UTC (permalink / raw)
  To: sam, zengzhaoxiu; +Cc: akpm, linux, linux-kernel, sparclinux

Sam Ravnborg wrote:
> sparc64 have an efficient ffs implementation.
> We use run-time patching to use the proper version
> depending on the actual sparc cpu.
> 
> As this is determinded at config time, then let the
> sparc cpu that has the efficient ffs benefit from this.
> 
> In other words - select CPU_NO_EFFICIENT_FFS only for SPARC32.

I'm not sure this is the right thing.

It's always a function call, and there's boot-time code patching to use
either an unrolled binary search or a POPC instructon on processors that
have that instruction.

The NO_EFFICIENT_FFS isn't much slower than the __ffs version, so the
call/return alone might eat the difference, and if the CPU doesn't have
POPC support it's definitely a lose.

Quite simply, gcd isn't important enough to be worth the same boot-time
code patching, and if we have to use one on both types of CPU the the
NO_EFFICIENT_FFS path is the safer alternative in case of uncertainty.

Would you be willing to try benchmarking it?  The baseline code, plus two
versions of the __ffs code using the two different __ffs implementations
(forced out of line by compiling from assembler source or using
inine asm and __attribute((noinline))).



By the way, the SPARC64 implementation could be improved.

It's currently 5 instructions:

__ffs:
        neg     %o0, %g1
        xnor    %o0, %g1, %o1
        popc    %o1, %o0
        retl
         sub    %o0, 1, %o0

That could be improved to 4:
	sub	%o0, 1, %g1
	andn	%g1, %o0, %g1
	retl
	 popc	%g1, %o0

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
@ 2016-05-07 22:30     ` George Spelvin
  0 siblings, 0 replies; 41+ messages in thread
From: George Spelvin @ 2016-05-07 22:30 UTC (permalink / raw)
  To: sam, zengzhaoxiu; +Cc: akpm, linux, linux-kernel, sparclinux

Sam Ravnborg wrote:
> sparc64 have an efficient ffs implementation.
> We use run-time patching to use the proper version
> depending on the actual sparc cpu.
> 
> As this is determinded at config time, then let the
> sparc cpu that has the efficient ffs benefit from this.
> 
> In other words - select CPU_NO_EFFICIENT_FFS only for SPARC32.

I'm not sure this is the right thing.

It's always a function call, and there's boot-time code patching to use
either an unrolled binary search or a POPC instructon on processors that
have that instruction.

The NO_EFFICIENT_FFS isn't much slower than the __ffs version, so the
call/return alone might eat the difference, and if the CPU doesn't have
POPC support it's definitely a lose.

Quite simply, gcd isn't important enough to be worth the same boot-time
code patching, and if we have to use one on both types of CPU the the
NO_EFFICIENT_FFS path is the safer alternative in case of uncertainty.

Would you be willing to try benchmarking it?  The baseline code, plus two
versions of the __ffs code using the two different __ffs implementations
(forced out of line by compiling from assembler source or using
inine asm and __attribute((noinline))).



By the way, the SPARC64 implementation could be improved.

It's currently 5 instructions:

__ffs:
        neg     %o0, %g1
        xnor    %o0, %g1, %o1
        popc    %o1, %o0
        retl
         sub    %o0, 1, %o0

That could be improved to 4:
	sub	%o0, 1, %g1
	andn	%g1, %o0, %g1
	retl
	 popc	%g1, %o0

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
  2016-05-07  8:41   ` George Spelvin
                       ` (3 preceding siblings ...)
  (?)
@ 2016-05-08 12:52     ` Zhaoxiu Zeng
  -1 siblings, 0 replies; 41+ messages in thread
From: Zhaoxiu Zeng @ 2016-05-08 12:52 UTC (permalink / raw)
  To: linux-arm-kernel

ÔÚ 2016/5/7 16:41, George Spelvin дµÀ:
> Nothing critical, but a bit of kibitzing.
> (That is slang in the Yiddish language for a person
> who offers annoying and unwanted advice.)
>
>> The binary GCD algorithm is based on the following facts:
>> 	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
>> 	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
>> 	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)
> 1) "even" and "odd" are adjectives.  In English, adjectives to not have
>    plural suffixes.  Thus, "they are even" or "they are odd".
> 2) Although "all" is not exactly wrong, it sounds odd.  Since there are
>    exactly two of them it's clearer to say "both".
>
> If I also rephrase the last line to fit into 80 columns, you get:
>
>   The binary GCD algorithm is based on the following facts:
> - 	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
> + 	1. If a and b are both even, then gcd(a,b) = 2 * gcd(a/2, b/2)
>  	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
> -	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)
> +	3. If both are odd, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)
>
> 3) Negative config options are always confusing.
>
> Would it be better to call it CONFIG_INEFFICIEBNT_FFS, or even simpler
> CONFIG_SLOW_FFS?
>
> Also, you're allowed to add "help" to a non-interactive config option,
> and some documentation might be useful.
> E.g.
>
> +config CPU_SLOW_FFS
> +	def_bool n
> +	help
> +	  If n, the CPU supports a fast __ffs (__builtin_ctz) operation,
> +	  either directly or via a short code sequence using a count
> +	  leading zeros or population count instruction.  If y, the
> +	  operation is emulated by slower software, such as an unrolled
> +	  binary search.
> +
> +	  This is purely an optimization option; the kernel
> +	  will function correctly regardless of how it is set.
> +

Thanks a lot.

> Your benchmark code doesn't have to have a separate code path if
> __x86_64__; rdtsc works on 32-bit code just as well.  paths.  And the
> way you subtract the end and start times is unnecessarily complicated.
> The C language guarantees that unsigned arithmetic simply wraps modulo
> 2^bits as expected.

rdsc works on x86, the other path is prepared for other architectures.

> Here are some more timings, with the same flags as your tests:
>
> First, 32 bit code:
> 		gcd0	gcd1	gcd2	gcd3	gcd4
> Ivy Bridge	3156	1192	1740	1160	1640	PASS
> AMD Phenom	7150	2564	2348	2975	2843	PASS
> Core 2		4176	2592	4164	2604	3900	PASS
> Pentium 4	11492	4784	7632	4852	6452	PASS
>
> And 64-bit (longer times becuase the inputs are larger):
> Ivy Bridge	10636	2496	3500	2432	3360	PASS
> AMD Phenom	19482	4058	6030	5001	6845	PASS
>
> Looking at those, I'm not sure how much better the gcd3/4 versions are
> than gcd1/2.  The difference seems pretty minor and sometimes negative.
>

The worst case of binary GCD is that one number is power of 2,
for example a is 0xffffffff (0xcccccccc for the "even/odd" variant) and b is 1.

The gcd3/4 versions can handle this properly.



^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
@ 2016-05-08 12:52     ` Zhaoxiu Zeng
  0 siblings, 0 replies; 41+ messages in thread
From: Zhaoxiu Zeng @ 2016-05-08 12:52 UTC (permalink / raw)
  To: George Spelvin, akpm, dalias, davem, geert, James.Bottomley,
	jjuran, peterz, sam
  Cc: linux-mips, linux-sh, heiko.carstens, linux, sparclinux,
	lennox.wu, jonas, linux-s390, linux, ysato, deller, jejb,
	mattst88, linux-snps-arc, uclinux-h8-devel, james.hogan,
	liqin.linux, linux-m68k, ink, zhaoxiu.zeng, nios2-dev,
	linux-metag, linux-arm-kernel, rth, monstr, linux-parisc, vgupta,
	linux-kernel, ralf, linux-alpha, schwidefsky, lftan

1NogMjAxNi81LzcgMTY6NDEsIEdlb3JnZSBTcGVsdmluINC0tcA6Cj4gTm90aGluZyBjcml0aWNh
bCwgYnV0IGEgYml0IG9mIGtpYml0emluZy4KPiAoVGhhdCBpcyBzbGFuZyBpbiB0aGUgWWlkZGlz
aCBsYW5ndWFnZSBmb3IgYSBwZXJzb24KPiB3aG8gb2ZmZXJzIGFubm95aW5nIGFuZCB1bndhbnRl
ZCBhZHZpY2UuKQo+Cj4+IFRoZSBiaW5hcnkgR0NEIGFsZ29yaXRobSBpcyBiYXNlZCBvbiB0aGUg
Zm9sbG93aW5nIGZhY3RzOgo+PiAJMS4gSWYgYSBhbmQgYiBhcmUgYWxsIGV2ZW5zLCB0aGVuIGdj
ZChhLGIpID0gMiAqIGdjZChhLzIsIGIvMikKPj4gCTIuIElmIGEgaXMgZXZlbiBhbmQgYiBpcyBv
ZGQsIHRoZW4gZ2NkKGEsYikgPSBnY2QoYS8yLCBiKQo+PiAJMy4gSWYgYSBhbmQgYiBhcmUgYWxs
IG9kZHMsIHRoZW4gZ2NkKGEsYikgPSBnY2QoKGEtYikvMiwgYikgPSBnY2QoKGErYikvMiwgYikK
PiAxKSAiZXZlbiIgYW5kICJvZGQiIGFyZSBhZGplY3RpdmVzLiAgSW4gRW5nbGlzaCwgYWRqZWN0
aXZlcyB0byBub3QgaGF2ZQo+ICAgIHBsdXJhbCBzdWZmaXhlcy4gIFRodXMsICJ0aGV5IGFyZSBl
dmVuIiBvciAidGhleSBhcmUgb2RkIi4KPiAyKSBBbHRob3VnaCAiYWxsIiBpcyBub3QgZXhhY3Rs
eSB3cm9uZywgaXQgc291bmRzIG9kZC4gIFNpbmNlIHRoZXJlIGFyZQo+ICAgIGV4YWN0bHkgdHdv
IG9mIHRoZW0gaXQncyBjbGVhcmVyIHRvIHNheSAiYm90aCIuCj4KPiBJZiBJIGFsc28gcmVwaHJh
c2UgdGhlIGxhc3QgbGluZSB0byBmaXQgaW50byA4MCBjb2x1bW5zLCB5b3UgZ2V0Ogo+Cj4gICBU
aGUgYmluYXJ5IEdDRCBhbGdvcml0aG0gaXMgYmFzZWQgb24gdGhlIGZvbGxvd2luZyBmYWN0czoK
PiAtIAkxLiBJZiBhIGFuZCBiIGFyZSBhbGwgZXZlbnMsIHRoZW4gZ2NkKGEsYikgPSAyICogZ2Nk
KGEvMiwgYi8yKQo+ICsgCTEuIElmIGEgYW5kIGIgYXJlIGJvdGggZXZlbiwgdGhlbiBnY2QoYSxi
KSA9IDIgKiBnY2QoYS8yLCBiLzIpCj4gIAkyLiBJZiBhIGlzIGV2ZW4gYW5kIGIgaXMgb2RkLCB0
aGVuIGdjZChhLGIpID0gZ2NkKGEvMiwgYikKPiAtCTMuIElmIGEgYW5kIGIgYXJlIGFsbCBvZGRz
LCB0aGVuIGdjZChhLGIpID0gZ2NkKChhLWIpLzIsIGIpID0gZ2NkKChhK2IpLzIsIGIpCj4gKwkz
LiBJZiBib3RoIGFyZSBvZGQsIHRoZW4gZ2NkKGEsYikgPSBnY2QoKGEtYikvMiwgYikgPSBnY2Qo
KGErYikvMiwgYikKPgo+IDMpIE5lZ2F0aXZlIGNvbmZpZyBvcHRpb25zIGFyZSBhbHdheXMgY29u
ZnVzaW5nLgo+Cj4gV291bGQgaXQgYmUgYmV0dGVyIHRvIGNhbGwgaXQgQ09ORklHX0lORUZGSUNJ
RUJOVF9GRlMsIG9yIGV2ZW4gc2ltcGxlcgo+IENPTkZJR19TTE9XX0ZGUz8KPgo+IEFsc28sIHlv
dSdyZSBhbGxvd2VkIHRvIGFkZCAiaGVscCIgdG8gYSBub24taW50ZXJhY3RpdmUgY29uZmlnIG9w
dGlvbiwKPiBhbmQgc29tZSBkb2N1bWVudGF0aW9uIG1pZ2h0IGJlIHVzZWZ1bC4KPiBFLmcuCj4K
PiArY29uZmlnIENQVV9TTE9XX0ZGUwo+ICsJZGVmX2Jvb2wgbgo+ICsJaGVscAo+ICsJICBJZiBu
LCB0aGUgQ1BVIHN1cHBvcnRzIGEgZmFzdCBfX2ZmcyAoX19idWlsdGluX2N0eikgb3BlcmF0aW9u
LAo+ICsJICBlaXRoZXIgZGlyZWN0bHkgb3IgdmlhIGEgc2hvcnQgY29kZSBzZXF1ZW5jZSB1c2lu
ZyBhIGNvdW50Cj4gKwkgIGxlYWRpbmcgemVyb3Mgb3IgcG9wdWxhdGlvbiBjb3VudCBpbnN0cnVj
dGlvbi4gIElmIHksIHRoZQo+ICsJICBvcGVyYXRpb24gaXMgZW11bGF0ZWQgYnkgc2xvd2VyIHNv
ZnR3YXJlLCBzdWNoIGFzIGFuIHVucm9sbGVkCj4gKwkgIGJpbmFyeSBzZWFyY2guCj4gKwo+ICsJ
ICBUaGlzIGlzIHB1cmVseSBhbiBvcHRpbWl6YXRpb24gb3B0aW9uOyB0aGUga2VybmVsCj4gKwkg
IHdpbGwgZnVuY3Rpb24gY29ycmVjdGx5IHJlZ2FyZGxlc3Mgb2YgaG93IGl0IGlzIHNldC4KPiAr
CgpUaGFua3MgYSBsb3QuCgo+IFlvdXIgYmVuY2htYXJrIGNvZGUgZG9lc24ndCBoYXZlIHRvIGhh
dmUgYSBzZXBhcmF0ZSBjb2RlIHBhdGggaWYKPiBfX3g4Nl82NF9fOyByZHRzYyB3b3JrcyBvbiAz
Mi1iaXQgY29kZSBqdXN0IGFzIHdlbGwuICBwYXRocy4gIEFuZCB0aGUKPiB3YXkgeW91IHN1YnRy
YWN0IHRoZSBlbmQgYW5kIHN0YXJ0IHRpbWVzIGlzIHVubmVjZXNzYXJpbHkgY29tcGxpY2F0ZWQu
Cj4gVGhlIEMgbGFuZ3VhZ2UgZ3VhcmFudGVlcyB0aGF0IHVuc2lnbmVkIGFyaXRobWV0aWMgc2lt
cGx5IHdyYXBzIG1vZHVsbwo+IDJeYml0cyBhcyBleHBlY3RlZC4KCnJkc2Mgd29ya3Mgb24geDg2
LCB0aGUgb3RoZXIgcGF0aCBpcyBwcmVwYXJlZCBmb3Igb3RoZXIgYXJjaGl0ZWN0dXJlcy4KCj4g
SGVyZSBhcmUgc29tZSBtb3JlIHRpbWluZ3MsIHdpdGggdGhlIHNhbWUgZmxhZ3MgYXMgeW91ciB0
ZXN0czoKPgo+IEZpcnN0LCAzMiBiaXQgY29kZToKPiAJCWdjZDAJZ2NkMQlnY2QyCWdjZDMJZ2Nk
NAo+IEl2eSBCcmlkZ2UJMzE1NgkxMTkyCTE3NDAJMTE2MAkxNjQwCVBBU1MKPiBBTUQgUGhlbm9t
CTcxNTAJMjU2NAkyMzQ4CTI5NzUJMjg0MwlQQVNTCj4gQ29yZSAyCQk0MTc2CTI1OTIJNDE2NAky
NjA0CTM5MDAJUEFTUwo+IFBlbnRpdW0gNAkxMTQ5Mgk0Nzg0CTc2MzIJNDg1Mgk2NDUyCVBBU1MK
Pgo+IEFuZCA2NC1iaXQgKGxvbmdlciB0aW1lcyBiZWN1YXNlIHRoZSBpbnB1dHMgYXJlIGxhcmdl
cik6Cj4gSXZ5IEJyaWRnZQkxMDYzNgkyNDk2CTM1MDAJMjQzMgkzMzYwCVBBU1MKPiBBTUQgUGhl
bm9tCTE5NDgyCTQwNTgJNjAzMAk1MDAxCTY4NDUJUEFTUwo+Cj4gTG9va2luZyBhdCB0aG9zZSwg
SSdtIG5vdCBzdXJlIGhvdyBtdWNoIGJldHRlciB0aGUgZ2NkMy80IHZlcnNpb25zIGFyZQo+IHRo
YW4gZ2NkMS8yLiAgVGhlIGRpZmZlcmVuY2Ugc2VlbXMgcHJldHR5IG1pbm9yIGFuZCBzb21ldGlt
ZXMgbmVnYXRpdmUuCj4KClRoZSB3b3JzdCBjYXNlIG9mIGJpbmFyeSBHQ0QgaXMgdGhhdCBvbmUg
bnVtYmVyIGlzIHBvd2VyIG9mIDIsCmZvciBleGFtcGxlIGEgaXMgMHhmZmZmZmZmZiAoMHhjY2Nj
Y2NjYyBmb3IgdGhlICJldmVuL29kZCIgdmFyaWFudCkgYW5kIGIgaXMgMS4KClRoZSBnY2QzLzQg
dmVyc2lvbnMgY2FuIGhhbmRsZSB0aGlzIHByb3Blcmx5LgoKCgpfX19fX19fX19fX19fX19fX19f
X19fX19fX19fX19fX19fX19fX19fX19fX19fXwpsaW51eC1zbnBzLWFyYyBtYWlsaW5nIGxpc3QK
bGludXgtc25wcy1hcmNAbGlzdHMuaW5mcmFkZWFkLm9yZwpodHRwOi8vbGlzdHMuaW5mcmFkZWFk
Lm9yZy9tYWlsbWFuL2xpc3RpbmZvL2xpbnV4LXNucHMtYXJj

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
@ 2016-05-08 12:52     ` Zhaoxiu Zeng
  0 siblings, 0 replies; 41+ messages in thread
From: Zhaoxiu Zeng @ 2016-05-08 12:52 UTC (permalink / raw)
  To: George Spelvin, akpm, dalias, davem, geert, James.Bottomley,
	jjuran, peterz, sam
  Cc: deller, heiko.carstens, ink, james.hogan, jejb, jonas, lennox.wu,
	lftan, linux-alpha, linux-arm-kernel, linux-kernel, linux-m68k,
	linux-metag, linux-mips, linux-parisc, linux-s390, linux-sh,
	linux-snps-arc, linux, linux, liqin.linux, mattst88, monstr,
	nios2-dev, ralf, rth, schwidefsky, sparclinux, uclinux-h8-devel,
	vgupta, ysato, zhaoxiu.zeng

在 2016/5/7 16:41, George Spelvin 写道:
> Nothing critical, but a bit of kibitzing.
> (That is slang in the Yiddish language for a person
> who offers annoying and unwanted advice.)
>
>> The binary GCD algorithm is based on the following facts:
>> 	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
>> 	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
>> 	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)
> 1) "even" and "odd" are adjectives.  In English, adjectives to not have
>    plural suffixes.  Thus, "they are even" or "they are odd".
> 2) Although "all" is not exactly wrong, it sounds odd.  Since there are
>    exactly two of them it's clearer to say "both".
>
> If I also rephrase the last line to fit into 80 columns, you get:
>
>   The binary GCD algorithm is based on the following facts:
> - 	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
> + 	1. If a and b are both even, then gcd(a,b) = 2 * gcd(a/2, b/2)
>  	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
> -	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)
> +	3. If both are odd, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)
>
> 3) Negative config options are always confusing.
>
> Would it be better to call it CONFIG_INEFFICIEBNT_FFS, or even simpler
> CONFIG_SLOW_FFS?
>
> Also, you're allowed to add "help" to a non-interactive config option,
> and some documentation might be useful.
> E.g.
>
> +config CPU_SLOW_FFS
> +	def_bool n
> +	help
> +	  If n, the CPU supports a fast __ffs (__builtin_ctz) operation,
> +	  either directly or via a short code sequence using a count
> +	  leading zeros or population count instruction.  If y, the
> +	  operation is emulated by slower software, such as an unrolled
> +	  binary search.
> +
> +	  This is purely an optimization option; the kernel
> +	  will function correctly regardless of how it is set.
> +

Thanks a lot.

> Your benchmark code doesn't have to have a separate code path if
> __x86_64__; rdtsc works on 32-bit code just as well.  paths.  And the
> way you subtract the end and start times is unnecessarily complicated.
> The C language guarantees that unsigned arithmetic simply wraps modulo
> 2^bits as expected.

rdsc works on x86, the other path is prepared for other architectures.

> Here are some more timings, with the same flags as your tests:
>
> First, 32 bit code:
> 		gcd0	gcd1	gcd2	gcd3	gcd4
> Ivy Bridge	3156	1192	1740	1160	1640	PASS
> AMD Phenom	7150	2564	2348	2975	2843	PASS
> Core 2		4176	2592	4164	2604	3900	PASS
> Pentium 4	11492	4784	7632	4852	6452	PASS
>
> And 64-bit (longer times becuase the inputs are larger):
> Ivy Bridge	10636	2496	3500	2432	3360	PASS
> AMD Phenom	19482	4058	6030	5001	6845	PASS
>
> Looking at those, I'm not sure how much better the gcd3/4 versions are
> than gcd1/2.  The difference seems pretty minor and sometimes negative.
>

The worst case of binary GCD is that one number is power of 2,
for example a is 0xffffffff (0xcccccccc for the "even/odd" variant) and b is 1.

The gcd3/4 versions can handle this properly.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
  2016-05-07  8:41   ` George Spelvin
                     ` (6 preceding siblings ...)
  (?)
@ 2016-05-08 12:52   ` Zhaoxiu Zeng
  -1 siblings, 0 replies; 41+ messages in thread
From: Zhaoxiu Zeng @ 2016-05-08 12:52 UTC (permalink / raw)
  To: George Spelvin, akpm, dalias, davem, geert, James.Bottomley,
	jjuran, peterz, sam
  Cc: deller, heiko.carstens, ink, james.hogan, jejb, jonas, lennox.wu,
	lftan, linux-alpha, linux-arm-kernel, linux-kernel, linux-m68k,
	linux-metag, linux-mips, linux-parisc, linux-s390, linux-sh,
	linux-snps-arc, linux, linux, liqin.linux, mattst88, monstr,
	nios2-dev, ralf, rth, schwidefsky, sparclinux, uclinux-h8-devel,
	vgupta, ysato, zhaoxiu.zeng

在 2016/5/7 16:41, George Spelvin 写道:
> Nothing critical, but a bit of kibitzing.
> (That is slang in the Yiddish language for a person
> who offers annoying and unwanted advice.)
>
>> The binary GCD algorithm is based on the following facts:
>> 	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
>> 	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
>> 	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)
> 1) "even" and "odd" are adjectives.  In English, adjectives to not have
>    plural suffixes.  Thus, "they are even" or "they are odd".
> 2) Although "all" is not exactly wrong, it sounds odd.  Since there are
>    exactly two of them it's clearer to say "both".
>
> If I also rephrase the last line to fit into 80 columns, you get:
>
>   The binary GCD algorithm is based on the following facts:
> - 	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
> + 	1. If a and b are both even, then gcd(a,b) = 2 * gcd(a/2, b/2)
>  	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
> -	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)
> +	3. If both are odd, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)
>
> 3) Negative config options are always confusing.
>
> Would it be better to call it CONFIG_INEFFICIEBNT_FFS, or even simpler
> CONFIG_SLOW_FFS?
>
> Also, you're allowed to add "help" to a non-interactive config option,
> and some documentation might be useful.
> E.g.
>
> +config CPU_SLOW_FFS
> +	def_bool n
> +	help
> +	  If n, the CPU supports a fast __ffs (__builtin_ctz) operation,
> +	  either directly or via a short code sequence using a count
> +	  leading zeros or population count instruction.  If y, the
> +	  operation is emulated by slower software, such as an unrolled
> +	  binary search.
> +
> +	  This is purely an optimization option; the kernel
> +	  will function correctly regardless of how it is set.
> +

Thanks a lot.

> Your benchmark code doesn't have to have a separate code path if
> __x86_64__; rdtsc works on 32-bit code just as well.  paths.  And the
> way you subtract the end and start times is unnecessarily complicated.
> The C language guarantees that unsigned arithmetic simply wraps modulo
> 2^bits as expected.

rdsc works on x86, the other path is prepared for other architectures.

> Here are some more timings, with the same flags as your tests:
>
> First, 32 bit code:
> 		gcd0	gcd1	gcd2	gcd3	gcd4
> Ivy Bridge	3156	1192	1740	1160	1640	PASS
> AMD Phenom	7150	2564	2348	2975	2843	PASS
> Core 2		4176	2592	4164	2604	3900	PASS
> Pentium 4	11492	4784	7632	4852	6452	PASS
>
> And 64-bit (longer times becuase the inputs are larger):
> Ivy Bridge	10636	2496	3500	2432	3360	PASS
> AMD Phenom	19482	4058	6030	5001	6845	PASS
>
> Looking at those, I'm not sure how much better the gcd3/4 versions are
> than gcd1/2.  The difference seems pretty minor and sometimes negative.
>

The worst case of binary GCD is that one number is power of 2,
for example a is 0xffffffff (0xcccccccc for the "even/odd" variant) and b is 1.

The gcd3/4 versions can handle this properly.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
@ 2016-05-08 12:52     ` Zhaoxiu Zeng
  0 siblings, 0 replies; 41+ messages in thread
From: Zhaoxiu Zeng @ 2016-05-08 12:52 UTC (permalink / raw)
  To: linux-snps-arc

? 2016/5/7 16:41, George Spelvin ??:
> Nothing critical, but a bit of kibitzing.
> (That is slang in the Yiddish language for a person
> who offers annoying and unwanted advice.)
>
>> The binary GCD algorithm is based on the following facts:
>> 	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
>> 	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
>> 	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)
> 1) "even" and "odd" are adjectives.  In English, adjectives to not have
>    plural suffixes.  Thus, "they are even" or "they are odd".
> 2) Although "all" is not exactly wrong, it sounds odd.  Since there are
>    exactly two of them it's clearer to say "both".
>
> If I also rephrase the last line to fit into 80 columns, you get:
>
>   The binary GCD algorithm is based on the following facts:
> - 	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
> + 	1. If a and b are both even, then gcd(a,b) = 2 * gcd(a/2, b/2)
>  	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
> -	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)
> +	3. If both are odd, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)
>
> 3) Negative config options are always confusing.
>
> Would it be better to call it CONFIG_INEFFICIEBNT_FFS, or even simpler
> CONFIG_SLOW_FFS?
>
> Also, you're allowed to add "help" to a non-interactive config option,
> and some documentation might be useful.
> E.g.
>
> +config CPU_SLOW_FFS
> +	def_bool n
> +	help
> +	  If n, the CPU supports a fast __ffs (__builtin_ctz) operation,
> +	  either directly or via a short code sequence using a count
> +	  leading zeros or population count instruction.  If y, the
> +	  operation is emulated by slower software, such as an unrolled
> +	  binary search.
> +
> +	  This is purely an optimization option; the kernel
> +	  will function correctly regardless of how it is set.
> +

Thanks a lot.

> Your benchmark code doesn't have to have a separate code path if
> __x86_64__; rdtsc works on 32-bit code just as well.  paths.  And the
> way you subtract the end and start times is unnecessarily complicated.
> The C language guarantees that unsigned arithmetic simply wraps modulo
> 2^bits as expected.

rdsc works on x86, the other path is prepared for other architectures.

> Here are some more timings, with the same flags as your tests:
>
> First, 32 bit code:
> 		gcd0	gcd1	gcd2	gcd3	gcd4
> Ivy Bridge	3156	1192	1740	1160	1640	PASS
> AMD Phenom	7150	2564	2348	2975	2843	PASS
> Core 2		4176	2592	4164	2604	3900	PASS
> Pentium 4	11492	4784	7632	4852	6452	PASS
>
> And 64-bit (longer times becuase the inputs are larger):
> Ivy Bridge	10636	2496	3500	2432	3360	PASS
> AMD Phenom	19482	4058	6030	5001	6845	PASS
>
> Looking at those, I'm not sure how much better the gcd3/4 versions are
> than gcd1/2.  The difference seems pretty minor and sometimes negative.
>

The worst case of binary GCD is that one number is power of 2,
for example a is 0xffffffff (0xcccccccc for the "even/odd" variant) and b is 1.

The gcd3/4 versions can handle this properly.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
@ 2016-05-08 12:52     ` Zhaoxiu Zeng
  0 siblings, 0 replies; 41+ messages in thread
From: Zhaoxiu Zeng @ 2016-05-08 12:52 UTC (permalink / raw)
  To: linux-arm-kernel

? 2016/5/7 16:41, George Spelvin ??:
> Nothing critical, but a bit of kibitzing.
> (That is slang in the Yiddish language for a person
> who offers annoying and unwanted advice.)
>
>> The binary GCD algorithm is based on the following facts:
>> 	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
>> 	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
>> 	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)
> 1) "even" and "odd" are adjectives.  In English, adjectives to not have
>    plural suffixes.  Thus, "they are even" or "they are odd".
> 2) Although "all" is not exactly wrong, it sounds odd.  Since there are
>    exactly two of them it's clearer to say "both".
>
> If I also rephrase the last line to fit into 80 columns, you get:
>
>   The binary GCD algorithm is based on the following facts:
> - 	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
> + 	1. If a and b are both even, then gcd(a,b) = 2 * gcd(a/2, b/2)
>  	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
> -	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)
> +	3. If both are odd, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)
>
> 3) Negative config options are always confusing.
>
> Would it be better to call it CONFIG_INEFFICIEBNT_FFS, or even simpler
> CONFIG_SLOW_FFS?
>
> Also, you're allowed to add "help" to a non-interactive config option,
> and some documentation might be useful.
> E.g.
>
> +config CPU_SLOW_FFS
> +	def_bool n
> +	help
> +	  If n, the CPU supports a fast __ffs (__builtin_ctz) operation,
> +	  either directly or via a short code sequence using a count
> +	  leading zeros or population count instruction.  If y, the
> +	  operation is emulated by slower software, such as an unrolled
> +	  binary search.
> +
> +	  This is purely an optimization option; the kernel
> +	  will function correctly regardless of how it is set.
> +

Thanks a lot.

> Your benchmark code doesn't have to have a separate code path if
> __x86_64__; rdtsc works on 32-bit code just as well.  paths.  And the
> way you subtract the end and start times is unnecessarily complicated.
> The C language guarantees that unsigned arithmetic simply wraps modulo
> 2^bits as expected.

rdsc works on x86, the other path is prepared for other architectures.

> Here are some more timings, with the same flags as your tests:
>
> First, 32 bit code:
> 		gcd0	gcd1	gcd2	gcd3	gcd4
> Ivy Bridge	3156	1192	1740	1160	1640	PASS
> AMD Phenom	7150	2564	2348	2975	2843	PASS
> Core 2		4176	2592	4164	2604	3900	PASS
> Pentium 4	11492	4784	7632	4852	6452	PASS
>
> And 64-bit (longer times becuase the inputs are larger):
> Ivy Bridge	10636	2496	3500	2432	3360	PASS
> AMD Phenom	19482	4058	6030	5001	6845	PASS
>
> Looking at those, I'm not sure how much better the gcd3/4 versions are
> than gcd1/2.  The difference seems pretty minor and sometimes negative.
>

The worst case of binary GCD is that one number is power of 2,
for example a is 0xffffffff (0xcccccccc for the "even/odd" variant) and b is 1.

The gcd3/4 versions can handle this properly.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
@ 2016-05-08 12:52     ` Zhaoxiu Zeng
  0 siblings, 0 replies; 41+ messages in thread
From: Zhaoxiu Zeng @ 2016-05-08 12:52 UTC (permalink / raw)
  To: George Spelvin, akpm, dalias, davem, geert, James.Bottomley,
	jjuran, peterz, sam
  Cc: linux-mips, linux-sh, heiko.carstens, linux, sparclinux,
	lennox.wu, jonas, linux-s390, linux, ysato, deller, jejb,
	mattst88, linux-snps-arc, uclinux-h8-devel, james.hogan,
	liqin.linux, linux-m68k, ink, zhaoxiu.zeng, nios2-dev,
	linux-metag, linux-arm-kernel, rth, monstr, linux-parisc, vgupta,
	linux-kernel, ralf, linux-alpha, schwidefsky, lftan

ÔÚ 2016/5/7 16:41, George Spelvin дµÀ:
> Nothing critical, but a bit of kibitzing.
> (That is slang in the Yiddish language for a person
> who offers annoying and unwanted advice.)
>
>> The binary GCD algorithm is based on the following facts:
>> 	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
>> 	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
>> 	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)
> 1) "even" and "odd" are adjectives.  In English, adjectives to not have
>    plural suffixes.  Thus, "they are even" or "they are odd".
> 2) Although "all" is not exactly wrong, it sounds odd.  Since there are
>    exactly two of them it's clearer to say "both".
>
> If I also rephrase the last line to fit into 80 columns, you get:
>
>   The binary GCD algorithm is based on the following facts:
> - 	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
> + 	1. If a and b are both even, then gcd(a,b) = 2 * gcd(a/2, b/2)
>  	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
> -	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)
> +	3. If both are odd, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)
>
> 3) Negative config options are always confusing.
>
> Would it be better to call it CONFIG_INEFFICIEBNT_FFS, or even simpler
> CONFIG_SLOW_FFS?
>
> Also, you're allowed to add "help" to a non-interactive config option,
> and some documentation might be useful.
> E.g.
>
> +config CPU_SLOW_FFS
> +	def_bool n
> +	help
> +	  If n, the CPU supports a fast __ffs (__builtin_ctz) operation,
> +	  either directly or via a short code sequence using a count
> +	  leading zeros or population count instruction.  If y, the
> +	  operation is emulated by slower software, such as an unrolled
> +	  binary search.
> +
> +	  This is purely an optimization option; the kernel
> +	  will function correctly regardless of how it is set.
> +

Thanks a lot.

> Your benchmark code doesn't have to have a separate code path if
> __x86_64__; rdtsc works on 32-bit code just as well.  paths.  And the
> way you subtract the end and start times is unnecessarily complicated.
> The C language guarantees that unsigned arithmetic simply wraps modulo
> 2^bits as expected.

rdsc works on x86, the other path is prepared for other architectures.

> Here are some more timings, with the same flags as your tests:
>
> First, 32 bit code:
> 		gcd0	gcd1	gcd2	gcd3	gcd4
> Ivy Bridge	3156	1192	1740	1160	1640	PASS
> AMD Phenom	7150	2564	2348	2975	2843	PASS
> Core 2		4176	2592	4164	2604	3900	PASS
> Pentium 4	11492	4784	7632	4852	6452	PASS
>
> And 64-bit (longer times becuase the inputs are larger):
> Ivy Bridge	10636	2496	3500	2432	3360	PASS
> AMD Phenom	19482	4058	6030	5001	6845	PASS
>
> Looking at those, I'm not sure how much better the gcd3/4 versions are
> than gcd1/2.  The difference seems pretty minor and sometimes negative.
>

The worst case of binary GCD is that one number is power of 2,
for example a is 0xffffffff (0xcccccccc for the "even/odd" variant) and b is 1.

The gcd3/4 versions can handle this properly.



_______________________________________________
linux-snps-arc mailing list
linux-snps-arc@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-snps-arc

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean
@ 2016-05-06  9:42 zengzhaoxiu
  0 siblings, 0 replies; 41+ messages in thread
From: zengzhaoxiu @ 2016-05-06  9:42 UTC (permalink / raw)
  To: akpm, linux, peterz, jjuran, James.Bottomley, geert, dalias, sam, davem
  Cc: Zhaoxiu Zeng, Richard Henderson, Ivan Kokshaysky, Matt Turner,
	Vineet Gupta, Russell King, Yoshinori Sato, James Hogan,
	Michal Simek, Ralf Baechle, Ley Foon Tan, Jonas Bonn,
	James E.J. Bottomley, Helge Deller, Martin Schwidefsky,
	Heiko Carstens, Chen Liqin, Lennox Wu, linux-kernel, linux-alpha,
	linux-snps-arc, linux-arm-kernel, uclinux-h8-devel, linux-m68k

From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

The binary GCD algorithm is based on the following facts:
	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)

Even on x86 machines with reasonable division hardware, the binary
algorithm runs about 25% faster (80% the execution time) than the
division-based Euclidian algorithm.

On platforms like Alpha and ARMv6 where division is a function call to
emulation code, it's even more significant.

There are two variants of the code here, depending on whether a
fast __ffs (find least significant set bit) instruction is available.
This allows the unpredictable branches in the bit-at-a-time shifting
loop to be eliminated.

If fast __ffs is not available, the "even/odd" GCD variant is used.

I use the following code to benchmark:

	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <string.h>
	#include <time.h>
	#include <unistd.h>

	#define swap(a, b) \
		do { \
			a ^= b; \
			b ^= a; \
			a ^= b; \
		} while (0)

	unsigned long gcd0(unsigned long a, unsigned long b)
	{
		unsigned long r;

		if (a < b) {
			swap(a, b);
		}

		if (b == 0)
			return a;

		while ((r = a % b) != 0) {
			a = b;
			b = r;
		}

		return b;
	}

	unsigned long gcd1(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a == b)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd2(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a == b)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	unsigned long gcd3(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);
		if (b == 1)
			return r & -r;

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a == 1)
				return r & -r;
			if (a == b)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd4(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;
		if (b == r)
			return r;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a == r)
				return r;
			if (a == b)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	static unsigned long (*gcd_func[])(unsigned long a, unsigned long b) = {
		gcd0, gcd1, gcd2, gcd3, gcd4,
	};

	#define TEST_ENTRIES (sizeof(gcd_func) / sizeof(gcd_func[0]))

	#if defined(__x86_64__)

	#define rdtscll(val) do { \
		unsigned long __a,__d; \
		__asm__ __volatile__("rdtsc" : "=a" (__a), "=d" (__d)); \
		(val) = ((unsigned long long)__a) | (((unsigned long long)__d)<<32); \
	} while(0)

	static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
								unsigned long a, unsigned long b, unsigned long *res)
	{
		unsigned long long start, end;
		unsigned long long ret;
		unsigned long gcd_res;

		rdtscll(start);
		gcd_res = gcd(a, b);
		rdtscll(end);

		if (end >= start)
			ret = end - start;
		else
			ret = ~0ULL - start + 1 + end;

		*res = gcd_res;
		return ret;
	}

	#else

	static inline struct timespec read_time(void)
	{
		struct timespec time;
		clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time);
		return time;
	}

	static inline unsigned long long diff_time(struct timespec start, struct timespec end)
	{
		struct timespec temp;

		if ((end.tv_nsec - start.tv_nsec) < 0) {
			temp.tv_sec = end.tv_sec - start.tv_sec - 1;
			temp.tv_nsec = 1000000000ULL + end.tv_nsec - start.tv_nsec;
		} else {
			temp.tv_sec = end.tv_sec - start.tv_sec;
			temp.tv_nsec = end.tv_nsec - start.tv_nsec;
		}

		return temp.tv_sec * 1000000000ULL + temp.tv_nsec;
	}

	static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
								unsigned long a, unsigned long b, unsigned long *res)
	{
		struct timespec start, end;
		unsigned long gcd_res;

		start = read_time();
		gcd_res = gcd(a, b);
		end = read_time();

		*res = gcd_res;
		return diff_time(start, end);
	}

	#endif

	static inline unsigned long get_rand()
	{
		if (sizeof(long) == 8)
			return (unsigned long)rand() << 32 | rand();
		else
			return rand();
	}

	int main(int argc, char **argv)
	{
		unsigned int seed = time(0);
		int loops = 100;
		int repeats = 1000;
		unsigned long (*res)[TEST_ENTRIES];
		unsigned long long elapsed[TEST_ENTRIES];
		int i, j, k;

		for (;;) {
			int opt = getopt(argc, argv, "n:r:s:");
			/* End condition always first */
			if (opt == -1)
				break;

			switch (opt) {
			case 'n':
				loops = atoi(optarg);
				break;
			case 'r':
				repeats = atoi(optarg);
				break;
			case 's':
				seed = strtoul(optarg, NULL, 10);
				break;
			default:
				/* You won't actually get here. */
				break;
			}
		}

		res = malloc(sizeof(unsigned long) * TEST_ENTRIES * loops);
		memset(elapsed, 0, sizeof(elapsed));

		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			/* Do we have args? */
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			unsigned long long min_elapsed[TEST_ENTRIES];
			for (k = 0; k < repeats; k++) {
				for (i = 0; i < TEST_ENTRIES; i++) {
					unsigned long long tmp = benchmark_gcd_func(gcd_func[i], a, b, &res[j][i]);
					if (k == 0 || min_elapsed[i] > tmp)
						min_elapsed[i] = tmp;
				}
			}
			for (i = 0; i < TEST_ENTRIES; i++)
				elapsed[i] += min_elapsed[i];
		}

		for (i = 0; i < TEST_ENTRIES; i++)
			printf("gcd%d: elapsed %llu\n", i, elapsed[i]);

		k = 0;
		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			for (i = 1; i < TEST_ENTRIES; i++) {
				if (res[j][i] != res[j][0])
					break;
			}
			if (i < TEST_ENTRIES) {
				if (k == 0) {
					k = 1;
					fprintf(stderr, "Error:\n");
				}
				fprintf(stderr, "gcd(%lu, %lu): ", a, b);
				for (i = 0; i < TEST_ENTRIES; i++)
					fprintf(stderr, "%ld%s", res[j][i], i < TEST_ENTRIES - 1 ? ", " : "\n");
			}
		}

		if (k == 0)
			fprintf(stderr, "PASS\n");

		free(res);

		return 0;
	}

Compiled with "-O2", on "VirtualBox 4.4.0-22-generic #38-Ubuntu x86_64" got:

zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 10174
gcd1: elapsed 2120
gcd2: elapsed 2902
gcd3: elapsed 2039
gcd4: elapsed 2812
PASS
zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 9309
gcd1: elapsed 2280
gcd2: elapsed 2822
gcd3: elapsed 2217
gcd4: elapsed 2710
PASS
zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 9589
gcd1: elapsed 2098
gcd2: elapsed 2815
gcd3: elapsed 2030
gcd4: elapsed 2718
PASS
zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 9914
gcd1: elapsed 2309
gcd2: elapsed 2779
gcd3: elapsed 2228
gcd4: elapsed 2709
PASS

Changes to V3:
- Fix build error
- Select CPU_NO_EFFICIENT_FFS if CONFIG_ARC && CONFIG_ISA_ARCOMPACT
- Select CPU_NO_EFFICIENT_FFS if CONFIG_S390 && !CONFIG_HAVE_MARCH_Z9_109_FEATURES
- Do new brnchmark
- Return immediately if one number becomes a power of 2
- Add comments written by George Spelvin

Changes to V2:
- Add a new Kconfig variable CPU_NO_EFFICIENT_FFS
- Separate into two versions by CPU_NO_EFFICIENT_FFS
- Return directly from the loop, rather than using break().
- Use "r &= -r" mostly because it's clearer.

Changes to V1:
- Don't touch Kconfig, remove the Euclidean algorithm implementation
- Don't use the "even-odd" variant
- Use __ffs if the CPU has efficient __ffs

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
Signed-off-by: George Spelvin <linux@horizon.com>
---
 arch/Kconfig                         |  3 ++
 arch/alpha/Kconfig                   |  1 +
 arch/arc/Kconfig                     |  1 +
 arch/arm/mm/Kconfig                  |  3 ++
 arch/h8300/Kconfig                   |  1 +
 arch/m32r/Kconfig                    |  1 +
 arch/m68k/Kconfig.cpu                | 11 ++++++
 arch/metag/Kconfig                   |  1 +
 arch/microblaze/Kconfig              |  1 +
 arch/mips/include/asm/cpu-features.h | 10 +++++
 arch/nios2/Kconfig                   |  1 +
 arch/openrisc/Kconfig                |  1 +
 arch/parisc/Kconfig                  |  1 +
 arch/s390/Kconfig                    |  2 +-
 arch/score/Kconfig                   |  1 +
 arch/sh/Kconfig                      |  1 +
 arch/sparc/Kconfig                   |  1 +
 lib/gcd.c                            | 77 +++++++++++++++++++++++++++++++-----
 18 files changed, 107 insertions(+), 11 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 81869a5..275f17d 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -638,4 +638,7 @@ config COMPAT_OLD_SIGACTION
 config ARCH_NO_COHERENT_DMA_MMAP
 	bool
 
+config CPU_NO_EFFICIENT_FFS
+	def_bool n
+
 source "kernel/gcov/Kconfig"
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 9d8a858..44e6f05 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -27,6 +27,7 @@ config ALPHA
 	select MODULES_USE_ELF_RELA
 	select ODD_RT_SIGACTION
 	select OLD_SIGSUSPEND
+	select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67
 	help
 	  The Alpha is a 64-bit general-purpose processor designed and
 	  marketed by the Digital Equipment Corporation of blessed memory,
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index ec4791e..f41eb4c 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -101,6 +101,7 @@ choice
 
 config ISA_ARCOMPACT
 	bool "ARCompact ISA"
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The original ARC ISA of ARC600/700 cores
 
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index 5534766..cb569b6 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -421,18 +421,21 @@ config CPU_32v3
 	select CPU_USE_DOMAINS if MMU
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_NO_EFFICIENT_FFS
 
 config CPU_32v4
 	bool
 	select CPU_USE_DOMAINS if MMU
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_NO_EFFICIENT_FFS
 
 config CPU_32v4T
 	bool
 	select CPU_USE_DOMAINS if MMU
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_NO_EFFICIENT_FFS
 
 config CPU_32v5
 	bool
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index 986ea84..aa232de 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -20,6 +20,7 @@ config H8300
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZO
 	select HAVE_ARCH_KGDB
+	select CPU_NO_EFFICIENT_FFS
 
 config RWSEM_GENERIC_SPINLOCK
 	def_bool y
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index c82b292..3cc8498 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -17,6 +17,7 @@ config M32R
 	select ARCH_USES_GETTIMEOFFSET
 	select MODULES_USE_ELF_RELA
 	select HAVE_DEBUG_STACKOVERFLOW
+	select CPU_NO_EFFICIENT_FFS
 
 config SBUS
 	bool
diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu
index 0dfcf12..0b6efe8 100644
--- a/arch/m68k/Kconfig.cpu
+++ b/arch/m68k/Kconfig.cpu
@@ -40,6 +40,7 @@ config M68000
 	select CPU_HAS_NO_MULDIV64
 	select CPU_HAS_NO_UNALIGNED
 	select GENERIC_CSUM
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The Freescale (was Motorola) 68000 CPU is the first generation of
 	  the well known M68K family of processors. The CPU core as well as
@@ -51,6 +52,7 @@ config MCPU32
 	bool
 	select CPU_HAS_NO_BITFIELDS
 	select CPU_HAS_NO_UNALIGNED
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The Freescale (was then Motorola) CPU32 is a CPU core that is
 	  based on the 68020 processor. For the most part it is used in
@@ -130,6 +132,7 @@ config M5206
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5206 processor support.
 
@@ -138,6 +141,7 @@ config M5206e
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5206e processor support.
 
@@ -163,6 +167,7 @@ config M5249
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5249 processor support.
 
@@ -171,6 +176,7 @@ config M525x
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Freescale (Motorola) Coldfire 5251/5253 processor support.
 
@@ -189,6 +195,7 @@ config M5272
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5272 processor support.
 
@@ -217,6 +224,7 @@ config M5307
 	select COLDFIRE_SW_A7
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5307 processor support.
 
@@ -242,6 +250,7 @@ config M5407
 	select COLDFIRE_SW_A7
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5407 processor support.
 
@@ -251,6 +260,7 @@ config M547x
 	select MMU_COLDFIRE if MMU
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Freescale ColdFire 5470/5471/5472/5473/5474/5475 processor support.
 
@@ -260,6 +270,7 @@ config M548x
 	select M54xx
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Freescale ColdFire 5480/5481/5482/5483/5484/5485 processor support.
 
diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig
index a0fa88d..2ac2de6 100644
--- a/arch/metag/Kconfig
+++ b/arch/metag/Kconfig
@@ -29,6 +29,7 @@ config METAG
 	select OF
 	select OF_EARLY_FLATTREE
 	select SPARSE_IRQ
+	select CPU_NO_EFFICIENT_FFS
 
 config STACKTRACE_SUPPORT
 	def_bool y
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 3d793b5..f17c3a4 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -32,6 +32,7 @@ config MICROBLAZE
 	select OF_EARLY_FLATTREE
 	select TRACING_SUPPORT
 	select VIRT_TO_BUS
+	select CPU_NO_EFFICIENT_FFS
 
 config SWAP
 	def_bool n
diff --git a/arch/mips/include/asm/cpu-features.h b/arch/mips/include/asm/cpu-features.h
index eeec8c8..e20e100 100644
--- a/arch/mips/include/asm/cpu-features.h
+++ b/arch/mips/include/asm/cpu-features.h
@@ -180,6 +180,16 @@
 #endif
 #endif
 
+/* __builtin_constant_p(cpu_has_mips_r) && cpu_has_mips_r */
+#if !((defined(cpu_has_mips32r1) && cpu_has_mips32r1) || \
+	  (defined(cpu_has_mips32r2) && cpu_has_mips32r2) || \
+	  (defined(cpu_has_mips32r6) && cpu_has_mips32r6) || \
+	  (defined(cpu_has_mips64r1) && cpu_has_mips64r1) || \
+	  (defined(cpu_has_mips64r2) && cpu_has_mips64r2) || \
+	  (defined(cpu_has_mips64r6) && cpu_has_mips64r6))
+#define CONFIG_CPU_NO_EFFICIENT_FFS 1
+#endif
+
 #ifndef cpu_has_mips_1
 # define cpu_has_mips_1		(!cpu_has_mips_r6)
 #endif
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index 4375554..f10bd2c 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -16,6 +16,7 @@ config NIOS2
 	select SOC_BUS
 	select SPARSE_IRQ
 	select USB_ARCH_HAS_HCD if USB_SUPPORT
+	select CPU_NO_EFFICIENT_FFS
 
 config GENERIC_CSUM
 	def_bool y
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index e118c02..142cb05 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -25,6 +25,7 @@ config OPENRISC
 	select MODULES_USE_ELF_RELA
 	select HAVE_DEBUG_STACKOVERFLOW
 	select OR1K_PIC
+	select CPU_NO_EFFICIENT_FFS if !OPENRISC_HAVE_INST_FF1
 
 config MMU
 	def_bool y
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 88cfaa8..3d498a6 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -32,6 +32,7 @@ config PARISC
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_SECCOMP_FILTER
 	select ARCH_NO_COHERENT_DMA_MMAP
+	select CPU_NO_EFFICIENT_FFS
 
 	help
 	  The PA-RISC microprocessor is designed by Hewlett-Packard and used
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index bf24ab1..9eb3932 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -164,7 +164,7 @@ config S390
 	select TTY
 	select VIRT_CPU_ACCOUNTING
 	select VIRT_TO_BUS
-
+	select CPU_NO_EFFICIENT_FFS if !HAVE_MARCH_Z9_109_FEATURES
 
 config SCHED_OMIT_FRAME_POINTER
 	def_bool y
diff --git a/arch/score/Kconfig b/arch/score/Kconfig
index 366e1b5..507d631 100644
--- a/arch/score/Kconfig
+++ b/arch/score/Kconfig
@@ -14,6 +14,7 @@ config SCORE
 	select VIRT_TO_BUS
 	select MODULES_USE_ELF_REL
 	select CLONE_BACKWARDS
+	select CPU_NO_EFFICIENT_FFS
 
 choice
 	prompt "System type"
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 7ed20fc..56cf5e5 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -44,6 +44,7 @@ config SUPERH
 	select OLD_SIGSUSPEND
 	select OLD_SIGACTION
 	select HAVE_ARCH_AUDITSYSCALL
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The SuperH is a RISC processor targeted for use in embedded systems
 	  and consumer electronics; it was also used in the Sega Dreamcast
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 57ffaf2..ca675ed 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -42,6 +42,7 @@ config SPARC
 	select ODD_RT_SIGACTION
 	select OLD_SIGSUSPEND
 	select ARCH_HAS_SG_CHAIN
+	select CPU_NO_EFFICIENT_FFS
 
 config SPARC32
 	def_bool !64BIT
diff --git a/lib/gcd.c b/lib/gcd.c
index 3657f12..b9fbe73 100644
--- a/lib/gcd.c
+++ b/lib/gcd.c
@@ -2,20 +2,77 @@
 #include <linux/gcd.h>
 #include <linux/export.h>
 
-/* Greatest common divisor */
+/*
+ * This implements the binary GCD algorithm. (Often attributed to Stein,
+ * but as Knuth has noted, appears in a first-century Chinese math text.)
+ *
+ * This is faster than the division-based algorithm even on x86, which
+ * has decent hardware division.
+ */
+
+#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS)
+
+/* If __ffs is available, the even/odd algorithm benchmarks slower. */
 unsigned long gcd(unsigned long a, unsigned long b)
 {
-	unsigned long r;
+	unsigned long r = a | b;
+
+	if (!a || !b)
+		return r;
 
-	if (a < b)
-		swap(a, b);
+	b >>= __ffs(b);
+	if (b == 1)
+		return r & -r;
 
-	if (!b)
-		return a;
-	while ((r = a % b) != 0) {
-		a = b;
-		b = r;
+	for (;;) {
+		a >>= __ffs(a);
+		if (a == 1)
+			return r & -r;
+		if (a == b)
+			return a << __ffs(r);
+
+		if (a < b)
+			swap(a, b);
+		a -= b;
 	}
-	return b;
 }
+
+#else
+
+/* If normalization is done by loops, the even/odd algorithm is a win. */
+unsigned long gcd(unsigned long a, unsigned long b)
+{
+	unsigned long r = a | b;
+
+	if (!a || !b)
+		return r;
+
+	/* Isolate lsbit of r */
+	r &= -r;
+
+	while (!(b & r))
+		b >>= 1;
+	if (b == r)
+		return r;
+
+	for (;;) {
+		while (!(a & r))
+			a >>= 1;
+		if (a == r)
+			return r;
+		if (a == b)
+			return a;
+
+		if (a < b)
+			swap(a, b);
+		a -= b;
+		a >>= 1;
+		if (a & r)
+			a += b;
+		a >>= 1;
+	}
+}
+
+#endif
+
 EXPORT_SYMBOL_GPL(gcd);
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 41+ messages in thread

end of thread, other threads:[~2016-05-08 13:17 UTC | newest]

Thread overview: 41+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-05-06  9:42 [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean zengzhaoxiu
2016-05-06  9:42 ` zengzhaoxiu at 163.com
2016-05-06  9:42 ` zengzhaoxiu
2016-05-06  9:42 ` zengzhaoxiu
2016-05-06  9:42 ` zengzhaoxiu
2016-05-06  9:42 ` zengzhaoxiu
2016-05-06 23:00 ` Andrew Morton
2016-05-06 23:00   ` Andrew Morton
2016-05-06 23:00   ` Andrew Morton
2016-05-06 23:00   ` Andrew Morton
2016-05-06 23:00   ` Andrew Morton
2016-05-07  8:41 ` George Spelvin
2016-05-07  8:41 ` George Spelvin
2016-05-07  8:41   ` George Spelvin
2016-05-07  8:41   ` George Spelvin
2016-05-07  8:41   ` George Spelvin
2016-05-07  8:41   ` George Spelvin
2016-05-07 10:46   ` Andreas Schwab
2016-05-07 10:46     ` Andreas Schwab
2016-05-07 10:46     ` Andreas Schwab
2016-05-07 10:46     ` Andreas Schwab
2016-05-07 10:46     ` Andreas Schwab
2016-05-07 10:46     ` Andreas Schwab
2016-05-07 20:41     ` George Spelvin
2016-05-07 10:46   ` Andreas Schwab
2016-05-08 12:52   ` Zhaoxiu Zeng
2016-05-08 12:52     ` Zhaoxiu Zeng
2016-05-08 12:52     ` Zhaoxiu Zeng
2016-05-08 12:52     ` Zhaoxiu Zeng
2016-05-08 12:52     ` Zhaoxiu Zeng
2016-05-08 12:52     ` Zhaoxiu Zeng
2016-05-08 12:52   ` Zhaoxiu Zeng
2016-05-07 11:23 ` Sam Ravnborg
2016-05-07 11:23   ` Sam Ravnborg
2016-05-07 11:23   ` Sam Ravnborg
2016-05-07 11:23   ` Sam Ravnborg
2016-05-07 11:23   ` Sam Ravnborg
2016-05-07 11:23   ` Sam Ravnborg
2016-05-07 22:30   ` George Spelvin
2016-05-07 22:30     ` George Spelvin
2016-05-06  9:42 zengzhaoxiu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.