linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] x86: generic versions of find_first_(zero_)bit, convert i386
@ 2008-03-31 17:15 Alexander van Heukelum
  2008-03-31 17:22 ` Stephen Hemminger
  2008-04-01  8:47 ` Ingo Molnar
  0 siblings, 2 replies; 34+ messages in thread
From: Alexander van Heukelum @ 2008-03-31 17:15 UTC (permalink / raw)
  To: Ingo Molnar, Andi Kleen, LKML; +Cc: Alexander van Heukelum

Generic versions of __find_first_bit and __find_first_zero_bit
are introduced as simplified versions of __find_next_bit and
__find_next_zero_bit. Their compilation and use are guarded by
a new config variable GENERIC_FIND_FIRST_BIT.

The generic versions of find_first_bit and find_first_zero_bit
are implemented in terms of the newly introduced __find_first_bit
and __find_first_zero_bit.

This patch also converts i386 to the generic functions. The text
size shrinks slightly due to uninlining of the find_*_bit functions.

   text    data     bss     dec     hex filename
4764939  480324  622592 5867855  59894f vmlinux  (i386 defconfig before)
4764645  480324  622592 5867561  598829 vmlinux  (i386 defconfig after)

Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>

---

Hi Ingo,

Here is another step in the unification of the bitops for i386
and x86_64. This patch implements a minimal conversion to a
generic implementation of find_first_bit/find_first_zero_bit
for i386. The optimization for small bitmaps and the conversion
of x86_64 will follow soon.

Compiles and runs fine on i386 and x86_64 (current x86#testing).

Greetings,
	Alexander

 arch/x86/Kconfig            |    3 ++
 include/asm-x86/bitops_32.h |   56 -----------------------------------------
 include/linux/bitops.h      |   34 +++++++++++++++++++++++++
 lib/Makefile                |    1 +
 lib/find_next_bit.c         |   58 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 96 insertions(+), 56 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6b3626d..fa7d16d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -80,6 +80,9 @@ config GENERIC_BUG
 	def_bool y
 	depends on BUG
 
+config GENERIC_FIND_FIRST_BIT
+	def_bool X86_32
+
 config GENERIC_FIND_NEXT_BIT
 	def_bool y
 
diff --git a/include/asm-x86/bitops_32.h b/include/asm-x86/bitops_32.h
index 3ed64b2..2e86302 100644
--- a/include/asm-x86/bitops_32.h
+++ b/include/asm-x86/bitops_32.h
@@ -4,62 +4,6 @@
 /*
  * Copyright 1992, Linus Torvalds.
  */
-
-/**
- * find_first_zero_bit - find the first zero bit in a memory region
- * @addr: The address to start the search at
- * @size: The maximum size to search
- *
- * Returns the bit number of the first zero bit, not the number of the byte
- * containing a bit.
- */
-static inline int find_first_zero_bit(const unsigned long *addr, unsigned size)
-{
-	int d0, d1, d2;
-	int res;
-
-	if (!size)
-		return 0;
-	/* This looks at memory.
-	 * Mark it volatile to tell gcc not to move it around
-	 */
-	asm volatile("movl $-1,%%eax\n\t"
-		     "xorl %%edx,%%edx\n\t"
-		     "repe; scasl\n\t"
-		     "je 1f\n\t"
-		     "xorl -4(%%edi),%%eax\n\t"
-		     "subl $4,%%edi\n\t"
-		     "bsfl %%eax,%%edx\n"
-		     "1:\tsubl %%ebx,%%edi\n\t"
-		     "shll $3,%%edi\n\t"
-		     "addl %%edi,%%edx"
-		     : "=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2)
-		     : "1" ((size + 31) >> 5), "2" (addr),
-		       "b" (addr) : "memory");
-	return res;
-}
-
-/**
- * find_first_bit - find the first set bit in a memory region
- * @addr: The address to start the search at
- * @size: The maximum size to search
- *
- * Returns the bit number of the first set bit, not the number of the byte
- * containing a bit.
- */
-static inline unsigned find_first_bit(const unsigned long *addr, unsigned size)
-{
-	unsigned x = 0;
-
-	while (x < size) {
-		unsigned long val = *addr++;
-		if (val)
-			return __ffs(val) + x;
-		x += sizeof(*addr) << 3;
-	}
-	return x;
-}
-
 #ifdef __KERNEL__
 
 #include <asm-generic/bitops/sched.h>
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 3865f2c..355d67b 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -113,6 +113,40 @@ static inline unsigned fls_long(unsigned long l)
 }
 
 #ifdef __KERNEL__
+#ifdef CONFIG_GENERIC_FIND_FIRST_BIT
+extern unsigned long __find_first_bit(const unsigned long *addr,
+		unsigned long size);
+
+/**
+ * find_first_bit - find the first set bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
+ *
+ * Returns the bit number of the first set bit.
+ */
+static __always_inline unsigned long
+find_first_bit(const unsigned long *addr, unsigned long size)
+{
+	return __find_first_bit(addr, size);
+}
+
+extern unsigned long __find_first_zero_bit(const unsigned long *addr,
+		unsigned long size);
+
+/**
+ * find_first_zero_bit - find the first cleared bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
+ *
+ * Returns the bit number of the first cleared bit.
+ */
+static __always_inline unsigned long
+find_first_zero_bit(const unsigned long *addr, unsigned long size)
+{
+	return __find_first_zero_bit(addr, size);
+}
+#endif /* CONFIG_GENERIC_FIND_FIRST_BIT */
+
 #ifdef CONFIG_GENERIC_FIND_NEXT_BIT
 extern unsigned long __find_next_bit(const unsigned long *addr,
 		unsigned long size, unsigned long offset);
diff --git a/lib/Makefile b/lib/Makefile
index 23de261..14c93e1 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
 lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o
+lib-$(CONFIG_GENERIC_FIND_FIRST_BIT) += find_next_bit.o
 lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
 obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
 obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
diff --git a/lib/find_next_bit.c b/lib/find_next_bit.c
index ce94c4c..d3f5784 100644
--- a/lib/find_next_bit.c
+++ b/lib/find_next_bit.c
@@ -16,6 +16,7 @@
 
 #define BITOP_WORD(nr)		((nr) / BITS_PER_LONG)
 
+#ifdef CONFIG_GENERIC_FIND_NEXT_BIT
 /*
  * Find the next set bit in a memory region.
  */
@@ -102,6 +103,63 @@ found_middle:
 	return result + ffz(tmp);
 }
 EXPORT_SYMBOL(__find_next_zero_bit);
+#endif /* CONFIG_GENERIC_FIND_NEXT_BIT */
+
+#ifdef CONFIG_GENERIC_FIND_FIRST_BIT
+/*
+ * Find the first set bit in a memory region.
+ */
+unsigned long __find_first_bit(const unsigned long *addr,
+		unsigned long size)
+{
+	const unsigned long *p = addr;
+	unsigned long result = 0;
+	unsigned long tmp;
+
+	while (size & ~(BITS_PER_LONG-1)) {
+		if ((tmp = *(p++)))
+			goto found;
+		result += BITS_PER_LONG;
+		size -= BITS_PER_LONG;
+	}
+	if (!size)
+		return result;
+
+	tmp = (*p) & (~0UL >> (BITS_PER_LONG - size));
+	if (tmp == 0UL)		/* Are any bits set? */
+		return result + size;	/* Nope. */
+found:
+	return result + __ffs(tmp);
+}
+EXPORT_SYMBOL(__find_first_bit);
+
+/*
+ * Find the first cleared bit in a memory region.
+ */
+unsigned long __find_first_zero_bit(const unsigned long *addr,
+		unsigned long size)
+{
+	const unsigned long *p = addr;
+	unsigned long result = 0;
+	unsigned long tmp;
+
+	while (size & ~(BITS_PER_LONG-1)) {
+		if (~(tmp = *(p++)))
+			goto found;
+		result += BITS_PER_LONG;
+		size -= BITS_PER_LONG;
+	}
+	if (!size)
+		return result;
+
+	tmp = (*p) | (~0UL << size);
+	if (tmp == ~0UL)	/* Are any bits zero? */
+		return result + size;	/* Nope. */
+found:
+	return result + ffz(tmp);
+}
+EXPORT_SYMBOL(__find_first_zero_bit);
+#endif /* CONFIG_GENERIC_FIND_FIRST_BIT */
 
 #ifdef __BIG_ENDIAN
 

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [PATCH] x86: generic versions of find_first_(zero_)bit, convert i386
  2008-03-31 17:15 [PATCH] x86: generic versions of find_first_(zero_)bit, convert i386 Alexander van Heukelum
@ 2008-03-31 17:22 ` Stephen Hemminger
  2008-03-31 19:38   ` Alexander van Heukelum
  2008-04-01  8:47 ` Ingo Molnar
  1 sibling, 1 reply; 34+ messages in thread
From: Stephen Hemminger @ 2008-03-31 17:22 UTC (permalink / raw)
  To: Alexander van Heukelum; +Cc: linux-kernel

On Mon, 31 Mar 2008 19:15:06 +0200
Alexander van Heukelum <heukelum@mailshack.com> wrote:

> Generic versions of __find_first_bit and __find_first_zero_bit
> are introduced as simplified versions of __find_next_bit and
> __find_next_zero_bit. Their compilation and use are guarded by
> a new config variable GENERIC_FIND_FIRST_BIT.
> 
> The generic versions of find_first_bit and find_first_zero_bit
> are implemented in terms of the newly introduced __find_first_bit
> and __find_first_zero_bit.
> 
> This patch also converts i386 to the generic functions. The text
> size shrinks slightly due to uninlining of the find_*_bit functions.
> 
>    text    data     bss     dec     hex filename
> 4764939  480324  622592 5867855  59894f vmlinux  (i386 defconfig before)
> 4764645  480324  622592 5867561  598829 vmlinux  (i386 defconfig after)
> 
> Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
> 

Size isn't everything, what is the performance difference?

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] x86: generic versions of find_first_(zero_)bit, convert i386
  2008-03-31 17:22 ` Stephen Hemminger
@ 2008-03-31 19:38   ` Alexander van Heukelum
  2008-03-31 21:58     ` Andi Kleen
  0 siblings, 1 reply; 34+ messages in thread
From: Alexander van Heukelum @ 2008-03-31 19:38 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: linux-kernel

On Mon, Mar 31, 2008 at 10:22:40AM -0700, Stephen Hemminger wrote:
> On Mon, 31 Mar 2008 19:15:06 +0200
> Alexander van Heukelum <heukelum@mailshack.com> wrote:
> 
> > Generic versions of __find_first_bit and __find_first_zero_bit
> > are introduced as simplified versions of __find_next_bit and
> > __find_next_zero_bit. Their compilation and use are guarded by
> > a new config variable GENERIC_FIND_FIRST_BIT.
> > 
> > The generic versions of find_first_bit and find_first_zero_bit
> > are implemented in terms of the newly introduced __find_first_bit
> > and __find_first_zero_bit.
> > 
> > This patch also converts i386 to the generic functions. The text
> > size shrinks slightly due to uninlining of the find_*_bit functions.
> > 
> >    text    data     bss     dec     hex filename
> > 4764939  480324  622592 5867855  59894f vmlinux  (i386 defconfig before)
> > 4764645  480324  622592 5867561  598829 vmlinux  (i386 defconfig after)
> > 
> > Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
> > 
> 
> Size isn't everything, what is the performance difference?

Hi,

Performance should not change too much. Uninlining of the functions has
some impact, of course, but this should be visible only for small bitmap
sizes. Measuring the performance impact by doing artificial benchmarks
is a bit problematic too, because it is hard to guess what patterns are
important. Anyhow, I hacked together a program (in userspace) that
searches for a bit in a bitmap. In pseudo code:

bitmap <- [0...]
for bitmapsize=1 to 512
	for bitposition=0 to bitmapsize-1
		find_first_bit in bitmap
		bitmap[bitposition] <- 1
		find_first_bit in bitmap
		bitmap[bitposition] <- 0

After each find_first_bit, the result is checked against the expected result.
A similar test is done for searching zero bits. The two tests are performed
1000 times in a loop. On a 2.4GHz (P-IV-type) Xeon, I get the following
results:

$ gcc -DNEW -fomit-frame-pointer -Os find_first_bit.c && time ./a.out
real    0m15.006s
$ nm -nStd
0000000134513492 0000000000000065 T find_first_bit
0000000134513557 0000000000000062 T find_first_zero_bit
0000000134513619 0000000000000190 T testzerobit
0000000134513809 0000000000000187 T testonebit
0000000134513996 0000000000000045 T main

and

$ gcc -fomit-frame-pointer -Os find_first_bit.c && time ./a.out
real    0m17.617s
0000000134513492 0000000000000293 T testzerobit
0000000134513785 0000000000000240 T testonebit
0000000134514025 0000000000000045 T main

So in this particular case, on this particular machine, with this
particular mix of searches, the new code is somewhat faster, even
though it is out-of-line.

A similar, but more convincing, change was seen when the assembly
versions for find_next_bit and find_next_zero_bit where replaced
by the generic ones.

Greetings,
	Alexander

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] x86: generic versions of find_first_(zero_)bit, convert i386
  2008-03-31 19:38   ` Alexander van Heukelum
@ 2008-03-31 21:58     ` Andi Kleen
  0 siblings, 0 replies; 34+ messages in thread
From: Andi Kleen @ 2008-03-31 21:58 UTC (permalink / raw)
  To: Alexander van Heukelum; +Cc: Stephen Hemminger, linux-kernel

Alexander van Heukelum <heukelum@mailshack.com> writes:
> 
> bitmap <- [0...]
> for bitmapsize=1 to 512
> 	for bitposition=0 to bitmapsize-1

I don't think that's a useful benchmark because you're testing 
mostly values that a real kernel will never use.

-Andi

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] x86: generic versions of find_first_(zero_)bit, convert i386
  2008-03-31 17:15 [PATCH] x86: generic versions of find_first_(zero_)bit, convert i386 Alexander van Heukelum
  2008-03-31 17:22 ` Stephen Hemminger
@ 2008-04-01  8:47 ` Ingo Molnar
  2008-04-01  9:46   ` Alexander van Heukelum
  1 sibling, 1 reply; 34+ messages in thread
From: Ingo Molnar @ 2008-04-01  8:47 UTC (permalink / raw)
  To: Alexander van Heukelum; +Cc: Andi Kleen, LKML, Alexander van Heukelum


* Alexander van Heukelum <heukelum@mailshack.com> wrote:

> Generic versions of __find_first_bit and __find_first_zero_bit are 
> introduced as simplified versions of __find_next_bit and 
> __find_next_zero_bit. Their compilation and use are guarded by a new 
> config variable GENERIC_FIND_FIRST_BIT.
> 
> The generic versions of find_first_bit and find_first_zero_bit are 
> implemented in terms of the newly introduced __find_first_bit and 
> __find_first_zero_bit.
> 
> This patch also converts i386 to the generic functions. The text size 
> shrinks slightly due to uninlining of the find_*_bit functions.
> 
>    text    data     bss     dec     hex filename
> 4764939  480324  622592 5867855  59894f vmlinux  (i386 defconfig before)
> 4764645  480324  622592 5867561  598829 vmlinux  (i386 defconfig after)
> 
> Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
> 
> ---
> 
> Hi Ingo,
> 
> Here is another step in the unification of the bitops for i386 and 
> x86_64. This patch implements a minimal conversion to a generic 
> implementation of find_first_bit/find_first_zero_bit for i386. The 
> optimization for small bitmaps and the conversion of x86_64 will 
> follow soon.
> 
> Compiles and runs fine on i386 and x86_64 (current x86#testing).

thanks, applied.

I guess we should keep the bitops.h portions alive for the moment though 
(surrounded by #ifndef GENERIC_FIND_FIRST_BIT), to make it easy for 
anyone to flip the Kconfig bit around via a oneliner patch and do some 
benchmarking? At least initially - i'm convinced that we want the 
generic versions in the long run. (especially if, as your patches do it, 
the generic code picks up the easy constant tests and optimizes them at 
build time)

	Ingo

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] x86: generic versions of find_first_(zero_)bit, convert i386
  2008-04-01  8:47 ` Ingo Molnar
@ 2008-04-01  9:46   ` Alexander van Heukelum
  2008-04-01 15:41     ` [PATCH] x86: switch x86_64 to generic find_first_bit Alexander van Heukelum
  2008-04-06 17:03     ` [PATCH] x86: generic versions of find_first_(zero_)bit, convert i386 dean gaudet
  0 siblings, 2 replies; 34+ messages in thread
From: Alexander van Heukelum @ 2008-04-01  9:46 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andi Kleen, LKML, Alexander van Heukelum

x86: generic versions of find_first_(zero_)bit, convert i386

Generic versions of __find_first_bit and __find_first_zero_bit
are introduced as simplified versions of __find_next_bit and
__find_next_zero_bit. Their compilation and use are guarded by
a new config variable GENERIC_FIND_FIRST_BIT.

The generic versions of find_first_bit and find_first_zero_bit
are implemented in terms of the newly introduced __find_first_bit
and __find_first_zero_bit.

This patch does not remove the i386-specific implementation,
but it does switch i386 to use the generic functions by setting
GENERIC_FIND_FIRST_BIT=y for X86_32.

Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>


 arch/x86/Kconfig            |    3 ++
 include/asm-x86/bitops_32.h |    2 +
 include/linux/bitops.h      |   34 +++++++++++++++++++++++++
 lib/Makefile                |    1 +
 lib/find_next_bit.c         |   58 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 98 insertions(+), 0 deletions(-)

---

> I guess we should keep the bitops.h portions alive for the moment 
> though (surrounded by #ifndef GENERIC_FIND_FIRST_BIT), to make it
> easy for anyone to flip the Kconfig bit around via a oneliner patch
> and do some benchmarking? At least initially - i'm convinced that
> we want the generic versions in the long run. (especially if, as
> your patches do it, the generic code picks up the easy constant
> tests and optimizes them at build time)
>
>        Ingo 

Keeping the old version alive for the moment is fine with me. Here
is a replacement patch that does exactly that. Runs on i386 with
this patch and also with GENERIC_FIND_FIRST_BIT=n.

I'll finish the patches for X86_64 and the constant-size bitmap
optimizations after work.

Greetings,
	Alexander

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6b3626d..fa7d16d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -80,6 +80,9 @@ config GENERIC_BUG
 	def_bool y
 	depends on BUG
 
+config GENERIC_FIND_FIRST_BIT
+	def_bool X86_32
+
 config GENERIC_FIND_NEXT_BIT
 	def_bool y
 
diff --git a/include/asm-x86/bitops_32.h b/include/asm-x86/bitops_32.h
index 3ed64b2..ba2c0de 100644
--- a/include/asm-x86/bitops_32.h
+++ b/include/asm-x86/bitops_32.h
@@ -5,6 +5,7 @@
  * Copyright 1992, Linus Torvalds.
  */
 
+#ifndef CONFIG_GENERIC_FIND_FIRST_BIT
 /**
  * find_first_zero_bit - find the first zero bit in a memory region
  * @addr: The address to start the search at
@@ -59,6 +60,7 @@ static inline unsigned find_first_bit(const unsigned long *addr, unsigned size)
 	}
 	return x;
 }
+#endif
 
 #ifdef __KERNEL__
 
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 3865f2c..355d67b 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -113,6 +113,40 @@ static inline unsigned fls_long(unsigned long l)
 }
 
 #ifdef __KERNEL__
+#ifdef CONFIG_GENERIC_FIND_FIRST_BIT
+extern unsigned long __find_first_bit(const unsigned long *addr,
+		unsigned long size);
+
+/**
+ * find_first_bit - find the first set bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
+ *
+ * Returns the bit number of the first set bit.
+ */
+static __always_inline unsigned long
+find_first_bit(const unsigned long *addr, unsigned long size)
+{
+	return __find_first_bit(addr, size);
+}
+
+extern unsigned long __find_first_zero_bit(const unsigned long *addr,
+		unsigned long size);
+
+/**
+ * find_first_zero_bit - find the first cleared bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
+ *
+ * Returns the bit number of the first cleared bit.
+ */
+static __always_inline unsigned long
+find_first_zero_bit(const unsigned long *addr, unsigned long size)
+{
+	return __find_first_zero_bit(addr, size);
+}
+#endif /* CONFIG_GENERIC_FIND_FIRST_BIT */
+
 #ifdef CONFIG_GENERIC_FIND_NEXT_BIT
 extern unsigned long __find_next_bit(const unsigned long *addr,
 		unsigned long size, unsigned long offset);
diff --git a/lib/Makefile b/lib/Makefile
index 23de261..14c93e1 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
 lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o
+lib-$(CONFIG_GENERIC_FIND_FIRST_BIT) += find_next_bit.o
 lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
 obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
 obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
diff --git a/lib/find_next_bit.c b/lib/find_next_bit.c
index ce94c4c..d3f5784 100644
--- a/lib/find_next_bit.c
+++ b/lib/find_next_bit.c
@@ -16,6 +16,7 @@
 
 #define BITOP_WORD(nr)		((nr) / BITS_PER_LONG)
 
+#ifdef CONFIG_GENERIC_FIND_NEXT_BIT
 /*
  * Find the next set bit in a memory region.
  */
@@ -102,6 +103,63 @@ found_middle:
 	return result + ffz(tmp);
 }
 EXPORT_SYMBOL(__find_next_zero_bit);
+#endif /* CONFIG_GENERIC_FIND_NEXT_BIT */
+
+#ifdef CONFIG_GENERIC_FIND_FIRST_BIT
+/*
+ * Find the first set bit in a memory region.
+ */
+unsigned long __find_first_bit(const unsigned long *addr,
+		unsigned long size)
+{
+	const unsigned long *p = addr;
+	unsigned long result = 0;
+	unsigned long tmp;
+
+	while (size & ~(BITS_PER_LONG-1)) {
+		if ((tmp = *(p++)))
+			goto found;
+		result += BITS_PER_LONG;
+		size -= BITS_PER_LONG;
+	}
+	if (!size)
+		return result;
+
+	tmp = (*p) & (~0UL >> (BITS_PER_LONG - size));
+	if (tmp == 0UL)		/* Are any bits set? */
+		return result + size;	/* Nope. */
+found:
+	return result + __ffs(tmp);
+}
+EXPORT_SYMBOL(__find_first_bit);
+
+/*
+ * Find the first cleared bit in a memory region.
+ */
+unsigned long __find_first_zero_bit(const unsigned long *addr,
+		unsigned long size)
+{
+	const unsigned long *p = addr;
+	unsigned long result = 0;
+	unsigned long tmp;
+
+	while (size & ~(BITS_PER_LONG-1)) {
+		if (~(tmp = *(p++)))
+			goto found;
+		result += BITS_PER_LONG;
+		size -= BITS_PER_LONG;
+	}
+	if (!size)
+		return result;
+
+	tmp = (*p) | (~0UL << size);
+	if (tmp == ~0UL)	/* Are any bits zero? */
+		return result + size;	/* Nope. */
+found:
+	return result + ffz(tmp);
+}
+EXPORT_SYMBOL(__find_first_zero_bit);
+#endif /* CONFIG_GENERIC_FIND_FIRST_BIT */
 
 #ifdef __BIG_ENDIAN
 

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH] x86: switch x86_64 to generic find_first_bit
  2008-04-01  9:46   ` Alexander van Heukelum
@ 2008-04-01 15:41     ` Alexander van Heukelum
  2008-04-01 15:42       ` [PATCH] x86: optimize find_first_bit for small bitmaps Alexander van Heukelum
  2008-04-06 17:03     ` [PATCH] x86: generic versions of find_first_(zero_)bit, convert i386 dean gaudet
  1 sibling, 1 reply; 34+ messages in thread
From: Alexander van Heukelum @ 2008-04-01 15:41 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andi Kleen, LKML, Alexander van Heukelum

[PATCH] x86: switch x86_64 to generic find_first_bit

Switch x86_64 to generic find_first_bit. The x86_64-specific
implementation is not removed.

Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>

---
 arch/x86/Kconfig            |    2 +-
 arch/x86/lib/bitops_64.c    |    2 ++
 include/asm-x86/bitops_64.h |    2 ++
 3 files changed, 5 insertions(+), 1 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fa7d16d..e37e286 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -81,7 +81,7 @@ config GENERIC_BUG
 	depends on BUG
 
 config GENERIC_FIND_FIRST_BIT
-	def_bool X86_32
+	def_bool y
 
 config GENERIC_FIND_NEXT_BIT
 	def_bool y
diff --git a/arch/x86/lib/bitops_64.c b/arch/x86/lib/bitops_64.c
index 0eeb704..568467d 100644
--- a/arch/x86/lib/bitops_64.c
+++ b/arch/x86/lib/bitops_64.c
@@ -1,3 +1,4 @@
+#ifndef CONFIG_GENERIC_FIND_FIRST_BIT
 #include <linux/bitops.h>
 
 #undef find_first_zero_bit
@@ -105,3 +106,4 @@ long find_first_bit(const unsigned long * addr, unsigned long size)
 
 EXPORT_SYMBOL(find_first_bit);
 EXPORT_SYMBOL(find_first_zero_bit);
+#endif
diff --git a/include/asm-x86/bitops_64.h b/include/asm-x86/bitops_64.h
index d133520..4081d7e 100644
--- a/include/asm-x86/bitops_64.h
+++ b/include/asm-x86/bitops_64.h
@@ -5,6 +5,7 @@
  * Copyright 1992, Linus Torvalds.
  */
 
+#ifndef CONFIG_GENERIC_FIND_FIRST_BIT
 extern long find_first_zero_bit(const unsigned long *addr, unsigned long size);
 extern long find_first_bit(const unsigned long *addr, unsigned long size);
 
@@ -24,6 +25,7 @@ static inline long __scanbit(unsigned long val, unsigned long max)
 	((__builtin_constant_p((size)) && (size) <= BITS_PER_LONG	\
 	  ? (__scanbit(~*(unsigned long *)(addr), (size)))		\
 	  : find_first_zero_bit((addr), (size))))
+#endif
 
 static inline void set_bit_string(unsigned long *bitmap, unsigned long i,
 				  int len)
-- 
1.5.2.5



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH] x86: optimize find_first_bit for small bitmaps
  2008-04-01 15:41     ` [PATCH] x86: switch x86_64 to generic find_first_bit Alexander van Heukelum
@ 2008-04-01 15:42       ` Alexander van Heukelum
  2008-04-01 15:47         ` [PATCH] x86: remove x86-specific implementations of find_first_bit Alexander van Heukelum
  0 siblings, 1 reply; 34+ messages in thread
From: Alexander van Heukelum @ 2008-04-01 15:42 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andi Kleen, LKML, Alexander van Heukelum

[PATCH] x86: optimize find_first_bit for small bitmaps

Avoid a call to find_first_bit if the bitmap size is know at
compile time and small enough to fit in a single long integer.
Modeled after an optimization in the original x86_64-specific
code.

Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>

---
 include/linux/bitops.h |   29 +++++++++++++++++++++++++++++
 1 files changed, 29 insertions(+), 0 deletions(-)

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 355d67b..48bde60 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -127,6 +127,20 @@ extern unsigned long __find_first_bit(const unsigned long *addr,
 static __always_inline unsigned long
 find_first_bit(const unsigned long *addr, unsigned long size)
 {
+	/* Avoid a function call if the bitmap size is a constant */
+	/* and not bigger than BITS_PER_LONG. */
+
+	/* insert a sentinel so that __ffs returns size if there */
+	/* are no set bits in the bitmap */
+	if (__builtin_constant_p(size) && (size < BITS_PER_LONG))
+		return __ffs((*addr) | (1ul << size));
+
+	/* the result of __ffs(0) is undefined, so it needs to be */
+	/* handled separately */
+	if (__builtin_constant_p(size) && (size == BITS_PER_LONG))
+		return ((*addr) == 0) ? BITS_PER_LONG : __ffs(*addr);
+
+	/* size is not constant or too big */
 	return __find_first_bit(addr, size);
 }
 
@@ -143,6 +157,21 @@ extern unsigned long __find_first_zero_bit(const unsigned long *addr,
 static __always_inline unsigned long
 find_first_zero_bit(const unsigned long *addr, unsigned long size)
 {
+	/* Avoid a function call if the bitmap size is a constant */
+	/* and not bigger than BITS_PER_LONG. */
+
+	/* insert a sentinel so that __ffs returns size if there */
+	/* are no set bits in the bitmap */
+	if (__builtin_constant_p(size) && (size < BITS_PER_LONG)) {
+		return __ffs(~(*addr) | (1ul << size));
+	}
+
+	/* the result of __ffs(0) is undefined, so it needs to be */
+	/* handled separately */
+	if (__builtin_constant_p(size) && (size == BITS_PER_LONG))
+		return (~(*addr) == 0) ? BITS_PER_LONG : __ffs(~(*addr));
+
+	/* size is not constant or too big */
 	return __find_first_zero_bit(addr, size);
 }
 #endif /* CONFIG_GENERIC_FIND_FIRST_BIT */
-- 
1.5.2.5



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH] x86: remove x86-specific implementations of find_first_bit
  2008-04-01 15:42       ` [PATCH] x86: optimize find_first_bit for small bitmaps Alexander van Heukelum
@ 2008-04-01 15:47         ` Alexander van Heukelum
  2008-04-03  9:34           ` Alexander van Heukelum
  2008-04-04  8:47           ` Ingo Molnar
  0 siblings, 2 replies; 34+ messages in thread
From: Alexander van Heukelum @ 2008-04-01 15:47 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andi Kleen, LKML, Alexander van Heukelum

[PATCH] x86: remove x86-specific implementations of find_first_bit

x86 has been switched to the generic versions of find_first_bit
and find_first_zero_bit, but the original versions were retained.
This patch just removes the now unused x86-specific versions.

Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>

---
 arch/x86/lib/Makefile       |    1 -
 arch/x86/lib/bitops_64.c    |  109 -------------------------------------------
 include/asm-x86/bitops_32.h |   58 -----------------------
 include/asm-x86/bitops_64.h |   23 ---------
 4 files changed, 0 insertions(+), 191 deletions(-)
 delete mode 100644 arch/x86/lib/bitops_64.c

This patch should of course wait until it is decided if the generic
version is good enough to replace the x86-specific ones. I hacked
together a benchmark that people can run on different machines. It
can be downloaded from:

   http://heukelum.fastmail.fm/find_first_bit

Greetings,
	Alexander

diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 4360932..76f60f5 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -21,7 +21,6 @@ else
 
         lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
         lib-y += thunk_64.o clear_page_64.o copy_page_64.o
-        lib-y += bitops_64.o
         lib-y += memmove_64.o memset_64.o
         lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
 endif
diff --git a/arch/x86/lib/bitops_64.c b/arch/x86/lib/bitops_64.c
deleted file mode 100644
index 568467d..0000000
--- a/arch/x86/lib/bitops_64.c
+++ /dev/null
@@ -1,109 +0,0 @@
-#ifndef CONFIG_GENERIC_FIND_FIRST_BIT
-#include <linux/bitops.h>
-
-#undef find_first_zero_bit
-#undef find_first_bit
-
-static inline long
-__find_first_zero_bit(const unsigned long * addr, unsigned long size)
-{
-	long d0, d1, d2;
-	long res;
-
-	/*
-	 * We must test the size in words, not in bits, because
-	 * otherwise incoming sizes in the range -63..-1 will not run
-	 * any scasq instructions, and then the flags used by the je
-	 * instruction will have whatever random value was in place
-	 * before.  Nobody should call us like that, but
-	 * find_next_zero_bit() does when offset and size are at the
-	 * same word and it fails to find a zero itself.
-	 */
-	size += 63;
-	size >>= 6;
-	if (!size)
-		return 0;
-	asm volatile(
-		"  repe; scasq\n"
-		"  je 1f\n"
-		"  xorq -8(%%rdi),%%rax\n"
-		"  subq $8,%%rdi\n"
-		"  bsfq %%rax,%%rdx\n"
-		"1:  subq %[addr],%%rdi\n"
-		"  shlq $3,%%rdi\n"
-		"  addq %%rdi,%%rdx"
-		:"=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2)
-		:"0" (0ULL), "1" (size), "2" (addr), "3" (-1ULL),
-		 [addr] "S" (addr) : "memory");
-	/*
-	 * Any register would do for [addr] above, but GCC tends to
-	 * prefer rbx over rsi, even though rsi is readily available
-	 * and doesn't have to be saved.
-	 */
-	return res;
-}
-
-/**
- * find_first_zero_bit - find the first zero bit in a memory region
- * @addr: The address to start the search at
- * @size: The maximum size to search
- *
- * Returns the bit-number of the first zero bit, not the number of the byte
- * containing a bit.
- */
-long find_first_zero_bit(const unsigned long * addr, unsigned long size)
-{
-	return __find_first_zero_bit (addr, size);
-}
-
-static inline long
-__find_first_bit(const unsigned long * addr, unsigned long size)
-{
-	long d0, d1;
-	long res;
-
-	/*
-	 * We must test the size in words, not in bits, because
-	 * otherwise incoming sizes in the range -63..-1 will not run
-	 * any scasq instructions, and then the flags used by the jz
-	 * instruction will have whatever random value was in place
-	 * before.  Nobody should call us like that, but
-	 * find_next_bit() does when offset and size are at the same
-	 * word and it fails to find a one itself.
-	 */
-	size += 63;
-	size >>= 6;
-	if (!size)
-		return 0;
-	asm volatile(
-		"   repe; scasq\n"
-		"   jz 1f\n"
-		"   subq $8,%%rdi\n"
-		"   bsfq (%%rdi),%%rax\n"
-		"1: subq %[addr],%%rdi\n"
-		"   shlq $3,%%rdi\n"
-		"   addq %%rdi,%%rax"
-		:"=a" (res), "=&c" (d0), "=&D" (d1)
-		:"0" (0ULL), "1" (size), "2" (addr),
-		 [addr] "r" (addr) : "memory");
-	return res;
-}
-
-/**
- * find_first_bit - find the first set bit in a memory region
- * @addr: The address to start the search at
- * @size: The maximum size to search
- *
- * Returns the bit-number of the first set bit, not the number of the byte
- * containing a bit.
- */
-long find_first_bit(const unsigned long * addr, unsigned long size)
-{
-	return __find_first_bit(addr,size);
-}
-
-#include <linux/module.h>
-
-EXPORT_SYMBOL(find_first_bit);
-EXPORT_SYMBOL(find_first_zero_bit);
-#endif
diff --git a/include/asm-x86/bitops_32.h b/include/asm-x86/bitops_32.h
index ba2c0de..2e86302 100644
--- a/include/asm-x86/bitops_32.h
+++ b/include/asm-x86/bitops_32.h
@@ -4,64 +4,6 @@
 /*
  * Copyright 1992, Linus Torvalds.
  */
-
-#ifndef CONFIG_GENERIC_FIND_FIRST_BIT
-/**
- * find_first_zero_bit - find the first zero bit in a memory region
- * @addr: The address to start the search at
- * @size: The maximum size to search
- *
- * Returns the bit number of the first zero bit, not the number of the byte
- * containing a bit.
- */
-static inline int find_first_zero_bit(const unsigned long *addr, unsigned size)
-{
-	int d0, d1, d2;
-	int res;
-
-	if (!size)
-		return 0;
-	/* This looks at memory.
-	 * Mark it volatile to tell gcc not to move it around
-	 */
-	asm volatile("movl $-1,%%eax\n\t"
-		     "xorl %%edx,%%edx\n\t"
-		     "repe; scasl\n\t"
-		     "je 1f\n\t"
-		     "xorl -4(%%edi),%%eax\n\t"
-		     "subl $4,%%edi\n\t"
-		     "bsfl %%eax,%%edx\n"
-		     "1:\tsubl %%ebx,%%edi\n\t"
-		     "shll $3,%%edi\n\t"
-		     "addl %%edi,%%edx"
-		     : "=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2)
-		     : "1" ((size + 31) >> 5), "2" (addr),
-		       "b" (addr) : "memory");
-	return res;
-}
-
-/**
- * find_first_bit - find the first set bit in a memory region
- * @addr: The address to start the search at
- * @size: The maximum size to search
- *
- * Returns the bit number of the first set bit, not the number of the byte
- * containing a bit.
- */
-static inline unsigned find_first_bit(const unsigned long *addr, unsigned size)
-{
-	unsigned x = 0;
-
-	while (x < size) {
-		unsigned long val = *addr++;
-		if (val)
-			return __ffs(val) + x;
-		x += sizeof(*addr) << 3;
-	}
-	return x;
-}
-#endif
-
 #ifdef __KERNEL__
 
 #include <asm-generic/bitops/sched.h>
diff --git a/include/asm-x86/bitops_64.h b/include/asm-x86/bitops_64.h
index 4081d7e..cb23122 100644
--- a/include/asm-x86/bitops_64.h
+++ b/include/asm-x86/bitops_64.h
@@ -4,29 +4,6 @@
 /*
  * Copyright 1992, Linus Torvalds.
  */
-
-#ifndef CONFIG_GENERIC_FIND_FIRST_BIT
-extern long find_first_zero_bit(const unsigned long *addr, unsigned long size);
-extern long find_first_bit(const unsigned long *addr, unsigned long size);
-
-/* return index of first bet set in val or max when no bit is set */
-static inline long __scanbit(unsigned long val, unsigned long max)
-{
-	asm("bsfq %1,%0 ; cmovz %2,%0" : "=&r" (val) : "r" (val), "r" (max));
-	return val;
-}
-
-#define find_first_bit(addr, size)					\
-	((__builtin_constant_p((size)) && (size) <= BITS_PER_LONG	\
-	  ? (__scanbit(*(unsigned long *)(addr), (size)))		\
-	  : find_first_bit((addr), (size))))
-
-#define find_first_zero_bit(addr, size)					\
-	((__builtin_constant_p((size)) && (size) <= BITS_PER_LONG	\
-	  ? (__scanbit(~*(unsigned long *)(addr), (size)))		\
-	  : find_first_zero_bit((addr), (size))))
-#endif
-
 static inline void set_bit_string(unsigned long *bitmap, unsigned long i,
 				  int len)
 {
-- 
1.5.2.5


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [PATCH] x86: remove x86-specific implementations of find_first_bit
  2008-04-01 15:47         ` [PATCH] x86: remove x86-specific implementations of find_first_bit Alexander van Heukelum
@ 2008-04-03  9:34           ` Alexander van Heukelum
  2008-04-04  8:47           ` Ingo Molnar
  1 sibling, 0 replies; 34+ messages in thread
From: Alexander van Heukelum @ 2008-04-03  9:34 UTC (permalink / raw)
  To: Andi Kleen, Ingo Molnar; +Cc: LKML, Alexander van Heukelum

On Tue, Apr 01, 2008 at 05:47:57PM +0200, Alexander van Heukelum wrote:
> This patch should of course wait until it is decided if the generic
> version is good enough to replace the x86-specific ones. I hacked
> together a benchmark that people can run on different machines. It can
> be downloaded from:
> 
>    http://heukelum.fastmail.fm/find_first_bit

I now put a much more sound benchmarking program there. And its
results for P-IV-like Xeon, Athlon XP and Opteron for the current
and generic implementations.

Greetings,
	Alexander

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] x86: remove x86-specific implementations of find_first_bit
  2008-04-01 15:47         ` [PATCH] x86: remove x86-specific implementations of find_first_bit Alexander van Heukelum
  2008-04-03  9:34           ` Alexander van Heukelum
@ 2008-04-04  8:47           ` Ingo Molnar
  1 sibling, 0 replies; 34+ messages in thread
From: Ingo Molnar @ 2008-04-04  8:47 UTC (permalink / raw)
  To: Alexander van Heukelum; +Cc: Andi Kleen, LKML, Alexander van Heukelum


* Alexander van Heukelum <heukelum@mailshack.com> wrote:

> [PATCH] x86: remove x86-specific implementations of find_first_bit
> 
> x86 has been switched to the generic versions of find_first_bit and 
> find_first_zero_bit, but the original versions were retained. This 
> patch just removes the now unused x86-specific versions.

thanks Alexander, i've added your patches to x86.git/latest.

	Ingo

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] x86: generic versions of find_first_(zero_)bit, convert i386
  2008-04-01  9:46   ` Alexander van Heukelum
  2008-04-01 15:41     ` [PATCH] x86: switch x86_64 to generic find_first_bit Alexander van Heukelum
@ 2008-04-06 17:03     ` dean gaudet
  2008-04-06 18:51       ` Alexander van Heukelum
  1 sibling, 1 reply; 34+ messages in thread
From: dean gaudet @ 2008-04-06 17:03 UTC (permalink / raw)
  To: Alexander van Heukelum
  Cc: Ingo Molnar, Andi Kleen, LKML, Alexander van Heukelum

fwiw there's a way to do ffz / ntz which can do lg(n) conditional moves in 
parallel... i'm not sure what (non-x86) architectures this might be best 
on, but it might be a good choice for the generic code... although maybe 
the large number of constants required will be a burden on RISC 
processors.

take a look at figure 5-17 here http://hackersdelight.org/revisions.pdf

int ntz(unsigned x) {
	unsigned y, bz, b4, b3, b2, b1, b0;
	y = x & -x; // Isolate rightmost 1-bit.
	bz = y ? 0 : 1; // 1 if y = 0.
	b4 = (y & 0x0000FFFF) ? 0 : 16;
	b3 = (y & 0x00FF00FF) ? 0 : 8;
	b2 = (y & 0x0F0F0F0F) ? 0 : 4;
	b1 = (y & 0x33333333) ? 0 : 2;
	b0 = (y & 0x55555555) ? 0 : 1;
	return bz + b4 + b3 + b2 + b1 + b0;
}

-dean

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] x86: generic versions of find_first_(zero_)bit, convert   i386
  2008-04-06 17:03     ` [PATCH] x86: generic versions of find_first_(zero_)bit, convert i386 dean gaudet
@ 2008-04-06 18:51       ` Alexander van Heukelum
  2008-04-06 20:22         ` dean gaudet
  0 siblings, 1 reply; 34+ messages in thread
From: Alexander van Heukelum @ 2008-04-06 18:51 UTC (permalink / raw)
  To: dean gaudet, Alexander van Heukelum; +Cc: Ingo Molnar, Andi Kleen, LKML

On Sun, 6 Apr 2008 10:03:43 -0700 (PDT), "dean gaudet" <dean@arctic.org>
said:
> fwiw there's a way to do ffz / ntz which can do lg(n) conditional moves in 
> parallel... i'm not sure what (non-x86) architectures this might be best 
> on, but it might be a good choice for the generic code... although maybe 
> the large number of constants required will be a burden on RISC 
> processors.

Hello Dean,

The current generic implementation of ffz is O(lg(n)) already, but
the version you suggest might indeed be a bit faster if the compiler
recognises that is can use conditional moves and the architecture
can handle large constants efficiently.

On the other had, the bit-search functions tend to be avoided as
much as possible, because they are often not implemented as a
hardware instruction and even if they are implemented in hardware,
they might be slow. The generic version is slow anyhow. That's
why the bitmap searches first test if a word in the bitmap is
all-0-bits/all-1-bits. The single-word version of ffz might even
be better off if it was optimized for size instead of being fully
unrolled!

> take a look at figure 5-17 here http://hackersdelight.org/revisions.pdf
> 
> int ntz(unsigned x) {
> 	unsigned y, bz, b4, b3, b2, b1, b0;
> 	y = x & -x; // Isolate rightmost 1-bit.
> 	bz = y ? 0 : 1; // 1 if y = 0.
> 	b4 = (y & 0x0000FFFF) ? 0 : 16;
> 	b3 = (y & 0x00FF00FF) ? 0 : 8;
> 	b2 = (y & 0x0F0F0F0F) ? 0 : 4;
> 	b1 = (y & 0x33333333) ? 0 : 2;
> 	b0 = (y & 0x55555555) ? 0 : 1;
> 	return bz + b4 + b3 + b2 + b1 + b0;
> }

Note: mask32 = ~0ul; mask16 = mask32 ^ (mask32 << 16), mask8 = ...

Greetings,
    Alexander
> 
> -dean
-- 
  Alexander van Heukelum
  heukelum@fastmail.fm

-- 
http://www.fastmail.fm - Or how I learned to stop worrying and
                          love email again


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] x86: generic versions of find_first_(zero_)bit, convert i386
  2008-04-06 18:51       ` Alexander van Heukelum
@ 2008-04-06 20:22         ` dean gaudet
  2008-04-07  8:43           ` Ingo Molnar
  2008-04-07 10:25           ` Alexander van Heukelum
  0 siblings, 2 replies; 34+ messages in thread
From: dean gaudet @ 2008-04-06 20:22 UTC (permalink / raw)
  To: Alexander van Heukelum
  Cc: Alexander van Heukelum, Ingo Molnar, Andi Kleen, LKML

On Sun, 6 Apr 2008, Alexander van Heukelum wrote:

> The current generic implementation of ffz is O(lg(n)) already

it's O(lg(n)) time... the operations all depend on each other.

the implementation i pointed to is O(lg(n)) code space... and the time 
depends on how parallel the machine is, they're not dependent on each 
other.

-dean

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] x86: generic versions of find_first_(zero_)bit, convert i386
  2008-04-06 20:22         ` dean gaudet
@ 2008-04-07  8:43           ` Ingo Molnar
  2008-04-07 10:25           ` Alexander van Heukelum
  1 sibling, 0 replies; 34+ messages in thread
From: Ingo Molnar @ 2008-04-07  8:43 UTC (permalink / raw)
  To: dean gaudet
  Cc: Alexander van Heukelum, Alexander van Heukelum, Andi Kleen, LKML


* dean gaudet <dean@arctic.org> wrote:

> On Sun, 6 Apr 2008, Alexander van Heukelum wrote:
> 
> > The current generic implementation of ffz is O(lg(n)) already
> 
> it's O(lg(n)) time... the operations all depend on each other.
> 
> the implementation i pointed to is O(lg(n)) code space... and the time 
> depends on how parallel the machine is, they're not dependent on each 
> other.

yep, i'd suggest we gravitate towards such a no-dependencies 
implementation for the generic code - especially modern CPUs would be 
able to execute them rather efficiently.

	Ingo

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] x86: generic versions of find_first_(zero_)bit, convert     i386
  2008-04-06 20:22         ` dean gaudet
  2008-04-07  8:43           ` Ingo Molnar
@ 2008-04-07 10:25           ` Alexander van Heukelum
  2008-04-18 20:18             ` Alternative implementation of the generic __ffs Alexander van Heukelum
  1 sibling, 1 reply; 34+ messages in thread
From: Alexander van Heukelum @ 2008-04-07 10:25 UTC (permalink / raw)
  To: dean gaudet; +Cc: Alexander van Heukelum, Ingo Molnar, Andi Kleen, LKML


On Sun, 6 Apr 2008 13:22:58 -0700 (PDT), "dean gaudet" <dean@arctic.org>
said:
> On Sun, 6 Apr 2008, Alexander van Heukelum wrote:
> 
> > The current generic implementation of ffz is O(lg(n)) already
> 
> it's O(lg(n)) time... the operations all depend on each other.
> 
> the implementation i pointed to is O(lg(n)) code space... and the time 
> depends on how parallel the machine is, they're not dependent on each 
> other.

Indeed. The worst dependencies are in the sum of all the partial
results in this implementation. And addition is associative, so
partial results can be written as ((a+b)+(c+d))+(e+f). Assuming
perfect parallel execution this would lead to O(ln(ln(n))). Good.

Care to implement ffs and __ffs like this?

Greetings,
    Alexander

> -dean
-- 
  Alexander van Heukelum
  heukelum@fastmail.fm

-- 
http://www.fastmail.fm - The professional email service


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Alternative implementation of the generic __ffs
  2008-04-07 10:25           ` Alexander van Heukelum
@ 2008-04-18 20:18             ` Alexander van Heukelum
  2008-04-18 23:46               ` dean gaudet
  0 siblings, 1 reply; 34+ messages in thread
From: Alexander van Heukelum @ 2008-04-18 20:18 UTC (permalink / raw)
  To: Alexander van Heukelum; +Cc: dean gaudet, Ingo Molnar, Andi Kleen, LKML

On Mon, Apr 07, 2008 at 12:25:50PM +0200, Alexander van Heukelum wrote:
> On Sun, 6 Apr 2008 13:22:58 -0700 (PDT), "dean gaudet" <dean@arctic.org> said:
> > On Sun, 6 Apr 2008, Alexander van Heukelum wrote:
> > > The current generic implementation of ffz is O(lg(n)) already
> > 
> > it's O(lg(n)) time... the operations all depend on each other.
> > 
> > the implementation i pointed to is O(lg(n)) code space... and the time 
> > depends on how parallel the machine is, they're not dependent on each 
> > other.
> 
> Indeed. The worst dependencies are in the sum of all the partial
> results in this implementation. And addition is associative, so
> partial results can be written as ((a+b)+(c+d))+(e+f). Assuming
> perfect parallel execution this would lead to O(ln(ln(n))). Good.

Hello all,

I've implemented ffs (find first set bit) like it is shown
in http://www.hackersdelight.org/ (see revisions, page 21).
I would be interested to see how it compares with the generic
implementation of __ffs in the linux kernel, in particular
on architectures that do not have an obvious fast way of
determining the first set bit of a word.

I have included a simple benchmark program that should give
a reasonable estimate of the performance ratio of the two
implementations. Please test and report :).

Is this implementation suitable to replace the current one?

Greetings,
	Alexander

P.S. Some results for some machines I could test on:

-----------

On a 2.1 GHz Athlon XP:
$ gcc -m32 -Os -fomit-frame-pointer ffs.c && time ./a.out
Original:        396 tics,   756 tics
New:             378 tics,   851 tics
Assembly:        262 tics,   383 tics
Empty loop:      128 tics,   203 tics

real    0m33.862s
user    0m33.026s
sys     0m0.344s

-----------

On a 2.33 GHz Xeon:
$ gcc -m64 -Os ffs.c && time ./a.out 
Original:        277 tics,   324 tics
New:             230 tics,   236 tics
Assembly:         90 tics,    84 tics
Empty loop:       90 tics,    82 tics

real    0m14.490s
user    0m14.270s
sys     0m0.220s
$ gcc -m32 -Os -fomit-frame-pointer ffs.c && time ./a.out
Original:        303 tics,   449 tics
New:             231 tics,   394 tics
Assembly:         90 tics,   144 tics
Empty loop:      102 tics,   116 tics

real    0m18.521s
user    0m18.380s
sys     0m0.140s

-----------

On an alpha EV6.7 (21264A) operating at 667 MHz:
$ gcc -Os ffs.c && time ./a.out
Original:        622 tics,   633 tics
New:             431 tics,   465 tics
Assembly:        235 tics,   210 tics
Empty loop:      199 tics,   218 tics
50.358u 0.259s 1:14.28 68.1%    0+1k 2+0io 2pf+0w

-----------

#include <stdio.h>
#include <sys/times.h>

#define LOOPS32 (1<<(30-5-1))
#define LOOPS64 (1<<(30-6-1))
#define ATTR __attribute__((noinline))

static ATTR int __ffs32_orig(unsigned int word)
{
	int num = 0;

	if ((word & 0xffff) == 0) {
		num += 16;
		word >>= 16;
	}
	if ((word & 0xff) == 0) {
		num += 8;
		word >>= 8;
	}
	if ((word & 0xf) == 0) {
		num += 4;
		word >>= 4;
	}
	if ((word & 0x3) == 0) {
		num += 2;
		word >>= 2;
	}
	if ((word & 0x1) == 0)
		num += 1;

	return num;
}

static ATTR int __ffs64_orig(unsigned long long word)
{
	int num = 0;

	if ((word & 0xffffffff) == 0) {
		num += 32;
		word >>= 32;
	}
	if ((word & 0xffff) == 0) {
		num += 16;
		word >>= 16;
	}
	if ((word & 0xff) == 0) {
		num += 8;
		word >>= 8;
	}
	if ((word & 0xf) == 0) {
		num += 4;
		word >>= 4;
	}
	if ((word & 0x3) == 0) {
		num += 2;
		word >>= 2;
	}
	if ((word & 0x1) == 0)
		num += 1;

	return num;
}

static ATTR int __ffs32_new(unsigned int value)
{
	int x0, x1, x2, x3, x4;

	value &= -value;
	x0 = (value & 0x55555555) ? 0 : 1;
	x1 = (value & 0x33333333) ? 0 : 2;
	x2 = (value & 0x0f0f0f0f) ? 0 : 4;
	x3 = (value & 0x00ff00ff) ? 0 : 8;
	x4 = (value & 0x0000ffff) ? 0 : 16;

	return x0 | x1 | x2 | x3 | x4;
}

static ATTR int __ffs64_new(unsigned long long value)
{
	int x0, x1, x2, x3, x4, x5;

	value &= -value;
	x0 = (value & 0x5555555555555555ull) ? 0 : 1;
	x1 = (value & 0x3333333333333333ull) ? 0 : 2;
	x2 = (value & 0x0f0f0f0f0f0f0f0full) ? 0 : 4;
	x3 = (value & 0x00ff00ff00ff00ffull) ? 0 : 8;
	x4 = (value & 0x0000ffff0000ffffull) ? 0 : 16;
	x5 = (value & 0x00000000ffffffffull) ? 0 : 32;

	return x0 | x1 | x2 | x3 | x4 | x5;
}

#ifdef __x86_64__
#define ASSEMBLYVERSION
static ATTR int __ffs32_asm(unsigned int value)
{
	int res;

	asm volatile("bsf %[value], %[res]"
		: [res] "=r" (res)
		: [value] "r" (value)
	);	

	return res;
}

static ATTR int __ffs64_asm(unsigned long long value)
{
	long res;

	asm volatile("bsf %[value], %[res]"
		: [res] "=r" (res)
		: [value] "r" (value)
	);	

	return res;
}
#endif

#ifdef __i386__
#define ASSEMBLYVERSION
static ATTR int __ffs32_asm(unsigned int value)
{
	int res;

	asm volatile("bsf %[value], %[res]"
		: [res] "=r" (res)
		: [value] "r" (value)
	);	

	return res;
}

static ATTR int __ffs64_asm(unsigned long long value)
{
	unsigned int low, high;
	int res;

	low = value;
	if (low) {
		asm volatile("bsf %[value], %[res]"
			: [res] "=r" (res)
			: [value] "r" (low)
		);
		return res;
	}

	high = value >> 32;
	asm volatile("bsf %[value], %[res]"
		: [res] "=r" (res)
		: [value] "r" (high)
	);

	return res | 32;
}
#endif

#ifdef __alpha__
#define ASSEMBLYVERSION
static ATTR int __ffs32_asm(unsigned int value)
{
	int res;

	asm volatile("cttz %[value], %[res]"
		: [res] "=r" (res)
		: [value] "r" (value)
	);	

	return res;
}

static ATTR int __ffs64_asm(unsigned long long value)
{
	long res;

	asm volatile("cttz %[value], %[res]"
		: [res] "=r" (res)
		: [value] "r" (value)
	);	

	return res;
}
#endif

static ATTR int __nothing32(unsigned int value)
{
	return value;
}

static ATTR int __nothing64(unsigned long long value)
{
	return (int)value;
}

/* Random numbers: modified version of libc mrand48 */
static unsigned long long myrand_next = 0x330eull;
static const unsigned long long myrand_a = 0x5deece66dull;
static const unsigned long long myrand_b = 0xbull;
unsigned int myrand(void)
{
	myrand_next = myrand_a * myrand_next + myrand_b;
	return (unsigned int)(myrand_next >> 16);
}

unsigned long long myrand64(void)
{
	return ((unsigned long long)myrand() << 32) | myrand();
}

void myrand_seed(unsigned int seed)
{
	int n;
	myrand_next = ((unsigned long long)seed << 16) + 0x330eull;
	for (n=0; n<100; n++) myrand(); /* warm it up a bit */
}

/* tic: wait for next clock tick and save current tick */
clock_t tictime;
void tic(void)
{
	struct tms t;
	times(&t);
	tictime = t.tms_utime;
	do {
		times(&t);
	} while (tictime == t.tms_utime);
	tictime = t.tms_utime;
}

/* toc: report number of ticks since tic() */
long toc(void)
{
	struct tms t;
	times(&t);
	return (long)(t.tms_utime - tictime);
}

__attribute__((noinline))
int bench_orig32(void)
{
	unsigned int i;
	int n, res = 0;

	myrand_seed(0);
	for (n=0; n<LOOPS32; n++) {
		i = myrand();
		while (i) {
			res ^= __ffs32_orig(i);
			i &= i - 1;	
		}	
	}
	return res;
}

__attribute__((noinline))
int bench_orig64(void)
{
	unsigned long long i;
	int n, res = 0;

	myrand_seed(0);
	for (n=0; n<LOOPS64; n++) {
		i = myrand64();
		while (i) {
			res ^= __ffs64_orig(i);
			i &= i - 1;	
		}	
	}
	return res;
}

__attribute__((noinline))
int bench_new32(void)
{
	unsigned int i;
	int n, res = 0;

	myrand_seed(0);
	for (n=0; n<LOOPS32; n++) {
		i = myrand();
		while (i) {
			res ^= __ffs32_new(i);
			i &= i - 1;	
		}	
	}
	return res;
}

__attribute__((noinline))
int bench_new64(void)
{
	unsigned long long i;
	int n, res = 0;

	myrand_seed(0);
	for (n=0; n<LOOPS64; n++) {
		i = myrand64();
		while (i) {
			res ^= __ffs64_new(i);
			i &= i - 1;	
		}	
	}
	return res;
}

#ifdef ASSEMBLYVERSION
__attribute__((noinline))
int bench_asm32(void)
{
	unsigned int i;
	int n, res = 0;

	myrand_seed(0);
	for (n=0; n<LOOPS32; n++) {
		i = myrand();
		while (i) {
			res ^= __ffs32_asm(i);
			i &= i - 1;	
		}	
	}
	return res;
}

__attribute__((noinline))
int bench_asm64(void)
{
	unsigned long long i;
	int n, res = 0;

	myrand_seed(0);
	for (n=0; n<LOOPS64; n++) {
		i = myrand64();
		while (i) {
			res ^= __ffs64_asm(i);
			i &= i - 1;	
		}	
	}
	return res;
}
#endif

__attribute__((noinline))
int bench_none32(void)
{
	unsigned int i;
	int n, res = 0;

	myrand_seed(0);
	for (n=0; n<LOOPS32; n++) {
		i = myrand();
		while (i) {
			res ^= __nothing32(i);
			i &= i - 1;	
		}	
	}
	return res;
}

__attribute__((noinline))
int bench_none64(void)
{
	unsigned long long i;
	int n, res = 0;

	myrand_seed(0);
	for (n=0; n<LOOPS64; n++) {
		i = myrand64();
		while (i) {
			res ^= __nothing64(i);
			i &= i - 1;
		}	
	}
	return res;
}

void test(void)
{
	unsigned long long int i;
	int n, res1, res2;

	myrand_seed(0);

	for (n=0; n<1000; n++) {
		i = myrand64();
		while (i) {
			res1 = __ffs64_orig(i);
			res2 = __ffs64_new(i);
			if (res1 != res2) {
				printf("%llu %i %i\n", i,
						res1, res2);
			}
			i &= i - 1;
		}
	}
}

int main(void)
{
	long tics;

	setvbuf(stdout, NULL, _IONBF, 0);
	test();

	printf("%-14s", "Original:");
	tic(); bench_orig32(); tics = toc();
	printf("%6lu tics,", tics);
	tic(); bench_orig64(); tics = toc();
	printf("%6lu tics\n", tics);

	printf("%-14s", "New:");
	tic(); bench_new32(); tics = toc();
	printf("%6lu tics,", tics);
	tic(); bench_new64(); tics = toc();
	printf("%6lu tics\n", tics);

#ifdef ASSEMBLYVERSION
	printf("%-14s", "Assembly:");
	tic(); bench_asm32(); tics = toc();
	printf("%6lu tics,", tics);
	tic(); bench_asm64(); tics = toc();
	printf("%6lu tics\n", tics);
#endif

	printf("%-14s", "Empty loop:");
	tic(); bench_none32(); tics = toc();
	printf("%6lu tics,", tics);
	tic(); bench_none64(); tics = toc();
	printf("%6lu tics\n", tics);

	return 0;
}

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: Alternative implementation of the generic __ffs
  2008-04-18 20:18             ` Alternative implementation of the generic __ffs Alexander van Heukelum
@ 2008-04-18 23:46               ` dean gaudet
  2008-04-19  0:09                 ` Harvey Harrison
  0 siblings, 1 reply; 34+ messages in thread
From: dean gaudet @ 2008-04-18 23:46 UTC (permalink / raw)
  To: Alexander van Heukelum
  Cc: Alexander van Heukelum, Ingo Molnar, Andi Kleen, LKML

On Fri, 18 Apr 2008, Alexander van Heukelum wrote:

> On Mon, Apr 07, 2008 at 12:25:50PM +0200, Alexander van Heukelum wrote:
> > On Sun, 6 Apr 2008 13:22:58 -0700 (PDT), "dean gaudet" <dean@arctic.org> said:
> > > On Sun, 6 Apr 2008, Alexander van Heukelum wrote:
> > > > The current generic implementation of ffz is O(lg(n)) already
> > > 
> > > it's O(lg(n)) time... the operations all depend on each other.
> > > 
> > > the implementation i pointed to is O(lg(n)) code space... and the time 
> > > depends on how parallel the machine is, they're not dependent on each 
> > > other.
> > 
> > Indeed. The worst dependencies are in the sum of all the partial
> > results in this implementation. And addition is associative, so
> > partial results can be written as ((a+b)+(c+d))+(e+f). Assuming
> > perfect parallel execution this would lead to O(ln(ln(n))). Good.
> 
> Hello all,
> 
> I've implemented ffs (find first set bit) like it is shown
> in http://www.hackersdelight.org/ (see revisions, page 21).

sweet!  thanks for doing this.


> static ATTR int __ffs32_new(unsigned int value)
> {
> 	int x0, x1, x2, x3, x4;
> 
> 	value &= -value;
> 	x0 = (value & 0x55555555) ? 0 : 1;
> 	x1 = (value & 0x33333333) ? 0 : 2;
> 	x2 = (value & 0x0f0f0f0f) ? 0 : 4;
> 	x3 = (value & 0x00ff00ff) ? 0 : 8;
> 	x4 = (value & 0x0000ffff) ? 0 : 16;

technically you can compute x4 with the original value prior to isolating 
the least-significant one-bit -- the compiler probably can't figure this 
out on its own though, so it's probably worth hoisting it manually.


> 	return x0 | x1 | x2 | x3 | x4;

i'm never sure if it's better to use | or + here... i bet it depends on 
what native operations the processor has... and depends on how ?: are 
implemented.

-dean

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: Alternative implementation of the generic __ffs
  2008-04-18 23:46               ` dean gaudet
@ 2008-04-19  0:09                 ` Harvey Harrison
  2008-04-19  0:20                   ` dean gaudet
  0 siblings, 1 reply; 34+ messages in thread
From: Harvey Harrison @ 2008-04-19  0:09 UTC (permalink / raw)
  To: dean gaudet
  Cc: Alexander van Heukelum, Alexander van Heukelum, Ingo Molnar,
	Andi Kleen, LKML

On Fri, 2008-04-18 at 16:46 -0700, dean gaudet wrote:
> On Fri, 18 Apr 2008, Alexander van Heukelum wrote:
> 
> > On Mon, Apr 07, 2008 at 12:25:50PM +0200, Alexander van Heukelum wrote:
> > > On Sun, 6 Apr 2008 13:22:58 -0700 (PDT), "dean gaudet" <dean@arctic.org> said:
> > > > On Sun, 6 Apr 2008, Alexander van Heukelum wrote:
> > > > > The current generic implementation of ffz is O(lg(n)) already
> > > > 
> > > > it's O(lg(n)) time... the operations all depend on each other.
> > > > 
> > > > the implementation i pointed to is O(lg(n)) code space... and the time 
> > > > depends on how parallel the machine is, they're not dependent on each 
> > > > other.
> > > 
> > > Indeed. The worst dependencies are in the sum of all the partial
> > > results in this implementation. And addition is associative, so
> > > partial results can be written as ((a+b)+(c+d))+(e+f). Assuming
> > > perfect parallel execution this would lead to O(ln(ln(n))). Good.
> > 
> > Hello all,
> > 
> > I've implemented ffs (find first set bit) like it is shown
> > in http://www.hackersdelight.org/ (see revisions, page 21).
> 
> sweet!  thanks for doing this.
> 
> 
> > static ATTR int __ffs32_new(unsigned int value)
> > {
> > 	int x0, x1, x2, x3, x4;
> > 
> > 	value &= -value;
> > 	x0 = (value & 0x55555555) ? 0 : 1;
> > 	x1 = (value & 0x33333333) ? 0 : 2;
> > 	x2 = (value & 0x0f0f0f0f) ? 0 : 4;
> > 	x3 = (value & 0x00ff00ff) ? 0 : 8;
> > 	x4 = (value & 0x0000ffff) ? 0 : 16;

How about:
	u8 x;

	value &= -value;
	x = (value & 0x55555555) ? 0 : 1;
	x |= (value & 0x33333333) ? 0 : 2;
	x |= (value & 0x0f0f0f0f) ? 0 : 4;
	x |= (value & 0x00ff00ff) ? 0 : 8;
	x |= (value & 0x0000ffff) ? 0 : 16;

	return x;

Harvey


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: Alternative implementation of the generic __ffs
  2008-04-19  0:09                 ` Harvey Harrison
@ 2008-04-19  0:20                   ` dean gaudet
  2008-04-19  0:58                     ` Joe Perches
  0 siblings, 1 reply; 34+ messages in thread
From: dean gaudet @ 2008-04-19  0:20 UTC (permalink / raw)
  To: Harvey Harrison
  Cc: Alexander van Heukelum, Alexander van Heukelum, Ingo Molnar,
	Andi Kleen, LKML

On Fri, 18 Apr 2008, Harvey Harrison wrote:

> On Fri, 2008-04-18 at 16:46 -0700, dean gaudet wrote:
> > On Fri, 18 Apr 2008, Alexander van Heukelum wrote:
> > 
> > > On Mon, Apr 07, 2008 at 12:25:50PM +0200, Alexander van Heukelum wrote:
> > > > On Sun, 6 Apr 2008 13:22:58 -0700 (PDT), "dean gaudet" <dean@arctic.org> said:
> > > > > On Sun, 6 Apr 2008, Alexander van Heukelum wrote:
> > > > > > The current generic implementation of ffz is O(lg(n)) already
> > > > > 
> > > > > it's O(lg(n)) time... the operations all depend on each other.
> > > > > 
> > > > > the implementation i pointed to is O(lg(n)) code space... and the time 
> > > > > depends on how parallel the machine is, they're not dependent on each 
> > > > > other.
> > > > 
> > > > Indeed. The worst dependencies are in the sum of all the partial
> > > > results in this implementation. And addition is associative, so
> > > > partial results can be written as ((a+b)+(c+d))+(e+f). Assuming
> > > > perfect parallel execution this would lead to O(ln(ln(n))). Good.
> > > 
> > > Hello all,
> > > 
> > > I've implemented ffs (find first set bit) like it is shown
> > > in http://www.hackersdelight.org/ (see revisions, page 21).
> > 
> > sweet!  thanks for doing this.
> > 
> > 
> > > static ATTR int __ffs32_new(unsigned int value)
> > > {
> > > 	int x0, x1, x2, x3, x4;
> > > 
> > > 	value &= -value;
> > > 	x0 = (value & 0x55555555) ? 0 : 1;
> > > 	x1 = (value & 0x33333333) ? 0 : 2;
> > > 	x2 = (value & 0x0f0f0f0f) ? 0 : 4;
> > > 	x3 = (value & 0x00ff00ff) ? 0 : 8;
> > > 	x4 = (value & 0x0000ffff) ? 0 : 16;
> 
> How about:
> 	u8 x;
> 
> 	value &= -value;
> 	x = (value & 0x55555555) ? 0 : 1;
> 	x |= (value & 0x33333333) ? 0 : 2;
> 	x |= (value & 0x0f0f0f0f) ? 0 : 4;
> 	x |= (value & 0x00ff00ff) ? 0 : 8;
> 	x |= (value & 0x0000ffff) ? 0 : 16;

any reasonable compiler should figure out the two are the same... but i 
really prefer spelling out the lack of dependencies of the computations by 
breaking it out per-bit.

-dean

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: Alternative implementation of the generic __ffs
  2008-04-19  0:20                   ` dean gaudet
@ 2008-04-19  0:58                     ` Joe Perches
  2008-04-19  1:04                       ` Harvey Harrison
  0 siblings, 1 reply; 34+ messages in thread
From: Joe Perches @ 2008-04-19  0:58 UTC (permalink / raw)
  To: dean gaudet
  Cc: Harvey Harrison, Alexander van Heukelum, Alexander van Heukelum,
	Ingo Molnar, Andi Kleen, LKML

On Fri, 2008-04-18 at 17:20 -0700, dean gaudet wrote:
> any reasonable compiler should figure out the two are the same... but i 
> really prefer spelling out the lack of dependencies of the computations by 
> breaking it out per-bit.

It seems gcc 4.3 (-Os or -O2) isn't a reasonable compiler.

I think this might be best:

int ffs32(unsigned int value)
{
	int x;

	value &= -value;
	if (!(value & 0x55555555))
		x = 1;
	else
		x = 0;
	if (!(value & 0x33333333))
		x |= 2;
	if (!(value & 0x0f0f0f0f))
		x |= 4;
	if (!(value & 0x00ff00ff))
		x |= 8;
	if (!(value & 0x0000ffff))
		x |= 16;

	return x;
}


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: Alternative implementation of the generic __ffs
  2008-04-19  0:58                     ` Joe Perches
@ 2008-04-19  1:04                       ` Harvey Harrison
  2008-04-19  1:11                         ` dean gaudet
  0 siblings, 1 reply; 34+ messages in thread
From: Harvey Harrison @ 2008-04-19  1:04 UTC (permalink / raw)
  To: Joe Perches
  Cc: dean gaudet, Alexander van Heukelum, Alexander van Heukelum,
	Ingo Molnar, Andi Kleen, LKML

On Fri, 2008-04-18 at 17:58 -0700, Joe Perches wrote:
> On Fri, 2008-04-18 at 17:20 -0700, dean gaudet wrote:
> > any reasonable compiler should figure out the two are the same... but i 
> > really prefer spelling out the lack of dependencies of the computations by 
> > breaking it out per-bit.
> 
> It seems gcc 4.3 (-Os or -O2) isn't a reasonable compiler.
> 
> I think this might be best:
> 
> int ffs32(unsigned int value)
> {
> 	int x;
> 
> 	value &= -value;
> 	if (!(value & 0x55555555))
> 		x = 1;
> 	else
> 		x = 0;
> 	if (!(value & 0x33333333))
> 		x |= 2;
> 	if (!(value & 0x0f0f0f0f))
> 		x |= 4;
> 	if (!(value & 0x00ff00ff))
> 		x |= 8;
> 	if (!(value & 0x0000ffff))
> 		x |= 16;
> 
> 	return x;
> }
> 

That produces the shortest assembly for me, also uses the fewest
registers.

Harvey


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: Alternative implementation of the generic __ffs
  2008-04-19  1:04                       ` Harvey Harrison
@ 2008-04-19  1:11                         ` dean gaudet
  2008-04-19  2:55                           ` Joe Perches
  0 siblings, 1 reply; 34+ messages in thread
From: dean gaudet @ 2008-04-19  1:11 UTC (permalink / raw)
  To: Harvey Harrison
  Cc: Joe Perches, Alexander van Heukelum, Alexander van Heukelum,
	Ingo Molnar, Andi Kleen, LKML

On Fri, 18 Apr 2008, Harvey Harrison wrote:

> On Fri, 2008-04-18 at 17:58 -0700, Joe Perches wrote:
> > On Fri, 2008-04-18 at 17:20 -0700, dean gaudet wrote:
> > > any reasonable compiler should figure out the two are the same... but i 
> > > really prefer spelling out the lack of dependencies of the computations by 
> > > breaking it out per-bit.
> > 
> > It seems gcc 4.3 (-Os or -O2) isn't a reasonable compiler.
> > 
> > I think this might be best:
> > 
> > int ffs32(unsigned int value)
> > {
> > 	int x;
> > 
> > 	value &= -value;
> > 	if (!(value & 0x55555555))
> > 		x = 1;
> > 	else
> > 		x = 0;
> > 	if (!(value & 0x33333333))
> > 		x |= 2;
> > 	if (!(value & 0x0f0f0f0f))
> > 		x |= 4;
> > 	if (!(value & 0x00ff00ff))
> > 		x |= 8;
> > 	if (!(value & 0x0000ffff))
> > 		x |= 16;
> > 
> > 	return x;
> > }
> > 
> 
> That produces the shortest assembly for me, also uses the fewest
> registers.

unfortunately it kind of defeats the purpose of the original code... which 
is high parallelism / no-dependencies.

have you benchmarked it?

-dean


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: Alternative implementation of the generic __ffs
  2008-04-19  1:11                         ` dean gaudet
@ 2008-04-19  2:55                           ` Joe Perches
  2008-04-19  4:13                             ` dean gaudet
  2008-04-19 22:29                             ` Matti Aarnio
  0 siblings, 2 replies; 34+ messages in thread
From: Joe Perches @ 2008-04-19  2:55 UTC (permalink / raw)
  To: dean gaudet
  Cc: Harvey Harrison, Alexander van Heukelum, Alexander van Heukelum,
	Ingo Molnar, Andi Kleen, LKML

On Fri, 2008-04-18 at 18:11 -0700, dean gaudet wrote: 
> have you benchmarked it?

I modified Alexander's benchmark:
http://lkml.org/lkml/2008/4/18/267
to include 32 and 64 bit variants called smallest.

On an old ARM:

$ gcc --version
gcc (GCC) 3.4.6

$ cat /proc/cpuinfo 
Processor	: Intel StrongARM-110 rev 4 (v4l)
BogoMIPS	: 262.14
Hardware	: Rebel-NetWinder
Revision	: 57ff
Serial		: 000000000000185c

$ gcc -Os -fomit-frame-pointer ffs.c
$ ./a.out
Original:       3180 tics,  8379 tics
New:            4280 tics,  8890 tics
Smallest:       4027 tics,  7835 tics
Empty loop:     1543 tics,  2260 tics

$ gcc -O2 -fomit-frame-pointer ffs.c
$ ./a.out
Original:       3161 tics,  7843 tics
New:            4778 tics,  8783 tics
Smallest:       4408 tics,  7149 tics
Empty loop:     1515 tics,  2140 tics

$ gcc -O3 -fomit-frame-pointer ffs.c
$ ./a.out
Original:       3078 tics,  7692 tics
New:            4714 tics,  8671 tics
Smallest:       4344 tics,  7117 tics
Empty loop:     1444 tics,  2024 tics

On my old laptop:

$ cat /proc/cpuinfo 
processor       : 0
vendor_id       : GenuineIntel
cpu family      : 15
model           : 2
model name      : Intel(R) Pentium(R) 4 Mobile CPU 1.60GHz
stepping        : 4
cpu MHz         : 1200.000
cache size      : 512 KB
fdiv_bug        : no
hlt_bug         : no
f00f_bug        : no
coma_bug        : no
fpu             : yes
fpu_exception   : yes
cpuid level     : 2
wp              : yes
flags           : fpu vme de pse tsc msr pae mce cx8 sep mtrr pge mca
cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm up
bogomips        : 2400.03
clflush size    : 64

$ gcc --version
gcc (GCC) 4.3.0

$ gcc -Os -fomit-frame-pointer ffs.c
$ ./a.out
Original:        901 tics,  1426 tics
New:             862 tics,  1244 tics
Smallest:        911 tics,  1331 tics
Assembly:        208 tics,   402 tics
Empty loop:      208 tics,   304 tics

$ gcc -O2 -fomit-frame-pointer ffs.c
$ ./a.out
Original:        918 tics,  1386 tics
New:             872 tics,  1193 tics
Smallest:        906 tics,  1309 tics
Assembly:        202 tics,   396 tics
Empty loop:      207 tics,   299 tics

$ gcc -O3 -fomit-frame-pointer ffs.c
$ ./a.out
Original:        865 tics,  1389 tics
New:             852 tics,  1183 tics
Smallest:        907 tics,  1296 tics
Assembly:        200 tics,   390 tics
Empty loop:      211 tics,   296 tics



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: Alternative implementation of the generic __ffs
  2008-04-19  2:55                           ` Joe Perches
@ 2008-04-19  4:13                             ` dean gaudet
  2008-04-19 10:05                               ` Mikael Pettersson
  2008-04-19 12:10                               ` Alexander van Heukelum
  2008-04-19 22:29                             ` Matti Aarnio
  1 sibling, 2 replies; 34+ messages in thread
From: dean gaudet @ 2008-04-19  4:13 UTC (permalink / raw)
  To: Joe Perches
  Cc: Harvey Harrison, Alexander van Heukelum, Alexander van Heukelum,
	Ingo Molnar, Andi Kleen, LKML

On Fri, 18 Apr 2008, Joe Perches wrote:

> On Fri, 2008-04-18 at 18:11 -0700, dean gaudet wrote: 
> > have you benchmarked it?
> 
> I modified Alexander's benchmark:
> http://lkml.org/lkml/2008/4/18/267
> to include 32 and 64 bit variants called smallest.
> 
> On an old ARM:

i'm guessing the 32-bit constants suck :(

the code could be modified to use 16-bit constants only -- it would add 
some dependent operations though (to move the hot bit into the low 
16-bits).

-dean


> 
> $ gcc --version gcc (GCC) 3.4.6
> 
> $ cat /proc/cpuinfo 
> Processor	: Intel StrongARM-110 rev 4 (v4l)
> BogoMIPS	: 262.14
> Hardware	: Rebel-NetWinder
> Revision	: 57ff
> Serial		: 000000000000185c
> 
> $ gcc -Os -fomit-frame-pointer ffs.c
> $ ./a.out
> Original:       3180 tics,  8379 tics
> New:            4280 tics,  8890 tics
> Smallest:       4027 tics,  7835 tics
> Empty loop:     1543 tics,  2260 tics
> 
> $ gcc -O2 -fomit-frame-pointer ffs.c
> $ ./a.out
> Original:       3161 tics,  7843 tics
> New:            4778 tics,  8783 tics
> Smallest:       4408 tics,  7149 tics
> Empty loop:     1515 tics,  2140 tics
> 
> $ gcc -O3 -fomit-frame-pointer ffs.c
> $ ./a.out
> Original:       3078 tics,  7692 tics
> New:            4714 tics,  8671 tics
> Smallest:       4344 tics,  7117 tics
> Empty loop:     1444 tics,  2024 tics

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: Alternative implementation of the generic __ffs
  2008-04-19  4:13                             ` dean gaudet
@ 2008-04-19 10:05                               ` Mikael Pettersson
  2008-04-19 12:10                               ` Alexander van Heukelum
  1 sibling, 0 replies; 34+ messages in thread
From: Mikael Pettersson @ 2008-04-19 10:05 UTC (permalink / raw)
  To: dean gaudet
  Cc: Joe Perches, Harvey Harrison, Alexander van Heukelum,
	Alexander van Heukelum, Ingo Molnar, Andi Kleen, LKML

dean gaudet writes:
 > On Fri, 18 Apr 2008, Joe Perches wrote:
 > 
 > > On Fri, 2008-04-18 at 18:11 -0700, dean gaudet wrote: 
 > > > have you benchmarked it?
 > > 
 > > I modified Alexander's benchmark:
 > > http://lkml.org/lkml/2008/4/18/267
 > > to include 32 and 64 bit variants called smallest.
 > > 
 > > On an old ARM:
 > 
 > i'm guessing the 32-bit constants suck :(
 > 
 > the code could be modified to use 16-bit constants only

16-bit immediates don't help, as ARMs express immediate
operands in arithmetic instructions as 8-bit values plus
a 4-bit rotation count (which is multiplied by 2).

Very new ARMs can construct full 16-bit immediates, but
that still takes an additional instruction and an additional
register.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: Alternative implementation of the generic __ffs
  2008-04-19  4:13                             ` dean gaudet
  2008-04-19 10:05                               ` Mikael Pettersson
@ 2008-04-19 12:10                               ` Alexander van Heukelum
  2008-04-19 18:17                                 ` Joe Perches
  1 sibling, 1 reply; 34+ messages in thread
From: Alexander van Heukelum @ 2008-04-19 12:10 UTC (permalink / raw)
  To: dean gaudet, Joe Perches
  Cc: Harvey Harrison, Alexander van Heukelum, Ingo Molnar, Andi Kleen, LKML

On Fri, 18 Apr 2008 21:13:47 -0700 (PDT), "dean gaudet"
<dean@arctic.org> said:
> On Fri, 18 Apr 2008, Joe Perches wrote:
> > On Fri, 2008-04-18 at 18:11 -0700, dean gaudet wrote: 
> > > have you benchmarked it?
> > 
> > I modified Alexander's benchmark:
> > http://lkml.org/lkml/2008/4/18/267
> > to include 32 and 64 bit variants called smallest.
> > 
> > On an old ARM:
> 
> i'm guessing the 32-bit constants suck :(
> 
> the code could be modified to use 16-bit constants only -- it would add 
> some dependent operations though (to move the hot bit into the low 
> 16-bits).
> 
> -dean

That would look like this (although I chose to reduce to less than 128,
due to completely irrelevant x86 considerations ;) ).

static ATTR int __ffs32_smallconstant(unsigned int value)
{
        int x0, x1, x2, x3, x4;
        unsigned int t2, t4;

        value &= -value;
        t2 = value | (value >> 16);
        t4 = t2 | (t2 >> 8);
        x4 = (value << 16) ? 0 : 16;
        x3 = (t2 << 24) ? 0 : 8;
        x2 = (t4 & 0x0f) ? 0 : 4;
        x1 = (t4 & 0x33) ? 0 : 2;
        x0 = (t4 & 0x55) ? 0 : 1;

        return x4 | x3 | x2 | x1 | x0;
}

I've added that to the benchmark, which you can now find here:
http://heukelum.fastmail.fm/ffs/. Testing the same with 
"return x4 + x3 + x2 + x1 + x0;" as the last line would be
interesting too.

Greetings,
    Alexander

> > $ gcc --version gcc (GCC) 3.4.6
> > 
> > $ cat /proc/cpuinfo 
> > Processor	: Intel StrongARM-110 rev 4 (v4l)
> > BogoMIPS	: 262.14
> > Hardware	: Rebel-NetWinder
> > Revision	: 57ff
> > Serial		: 000000000000185c
> > 
> > $ gcc -Os -fomit-frame-pointer ffs.c
> > $ ./a.out
> > Original:       3180 tics,  8379 tics
> > New:            4280 tics,  8890 tics
> > Smallest:       4027 tics,  7835 tics
> > Empty loop:     1543 tics,  2260 tics
> > 
> > $ gcc -O2 -fomit-frame-pointer ffs.c
> > $ ./a.out
> > Original:       3161 tics,  7843 tics
> > New:            4778 tics,  8783 tics
> > Smallest:       4408 tics,  7149 tics
> > Empty loop:     1515 tics,  2140 tics
> > 
> > $ gcc -O3 -fomit-frame-pointer ffs.c
> > $ ./a.out
> > Original:       3078 tics,  7692 tics
> > New:            4714 tics,  8671 tics
> > Smallest:       4344 tics,  7117 tics
> > Empty loop:     1444 tics,  2024 tics

Thanks for testing, Harvey!
-- 
  Alexander van Heukelum
  heukelum@fastmail.fm

-- 
http://www.fastmail.fm - IMAP accessible web-mail


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: Alternative implementation of the generic __ffs
  2008-04-19 12:10                               ` Alexander van Heukelum
@ 2008-04-19 18:17                                 ` Joe Perches
  2008-04-19 20:26                                   ` Alexander van Heukelum
  0 siblings, 1 reply; 34+ messages in thread
From: Joe Perches @ 2008-04-19 18:17 UTC (permalink / raw)
  To: Alexander van Heukelum
  Cc: dean gaudet, Harvey Harrison, Alexander van Heukelum,
	Ingo Molnar, Andi Kleen, LKML

On Sat, 2008-04-19 at 14:10 +0200, Alexander van Heukelum wrote:
> I've added that to the benchmark, which you can now find here:
> http://heukelum.fastmail.fm/ffs/.

retested on arm:

$ gcc -Os -fomit-frame-pointer ffs.c
$ ./a.out
Original:       3170 tics,  8326 tics
New:            4214 tics,  8793 tics
Smallest:       4023 tics,  7733 tics
Small const:    3442 tics,  6188 tics
Empty loop:     1517 tics,  2243 tics

$ gcc -O2 -fomit-frame-pointer ffs.c
$ ./a.out
Original:       3172 tics,  7832 tics
New:            4805 tics,  8790 tics
Smallest:       4405 tics,  7154 tics
Small const:    3442 tics,  5612 tics
Empty loop:     1516 tics,  2145 tics

$ gcc -O3 -fomit-frame-pointer ffs.c
$ ./a.out
Original:       3080 tics,  7709 tics
New:            4723 tics,  8656 tics
Smallest:       4333 tics,  7121 tics
Small const:    3379 tics,  5483 tics
Empty loop:     1447 tics,  2016 tics

>  Testing the same with 
> "return x4 + x3 + x2 + x1 + x0;" as the last line would be
> interesting too.

Adding is slower:

$ gcc -Os -fomit-frame-pointer ffs.c
$ ./a.out
Original:       3152 tics,  8310 tics
New:            4214 tics,  8789 tics
Smallest:       4024 tics,  7737 tics
Small const:    3538 tics,  6295 tics
Empty loop:     1517 tics,  2243 tics

$ gcc -O2 -fomit-frame-pointer ffs.c
$ ./a.out
Original:       3184 tics,  7849 tics
New:            4790 tics,  8814 tics
Smallest:       4406 tics,  7161 tics
Small const:    3538 tics,  5806 tics
Empty loop:     1521 tics,  2153 tics

$ gcc -O3 -fomit-frame-pointer ffs.c
$ ./a.out
Original:       3091 tics,  7694 tics
New:            4718 tics,  8656 tics
Smallest:       4333 tics,  7124 tics
Small const:    3467 tics,  5687 tics
Empty loop:     1445 tics,  2066 tics



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: Alternative implementation of the generic __ffs
  2008-04-19 18:17                                 ` Joe Perches
@ 2008-04-19 20:26                                   ` Alexander van Heukelum
  0 siblings, 0 replies; 34+ messages in thread
From: Alexander van Heukelum @ 2008-04-19 20:26 UTC (permalink / raw)
  To: Joe Perches
  Cc: dean gaudet, Harvey Harrison, Alexander van Heukelum,
	Ingo Molnar, Andi Kleen, LKML

On Sat, 19 Apr 2008 11:17:01 -0700, "Joe Perches" <joe@perches.com>
said:
> On Sat, 2008-04-19 at 14:10 +0200, Alexander van Heukelum wrote:
> > I've added that to the benchmark, which you can now find here:
> > http://heukelum.fastmail.fm/ffs/.

Thanks! Added the version you sent to the program and added the results
of the ARM processor to the page.

More ideas welcome ;).

> retested on arm:
> 
> $ gcc -Os -fomit-frame-pointer ffs.c
> $ ./a.out
> Original:       3170 tics,  8326 tics
> New:            4214 tics,  8793 tics
> Smallest:       4023 tics,  7733 tics
> Small const:    3442 tics,  6188 tics
> Empty loop:     1517 tics,  2243 tics
> 
> $ gcc -O2 -fomit-frame-pointer ffs.c
> $ ./a.out
> Original:       3172 tics,  7832 tics
> New:            4805 tics,  8790 tics
> Smallest:       4405 tics,  7154 tics
> Small const:    3442 tics,  5612 tics
> Empty loop:     1516 tics,  2145 tics
> 
> $ gcc -O3 -fomit-frame-pointer ffs.c
> $ ./a.out
> Original:       3080 tics,  7709 tics
> New:            4723 tics,  8656 tics
> Smallest:       4333 tics,  7121 tics
> Small const:    3379 tics,  5483 tics
> Empty loop:     1447 tics,  2016 tics
> 
> >  Testing the same with 
> > "return x4 + x3 + x2 + x1 + x0;" as the last line would be
> > interesting too.
> 
> Adding is slower:
> 
> $ gcc -Os -fomit-frame-pointer ffs.c
> $ ./a.out
> Original:       3152 tics,  8310 tics
> New:            4214 tics,  8789 tics
> Smallest:       4024 tics,  7737 tics
> Small const:    3538 tics,  6295 tics
> Empty loop:     1517 tics,  2243 tics
> 
> $ gcc -O2 -fomit-frame-pointer ffs.c
> $ ./a.out
> Original:       3184 tics,  7849 tics
> New:            4790 tics,  8814 tics
> Smallest:       4406 tics,  7161 tics
> Small const:    3538 tics,  5806 tics
> Empty loop:     1521 tics,  2153 tics
> 
> $ gcc -O3 -fomit-frame-pointer ffs.c
> $ ./a.out
> Original:       3091 tics,  7694 tics
> New:            4718 tics,  8656 tics
> Smallest:       4333 tics,  7124 tics
> Small const:    3467 tics,  5687 tics
> Empty loop:     1445 tics,  2066 tics
> 
> 
-- 
  Alexander van Heukelum
  heukelum@fastmail.fm

-- 
http://www.fastmail.fm - And now for something completely different…


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: Alternative implementation of the generic __ffs
  2008-04-19  2:55                           ` Joe Perches
  2008-04-19  4:13                             ` dean gaudet
@ 2008-04-19 22:29                             ` Matti Aarnio
  2008-04-20  3:06                               ` Joe Perches
  1 sibling, 1 reply; 34+ messages in thread
From: Matti Aarnio @ 2008-04-19 22:29 UTC (permalink / raw)
  To: Joe Perches
  Cc: Harvey Harrison, Alexander van Heukelum, Alexander van Heukelum, LKML

On Fri, Apr 18, 2008 at 07:55:28PM -0700, Joe Perches wrote:
> On Fri, 2008-04-18 at 18:11 -0700, dean gaudet wrote: 
> > have you benchmarked it?
> 
> I modified Alexander's benchmark:
> http://lkml.org/lkml/2008/4/18/267
> to include 32 and 64 bit variants called smallest.
> 
> On an old ARM:

...

I am curious, why not take the code already in glibc ffs() for ARM ?
That is, if the ffs() is all that important detail in kernel ?

  /Matti Aarnio

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: Alternative implementation of the generic __ffs
  2008-04-19 22:29                             ` Matti Aarnio
@ 2008-04-20  3:06                               ` Joe Perches
  2008-04-20  8:42                                 ` Alexander van Heukelum
  0 siblings, 1 reply; 34+ messages in thread
From: Joe Perches @ 2008-04-20  3:06 UTC (permalink / raw)
  To: Matti Aarnio
  Cc: Harvey Harrison, Alexander van Heukelum, Alexander van Heukelum, LKML

On Sun, 2008-04-20 at 01:29 +0300, Matti Aarnio wrote:
> I am curious, why not take the code already in glibc ffs() for ARM ?
> That is, if the ffs() is all that important detail in kernel ?

Here's test results with the glibc ffs implementation.
(small const is still using slower add rather than or)

$ gcc -Os -fomit-frame-pointer ffs.c
$ ./a.out
Original:       3155 tics,  8331 tics
New:            4211 tics,  8793 tics
Smallest:       4019 tics,  7754 tics
Small const:    3552 tics,  6308 tics
glibc:          2816 tics,  6911 tics
Empty loop:     1516 tics,  2244 tics

$ gcc -O2 -fomit-frame-pointer ffs.c
$ ./a.out
Original:       3155 tics,  7828 tics
New:            4792 tics,  8825 tics
Smallest:       4401 tics,  7155 tics
Small const:    3539 tics,  5805 tics
glibc:          2720 tics,  7061 tics
Empty loop:     1516 tics,  2148 tics

$ gcc -O3 -fomit-frame-pointer ffs.c
$ ./a.out
Original:       3080 tics,  7706 tics
New:            4721 tics,  8663 tics
Smallest:       4334 tics,  7116 tics
Small const:    3466 tics,  5672 tics
glibc:          2649 tics,  6939 tics
Empty loop:     1444 tics,  2012 tics



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: Alternative implementation of the generic __ffs
  2008-04-20  3:06                               ` Joe Perches
@ 2008-04-20  8:42                                 ` Alexander van Heukelum
  2008-04-20 12:31                                   ` Matti Aarnio
  0 siblings, 1 reply; 34+ messages in thread
From: Alexander van Heukelum @ 2008-04-20  8:42 UTC (permalink / raw)
  To: Joe Perches, Matti Aarnio; +Cc: Harvey Harrison, Alexander van Heukelum, LKML

On Sat, 19 Apr 2008 20:06:57 -0700, "Joe Perches" <joe@perches.com>
said:
> On Sun, 2008-04-20 at 01:29 +0300, Matti Aarnio wrote:
> > I am curious, why not take the code already in glibc ffs() for ARM ?
> > That is, if the ffs() is all that important detail in kernel ?

Hi,

The glibc version is based on a table-lookup. This makes it
behave differently in hot and cold cache situations. That's
fine if __ffs is used in tight loops, but in the kernel such
use of __ffs is avoided because it might be slow. I added it
to the benchmark, but it would need testing for the cold
cache case too.

As for the importance of __ffs in the kernel: as far as I
know the hot-spots in the kernel using __ffs are the
schedular (sched_find_first_bit) and the cpu mask walking
code (for_each_cpu_mask).

Greetings,
    Alexander

> Here's test results with the glibc ffs implementation.
> (small const is still using slower add rather than or)

Added, thanks.

> $ gcc -Os -fomit-frame-pointer ffs.c
> $ ./a.out
> Original:       3155 tics,  8331 tics
> New:            4211 tics,  8793 tics
> Smallest:       4019 tics,  7754 tics
> Small const:    3552 tics,  6308 tics
> glibc:          2816 tics,  6911 tics
> Empty loop:     1516 tics,  2244 tics
> 
> $ gcc -O2 -fomit-frame-pointer ffs.c
> $ ./a.out
> Original:       3155 tics,  7828 tics
> New:            4792 tics,  8825 tics
> Smallest:       4401 tics,  7155 tics
> Small const:    3539 tics,  5805 tics
> glibc:          2720 tics,  7061 tics
> Empty loop:     1516 tics,  2148 tics
> 
> $ gcc -O3 -fomit-frame-pointer ffs.c
> $ ./a.out
> Original:       3080 tics,  7706 tics
> New:            4721 tics,  8663 tics
> Smallest:       4334 tics,  7116 tics
> Small const:    3466 tics,  5672 tics
> glibc:          2649 tics,  6939 tics
> Empty loop:     1444 tics,  2012 tics
> 
> 
-- 
  Alexander van Heukelum
  heukelum@fastmail.fm

-- 
http://www.fastmail.fm - A no graphics, no pop-ups email service


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: Alternative implementation of the generic __ffs
  2008-04-20  8:42                                 ` Alexander van Heukelum
@ 2008-04-20 12:31                                   ` Matti Aarnio
  2008-04-21 11:43                                     ` Alexander van Heukelum
  0 siblings, 1 reply; 34+ messages in thread
From: Matti Aarnio @ 2008-04-20 12:31 UTC (permalink / raw)
  To: Alexander van Heukelum
  Cc: Joe Perches, Harvey Harrison, Alexander van Heukelum, LKML

On Sun, Apr 20, 2008 at 10:42:21AM +0200, Alexander van Heukelum wrote:
> On Sat, 19 Apr 2008 20:06:57 -0700, "Joe Perches" <joe@perches.com>
> said:
> > On Sun, 2008-04-20 at 01:29 +0300, Matti Aarnio wrote:
> > > I am curious, why not take the code already in glibc ffs() for ARM ?
> > > That is, if the ffs() is all that important detail in kernel ?
> 
> Hi,
> 
> The glibc version is based on a table-lookup. This makes it
> behave differently in hot and cold cache situations. That's
> fine if __ffs is used in tight loops, but in the kernel such
> use of __ffs is avoided because it might be slow. I added it
> to the benchmark, but it would need testing for the cold
> cache case too.
> 
> As for the importance of __ffs in the kernel: as far as I
> know the hot-spots in the kernel using __ffs are the
> schedular (sched_find_first_bit) and the cpu mask walking
> code (for_each_cpu_mask).

Perhaps those hot-spots would benefit from more broadly
accelerable algorithms.   ARM architecture v5 introduced
a CLZ instruction -- Count Leading Zeroes.

Well, gcc's  __builtin_ffs() for ARM Arch5 and up (including
XScale) does things in a bit more interesting way:

  http://mail-index.netbsd.org/port-arm/2002/08/20/0001.html

$ cat try.c
int foo(int i)
{
        return __builtin_ffs(i);
}
$ arm-gp2x-linux-gcc -S -O -march=armv5 try.c 
$ more try.s 
        .file   "try.c"
        .text
        .align  2
        .global foo
        .type   foo, %function
foo:
        @ args = 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        @ link register save eliminated.
        @ lr needed for prologue
        rsb     r3, r0, #0
        and     r3, r3, r0
        clz     r3, r3
        rsb     r0, r3, #32
        bx      lr
        .size   foo, .-foo
        .ident  "GCC: (GNU) 4.1.2 (Fedora GP2X 4.1.2-8.fc9)"


> Greetings,
>     Alexander

/Matti Aarnio

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: Alternative implementation of the generic __ffs
  2008-04-20 12:31                                   ` Matti Aarnio
@ 2008-04-21 11:43                                     ` Alexander van Heukelum
  0 siblings, 0 replies; 34+ messages in thread
From: Alexander van Heukelum @ 2008-04-21 11:43 UTC (permalink / raw)
  To: Matti Aarnio; +Cc: Joe Perches, Harvey Harrison, Alexander van Heukelum, LKML

On Sun, 20 Apr 2008 15:31:46 +0300, "Matti Aarnio"
<matti.aarnio@zmailer.org> said:
> On Sun, Apr 20, 2008 at 10:42:21AM +0200, Alexander van Heukelum wrote:
> > On Sat, 19 Apr 2008 20:06:57 -0700, "Joe Perches" <joe@perches.com>
> > said:
> > > On Sun, 2008-04-20 at 01:29 +0300, Matti Aarnio wrote:
> > > > I am curious, why not take the code already in glibc ffs() for ARM ?
> > > > That is, if the ffs() is all that important detail in kernel ?
> > 
> > Hi,
> > 
> > The glibc version is based on a table-lookup. This makes it
> > behave differently in hot and cold cache situations. That's
> > fine if __ffs is used in tight loops, but in the kernel such
> > use of __ffs is avoided because it might be slow. I added it
> > to the benchmark, but it would need testing for the cold
> > cache case too.
> > 
> > As for the importance of __ffs in the kernel: as far as I
> > know the hot-spots in the kernel using __ffs are the
> > schedular (sched_find_first_bit) and the cpu mask walking
> > code (for_each_cpu_mask).
> 
> Perhaps those hot-spots would benefit from more broadly
> accelerable algorithms.

That would be a possibility too ;). The advantages of bitmaps
are that they are so compact and so easy to understand.

>                      ARM architecture v5 introduced
> a CLZ instruction -- Count Leading Zeroes.

Yeah, if such an instruction exist, the arch should provide
optimized versions for the bit functions. The interest here
was to provide a (beter) generic version in pure C for
architectures without such instructions.

Inline assembly versions are indeed provided in
asm-arm/bitops.h for ARM5+.

Greetings,
    Alexander

> Well, gcc's  __builtin_ffs() for ARM Arch5 and up (including
> XScale) does things in a bit more interesting way:
> 
>   http://mail-index.netbsd.org/port-arm/2002/08/20/0001.html
> 
> $ cat try.c
> int foo(int i)
> {
>         return __builtin_ffs(i);
> }
> $ arm-gp2x-linux-gcc -S -O -march=armv5 try.c 
> $ more try.s 
>         .file   "try.c"
>         .text
>         .align  2
>         .global foo
>         .type   foo, %function
> foo:
>         @ args = 0, pretend = 0, frame = 0
>         @ frame_needed = 0, uses_anonymous_args = 0
>         @ link register save eliminated.
>         @ lr needed for prologue
>         rsb     r3, r0, #0
>         and     r3, r3, r0
>         clz     r3, r3
>         rsb     r0, r3, #32
>         bx      lr
>         .size   foo, .-foo
>         .ident  "GCC: (GNU) 4.1.2 (Fedora GP2X 4.1.2-8.fc9)"
> 
> 
> > Greetings,
> >     Alexander
> 
> /Matti Aarnio
-- 
  Alexander van Heukelum
  heukelum@fastmail.fm

-- 
http://www.fastmail.fm - I mean, what is it about a decent email service?


^ permalink raw reply	[flat|nested] 34+ messages in thread

end of thread, other threads:[~2008-04-21 11:43 UTC | newest]

Thread overview: 34+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-03-31 17:15 [PATCH] x86: generic versions of find_first_(zero_)bit, convert i386 Alexander van Heukelum
2008-03-31 17:22 ` Stephen Hemminger
2008-03-31 19:38   ` Alexander van Heukelum
2008-03-31 21:58     ` Andi Kleen
2008-04-01  8:47 ` Ingo Molnar
2008-04-01  9:46   ` Alexander van Heukelum
2008-04-01 15:41     ` [PATCH] x86: switch x86_64 to generic find_first_bit Alexander van Heukelum
2008-04-01 15:42       ` [PATCH] x86: optimize find_first_bit for small bitmaps Alexander van Heukelum
2008-04-01 15:47         ` [PATCH] x86: remove x86-specific implementations of find_first_bit Alexander van Heukelum
2008-04-03  9:34           ` Alexander van Heukelum
2008-04-04  8:47           ` Ingo Molnar
2008-04-06 17:03     ` [PATCH] x86: generic versions of find_first_(zero_)bit, convert i386 dean gaudet
2008-04-06 18:51       ` Alexander van Heukelum
2008-04-06 20:22         ` dean gaudet
2008-04-07  8:43           ` Ingo Molnar
2008-04-07 10:25           ` Alexander van Heukelum
2008-04-18 20:18             ` Alternative implementation of the generic __ffs Alexander van Heukelum
2008-04-18 23:46               ` dean gaudet
2008-04-19  0:09                 ` Harvey Harrison
2008-04-19  0:20                   ` dean gaudet
2008-04-19  0:58                     ` Joe Perches
2008-04-19  1:04                       ` Harvey Harrison
2008-04-19  1:11                         ` dean gaudet
2008-04-19  2:55                           ` Joe Perches
2008-04-19  4:13                             ` dean gaudet
2008-04-19 10:05                               ` Mikael Pettersson
2008-04-19 12:10                               ` Alexander van Heukelum
2008-04-19 18:17                                 ` Joe Perches
2008-04-19 20:26                                   ` Alexander van Heukelum
2008-04-19 22:29                             ` Matti Aarnio
2008-04-20  3:06                               ` Joe Perches
2008-04-20  8:42                                 ` Alexander van Heukelum
2008-04-20 12:31                                   ` Matti Aarnio
2008-04-21 11:43                                     ` Alexander van Heukelum

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).