All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] div64_64 support
@ 2007-02-24  1:05 Stephen Hemminger
  2007-02-24 16:19 ` Sami Farin
  2007-02-26 20:09 ` Jan Engelhardt
  0 siblings, 2 replies; 62+ messages in thread
From: Stephen Hemminger @ 2007-02-24  1:05 UTC (permalink / raw)
  To: linux-kernel, netdev

Since there already two users of full 64 bit division in the kernel,
and other places maybe hiding out as well. Add a full 64/64 bit divide.

Yes this expensive, but there are places where it is necessary.
It is not clear if doing the scaling buys any advantage on 64 bit platforms,
so for them a full divide is done.

---
 include/asm-arm/div64.h      |    2 ++
 include/asm-generic/div64.h  |    8 ++++++++
 include/asm-m68k/div64.h     |    2 ++
 include/asm-mips/div64.h     |    8 ++++++++
 include/asm-um/div64.h       |    1 +
 include/asm-xtensa/div64.h   |    1 +
 lib/div64.c                  |   22 ++++++++++++++++++++++
 net/ipv4/tcp_cubic.c         |   22 ----------------------
 net/netfilter/xt_connbytes.c |   16 ----------------
 9 files changed, 44 insertions(+), 38 deletions(-)

--- linux-2.6.21-rc1.orig/include/asm-arm/div64.h	2007-02-23 16:44:41.000000000 -0800
+++ linux-2.6.21-rc1/include/asm-arm/div64.h	2007-02-23 16:57:36.000000000 -0800
@@ -221,6 +221,8 @@
 	__nr;								\
 })
 
+extern uint64_t div64_64(uint64_t dividend, uint64_t divisor);
+
 #endif
 
 #endif
--- linux-2.6.21-rc1.orig/include/asm-generic/div64.h	2007-02-23 16:35:27.000000000 -0800
+++ linux-2.6.21-rc1/include/asm-generic/div64.h	2007-02-23 16:56:40.000000000 -0800
@@ -30,9 +30,17 @@
 	__rem;							\
  })
 
+/*
+ * div64_64 - Divide two 64 bit numbers
+ */
+static inline uint64_t div64_64(uint64_t dividend, uint64_t divisor)
+{
+	return dividend / divisor;
+}
 #elif BITS_PER_LONG == 32
 
 extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor);
+extern uint64_t div64_64(uint64_t dividend, uint64_t divisor);
 
 /* The unnecessary pointer compare is there
  * to check for type safety (n must be 64bit)
--- linux-2.6.21-rc1.orig/include/asm-m68k/div64.h	2007-02-23 16:45:20.000000000 -0800
+++ linux-2.6.21-rc1/include/asm-m68k/div64.h	2007-02-23 16:56:27.000000000 -0800
@@ -23,4 +23,6 @@
 	__rem;							\
 })
 
+extern uint64_t div64_64(uint64_t dividend, uint64_t divisor);
+
 #endif /* _M68K_DIV64_H */
--- linux-2.6.21-rc1.orig/include/asm-mips/div64.h	2007-02-23 16:45:25.000000000 -0800
+++ linux-2.6.21-rc1/include/asm-mips/div64.h	2007-02-23 16:59:52.000000000 -0800
@@ -78,6 +78,9 @@
 	__quot = __quot << 32 | __low; \
 	(n) = __quot; \
 	__mod; })
+
+extern uint64_t div64_64(uint64_t dividend, uint64_t divisor);
+
 #endif /* (_MIPS_SZLONG == 32) */
 
 #if (_MIPS_SZLONG == 64)
@@ -101,6 +104,11 @@
 	(n) = __quot; \
 	__mod; })
 
+static inline uint64_t div64_64(uint64_t dividend, uint64_t divisor)
+{
+	return dividend / divisor;
+}
+
 #endif /* (_MIPS_SZLONG == 64) */
 
 #endif /* _ASM_DIV64_H */
--- linux-2.6.21-rc1.orig/include/asm-um/div64.h	2007-02-23 16:45:37.000000000 -0800
+++ linux-2.6.21-rc1/include/asm-um/div64.h	2007-02-23 16:58:29.000000000 -0800
@@ -3,4 +3,5 @@
 
 #include "asm/arch/div64.h"
 
+extern uint64_t div64_64(uint64_t dividend, uint64_t divisor);
 #endif
--- linux-2.6.21-rc1.orig/include/asm-xtensa/div64.h	2007-02-23 16:45:43.000000000 -0800
+++ linux-2.6.21-rc1/include/asm-xtensa/div64.h	2007-02-23 16:58:04.000000000 -0800
@@ -16,4 +16,5 @@
 	n /= (unsigned int) base; \
 	__res; })
 
+extern uint64_t div64_64(uint64_t dividend, uint64_t divisor);
 #endif
--- linux-2.6.21-rc1.orig/lib/div64.c	2007-02-23 16:50:40.000000000 -0800
+++ linux-2.6.21-rc1/lib/div64.c	2007-02-23 16:54:56.000000000 -0800
@@ -18,6 +18,7 @@
 
 #include <linux/types.h>
 #include <linux/module.h>
+#include <asm/bitops.h>
 #include <asm/div64.h>
 
 /* Not needed on 64bit architectures */
@@ -58,4 +59,25 @@
 
 EXPORT_SYMBOL(__div64_32);
 
+/* Use scaling to do a full 64 bit division  */
+uint64_t div64_64(uint64_t dividend, uint64_t divisor)
+{
+	uint32_t d = divisor;
+
+	if (divisor > 0xffffffffULL) {
+		unsigned int shift = fls(divisor >> 32);
+
+		d = divisor >> shift;
+		dividend >>= shift;
+	}
+
+	/* avoid 64 bit division if possible */
+	if (dividend >> 32)
+		do_div(dividend, d);
+	else
+		dividend = (uint32_t) dividend / d;
+
+	return dividend;
+}
+
 #endif /* BITS_PER_LONG == 32 */
--- linux-2.6.21-rc1.orig/net/ipv4/tcp_cubic.c	2007-02-23 16:33:52.000000000 -0800
+++ linux-2.6.21-rc1/net/ipv4/tcp_cubic.c	2007-02-23 16:45:50.000000000 -0800
@@ -51,7 +51,6 @@
 module_param(tcp_friendliness, int, 0644);
 MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
 
-#include <asm/div64.h>
 
 /* BIC TCP Parameters */
 struct bictcp {
@@ -93,27 +92,6 @@
 		tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
 }
 
-/* 64bit divisor, dividend and result. dynamic precision */
-static inline u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor)
-{
-	u_int32_t d = divisor;
-
-	if (divisor > 0xffffffffULL) {
-		unsigned int shift = fls(divisor >> 32);
-
-		d = divisor >> shift;
-		dividend >>= shift;
-	}
-
-	/* avoid 64 bit division if possible */
-	if (dividend >> 32)
-		do_div(dividend, d);
-	else
-		dividend = (uint32_t) dividend / d;
-
-	return dividend;
-}
-
 /*
  * calculate the cubic root of x using Newton-Raphson
  */
--- linux-2.6.21-rc1.orig/net/netfilter/xt_connbytes.c	2007-02-23 16:40:57.000000000 -0800
+++ linux-2.6.21-rc1/net/netfilter/xt_connbytes.c	2007-02-23 16:41:09.000000000 -0800
@@ -24,22 +24,6 @@
 MODULE_DESCRIPTION("iptables match for matching number of pkts/bytes per connection");
 MODULE_ALIAS("ipt_connbytes");
 
-/* 64bit divisor, dividend and result. dynamic precision */
-static u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor)
-{
-	u_int32_t d = divisor;
-
-	if (divisor > 0xffffffffULL) {
-		unsigned int shift = fls(divisor >> 32);
-
-		d = divisor >> shift;
-		dividend >>= shift;
-	}
-
-	do_div(dividend, d);
-	return dividend;
-}
-
 static int
 match(const struct sk_buff *skb,
       const struct net_device *in,

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-02-24  1:05 [RFC] div64_64 support Stephen Hemminger
@ 2007-02-24 16:19 ` Sami Farin
  2007-02-26 19:28   ` Stephen Hemminger
  2007-02-26 20:09 ` Jan Engelhardt
  1 sibling, 1 reply; 62+ messages in thread
From: Sami Farin @ 2007-02-24 16:19 UTC (permalink / raw)
  To: linux-kernel, netdev

On Fri, Feb 23, 2007 at 17:05:27 -0800, Stephen Hemminger wrote:
> Since there already two users of full 64 bit division in the kernel,
> and other places maybe hiding out as well. Add a full 64/64 bit divide.
> 
> Yes this expensive, but there are places where it is necessary.
> It is not clear if doing the scaling buys any advantage on 64 bit platforms,
> so for them a full divide is done.

Still does not work after these fixes... how came?

WARNING: "div64_64" [net/netfilter/xt_connbytes.ko] undefined!
WARNING: "div64_64" [net/ipv4/tcp_cubic.ko] undefined!

--- linux-2.6.19/include/asm-i386/div64.h.bak	2006-11-29 23:57:37.000000000 +0200
+++ linux-2.6.19/include/asm-i386/div64.h	2007-02-24 16:24:55.822529880 +0200
@@ -45,4 +45,7 @@ div_ll_X_l_rem(long long divs, long div,
 	return dum2;
 
 }
+
+extern uint64_t div64_64(uint64_t dividend, uint64_t divisor);
+
 #endif
--- linux-2.6.19/lib/div64.c.bak	2007-02-24 16:10:03.686084000 +0200
+++ linux-2.6.19/lib/div64.c	2007-02-24 17:01:11.224517353 +0200
@@ -80,4 +80,6 @@ uint64_t div64_64(uint64_t dividend, uin
 	return dividend;
 }
 
+EXPORT_SYMBOL(div64_64);
+
 #endif /* BITS_PER_LONG == 32 */

-- 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-02-24 16:19 ` Sami Farin
@ 2007-02-26 19:28   ` Stephen Hemminger
  2007-02-26 19:39     ` David Miller
  0 siblings, 1 reply; 62+ messages in thread
From: Stephen Hemminger @ 2007-02-26 19:28 UTC (permalink / raw)
  To: linux-kernel

asm-i386.h/div64 and div64.o needs to move in Makefile to get this to work
on i386.



---
 include/asm-arm/div64.h      |    2 ++
 include/asm-generic/div64.h  |    8 ++++++++
 include/asm-i386/div64.h     |    5 +++++
 include/asm-m68k/div64.h     |    2 ++
 include/asm-mips/div64.h     |    8 ++++++++
 include/asm-um/div64.h       |    1 +
 include/asm-xtensa/div64.h   |    1 +
 lib/Makefile                 |    5 +++--
 lib/div64.c                  |   23 +++++++++++++++++++++++
 net/ipv4/tcp_cubic.c         |   22 ----------------------
 net/netfilter/xt_connbytes.c |   16 ----------------
 11 files changed, 53 insertions(+), 40 deletions(-)

--- sky2.orig/include/asm-arm/div64.h	2007-02-26 10:17:33.000000000 -0800
+++ sky2/include/asm-arm/div64.h	2007-02-26 10:17:50.000000000 -0800
@@ -221,6 +221,8 @@
 	__nr;								\
 })
 
+extern uint64_t div64_64(uint64_t dividend, uint64_t divisor);
+
 #endif
 
 #endif
--- sky2.orig/include/asm-generic/div64.h	2007-02-26 10:17:33.000000000 -0800
+++ sky2/include/asm-generic/div64.h	2007-02-26 10:17:50.000000000 -0800
@@ -30,9 +30,17 @@
 	__rem;							\
  })
 
+/*
+ * div64_64 - Divide two 64 bit numbers
+ */
+static inline uint64_t div64_64(uint64_t dividend, uint64_t divisor)
+{
+	return dividend / divisor;
+}
 #elif BITS_PER_LONG == 32
 
 extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor);
+extern uint64_t div64_64(uint64_t dividend, uint64_t divisor);
 
 /* The unnecessary pointer compare is there
  * to check for type safety (n must be 64bit)
--- sky2.orig/include/asm-m68k/div64.h	2007-02-26 10:17:33.000000000 -0800
+++ sky2/include/asm-m68k/div64.h	2007-02-26 10:17:50.000000000 -0800
@@ -23,4 +23,6 @@
 	__rem;							\
 })
 
+extern uint64_t div64_64(uint64_t dividend, uint64_t divisor);
+
 #endif /* _M68K_DIV64_H */
--- sky2.orig/include/asm-mips/div64.h	2007-02-26 10:17:33.000000000 -0800
+++ sky2/include/asm-mips/div64.h	2007-02-26 10:17:50.000000000 -0800
@@ -78,6 +78,9 @@
 	__quot = __quot << 32 | __low; \
 	(n) = __quot; \
 	__mod; })
+
+extern uint64_t div64_64(uint64_t dividend, uint64_t divisor);
+
 #endif /* (_MIPS_SZLONG == 32) */
 
 #if (_MIPS_SZLONG == 64)
@@ -101,6 +104,11 @@
 	(n) = __quot; \
 	__mod; })
 
+static inline uint64_t div64_64(uint64_t dividend, uint64_t divisor)
+{
+	return dividend / divisor;
+}
+
 #endif /* (_MIPS_SZLONG == 64) */
 
 #endif /* _ASM_DIV64_H */
--- sky2.orig/include/asm-um/div64.h	2007-02-26 10:17:33.000000000 -0800
+++ sky2/include/asm-um/div64.h	2007-02-26 10:17:50.000000000 -0800
@@ -3,4 +3,5 @@
 
 #include "asm/arch/div64.h"
 
+extern uint64_t div64_64(uint64_t dividend, uint64_t divisor);
 #endif
--- sky2.orig/include/asm-xtensa/div64.h	2007-02-26 10:17:33.000000000 -0800
+++ sky2/include/asm-xtensa/div64.h	2007-02-26 10:17:50.000000000 -0800
@@ -16,4 +16,5 @@
 	n /= (unsigned int) base; \
 	__res; })
 
+extern uint64_t div64_64(uint64_t dividend, uint64_t divisor);
 #endif
--- sky2.orig/lib/div64.c	2007-02-26 10:17:33.000000000 -0800
+++ sky2/lib/div64.c	2007-02-26 10:31:49.000000000 -0800
@@ -18,6 +18,7 @@
 
 #include <linux/types.h>
 #include <linux/module.h>
+#include <asm/bitops.h>
 #include <asm/div64.h>
 
 /* Not needed on 64bit architectures */
@@ -58,4 +59,26 @@
 
 EXPORT_SYMBOL(__div64_32);
 
+/* Use scaling to do a full 64 bit division  */
+uint64_t div64_64(uint64_t dividend, uint64_t divisor)
+{
+	uint32_t d = divisor;
+
+	if (divisor > 0xffffffffULL) {
+		unsigned int shift = fls(divisor >> 32);
+
+		d = divisor >> shift;
+		dividend >>= shift;
+	}
+
+	/* avoid 64 bit division if possible */
+	if (dividend >> 32)
+		do_div(dividend, d);
+	else
+		dividend = (uint32_t) dividend / d;
+
+	return dividend;
+}
+EXPORT_SYMBOL(div64_64);
+
 #endif /* BITS_PER_LONG == 32 */
--- sky2.orig/net/ipv4/tcp_cubic.c	2007-02-26 10:17:33.000000000 -0800
+++ sky2/net/ipv4/tcp_cubic.c	2007-02-26 10:17:50.000000000 -0800
@@ -51,7 +51,6 @@
 module_param(tcp_friendliness, int, 0644);
 MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
 
-#include <asm/div64.h>
 
 /* BIC TCP Parameters */
 struct bictcp {
@@ -93,27 +92,6 @@
 		tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
 }
 
-/* 64bit divisor, dividend and result. dynamic precision */
-static inline u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor)
-{
-	u_int32_t d = divisor;
-
-	if (divisor > 0xffffffffULL) {
-		unsigned int shift = fls(divisor >> 32);
-
-		d = divisor >> shift;
-		dividend >>= shift;
-	}
-
-	/* avoid 64 bit division if possible */
-	if (dividend >> 32)
-		do_div(dividend, d);
-	else
-		dividend = (uint32_t) dividend / d;
-
-	return dividend;
-}
-
 /*
  * calculate the cubic root of x using Newton-Raphson
  */
--- sky2.orig/net/netfilter/xt_connbytes.c	2007-02-26 10:17:33.000000000 -0800
+++ sky2/net/netfilter/xt_connbytes.c	2007-02-26 10:17:50.000000000 -0800
@@ -24,22 +24,6 @@
 MODULE_DESCRIPTION("iptables match for matching number of pkts/bytes per connection");
 MODULE_ALIAS("ipt_connbytes");
 
-/* 64bit divisor, dividend and result. dynamic precision */
-static u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor)
-{
-	u_int32_t d = divisor;
-
-	if (divisor > 0xffffffffULL) {
-		unsigned int shift = fls(divisor >> 32);
-
-		d = divisor >> shift;
-		dividend >>= shift;
-	}
-
-	do_div(dividend, d);
-	return dividend;
-}
-
 static int
 match(const struct sk_buff *skb,
       const struct net_device *in,
--- sky2.orig/include/asm-i386/div64.h	2007-02-26 10:36:29.000000000 -0800
+++ sky2/include/asm-i386/div64.h	2007-02-26 11:07:47.000000000 -0800
@@ -1,6 +1,8 @@
 #ifndef __I386_DIV64
 #define __I386_DIV64
 
+#include <linux/types.h>
+
 /*
  * do_div() is NOT a C function. It wants to return
  * two values (the quotient and the remainder), but
@@ -45,4 +47,7 @@
 	return dum2;
 
 }
+
+extern uint64_t div64_64(uint64_t dividend, uint64_t divisor);
+
 #endif
--- sky2.orig/lib/Makefile	2007-02-26 11:24:48.000000000 -0800
+++ sky2/lib/Makefile	2007-02-26 11:26:57.000000000 -0800
@@ -4,7 +4,7 @@
 
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o dump_stack.o \
-	 idr.o div64.o int_sqrt.o bitmap.o extable.o prio_tree.o \
+	 idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \
 	 sha1.o irq_regs.o reciprocal_div.o
 
 lib-$(CONFIG_MMU) += ioremap.o
@@ -12,7 +12,8 @@
 
 lib-y	+= kobject.o kref.o kobject_uevent.o klist.o
 
-obj-y += sort.o parser.o halfmd4.o debug_locks.o random32.o bust_spinlocks.o
+obj-y += sort.o parser.o div64.o halfmd4.o debug_locks.o random32.o \
+	 bust_spinlocks.o
 
 ifeq ($(CONFIG_DEBUG_KOBJECT),y)
 CFLAGS_kobject.o += -DDEBUG

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-02-26 19:28   ` Stephen Hemminger
@ 2007-02-26 19:39     ` David Miller
  0 siblings, 0 replies; 62+ messages in thread
From: David Miller @ 2007-02-26 19:39 UTC (permalink / raw)
  To: shemminger; +Cc: linux-kernel

From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Mon, 26 Feb 2007 11:28:18 -0800

> asm-i386.h/div64 and div64.o needs to move in Makefile to get this to work
> on i386.

This looks great to me:

Signed-off-by: David S. Miller <davem@davemloft.net>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-02-24  1:05 [RFC] div64_64 support Stephen Hemminger
  2007-02-24 16:19 ` Sami Farin
@ 2007-02-26 20:09 ` Jan Engelhardt
  2007-02-26 21:28   ` Stephen Hemminger
  2007-02-26 22:31   ` Stephen Hemminger
  1 sibling, 2 replies; 62+ messages in thread
From: Jan Engelhardt @ 2007-02-26 20:09 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: linux-kernel, netdev


On Feb 23 2007 17:05, Stephen Hemminger wrote:
>
>Since there already two users of full 64 bit division in the kernel,
>and other places maybe hiding out as well. Add a full 64/64 bit divide.
>
>Yes this expensive, but there are places where it is necessary.
>It is not clear if doing the scaling buys any advantage on 64 bit platforms,
>so for them a full divide is done.
>
>---
> include/asm-arm/div64.h      |    2 ++
> include/asm-generic/div64.h  |    8 ++++++++
> include/asm-m68k/div64.h     |    2 ++
> include/asm-mips/div64.h     |    8 ++++++++
> include/asm-um/div64.h       |    1 +
> include/asm-xtensa/div64.h   |    1 +
> lib/div64.c                  |   22 ++++++++++++++++++++++
> net/ipv4/tcp_cubic.c         |   22 ----------------------
> net/netfilter/xt_connbytes.c |   16 ----------------
> 9 files changed, 44 insertions(+), 38 deletions(-)

Actually, there is udivdi3 support in the kernel

./arch/arm26/lib/udivdi3.c
./arch/sh/lib/udivdi3.c
./arch/sparc/lib/udivdi3.S

should not this be consolidated too?




Jan
-- 
ft: http://freshmeat.net/p/chaostables/

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-02-26 20:09 ` Jan Engelhardt
@ 2007-02-26 21:28   ` Stephen Hemminger
  2007-02-27  1:20     ` H. Peter Anvin
  2007-02-26 22:31   ` Stephen Hemminger
  1 sibling, 1 reply; 62+ messages in thread
From: Stephen Hemminger @ 2007-02-26 21:28 UTC (permalink / raw)
  To: Jan Engelhardt; +Cc: linux-kernel, netdev

On Mon, 26 Feb 2007 21:09:26 +0100 (MET)
Jan Engelhardt <jengelh@linux01.gwdg.de> wrote:

> 
> On Feb 23 2007 17:05, Stephen Hemminger wrote:
> >
> >Since there already two users of full 64 bit division in the kernel,
> >and other places maybe hiding out as well. Add a full 64/64 bit divide.
> >
> >Yes this expensive, but there are places where it is necessary.
> >It is not clear if doing the scaling buys any advantage on 64 bit platforms,
> >so for them a full divide is done.
> >
> >---
> > include/asm-arm/div64.h      |    2 ++
> > include/asm-generic/div64.h  |    8 ++++++++
> > include/asm-m68k/div64.h     |    2 ++
> > include/asm-mips/div64.h     |    8 ++++++++
> > include/asm-um/div64.h       |    1 +
> > include/asm-xtensa/div64.h   |    1 +
> > lib/div64.c                  |   22 ++++++++++++++++++++++
> > net/ipv4/tcp_cubic.c         |   22 ----------------------
> > net/netfilter/xt_connbytes.c |   16 ----------------
> > 9 files changed, 44 insertions(+), 38 deletions(-)
> 
> Actually, there is udivdi3 support in the kernel
> 
> ./arch/arm26/lib/udivdi3.c
> ./arch/sh/lib/udivdi3.c
> ./arch/sparc/lib/udivdi3.S
> 
> should not this be consolidated too?


Hmm. Those are the GCC internal versions, that are picked up but
doing divide in place. Do we want to allow general 64 bit in kernel to
be easily used? It could cause sloppy slow code, but it would look
cleaner.

-- 
Stephen Hemminger <shemminger@linux-foundation.org>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-02-26 20:09 ` Jan Engelhardt
  2007-02-26 21:28   ` Stephen Hemminger
@ 2007-02-26 22:31   ` Stephen Hemminger
  2007-02-26 23:02     ` Jan Engelhardt
                       ` (2 more replies)
  1 sibling, 3 replies; 62+ messages in thread
From: Stephen Hemminger @ 2007-02-26 22:31 UTC (permalink / raw)
  To: Jan Engelhardt; +Cc: linux-kernel, netdev

Here is another way to handle the 64 bit divide case.
It allows full 64 bit divide by adding the support routine
GCC needs.

---
 arch/alpha/Kconfig           |    4 ++++
 arch/arm/Kconfig             |    4 ++++
 arch/arm26/Kconfig           |    4 ++++
 arch/avr32/Kconfig           |    4 ++++
 arch/cris/Kconfig            |    4 ++++
 arch/frv/Kconfig             |    4 ++++
 arch/h8300/Kconfig           |    4 ++++
 arch/i386/Kconfig            |    4 ++++
 arch/m32r/Kconfig            |    4 ++++
 arch/m68k/Kconfig            |    4 ++++
 arch/m68knommu/Kconfig       |    4 ++++
 arch/mips/Kconfig            |    4 ++++
 arch/parisc/Kconfig          |    4 ++++
 arch/powerpc/Kconfig         |    4 ++++
 arch/ppc/Kconfig             |    4 ++++
 arch/s390/Kconfig            |    4 ++++
 arch/sh64/Kconfig            |    4 ++++
 arch/v850/Kconfig            |    3 +++
 arch/xtensa/Kconfig          |    4 ++++
 lib/Makefile                 |    1 +
 lib/udivdi3.c                |   37 +++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_cubic.c         |   26 ++------------------------
 net/netfilter/xt_connbytes.c |   19 +------------------
 23 files changed, 116 insertions(+), 42 deletions(-)

--- pktgen.orig/net/ipv4/tcp_cubic.c	2007-02-26 13:40:08.000000000 -0800
+++ pktgen/net/ipv4/tcp_cubic.c	2007-02-26 14:30:00.000000000 -0800
@@ -51,7 +51,6 @@
 module_param(tcp_friendliness, int, 0644);
 MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
 
-#include <asm/div64.h>
 
 /* BIC TCP Parameters */
 struct bictcp {
@@ -93,27 +92,6 @@
 		tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
 }
 
-/* 64bit divisor, dividend and result. dynamic precision */
-static inline u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor)
-{
-	u_int32_t d = divisor;
-
-	if (divisor > 0xffffffffULL) {
-		unsigned int shift = fls(divisor >> 32);
-
-		d = divisor >> shift;
-		dividend >>= shift;
-	}
-
-	/* avoid 64 bit division if possible */
-	if (dividend >> 32)
-		do_div(dividend, d);
-	else
-		dividend = (uint32_t) dividend / d;
-
-	return dividend;
-}
-
 /*
  * calculate the cubic root of x using Newton-Raphson
  */
@@ -134,7 +112,7 @@
 	 */
 	do {
 		x1 = x;
-		x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3;
+		x = (2 * x + (u32) (a / x*x)) / 3;
 	} while (abs(x1 - x) > 1);
 
 	return x;
--- pktgen.orig/net/netfilter/xt_connbytes.c	2007-02-26 13:40:08.000000000 -0800
+++ pktgen/net/netfilter/xt_connbytes.c	2007-02-26 14:29:13.000000000 -0800
@@ -16,7 +16,6 @@
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/xt_connbytes.h>
 
-#include <asm/div64.h>
 #include <asm/bitops.h>
 
 MODULE_LICENSE("GPL");
@@ -24,22 +23,6 @@
 MODULE_DESCRIPTION("iptables match for matching number of pkts/bytes per connection");
 MODULE_ALIAS("ipt_connbytes");
 
-/* 64bit divisor, dividend and result. dynamic precision */
-static u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor)
-{
-	u_int32_t d = divisor;
-
-	if (divisor > 0xffffffffULL) {
-		unsigned int shift = fls(divisor >> 32);
-
-		d = divisor >> shift;
-		dividend >>= shift;
-	}
-
-	do_div(dividend, d);
-	return dividend;
-}
-
 static int
 match(const struct sk_buff *skb,
       const struct net_device *in,
@@ -106,7 +89,7 @@
 			break;
 		}
 		if (pkts != 0)
-			what = div64_64(bytes, pkts);
+			what = bytes / pkts;
 		break;
 	}
 
--- pktgen.orig/lib/Makefile	2007-02-26 13:40:08.000000000 -0800
+++ pktgen/lib/Makefile	2007-02-26 14:17:31.000000000 -0800
@@ -28,6 +28,7 @@
 lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o
 lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
 obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
+obj-$(CONFIG_GENERIC_UDIVDI3) += udivdi3.o
 obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
 obj-$(CONFIG_PLIST) += plist.o
 obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
--- pktgen.orig/arch/alpha/Kconfig	2007-02-26 13:51:29.000000000 -0800
+++ pktgen/arch/alpha/Kconfig	2007-02-26 13:54:35.000000000 -0800
@@ -33,6 +33,10 @@
 	bool
 	default n
 
+config GENERIC_UDIVDI3
+	bool
+	default y
+
 config GENERIC_FIND_NEXT_BIT
 	bool
 	default y
--- pktgen.orig/arch/arm/Kconfig	2007-02-26 13:51:29.000000000 -0800
+++ pktgen/arch/arm/Kconfig	2007-02-26 13:54:57.000000000 -0800
@@ -90,6 +90,10 @@
 	bool
 	default n
 
+config GENERIC_UDIVDI3
+	bool
+	default y
+
 config GENERIC_HWEIGHT
 	bool
 	default y
--- pktgen.orig/arch/arm26/Kconfig	2007-02-26 13:48:46.000000000 -0800
+++ pktgen/arch/arm26/Kconfig	2007-02-26 13:55:24.000000000 -0800
@@ -49,6 +49,10 @@
 	bool
 	default n
 
+config GENERIC_UDIVDI3
+	bool
+	default y
+
 config GENERIC_HWEIGHT
 	bool
 	default y
--- pktgen.orig/arch/avr32/Kconfig	2007-02-26 13:51:29.000000000 -0800
+++ pktgen/arch/avr32/Kconfig	2007-02-26 13:55:39.000000000 -0800
@@ -53,6 +53,10 @@
 	bool
 	default n
 
+config GENERIC_UDIVDI3
+	bool
+	default y
+
 config GENERIC_BUST_SPINLOCK
 	bool
 
--- pktgen.orig/arch/cris/Kconfig	2007-02-26 13:51:29.000000000 -0800
+++ pktgen/arch/cris/Kconfig	2007-02-26 13:55:53.000000000 -0800
@@ -28,6 +28,10 @@
 	bool
 	default n
 
+config GENERIC_UDIVDI3
+	bool
+	default y
+
 config GENERIC_FIND_NEXT_BIT
 	bool
 	default y
--- pktgen.orig/arch/frv/Kconfig	2007-02-26 13:51:29.000000000 -0800
+++ pktgen/arch/frv/Kconfig	2007-02-26 13:56:09.000000000 -0800
@@ -53,6 +53,10 @@
 	bool
 	default y
 
+config GENERIC_UDIVDI3
+	bool
+	default y
+
 mainmenu "Fujitsu FR-V Kernel Configuration"
 
 source "init/Kconfig"
--- pktgen.orig/arch/h8300/Kconfig	2007-02-26 13:51:29.000000000 -0800
+++ pktgen/arch/h8300/Kconfig	2007-02-26 13:56:20.000000000 -0800
@@ -41,6 +41,10 @@
 	bool
 	default n
 
+config GENERIC_UDIVDI3
+	bool
+	default y
+
 config GENERIC_FIND_NEXT_BIT
 	bool
 	default y
--- pktgen.orig/arch/i386/Kconfig	2007-02-26 14:01:59.000000000 -0800
+++ pktgen/arch/i386/Kconfig	2007-02-26 14:02:28.000000000 -0800
@@ -75,6 +75,10 @@
 	bool
 	default y
 
+config GENERIC_UDIVDI3
+	bool
+	default y
+
 config ARCH_MAY_HAVE_PC_FDC
 	bool
 	default y
--- pktgen.orig/arch/m32r/Kconfig	2007-02-26 13:51:29.000000000 -0800
+++ pktgen/arch/m32r/Kconfig	2007-02-26 13:56:43.000000000 -0800
@@ -229,6 +229,10 @@
 	bool
 	default n
 
+config GENERIC_UDIVDI3
+	bool
+	default y
+
 config GENERIC_FIND_NEXT_BIT
 	bool
 	default y
--- pktgen.orig/arch/m68k/Kconfig	2007-02-26 13:51:29.000000000 -0800
+++ pktgen/arch/m68k/Kconfig	2007-02-26 13:56:56.000000000 -0800
@@ -25,6 +25,10 @@
 	bool
 	default n
 
+config GENERIC_UDIVDI3
+	bool
+	default y
+
 config GENERIC_HWEIGHT
 	bool
 	default y
--- pktgen.orig/arch/m68knommu/Kconfig	2007-02-26 13:51:29.000000000 -0800
+++ pktgen/arch/m68knommu/Kconfig	2007-02-26 13:57:05.000000000 -0800
@@ -37,6 +37,10 @@
 	bool
 	default n
 
+config GENERIC_UDIVDI3
+	bool
+	default y
+
 config GENERIC_FIND_NEXT_BIT
 	bool
 	default y
--- pktgen.orig/arch/mips/Kconfig	2007-02-26 13:51:29.000000000 -0800
+++ pktgen/arch/mips/Kconfig	2007-02-26 13:57:13.000000000 -0800
@@ -843,6 +843,10 @@
 	bool
 	default n
 
+config GENERIC_UDIVDI3
+	bool
+	default y
+
 config GENERIC_FIND_NEXT_BIT
 	bool
 	default y
--- pktgen.orig/arch/parisc/Kconfig	2007-02-26 13:51:29.000000000 -0800
+++ pktgen/arch/parisc/Kconfig	2007-02-26 13:57:21.000000000 -0800
@@ -33,6 +33,10 @@
 	bool
 	default n
 
+config GENERIC_UDIVDI3
+	bool
+	default y
+
 config GENERIC_FIND_NEXT_BIT
 	bool
 	default y
--- pktgen.orig/arch/powerpc/Kconfig	2007-02-26 13:51:29.000000000 -0800
+++ pktgen/arch/powerpc/Kconfig	2007-02-26 13:57:31.000000000 -0800
@@ -49,6 +49,10 @@
 	bool
 	default y if 64BIT
 
+config GENERIC_UDIVDI3
+	bool
+	default y
+
 config GENERIC_HWEIGHT
 	bool
 	default y
--- pktgen.orig/arch/ppc/Kconfig	2007-02-26 13:51:29.000000000 -0800
+++ pktgen/arch/ppc/Kconfig	2007-02-26 13:57:44.000000000 -0800
@@ -27,6 +27,10 @@
 	bool
 	default n
 
+config GENERIC_UDIVDI3
+	bool
+	default y
+
 config GENERIC_HWEIGHT
 	bool
 	default y
--- pktgen.orig/arch/s390/Kconfig	2007-02-26 13:51:29.000000000 -0800
+++ pktgen/arch/s390/Kconfig	2007-02-26 13:57:53.000000000 -0800
@@ -34,6 +34,10 @@
 	bool
 	default n
 
+config GENERIC_UDIVDI3
+	bool
+	default y
+
 config GENERIC_HWEIGHT
 	bool
 	default y
--- pktgen.orig/arch/sh64/Kconfig	2007-02-26 13:51:29.000000000 -0800
+++ pktgen/arch/sh64/Kconfig	2007-02-26 14:00:32.000000000 -0800
@@ -21,6 +21,10 @@
 	bool
 	default y
 
+config GENERIC_UDIVDI3
+	bool
+	default y
+
 config GENERIC_FIND_NEXT_BIT
 	bool
 	default y
--- pktgen.orig/arch/v850/Kconfig	2007-02-26 13:51:29.000000000 -0800
+++ pktgen/arch/v850/Kconfig	2007-02-26 13:59:29.000000000 -0800
@@ -19,6 +19,9 @@
 config RWSEM_XCHGADD_ALGORITHM
 	bool
 	default n
+config GENERIC_UDIVDI3
+	bool
+	default y
 config GENERIC_FIND_NEXT_BIT
 	bool
 	default y
--- pktgen.orig/arch/xtensa/Kconfig	2007-02-26 13:51:29.000000000 -0800
+++ pktgen/arch/xtensa/Kconfig	2007-02-26 13:59:45.000000000 -0800
@@ -26,6 +26,10 @@
 	bool
 	default y
 
+config GENERIC_UDIVDI3
+	bool
+	default y
+
 config GENERIC_FIND_NEXT_BIT
 	bool
 	default y
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ pktgen/lib/udivdi3.c	2007-02-26 14:14:13.000000000 -0800
@@ -0,0 +1,37 @@
+/*
+ * Generic C  version of full 64 bit by 64 bit division
+ * Extracted from version used by netfilter connection tracking
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * Code generated for this function might be very inefficient
+ * for some CPUs, can be overridden by linking arch-specific
+ * assembly versions such as arch/sparc/lib/udivdi.S
+ */
+#include <linux/types.h>
+#include <linux/module.h>
+#include <asm/div64.h>
+
+uint64_t __udivdi3(uint64_t dividend, uint64_t divisor)
+{
+	uint32_t d = divisor;
+
+	/* Scale divisor to 32 bits */
+	if (divisor > 0xffffffffULL) {
+		unsigned int shift = fls(divisor >> 32);
+
+		d = divisor >> shift;
+		dividend >>= shift;
+	}
+
+	/* avoid 64 bit division if possible */
+	if (dividend >> 32)
+		do_div(dividend, d);
+	else
+		dividend = (uint32_t) dividend / d;
+
+	return dividend;
+}
+EXPORT_SYMBOL(__udivdi3);


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-02-26 22:31   ` Stephen Hemminger
@ 2007-02-26 23:02     ` Jan Engelhardt
  2007-02-26 23:44       ` Stephen Hemminger
  2007-02-27  6:21     ` Dan Williams
  2007-03-03  2:31     ` Andi Kleen
  2 siblings, 1 reply; 62+ messages in thread
From: Jan Engelhardt @ 2007-02-26 23:02 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: linux-kernel, netdev


On Feb 26 2007 13:28, Stephen Hemminger wrote:
>> 
>> ./arch/arm26/lib/udivdi3.c
>> ./arch/sh/lib/udivdi3.c
>> ./arch/sparc/lib/udivdi3.S
>> 
>> should not this be consolidated too?
>
>Hmm. Those are the GCC internal versions, that are picked up but
>doing divide in place. Do we want to allow general 64 bit in kernel to
>be easily used? It could cause sloppy slow code, but it would look
>cleaner.

Then our reviewers should catch it, and if not, the janitors will
(/me winks at R.P.J.Day and trivial@).

>@@ -134,7 +112,7 @@
> 	 */
> 	do {
> 		x1 = x;
>-		x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3;
>+		x = (2 * x + (u32) (a / x*x)) / 3;

Eye see a bug.

Previously there was div64_64(a, x*x) which is equivalent to
(a)/(x*x), or just: a/(x^2). But now you do a/x*x, which is
equivalent to a*x/x (in the domain of real numbers). Furthermore,
a/x*x is a-(a%x), which does not even remotely match a/(x^2).

Please keep the math intact, thank you ;-)



Jan
-- 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-02-26 23:02     ` Jan Engelhardt
@ 2007-02-26 23:44       ` Stephen Hemminger
  2007-02-27  0:05         ` Jan Engelhardt
  0 siblings, 1 reply; 62+ messages in thread
From: Stephen Hemminger @ 2007-02-26 23:44 UTC (permalink / raw)
  To: Jan Engelhardt; +Cc: linux-kernel, netdev

On Tue, 27 Feb 2007 00:02:50 +0100 (MET)
Jan Engelhardt <jengelh@linux01.gwdg.de> wrote:

> 
> On Feb 26 2007 13:28, Stephen Hemminger wrote:
> >> 
> >> ./arch/arm26/lib/udivdi3.c
> >> ./arch/sh/lib/udivdi3.c
> >> ./arch/sparc/lib/udivdi3.S
> >> 
> >> should not this be consolidated too?
> >
> >Hmm. Those are the GCC internal versions, that are picked up but
> >doing divide in place. Do we want to allow general 64 bit in kernel to
> >be easily used? It could cause sloppy slow code, but it would look
> >cleaner.
> 
> Then our reviewers should catch it, and if not, the janitors will
> (/me winks at R.P.J.Day and trivial@).
> 
> >@@ -134,7 +112,7 @@
> > 	 */
> > 	do {
> > 		x1 = x;
> >-		x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3;
> >+		x = (2 * x + (u32) (a / x*x)) / 3;
> 
> Eye see a bug.
> 
> Previously there was div64_64(a, x*x) which is equivalent to
> (a)/(x*x), or just: a/(x^2). But now you do a/x*x, which is
> equivalent to a*x/x (in the domain of real numbers). Furthermore,
> a/x*x is a-(a%x), which does not even remotely match a/(x^2).
> 
> Please keep the math intact, thank you ;-)

Been there, done that, don't want to repeat it...

-- 
Stephen Hemminger <shemminger@linux-foundation.org>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-02-26 23:44       ` Stephen Hemminger
@ 2007-02-27  0:05         ` Jan Engelhardt
  2007-02-27  0:07           ` Stephen Hemminger
  0 siblings, 1 reply; 62+ messages in thread
From: Jan Engelhardt @ 2007-02-27  0:05 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: linux-kernel, netdev


On Feb 26 2007 15:44, Stephen Hemminger wrote:
>> >-		x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3;
>> >+		x = (2 * x + (u32) (a / x*x)) / 3;
>> 
>> Previously there was div64_64(a, x*x) which is equivalent to
>> (a)/(x*x), or just: a/(x^2). But now you do a/x*x, which is
>> equivalent to a*x/x (in the domain of real numbers). Furthermore,
>> a/x*x is a-(a%x), which does not even remotely match a/(x^2).
>> 
>Been there, done that, don't want to repeat it...

I am sorry I don't quite follow.


Jan
-- 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-02-27  0:05         ` Jan Engelhardt
@ 2007-02-27  0:07           ` Stephen Hemminger
  2007-02-27  0:14             ` Jan Engelhardt
  0 siblings, 1 reply; 62+ messages in thread
From: Stephen Hemminger @ 2007-02-27  0:07 UTC (permalink / raw)
  To: Jan Engelhardt; +Cc: linux-kernel, netdev

On Tue, 27 Feb 2007 01:05:26 +0100 (MET)
Jan Engelhardt <jengelh@linux01.gwdg.de> wrote:

> 
> On Feb 26 2007 15:44, Stephen Hemminger wrote:
> >> >-		x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3;
> >> >+		x = (2 * x + (u32) (a / x*x)) / 3;
> >> 
> >> Previously there was div64_64(a, x*x) which is equivalent to
> >> (a)/(x*x), or just: a/(x^2). But now you do a/x*x, which is
> >> equivalent to a*x/x (in the domain of real numbers). Furthermore,
> >> a/x*x is a-(a%x), which does not even remotely match a/(x^2).
> >> 
> >Been there, done that, don't want to repeat it...
> 
> I am sorry I don't quite follow.

Once before a missed paren's caused a TCP congestion window bug that
took 6 months before it was found...

-- 
Stephen Hemminger <shemminger@linux-foundation.org>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-02-27  0:07           ` Stephen Hemminger
@ 2007-02-27  0:14             ` Jan Engelhardt
  0 siblings, 0 replies; 62+ messages in thread
From: Jan Engelhardt @ 2007-02-27  0:14 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: linux-kernel, netdev


On Feb 26 2007 16:07, Stephen Hemminger wrote:
>> On Feb 26 2007 15:44, Stephen Hemminger wrote:
>> >> >-		x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3;
>> >> >+		x = (2 * x + (u32) (a / x*x)) / 3;
>> >> 
>> >> Previously there was div64_64(a, x*x) which is equivalent to
>> >> (a)/(x*x), or just: a/(x^2). But now you do a/x*x, which is
>> >> equivalent to a*x/x (in the domain of real numbers). Furthermore,
>> >> a/x*x is a-(a%x), which does not even remotely match a/(x^2).
>> >> 
>> >Been there, done that, don't want to repeat it...
>> 
>> I am sorry I don't quite follow.
>
>Once before a missed paren's caused a TCP congestion window bug that
>took 6 months before it was found...

Hah, just as I expected.

|On Tue, 27 Feb 2007 00:02:50 +0100 (MET), Jan Engelhardt wrote:
|>Then our reviewers should catch it, and if not, the janitors will.


Jan
-- 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-02-26 21:28   ` Stephen Hemminger
@ 2007-02-27  1:20     ` H. Peter Anvin
  2007-02-27  3:45       ` Segher Boessenkool
  0 siblings, 1 reply; 62+ messages in thread
From: H. Peter Anvin @ 2007-02-27  1:20 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Jan Engelhardt, linux-kernel, netdev

Stephen Hemminger wrote:
> 
> Hmm. Those are the GCC internal versions, that are picked up but
> doing divide in place. Do we want to allow general 64 bit in kernel to
> be easily used? It could cause sloppy slow code, but it would look
> cleaner.
> 

... and it would handle datatypes which may be architecture-dependent a 
lot cleaner.

I thought the motivation for div64() was that a 64:32->32 divide could 
be done a lot faster on a number of platforms (including the important 
x86) than a generic 64:64->64 divide, but gcc doesn't handle the 
devolution automatically -- there is no such libgcc function.

	-hpa

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-02-27  1:20     ` H. Peter Anvin
@ 2007-02-27  3:45       ` Segher Boessenkool
  0 siblings, 0 replies; 62+ messages in thread
From: Segher Boessenkool @ 2007-02-27  3:45 UTC (permalink / raw)
  To: H. Peter Anvin; +Cc: Jan Engelhardt, netdev, Stephen Hemminger, linux-kernel

> I thought the motivation for div64() was that a 64:32->32 divide could 
> be done a lot faster on a number of platforms (including the important 
> x86) than a generic 64:64->64 divide, but gcc doesn't handle the 
> devolution automatically -- there is no such libgcc function.

That there's no such function in libgcc doesn't mean GCC
cannot handle this; libgcc is a bunch of library functions
that are really needed for generated code (because you
really don't want to expand those functions inline
everywhere) -- you won't find an addsi3 in libgcc either.

There does exist a divmoddisi4, sort of.

It used to be defined in three GCC targets, but commented
out in all three.  The NS32k is long gone.  For Vax, a
comment says the machine insn for this isn't used because
it is just too slow.  And the i386 version is disabled
because it returns the wrong result on overflow (not the
truncated 64-bit result, required by the implicit cast
to 32-bit, but the i386 arch traps to the overflow handler).

The only way to express the semantics you want in (GNU) C
is to use asm() -- and that's exactly what div64() does :-)

Blame it on the C language, but not on GCC.  Not this time.


Segher


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-02-26 22:31   ` Stephen Hemminger
  2007-02-26 23:02     ` Jan Engelhardt
@ 2007-02-27  6:21     ` Dan Williams
  2007-03-03  2:31     ` Andi Kleen
  2 siblings, 0 replies; 62+ messages in thread
From: Dan Williams @ 2007-02-27  6:21 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Jan Engelhardt, linux-kernel, netdev, Nicolas Pitre,
	Russell King - ARM Linux

On 2/26/07, Stephen Hemminger <shemminger@linux-foundation.org> wrote:
> Here is another way to handle the 64 bit divide case.
> It allows full 64 bit divide by adding the support routine
> GCC needs.
>
<snip>

I know ARM already went through the process of removing __udivdi3 support:
http://www.arm.linux.org.uk/developer/patches/viewpatch.php?id=2723/2

Copying Russell and Nicolas as a heads up.

--
Dan

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-02-26 22:31   ` Stephen Hemminger
  2007-02-26 23:02     ` Jan Engelhardt
  2007-02-27  6:21     ` Dan Williams
@ 2007-03-03  2:31     ` Andi Kleen
  2007-03-05 23:57       ` Stephen Hemminger
  2 siblings, 1 reply; 62+ messages in thread
From: Andi Kleen @ 2007-03-03  2:31 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Jan Engelhardt, linux-kernel, netdev

Stephen Hemminger <shemminger@linux-foundation.org> writes:

> Here is another way to handle the 64 bit divide case.
> It allows full 64 bit divide by adding the support routine
> GCC needs.

Not supplying that was intentional by Linus so that people
think twice (or more often) before they using such expensive
operations. A plain / looks too innocent.

Is it really needed by CUBIC anyways?  It uses it for getting
the cubic root, but the algorithm recommended by Hacker's Delight
(great book) doesn't use any divisions at all. Probably better 
to use a better algorithm without divisions.

-Andi

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-03-03  2:31     ` Andi Kleen
@ 2007-03-05 23:57       ` Stephen Hemminger
  2007-03-06  0:25         ` David Miller
  2007-03-06 13:34         ` [RFC] div64_64 support Andi Kleen
  0 siblings, 2 replies; 62+ messages in thread
From: Stephen Hemminger @ 2007-03-05 23:57 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Jan Engelhardt, linux-kernel, netdev

On 03 Mar 2007 03:31:52 +0100
Andi Kleen <andi@firstfloor.org> wrote:

> Stephen Hemminger <shemminger@linux-foundation.org> writes:
> 
> > Here is another way to handle the 64 bit divide case.
> > It allows full 64 bit divide by adding the support routine
> > GCC needs.
> 
> Not supplying that was intentional by Linus so that people
> think twice (or more often) before they using such expensive
> operations. A plain / looks too innocent.
> 
> Is it really needed by CUBIC anyways?  It uses it for getting
> the cubic root, but the algorithm recommended by Hacker's Delight
> (great book) doesn't use any divisions at all. Probably better 
> to use a better algorithm without divisions.
> 

I tried the code from Hacker's Delight.
It is cool, but performance is CPU (and data) dependent:

Average # of usecs per operation:

		Hacker		Newton
Pentium 3	68.6	<	90.4
T2050		98.6	>	92.0
U1400		450	>	415
Xeon		70 	<	90
Xeon (newer)	71	<	78

EM64T		21.8	<	24.6
AMD64		23.4	<	32.0

It might be worth the change for code size reduction though.


-- 
Stephen Hemminger <shemminger@linux-foundation.org>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-03-05 23:57       ` Stephen Hemminger
@ 2007-03-06  0:25         ` David Miller
  2007-03-06 13:36           ` Andi Kleen
  2007-03-06 14:04           ` [RFC] div64_64 support II Andi Kleen
  2007-03-06 13:34         ` [RFC] div64_64 support Andi Kleen
  1 sibling, 2 replies; 62+ messages in thread
From: David Miller @ 2007-03-06  0:25 UTC (permalink / raw)
  To: shemminger; +Cc: andi, jengelh, linux-kernel, netdev

From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Mon, 5 Mar 2007 15:57:14 -0800

> I tried the code from Hacker's Delight.
> It is cool, but performance is CPU (and data) dependent:
> 
> Average # of usecs per operation:

Interesting results.

The problem with these algorithms that tradoff one or more
multiplies in order to avoid a divide is that they don't
give anything and often lose when both multiplies and
divides are emulated in software.

This is particularly true in this cube-root case from Hacker's
Delight, because it's using 3 multiplies per iteration in place of one
divide per iteration.

Actually, sorry, there is only one real multiply in there since the
other two can be computed using addition and shifts.

Another thing is that the non-Hacker's Delight version iterates
differently for different input values, so the input value space is
very important to consider when comparing these two pieces of code.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-03-05 23:57       ` Stephen Hemminger
  2007-03-06  0:25         ` David Miller
@ 2007-03-06 13:34         ` Andi Kleen
  2007-03-06 14:19           ` Eric Dumazet
  1 sibling, 1 reply; 62+ messages in thread
From: Andi Kleen @ 2007-03-06 13:34 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Andi Kleen, Jan Engelhardt, linux-kernel, netdev

On Mon, Mar 05, 2007 at 03:57:14PM -0800, Stephen Hemminger wrote:
> On 03 Mar 2007 03:31:52 +0100
> Andi Kleen <andi@firstfloor.org> wrote:
> 
> > Stephen Hemminger <shemminger@linux-foundation.org> writes:
> > 
> > > Here is another way to handle the 64 bit divide case.
> > > It allows full 64 bit divide by adding the support routine
> > > GCC needs.
> > 
> > Not supplying that was intentional by Linus so that people
> > think twice (or more often) before they using such expensive
> > operations. A plain / looks too innocent.
> > 
> > Is it really needed by CUBIC anyways?  It uses it for getting
> > the cubic root, but the algorithm recommended by Hacker's Delight
> > (great book) doesn't use any divisions at all. Probably better 
> > to use a better algorithm without divisions.
> > 
> 
> I tried the code from Hacker's Delight.
> It is cool, but performance is CPU (and data) dependent:

I did too. My experiences were mixed: on 32bit it was slower,
on 64bit faster on average.  Strangely the 32bit version ran
faster again without -fomit-frame-pointer, so it's likely
some funny interaction with 32bit long long code generation.
The difference is never more than 100 cycles so it shouldn't
be a big issue either way.

For some input arguments (<1% in my testing)
it also gave an answer 1 off from the existing code, 
but I don't think that's a problem.

But more importantly during testing I found that the cubic
code gives a division by zero for input arguments >2^43. If you
have a system with >16TB of memory this could actually be a remotely
exploitable bug :)

I still think it's a good idea to switch to the new function,
especially since it's shorter code.

Here's the patch. Note I didn't verify it with real large window
TCP operations; only unit testing.

-Andi

Use Hacker's delight cube root algorithm in cubic TCP

Shorter code and fixes a theoretically remote exploitable bug.

Signed-off-by: Andi Kleen <ak@suse.de>


Index: linux-2.6.21-rc1-net/net/ipv4/tcp_cubic.c
===================================================================
--- linux-2.6.21-rc1-net.orig/net/ipv4/tcp_cubic.c
+++ linux-2.6.21-rc1-net/net/ipv4/tcp_cubic.c
@@ -93,51 +93,24 @@ static void bictcp_init(struct sock *sk)
 		tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
 }
 
-/* 64bit divisor, dividend and result. dynamic precision */
-static inline u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor)
+static u32 cubic_root(u64 x)
 {
-	u_int32_t d = divisor;
-
-	if (divisor > 0xffffffffULL) {
-		unsigned int shift = fls(divisor >> 32);
-
-		d = divisor >> shift;
-		dividend >>= shift;
-	}
-
-	/* avoid 64 bit division if possible */
-	if (dividend >> 32)
-		do_div(dividend, d);
-	else
-		dividend = (uint32_t) dividend / d;
-
-	return dividend;
-}
-
-/*
- * calculate the cubic root of x using Newton-Raphson
- */
-static u32 cubic_root(u64 a)
-{
-	u32 x, x1;
-
-	/* Initial estimate is based on:
-	 * cbrt(x) = exp(log(x) / 3)
-	 */
-	x = 1u << (fls64(a)/3);
-
-	/*
-	 * Iteration based on:
-	 *                         2
-	 * x    = ( 2 * x  +  a / x  ) / 3
-	 *  k+1          k         k
-	 */
-	do {
-		x1 = x;
-		x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3;
-	} while (abs(x1 - x) > 1);
-
-	return x;
+	int s;
+	u32 y;
+	u64 b;
+	u64 bs;
+
+	y = 0;
+	for (s = 63; s >= 0; s -= 3) {
+		y = 2 * y;
+		b = 3 * y * (y+1) + 1;
+		bs = b << s;
+		if (x >= bs && (b == (bs>>s))) {  /* avoid overflow */
+			x -= bs;
+			y++;
+		}
+	}
+	return y;
 }
 
 /*

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-03-06  0:25         ` David Miller
@ 2007-03-06 13:36           ` Andi Kleen
  2007-03-06 14:04           ` [RFC] div64_64 support II Andi Kleen
  1 sibling, 0 replies; 62+ messages in thread
From: Andi Kleen @ 2007-03-06 13:36 UTC (permalink / raw)
  To: David Miller; +Cc: shemminger, andi, jengelh, linux-kernel, netdev

On Mon, Mar 05, 2007 at 04:25:51PM -0800, David Miller wrote:
> Another thing is that the non-Hacker's Delight version iterates
> differently for different input values, so the input value space is
> very important to consider when comparing these two pieces of code.

I did some stochastic testing on my version. It gave 1 off for < 1% of
the arguments.  Probably not an issue.

Besides it actually works for >2^43 @)

-Andi

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support II
  2007-03-06  0:25         ` David Miller
  2007-03-06 13:36           ` Andi Kleen
@ 2007-03-06 14:04           ` Andi Kleen
  2007-03-06 17:43             ` Dagfinn Ilmari Mannsåker
  2007-03-06 18:48             ` H. Peter Anvin
  1 sibling, 2 replies; 62+ messages in thread
From: Andi Kleen @ 2007-03-06 14:04 UTC (permalink / raw)
  To: David Miller; +Cc: shemminger, andi, jengelh, linux-kernel, netdev

> The problem with these algorithms that tradoff one or more
> multiplies in order to avoid a divide is that they don't
> give anything and often lose when both multiplies and
> divides are emulated in software.

Actually on rereading this: is there really any Linux port
that emulates multiplies in software? I thought that was only
done on really small microcontrollers or smart cards; but anything
32bit+ that runs Linux should have hardware multiply, shouldn't it?

-Andi

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-03-06 13:34         ` [RFC] div64_64 support Andi Kleen
@ 2007-03-06 14:19           ` Eric Dumazet
  2007-03-06 14:45             ` Andi Kleen
  0 siblings, 1 reply; 62+ messages in thread
From: Eric Dumazet @ 2007-03-06 14:19 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Stephen Hemminger, Jan Engelhardt, linux-kernel, netdev

On Tuesday 06 March 2007 14:34, Andi Kleen wrote:

> -	return x;
> +	int s;
> +	u32 y;
> +	u64 b;
> +	u64 bs;
> +
> +	y = 0;
> +	for (s = 63; s >= 0; s -= 3) {
> +		y = 2 * y;
> +		b = 3 * y * (y+1) + 1;
> +		bs = b << s;
> +		if (x >= bs && (b == (bs>>s))) {  /* avoid overflow */
> +			x -= bs;
> +			y++;
> +		}
> +	}
> +	return y;
>  }
>

Andi

<rant>
Let me see... You throw code like that and expect someone to actually 
understand it in one year, and be able to correct a bug ?
</rant>

Please add something, an URL or even better a nice explanation, per favor...

Thank you

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-03-06 14:19           ` Eric Dumazet
@ 2007-03-06 14:45             ` Andi Kleen
  2007-03-06 15:10               ` Roland Kuhn
  2007-03-06 18:50               ` [RFC] div64_64 support H. Peter Anvin
  0 siblings, 2 replies; 62+ messages in thread
From: Andi Kleen @ 2007-03-06 14:45 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Andi Kleen, Stephen Hemminger, Jan Engelhardt, linux-kernel, netdev

> <rant>
> Let me see... You throw code like that and expect someone to actually 
> understand it in one year, and be able to correct a bug ?

To be honest I don't expect any bugs in this function.

> </rant>
> 
> Please add something, an URL or even better a nice explanation, per favor...

It's straight out of Hacker's delight which is referenced in the commit
log.

-Andi

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-03-06 14:45             ` Andi Kleen
@ 2007-03-06 15:10               ` Roland Kuhn
  2007-03-06 18:29                 ` Stephen Hemminger
  2007-03-06 18:50               ` [RFC] div64_64 support H. Peter Anvin
  1 sibling, 1 reply; 62+ messages in thread
From: Roland Kuhn @ 2007-03-06 15:10 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Eric Dumazet, Stephen Hemminger, Jan Engelhardt, linux-kernel, netdev


[-- Attachment #1.1: Type: text/plain, Size: 1341 bytes --]

Hi Andi!

On 6 Mar 2007, at 15:45, Andi Kleen wrote:

>> <rant>
>> Let me see... You throw code like that and expect someone to actually
>> understand it in one year, and be able to correct a bug ?
>
> To be honest I don't expect any bugs in this function.
>
>> </rant>
>>
>> Please add something, an URL or even better a nice explanation,  
>> per favor...
>
> It's straight out of Hacker's delight which is referenced in the  
> commit
> log.
>
And it's pretty neat, too. Hint: (y+1)**3 = y**3 + 3*y**2 + 3*y + 1.  
The algorithm is exactly the same as for calculating the cubic root  
on paper, digit by digit. I found that algo in the school notebook of  
my grandpa (late 1920ies), a pity that it's not taught anymore...  
pocket calculators _do_ have downsides ;-)

Ciao,
                     Roland

--
TU Muenchen, Physik-Department E18, James-Franck-Str., 85748 Garching
Telefon 089/289-12575; Telefax 089/289-12570
--
CERN office: 892-1-D23 phone: +41 22 7676540 mobile: +41 76 487 4482
--
Any society that would give up a little liberty to gain a little
security will deserve neither and lose both.  - Benjamin Franklin
-----BEGIN GEEK CODE BLOCK-----
Version: 3.12
GS/CS/M/MU d-(++) s:+ a-> C+++ UL++++ P+++ L+++ E(+) W+ !N K- w--- M 
+ !V Y+
PGP++ t+(++) 5 R+ tv-- b+ DI++ e+++>++++ h---- y+++
------END GEEK CODE BLOCK------



[-- Attachment #1.2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 4324 bytes --]

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 186 bytes --]

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support II
  2007-03-06 14:04           ` [RFC] div64_64 support II Andi Kleen
@ 2007-03-06 17:43             ` Dagfinn Ilmari Mannsåker
  2007-03-06 18:25               ` David Miller
  2007-03-06 18:48             ` H. Peter Anvin
  1 sibling, 1 reply; 62+ messages in thread
From: Dagfinn Ilmari Mannsåker @ 2007-03-06 17:43 UTC (permalink / raw)
  To: Andi Kleen; +Cc: David Miller, shemminger, jengelh, linux-kernel, netdev

Andi Kleen <andi@firstfloor.org> writes:

> Actually on rereading this: is there really any Linux port
> that emulates multiplies in software? I thought that was only
> done on really small microcontrollers or smart cards; but anything
> 32bit+ that runs Linux should have hardware multiply, shouldn't it?

SPARCv7 (sun4/sun4c) doesn't have hardware mul/div. This includes
SparcStation 1, 1+, 2, SLC, ELC, IPC and IPX.

-- 
ilmari
"A disappointingly low fraction of the human race is,
 at any given time, on fire." - Stig Sandbeck Mathisen

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support II
  2007-03-06 17:43             ` Dagfinn Ilmari Mannsåker
@ 2007-03-06 18:25               ` David Miller
  0 siblings, 0 replies; 62+ messages in thread
From: David Miller @ 2007-03-06 18:25 UTC (permalink / raw)
  To: ilmari; +Cc: andi, shemminger, jengelh, linux-kernel, netdev

From: ilmari@ilmari.org (Dagfinn Ilmari Mannsåker)
Date: Tue, 06 Mar 2007 18:43:14 +0100

> Andi Kleen <andi@firstfloor.org> writes:
> 
> > Actually on rereading this: is there really any Linux port
> > that emulates multiplies in software? I thought that was only
> > done on really small microcontrollers or smart cards; but anything
> > 32bit+ that runs Linux should have hardware multiply, shouldn't it?
> 
> SPARCv7 (sun4/sun4c) doesn't have hardware mul/div. This includes
> SparcStation 1, 1+, 2, SLC, ELC, IPC and IPX.

Right and the cypress sun4m parts don't do multiply/divide in
hardware either.

I believe the Alpha does divide in software too, see
arch/alpha/lib/divide.S

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-03-06 15:10               ` Roland Kuhn
@ 2007-03-06 18:29                 ` Stephen Hemminger
  2007-03-06 19:48                   ` Andi Kleen
                                     ` (2 more replies)
  0 siblings, 3 replies; 62+ messages in thread
From: Stephen Hemminger @ 2007-03-06 18:29 UTC (permalink / raw)
  To: Roland Kuhn
  Cc: Andi Kleen, Eric Dumazet, Jan Engelhardt, linux-kernel, netdev

Don't count the existing Newton-Raphson out. It turns out that to get enough
precision for 32 bits, only 4 iterations are needed. By unrolling those, it
gets much better timing.

Slightly gross test program (with original cubic wraparound bug fixed).

---
/* Test and measure perf of cube root algorithms.  */
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>

#ifdef __x86_64

#define rdtscll(val) do { \
     unsigned int __a,__d; \
     asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \
     (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
} while(0)

# define do_div(n,base) ({					\
	uint32_t __base = (base);				\
	uint32_t __rem;						\
	__rem = ((uint64_t)(n)) % __base;			\
	(n) = ((uint64_t)(n)) / __base;				\
	__rem;							\
 })


/**
 * __ffs - find first bit in word.
 * @word: The word to search
 *
 * Undefined if no bit exists, so code should check against 0 first.
 */
static __inline__ unsigned long __ffs(unsigned long word)
{
	__asm__("bsfq %1,%0"
		:"=r" (word)
		:"rm" (word));
	return word;
}

/*
 * __fls: find last bit set.
 * @word: The word to search
 *
 * Undefined if no zero exists, so code should check against ~0UL first.
 */
static inline unsigned long __fls(unsigned long word)
{
	__asm__("bsrq %1,%0"
		:"=r" (word)
		:"rm" (word));
	return word;
}

/**
 * ffs - find first bit set
 * @x: the word to search
 *
 * This is defined the same way as
 * the libc and compiler builtin ffs routines, therefore
 * differs in spirit from the above ffz (man ffs).
 */
static __inline__ int ffs(int x)
{
	int r;

	__asm__("bsfl %1,%0\n\t"
		"cmovzl %2,%0" 
		: "=r" (r) : "rm" (x), "r" (-1));
	return r+1;
}

/**
 * fls - find last bit set
 * @x: the word to search
 *
 * This is defined the same way as ffs.
 */
static inline int fls(int x)
{
	int r;

	__asm__("bsrl %1,%0\n\t"
		"cmovzl %2,%0"
		: "=&r" (r) : "rm" (x), "rm" (-1));
	return r+1;
}

/**
 * fls64 - find last bit set in 64 bit word
 * @x: the word to search
 *
 * This is defined the same way as fls.
 */
static inline int fls64(uint64_t x)
{
	if (x == 0)
		return 0;
	return __fls(x) + 1;
}

static inline uint64_t div64_64(uint64_t dividend, uint64_t divisor)
{
	return dividend / divisor;
}

#elif __i386

#define rdtscll(val) \
     __asm__ __volatile__("rdtsc" : "=A" (val))

/**
 * ffs - find first bit set
 * @x: the word to search
 *
 * This is defined the same way as
 * the libc and compiler builtin ffs routines, therefore
 * differs in spirit from the above ffz() (man ffs).
 */
static inline int ffs(int x)
{
	int r;

	__asm__("bsfl %1,%0\n\t"
		"jnz 1f\n\t"
		"movl $-1,%0\n"
		"1:" : "=r" (r) : "rm" (x));
	return r+1;
}

/**
 * fls - find last bit set
 * @x: the word to search
 *
 * This is defined the same way as ffs().
 */
static inline int fls(int x)
{
	int r;

	__asm__("bsrl %1,%0\n\t"
		"jnz 1f\n\t"
		"movl $-1,%0\n"
		"1:" : "=r" (r) : "rm" (x));
	return r+1;
}

static inline int fls64(uint64_t x)
{
	uint32_t h = x >> 32;
	if (h)
		return fls(h) + 32;
	return fls(x);
}


#define do_div(n,base) ({ \
	unsigned long __upper, __low, __high, __mod, __base; \
	__base = (base); \
	asm("":"=a" (__low), "=d" (__high):"A" (n)); \
	__upper = __high; \
	if (__high) { \
		__upper = __high % (__base); \
		__high = __high / (__base); \
	} \
	asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \
	asm("":"=A" (n):"a" (__low),"d" (__high)); \
	__mod; \
})


/* 64bit divisor, dividend and result. dynamic precision */
static uint64_t div64_64(uint64_t dividend, uint64_t divisor)
{
	uint32_t d = divisor;

	if (divisor > 0xffffffffULL) {
		unsigned int shift = fls(divisor >> 32);

		d = divisor >> shift;
		dividend >>= shift;
	}

	/* avoid 64 bit division if possible */
	if (dividend >> 32)
		do_div(dividend, d);
	else
		dividend = (uint32_t) dividend / d;

	return dividend;
}
#endif

/* Andi Kleen's version */
uint32_t acbrt(uint64_t x)
{
	uint32_t y = 0;
	int s;

	for (s = 63; s >= 0; s -= 3) {
		uint64_t b, bs;

		y = 2 * y;
		b = 3 * y * (y+1) + 1;
		bs = b << s;
		if (x >= bs && (b == (bs>>s))) {  /* avoid overflow */
			x -= bs;
			y++;
		}
	}
	return y;
}

/* My version of hacker's delight */
uint32_t hcbrt(uint64_t x)
{
	int s = 60;
	uint32_t y = 0;

	do {
		uint64_t b;
		y = 2*y;
		b = (uint64_t)(3*y*(y + 1) + 1) << s;
		s = s - 3;
		if (x >= b) {
			x = x - b;
			y = y + 1;
		}
	} while(s >= 0);

	return y;
}

/* calculate the cubic root of x using Newton-Raphson */
static uint32_t ocubic(uint64_t a)
{
	uint32_t x, x1;

	/* Initial estimate is based on:
	 * cbrt(x) = exp(log(x) / 3)
	 */
	x = 1u << (fls64(a)/3);

	/*
	 * Iteration based on:
	 *                         2
	 * x    = ( 2 * x  +  a / x  ) / 3
	 *  k+1          k         k
	 */
	do {
		uint64_t x2;

		x2 = x;
		x2 *= x;
		x1 = x;

		x = (2 * x + div64_64(a, x2)) / 3;
	} while (abs(x1 - x) > 1);

	return x;
}

/* calculate the cubic root of x using Newton-Raphson */
static uint32_t ncubic(uint64_t a)
{
	uint64_t x;

	/* Initial estimate is based on:
	 * cbrt(x) = exp(log(x) / 3)
	 */
	x = 1u << (fls64(a)/3);

	/* Converges in 3 iterations to > 32 bits */

	x = (2 * x + div64_64(a, x*x)) / 3;
	x = (2 * x + div64_64(a, x*x)) / 3;
	x = (2 * x + div64_64(a, x*x)) / 3;

	return x;
}

static const struct cbrt {
	uint64_t	in;
	uint32_t	result;
} cases[] = {
	{1, 1}, {2, 1}, {3, 1}, {4, 1}, {5, 1}, {6, 1}, {7, 1}, 
	{8, 2}, {9, 2}, {10, 2}, {11, 2}, {12, 2}, {13, 2}, {14, 2},
	{15, 2}, {16, 2}, {17, 2}, {18, 2}, {19, 2}, {20, 2}, {21, 2}, {22, 2},
	{23, 2}, {24, 2}, {25, 2}, {26, 2}, {27, 3}, {28, 3}, {29, 3}, {30, 3},
	{31, 3}, {32, 3}, {33, 3}, {34, 3}, {35, 3}, {36, 3}, {37, 3}, {38, 3},
	{39, 3}, {40, 3}, {99, 4}, {100, 4}, {101, 4}, 
	{ 125ull, 5 }, { 216ull, 6 },  { 343ull, 7 }, { 512ull, 8 }, 
	{ 1000ull, 10 },  { 1331ull, 11 },
	{ 8000ull, 20 },  { 9261ull, 21 },
	{32767, 31},	 {32768, 32}, {32769, 32}, 
	{ 64000ull, 40 },  { 68921ull, 41 },
	{ 512000ull, 80 },  { 531441ull, 81 },
	{ 1000000ull, 100 },  { 1030301ull, 101 },
	{ 4096000ull, 160 },  { 4173281ull, 161 },
	{ 16387064ull, 254 },  { 16581375ull, 255 },
	{ 16777216ull, 256 },  { 16974593ull, 257 },
	{ 131096512ull, 508 },  { 131872229ull, 509 },
	{ 132651000ull, 510 },  { 133432831ull, 511 },
	{ 134217728ull, 512 },  { 135005697ull, 513 },
	{ 1000000000ull, 1000 },  { 1003003001ull, 1001 },
	{ 1006012008ull, 1002 },  { 1009027027ull, 1003 },
	{ 1061208000ull, 1020 },  { 1064332261ull, 1021 },
	{ 1067462648ull, 1022 },  { 1070599167ull, 1023 },
	
	{1073741823, 1023}, {1073741824, 1024}, {1073741825, 1024},
	{~0, 2097151},
/* 100 random values */
 { 7749363893351949254ull, 1978891},  { 7222815480849057907ull, 1933016},
 { 8408462745175416063ull, 2033475},  { 3091884191388096748ull, 1456826},
 { 2562019500164152525ull, 1368340},  { 4403210617922443179ull, 1639041},
 { 3364542905362882299ull, 1498449},  { 8782769017716072774ull, 2063211},
 { 5863405773976003266ull, 1803225},  { 1306053050111174648ull, 1093084},
 { 150346236956174824ull, 531737},  { 1265737889039205261ull, 1081719},
 { 1445109530774087002ull, 1130577},  { 1197105577171186275ull, 1061803},
 { 9213452462461015967ull, 2096399},  { 4730966302945445786ull, 1678739},
 { 5650605098630667570ull, 1781141},  { 5880381756353009591ull, 1804963},
 { 4552499520046621784ull, 1657359},  { 2697991130065918298ull, 1392131},
 { 4858364911220984157ull, 1693674},  { 3691457481531040535ull, 1545489},
 { 2613117305472506601ull, 1377377},  { 7449943749836318932ull, 1953069},
 { 643378865959570610ull, 863287},  { 4851450802679832774ull, 1692871},
 { 1772859812839988916ull, 1210295},  { 8210946489571640849ull, 2017426},
 { 591875965497384322ull, 839608},  { 4221553402965100097ull, 1616183},
 { 2197744667347238205ull, 1300146},  { 8321400714356781191ull, 2026432},
 { 2459557415995497961ull, 1349850},  { 3460673533926954145ull, 1512586},
 { 4727304344741345505ull, 1678306},  { 4903203917250634599ull, 1698869},
 { 4036494370831490817ull, 1592214},  { 8585205035691420311ull, 2047624},
 { 2622143824319236828ull, 1378961},  { 5902762718897731478ull, 1807250},
 { 6344401509618197560ull, 1851243},  { 4059247793194552874ull, 1595200},
 { 7648030174294342832ull, 1970228},  { 2111858627070002939ull, 1282985},
 { 3231502273651985583ull, 1478432},  { 8821862535190318932ull, 2066268},
 { 6062559696943389464ull, 1823414},  { 4054224670122353756ull, 1594541},
 { 3674929609692563482ull, 1543179},  { 6310802012126231363ull, 1847969},
 { 4450190829039920890ull, 1644849},  { 8764531173541462842ull, 2061782},
 { 1361923252301505833ull, 1108453},  { 5912924843615600614ull, 1808287},
 { 5714768882048811324ull, 1787857},  { 7249589769047033748ull, 1935401},
 { 4123157012528828376ull, 1603528},  { 1729687638268160097ull, 1200390},
 { 5132287771298228729ull, 1724925},  { 1564349257200314043ull, 1160854},
 { 951586254223522969ull, 983594},  { 4569664949094662293ull, 1659439},
 { 9082730968228181483ull, 2086437},  { 6312891027251024051ull, 1848173},
 { 6915415788559031791ull, 1905194},  { 2713150456497618688ull, 1394733},
 { 5390954890749602465ull, 1753430},  { 1405547745908296421ull, 1120164},
 { 1157301728707637259ull, 1049902},  { 1513573187112042448ull, 1148156},
 { 687416080475161159ull, 882551},  { 484496930861389501ull, 785411},
 { 1625256440396143907ull, 1175729},  { 7358388240824901288ull, 1945035},
 { 6055730836615196283ull, 1822729},  { 5897962221937294789ull, 1806760},
 { 862205218853780339ull, 951780},  { 4798091009445823173ull, 1686641},
 { 644772714391937867ull, 863910},  { 4255852691293155171ull, 1620549},
 { 5287931004512034672ull, 1742188},  { 479051048987854372ull, 782457},
 { 9223312736680112286ull, 2097147},  { 8208392001457969628ull, 2017217},
 { 9203071384420047828ull, 2095612},  { 8029313043584389618ull, 2002439},
 { 38384068872053008ull, 337326},  { 5477688516749455419ull, 1762784},
 { 1504622508868036557ull, 1145888},  { 8421184723110053200ull, 2034500},
 { 3312070181890020423ull, 1490618},  { 5344298403762143580ull, 1748357},
 { 6340030040222269807ull, 1850818},  { 4895839553118470425ull, 1698018},
 { 2806627376195262363ull, 1410570},  { 5321619225005368821ull, 1745880},
 { 6897323351052656353ull, 1903532},  { 326700202259382556ull, 688731},
 { 7685269066741890339ull, 1973420},  { 8054506481558450217ull, 2004531},
};
#define NCASES (sizeof(cases)/sizeof(cases[0]))


static double ticks_per_usec;

static void show(const char *func, uint64_t sum, uint64_t sq, 
		 unsigned long long mx, unsigned long err)
{
	double mean, std;

	mean = (double) sum / ticks_per_usec / NCASES ;
	std = sqrtl( (double) sq / ticks_per_usec / NCASES - mean * mean);

	printf("%-10s %8llu %8.2f %8.2f %8.2f %lu\n", func, 
	       (unsigned long long) sum / NCASES, mean, std, 
	       (double) mx / ticks_per_usec, err);
}


int main(int argc, char **argv)
{
	int i;
	uint32_t c;
	unsigned long long start, end, t, mx;
	unsigned long long err, sum, sum_sq;

	rdtscll(start);
	sleep(2);
	rdtscll(end);
	ticks_per_usec = (double) (end - start) / 2000000.;
	printf("Function     clocks  mean(us) max(us)  std(us)  total error\n");

#define DOTEST(func) \
	if (func(27) != 3) printf("ouch\n"); \
	sum = sum_sq = mx = 0; \
	for (err = 0, i = 0; i < NCASES; i++) { \
		rdtscll(start);						\
		c = func(cases[i].in); \
		rdtscll(end); \
		t = end - start; sum += t; sum_sq += t*t; \
		if (t > mx) mx = t; \
		err += abs((long) cases[i].result - c); \
	} \
	show(#func, sum, sum_sq, mx, err);

	DOTEST(ocubic);
	DOTEST(ncubic);
	DOTEST(acbrt);
	DOTEST(hcbrt);
	return 0;
}

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support II
  2007-03-06 14:04           ` [RFC] div64_64 support II Andi Kleen
  2007-03-06 17:43             ` Dagfinn Ilmari Mannsåker
@ 2007-03-06 18:48             ` H. Peter Anvin
  1 sibling, 0 replies; 62+ messages in thread
From: H. Peter Anvin @ 2007-03-06 18:48 UTC (permalink / raw)
  To: Andi Kleen; +Cc: David Miller, shemminger, jengelh, linux-kernel, netdev

Andi Kleen wrote:
>> The problem with these algorithms that tradoff one or more
>> multiplies in order to avoid a divide is that they don't
>> give anything and often lose when both multiplies and
>> divides are emulated in software.
> 
> Actually on rereading this: is there really any Linux port
> that emulates multiplies in software? I thought that was only
> done on really small microcontrollers or smart cards; but anything
> 32bit+ that runs Linux should have hardware multiply, shouldn't it?

SPARC < v8 does multiplies using an MSTEP instruction.

	-hpa

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-03-06 14:45             ` Andi Kleen
  2007-03-06 15:10               ` Roland Kuhn
@ 2007-03-06 18:50               ` H. Peter Anvin
  1 sibling, 0 replies; 62+ messages in thread
From: H. Peter Anvin @ 2007-03-06 18:50 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Eric Dumazet, Stephen Hemminger, Jan Engelhardt, linux-kernel, netdev

Andi Kleen wrote:
>> <rant>
>> Let me see... You throw code like that and expect someone to actually 
>> understand it in one year, and be able to correct a bug ?
> 
> To be honest I don't expect any bugs in this function.
> 
>> </rant>
>>
>> Please add something, an URL or even better a nice explanation, per favor...
> 
> It's straight out of Hacker's delight which is referenced in the commit
> log.

Referencing it in a comment would have been a better idea.

	-hpa

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-03-06 18:29                 ` Stephen Hemminger
@ 2007-03-06 19:48                   ` Andi Kleen
  2007-03-06 20:04                     ` Stephen Hemminger
  2007-03-06 21:53                   ` Sami Farin
  2007-03-06 21:58                   ` [RFC] div64_64 support David Miller
  2 siblings, 1 reply; 62+ messages in thread
From: Andi Kleen @ 2007-03-06 19:48 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Roland Kuhn, Andi Kleen, Eric Dumazet, Jan Engelhardt,
	linux-kernel, netdev

On Tue, Mar 06, 2007 at 10:29:41AM -0800, Stephen Hemminger wrote:
> Don't count the existing Newton-Raphson out. It turns out that to get enough
> precision for 32 bits, only 4 iterations are needed. By unrolling those, it
> gets much better timing.

But did you fix the >2^43 bug too?

SGI has already shipped 10TB Altixen, so it's not entirely theoretical.

-Andi


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-03-06 19:48                   ` Andi Kleen
@ 2007-03-06 20:04                     ` Stephen Hemminger
  0 siblings, 0 replies; 62+ messages in thread
From: Stephen Hemminger @ 2007-03-06 20:04 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Roland Kuhn, Andi Kleen, Eric Dumazet, Jan Engelhardt,
	linux-kernel, netdev

On Tue, 6 Mar 2007 20:48:41 +0100
Andi Kleen <andi@firstfloor.org> wrote:

> On Tue, Mar 06, 2007 at 10:29:41AM -0800, Stephen Hemminger wrote:
> > Don't count the existing Newton-Raphson out. It turns out that to get enough
> > precision for 32 bits, only 4 iterations are needed. By unrolling those, it
> > gets much better timing.
> 
> But did you fix the >2^43 bug too?

It was caused by not doing x^2 in 64 bit.

> 
> SGI has already shipped 10TB Altixen, so it's not entirely theoretical.
> 
> -Andi
> 


-- 
Stephen Hemminger <shemminger@linux-foundation.org>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-03-06 18:29                 ` Stephen Hemminger
  2007-03-06 19:48                   ` Andi Kleen
@ 2007-03-06 21:53                   ` Sami Farin
  2007-03-06 22:24                     ` Sami Farin
  2007-03-06 21:58                   ` [RFC] div64_64 support David Miller
  2 siblings, 1 reply; 62+ messages in thread
From: Sami Farin @ 2007-03-06 21:53 UTC (permalink / raw)
  To: linux-kernel, netdev

On Tue, Mar 06, 2007 at 10:29:41 -0800, Stephen Hemminger wrote:
> Don't count the existing Newton-Raphson out. It turns out that to get enough
> precision for 32 bits, only 4 iterations are needed. By unrolling those, it
> gets much better timing.
> 
> Slightly gross test program (with original cubic wraparound bug fixed).
...
> 	{~0, 2097151},
             ^^^^^^^
this should be 2642245.

Without serializing instruction before rdtsc and with one loop
I do not get very accurate results (104 for ncubic, > 1000 for others).

#define rdtscll_serialize(val) \
  __asm__ __volatile__("movl $0, %%eax\n\tcpuid\n\trdtsc\n" : "=A" (val) : : "ebx", "ecx")

Here Pentium D timings for 1000 loops. 

~0, 2097151

Function     clocks mean(us)  max(us)  std(us) total error
ocubic          912    0.306   20.317    0.730      545101
ncubic          777    0.261   14.799    0.486      576263
acbrt          1168    0.392   21.681    0.547      547562
hcbrt           827    0.278   15.244    0.387        2410

~0, 2642245

Function     clocks mean(us)  max(us)  std(us) total error
ocubic          908    0.305   20.210    0.656           7
ncubic          775    0.260   14.792    0.550       31169
acbrt          1176    0.395   22.017    0.970        2468
hcbrt           826    0.278   15.326    0.670      547504

And I found bug in gcc-4.1.2, it gave 0 for ncubic results
when doing 1000 loops test... gcc-4.0.3 works.

-- 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-03-06 18:29                 ` Stephen Hemminger
  2007-03-06 19:48                   ` Andi Kleen
  2007-03-06 21:53                   ` Sami Farin
@ 2007-03-06 21:58                   ` David Miller
  2007-03-06 22:47                     ` [PATCH] tcp_cubic: faster cube root Stephen Hemminger
  2 siblings, 1 reply; 62+ messages in thread
From: David Miller @ 2007-03-06 21:58 UTC (permalink / raw)
  To: shemminger; +Cc: rkuhn, andi, dada1, jengelh, linux-kernel, netdev

From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Tue, 6 Mar 2007 10:29:41 -0800

> /* calculate the cubic root of x using Newton-Raphson */
> static uint32_t ncubic(uint64_t a)
> {
> 	uint64_t x;
> 
> 	/* Initial estimate is based on:
> 	 * cbrt(x) = exp(log(x) / 3)
> 	 */
> 	x = 1u << (fls64(a)/3);
> 
> 	/* Converges in 3 iterations to > 32 bits */
> 
> 	x = (2 * x + div64_64(a, x*x)) / 3;
> 	x = (2 * x + div64_64(a, x*x)) / 3;
> 	x = (2 * x + div64_64(a, x*x)) / 3;
> 
> 	return x;
> }

Indeed that will be the fastest variant for cpus with hw
integer division.

I did a quick sparc64 port, here is what I got:

Function     clocks  mean(us) max(us)  std(us)  total error
ocubic          529     0.35    15.16     0.66 545101
ncubic          498     0.33    12.83     0.36 576263
acbrt           427     0.28    11.04     0.33 547562
hcbrt           393     0.26    10.18     0.47 2410

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-03-06 21:53                   ` Sami Farin
@ 2007-03-06 22:24                     ` Sami Farin
  2007-03-07  0:00                       ` Stephen Hemminger
                                         ` (2 more replies)
  0 siblings, 3 replies; 62+ messages in thread
From: Sami Farin @ 2007-03-06 22:24 UTC (permalink / raw)
  To: linux-kernel, netdev

On Tue, Mar 06, 2007 at 23:53:49 +0200, Sami Farin wrote:
...
> And I found bug in gcc-4.1.2, it gave 0 for ncubic results
> when doing 1000 loops test... gcc-4.0.3 works.

Found it.

--- cbrt-test.c~	2007-03-07 00:20:54.735248105 +0200
+++ cbrt-test.c	2007-03-07 00:21:03.964864343 +0200
@@ -209,7 +209,7 @@
 
 	__asm__("bsrl %1,%0\n\t"
 		"cmovzl %2,%0"
-		: "=&r" (r) : "rm" (x), "rm" (-1));
+		: "=&r" (r) : "rm" (x), "rm" (-1) : "memory");
 	return r+1;
 }
 
Now Linux 2.6 does not have "memory" in fls, maybe it causes
some gcc funnies some people are seeing.

-- 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [PATCH] tcp_cubic: faster cube root
  2007-03-06 21:58                   ` [RFC] div64_64 support David Miller
@ 2007-03-06 22:47                     ` Stephen Hemminger
  2007-03-06 22:58                       ` cube root benchmark code Stephen Hemminger
  2007-03-07  4:20                       ` [PATCH] tcp_cubic: faster cube root David Miller
  0 siblings, 2 replies; 62+ messages in thread
From: Stephen Hemminger @ 2007-03-06 22:47 UTC (permalink / raw)
  To: David Miller; +Cc: rkuhn, andi, dada1, jengelh, linux-kernel, netdev

The Newton-Raphson method is quadratically convergent so
only a small fixed number of steps are necessary.
Therefore it is faster to unroll the loop. Since div64_64 is no longer
inline it won't cause code explosion.

Also fixes a bug that can occur if x^2 was bigger than 32 bits.

Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>

---
 net/ipv4/tcp_cubic.c |   16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

--- net-2.6.22.orig/net/ipv4/tcp_cubic.c	2007-03-06 12:24:34.000000000 -0800
+++ net-2.6.22/net/ipv4/tcp_cubic.c	2007-03-06 14:43:37.000000000 -0800
@@ -96,23 +96,17 @@
  */
 static u32 cubic_root(u64 a)
 {
-	u32 x, x1;
+	u64 x;
 
 	/* Initial estimate is based on:
 	 * cbrt(x) = exp(log(x) / 3)
 	 */
 	x = 1u << (fls64(a)/3);
 
-	/*
-	 * Iteration based on:
-	 *                         2
-	 * x    = ( 2 * x  +  a / x  ) / 3
-	 *  k+1          k         k
-	 */
-	do {
-		x1 = x;
-		x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3;
-	} while (abs(x1 - x) > 1);
+	/* converges to 32 bits in 3 iterations */
+	x = (2 * x + div64_64(a, x*x)) / 3;
+	x = (2 * x + div64_64(a, x*x)) / 3;
+	x = (2 * x + div64_64(a, x*x)) / 3;
 
 	return x;
 }

^ permalink raw reply	[flat|nested] 62+ messages in thread

* cube root benchmark code
  2007-03-06 22:47                     ` [PATCH] tcp_cubic: faster cube root Stephen Hemminger
@ 2007-03-06 22:58                       ` Stephen Hemminger
  2007-03-07  6:08                         ` Update to " Willy Tarreau
  2007-03-07  4:20                       ` [PATCH] tcp_cubic: faster cube root David Miller
  1 sibling, 1 reply; 62+ messages in thread
From: Stephen Hemminger @ 2007-03-06 22:58 UTC (permalink / raw)
  To: David Miller; +Cc: rkuhn, andi, dada1, jengelh, linux-kernel, netdev

Here is a better version of the benchmark code.
It has the original code used in 2.4 version of Cubic for comparison

-----------------------------------------------------------
/* Test and measure perf of cube root algorithms.  */
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>
#include <unistd.h>

#ifdef __x86_64

#define rdtscll(val) do { \
     unsigned int __a,__d; \
     asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \
     (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
} while(0)

# define do_div(n,base) ({					\
	uint32_t __base = (base);				\
	uint32_t __rem;						\
	__rem = ((uint64_t)(n)) % __base;			\
	(n) = ((uint64_t)(n)) / __base;				\
	__rem;							\
 })


/**
 * __ffs - find first bit in word.
 * @word: The word to search
 *
 * Undefined if no bit exists, so code should check against 0 first.
 */
static __inline__ unsigned long __ffs(unsigned long word)
{
	__asm__("bsfq %1,%0"
		:"=r" (word)
		:"rm" (word));
	return word;
}

/*
 * __fls: find last bit set.
 * @word: The word to search
 *
 * Undefined if no zero exists, so code should check against ~0UL first.
 */
static inline unsigned long __fls(unsigned long word)
{
	__asm__("bsrq %1,%0"
		:"=r" (word)
		:"rm" (word));
	return word;
}

/**
 * ffs - find first bit set
 * @x: the word to search
 *
 * This is defined the same way as
 * the libc and compiler builtin ffs routines, therefore
 * differs in spirit from the above ffz (man ffs).
 */
static __inline__ int ffs(int x)
{
	int r;

	__asm__("bsfl %1,%0\n\t"
		"cmovzl %2,%0" 
		: "=r" (r) : "rm" (x), "r" (-1));
	return r+1;
}

/**
 * fls - find last bit set
 * @x: the word to search
 *
 * This is defined the same way as ffs.
 */
static inline int fls(int x)
{
	int r;

	__asm__("bsrl %1,%0\n\t"
		"cmovzl %2,%0"
		: "=&r" (r) : "rm" (x), "rm" (-1));
	return r+1;
}

/**
 * fls64 - find last bit set in 64 bit word
 * @x: the word to search
 *
 * This is defined the same way as fls.
 */
static inline int fls64(uint64_t x)
{
	if (x == 0)
		return 0;
	return __fls(x) + 1;
}

static inline uint64_t div64_64(uint64_t dividend, uint64_t divisor)
{
	return dividend / divisor;
}

#elif __i386

#define rdtscll(val) \
     __asm__ __volatile__("rdtsc" : "=A" (val))

/**
 * ffs - find first bit set
 * @x: the word to search
 *
 * This is defined the same way as
 * the libc and compiler builtin ffs routines, therefore
 * differs in spirit from the above ffz() (man ffs).
 */
static inline int ffs(int x)
{
	int r;

	__asm__("bsfl %1,%0\n\t"
		"jnz 1f\n\t"
		"movl $-1,%0\n"
		"1:" : "=r" (r) : "rm" (x));
	return r+1;
}

/**
 * fls - find last bit set
 * @x: the word to search
 *
 * This is defined the same way as ffs().
 */
static inline int fls(int x)
{
	int r;

	__asm__("bsrl %1,%0\n\t"
		"jnz 1f\n\t"
		"movl $-1,%0\n"
		"1:" : "=r" (r) : "rm" (x));
	return r+1;
}

static inline int fls64(uint64_t x)
{
	uint32_t h = x >> 32;
	if (h)
		return fls(h) + 32;
	return fls(x);
}


#define do_div(n,base) ({ \
	unsigned long __upper, __low, __high, __mod, __base; \
	__base = (base); \
	asm("":"=a" (__low), "=d" (__high):"A" (n)); \
	__upper = __high; \
	if (__high) { \
		__upper = __high % (__base); \
		__high = __high / (__base); \
	} \
	asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \
	asm("":"=A" (n):"a" (__low),"d" (__high)); \
	__mod; \
})


/* 64bit divisor, dividend and result. dynamic precision */
static uint64_t div64_64(uint64_t dividend, uint64_t divisor)
{
	uint32_t d = divisor;

	if (divisor > 0xffffffffULL) {
		unsigned int shift = fls(divisor >> 32);

		d = divisor >> shift;
		dividend >>= shift;
	}

	/* avoid 64 bit division if possible */
	if (dividend >> 32)
		do_div(dividend, d);
	else
		dividend = (uint32_t) dividend / d;

	return dividend;
}
#endif

/* Andi Kleen's version */
uint32_t acbrt(uint64_t x)
{
	uint32_t y = 0;
	int s;

	for (s = 63; s >= 0; s -= 3) {
		uint64_t b, bs;

		y = 2 * y;
		b = 3 * y * (y+1) + 1;
		bs = b << s;
		if (x >= bs && (b == (bs>>s))) {  /* avoid overflow */
			x -= bs;
			y++;
		}
	}
	return y;
}

/* My version of hacker's delight */
uint32_t hcbrt(uint64_t x)
{
	int s = 60;
	uint32_t y = 0;

	do {
		uint64_t b;
		y = 2*y;
		b = (uint64_t)(3*y*(y + 1) + 1) << s;
		s = s - 3;
		if (x >= b) {
			x = x - b;
			y = y + 1;
		}
	} while(s >= 0);

	return y;
}

/* calculate the cubic root of x using Newton-Raphson */
static uint32_t ocubic(uint64_t a)
{
	uint32_t x, x1;

	/* Initial estimate is based on:
	 * cbrt(x) = exp(log(x) / 3)
	 */
	x = 1u << (fls64(a)/3);

	/*
	 * Iteration based on:
	 *                         2
	 * x    = ( 2 * x  +  a / x  ) / 3
	 *  k+1          k         k
	 */
	do {
		x1 = x;

		x = (2 * x + div64_64(a, (uint64_t)x * x)) / 3;
	} while (abs(x1 - x) > 1);

	return x;
}

/* calculate the cubic root of x using Newton-Raphson */
static uint32_t ncubic(uint64_t a)
{
	uint64_t x;

	/* Initial estimate is based on:
	 * cbrt(x) = exp(log(x) / 3)
	 */
	x = 1u << (fls64(a)/3);

	/* Converges in 3 iterations to > 32 bits */
	x = (2 * x + div64_64(a, x*x)) / 3;
	x = (2 * x + div64_64(a, x*x)) / 3;
	x = (2 * x + div64_64(a, x*x)) / 3;

	return x;
}

/* 65536 times the cubic root of 0,    1,     2,     3,      4,      5,      6,      7*/
static uint64_t bictcp_table[8] = {0, 65536, 82570, 94519, 104030, 112063, 119087, 125367};

/* calculate the cubic root of x
   the basic idea is that x can be expressed as i*8^j
   so cubic_root(x) = cubic_root(i)*2^j
   in the following code, x is i, and y is 2^j
   because of integer calculation, there are errors in calculation
   so finally use binary search to find out the exact solution*/
static uint32_t bictcp(uint64_t x)
{
        uint64_t y, app, target, start, end, mid, start_diff, end_diff;

        if (x == 0)
                return 0;

        target = x;

        /*first estimate lower and upper bound*/
        y = 1;
        while (x >= 8){
                x = (x >> 3);
                y = (y << 1);
        }
        start = (y*bictcp_table[x])>>16;
        if (x==7)
                end = (y<<1);
        else
                end = (y*bictcp_table[x+1]+65535)>>16;

        /*binary search for more accurate one*/
        while (start < end-1) {
                mid = (start+end) >> 1;
                app = mid*mid*mid;
                if (app < target)
                        start = mid;
                else if (app > target)
                        end = mid;
                else
                        return mid;
        }

        /*find the most accurate one from start and end*/
        app = start*start*start;
        if (app < target)
                start_diff = target - app;
        else
                start_diff = app - target;
        app = end*end*end;
        if (app < target)
                end_diff = target - app;
        else
                end_diff = app - target;

        return (start_diff < end_diff) ? start : end;
}


#define NCASES 1000
static uint64_t cases[NCASES];
static double results[NCASES];

static double ticks_per_usec;
static unsigned long long start, end;

static void dotest(const char *name, uint32_t (*func)(uint64_t))
{
	int i;
	unsigned long long t, mx = 0, sum = 0, sum_sq = 0;
	double mean, std, err = 0;

	for (i = 0; i < NCASES; i++) {
		uint64_t x = cases[i];
		uint32_t v;

		rdtscll(start);
		v = (*func)(x);
		rdtscll(end);

		t = end - start;
		if (t > mx) mx = t;
		sum += t; sum_sq += t*t;

		err += fabs(((double) v - results[i]) / results[i]);
	}

	mean = (double) sum / ticks_per_usec / NCASES ;
	std = sqrtl( (double) sum_sq / ticks_per_usec / NCASES - mean * mean);

	printf("%-10s %8llu %8.2f %8.2f %8.2f %.03f%%\n", name, 
	       (unsigned long long) sum / NCASES, mean, std, 
	       (double) mx / ticks_per_usec, err * 100./ NCASES);
}


int main(int argc, char **argv)
{
	uint64_t x;
	int i;

	printf("Calibrating\n");
	rdtscll(start);
	sleep(2);
	rdtscll(end);
	ticks_per_usec = (double) (end - start) / 2000000.;

	for (i = 0; i < 63; i++) 
		cases[i] = 1ull << i;
	x = ~0;
	while (x != 0) {
		cases[i++] = x;
		x >>= 1;
	}
	x = ~0;
	while (x != 0) {
		cases[i++] = x;
		x <<= 1;
	}

	while (i < NCASES)
		cases[i++] = (uint64_t) random()  * (uint64_t) random();

	for (i = 0; i < NCASES; i++) 
		results[i] = cbrt((double)cases[i]);

	printf("Function     clocks  mean(us) max(us)  std(us)  Avg error\n");
	
#define DOTEST(x)	dotest(#x, x)
	DOTEST(bictcp);
	DOTEST(ocubic);
	DOTEST(ncubic);
	DOTEST(acbrt);
	DOTEST(hcbrt);
	return 0;
}

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-03-06 22:24                     ` Sami Farin
@ 2007-03-07  0:00                       ` Stephen Hemminger
  2007-03-07  0:05                         ` David Miller
  2007-03-07  0:05                         ` Sami Farin
  2007-03-07 16:11                       ` Chuck Ebbert
  2007-03-08 18:23                       ` asm volatile [Was: [RFC] div64_64 support] Sami Farin
  2 siblings, 2 replies; 62+ messages in thread
From: Stephen Hemminger @ 2007-03-07  0:00 UTC (permalink / raw)
  To: linux-kernel

On Wed, 7 Mar 2007 00:24:35 +0200
Sami Farin <7atbggg02@sneakemail.com> wrote:

> On Tue, Mar 06, 2007 at 23:53:49 +0200, Sami Farin wrote:
> ...
> > And I found bug in gcc-4.1.2, it gave 0 for ncubic results
> > when doing 1000 loops test... gcc-4.0.3 works.
> 
> Found it.
> 
> --- cbrt-test.c~	2007-03-07 00:20:54.735248105 +0200
> +++ cbrt-test.c	2007-03-07 00:21:03.964864343 +0200
> @@ -209,7 +209,7 @@
>  
>  	__asm__("bsrl %1,%0\n\t"
>  		"cmovzl %2,%0"
> -		: "=&r" (r) : "rm" (x), "rm" (-1));
> +		: "=&r" (r) : "rm" (x), "rm" (-1) : "memory");
>  	return r+1;
>  }
>  
> Now Linux 2.6 does not have "memory" in fls, maybe it causes
> some gcc funnies some people are seeing.
> 

That code was copy-paste from:
	include/asm-x86_64/bitops.h

So shouldn't both fls() and ffs() be fixed there as well?

-- 
Stephen Hemminger <shemminger@linux-foundation.org>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-03-07  0:00                       ` Stephen Hemminger
@ 2007-03-07  0:05                         ` David Miller
  2007-03-07  0:05                         ` Sami Farin
  1 sibling, 0 replies; 62+ messages in thread
From: David Miller @ 2007-03-07  0:05 UTC (permalink / raw)
  To: shemminger; +Cc: linux-kernel

From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Tue, 6 Mar 2007 16:00:55 -0800

> On Wed, 7 Mar 2007 00:24:35 +0200
> Sami Farin <7atbggg02@sneakemail.com> wrote:
> 
> > On Tue, Mar 06, 2007 at 23:53:49 +0200, Sami Farin wrote:
> > ...
> > > And I found bug in gcc-4.1.2, it gave 0 for ncubic results
> > > when doing 1000 loops test... gcc-4.0.3 works.
> > 
> > Found it.
> > 
> > --- cbrt-test.c~	2007-03-07 00:20:54.735248105 +0200
> > +++ cbrt-test.c	2007-03-07 00:21:03.964864343 +0200
> > @@ -209,7 +209,7 @@
> >  
> >  	__asm__("bsrl %1,%0\n\t"
> >  		"cmovzl %2,%0"
> > -		: "=&r" (r) : "rm" (x), "rm" (-1));
> > +		: "=&r" (r) : "rm" (x), "rm" (-1) : "memory");
> >  	return r+1;
> >  }
> >  
> > Now Linux 2.6 does not have "memory" in fls, maybe it causes
> > some gcc funnies some people are seeing.
> > 
> 
> That code was copy-paste from:
> 	include/asm-x86_64/bitops.h
> 
> So shouldn't both fls() and ffs() be fixed there as well?

This code doesn't modify memory behind gcc's back, so this
"memory" clobber shouldn't really be needed.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-03-07  0:00                       ` Stephen Hemminger
  2007-03-07  0:05                         ` David Miller
@ 2007-03-07  0:05                         ` Sami Farin
  1 sibling, 0 replies; 62+ messages in thread
From: Sami Farin @ 2007-03-07  0:05 UTC (permalink / raw)
  To: linux-kernel

On Tue, Mar 06, 2007 at 16:00:55 -0800, Stephen Hemminger wrote:
...
> > Now Linux 2.6 does not have "memory" in fls, maybe it causes
> > some gcc funnies some people are seeing.
> > 
> 
> That code was copy-paste from:
> 	include/asm-x86_64/bitops.h
> 
> So shouldn't both fls() and ffs() be fixed there as well?

Yes.  And for both x86_64 and i386.

-- 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] tcp_cubic: faster cube root
  2007-03-06 22:47                     ` [PATCH] tcp_cubic: faster cube root Stephen Hemminger
  2007-03-06 22:58                       ` cube root benchmark code Stephen Hemminger
@ 2007-03-07  4:20                       ` David Miller
  2007-03-07 12:12                         ` Andi Kleen
  1 sibling, 1 reply; 62+ messages in thread
From: David Miller @ 2007-03-07  4:20 UTC (permalink / raw)
  To: shemminger; +Cc: rkuhn, andi, dada1, jengelh, linux-kernel, netdev

From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Tue, 6 Mar 2007 14:47:06 -0800

> The Newton-Raphson method is quadratically convergent so
> only a small fixed number of steps are necessary.
> Therefore it is faster to unroll the loop. Since div64_64 is no longer
> inline it won't cause code explosion.
> 
> Also fixes a bug that can occur if x^2 was bigger than 32 bits.
> 
> Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>

Applied, thanks Stephen.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Update to cube root benchmark code
  2007-03-06 22:58                       ` cube root benchmark code Stephen Hemminger
@ 2007-03-07  6:08                         ` Willy Tarreau
  2007-03-08  1:07                           ` [PATCH] tcp_cubic: use 32 bit math Stephen Hemminger
  0 siblings, 1 reply; 62+ messages in thread
From: Willy Tarreau @ 2007-03-07  6:08 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, rkuhn, andi, dada1, jengelh, linux-kernel, netdev

Hi Stephen,

Thanks for this code, it's easy to experiment with it.
Let me propose this simple update with a variation on your ncubic() function.
I noticed that all intermediate results were far below 32 bits, so I did a
new version which is 30% faster on my athlon with the same results. This is
because we only use x and a/x^2 in the function, with x very close to cbrt(a).
So a/x^2 is very close to cbrt(a) which is at most 22 bits. So we only use
the 32 lower bits of the result of div64_64(), and all intermediate
computations can be done on 32 bits (including multiplies and divides).

willy@pcw:~$ ./bictcp 
Calibrating
Function     clocks  mean(us) max(us)  std(us)  Avg error
bictcp         1085     0.70    28.19     2.30 0.172%
ocubic          869     0.56    22.76     1.23 0.274%
ncubic          637     0.41    16.29     1.41 0.247%
ncubic32        435     0.28    11.18     1.03 0.247%
acbrt           824     0.53    21.03     0.85 0.275%
hcbrt           547     0.35    13.96     0.42 1.580%

I also tried to improve a bit by checking for early convergence and
returning before last divide, but it is worthless because it almost
never happens so it does not make the code any faster.

Here's the code. I think that it would be fine if we merged this
version since it's supposed to behave better on most 32 bits machines.

Best regards,
Willy

/*
Here is a better version of the benchmark code.
It has the original code used in 2.4 version of Cubic for comparison

-----------------------------------------------------------
*/
/* Test and measure perf of cube root algorithms.  */
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>
#include <unistd.h>

#ifdef __x86_64

#define rdtscll(val) do { \
     unsigned int __a,__d; \
     asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \
     (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
} while(0)

# define do_div(n,base) ({					\
	uint32_t __base = (base);				\
	uint32_t __rem;						\
	__rem = ((uint64_t)(n)) % __base;			\
	(n) = ((uint64_t)(n)) / __base;				\
	__rem;							\
 })


/**
 * __ffs - find first bit in word.
 * @word: The word to search
 *
 * Undefined if no bit exists, so code should check against 0 first.
 */
static __inline__ unsigned long __ffs(unsigned long word)
{
	__asm__("bsfq %1,%0"
		:"=r" (word)
		:"rm" (word));
	return word;
}

/*
 * __fls: find last bit set.
 * @word: The word to search
 *
 * Undefined if no zero exists, so code should check against ~0UL first.
 */
static inline unsigned long __fls(unsigned long word)
{
	__asm__("bsrq %1,%0"
		:"=r" (word)
		:"rm" (word));
	return word;
}

/**
 * ffs - find first bit set
 * @x: the word to search
 *
 * This is defined the same way as
 * the libc and compiler builtin ffs routines, therefore
 * differs in spirit from the above ffz (man ffs).
 */
static __inline__ int ffs(int x)
{
	int r;

	__asm__("bsfl %1,%0\n\t"
		"cmovzl %2,%0" 
		: "=r" (r) : "rm" (x), "r" (-1));
	return r+1;
}

/**
 * fls - find last bit set
 * @x: the word to search
 *
 * This is defined the same way as ffs.
 */
static inline int fls(int x)
{
	int r;

	__asm__("bsrl %1,%0\n\t"
		"cmovzl %2,%0"
		: "=&r" (r) : "rm" (x), "rm" (-1));
	return r+1;
}

/**
 * fls64 - find last bit set in 64 bit word
 * @x: the word to search
 *
 * This is defined the same way as fls.
 */
static inline int fls64(uint64_t x)
{
	if (x == 0)
		return 0;
	return __fls(x) + 1;
}

static inline uint64_t div64_64(uint64_t dividend, uint64_t divisor)
{
	return dividend / divisor;
}

#elif __i386

#define rdtscll(val) \
     __asm__ __volatile__("rdtsc" : "=A" (val))

/**
 * ffs - find first bit set
 * @x: the word to search
 *
 * This is defined the same way as
 * the libc and compiler builtin ffs routines, therefore
 * differs in spirit from the above ffz() (man ffs).
 */
static inline int ffs(int x)
{
	int r;

	__asm__("bsfl %1,%0\n\t"
		"jnz 1f\n\t"
		"movl $-1,%0\n"
		"1:" : "=r" (r) : "rm" (x));
	return r+1;
}

/**
 * fls - find last bit set
 * @x: the word to search
 *
 * This is defined the same way as ffs().
 */
static inline int fls(int x)
{
	int r;

	__asm__("bsrl %1,%0\n\t"
		"jnz 1f\n\t"
		"movl $-1,%0\n"
		"1:" : "=r" (r) : "rm" (x));
	return r+1;
}

static inline int fls64(uint64_t x)
{
	uint32_t h = x >> 32;
	if (h)
		return fls(h) + 32;
	return fls(x);
}


#define do_div(n,base) ({ \
	unsigned long __upper, __low, __high, __mod, __base; \
	__base = (base); \
	asm("":"=a" (__low), "=d" (__high):"A" (n)); \
	__upper = __high; \
	if (__high) { \
		__upper = __high % (__base); \
		__high = __high / (__base); \
	} \
	asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \
	asm("":"=A" (n):"a" (__low),"d" (__high)); \
	__mod; \
})


/* 64bit divisor, dividend and result. dynamic precision */
static uint64_t div64_64(uint64_t dividend, uint64_t divisor)
{
	uint32_t d = divisor;

	if (divisor > 0xffffffffULL) {
		unsigned int shift = fls(divisor >> 32);

		d = divisor >> shift;
		dividend >>= shift;
	}

	/* avoid 64 bit division if possible */
	if (dividend >> 32)
		do_div(dividend, d);
	else
		dividend = (uint32_t) dividend / d;

	return dividend;
}
#endif

/* Andi Kleen's version */
uint32_t acbrt(uint64_t x)
{
	uint32_t y = 0;
	int s;

	for (s = 63; s >= 0; s -= 3) {
		uint64_t b, bs;

		y = 2 * y;
		b = 3 * y * (y+1) + 1;
		bs = b << s;
		if (x >= bs && (b == (bs>>s))) {  /* avoid overflow */
			x -= bs;
			y++;
		}
	}
	return y;
}

/* My version of hacker's delight */
uint32_t hcbrt(uint64_t x)
{
	int s = 60;
	uint32_t y = 0;

	do {
		uint64_t b;
		y = 2*y;
		b = (uint64_t)(3*y*(y + 1) + 1) << s;
		s = s - 3;
		if (x >= b) {
			x = x - b;
			y = y + 1;
		}
	} while(s >= 0);

	return y;
}

/* calculate the cubic root of x using Newton-Raphson */
static uint32_t ocubic(uint64_t a)
{
	uint32_t x, x1;

	/* Initial estimate is based on:
	 * cbrt(x) = exp(log(x) / 3)
	 */
	x = 1u << (fls64(a)/3);

	/*
	 * Iteration based on:
	 *                         2
	 * x    = ( 2 * x  +  a / x  ) / 3
	 *  k+1          k         k
	 */
	do {
		x1 = x;

		x = (2 * x + div64_64(a, (uint64_t)x * x)) / 3;
	} while (abs(x1 - x) > 1);

	return x;
}

/* calculate the cubic root of x using Newton-Raphson */
static uint32_t ncubic(uint64_t a)
{
	uint64_t x;

	/* Initial estimate is based on:
	 * cbrt(x) = exp(log(x) / 3)
	 */
	x = 1u << (fls64(a)/3);

	/* Converges in 3 iterations to > 32 bits */
	x = (2 * x + div64_64(a, x*x)) / 3;
	x = (2 * x + div64_64(a, x*x)) / 3;
	x = (2 * x + div64_64(a, x*x)) / 3;

	return x;
}

/* calculate the cubic root of x using Newton-Raphson */
static uint32_t ncubic32(uint64_t a)
{
	uint32_t x;

	/* Initial estimate is based on:
	 * cbrt(x) = exp(log(x) / 3)
	 */
	x = 1u << (fls64(a)/3);

	/* Converges in 3 iterations to > 32 bits */
	/* We can do 32bit maths here :
	 *   x ~= cbrt(a) so (a/x^2) ~= cbrt(a) which is about 22 bits max
	 */
	x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x)) / 3;
	x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x)) / 3;
	x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x)) / 3;

	return x;
}

/* 65536 times the cubic root of 0,    1,     2,     3,      4,      5,      6,      7*/
static uint64_t bictcp_table[8] = {0, 65536, 82570, 94519, 104030, 112063, 119087, 125367};

/* calculate the cubic root of x
   the basic idea is that x can be expressed as i*8^j
   so cubic_root(x) = cubic_root(i)*2^j
   in the following code, x is i, and y is 2^j
   because of integer calculation, there are errors in calculation
   so finally use binary search to find out the exact solution*/
static uint32_t bictcp(uint64_t x)
{
        uint64_t y, app, target, start, end, mid, start_diff, end_diff;

        if (x == 0)
                return 0;

        target = x;

        /*first estimate lower and upper bound*/
        y = 1;
        while (x >= 8){
                x = (x >> 3);
                y = (y << 1);
        }
        start = (y*bictcp_table[x])>>16;
        if (x==7)
                end = (y<<1);
        else
                end = (y*bictcp_table[x+1]+65535)>>16;

        /*binary search for more accurate one*/
        while (start < end-1) {
                mid = (start+end) >> 1;
                app = mid*mid*mid;
                if (app < target)
                        start = mid;
                else if (app > target)
                        end = mid;
                else
                        return mid;
        }

        /*find the most accurate one from start and end*/
        app = start*start*start;
        if (app < target)
                start_diff = target - app;
        else
                start_diff = app - target;
        app = end*end*end;
        if (app < target)
                end_diff = target - app;
        else
                end_diff = app - target;

        return (start_diff < end_diff) ? start : end;
}


#define NCASES 1000
static uint64_t cases[NCASES];
static double results[NCASES];

static double ticks_per_usec;
static unsigned long long start, end;

static void dotest(const char *name, uint32_t (*func)(uint64_t))
{
	int i;
	unsigned long long t, mx = 0, sum = 0, sum_sq = 0;
	double mean, std, err = 0;

	for (i = 0; i < NCASES; i++) {
		uint64_t x = cases[i];
		uint32_t v;

		rdtscll(start);
		v = (*func)(x);
		rdtscll(end);

		t = end - start;
		if (t > mx) mx = t;
		sum += t; sum_sq += t*t;

		err += fabs(((double) v - results[i]) / results[i]);
	}

	mean = (double) sum / ticks_per_usec / NCASES ;
	std = sqrtl( (double) sum_sq / ticks_per_usec / NCASES - mean * mean);

	printf("%-10s %8llu %8.2f %8.2f %8.2f %.03f%%\n", name, 
	       (unsigned long long) sum / NCASES, mean, std, 
	       (double) mx / ticks_per_usec, err * 100./ NCASES);
}


int main(int argc, char **argv)
{
	uint64_t x;
	int i;

	printf("Calibrating\n");
	rdtscll(start);
	sleep(2);
	rdtscll(end);
	ticks_per_usec = (double) (end - start) / 2000000.;

	for (i = 0; i < 63; i++) 
		cases[i] = 1ull << i;
	x = ~0;
	while (x != 0) {
		cases[i++] = x;
		x >>= 1;
	}
	x = ~0;
	while (x != 0) {
		cases[i++] = x;
		x <<= 1;
	}

	while (i < NCASES)
		cases[i++] = (uint64_t) random()  * (uint64_t) random();

	for (i = 0; i < NCASES; i++) 
		results[i] = cbrt((double)cases[i]);

	printf("Function     clocks  mean(us) max(us)  std(us)  Avg error\n");
	
#define DOTEST(x)	dotest(#x, x)
	DOTEST(bictcp);
	DOTEST(ocubic);
	DOTEST(ncubic);
	DOTEST(ncubic32);
	DOTEST(acbrt);
	DOTEST(hcbrt);
	return 0;
}



^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] tcp_cubic: faster cube root
  2007-03-07  4:20                       ` [PATCH] tcp_cubic: faster cube root David Miller
@ 2007-03-07 12:12                         ` Andi Kleen
  2007-03-07 19:33                           ` David Miller
  0 siblings, 1 reply; 62+ messages in thread
From: Andi Kleen @ 2007-03-07 12:12 UTC (permalink / raw)
  To: David Miller
  Cc: shemminger, rkuhn, andi, dada1, jengelh, linux-kernel, netdev

On Tue, Mar 06, 2007 at 08:20:52PM -0800, David Miller wrote:
> From: Stephen Hemminger <shemminger@linux-foundation.org>
> Date: Tue, 6 Mar 2007 14:47:06 -0800
> 
> > The Newton-Raphson method is quadratically convergent so
> > only a small fixed number of steps are necessary.
> > Therefore it is faster to unroll the loop. Since div64_64 is no longer
> > inline it won't cause code explosion.
> > 
> > Also fixes a bug that can occur if x^2 was bigger than 32 bits.
> > 
> > Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
> 
> Applied, thanks Stephen.

Well that still needs the ugly div64_64 function. At least my goal was to 
eliminate that, not make it faster (I don't see any evidence this function 
is particularly performance critical). You prefer to keep div64_64? 

-Andi

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-03-06 22:24                     ` Sami Farin
  2007-03-07  0:00                       ` Stephen Hemminger
@ 2007-03-07 16:11                       ` Chuck Ebbert
  2007-03-07 18:32                         ` Sami Farin
  2007-03-08 18:23                       ` asm volatile [Was: [RFC] div64_64 support] Sami Farin
  2 siblings, 1 reply; 62+ messages in thread
From: Chuck Ebbert @ 2007-03-07 16:11 UTC (permalink / raw)
  To: Sami Farin; +Cc: linux-kernel, netdev

Sami Farin wrote:
> On Tue, Mar 06, 2007 at 23:53:49 +0200, Sami Farin wrote:
> ...
>> And I found bug in gcc-4.1.2, it gave 0 for ncubic results
>> when doing 1000 loops test... gcc-4.0.3 works.
> 
> Found it.
> 
> --- cbrt-test.c~	2007-03-07 00:20:54.735248105 +0200
> +++ cbrt-test.c	2007-03-07 00:21:03.964864343 +0200
> @@ -209,7 +209,7 @@
>  
>  	__asm__("bsrl %1,%0\n\t"
>  		"cmovzl %2,%0"
> -		: "=&r" (r) : "rm" (x), "rm" (-1));
> +		: "=&r" (r) : "rm" (x), "rm" (-1) : "memory");
>  	return r+1;
>  }
>  
> Now Linux 2.6 does not have "memory" in fls, maybe it causes
> some gcc funnies some people are seeing.

Can you post the difference in the generated code with that change?


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC] div64_64 support
  2007-03-07 16:11                       ` Chuck Ebbert
@ 2007-03-07 18:32                         ` Sami Farin
  0 siblings, 0 replies; 62+ messages in thread
From: Sami Farin @ 2007-03-07 18:32 UTC (permalink / raw)
  To: linux-kernel, netdev

[-- Attachment #1: Type: text/plain, Size: 6133 bytes --]

On Wed, Mar 07, 2007 at 11:11:49 -0500, Chuck Ebbert wrote:
> Sami Farin wrote:
> > On Tue, Mar 06, 2007 at 23:53:49 +0200, Sami Farin wrote:
> > ...
> >> And I found bug in gcc-4.1.2, it gave 0 for ncubic results
> >> when doing 1000 loops test... gcc-4.0.3 works.
> > 
> > Found it.
> > 
> > --- cbrt-test.c~	2007-03-07 00:20:54.735248105 +0200
> > +++ cbrt-test.c	2007-03-07 00:21:03.964864343 +0200
> > @@ -209,7 +209,7 @@
> >  
> >  	__asm__("bsrl %1,%0\n\t"
> >  		"cmovzl %2,%0"
> > -		: "=&r" (r) : "rm" (x), "rm" (-1));
> > +		: "=&r" (r) : "rm" (x), "rm" (-1) : "memory");
> >  	return r+1;
> >  }
> >  
> > Now Linux 2.6 does not have "memory" in fls, maybe it causes
> > some gcc funnies some people are seeing.
> 
> Can you post the difference in the generated code with that change?

Fun.. looks when not using "memory" gcc does not even bother
calling ncubic() 666 times.  So it gets better timings ( 42/666=0 ) =)

--- cbrt-test-no_memory.s	2007-03-07 20:22:27.838466385 +0200
+++ cbrt-test-using_memory.s	2007-03-07 20:22:38.237013197 +0200
...
 main:
 	leal	4(%esp), %ecx
 	andl	$-16, %esp
 	pushl	-4(%ecx)
 	pushl	%ebp
 	pushl	%edi
 	pushl	%esi
 	pushl	%ebx
 	pushl	%ecx
-	subl	$136, %esp
+	subl	$152, %esp
 	movl	$.LC0, (%esp)
 	call	puts
 	xorl	%edx, %edx
 	movl	$27, %eax
 	call	ncubic
 	cmpl	$3, %eax
-	je	.L83
+	je	.L87
 	movl	$.LC1, (%esp)
 	call	puts
-.L83:
-	xorl	%eax, %eax
-	xorl	%edi, %edi
-	movl	%eax, 88(%esp)
+.L87:
 	xorl	%eax, %eax
-	xorl	%esi, %esi
+	xorl	%ebp, %ebp
 	movl	%eax, 92(%esp)
 	xorl	%eax, %eax
-	xorl	%ebp, %ebp
+	xorl	%edi, %edi
 	movl	%eax, 96(%esp)
 	xorl	%eax, %eax
+	xorl	%esi, %esi
 	movl	%eax, 100(%esp)
 	xorl	%eax, %eax
 	movl	%eax, 104(%esp)
 	xorl	%eax, %eax
 	movl	%eax, 108(%esp)
-	movl	%edi, 112(%esp)
-	movl	%esi, 116(%esp)
-	.p2align 4,,15
-.L84:
+	xorl	%eax, %eax
+	movl	%eax, 112(%esp)
+	movl	%ebp, 116(%esp)
+	movl	%edi, 120(%esp)
+	movl	%esi, 124(%esp)
+.L88:
 #APP
 	movl $0, %eax
 	cpuid
 	rdtsc
 
 #NO_APP
 	movl	%eax, 56(%esp)
 	movl	%edx, 60(%esp)
 #APP
 	movl $0, %eax
 	cpuid
 	rdtsc
 
 #NO_APP
 	movl	%eax, %esi
 	movl	%edx, %edi
 	subl	56(%esp), %esi
 	sbbl	60(%esp), %edi
 	cmpl	$0, %edi
 	ja	.L66
 	cmpl	$999, %esi
-	jbe	.L84
+	jbe	.L88
 .L66:
+	movl	92(%esp), %edx
+	leal	(%edx,%edx,2), %eax
+	movl	cases+4(,%eax,4), %edi
+	movl	cases(,%eax,4), %esi
+	movl	%edi, %edx
+	movl	%esi, %eax
+	call	ncubic
 #APP
 	movl $0, %eax
 	cpuid
 	rdtsc
 
 #NO_APP
-	movl	%eax, %esi
-	movl	%edx, %edi
+	movl	$666, %ebx
+	movl	%eax, 128(%esp)
+	movl	%edx, 132(%esp)
+	.p2align 4,,15
+.L67:
+	movl	%esi, %eax
+	movl	%edi, %edx
+	call	ncubic
+	decl	%ebx
+	movl	%eax, %ebp
+	jne	.L67
 #APP
 	movl $0, %eax
 	cpuid
 	rdtsc
 
 #NO_APP
-	subl	%esi, %eax
+	subl	128(%esp), %eax
 	movl	$666, %ebx
-	sbbl	%edi, %edx
-	xorl	%ecx, %ecx
 	movl	%ebx, 8(%esp)
+	sbbl	132(%esp), %edx
+	xorl	%ecx, %ecx
 	movl	%ecx, 12(%esp)
 	movl	%eax, (%esp)
 	movl	%edx, 4(%esp)
 	call	__udivdi3
-	addl	%eax, 104(%esp)
+	addl	%eax, 112(%esp)
 	movl	%edx, %ecx
 	movl	%eax, %ebx
 	movl	%edx, %esi
-	adcl	%edx, 108(%esp)
+	adcl	%edx, 116(%esp)
 	imull	%eax, %ecx
 	mull	%ebx
 	addl	%ecx, %ecx
 	movl	%eax, 56(%esp)
 	addl	%ecx, %edx
 	movl	56(%esp), %eax
-	addl	%eax, 112(%esp)
+	addl	%eax, 120(%esp)
 	movl	%edx, 60(%esp)
 	movl	60(%esp), %edx
-	adcl	%edx, 116(%esp)
-	cmpl	%esi, 92(%esp)
-	ja	.L67
-	jb	.L68
-	cmpl	%ebx, 88(%esp)
-	jae	.L67
-.L68:
-	movl	%ebx, 88(%esp)
-	movl	%esi, 92(%esp)
-.L67:
-	leal	(%ebp,%ebp,2), %ebx
-	sall	$2, %ebx
-	movl	cases+4(%ebx), %edx
-	movl	cases(%ebx), %eax
-	call	ncubic
-	movl	cases+8(%ebx), %edx
-	subl	%eax, %edx
-	movl	%edx, %eax
-	sarl	$31, %eax
-	xorl	%eax, %edx
-	subl	%eax, %edx
-	movl	%edx, %ecx
-	sarl	$31, %ecx
-	addl	%edx, 96(%esp)
-	adcl	%ecx, 100(%esp)
-	incl	%ebp
-	cmpl	$183, %ebp
-	jbe	.L84
-	movl	108(%esp), %eax
-	fildll	104(%esp)
-	testl	%eax, %eax
-	js	.L85
+	adcl	%edx, 124(%esp)
+	cmpl	%esi, 100(%esp)
+	ja	.L69
+	jb	.L70
+	cmpl	%ebx, 96(%esp)
+	jae	.L69
 .L70:
-	fstpl	120(%esp)
+	movl	%ebx, 96(%esp)
+	movl	%esi, 100(%esp)
+.L69:
+	movl	92(%esp), %edx
+	leal	(%edx,%edx,2), %eax
+	movl	cases+8(,%eax,4), %eax
+	subl	%ebp, %eax
+	movl	%eax, %ecx
+	sarl	$31, %ecx
+	xorl	%ecx, %eax
+	subl	%ecx, %eax
+	cltd
+	addl	%eax, 104(%esp)
+	adcl	%edx, 108(%esp)
+	incl	92(%esp)
+	cmpl	$183, 92(%esp)
+	jbe	.L88
 	movl	116(%esp), %eax
-	fldl	120(%esp)
+	fildll	112(%esp)
+	testl	%eax, %eax
+	js	.L89
+.L72:
+	fstpl	136(%esp)
+	movl	124(%esp), %eax
+	fldl	136(%esp)
 	fdivl	.LC7
 	testl	%eax, %eax
 	flds	.LC4
 	fdivr	%st, %st(1)
-	fildll	112(%esp)
-	js	.L86
-.L71:
-	fstpl	120(%esp)
-	fldl	120(%esp)
+	fildll	120(%esp)
+	js	.L90
+.L73:
+	fstpl	136(%esp)
+	fldl	136(%esp)
 	fdivl	.LC7
 	fdivp	%st, %st(1)
 	fld	%st(1)
 	fmul	%st(2), %st
 	fsubrp	%st, %st(1)
 	fld	%st(0)
 	fsqrt
 	fucomi	%st(0), %st
-	jp	.L88
-	je	.L89
-.L88:
+	jp	.L92
+	je	.L93
+.L92:
 	fstp	%st(0)
 	fstpl	(%esp)
 	fstpl	64(%esp)
 	call	sqrt
 	fldl	64(%esp)
 	fxch	%st(1)
-.L72:
-	movl	96(%esp), %eax
-	movl	100(%esp), %edx
-	fildll	88(%esp)
+.L74:
+	movl	104(%esp), %eax
+	movl	108(%esp), %edx
+	fildll	96(%esp)
 	movl	%eax, 40(%esp)
-	movl	92(%esp), %eax
+	movl	100(%esp), %eax
 	movl	%edx, 44(%esp)
 	testl	%eax, %eax
-	js	.L87
-.L73:
-	fstpl	120(%esp)
-	movl	104(%esp), %eax
+	js	.L91
+.L75:
+	fstpl	136(%esp)
+	movl	112(%esp), %eax
 	movl	$184, %ebp
-	fldl	120(%esp)
+	fldl	136(%esp)
 	xorl	%edi, %edi
 	movl	$.LC5, %esi
 	fdivl	.LC7
-	movl	108(%esp), %edx
+	movl	116(%esp), %edx
 	movl	%ebp, 8(%esp)
 	movl	%edi, 12(%esp)
 	movl	%eax, (%esp)
 	movl	%edx, 4(%esp)
 	fstpl	32(%esp)
 	fstpl	24(%esp)
 	fstpl	16(%esp)
 	call	__udivdi3
 	movl	%esi, 4(%esp)
 	movl	$.LC6, (%esp)
 	movl	%eax, 8(%esp)
 	movl	%edx, 12(%esp)
 	call	printf
-	addl	$136, %esp
+	addl	$152, %esp
 	xorl	%eax, %eax
 	popl	%ecx
 	popl	%ebx
 	popl	%esi
 	popl	%edi
 	popl	%ebp
 	leal	-4(%ecx), %esp
 	ret
-.L89:
+.L93:
 	fstp	%st(1)
+	jmp	.L74
+.L89:
+	fadds	.LC2
 	jmp	.L72
-.L85:
+.L91:
 	fadds	.LC2
-	jmp	.L70
-.L87:
+	jmp	.L75
+.L90:
 	fadds	.LC2
 	jmp	.L73
-.L86:
-	fadds	.LC2
-	jmp	.L71
 	.size	main, .-main
 	.section	.rodata
 	.align 32
 	.type	cases, @object
 	.size	cases, 2208
 cases:
...


-- 

[-- Attachment #2: cbrt-test.c --]
[-- Type: text/plain, Size: 11463 bytes --]

/*

Don't count the existing Newton-Raphson out. It turns out that to get enough
precision for 32 bits, only 4 iterations are needed. By unrolling those, it
gets much better timing.

Slightly gross test program (with original cubic wraparound bug fixed).

---

*/

/* Test and measure perf of cube root algorithms.  */
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>

#define MEMCLOBBER 1

/**
 * ffs - find first bit set
 * @x: the word to search
 *
 * This is defined the same way as
 * the libc and compiler builtin ffs routines, therefore
 * differs in spirit from the above ffz (man ffs).
 */
static inline int ffs(int x)
{
	int r;

	__asm__("bsfl %1,%0\n\t"
		"cmovzl %2,%0" 
#if MEMCLOBBER
		: "=&r" (r) : "rm" (x), "r" (-1) : "memory");
#else
		: "=&r" (r) : "rm" (x), "r" (-1));
#endif
	return r+1;
}

/**
 * fls - find last bit set
 * @x: the word to search
 *
 * This is defined the same way as ffs.
 */
static inline int fls(int x)
{
	int r;

	__asm__("bsrl %1,%0\n\t"
		"cmovzl %2,%0"
#if MEMCLOBBER
		: "=&r" (r) : "rm" (x), "r" (-1) : "memory" );
#else
		: "=&r" (r) : "rm" (x), "r" (-1));
#endif
	return r+1;
}

#ifdef __x86_64

#define rdtscll(val) do { \
     unsigned int __a,__d; \
     asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \
     (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
} while(0)

# define do_div(n,base) ({					\
	uint32_t __base = (base);				\
	uint32_t __rem;						\
	__rem = ((uint64_t)(n)) % __base;			\
	(n) = ((uint64_t)(n)) / __base;				\
	__rem;							\
 })


/**
 * __ffs - find first bit in word.
 * @word: The word to search
 *
 * Undefined if no bit exists, so code should check against 0 first.
 */
static inline unsigned long __ffs(unsigned long word)
{
	__asm__("bsfq %1,%0"
		:"=r" (word)
		:"rm" (word));
	return word;
}

/*
 * __fls: find last bit set.
 * @word: The word to search
 *
 * Undefined if no zero exists, so code should check against ~0UL first.
 */
static inline unsigned long __fls(unsigned long word)
{
	__asm__("bsrq %1,%0"
		:"=r" (word)
		:"rm" (word));
	return word;
}

/**
 * fls64 - find last bit set in 64 bit word
 * @x: the word to search
 *
 * This is defined the same way as fls.
 */
static inline int fls64(uint64_t x)
{
	if (x == 0)
		return 0;
	return __fls(x) + 1;
}

static inline uint64_t div64_64(uint64_t dividend, uint64_t divisor)
{
	return dividend / divisor;
}

#elif __i386

#define rdtscll_start(val) \
     __asm__ __volatile__("movl $0, %%eax\n\tcpuid\n\trdtsc\n" : "=A" (val) : : "ebx", "ecx")

#define rdtscll(val) \
     __asm__ __volatile__("rdtsc" : "=A" (val))

static inline int fls64(uint64_t x)
{
	uint32_t h = x >> 32;
	if (h)
		return fls(h) + 32;
	return fls(x);
}


#define do_div(n,base) ({ \
	unsigned long __upper, __low, __high, __mod, __base; \
	__base = (base); \
	asm("":"=a" (__low), "=d" (__high):"A" (n)); \
	__upper = __high; \
	if (__high) { \
		__upper = __high % (__base); \
		__high = __high / (__base); \
	} \
	asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \
	asm("":"=A" (n):"a" (__low),"d" (__high)); \
	__mod; \
})


/* 64bit divisor, dividend and result. dynamic precision */
static uint64_t div64_64(uint64_t dividend, uint64_t divisor)
{
	uint32_t d = divisor;

	if (divisor > 0xffffffffULL) {
		unsigned int shift = fls(divisor >> 32);

		d = divisor >> shift;
		dividend >>= shift;
	}

	/* avoid 64 bit division if possible */
	if (dividend >> 32)
		do_div(dividend, d);
	else
		dividend = (uint32_t) dividend / d;

	return dividend;
}
#endif

/* Andi Kleen's version */
uint32_t acbrt(uint64_t x)
{
	uint32_t y = 0;
	int s;

	for (s = 63; s >= 0; s -= 3) {
		uint64_t b, bs;

		y = 2 * y;
		b = 3 * y * (y+1) + 1;
		bs = b << s;
		if (x >= bs && (b == (bs>>s))) {  /* avoid overflow */
			x -= bs;
			y++;
		}
	}
	return y;
}

/* My version of hacker's delight */
uint32_t hcbrt(uint64_t x)
{
	int s = 60;
	uint32_t y = 0;

	do {
		uint64_t b;
		y = 2*y;
		b = (uint64_t)(3*y*(y + 1) + 1) << s;
		s = s - 3;
		if (x >= b) {
			x = x - b;
			y = y + 1;
		}
	} while(s >= 0);

	return y;
}

/* calculate the cubic root of x using Newton-Raphson */
static uint32_t ocubic(uint64_t a)
{
	uint32_t x, x1;

	/* Initial estimate is based on:
	 * cbrt(x) = exp(log(x) / 3)
	 */
	x = 1u << (fls64(a)/3);

	/*
	 * Iteration based on:
	 *                         2
	 * x    = ( 2 * x  +  a / x  ) / 3
	 *  k+1          k         k
	 */
	do {
		uint64_t x2;

		x2 = x;
		x2 *= x;
		x1 = x;

		x = (2 * x + div64_64(a, x2)) / 3;
	} while (abs(x1 - x) > 1);

	return x;
}

/* calculate the cubic root of x using Newton-Raphson */
static uint32_t ncubic(uint64_t a)
{
	uint64_t x;

	/* Initial estimate is based on:
	 * cbrt(x) = exp(log(x) / 3)
	 */
	x = 1u << (fls64(a)/3);

	/* Converges in 3 iterations to > 32 bits */

	x = (2 * x + div64_64(a, x*x)) / 3;
	x = (2 * x + div64_64(a, x*x)) / 3;
	x = (2 * x + div64_64(a, x*x)) / 3;

	return x;
}

static const struct cbrt {
	uint64_t	in;
	uint32_t	result;
} cases[] = {
	{1, 1}, {2, 1}, {3, 1}, {4, 1}, {5, 1}, {6, 1}, {7, 1}, 
	{8, 2}, {9, 2}, {10, 2}, {11, 2}, {12, 2}, {13, 2}, {14, 2},
	{15, 2}, {16, 2}, {17, 2}, {18, 2}, {19, 2}, {20, 2}, {21, 2}, {22, 2},
	{23, 2}, {24, 2}, {25, 2}, {26, 2}, {27, 3}, {28, 3}, {29, 3}, {30, 3},
	{31, 3}, {32, 3}, {33, 3}, {34, 3}, {35, 3}, {36, 3}, {37, 3}, {38, 3},
	{39, 3}, {40, 3}, {99, 4}, {100, 4}, {101, 4}, 
	{ 125ull, 5 }, { 216ull, 6 },  { 343ull, 7 }, { 512ull, 8 }, 
	{ 1000ull, 10 },  { 1331ull, 11 },
	{ 8000ull, 20 },  { 9261ull, 21 },
	{32767, 31},	 {32768, 32}, {32769, 32}, 
	{ 64000ull, 40 },  { 68921ull, 41 },
	{ 512000ull, 80 },  { 531441ull, 81 },
	{ 1000000ull, 100 },  { 1030301ull, 101 },
	{ 4096000ull, 160 },  { 4173281ull, 161 },
	{ 16387064ull, 254 },  { 16581375ull, 255 },
	{ 16777216ull, 256 },  { 16974593ull, 257 },
	{ 131096512ull, 508 },  { 131872229ull, 509 },
	{ 132651000ull, 510 },  { 133432831ull, 511 },
	{ 134217728ull, 512 },  { 135005697ull, 513 },
	{ 1000000000ull, 1000 },  { 1003003001ull, 1001 },
	{ 1006012008ull, 1002 },  { 1009027027ull, 1003 },
	{ 1061208000ull, 1020 },  { 1064332261ull, 1021 },
	{ 1067462648ull, 1022 },  { 1070599167ull, 1023 },
	
	{1073741823, 1023}, {1073741824, 1024}, {1073741825, 1024},
	{~0, 2642245},
/* 100 random values */
 { 7749363893351949254ull, 1978891},  { 7222815480849057907ull, 1933016},
 { 8408462745175416063ull, 2033475},  { 3091884191388096748ull, 1456826},
 { 2562019500164152525ull, 1368340},  { 4403210617922443179ull, 1639041},
 { 3364542905362882299ull, 1498449},  { 8782769017716072774ull, 2063211},
 { 5863405773976003266ull, 1803225},  { 1306053050111174648ull, 1093084},
 { 150346236956174824ull, 531737},  { 1265737889039205261ull, 1081719},
 { 1445109530774087002ull, 1130577},  { 1197105577171186275ull, 1061803},
 { 9213452462461015967ull, 2096399},  { 4730966302945445786ull, 1678739},
 { 5650605098630667570ull, 1781141},  { 5880381756353009591ull, 1804963},
 { 4552499520046621784ull, 1657359},  { 2697991130065918298ull, 1392131},
 { 4858364911220984157ull, 1693674},  { 3691457481531040535ull, 1545489},
 { 2613117305472506601ull, 1377377},  { 7449943749836318932ull, 1953069},
 { 643378865959570610ull, 863287},  { 4851450802679832774ull, 1692871},
 { 1772859812839988916ull, 1210295},  { 8210946489571640849ull, 2017426},
 { 591875965497384322ull, 839608},  { 4221553402965100097ull, 1616183},
 { 2197744667347238205ull, 1300146},  { 8321400714356781191ull, 2026432},
 { 2459557415995497961ull, 1349850},  { 3460673533926954145ull, 1512586},
 { 4727304344741345505ull, 1678306},  { 4903203917250634599ull, 1698869},
 { 4036494370831490817ull, 1592214},  { 8585205035691420311ull, 2047624},
 { 2622143824319236828ull, 1378961},  { 5902762718897731478ull, 1807250},
 { 6344401509618197560ull, 1851243},  { 4059247793194552874ull, 1595200},
 { 7648030174294342832ull, 1970228},  { 2111858627070002939ull, 1282985},
 { 3231502273651985583ull, 1478432},  { 8821862535190318932ull, 2066268},
 { 6062559696943389464ull, 1823414},  { 4054224670122353756ull, 1594541},
 { 3674929609692563482ull, 1543179},  { 6310802012126231363ull, 1847969},
 { 4450190829039920890ull, 1644849},  { 8764531173541462842ull, 2061782},
 { 1361923252301505833ull, 1108453},  { 5912924843615600614ull, 1808287},
 { 5714768882048811324ull, 1787857},  { 7249589769047033748ull, 1935401},
 { 4123157012528828376ull, 1603528},  { 1729687638268160097ull, 1200390},
 { 5132287771298228729ull, 1724925},  { 1564349257200314043ull, 1160854},
 { 951586254223522969ull, 983594},  { 4569664949094662293ull, 1659439},
 { 9082730968228181483ull, 2086437},  { 6312891027251024051ull, 1848173},
 { 6915415788559031791ull, 1905194},  { 2713150456497618688ull, 1394733},
 { 5390954890749602465ull, 1753430},  { 1405547745908296421ull, 1120164},
 { 1157301728707637259ull, 1049902},  { 1513573187112042448ull, 1148156},
 { 687416080475161159ull, 882551},  { 484496930861389501ull, 785411},
 { 1625256440396143907ull, 1175729},  { 7358388240824901288ull, 1945035},
 { 6055730836615196283ull, 1822729},  { 5897962221937294789ull, 1806760},
 { 862205218853780339ull, 951780},  { 4798091009445823173ull, 1686641},
 { 644772714391937867ull, 863910},  { 4255852691293155171ull, 1620549},
 { 5287931004512034672ull, 1742188},  { 479051048987854372ull, 782457},
 { 9223312736680112286ull, 2097147},  { 8208392001457969628ull, 2017217},
 { 9203071384420047828ull, 2095612},  { 8029313043584389618ull, 2002439},
 { 38384068872053008ull, 337326},  { 5477688516749455419ull, 1762784},
 { 1504622508868036557ull, 1145888},  { 8421184723110053200ull, 2034500},
 { 3312070181890020423ull, 1490618},  { 5344298403762143580ull, 1748357},
 { 6340030040222269807ull, 1850818},  { 4895839553118470425ull, 1698018},
 { 2806627376195262363ull, 1410570},  { 5321619225005368821ull, 1745880},
 { 6897323351052656353ull, 1903532},  { 326700202259382556ull, 688731},
 { 7685269066741890339ull, 1973420},  { 8054506481558450217ull, 2004531},
};
#define NCASES (sizeof(cases)/sizeof(cases[0]))


static double ticks_per_usec = 2979.0;

static void show(const char *func, uint64_t sum, uint64_t sq, 
		 unsigned long long mx, unsigned long long err)
{
	double mean, std;

	mean = (double) sum / ticks_per_usec / NCASES ;
	std = sqrtl( (double) sq / ticks_per_usec / NCASES - mean * mean);

	printf("%-10s %8llu %8.3f %8.3f %8.3f %11llu\n", func, 
	       (unsigned long long) sum / NCASES, mean, std, 
	       (double) mx / ticks_per_usec, err);
}


int main(int argc, char **argv)
{
	int i;
	uint32_t c;
	unsigned long long start, end, t, mx;
	unsigned long long err, sum, sum_sq, cerr;
	int loops;

#if 0
	rdtscll(start);
	sleep(2);
	rdtscll(end);
	ticks_per_usec = (double) (end - start) / 2000000.;
#endif

	printf("Function     clocks mean(us)  max(us)  std(us) total error\n");

#define DOTEST(func) \
	if (func(27) != 3) printf("ouch\n"); \
	sum = sum_sq = mx = 0; \
	for (err = 0, i = 0; i < NCASES; i++) { \
		do { \
			rdtscll_start(start); \
			rdtscll_start(end); \
		} while ((end - start) < 1000); \
		c = func(cases[i].in); \
		loops = 667; \
		rdtscll_start(start);		\
		while (--loops > 0) c = func(cases[i].in); \
		rdtscll_start(end); \
		t = (end - start) / 666; sum += t; sum_sq += t*t; \
		if (t > mx) mx = t; \
		cerr = abs((long) cases[i].result - c); \
		err += cerr; \
	} \
	show(#func, sum, sum_sq, mx, err);

	//DOTEST(ocubic);
	DOTEST(ncubic);
	//DOTEST(acbrt);
	//DOTEST(hcbrt);
	return 0;
}


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] tcp_cubic: faster cube root
  2007-03-07 12:12                         ` Andi Kleen
@ 2007-03-07 19:33                           ` David Miller
  0 siblings, 0 replies; 62+ messages in thread
From: David Miller @ 2007-03-07 19:33 UTC (permalink / raw)
  To: andi; +Cc: shemminger, rkuhn, dada1, jengelh, linux-kernel, netdev

From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 7 Mar 2007 13:12:46 +0100

> Well that still needs the ugly div64_64 function. At least my goal was to 
> eliminate that, not make it faster (I don't see any evidence this function 
> is particularly performance critical). You prefer to keep div64_64? 

I really have no issues with it.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [PATCH] tcp_cubic: use 32 bit math
  2007-03-07  6:08                         ` Update to " Willy Tarreau
@ 2007-03-08  1:07                           ` Stephen Hemminger
  2007-03-08  2:55                             ` David Miller
  0 siblings, 1 reply; 62+ messages in thread
From: Stephen Hemminger @ 2007-03-08  1:07 UTC (permalink / raw)
  To: Willy Tarreau, David Miller
  Cc: rkuhn, andi, dada1, jengelh, linux-kernel, netdev

The basic calculation has to be done in 32 bits to avoid
doing 64 bit divide by 3. The value x is only 22bits max
so only need full 64 bits only for x^2.

Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>

---
 net/ipv4/tcp_cubic.c |    8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

--- net-2.6.22.orig/net/ipv4/tcp_cubic.c	2007-03-07 15:51:37.000000000 -0800
+++ net-2.6.22/net/ipv4/tcp_cubic.c	2007-03-07 17:06:02.000000000 -0800
@@ -96,7 +96,7 @@
  */
 static u32 cubic_root(u64 a)
 {
-	u64 x;
+	u32 x;
 
 	/* Initial estimate is based on:
 	 * cbrt(x) = exp(log(x) / 3)
@@ -104,9 +104,9 @@
 	x = 1u << (fls64(a)/3);
 
 	/* converges to 32 bits in 3 iterations */
-	x = (2 * x + div64_64(a, x*x)) / 3;
-	x = (2 * x + div64_64(a, x*x)) / 3;
-	x = (2 * x + div64_64(a, x*x)) / 3;
+	x = (2 * x + (u32)div64_64(a, (u64)x*(u64)x)) / 3;
+	x = (2 * x + (u32)div64_64(a, (u64)x*(u64)x)) / 3;
+	x = (2 * x + (u32)div64_64(a, (u64)x*(u64)x)) / 3;
 
 	return x;
 }

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] tcp_cubic: use 32 bit math
  2007-03-08  1:07                           ` [PATCH] tcp_cubic: use 32 bit math Stephen Hemminger
@ 2007-03-08  2:55                             ` David Miller
  2007-03-08  3:10                               ` Stephen Hemminger
  0 siblings, 1 reply; 62+ messages in thread
From: David Miller @ 2007-03-08  2:55 UTC (permalink / raw)
  To: shemminger; +Cc: w, rkuhn, andi, dada1, jengelh, linux-kernel, netdev

From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Wed, 7 Mar 2007 17:07:31 -0800

> The basic calculation has to be done in 32 bits to avoid
> doing 64 bit divide by 3. The value x is only 22bits max
> so only need full 64 bits only for x^2.
> 
> Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>

Applied, thanks Stephen.

What about Willy Tarreau's supposedly even faster variant?
Or does this incorporate that set of improvements?

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] tcp_cubic: use 32 bit math
  2007-03-08  2:55                             ` David Miller
@ 2007-03-08  3:10                               ` Stephen Hemminger
  2007-03-08  3:51                                 ` David Miller
  2007-03-08  4:16                                 ` [PATCH] tcp_cubic: use 32 bit math Willy Tarreau
  0 siblings, 2 replies; 62+ messages in thread
From: Stephen Hemminger @ 2007-03-08  3:10 UTC (permalink / raw)
  To: David Miller; +Cc: w, rkuhn, andi, dada1, jengelh, linux-kernel, netdev

David Miller wrote:
> From: Stephen Hemminger <shemminger@linux-foundation.org>
> Date: Wed, 7 Mar 2007 17:07:31 -0800
>
>   
>> The basic calculation has to be done in 32 bits to avoid
>> doing 64 bit divide by 3. The value x is only 22bits max
>> so only need full 64 bits only for x^2.
>>
>> Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
>>     
>
> Applied, thanks Stephen.
>
> What about Willy Tarreau's supposedly even faster variant?
> Or does this incorporate that set of improvements?
>   
That's what this is:
    x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x)) / 3;


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] tcp_cubic: use 32 bit math
  2007-03-08  3:10                               ` Stephen Hemminger
@ 2007-03-08  3:51                                 ` David Miller
  2007-03-10 11:48                                   ` Willy Tarreau
  2007-03-08  4:16                                 ` [PATCH] tcp_cubic: use 32 bit math Willy Tarreau
  1 sibling, 1 reply; 62+ messages in thread
From: David Miller @ 2007-03-08  3:51 UTC (permalink / raw)
  To: shemminger; +Cc: w, rkuhn, andi, dada1, jengelh, linux-kernel, netdev

From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Wed, 07 Mar 2007 19:10:47 -0800

> David Miller wrote:
> > What about Willy Tarreau's supposedly even faster variant?
> > Or does this incorporate that set of improvements?
> >   
> That's what this is:
>     x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x)) / 3;

Great, thanks for the clarification.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] tcp_cubic: use 32 bit math
  2007-03-08  3:10                               ` Stephen Hemminger
  2007-03-08  3:51                                 ` David Miller
@ 2007-03-08  4:16                                 ` Willy Tarreau
  1 sibling, 0 replies; 62+ messages in thread
From: Willy Tarreau @ 2007-03-08  4:16 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, rkuhn, andi, dada1, jengelh, linux-kernel, netdev

On Wed, Mar 07, 2007 at 07:10:47PM -0800, Stephen Hemminger wrote:
> David Miller wrote:
> >From: Stephen Hemminger <shemminger@linux-foundation.org>
> >Date: Wed, 7 Mar 2007 17:07:31 -0800
> >
> >  
> >>The basic calculation has to be done in 32 bits to avoid
> >>doing 64 bit divide by 3. The value x is only 22bits max
> >>so only need full 64 bits only for x^2.
> >>
> >>Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
> >>    
> >
> >Applied, thanks Stephen.
> >
> >What about Willy Tarreau's supposedly even faster variant?
> >Or does this incorporate that set of improvements?
> >  
> That's what this is:
>    x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x)) / 3;

Confirmed, it's the same. BTW, has someone tested on a 64bit system if
it brings any difference ?

Willy


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: asm volatile [Was: [RFC] div64_64 support]
  2007-03-06 22:24                     ` Sami Farin
  2007-03-07  0:00                       ` Stephen Hemminger
  2007-03-07 16:11                       ` Chuck Ebbert
@ 2007-03-08 18:23                       ` Sami Farin
  2007-03-08 22:01                         ` asm volatile David Miller
  2 siblings, 1 reply; 62+ messages in thread
From: Sami Farin @ 2007-03-08 18:23 UTC (permalink / raw)
  To: linux-kernel, netdev

On Wed, Mar 07, 2007 at 00:24:35 +0200, Sami Farin wrote:
> On Tue, Mar 06, 2007 at 23:53:49 +0200, Sami Farin wrote:
> ...
> > And I found bug in gcc-4.1.2, it gave 0 for ncubic results
> > when doing 1000 loops test... gcc-4.0.3 works.
> 
> Found it.
> 
> --- cbrt-test.c~	2007-03-07 00:20:54.735248105 +0200
> +++ cbrt-test.c	2007-03-07 00:21:03.964864343 +0200
> @@ -209,7 +209,7 @@
>  
>  	__asm__("bsrl %1,%0\n\t"
>  		"cmovzl %2,%0"
> -		: "=&r" (r) : "rm" (x), "rm" (-1));
> +		: "=&r" (r) : "rm" (x), "rm" (-1) : "memory");
>  	return r+1;
>  }
>  
> Now Linux 2.6 does not have "memory" in fls, maybe it causes
> some gcc funnies some people are seeing.

It also works without "memory" if I do "__asm__ volatile".

Why some functions have volatile and some have not in include/asm-*/*.h ?

-- 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: asm volatile
  2007-03-08 18:23                       ` asm volatile [Was: [RFC] div64_64 support] Sami Farin
@ 2007-03-08 22:01                         ` David Miller
  0 siblings, 0 replies; 62+ messages in thread
From: David Miller @ 2007-03-08 22:01 UTC (permalink / raw)
  To: 7atbggg02; +Cc: linux-kernel, netdev

From: Sami Farin <7atbggg02@sneakemail.com>
Date: Thu, 8 Mar 2007 20:23:57 +0200

> On Wed, Mar 07, 2007 at 00:24:35 +0200, Sami Farin wrote:
> > On Tue, Mar 06, 2007 at 23:53:49 +0200, Sami Farin wrote:
> > ...
> > > And I found bug in gcc-4.1.2, it gave 0 for ncubic results
> > > when doing 1000 loops test... gcc-4.0.3 works.
> > 
> > Found it.
> > 
> > --- cbrt-test.c~	2007-03-07 00:20:54.735248105 +0200
> > +++ cbrt-test.c	2007-03-07 00:21:03.964864343 +0200
> > @@ -209,7 +209,7 @@
> >  
> >  	__asm__("bsrl %1,%0\n\t"
> >  		"cmovzl %2,%0"
> > -		: "=&r" (r) : "rm" (x), "rm" (-1));
> > +		: "=&r" (r) : "rm" (x), "rm" (-1) : "memory");
> >  	return r+1;
> >  }
> >  
> > Now Linux 2.6 does not have "memory" in fls, maybe it causes
> > some gcc funnies some people are seeing.
> 
> It also works without "memory" if I do "__asm__ volatile".
> 
> Why some functions have volatile and some have not in include/asm-*/*.h ?

"volatile" is really only needed if there is some side effect
that cannot be expressed to gcc which makes ordering over
the asm wrt. other pieces of code important.

But in these case it should absolutely not be needed.  It's
simply computing an interger result from some inputs and
some values in memory.  GCC should see perfectly fine what
is memory is read by the asm and therefore what ordering
constraints there are wrt. writes to the same memory location.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] tcp_cubic: use 32 bit math
  2007-03-08  3:51                                 ` David Miller
@ 2007-03-10 11:48                                   ` Willy Tarreau
  2007-03-12 21:11                                     ` Stephen Hemminger
  0 siblings, 1 reply; 62+ messages in thread
From: Willy Tarreau @ 2007-03-10 11:48 UTC (permalink / raw)
  To: David Miller
  Cc: shemminger, rkuhn, andi, dada1, jengelh, linux-kernel, netdev

On Wed, Mar 07, 2007 at 07:51:35PM -0800, David Miller wrote:
> From: Stephen Hemminger <shemminger@linux-foundation.org>
> Date: Wed, 07 Mar 2007 19:10:47 -0800
> 
> > David Miller wrote:
> > > What about Willy Tarreau's supposedly even faster variant?
> > > Or does this incorporate that set of improvements?
> > >   
> > That's what this is:
> >     x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x)) / 3;
> 
> Great, thanks for the clarification.

Oh BTW, I have a newer version with a first approximation of the
cbrt() before the div64_64, which allows us to reduce from 3 div64
to only 2 div64. This results in a version which is twice as fast
as the initial one (ncubic), but with slightly less accuracy (0.286%
compared to 0.247). But I see that other functions such as hcbrt()
had a 1.5% avg error, so I think this is not dramatic.

Also, I managed to remove all other divides, to be kind with CPUs
having a slow divide instruction or no divide at all. Since we compute
on limited range (22 bits), we can multiply then shift right. It shows
me even slightly better time on pentium-m and athlon, with a slightly
higher avg error (0.297% compared to 0.286%), and slightly smaller
code.

I just have to clean experiments from my code to provide a patch.
David, Stephen, are you interested ?

$ ./bictcp
fls(0)=0, fls(1)=1, fls(256)=9
Calibrating
Function     clocks  mean(us) max(us)  std(us)  Avg error
bictcp          936     0.61    24.28     1.99 0.172%
ocubic          886     0.57    23.51     3.18 0.274%
ncubic          644     0.42    16.59     2.18 0.247%
ncubic32        444     0.29    11.47     1.50 0.247%
ncubic32_1      444     0.29    11.56     1.88 0.238%
ncubic32b3      337     0.22     8.67     0.88 0.286%
ncubic_ndiv3    329     0.21     8.46     0.69 0.297%
acbrt           707     0.46    18.05     0.80 0.275%
hcbrt           644     0.42    16.44     0.51 1.580%


Regards,
Willy


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] tcp_cubic: use 32 bit math
  2007-03-10 11:48                                   ` Willy Tarreau
@ 2007-03-12 21:11                                     ` Stephen Hemminger
  2007-03-13 20:50                                       ` Willy Tarreau
  0 siblings, 1 reply; 62+ messages in thread
From: Stephen Hemminger @ 2007-03-12 21:11 UTC (permalink / raw)
  To: Willy Tarreau
  Cc: David Miller, rkuhn, andi, dada1, jengelh, linux-kernel, netdev

On Sat, 10 Mar 2007 12:48:26 +0100
Willy Tarreau <w@1wt.eu> wrote:

> On Wed, Mar 07, 2007 at 07:51:35PM -0800, David Miller wrote:
> > From: Stephen Hemminger <shemminger@linux-foundation.org>
> > Date: Wed, 07 Mar 2007 19:10:47 -0800
> > 
> > > David Miller wrote:
> > > > What about Willy Tarreau's supposedly even faster variant?
> > > > Or does this incorporate that set of improvements?
> > > >   
> > > That's what this is:
> > >     x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x)) / 3;
> > 
> > Great, thanks for the clarification.
> 
> Oh BTW, I have a newer version with a first approximation of the
> cbrt() before the div64_64, which allows us to reduce from 3 div64
> to only 2 div64. This results in a version which is twice as fast
> as the initial one (ncubic), but with slightly less accuracy (0.286%
> compared to 0.247). But I see that other functions such as hcbrt()
> had a 1.5% avg error, so I think this is not dramatic.

Ignore my hcbrt() it was a less accurate version of andi's stuff.

> Also, I managed to remove all other divides, to be kind with CPUs
> having a slow divide instruction or no divide at all. Since we compute
> on limited range (22 bits), we can multiply then shift right. It shows
> me even slightly better time on pentium-m and athlon, with a slightly
> higher avg error (0.297% compared to 0.286%), and slightly smaller
> code.

What does the code look like?

 
> I just have to clean experiments from my code to provide a patch.
> David, Stephen, are you interested ?
> 
> $ ./bictcp
> fls(0)=0, fls(1)=1, fls(256)=9
> Calibrating
> Function     clocks  mean(us) max(us)  std(us)  Avg error
> bictcp          936     0.61    24.28     1.99 0.172%
> ocubic          886     0.57    23.51     3.18 0.274%
> ncubic          644     0.42    16.59     2.18 0.247%
> ncubic32        444     0.29    11.47     1.50 0.247%
> ncubic32_1      444     0.29    11.56     1.88 0.238%
> ncubic32b3      337     0.22     8.67     0.88 0.286%
> ncubic_ndiv3    329     0.21     8.46     0.69 0.297%
> acbrt           707     0.46    18.05     0.80 0.275%
> hcbrt           644     0.42    16.44     0.51 1.580%
> 
> 
> Regards,
> Willy
> 


-- 
Stephen Hemminger <shemminger@linux-foundation.org>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] tcp_cubic: use 32 bit math
  2007-03-12 21:11                                     ` Stephen Hemminger
@ 2007-03-13 20:50                                       ` Willy Tarreau
  2007-03-21 18:54                                         ` Stephen Hemminger
  0 siblings, 1 reply; 62+ messages in thread
From: Willy Tarreau @ 2007-03-13 20:50 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, rkuhn, andi, dada1, jengelh, linux-kernel, netdev

Hi Stephen,

On Mon, Mar 12, 2007 at 02:11:56PM -0700, Stephen Hemminger wrote:
> > Oh BTW, I have a newer version with a first approximation of the
> > cbrt() before the div64_64, which allows us to reduce from 3 div64
> > to only 2 div64. This results in a version which is twice as fast
> > as the initial one (ncubic), but with slightly less accuracy (0.286%
> > compared to 0.247). But I see that other functions such as hcbrt()
> > had a 1.5% avg error, so I think this is not dramatic.
> 
> Ignore my hcbrt() it was a less accurate version of andi's stuff.

OK.

> > Also, I managed to remove all other divides, to be kind with CPUs
> > having a slow divide instruction or no divide at all. Since we compute
> > on limited range (22 bits), we can multiply then shift right. It shows
> > me even slightly better time on pentium-m and athlon, with a slightly
> > higher avg error (0.297% compared to 0.286%), and slightly smaller
> > code.
> 
> What does the code look like?

Well, I have cleaned it a little bit, there were more comments and ifdefs
than code ! I've appended it to the end of this mail.

I have changed it a bit, because I noticed that integer divide precision
was so coarse that there were other possibilities to play with the bits.

I have experimented with combinations of several methods :
  - replace integer divides with multiplies/shifts where possible.

  - compensation for divide imprecisions by adding/removing small
    values bofore/after them. Often, the integer result of 1/(x*(x-1))
    is closer to (float)1/(float)x^2 than 1/(x*x). This is because
    the divide always truncates the result.

  - use direct result lookup for small values. Small inputs give small
    outputs which have very few moving bits. Many different values fit
    in a 32bit integer, so we use a shift offset to lookup the value.
    I used this in an fls function I wrote a while ago, that I should
    also post because it is up to twice as fast as the kernel's.
    Sometimes it seems faster to lookup in from memory, sometimes it
    is faster from an immediate value. Maybe more visible differences
    would show up on RISC CPUs where loading 32 bits immediate needs
    two instructions. I don't know yet, I've not tested on my sparc
    yet.

  - use small lookup tables (64 bytes) with 6 bits inputs and at least
    as many on output. We only lookup the 6 MSB and return the 2-3 MSB
    of the result.

  - iterative search and manual refinment of the lookup tables for best
    accuracy. The avg error rate can easily be halved this way.

I have duplicated tried several functions with 0, 1, 2 and 3 divides.
Several of them offer better accuracy over what we currently have, in
less cycles. Others offer faster results (up to 5 times) with slightly
less accuracy.

There is one function which is not to be used, but is just here for
comparison (ncubic_0div). It does no divide but has awful avg error.

But one which is interesting is the ncubic_tab0. It does not use any
divide at all, even not any div64. It shows a 0.6% avg error, which I'm
not sure is enough or not. It is 6.7 times faster than initial ncubic()
with less accuracy, and 4 times smaller. I suspect that it can differ
more on architectures which have no divide instruction.

Is 0.6% avg error rate is too much, ncubic_tab1() uses one single div64
and is twice slower (still nearly 3 times faster than ncubic). It show
0.195% avg error, which is better than initial ncubic. I think that it
is a good tradeoff.

If best accuracy is an absolute requirement, then I have a variation of
ncubic (ncubic_3div) which does 0.17% in 2/3 of the time (compared to
0.247%), and which is slightly smaller.

I have also added a "size" column, indicating approximative function
size, provided that the compiler does not reorder the code. On gcc 3.4,
it's OK, but 4.1 returns garbage. That does not matter, it's just a
rough estimate anyway.

Here are the results classed by speed :

/* Sample output on a Pentium-M 600 MHz :

Function          clocks mean(us)  max(us)  std(us) Avg err size
ncubic_tab0           79     0.66     7.20     1.04  0.613%  160
ncubic_0div           84     0.70     7.64     1.57  4.521%  192
ncubic_1div          178     1.48    16.27     1.81  0.443%  336
ncubic_tab1          179     1.49    16.34     1.85  0.195%  320
ncubic_ndiv3         263     2.18    24.04     3.59  0.250%  512
ncubic_2div          270     2.24    24.70     2.77  0.187%  512
ncubic32_1           359     2.98    32.81     3.59  0.238%  544
ncubic_3div          361     2.99    33.08     3.79  0.170%  656
ncubic32             364     3.02    33.29     3.51  0.247%  544
ncubic               529     4.39    48.39     4.92  0.247%  720
hcbrt                539     4.47    49.25     5.98  1.580%   96
ocubic               732     4.93    61.83     7.22  0.274%  320
acbrt                842     6.98    76.73     8.55  0.275%  192
bictcp              1032     6.95    86.30     9.04  0.172%  768

And now by avg error :

ncubic_3div          361     2.99    33.08     3.79  0.170%  656
bictcp              1032     6.95    86.30     9.04  0.172%  768
ncubic_2div          270     2.24    24.70     2.77  0.187%  512
ncubic_tab1          179     1.49    16.34     1.85  0.195%  320
ncubic32_1           359     2.98    32.81     3.59  0.238%  544
ncubic               529     4.39    48.39     4.92  0.247%  720
ncubic32             364     3.02    33.29     3.51  0.247%  544
ncubic_ndiv3         263     2.18    24.04     3.59  0.250%  512
ocubic               732     4.93    61.83     7.22  0.274%  320
acbrt                842     6.98    76.73     8.55  0.275%  192
ncubic_1div          178     1.48    16.27     1.81  0.443%  336
ncubic_tab0           79     0.66     7.20     1.04  0.613%  160
hcbrt                539     4.47    49.25     5.98  1.580%   96
ncubic_0div           84     0.70     7.64     1.57  4.521%  192


And here comes the code. I have tried to document it a bit, as much
as can be done on experimentation code. It is often easier to use
a pencil and paper to understand how the bits move.

Regards,
Willy



/*
Here is a better version of the benchmark code.
It has the original code used in 2.4 version of Cubic for comparison

-----------------------------------------------------------
*/
/* Test and measure perf of cube root algorithms.  */
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>
#include <unistd.h>

#ifdef __x86_64

#define rdtscll(val) do { \
     unsigned int __a,__d; \
     asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \
     (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
} while(0)

# define do_div(n,base) ({					\
	uint32_t __base = (base);				\
	uint32_t __rem;						\
	__rem = ((uint64_t)(n)) % __base;			\
	(n) = ((uint64_t)(n)) / __base;				\
	__rem;							\
 })


/**
 * __ffs - find first bit in word.
 * @word: The word to search
 *
 * Undefined if no bit exists, so code should check against 0 first.
 */
static __inline__ unsigned long __ffs(unsigned long word)
{
	__asm__("bsfq %1,%0"
		:"=r" (word)
		:"rm" (word));
	return word;
}

/*
 * __fls: find last bit set.
 * @word: The word to search
 *
 * Undefined if no zero exists, so code should check against ~0UL first.
 */
static inline unsigned long __fls(unsigned long word)
{
	__asm__("bsrq %1,%0"
		:"=r" (word)
		:"rm" (word));
	return word;
}

/**
 * ffs - find first bit set
 * @x: the word to search
 *
 * This is defined the same way as
 * the libc and compiler builtin ffs routines, therefore
 * differs in spirit from the above ffz (man ffs).
 */
static __inline__ int ffs(int x)
{
	int r;

	__asm__("bsfl %1,%0\n\t"
		"cmovzl %2,%0" 
		: "=r" (r) : "rm" (x), "r" (-1));
	return r+1;
}

/**
 * fls - find last bit set
 * @x: the word to search
 *
 * This is defined the same way as ffs.
 */
static inline int fls(int x)
{
	int r;

	__asm__("bsrl %1,%0\n\t"
		"cmovzl %2,%0"
		: "=&r" (r) : "rm" (x), "rm" (-1));
	return r+1;
}

/**
 * fls64 - find last bit set in 64 bit word
 * @x: the word to search
 *
 * This is defined the same way as fls.
 */
static inline int fls64(uint64_t x)
{
	if (x == 0)
		return 0;
	return __fls(x) + 1;
}

static inline uint64_t div64_64(uint64_t dividend, uint64_t divisor)
{
	return dividend / divisor;
}

#elif __i386

#define rdtscll(val) \
     __asm__ __volatile__("rdtsc" : "=A" (val))

/**
 * ffs - find first bit set
 * @x: the word to search
 *
 * This is defined the same way as
 * the libc and compiler builtin ffs routines, therefore
 * differs in spirit from the above ffz() (man ffs).
 */
static inline int ffs(int x)
{
	int r;

	__asm__("bsfl %1,%0\n\t"
		"jnz 1f\n\t"
		"movl $-1,%0\n"
		"1:" : "=r" (r) : "rm" (x));
	return r+1;
}

/**
 * fls - find last bit set
 * @x: the word to search
 *
 * This is defined the same way as ffs().
 */
static inline int fls(int x)
{
	int r;

	__asm__("bsrl %1,%0\n\t"
		"jnz 1f\n\t"
		"movl $-1,%0\n"
		"1:" : "=r" (r) : "rm" (x));
	return r+1;
}

static inline int fls64(uint64_t x)
{
	uint32_t h = x >> 32;
	if (h)
		return fls(h) + 32;
	return fls(x);
}


#define do_div(n,base) ({ \
	unsigned long __upper, __low, __high, __mod, __base; \
	__base = (base); \
	asm("":"=a" (__low), "=d" (__high):"A" (n)); \
	__upper = __high; \
	if (__high) { \
		__upper = __high % (__base); \
		__high = __high / (__base); \
	} \
	asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \
	asm("":"=A" (n):"a" (__low),"d" (__high)); \
	__mod; \
})


/* 64bit divisor, dividend and result. dynamic precision */
static uint64_t div64_64(uint64_t dividend, uint64_t divisor)
{
	uint32_t d = divisor;

	if (divisor > 0xffffffffULL) {
		unsigned int shift = fls(divisor >> 32);

		d = divisor >> shift;
		dividend >>= shift;
	}

	/* avoid 64 bit division if possible */
	if (dividend >> 32)
		do_div(dividend, d);
	else
		dividend = (uint32_t) dividend / d;

	return dividend;
}

/* this one only works when the result is below 32 bits */
static uint32_t div64_64_32(uint64_t dividend, uint64_t divisor)
{
	uint32_t d = divisor;

	if (divisor > 0xffffffffULL) {
		unsigned int shift = fls(divisor >> 32);

		d = divisor >> shift;
		dividend >>= shift;
	}

	/* avoid 64 bit division if possible */
	if (dividend >> 32)
		do_div(dividend, d);
	else
		dividend = (uint32_t) dividend / d;
	return dividend;
}
#endif

/* Andi Kleen's version */
uint32_t acbrt(uint64_t x)
{
	uint32_t y = 0;
	int s;

	for (s = 63; s >= 0; s -= 3) {
		uint64_t b, bs;

		y = 2 * y;
		b = 3 * y * (y+1) + 1;
		bs = b << s;
		if (x >= bs && (b == (bs>>s))) {  /* avoid overflow */
			x -= bs;
			y++;
		}
	}
	return y;
}
uint32_t end_acbrt() { }

/* My version of hacker's delight */
uint32_t hcbrt(uint64_t x)
{
	int s = 60;
	uint32_t y = 0;

	do {
		uint64_t b;
		y = 2*y;
		b = (uint64_t)(3*y*(y + 1) + 1) << s;
		s = s - 3;
		if (x >= b) {
			x = x - b;
			y = y + 1;
		}
	} while(s >= 0);

	return y;
}
uint32_t end_hcbrt() { }

/* calculate the cubic root of x using Newton-Raphson */
static uint32_t ocubic(uint64_t a)
{
	uint32_t x, x1;

	/* Initial estimate is based on:
	 * cbrt(x) = exp(log(x) / 3)
	 */
	x = 1u << (fls64(a)/3);

	/*
	 * Iteration based on:
	 *                         2
	 * x    = ( 2 * x  +  a / x  ) / 3
	 *  k+1          k         k
	 */
	do {
		x1 = x;

		x = (2 * x + div64_64(a, (uint64_t)x * x)) / 3;
	} while (abs(x1 - x) > 1);

	return x;
}
static uint32_t end_ocubic() { }

/* calculate the cubic root of x using Newton-Raphson */
static uint32_t ncubic(uint64_t a)
{
	uint64_t x;

	/* Initial estimate is based on:
	 * cbrt(x) = exp(log(x) / 3)
	 */
	x = 1u << (fls64(a)/3);

	/* Converges in 3 iterations to > 32 bits */
	x = (2 * x + div64_64(a, x*x)) / 3;
	x = (2 * x + div64_64(a, x*x)) / 3;
	x = (2 * x + div64_64(a, x*x)) / 3;

	return x;
}
static uint32_t end_ncubic() { }

/* calculate the cubic root of x using Newton-Raphson.
 * Avg err ~= 0.247%
 */
static uint32_t ncubic32(uint64_t a)
{
	uint32_t x;

	/* Initial estimate is based on:
	 * cbrt(x) = exp(log(x) / 3)
	 */
	x = 1u << (fls64(a)/3);

	/* Converges in 3 iterations to > 32 bits */
	/* We can do 32bit maths here :
	 *   x ~= cbrt(a) so (a/x^2) ~= cbrt(a) which is about 22 bits max
	 */
	x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x)) / 3;
	x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x)) / 3;
	x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x)) / 3;

	return x;
}
static uint32_t end_ncubic32() { }

/* calculate the cubic root of x using Newton-Raphson - small refinement.
 * Avg err ~= 0.238%
 */
static uint32_t ncubic32_1(uint64_t a)
{
	uint32_t x;

	/* Initial estimate is based on:
	 * cbrt(x) = exp(log(x) / 3)
	 */
	x = 1u << ((fls64(a)+1)/3);

	/* Converges in 3 iterations to > 32 bits */
	/* We can do 32bit maths here :
	 *   x ~= cbrt(a) so (a/x^2) ~= cbrt(a) which is about 22 bits max
	 */
	x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x)) / 3;
	x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x)) / 3;
	x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x)) / 3;

	return x;
}
static uint32_t end_ncubic32_1() { }


int man_adj;

#define ALIGN64 __attribute__ ((aligned (8)))

/* calculate the cubic root of x using Newton-Raphson with less divides.
 * Avg err ~= 0.250%
 */
static uint32_t ncubic_ndiv3(uint64_t a)
{
	uint32_t x;
	uint32_t b;
	uint32_t y;

	/*
	 * For low values (between 2 and 63), we use a direct mapping of the
	 * input divided by 4 (between 0 and 15) to the output between 1 and 4.
	 * Those 4 values can be stored as two bits if we store the result
	 * minus 1, which constitute 32 bits for the 16 values.
	 * We use a uint32_t for this, which we shift right by a/4.
	 *
	 * a / 4   = 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
	 * a_max   = 63 59 55 51 47 43 39 35 31 27 23 19 15 11 07 03
	 * a_min   = 60 56 52 48 44 40 36 32 28 24 20 16 12 08 04 00
	 * cbrt(a) =  4  4  4  4  4  3  3  3  3  3  3  3  2  2  2  1
	 * bits    = 11 11 11 11 11 10 10 10 10 10 10 10 01 01 01 00
	 *         = 0xFFEAAA54UL
	 */
	ALIGN64 static uint32_t cbrt_x_4m1 = 0xFFEAAA54UL;


	/* For higher values, we use an initial estimation based on this fact :
	 *   cbrt(x) = exp(log(x) / 3)
	 * and :
	 *   cbrt(x * y) = cbrt(x) * cbrt(y)
	 *
	 * So for a block of 3 input bits, we can get 1 output bit, and for
	 * 6 input bits, we get 2 output bits (3 in fact due to rounding up).
	 * So we have to operate on 3bit boundaries, and check the highest
	 * 6 bits to provide 2 to 3 bits on output.
	 *
	 * Let's consider n the value so that we have between 4 and 6 MSB
	 * between bits 3n and 3n+5. We will set the output bits between
	 * n and n+2.
	 *
	 * We have 64 possible values for the 6 MSB. But since a cbrt()
	 * output changes slower than its input, we can easily focus on
	 * the 4 MSB only. This means 16 values. Just like above for the
	 * small values, we can store 16 * 2 bits in a uint32_t.
	 *
	 * The little difference is that we are not seeking exact output
	 * result, but the closest value to the exact output, to improve
	 * convergence speed. To achieve this, we start with same values
	 * as above, and iteratively refine them by hand so that the average
	 * error reaches its minimum.
	 *
	 * Theorical map value is     0xFFEAAA64UL.
	 * Experimental best value is 0xFFAFAA94UL.
	 *
	 * In order to find blocks of 3 bits aligned on 3n bits boundaries,
	 * we have to shift right by multiples of 3 bits. We want to avoid
	 * a costly divide, so we instead multiply by 84 and shift right by 8,
	 * as this returns the same values for inputs below and 64.
	 *
	 * For the results, we still have to divide by 3 multiple times. We
	 * know the result as well as intermediate values are less than 2^22,
	 * so we can use the same principle with shifted arithmetics :
	 *
	 *    341/1024 < 1/3 < 342/1024
	 *
	 * Rouding errors on integer maths generally can be compensated for by
	 * small offset adjustments before divides. Some have been added after
	 * experimentations to provide better accuracy.
	 *
	 */
	ALIGN64 static uint32_t cbrt_4msb2lsb = 0xFFAFAA94UL; /* cbrt([0..63]/4)-1 */

	b = fls64(a);

	if (b <= 1)
		return b;

	else if (b < 7) {
		/* a in [0..63] */
		uint32_t bits;
		bits = (uint8_t)a >> 1;
		bits &= ~1;
		return 1 + ((cbrt_x_4m1 >> bits) & 0x03);
	}

	/* We will shift a right by 3n bits, to retrieve bits 3n..3n+5.
	 * We want (b - 1) / 3 for b between 7 and 64 so that we have a
	 * a bit shift count between 2 and 21 inclusive.
	 */
	b = (b * 84) >> 8;

	y = (a >> (b * 3 - 1)) << 1;
	x = 1 + ((cbrt_4msb2lsb >> y) & 3);
	x = (x << b) >> 1;

	/* x += 6; // provides slightly better accuracy (0.246% vs 0.250%) */

	x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)(x - 1)));
	x = ((x * 344) >> 10);  /* = x/2.977 ~= x/3 */

	x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)(x - 1)));
	x = ((x * 341) >> 10);  /* = x/3.003 ~= x/3 */
	return x;
}
static uint32_t end_ncubic_ndiv3() { }


/* We use this constant to extract the 3 highest bits of <a> and shift them
 * 2 bits left in order to provide a 4bit-aligned shift pointer to an uint32_t.
 */
#define VAL_3BIT_TO_SHIFT4(a,b) (((a) >> ((b) * 3)) << 2)


/* calculate the cubic root of x using Newton-Raphson in 3 divides after first
 * approximation.
 *
 * Avg err ~= 0.170%
 */
static uint32_t ncubic_3div(uint64_t a)
{
	uint32_t x;
	uint32_t b;
	uint32_t shift;

	/*
	 * For large values, 3 bits inputs are enough.
	 *
	 * We store 4*cbrt(8*x)-1 for x in [0..7]
	 *
	 *  x     |   cbrt      | 4*cbrt | int | int |
	 *  range |   range     | approx | val.|  -3 |
	 * -------+-------------+--------+-----+-----+
	 * 56..63 | 3.83 - 3.98 | 15.66  | 16  |  13 |
	 * 48..55 | 3.63 - 3.80 | 14.93  | 15  |  12 |
	 * 40..47 | 3.42 - 3.61 | 14.12  | 14  |  11 |
	 * 32..39 | 3.17 - 3.39 | 13.21  | 13  |  10 |
	 * 24..31 | 2.88 - 3.14 | 12.15  | 12  |   9 |
	 * 16..23 | 2.51 - 2.84 | 10.86  | 11  |   8 |
	 * 08..15 | 2.00 - 2.46 |  9.16  |  9  |   6 |
	 * 00..07 | 1.00 - 1.91 |  6.35  |  6  |   3 |
	 *
	 * We can store (4*cbrt(x)-3) in 4 bits for x in [0..63].
	 * So we use the following map : 0xDCBA9863UL
	 */
	ALIGN64 static uint32_t cbrt_3msb_4lsb = 0xDCBA9863UL;

	b = fls64(a);

	if (b <= 1)
		return b;
	if (b < 7) {
		/* a in [0..63] */
		uint32_t bits;
		bits = (uint8_t)a >> 1;
		bits &= ~1;
		return 1 + ((0xFFEAAA54UL >> bits) & 0x03);
	}

	/* We want (b - 1) / 3 for b between 7 and 64 so that we have a
	 * a bit shift count between 2 and 21 inclusive.
	 */
	b = (b * 84) >> 8;

	/* We want the highest 3bit block from 'a'  */
	x = ((0xDCBA9863UL >> VAL_3BIT_TO_SHIFT4(a, b)) & 0x0F) + 3;
	x = (x << b) >> 3;

	x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x));
	x = ((x * 348) >> 10);  // = x/2.94. Gives best precision here.

	x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x));
	x = ((x * 352) >> 10);  // = x/2.91. Gives best precision here

	x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)(x - 1)));
	x = ((x * 341) >> 10);  // = x/3.003 ~= x/3
	return x;
}
static uint32_t end_ncubic_3div() { }



/* calculate the cubic root of x using Newton-Raphson in 2 divides after first
 * approximation.
 *
 * Avg err ~= 0.187%
 */
static uint32_t ncubic_2div(uint64_t a)
{
	uint32_t x;
	uint32_t b;
	uint32_t shift;

	/*
	 * For large values, 3 bits inputs are enough.
	 *
	 * We store 4*cbrt(8*x)-1 for x in [0..7]
	 *
	 *  x     |   cbrt      | 4*cbrt | int | int |
	 *  range |   range     | approx | val.|  -3 |
	 * -------+-------------+--------+-----+-----+
	 * 56..63 | 3.83 - 3.98 | 15.66  | 16  |  13 |
	 * 48..55 | 3.63 - 3.80 | 14.93  | 15  |  12 |
	 * 40..47 | 3.42 - 3.61 | 14.12  | 14  |  11 |
	 * 32..39 | 3.17 - 3.39 | 13.21  | 13  |  10 |
	 * 24..31 | 2.88 - 3.14 | 12.15  | 12  |   9 |
	 * 16..23 | 2.51 - 2.84 | 10.86  | 11  |   8 |
	 * 08..15 | 2.00 - 2.46 |  9.16  |  9  |   6 |
	 * 00..07 | 1.00 - 1.91 |  6.35  |  6  |   3 |
	 *
	 * We can store (4*cbrt(x)-3) in 4 bits for x in [0..63].
	 * So we use the following map : 0xDCBA9863UL
	 */
	ALIGN64 static uint32_t cbrt_3msb_4lsb = 0xDCBA9863UL;

	b = fls64(a);

	if (b <= 1)
		return b;
	else if (b < 7) {
		/* a in [0..63] */
		uint32_t bits;
		bits = (uint8_t)a >> 1;
		bits &= ~1;
		return 1 + ((0xFFEAAA54UL >> bits) & 0x03);
	}

	/* We want (b - 1) / 3 for b between 7 and 64 so that we have a
	 * a bit shift count between 2 and 21 inclusive.
	 */
	b = (b * 84) >> 8;

	/* We want the highest 3bit block from 'a'  */
	shift = (a >> (b * 3)) << 2;      /* shift is 0..28 now. */

	x = ((/*cbrt_3msb_4lsb*/0xDCBA9863UL >> shift) & 0x0F) + 3;
	x = (x << b) >> 3;

	x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x));
	x = ((x * 352) >> 10);  // = x/2.91. Gives best precision here.

	x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)(x - 1)));
	x = ((x * 341) >> 10);  // = x/3.003 ~= x/3
	return x;
}
static uint32_t end_ncubic_2div() { }


/* calculate the cubic root of x using Newton-Raphson in a single divide after
 * first approximation.
 *
 * Avg err ~= 0.443%
 */
static uint32_t ncubic_1div(uint64_t a)
{
	uint32_t x;
	uint32_t b;
	uint32_t shift;

	/*
	 * For large values, 3 bits inputs are enough.
	 *
	 * We store 4*cbrt(8*x)-1 for x in [0..7]
	 *
	 *  x     |   cbrt      | 4*cbrt | int | int |
	 *  range |   range     | approx | val.|  -3 |
	 * -------+-------------+--------+-----+-----+
	 * 56..63 | 3.83 - 3.98 | 15.66  | 16  |  13 |
	 * 48..55 | 3.63 - 3.80 | 14.93  | 15  |  12 |
	 * 40..47 | 3.42 - 3.61 | 14.12  | 14  |  11 |
	 * 32..39 | 3.17 - 3.39 | 13.21  | 13  |  10 |
	 * 24..31 | 2.88 - 3.14 | 12.15  | 12  |   9 |
	 * 16..23 | 2.51 - 2.84 | 10.86  | 11  |   8 |
	 * 08..15 | 2.00 - 2.46 |  9.16  |  9  |   6 |
	 * 00..07 | 1.00 - 1.91 |  6.35  |  6  |   3 |
	 *
	 * We can store (4*cbrt(x)-3) in 4 bits for x in [0..63].
	 * So we use the following map : 0xDCBA9863UL
	 */
	ALIGN64 static uint32_t cbrt_3msb_4lsb = 0xDCBA9863UL;

	b = fls64(a);

	if (b < 7) {
		if (b <= 1)
			return b;
		/* a in [0..63] */
		uint32_t bits;
		bits = (uint8_t)a >> 1;
		bits &= ~1;
		return 1 + ((0xFFEAAA54UL >> bits) & 0x03);
	}

	/* We want (b - 1) / 3 for b between 7 and 64 so that we have a
	 * a bit shift count between 2 and 21 inclusive.
	 */
	b = (b * 84) >> 8;

	/* We want the highest 3bit block from 'a'  */
	x = ((/*cbrt_3msb_4lsb*/0xDCBA9863UL >> VAL_3BIT_TO_SHIFT4(a, b)) & 0x0F) + 3;
	x = (x << b) >> 3;

	/* one divide is enough to get a value within 0.4% */
	x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x * (uint64_t)(x - 1)));
	x = ((x * 341) >> 10);
	return x;
}
static uint32_t end_ncubic_1div() { }


/* calculate the cubic root of x using only an approximation.
 * Avg err ~= 4.521%
 */
static uint32_t ncubic_0div(uint64_t a)
{
	uint32_t x;
	uint32_t b;

	/*
	 * For large values, 3 bits inputs are enough.
	 *
	 * We store 4*cbrt(8*x)-1 for x in [0..7]
	 *
	 *  x     |   cbrt      | 4*cbrt | int | int |
	 *  range |   range     | approx | val.|  -3 |
	 * -------+-------------+--------+-----+-----+
	 * 56..63 | 3.83 - 3.98 | 15.66  | 16  |  13 |
	 * 48..55 | 3.63 - 3.80 | 14.93  | 15  |  12 |
	 * 40..47 | 3.42 - 3.61 | 14.12  | 14  |  11 |
	 * 32..39 | 3.17 - 3.39 | 13.21  | 13  |  10 |
	 * 24..31 | 2.88 - 3.14 | 12.15  | 12  |   9 |
	 * 16..23 | 2.51 - 2.84 | 10.86  | 11  |   8 |
	 * 08..15 | 2.00 - 2.46 |  9.16  |  9  |   6 |
	 * 00..07 | 1.00 - 1.91 |  6.35  |  6  |   3 |
	 *
	 * We can store (4*cbrt(x)-3) in 4 bits for x in [0..63].
	 * So we use the following map : 0xDCBA9863UL
	 */
	ALIGN64 static uint32_t cbrt_3msb_4lsb = 0xDCBA9863UL;

	b = fls64(a);

	if (b < 7) {
		if (b <= 1)
			return b;
		/* a in [0..63] */
		uint32_t bits;
		bits = (uint8_t)a >> 1;
		bits &= ~1;
		return 1 + ((0xFFEAAA54UL >> bits) & 0x03);
	}

	/* We want (b - 1) / 3 for b between 7 and 64 so that we have a
	 * a bit shift count between 2 and 21 inclusive.
	 */
	b = (b * 84) >> 8;

	/* We want the highest 3bit block from 'a'  */
	x = ((/*cbrt_3msb_4lsb*/0xDCBA9863UL >> VAL_3BIT_TO_SHIFT4(a, b)) & 0x0F) + 3;
	x = (x << b) >> 3;
	return x;
}
static uint32_t end_ncubic_0div() { }



/* calculate the cubic root of x using table lookups only.
 * Avg err ~= 0.613%
 */
static uint32_t ncubic_tab0(uint64_t a)
{
	uint32_t b;
	uint32_t shift;

	/*
	 * cbrt(x) MSB values for x MSB values in [0..63].
	 * Precomputed then refined by hand - Willy Tarreau
	 *
	 * For x in [0..63],
	 *   v = cbrt(x << 18) - 1
	 *   cbrt(x) = (v[x] + 1) >> 6
	 */
	static uint8_t v[] = {
		/* 0x00 */    0,   63,   63,   63,  127,  127,  127,  127,
		/* 0x08 */  129,  135,  139,  143,  147,  152,  155,  160,
		/* 0x10 */  161,  165,  168,  172,  174,  177,  179,  182,
		/* 0x18 */  184,  188,  190,  193,  195,  197,  199,  202,
		/* 0x20 */  203,  205,  207,  209,  212,  213,  215,  217,
		/* 0x28 */  219,  221,  222,  224,  226,  227,  229,  231,
		/* 0x30 */  232,  234,  236,  237,  239,  240,  241,  243,
		/* 0x38 */  244,  246,  247,  249,  251,  252,  253,  255,
	};

	b = fls64(a);

	if (b < 7)
		/* a in [0..63] */
		return (v[(uint32_t)a] + 31) >> 6;

	b = ((b * 84) >> 8) - 1;
	shift = (a >> (b * 3));
	return ((uint32_t)(v[shift] + 1) << b) >> 6;
}
static uint32_t end_ncubic_tab0() { }


/* calculate the cubic root of x using a table lookup followed by one
 * Newton-Raphson iteration.
 * Avg err ~= 0.195%
 */
static uint32_t ncubic_tab1(uint64_t a)
{
	uint32_t x;
	uint32_t b;
	uint32_t z;
	uint32_t shift;

	/*
	 * cbrt(x) MSB values for x MSB values in [0..63].
	 * Precomputed then refined by hand - Willy Tarreau
	 *
	 * For x in [0..63],
	 *   v = cbrt(x << 18) - 1
	 *   cbrt(x) = (v[x] + 10) >> 6
	 */
	static uint8_t v[] = {
		/* 0x00 */    0,   54,   54,   54,  118,  118,  118,  118,
		/* 0x08 */  123,  129,  134,  138,  143,  147,  151,  156,
		/* 0x10 */  157,  161,  164,  168,  170,  173,  176,  179,
		/* 0x18 */  181,  185,  187,  190,  192,  194,  197,  199,
		/* 0x20 */  200,  202,  204,  206,  209,  211,  213,  215,
		/* 0x28 */  217,  219,  221,  222,  224,  225,  227,  229,
		/* 0x30 */  231,  232,  234,  236,  237,  239,  240,  242,
		/* 0x38 */  244,  245,  246,  248,  250,  251,  252,  254,
	};

	b = fls64(a);
	if (b < 7) {
		/* a in [0..63] */
		return ((uint32_t)v[(uint32_t)a] + 35) >> 6;
	}

	b = ((b * 84) >> 8) - 1;
	shift = (a >> (b * 3));

	x = ((uint32_t)(((uint32_t)v[shift] + 10) << b)) >> 6;

	/* one divide is enough to get a value within 0.19% */
	x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x * (uint64_t)(x - 1)));
	x = ((x * 341) >> 10);
	return x;
}
static uint32_t end_ncubic_tab1() { }



/* 65536 times the cubic root of 0,    1,     2,     3,      4,      5,      6,      7*/
static uint64_t bictcp_table[8] = {0, 65536, 82570, 94519, 104030, 112063, 119087, 125367};

/* calculate the cubic root of x
   the basic idea is that x can be expressed as i*8^j
   so cubic_root(x) = cubic_root(i)*2^j
   in the following code, x is i, and y is 2^j
   because of integer calculation, there are errors in calculation
   so finally use binary search to find out the exact solution.
   Avg err ~= 0.172%
*/

static uint32_t bictcp(uint64_t x)
{
        uint64_t y, app, target, start, end, mid, start_diff, end_diff;

        if (x == 0)
                return 0;

        target = x;

        /*first estimate lower and upper bound*/
        y = 1;
        while (x >= 8){
                x = (x >> 3);
                y = (y << 1);
        }
        start = (y*bictcp_table[x])>>16;
        if (x==7)
                end = (y<<1);
        else
                end = (y*bictcp_table[x+1]+65535)>>16;

        /*binary search for more accurate one*/
        while (start < end-1) {
                mid = (start+end) >> 1;
                app = mid*mid*mid;
                if (app < target)
                        start = mid;
                else if (app > target)
                        end = mid;
                else
                        return mid;
        }

        /*find the most accurate one from start and end*/
        app = start*start*start;
        if (app < target)
                start_diff = target - app;
        else
                start_diff = app - target;
        app = end*end*end;
        if (app < target)
                end_diff = target - app;
        else
                end_diff = app - target;

        return (start_diff < end_diff) ? start : end;
}
static uint32_t end_bictcp() { }


#define NCASES 1000
static uint64_t cases[NCASES];
static double results[NCASES];

static double ticks_per_usec;
static unsigned long long start, end;

static void dotest(const char *name, uint32_t (*func)(uint64_t), int size)
{
	int i;
	unsigned long long t, mx = 0, sum = 0, sum_sq = 0;
	double mean, std, err = 0;

	for (i = 0; i < NCASES; i++) {
		uint64_t x = cases[i];
		uint32_t v;

		rdtscll(start);
		v = (*func)(x);
		rdtscll(end);

		t = end - start;
		if (t > mx) mx = t;
		sum += t; sum_sq += t*t;

		err += fabs(((double) v - results[i]) / results[i]);
	}

	mean = (double) sum / ticks_per_usec / NCASES ;
	std = sqrtl( (double) sum_sq / ticks_per_usec / NCASES - mean * mean);

	printf("%-15s %8llu %8.2f %8.2f %8.2f  %.03f%% %4d\n", name, 
	       (unsigned long long) sum / NCASES, mean, std, 
	       (double) mx / ticks_per_usec, err * 100./ NCASES,
	       size);
}

int main(int argc, char **argv)
{
	uint64_t x;
	int i;

	printf("Calibrating\n");
	rdtscll(start);
	sleep(2);
	rdtscll(end);
	ticks_per_usec = (double) (end - start) / 2000000.;

	for (i = 0; i < 63; i++) 
		cases[i] = 1ull << i;
	x = ~0;
	while (x != 0) {
		cases[i++] = x;
		x >>= 1;
	}
	x = ~0;
	while (x != 0) {
		cases[i++] = x;
		x <<= 1;
	}

	while (i < NCASES)
		cases[i++] = (uint64_t) random()  * (uint64_t) random();

	for (i = 0; i < NCASES; i++) 
		results[i] = cbrt((double)cases[i]);

	printf("Function          clocks mean(us)  max(us)  std(us) Avg err size\n");
	
#define DOTEST(x)	dotest(#x, x, end_##x-x)
	DOTEST(bictcp);
	DOTEST(ocubic);
	DOTEST(ncubic);
	DOTEST(ncubic32);
	DOTEST(ncubic32_1);
	DOTEST(ncubic_ndiv3);

	DOTEST(ncubic_3div);
	DOTEST(ncubic_2div);
	DOTEST(ncubic_1div);
	DOTEST(ncubic_0div);
	DOTEST(ncubic_tab1);
	DOTEST(ncubic_tab0);
	//for (man_adj = 0; man_adj < 16; man_adj++) {
	//	printf("%02d : ", man_adj);
	//	DOTEST(ncubic_tab1);
	//}
	DOTEST(acbrt);
	DOTEST(hcbrt);
	return 0;
}



^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] tcp_cubic: use 32 bit math
  2007-03-13 20:50                                       ` Willy Tarreau
@ 2007-03-21 18:54                                         ` Stephen Hemminger
  2007-03-21 19:15                                           ` Willy Tarreau
  0 siblings, 1 reply; 62+ messages in thread
From: Stephen Hemminger @ 2007-03-21 18:54 UTC (permalink / raw)
  To: Willy Tarreau
  Cc: David Miller, rkuhn, andi, dada1, jengelh, linux-kernel, netdev

On Tue, 13 Mar 2007 21:50:20 +0100
Willy Tarreau <w@1wt.eu> wrote:

> Hi Stephen,
> 
> On Mon, Mar 12, 2007 at 02:11:56PM -0700, Stephen Hemminger wrote:
> > > Oh BTW, I have a newer version with a first approximation of the
> > > cbrt() before the div64_64, which allows us to reduce from 3 div64
> > > to only 2 div64. This results in a version which is twice as fast
> > > as the initial one (ncubic), but with slightly less accuracy (0.286%
> > > compared to 0.247). But I see that other functions such as hcbrt()
> > > had a 1.5% avg error, so I think this is not dramatic.
> > 
> > Ignore my hcbrt() it was a less accurate version of andi's stuff.
> 
> OK.
> 
> > > Also, I managed to remove all other divides, to be kind with CPUs
> > > having a slow divide instruction or no divide at all. Since we compute
> > > on limited range (22 bits), we can multiply then shift right. It shows
> > > me even slightly better time on pentium-m and athlon, with a slightly
> > > higher avg error (0.297% compared to 0.286%), and slightly smaller
> > > code.
> > 
> > What does the code look like?
> 
> Well, I have cleaned it a little bit, there were more comments and ifdefs
> than code ! I've appended it to the end of this mail.
> 
> I have changed it a bit, because I noticed that integer divide precision
> was so coarse that there were other possibilities to play with the bits.
> 
> I have experimented with combinations of several methods :
>   - replace integer divides with multiplies/shifts where possible.
> 
>   - compensation for divide imprecisions by adding/removing small
>     values bofore/after them. Often, the integer result of 1/(x*(x-1))
>     is closer to (float)1/(float)x^2 than 1/(x*x). This is because
>     the divide always truncates the result.
> 
>   - use direct result lookup for small values. Small inputs give small
>     outputs which have very few moving bits. Many different values fit
>     in a 32bit integer, so we use a shift offset to lookup the value.
>     I used this in an fls function I wrote a while ago, that I should
>     also post because it is up to twice as fast as the kernel's.
>     Sometimes it seems faster to lookup in from memory, sometimes it
>     is faster from an immediate value. Maybe more visible differences
>     would show up on RISC CPUs where loading 32 bits immediate needs
>     two instructions. I don't know yet, I've not tested on my sparc
>     yet.
> 
>   - use small lookup tables (64 bytes) with 6 bits inputs and at least
>     as many on output. We only lookup the 6 MSB and return the 2-3 MSB
>     of the result.
> 
>   - iterative search and manual refinment of the lookup tables for best
>     accuracy. The avg error rate can easily be halved this way.
> 
> I have duplicated tried several functions with 0, 1, 2 and 3 divides.
> Several of them offer better accuracy over what we currently have, in
> less cycles. Others offer faster results (up to 5 times) with slightly
> less accuracy.
> 
> There is one function which is not to be used, but is just here for
> comparison (ncubic_0div). It does no divide but has awful avg error.
> 
> But one which is interesting is the ncubic_tab0. It does not use any
> divide at all, even not any div64. It shows a 0.6% avg error, which I'm
> not sure is enough or not. It is 6.7 times faster than initial ncubic()
> with less accuracy, and 4 times smaller. I suspect that it can differ
> more on architectures which have no divide instruction.
> 
> Is 0.6% avg error rate is too much, ncubic_tab1() uses one single div64
> and is twice slower (still nearly 3 times faster than ncubic). It show
> 0.195% avg error, which is better than initial ncubic. I think that it
> is a good tradeoff.
> 
> If best accuracy is an absolute requirement, then I have a variation of
> ncubic (ncubic_3div) which does 0.17% in 2/3 of the time (compared to
> 0.247%), and which is slightly smaller.
> 
> I have also added a "size" column, indicating approximative function
> size, provided that the compiler does not reorder the code. On gcc 3.4,
> it's OK, but 4.1 returns garbage. That does not matter, it's just a
> rough estimate anyway.
> 
> Here are the results classed by speed :
> 
> /* Sample output on a Pentium-M 600 MHz :
> 
> Function          clocks mean(us)  max(us)  std(us) Avg err size
> ncubic_tab0           79     0.66     7.20     1.04  0.613%  160
> ncubic_0div           84     0.70     7.64     1.57  4.521%  192
> ncubic_1div          178     1.48    16.27     1.81  0.443%  336
> ncubic_tab1          179     1.49    16.34     1.85  0.195%  320
> ncubic_ndiv3         263     2.18    24.04     3.59  0.250%  512
> ncubic_2div          270     2.24    24.70     2.77  0.187%  512
> ncubic32_1           359     2.98    32.81     3.59  0.238%  544
> ncubic_3div          361     2.99    33.08     3.79  0.170%  656
> ncubic32             364     3.02    33.29     3.51  0.247%  544
> ncubic               529     4.39    48.39     4.92  0.247%  720
> hcbrt                539     4.47    49.25     5.98  1.580%   96
> ocubic               732     4.93    61.83     7.22  0.274%  320
> acbrt                842     6.98    76.73     8.55  0.275%  192
> bictcp              1032     6.95    86.30     9.04  0.172%  768
> 
> And now by avg error :
> 
> ncubic_3div          361     2.99    33.08     3.79  0.170%  656
> bictcp              1032     6.95    86.30     9.04  0.172%  768
> ncubic_2div          270     2.24    24.70     2.77  0.187%  512
> ncubic_tab1          179     1.49    16.34     1.85  0.195%  320
> ncubic32_1           359     2.98    32.81     3.59  0.238%  544
> ncubic               529     4.39    48.39     4.92  0.247%  720
> ncubic32             364     3.02    33.29     3.51  0.247%  544
> ncubic_ndiv3         263     2.18    24.04     3.59  0.250%  512
> ocubic               732     4.93    61.83     7.22  0.274%  320
> acbrt                842     6.98    76.73     8.55  0.275%  192
> ncubic_1div          178     1.48    16.27     1.81  0.443%  336
> ncubic_tab0           79     0.66     7.20     1.04  0.613%  160
> hcbrt                539     4.47    49.25     5.98  1.580%   96
> ncubic_0div           84     0.70     7.64     1.57  4.521%  192
> 
> 
> And here comes the code. I have tried to document it a bit, as much
> as can be done on experimentation code. It is often easier to use
> a pencil and paper to understand how the bits move.
> 
> Regards,
> Willy
> 

The following version of div64_64 is faster because do_div already
optimized for the 32 bit case..

I get the following results on ULV Core Solo (ie slow current processor)
and the following on 64bit Core Duo. ncubic_tab1 seems like
the best (no additional error and about as fast)

ULV Core Solo

Function          clocks mean(us)  max(us)  std(us) Avg err size
ncubic_tab0          192    11.24    45.10    15.28  0.450% -2262
ncubic_0div          201    11.77    47.23    27.40  3.357% -2404
ncubic_1div          324    19.02    76.32    25.82  0.189% -2567
ncubic_tab1          326    19.13    76.73    23.71  0.043% -2059
ncubic_2div          456    26.72   108.92   493.16  0.028% -2790
ncubic_ndiv3         463    27.15   133.37  1889.39  0.104% -3344
ncubic32             549    32.18   130.59   508.97  0.041% -3794
ncubic32_1           574    33.66   138.32   548.48  0.029% -3604
ncubic_3div          581    34.04   140.24   608.55  0.018% -3050
ncubic               733    42.92   173.35   523.19  0.041%  299
ocubic              1046    61.25   283.68  3305.65  0.027% -2232
acbrt               1149    67.32   284.91  1941.55  0.029%  168
bictcp              1663    97.41   394.29   604.86  0.017%  628

Core 2 Duo

Function          clocks mean(us)  max(us)  std(us) Avg err size
ncubic_0div           74     0.03     1.60     0.07  3.357% -2101
ncubic_tab0           74     0.03     1.60     0.04  0.450% -2029
ncubic_1div          142     0.07     3.11     1.05  0.189% -2195
ncubic_tab1          144     0.07     3.18     1.02  0.043% -1638
ncubic_2div          216     0.10     4.74     1.07  0.028% -2326
ncubic_ndiv3         219     0.10     4.76     1.04  0.104% -2709
ncubic32             269     0.13     5.87     1.13  0.041% -1500
ncubic32_1           272     0.13     5.92     1.10  0.029% -2881
ncubic               273     0.13     5.96     1.13  0.041% -1763
ncubic_3div          290     0.14     6.32     1.01  0.018% -2499
acbrt                430     0.20     9.42     1.18  0.029%   77
ocubic               444     0.21     9.82     1.82  0.027% -1924
bictcp               549     0.26    12.06     1.68  0.017%  236



-- 
Stephen Hemminger <shemminger@linux-foundation.org>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] tcp_cubic: use 32 bit math
  2007-03-21 18:54                                         ` Stephen Hemminger
@ 2007-03-21 19:15                                           ` Willy Tarreau
  2007-03-21 19:58                                             ` Stephen Hemminger
  2007-03-21 20:15                                             ` [PATCH 1/2] div64_64 optimization Stephen Hemminger
  0 siblings, 2 replies; 62+ messages in thread
From: Willy Tarreau @ 2007-03-21 19:15 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, rkuhn, andi, dada1, jengelh, linux-kernel, netdev

Hi Stephen,

On Wed, Mar 21, 2007 at 11:54:19AM -0700, Stephen Hemminger wrote:
> On Tue, 13 Mar 2007 21:50:20 +0100
> Willy Tarreau <w@1wt.eu> wrote:

[...] ( cut my boring part )

> > Here are the results classed by speed :
> > 
> > /* Sample output on a Pentium-M 600 MHz :
> > 
> > Function          clocks mean(us)  max(us)  std(us) Avg err size
> > ncubic_tab0           79     0.66     7.20     1.04  0.613%  160
> > ncubic_0div           84     0.70     7.64     1.57  4.521%  192
> > ncubic_1div          178     1.48    16.27     1.81  0.443%  336
> > ncubic_tab1          179     1.49    16.34     1.85  0.195%  320
> > ncubic_ndiv3         263     2.18    24.04     3.59  0.250%  512
> > ncubic_2div          270     2.24    24.70     2.77  0.187%  512
> > ncubic32_1           359     2.98    32.81     3.59  0.238%  544
> > ncubic_3div          361     2.99    33.08     3.79  0.170%  656
> > ncubic32             364     3.02    33.29     3.51  0.247%  544
> > ncubic               529     4.39    48.39     4.92  0.247%  720
> > hcbrt                539     4.47    49.25     5.98  1.580%   96
> > ocubic               732     4.93    61.83     7.22  0.274%  320
> > acbrt                842     6.98    76.73     8.55  0.275%  192
> > bictcp              1032     6.95    86.30     9.04  0.172%  768
> > 

[...]

> The following version of div64_64 is faster because do_div already
> optimized for the 32 bit case..

Cool, this is interesting because I first wanted to optimize it but did
not find how to start with this. You seem to get very good results. BTW,
you did not append your changes.

However, one thing I do not understand is why your avg error is about 1/3
below the original one. Was there a precision bug in the original div_64_64
or did you extend the values used in the test ?

Or perhaps you used -fast-math to build and the original cbrt() is less
precise in this case ?

> I get the following results on ULV Core Solo (ie slow current processor)
> and the following on 64bit Core Duo. ncubic_tab1 seems like
> the best (no additional error and about as fast)

OK. It was the one I preferred too unless tab0's avg error was acceptable.

> ULV Core Solo
> 
> Function          clocks mean(us)  max(us)  std(us) Avg err size
> ncubic_tab0          192    11.24    45.10    15.28  0.450% -2262
> ncubic_0div          201    11.77    47.23    27.40  3.357% -2404
> ncubic_1div          324    19.02    76.32    25.82  0.189% -2567
> ncubic_tab1          326    19.13    76.73    23.71  0.043% -2059
> ncubic_2div          456    26.72   108.92   493.16  0.028% -2790
> ncubic_ndiv3         463    27.15   133.37  1889.39  0.104% -3344
> ncubic32             549    32.18   130.59   508.97  0.041% -3794
> ncubic32_1           574    33.66   138.32   548.48  0.029% -3604
> ncubic_3div          581    34.04   140.24   608.55  0.018% -3050
> ncubic               733    42.92   173.35   523.19  0.041%  299
> ocubic              1046    61.25   283.68  3305.65  0.027% -2232
> acbrt               1149    67.32   284.91  1941.55  0.029%  168
> bictcp              1663    97.41   394.29   604.86  0.017%  628
> 
> Core 2 Duo
> 
> Function          clocks mean(us)  max(us)  std(us) Avg err size
> ncubic_0div           74     0.03     1.60     0.07  3.357% -2101
> ncubic_tab0           74     0.03     1.60     0.04  0.450% -2029
> ncubic_1div          142     0.07     3.11     1.05  0.189% -2195
> ncubic_tab1          144     0.07     3.18     1.02  0.043% -1638
> ncubic_2div          216     0.10     4.74     1.07  0.028% -2326
> ncubic_ndiv3         219     0.10     4.76     1.04  0.104% -2709
> ncubic32             269     0.13     5.87     1.13  0.041% -1500
> ncubic32_1           272     0.13     5.92     1.10  0.029% -2881
> ncubic               273     0.13     5.96     1.13  0.041% -1763
> ncubic_3div          290     0.14     6.32     1.01  0.018% -2499
> acbrt                430     0.20     9.42     1.18  0.029%   77
> ocubic               444     0.21     9.82     1.82  0.027% -1924
> bictcp               549     0.26    12.06     1.68  0.017%  236

Thanks,
Willy


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] tcp_cubic: use 32 bit math
  2007-03-21 19:15                                           ` Willy Tarreau
@ 2007-03-21 19:58                                             ` Stephen Hemminger
  2007-03-21 20:15                                             ` [PATCH 1/2] div64_64 optimization Stephen Hemminger
  1 sibling, 0 replies; 62+ messages in thread
From: Stephen Hemminger @ 2007-03-21 19:58 UTC (permalink / raw)
  To: Willy Tarreau
  Cc: David Miller, rkuhn, andi, dada1, jengelh, linux-kernel, netdev

On Wed, 21 Mar 2007 20:15:37 +0100
Willy Tarreau <w@1wt.eu> wrote:

> Hi Stephen,
> 
> On Wed, Mar 21, 2007 at 11:54:19AM -0700, Stephen Hemminger wrote:
> > On Tue, 13 Mar 2007 21:50:20 +0100
> > Willy Tarreau <w@1wt.eu> wrote:
> 
> [...] ( cut my boring part )
> 
> > > Here are the results classed by speed :
> > > 
> > > /* Sample output on a Pentium-M 600 MHz :
> > > 
> > > Function          clocks mean(us)  max(us)  std(us) Avg err size
> > > ncubic_tab0           79     0.66     7.20     1.04  0.613%  160
> > > ncubic_0div           84     0.70     7.64     1.57  4.521%  192
> > > ncubic_1div          178     1.48    16.27     1.81  0.443%  336
> > > ncubic_tab1          179     1.49    16.34     1.85  0.195%  320
> > > ncubic_ndiv3         263     2.18    24.04     3.59  0.250%  512
> > > ncubic_2div          270     2.24    24.70     2.77  0.187%  512
> > > ncubic32_1           359     2.98    32.81     3.59  0.238%  544
> > > ncubic_3div          361     2.99    33.08     3.79  0.170%  656
> > > ncubic32             364     3.02    33.29     3.51  0.247%  544
> > > ncubic               529     4.39    48.39     4.92  0.247%  720
> > > hcbrt                539     4.47    49.25     5.98  1.580%   96
> > > ocubic               732     4.93    61.83     7.22  0.274%  320
> > > acbrt                842     6.98    76.73     8.55  0.275%  192
> > > bictcp              1032     6.95    86.30     9.04  0.172%  768
> > > 
> 
> [...]
> 
> > The following version of div64_64 is faster because do_div already
> > optimized for the 32 bit case..
>

/* 64bit divisor, dividend and result. dynamic precision */
static uint64_t div64_64(uint64_t dividend, uint64_t divisor)
{
	uint32_t high, d;

	high = divisor >> 32;
	if (high) {
		unsigned int shift = fls(high);

		d = divisor >> shift;
		dividend >>= shift;
	} else
		d = divisor;

	do_div(dividend, d);

	return dividend;
}



> Cool, this is interesting because I first wanted to optimize it but did
> not find how to start with this. You seem to get very good results. BTW,
> you did not append your changes.
> 
> However, one thing I do not understand is why your avg error is about 1/3
> below the original one. Was there a precision bug in the original div_64_64
> or did you extend the values used in the test ?
> 
> Or perhaps you used -fast-math to build and the original cbrt() is less
> precise in this case ?

No, but I did use -mtune=pentiumm on the ULV
-- 
Stephen Hemminger <shemminger@linux-foundation.org>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [PATCH 1/2] div64_64 optimization
  2007-03-21 19:15                                           ` Willy Tarreau
  2007-03-21 19:58                                             ` Stephen Hemminger
@ 2007-03-21 20:15                                             ` Stephen Hemminger
  2007-03-21 20:17                                               ` [PATCH 2/2] tcp: cubic optimization Stephen Hemminger
  2007-03-22 19:11                                               ` [PATCH 1/2] div64_64 optimization David Miller
  1 sibling, 2 replies; 62+ messages in thread
From: Stephen Hemminger @ 2007-03-21 20:15 UTC (permalink / raw)
  To: Willy Tarreau, David Miller
  Cc: rkuhn, andi, dada1, jengelh, linux-kernel, netdev

Minor optimization of div64_64.  do_div() already does optimization
for the case of 32 by 32 divide, so no need to do it here.

Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>

--- net-2.6.22.orig/lib/div64.c	2007-03-21 12:03:59.000000000 -0700
+++ net-2.6.22/lib/div64.c	2007-03-21 12:04:46.000000000 -0700
@@ -61,20 +61,18 @@
 /* 64bit divisor, dividend and result. dynamic precision */
 uint64_t div64_64(uint64_t dividend, uint64_t divisor)
 {
-	uint32_t d = divisor;
+	uint32_t high, d;
 
-	if (divisor > 0xffffffffULL) {
-		unsigned int shift = fls(divisor >> 32);
+	high = divisor >> 32;
+	if (high) {
+		unsigned int shift = fls(high);
 
 		d = divisor >> shift;
 		dividend >>= shift;
-	}
+	} else
+		d = divisor;
 
-	/* avoid 64 bit division if possible */
-	if (dividend >> 32)
-		do_div(dividend, d);
-	else
-		dividend = (uint32_t) dividend / d;
+	do_div(dividend, d);
 
 	return dividend;
 }

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [PATCH 2/2] tcp: cubic optimization
  2007-03-21 20:15                                             ` [PATCH 1/2] div64_64 optimization Stephen Hemminger
@ 2007-03-21 20:17                                               ` Stephen Hemminger
  2007-03-22 19:11                                                 ` David Miller
  2007-03-22 19:11                                               ` [PATCH 1/2] div64_64 optimization David Miller
  1 sibling, 1 reply; 62+ messages in thread
From: Stephen Hemminger @ 2007-03-21 20:17 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Willy Tarreau, David Miller, rkuhn, andi, dada1, jengelh,
	linux-kernel, netdev

Use willy's work in optimizing cube root by having table for small values.

Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>

--- net-2.6.22.orig/net/ipv4/tcp_cubic.c	2007-03-21 12:57:11.000000000 -0700
+++ net-2.6.22/net/ipv4/tcp_cubic.c	2007-03-21 13:04:59.000000000 -0700
@@ -91,23 +91,51 @@
 		tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
 }
 
-/*
- * calculate the cubic root of x using Newton-Raphson
+/* calculate the cubic root of x using a table lookup followed by one
+ * Newton-Raphson iteration.
+ * Avg err ~= 0.195%
  */
 static u32 cubic_root(u64 a)
 {
-	u32 x;
-
-	/* Initial estimate is based on:
-	 * cbrt(x) = exp(log(x) / 3)
+	u32 x, b, shift;
+	/*
+	 * cbrt(x) MSB values for x MSB values in [0..63].
+	 * Precomputed then refined by hand - Willy Tarreau
+	 *
+	 * For x in [0..63],
+	 *   v = cbrt(x << 18) - 1
+	 *   cbrt(x) = (v[x] + 10) >> 6
 	 */
-	x = 1u << (fls64(a)/3);
+	static const u8 v[] = {
+		/* 0x00 */    0,   54,   54,   54,  118,  118,  118,  118,
+		/* 0x08 */  123,  129,  134,  138,  143,  147,  151,  156,
+		/* 0x10 */  157,  161,  164,  168,  170,  173,  176,  179,
+		/* 0x18 */  181,  185,  187,  190,  192,  194,  197,  199,
+		/* 0x20 */  200,  202,  204,  206,  209,  211,  213,  215,
+		/* 0x28 */  217,  219,  221,  222,  224,  225,  227,  229,
+		/* 0x30 */  231,  232,  234,  236,  237,  239,  240,  242,
+		/* 0x38 */  244,  245,  246,  248,  250,  251,  252,  254,
+	};
+
+	b = fls64(a);
+	if (b < 7) {
+		/* a in [0..63] */
+		return ((u32)v[(u32)a] + 35) >> 6;
+	}
+
+	b = ((b * 84) >> 8) - 1;
+	shift = (a >> (b * 3));
 
-	/* converges to 32 bits in 3 iterations */
-	x = (2 * x + (u32)div64_64(a, (u64)x*(u64)x)) / 3;
-	x = (2 * x + (u32)div64_64(a, (u64)x*(u64)x)) / 3;
-	x = (2 * x + (u32)div64_64(a, (u64)x*(u64)x)) / 3;
+	x = ((u32)(((u32)v[shift] + 10) << b)) >> 6;
 
+	/*
+	 * Newton-Raphson iteration
+	 *                         2
+	 * x    = ( 2 * x  +  a / x  ) / 3
+	 *  k+1          k         k
+	 */
+	x = (2 * x + (u32)div64_64(a, (u64)x * (u64)(x - 1)));
+	x = ((x * 341) >> 10);
 	return x;
 }
 


-- 
Stephen Hemminger <shemminger@linux-foundation.org>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 1/2] div64_64 optimization
  2007-03-21 20:15                                             ` [PATCH 1/2] div64_64 optimization Stephen Hemminger
  2007-03-21 20:17                                               ` [PATCH 2/2] tcp: cubic optimization Stephen Hemminger
@ 2007-03-22 19:11                                               ` David Miller
  1 sibling, 0 replies; 62+ messages in thread
From: David Miller @ 2007-03-22 19:11 UTC (permalink / raw)
  To: shemminger; +Cc: w, rkuhn, andi, dada1, jengelh, linux-kernel, netdev

From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Wed, 21 Mar 2007 13:15:32 -0700

> Minor optimization of div64_64.  do_div() already does optimization
> for the case of 32 by 32 divide, so no need to do it here.
> 
> Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>

Applied to net-2.6.22

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 2/2] tcp: cubic optimization
  2007-03-21 20:17                                               ` [PATCH 2/2] tcp: cubic optimization Stephen Hemminger
@ 2007-03-22 19:11                                                 ` David Miller
  0 siblings, 0 replies; 62+ messages in thread
From: David Miller @ 2007-03-22 19:11 UTC (permalink / raw)
  To: shemminger; +Cc: w, rkuhn, andi, dada1, jengelh, linux-kernel, netdev

From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Wed, 21 Mar 2007 13:17:21 -0700

> Use willy's work in optimizing cube root by having table for small values.
> 
> Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>

Also applied to net-2.6.22, thanks Stephen.

^ permalink raw reply	[flat|nested] 62+ messages in thread

end of thread, other threads:[~2007-03-22 19:11 UTC | newest]

Thread overview: 62+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-02-24  1:05 [RFC] div64_64 support Stephen Hemminger
2007-02-24 16:19 ` Sami Farin
2007-02-26 19:28   ` Stephen Hemminger
2007-02-26 19:39     ` David Miller
2007-02-26 20:09 ` Jan Engelhardt
2007-02-26 21:28   ` Stephen Hemminger
2007-02-27  1:20     ` H. Peter Anvin
2007-02-27  3:45       ` Segher Boessenkool
2007-02-26 22:31   ` Stephen Hemminger
2007-02-26 23:02     ` Jan Engelhardt
2007-02-26 23:44       ` Stephen Hemminger
2007-02-27  0:05         ` Jan Engelhardt
2007-02-27  0:07           ` Stephen Hemminger
2007-02-27  0:14             ` Jan Engelhardt
2007-02-27  6:21     ` Dan Williams
2007-03-03  2:31     ` Andi Kleen
2007-03-05 23:57       ` Stephen Hemminger
2007-03-06  0:25         ` David Miller
2007-03-06 13:36           ` Andi Kleen
2007-03-06 14:04           ` [RFC] div64_64 support II Andi Kleen
2007-03-06 17:43             ` Dagfinn Ilmari Mannsåker
2007-03-06 18:25               ` David Miller
2007-03-06 18:48             ` H. Peter Anvin
2007-03-06 13:34         ` [RFC] div64_64 support Andi Kleen
2007-03-06 14:19           ` Eric Dumazet
2007-03-06 14:45             ` Andi Kleen
2007-03-06 15:10               ` Roland Kuhn
2007-03-06 18:29                 ` Stephen Hemminger
2007-03-06 19:48                   ` Andi Kleen
2007-03-06 20:04                     ` Stephen Hemminger
2007-03-06 21:53                   ` Sami Farin
2007-03-06 22:24                     ` Sami Farin
2007-03-07  0:00                       ` Stephen Hemminger
2007-03-07  0:05                         ` David Miller
2007-03-07  0:05                         ` Sami Farin
2007-03-07 16:11                       ` Chuck Ebbert
2007-03-07 18:32                         ` Sami Farin
2007-03-08 18:23                       ` asm volatile [Was: [RFC] div64_64 support] Sami Farin
2007-03-08 22:01                         ` asm volatile David Miller
2007-03-06 21:58                   ` [RFC] div64_64 support David Miller
2007-03-06 22:47                     ` [PATCH] tcp_cubic: faster cube root Stephen Hemminger
2007-03-06 22:58                       ` cube root benchmark code Stephen Hemminger
2007-03-07  6:08                         ` Update to " Willy Tarreau
2007-03-08  1:07                           ` [PATCH] tcp_cubic: use 32 bit math Stephen Hemminger
2007-03-08  2:55                             ` David Miller
2007-03-08  3:10                               ` Stephen Hemminger
2007-03-08  3:51                                 ` David Miller
2007-03-10 11:48                                   ` Willy Tarreau
2007-03-12 21:11                                     ` Stephen Hemminger
2007-03-13 20:50                                       ` Willy Tarreau
2007-03-21 18:54                                         ` Stephen Hemminger
2007-03-21 19:15                                           ` Willy Tarreau
2007-03-21 19:58                                             ` Stephen Hemminger
2007-03-21 20:15                                             ` [PATCH 1/2] div64_64 optimization Stephen Hemminger
2007-03-21 20:17                                               ` [PATCH 2/2] tcp: cubic optimization Stephen Hemminger
2007-03-22 19:11                                                 ` David Miller
2007-03-22 19:11                                               ` [PATCH 1/2] div64_64 optimization David Miller
2007-03-08  4:16                                 ` [PATCH] tcp_cubic: use 32 bit math Willy Tarreau
2007-03-07  4:20                       ` [PATCH] tcp_cubic: faster cube root David Miller
2007-03-07 12:12                         ` Andi Kleen
2007-03-07 19:33                           ` David Miller
2007-03-06 18:50               ` [RFC] div64_64 support H. Peter Anvin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.