linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] Remove some divide instructions
@ 2004-10-27 16:14 Zachary Amsden
  2004-10-27 16:28 ` Linus Torvalds
  0 siblings, 1 reply; 13+ messages in thread
From: Zachary Amsden @ 2004-10-27 16:14 UTC (permalink / raw)
  To: linux-kernel, Linus Torvalds, george

[-- Attachment #1: Type: text/plain, Size: 560 bytes --]

I noticed several 64-bit divides for HZ/USER_HZ, and also the fact that 
HZ == USER_HZ on many architectures (or if you play with scaling it ;).  
Since do_div is macroized to optimized assembler on many platforms, we 
emit long divides for divide by one.  This could be extended further to 
recognize other power of two divides, but I don't think the complexity 
of the macros would be justified.  I also didn't feel it was worthwhile 
to optimize this for non-constant divides; if you feel otherwise, please 
extend.

Cheers,

Zachary Amsden
zach@vmware.com

[-- Attachment #2: div64.patch --]
[-- Type: text/plain, Size: 6213 bytes --]

diff -ru linux-2.6.10-rc1-nsz/include/asm-arm/div64.h linux-2.6.10-rc1/include/asm-arm/div64.h
--- linux-2.6.10-rc1-nsz/include/asm-arm/div64.h	2004-10-25 10:53:12.000000000 -0700
+++ linux-2.6.10-rc1/include/asm-arm/div64.h	2004-10-27 08:29:36.000000000 -0700
@@ -28,6 +28,7 @@
 #endif
 
 #define do_div(n,base)						\
+((__builtin_constant_p(base) && ((base) == 1)) ? 0 :		\
 ({								\
 	register unsigned int __base      asm("r4") = base;	\
 	register unsigned long long __n   asm("r0") = n;	\
@@ -43,6 +44,6 @@
 		: "ip", "lr", "cc");				\
 	n = __res;						\
 	__rem;							\
-})
+}))
 
 #endif
diff -ru linux-2.6.10-rc1-nsz/include/asm-generic/div64.h linux-2.6.10-rc1/include/asm-generic/div64.h
--- linux-2.6.10-rc1-nsz/include/asm-generic/div64.h	2003-12-17 18:59:30.000000000 -0800
+++ linux-2.6.10-rc1/include/asm-generic/div64.h	2004-10-27 08:26:01.000000000 -0700
@@ -22,13 +22,14 @@
 
 #if BITS_PER_LONG == 64
 
-# define do_div(n,base) ({					\
-	uint32_t __base = (base);				\
+# define do_div(n,base) (					\
+	(__builtin_constant_p(base) && ((base) == 1)) ? 0 : ({	\
 	uint32_t __rem;						\
+	uint32_t __base = (base);				\
 	__rem = ((uint64_t)(n)) % __base;			\
 	(n) = ((uint64_t)(n)) / __base;				\
 	__rem;							\
- })
+ }))
 
 #elif BITS_PER_LONG == 32
 
@@ -37,17 +38,18 @@
 /* The unnecessary pointer compare is there
  * to check for type safety (n must be 64bit)
  */
-# define do_div(n,base) ({				\
-	uint32_t __base = (base);			\
-	uint32_t __rem;					\
-	(void)(((typeof((n)) *)0) == ((uint64_t *)0));	\
-	if (likely(((n) >> 32) == 0)) {			\
-		__rem = (uint32_t)(n) % __base;		\
-		(n) = (uint32_t)(n) / __base;		\
-	} else 						\
-		__rem = __div64_32(&(n), __base);	\
-	__rem;						\
- })
+# define do_div(n,base) (					\
+	(__builtin_constant_p(base) && ((base) == 1)) ? 0 : ({	\
+	uint32_t __base = (base);				\
+	uint32_t __rem;						\
+	(void)(((typeof((n)) *)0) == ((uint64_t *)0));		\
+	if (likely(((n) >> 32) == 0)) {				\
+		__rem = (uint32_t)(n) % __base;			\
+		(n) = (uint32_t)(n) / __base;			\
+	} else 							\
+		__rem = __div64_32(&(n), __base);		\
+	__rem;							\
+ }))
 
 #else /* BITS_PER_LONG == ?? */
 
diff -ru linux-2.6.10-rc1-nsz/include/asm-i386/div64.h linux-2.6.10-rc1/include/asm-i386/div64.h
--- linux-2.6.10-rc1-nsz/include/asm-i386/div64.h	2003-12-17 18:57:59.000000000 -0800
+++ linux-2.6.10-rc1/include/asm-i386/div64.h	2004-10-27 08:31:45.000000000 -0700
@@ -13,7 +13,8 @@
  * This ends up being the most efficient "calling
  * convention" on x86.
  */
-#define do_div(n,base) ({ \
+#define do_div(n,base) ( \
+	(__builtin_constant_p(base) && ((base) == 1)) ? 0 : ({ \
 	unsigned long __upper, __low, __high, __mod, __base; \
 	__base = (base); \
 	asm("":"=a" (__low), "=d" (__high):"A" (n)); \
@@ -25,7 +26,7 @@
 	asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \
 	asm("":"=A" (n):"a" (__low),"d" (__high)); \
 	__mod; \
-})
+}))
 
 /*
  * (long)X = ((long long)divs) / (long)div
diff -ru linux-2.6.10-rc1-nsz/include/asm-m32r/div64.h linux-2.6.10-rc1/include/asm-m32r/div64.h
--- linux-2.6.10-rc1-nsz/include/asm-m32r/div64.h	2004-10-25 11:15:58.000000000 -0700
+++ linux-2.6.10-rc1/include/asm-m32r/div64.h	2004-10-27 08:21:53.000000000 -0700
@@ -12,6 +12,7 @@
  *  return value = n % base;
  */
 #define do_div(n, base)						\
+((__builtin_constant_p(base) && ((base) == 1)) ? 0 : 		\
 ({								\
 	unsigned long _res, _high, _mid, _low;			\
 								\
@@ -33,6 +34,6 @@
 		n = (_low / (unsigned long)(base));		\
 	}							\
 	_res;							\
-})
+}))
 
 #endif  /* _ASM_M32R_DIV64 */
diff -ru linux-2.6.10-rc1-nsz/include/asm-m68k/div64.h linux-2.6.10-rc1/include/asm-m68k/div64.h
--- linux-2.6.10-rc1-nsz/include/asm-m68k/div64.h	2003-12-17 18:59:37.000000000 -0800
+++ linux-2.6.10-rc1/include/asm-m68k/div64.h	2004-10-27 08:30:55.000000000 -0700
@@ -3,7 +3,8 @@
 
 /* n = n / base; return rem; */
 
-#define do_div(n, base) ({					\
+#define do_div(n, base) (					\
+(__builtin_constant_p(base) && ((base) == 1)) ? 0 : ({		\
 	union {							\
 		unsigned long n32[2];				\
 		unsigned long long n64;				\
@@ -21,6 +22,6 @@
 		: "d" (base), "1" (__upper), "0" (__n.n32[1]));	\
 	(n) = __n.n64;						\
 	__rem;							\
-})
+}))
 
 #endif /* _M68K_DIV64_H */
diff -ru linux-2.6.10-rc1-nsz/include/asm-mips/div64.h linux-2.6.10-rc1/include/asm-mips/div64.h
--- linux-2.6.10-rc1-nsz/include/asm-mips/div64.h	2003-12-17 18:59:06.000000000 -0800
+++ linux-2.6.10-rc1/include/asm-mips/div64.h	2004-10-27 08:25:29.000000000 -0700
@@ -51,7 +51,8 @@
 	(res) = __quot; \
 	__mod; })
 
-#define do_div(n, base) ({ \
+#define do_div(n, base) ( \
+	(__builtin_constant_p(base) && ((base) == 1)) ? 0 : ({ \
 	unsigned long long __quot; \
 	unsigned long __mod; \
 	unsigned long long __div; \
@@ -74,7 +75,7 @@
 	__quot = __high; \
 	__quot = __quot << 32 | __low; \
 	(n) = __quot; \
-	__mod; })
+	__mod; }))
 #endif /* (_MIPS_SZLONG == 32) */
 
 #if (_MIPS_SZLONG == 64)
@@ -104,7 +105,8 @@
  * Hey, we're already 64-bit, no
  * need to play games..
  */
-#define do_div(n, base) ({ \
+#define do_div(n, base) ( \
+	(__builtin_constant_p(base) && ((base) == 1)) ? 0 : ({ \
 	unsigned long __quot; \
 	unsigned int __mod; \
 	unsigned long __div; \
@@ -117,7 +119,7 @@
 	__quot = __div / __base; \
 	\
 	(n) = __quot; \
-	__mod; })
+	__mod; }))
 
 #endif /* (_MIPS_SZLONG == 64) */
 
diff -ru linux-2.6.10-rc1-nsz/include/asm-s390/div64.h linux-2.6.10-rc1/include/asm-s390/div64.h
--- linux-2.6.10-rc1-nsz/include/asm-s390/div64.h	2004-10-25 10:50:43.000000000 -0700
+++ linux-2.6.10-rc1/include/asm-s390/div64.h	2004-10-27 08:27:34.000000000 -0700
@@ -4,7 +4,8 @@
 #ifndef __s390x__
 
 /* for do_div "base" needs to be smaller than 2^31-1 */
-#define do_div(n, base) ({                                      \
+#define do_div(n, base) (					\
+	(__builtin_constant_p(base) && ((base) == 1)) ? 0 : ({	\
 	unsigned long long __n = (n);				\
 	unsigned long __r;					\
 								\
@@ -40,7 +41,7 @@
 	     : "d" (base), "m" (__n) : "0", "1", "2", "cc" );	\
 	(n) = (__n);						\
         __r;                                                    \
-})
+}))
 
 #else /* __s390x__ */
 #include <asm-generic/div64.h>

[-- Attachment #3: README.div64 --]
[-- Type: text/plain, Size: 484 bytes --]


div64.patch :

Get rid of 64-bit constant divide by one.  This appears to be a common case
for HZ == USER_HZ.  I tested the patch on older 2.6 kernels and was able to
produce some harmless warnings (statement has no effect), but it builds clean
for i386 with a 2.6.10 kernel.  I tested the generic asm inline by extracting
to gcc, but I have not tested any other platforms.

Doubtful, but if this breaks your build for some other platform, send me mail.

Zach Amsden
zach@vmware.com

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] Remove some divide instructions
  2004-10-27 16:14 [PATCH] Remove some divide instructions Zachary Amsden
@ 2004-10-27 16:28 ` Linus Torvalds
  2004-10-27 18:05   ` Zachary Amsden
                     ` (3 more replies)
  0 siblings, 4 replies; 13+ messages in thread
From: Linus Torvalds @ 2004-10-27 16:28 UTC (permalink / raw)
  To: Zachary Amsden; +Cc: linux-kernel, george



On Wed, 27 Oct 2004, Zachary Amsden wrote:
>
> I noticed several 64-bit divides for HZ/USER_HZ, and also the fact that 
> HZ == USER_HZ on many architectures (or if you play with scaling it ;).  
> Since do_div is macroized to optimized assembler on many platforms, we 
> emit long divides for divide by one.  This could be extended further to 
> recognize other power of two divides, but I don't think the complexity 
> of the macros would be justified.  I also didn't feel it was worthwhile 
> to optimize this for non-constant divides; if you feel otherwise, please 
> extend.

Can you test out the trivial extension, which is to allow any power-of-two
constant, and just leave it as a divide in those cases (and let the
compiler turn it into a 64-bit shift)?

It's very easy to test for powers of two: !((x) & ((x)-1)) does it for 
everything but zero, and zero in this case is an error anyway.

		Linus

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] Remove some divide instructions
  2004-10-27 16:28 ` Linus Torvalds
@ 2004-10-27 18:05   ` Zachary Amsden
  2004-10-27 20:16   ` Zachary Amsden
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 13+ messages in thread
From: Zachary Amsden @ 2004-10-27 18:05 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1844 bytes --]

Tested code generation (gcc (GCC) 3.2.2 20030222) and the optimization 
for powers of two works.  Reassigning base to a register causes the 
builtin_constant check to be dropped, but since these are constants 
anyway we can reference base without side effects.  I didn't extract n 
into a register to avoid side effects if it was not already done, since 
there are several possible places where the macro can generate side 
effects for (n) and even a comment to this effect.

This does the right thing for at least i386 and the generic code, and 
generates the expected shifts and and masks for power of two divides.

I'm compiling and booting a kernel now to make sure it really works, but 
the assembler looks great.  Apologies if I broke any other 
architectures, but it's quite difficult to test them all, and the non-ws 
code change is minimal

Zachary Amsden
zach@vmware.com

Linus Torvalds wrote:

>On Wed, 27 Oct 2004, Zachary Amsden wrote:
>  
>
>>I noticed several 64-bit divides for HZ/USER_HZ, and also the fact that 
>>HZ == USER_HZ on many architectures (or if you play with scaling it ;).  
>>Since do_div is macroized to optimized assembler on many platforms, we 
>>emit long divides for divide by one.  This could be extended further to 
>>recognize other power of two divides, but I don't think the complexity 
>>of the macros would be justified.  I also didn't feel it was worthwhile 
>>to optimize this for non-constant divides; if you feel otherwise, please 
>>extend.
>>    
>>
>
>Can you test out the trivial extension, which is to allow any power-of-two
>constant, and just leave it as a divide in those cases (and let the
>compiler turn it into a 64-bit shift)?
>
>It's very easy to test for powers of two: !((x) & ((x)-1)) does it for 
>everything but zero, and zero in this case is an error anyway.
>
>		Linus
>  
>


[-- Attachment #2: div64-2.patch --]
[-- Type: text/plain, Size: 13383 bytes --]

diff -ru linux-2.6.10-rc1/include/asm-arm/div64.h linux-2.6.10-rc1-nsz/include/asm-arm/div64.h
--- linux-2.6.10-rc1/include/asm-arm/div64.h	2004-10-27 10:41:40.000000000 -0700
+++ linux-2.6.10-rc1-nsz/include/asm-arm/div64.h	2004-10-27 10:24:25.000000000 -0700
@@ -27,22 +27,27 @@
 #define __xh "r1"
 #endif
 
-#define do_div(n,base)						\
-({								\
-	register unsigned int __base      asm("r4") = base;	\
-	register unsigned long long __n   asm("r0") = n;	\
-	register unsigned long long __res asm("r2");		\
-	register unsigned int __rem       asm(__xh);		\
-	asm(	__asmeq("%0", __xh)				\
-		__asmeq("%1", "r2")				\
-		__asmeq("%2", "r0")				\
-		__asmeq("%3", "r4")				\
-		"bl	__do_div64"				\
-		: "=r" (__rem), "=r" (__res)			\
-		: "r" (__n), "r" (__base)			\
-		: "ip", "lr", "cc");				\
-	n = __res;						\
-	__rem;							\
+#define do_div(n,base)							\
+({									\
+	register unsigned long long __n   asm("r0") = n;		\
+	register unsigned long long __res asm("r2");			\
+	register unsigned int __rem       asm(__xh);			\
+	if (__builtin_constant_p(base) && !((base) & ((base)-1))) {	\
+		__rem = __n % (base);					\
+		__res = __n / (base);					\
+	} else {							\
+		register unsigned int __base      asm("r4") = base;	\
+		asm(	__asmeq("%0", __xh)				\
+			__asmeq("%1", "r2")				\
+			__asmeq("%2", "r0")				\
+			__asmeq("%3", "r4")				\
+			"bl	__do_div64"				\
+			: "=r" (__rem), "=r" (__res)			\
+			: "r" (__n), "r" (__base)			\
+			: "ip", "lr", "cc");				\
+	}								\
+	n = __res;							\
+	__rem;								\
 })
 
 #endif
diff -ru linux-2.6.10-rc1/include/asm-generic/div64.h linux-2.6.10-rc1-nsz/include/asm-generic/div64.h
--- linux-2.6.10-rc1/include/asm-generic/div64.h	2004-10-27 10:41:40.000000000 -0700
+++ linux-2.6.10-rc1-nsz/include/asm-generic/div64.h	2004-10-27 10:24:06.000000000 -0700
@@ -22,12 +22,17 @@
 
 #if BITS_PER_LONG == 64
 
-# define do_div(n,base) ({					\
-	uint32_t __base = (base);				\
-	uint32_t __rem;						\
-	__rem = ((uint64_t)(n)) % __base;			\
-	(n) = ((uint64_t)(n)) / __base;				\
-	__rem;							\
+# define do_div(n,base) ({						\
+	uint32_t __rem;							\
+	if (__builtin_constant_p(base) && !((base) & ((base)-1))) {	\
+		__rem = ((uint64_t)(n)) % (base);			\
+		(n) = ((uint64_t)(n)) / (base);				\
+	} else {							\
+		uint32_t __base = (base);				\
+		__rem = ((uint64_t)(n)) % __base;			\
+		(n) = ((uint64_t)(n)) / __base;				\
+	}								\
+	__rem;								\
  })
 
 #elif BITS_PER_LONG == 32
@@ -37,16 +42,21 @@
 /* The unnecessary pointer compare is there
  * to check for type safety (n must be 64bit)
  */
-# define do_div(n,base) ({				\
-	uint32_t __base = (base);			\
-	uint32_t __rem;					\
-	(void)(((typeof((n)) *)0) == ((uint64_t *)0));	\
-	if (likely(((n) >> 32) == 0)) {			\
-		__rem = (uint32_t)(n) % __base;		\
-		(n) = (uint32_t)(n) / __base;		\
-	} else 						\
-		__rem = __div64_32(&(n), __base);	\
-	__rem;						\
+# define do_div(n,base) ({						\
+	uint32_t __rem;						 	\
+	(void)(((typeof((n)) *)0) == ((uint64_t *)0));			\
+	if (__builtin_constant_p(base) && !((base) & ((base)-1))) {     \
+		__rem = ((uint64_t)(n)) % (base);			\
+		(n) = ((uint64_t)(n)) / (base);				\
+	} else {							\
+		uint32_t __base = (base);				\
+		if (likely(((n) >> 32) == 0)) {				\
+			__rem = (uint32_t)(n) % __base;			\
+			(n) = (uint32_t)(n) / __base;			\
+		} else 							\
+			__rem = __div64_32(&(n), __base);		\
+	}								\
+	__rem;								\
  })
 
 #else /* BITS_PER_LONG == ?? */
diff -ru linux-2.6.10-rc1/include/asm-i386/div64.h linux-2.6.10-rc1-nsz/include/asm-i386/div64.h
--- linux-2.6.10-rc1/include/asm-i386/div64.h	2004-10-27 10:41:40.000000000 -0700
+++ linux-2.6.10-rc1-nsz/include/asm-i386/div64.h	2004-10-27 10:15:04.000000000 -0700
@@ -14,16 +14,23 @@
  * convention" on x86.
  */
 #define do_div(n,base) ({ \
-	unsigned long __upper, __low, __high, __mod, __base; \
-	__base = (base); \
-	asm("":"=a" (__low), "=d" (__high):"A" (n)); \
-	__upper = __high; \
-	if (__high) { \
-		__upper = __high % (__base); \
-		__high = __high / (__base); \
+	unsigned long __mod; \
+	if (__builtin_constant_p(base) && !((base) & ((base)-1))) { \
+		__mod = ((uint64_t)(n)) % base;	\
+		(n) = ((uint64_t)(n)) / base; \
+	} else { \
+		unsigned long __upper, __low, __high, __base; \
+		__base = (base); \
+		asm("":"=a" (__low), "=d" (__high):"A" (n)); \
+		__upper = __high; \
+		if (__high) { \
+			__upper = __high % (__base); \
+			__high = __high / (__base); \
+		} \
+		asm("divl %2":	"=a" (__low), "=d" (__mod) : \
+				"rm" (__base), "0" (__low), "1" (__upper)); \
+		asm("":"=A" (n):"a" (__low),"d" (__high)); \
 	} \
-	asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \
-	asm("":"=A" (n):"a" (__low),"d" (__high)); \
 	__mod; \
 })
 
diff -ru linux-2.6.10-rc1/include/asm-m32r/div64.h linux-2.6.10-rc1-nsz/include/asm-m32r/div64.h
--- linux-2.6.10-rc1/include/asm-m32r/div64.h	2004-10-27 10:41:40.000000000 -0700
+++ linux-2.6.10-rc1-nsz/include/asm-m32r/div64.h	2004-10-27 10:23:37.000000000 -0700
@@ -11,28 +11,33 @@
  *  n = n / base;
  *  return value = n % base;
  */
-#define do_div(n, base)						\
-({								\
-	unsigned long _res, _high, _mid, _low;			\
-								\
-	_low = (n) & 0xffffffffUL;				\
-	_high = (n) >> 32;					\
-	if (_high) {						\
-		_mid = (_high % (unsigned long)(base)) << 16;	\
-		_high = _high / (unsigned long)(base);		\
-		_mid += _low >> 16;				\
-		_low &= 0x0000ffffUL;				\
-		_low += (_mid % (unsigned long)(base)) << 16;	\
-		_mid = _mid / (unsigned long)(base);		\
-		_res = _low % (unsigned long)(base);		\
-		_low = _low / (unsigned long)(base);		\
-		n = _low + ((long long)_mid << 16) +		\
-			((long long)_high << 32);		\
-	} else {						\
-		_res = _low % (unsigned long)(base);		\
-		n = (_low / (unsigned long)(base));		\
-	}							\
-	_res;							\
+#define do_div(n, base)							\
+({									\
+	unsigned long _res;						\
+	if (__builtin_constant_p(base) && !((base) & ((base)-1))) {	\
+		_res = ((uint64_t)(n)) % (base);			\
+		(n) = ((uint64_t)(n)) / (base);				\
+	} else {							\
+		unsigned long _high, _mid, _low;			\
+		_low = (n) & 0xffffffffUL;				\
+		_high = (n) >> 32;					\
+		if (_high) {						\
+			_mid = (_high % (unsigned long)(base)) << 16;	\
+			_high = _high / (unsigned long)(base);		\
+			_mid += _low >> 16;				\
+			_low &= 0x0000ffffUL;				\
+			_low += (_mid % (unsigned long)(base)) << 16;	\
+			_mid = _mid / (unsigned long)(base);		\
+			_res = _low % (unsigned long)(base);		\
+			_low = _low / (unsigned long)(base);		\
+			n = _low + ((long long)_mid << 16) +		\
+				((long long)_high << 32);		\
+		} else {						\
+			_res = _low % (unsigned long)(base);		\
+			n = (_low / (unsigned long)(base));		\
+		}							\
+	}								\
+	_res;								\
 })
 
 #endif  /* _ASM_M32R_DIV64 */
diff -ru linux-2.6.10-rc1/include/asm-m68k/div64.h linux-2.6.10-rc1-nsz/include/asm-m68k/div64.h
--- linux-2.6.10-rc1/include/asm-m68k/div64.h	2004-10-27 10:41:40.000000000 -0700
+++ linux-2.6.10-rc1-nsz/include/asm-m68k/div64.h	2004-10-27 10:23:23.000000000 -0700
@@ -3,24 +3,30 @@
 
 /* n = n / base; return rem; */
 
-#define do_div(n, base) ({					\
-	union {							\
-		unsigned long n32[2];				\
-		unsigned long long n64;				\
-	} __n;							\
-	unsigned long __rem, __upper;				\
-								\
-	__n.n64 = (n);						\
-	if ((__upper = __n.n32[0])) {				\
-		asm ("divul.l %2,%1:%0"				\
-			: "=d" (__n.n32[0]), "=d" (__upper)	\
-			: "d" (base), "0" (__n.n32[0]));	\
-	}							\
-	asm ("divu.l %2,%1:%0"					\
-		: "=d" (__n.n32[1]), "=d" (__rem)		\
-		: "d" (base), "1" (__upper), "0" (__n.n32[1]));	\
-	(n) = __n.n64;						\
-	__rem;							\
-})
+#define do_div(n, base) ({						\
+	unsigned long __rem;						\
+	if (__builtin_constant_p(base) && !((base) & ((base)-1))) {	\
+		__rem = ((uint64_t)(n)) % (base);			\
+		(n) = ((uint64_t)(n)) / (base);				\
+	} else {							\
+		union {							\
+			unsigned long n32[2];				\
+			unsigned long long n64;				\
+		} __n;							\
+		unsigned long __upper;					\
+									\
+		__n.n64 = (n);						\
+		if ((__upper = __n.n32[0])) {				\
+			asm ("divul.l %2,%1:%0"				\
+				: "=d" (__n.n32[0]), "=d" (__upper)	\
+				: "d" (base), "0" (__n.n32[0]));	\
+		}							\
+		asm ("divu.l %2,%1:%0"					\
+			: "=d" (__n.n32[1]), "=d" (__rem)		\
+			: "d" (base), "1" (__upper), "0" (__n.n32[1]));	\
+		(n) = __n.n64;						\
+	}								\
+	__rem;								\
+}))
 
 #endif /* _M68K_DIV64_H */
diff -ru linux-2.6.10-rc1/include/asm-mips/div64.h linux-2.6.10-rc1-nsz/include/asm-mips/div64.h
--- linux-2.6.10-rc1/include/asm-mips/div64.h	2004-10-27 10:41:40.000000000 -0700
+++ linux-2.6.10-rc1-nsz/include/asm-mips/div64.h	2004-10-27 10:34:01.000000000 -0700
@@ -52,27 +52,32 @@
 	__mod; })
 
 #define do_div(n, base) ({ \
+	unsigned long long __div; \
 	unsigned long long __quot; \
 	unsigned long __mod; \
-	unsigned long long __div; \
-	unsigned long __upper, __low, __high, __base; \
 	\
 	__div = (n); \
-	__base = (base); \
-	\
-	__high = __div >> 32; \
-	__low = __div; \
-	__upper = __high; \
-	\
-	if (__high) \
-		__asm__("divu	$0, %z2, %z3" \
-			: "=h" (__upper), "=l" (__high) \
-			: "Jr" (__high), "Jr" (__base)); \
-	\
-	__mod = do_div64_32(__low, __upper, __low, __base); \
-	\
-	__quot = __high; \
-	__quot = __quot << 32 | __low; \
+	if (__builtin_constant_p(base) && !((base) & ((base)-1))) { \
+		__mod = __div % (base); \
+		__quot = __div / (base); \
+	} else { \
+		unsigned long __upper, __low, __high, __base; \
+		\
+		__base = (base); \
+		__high = __div >> 32; \
+		__low = __div; \
+		__upper = __high; \
+		\
+		if (__high) \
+			__asm__("divu	$0, %z2, %z3" \
+				: "=h" (__upper), "=l" (__high) \
+				: "Jr" (__high), "Jr" (__base)); \
+		\
+		__mod = do_div64_32(__low, __upper, __low, __base); \
+		\
+		__quot = __high; \
+		__quot = __quot << 32 | __low; \
+	} \
 	(n) = __quot; \
 	__mod; })
 #endif /* (_MIPS_SZLONG == 32) */
@@ -104,18 +109,22 @@
  * Hey, we're already 64-bit, no
  * need to play games..
  */
-#define do_div(n, base) ({ \
+#define do_div(n, base) ( \
+	unsigned long __div; \
 	unsigned long __quot; \
 	unsigned int __mod; \
-	unsigned long __div; \
-	unsigned int __base; \
 	\
 	__div = (n); \
-	__base = (base); \
-	\
-	__mod = __div % __base; \
-	__quot = __div / __base; \
-	\
+	if (__builtin_constant_p(base) && !((base) & ((base)-1))) { \
+		__mod = __div % (base); \
+		__quot = __div / (base); \
+	} else { \
+		unsigned int __base; \
+		\
+		__base = (base); \
+		__mod = __div % __base; \
+		__quot = __div / __base; \
+	} \
 	(n) = __quot; \
 	__mod; })
 
diff -ru linux-2.6.10-rc1/include/asm-s390/div64.h linux-2.6.10-rc1-nsz/include/asm-s390/div64.h
--- linux-2.6.10-rc1/include/asm-s390/div64.h	2004-10-27 10:42:32.000000000 -0700
+++ linux-2.6.10-rc1-nsz/include/asm-s390/div64.h	2004-10-27 10:43:06.000000000 -0700
@@ -4,42 +4,47 @@
 #ifndef __s390x__
 
 /* for do_div "base" needs to be smaller than 2^31-1 */
-#define do_div(n, base) (					\
-	unsigned long long __n = (n);				\
-	unsigned long __r;					\
-								\
-	asm ("   slr  0,0\n"					\
-	     "   l    1,%1\n"					\
-	     "   srdl 0,1\n"					\
-	     "   dr   0,%2\n"					\
-	     "   alr  1,1\n"					\
-	     "   alr  0,0\n"					\
-	     "   lhi  2,1\n"					\
-	     "   n    2,%1\n"					\
-	     "   alr  0,2\n"					\
-	     "   clr  0,%2\n"					\
-	     "   jl   0f\n"					\
-	     "   slr  0,%2\n"					\
-             "   ahi  1,1\n"					\
-	     "0: st   1,%1\n"					\
-	     "   l    1,4+%1\n"					\
-	     "   srdl 0,1\n"					\
-             "   dr   0,%2\n"					\
-	     "   alr  1,1\n"					\
-	     "   alr  0,0\n"					\
-	     "   lhi  2,1\n"					\
-	     "   n    2,4+%1\n"					\
-	     "   alr  0,2\n"					\
-	     "   clr  0,%2\n"					\
-             "   jl   1f\n"					\
-	     "   slr  0,%2\n"					\
-	     "   ahi  1,1\n"					\
-	     "1: st   1,4+%1\n"					\
-             "   lr   %0,0"					\
-	     : "=d" (__r), "=m" (__n)				\
-	     : "d" (base), "m" (__n) : "0", "1", "2", "cc" );	\
-	(n) = (__n);						\
-        __r;                                                    \
+#define do_div(n, base) ({						\
+	unsigned long long __n = (n);					\
+	unsigned long __r;						\
+									\
+	if (__builtin_constant_p(base) && !((base) & ((base)-1))) {	\
+		__r = __n % (base);					\
+		__n = __n / (base);					\
+	} else {							\
+		asm ("   slr  0,0\n"					\
+		     "   l    1,%1\n"					\
+		     "   srdl 0,1\n"					\
+		     "   dr   0,%2\n"					\
+		     "   alr  1,1\n"					\
+		     "   alr  0,0\n"					\
+		     "   lhi  2,1\n"					\
+		     "   n    2,%1\n"					\
+		     "   alr  0,2\n"					\
+		     "   clr  0,%2\n"					\
+		     "   jl   0f\n"					\
+		     "   slr  0,%2\n"					\
+		     "   ahi  1,1\n"					\
+		     "0: st   1,%1\n"					\
+		     "   l    1,4+%1\n"					\
+		     "   srdl 0,1\n"					\
+		     "   dr   0,%2\n"					\
+		     "   alr  1,1\n"					\
+		     "   alr  0,0\n"					\
+		     "   lhi  2,1\n"					\
+		     "   n    2,4+%1\n"					\
+		     "   alr  0,2\n"					\
+		     "   clr  0,%2\n"					\
+		     "   jl   1f\n"					\
+		     "   slr  0,%2\n"					\
+		     "   ahi  1,1\n"					\
+		     "1: st   1,4+%1\n"					\
+		     "   lr   %0,0"					\
+		     : "=d" (__r), "=m" (__n)				\
+		     : "d" (base), "m" (__n) : "0", "1", "2", "cc" );	\
+	}								\
+	(n) = (__n);							\
+	__r;								\
 })
 
 #else /* __s390x__ */

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] Remove some divide instructions
  2004-10-27 16:28 ` Linus Torvalds
  2004-10-27 18:05   ` Zachary Amsden
@ 2004-10-27 20:16   ` Zachary Amsden
  2004-10-27 21:24     ` Linus Torvalds
  2004-10-27 22:08   ` Thayne Harbaugh
  2004-10-27 22:14   ` Zachary Amsden
  3 siblings, 1 reply; 13+ messages in thread
From: Zachary Amsden @ 2004-10-27 20:16 UTC (permalink / raw)
  To: linux-kernel; +Cc: Linus Torvalds

[-- Attachment #1: Type: text/plain, Size: 1844 bytes --]

Apparently cc-ing linux-kernel is not good enough

Tested code generation (gcc (GCC) 3.2.2 20030222) and the optimization 
for powers of two works.  Reassigning base to a register causes the 
optimization to be dropped, but since these are constants anyway we can 
reference base without side effects.  I didn't extract n into a register 
to avoid side effects if it was not already done, since there are 
several possible places where the macro can generate side effects for 
(n) and even a comment to this effect.

This does the right thing for at least i386 and the generic code, and 
generates the expected shifts and and masks for power of two divides.

Passes several tests (code looks sane, assembler looks sane, boots & 
runs fine) on i386.  Apologies if I unlikely broke any other 
architectures, but it's quite difficult to test them all.

Zachary Amsden
zach@vmware.com

Linus Torvalds wrote:

>On Wed, 27 Oct 2004, Zachary Amsden wrote:
>  
>
>>I noticed several 64-bit divides for HZ/USER_HZ, and also the fact that 
>>HZ == USER_HZ on many architectures (or if you play with scaling it ;).  
>>Since do_div is macroized to optimized assembler on many platforms, we 
>>emit long divides for divide by one.  This could be extended further to 
>>recognize other power of two divides, but I don't think the complexity 
>>of the macros would be justified.  I also didn't feel it was worthwhile 
>>to optimize this for non-constant divides; if you feel otherwise, please 
>>extend.
>>    
>>
>
>Can you test out the trivial extension, which is to allow any power-of-two
>constant, and just leave it as a divide in those cases (and let the
>compiler turn it into a 64-bit shift)?
>
>It's very easy to test for powers of two: !((x) & ((x)-1)) does it for 
>everything but zero, and zero in this case is an error anyway.
>
>		Linus
>  
>


[-- Attachment #2: div64-2.patch --]
[-- Type: text/plain, Size: 13383 bytes --]

diff -ru linux-2.6.10-rc1/include/asm-arm/div64.h linux-2.6.10-rc1-nsz/include/asm-arm/div64.h
--- linux-2.6.10-rc1/include/asm-arm/div64.h	2004-10-27 10:41:40.000000000 -0700
+++ linux-2.6.10-rc1-nsz/include/asm-arm/div64.h	2004-10-27 10:24:25.000000000 -0700
@@ -27,22 +27,27 @@
 #define __xh "r1"
 #endif
 
-#define do_div(n,base)						\
-({								\
-	register unsigned int __base      asm("r4") = base;	\
-	register unsigned long long __n   asm("r0") = n;	\
-	register unsigned long long __res asm("r2");		\
-	register unsigned int __rem       asm(__xh);		\
-	asm(	__asmeq("%0", __xh)				\
-		__asmeq("%1", "r2")				\
-		__asmeq("%2", "r0")				\
-		__asmeq("%3", "r4")				\
-		"bl	__do_div64"				\
-		: "=r" (__rem), "=r" (__res)			\
-		: "r" (__n), "r" (__base)			\
-		: "ip", "lr", "cc");				\
-	n = __res;						\
-	__rem;							\
+#define do_div(n,base)							\
+({									\
+	register unsigned long long __n   asm("r0") = n;		\
+	register unsigned long long __res asm("r2");			\
+	register unsigned int __rem       asm(__xh);			\
+	if (__builtin_constant_p(base) && !((base) & ((base)-1))) {	\
+		__rem = __n % (base);					\
+		__res = __n / (base);					\
+	} else {							\
+		register unsigned int __base      asm("r4") = base;	\
+		asm(	__asmeq("%0", __xh)				\
+			__asmeq("%1", "r2")				\
+			__asmeq("%2", "r0")				\
+			__asmeq("%3", "r4")				\
+			"bl	__do_div64"				\
+			: "=r" (__rem), "=r" (__res)			\
+			: "r" (__n), "r" (__base)			\
+			: "ip", "lr", "cc");				\
+	}								\
+	n = __res;							\
+	__rem;								\
 })
 
 #endif
diff -ru linux-2.6.10-rc1/include/asm-generic/div64.h linux-2.6.10-rc1-nsz/include/asm-generic/div64.h
--- linux-2.6.10-rc1/include/asm-generic/div64.h	2004-10-27 10:41:40.000000000 -0700
+++ linux-2.6.10-rc1-nsz/include/asm-generic/div64.h	2004-10-27 10:24:06.000000000 -0700
@@ -22,12 +22,17 @@
 
 #if BITS_PER_LONG == 64
 
-# define do_div(n,base) ({					\
-	uint32_t __base = (base);				\
-	uint32_t __rem;						\
-	__rem = ((uint64_t)(n)) % __base;			\
-	(n) = ((uint64_t)(n)) / __base;				\
-	__rem;							\
+# define do_div(n,base) ({						\
+	uint32_t __rem;							\
+	if (__builtin_constant_p(base) && !((base) & ((base)-1))) {	\
+		__rem = ((uint64_t)(n)) % (base);			\
+		(n) = ((uint64_t)(n)) / (base);				\
+	} else {							\
+		uint32_t __base = (base);				\
+		__rem = ((uint64_t)(n)) % __base;			\
+		(n) = ((uint64_t)(n)) / __base;				\
+	}								\
+	__rem;								\
  })
 
 #elif BITS_PER_LONG == 32
@@ -37,16 +42,21 @@
 /* The unnecessary pointer compare is there
  * to check for type safety (n must be 64bit)
  */
-# define do_div(n,base) ({				\
-	uint32_t __base = (base);			\
-	uint32_t __rem;					\
-	(void)(((typeof((n)) *)0) == ((uint64_t *)0));	\
-	if (likely(((n) >> 32) == 0)) {			\
-		__rem = (uint32_t)(n) % __base;		\
-		(n) = (uint32_t)(n) / __base;		\
-	} else 						\
-		__rem = __div64_32(&(n), __base);	\
-	__rem;						\
+# define do_div(n,base) ({						\
+	uint32_t __rem;						 	\
+	(void)(((typeof((n)) *)0) == ((uint64_t *)0));			\
+	if (__builtin_constant_p(base) && !((base) & ((base)-1))) {     \
+		__rem = ((uint64_t)(n)) % (base);			\
+		(n) = ((uint64_t)(n)) / (base);				\
+	} else {							\
+		uint32_t __base = (base);				\
+		if (likely(((n) >> 32) == 0)) {				\
+			__rem = (uint32_t)(n) % __base;			\
+			(n) = (uint32_t)(n) / __base;			\
+		} else 							\
+			__rem = __div64_32(&(n), __base);		\
+	}								\
+	__rem;								\
  })
 
 #else /* BITS_PER_LONG == ?? */
diff -ru linux-2.6.10-rc1/include/asm-i386/div64.h linux-2.6.10-rc1-nsz/include/asm-i386/div64.h
--- linux-2.6.10-rc1/include/asm-i386/div64.h	2004-10-27 10:41:40.000000000 -0700
+++ linux-2.6.10-rc1-nsz/include/asm-i386/div64.h	2004-10-27 10:15:04.000000000 -0700
@@ -14,16 +14,23 @@
  * convention" on x86.
  */
 #define do_div(n,base) ({ \
-	unsigned long __upper, __low, __high, __mod, __base; \
-	__base = (base); \
-	asm("":"=a" (__low), "=d" (__high):"A" (n)); \
-	__upper = __high; \
-	if (__high) { \
-		__upper = __high % (__base); \
-		__high = __high / (__base); \
+	unsigned long __mod; \
+	if (__builtin_constant_p(base) && !((base) & ((base)-1))) { \
+		__mod = ((uint64_t)(n)) % base;	\
+		(n) = ((uint64_t)(n)) / base; \
+	} else { \
+		unsigned long __upper, __low, __high, __base; \
+		__base = (base); \
+		asm("":"=a" (__low), "=d" (__high):"A" (n)); \
+		__upper = __high; \
+		if (__high) { \
+			__upper = __high % (__base); \
+			__high = __high / (__base); \
+		} \
+		asm("divl %2":	"=a" (__low), "=d" (__mod) : \
+				"rm" (__base), "0" (__low), "1" (__upper)); \
+		asm("":"=A" (n):"a" (__low),"d" (__high)); \
 	} \
-	asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \
-	asm("":"=A" (n):"a" (__low),"d" (__high)); \
 	__mod; \
 })
 
diff -ru linux-2.6.10-rc1/include/asm-m32r/div64.h linux-2.6.10-rc1-nsz/include/asm-m32r/div64.h
--- linux-2.6.10-rc1/include/asm-m32r/div64.h	2004-10-27 10:41:40.000000000 -0700
+++ linux-2.6.10-rc1-nsz/include/asm-m32r/div64.h	2004-10-27 10:23:37.000000000 -0700
@@ -11,28 +11,33 @@
  *  n = n / base;
  *  return value = n % base;
  */
-#define do_div(n, base)						\
-({								\
-	unsigned long _res, _high, _mid, _low;			\
-								\
-	_low = (n) & 0xffffffffUL;				\
-	_high = (n) >> 32;					\
-	if (_high) {						\
-		_mid = (_high % (unsigned long)(base)) << 16;	\
-		_high = _high / (unsigned long)(base);		\
-		_mid += _low >> 16;				\
-		_low &= 0x0000ffffUL;				\
-		_low += (_mid % (unsigned long)(base)) << 16;	\
-		_mid = _mid / (unsigned long)(base);		\
-		_res = _low % (unsigned long)(base);		\
-		_low = _low / (unsigned long)(base);		\
-		n = _low + ((long long)_mid << 16) +		\
-			((long long)_high << 32);		\
-	} else {						\
-		_res = _low % (unsigned long)(base);		\
-		n = (_low / (unsigned long)(base));		\
-	}							\
-	_res;							\
+#define do_div(n, base)							\
+({									\
+	unsigned long _res;						\
+	if (__builtin_constant_p(base) && !((base) & ((base)-1))) {	\
+		_res = ((uint64_t)(n)) % (base);			\
+		(n) = ((uint64_t)(n)) / (base);				\
+	} else {							\
+		unsigned long _high, _mid, _low;			\
+		_low = (n) & 0xffffffffUL;				\
+		_high = (n) >> 32;					\
+		if (_high) {						\
+			_mid = (_high % (unsigned long)(base)) << 16;	\
+			_high = _high / (unsigned long)(base);		\
+			_mid += _low >> 16;				\
+			_low &= 0x0000ffffUL;				\
+			_low += (_mid % (unsigned long)(base)) << 16;	\
+			_mid = _mid / (unsigned long)(base);		\
+			_res = _low % (unsigned long)(base);		\
+			_low = _low / (unsigned long)(base);		\
+			n = _low + ((long long)_mid << 16) +		\
+				((long long)_high << 32);		\
+		} else {						\
+			_res = _low % (unsigned long)(base);		\
+			n = (_low / (unsigned long)(base));		\
+		}							\
+	}								\
+	_res;								\
 })
 
 #endif  /* _ASM_M32R_DIV64 */
diff -ru linux-2.6.10-rc1/include/asm-m68k/div64.h linux-2.6.10-rc1-nsz/include/asm-m68k/div64.h
--- linux-2.6.10-rc1/include/asm-m68k/div64.h	2004-10-27 10:41:40.000000000 -0700
+++ linux-2.6.10-rc1-nsz/include/asm-m68k/div64.h	2004-10-27 10:23:23.000000000 -0700
@@ -3,24 +3,30 @@
 
 /* n = n / base; return rem; */
 
-#define do_div(n, base) ({					\
-	union {							\
-		unsigned long n32[2];				\
-		unsigned long long n64;				\
-	} __n;							\
-	unsigned long __rem, __upper;				\
-								\
-	__n.n64 = (n);						\
-	if ((__upper = __n.n32[0])) {				\
-		asm ("divul.l %2,%1:%0"				\
-			: "=d" (__n.n32[0]), "=d" (__upper)	\
-			: "d" (base), "0" (__n.n32[0]));	\
-	}							\
-	asm ("divu.l %2,%1:%0"					\
-		: "=d" (__n.n32[1]), "=d" (__rem)		\
-		: "d" (base), "1" (__upper), "0" (__n.n32[1]));	\
-	(n) = __n.n64;						\
-	__rem;							\
-})
+#define do_div(n, base) ({						\
+	unsigned long __rem;						\
+	if (__builtin_constant_p(base) && !((base) & ((base)-1))) {	\
+		__rem = ((uint64_t)(n)) % (base);			\
+		(n) = ((uint64_t)(n)) / (base);				\
+	} else {							\
+		union {							\
+			unsigned long n32[2];				\
+			unsigned long long n64;				\
+		} __n;							\
+		unsigned long __upper;					\
+									\
+		__n.n64 = (n);						\
+		if ((__upper = __n.n32[0])) {				\
+			asm ("divul.l %2,%1:%0"				\
+				: "=d" (__n.n32[0]), "=d" (__upper)	\
+				: "d" (base), "0" (__n.n32[0]));	\
+		}							\
+		asm ("divu.l %2,%1:%0"					\
+			: "=d" (__n.n32[1]), "=d" (__rem)		\
+			: "d" (base), "1" (__upper), "0" (__n.n32[1]));	\
+		(n) = __n.n64;						\
+	}								\
+	__rem;								\
+}))
 
 #endif /* _M68K_DIV64_H */
diff -ru linux-2.6.10-rc1/include/asm-mips/div64.h linux-2.6.10-rc1-nsz/include/asm-mips/div64.h
--- linux-2.6.10-rc1/include/asm-mips/div64.h	2004-10-27 10:41:40.000000000 -0700
+++ linux-2.6.10-rc1-nsz/include/asm-mips/div64.h	2004-10-27 10:34:01.000000000 -0700
@@ -52,27 +52,32 @@
 	__mod; })
 
 #define do_div(n, base) ({ \
+	unsigned long long __div; \
 	unsigned long long __quot; \
 	unsigned long __mod; \
-	unsigned long long __div; \
-	unsigned long __upper, __low, __high, __base; \
 	\
 	__div = (n); \
-	__base = (base); \
-	\
-	__high = __div >> 32; \
-	__low = __div; \
-	__upper = __high; \
-	\
-	if (__high) \
-		__asm__("divu	$0, %z2, %z3" \
-			: "=h" (__upper), "=l" (__high) \
-			: "Jr" (__high), "Jr" (__base)); \
-	\
-	__mod = do_div64_32(__low, __upper, __low, __base); \
-	\
-	__quot = __high; \
-	__quot = __quot << 32 | __low; \
+	if (__builtin_constant_p(base) && !((base) & ((base)-1))) { \
+		__mod = __div % (base); \
+		__quot = __div / (base); \
+	} else { \
+		unsigned long __upper, __low, __high, __base; \
+		\
+		__base = (base); \
+		__high = __div >> 32; \
+		__low = __div; \
+		__upper = __high; \
+		\
+		if (__high) \
+			__asm__("divu	$0, %z2, %z3" \
+				: "=h" (__upper), "=l" (__high) \
+				: "Jr" (__high), "Jr" (__base)); \
+		\
+		__mod = do_div64_32(__low, __upper, __low, __base); \
+		\
+		__quot = __high; \
+		__quot = __quot << 32 | __low; \
+	} \
 	(n) = __quot; \
 	__mod; })
 #endif /* (_MIPS_SZLONG == 32) */
@@ -104,18 +109,22 @@
  * Hey, we're already 64-bit, no
  * need to play games..
  */
-#define do_div(n, base) ({ \
+#define do_div(n, base) ( \
+	unsigned long __div; \
 	unsigned long __quot; \
 	unsigned int __mod; \
-	unsigned long __div; \
-	unsigned int __base; \
 	\
 	__div = (n); \
-	__base = (base); \
-	\
-	__mod = __div % __base; \
-	__quot = __div / __base; \
-	\
+	if (__builtin_constant_p(base) && !((base) & ((base)-1))) { \
+		__mod = __div % (base); \
+		__quot = __div / (base); \
+	} else { \
+		unsigned int __base; \
+		\
+		__base = (base); \
+		__mod = __div % __base; \
+		__quot = __div / __base; \
+	} \
 	(n) = __quot; \
 	__mod; })
 
diff -ru linux-2.6.10-rc1/include/asm-s390/div64.h linux-2.6.10-rc1-nsz/include/asm-s390/div64.h
--- linux-2.6.10-rc1/include/asm-s390/div64.h	2004-10-27 10:42:32.000000000 -0700
+++ linux-2.6.10-rc1-nsz/include/asm-s390/div64.h	2004-10-27 10:43:06.000000000 -0700
@@ -4,42 +4,47 @@
 #ifndef __s390x__
 
 /* for do_div "base" needs to be smaller than 2^31-1 */
-#define do_div(n, base) (					\
-	unsigned long long __n = (n);				\
-	unsigned long __r;					\
-								\
-	asm ("   slr  0,0\n"					\
-	     "   l    1,%1\n"					\
-	     "   srdl 0,1\n"					\
-	     "   dr   0,%2\n"					\
-	     "   alr  1,1\n"					\
-	     "   alr  0,0\n"					\
-	     "   lhi  2,1\n"					\
-	     "   n    2,%1\n"					\
-	     "   alr  0,2\n"					\
-	     "   clr  0,%2\n"					\
-	     "   jl   0f\n"					\
-	     "   slr  0,%2\n"					\
-             "   ahi  1,1\n"					\
-	     "0: st   1,%1\n"					\
-	     "   l    1,4+%1\n"					\
-	     "   srdl 0,1\n"					\
-             "   dr   0,%2\n"					\
-	     "   alr  1,1\n"					\
-	     "   alr  0,0\n"					\
-	     "   lhi  2,1\n"					\
-	     "   n    2,4+%1\n"					\
-	     "   alr  0,2\n"					\
-	     "   clr  0,%2\n"					\
-             "   jl   1f\n"					\
-	     "   slr  0,%2\n"					\
-	     "   ahi  1,1\n"					\
-	     "1: st   1,4+%1\n"					\
-             "   lr   %0,0"					\
-	     : "=d" (__r), "=m" (__n)				\
-	     : "d" (base), "m" (__n) : "0", "1", "2", "cc" );	\
-	(n) = (__n);						\
-        __r;                                                    \
+#define do_div(n, base) ({						\
+	unsigned long long __n = (n);					\
+	unsigned long __r;						\
+									\
+	if (__builtin_constant_p(base) && !((base) & ((base)-1))) {	\
+		__r = __n % (base);					\
+		__n = __n / (base);					\
+	} else {							\
+		asm ("   slr  0,0\n"					\
+		     "   l    1,%1\n"					\
+		     "   srdl 0,1\n"					\
+		     "   dr   0,%2\n"					\
+		     "   alr  1,1\n"					\
+		     "   alr  0,0\n"					\
+		     "   lhi  2,1\n"					\
+		     "   n    2,%1\n"					\
+		     "   alr  0,2\n"					\
+		     "   clr  0,%2\n"					\
+		     "   jl   0f\n"					\
+		     "   slr  0,%2\n"					\
+		     "   ahi  1,1\n"					\
+		     "0: st   1,%1\n"					\
+		     "   l    1,4+%1\n"					\
+		     "   srdl 0,1\n"					\
+		     "   dr   0,%2\n"					\
+		     "   alr  1,1\n"					\
+		     "   alr  0,0\n"					\
+		     "   lhi  2,1\n"					\
+		     "   n    2,4+%1\n"					\
+		     "   alr  0,2\n"					\
+		     "   clr  0,%2\n"					\
+		     "   jl   1f\n"					\
+		     "   slr  0,%2\n"					\
+		     "   ahi  1,1\n"					\
+		     "1: st   1,4+%1\n"					\
+		     "   lr   %0,0"					\
+		     : "=d" (__r), "=m" (__n)				\
+		     : "d" (base), "m" (__n) : "0", "1", "2", "cc" );	\
+	}								\
+	(n) = (__n);							\
+	__r;								\
 })
 
 #else /* __s390x__ */

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] Remove some divide instructions
  2004-10-27 20:16   ` Zachary Amsden
@ 2004-10-27 21:24     ` Linus Torvalds
  0 siblings, 0 replies; 13+ messages in thread
From: Linus Torvalds @ 2004-10-27 21:24 UTC (permalink / raw)
  To: Zachary Amsden; +Cc: linux-kernel



On Wed, 27 Oct 2004, Zachary Amsden wrote:
> 
> Passes several tests (code looks sane, assembler looks sane, boots & 
> runs fine) on i386.  Apologies if I unlikely broke any other 
> architectures, but it's quite difficult to test them all.

I'd suggest _only_ changing the i386 version.

For example, your asm-generic changes looks senseless, since they only
make the macros more complex, without actually changing anything. And the
other architectures may want to do other things, since right now at least
some of them use things like fixed hardware registers etc which is not
necessarily appropriate for the non-asm case...

That way you'd also only modify the architecture that you can actually 
verify..

		Linus

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] Remove some divide instructions
  2004-10-27 16:28 ` Linus Torvalds
  2004-10-27 18:05   ` Zachary Amsden
  2004-10-27 20:16   ` Zachary Amsden
@ 2004-10-27 22:08   ` Thayne Harbaugh
  2004-10-27 22:14   ` Zachary Amsden
  3 siblings, 0 replies; 13+ messages in thread
From: Thayne Harbaugh @ 2004-10-27 22:08 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Zachary Amsden, linux-kernel, george

[-- Attachment #1: Type: text/plain, Size: 385 bytes --]

On Wed, 2004-10-27 at 09:28 -0700, Linus Torvalds wrote:

> It's very easy to test for powers of two: !((x) & ((x)-1)) does it for 
> everything but zero, and zero in this case is an error anyway.

Of course 0 is not a power of two - nor a power of any other number
(although n**(-oo) where |n|>1 and n**(oo) where |n|<1 are close).

8)

-- 
Thayne Harbaugh
Linux Networx

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] Remove some divide instructions
  2004-10-27 16:28 ` Linus Torvalds
                     ` (2 preceding siblings ...)
  2004-10-27 22:08   ` Thayne Harbaugh
@ 2004-10-27 22:14   ` Zachary Amsden
  2004-10-28  0:11     ` Linus Torvalds
  3 siblings, 1 reply; 13+ messages in thread
From: Zachary Amsden @ 2004-10-27 22:14 UTC (permalink / raw)
  To: Linus Torvalds, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 2139 bytes --]

Apologies in advance for e-mail difficulties today - I can't reply 
directly to your message, so threading may be off.

>I'd suggest _only_ changing the i386 version.
>
>For example, your asm-generic changes looks senseless, since they only
>make the macros more complex, without actually changing anything. And the
>other architectures may want to do other things, since right now at least
>some of them use things like fixed hardware registers etc which is not
>necessarily appropriate for the non-asm case...
>
>That way you'd also only modify the architecture that you can actually 
>verify..
>
>		Linus
>

Backed out everything but i386 and generic.  For the generic version, I 
compiled and tested this code outside of the kernel.  Actually, I found 
that at least for my tool chain, the generic version

+# define do_div(n,base) ({                                             \
+       uint32_t __rem;                                                 \
+       if (__builtin_constant_p(base) && !((base) & ((base)-1))) {     \
+               __rem = ((uint64_t)(n)) % (base);                       \
+               (n) = ((uint64_t)(n)) / (base);                         \
+       } else {                                                        \
+               uint32_t __base = (base);                               \
+               __rem = ((uint64_t)(n)) % __base;                       \
+               (n) = ((uint64_t)(n)) / __base;                         \
+       }                                                               \
+       __rem;          

Does indeed generate different code for the constant case - without it, 
due to the assignment to __base, the shift / mask optimization does not 
take place.  Apparently the constant attribute and associated 
optimizations do not propagate through the assignment.  Other gccs may 
behave differently.   I also tried making __base const, to no avail.  If 
one were willing to ignore the potential macro side effects of the 
references to (base), the code could be the same, but I'm not the best 
judge of whether that is a good thing to do.

Zach
zach@vmware.com

[-- Attachment #2: div64-3.patch --]
[-- Type: text/plain, Size: 3225 bytes --]

diff -ru linux-2.6.10-rc1/include/asm-generic/div64.h linux-2.6.10-rc1-nsz/include/asm-generic/div64.h
--- linux-2.6.10-rc1/include/asm-generic/div64.h	2004-10-27 10:41:40.000000000 -0700
+++ linux-2.6.10-rc1-nsz/include/asm-generic/div64.h	2004-10-27 10:24:06.000000000 -0700
@@ -22,12 +22,17 @@
 
 #if BITS_PER_LONG == 64
 
-# define do_div(n,base) ({					\
-	uint32_t __base = (base);				\
-	uint32_t __rem;						\
-	__rem = ((uint64_t)(n)) % __base;			\
-	(n) = ((uint64_t)(n)) / __base;				\
-	__rem;							\
+# define do_div(n,base) ({						\
+	uint32_t __rem;							\
+	if (__builtin_constant_p(base) && !((base) & ((base)-1))) {	\
+		__rem = ((uint64_t)(n)) % (base);			\
+		(n) = ((uint64_t)(n)) / (base);				\
+	} else {							\
+		uint32_t __base = (base);				\
+		__rem = ((uint64_t)(n)) % __base;			\
+		(n) = ((uint64_t)(n)) / __base;				\
+	}								\
+	__rem;								\
  })
 
 #elif BITS_PER_LONG == 32
@@ -37,16 +42,21 @@
 /* The unnecessary pointer compare is there
  * to check for type safety (n must be 64bit)
  */
-# define do_div(n,base) ({				\
-	uint32_t __base = (base);			\
-	uint32_t __rem;					\
-	(void)(((typeof((n)) *)0) == ((uint64_t *)0));	\
-	if (likely(((n) >> 32) == 0)) {			\
-		__rem = (uint32_t)(n) % __base;		\
-		(n) = (uint32_t)(n) / __base;		\
-	} else 						\
-		__rem = __div64_32(&(n), __base);	\
-	__rem;						\
+# define do_div(n,base) ({						\
+	uint32_t __rem;						 	\
+	(void)(((typeof((n)) *)0) == ((uint64_t *)0));			\
+	if (__builtin_constant_p(base) && !((base) & ((base)-1))) {     \
+		__rem = ((uint64_t)(n)) % (base);			\
+		(n) = ((uint64_t)(n)) / (base);				\
+	} else {							\
+		uint32_t __base = (base);				\
+		if (likely(((n) >> 32) == 0)) {				\
+			__rem = (uint32_t)(n) % __base;			\
+			(n) = (uint32_t)(n) / __base;			\
+		} else 							\
+			__rem = __div64_32(&(n), __base);		\
+	}								\
+	__rem;								\
  })
 
 #else /* BITS_PER_LONG == ?? */
diff -ru linux-2.6.10-rc1/include/asm-i386/div64.h linux-2.6.10-rc1-nsz/include/asm-i386/div64.h
--- linux-2.6.10-rc1/include/asm-i386/div64.h	2004-10-27 10:41:40.000000000 -0700
+++ linux-2.6.10-rc1-nsz/include/asm-i386/div64.h	2004-10-27 10:15:04.000000000 -0700
@@ -14,16 +14,23 @@
  * convention" on x86.
  */
 #define do_div(n,base) ({ \
-	unsigned long __upper, __low, __high, __mod, __base; \
-	__base = (base); \
-	asm("":"=a" (__low), "=d" (__high):"A" (n)); \
-	__upper = __high; \
-	if (__high) { \
-		__upper = __high % (__base); \
-		__high = __high / (__base); \
+	unsigned long __mod; \
+	if (__builtin_constant_p(base) && !((base) & ((base)-1))) { \
+		__mod = ((uint64_t)(n)) % base;	\
+		(n) = ((uint64_t)(n)) / base; \
+	} else { \
+		unsigned long __upper, __low, __high, __base; \
+		__base = (base); \
+		asm("":"=a" (__low), "=d" (__high):"A" (n)); \
+		__upper = __high; \
+		if (__high) { \
+			__upper = __high % (__base); \
+			__high = __high / (__base); \
+		} \
+		asm("divl %2":	"=a" (__low), "=d" (__mod) : \
+				"rm" (__base), "0" (__low), "1" (__upper)); \
+		asm("":"=A" (n):"a" (__low),"d" (__high)); \
 	} \
-	asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \
-	asm("":"=A" (n):"a" (__low),"d" (__high)); \
 	__mod; \
 })
 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] Remove some divide instructions
  2004-10-27 22:14   ` Zachary Amsden
@ 2004-10-28  0:11     ` Linus Torvalds
  2004-10-28  0:47       ` Linus Torvalds
  2004-10-28  0:59       ` Maciej W. Rozycki
  0 siblings, 2 replies; 13+ messages in thread
From: Linus Torvalds @ 2004-10-28  0:11 UTC (permalink / raw)
  To: Zachary Amsden; +Cc: linux-kernel



On Wed, 27 Oct 2004, Zachary Amsden wrote:
> 
> Backed out everything but i386 and generic.  For the generic version, I 
> compiled and tested this code outside of the kernel.  Actually, I found 
> that at least for my tool chain, the generic version
> 
> +# define do_div(n,base) ({                                             \
> +       uint32_t __rem;                                                 \
> +       if (__builtin_constant_p(base) && !((base) & ((base)-1))) {     \
> +               __rem = ((uint64_t)(n)) % (base);                       \
> +               (n) = ((uint64_t)(n)) / (base);                         \
> +       } else {                                                        \
> +               uint32_t __base = (base);                               \
> +               __rem = ((uint64_t)(n)) % __base;                       \
> +               (n) = ((uint64_t)(n)) / __base;                         \
> +       }                                                               \
> +       __rem;          
> 
> Does indeed generate different code for the constant case - without it, 
> due to the assignment to __base, the shift / mask optimization does not 
> take place.

Oh, damn. That's a separate issue, apparently, and there you just use 
"__builtin_constant_p()" as a way to check that there are no side effects 
on "base".

Might as well drop the check for the value, since it doesn't matter.

Alternatively, we could just document the fact that neither "base" nor "n"
are normal arguments to a function. After all, we already _do_ change "n", 
and the strange calling convention is already documented as nothing but a 
sick hack to make architectures able to use inline assembly efficiently.

I could add a sparse check for "no side effects", if anybody cares (so 
that you could do

	__builtin_warning(
		!__builtin_nosideeffects(base),
		"expression has side effects");

in macros like these.. Sparse already has the logic internally..

		Linus

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] Remove some divide instructions
  2004-10-28  0:11     ` Linus Torvalds
@ 2004-10-28  0:47       ` Linus Torvalds
  2004-10-29  0:47         ` Zachary Amsden
  2004-10-28  0:59       ` Maciej W. Rozycki
  1 sibling, 1 reply; 13+ messages in thread
From: Linus Torvalds @ 2004-10-28  0:47 UTC (permalink / raw)
  To: Zachary Amsden; +Cc: linux-kernel



On Wed, 27 Oct 2004, Linus Torvalds wrote:
> 
> I could add a sparse check for "no side effects", if anybody cares (so 
> that you could do
> 
> 	__builtin_warning(
> 		!__builtin_nosideeffects(base),
> 		"expression has side effects");
> 
> in macros like these.. Sparse already has the logic internally..

Done. Except I called it __builtin_safe_p(). 

The kernel sources already know about "__builtin_warning()" (and 
pre-process it away on gcc), so if you have a new sparse setup (as of two 
minutes ago ;), you can use this thing to check that arguments to macros 
do not have side effects.

Useful? You be the judge. But it was just a couple of lines in sparse, and
doing so also made it obvious how to clean up __builtin_constant_p() a lot
at the same time by just re-organizing things a bit.

My inliner and statement simplificator isn't perfect, so inline functions
sadly are not considered constant (or safe) even if they _do_ end up
returning a constant value (or be safe internally).

		Linus

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] Remove some divide instructions
  2004-10-28  0:11     ` Linus Torvalds
  2004-10-28  0:47       ` Linus Torvalds
@ 2004-10-28  0:59       ` Maciej W. Rozycki
  1 sibling, 0 replies; 13+ messages in thread
From: Maciej W. Rozycki @ 2004-10-28  0:59 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Zachary Amsden, linux-kernel

On Wed, 27 Oct 2004, Linus Torvalds wrote:

> > Does indeed generate different code for the constant case - without it, 
> > due to the assignment to __base, the shift / mask optimization does not 
> > take place.
> 
> Oh, damn. That's a separate issue, apparently, and there you just use 
> "__builtin_constant_p()" as a way to check that there are no side effects 
> on "base".

 That has to be a deficiency in that specific version of compiler.  For me 
it just works as long as __base is const:

$ cat do_div.c
#include <stdint.h>

#define do_div(n, base) ({						\
	const uint32_t __base = (base);					\
	uint32_t __rem;							\
	__rem = ((uint64_t)(n)) % __base;				\
	(n) = ((uint64_t)(n)) / __base;					\
	__rem;								\
})

uint32_t div16(uint64_t *n)
{
	return do_div(*n, 16);
}
$ gcc -g -O2 -fomit-frame-pointer -c do_div.c
$ objdump -d do_div.o

do_div.o:     file format elf32-i386

Disassembly of section .text:

00000000 <div16>:
   0:	56                   	push   %esi
   1:	53                   	push   %ebx
   2:	8b 74 24 0c          	mov    0xc(%esp),%esi
   6:	8b 0e                	mov    (%esi),%ecx
   8:	8b 5e 04             	mov    0x4(%esi),%ebx
   b:	89 c8                	mov    %ecx,%eax
   d:	0f ac d9 04          	shrd   $0x4,%ebx,%ecx
  11:	c1 eb 04             	shr    $0x4,%ebx
  14:	89 0e                	mov    %ecx,(%esi)
  16:	89 5e 04             	mov    %ebx,0x4(%esi)
  19:	5b                   	pop    %ebx
  1a:	83 e0 0f             	and    $0xf,%eax
  1d:	5e                   	pop    %esi
  1e:	c3                   	ret    
$ gcc --version
gcc (GCC) 3.5.0 20040322 (experimental)
[...]

I guess anything not older is expected to work.

  Maciej

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] Remove some divide instructions
  2004-10-28  0:47       ` Linus Torvalds
@ 2004-10-29  0:47         ` Zachary Amsden
  2004-10-29  4:52           ` Linus Torvalds
  0 siblings, 1 reply; 13+ messages in thread
From: Zachary Amsden @ 2004-10-29  0:47 UTC (permalink / raw)
  To: Linus Torvalds, linux-kernel

Tried this.  I found a problem -- ide-cd.c calls a function to compute 
base, which was caught by sparse.  This is easy enough to workaround, 
but my adventures went further than expected..

I tested all powers of two and found gcc doesn't like to perform the 
optimization for 0x80000000 as a divisor.  Ok, worked around that.  Now 
Everything appears to work great, until I started using modules:

  MODPOST
*** Warning: "__udivdi3" [drivers/scsi/ipr.ko] undefined!
*** Warning: "__umoddi3" [drivers/scsi/ipr.ko] undefined!
*** Warning: "__udivdi3" [drivers/scsi/dpt_i2o.ko] undefined!
*** Warning: "__umoddi3" [drivers/scsi/dpt_i2o.ko] undefined!
*** Warning: "__udivdi3" [drivers/base/firmware_class.ko] undefined!
*** Warning: "__umoddi3" [drivers/base/firmware_class.ko] undefined!

That doesn't look good.  Trying to load the modules confirms that 
__uxxxdi3 is missing.  How did this happen?  If you look at do_div, it 
is clear that __udivdi3 should not be needed, since we will always hit 
the optimization path.  Nevertheless, gcc inserts an extraneous ".globl 
__udivdi3" in all output to the assembler from .c files which include 
asm/div64.h, even if the do_div function is never directly referenced 
and no instances of it appear in the assembler output (libcrc32c.c is 
the easiest to verify).  Apparently, this happens somewhere before the 
optimization phase, and the external reference never gets dropped after 
that.  Since div64.h is included from sched.h, that happens to be quite 
a few files.

#define do_div(n,base) ({ \
        unsigned long __mod; \
        if (unlikely(__builtin_constant_p(base) && !((base) & 
((base)-1)) && \
            (long)(base) > 0)) { \
                __mod = ((uint64_t)(n)) % ((unsigned long)(base));      \
                (n) = ((uint64_t)(n)) / ((unsigned long)(base)); \
        } else { \

The kernel proper is ok - since the optimization is done correctly and 
udivdi3 is never actually referenced, it gets dropped during the link 
phase.  Modules are not - the unresolved symbol stays there.

This leaves several options:

1) Forget the optimization altogether
2) Go back to the (base == 1) check
3) In the module post phase, strip extraneous unused external references 
from modules
4) Fixup module loading to work around the problem
5) Do an enumeration case for each possible constant divisor
6) Upgrade my silly old compiler and tools

This seems like a lot of work for a trivial optimization; for i386, 
perhaps #2 is the most appropriate - with a sufficiently new GCC, this 
optimization should be automatic for all architectures not hardcoding 
do_div as inline assembler.

Seems to have come full circle - the trivial extension turns out to have 
non-trivial side effects.  If only GCC were as easily extensible as 
sparse!  A __builtin_highest_one_bit() function would make it possible 
to use inline assembler without degenerating to individual cases for 
each bit.

Zachary Amsden
zach@vmware.com

Linus Torvalds wrote:

>On Wed, 27 Oct 2004, Linus Torvalds wrote:
>  
>
>>I could add a sparse check for "no side effects", if anybody cares (so 
>>that you could do
>>
>>	__builtin_warning(
>>		!__builtin_nosideeffects(base),
>>		"expression has side effects");
>>
>>in macros like these.. Sparse already has the logic internally..
>>    
>>
>
>Done. Except I called it __builtin_safe_p(). 
>
>The kernel sources already know about "__builtin_warning()" (and 
>pre-process it away on gcc), so if you have a new sparse setup (as of two 
>minutes ago ;), you can use this thing to check that arguments to macros 
>do not have side effects.
>
>Useful? You be the judge. But it was just a couple of lines in sparse, and
>doing so also made it obvious how to clean up __builtin_constant_p() a lot
>at the same time by just re-organizing things a bit.
>
>My inliner and statement simplificator isn't perfect, so inline functions
>sadly are not considered constant (or safe) even if they _do_ end up
>returning a constant value (or be safe internally).
>
>		Linus
>  
>


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] Remove some divide instructions
  2004-10-29  0:47         ` Zachary Amsden
@ 2004-10-29  4:52           ` Linus Torvalds
  2004-10-29 19:10             ` Geert Uytterhoeven
  0 siblings, 1 reply; 13+ messages in thread
From: Linus Torvalds @ 2004-10-29  4:52 UTC (permalink / raw)
  To: Zachary Amsden; +Cc: linux-kernel



On Thu, 28 Oct 2004, Zachary Amsden wrote:
>
> This leaves several options:
> 
> 1) Forget the optimization altogether
> 2) Go back to the (base == 1) check

Ok, I think I led you on a merry goose-chase, and the "base == 1" check 
was the only one worth bothering with after all. Sorry about that.

> This seems like a lot of work for a trivial optimization; for i386, 
> perhaps #2 is the most appropriate - with a sufficiently new GCC, this 
> optimization should be automatic for all architectures not hardcoding 
> do_div as inline assembler.

The do_div() optimization is a trivial one, and one that gcc should 
definitely have recognized. It's not like a compiler cannot see that you 
have a 64 / 32 divide, and realize that it's cheaper than a full 64 / 64 
divide.

But hey, even if the gcc people finally did that optimization today, it 
would take a few years before we didn't support old compilers any more, 
so.

> Seems to have come full circle - the trivial extension turns out to have 
> non-trivial side effects.  If only GCC were as easily extensible as 
> sparse!  A __builtin_highest_one_bit() function would make it possible 
> to use inline assembler without degenerating to individual cases for 
> each bit.

Yes. There are tons of places where we'd love to have a constant
compile-time "log2()" function. And yes, I could do it in sparse in about
ten more lines of code, but..

I guess you could do it with a lot of tests...  Something like this should
do it

	#define __constant_log2(y) ((y==1) ? 0 : \
				    (y==2) ? 1 : \
				    (y==4) ? 2 : \
				    (y==8) ? 3 : -1)	/* We could go on ... */

	#define do_div(x,y) ({					\
		unsigned long __mod;				\
		int __log2;					\
		if (__builtin_constant_p(y) && 			\
		    !((y) & ((y)-1)) &&				\
		    (__log2 = __constant_log2((y))) >= 0) {	\
			mod = x & ((y)-1);			\
			(x) >>= __log2;				\
		} else {					\
			.. inline asm case ..			\
		}						\
		__mod; })

which looks like it should work, but it's getting so ugly that I suspect I 
should be committed for even thinking about it.

(And no, I didn't test the above. It is all trivially optimizable by a 
compiler, and I can't see how gcc could _fail_ to get it right, but hey, I 
thought the previous thing would work too, so I'm clearly not competent to 
make that judgement... ;)

			Linus

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] Remove some divide instructions
  2004-10-29  4:52           ` Linus Torvalds
@ 2004-10-29 19:10             ` Geert Uytterhoeven
  0 siblings, 0 replies; 13+ messages in thread
From: Geert Uytterhoeven @ 2004-10-29 19:10 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Zachary Amsden, Linux Kernel Development

On Thu, 28 Oct 2004, Linus Torvalds wrote:
> On Thu, 28 Oct 2004, Zachary Amsden wrote:
> 	#define __constant_log2(y) ((y==1) ? 0 : \
> 				    (y==2) ? 1 : \
> 				    (y==4) ? 2 : \
> 				    (y==8) ? 3 : -1)	/* We could go on ... */
> 
> 	#define do_div(x,y) ({					\
> 		unsigned long __mod;				\
> 		int __log2;					\
> 		if (__builtin_constant_p(y) && 			\
> 		    !((y) & ((y)-1)) &&				\
> 		    (__log2 = __constant_log2((y))) >= 0) {	\
> 			mod = x & ((y)-1);			\
> 			(x) >>= __log2;				\
> 		} else {					\
> 			.. inline asm case ..			\
> 		}						\
> 		__mod; })
> 
> which looks like it should work, but it's getting so ugly that I suspect I 
> should be committed for even thinking about it.
> 
> (And no, I didn't test the above. It is all trivially optimizable by a 
> compiler, and I can't see how gcc could _fail_ to get it right, but hey, I 
> thought the previous thing would work too, so I'm clearly not competent to 
> make that judgement... ;)

Perhaps this is why you see in many header files code like this:

    static inline xxx_const_case(...) { ... }
    static inline xxx_nonconst_case(...) { ... }

#define xxx(...) __builtin_constant_p(...) ? xxx_const_case(...) \
					   : xxx_nonconst_case(...)

i.e. gcc is better in optimizing away calls to non-called functions?

Gr{oetje,eeting}s,

						Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
							    -- Linus Torvalds

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2004-10-29 19:43 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2004-10-27 16:14 [PATCH] Remove some divide instructions Zachary Amsden
2004-10-27 16:28 ` Linus Torvalds
2004-10-27 18:05   ` Zachary Amsden
2004-10-27 20:16   ` Zachary Amsden
2004-10-27 21:24     ` Linus Torvalds
2004-10-27 22:08   ` Thayne Harbaugh
2004-10-27 22:14   ` Zachary Amsden
2004-10-28  0:11     ` Linus Torvalds
2004-10-28  0:47       ` Linus Torvalds
2004-10-29  0:47         ` Zachary Amsden
2004-10-29  4:52           ` Linus Torvalds
2004-10-29 19:10             ` Geert Uytterhoeven
2004-10-28  0:59       ` Maciej W. Rozycki

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).