All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 0/2] Optimise some IP checksum functions.
@ 2015-05-19 15:18 ` Christophe Leroy
  0 siblings, 0 replies; 16+ messages in thread
From: Christophe Leroy @ 2015-05-19 15:18 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, scottwood
  Cc: linux-kernel, linuxppc-dev, Joakim Tjernlund

This patchset provides a few optimisations related to IP checksum functions.

Christophe Leroy (2):
  powerpc: put csum_tcpudp_magic inline
  powerpc: add support for csum_add()

 arch/powerpc/include/asm/checksum.h | 37 ++++++++++++++++++++++++++++---------
 arch/powerpc/lib/checksum_32.S      | 16 ----------------
 arch/powerpc/lib/checksum_64.S      | 21 ---------------------
 3 files changed, 28 insertions(+), 46 deletions(-)

-- 
2.1.0


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v3 0/2] Optimise some IP checksum functions.
@ 2015-05-19 15:18 ` Christophe Leroy
  0 siblings, 0 replies; 16+ messages in thread
From: Christophe Leroy @ 2015-05-19 15:18 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, scottwood
  Cc: linuxppc-dev, linux-kernel

This patchset provides a few optimisations related to IP checksum functions.

Christophe Leroy (2):
  powerpc: put csum_tcpudp_magic inline
  powerpc: add support for csum_add()

 arch/powerpc/include/asm/checksum.h | 37 ++++++++++++++++++++++++++++---------
 arch/powerpc/lib/checksum_32.S      | 16 ----------------
 arch/powerpc/lib/checksum_64.S      | 21 ---------------------
 3 files changed, 28 insertions(+), 46 deletions(-)

-- 
2.1.0

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v3 1/2] powerpc: put csum_tcpudp_magic inline
  2015-05-19 15:18 ` Christophe Leroy
@ 2015-05-19 15:18   ` Christophe Leroy
  -1 siblings, 0 replies; 16+ messages in thread
From: Christophe Leroy @ 2015-05-19 15:18 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, scottwood
  Cc: linux-kernel, linuxppc-dev, Joakim Tjernlund

csum_tcpudp_magic() is only a few instructions, and does modify
really few registers. So it is not worth having it as a separate
function and suffer function branching and saving of volatile
registers.

This patch makes it inline by use of the already existing
csum_tcpudp_nofold() function.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 arch/powerpc/include/asm/checksum.h | 21 ++++++++++++---------
 arch/powerpc/lib/checksum_32.S      | 16 ----------------
 arch/powerpc/lib/checksum_64.S      | 21 ---------------------
 3 files changed, 12 insertions(+), 46 deletions(-)

diff --git a/arch/powerpc/include/asm/checksum.h b/arch/powerpc/include/asm/checksum.h
index 8251a3b..5e43d2d 100644
--- a/arch/powerpc/include/asm/checksum.h
+++ b/arch/powerpc/include/asm/checksum.h
@@ -20,15 +20,6 @@
 extern __sum16 ip_fast_csum(const void *iph, unsigned int ihl);
 
 /*
- * computes the checksum of the TCP/UDP pseudo-header
- * returns a 16-bit checksum, already complemented
- */
-extern __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
-					unsigned short len,
-					unsigned short proto,
-					__wsum sum);
-
-/*
  * computes the checksum of a memory block at buff, length len,
  * and adds in "sum" (32-bit)
  *
@@ -127,6 +118,18 @@ static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
 #endif
 }
 
+/*
+ * computes the checksum of the TCP/UDP pseudo-header
+ * returns a 16-bit checksum, already complemented
+ */
+static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
+					unsigned short len,
+					unsigned short proto,
+					__wsum sum)
+{
+	return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
+}
+
 #endif
 #endif /* __KERNEL__ */
 #endif
diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S
index e23a436..d6fab08 100644
--- a/arch/powerpc/lib/checksum_32.S
+++ b/arch/powerpc/lib/checksum_32.S
@@ -41,22 +41,6 @@ _GLOBAL(ip_fast_csum)
 	blr
 
 /*
- * Compute checksum of TCP or UDP pseudo-header:
- *   csum_tcpudp_magic(saddr, daddr, len, proto, sum)
- */	
-_GLOBAL(csum_tcpudp_magic)
-	rlwimi	r5,r6,16,0,15	/* put proto in upper half of len */
-	addc	r0,r3,r4	/* add 4 32-bit words together */
-	adde	r0,r0,r5
-	adde	r0,r0,r7
-	addze	r0,r0		/* add in final carry */
-	rlwinm	r3,r0,16,0,31	/* fold two halves together */
-	add	r3,r0,r3
-	not	r3,r3
-	srwi	r3,r3,16
-	blr
-
-/*
  * computes the checksum of a memory block at buff, length len,
  * and adds in "sum" (32-bit)
  *
diff --git a/arch/powerpc/lib/checksum_64.S b/arch/powerpc/lib/checksum_64.S
index 57a0720..f3ef354 100644
--- a/arch/powerpc/lib/checksum_64.S
+++ b/arch/powerpc/lib/checksum_64.S
@@ -45,27 +45,6 @@ _GLOBAL(ip_fast_csum)
 	blr
 
 /*
- * Compute checksum of TCP or UDP pseudo-header:
- *   csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum)
- * No real gain trying to do this specially for 64 bit, but
- * the 32 bit addition may spill into the upper bits of
- * the doubleword so we still must fold it down from 64.
- */	
-_GLOBAL(csum_tcpudp_magic)
-	rlwimi	r5,r6,16,0,15	/* put proto in upper half of len */
-	addc	r0,r3,r4	/* add 4 32-bit words together */
-	adde	r0,r0,r5
-	adde	r0,r0,r7
-        rldicl  r4,r0,32,0      /* fold 64 bit value */
-        add     r0,r4,r0
-        srdi    r0,r0,32
-	rlwinm	r3,r0,16,0,31	/* fold two halves together */
-	add	r3,r0,r3
-	not	r3,r3
-	srwi	r3,r3,16
-	blr
-
-/*
  * Computes the checksum of a memory block at buff, length len,
  * and adds in "sum" (32-bit).
  *
-- 
2.1.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v3 1/2] powerpc: put csum_tcpudp_magic inline
@ 2015-05-19 15:18   ` Christophe Leroy
  0 siblings, 0 replies; 16+ messages in thread
From: Christophe Leroy @ 2015-05-19 15:18 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, scottwood
  Cc: linuxppc-dev, linux-kernel

csum_tcpudp_magic() is only a few instructions, and does modify
really few registers. So it is not worth having it as a separate
function and suffer function branching and saving of volatile
registers.

This patch makes it inline by use of the already existing
csum_tcpudp_nofold() function.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 arch/powerpc/include/asm/checksum.h | 21 ++++++++++++---------
 arch/powerpc/lib/checksum_32.S      | 16 ----------------
 arch/powerpc/lib/checksum_64.S      | 21 ---------------------
 3 files changed, 12 insertions(+), 46 deletions(-)

diff --git a/arch/powerpc/include/asm/checksum.h b/arch/powerpc/include/asm/checksum.h
index 8251a3b..5e43d2d 100644
--- a/arch/powerpc/include/asm/checksum.h
+++ b/arch/powerpc/include/asm/checksum.h
@@ -20,15 +20,6 @@
 extern __sum16 ip_fast_csum(const void *iph, unsigned int ihl);
 
 /*
- * computes the checksum of the TCP/UDP pseudo-header
- * returns a 16-bit checksum, already complemented
- */
-extern __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
-					unsigned short len,
-					unsigned short proto,
-					__wsum sum);
-
-/*
  * computes the checksum of a memory block at buff, length len,
  * and adds in "sum" (32-bit)
  *
@@ -127,6 +118,18 @@ static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
 #endif
 }
 
+/*
+ * computes the checksum of the TCP/UDP pseudo-header
+ * returns a 16-bit checksum, already complemented
+ */
+static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
+					unsigned short len,
+					unsigned short proto,
+					__wsum sum)
+{
+	return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
+}
+
 #endif
 #endif /* __KERNEL__ */
 #endif
diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S
index e23a436..d6fab08 100644
--- a/arch/powerpc/lib/checksum_32.S
+++ b/arch/powerpc/lib/checksum_32.S
@@ -41,22 +41,6 @@ _GLOBAL(ip_fast_csum)
 	blr
 
 /*
- * Compute checksum of TCP or UDP pseudo-header:
- *   csum_tcpudp_magic(saddr, daddr, len, proto, sum)
- */	
-_GLOBAL(csum_tcpudp_magic)
-	rlwimi	r5,r6,16,0,15	/* put proto in upper half of len */
-	addc	r0,r3,r4	/* add 4 32-bit words together */
-	adde	r0,r0,r5
-	adde	r0,r0,r7
-	addze	r0,r0		/* add in final carry */
-	rlwinm	r3,r0,16,0,31	/* fold two halves together */
-	add	r3,r0,r3
-	not	r3,r3
-	srwi	r3,r3,16
-	blr
-
-/*
  * computes the checksum of a memory block at buff, length len,
  * and adds in "sum" (32-bit)
  *
diff --git a/arch/powerpc/lib/checksum_64.S b/arch/powerpc/lib/checksum_64.S
index 57a0720..f3ef354 100644
--- a/arch/powerpc/lib/checksum_64.S
+++ b/arch/powerpc/lib/checksum_64.S
@@ -45,27 +45,6 @@ _GLOBAL(ip_fast_csum)
 	blr
 
 /*
- * Compute checksum of TCP or UDP pseudo-header:
- *   csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum)
- * No real gain trying to do this specially for 64 bit, but
- * the 32 bit addition may spill into the upper bits of
- * the doubleword so we still must fold it down from 64.
- */	
-_GLOBAL(csum_tcpudp_magic)
-	rlwimi	r5,r6,16,0,15	/* put proto in upper half of len */
-	addc	r0,r3,r4	/* add 4 32-bit words together */
-	adde	r0,r0,r5
-	adde	r0,r0,r7
-        rldicl  r4,r0,32,0      /* fold 64 bit value */
-        add     r0,r4,r0
-        srdi    r0,r0,32
-	rlwinm	r3,r0,16,0,31	/* fold two halves together */
-	add	r3,r0,r3
-	not	r3,r3
-	srwi	r3,r3,16
-	blr
-
-/*
  * Computes the checksum of a memory block at buff, length len,
  * and adds in "sum" (32-bit).
  *
-- 
2.1.0

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v3 2/2] powerpc: add support for csum_add()
  2015-05-19 15:18 ` Christophe Leroy
@ 2015-05-19 15:18   ` Christophe Leroy
  -1 siblings, 0 replies; 16+ messages in thread
From: Christophe Leroy @ 2015-05-19 15:18 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, scottwood
  Cc: linux-kernel, linuxppc-dev, Joakim Tjernlund

The C version of csum_add() as defined in include/net/checksum.h gives the
following assembly in ppc32:
       0:       7c 04 1a 14     add     r0,r4,r3
       4:       7c 64 00 10     subfc   r3,r4,r0
       8:       7c 63 19 10     subfe   r3,r3,r3
       c:       7c 63 00 50     subf    r3,r3,r0
and the following in ppc64:
   0xc000000000001af8 <+0>:	add     r3,r3,r4
   0xc000000000001afc <+4>:	cmplw   cr7,r3,r4
   0xc000000000001b00 <+8>:	mfcr    r4
   0xc000000000001b04 <+12>:	rlwinm  r4,r4,29,31,31
   0xc000000000001b08 <+16>:	add     r3,r4,r3
   0xc000000000001b0c <+20>:	clrldi  r3,r3,32
   0xc000000000001b10 <+24>:	blr

include/net/checksum.h also offers the possibility to define an arch specific
function.
This patch provides a specific csum_add() inline function.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 arch/powerpc/include/asm/checksum.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/arch/powerpc/include/asm/checksum.h b/arch/powerpc/include/asm/checksum.h
index 5e43d2d..e8d9ef4 100644
--- a/arch/powerpc/include/asm/checksum.h
+++ b/arch/powerpc/include/asm/checksum.h
@@ -130,6 +130,22 @@ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
 	return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
 }
 
+#define HAVE_ARCH_CSUM_ADD
+static inline __wsum csum_add(__wsum csum, __wsum addend)
+{
+#ifdef __powerpc64__
+	u64 res = (__force u64)csum;
+
+	res += (__force u64)addend;
+	return (__force __wsum)((u32)res + (res >> 32));
+#else
+	asm("addc %0,%0,%1;"
+	    "addze %0,%0;"
+	    : "+r" (csum) : "r" (addend));
+	return csum;
+#endif
+}
+
 #endif
 #endif /* __KERNEL__ */
 #endif
-- 
2.1.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v3 2/2] powerpc: add support for csum_add()
@ 2015-05-19 15:18   ` Christophe Leroy
  0 siblings, 0 replies; 16+ messages in thread
From: Christophe Leroy @ 2015-05-19 15:18 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, scottwood
  Cc: linuxppc-dev, linux-kernel

The C version of csum_add() as defined in include/net/checksum.h gives the
following assembly in ppc32:
       0:       7c 04 1a 14     add     r0,r4,r3
       4:       7c 64 00 10     subfc   r3,r4,r0
       8:       7c 63 19 10     subfe   r3,r3,r3
       c:       7c 63 00 50     subf    r3,r3,r0
and the following in ppc64:
   0xc000000000001af8 <+0>:	add     r3,r3,r4
   0xc000000000001afc <+4>:	cmplw   cr7,r3,r4
   0xc000000000001b00 <+8>:	mfcr    r4
   0xc000000000001b04 <+12>:	rlwinm  r4,r4,29,31,31
   0xc000000000001b08 <+16>:	add     r3,r4,r3
   0xc000000000001b0c <+20>:	clrldi  r3,r3,32
   0xc000000000001b10 <+24>:	blr

include/net/checksum.h also offers the possibility to define an arch specific
function.
This patch provides a specific csum_add() inline function.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 arch/powerpc/include/asm/checksum.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/arch/powerpc/include/asm/checksum.h b/arch/powerpc/include/asm/checksum.h
index 5e43d2d..e8d9ef4 100644
--- a/arch/powerpc/include/asm/checksum.h
+++ b/arch/powerpc/include/asm/checksum.h
@@ -130,6 +130,22 @@ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
 	return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
 }
 
+#define HAVE_ARCH_CSUM_ADD
+static inline __wsum csum_add(__wsum csum, __wsum addend)
+{
+#ifdef __powerpc64__
+	u64 res = (__force u64)csum;
+
+	res += (__force u64)addend;
+	return (__force __wsum)((u32)res + (res >> 32));
+#else
+	asm("addc %0,%0,%1;"
+	    "addze %0,%0;"
+	    : "+r" (csum) : "r" (addend));
+	return csum;
+#endif
+}
+
 #endif
 #endif /* __KERNEL__ */
 #endif
-- 
2.1.0

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* RE: [PATCH v3 2/2] powerpc: add support for csum_add()
  2015-05-19 15:18   ` Christophe Leroy
@ 2015-05-22 15:57     ` David Laight
  -1 siblings, 0 replies; 16+ messages in thread
From: David Laight @ 2015-05-22 15:57 UTC (permalink / raw)
  To: 'Christophe Leroy',
	Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	scottwood
  Cc: linuxppc-dev, linux-kernel

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 1254 bytes --]

From: Linuxppc-dev Christophe Leroy
> Sent: 19 May 2015 16:19
...
> diff --git a/arch/powerpc/include/asm/checksum.h b/arch/powerpc/include/asm/checksum.h
> index 5e43d2d..e8d9ef4 100644
> --- a/arch/powerpc/include/asm/checksum.h
> +++ b/arch/powerpc/include/asm/checksum.h
> @@ -130,6 +130,22 @@ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
>  	return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
>  }
> 
> +#define HAVE_ARCH_CSUM_ADD
> +static inline __wsum csum_add(__wsum csum, __wsum addend)
> +{
> +#ifdef __powerpc64__
> +	u64 res = (__force u64)csum;
> +
> +	res += (__force u64)addend;
> +	return (__force __wsum)((u32)res + (res >> 32));
> +#else
> +	asm("addc %0,%0,%1;"
> +	    "addze %0,%0;"
> +	    : "+r" (csum) : "r" (addend));
> +	return csum;
> +#endif

I'd have thought it better to test for the cpu type where you want the
'asm' variant, and then fall back on the C version for all others.
I know (well suspect) there are only two cases here.

I'd also have thought that the 64bit C version above would be generally 'good'.

	David

ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH v3 2/2] powerpc: add support for csum_add()
@ 2015-05-22 15:57     ` David Laight
  0 siblings, 0 replies; 16+ messages in thread
From: David Laight @ 2015-05-22 15:57 UTC (permalink / raw)
  To: 'Christophe Leroy',
	Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	scottwood
  Cc: linuxppc-dev, linux-kernel

RnJvbTogTGludXhwcGMtZGV2IENocmlzdG9waGUgTGVyb3kNCj4gU2VudDogMTkgTWF5IDIwMTUg
MTY6MTkNCi4uLg0KPiBkaWZmIC0tZ2l0IGEvYXJjaC9wb3dlcnBjL2luY2x1ZGUvYXNtL2NoZWNr
c3VtLmggYi9hcmNoL3Bvd2VycGMvaW5jbHVkZS9hc20vY2hlY2tzdW0uaA0KPiBpbmRleCA1ZTQz
ZDJkLi5lOGQ5ZWY0IDEwMDY0NA0KPiAtLS0gYS9hcmNoL3Bvd2VycGMvaW5jbHVkZS9hc20vY2hl
Y2tzdW0uaA0KPiArKysgYi9hcmNoL3Bvd2VycGMvaW5jbHVkZS9hc20vY2hlY2tzdW0uaA0KPiBA
QCAtMTMwLDYgKzEzMCwyMiBAQCBzdGF0aWMgaW5saW5lIF9fc3VtMTYgY3N1bV90Y3B1ZHBfbWFn
aWMoX19iZTMyIHNhZGRyLCBfX2JlMzIgZGFkZHIsDQo+ICAJcmV0dXJuIGNzdW1fZm9sZChjc3Vt
X3RjcHVkcF9ub2ZvbGQoc2FkZHIsIGRhZGRyLCBsZW4sIHByb3RvLCBzdW0pKTsNCj4gIH0NCj4g
DQo+ICsjZGVmaW5lIEhBVkVfQVJDSF9DU1VNX0FERA0KPiArc3RhdGljIGlubGluZSBfX3dzdW0g
Y3N1bV9hZGQoX193c3VtIGNzdW0sIF9fd3N1bSBhZGRlbmQpDQo+ICt7DQo+ICsjaWZkZWYgX19w
b3dlcnBjNjRfXw0KPiArCXU2NCByZXMgPSAoX19mb3JjZSB1NjQpY3N1bTsNCj4gKw0KPiArCXJl
cyArPSAoX19mb3JjZSB1NjQpYWRkZW5kOw0KPiArCXJldHVybiAoX19mb3JjZSBfX3dzdW0pKCh1
MzIpcmVzICsgKHJlcyA+PiAzMikpOw0KPiArI2Vsc2UNCj4gKwlhc20oImFkZGMgJTAsJTAsJTE7
Ig0KPiArCSAgICAiYWRkemUgJTAsJTA7Ig0KPiArCSAgICA6ICIrciIgKGNzdW0pIDogInIiIChh
ZGRlbmQpKTsNCj4gKwlyZXR1cm4gY3N1bTsNCj4gKyNlbmRpZg0KDQpJJ2QgaGF2ZSB0aG91Z2h0
IGl0IGJldHRlciB0byB0ZXN0IGZvciB0aGUgY3B1IHR5cGUgd2hlcmUgeW91IHdhbnQgdGhlDQon
YXNtJyB2YXJpYW50LCBhbmQgdGhlbiBmYWxsIGJhY2sgb24gdGhlIEMgdmVyc2lvbiBmb3IgYWxs
IG90aGVycy4NCkkga25vdyAod2VsbCBzdXNwZWN0KSB0aGVyZSBhcmUgb25seSB0d28gY2FzZXMg
aGVyZS4NCg0KSSdkIGFsc28gaGF2ZSB0aG91Z2h0IHRoYXQgdGhlIDY0Yml0IEMgdmVyc2lvbiBh
Ym92ZSB3b3VsZCBiZSBnZW5lcmFsbHkgJ2dvb2QnLg0KDQoJRGF2aWQNCg0K

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 2/2] powerpc: add support for csum_add()
  2015-05-22 15:57     ` David Laight
  (?)
@ 2015-05-22 19:32     ` Scott Wood
  2015-05-22 21:39       ` Segher Boessenkool
  2015-05-26 13:57         ` David Laight
  -1 siblings, 2 replies; 16+ messages in thread
From: Scott Wood @ 2015-05-22 19:32 UTC (permalink / raw)
  To: David Laight
  Cc: 'Christophe Leroy',
	Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	linuxppc-dev, linux-kernel

On Fri, 2015-05-22 at 15:57 +0000, David Laight wrote:
> From: Linuxppc-dev Christophe Leroy
> > Sent: 19 May 2015 16:19
> ...
> > diff --git a/arch/powerpc/include/asm/checksum.h b/arch/powerpc/include/asm/checksum.h
> > index 5e43d2d..e8d9ef4 100644
> > --- a/arch/powerpc/include/asm/checksum.h
> > +++ b/arch/powerpc/include/asm/checksum.h
> > @@ -130,6 +130,22 @@ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
> >  	return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
> >  }
> > 
> > +#define HAVE_ARCH_CSUM_ADD
> > +static inline __wsum csum_add(__wsum csum, __wsum addend)
> > +{
> > +#ifdef __powerpc64__
> > +	u64 res = (__force u64)csum;
> > +
> > +	res += (__force u64)addend;
> > +	return (__force __wsum)((u32)res + (res >> 32));
> > +#else
> > +	asm("addc %0,%0,%1;"
> > +	    "addze %0,%0;"
> > +	    : "+r" (csum) : "r" (addend));
> > +	return csum;
> > +#endif
> 
> I'd have thought it better to test for the cpu type where you want the
> 'asm' variant, and then fall back on the C version for all others.
> I know (well suspect) there are only two cases here.

Usually it's more readable to see "if (x) ... else ..." than "if (!
x) ... else ..." and 64-bit is what has a symbol defined.

> I'd also have thought that the 64bit C version above would be generally 'good'.

It doesn't generate the addc/addze sequence.  At least with GCC 4.8.2,
it does something like:

	mr	tmp0, csum
	li	tmp1, 0
	li	tmp2, 0
	addc	tmp3, addend, tmp0
	adde	csum, tmp2, tmp1
	add	csum, csum, tmp3

-Scott



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 2/2] powerpc: add support for csum_add()
  2015-05-22 19:32     ` Scott Wood
@ 2015-05-22 21:39       ` Segher Boessenkool
  2015-05-22 21:54         ` Scott Wood
  2015-05-26 13:57         ` David Laight
  1 sibling, 1 reply; 16+ messages in thread
From: Segher Boessenkool @ 2015-05-22 21:39 UTC (permalink / raw)
  To: Scott Wood; +Cc: David Laight, linux-kernel, Paul Mackerras, linuxppc-dev

On Fri, May 22, 2015 at 02:32:42PM -0500, Scott Wood wrote:
> > I'd also have thought that the 64bit C version above would be generally 'good'.
> 
> It doesn't generate the addc/addze sequence.  At least with GCC 4.8.2,
> it does something like:
> 
> 	mr	tmp0, csum
> 	li	tmp1, 0
> 	li	tmp2, 0
> 	addc	tmp3, addend, tmp0
> 	adde	csum, tmp2, tmp1
> 	add	csum, csum, tmp3

Right.  Don't expect older compilers to do sane things here.

All this begs a question...  If it is worth spending so much time
micro-optimising this, why not pick the low-hanging fruit first?
Having a 32-bit accumulator for ones' complement sums, on a 64-bit
system, is not such a great idea.


Segher

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 2/2] powerpc: add support for csum_add()
  2015-05-22 21:39       ` Segher Boessenkool
@ 2015-05-22 21:54         ` Scott Wood
  0 siblings, 0 replies; 16+ messages in thread
From: Scott Wood @ 2015-05-22 21:54 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: David Laight, linux-kernel, Paul Mackerras, linuxppc-dev,
	Christophe Leroy

On Fri, 2015-05-22 at 16:39 -0500, Segher Boessenkool wrote:
> On Fri, May 22, 2015 at 02:32:42PM -0500, Scott Wood wrote:
> > > I'd also have thought that the 64bit C version above would be generally 'good'.
> > 
> > It doesn't generate the addc/addze sequence.  At least with GCC 4.8.2,
> > it does something like:
> > 
> > 	mr	tmp0, csum
> > 	li	tmp1, 0
> > 	li	tmp2, 0
> > 	addc	tmp3, addend, tmp0
> > 	adde	csum, tmp2, tmp1
> > 	add	csum, csum, tmp3
> 
> Right.  Don't expect older compilers to do sane things here.
> 
> All this begs a question...  If it is worth spending so much time
> micro-optimising this, why not pick the low-hanging fruit first?
> Having a 32-bit accumulator for ones' complement sums, on a 64-bit
> system, is not such a great idea.

That would be a more intrusive change -- not (comparatively) low-hanging
fruit.  Plus, the person submitting these patches is focused on 32-bit.

-Scott



^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH v3 2/2] powerpc: add support for csum_add()
  2015-05-22 19:32     ` Scott Wood
@ 2015-05-26 13:57         ` David Laight
  2015-05-26 13:57         ` David Laight
  1 sibling, 0 replies; 16+ messages in thread
From: David Laight @ 2015-05-26 13:57 UTC (permalink / raw)
  To: 'Scott Wood'
  Cc: 'Christophe Leroy',
	Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	linuxppc-dev, linux-kernel

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 554 bytes --]

From: Scott Wood ...
> > I'd also have thought that the 64bit C version above would be generally 'good'.
> 
> It doesn't generate the addc/addze sequence.  At least with GCC 4.8.2,
> it does something like:
> 
> 	mr	tmp0, csum
> 	li	tmp1, 0
> 	li	tmp2, 0
> 	addc	tmp3, addend, tmp0
> 	adde	csum, tmp2, tmp1
> 	add	csum, csum, tmp3

I was thinking of all 64bit targets, not 32bit ones.

	David

ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH v3 2/2] powerpc: add support for csum_add()
@ 2015-05-26 13:57         ` David Laight
  0 siblings, 0 replies; 16+ messages in thread
From: David Laight @ 2015-05-26 13:57 UTC (permalink / raw)
  To: 'Scott Wood'
  Cc: 'Christophe Leroy',
	Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	linuxppc-dev, linux-kernel

RnJvbTogU2NvdHQgV29vZCAuLi4NCj4gPiBJJ2QgYWxzbyBoYXZlIHRob3VnaHQgdGhhdCB0aGUg
NjRiaXQgQyB2ZXJzaW9uIGFib3ZlIHdvdWxkIGJlIGdlbmVyYWxseSAnZ29vZCcuDQo+IA0KPiBJ
dCBkb2Vzbid0IGdlbmVyYXRlIHRoZSBhZGRjL2FkZHplIHNlcXVlbmNlLiAgQXQgbGVhc3Qgd2l0
aCBHQ0MgNC44LjIsDQo+IGl0IGRvZXMgc29tZXRoaW5nIGxpa2U6DQo+IA0KPiAJbXIJdG1wMCwg
Y3N1bQ0KPiAJbGkJdG1wMSwgMA0KPiAJbGkJdG1wMiwgMA0KPiAJYWRkYwl0bXAzLCBhZGRlbmQs
IHRtcDANCj4gCWFkZGUJY3N1bSwgdG1wMiwgdG1wMQ0KPiAJYWRkCWNzdW0sIGNzdW0sIHRtcDMN
Cg0KSSB3YXMgdGhpbmtpbmcgb2YgYWxsIDY0Yml0IHRhcmdldHMsIG5vdCAzMmJpdCBvbmVzLg0K
DQoJRGF2aWQNCg0K

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 2/2] powerpc: add support for csum_add()
  2015-05-26 13:57         ` David Laight
  (?)
@ 2015-05-26 19:42         ` Scott Wood
  2015-05-27  8:41             ` David Laight
  -1 siblings, 1 reply; 16+ messages in thread
From: Scott Wood @ 2015-05-26 19:42 UTC (permalink / raw)
  To: David Laight
  Cc: 'Christophe Leroy',
	Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	linuxppc-dev, linux-kernel

On Tue, 2015-05-26 at 13:57 +0000, David Laight wrote:
> From: Scott Wood ...
> > > I'd also have thought that the 64bit C version above would be 
> > > generally 'good'.
> > 
> > It doesn't generate the addc/addze sequence.  At least with GCC 
> > 4.8.2,
> > it does something like:
> > 
> >     mr      tmp0, csum
> >     li      tmp1, 0
> >     li      tmp2, 0
> >     addc    tmp3, addend, tmp0
> >     adde    csum, tmp2, tmp1
> >     add     csum, csum, tmp3
> 
> I was thinking of all 64bit targets, not 32bit ones.

Oh, you mean move it out of arch/powerpc?  Sounds reasonable, but 
someone should probably check what the resulting code looks like on 
other common arches.  OTOH, if we're going to modify non-arch code, 
that might be a good opportunity to implement Segher's suggestion and 
move to a 64-bit accumulator.

-Scott


^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH v3 2/2] powerpc: add support for csum_add()
  2015-05-26 19:42         ` Scott Wood
@ 2015-05-27  8:41             ` David Laight
  0 siblings, 0 replies; 16+ messages in thread
From: David Laight @ 2015-05-27  8:41 UTC (permalink / raw)
  To: 'Scott Wood'
  Cc: 'Christophe Leroy',
	Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	linuxppc-dev, linux-kernel

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 769 bytes --]

From: Scott Wood
> Sent: 26 May 2015 20:43
...
> > I was thinking of all 64bit targets, not 32bit ones.
> 
> Oh, you mean move it out of arch/powerpc?  Sounds reasonable, but
> someone should probably check what the resulting code looks like on
> other common arches.  OTOH, if we're going to modify non-arch code,
> that might be a good opportunity to implement Segher's suggestion and
> move to a 64-bit accumulator.

Or more likely: adding alternate 32bit words to different 64-bit
registers in order to reduce the dependency chains further.
I'm sure I've seen a version that does that somewhere.

	David

ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH v3 2/2] powerpc: add support for csum_add()
@ 2015-05-27  8:41             ` David Laight
  0 siblings, 0 replies; 16+ messages in thread
From: David Laight @ 2015-05-27  8:41 UTC (permalink / raw)
  To: 'Scott Wood'
  Cc: 'Christophe Leroy',
	Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	linuxppc-dev, linux-kernel

RnJvbTogU2NvdHQgV29vZA0KPiBTZW50OiAyNiBNYXkgMjAxNSAyMDo0Mw0KLi4uDQo+ID4gSSB3
YXMgdGhpbmtpbmcgb2YgYWxsIDY0Yml0IHRhcmdldHMsIG5vdCAzMmJpdCBvbmVzLg0KPiANCj4g
T2gsIHlvdSBtZWFuIG1vdmUgaXQgb3V0IG9mIGFyY2gvcG93ZXJwYz8gIFNvdW5kcyByZWFzb25h
YmxlLCBidXQNCj4gc29tZW9uZSBzaG91bGQgcHJvYmFibHkgY2hlY2sgd2hhdCB0aGUgcmVzdWx0
aW5nIGNvZGUgbG9va3MgbGlrZSBvbg0KPiBvdGhlciBjb21tb24gYXJjaGVzLiAgT1RPSCwgaWYg
d2UncmUgZ29pbmcgdG8gbW9kaWZ5IG5vbi1hcmNoIGNvZGUsDQo+IHRoYXQgbWlnaHQgYmUgYSBn
b29kIG9wcG9ydHVuaXR5IHRvIGltcGxlbWVudCBTZWdoZXIncyBzdWdnZXN0aW9uIGFuZA0KPiBt
b3ZlIHRvIGEgNjQtYml0IGFjY3VtdWxhdG9yLg0KDQpPciBtb3JlIGxpa2VseTogYWRkaW5nIGFs
dGVybmF0ZSAzMmJpdCB3b3JkcyB0byBkaWZmZXJlbnQgNjQtYml0DQpyZWdpc3RlcnMgaW4gb3Jk
ZXIgdG8gcmVkdWNlIHRoZSBkZXBlbmRlbmN5IGNoYWlucyBmdXJ0aGVyLg0KSSdtIHN1cmUgSSd2
ZSBzZWVuIGEgdmVyc2lvbiB0aGF0IGRvZXMgdGhhdCBzb21ld2hlcmUuDQoNCglEYXZpZA0KDQo=

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2015-05-27  8:42 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-05-19 15:18 [PATCH v3 0/2] Optimise some IP checksum functions Christophe Leroy
2015-05-19 15:18 ` Christophe Leroy
2015-05-19 15:18 ` [PATCH v3 1/2] powerpc: put csum_tcpudp_magic inline Christophe Leroy
2015-05-19 15:18   ` Christophe Leroy
2015-05-19 15:18 ` [PATCH v3 2/2] powerpc: add support for csum_add() Christophe Leroy
2015-05-19 15:18   ` Christophe Leroy
2015-05-22 15:57   ` David Laight
2015-05-22 15:57     ` David Laight
2015-05-22 19:32     ` Scott Wood
2015-05-22 21:39       ` Segher Boessenkool
2015-05-22 21:54         ` Scott Wood
2015-05-26 13:57       ` David Laight
2015-05-26 13:57         ` David Laight
2015-05-26 19:42         ` Scott Wood
2015-05-27  8:41           ` David Laight
2015-05-27  8:41             ` David Laight

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.