All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] x86/csum: rewrite csum_partial()
@ 2021-11-11  6:53 Eric Dumazet
  2021-11-11  9:10 ` Peter Zijlstra
                   ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Eric Dumazet @ 2021-11-11  6:53 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski
  Cc: netdev, Eric Dumazet, Eric Dumazet, x86, Alexander Duyck

From: Eric Dumazet <edumazet@google.com>

With more NIC supporting CHECKSUM_COMPLETE, and IPv6 being widely used.
csum_partial() is heavily used with small amount of bytes,
and is consuming many cycles.

IPv6 header size for instance is 40 bytes.

Another thing to consider is that NET_IP_ALIGN is 0 on x86,
meaning that network headers in RX path are not word-aligned,
unless the driver forces this.

This means that csum_partial() fetches one u16
to 'align the buffer', then perform seven u64 additions
with carry in a loop, then a remaining u32, then a remaining u16.

With this new version, we perform 10 u32 adds with carry, to
avoid the expensive 64->32 transformation. Using 5 u64 adds
plus one add32_with_carry() is more expensive.

Also note that this avoids loops for less than ~60 bytes.

Tested on various cpus, all of them show a big reduction in
csum_partial() cost (by 30 to 75 %)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Alexander Duyck <alexander.duyck@gmail.com>
---
 arch/x86/lib/csum-partial_64.c | 146 +++++++++++++++++----------------
 1 file changed, 77 insertions(+), 69 deletions(-)

diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index e7925d668b680269fb2442766deaf416dc42f9a1..f48fe0ec9663ff3afa1b5403f135407b8b0fde01 100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -21,97 +21,105 @@ static inline unsigned short from32to16(unsigned a)
 }
 
 /*
- * Do a 64-bit checksum on an arbitrary memory area.
+ * Do a checksum on an arbitrary memory area.
  * Returns a 32bit checksum.
  *
  * This isn't as time critical as it used to be because many NICs
  * do hardware checksumming these days.
- * 
- * Things tried and found to not make it faster:
- * Manual Prefetching
- * Unrolling to an 128 bytes inner loop.
- * Using interleaving with more registers to break the carry chains.
+ *
+ * Still, with CHECKSUM_COMPLETE this is called to compute
+ * checksums on IPv6 headers (40 bytes) and other small parts.
  */
 static unsigned do_csum(const unsigned char *buff, unsigned len)
 {
-	unsigned odd, count;
-	unsigned long result = 0;
+	unsigned long dwords;
+	unsigned odd, result = 0;
 
-	if (unlikely(len == 0))
-		return result; 
 	odd = 1 & (unsigned long) buff;
 	if (unlikely(odd)) {
+		if (unlikely(len == 0))
+			return result;
 		result = *buff << 8;
 		len--;
 		buff++;
 	}
-	count = len >> 1;		/* nr of 16-bit words.. */
-	if (count) {
-		if (2 & (unsigned long) buff) {
-			result += *(unsigned short *)buff;
-			count--;
-			len -= 2;
-			buff += 2;
-		}
-		count >>= 1;		/* nr of 32-bit words.. */
-		if (count) {
-			unsigned long zero;
-			unsigned count64;
-			if (4 & (unsigned long) buff) {
-				result += *(unsigned int *) buff;
-				count--;
-				len -= 4;
-				buff += 4;
-			}
-			count >>= 1;	/* nr of 64-bit words.. */
+	if (unlikely(len >= 64)) {
+		unsigned long temp64 = result;
+		do {
+			asm("	addq 0*8(%[src]),%[res]\n"
+			    "	adcq 1*8(%[src]),%[res]\n"
+			    "	adcq 2*8(%[src]),%[res]\n"
+			    "	adcq 3*8(%[src]),%[res]\n"
+			    "	adcq 4*8(%[src]),%[res]\n"
+			    "	adcq 5*8(%[src]),%[res]\n"
+			    "	adcq 6*8(%[src]),%[res]\n"
+			    "	adcq 7*8(%[src]),%[res]\n"
+			    "	adcq $0,%[res]"
+			    : [res] "=r" (temp64)
+			    : [src] "r" (buff), "[res]" (temp64)
+			    : "memory");
+			buff += 64;
+			len -= 64;
+		} while (len >= 64);
+		result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
+	}
 
-			/* main loop using 64byte blocks */
-			zero = 0;
-			count64 = count >> 3;
-			while (count64) { 
-				asm("addq 0*8(%[src]),%[res]\n\t"
-				    "adcq 1*8(%[src]),%[res]\n\t"
-				    "adcq 2*8(%[src]),%[res]\n\t"
-				    "adcq 3*8(%[src]),%[res]\n\t"
-				    "adcq 4*8(%[src]),%[res]\n\t"
-				    "adcq 5*8(%[src]),%[res]\n\t"
-				    "adcq 6*8(%[src]),%[res]\n\t"
-				    "adcq 7*8(%[src]),%[res]\n\t"
-				    "adcq %[zero],%[res]"
-				    : [res] "=r" (result)
-				    : [src] "r" (buff), [zero] "r" (zero),
-				    "[res]" (result));
-				buff += 64;
-				count64--;
-			}
+	dwords = len >> 2;
+	if (dwords) { /* dwords is in [1..15] */
+		unsigned long dest;
 
-			/* last up to 7 8byte blocks */
-			count %= 8; 
-			while (count) { 
-				asm("addq %1,%0\n\t"
-				    "adcq %2,%0\n" 
-					    : "=r" (result)
-				    : "m" (*(unsigned long *)buff), 
-				    "r" (zero),  "0" (result));
-				--count; 
-				buff += 8;
-			}
-			result = add32_with_carry(result>>32,
-						  result&0xffffffff); 
+		/*
+		 * This implements an optimized version of
+		 * switch (dwords) {
+		 * case 15: res = add_with_carry(res, buf32[14]); fallthrough;
+		 * case 14: res = add_with_carry(res, buf32[13]); fallthrough;
+		 * case 13: res = add_with_carry(res, buf32[12]); fallthrough;
+		 * ...
+		 * case 3: res = add_with_carry(res, buf32[2]); fallthrough;
+		 * case 2: res = add_with_carry(res, buf32[1]); fallthrough;
+		 * case 1: res = add_with_carry(res, buf32[0]); fallthrough;
+		 * }
+		 *
+		 * "adcl 8byteoff(%reg1),%reg2" are using either 3 or 4 bytes.
+		 */
+		asm("	call 1f\n"
+		    "1:	pop %[dest]\n"
+		    "	lea (2f-1b)(%[dest],%[skip],4),%[dest]\n"
+		    "	clc\n"
+		    "	jmp *%[dest]\n               .align 4\n"
+		    "2:\n"
+		    "	adcl 14*4(%[src]),%[res]\n   .align 4\n"
+		    "	adcl 13*4(%[src]),%[res]\n   .align 4\n"
+		    "	adcl 12*4(%[src]),%[res]\n   .align 4\n"
+		    "	adcl 11*4(%[src]),%[res]\n   .align 4\n"
+		    "	adcl 10*4(%[src]),%[res]\n   .align 4\n"
+		    "	adcl 9*4(%[src]),%[res]\n   .align 4\n"
+		    "	adcl 8*4(%[src]),%[res]\n   .align 4\n"
+		    "	adcl 7*4(%[src]),%[res]\n   .align 4\n"
+		    "	adcl 6*4(%[src]),%[res]\n   .align 4\n"
+		    "	adcl 5*4(%[src]),%[res]\n   .align 4\n"
+		    "	adcl 4*4(%[src]),%[res]\n   .align 4\n"
+		    "	adcl 3*4(%[src]),%[res]\n   .align 4\n"
+		    "	adcl 2*4(%[src]),%[res]\n   .align 4\n"
+		    "	adcl 1*4(%[src]),%[res]\n   .align 4\n"
+		    "	adcl 0*4(%[src]),%[res]\n"
+		    "	adcl $0,%[res]"
+			: [res] "=r" (result), [dest] "=&r" (dest)
+			: [src] "r" (buff), "[res]" (result),
+			  [skip] "r" (dwords ^ 15)
+			: "memory");
+	}
 
-			if (len & 4) {
-				result += *(unsigned int *) buff;
-				buff += 4;
-			}
-		}
+	if (len & 3U) {
+		buff += len & ~3U;
+		result = from32to16(result);
 		if (len & 2) {
 			result += *(unsigned short *) buff;
 			buff += 2;
 		}
+		if (len & 1)
+			result += *buff;
 	}
-	if (len & 1)
-		result += *buff;
-	result = add32_with_carry(result>>32, result & 0xffffffff); 
 	if (unlikely(odd)) { 
 		result = from32to16(result);
 		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
-- 
2.34.0.rc0.344.g81b53c2807-goog


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [RFC] x86/csum: rewrite csum_partial()
  2021-11-11  6:53 [RFC] x86/csum: rewrite csum_partial() Eric Dumazet
@ 2021-11-11  9:10 ` Peter Zijlstra
  2021-11-11  9:44   ` Peter Zijlstra
  2021-11-11 16:51 ` Alexander Duyck
  2021-11-14 13:07 ` David Laight
  2 siblings, 1 reply; 11+ messages in thread
From: Peter Zijlstra @ 2021-11-11  9:10 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, Jakub Kicinski, netdev, Eric Dumazet, x86,
	Alexander Duyck

On Wed, Nov 10, 2021 at 10:53:22PM -0800, Eric Dumazet wrote:
> +		/*
> +		 * This implements an optimized version of
> +		 * switch (dwords) {
> +		 * case 15: res = add_with_carry(res, buf32[14]); fallthrough;
> +		 * case 14: res = add_with_carry(res, buf32[13]); fallthrough;
> +		 * case 13: res = add_with_carry(res, buf32[12]); fallthrough;
> +		 * ...
> +		 * case 3: res = add_with_carry(res, buf32[2]); fallthrough;
> +		 * case 2: res = add_with_carry(res, buf32[1]); fallthrough;
> +		 * case 1: res = add_with_carry(res, buf32[0]); fallthrough;
> +		 * }
> +		 *
> +		 * "adcl 8byteoff(%reg1),%reg2" are using either 3 or 4 bytes.
> +		 */
> +		asm("	call 1f\n"
> +		    "1:	pop %[dest]\n"

That's terrible. I think on x86_64 we can do: lea (%%rip), %[dest], not
sure what would be the best way on i386.

> +		    "	lea (2f-1b)(%[dest],%[skip],4),%[dest]\n"
> +		    "	clc\n"
> +		    "	jmp *%[dest]\n               .align 4\n"

That's an indirect branch, you can't do that these days. This would need
to use JMP_NOSPEC (except we don't have a !ASSEMBLER version of that.
But that would also completely and utterly destroy performance.

Also, objtool would complain about this if it hadn't tripped over that
first instruction:

 arch/x86/lib/csum-partial_64.o: warning: objtool: do_csum()+0x84: indirect jump found in RETPOLINE build

I'm not sure what the best way is to unroll loops without using computed
gotos/jump-tables though :/

> +		    "2:\n"
> +		    "	adcl 14*4(%[src]),%[res]\n   .align 4\n"
> +		    "	adcl 13*4(%[src]),%[res]\n   .align 4\n"
> +		    "	adcl 12*4(%[src]),%[res]\n   .align 4\n"
> +		    "	adcl 11*4(%[src]),%[res]\n   .align 4\n"
> +		    "	adcl 10*4(%[src]),%[res]\n   .align 4\n"
> +		    "	adcl 9*4(%[src]),%[res]\n   .align 4\n"
> +		    "	adcl 8*4(%[src]),%[res]\n   .align 4\n"
> +		    "	adcl 7*4(%[src]),%[res]\n   .align 4\n"
> +		    "	adcl 6*4(%[src]),%[res]\n   .align 4\n"
> +		    "	adcl 5*4(%[src]),%[res]\n   .align 4\n"
> +		    "	adcl 4*4(%[src]),%[res]\n   .align 4\n"
> +		    "	adcl 3*4(%[src]),%[res]\n   .align 4\n"
> +		    "	adcl 2*4(%[src]),%[res]\n   .align 4\n"
> +		    "	adcl 1*4(%[src]),%[res]\n   .align 4\n"
> +		    "	adcl 0*4(%[src]),%[res]\n"
> +		    "	adcl $0,%[res]"

If only the CPU would accept: REP ADCL (%%rsi), %[res]   :/

> +			: [res] "=r" (result), [dest] "=&r" (dest)
> +			: [src] "r" (buff), "[res]" (result),
> +			  [skip] "r" (dwords ^ 15)
> +			: "memory");
> +	}

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC] x86/csum: rewrite csum_partial()
  2021-11-11  9:10 ` Peter Zijlstra
@ 2021-11-11  9:44   ` Peter Zijlstra
  2021-11-11 16:02     ` Eric Dumazet
  0 siblings, 1 reply; 11+ messages in thread
From: Peter Zijlstra @ 2021-11-11  9:44 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, Jakub Kicinski, netdev, Eric Dumazet, x86,
	Alexander Duyck

On Thu, Nov 11, 2021 at 10:10:19AM +0100, Peter Zijlstra wrote:
> On Wed, Nov 10, 2021 at 10:53:22PM -0800, Eric Dumazet wrote:
> > +		/*
> > +		 * This implements an optimized version of
> > +		 * switch (dwords) {
> > +		 * case 15: res = add_with_carry(res, buf32[14]); fallthrough;
> > +		 * case 14: res = add_with_carry(res, buf32[13]); fallthrough;
> > +		 * case 13: res = add_with_carry(res, buf32[12]); fallthrough;
> > +		 * ...
> > +		 * case 3: res = add_with_carry(res, buf32[2]); fallthrough;
> > +		 * case 2: res = add_with_carry(res, buf32[1]); fallthrough;
> > +		 * case 1: res = add_with_carry(res, buf32[0]); fallthrough;
> > +		 * }
> > +		 *
> > +		 * "adcl 8byteoff(%reg1),%reg2" are using either 3 or 4 bytes.
> > +		 */
> > +		asm("	call 1f\n"
> > +		    "1:	pop %[dest]\n"
> 
> That's terrible. I think on x86_64 we can do: lea (%%rip), %[dest], not
> sure what would be the best way on i386.
> 
> > +		    "	lea (2f-1b)(%[dest],%[skip],4),%[dest]\n"
> > +		    "	clc\n"
> > +		    "	jmp *%[dest]\n               .align 4\n"
> 
> That's an indirect branch, you can't do that these days. This would need
> to use JMP_NOSPEC (except we don't have a !ASSEMBLER version of that.
> But that would also completely and utterly destroy performance.
> 
> Also, objtool would complain about this if it hadn't tripped over that
> first instruction:
> 
>  arch/x86/lib/csum-partial_64.o: warning: objtool: do_csum()+0x84: indirect jump found in RETPOLINE build
> 
> I'm not sure what the best way is to unroll loops without using computed
> gotos/jump-tables though :/
> 
> > +		    "2:\n"
> > +		    "	adcl 14*4(%[src]),%[res]\n   .align 4\n"
> > +		    "	adcl 13*4(%[src]),%[res]\n   .align 4\n"
> > +		    "	adcl 12*4(%[src]),%[res]\n   .align 4\n"
> > +		    "	adcl 11*4(%[src]),%[res]\n   .align 4\n"
> > +		    "	adcl 10*4(%[src]),%[res]\n   .align 4\n"
> > +		    "	adcl 9*4(%[src]),%[res]\n   .align 4\n"
> > +		    "	adcl 8*4(%[src]),%[res]\n   .align 4\n"
> > +		    "	adcl 7*4(%[src]),%[res]\n   .align 4\n"
> > +		    "	adcl 6*4(%[src]),%[res]\n   .align 4\n"
> > +		    "	adcl 5*4(%[src]),%[res]\n   .align 4\n"
> > +		    "	adcl 4*4(%[src]),%[res]\n   .align 4\n"
> > +		    "	adcl 3*4(%[src]),%[res]\n   .align 4\n"
> > +		    "	adcl 2*4(%[src]),%[res]\n   .align 4\n"
> > +		    "	adcl 1*4(%[src]),%[res]\n   .align 4\n"
> > +		    "	adcl 0*4(%[src]),%[res]\n"
> > +		    "	adcl $0,%[res]"
> 
> If only the CPU would accept: REP ADCL (%%rsi), %[res]   :/
> 
> > +			: [res] "=r" (result), [dest] "=&r" (dest)
> > +			: [src] "r" (buff), "[res]" (result),
> > +			  [skip] "r" (dwords ^ 15)
> > +			: "memory");
> > +	}

This is the best I could come up with ...

--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -32,7 +32,6 @@ static inline unsigned short from32to16(
  */
 static unsigned do_csum(const unsigned char *buff, unsigned len)
 {
-	unsigned long dwords;
 	unsigned odd, result = 0;
 
 	odd = 1 & (unsigned long) buff;
@@ -64,50 +63,54 @@ static unsigned do_csum(const unsigned c
 		result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
 	}
 
-	dwords = len >> 2;
-	if (dwords) { /* dwords is in [1..15] */
-		unsigned long dest;
-
-		/*
-		 * This implements an optimized version of
-		 * switch (dwords) {
-		 * case 15: res = add_with_carry(res, buf32[14]); fallthrough;
-		 * case 14: res = add_with_carry(res, buf32[13]); fallthrough;
-		 * case 13: res = add_with_carry(res, buf32[12]); fallthrough;
-		 * ...
-		 * case 3: res = add_with_carry(res, buf32[2]); fallthrough;
-		 * case 2: res = add_with_carry(res, buf32[1]); fallthrough;
-		 * case 1: res = add_with_carry(res, buf32[0]); fallthrough;
-		 * }
-		 *
-		 * "adcl 8byteoff(%reg1),%reg2" are using either 3 or 4 bytes.
-		 */
-		asm("	call 1f\n"
-		    "1:	pop %[dest]\n"
-		    "	lea (2f-1b)(%[dest],%[skip],4),%[dest]\n"
-		    "	clc\n"
-		    "	jmp *%[dest]\n               .align 4\n"
-		    "2:\n"
-		    "	adcl 14*4(%[src]),%[res]\n   .align 4\n"
-		    "	adcl 13*4(%[src]),%[res]\n   .align 4\n"
-		    "	adcl 12*4(%[src]),%[res]\n   .align 4\n"
-		    "	adcl 11*4(%[src]),%[res]\n   .align 4\n"
-		    "	adcl 10*4(%[src]),%[res]\n   .align 4\n"
-		    "	adcl 9*4(%[src]),%[res]\n   .align 4\n"
-		    "	adcl 8*4(%[src]),%[res]\n   .align 4\n"
-		    "	adcl 7*4(%[src]),%[res]\n   .align 4\n"
-		    "	adcl 6*4(%[src]),%[res]\n   .align 4\n"
-		    "	adcl 5*4(%[src]),%[res]\n   .align 4\n"
-		    "	adcl 4*4(%[src]),%[res]\n   .align 4\n"
-		    "	adcl 3*4(%[src]),%[res]\n   .align 4\n"
-		    "	adcl 2*4(%[src]),%[res]\n   .align 4\n"
-		    "	adcl 1*4(%[src]),%[res]\n   .align 4\n"
-		    "	adcl 0*4(%[src]),%[res]\n"
-		    "	adcl $0,%[res]"
-			: [res] "=r" (result), [dest] "=&r" (dest)
-			: [src] "r" (buff), "[res]" (result),
-			  [skip] "r" (dwords ^ 15)
-			: "memory");
+	if (len >> 2) { /* dwords is in [1..15] */
+		if (len >= 32) {
+			asm("	addl 0*4(%[src]),%[res]\n"
+			    "	adcl 1*4(%[src]),%[res]\n"
+			    "	adcl 2*4(%[src]),%[res]\n"
+			    "	adcl 3*4(%[src]),%[res]\n"
+			    "	adcl 4*4(%[src]),%[res]\n"
+			    "	adcl 5*4(%[src]),%[res]\n"
+			    "	adcl 6*4(%[src]),%[res]\n"
+			    "	adcl 7*4(%[src]),%[res]\n"
+			    "	adcl $0,%[res]"
+			    : [res] "=r" (result)
+			    : [src] "r" (buff), "[res]" (result)
+			    : "memory");
+			buff += 32;
+			len -= 32;
+		}
+		if (len >= 16) {
+			asm("	addl 0*4(%[src]),%[res]\n"
+			    "	adcl 1*4(%[src]),%[res]\n"
+			    "	adcl 2*4(%[src]),%[res]\n"
+			    "	adcl 3*4(%[src]),%[res]\n"
+			    "	adcl $0,%[res]"
+			    : [res] "=r" (result)
+			    : [src] "r" (buff), "[res]" (result)
+			    : "memory");
+			buff += 16;
+			len -= 16;
+		}
+		if (len >= 8) {
+			asm("	addl 0*4(%[src]),%[res]\n"
+			    "	adcl 1*4(%[src]),%[res]\n"
+			    "	adcl $0,%[res]"
+			    : [res] "=r" (result)
+			    : [src] "r" (buff), "[res]" (result)
+			    : "memory");
+			buff += 8;
+			len -= 8;
+		}
+		if (len >= 4) {
+			asm("	addl 0*4(%[src]),%[res]\n"
+			    "	adcl $0,%[res]"
+			    : [res] "=r" (result)
+			    : [src] "r" (buff), "[res]" (result)
+			    : "memory");
+			buff += 4;
+			len -= 4;
+		}
 	}
 
 	if (len & 3U) {
@@ -120,7 +123,7 @@ static unsigned do_csum(const unsigned c
 		if (len & 1)
 			result += *buff;
 	}
-	if (unlikely(odd)) { 
+	if (unlikely(odd)) {
 		result = from32to16(result);
 		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
 	}

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC] x86/csum: rewrite csum_partial()
  2021-11-11  9:44   ` Peter Zijlstra
@ 2021-11-11 16:02     ` Eric Dumazet
  2021-11-11 16:52       ` Eric Dumazet
  0 siblings, 1 reply; 11+ messages in thread
From: Eric Dumazet @ 2021-11-11 16:02 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Eric Dumazet, David S . Miller, Jakub Kicinski, netdev, x86,
	Alexander Duyck

On Thu, Nov 11, 2021 at 1:44 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Thu, Nov 11, 2021 at 10:10:19AM +0100, Peter Zijlstra wrote:
> > On Wed, Nov 10, 2021 at 10:53:22PM -0800, Eric Dumazet wrote:
> > > +           /*
> > > +            * This implements an optimized version of
> > > +            * switch (dwords) {
> > > +            * case 15: res = add_with_carry(res, buf32[14]); fallthrough;
> > > +            * case 14: res = add_with_carry(res, buf32[13]); fallthrough;
> > > +            * case 13: res = add_with_carry(res, buf32[12]); fallthrough;
> > > +            * ...
> > > +            * case 3: res = add_with_carry(res, buf32[2]); fallthrough;
> > > +            * case 2: res = add_with_carry(res, buf32[1]); fallthrough;
> > > +            * case 1: res = add_with_carry(res, buf32[0]); fallthrough;
> > > +            * }
> > > +            *
> > > +            * "adcl 8byteoff(%reg1),%reg2" are using either 3 or 4 bytes.
> > > +            */
> > > +           asm("   call 1f\n"
> > > +               "1: pop %[dest]\n"
> >
> > That's terrible. I think on x86_64 we can do: lea (%%rip), %[dest], not
> > sure what would be the best way on i386.
> >
> > > +               "   lea (2f-1b)(%[dest],%[skip],4),%[dest]\n"
> > > +               "   clc\n"
> > > +               "   jmp *%[dest]\n               .align 4\n"
> >
> > That's an indirect branch, you can't do that these days. This would need
> > to use JMP_NOSPEC (except we don't have a !ASSEMBLER version of that.
> > But that would also completely and utterly destroy performance.
> >
> > Also, objtool would complain about this if it hadn't tripped over that
> > first instruction:
> >
> >  arch/x86/lib/csum-partial_64.o: warning: objtool: do_csum()+0x84: indirect jump found in RETPOLINE build
> >
> > I'm not sure what the best way is to unroll loops without using computed
> > gotos/jump-tables though :/
> >
> > > +               "2:\n"
> > > +               "   adcl 14*4(%[src]),%[res]\n   .align 4\n"
> > > +               "   adcl 13*4(%[src]),%[res]\n   .align 4\n"
> > > +               "   adcl 12*4(%[src]),%[res]\n   .align 4\n"
> > > +               "   adcl 11*4(%[src]),%[res]\n   .align 4\n"
> > > +               "   adcl 10*4(%[src]),%[res]\n   .align 4\n"
> > > +               "   adcl 9*4(%[src]),%[res]\n   .align 4\n"
> > > +               "   adcl 8*4(%[src]),%[res]\n   .align 4\n"
> > > +               "   adcl 7*4(%[src]),%[res]\n   .align 4\n"
> > > +               "   adcl 6*4(%[src]),%[res]\n   .align 4\n"
> > > +               "   adcl 5*4(%[src]),%[res]\n   .align 4\n"
> > > +               "   adcl 4*4(%[src]),%[res]\n   .align 4\n"
> > > +               "   adcl 3*4(%[src]),%[res]\n   .align 4\n"
> > > +               "   adcl 2*4(%[src]),%[res]\n   .align 4\n"
> > > +               "   adcl 1*4(%[src]),%[res]\n   .align 4\n"
> > > +               "   adcl 0*4(%[src]),%[res]\n"
> > > +               "   adcl $0,%[res]"
> >
> > If only the CPU would accept: REP ADCL (%%rsi), %[res]   :/
> >
> > > +                   : [res] "=r" (result), [dest] "=&r" (dest)
> > > +                   : [src] "r" (buff), "[res]" (result),
> > > +                     [skip] "r" (dwords ^ 15)
> > > +                   : "memory");
> > > +   }
>
> This is the best I could come up with ...
>
> --- a/arch/x86/lib/csum-partial_64.c
> +++ b/arch/x86/lib/csum-partial_64.c
> @@ -32,7 +32,6 @@ static inline unsigned short from32to16(
>   */
>  static unsigned do_csum(const unsigned char *buff, unsigned len)
>  {
> -       unsigned long dwords;
>         unsigned odd, result = 0;
>
>         odd = 1 & (unsigned long) buff;
> @@ -64,50 +63,54 @@ static unsigned do_csum(const unsigned c
>                 result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
>         }
>
> -       dwords = len >> 2;
> -       if (dwords) { /* dwords is in [1..15] */
> -               unsigned long dest;
> -
> -               /*
> -                * This implements an optimized version of
> -                * switch (dwords) {
> -                * case 15: res = add_with_carry(res, buf32[14]); fallthrough;
> -                * case 14: res = add_with_carry(res, buf32[13]); fallthrough;
> -                * case 13: res = add_with_carry(res, buf32[12]); fallthrough;
> -                * ...
> -                * case 3: res = add_with_carry(res, buf32[2]); fallthrough;
> -                * case 2: res = add_with_carry(res, buf32[1]); fallthrough;
> -                * case 1: res = add_with_carry(res, buf32[0]); fallthrough;
> -                * }
> -                *
> -                * "adcl 8byteoff(%reg1),%reg2" are using either 3 or 4 bytes.
> -                */
> -               asm("   call 1f\n"
> -                   "1: pop %[dest]\n"
> -                   "   lea (2f-1b)(%[dest],%[skip],4),%[dest]\n"
> -                   "   clc\n"
> -                   "   jmp *%[dest]\n               .align 4\n"
> -                   "2:\n"
> -                   "   adcl 14*4(%[src]),%[res]\n   .align 4\n"
> -                   "   adcl 13*4(%[src]),%[res]\n   .align 4\n"
> -                   "   adcl 12*4(%[src]),%[res]\n   .align 4\n"
> -                   "   adcl 11*4(%[src]),%[res]\n   .align 4\n"
> -                   "   adcl 10*4(%[src]),%[res]\n   .align 4\n"
> -                   "   adcl 9*4(%[src]),%[res]\n   .align 4\n"
> -                   "   adcl 8*4(%[src]),%[res]\n   .align 4\n"
> -                   "   adcl 7*4(%[src]),%[res]\n   .align 4\n"
> -                   "   adcl 6*4(%[src]),%[res]\n   .align 4\n"
> -                   "   adcl 5*4(%[src]),%[res]\n   .align 4\n"
> -                   "   adcl 4*4(%[src]),%[res]\n   .align 4\n"
> -                   "   adcl 3*4(%[src]),%[res]\n   .align 4\n"
> -                   "   adcl 2*4(%[src]),%[res]\n   .align 4\n"
> -                   "   adcl 1*4(%[src]),%[res]\n   .align 4\n"
> -                   "   adcl 0*4(%[src]),%[res]\n"
> -                   "   adcl $0,%[res]"
> -                       : [res] "=r" (result), [dest] "=&r" (dest)
> -                       : [src] "r" (buff), "[res]" (result),
> -                         [skip] "r" (dwords ^ 15)
> -                       : "memory");
> +       if (len >> 2) { /* dwords is in [1..15] */
> +               if (len >= 32) {
> +                       asm("   addl 0*4(%[src]),%[res]\n"
> +                           "   adcl 1*4(%[src]),%[res]\n"
> +                           "   adcl 2*4(%[src]),%[res]\n"
> +                           "   adcl 3*4(%[src]),%[res]\n"
> +                           "   adcl 4*4(%[src]),%[res]\n"
> +                           "   adcl 5*4(%[src]),%[res]\n"
> +                           "   adcl 6*4(%[src]),%[res]\n"
> +                           "   adcl 7*4(%[src]),%[res]\n"
> +                           "   adcl $0,%[res]"
> +                           : [res] "=r" (result)
> +                           : [src] "r" (buff), "[res]" (result)
> +                           : "memory");
> +                       buff += 32;
> +                       len -= 32;
> +               }
> +               if (len >= 16) {
> +                       asm("   addl 0*4(%[src]),%[res]\n"
> +                           "   adcl 1*4(%[src]),%[res]\n"
> +                           "   adcl 2*4(%[src]),%[res]\n"
> +                           "   adcl 3*4(%[src]),%[res]\n"
> +                           "   adcl $0,%[res]"
> +                           : [res] "=r" (result)
> +                           : [src] "r" (buff), "[res]" (result)
> +                           : "memory");
> +                       buff += 16;
> +                       len -= 16;
> +               }
> +               if (len >= 8) {
> +                       asm("   addl 0*4(%[src]),%[res]\n"
> +                           "   adcl 1*4(%[src]),%[res]\n"
> +                           "   adcl $0,%[res]"
> +                           : [res] "=r" (result)
> +                           : [src] "r" (buff), "[res]" (result)
> +                           : "memory");
> +                       buff += 8;
> +                       len -= 8;
> +               }
> +               if (len >= 4) {
> +                       asm("   addl 0*4(%[src]),%[res]\n"
> +                           "   adcl $0,%[res]"
> +                           : [res] "=r" (result)
> +                           : [src] "r" (buff), "[res]" (result)
> +                           : "memory");
> +                       buff += 4;
> +                       len -= 4;
> +               }
>         }
>
>         if (len & 3U) {
> @@ -120,7 +123,7 @@ static unsigned do_csum(const unsigned c
>                 if (len & 1)
>                         result += *buff;
>         }
> -       if (unlikely(odd)) {
> +       if (unlikely(odd)) {
>                 result = from32to16(result);
>                 result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
>         }

Thanks Peter !

This is more or less the first version I wrote. (I was doing tests for
(len & 32), (len & 16) .. to not have to update len in these blocks.

Then, I tried to add an inline version, a la ip_fast_csum() but for IPv6.

Then I came up with the version I sent, for some reason my .config had
temporarily disabled CONFIG_RETPOLINE,
thanks for reminding me this !

I also missed this warning anyway :
arch/x86/lib/csum-partial_64.o: warning: objtool: csum_partial()+0x2f:
unannotated intra-function call

I will spend a bit more time on this before sending a V2, thanks again !

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC] x86/csum: rewrite csum_partial()
  2021-11-11  6:53 [RFC] x86/csum: rewrite csum_partial() Eric Dumazet
  2021-11-11  9:10 ` Peter Zijlstra
@ 2021-11-11 16:51 ` Alexander Duyck
  2021-11-14 13:07 ` David Laight
  2 siblings, 0 replies; 11+ messages in thread
From: Alexander Duyck @ 2021-11-11 16:51 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, Jakub Kicinski, netdev, Eric Dumazet,
	the arch/x86 maintainers

On Wed, Nov 10, 2021 at 10:53 PM Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> From: Eric Dumazet <edumazet@google.com>
>
> With more NIC supporting CHECKSUM_COMPLETE, and IPv6 being widely used.
> csum_partial() is heavily used with small amount of bytes,
> and is consuming many cycles.
>
> IPv6 header size for instance is 40 bytes.
>
> Another thing to consider is that NET_IP_ALIGN is 0 on x86,
> meaning that network headers in RX path are not word-aligned,
> unless the driver forces this.
>
> This means that csum_partial() fetches one u16
> to 'align the buffer', then perform seven u64 additions
> with carry in a loop, then a remaining u32, then a remaining u16.
>
> With this new version, we perform 10 u32 adds with carry, to
> avoid the expensive 64->32 transformation. Using 5 u64 adds
> plus one add32_with_carry() is more expensive.
>
> Also note that this avoids loops for less than ~60 bytes.
>
> Tested on various cpus, all of them show a big reduction in
> csum_partial() cost (by 30 to 75 %)
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Alexander Duyck <alexander.duyck@gmail.com>
> ---
>  arch/x86/lib/csum-partial_64.c | 146 +++++++++++++++++----------------
>  1 file changed, 77 insertions(+), 69 deletions(-)
>
> diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
> index e7925d668b680269fb2442766deaf416dc42f9a1..f48fe0ec9663ff3afa1b5403f135407b8b0fde01 100644
> --- a/arch/x86/lib/csum-partial_64.c
> +++ b/arch/x86/lib/csum-partial_64.c
> @@ -21,97 +21,105 @@ static inline unsigned short from32to16(unsigned a)
>  }
>
>  /*
> - * Do a 64-bit checksum on an arbitrary memory area.
> + * Do a checksum on an arbitrary memory area.
>   * Returns a 32bit checksum.
>   *
>   * This isn't as time critical as it used to be because many NICs
>   * do hardware checksumming these days.
> - *
> - * Things tried and found to not make it faster:
> - * Manual Prefetching
> - * Unrolling to an 128 bytes inner loop.
> - * Using interleaving with more registers to break the carry chains.
> + *
> + * Still, with CHECKSUM_COMPLETE this is called to compute
> + * checksums on IPv6 headers (40 bytes) and other small parts.
>   */
>  static unsigned do_csum(const unsigned char *buff, unsigned len)
>  {
> -       unsigned odd, count;
> -       unsigned long result = 0;
> +       unsigned long dwords;
> +       unsigned odd, result = 0;
>
> -       if (unlikely(len == 0))
> -               return result;
>         odd = 1 & (unsigned long) buff;
>         if (unlikely(odd)) {
> +               if (unlikely(len == 0))
> +                       return result;
>                 result = *buff << 8;
>                 len--;
>                 buff++;
>         }
> -       count = len >> 1;               /* nr of 16-bit words.. */
> -       if (count) {
> -               if (2 & (unsigned long) buff) {
> -                       result += *(unsigned short *)buff;
> -                       count--;
> -                       len -= 2;
> -                       buff += 2;
> -               }
> -               count >>= 1;            /* nr of 32-bit words.. */
> -               if (count) {
> -                       unsigned long zero;
> -                       unsigned count64;
> -                       if (4 & (unsigned long) buff) {
> -                               result += *(unsigned int *) buff;
> -                               count--;
> -                               len -= 4;
> -                               buff += 4;
> -                       }
> -                       count >>= 1;    /* nr of 64-bit words.. */

So for most cases getting rid of the alignment code should be fine.
However as I recall my main concern when dealing with something like
this was the case of a page straddling value. For most network packets
that shouldn't be the case but it may be something we want to add some
sort of debug check for where if we are unaligned, and the address
straddles pages, and the debugging is enabled we throw a warning at
least.

Otherwise as an alternative we might consider just performing one 8B
read at the start with us forcing the alignment via masking the
address and data and then just dropping the unused values in order get
us into 8B alignment so that we don't trigger the page straddling
read, and we can just rotate the result later to align it to the
start.

> +       if (unlikely(len >= 64)) {
> +               unsigned long temp64 = result;
> +               do {
> +                       asm("   addq 0*8(%[src]),%[res]\n"
> +                           "   adcq 1*8(%[src]),%[res]\n"
> +                           "   adcq 2*8(%[src]),%[res]\n"
> +                           "   adcq 3*8(%[src]),%[res]\n"
> +                           "   adcq 4*8(%[src]),%[res]\n"
> +                           "   adcq 5*8(%[src]),%[res]\n"
> +                           "   adcq 6*8(%[src]),%[res]\n"
> +                           "   adcq 7*8(%[src]),%[res]\n"
> +                           "   adcq $0,%[res]"
> +                           : [res] "=r" (temp64)
> +                           : [src] "r" (buff), "[res]" (temp64)
> +                           : "memory");
> +                       buff += 64;
> +                       len -= 64;
> +               } while (len >= 64);
> +               result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
> +       }
>
> -                       /* main loop using 64byte blocks */
> -                       zero = 0;
> -                       count64 = count >> 3;
> -                       while (count64) {
> -                               asm("addq 0*8(%[src]),%[res]\n\t"
> -                                   "adcq 1*8(%[src]),%[res]\n\t"
> -                                   "adcq 2*8(%[src]),%[res]\n\t"
> -                                   "adcq 3*8(%[src]),%[res]\n\t"
> -                                   "adcq 4*8(%[src]),%[res]\n\t"
> -                                   "adcq 5*8(%[src]),%[res]\n\t"
> -                                   "adcq 6*8(%[src]),%[res]\n\t"
> -                                   "adcq 7*8(%[src]),%[res]\n\t"
> -                                   "adcq %[zero],%[res]"
> -                                   : [res] "=r" (result)
> -                                   : [src] "r" (buff), [zero] "r" (zero),
> -                                   "[res]" (result));
> -                               buff += 64;
> -                               count64--;
> -                       }

So in terms of this main loop it is probably as good as it gets for
anything pre-Haswell, but I am wondering if we might be able to
improve upon this for Haswell and newer architectures that have more
read throughput since I think those parts would be serialized on the
carry flag instead of being limited to one read per cycle.

> +       dwords = len >> 2;
> +       if (dwords) { /* dwords is in [1..15] */
> +               unsigned long dest;
>
> -                       /* last up to 7 8byte blocks */
> -                       count %= 8;
> -                       while (count) {
> -                               asm("addq %1,%0\n\t"
> -                                   "adcq %2,%0\n"
> -                                           : "=r" (result)
> -                                   : "m" (*(unsigned long *)buff),
> -                                   "r" (zero),  "0" (result));
> -                               --count;
> -                               buff += 8;
> -                       }
> -                       result = add32_with_carry(result>>32,
> -                                                 result&0xffffffff);
> +               /*
> +                * This implements an optimized version of
> +                * switch (dwords) {
> +                * case 15: res = add_with_carry(res, buf32[14]); fallthrough;
> +                * case 14: res = add_with_carry(res, buf32[13]); fallthrough;
> +                * case 13: res = add_with_carry(res, buf32[12]); fallthrough;
> +                * ...
> +                * case 3: res = add_with_carry(res, buf32[2]); fallthrough;
> +                * case 2: res = add_with_carry(res, buf32[1]); fallthrough;
> +                * case 1: res = add_with_carry(res, buf32[0]); fallthrough;
> +                * }
> +                *
> +                * "adcl 8byteoff(%reg1),%reg2" are using either 3 or 4 bytes.
> +                */
> +               asm("   call 1f\n"
> +                   "1: pop %[dest]\n"
> +                   "   lea (2f-1b)(%[dest],%[skip],4),%[dest]\n"
> +                   "   clc\n"
> +                   "   jmp *%[dest]\n               .align 4\n"
> +                   "2:\n"
> +                   "   adcl 14*4(%[src]),%[res]\n   .align 4\n"
> +                   "   adcl 13*4(%[src]),%[res]\n   .align 4\n"
> +                   "   adcl 12*4(%[src]),%[res]\n   .align 4\n"
> +                   "   adcl 11*4(%[src]),%[res]\n   .align 4\n"
> +                   "   adcl 10*4(%[src]),%[res]\n   .align 4\n"
> +                   "   adcl 9*4(%[src]),%[res]\n   .align 4\n"
> +                   "   adcl 8*4(%[src]),%[res]\n   .align 4\n"
> +                   "   adcl 7*4(%[src]),%[res]\n   .align 4\n"
> +                   "   adcl 6*4(%[src]),%[res]\n   .align 4\n"
> +                   "   adcl 5*4(%[src]),%[res]\n   .align 4\n"
> +                   "   adcl 4*4(%[src]),%[res]\n   .align 4\n"
> +                   "   adcl 3*4(%[src]),%[res]\n   .align 4\n"
> +                   "   adcl 2*4(%[src]),%[res]\n   .align 4\n"
> +                   "   adcl 1*4(%[src]),%[res]\n   .align 4\n"
> +                   "   adcl 0*4(%[src]),%[res]\n"
> +                   "   adcl $0,%[res]"
> +                       : [res] "=r" (result), [dest] "=&r" (dest)
> +                       : [src] "r" (buff), "[res]" (result),
> +                         [skip] "r" (dwords ^ 15)
> +                       : "memory");
> +       }

I gave up on this after the whole specter/meltdown thing because it
was an indirect jump. Packing it down to a tight loop processing
single QWORDs may be in our favor as it can just replay the decoded
instructions for as many times as we need to.

>
> -                       if (len & 4) {
> -                               result += *(unsigned int *) buff;
> -                               buff += 4;
> -                       }
> -               }
> +       if (len & 3U) {
> +               buff += len & ~3U;
> +               result = from32to16(result);
>                 if (len & 2) {
>                         result += *(unsigned short *) buff;
>                         buff += 2;
>                 }
> +               if (len & 1)
> +                       result += *buff;
>         }
> -       if (len & 1)
> -               result += *buff;

This is another spot where I wonder if we can't get away with a single
8B read and then just mask the value based on the length remaining. I
would think it would save us a fair bit of time and a number of
cycles.

> -       result = add32_with_carry(result>>32, result & 0xffffffff);
>         if (unlikely(odd)) {
>                 result = from32to16(result);
>                 result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);

Rather than doing the fold and rotate if odd would it maybe make sense
for us just to do a single 64b rotate based on the offset of the
original value. We could do that before the result is folded to a 32b
value and it would likely only cost us one cycle instead of the
several that are being spent here doing the test for zero, fold, and
rotate.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC] x86/csum: rewrite csum_partial()
  2021-11-11 16:02     ` Eric Dumazet
@ 2021-11-11 16:52       ` Eric Dumazet
  2021-11-11 18:17         ` Andrew Cooper
  0 siblings, 1 reply; 11+ messages in thread
From: Eric Dumazet @ 2021-11-11 16:52 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Eric Dumazet, David S . Miller, Jakub Kicinski, netdev, x86,
	Alexander Duyck

On Thu, Nov 11, 2021 at 8:02 AM Eric Dumazet <edumazet@google.com> wrote:
>

> Thanks Peter !
>
> This is more or less the first version I wrote. (I was doing tests for
> (len & 32), (len & 16) .. to not have to update len in these blocks.
>
> Then, I tried to add an inline version, a la ip_fast_csum() but for IPv6.
>
> Then I came up with the version I sent, for some reason my .config had
> temporarily disabled CONFIG_RETPOLINE,
> thanks for reminding me this !
>
> I also missed this warning anyway :
> arch/x86/lib/csum-partial_64.o: warning: objtool: csum_partial()+0x2f:
> unannotated intra-function call
>
> I will spend a bit more time on this before sending a V2, thanks again !

BTW, I could not understand why :

               result = add32_with_carry(result, *(u32 *)buff);

generates this code :

 123: 41 8b 09              mov    (%r9),%ecx
 126: 89 4d f8              mov    %ecx,-0x8(%rbp)
 129: 03 45 f8              add    -0x8(%rbp),%eax
 12c: 83 d0 00              adc    $0x0,%eax

Apparently add32_with_carry() forces the use of use of a temporary in memory

While
               asm("   addl 0*4(%[src]),%[res]\n"
                   "   adcl $0,%[res]\n"
                       : [res] "=r" (result)
                       : [src] "r" (buff), "[res]" (result)
                        : "memory");

gives

 120: 41 03 01              add    (%r9),%eax
 123: 83 d0 00              adc    $0x0,%eax

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC] x86/csum: rewrite csum_partial()
  2021-11-11 16:52       ` Eric Dumazet
@ 2021-11-11 18:17         ` Andrew Cooper
  2021-11-11 19:02           ` Eric Dumazet
  0 siblings, 1 reply; 11+ messages in thread
From: Andrew Cooper @ 2021-11-11 18:17 UTC (permalink / raw)
  To: Eric Dumazet, Peter Zijlstra
  Cc: Eric Dumazet, David S . Miller, Jakub Kicinski, netdev, x86,
	Alexander Duyck, Andrew Cooper, amc96

On 11/11/2021 16:52, Eric Dumazet wrote:
> On Thu, Nov 11, 2021 at 8:02 AM Eric Dumazet <edumazet@google.com> wrote:
>> Thanks Peter !
>>
>> This is more or less the first version I wrote. (I was doing tests for
>> (len & 32), (len & 16) .. to not have to update len in these blocks.
>>
>> Then, I tried to add an inline version, a la ip_fast_csum() but for IPv6.
>>
>> Then I came up with the version I sent, for some reason my .config had
>> temporarily disabled CONFIG_RETPOLINE,
>> thanks for reminding me this !
>>
>> I also missed this warning anyway :
>> arch/x86/lib/csum-partial_64.o: warning: objtool: csum_partial()+0x2f:
>> unannotated intra-function call
>>
>> I will spend a bit more time on this before sending a V2, thanks again !
> BTW, I could not understand why :
>
>                result = add32_with_carry(result, *(u32 *)buff);
>
> generates this code :
>
>  123: 41 8b 09              mov    (%r9),%ecx
>  126: 89 4d f8              mov    %ecx,-0x8(%rbp)
>  129: 03 45 f8              add    -0x8(%rbp),%eax
>  12c: 83 d0 00              adc    $0x0,%eax

Are you using Clang?  There is a long outstanding code generation bug
where an "rm" constraint is converted to "m" internally.

https://bugs.llvm.org/show_bug.cgi?id=47530

Even a stopgap of pretending "rm" means "r" would result in far better
code, 99% of the time.

> Apparently add32_with_carry() forces the use of use of a temporary in memory
>
> While
>                asm("   addl 0*4(%[src]),%[res]\n"
>                    "   adcl $0,%[res]\n"
>                        : [res] "=r" (result)
>                        : [src] "r" (buff), "[res]" (result)
>                         : "memory");

Just as a minor note about the asm constraints here and elsewhere

    : [res] "=r" (result)
    : "res" (result)

ought to be just [res] "+r" (result).  The result variable really is
read and written by the asm fragments.

~Andrew


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC] x86/csum: rewrite csum_partial()
  2021-11-11 18:17         ` Andrew Cooper
@ 2021-11-11 19:02           ` Eric Dumazet
  0 siblings, 0 replies; 11+ messages in thread
From: Eric Dumazet @ 2021-11-11 19:02 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: Peter Zijlstra, Eric Dumazet, David S . Miller, Jakub Kicinski,
	netdev, x86, Alexander Duyck, amc96

On Thu, Nov 11, 2021 at 10:18 AM Andrew Cooper
<andrew.cooper3@citrix.com> wrote:
>
> On 11/11/2021 16:52, Eric Dumazet wrote:
> > On Thu, Nov 11, 2021 at 8:02 AM Eric Dumazet <edumazet@google.com> wrote:
> >> Thanks Peter !
> >>
> >> This is more or less the first version I wrote. (I was doing tests for
> >> (len & 32), (len & 16) .. to not have to update len in these blocks.
> >>
> >> Then, I tried to add an inline version, a la ip_fast_csum() but for IPv6.
> >>
> >> Then I came up with the version I sent, for some reason my .config had
> >> temporarily disabled CONFIG_RETPOLINE,
> >> thanks for reminding me this !
> >>
> >> I also missed this warning anyway :
> >> arch/x86/lib/csum-partial_64.o: warning: objtool: csum_partial()+0x2f:
> >> unannotated intra-function call
> >>
> >> I will spend a bit more time on this before sending a V2, thanks again !
> > BTW, I could not understand why :
> >
> >                result = add32_with_carry(result, *(u32 *)buff);
> >
> > generates this code :
> >
> >  123: 41 8b 09              mov    (%r9),%ecx
> >  126: 89 4d f8              mov    %ecx,-0x8(%rbp)
> >  129: 03 45 f8              add    -0x8(%rbp),%eax
> >  12c: 83 d0 00              adc    $0x0,%eax
>
> Are you using Clang?  There is a long outstanding code generation bug
> where an "rm" constraint is converted to "m" internally.
>

Yes, this is what I realized later. This is a clang bug.

> https://bugs.llvm.org/show_bug.cgi?id=47530
>
> Even a stopgap of pretending "rm" means "r" would result in far better
> code, 99% of the time.

Agreed

>
> > Apparently add32_with_carry() forces the use of use of a temporary in memory
> >
> > While
> >                asm("   addl 0*4(%[src]),%[res]\n"
> >                    "   adcl $0,%[res]\n"
> >                        : [res] "=r" (result)
> >                        : [src] "r" (buff), "[res]" (result)
> >                         : "memory");
>
> Just as a minor note about the asm constraints here and elsewhere
>
>     : [res] "=r" (result)
>     : "res" (result)
>
> ought to be just [res] "+r" (result).  The result variable really is
> read and written by the asm fragments.

Thanks for the tip !

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [RFC] x86/csum: rewrite csum_partial()
  2021-11-11  6:53 [RFC] x86/csum: rewrite csum_partial() Eric Dumazet
  2021-11-11  9:10 ` Peter Zijlstra
  2021-11-11 16:51 ` Alexander Duyck
@ 2021-11-14 13:07 ` David Laight
  2021-11-14 14:12   ` David Laight
  2 siblings, 1 reply; 11+ messages in thread
From: David Laight @ 2021-11-14 13:07 UTC (permalink / raw)
  To: 'Eric Dumazet', David S . Miller, Jakub Kicinski
  Cc: netdev, Eric Dumazet, x86, Alexander Duyck

From: Eric Dumazet
> Sent: 11 November 2021 06:53
> 
> With more NIC supporting CHECKSUM_COMPLETE, and IPv6 being widely used.
> csum_partial() is heavily used with small amount of bytes,
> and is consuming many cycles.
> 
> IPv6 header size for instance is 40 bytes.
> 
> Another thing to consider is that NET_IP_ALIGN is 0 on x86,
> meaning that network headers in RX path are not word-aligned,
> unless the driver forces this.
> 
> This means that csum_partial() fetches one u16
> to 'align the buffer', then perform seven u64 additions
> with carry in a loop, then a remaining u32, then a remaining u16.
> 
> With this new version, we perform 10 u32 adds with carry, to
> avoid the expensive 64->32 transformation. Using 5 u64 adds
> plus one add32_with_carry() is more expensive.
> 
> Also note that this avoids loops for less than ~60 bytes.

I spent far too long looking at this code a while back :-)
I did post a patch - probably 10th May 2020.

Prior to Sandy bridge ADC always took two clocks.
Even on Sandy bridge there is a two clock delay for the sum,
only the carry flag is available earlier.
Broadwell (and I think all AMD cpu) do ADC in 1 clock.
This can be avoided by adding to alternate registers.
There are also issues on some cpu with the partial updates
to the flags register (DEC sets Z but not C) causing unwanted
register dependencies.

I think misaligned memory reads take an extra clock.
But more recent cpu can do two memory reads per clock.
So unless the code is trying to beat 8 bytes/clock it
shouldn't have much effect.

The fastest loop I found (for large buffers) used:

+       asm(    "       bt    $4, %[len]\n"
+               "       jnc   10f\n"
+               "       add   (%[buff], %[len]), %[sum_0]\n"
+               "       adc   8(%[buff], %[len]), %[sum_1]\n"
+               "       lea   16(%[len]), %[len]\n"
+               "10:    jecxz 20f\n"
+               "       adc   (%[buff], %[len]), %[sum_0]\n"
+               "       adc   8(%[buff], %[len]), %[sum_1]\n"
+               "       lea   32(%[len]), %[len_tmp]\n"
+               "       adc   16(%[buff], %[len]), %[sum_0]\n"
+               "       adc   24(%[buff], %[len]), %[sum_1]\n"
+               "       mov   %[len_tmp], %[len]\n"
+               "       jmp   10b\n"
+               "20:    adc   %[sum_0], %[sum]\n"
+               "       adc   %[sum_1], %[sum]\n"
+               "       adc   $0, %[sum]\n"
+           : [sum] "+&r" (sum), [sum_0] "+&r" (sum_0), [sum_1] "+&r" (sum_1),
+               [len] "+&c" (len), [len_tmp] "=&r" (len_tmp)
+           : [buff] "r" (buff)
+           : "memory" );

The principle is that 'buff' points to the end on the buffer.
The 'length' (in %cx) is negative and then increased until it hits zero.

This runs at 8 bytes/clock on anything recent (and approaches it on Ivy bridge).
Splitting the 'add 32' did make a slight improvement.
If you aren't worried (too much) about cpu before Bradwell then IIRC
this loop gets close to 8 bytes/clock:

+               "10:    jecxz 20f\n"
+               "       adc   (%[buff], %[len]), %[sum]\n"
+               "       adc   8(%[buff], %[len]), %[sum]\n"
+               "       lea   16(%[len]), %[tmp]\n"
+               "       jmp   10b\n"
+               " 20:"

I also toyed with this loop:

            count = (lim + 7 - buf) & -64;
            buf += count;

            count = -count;
            asm(    "       xor   %[sum_odd], %[sum_odd]\n"    // Also clears carry and overflow
                    "10:    jrcxz 20f\n"
                    "       adcx    (%[buf], %[count]), %[sum]\n"
                    "       adox   8(%[buf], %[count]), %[sum_odd]\n"
                    "       adcx  16(%[buf], %[count]), %[sum]\n"
                    "       adox  24(%[buf], %[count]), %[sum_odd]\n"
                    "       adcx  32(%[buf], %[count]), %[sum]\n"
                    "       adox  40(%[buf], %[count]), %[sum_odd]\n"
                    "       adcx  48(%[buf], %[count]), %[sum]\n"
                    "       adox  56(%[buf], %[count]), %[sum_odd]\n"
                    "       lea   64(%[count]), %[count]\n"
                    "       jmp   10b\n"
                    "20:    adox  %[count], %[sum_odd]\n"  // [count] is zero
                    "       adcx  %[sum_odd], %[sum]\n"
                    "       adcx  %[count], %[sum]"
                : [sum] "=&r" (sum), [count] "=&c" (count), [sum_odd] "=&r" (sum_odd)
                : [buf] "r" (buf), "0" (sum), "1" (count)
                : "memory");
        }

My notes say it achieved 12 bytes/clock on an i7-7700.
However it is only really useful for long aligned buffers.

It is completely annoying that you can't use LOOP (dec %cx and jump nz)
because it is very slow on Intel cpu - even ones that support adox).
JCZX is fine.

It is also possible to reduce the checksum to 16 bits using:
	sum = (sum % 0xffff) ^ 0xffff;
I think this is faster if gcc uses a 'multiply by reciprocal'
but (in my tests) it sometimes used a divide.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [RFC] x86/csum: rewrite csum_partial()
  2021-11-14 13:07 ` David Laight
@ 2021-11-14 14:12   ` David Laight
  2021-11-15 10:23     ` David Laight
  0 siblings, 1 reply; 11+ messages in thread
From: David Laight @ 2021-11-14 14:12 UTC (permalink / raw)
  To: David Laight, 'Eric Dumazet', David S . Miller, Jakub Kicinski
  Cc: netdev, Eric Dumazet, x86, Alexander Duyck

From: David Laight
> Sent: 14 November 2021 13:07
> 
..
> If you aren't worried (too much) about cpu before Bradwell then IIRC
> this loop gets close to 8 bytes/clock:
> 
> +               "10:    jecxz 20f\n"
> +               "       adc   (%[buff], %[len]), %[sum]\n"
> +               "       adc   8(%[buff], %[len]), %[sum]\n"
> +               "       lea   16(%[len]), %[tmp]\n"
> +               "       jmp   10b\n"
> +               " 20:"

It is even possible a loop based on:
	10:	adc	(%[buff], %[len], 8), %sum
		inc	%[len]
		jnz	10b
will run at 8 bytes per clock on very recent Intel cpu.
The 'adc' needs P06 and P23, the 'inc' P0156 and the
'jnz' P6 (predicted taken) (on Broadwell and probably later).
(The 'inc' and 'jnz' might alse be fusable to a single P6 u-op.)

Using 'lea' instead of 'inc' constrains it to P15.
That might actually generate better scheduling since it
is guaranteed to 'miss' the 'adc'.

So if the right ports are selected it is possible to
execute all the instructions in parallel.

It certainly isn't necessary to unroll the loop any more
than two reads for Bradwell onwards.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [RFC] x86/csum: rewrite csum_partial()
  2021-11-14 14:12   ` David Laight
@ 2021-11-15 10:23     ` David Laight
  0 siblings, 0 replies; 11+ messages in thread
From: David Laight @ 2021-11-15 10:23 UTC (permalink / raw)
  To: 'Eric Dumazet', David S . Miller, Jakub Kicinski
  Cc: netdev, Eric Dumazet, x86, Alexander Duyck

From: David Laight
> Sent: 14 November 2021 14:12
> ..
> > If you aren't worried (too much) about cpu before Broadwell then IIRC
> > this loop gets close to 8 bytes/clock:
> >
> > +               "10:    jecxz 20f\n"
> > +               "       adc   (%[buff], %[len]), %[sum]\n"
> > +               "       adc   8(%[buff], %[len]), %[sum]\n"
> > +               "       lea   16(%[len]), %[tmp]\n"
> > +               "       jmp   10b\n"
> > +               " 20:"
> 
> It is even possible a loop based on:
> 	10:	adc	(%[buff], %[len], 8), %sum
> 		inc	%[len]
> 		jnz	10b
> will run at 8 bytes per clock on very recent Intel cpu.

It doesn't on i7-7700.
(which I probably tested last year).

But the first loop does run twice as fast - and will only
be beaten by the adcx/adox loop.
So there is no need to unroll to more than 2 reads/loop.

For cpu between Ivy bridge and Broadwell you want to use
separate 'sum' registers to avoid the 2 clock latency
of the adc result.
That should beat the 4 bytes/clock of the current loop.
But does need an extra unroll to get near 8 bytes/clock.

For older cpu (nehalem/core2) the 'jecxz' loop is about the
only way to 'loop carry' the carry flag without the
6 clock penalty for the partial flags register update.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2021-11-15 10:23 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-11  6:53 [RFC] x86/csum: rewrite csum_partial() Eric Dumazet
2021-11-11  9:10 ` Peter Zijlstra
2021-11-11  9:44   ` Peter Zijlstra
2021-11-11 16:02     ` Eric Dumazet
2021-11-11 16:52       ` Eric Dumazet
2021-11-11 18:17         ` Andrew Cooper
2021-11-11 19:02           ` Eric Dumazet
2021-11-11 16:51 ` Alexander Duyck
2021-11-14 13:07 ` David Laight
2021-11-14 14:12   ` David Laight
2021-11-15 10:23     ` David Laight

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.