All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] x86/csum: rewrite csum_partial()
@ 2021-11-12 16:19 Eric Dumazet
  2021-11-12 16:45 ` Peter Zijlstra
                   ` (4 more replies)
  0 siblings, 5 replies; 10+ messages in thread
From: Eric Dumazet @ 2021-11-12 16:19 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski
  Cc: netdev, Eric Dumazet, Eric Dumazet, x86, Alexander Duyck,
	Peter Zijlstra, Andrew Cooper

From: Eric Dumazet <edumazet@google.com>

With more NIC supporting CHECKSUM_COMPLETE, and IPv6 being widely used.
csum_partial() is heavily used with small amount of bytes,
and is consuming many cycles.

IPv6 header size for instance is 40 bytes.

Another thing to consider is that NET_IP_ALIGN is 0 on x86,
meaning that network headers are not word-aligned, unless
the driver forces this.

This means that csum_partial() fetches one u16
to 'align the buffer', then perform three u64 additions
with carry in a loop, then a remaining u32, then a remaining u16.

With this new version, we perform a loop only for the 64 bytes blocks,
then the remaining is bisected.

Tested on various cpus, all of them show a big reduction in
csum_partial() cost (by 50 to 80 %)

v3: - use "+r" (temp64) asm constraints (Andrew).
    - fold do_csum() in csum_partial(), as gcc does not inline it.
    - fix bug added in v2 for the "odd" case.
    - back using addcq, as Andrew pointed the clang bug that was adding
	  a stall on my hosts.
      (separate patch to add32_with_carry() will follow)
    - use load_unaligned_zeropad() for final 1-7 bytes (Peter & Alexander).

v2: - removed the hard-coded switch(), as it was not RETPOLINE aware.
    - removed the final add32_with_carry() that we were doing
      in csum_partial(), we can simply pass @sum to do_csum().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Alexander Duyck <alexander.duyck@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
---
 arch/x86/lib/csum-partial_64.c | 162 ++++++++++++++-------------------
 1 file changed, 67 insertions(+), 95 deletions(-)

diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index e7925d668b680269fb2442766deaf416dc42f9a1..5ec35626945b6db2f7f41c6d46d5e422810eac46 100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -9,6 +9,7 @@
 #include <linux/compiler.h>
 #include <linux/export.h>
 #include <asm/checksum.h>
+#include <asm/word-at-a-time.h>
 
 static inline unsigned short from32to16(unsigned a) 
 {
@@ -21,120 +22,92 @@ static inline unsigned short from32to16(unsigned a)
 }
 
 /*
- * Do a 64-bit checksum on an arbitrary memory area.
+ * Do a checksum on an arbitrary memory area.
  * Returns a 32bit checksum.
  *
  * This isn't as time critical as it used to be because many NICs
  * do hardware checksumming these days.
- * 
- * Things tried and found to not make it faster:
- * Manual Prefetching
- * Unrolling to an 128 bytes inner loop.
- * Using interleaving with more registers to break the carry chains.
+ *
+ * Still, with CHECKSUM_COMPLETE this is called to compute
+ * checksums on IPv6 headers (40 bytes) and other small parts.
+ * it's best to have buff aligned on a 64-bit boundary
  */
-static unsigned do_csum(const unsigned char *buff, unsigned len)
+__wsum csum_partial(const void *buff, int len, __wsum sum)
 {
-	unsigned odd, count;
-	unsigned long result = 0;
+	u64 temp64 = (__force u64)sum;
+	unsigned odd, result;
 
-	if (unlikely(len == 0))
-		return result; 
 	odd = 1 & (unsigned long) buff;
 	if (unlikely(odd)) {
-		result = *buff << 8;
+		if (unlikely(len == 0))
+			return sum;
+		temp64 += (*(unsigned char *)buff << 8);
 		len--;
 		buff++;
 	}
-	count = len >> 1;		/* nr of 16-bit words.. */
-	if (count) {
-		if (2 & (unsigned long) buff) {
-			result += *(unsigned short *)buff;
-			count--;
-			len -= 2;
-			buff += 2;
-		}
-		count >>= 1;		/* nr of 32-bit words.. */
-		if (count) {
-			unsigned long zero;
-			unsigned count64;
-			if (4 & (unsigned long) buff) {
-				result += *(unsigned int *) buff;
-				count--;
-				len -= 4;
-				buff += 4;
-			}
-			count >>= 1;	/* nr of 64-bit words.. */
 
-			/* main loop using 64byte blocks */
-			zero = 0;
-			count64 = count >> 3;
-			while (count64) { 
-				asm("addq 0*8(%[src]),%[res]\n\t"
-				    "adcq 1*8(%[src]),%[res]\n\t"
-				    "adcq 2*8(%[src]),%[res]\n\t"
-				    "adcq 3*8(%[src]),%[res]\n\t"
-				    "adcq 4*8(%[src]),%[res]\n\t"
-				    "adcq 5*8(%[src]),%[res]\n\t"
-				    "adcq 6*8(%[src]),%[res]\n\t"
-				    "adcq 7*8(%[src]),%[res]\n\t"
-				    "adcq %[zero],%[res]"
-				    : [res] "=r" (result)
-				    : [src] "r" (buff), [zero] "r" (zero),
-				    "[res]" (result));
-				buff += 64;
-				count64--;
-			}
+	while (unlikely(len >= 64)) {
+		asm("addq 0*8(%[src]),%[res]\n\t"
+		    "adcq 1*8(%[src]),%[res]\n\t"
+		    "adcq 2*8(%[src]),%[res]\n\t"
+		    "adcq 3*8(%[src]),%[res]\n\t"
+		    "adcq 4*8(%[src]),%[res]\n\t"
+		    "adcq 5*8(%[src]),%[res]\n\t"
+		    "adcq 6*8(%[src]),%[res]\n\t"
+		    "adcq 7*8(%[src]),%[res]\n\t"
+		    "adcq $0,%[res]"
+		    : [res] "+r" (temp64)
+		    : [src] "r" (buff)
+		    : "memory");
+		buff += 64;
+		len -= 64;
+	}
+
+	if (len & 32) {
+		asm("addq 0*8(%[src]),%[res]\n\t"
+		    "adcq 1*8(%[src]),%[res]\n\t"
+		    "adcq 2*8(%[src]),%[res]\n\t"
+		    "adcq 3*8(%[src]),%[res]\n\t"
+		    "adcq $0,%[res]"
+			: [res] "+r" (temp64)
+			: [src] "r" (buff)
+			: "memory");
+		buff += 32;
+	}
+	if (len & 16) {
+		asm("addq 0*8(%[src]),%[res]\n\t"
+		    "adcq 1*8(%[src]),%[res]\n\t"
+		    "adcq $0,%[res]"
+			: [res] "+r" (temp64)
+			: [src] "r" (buff)
+			: "memory");
+		buff += 16;
+	}
+	if (len & 8) {
+		asm("addq 0*8(%[src]),%[res]\n\t"
+		    "adcq $0,%[res]"
+			: [res] "+r" (temp64)
+			: [src] "r" (buff)
+			: "memory");
+		buff += 8;
+	}
+	if (len & 7) {
+		unsigned int shift = (8 - (len & 7)) * 8;
+		unsigned long trail;
 
-			/* last up to 7 8byte blocks */
-			count %= 8; 
-			while (count) { 
-				asm("addq %1,%0\n\t"
-				    "adcq %2,%0\n" 
-					    : "=r" (result)
-				    : "m" (*(unsigned long *)buff), 
-				    "r" (zero),  "0" (result));
-				--count; 
-				buff += 8;
-			}
-			result = add32_with_carry(result>>32,
-						  result&0xffffffff); 
+		trail = (load_unaligned_zeropad(buff) << shift) >> shift;
 
-			if (len & 4) {
-				result += *(unsigned int *) buff;
-				buff += 4;
-			}
-		}
-		if (len & 2) {
-			result += *(unsigned short *) buff;
-			buff += 2;
-		}
+		asm("addq %[trail],%[res]\n\t"
+		    "adcq $0,%[res]"
+			: [res] "+r" (temp64)
+			: [trail] "r" (trail));
 	}
-	if (len & 1)
-		result += *buff;
-	result = add32_with_carry(result>>32, result & 0xffffffff); 
+	result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
 	if (unlikely(odd)) { 
 		result = from32to16(result);
 		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
 	}
-	return result;
-}
-
-/*
- * computes the checksum of a memory block at buff, length len,
- * and adds in "sum" (32-bit)
- *
- * returns a 32-bit number suitable for feeding into itself
- * or csum_tcpudp_magic
- *
- * this function must be called with even lengths, except
- * for the last fragment, which may be odd
- *
- * it's best to have buff aligned on a 64-bit boundary
- */
-__wsum csum_partial(const void *buff, int len, __wsum sum)
-{
-	return (__force __wsum)add32_with_carry(do_csum(buff, len),
-						(__force u32)sum);
+	return (__force __wsum)result;
 }
 EXPORT_SYMBOL(csum_partial);
 
@@ -147,4 +120,3 @@ __sum16 ip_compute_csum(const void *buff, int len)
 	return csum_fold(csum_partial(buff,len,0));
 }
 EXPORT_SYMBOL(ip_compute_csum);
-
-- 
2.34.0.rc1.387.gb447b232ab-goog


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] x86/csum: rewrite csum_partial()
  2021-11-12 16:19 [PATCH v2] x86/csum: rewrite csum_partial() Eric Dumazet
@ 2021-11-12 16:45 ` Peter Zijlstra
  2021-11-12 17:23   ` Eric Dumazet
  2021-11-13  1:13 ` Alexander Duyck
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 10+ messages in thread
From: Peter Zijlstra @ 2021-11-12 16:45 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, Jakub Kicinski, netdev, Eric Dumazet, x86,
	Alexander Duyck, Andrew Cooper

On Fri, Nov 12, 2021 at 08:19:50AM -0800, Eric Dumazet wrote:
> From: Eric Dumazet <edumazet@google.com>
> 
> With more NIC supporting CHECKSUM_COMPLETE, and IPv6 being widely used.
> csum_partial() is heavily used with small amount of bytes,
> and is consuming many cycles.
> 
> IPv6 header size for instance is 40 bytes.
> 
> Another thing to consider is that NET_IP_ALIGN is 0 on x86,
> meaning that network headers are not word-aligned, unless
> the driver forces this.
> 
> This means that csum_partial() fetches one u16
> to 'align the buffer', then perform three u64 additions
> with carry in a loop, then a remaining u32, then a remaining u16.
> 
> With this new version, we perform a loop only for the 64 bytes blocks,
> then the remaining is bisected.
> 
> Tested on various cpus, all of them show a big reduction in
> csum_partial() cost (by 50 to 80 %)
> 
> v3: - use "+r" (temp64) asm constraints (Andrew).
>     - fold do_csum() in csum_partial(), as gcc does not inline it.
>     - fix bug added in v2 for the "odd" case.
>     - back using addcq, as Andrew pointed the clang bug that was adding
> 	  a stall on my hosts.
>       (separate patch to add32_with_carry() will follow)
>     - use load_unaligned_zeropad() for final 1-7 bytes (Peter & Alexander).
> 
> v2: - removed the hard-coded switch(), as it was not RETPOLINE aware.
>     - removed the final add32_with_carry() that we were doing
>       in csum_partial(), we can simply pass @sum to do_csum().
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>

Looks nice, happen to have shiny perf numbers to show how awesome it it?
:-)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] x86/csum: rewrite csum_partial()
  2021-11-12 16:45 ` Peter Zijlstra
@ 2021-11-12 17:23   ` Eric Dumazet
  2021-11-12 17:36     ` Eric Dumazet
  2021-11-13  7:43     ` Peter Zijlstra
  0 siblings, 2 replies; 10+ messages in thread
From: Eric Dumazet @ 2021-11-12 17:23 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Eric Dumazet, David S . Miller, Jakub Kicinski, netdev, x86,
	Alexander Duyck, Andrew Cooper

On Fri, Nov 12, 2021 at 8:45 AM Peter Zijlstra <peterz@infradead.org> wrote:
>

>
> Looks nice, happen to have shiny perf numbers to show how awesome it it?
> :-)

On a networking load on cascadlake, line rate received on a single thread, I see
perf -e cycles:pp -C <cpu>

Before:
       4.16%  [kernel]       [k] csum_partial
After:
        0.83%  [kernel]       [k] csum_partial

If run in a loop 1,000,000 times,

Before:
       26,922,913      cycles                    # 3846130.429 GHz
        80,302,961      instructions              #    2.98  insn per
cycle
        21,059,816      branches                  # 3008545142.857
M/sec
             2,896      branch-misses             #    0.01% of all
branches
After:
        17,960,709      cycles                    # 3592141.800 GHz
        41,292,805      instructions              #    2.30  insn per
cycle
        11,058,119      branches                  # 2211623800.000
M/sec
             2,997      branch-misses             #    0.03% of all
branches

Thanks for your help !

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] x86/csum: rewrite csum_partial()
  2021-11-12 17:23   ` Eric Dumazet
@ 2021-11-12 17:36     ` Eric Dumazet
  2021-11-12 17:42       ` Eric Dumazet
  2021-11-13  7:43     ` Peter Zijlstra
  1 sibling, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2021-11-12 17:36 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Eric Dumazet, David S . Miller, Jakub Kicinski, netdev, x86,
	Alexander Duyck, Andrew Cooper

On Fri, Nov 12, 2021 at 9:23 AM Eric Dumazet <edumazet@google.com> wrote:
>
> On Fri, Nov 12, 2021 at 8:45 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
>
> >
> > Looks nice, happen to have shiny perf numbers to show how awesome it it?
> > :-)
>
> On a networking load on cascadlake, line rate received on a single thread, I see
> perf -e cycles:pp -C <cpu>
>
> Before:
>        4.16%  [kernel]       [k] csum_partial
> After:
>         0.83%  [kernel]       [k] csum_partial
>
> If run in a loop 1,000,000 times,
>

However, there must be an error in my patch, return values are not the
same on unaligned buffers.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] x86/csum: rewrite csum_partial()
  2021-11-12 17:36     ` Eric Dumazet
@ 2021-11-12 17:42       ` Eric Dumazet
  0 siblings, 0 replies; 10+ messages in thread
From: Eric Dumazet @ 2021-11-12 17:42 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Eric Dumazet, David S . Miller, Jakub Kicinski, netdev, x86,
	Alexander Duyck, Andrew Cooper

On Fri, Nov 12, 2021 at 9:36 AM Eric Dumazet <edumazet@google.com> wrote:
>
> On Fri, Nov 12, 2021 at 9:23 AM Eric Dumazet <edumazet@google.com> wrote:
> >
> > On Fri, Nov 12, 2021 at 8:45 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> >
> > >
> > > Looks nice, happen to have shiny perf numbers to show how awesome it it?
> > > :-)
> >
> > On a networking load on cascadlake, line rate received on a single thread, I see
> > perf -e cycles:pp -C <cpu>
> >
> > Before:
> >        4.16%  [kernel]       [k] csum_partial
> > After:
> >         0.83%  [kernel]       [k] csum_partial
> >
> > If run in a loop 1,000,000 times,
> >
>
> However, there must be an error in my patch, return values are not the
> same on unaligned buffers.

Oh silly me, the 32bit value is different, but the 16bit csum is good,
sorry for the noise.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] x86/csum: rewrite csum_partial()
  2021-11-12 16:19 [PATCH v2] x86/csum: rewrite csum_partial() Eric Dumazet
  2021-11-12 16:45 ` Peter Zijlstra
@ 2021-11-13  1:13 ` Alexander Duyck
  2021-11-17 14:00 ` [tip: x86/core] x86/csum: Rewrite/optimize csum_partial() tip-bot2 for Eric Dumazet
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 10+ messages in thread
From: Alexander Duyck @ 2021-11-13  1:13 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, Jakub Kicinski, netdev, Eric Dumazet,
	the arch/x86 maintainers, Peter Zijlstra, Andrew Cooper

On Fri, Nov 12, 2021 at 8:19 AM Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> From: Eric Dumazet <edumazet@google.com>
>
> With more NIC supporting CHECKSUM_COMPLETE, and IPv6 being widely used.
> csum_partial() is heavily used with small amount of bytes,
> and is consuming many cycles.
>
> IPv6 header size for instance is 40 bytes.
>
> Another thing to consider is that NET_IP_ALIGN is 0 on x86,
> meaning that network headers are not word-aligned, unless
> the driver forces this.
>
> This means that csum_partial() fetches one u16
> to 'align the buffer', then perform three u64 additions
> with carry in a loop, then a remaining u32, then a remaining u16.
>
> With this new version, we perform a loop only for the 64 bytes blocks,
> then the remaining is bisected.
>
> Tested on various cpus, all of them show a big reduction in
> csum_partial() cost (by 50 to 80 %)
>
> v3: - use "+r" (temp64) asm constraints (Andrew).
>     - fold do_csum() in csum_partial(), as gcc does not inline it.
>     - fix bug added in v2 for the "odd" case.
>     - back using addcq, as Andrew pointed the clang bug that was adding
>           a stall on my hosts.
>       (separate patch to add32_with_carry() will follow)
>     - use load_unaligned_zeropad() for final 1-7 bytes (Peter & Alexander).
>
> v2: - removed the hard-coded switch(), as it was not RETPOLINE aware.
>     - removed the final add32_with_carry() that we were doing
>       in csum_partial(), we can simply pass @sum to do_csum().
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Alexander Duyck <alexander.duyck@gmail.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Andrew Cooper <andrew.cooper3@citrix.com>
> ---
>  arch/x86/lib/csum-partial_64.c | 162 ++++++++++++++-------------------
>  1 file changed, 67 insertions(+), 95 deletions(-)

Looks good to me.

Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] x86/csum: rewrite csum_partial()
  2021-11-12 17:23   ` Eric Dumazet
  2021-11-12 17:36     ` Eric Dumazet
@ 2021-11-13  7:43     ` Peter Zijlstra
  1 sibling, 0 replies; 10+ messages in thread
From: Peter Zijlstra @ 2021-11-13  7:43 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Eric Dumazet, David S . Miller, Jakub Kicinski, netdev, x86,
	Alexander Duyck, Andrew Cooper

On Fri, Nov 12, 2021 at 09:23:39AM -0800, Eric Dumazet wrote:
> On Fri, Nov 12, 2021 at 8:45 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> 
> >
> > Looks nice, happen to have shiny perf numbers to show how awesome it it?
> > :-)
> 
> On a networking load on cascadlake, line rate received on a single thread, I see
> perf -e cycles:pp -C <cpu>
> 
> Before:
>        4.16%  [kernel]       [k] csum_partial
> After:
>         0.83%  [kernel]       [k] csum_partial
> 
> If run in a loop 1,000,000 times,
> 
> Before:
>        26,922,913      cycles                    # 3846130.429 GHz
>         80,302,961      instructions              #    2.98  insn per
> cycle
>         21,059,816      branches                  # 3008545142.857
> M/sec
>              2,896      branch-misses             #    0.01% of all
> branches
> After:
>         17,960,709      cycles                    # 3592141.800 GHz
>         41,292,805      instructions              #    2.30  insn per
> cycle
>         11,058,119      branches                  # 2211623800.000
> M/sec
>              2,997      branch-misses             #    0.03% of all
> branches
> 
> Thanks for your help !

I've added these numbers to the Changelog and will queue the patch in
x86/core once -rc1 happens.

Thanks!

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [tip: x86/core] x86/csum: Rewrite/optimize csum_partial()
  2021-11-12 16:19 [PATCH v2] x86/csum: rewrite csum_partial() Eric Dumazet
  2021-11-12 16:45 ` Peter Zijlstra
  2021-11-13  1:13 ` Alexander Duyck
@ 2021-11-17 14:00 ` tip-bot2 for Eric Dumazet
  2021-12-01 10:51 ` [PATCH v2] x86/csum: rewrite csum_partial() David Laight
  2021-12-10 11:05 ` [tip: x86/core] x86/csum: Rewrite/optimize csum_partial() tip-bot2 for Eric Dumazet
  4 siblings, 0 replies; 10+ messages in thread
From: tip-bot2 for Eric Dumazet @ 2021-11-17 14:00 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Eric Dumazet, Peter Zijlstra (Intel), Alexander Duyck, x86, linux-kernel

The following commit has been merged into the x86/core branch of tip:

Commit-ID:     d31c3c683ee668ba5d87c0730610442fd672525f
Gitweb:        https://git.kernel.org/tip/d31c3c683ee668ba5d87c0730610442fd672525f
Author:        Eric Dumazet <edumazet@google.com>
AuthorDate:    Fri, 12 Nov 2021 08:19:50 -08:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Wed, 17 Nov 2021 14:48:54 +01:00

x86/csum: Rewrite/optimize csum_partial()

With more NIC supporting CHECKSUM_COMPLETE, and IPv6 being widely
used.  csum_partial() is heavily used with small amount of bytes, and
is consuming many cycles.

IPv6 header size for instance is 40 bytes.

Another thing to consider is that NET_IP_ALIGN is 0 on x86, meaning
that network headers are not word-aligned, unless the driver forces
this.

This means that csum_partial() fetches one u16 to 'align the buffer',
then perform three u64 additions with carry in a loop, then a
remaining u32, then a remaining u16.

With this new version, we perform a loop only for the 64 bytes blocks,
then the remaining is bisected.

Tested on various cpus, all of them show a big reduction in
csum_partial() cost (by 50 to 80 %)

Before:
	4.16%  [kernel]       [k] csum_partial
After:
	0.83%  [kernel]       [k] csum_partial

If run in a loop 1,000,000 times:

Before:
	26,922,913      cycles                    # 3846130.429 GHz
	80,302,961      instructions              #    2.98  insn per cycle
	21,059,816      branches                  # 3008545142.857 M/sec
	     2,896      branch-misses             #    0.01% of all branches
After:
	17,960,709      cycles                    # 3592141.800 GHz
	41,292,805      instructions              #    2.30  insn per cycle
	11,058,119      branches                  # 2211623800.000 M/sec
	     2,997      branch-misses             #    0.03% of all branches

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
Link: https://lore.kernel.org/r/20211112161950.528886-1-eric.dumazet@gmail.com
---
 arch/x86/lib/csum-partial_64.c | 162 +++++++++++++-------------------
 1 file changed, 67 insertions(+), 95 deletions(-)

diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index e7925d6..5ec3562 100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -9,6 +9,7 @@
 #include <linux/compiler.h>
 #include <linux/export.h>
 #include <asm/checksum.h>
+#include <asm/word-at-a-time.h>
 
 static inline unsigned short from32to16(unsigned a) 
 {
@@ -21,120 +22,92 @@ static inline unsigned short from32to16(unsigned a)
 }
 
 /*
- * Do a 64-bit checksum on an arbitrary memory area.
+ * Do a checksum on an arbitrary memory area.
  * Returns a 32bit checksum.
  *
  * This isn't as time critical as it used to be because many NICs
  * do hardware checksumming these days.
- * 
- * Things tried and found to not make it faster:
- * Manual Prefetching
- * Unrolling to an 128 bytes inner loop.
- * Using interleaving with more registers to break the carry chains.
+ *
+ * Still, with CHECKSUM_COMPLETE this is called to compute
+ * checksums on IPv6 headers (40 bytes) and other small parts.
+ * it's best to have buff aligned on a 64-bit boundary
  */
-static unsigned do_csum(const unsigned char *buff, unsigned len)
+__wsum csum_partial(const void *buff, int len, __wsum sum)
 {
-	unsigned odd, count;
-	unsigned long result = 0;
+	u64 temp64 = (__force u64)sum;
+	unsigned odd, result;
 
-	if (unlikely(len == 0))
-		return result; 
 	odd = 1 & (unsigned long) buff;
 	if (unlikely(odd)) {
-		result = *buff << 8;
+		if (unlikely(len == 0))
+			return sum;
+		temp64 += (*(unsigned char *)buff << 8);
 		len--;
 		buff++;
 	}
-	count = len >> 1;		/* nr of 16-bit words.. */
-	if (count) {
-		if (2 & (unsigned long) buff) {
-			result += *(unsigned short *)buff;
-			count--;
-			len -= 2;
-			buff += 2;
-		}
-		count >>= 1;		/* nr of 32-bit words.. */
-		if (count) {
-			unsigned long zero;
-			unsigned count64;
-			if (4 & (unsigned long) buff) {
-				result += *(unsigned int *) buff;
-				count--;
-				len -= 4;
-				buff += 4;
-			}
-			count >>= 1;	/* nr of 64-bit words.. */
 
-			/* main loop using 64byte blocks */
-			zero = 0;
-			count64 = count >> 3;
-			while (count64) { 
-				asm("addq 0*8(%[src]),%[res]\n\t"
-				    "adcq 1*8(%[src]),%[res]\n\t"
-				    "adcq 2*8(%[src]),%[res]\n\t"
-				    "adcq 3*8(%[src]),%[res]\n\t"
-				    "adcq 4*8(%[src]),%[res]\n\t"
-				    "adcq 5*8(%[src]),%[res]\n\t"
-				    "adcq 6*8(%[src]),%[res]\n\t"
-				    "adcq 7*8(%[src]),%[res]\n\t"
-				    "adcq %[zero],%[res]"
-				    : [res] "=r" (result)
-				    : [src] "r" (buff), [zero] "r" (zero),
-				    "[res]" (result));
-				buff += 64;
-				count64--;
-			}
+	while (unlikely(len >= 64)) {
+		asm("addq 0*8(%[src]),%[res]\n\t"
+		    "adcq 1*8(%[src]),%[res]\n\t"
+		    "adcq 2*8(%[src]),%[res]\n\t"
+		    "adcq 3*8(%[src]),%[res]\n\t"
+		    "adcq 4*8(%[src]),%[res]\n\t"
+		    "adcq 5*8(%[src]),%[res]\n\t"
+		    "adcq 6*8(%[src]),%[res]\n\t"
+		    "adcq 7*8(%[src]),%[res]\n\t"
+		    "adcq $0,%[res]"
+		    : [res] "+r" (temp64)
+		    : [src] "r" (buff)
+		    : "memory");
+		buff += 64;
+		len -= 64;
+	}
+
+	if (len & 32) {
+		asm("addq 0*8(%[src]),%[res]\n\t"
+		    "adcq 1*8(%[src]),%[res]\n\t"
+		    "adcq 2*8(%[src]),%[res]\n\t"
+		    "adcq 3*8(%[src]),%[res]\n\t"
+		    "adcq $0,%[res]"
+			: [res] "+r" (temp64)
+			: [src] "r" (buff)
+			: "memory");
+		buff += 32;
+	}
+	if (len & 16) {
+		asm("addq 0*8(%[src]),%[res]\n\t"
+		    "adcq 1*8(%[src]),%[res]\n\t"
+		    "adcq $0,%[res]"
+			: [res] "+r" (temp64)
+			: [src] "r" (buff)
+			: "memory");
+		buff += 16;
+	}
+	if (len & 8) {
+		asm("addq 0*8(%[src]),%[res]\n\t"
+		    "adcq $0,%[res]"
+			: [res] "+r" (temp64)
+			: [src] "r" (buff)
+			: "memory");
+		buff += 8;
+	}
+	if (len & 7) {
+		unsigned int shift = (8 - (len & 7)) * 8;
+		unsigned long trail;
 
-			/* last up to 7 8byte blocks */
-			count %= 8; 
-			while (count) { 
-				asm("addq %1,%0\n\t"
-				    "adcq %2,%0\n" 
-					    : "=r" (result)
-				    : "m" (*(unsigned long *)buff), 
-				    "r" (zero),  "0" (result));
-				--count; 
-				buff += 8;
-			}
-			result = add32_with_carry(result>>32,
-						  result&0xffffffff); 
+		trail = (load_unaligned_zeropad(buff) << shift) >> shift;
 
-			if (len & 4) {
-				result += *(unsigned int *) buff;
-				buff += 4;
-			}
-		}
-		if (len & 2) {
-			result += *(unsigned short *) buff;
-			buff += 2;
-		}
+		asm("addq %[trail],%[res]\n\t"
+		    "adcq $0,%[res]"
+			: [res] "+r" (temp64)
+			: [trail] "r" (trail));
 	}
-	if (len & 1)
-		result += *buff;
-	result = add32_with_carry(result>>32, result & 0xffffffff); 
+	result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
 	if (unlikely(odd)) { 
 		result = from32to16(result);
 		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
 	}
-	return result;
-}
-
-/*
- * computes the checksum of a memory block at buff, length len,
- * and adds in "sum" (32-bit)
- *
- * returns a 32-bit number suitable for feeding into itself
- * or csum_tcpudp_magic
- *
- * this function must be called with even lengths, except
- * for the last fragment, which may be odd
- *
- * it's best to have buff aligned on a 64-bit boundary
- */
-__wsum csum_partial(const void *buff, int len, __wsum sum)
-{
-	return (__force __wsum)add32_with_carry(do_csum(buff, len),
-						(__force u32)sum);
+	return (__force __wsum)result;
 }
 EXPORT_SYMBOL(csum_partial);
 
@@ -147,4 +120,3 @@ __sum16 ip_compute_csum(const void *buff, int len)
 	return csum_fold(csum_partial(buff,len,0));
 }
 EXPORT_SYMBOL(ip_compute_csum);
-

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* RE: [PATCH v2] x86/csum: rewrite csum_partial()
  2021-11-12 16:19 [PATCH v2] x86/csum: rewrite csum_partial() Eric Dumazet
                   ` (2 preceding siblings ...)
  2021-11-17 14:00 ` [tip: x86/core] x86/csum: Rewrite/optimize csum_partial() tip-bot2 for Eric Dumazet
@ 2021-12-01 10:51 ` David Laight
  2021-12-10 11:05 ` [tip: x86/core] x86/csum: Rewrite/optimize csum_partial() tip-bot2 for Eric Dumazet
  4 siblings, 0 replies; 10+ messages in thread
From: David Laight @ 2021-12-01 10:51 UTC (permalink / raw)
  To: 'Eric Dumazet', David S . Miller, Jakub Kicinski
  Cc: netdev, Eric Dumazet, x86, Alexander Duyck, Peter Zijlstra,
	Andrew Cooper

From: Eric Dumazet
> Sent: 12 November 2021 16:20
> 
> With more NIC supporting CHECKSUM_COMPLETE, and IPv6 being widely used.
> csum_partial() is heavily used with small amount of bytes,
> and is consuming many cycles.
> 
> IPv6 header size for instance is 40 bytes.
> 
> Another thing to consider is that NET_IP_ALIGN is 0 on x86,
> meaning that network headers are not word-aligned, unless
> the driver forces this.
> 
> This means that csum_partial() fetches one u16
> to 'align the buffer', then perform three u64 additions
> with carry in a loop, then a remaining u32, then a remaining u16.
> 
> With this new version, we perform a loop only for the 64 bytes blocks,
> then the remaining is bisected.
> 

I missed this going through, a couple of comments.
I've removed all the old lines from the patch to make it readable.

> +__wsum csum_partial(const void *buff, int len, __wsum sum)
>  {
> +	u64 temp64 = (__force u64)sum;
> +	unsigned odd, result;
> 
>  	odd = 1 & (unsigned long) buff;
>  	if (unlikely(odd)) {
> +		if (unlikely(len == 0))
> +			return sum;
> +		temp64 += (*(unsigned char *)buff << 8);
>  		len--;
>  		buff++;
>  	}

Do you need to special case an odd buffer address?
You are doing misaligned reads for other (more likely)
misaligned addresses so optimising for odd buffer addresses
is rather pointless.
If misaligned reads do cost an extra clock then it might
be worth detecting the more likely '4n+2' alignment of a receive
buffer and aligning that to 8n.

> 
> +	while (unlikely(len >= 64)) {
> +		asm("addq 0*8(%[src]),%[res]\n\t"
> +		    "adcq 1*8(%[src]),%[res]\n\t"
> +		    "adcq 2*8(%[src]),%[res]\n\t"
> +		    "adcq 3*8(%[src]),%[res]\n\t"
> +		    "adcq 4*8(%[src]),%[res]\n\t"
> +		    "adcq 5*8(%[src]),%[res]\n\t"
> +		    "adcq 6*8(%[src]),%[res]\n\t"
> +		    "adcq 7*8(%[src]),%[res]\n\t"
> +		    "adcq $0,%[res]"
> +		    : [res] "+r" (temp64)
> +		    : [src] "r" (buff)
> +		    : "memory");
> +		buff += 64;
> +		len -= 64;
> +	}
> +
> +	if (len & 32) {
> +		asm("addq 0*8(%[src]),%[res]\n\t"
> +		    "adcq 1*8(%[src]),%[res]\n\t"
> +		    "adcq 2*8(%[src]),%[res]\n\t"
> +		    "adcq 3*8(%[src]),%[res]\n\t"
> +		    "adcq $0,%[res]"
> +			: [res] "+r" (temp64)
> +			: [src] "r" (buff)
> +			: "memory");
> +		buff += 32;
> +	}
> +	if (len & 16) {
> +		asm("addq 0*8(%[src]),%[res]\n\t"
> +		    "adcq 1*8(%[src]),%[res]\n\t"
> +		    "adcq $0,%[res]"
> +			: [res] "+r" (temp64)
> +			: [src] "r" (buff)
> +			: "memory");
> +		buff += 16;
> +	}
> +	if (len & 8) {
> +		asm("addq 0*8(%[src]),%[res]\n\t"
> +		    "adcq $0,%[res]"
> +			: [res] "+r" (temp64)
> +			: [src] "r" (buff)
> +			: "memory");
> +		buff += 8;
> +	}

I suspect it is worth doing:
	switch (len & 24) {
	}
and separately coding the 24 byte case
to reduce the number of 'adc $0,%reg'.
Although writing the conditions by hand might needed to get
the likely code first (whichever length it is).

> +	if (len & 7) {
> +		unsigned int shift = (8 - (len & 7)) * 8;
> +		unsigned long trail;
> 
> +		trail = (load_unaligned_zeropad(buff) << shift) >> shift;
> 
> +		asm("addq %[trail],%[res]\n\t"
> +		    "adcq $0,%[res]"
> +			: [res] "+r" (temp64)
> +			: [trail] "r" (trail));
>  	}

If you do the 'len & 7' test at the top the 56bit 'trail' value
can just be added to the 32bit 'sum' input.
Just:
		temp64 += *(u64 *)(buff + len - 8) << shift;
would do - except it can fall off the beginning of a page :-(
Maybe:
		temp64 += load_unaligned_zeropad(buff + (len & ~7)) & (~0ull >> shift);
Generating the mask reduces the register dependency chain length.

Although I remember trying to do something like this and finding
it was actually slower than the old code.
The problem is likely to be the long register chain generating 'shift'
compared to the latency of multiple memory reads that you only get once.
So potentially a 'switch (len & 6)' followed by a final test for odd
length may in fact be better - who knows.

> +	result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
>  	if (unlikely(odd)) {
>  		result = from32to16(result);
>  		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
>  	}
> +	return (__force __wsum)result;
>  }

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [tip: x86/core] x86/csum: Rewrite/optimize csum_partial()
  2021-11-12 16:19 [PATCH v2] x86/csum: rewrite csum_partial() Eric Dumazet
                   ` (3 preceding siblings ...)
  2021-12-01 10:51 ` [PATCH v2] x86/csum: rewrite csum_partial() David Laight
@ 2021-12-10 11:05 ` tip-bot2 for Eric Dumazet
  4 siblings, 0 replies; 10+ messages in thread
From: tip-bot2 for Eric Dumazet @ 2021-12-10 11:05 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Eric Dumazet, Peter Zijlstra (Intel),
	Borislav Petkov, Alexander Duyck, x86, linux-kernel

The following commit has been merged into the x86/core branch of tip:

Commit-ID:     3411506550b1f714a52b5db087666c08658d2698
Gitweb:        https://git.kernel.org/tip/3411506550b1f714a52b5db087666c08658d2698
Author:        Eric Dumazet <edumazet@google.com>
AuthorDate:    Fri, 12 Nov 2021 08:19:50 -08:00
Committer:     Borislav Petkov <bp@suse.de>
CommitterDate: Wed, 08 Dec 2021 11:26:09 +01:00

x86/csum: Rewrite/optimize csum_partial()

With more NICs supporting CHECKSUM_COMPLETE, and IPv6 being widely
used csum_partial() is heavily used with small amount of bytes, and is
consuming many cycles.

IPv6 header size, for instance, is 40 bytes.

Another thing to consider is that NET_IP_ALIGN is 0 on x86, meaning
that network headers are not word-aligned, unless the driver forces
this.

This means that csum_partial() fetches one u16 to 'align the buffer',
then performs three u64 additions with carry in a loop, then a
remaining u32, then a remaining u16.

With this new version, it performs a loop only for the 64 bytes blocks,
then the remaining is bisected.

Testing on various CPUs, all of them show a big reduction in
csum_partial() cost (by 50 to 80 %)

Before:
	4.16%  [kernel]       [k] csum_partial
After:
	0.83%  [kernel]       [k] csum_partial

If run in a loop 1,000,000 times:

Before:
	26,922,913      cycles                    # 3846130.429 GHz
	80,302,961      instructions              #    2.98  insn per cycle
	21,059,816      branches                  # 3008545142.857 M/sec
	     2,896      branch-misses             #    0.01% of all branches
After:
	17,960,709      cycles                    # 3592141.800 GHz
	41,292,805      instructions              #    2.30  insn per cycle
	11,058,119      branches                  # 2211623800.000 M/sec
	     2,997      branch-misses             #    0.03% of all branches

 [ bp: Massage, merge in subsequent fixes into a single patch:
   - um compilation error due to missing load_unaligned_zeropad():
	- Reported-by: kernel test robot <lkp@intel.com>
	- Link: https://lkml.kernel.org/r/20211118175239.1525650-1-eric.dumazet@gmail.com
   - Fix initial seed for odd buffers
	- Reported-by: Noah Goldstein <goldstein.w.n@gmail.com>
	- Link: https://lkml.kernel.org/r/20211125141817.3541501-1-eric.dumazet@gmail.com
  ]

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
Link: https://lore.kernel.org/r/20211112161950.528886-1-eric.dumazet@gmail.com
---
 arch/x86/lib/csum-partial_64.c | 183 +++++++++++++++-----------------
 1 file changed, 91 insertions(+), 92 deletions(-)

diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index e7925d6..1f8a8f8 100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -9,6 +9,7 @@
 #include <linux/compiler.h>
 #include <linux/export.h>
 #include <asm/checksum.h>
+#include <asm/word-at-a-time.h>
 
 static inline unsigned short from32to16(unsigned a) 
 {
@@ -21,120 +22,119 @@ static inline unsigned short from32to16(unsigned a)
 }
 
 /*
- * Do a 64-bit checksum on an arbitrary memory area.
+ * Do a checksum on an arbitrary memory area.
  * Returns a 32bit checksum.
  *
  * This isn't as time critical as it used to be because many NICs
  * do hardware checksumming these days.
- * 
- * Things tried and found to not make it faster:
- * Manual Prefetching
- * Unrolling to an 128 bytes inner loop.
- * Using interleaving with more registers to break the carry chains.
+ *
+ * Still, with CHECKSUM_COMPLETE this is called to compute
+ * checksums on IPv6 headers (40 bytes) and other small parts.
+ * it's best to have buff aligned on a 64-bit boundary
  */
-static unsigned do_csum(const unsigned char *buff, unsigned len)
+__wsum csum_partial(const void *buff, int len, __wsum sum)
 {
-	unsigned odd, count;
-	unsigned long result = 0;
+	u64 temp64 = (__force u64)sum;
+	unsigned odd, result;
 
-	if (unlikely(len == 0))
-		return result; 
 	odd = 1 & (unsigned long) buff;
 	if (unlikely(odd)) {
-		result = *buff << 8;
+		if (unlikely(len == 0))
+			return sum;
+		temp64 = ror32((__force u32)sum, 8);
+		temp64 += (*(unsigned char *)buff << 8);
 		len--;
 		buff++;
 	}
-	count = len >> 1;		/* nr of 16-bit words.. */
-	if (count) {
-		if (2 & (unsigned long) buff) {
-			result += *(unsigned short *)buff;
-			count--;
-			len -= 2;
-			buff += 2;
-		}
-		count >>= 1;		/* nr of 32-bit words.. */
-		if (count) {
-			unsigned long zero;
-			unsigned count64;
-			if (4 & (unsigned long) buff) {
-				result += *(unsigned int *) buff;
-				count--;
-				len -= 4;
-				buff += 4;
-			}
-			count >>= 1;	/* nr of 64-bit words.. */
 
-			/* main loop using 64byte blocks */
-			zero = 0;
-			count64 = count >> 3;
-			while (count64) { 
-				asm("addq 0*8(%[src]),%[res]\n\t"
-				    "adcq 1*8(%[src]),%[res]\n\t"
-				    "adcq 2*8(%[src]),%[res]\n\t"
-				    "adcq 3*8(%[src]),%[res]\n\t"
-				    "adcq 4*8(%[src]),%[res]\n\t"
-				    "adcq 5*8(%[src]),%[res]\n\t"
-				    "adcq 6*8(%[src]),%[res]\n\t"
-				    "adcq 7*8(%[src]),%[res]\n\t"
-				    "adcq %[zero],%[res]"
-				    : [res] "=r" (result)
-				    : [src] "r" (buff), [zero] "r" (zero),
-				    "[res]" (result));
-				buff += 64;
-				count64--;
-			}
+	while (unlikely(len >= 64)) {
+		asm("addq 0*8(%[src]),%[res]\n\t"
+		    "adcq 1*8(%[src]),%[res]\n\t"
+		    "adcq 2*8(%[src]),%[res]\n\t"
+		    "adcq 3*8(%[src]),%[res]\n\t"
+		    "adcq 4*8(%[src]),%[res]\n\t"
+		    "adcq 5*8(%[src]),%[res]\n\t"
+		    "adcq 6*8(%[src]),%[res]\n\t"
+		    "adcq 7*8(%[src]),%[res]\n\t"
+		    "adcq $0,%[res]"
+		    : [res] "+r" (temp64)
+		    : [src] "r" (buff)
+		    : "memory");
+		buff += 64;
+		len -= 64;
+	}
+
+	if (len & 32) {
+		asm("addq 0*8(%[src]),%[res]\n\t"
+		    "adcq 1*8(%[src]),%[res]\n\t"
+		    "adcq 2*8(%[src]),%[res]\n\t"
+		    "adcq 3*8(%[src]),%[res]\n\t"
+		    "adcq $0,%[res]"
+			: [res] "+r" (temp64)
+			: [src] "r" (buff)
+			: "memory");
+		buff += 32;
+	}
+	if (len & 16) {
+		asm("addq 0*8(%[src]),%[res]\n\t"
+		    "adcq 1*8(%[src]),%[res]\n\t"
+		    "adcq $0,%[res]"
+			: [res] "+r" (temp64)
+			: [src] "r" (buff)
+			: "memory");
+		buff += 16;
+	}
+	if (len & 8) {
+		asm("addq 0*8(%[src]),%[res]\n\t"
+		    "adcq $0,%[res]"
+			: [res] "+r" (temp64)
+			: [src] "r" (buff)
+			: "memory");
+		buff += 8;
+	}
+	if (len & 7) {
+#ifdef CONFIG_DCACHE_WORD_ACCESS
+		unsigned int shift = (8 - (len & 7)) * 8;
+		unsigned long trail;
 
-			/* last up to 7 8byte blocks */
-			count %= 8; 
-			while (count) { 
-				asm("addq %1,%0\n\t"
-				    "adcq %2,%0\n" 
-					    : "=r" (result)
-				    : "m" (*(unsigned long *)buff), 
-				    "r" (zero),  "0" (result));
-				--count; 
-				buff += 8;
-			}
-			result = add32_with_carry(result>>32,
-						  result&0xffffffff); 
+		trail = (load_unaligned_zeropad(buff) << shift) >> shift;
 
-			if (len & 4) {
-				result += *(unsigned int *) buff;
-				buff += 4;
-			}
+		asm("addq %[trail],%[res]\n\t"
+		    "adcq $0,%[res]"
+			: [res] "+r" (temp64)
+			: [trail] "r" (trail));
+#else
+		if (len & 4) {
+			asm("addq %[val],%[res]\n\t"
+			    "adcq $0,%[res]"
+				: [res] "+r" (temp64)
+				: [val] "r" ((u64)*(u32 *)buff)
+				: "memory");
+			buff += 4;
 		}
 		if (len & 2) {
-			result += *(unsigned short *) buff;
+			asm("addq %[val],%[res]\n\t"
+			    "adcq $0,%[res]"
+				: [res] "+r" (temp64)
+				: [val] "r" ((u64)*(u16 *)buff)
+				: "memory");
 			buff += 2;
 		}
+		if (len & 1) {
+			asm("addq %[val],%[res]\n\t"
+			    "adcq $0,%[res]"
+				: [res] "+r" (temp64)
+				: [val] "r" ((u64)*(u8 *)buff)
+				: "memory");
+		}
+#endif
 	}
-	if (len & 1)
-		result += *buff;
-	result = add32_with_carry(result>>32, result & 0xffffffff); 
-	if (unlikely(odd)) { 
+	result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
+	if (unlikely(odd)) {
 		result = from32to16(result);
 		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
 	}
-	return result;
-}
-
-/*
- * computes the checksum of a memory block at buff, length len,
- * and adds in "sum" (32-bit)
- *
- * returns a 32-bit number suitable for feeding into itself
- * or csum_tcpudp_magic
- *
- * this function must be called with even lengths, except
- * for the last fragment, which may be odd
- *
- * it's best to have buff aligned on a 64-bit boundary
- */
-__wsum csum_partial(const void *buff, int len, __wsum sum)
-{
-	return (__force __wsum)add32_with_carry(do_csum(buff, len),
-						(__force u32)sum);
+	return (__force __wsum)result;
 }
 EXPORT_SYMBOL(csum_partial);
 
@@ -147,4 +147,3 @@ __sum16 ip_compute_csum(const void *buff, int len)
 	return csum_fold(csum_partial(buff,len,0));
 }
 EXPORT_SYMBOL(ip_compute_csum);
-

^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2021-12-10 11:05 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-12 16:19 [PATCH v2] x86/csum: rewrite csum_partial() Eric Dumazet
2021-11-12 16:45 ` Peter Zijlstra
2021-11-12 17:23   ` Eric Dumazet
2021-11-12 17:36     ` Eric Dumazet
2021-11-12 17:42       ` Eric Dumazet
2021-11-13  7:43     ` Peter Zijlstra
2021-11-13  1:13 ` Alexander Duyck
2021-11-17 14:00 ` [tip: x86/core] x86/csum: Rewrite/optimize csum_partial() tip-bot2 for Eric Dumazet
2021-12-01 10:51 ` [PATCH v2] x86/csum: rewrite csum_partial() David Laight
2021-12-10 11:05 ` [tip: x86/core] x86/csum: Rewrite/optimize csum_partial() tip-bot2 for Eric Dumazet

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.