All of lore.kernel.org
 help / color / mirror / Atom feed
From: Eric Dumazet <eric.dumazet@gmail.com>
To: "David S . Miller" <davem@davemloft.net>,
	Jakub Kicinski <kuba@kernel.org>
Cc: netdev <netdev@vger.kernel.org>,
	Eric Dumazet <edumazet@google.com>,
	Eric Dumazet <eric.dumazet@gmail.com>,
	x86@kernel.org, Alexander Duyck <alexander.duyck@gmail.com>
Subject: [RFC] x86/csum: rewrite csum_partial()
Date: Wed, 10 Nov 2021 22:53:22 -0800	[thread overview]
Message-ID: <20211111065322.1261275-1-eric.dumazet@gmail.com> (raw)

From: Eric Dumazet <edumazet@google.com>

With more NIC supporting CHECKSUM_COMPLETE, and IPv6 being widely used.
csum_partial() is heavily used with small amount of bytes,
and is consuming many cycles.

IPv6 header size for instance is 40 bytes.

Another thing to consider is that NET_IP_ALIGN is 0 on x86,
meaning that network headers in RX path are not word-aligned,
unless the driver forces this.

This means that csum_partial() fetches one u16
to 'align the buffer', then perform seven u64 additions
with carry in a loop, then a remaining u32, then a remaining u16.

With this new version, we perform 10 u32 adds with carry, to
avoid the expensive 64->32 transformation. Using 5 u64 adds
plus one add32_with_carry() is more expensive.

Also note that this avoids loops for less than ~60 bytes.

Tested on various cpus, all of them show a big reduction in
csum_partial() cost (by 30 to 75 %)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Alexander Duyck <alexander.duyck@gmail.com>
---
 arch/x86/lib/csum-partial_64.c | 146 +++++++++++++++++----------------
 1 file changed, 77 insertions(+), 69 deletions(-)

diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index e7925d668b680269fb2442766deaf416dc42f9a1..f48fe0ec9663ff3afa1b5403f135407b8b0fde01 100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -21,97 +21,105 @@ static inline unsigned short from32to16(unsigned a)
 }
 
 /*
- * Do a 64-bit checksum on an arbitrary memory area.
+ * Do a checksum on an arbitrary memory area.
  * Returns a 32bit checksum.
  *
  * This isn't as time critical as it used to be because many NICs
  * do hardware checksumming these days.
- * 
- * Things tried and found to not make it faster:
- * Manual Prefetching
- * Unrolling to an 128 bytes inner loop.
- * Using interleaving with more registers to break the carry chains.
+ *
+ * Still, with CHECKSUM_COMPLETE this is called to compute
+ * checksums on IPv6 headers (40 bytes) and other small parts.
  */
 static unsigned do_csum(const unsigned char *buff, unsigned len)
 {
-	unsigned odd, count;
-	unsigned long result = 0;
+	unsigned long dwords;
+	unsigned odd, result = 0;
 
-	if (unlikely(len == 0))
-		return result; 
 	odd = 1 & (unsigned long) buff;
 	if (unlikely(odd)) {
+		if (unlikely(len == 0))
+			return result;
 		result = *buff << 8;
 		len--;
 		buff++;
 	}
-	count = len >> 1;		/* nr of 16-bit words.. */
-	if (count) {
-		if (2 & (unsigned long) buff) {
-			result += *(unsigned short *)buff;
-			count--;
-			len -= 2;
-			buff += 2;
-		}
-		count >>= 1;		/* nr of 32-bit words.. */
-		if (count) {
-			unsigned long zero;
-			unsigned count64;
-			if (4 & (unsigned long) buff) {
-				result += *(unsigned int *) buff;
-				count--;
-				len -= 4;
-				buff += 4;
-			}
-			count >>= 1;	/* nr of 64-bit words.. */
+	if (unlikely(len >= 64)) {
+		unsigned long temp64 = result;
+		do {
+			asm("	addq 0*8(%[src]),%[res]\n"
+			    "	adcq 1*8(%[src]),%[res]\n"
+			    "	adcq 2*8(%[src]),%[res]\n"
+			    "	adcq 3*8(%[src]),%[res]\n"
+			    "	adcq 4*8(%[src]),%[res]\n"
+			    "	adcq 5*8(%[src]),%[res]\n"
+			    "	adcq 6*8(%[src]),%[res]\n"
+			    "	adcq 7*8(%[src]),%[res]\n"
+			    "	adcq $0,%[res]"
+			    : [res] "=r" (temp64)
+			    : [src] "r" (buff), "[res]" (temp64)
+			    : "memory");
+			buff += 64;
+			len -= 64;
+		} while (len >= 64);
+		result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
+	}
 
-			/* main loop using 64byte blocks */
-			zero = 0;
-			count64 = count >> 3;
-			while (count64) { 
-				asm("addq 0*8(%[src]),%[res]\n\t"
-				    "adcq 1*8(%[src]),%[res]\n\t"
-				    "adcq 2*8(%[src]),%[res]\n\t"
-				    "adcq 3*8(%[src]),%[res]\n\t"
-				    "adcq 4*8(%[src]),%[res]\n\t"
-				    "adcq 5*8(%[src]),%[res]\n\t"
-				    "adcq 6*8(%[src]),%[res]\n\t"
-				    "adcq 7*8(%[src]),%[res]\n\t"
-				    "adcq %[zero],%[res]"
-				    : [res] "=r" (result)
-				    : [src] "r" (buff), [zero] "r" (zero),
-				    "[res]" (result));
-				buff += 64;
-				count64--;
-			}
+	dwords = len >> 2;
+	if (dwords) { /* dwords is in [1..15] */
+		unsigned long dest;
 
-			/* last up to 7 8byte blocks */
-			count %= 8; 
-			while (count) { 
-				asm("addq %1,%0\n\t"
-				    "adcq %2,%0\n" 
-					    : "=r" (result)
-				    : "m" (*(unsigned long *)buff), 
-				    "r" (zero),  "0" (result));
-				--count; 
-				buff += 8;
-			}
-			result = add32_with_carry(result>>32,
-						  result&0xffffffff); 
+		/*
+		 * This implements an optimized version of
+		 * switch (dwords) {
+		 * case 15: res = add_with_carry(res, buf32[14]); fallthrough;
+		 * case 14: res = add_with_carry(res, buf32[13]); fallthrough;
+		 * case 13: res = add_with_carry(res, buf32[12]); fallthrough;
+		 * ...
+		 * case 3: res = add_with_carry(res, buf32[2]); fallthrough;
+		 * case 2: res = add_with_carry(res, buf32[1]); fallthrough;
+		 * case 1: res = add_with_carry(res, buf32[0]); fallthrough;
+		 * }
+		 *
+		 * "adcl 8byteoff(%reg1),%reg2" are using either 3 or 4 bytes.
+		 */
+		asm("	call 1f\n"
+		    "1:	pop %[dest]\n"
+		    "	lea (2f-1b)(%[dest],%[skip],4),%[dest]\n"
+		    "	clc\n"
+		    "	jmp *%[dest]\n               .align 4\n"
+		    "2:\n"
+		    "	adcl 14*4(%[src]),%[res]\n   .align 4\n"
+		    "	adcl 13*4(%[src]),%[res]\n   .align 4\n"
+		    "	adcl 12*4(%[src]),%[res]\n   .align 4\n"
+		    "	adcl 11*4(%[src]),%[res]\n   .align 4\n"
+		    "	adcl 10*4(%[src]),%[res]\n   .align 4\n"
+		    "	adcl 9*4(%[src]),%[res]\n   .align 4\n"
+		    "	adcl 8*4(%[src]),%[res]\n   .align 4\n"
+		    "	adcl 7*4(%[src]),%[res]\n   .align 4\n"
+		    "	adcl 6*4(%[src]),%[res]\n   .align 4\n"
+		    "	adcl 5*4(%[src]),%[res]\n   .align 4\n"
+		    "	adcl 4*4(%[src]),%[res]\n   .align 4\n"
+		    "	adcl 3*4(%[src]),%[res]\n   .align 4\n"
+		    "	adcl 2*4(%[src]),%[res]\n   .align 4\n"
+		    "	adcl 1*4(%[src]),%[res]\n   .align 4\n"
+		    "	adcl 0*4(%[src]),%[res]\n"
+		    "	adcl $0,%[res]"
+			: [res] "=r" (result), [dest] "=&r" (dest)
+			: [src] "r" (buff), "[res]" (result),
+			  [skip] "r" (dwords ^ 15)
+			: "memory");
+	}
 
-			if (len & 4) {
-				result += *(unsigned int *) buff;
-				buff += 4;
-			}
-		}
+	if (len & 3U) {
+		buff += len & ~3U;
+		result = from32to16(result);
 		if (len & 2) {
 			result += *(unsigned short *) buff;
 			buff += 2;
 		}
+		if (len & 1)
+			result += *buff;
 	}
-	if (len & 1)
-		result += *buff;
-	result = add32_with_carry(result>>32, result & 0xffffffff); 
 	if (unlikely(odd)) { 
 		result = from32to16(result);
 		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
-- 
2.34.0.rc0.344.g81b53c2807-goog


             reply	other threads:[~2021-11-11  6:53 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-11-11  6:53 Eric Dumazet [this message]
2021-11-11  9:10 ` [RFC] x86/csum: rewrite csum_partial() Peter Zijlstra
2021-11-11  9:44   ` Peter Zijlstra
2021-11-11 16:02     ` Eric Dumazet
2021-11-11 16:52       ` Eric Dumazet
2021-11-11 18:17         ` Andrew Cooper
2021-11-11 19:02           ` Eric Dumazet
2021-11-11 16:51 ` Alexander Duyck
2021-11-14 13:07 ` David Laight
2021-11-14 14:12   ` David Laight
2021-11-15 10:23     ` David Laight

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20211111065322.1261275-1-eric.dumazet@gmail.com \
    --to=eric.dumazet@gmail.com \
    --cc=alexander.duyck@gmail.com \
    --cc=davem@davemloft.net \
    --cc=edumazet@google.com \
    --cc=kuba@kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.