Re: x86/csum: Remove unnecessary odd handling

From: Linus Torvalds <torvalds@linux-foundation.org>
To: David Laight <David.Laight@aculab.com>
Cc: Noah Goldstein <goldstein.w.n@gmail.com>,
	kernel test robot <lkp@intel.com>,
	 "x86@kernel.org" <x86@kernel.org>,
	 "oe-kbuild-all@lists.linux.dev" <oe-kbuild-all@lists.linux.dev>,
	 "linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	"edumazet@google.com" <edumazet@google.com>,
	 "tglx@linutronix.de" <tglx@linutronix.de>,
	"mingo@redhat.com" <mingo@redhat.com>,
	"bp@alien8.de" <bp@alien8.de>,
	 "dave.hansen@linux.intel.com" <dave.hansen@linux.intel.com>,
	"hpa@zytor.com" <hpa@zytor.com>
Subject: Re: x86/csum: Remove unnecessary odd handling
Date: Fri, 5 Jan 2024 10:05:44 -0800	[thread overview]
Message-ID: <CAHk-=wg8vssPVO68_qH_BHBCj6_DDawKQHBOgZh4gw5YFmpCKA@mail.gmail.com> (raw)
In-Reply-To: <5354eeec562345f6a1de84f0b2081b75@AcuMS.aculab.com>

[-- Attachment #1: Type: text/plain, Size: 450 bytes --]

On Fri, 5 Jan 2024 at 02:41, David Laight <David.Laight@aculab.com> wrote:
>
> Interesting, I'm pretty sure trying to get two blocks of
>  'adc' scheduled in parallel like that doesn't work.

You should check out the benchmark at

       https://github.com/fenrus75/csum_partial

and see if you can improve on it. I'm including the patch (on top of
that code by Arjan) to implement the actual current kernel version as
"New version".

         Linus

[-- Attachment #2: p --]
[-- Type: application/octet-stream, Size: 4840 bytes --]

From 6ff7f7a72a4855970b1621ac9724c44c393a6d44 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 5 Jan 2024 09:46:32 -0800
Subject: [PATCH] Add the current kernel version as "New version"

---
 Makefile       |   3 --
 csum_partial.c | 117 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 115 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index e4b1bb3..4e29f8a 100644
--- a/Makefile
+++ b/Makefile
@@ -17,6 +17,3 @@ chain2.svg: graphs/chain2.dot
 chain2a.svg: graphs/chain2a.dot
 	dot -Tsvg -O graphs/chain2a.dot  
 	mv graphs/chain2a.dot.svg chain2a.svg
-	
-	
-	
\ No newline at end of file
diff --git a/csum_partial.c b/csum_partial.c
index 4db0d97..ddf6acd 100644
--- a/csum_partial.c
+++ b/csum_partial.c
@@ -14,13 +14,28 @@
 #include <time.h>
 
 typedef uint32_t __wsum;
+typedef uint32_t __u32;
+typedef uint64_t __u64;
 typedef uint64_t u64;
 typedef uint32_t u32;
+# define likely(x) __builtin_expect(!!(x), 1)
 # define unlikely(x) __builtin_expect(!!(x), 0)
 
+#define __force
+
 #define LOOPCOUNT 102400
 #define PACKETSIZE 40
 
+/**
+ * ror64 - rotate a 64-bit value right
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static inline __u64 ror64(__u64 word, unsigned int shift)
+{
+	return (word >> (shift & 63)) | (word << ((-shift) & 63));
+}
+
 static inline unsigned long load_unaligned_zeropad(const void *addr)
 {
 	unsigned long ret, dummy;
@@ -484,7 +499,105 @@ static inline __wsum nulltest(const void *buff, int len, __wsum sum)
 {
 	return 2;
 }
+static inline __wsum csum_finalize_sum(u64 temp64)
+{
+	return (__force __wsum)((temp64 + ror64(temp64, 32)) >> 32);
+}
 
+static inline unsigned long update_csum_40b(unsigned long sum, const unsigned long m[5])
+{
+	asm("addq %1,%0\n\t"
+	     "adcq %2,%0\n\t"
+	     "adcq %3,%0\n\t"
+	     "adcq %4,%0\n\t"
+	     "adcq %5,%0\n\t"
+	     "adcq $0,%0"
+		:"+r" (sum)
+		:"m" (m[0]), "m" (m[1]), "m" (m[2]),
+		 "m" (m[3]), "m" (m[4]));
+	return sum;
+}
+
+/*
+ * Do a checksum on an arbitrary memory area.
+ * Returns a 32bit checksum.
+ *
+ * This isn't as time critical as it used to be because many NICs
+ * do hardware checksumming these days.
+ *
+ * Still, with CHECKSUM_COMPLETE this is called to compute
+ * checksums on IPv6 headers (40 bytes) and other small parts.
+ * it's best to have buff aligned on a 64-bit boundary
+ */
+__wsum csum_partial_new(const void *buff, int len, __wsum sum)
+{
+	u64 temp64 = (__force u64)sum;
+
+	/* Do two 40-byte chunks in parallel to get better ILP */
+	if (likely(len >= 80)) {
+		u64 temp64_2 = 0;
+		do {
+			temp64 = update_csum_40b(temp64, buff);
+			temp64_2 = update_csum_40b(temp64_2, buff + 40);
+			buff += 80;
+			len -= 80;
+		} while (len >= 80);
+
+		asm("addq %1,%0\n\t"
+		    "adcq $0,%0"
+		    :"+r" (temp64): "r" (temp64_2));
+	}
+
+	/*
+	 * len == 40 is the hot case due to IPv6 headers, so return
+	 * early for that exact case without checking the tail bytes.
+	 */
+	if (len >= 40) {
+		temp64 = update_csum_40b(temp64, buff);
+		len -= 40;
+		if (!len)
+			return csum_finalize_sum(temp64);
+		buff += 40;
+	}
+
+	if (len & 32) {
+		asm("addq 0*8(%[src]),%[res]\n\t"
+		    "adcq 1*8(%[src]),%[res]\n\t"
+		    "adcq 2*8(%[src]),%[res]\n\t"
+		    "adcq 3*8(%[src]),%[res]\n\t"
+		    "adcq $0,%[res]"
+		    : [res] "+r"(temp64)
+		    : [src] "r"(buff), "m"(*(const char(*)[32])buff));
+		buff += 32;
+	}
+	if (len & 16) {
+		asm("addq 0*8(%[src]),%[res]\n\t"
+		    "adcq 1*8(%[src]),%[res]\n\t"
+		    "adcq $0,%[res]"
+		    : [res] "+r"(temp64)
+		    : [src] "r"(buff), "m"(*(const char(*)[16])buff));
+		buff += 16;
+	}
+	if (len & 8) {
+		asm("addq 0*8(%[src]),%[res]\n\t"
+		    "adcq $0,%[res]"
+		    : [res] "+r"(temp64)
+		    : [src] "r"(buff), "m"(*(const char(*)[8])buff));
+		buff += 8;
+	}
+	if (len & 7) {
+		unsigned int shift = (-len << 3) & 63;
+		unsigned long trail;
+
+		trail = (load_unaligned_zeropad(buff) << shift) >> shift;
+
+		asm("addq %[trail],%[res]\n\t"
+		    "adcq $0,%[res]"
+		    : [res] "+r"(temp64)
+		    : [trail] "r"(trail));
+	}
+	return csum_finalize_sum(temp64);
+}
 
 double cycles[64];
 int cyclecount[64];
@@ -612,6 +725,7 @@ int main(int argc, char **argv)
 
 	MEASURE(2,  csum_partial, "Upcoming linux kernel version");
 	MEASURE(4,  csum_specialized, "Specialized to size 40");
+	MEASURE(6,  csum_partial_new, "New version");
 	MEASURE(22, csum_partial_no_odd, "Odd-alignment handling removed");
 	MEASURE(24, csum_partial_dead_code, "Dead code elimination           ");
 	MEASURE(28, csum_partial_ACX, "ADX interleaved ");
@@ -619,7 +733,6 @@ int main(int argc, char **argv)
 	MEASURE(34, csum_partial_32bit, "32 bit train ");
 	MEASURE(36, csum_partial_zero_sum, "Assume zero input sum");
 
-
 	report();
 	}
-}
\ No newline at end of file
+}