linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Linus Torvalds <torvalds@linux-foundation.org>
To: David Laight <David.Laight@aculab.com>
Cc: Noah Goldstein <goldstein.w.n@gmail.com>,
	kernel test robot <lkp@intel.com>,
	 "x86@kernel.org" <x86@kernel.org>,
	 "oe-kbuild-all@lists.linux.dev" <oe-kbuild-all@lists.linux.dev>,
	 "linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	"edumazet@google.com" <edumazet@google.com>,
	 "tglx@linutronix.de" <tglx@linutronix.de>,
	"mingo@redhat.com" <mingo@redhat.com>,
	"bp@alien8.de" <bp@alien8.de>,
	 "dave.hansen@linux.intel.com" <dave.hansen@linux.intel.com>,
	"hpa@zytor.com" <hpa@zytor.com>
Subject: Re: x86/csum: Remove unnecessary odd handling
Date: Fri, 5 Jan 2024 10:05:44 -0800	[thread overview]
Message-ID: <CAHk-=wg8vssPVO68_qH_BHBCj6_DDawKQHBOgZh4gw5YFmpCKA@mail.gmail.com> (raw)
In-Reply-To: <5354eeec562345f6a1de84f0b2081b75@AcuMS.aculab.com>

[-- Attachment #1: Type: text/plain, Size: 450 bytes --]

On Fri, 5 Jan 2024 at 02:41, David Laight <David.Laight@aculab.com> wrote:
>
> Interesting, I'm pretty sure trying to get two blocks of
>  'adc' scheduled in parallel like that doesn't work.

You should check out the benchmark at

       https://github.com/fenrus75/csum_partial

and see if you can improve on it. I'm including the patch (on top of
that code by Arjan) to implement the actual current kernel version as
"New version".

         Linus

[-- Attachment #2: p --]
[-- Type: application/octet-stream, Size: 4840 bytes --]

From 6ff7f7a72a4855970b1621ac9724c44c393a6d44 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 5 Jan 2024 09:46:32 -0800
Subject: [PATCH] Add the current kernel version as "New version"

---
 Makefile       |   3 --
 csum_partial.c | 117 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 115 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index e4b1bb3..4e29f8a 100644
--- a/Makefile
+++ b/Makefile
@@ -17,6 +17,3 @@ chain2.svg: graphs/chain2.dot
 chain2a.svg: graphs/chain2a.dot
 	dot -Tsvg -O graphs/chain2a.dot  
 	mv graphs/chain2a.dot.svg chain2a.svg
-	
-	
-	
\ No newline at end of file
diff --git a/csum_partial.c b/csum_partial.c
index 4db0d97..ddf6acd 100644
--- a/csum_partial.c
+++ b/csum_partial.c
@@ -14,13 +14,28 @@
 #include <time.h>
 
 typedef uint32_t __wsum;
+typedef uint32_t __u32;
+typedef uint64_t __u64;
 typedef uint64_t u64;
 typedef uint32_t u32;
+# define likely(x) __builtin_expect(!!(x), 1)
 # define unlikely(x) __builtin_expect(!!(x), 0)
 
+#define __force
+
 #define LOOPCOUNT 102400
 #define PACKETSIZE 40
 
+/**
+ * ror64 - rotate a 64-bit value right
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static inline __u64 ror64(__u64 word, unsigned int shift)
+{
+	return (word >> (shift & 63)) | (word << ((-shift) & 63));
+}
+
 static inline unsigned long load_unaligned_zeropad(const void *addr)
 {
 	unsigned long ret, dummy;
@@ -484,7 +499,105 @@ static inline __wsum nulltest(const void *buff, int len, __wsum sum)
 {
 	return 2;
 }
+static inline __wsum csum_finalize_sum(u64 temp64)
+{
+	return (__force __wsum)((temp64 + ror64(temp64, 32)) >> 32);
+}
 
+static inline unsigned long update_csum_40b(unsigned long sum, const unsigned long m[5])
+{
+	asm("addq %1,%0\n\t"
+	     "adcq %2,%0\n\t"
+	     "adcq %3,%0\n\t"
+	     "adcq %4,%0\n\t"
+	     "adcq %5,%0\n\t"
+	     "adcq $0,%0"
+		:"+r" (sum)
+		:"m" (m[0]), "m" (m[1]), "m" (m[2]),
+		 "m" (m[3]), "m" (m[4]));
+	return sum;
+}
+
+/*
+ * Do a checksum on an arbitrary memory area.
+ * Returns a 32bit checksum.
+ *
+ * This isn't as time critical as it used to be because many NICs
+ * do hardware checksumming these days.
+ *
+ * Still, with CHECKSUM_COMPLETE this is called to compute
+ * checksums on IPv6 headers (40 bytes) and other small parts.
+ * it's best to have buff aligned on a 64-bit boundary
+ */
+__wsum csum_partial_new(const void *buff, int len, __wsum sum)
+{
+	u64 temp64 = (__force u64)sum;
+
+	/* Do two 40-byte chunks in parallel to get better ILP */
+	if (likely(len >= 80)) {
+		u64 temp64_2 = 0;
+		do {
+			temp64 = update_csum_40b(temp64, buff);
+			temp64_2 = update_csum_40b(temp64_2, buff + 40);
+			buff += 80;
+			len -= 80;
+		} while (len >= 80);
+
+		asm("addq %1,%0\n\t"
+		    "adcq $0,%0"
+		    :"+r" (temp64): "r" (temp64_2));
+	}
+
+	/*
+	 * len == 40 is the hot case due to IPv6 headers, so return
+	 * early for that exact case without checking the tail bytes.
+	 */
+	if (len >= 40) {
+		temp64 = update_csum_40b(temp64, buff);
+		len -= 40;
+		if (!len)
+			return csum_finalize_sum(temp64);
+		buff += 40;
+	}
+
+	if (len & 32) {
+		asm("addq 0*8(%[src]),%[res]\n\t"
+		    "adcq 1*8(%[src]),%[res]\n\t"
+		    "adcq 2*8(%[src]),%[res]\n\t"
+		    "adcq 3*8(%[src]),%[res]\n\t"
+		    "adcq $0,%[res]"
+		    : [res] "+r"(temp64)
+		    : [src] "r"(buff), "m"(*(const char(*)[32])buff));
+		buff += 32;
+	}
+	if (len & 16) {
+		asm("addq 0*8(%[src]),%[res]\n\t"
+		    "adcq 1*8(%[src]),%[res]\n\t"
+		    "adcq $0,%[res]"
+		    : [res] "+r"(temp64)
+		    : [src] "r"(buff), "m"(*(const char(*)[16])buff));
+		buff += 16;
+	}
+	if (len & 8) {
+		asm("addq 0*8(%[src]),%[res]\n\t"
+		    "adcq $0,%[res]"
+		    : [res] "+r"(temp64)
+		    : [src] "r"(buff), "m"(*(const char(*)[8])buff));
+		buff += 8;
+	}
+	if (len & 7) {
+		unsigned int shift = (-len << 3) & 63;
+		unsigned long trail;
+
+		trail = (load_unaligned_zeropad(buff) << shift) >> shift;
+
+		asm("addq %[trail],%[res]\n\t"
+		    "adcq $0,%[res]"
+		    : [res] "+r"(temp64)
+		    : [trail] "r"(trail));
+	}
+	return csum_finalize_sum(temp64);
+}
 
 double cycles[64];
 int cyclecount[64];
@@ -612,6 +725,7 @@ int main(int argc, char **argv)
 
 	MEASURE(2,  csum_partial, "Upcoming linux kernel version");
 	MEASURE(4,  csum_specialized, "Specialized to size 40");
+	MEASURE(6,  csum_partial_new, "New version");
 	MEASURE(22, csum_partial_no_odd, "Odd-alignment handling removed");
 	MEASURE(24, csum_partial_dead_code, "Dead code elimination           ");
 	MEASURE(28, csum_partial_ACX, "ADX interleaved ");
@@ -619,7 +733,6 @@ int main(int argc, char **argv)
 	MEASURE(34, csum_partial_32bit, "32 bit train ");
 	MEASURE(36, csum_partial_zero_sum, "Assume zero input sum");
 
-
 	report();
 	}
-}
\ No newline at end of file
+}

  parent reply	other threads:[~2024-01-05 18:06 UTC|newest]

Thread overview: 33+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <20230628020657.957880-1-goldstein.w.n@gmail.com>
2023-06-28  9:12 ` x86/csum: Remove unnecessary odd handling Borislav Petkov
2023-06-28 15:32   ` Noah Goldstein
2023-06-28 17:44     ` Linus Torvalds
2023-06-28 18:34       ` Noah Goldstein
2023-06-28 20:02         ` Linus Torvalds
2023-06-29 14:04   ` David Laight
2023-06-29 14:27   ` David Laight
2023-09-01 22:21 ` Noah Goldstein
2023-09-06 13:49   ` David Laight
2023-09-06 14:38   ` David Laight
2023-09-20 19:20     ` Noah Goldstein
2023-09-20 19:23 ` Noah Goldstein
2023-09-23  3:24   ` kernel test robot
2023-09-23 14:05     ` Noah Goldstein
2023-09-23 21:13       ` David Laight
2023-09-24 14:35         ` Noah Goldstein
2023-12-23 22:18           ` Noah Goldstein
2024-01-04 23:28             ` Noah Goldstein
2024-01-04 23:34               ` Dave Hansen
2024-01-04 23:36               ` Linus Torvalds
2024-01-05  0:33                 ` Linus Torvalds
2024-01-05 10:41                   ` David Laight
2024-01-05 16:12                     ` David Laight
2024-01-05 18:05                     ` Linus Torvalds [this message]
2024-01-05 23:52                       ` David Laight
2024-01-06  0:18                         ` Linus Torvalds
2024-01-06 10:26                           ` Eric Dumazet
2024-01-06 19:32                             ` Linus Torvalds
2024-01-07 12:11                             ` David Laight
2024-01-06 22:08                       ` David Laight
2024-01-07  1:09                         ` H. Peter Anvin
2024-01-07 11:44                           ` David Laight
2023-09-24 14:35 ` Noah Goldstein

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CAHk-=wg8vssPVO68_qH_BHBCj6_DDawKQHBOgZh4gw5YFmpCKA@mail.gmail.com' \
    --to=torvalds@linux-foundation.org \
    --cc=David.Laight@aculab.com \
    --cc=bp@alien8.de \
    --cc=dave.hansen@linux.intel.com \
    --cc=edumazet@google.com \
    --cc=goldstein.w.n@gmail.com \
    --cc=hpa@zytor.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=lkp@intel.com \
    --cc=mingo@redhat.com \
    --cc=oe-kbuild-all@lists.linux.dev \
    --cc=tglx@linutronix.de \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).