All of lore.kernel.org
 help / color / mirror / Atom feed
From: Robin Murphy <robin.murphy@arm.com>
To: will@kernel.org, catalin.marinas@arm.com
Cc: zhangshaokun@hisilicon.com, huanglingyan2@huawei.com,
	zhaoyuke@huawei.com, linux-arm-kernel@lists.infradead.org,
	ard.biesheuvel@linaro.org
Subject: [PATCH v5] arm64: Implement optimised checksum routine
Date: Wed, 15 Jan 2020 16:42:39 +0000	[thread overview]
Message-ID: <44e00660247f8f39354ba8918071d6cff0d90612.1579106209.git.robin.murphy@arm.com> (raw)

Apparently there exist certain workloads which rely heavily on software
checksumming, for which the generic do_csum() implementation becomes a
significant bottleneck. Therefore let's give arm64 its own optimised
version - for ease of maintenance this foregoes assembly or intrisics,
and is thus not actually arm64-specific, but does rely heavily on C
idioms that translate well to the A64 ISA and the typical load/store
capabilities of most ARMv8 CPU cores.

The resulting increase in checksum throughput scales nicely with buffer
size, tending towards 4x for a small in-order core (Cortex-A53), and up
to 6x or more for an aggressive big core (Ampere eMAG).

Signed-off-by: Robin Murphy <robin.murphy@arm.com>

---

I rigged up a simple userspace test to run the generic and new code for
various buffer lengths at aligned and unaligned offsets; data is average
runtime in nanoseconds.

Ampere eMAG:

GCC 8.3.0
size		generic	new	speedup
     4@0:	8	8	100%
     4@3:	6	8	75%
     8@0:	9	8	112%
     8@3:	9	9	100%
    16@0:	12	9	133%
    16@3:	12	9	133%
    32@0:	18	10	180%
    32@3:	18	10	180%
    64@0:	31	13	238%
    64@3:	30	14	214%
   128@0:	55	20	275%
   128@3:	55	21	261%
   256@0:	105	28	375%
   256@3:	104	28	371%
   512@0:	203	44	461%
   512@3:	203	44	461%
  1024@0:	402	75	536%
  1024@3:	402	75	536%
  2048@0:	799	136	587%
  2048@3:	795	136	584%
  4096@0:	1588	259	613%
  4096@3:	1586	260	610%
  8192@0:	3178	508	625%
  8192@3:	3168	507	624%

Clang 8.0.0
size		generic	new	speedup
     4@0:	8	8	100%
     4@3:	5	8	62%
     8@0:	9	8	112%
     8@3:	9	8	112%
    16@0:	11	8	137%
    16@3:	12	12	100%
    32@0:	17	11	154%
    32@3:	17	13	130%
    64@0:	26	16	162%
    64@3:	26	18	144%
   128@0:	46	23	200%
   128@3:	46	25	184%
   256@0:	86	34	252%
   256@3:	86	36	238%
   512@0:	164	56	292%
   512@3:	165	58	284%
  1024@0:	322	101	318%
  1024@3:	322	102	315%
  2048@0:	638	190	335%
  2048@3:	638	191	334%
  4096@0:	1274	367	347%
  4096@3:	1274	369	345%
  8192@0:	2536	723	350%
  8192@3:	2539	724	350%

Arm Cortex-A53:

GCC 8.3.0
size		generic	new	speedup
     4@0:	40	38	105%
     4@3:	29	38	76%
     8@0:	47	38	123%
     8@3:	40	38	105%
    16@0:	55	38	144%
    16@3:	50	41	121%
    32@0:	76	43	176%
    32@3:	72	48	150%
    64@0:	134	58	231%
    64@3:	127	64	198%
   128@0:	219	87	251%
   128@3:	211	92	229%
   256@0:	388	129	300%
   256@3:	380	134	283%
   512@0:	725	214	338%
   512@3:	718	218	329%
  1024@0:	1400	392	357%
  1024@3:	1393	398	350%
  2048@0:	2751	730	376%
  2048@3:	2743	736	372%
  4096@0:	5451	1405	387%
  4096@3:	5444	1411	385%
  8192@0:	10854	2755	393%
  8192@3:	10846	2762	392%

Clang 8.0.0
size		generic	new	speedup
     4@0:	49	32	153%
     4@3:	31	32	96%
     8@0:	54	32	168%
     8@3:	48	36	133%
    16@0:	63	36	175%
    16@3:	56	47	119%
    32@0:	78	50	156%
    32@3:	73	56	130%
    64@0:	125	67	186%
    64@3:	116	72	161%
   128@0:	192	94	204%
   128@3:	183	99	184%
   256@0:	327	136	240%
   256@3:	319	141	226%
   512@0:	597	227	262%
   512@3:	589	226	260%
  1024@0:	1138	397	286%
  1024@3:	1129	404	279%
  2048@0:	2218	735	301%
  2048@3:	2209	741	298%
  4096@3:	4369	1417	308%
  8192@0:	8699	2761	315%
  8192@3:	8691	2767	314%
---
 arch/arm64/include/asm/checksum.h |   3 +
 arch/arm64/lib/Makefile           |   6 +-
 arch/arm64/lib/csum.c             | 123 ++++++++++++++++++++++++++++++
 3 files changed, 129 insertions(+), 3 deletions(-)
 create mode 100644 arch/arm64/lib/csum.c

diff --git a/arch/arm64/include/asm/checksum.h b/arch/arm64/include/asm/checksum.h
index d064a50deb5f..8d2a7de39744 100644
--- a/arch/arm64/include/asm/checksum.h
+++ b/arch/arm64/include/asm/checksum.h
@@ -35,6 +35,9 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
 }
 #define ip_fast_csum ip_fast_csum
 
+extern unsigned int do_csum(const unsigned char *buff, int len);
+#define do_csum do_csum
+
 #include <asm-generic/checksum.h>
 
 #endif	/* __ASM_CHECKSUM_H */
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index c21b936dc01d..2fc253466dbf 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0
 lib-y		:= clear_user.o delay.o copy_from_user.o		\
 		   copy_to_user.o copy_in_user.o copy_page.o		\
-		   clear_page.o memchr.o memcpy.o memmove.o memset.o	\
-		   memcmp.o strcmp.o strncmp.o strlen.o strnlen.o	\
-		   strchr.o strrchr.o tishift.o
+		   clear_page.o csum.o memchr.o memcpy.o memmove.o	\
+		   memset.o memcmp.o strcmp.o strncmp.o strlen.o	\
+		   strnlen.o strchr.o strrchr.o tishift.o
 
 ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
 obj-$(CONFIG_XOR_BLOCKS)	+= xor-neon.o
diff --git a/arch/arm64/lib/csum.c b/arch/arm64/lib/csum.c
new file mode 100644
index 000000000000..99cc11999756
--- /dev/null
+++ b/arch/arm64/lib/csum.c
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2019-2020 Arm Ltd.
+
+#include <linux/compiler.h>
+#include <linux/kasan-checks.h>
+#include <linux/kernel.h>
+
+#include <net/checksum.h>
+
+/* Looks dumb, but generates nice-ish code */
+static u64 accumulate(u64 sum, u64 data)
+{
+	__uint128_t tmp = (__uint128_t)sum + data;
+	return tmp + (tmp >> 64);
+}
+
+unsigned int do_csum(const unsigned char *buff, int len)
+{
+	unsigned int offset, shift, sum;
+	const u64 *ptr;
+	u64 data, sum64 = 0;
+
+	offset = (unsigned long)buff & 7;
+	/*
+	 * This is to all intents and purposes safe, since rounding down cannot
+	 * result in a different page or cache line being accessed, and @buff
+	 * should absolutely not be pointing to anything read-sensitive. We do,
+	 * however, have to be careful not to piss off KASAN, which means using
+	 * unchecked reads to accommodate the head and tail, for which we'll
+	 * compensate with an explicit check up-front.
+	 */
+	kasan_check_read(buff, len);
+	ptr = (u64 *)(buff - offset);
+	len = len + offset - 8;
+
+	/*
+	 * Head: zero out any excess leading bytes. Shifting back by the same
+	 * amount should be at least as fast as any other way of handling the
+	 * odd/even alignment, and means we can ignore it until the very end.
+	 */
+	shift = offset * 8;
+	data = READ_ONCE_NOCHECK(*ptr++);
+#ifdef __LITTLE_ENDIAN
+	data = (data >> shift) << shift;
+#else
+	data = (data << shift) >> shift;
+#endif
+
+	/*
+	 * Body: straightforward aligned loads from here on (the paired loads
+	 * underlying the quadword type still only need dword alignment). The
+	 * main loop strictly excludes the tail, so the second loop will always
+	 * run at least once.
+	 */
+	while (len > 64) {
+		__uint128_t tmp1, tmp2, tmp3, tmp4;
+
+		tmp1 = READ_ONCE_NOCHECK(*(__uint128_t *)ptr);
+		tmp2 = READ_ONCE_NOCHECK(*(__uint128_t *)(ptr + 2));
+		tmp3 = READ_ONCE_NOCHECK(*(__uint128_t *)(ptr + 4));
+		tmp4 = READ_ONCE_NOCHECK(*(__uint128_t *)(ptr + 6));
+
+		len -= 64;
+		ptr += 8;
+
+		/* This is the "don't dump the carry flag into a GPR" idiom */
+		tmp1 += (tmp1 >> 64) | (tmp1 << 64);
+		tmp2 += (tmp2 >> 64) | (tmp2 << 64);
+		tmp3 += (tmp3 >> 64) | (tmp3 << 64);
+		tmp4 += (tmp4 >> 64) | (tmp4 << 64);
+		tmp1 = ((tmp1 >> 64) << 64) | (tmp2 >> 64);
+		tmp1 += (tmp1 >> 64) | (tmp1 << 64);
+		tmp3 = ((tmp3 >> 64) << 64) | (tmp4 >> 64);
+		tmp3 += (tmp3 >> 64) | (tmp3 << 64);
+		tmp1 = ((tmp1 >> 64) << 64) | (tmp3 >> 64);
+		tmp1 += (tmp1 >> 64) | (tmp1 << 64);
+		tmp1 = ((tmp1 >> 64) << 64) | sum64;
+		tmp1 += (tmp1 >> 64) | (tmp1 << 64);
+		sum64 = tmp1 >> 64;
+	}
+	while (len > 8) {
+		__uint128_t tmp;
+
+		sum64 = accumulate(sum64, data);
+		tmp = READ_ONCE_NOCHECK(*(__uint128_t *)ptr);
+
+		len -= 16;
+		ptr += 2;
+
+#ifdef __LITTLE_ENDIAN
+		data = tmp >> 64;
+		sum64 = accumulate(sum64, tmp);
+#else
+		data = tmp;
+		sum64 = accumulate(sum64, tmp >> 64);
+#endif
+	}
+	if (len > 0) {
+		sum64 = accumulate(sum64, data);
+		data = READ_ONCE_NOCHECK(*ptr);
+		len -= 8;
+	}
+	/*
+	 * Tail: zero any over-read bytes similarly to the head, again
+	 * preserving odd/even alignment.
+	 */
+	shift = len * -8;
+#ifdef __LITTLE_ENDIAN
+	data = (data << shift) >> shift;
+#else
+	data = (data >> shift) << shift;
+#endif
+	sum64 = accumulate(sum64, data);
+
+	/* Finally, folding */
+	sum64 += (sum64 >> 32) | (sum64 << 32);
+	sum = sum64 >> 32;
+	sum += (sum >> 16) | (sum << 16);
+	if (offset & 1)
+		return (u16)swab32(sum);
+
+	return sum >> 16;
+}
-- 
2.23.0.dirty


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

             reply	other threads:[~2020-01-15 16:42 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-01-15 16:42 Robin Murphy [this message]
2020-01-16  0:10 ` [PATCH v5] arm64: Implement optimised checksum routine Robin Murphy
2020-01-16 10:55 ` Will Deacon
2020-01-16 13:59   ` Shaokun Zhang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=44e00660247f8f39354ba8918071d6cff0d90612.1579106209.git.robin.murphy@arm.com \
    --to=robin.murphy@arm.com \
    --cc=ard.biesheuvel@linaro.org \
    --cc=catalin.marinas@arm.com \
    --cc=huanglingyan2@huawei.com \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=will@kernel.org \
    --cc=zhangshaokun@hisilicon.com \
    --cc=zhaoyuke@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.