linux-arm-kernel.lists.infradead.org archive mirror
 help / color / mirror / Atom feed
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
To: linux-arm-kernel@lists.infradead.org
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>,
	netdev@vger.kernel.org, ilias.apalodimas@linaro.org,
	will.deacon@arm.com,
	"huanglingyan \(A\)" <huanglingyan2@huawei.com>,
	steve.capper@arm.com
Subject: [PATCH] arm64: do_csum: implement accelerated scalar version
Date: Tue, 19 Feb 2019 00:08:42 +0100	[thread overview]
Message-ID: <20190218230842.11448-1-ard.biesheuvel@linaro.org> (raw)

It turns out that the IP checksumming code is still exercised often,
even though one might expect that modern NICs with checksum offload
have no use for it. However, as Lingyan points out, there are
combinations of features where the network stack may still fall back
to software checksumming, and so it makes sense to provide an
optimized implementation in software as well.

So provide an implementation of do_csum() in scalar assembler, which,
unlike C, gives direct access to the carry flag, making the code run
substantially faster. The routine uses overlapping 64 byte loads for
all input size > 64 bytes, in order to reduce the number of branches
and improve performance on cores with deep pipelines.

On Cortex-A57, this implementation is on par with Lingyan's NEON
implementation, and roughly 7x as fast as the generic C code.

Cc: "huanglingyan (A)" <huanglingyan2@huawei.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
Test code after the patch.

 arch/arm64/include/asm/checksum.h |   3 +
 arch/arm64/lib/Makefile           |   2 +-
 arch/arm64/lib/csum.S             | 127 ++++++++++++++++++++
 3 files changed, 131 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/checksum.h b/arch/arm64/include/asm/checksum.h
index 0b6f5a7d4027..e906b956c1fc 100644
--- a/arch/arm64/include/asm/checksum.h
+++ b/arch/arm64/include/asm/checksum.h
@@ -46,6 +46,9 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
 }
 #define ip_fast_csum ip_fast_csum
 
+extern unsigned int do_csum(const unsigned char *buff, int len);
+#define do_csum do_csum
+
 #include <asm-generic/checksum.h>
 
 #endif	/* __ASM_CHECKSUM_H */
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 5540a1638baf..a7606007a749 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -3,7 +3,7 @@ lib-y		:= clear_user.o delay.o copy_from_user.o		\
 		   copy_to_user.o copy_in_user.o copy_page.o		\
 		   clear_page.o memchr.o memcpy.o memmove.o memset.o	\
 		   memcmp.o strcmp.o strncmp.o strlen.o strnlen.o	\
-		   strchr.o strrchr.o tishift.o
+		   strchr.o strrchr.o tishift.o csum.o
 
 ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
 obj-$(CONFIG_XOR_BLOCKS)	+= xor-neon.o
diff --git a/arch/arm64/lib/csum.S b/arch/arm64/lib/csum.S
new file mode 100644
index 000000000000..534e2ebdc426
--- /dev/null
+++ b/arch/arm64/lib/csum.S
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ENTRY(do_csum)
+	adds		x2, xzr, xzr		// clear x2 and C flag
+
+	// 64 bytes at a time
+	lsr		x3, x1, #6
+	and		x1, x1, #63
+	cbz		x3, 1f
+
+	// Eight 64-bit adds per iteration
+0:	ldp		x4, x5, [x0], #64
+	ldp		x6, x7, [x0, #-48]
+	ldp		x8, x9, [x0, #-32]
+	ldp		x10, x11, [x0, #-16]
+	adcs		x2, x2, x4
+	sub		x3, x3, #1
+	adcs		x2, x2, x5
+	adcs		x2, x2, x6
+	adcs		x2, x2, x7
+	adcs		x2, x2, x8
+	adcs		x2, x2, x9
+	adcs		x2, x2, x10
+	adcs		x2, x2, x11
+	cbnz		x3, 0b
+	adc		x2, x2, xzr
+
+	cbz		x1, 7f
+	bic		x3, x1, #1
+	add		x12, x0, x1
+	add		x0, x0, x3
+	neg		x3, x3
+	add		x3, x3, #64
+	lsl		x3, x3, #3
+
+	// Handle remaining 63 bytes or less using an overlapping 64-byte load
+	// and a branchless code path to complete the calculation
+	ldp		x4, x5, [x0, #-64]
+	ldp		x6, x7, [x0, #-48]
+	ldp		x8, x9, [x0, #-32]
+	ldp		x10, x11, [x0, #-16]
+	ldrb		w12, [x12, #-1]
+
+	.irp		reg, x4, x5, x6, x7, x8, x9, x10, x11
+	cmp		x3, #64
+	csel		\reg, \reg, xzr, lt
+	ccmp		x3, xzr, #0, lt
+	csel		x13, x3, xzr, gt
+	sub		x3, x3, #64
+CPU_LE(	lsr		\reg, \reg, x13		)
+CPU_BE(	lsl		\reg, \reg, x13		)
+	.endr
+
+	adds		x2, x2, x4
+	adcs		x2, x2, x5
+	adcs		x2, x2, x6
+	adcs		x2, x2, x7
+	adcs		x2, x2, x8
+	adcs		x2, x2, x9
+	adcs		x2, x2, x10
+	adcs		x2, x2, x11
+	adc		x2, x2, xzr
+
+CPU_LE(	adds		x12, x2, x12		)
+CPU_BE(	adds		x12, x2, x12, lsl #8	)
+	adc		x12, x12, xzr
+	tst		x1, #1
+	csel		x2, x2, x12, eq
+
+7:	lsr		x1, x2, #32
+	adds		w2, w2, w1
+	adc		w2, w2, wzr
+
+	lsr		w1, w2, #16
+	uxth		w2, w2
+	add		w2, w2, w1
+
+	lsr		w1, w2, #16		// handle the carry by hand
+	add		w2, w2, w1
+
+	uxth		w0, w2
+	ret
+
+	// Handle 63 bytes or less
+1:	tbz		x1, #5, 2f
+	ldp		x4, x5, [x0], #32
+	ldp		x6, x7, [x0, #-16]
+	adds		x2, x2, x4
+	adcs		x2, x2, x5
+	adcs		x2, x2, x6
+	adcs		x2, x2, x7
+	adc		x2, x2, xzr
+
+2:	tbz		x1, #4, 3f
+	ldp		x4, x5, [x0], #16
+	adds		x2, x2, x4
+	adcs		x2, x2, x5
+	adc		x2, x2, xzr
+
+3:	tbz		x1, #3, 4f
+	ldr		x4, [x0], #8
+	adds		x2, x2, x4
+	adc		x2, x2, xzr
+
+4:	tbz		x1, #2, 5f
+	ldr		w4, [x0], #4
+	adds		x2, x2, x4
+	adc		x2, x2, xzr
+
+5:	tbz		x1, #1, 6f
+	ldrh		w4, [x0], #2
+	adds		x2, x2, x4
+	adc		x2, x2, xzr
+
+6:	tbz		x1, #0, 7b
+	ldrb		w4, [x0]
+CPU_LE(	adds		x2, x2, x4		)
+CPU_BE(	adds		x2, x2, x4, lsl #8	)
+	adc		x2, x2, xzr
+	b		7b
+ENDPROC(do_csum)
-- 
2.20.1

  diff --git a/lib/checksum.c b/lib/checksum.c
  index d3ec93f9e5f3..7711f1186f71 100644
  --- a/lib/checksum.c
  +++ b/lib/checksum.c
  @@ -37,7 +37,7 @@
   
   #include <asm/byteorder.h>
   
  -#ifndef do_csum
  +#if 1 //ndef do_csum
   static inline unsigned short from32to16(unsigned int x)
   {
          /* add up 16-bit and 16-bit for 16+c bit */
  @@ -47,7 +47,7 @@ static inline unsigned short from32to16(unsigned int x)
          return x;
   }
   
  -static unsigned int do_csum(const unsigned char *buff, int len)
  +static unsigned int __do_csum(const unsigned char *buff, int len)
   {
          int odd;
          unsigned int result = 0;
  @@ -206,3 +206,23 @@ __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
   }
   EXPORT_SYMBOL(csum_tcpudp_nofold);
   #endif
  +
  +extern u8 crypto_ft_tab[];
  +
  +static int __init do_selftest(void)
  +{
  +       int i, j;
  +       u16 c1, c2;
  +
  +       for (i = 0; i < 1024; i++) {
  +               for (j = i + 1; j <= 1024; j++) {
  +                       c1 = __do_csum(crypto_ft_tab + i, j - i);
  +                       c2 = do_csum(crypto_ft_tab + i, j - i);
  +
  +                       if (c1 != c2)
  +                               pr_err("######### %d %d %x %x\n", i, j, c1, c2);
  +               }
  +       }
  +       return 0;
  +}
  +late_initcall(do_selftest);

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

             reply	other threads:[~2019-02-18 23:09 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-02-18 23:08 Ard Biesheuvel [this message]
2019-02-19 15:08 ` [PATCH] arm64: do_csum: implement accelerated scalar version Ilias Apalodimas
2019-02-28 14:16   ` Ard Biesheuvel
2019-02-28 15:13     ` Robin Murphy
2019-02-28 15:28       ` Ard Biesheuvel
2019-04-12  2:31 ` Zhangshaokun
2019-04-12  9:52   ` Will Deacon
2019-04-15 18:18     ` Robin Murphy
2019-05-15  9:47       ` Will Deacon
2019-05-15 10:15         ` David Laight
2019-05-15 10:57           ` Robin Murphy
2019-05-15 11:13             ` David Laight
2019-05-15 12:39               ` Robin Murphy
2019-05-15 13:54                 ` David Laight
2019-05-15 11:02         ` Robin Murphy
2019-05-16  3:14         ` Zhangshaokun
2019-08-15 16:46           ` Will Deacon
2019-08-16  8:15             ` Shaokun Zhang
2019-08-16 14:55               ` Robin Murphy

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190218230842.11448-1-ard.biesheuvel@linaro.org \
    --to=ard.biesheuvel@linaro.org \
    --cc=huanglingyan2@huawei.com \
    --cc=ilias.apalodimas@linaro.org \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=netdev@vger.kernel.org \
    --cc=steve.capper@arm.com \
    --cc=will.deacon@arm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).