* [PATCH] arm64/lib: add accelerated do_csum for arm64
@ 2018-12-28 7:37 huhai
2018-12-28 14:46 ` kbuild test robot
2019-01-03 8:13 ` Christoph Hellwig
0 siblings, 2 replies; 3+ messages in thread
From: huhai @ 2018-12-28 7:37 UTC (permalink / raw)
To: will.deacon, catalin.marinas
Cc: linux-arm-kernel, linux-kernel, liuyun01, huhai
do_csum() in lib/checksum.c is too slow in ARM64,
and we can use assembly and algorithm to accelerate it.
Signed-off-by: huhai <huhai@kylinos.cn>
---
arch/arm64/include/asm/checksum.h | 3 +
arch/arm64/lib/Makefile | 2 +-
arch/arm64/lib/checksum.c | 144 ++++++++++++++++++++++++++++++
3 files changed, 148 insertions(+), 1 deletion(-)
create mode 100644 arch/arm64/lib/checksum.c
diff --git a/arch/arm64/include/asm/checksum.h b/arch/arm64/include/asm/checksum.h
index 0b6f5a7d4027..0d7b80fb300e 100644
--- a/arch/arm64/include/asm/checksum.h
+++ b/arch/arm64/include/asm/checksum.h
@@ -26,6 +26,9 @@ static inline __sum16 csum_fold(__wsum csum)
}
#define csum_fold csum_fold
+#define do_csum do_csum
+unsigned int do_csum(const unsigned char *buff, unsigned int len);
+
static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
{
__uint128_t tmp;
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 69ff9887f724..4134730a121b 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
-lib-y := clear_user.o delay.o copy_from_user.o \
+lib-y := checksum.o clear_user.o delay.o copy_from_user.o \
copy_to_user.o copy_in_user.o copy_page.o \
clear_page.o memchr.o memcpy.o memmove.o memset.o \
memcmp.o strcmp.o strncmp.o strlen.o strnlen.o \
diff --git a/arch/arm64/lib/checksum.c b/arch/arm64/lib/checksum.c
new file mode 100644
index 000000000000..6931ef13ef87
--- /dev/null
+++ b/arch/arm64/lib/checksum.c
@@ -0,0 +1,144 @@
+/*
+ * arch/arm64/lib/checksum.c
+ *
+ * This file contains network checksum routines that are better done
+ * in an architecture-specific manner due to speed.
+ *
+ * Acknowledgements:
+ * This file is based on arch/x86/lib/csum-partial_64.c and
+ * arch/alpha/lib/checksum.c, which was written by Thomas Gleixner
+ * and Rick Gorton respectively.
+ */
+
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <asm/checksum.h>
+
+static inline unsigned short from64to16(unsigned long x)
+{
+ /* Using extract instructions is a bit more efficient
+ * than the original shift/bitmask version.
+ */
+
+ union {
+ unsigned long ul;
+ unsigned int ui[2];
+ unsigned short us[4];
+ } in_v, tmp_v, out_v;
+
+ in_v.ul = x;
+ tmp_v.ul = (unsigned long) in_v.ui[0] + (unsigned long) in_v.ui[1];
+
+ /* Since the bits of tmp_v.sh[3] are going to always be zero,
+ * we don't have to bother to add that in.
+ */
+ out_v.ul = (unsigned long) tmp_v.us[0] + (unsigned long) tmp_v.us[1]
+ + (unsigned long) tmp_v.us[2];
+
+ /* Similarly, out_v.us[2] is always zero for the final add. */
+ return out_v.us[0] + out_v.us[1];
+}
+
+/*
+ * Do a 64-bit checksum on an arbitrary memory area.
+ * Returns a 16bit checksum.
+ */
+unsigned int do_csum(const unsigned char *buff, unsigned len)
+{
+ unsigned odd, count;
+ unsigned long result = 0;
+
+ if (unlikely(len == 0))
+ return result;
+ odd = 1 & (unsigned long) buff;
+ if (odd) {
+ result = *buff << 8;
+ len--;
+ buff++;
+ }
+ count = len >> 1; /* nr of 16-bit words.. */
+ if (count) {
+ if (2 & (unsigned long) buff) {
+ result += *(unsigned short *)buff;
+ count--;
+ len -= 2;
+ buff += 2;
+ }
+ count >>= 1; /* nr of 32-bit words.. */
+ if (count) {
+ unsigned long zero;
+ unsigned long tmp1;
+ unsigned count64;
+
+ if (4 & (unsigned long) buff) {
+ result += *(unsigned int *) buff;
+ count--;
+ len -= 4;
+ buff += 4;
+ }
+ count >>= 1; /* nr of 64-bit words.. */
+
+ /* main loop using 64byte blocks */
+ zero = 0;
+ count64 = count >> 3;
+ while (count64) {
+ __asm__ __volatile__(
+ "ldr %x3, [%x1, #0]\n"
+ "adds %x0, %x0, %x3\n"
+ "ldr %x3, [%x1, #8]\n"
+ "adcs %x0, %x0, %x3\n"
+ "ldr %x3, [%x1, #16]\n"
+ "adcs %x0, %x0, %x3\n"
+ "ldr %x3, [%x1, #24]\n"
+ "adcs %x0, %x0, %x3\n"
+ "ldr %x3, [%x1, #32]\n"
+ "adcs %x0, %x0, %x3\n"
+ "ldr %x3, [%x1, #40]\n"
+ "adcs %x0, %x0, %x3\n"
+ "ldr %x3, [%x1, #48]\n"
+ "adcs %x0, %x0, %x3\n"
+ "ldr %x3, [%x1, #56]\n"
+ "adcs %x0, %x0, %x3\n"
+ "adcs %x0, %x0, %x2\n"
+ : "=r" (result)
+ : "r" (buff), "r" (zero), "r" (tmp1), "0" (result)
+ : "cc", "memory");
+
+ buff += 64;
+ count64--;
+ }
+
+ /* last up to 7 8byte blocks */
+ count %= 8;
+
+ while (count) {
+ __asm__ __volatile__(
+ "adds %x0, %x0, %x1\n"
+ "adcs %x0, %x0, %x2\n"
+ : "=r" (result)
+ : "r" (*(unsigned long *)buff), "r" (zero), "0" (result)
+ : "cc", "memory");
+ --count;
+ buff += 8;
+ }
+ result = (result & 0xffffffff) + (result >> 32);
+
+ if (len & 4) {
+ result += *(unsigned int *) buff;
+ buff += 4;
+ }
+ }
+ if (len & 2) {
+ result += *(unsigned short *) buff;
+ buff += 2;
+ }
+ }
+ if (len & 1)
+ result += *buff;
+
+ result = from64to16(result);
+ if (odd)
+ result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
+
+ return result;
+}
--
2.20.1
^ permalink raw reply related [flat|nested] 3+ messages in thread
* Re: [PATCH] arm64/lib: add accelerated do_csum for arm64
2018-12-28 7:37 [PATCH] arm64/lib: add accelerated do_csum for arm64 huhai
@ 2018-12-28 14:46 ` kbuild test robot
2019-01-03 8:13 ` Christoph Hellwig
1 sibling, 0 replies; 3+ messages in thread
From: kbuild test robot @ 2018-12-28 14:46 UTC (permalink / raw)
To: huhai
Cc: kbuild-all, will.deacon, catalin.marinas, linux-arm-kernel,
linux-kernel, liuyun01, huhai
[-- Attachment #1: Type: text/plain, Size: 2663 bytes --]
Hi huhai,
Thank you for the patch! Perhaps something to improve:
[auto build test WARNING on arm64/for-next/core]
[also build test WARNING on v4.20 next-20181224]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
url: https://github.com/0day-ci/linux/commits/huhai/arm64-lib-add-accelerated-do_csum-for-arm64/20181228-155335
base: https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git for-next/core
config: arm64-allmodconfig (attached as .config)
compiler: aarch64-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
GCC_VERSION=7.2.0 make.cross ARCH=arm64
Note: it may well be a FALSE warning. FWIW you are at least aware of it now.
http://gcc.gnu.org/wiki/Better_Uninitialized_Warnings
All warnings (new ones prefixed by >>):
arch/arm64/lib/checksum.c: In function 'do_csum':
>> arch/arm64/lib/checksum.c:85:5: warning: 'tmp1' may be used uninitialized in this function [-Wmaybe-uninitialized]
__asm__ __volatile__(
^~~~~~~
vim +/tmp1 +85 arch/arm64/lib/checksum.c
41
42 /*
43 * Do a 64-bit checksum on an arbitrary memory area.
44 * Returns a 16bit checksum.
45 */
46 unsigned int do_csum(const unsigned char *buff, unsigned len)
47 {
48 unsigned odd, count;
49 unsigned long result = 0;
50
51 if (unlikely(len == 0))
52 return result;
53 odd = 1 & (unsigned long) buff;
54 if (odd) {
55 result = *buff << 8;
56 len--;
57 buff++;
58 }
59 count = len >> 1; /* nr of 16-bit words.. */
60 if (count) {
61 if (2 & (unsigned long) buff) {
62 result += *(unsigned short *)buff;
63 count--;
64 len -= 2;
65 buff += 2;
66 }
67 count >>= 1; /* nr of 32-bit words.. */
68 if (count) {
69 unsigned long zero;
70 unsigned long tmp1;
71 unsigned count64;
72
73 if (4 & (unsigned long) buff) {
74 result += *(unsigned int *) buff;
75 count--;
76 len -= 4;
77 buff += 4;
78 }
79 count >>= 1; /* nr of 64-bit words.. */
80
81 /* main loop using 64byte blocks */
82 zero = 0;
83 count64 = count >> 3;
84 while (count64) {
> 85 __asm__ __volatile__(
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 61869 bytes --]
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] arm64/lib: add accelerated do_csum for arm64
2018-12-28 7:37 [PATCH] arm64/lib: add accelerated do_csum for arm64 huhai
2018-12-28 14:46 ` kbuild test robot
@ 2019-01-03 8:13 ` Christoph Hellwig
1 sibling, 0 replies; 3+ messages in thread
From: Christoph Hellwig @ 2019-01-03 8:13 UTC (permalink / raw)
To: huhai
Cc: will.deacon, catalin.marinas, linux-arm-kernel, linux-kernel, liuyun01
> diff --git a/arch/arm64/lib/checksum.c b/arch/arm64/lib/checksum.c
> new file mode 100644
> index 000000000000..6931ef13ef87
> --- /dev/null
> +++ b/arch/arm64/lib/checksum.c
> @@ -0,0 +1,144 @@
> +/*
> + * arch/arm64/lib/checksum.c
> + *
No need to mention the file name. On the other hand it really should
have a SPDX tag, and preferably a copyright notice.
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2019-01-03 8:13 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-12-28 7:37 [PATCH] arm64/lib: add accelerated do_csum for arm64 huhai
2018-12-28 14:46 ` kbuild test robot
2019-01-03 8:13 ` Christoph Hellwig
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).