From: Lingyan Huang <huanglingyan2@huawei.com>
To: <linux-arm-kernel@lists.infradead.org>
Cc: Lingyan Huang <huanglingyan2@huawei.com>,
Ard Biesheuvel <ard.biesheuvel@linaro.org>,
Will Deacon <will.deacon@arm.com>,
Catalin Marinas <catalin.marinas@arm.com>
Subject: [PATCH v3] arm64: lib: accelerate do_csum with NEON instruction
Date: Sun, 6 Jan 2019 09:55:29 +0800 [thread overview]
Message-ID: <1546739729-17234-1-git-send-email-huanglingyan2@huawei.com> (raw)
Function do_csum() in lib/checksum.c is used to compute checksum,
which is turned out to be slowly and costs a lot of resources.
Let's use neon instructions to accelerate the checksum computation
for arm64.
------
V2 ==> V3:
only modify the arm64 codes instead of modifying headers
under asm-generic and code in lib/checksum.c.
------
------
V1 ==> V2:
Change NEON assembly code to NEON intrinsic code which is built
on top of arm_neon.h to avoid dropping into assembly.
------
Here is the comparison results of function ip_compute_csum() between
general do_csum() and neon instruction do_csum(). The test platform
is HUAWEI 1620 server with TAISHAN cores.
len(1000cycle) general(ns) do_csum_neon(ns)
64B: 58060 59460
128B: 82930 83930
256B: 132480 73570
512B: 230100 86230
1024B: 426600 98200
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Lingyan Huang <huanglingyan2@huawei.com>
---
arch/arm64/include/asm/checksum.h | 5 ++
arch/arm64/lib/Makefile | 8 +--
arch/arm64/lib/checksum.c | 26 ++++++++
arch/arm64/lib/do_csum.c | 136 ++++++++++++++++++++++++++++++++++++++
4 files changed, 171 insertions(+), 4 deletions(-)
create mode 100644 arch/arm64/lib/checksum.c
create mode 100644 arch/arm64/lib/do_csum.c
diff --git a/arch/arm64/include/asm/checksum.h b/arch/arm64/include/asm/checksum.h
index 0b6f5a7..7acd713 100644
--- a/arch/arm64/include/asm/checksum.h
+++ b/arch/arm64/include/asm/checksum.h
@@ -26,6 +26,10 @@ static inline __sum16 csum_fold(__wsum csum)
}
#define csum_fold csum_fold
+#define do_csum do_csum
+unsigned int do_csum(const unsigned char *buff, int len);
+extern unsigned int do_csum_arm(const unsigned char *buff, int len);
+
static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
{
__uint128_t tmp;
@@ -46,6 +50,7 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
}
#define ip_fast_csum ip_fast_csum
+
#include <asm-generic/checksum.h>
#endif /* __ASM_CHECKSUM_H */
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 5540a16..c0b5b8c 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -3,12 +3,12 @@ lib-y := clear_user.o delay.o copy_from_user.o \
copy_to_user.o copy_in_user.o copy_page.o \
clear_page.o memchr.o memcpy.o memmove.o memset.o \
memcmp.o strcmp.o strncmp.o strlen.o strnlen.o \
- strchr.o strrchr.o tishift.o
+ strchr.o strrchr.o tishift.o checksum.o
ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
-obj-$(CONFIG_XOR_BLOCKS) += xor-neon.o
-CFLAGS_REMOVE_xor-neon.o += -mgeneral-regs-only
-CFLAGS_xor-neon.o += -ffreestanding
+obj-y += do_csum.o
+CFLAGS_REMOVE_do_csum.o += -mgeneral-regs-only
+CFLAGS_do_csum.o += -ffreestanding
endif
# Tell the compiler to treat all general purpose registers (with the
diff --git a/arch/arm64/lib/checksum.c b/arch/arm64/lib/checksum.c
new file mode 100644
index 0000000..15a31bb
--- /dev/null
+++ b/arch/arm64/lib/checksum.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * Authors: Lingyan Huang <huanglingyan2@huawei.com>
+ * Copyright (C) 2018 Hisilicon, Inc. All Rights Reserved.
+ *
+ * Generic C or neon implementation of do_csum operations.
+ * Choose faster neon instructions when NEON is supported.
+ *
+ */
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <asm/checksum.h>
+
+#define CSUM_NEON_THRESHOLD 128
+
+unsigned int do_csum(const unsigned char *buff, int len)
+{
+#ifdef CONFIG_KERNEL_MODE_NEON
+ if (len >= CSUM_NEON_THRESHOLD)
+ return do_csum_arm(buff, len);
+#endif /* CONFIG_KERNEL_MODE_NEON */
+#undef do_csum
+ return 0;
+}
diff --git a/arch/arm64/lib/do_csum.c b/arch/arm64/lib/do_csum.c
new file mode 100644
index 0000000..893583f
--- /dev/null
+++ b/arch/arm64/lib/do_csum.c
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Authors: Lingyan Huang <huanglingyan2@huawei.com>
+ * Copyright (C) 2018 Hisilicon, Inc. All Rights Reserved.
+ *
+ * Generic C or neon implementation of do_csum operations.
+ * Choose faster neon instructions when NEON is supported.
+ *
+ */
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <asm/checksum.h>
+#include <asm/byteorder.h>
+#include <asm/neon-intrinsics.h>
+
+#define CSUM_NEON_THRESHOLD 128
+#ifdef CONFIG_KERNEL_MODE_NEON
+static inline u32 from64to16(u64 x)
+{
+ /* add up 32-bit and 32-bit for 32+c bit */
+ x = (x & 0xffffffff) + (x >> 32);
+ /* add up carry.. */
+ x = (x & 0xffffffff) + (x >> 32);
+ /* add up 16-bit and 16-bit for 16+c bit */
+ x = ((u32)x & 0xffff) + ((u32)x >> 16);
+ /* add up carry.. */
+ x = ((u32)x & 0xffff) + ((u32)x >> 16);
+ return x;
+}
+
+unsigned int do_csum_neon(const unsigned char *buff, int len)
+{
+ unsigned int odd, count;
+ uint64_t result = 0;
+ unsigned int count64;
+ uint32x4_t vzero = (uint32x4_t){0, 0, 0, 0};
+
+ register uint32x4_t v0, v1, v2, v3;
+
+ if (unlikely(len <= 0))
+ return result;
+
+ odd = 1 & (unsigned long)buff;
+ if (unlikely(odd)) {
+ result = *buff;
+ len--;
+ buff++;
+ }
+
+ count = len >> 1;
+ if (count) {
+ if (2 & (unsigned long)buff) {
+ result += *(unsigned short *)buff;
+ count--;
+ len -= 2;
+ buff += 2;
+ }
+ count >>= 1; /* nr of 32-bit words.. */
+ if (count) {
+ if (4 & (unsigned long)buff) {
+ result += *(unsigned int *)buff;
+ count--;
+ len -= 4;
+ buff += 4;
+ }
+ count >>= 1; /* nr of 64-bit words.. */
+
+ v0 = vzero;
+ v1 = vzero;
+ v2 = vzero;
+ v3 = vzero;
+
+ count64 = count >> 3; /* compute 64 Byte circle */
+ while (count64) {
+ v0 = vpadalq_u16(v0,
+ vld1q_u16((uint16_t *)buff + 0));
+ v1 = vpadalq_u16(v1,
+ vld1q_u16((uint16_t *)buff + 8));
+ v2 = vpadalq_u16(v2,
+ vld1q_u16((uint16_t *)buff + 16));
+ v3 = vpadalq_u16(v3,
+ vld1q_u16((uint16_t *)buff + 24));
+ buff += 64;
+ count64--;
+ }
+ v0 = vaddq_u32(v0, v1);
+ v2 = vaddq_u32(v2, v3);
+ v0 = vaddq_u32(v0, v2);
+
+ count %= 8;
+ while (count >= 2) { /* compute 16 byte circle */
+ v0 = vpadalq_u16(v0,
+ vld1q_u16((uint16_t *)buff + 0));
+ buff += 16;
+ count -= 2;
+ }
+
+ result += vgetq_lane_u32(v0, 0);
+ result += vgetq_lane_u32(v0, 1);
+ result += vgetq_lane_u32(v0, 2);
+ result += vgetq_lane_u32(v0, 3);
+ if (count & 1) {
+ result += *(unsigned long long *)buff;
+ buff += 8;
+ }
+ if (len & 4) {
+ result += *(unsigned int *)buff;
+ buff += 4;
+ }
+ }
+ if (len & 2) {
+ result += *(unsigned short *)buff;
+ buff += 2;
+ }
+ }
+ if (len & 1)
+ result += *buff;
+ result = from64to16(result);
+ if (odd)
+ result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
+ return result;
+}
+#endif
+
+
+unsigned int do_csum_arm(const unsigned char *buff, int len)
+{
+ unsigned int res;
+
+ kernel_neon_begin();
+ res = do_csum_neon(buff, len);
+ kernel_neon_end();
+ return res;
+}
+
--
2.7.4
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
next reply other threads:[~2019-01-06 1:56 UTC|newest]
Thread overview: 21+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-01-06 1:55 Lingyan Huang [this message]
2019-01-06 8:26 ` [PATCH v3] arm64: lib: accelerate do_csum with NEON instruction Ard Biesheuvel
[not found] ` <9129b882-60f3-8046-0cb9-e0b2452a118d@huawei.com>
2019-01-08 13:54 ` Will Deacon
2019-01-09 2:03 ` huanglingyan (A)
2019-01-10 4:08 ` 胡海
2019-01-10 8:14 ` huanglingyan (A)
2019-01-16 16:46 ` Will Deacon
2019-01-18 1:07 ` huanglingyan (A)
2019-01-18 11:14 ` Ard Biesheuvel
2019-02-12 2:26 ` huanglingyan (A)
2019-02-12 7:07 ` Ard Biesheuvel
2019-02-13 8:42 ` huanglingyan (A)
2019-02-13 9:15 ` Ard Biesheuvel
2019-02-13 17:55 ` Ard Biesheuvel
2019-02-14 9:57 ` huanglingyan (A)
2019-02-18 8:49 ` huanglingyan (A)
2019-02-18 9:03 ` Ard Biesheuvel
2019-01-09 14:58 ` Dave Martin
2019-01-10 8:03 ` huanglingyan (A)
2019-01-10 13:53 ` Dave Martin
[not found] <1f065749-6676-6489-14ae-fdcfeeb3389c@huawei.com>
2019-01-07 6:11 ` huanglingyan (A)
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1546739729-17234-1-git-send-email-huanglingyan2@huawei.com \
--to=huanglingyan2@huawei.com \
--cc=ard.biesheuvel@linaro.org \
--cc=catalin.marinas@arm.com \
--cc=linux-arm-kernel@lists.infradead.org \
--cc=will.deacon@arm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).