[PATCH v2] arm64: lib: accelerate do_csum with NEON instruction

* [PATCH v2] arm64: lib: accelerate do_csum with NEON instruction
@ 2019-01-03 12:32 Lingyan Huang
  2019-01-03 18:19 ` Ard Biesheuvel
  0 siblings, 1 reply; 5+ messages in thread
From: Lingyan Huang @ 2019-01-03 12:32 UTC (permalink / raw)
  To: linux-arm-kernel; +Cc: Lingyan Huang

Function do_csum() in lib/checksum.c is used to compute checksum,
which is turned out to be slowly and costs a lot of resources.
Let's use neon instructions to accelerate the checksum computation
for arm64.

------
V1 ==> V2:
    Change NEON assembly code to NEON intrinsic code which is built
    on top of arm_neon.h to avoid dropping into assembly.
------

Here is the comparison results of function ip_compute_csum() between
general do_csum() and neon instruction do_csum().

        len(1000cycle)      general(ns)        do_csum_neon(ns)
          64B:                58060                 59460
         128B:                82930                 83930
         256B:               132480                 73570
         512B:               230100                 86230
        1024B:               426600                 98200


---
 arch/arm64/lib/Makefile        |   4 ++
 arch/arm64/lib/checksum.c      | 140 +++++++++++++++++++++++++++++++++++++++++
 include/asm-generic/checksum.h |   1 +
 lib/checksum.c                 |   8 ++-
 4 files changed, 152 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/lib/checksum.c

diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 5540a16..ec2fcd3 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -9,6 +9,10 @@ ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
 obj-$(CONFIG_XOR_BLOCKS)	+= xor-neon.o
 CFLAGS_REMOVE_xor-neon.o	+= -mgeneral-regs-only
 CFLAGS_xor-neon.o		+= -ffreestanding
+
+obj-y    += checksum.o
+CFLAGS_REMOVE_checksum.o	+= -mgeneral-regs-only
+CFLAGS_checksum.o		+= -ffreestanding
 endif
 
 # Tell the compiler to treat all general purpose registers (with the
diff --git a/arch/arm64/lib/checksum.c b/arch/arm64/lib/checksum.c
new file mode 100644
index 0000000..48f4ead
--- /dev/null
+++ b/arch/arm64/lib/checksum.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * arch/arm64/lib/checksum.c
+ *
+ * Authors: Lingyan Huang <huanglingyan2@huawei.com>
+ * Copyright (C) 2018 Hisilicon, Inc. All Rights Reserved.
+ *
+ * Generic C or neon implementation of do_csum operations.
+ * Choose faster neon instructions when NEON is supported.
+ *
+ */
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <asm/checksum.h>
+#include <asm/byteorder.h>
+#include <asm/neon-intrinsics.h>
+
+#define CSUM_NEON_THRESHOLD 128
+#ifdef CONFIG_KERNEL_MODE_NEON
+static inline u32 from64to16(u64 x)
+{
+	/* add up 32-bit and 32-bit for 32+c bit */
+	x = (x & 0xffffffff) + (x >> 32);
+	/* add up carry.. */
+	x = (x & 0xffffffff) + (x >> 32);
+	/* add up 16-bit and 16-bit for 16+c bit */
+	x = ((u32)x & 0xffff) + ((u32)x >> 16);
+	/* add up carry.. */
+	x = ((u32)x & 0xffff) + ((u32)x >> 16);
+	return x;
+}
+
+unsigned int do_csum_neon(const unsigned char *buff, unsigned int len)
+{
+	unsigned int odd, count;
+	uint64_t result = 0;
+	unsigned int count64;
+	uint32x4_t vzero = (uint32x4_t){0, 0, 0, 0};
+
+	register uint32x4_t v0, v1, v2, v3;
+
+	if (unlikely(len <= 0))
+		return result;
+
+	odd = 1 & (unsigned long) buff;
+	if (unlikely(odd)) {
+		result = *buff;
+		len--;
+		buff++;
+	}
+
+	count = len >> 1;
+	if (count) {
+		if (2 & (unsigned long) buff) {
+			result += *(unsigned short *)buff;
+			count--;
+			len -= 2;
+			buff += 2;
+		}
+		count >>= 1;            /* nr of 32-bit words.. */
+		if (count) {
+			if (4 & (unsigned long) buff) {
+				result += *(unsigned int *) buff;
+				count--;
+				len -= 4;
+				buff += 4;
+			}
+			count >>= 1;    /* nr of 64-bit words.. */
+
+			v0 = vzero;
+			v1 = vzero;
+			v2 = vzero;
+			v3 = vzero;
+
+			count64 = count >> 3;  /* compute 64 Byte circle */
+			while (count64) {
+				v0 = vpadalq_u16(v0,
+					vld1q_u16((uint16_t *)buff + 0));
+				v1 = vpadalq_u16(v1,
+					vld1q_u16((uint16_t *)buff + 8));
+				v2 = vpadalq_u16(v2,
+					vld1q_u16((uint16_t *)buff + 16));
+				v3 = vpadalq_u16(v3,
+					vld1q_u16((uint16_t *)buff + 24));
+				buff += 64;
+				count64--;
+			}
+			v0 = vaddq_u32(v0, v1);
+			v2 = vaddq_u32(v2, v3);
+			v0 = vaddq_u32(v0, v2);
+
+			count %= 8;
+			while (count >= 2) { /* compute 16 byte circle */
+				v0 = vpadalq_u16(v0,
+					vld1q_u16((uint16_t *)buff + 0));
+				buff += 16;
+				count -= 2;
+			}
+
+			result += vgetq_lane_u32(v0, 0);
+			result += vgetq_lane_u32(v0, 1);
+			result += vgetq_lane_u32(v0, 2);
+			result += vgetq_lane_u32(v0, 3);
+			if (count & 1) {
+				result += *(unsigned long long *) buff;
+				buff += 8;
+			}
+			if (len & 4) {
+				result += *(unsigned int *) buff;
+				buff += 4;
+			}
+		}
+		if (len & 2) {
+			result += *(unsigned short *) buff;
+			buff += 2;
+		}
+	}
+	if (len & 1)
+		result += *buff;
+	result = from64to16(result);
+	if (odd)
+		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
+	return result;
+}
+#endif
+
+
+unsigned int do_csum(const unsigned char *buff, unsigned int len)
+{
+	if ((len >= CSUM_NEON_THRESHOLD) && may_use_simd()) {
+		unsigned int res;
+
+		kernel_neon_begin();
+		res = do_csum_neon(buff, len);
+		kernel_neon_end();
+		return res;
+	} else
+		return do_csum_generic(buff, len);
+}
diff --git a/include/asm-generic/checksum.h b/include/asm-generic/checksum.h
index 34785c0..041063c 100644
--- a/include/asm-generic/checksum.h
+++ b/include/asm-generic/checksum.h
@@ -33,6 +33,7 @@ extern __wsum csum_partial_copy(const void *src, void *dst, int len, __wsum sum)
  */
 extern __wsum csum_partial_copy_from_user(const void __user *src, void *dst,
 					int len, __wsum sum, int *csum_err);
+extern unsigned int do_csum_generic(const unsigned char *buff, int len);
 
 #ifndef csum_partial_copy_nocheck
 #define csum_partial_copy_nocheck(src, dst, len, sum)	\
diff --git a/lib/checksum.c b/lib/checksum.c
index d3ec93f..83392db 100644
--- a/lib/checksum.c
+++ b/lib/checksum.c
@@ -47,7 +47,7 @@ static inline unsigned short from32to16(unsigned int x)
 	return x;
 }
 
-static unsigned int do_csum(const unsigned char *buff, int len)
+unsigned int do_csum_generic(const unsigned char *buff, int len)
 {
 	int odd;
 	unsigned int result = 0;
@@ -100,6 +100,12 @@ static unsigned int do_csum(const unsigned char *buff, int len)
 out:
 	return result;
 }
+
+static unsigned int do_csum(const unsigned char *buff, int len)
+{
+	return do_csum_generic(buff, len);
+}
+
 #endif
 
 #ifndef ip_fast_csum
-- 
2.7.4


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 5+ messages in thread