* [PATCH v2] arm64: lib: accelerate do_csum with NEON instruction
@ 2019-01-03 12:32 Lingyan Huang
2019-01-03 18:19 ` Ard Biesheuvel
0 siblings, 1 reply; 5+ messages in thread
From: Lingyan Huang @ 2019-01-03 12:32 UTC (permalink / raw)
To: linux-arm-kernel; +Cc: Lingyan Huang
Function do_csum() in lib/checksum.c is used to compute checksum,
which is turned out to be slowly and costs a lot of resources.
Let's use neon instructions to accelerate the checksum computation
for arm64.
------
V1 ==> V2:
Change NEON assembly code to NEON intrinsic code which is built
on top of arm_neon.h to avoid dropping into assembly.
------
Here is the comparison results of function ip_compute_csum() between
general do_csum() and neon instruction do_csum().
len(1000cycle) general(ns) do_csum_neon(ns)
64B: 58060 59460
128B: 82930 83930
256B: 132480 73570
512B: 230100 86230
1024B: 426600 98200
---
arch/arm64/lib/Makefile | 4 ++
arch/arm64/lib/checksum.c | 140 +++++++++++++++++++++++++++++++++++++++++
include/asm-generic/checksum.h | 1 +
lib/checksum.c | 8 ++-
4 files changed, 152 insertions(+), 1 deletion(-)
create mode 100644 arch/arm64/lib/checksum.c
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 5540a16..ec2fcd3 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -9,6 +9,10 @@ ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
obj-$(CONFIG_XOR_BLOCKS) += xor-neon.o
CFLAGS_REMOVE_xor-neon.o += -mgeneral-regs-only
CFLAGS_xor-neon.o += -ffreestanding
+
+obj-y += checksum.o
+CFLAGS_REMOVE_checksum.o += -mgeneral-regs-only
+CFLAGS_checksum.o += -ffreestanding
endif
# Tell the compiler to treat all general purpose registers (with the
diff --git a/arch/arm64/lib/checksum.c b/arch/arm64/lib/checksum.c
new file mode 100644
index 0000000..48f4ead
--- /dev/null
+++ b/arch/arm64/lib/checksum.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * arch/arm64/lib/checksum.c
+ *
+ * Authors: Lingyan Huang <huanglingyan2@huawei.com>
+ * Copyright (C) 2018 Hisilicon, Inc. All Rights Reserved.
+ *
+ * Generic C or neon implementation of do_csum operations.
+ * Choose faster neon instructions when NEON is supported.
+ *
+ */
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <asm/checksum.h>
+#include <asm/byteorder.h>
+#include <asm/neon-intrinsics.h>
+
+#define CSUM_NEON_THRESHOLD 128
+#ifdef CONFIG_KERNEL_MODE_NEON
+static inline u32 from64to16(u64 x)
+{
+ /* add up 32-bit and 32-bit for 32+c bit */
+ x = (x & 0xffffffff) + (x >> 32);
+ /* add up carry.. */
+ x = (x & 0xffffffff) + (x >> 32);
+ /* add up 16-bit and 16-bit for 16+c bit */
+ x = ((u32)x & 0xffff) + ((u32)x >> 16);
+ /* add up carry.. */
+ x = ((u32)x & 0xffff) + ((u32)x >> 16);
+ return x;
+}
+
+unsigned int do_csum_neon(const unsigned char *buff, unsigned int len)
+{
+ unsigned int odd, count;
+ uint64_t result = 0;
+ unsigned int count64;
+ uint32x4_t vzero = (uint32x4_t){0, 0, 0, 0};
+
+ register uint32x4_t v0, v1, v2, v3;
+
+ if (unlikely(len <= 0))
+ return result;
+
+ odd = 1 & (unsigned long) buff;
+ if (unlikely(odd)) {
+ result = *buff;
+ len--;
+ buff++;
+ }
+
+ count = len >> 1;
+ if (count) {
+ if (2 & (unsigned long) buff) {
+ result += *(unsigned short *)buff;
+ count--;
+ len -= 2;
+ buff += 2;
+ }
+ count >>= 1; /* nr of 32-bit words.. */
+ if (count) {
+ if (4 & (unsigned long) buff) {
+ result += *(unsigned int *) buff;
+ count--;
+ len -= 4;
+ buff += 4;
+ }
+ count >>= 1; /* nr of 64-bit words.. */
+
+ v0 = vzero;
+ v1 = vzero;
+ v2 = vzero;
+ v3 = vzero;
+
+ count64 = count >> 3; /* compute 64 Byte circle */
+ while (count64) {
+ v0 = vpadalq_u16(v0,
+ vld1q_u16((uint16_t *)buff + 0));
+ v1 = vpadalq_u16(v1,
+ vld1q_u16((uint16_t *)buff + 8));
+ v2 = vpadalq_u16(v2,
+ vld1q_u16((uint16_t *)buff + 16));
+ v3 = vpadalq_u16(v3,
+ vld1q_u16((uint16_t *)buff + 24));
+ buff += 64;
+ count64--;
+ }
+ v0 = vaddq_u32(v0, v1);
+ v2 = vaddq_u32(v2, v3);
+ v0 = vaddq_u32(v0, v2);
+
+ count %= 8;
+ while (count >= 2) { /* compute 16 byte circle */
+ v0 = vpadalq_u16(v0,
+ vld1q_u16((uint16_t *)buff + 0));
+ buff += 16;
+ count -= 2;
+ }
+
+ result += vgetq_lane_u32(v0, 0);
+ result += vgetq_lane_u32(v0, 1);
+ result += vgetq_lane_u32(v0, 2);
+ result += vgetq_lane_u32(v0, 3);
+ if (count & 1) {
+ result += *(unsigned long long *) buff;
+ buff += 8;
+ }
+ if (len & 4) {
+ result += *(unsigned int *) buff;
+ buff += 4;
+ }
+ }
+ if (len & 2) {
+ result += *(unsigned short *) buff;
+ buff += 2;
+ }
+ }
+ if (len & 1)
+ result += *buff;
+ result = from64to16(result);
+ if (odd)
+ result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
+ return result;
+}
+#endif
+
+
+unsigned int do_csum(const unsigned char *buff, unsigned int len)
+{
+ if ((len >= CSUM_NEON_THRESHOLD) && may_use_simd()) {
+ unsigned int res;
+
+ kernel_neon_begin();
+ res = do_csum_neon(buff, len);
+ kernel_neon_end();
+ return res;
+ } else
+ return do_csum_generic(buff, len);
+}
diff --git a/include/asm-generic/checksum.h b/include/asm-generic/checksum.h
index 34785c0..041063c 100644
--- a/include/asm-generic/checksum.h
+++ b/include/asm-generic/checksum.h
@@ -33,6 +33,7 @@ extern __wsum csum_partial_copy(const void *src, void *dst, int len, __wsum sum)
*/
extern __wsum csum_partial_copy_from_user(const void __user *src, void *dst,
int len, __wsum sum, int *csum_err);
+extern unsigned int do_csum_generic(const unsigned char *buff, int len);
#ifndef csum_partial_copy_nocheck
#define csum_partial_copy_nocheck(src, dst, len, sum) \
diff --git a/lib/checksum.c b/lib/checksum.c
index d3ec93f..83392db 100644
--- a/lib/checksum.c
+++ b/lib/checksum.c
@@ -47,7 +47,7 @@ static inline unsigned short from32to16(unsigned int x)
return x;
}
-static unsigned int do_csum(const unsigned char *buff, int len)
+unsigned int do_csum_generic(const unsigned char *buff, int len)
{
int odd;
unsigned int result = 0;
@@ -100,6 +100,12 @@ static unsigned int do_csum(const unsigned char *buff, int len)
out:
return result;
}
+
+static unsigned int do_csum(const unsigned char *buff, int len)
+{
+ return do_csum_generic(buff, len);
+}
+
#endif
#ifndef ip_fast_csum
--
2.7.4
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH v2] arm64: lib: accelerate do_csum with NEON instruction
2019-01-03 12:32 [PATCH v2] arm64: lib: accelerate do_csum with NEON instruction Lingyan Huang
@ 2019-01-03 18:19 ` Ard Biesheuvel
2019-01-05 2:20 ` huanglingyan (A)
0 siblings, 1 reply; 5+ messages in thread
From: Ard Biesheuvel @ 2019-01-03 18:19 UTC (permalink / raw)
To: Lingyan Huang; +Cc: linux-arm-kernel
On Thu, 3 Jan 2019 at 13:32, Lingyan Huang <huanglingyan2@huawei.com> wrote:
>
> Function do_csum() in lib/checksum.c is used to compute checksum,
> which is turned out to be slowly and costs a lot of resources.
> Let's use neon instructions to accelerate the checksum computation
> for arm64.
>
> ------
> V1 ==> V2:
> Change NEON assembly code to NEON intrinsic code which is built
> on top of arm_neon.h to avoid dropping into assembly.
> ------
>
> Here is the comparison results of function ip_compute_csum() between
> general do_csum() and neon instruction do_csum().
>
> len(1000cycle) general(ns) do_csum_neon(ns)
> 64B: 58060 59460
> 128B: 82930 83930
> 256B: 132480 73570
> 512B: 230100 86230
> 1024B: 426600 98200
>
Very nice! Which CPU did you test this on?
>
> ---
> arch/arm64/lib/Makefile | 4 ++
> arch/arm64/lib/checksum.c | 140 +++++++++++++++++++++++++++++++++++++++++
> include/asm-generic/checksum.h | 1 +
> lib/checksum.c | 8 ++-
> 4 files changed, 152 insertions(+), 1 deletion(-)
> create mode 100644 arch/arm64/lib/checksum.c
>
> diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
> index 5540a16..ec2fcd3 100644
> --- a/arch/arm64/lib/Makefile
> +++ b/arch/arm64/lib/Makefile
> @@ -9,6 +9,10 @@ ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
> obj-$(CONFIG_XOR_BLOCKS) += xor-neon.o
> CFLAGS_REMOVE_xor-neon.o += -mgeneral-regs-only
> CFLAGS_xor-neon.o += -ffreestanding
> +
> +obj-y += checksum.o
> +CFLAGS_REMOVE_checksum.o += -mgeneral-regs-only
> +CFLAGS_checksum.o += -ffreestanding
> endif
>
> # Tell the compiler to treat all general purpose registers (with the
> diff --git a/arch/arm64/lib/checksum.c b/arch/arm64/lib/checksum.c
> new file mode 100644
> index 0000000..48f4ead
> --- /dev/null
> +++ b/arch/arm64/lib/checksum.c
> @@ -0,0 +1,140 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * arch/arm64/lib/checksum.c
Drop this line
> + *
> + * Authors: Lingyan Huang <huanglingyan2@huawei.com>
> + * Copyright (C) 2018 Hisilicon, Inc. All Rights Reserved.
> + *
> + * Generic C or neon implementation of do_csum operations.
> + * Choose faster neon instructions when NEON is supported.
> + *
> + */
> +
> +#include <asm/neon.h>
> +#include <asm/simd.h>
> +#include <asm/checksum.h>
> +#include <asm/byteorder.h>
> +#include <asm/neon-intrinsics.h>
> +
> +#define CSUM_NEON_THRESHOLD 128
> +#ifdef CONFIG_KERNEL_MODE_NEON
> +static inline u32 from64to16(u64 x)
> +{
> + /* add up 32-bit and 32-bit for 32+c bit */
> + x = (x & 0xffffffff) + (x >> 32);
> + /* add up carry.. */
> + x = (x & 0xffffffff) + (x >> 32);
> + /* add up 16-bit and 16-bit for 16+c bit */
> + x = ((u32)x & 0xffff) + ((u32)x >> 16);
> + /* add up carry.. */
> + x = ((u32)x & 0xffff) + ((u32)x >> 16);
> + return x;
> +}
> +
> +unsigned int do_csum_neon(const unsigned char *buff, unsigned int len)
> +{
> + unsigned int odd, count;
> + uint64_t result = 0;
> + unsigned int count64;
> + uint32x4_t vzero = (uint32x4_t){0, 0, 0, 0};
> +
> + register uint32x4_t v0, v1, v2, v3;
> +
> + if (unlikely(len <= 0))
len is unsigned so '== 0' is sufficient
> + return result;
> +
> + odd = 1 & (unsigned long) buff;
No space after () cast please
> + if (unlikely(odd)) {
> + result = *buff;
> + len--;
> + buff++;
> + }
> +
> + count = len >> 1;
> + if (count) {
> + if (2 & (unsigned long) buff) {
> + result += *(unsigned short *)buff;
> + count--;
> + len -= 2;
> + buff += 2;
> + }
> + count >>= 1; /* nr of 32-bit words.. */
> + if (count) {
> + if (4 & (unsigned long) buff) {
> + result += *(unsigned int *) buff;
> + count--;
> + len -= 4;
> + buff += 4;
> + }
> + count >>= 1; /* nr of 64-bit words.. */
> +
> + v0 = vzero;
> + v1 = vzero;
> + v2 = vzero;
> + v3 = vzero;
> +
> + count64 = count >> 3; /* compute 64 Byte circle */
> + while (count64) {
> + v0 = vpadalq_u16(v0,
> + vld1q_u16((uint16_t *)buff + 0));
> + v1 = vpadalq_u16(v1,
> + vld1q_u16((uint16_t *)buff + 8));
> + v2 = vpadalq_u16(v2,
> + vld1q_u16((uint16_t *)buff + 16));
> + v3 = vpadalq_u16(v3,
> + vld1q_u16((uint16_t *)buff + 24));
> + buff += 64;
> + count64--;
> + }
> + v0 = vaddq_u32(v0, v1);
> + v2 = vaddq_u32(v2, v3);
> + v0 = vaddq_u32(v0, v2);
> +
> + count %= 8;
> + while (count >= 2) { /* compute 16 byte circle */
> + v0 = vpadalq_u16(v0,
> + vld1q_u16((uint16_t *)buff + 0));
> + buff += 16;
> + count -= 2;
> + }
> +
> + result += vgetq_lane_u32(v0, 0);
> + result += vgetq_lane_u32(v0, 1);
> + result += vgetq_lane_u32(v0, 2);
> + result += vgetq_lane_u32(v0, 3);
> + if (count & 1) {
> + result += *(unsigned long long *) buff;
> + buff += 8;
> + }
> + if (len & 4) {
> + result += *(unsigned int *) buff;
> + buff += 4;
> + }
> + }
> + if (len & 2) {
> + result += *(unsigned short *) buff;
> + buff += 2;
> + }
> + }
> + if (len & 1)
> + result += *buff;
> + result = from64to16(result);
> + if (odd)
> + result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
> + return result;
> +}
> +#endif
> +
> +
> +unsigned int do_csum(const unsigned char *buff, unsigned int len)
> +{
> + if ((len >= CSUM_NEON_THRESHOLD) && may_use_simd()) {
> + unsigned int res;
> +
> + kernel_neon_begin();
> + res = do_csum_neon(buff, len);
> + kernel_neon_end();
> + return res;
> + } else
> + return do_csum_generic(buff, len);
> +}
> diff --git a/include/asm-generic/checksum.h b/include/asm-generic/checksum.h
> index 34785c0..041063c 100644
> --- a/include/asm-generic/checksum.h
> +++ b/include/asm-generic/checksum.h
> @@ -33,6 +33,7 @@ extern __wsum csum_partial_copy(const void *src, void *dst, int len, __wsum sum)
> */
> extern __wsum csum_partial_copy_from_user(const void __user *src, void *dst,
> int len, __wsum sum, int *csum_err);
> +extern unsigned int do_csum_generic(const unsigned char *buff, int len);
>
Surely, we can find a better way to hook up this code than modifying
headers under asm-generic and code in lib/checksum.c
Is arm64 the only arch that has an optimized checksum() function?
> #ifndef csum_partial_copy_nocheck
> #define csum_partial_copy_nocheck(src, dst, len, sum) \
> diff --git a/lib/checksum.c b/lib/checksum.c
> index d3ec93f..83392db 100644
> --- a/lib/checksum.c
> +++ b/lib/checksum.c
> @@ -47,7 +47,7 @@ static inline unsigned short from32to16(unsigned int x)
> return x;
> }
>
> -static unsigned int do_csum(const unsigned char *buff, int len)
> +unsigned int do_csum_generic(const unsigned char *buff, int len)
> {
> int odd;
> unsigned int result = 0;
> @@ -100,6 +100,12 @@ static unsigned int do_csum(const unsigned char *buff, int len)
> out:
> return result;
> }
> +
> +static unsigned int do_csum(const unsigned char *buff, int len)
> +{
> + return do_csum_generic(buff, len);
> +}
> +
> #endif
>
> #ifndef ip_fast_csum
> --
> 2.7.4
>
>
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH v2] arm64: lib: accelerate do_csum with NEON instruction
2019-01-03 18:19 ` Ard Biesheuvel
@ 2019-01-05 2:20 ` huanglingyan (A)
0 siblings, 0 replies; 5+ messages in thread
From: huanglingyan (A) @ 2019-01-05 2:20 UTC (permalink / raw)
To: Ard Biesheuvel; +Cc: linux-arm-kernel
在 2019/1/4 2:19, Ard Biesheuvel 写道:
> On Thu, 3 Jan 2019 at 13:32, Lingyan Huang <huanglingyan2@huawei.com> wrote:
>> Function do_csum() in lib/checksum.c is used to compute checksum,
>> which is turned out to be slowly and costs a lot of resources.
>> Let's use neon instructions to accelerate the checksum computation
>> for arm64.
>>
>> ------
>> V1 ==> V2:
>> Change NEON assembly code to NEON intrinsic code which is built
>> on top of arm_neon.h to avoid dropping into assembly.
>> ------
>>
>> Here is the comparison results of function ip_compute_csum() between
>> general do_csum() and neon instruction do_csum().
>>
>> len(1000cycle) general(ns) do_csum_neon(ns)
>> 64B: 58060 59460
>> 128B: 82930 83930
>> 256B: 132480 73570
>> 512B: 230100 86230
>> 1024B: 426600 98200
>>
> Very nice! Which CPU did you test this on?
Thank you for your reply. The test platform is Huawei hip08.
>> ---
>> arch/arm64/lib/Makefile | 4 ++
>> arch/arm64/lib/checksum.c | 140 +++++++++++++++++++++++++++++++++++++++++
>> include/asm-generic/checksum.h | 1 +
>> lib/checksum.c | 8 ++-
>> 4 files changed, 152 insertions(+), 1 deletion(-)
>> create mode 100644 arch/arm64/lib/checksum.c
>>
>> diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
>> index 5540a16..ec2fcd3 100644
>> --- a/arch/arm64/lib/Makefile
>> +++ b/arch/arm64/lib/Makefile
>> @@ -9,6 +9,10 @@ ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
>> obj-$(CONFIG_XOR_BLOCKS) += xor-neon.o
>> CFLAGS_REMOVE_xor-neon.o += -mgeneral-regs-only
>> CFLAGS_xor-neon.o += -ffreestanding
>> +
>> +obj-y += checksum.o
>> +CFLAGS_REMOVE_checksum.o += -mgeneral-regs-only
>> +CFLAGS_checksum.o += -ffreestanding
>> endif
>>
>> # Tell the compiler to treat all general purpose registers (with the
>> diff --git a/arch/arm64/lib/checksum.c b/arch/arm64/lib/checksum.c
>> new file mode 100644
>> index 0000000..48f4ead
>> --- /dev/null
>> +++ b/arch/arm64/lib/checksum.c
>> @@ -0,0 +1,140 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * arch/arm64/lib/checksum.c
> Drop this line
OK.
>> + *
>> + * Authors: Lingyan Huang <huanglingyan2@huawei.com>
>> + * Copyright (C) 2018 Hisilicon, Inc. All Rights Reserved.
>> + *
>> + * Generic C or neon implementation of do_csum operations.
>> + * Choose faster neon instructions when NEON is supported.
>> + *
>> + */
>> +
>> +#include <asm/neon.h>
>> +#include <asm/simd.h>
>> +#include <asm/checksum.h>
>> +#include <asm/byteorder.h>
>> +#include <asm/neon-intrinsics.h>
>> +
>> +#define CSUM_NEON_THRESHOLD 128
>> +#ifdef CONFIG_KERNEL_MODE_NEON
>> +static inline u32 from64to16(u64 x)
>> +{
>> + /* add up 32-bit and 32-bit for 32+c bit */
>> + x = (x & 0xffffffff) + (x >> 32);
>> + /* add up carry.. */
>> + x = (x & 0xffffffff) + (x >> 32);
>> + /* add up 16-bit and 16-bit for 16+c bit */
>> + x = ((u32)x & 0xffff) + ((u32)x >> 16);
>> + /* add up carry.. */
>> + x = ((u32)x & 0xffff) + ((u32)x >> 16);
>> + return x;
>> +}
>> +
>> +unsigned int do_csum_neon(const unsigned char *buff, unsigned int len)
>> +{
>> + unsigned int odd, count;
>> + uint64_t result = 0;
>> + unsigned int count64;
>> + uint32x4_t vzero = (uint32x4_t){0, 0, 0, 0};
>> +
>> + register uint32x4_t v0, v1, v2, v3;
>> +
>> + if (unlikely(len <= 0))
> len is unsigned so '== 0' is sufficient
OK
>> + return result;
>> +
>> + odd = 1 & (unsigned long) buff;
> No space after () cast please
Has modified all such situations in V3 patch.
>> + if (unlikely(odd)) {
>> + result = *buff;
>> + len--;
>> + buff++;
>> + }
>> +
>> + count = len >> 1;
>> + if (count) {
>> + if (2 & (unsigned long) buff) {
>> + result += *(unsigned short *)buff;
>> + count--;
>> + len -= 2;
>> + buff += 2;
>> + }
>> + count >>= 1; /* nr of 32-bit words.. */
>> + if (count) {
>> + if (4 & (unsigned long) buff) {
>> + result += *(unsigned int *) buff;
>> + count--;
>> + len -= 4;
>> + buff += 4;
>> + }
>> + count >>= 1; /* nr of 64-bit words.. */
>> +
>> + v0 = vzero;
>> + v1 = vzero;
>> + v2 = vzero;
>> + v3 = vzero;
>> +
>> + count64 = count >> 3; /* compute 64 Byte circle */
>> + while (count64) {
>> + v0 = vpadalq_u16(v0,
>> + vld1q_u16((uint16_t *)buff + 0));
>> + v1 = vpadalq_u16(v1,
>> + vld1q_u16((uint16_t *)buff + 8));
>> + v2 = vpadalq_u16(v2,
>> + vld1q_u16((uint16_t *)buff + 16));
>> + v3 = vpadalq_u16(v3,
>> + vld1q_u16((uint16_t *)buff + 24));
>> + buff += 64;
>> + count64--;
>> + }
>> + v0 = vaddq_u32(v0, v1);
>> + v2 = vaddq_u32(v2, v3);
>> + v0 = vaddq_u32(v0, v2);
>> +
>> + count %= 8;
>> + while (count >= 2) { /* compute 16 byte circle */
>> + v0 = vpadalq_u16(v0,
>> + vld1q_u16((uint16_t *)buff + 0));
>> + buff += 16;
>> + count -= 2;
>> + }
>> +
>> + result += vgetq_lane_u32(v0, 0);
>> + result += vgetq_lane_u32(v0, 1);
>> + result += vgetq_lane_u32(v0, 2);
>> + result += vgetq_lane_u32(v0, 3);
>> + if (count & 1) {
>> + result += *(unsigned long long *) buff;
>> + buff += 8;
>> + }
>> + if (len & 4) {
>> + result += *(unsigned int *) buff;
>> + buff += 4;
>> + }
>> + }
>> + if (len & 2) {
>> + result += *(unsigned short *) buff;
>> + buff += 2;
>> + }
>> + }
>> + if (len & 1)
>> + result += *buff;
>> + result = from64to16(result);
>> + if (odd)
>> + result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
>> + return result;
>> +}
>> +#endif
>> +
>> +
>> +unsigned int do_csum(const unsigned char *buff, unsigned int len)
>> +{
>> + if ((len >= CSUM_NEON_THRESHOLD) && may_use_simd()) {
>> + unsigned int res;
>> +
>> + kernel_neon_begin();
>> + res = do_csum_neon(buff, len);
>> + kernel_neon_end();
>> + return res;
>> + } else
>> + return do_csum_generic(buff, len);
>> +}
>> diff --git a/include/asm-generic/checksum.h b/include/asm-generic/checksum.h
>> index 34785c0..041063c 100644
>> --- a/include/asm-generic/checksum.h
>> +++ b/include/asm-generic/checksum.h
>> @@ -33,6 +33,7 @@ extern __wsum csum_partial_copy(const void *src, void *dst, int len, __wsum sum)
>> */
>> extern __wsum csum_partial_copy_from_user(const void __user *src, void *dst,
>> int len, __wsum sum, int *csum_err);
>> +extern unsigned int do_csum_generic(const unsigned char *buff, int len);
>>
> Surely, we can find a better way to hook up this code than modifying
> headers under asm-generic and code in lib/checksum.c
>
> Is arm64 the only arch that has an optimized checksum() function?
Arm64 is not the only arch. Your suggetion is reasonable so I'm working at looking for
a better way to only modify the arm64 codes.
>> #ifndef csum_partial_copy_nocheck
>> #define csum_partial_copy_nocheck(src, dst, len, sum) \
>> diff --git a/lib/checksum.c b/lib/checksum.c
>> index d3ec93f..83392db 100644
>> --- a/lib/checksum.c
>> +++ b/lib/checksum.c
>> @@ -47,7 +47,7 @@ static inline unsigned short from32to16(unsigned int x)
>> return x;
>> }
>>
>> -static unsigned int do_csum(const unsigned char *buff, int len)
>> +unsigned int do_csum_generic(const unsigned char *buff, int len)
>> {
>> int odd;
>> unsigned int result = 0;
>> @@ -100,6 +100,12 @@ static unsigned int do_csum(const unsigned char *buff, int len)
>> out:
>> return result;
>> }
>> +
>> +static unsigned int do_csum(const unsigned char *buff, int len)
>> +{
>> + return do_csum_generic(buff, len);
>> +}
>> +
>> #endif
>>
>> #ifndef ip_fast_csum
>> --
>> 2.7.4
>>
>>
>> _______________________________________________
>> linux-arm-kernel mailing list
>> linux-arm-kernel@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
> .
>
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH v2] arm64: lib: accelerate do_csum with NEON instruction
2019-01-03 15:44 ` Will Deacon
@ 2019-01-05 2:27 ` huanglingyan (A)
0 siblings, 0 replies; 5+ messages in thread
From: huanglingyan (A) @ 2019-01-05 2:27 UTC (permalink / raw)
To: Will Deacon; +Cc: Catalin Marinas, linux-arm-kernel
在 2019/1/3 23:44, Will Deacon 写道:
> On Wed, Jan 02, 2019 at 04:31:00PM +0800, Lingyan Huang wrote:
>> Function do_csum() in lib/checksum.c is used to compute checksum,
>> which is turned out to be slowly and costs a lot of resources.
>> Let's use neon instructions to accelerate the checksum computation
>> for arm64.
>>
>> ------
>> V1 ==> V2:
>> Change NEON assembly code to NEON intrinsic code which is built
>> on top of arm_neon.h to avoid dropping into assembly.
>> ------
>>
>> Here is the comparison results of function ip_compute_csum() between
>> general do_csum() and neon instruction do_csum().
>>
>> len(1000cycle) general(ns) do_csum_neon(ns)
>> 64B: 58060 59460
>> 128B: 82930 83930
>> 256B: 132480 73570
>> 512B: 230100 86230
>> 1024B: 426600 98200
>>
>>
>> Cc: Catalin Marinas <catalin.marinas@arm.com>
>> Cc: Will Deacon <will.deacon@arm.com>
>> Signed-off-by: Lingyan Huang <huanglingyan2@huawei.com>
>> ---
>> arch/arm64/lib/Makefile | 4 ++
>> arch/arm64/lib/checksum.c | 140 +++++++++++++++++++++++++++++++++++++++++
>> include/asm-generic/checksum.h | 1 +
>> lib/checksum.c | 8 ++-
>> 4 files changed, 152 insertions(+), 1 deletion(-)
>> create mode 100644 arch/arm64/lib/checksum.c
> I received five copies of this patch :/
>
> Can you send a v3 (just once) so it's clear which is the latest version,
> please?
>
> Will
>
> .
I'm so sorry for the trouble I caused. It seems my mail server had some problems. The v3
version is on the way. Looking forward to your suggestions.
Deeply sorry for this accident.
- Lingyan Huang
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH v2] arm64: lib: accelerate do_csum with NEON instruction
[not found] <1546417860-30214-1-git-send-email-huanglingyan2@huawei.com>
@ 2019-01-03 15:44 ` Will Deacon
2019-01-05 2:27 ` huanglingyan (A)
0 siblings, 1 reply; 5+ messages in thread
From: Will Deacon @ 2019-01-03 15:44 UTC (permalink / raw)
To: Lingyan Huang; +Cc: Catalin Marinas, linux-arm-kernel
On Wed, Jan 02, 2019 at 04:31:00PM +0800, Lingyan Huang wrote:
> Function do_csum() in lib/checksum.c is used to compute checksum,
> which is turned out to be slowly and costs a lot of resources.
> Let's use neon instructions to accelerate the checksum computation
> for arm64.
>
> ------
> V1 ==> V2:
> Change NEON assembly code to NEON intrinsic code which is built
> on top of arm_neon.h to avoid dropping into assembly.
> ------
>
> Here is the comparison results of function ip_compute_csum() between
> general do_csum() and neon instruction do_csum().
>
> len(1000cycle) general(ns) do_csum_neon(ns)
> 64B: 58060 59460
> 128B: 82930 83930
> 256B: 132480 73570
> 512B: 230100 86230
> 1024B: 426600 98200
>
>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will.deacon@arm.com>
> Signed-off-by: Lingyan Huang <huanglingyan2@huawei.com>
> ---
> arch/arm64/lib/Makefile | 4 ++
> arch/arm64/lib/checksum.c | 140 +++++++++++++++++++++++++++++++++++++++++
> include/asm-generic/checksum.h | 1 +
> lib/checksum.c | 8 ++-
> 4 files changed, 152 insertions(+), 1 deletion(-)
> create mode 100644 arch/arm64/lib/checksum.c
I received five copies of this patch :/
Can you send a v3 (just once) so it's clear which is the latest version,
please?
Will
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2019-01-05 2:26 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-03 12:32 [PATCH v2] arm64: lib: accelerate do_csum with NEON instruction Lingyan Huang
2019-01-03 18:19 ` Ard Biesheuvel
2019-01-05 2:20 ` huanglingyan (A)
[not found] <1546417860-30214-1-git-send-email-huanglingyan2@huawei.com>
2019-01-03 15:44 ` Will Deacon
2019-01-05 2:27 ` huanglingyan (A)
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.