Re: [PATCH v3] arm64: lib: accelerate do_csum with NEON instruction

From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
To: Lingyan Huang <huanglingyan2@huawei.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>,
	Will Deacon <will.deacon@arm.com>,
	linux-arm-kernel <linux-arm-kernel@lists.infradead.org>
Subject: Re: [PATCH v3] arm64: lib: accelerate do_csum with NEON instruction
Date: Sun, 6 Jan 2019 09:26:39 +0100	[thread overview]
Message-ID: <CAKv+Gu_wsK1UNXSp=5Hvd7XCHKC3cVkYjTYHhvqM4Xt09A9iKg@mail.gmail.com> (raw)
In-Reply-To: <1546739729-17234-1-git-send-email-huanglingyan2@huawei.com>

On Sun, 6 Jan 2019 at 02:56, Lingyan Huang <huanglingyan2@huawei.com> wrote:
>
> Function do_csum() in lib/checksum.c is used to compute checksum,
> which is turned out to be slowly and costs a lot of resources.
> Let's use neon instructions to accelerate the checksum computation
> for arm64.
>
> ------
> V2 ==> V3:
>     only modify the arm64 codes instead of modifying headers
>     under asm-generic and code in lib/checksum.c.
> ------
> ------
> V1 ==> V2:
>     Change NEON assembly code to NEON intrinsic code which is built
>     on top of arm_neon.h to avoid dropping into assembly.
> ------
>

Please put the changelog between the --- below and the diffstat

> Here is the comparison results of function ip_compute_csum() between
> general do_csum() and neon instruction do_csum(). The test platform
> is HUAWEI 1620 server with TAISHAN cores.
>
>         len(1000cycle)      general(ns)        do_csum_neon(ns)
>           64B:                58060                 59460
>          128B:                82930                 83930
>          256B:               132480                 73570
>          512B:               230100                 86230
>         1024B:               426600                 98200
>

Again, very nice performance. How did you test for correctness?

>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will.deacon@arm.com>
> Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> Signed-off-by: Lingyan Huang <huanglingyan2@huawei.com>
> ---
>  arch/arm64/include/asm/checksum.h |   5 ++
>  arch/arm64/lib/Makefile           |   8 +--
>  arch/arm64/lib/checksum.c         |  26 ++++++++
>  arch/arm64/lib/do_csum.c          | 136 ++++++++++++++++++++++++++++++++++++++
>  4 files changed, 171 insertions(+), 4 deletions(-)
>  create mode 100644 arch/arm64/lib/checksum.c
>  create mode 100644 arch/arm64/lib/do_csum.c
>
> diff --git a/arch/arm64/include/asm/checksum.h b/arch/arm64/include/asm/checksum.h
> index 0b6f5a7..7acd713 100644
> --- a/arch/arm64/include/asm/checksum.h
> +++ b/arch/arm64/include/asm/checksum.h
> @@ -26,6 +26,10 @@ static inline __sum16 csum_fold(__wsum csum)
>  }
>  #define csum_fold csum_fold
>
> +#define do_csum do_csum
> +unsigned int do_csum(const unsigned char *buff, int len);
> +extern unsigned int do_csum_arm(const unsigned char *buff, int len);
> +
>  static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
>  {
>         __uint128_t tmp;
> @@ -46,6 +50,7 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
>  }
>  #define ip_fast_csum ip_fast_csum
>
> +

Drop this whitespace-only change

>  #include <asm-generic/checksum.h>
>
>  #endif /* __ASM_CHECKSUM_H */
> diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
> index 5540a16..c0b5b8c 100644
> --- a/arch/arm64/lib/Makefile
> +++ b/arch/arm64/lib/Makefile
> @@ -3,12 +3,12 @@ lib-y         := clear_user.o delay.o copy_from_user.o                \
>                    copy_to_user.o copy_in_user.o copy_page.o            \
>                    clear_page.o memchr.o memcpy.o memmove.o memset.o    \
>                    memcmp.o strcmp.o strncmp.o strlen.o strnlen.o       \
> -                  strchr.o strrchr.o tishift.o
> +                  strchr.o strrchr.o tishift.o checksum.o
>
>  ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
> -obj-$(CONFIG_XOR_BLOCKS)       += xor-neon.o
> -CFLAGS_REMOVE_xor-neon.o       += -mgeneral-regs-only
> -CFLAGS_xor-neon.o              += -ffreestanding
> +obj-y    += do_csum.o

Please indent aligned with the others

> +CFLAGS_REMOVE_do_csum.o        += -mgeneral-regs-only
> +CFLAGS_do_csum.o               += -ffreestanding
>  endif
>
>  # Tell the compiler to treat all general purpose registers (with the
> diff --git a/arch/arm64/lib/checksum.c b/arch/arm64/lib/checksum.c
> new file mode 100644
> index 0000000..15a31bb
> --- /dev/null
> +++ b/arch/arm64/lib/checksum.c
> @@ -0,0 +1,26 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + *
> + * Authors: Lingyan Huang <huanglingyan2@huawei.com>
> + * Copyright (C) 2018 Hisilicon, Inc. All Rights Reserved.
> + *
> + * Generic C or neon implementation of do_csum operations.
> + * Choose faster neon instructions when NEON is supported.
> + *
> + */
> +
> +#include <asm/neon.h>
> +#include <asm/simd.h>
> +#include <asm/checksum.h>
> +
> +#define CSUM_NEON_THRESHOLD 128
> +
> +unsigned int do_csum(const unsigned char *buff, int len)
> +{
> +#ifdef CONFIG_KERNEL_MODE_NEON
> +       if (len >= CSUM_NEON_THRESHOLD)

Please change this into

if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
    len >= CSUM_NEON_THRESHOLD &&
    may_use_simd()) {
        kernel_neon_begin();
        res = do_csum_neon(buff, len);
        kernel_neon_end();
    }

and drop the intermediate do_csum_arm()

> +               return do_csum_arm(buff, len);
> +#endif  /* CONFIG_KERNEL_MODE_NEON */

No else? What happens if len < CSUM_NEON_THRESHOLD ?

> +#undef do_csum

Can we drop this?

> +       return 0;
> +}
> diff --git a/arch/arm64/lib/do_csum.c b/arch/arm64/lib/do_csum.c
> new file mode 100644
> index 0000000..893583f
> --- /dev/null
> +++ b/arch/arm64/lib/do_csum.c
> @@ -0,0 +1,136 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Authors: Lingyan Huang <huanglingyan2@huawei.com>
> + * Copyright (C) 2018 Hisilicon, Inc. All Rights Reserved.
> + *
> + * Generic C or neon implementation of do_csum operations.
> + * Choose faster neon instructions when NEON is supported.
> + *
> + */
> +
> +#include <asm/neon.h>
> +#include <asm/simd.h>
> +#include <asm/checksum.h>
> +#include <asm/byteorder.h>
> +#include <asm/neon-intrinsics.h>
> +
> +#define CSUM_NEON_THRESHOLD 128

Drop this - it is not used in this file

> +#ifdef CONFIG_KERNEL_MODE_NEON

This file is only built if KERNEL_MODE_NEON=y so the #ifdef can be dropped

> +static inline u32 from64to16(u64 x)
> +{
> +       /* add up 32-bit and 32-bit for 32+c bit */
> +       x = (x & 0xffffffff) + (x >> 32);
> +       /* add up carry.. */
> +       x = (x & 0xffffffff) + (x >> 32);
> +       /* add up 16-bit and 16-bit for 16+c bit */
> +       x = ((u32)x & 0xffff) + ((u32)x >> 16);
> +       /* add up carry.. */
> +       x = ((u32)x & 0xffff) + ((u32)x >> 16);
> +       return x;
> +}
> +
> +unsigned int do_csum_neon(const unsigned char *buff, int len)
> +{
> +       unsigned int odd, count;
> +       uint64_t result = 0;
> +       unsigned int count64;
> +       uint32x4_t vzero = (uint32x4_t){0, 0, 0, 0};
> +
> +       register uint32x4_t v0, v1, v2, v3;
> +
> +       if (unlikely(len <= 0))
> +               return result;
> +
> +       odd = 1 & (unsigned long)buff;
> +       if (unlikely(odd)) {
> +               result = *buff;
> +               len--;
> +               buff++;
> +       }
> +
> +       count = len >> 1;
> +       if (count) {
> +               if (2 & (unsigned long)buff) {
> +                       result += *(unsigned short *)buff;
> +                       count--;
> +                       len -= 2;
> +                       buff += 2;
> +               }
> +               count >>= 1;            /* nr of 32-bit words.. */
> +               if (count) {
> +                       if (4 & (unsigned long)buff) {
> +                               result += *(unsigned int *)buff;
> +                               count--;
> +                               len -= 4;
> +                               buff += 4;
> +                       }
> +                       count >>= 1;    /* nr of 64-bit words.. */
> +
> +                       v0 = vzero;
> +                       v1 = vzero;
> +                       v2 = vzero;
> +                       v3 = vzero;
> +
> +                       count64 = count >> 3;  /* compute 64 Byte circle */
> +                       while (count64) {
> +                               v0 = vpadalq_u16(v0,
> +                                       vld1q_u16((uint16_t *)buff + 0));
> +                               v1 = vpadalq_u16(v1,
> +                                       vld1q_u16((uint16_t *)buff + 8));
> +                               v2 = vpadalq_u16(v2,
> +                                       vld1q_u16((uint16_t *)buff + 16));
> +                               v3 = vpadalq_u16(v3,
> +                                       vld1q_u16((uint16_t *)buff + 24));
> +                               buff += 64;
> +                               count64--;
> +                       }
> +                       v0 = vaddq_u32(v0, v1);
> +                       v2 = vaddq_u32(v2, v3);
> +                       v0 = vaddq_u32(v0, v2);
> +
> +                       count %= 8;
> +                       while (count >= 2) { /* compute 16 byte circle */
> +                               v0 = vpadalq_u16(v0,
> +                                       vld1q_u16((uint16_t *)buff + 0));
> +                               buff += 16;
> +                               count -= 2;
> +                       }
> +
> +                       result += vgetq_lane_u32(v0, 0);
> +                       result += vgetq_lane_u32(v0, 1);
> +                       result += vgetq_lane_u32(v0, 2);
> +                       result += vgetq_lane_u32(v0, 3);
> +                       if (count & 1) {
> +                               result += *(unsigned long long *)buff;
> +                               buff += 8;
> +                       }
> +                       if (len & 4) {
> +                               result += *(unsigned int *)buff;
> +                               buff += 4;
> +                       }
> +               }
> +               if (len & 2) {
> +                       result += *(unsigned short *)buff;
> +                       buff += 2;
> +               }
> +       }
> +       if (len & 1)
> +               result += *buff;
> +       result = from64to16(result);
> +       if (odd)
> +               result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
> +       return result;
> +}
> +#endif
> +
> +
> +unsigned int do_csum_arm(const unsigned char *buff, int len)
> +{
> +       unsigned int res;
> +
> +       kernel_neon_begin();
> +       res = do_csum_neon(buff, len);
> +       kernel_neon_end();
> +       return res;
> +}

As I said above, please drop this intermediate function and fold the
logic into do_csum()

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel