All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] arch/arm: optimization for memcpy on AArch64
@ 2017-11-27  7:49 Herbert Guan
  2017-11-29 12:31 ` Jerin Jacob
                   ` (4 more replies)
  0 siblings, 5 replies; 27+ messages in thread
From: Herbert Guan @ 2017-11-27  7:49 UTC (permalink / raw)
  To: jerin.jacob, jianbo.liu, dev; +Cc: Herbert Guan

This patch provides an option to do rte_memcpy() using 'restrict'
qualifier, which can induce GCC to do optimizations by using more
efficient instructions, providing some performance gain over memcpy()
on some AArch64 platforms/enviroments.

The memory copy performance differs between different AArch64
platforms. And a more recent glibc (e.g. 2.23 or later)
can provide a better memcpy() performance compared to old glibc
versions. It's always suggested to use a more recent glibc if
possible, from which the entire system can get benefit. If for some
reason an old glibc has to be used, this patch is provided for an
alternative.

This implementation can improve memory copy on some AArch64
platforms, when an old glibc (e.g. 2.19, 2.17...) is being used.
It is disabled by default and needs "RTE_ARCH_ARM64_MEMCPY"
defined to activate. It's not always proving better performance
than memcpy() so users need to run DPDK unit test
"memcpy_perf_autotest" and customize parameters in "customization
section" in rte_memcpy_64.h for best performance.

Compiler version will also impact the rte_memcpy() performance.
It's observed on some platforms and with the same code, GCC 7.2.0
compiled binary can provide better performance than GCC 4.8.5. It's
suggested to use GCC 5.4.0 or later.

Signed-off-by: Herbert Guan <herbert.guan@arm.com>
---
 .../common/include/arch/arm/rte_memcpy_64.h        | 193 +++++++++++++++++++++
 1 file changed, 193 insertions(+)

diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
index b80d8ba..1f42b3c 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
@@ -42,6 +42,197 @@
 
 #include "generic/rte_memcpy.h"
 
+#ifdef RTE_ARCH_ARM64_MEMCPY
+#include <rte_common.h>
+#include <rte_branch_prediction.h>
+
+/*******************************************************************************
+ * The memory copy performance differs on different AArch64 micro-architectures.
+ * And the most recent glibc (e.g. 2.23 or later) can provide a better memcpy()
+ * performance compared to old glibc versions. It's always suggested to use a
+ * more recent glibc if possible, from which the entire system can get benefit.
+ *
+ * This implementation improves memory copy on some aarch64 micro-architectures,
+ * when an old glibc (e.g. 2.19, 2.17...) is being used. It is disabled by
+ * default and needs "RTE_ARCH_ARM64_MEMCPY" defined to activate. It's not
+ * always providing better performance than memcpy() so users need to run unit
+ * test "memcpy_perf_autotest" and customize parameters in customization section
+ * below for best performance.
+ *
+ * Compiler version will also impact the rte_memcpy() performance. It's observed
+ * on some platforms and with the same code, GCC 7.2.0 compiled binaries can
+ * provide better performance than GCC 4.8.5 compiled binaries.
+ ******************************************************************************/
+
+/**************************************
+ * Beginning of customization section
+ **************************************/
+#define ALIGNMENT_MASK 0x0F
+#ifndef RTE_ARCH_ARM64_MEMCPY_STRICT_ALIGN
+// Only src unalignment will be treaed as unaligned copy
+#define IS_UNALIGNED_COPY(dst, src) ((uintptr_t)(dst) & ALIGNMENT_MASK)
+#else
+// Both dst and src unalignment will be treated as unaligned copy
+#define IS_UNALIGNED_COPY(dst, src) \
+		(((uintptr_t)(dst) | (uintptr_t)(src)) & ALIGNMENT_MASK)
+#endif
+
+
+// If copy size is larger than threshold, memcpy() will be used.
+// Run "memcpy_perf_autotest" to determine the proper threshold.
+#define ALIGNED_THRESHOLD       ((size_t)(0xffffffff))
+#define UNALIGNED_THRESHOLD     ((size_t)(0xffffffff))
+
+
+/**************************************
+ * End of customization section
+ **************************************/
+#ifdef RTE_TOOLCHAIN_GCC
+#if (GCC_VERSION < 50400)
+#warning "The GCC version is quite old, which may result in sub-optimal \
+performance of the compiled code. It is suggested that at least GCC 5.4.0 \
+be used."
+#endif
+#endif
+
+static inline void __attribute__ ((__always_inline__))
+rte_mov16(uint8_t *restrict dst, const uint8_t *restrict src)
+{
+	__int128 * restrict dst128 = (__int128 * restrict)dst;
+	const __int128 * restrict src128 = (const __int128 * restrict)src;
+	*dst128 = *src128;
+}
+
+static inline void __attribute__ ((__always_inline__))
+rte_mov32(uint8_t *restrict dst, const uint8_t *restrict src)
+{
+	__int128 * restrict dst128 = (__int128 * restrict)dst;
+	const __int128 * restrict src128 = (const __int128 * restrict)src;
+	dst128[0] = src128[0];
+	dst128[1] = src128[1];
+}
+
+static inline void __attribute__ ((__always_inline__))
+rte_mov48(uint8_t *restrict dst, const uint8_t *restrict src)
+{
+	__int128 * restrict dst128 = (__int128 * restrict)dst;
+	const __int128 * restrict src128 = (const __int128 * restrict)src;
+	dst128[0] = src128[0];
+	dst128[1] = src128[1];
+	dst128[2] = src128[2];
+}
+
+static inline void __attribute__ ((__always_inline__))
+rte_mov64(uint8_t *restrict dst, const uint8_t *restrict src)
+{
+	__int128 * restrict dst128 = (__int128 * restrict)dst;
+	const __int128 * restrict src128 = (const __int128 * restrict)src;
+	dst128[0] = src128[0];
+	dst128[1] = src128[1];
+	dst128[2] = src128[2];
+	dst128[3] = src128[3];
+}
+
+static inline void __attribute__ ((__always_inline__))
+rte_mov128(uint8_t *restrict dst, const uint8_t *restrict src)
+{
+	rte_mov64(dst, src);
+	rte_mov64(dst + 64, src + 64);
+}
+
+static inline void __attribute__ ((__always_inline__))
+rte_mov256(uint8_t *restrict dst, const uint8_t *restrict src)
+{
+	rte_mov128(dst, src);
+	rte_mov128(dst + 128, src + 128);
+}
+
+static inline void __attribute__ ((__always_inline__))
+rte_memcpy_lt16(uint8_t *restrict dst, const uint8_t *restrict src, size_t n)
+{
+	if (n & 0x08) {
+		/* copy 8 ~ 15 bytes */
+		*(uint64_t *)dst = *(const uint64_t *)src;
+		*(uint64_t *)(dst - 8 + n) = *(const uint64_t *)(src - 8 + n);
+	} else if (n & 0x04) {
+		/* copy 4 ~ 7 bytes */
+		*(uint32_t *)dst = *(const uint32_t *)src;
+		*(uint32_t *)(dst - 4 + n) = *(const uint32_t *)(src - 4 + n);
+	} else if (n & 0x02) {
+		/* copy 2 ~ 3 bytes */
+		*(uint16_t *)dst = *(const uint16_t *)src;
+		*(uint16_t *)(dst - 2 + n) = *(const uint16_t *)(src - 2 + n);
+	} else if (n & 0x01) {
+		/* copy 1 byte */
+		*dst = *src;
+	}
+}
+
+static inline void __attribute__ ((__always_inline__))
+rte_memcpy_ge16_lt64
+(uint8_t *restrict dst, const uint8_t *restrict src, size_t n)
+{
+	if (n == 16) {
+		rte_mov16(dst, src);
+	} else if (n <= 32) {
+		rte_mov16(dst, src);
+		rte_mov16(dst - 16 + n, src - 16 + n);
+	} else if (n <= 48) {
+		rte_mov32(dst, src);
+		rte_mov16(dst - 16 + n, src - 16 + n);
+	} else {
+		rte_mov48(dst, src);
+		rte_mov16(dst - 16 + n, src - 16 + n);
+	}
+}
+
+static inline void __attribute__ ((__always_inline__))
+rte_memcpy_ge64(uint8_t *restrict dst, const uint8_t *restrict src, size_t n)
+{
+	do {
+		rte_mov64(dst, src);
+		src += 64;
+		dst += 64;
+		n -= 64;
+	} while (likely(n >= 64));
+
+	if (likely(n)) {
+		if (n > 48)
+			rte_mov64(dst - 64 + n, src - 64 + n);
+		else if (n > 32)
+			rte_mov48(dst - 48 + n, src - 48 + n);
+		else if (n > 16)
+			rte_mov32(dst - 32 + n, src - 32 + n);
+		else
+			rte_mov16(dst - 16 + n, src - 16 + n);
+	}
+}
+
+static inline void *__attribute__ ((__always_inline__))
+rte_memcpy(void *restrict dst, const void *restrict src, size_t n)
+{
+	if (n < 16) {
+		rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	}
+	if (n < 64) {
+		rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	}
+	__builtin_prefetch(src, 0, 0);
+	__builtin_prefetch(dst, 1, 0);
+	if (likely(
+		  (!IS_UNALIGNED_COPY(dst, src) && n <= ALIGNED_THRESHOLD)
+		   || (IS_UNALIGNED_COPY(dst, src) && n <= UNALIGNED_THRESHOLD)
+		  )) {
+		rte_memcpy_ge64((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	} else
+		return memcpy(dst, src, n);
+}
+
+
+#else
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
@@ -80,6 +271,8 @@
 
 #define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
 
+#endif
+
 #ifdef __cplusplus
 }
 #endif
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH] arch/arm: optimization for memcpy on AArch64
  2017-11-27  7:49 [PATCH] arch/arm: optimization for memcpy on AArch64 Herbert Guan
@ 2017-11-29 12:31 ` Jerin Jacob
  2017-12-03 12:37   ` Herbert Guan
  2017-12-02  7:33 ` Pavan Nikhilesh Bhagavatula
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 27+ messages in thread
From: Jerin Jacob @ 2017-11-29 12:31 UTC (permalink / raw)
  To: Herbert Guan; +Cc: jianbo.liu, dev

-----Original Message-----
> Date: Mon, 27 Nov 2017 15:49:45 +0800
> From: Herbert Guan <herbert.guan@arm.com>
> To: jerin.jacob@caviumnetworks.com, jianbo.liu@arm.com, dev@dpdk.org
> CC: Herbert Guan <herbert.guan@arm.com>
> Subject: [PATCH] arch/arm: optimization for memcpy on AArch64
> X-Mailer: git-send-email 1.8.3.1
> +
> +/**************************************
> + * Beginning of customization section
> + **************************************/
> +#define ALIGNMENT_MASK 0x0F
> +#ifndef RTE_ARCH_ARM64_MEMCPY_STRICT_ALIGN
> +// Only src unalignment will be treaed as unaligned copy

C++ style comments. It may generate check patch errors.

> +#define IS_UNALIGNED_COPY(dst, src) ((uintptr_t)(dst) & ALIGNMENT_MASK)
> +#else
> +// Both dst and src unalignment will be treated as unaligned copy
> +#define IS_UNALIGNED_COPY(dst, src) \
> +		(((uintptr_t)(dst) | (uintptr_t)(src)) & ALIGNMENT_MASK)
> +#endif
> +
> +
> +// If copy size is larger than threshold, memcpy() will be used.
> +// Run "memcpy_perf_autotest" to determine the proper threshold.
> +#define ALIGNED_THRESHOLD       ((size_t)(0xffffffff))
> +#define UNALIGNED_THRESHOLD     ((size_t)(0xffffffff))

Do you see any case where this threshold is useful.

> +
> +static inline void *__attribute__ ((__always_inline__))
> +rte_memcpy(void *restrict dst, const void *restrict src, size_t n)
> +{
> +	if (n < 16) {
> +		rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
> +		return dst;
> +	}
> +	if (n < 64) {
> +		rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src, n);
> +		return dst;
> +	}

Unfortunately we have 128B cache arm64 implementation too. Could you 
please take care that based on RTE_CACHE_LINE_SIZE

> +	__builtin_prefetch(src, 0, 0);
> +	__builtin_prefetch(dst, 1, 0);

See above point and Please use DPDK equivalents. rte_prefetch*()

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] arch/arm: optimization for memcpy on AArch64
  2017-11-27  7:49 [PATCH] arch/arm: optimization for memcpy on AArch64 Herbert Guan
  2017-11-29 12:31 ` Jerin Jacob
@ 2017-12-02  7:33 ` Pavan Nikhilesh Bhagavatula
  2017-12-03 12:38   ` Herbert Guan
  2017-12-05  6:02 ` [PATCH v2] " Herbert Guan
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 27+ messages in thread
From: Pavan Nikhilesh Bhagavatula @ 2017-12-02  7:33 UTC (permalink / raw)
  To: Herbert Guan, jianbo.liu; +Cc: dev

On Mon, Nov 27, 2017 at 03:49:45PM +0800, Herbert Guan wrote:
> This patch provides an option to do rte_memcpy() using 'restrict'
> qualifier, which can induce GCC to do optimizations by using more
> efficient instructions, providing some performance gain over memcpy()
> on some AArch64 platforms/enviroments.
>
> The memory copy performance differs between different AArch64
> platforms. And a more recent glibc (e.g. 2.23 or later)
> can provide a better memcpy() performance compared to old glibc
> versions. It's always suggested to use a more recent glibc if
> possible, from which the entire system can get benefit. If for some
> reason an old glibc has to be used, this patch is provided for an
> alternative.
>
> This implementation can improve memory copy on some AArch64
> platforms, when an old glibc (e.g. 2.19, 2.17...) is being used.
> It is disabled by default and needs "RTE_ARCH_ARM64_MEMCPY"
> defined to activate. It's not always proving better performance
> than memcpy() so users need to run DPDK unit test
> "memcpy_perf_autotest" and customize parameters in "customization
> section" in rte_memcpy_64.h for best performance.
>
> Compiler version will also impact the rte_memcpy() performance.
> It's observed on some platforms and with the same code, GCC 7.2.0
> compiled binary can provide better performance than GCC 4.8.5. It's
> suggested to use GCC 5.4.0 or later.
>
> Signed-off-by: Herbert Guan <herbert.guan@arm.com>
> ---
>  .../common/include/arch/arm/rte_memcpy_64.h        | 193 +++++++++++++++++++++
>  1 file changed, 193 insertions(+)
>
> diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> index b80d8ba..1f42b3c 100644
> --- a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> +++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> @@ -42,6 +42,197 @@
>
>  #include "generic/rte_memcpy.h"
>
> +#ifdef RTE_ARCH_ARM64_MEMCPY

There is an existing flag for arm32 to enable neon based memcpy
RTE_ARCH_ARM_NEON_MEMCPY we could reuse that here as restrict does the same.

> +#include <rte_common.h>
> +#include <rte_branch_prediction.h>
> +
> +/*******************************************************************************
> + * The memory copy performance differs on different AArch64 micro-architectures.
> + * And the most recent glibc (e.g. 2.23 or later) can provide a better memcpy()
> + * performance compared to old glibc versions. It's always suggested to use a
> + * more recent glibc if possible, from which the entire system can get benefit.
> + *
> + * This implementation improves memory copy on some aarch64 micro-architectures,
> + * when an old glibc (e.g. 2.19, 2.17...) is being used. It is disabled by
> + * default and needs "RTE_ARCH_ARM64_MEMCPY" defined to activate. It's not
> + * always providing better performance than memcpy() so users need to run unit
> + * test "memcpy_perf_autotest" and customize parameters in customization section
> + * below for best performance.
> + *
> + * Compiler version will also impact the rte_memcpy() performance. It's observed
> + * on some platforms and with the same code, GCC 7.2.0 compiled binaries can
> + * provide better performance than GCC 4.8.5 compiled binaries.
> + ******************************************************************************/
> +
> +/**************************************
> + * Beginning of customization section
> + **************************************/
> +#define ALIGNMENT_MASK 0x0F
> +#ifndef RTE_ARCH_ARM64_MEMCPY_STRICT_ALIGN
> +// Only src unalignment will be treaed as unaligned copy
> +#define IS_UNALIGNED_COPY(dst, src) ((uintptr_t)(dst) & ALIGNMENT_MASK)

We can use existing `rte_is_aligned` function instead.

> +#else
> +// Both dst and src unalignment will be treated as unaligned copy
> +#define IS_UNALIGNED_COPY(dst, src) \
> +		(((uintptr_t)(dst) | (uintptr_t)(src)) & ALIGNMENT_MASK)
> +#endif
> +
> +
> +// If copy size is larger than threshold, memcpy() will be used.
> +// Run "memcpy_perf_autotest" to determine the proper threshold.
> +#define ALIGNED_THRESHOLD       ((size_t)(0xffffffff))
> +#define UNALIGNED_THRESHOLD     ((size_t)(0xffffffff))
> +
> +
> +/**************************************
> + * End of customization section
> + **************************************/
> +#ifdef RTE_TOOLCHAIN_GCC
> +#if (GCC_VERSION < 50400)
> +#warning "The GCC version is quite old, which may result in sub-optimal \
> +performance of the compiled code. It is suggested that at least GCC 5.4.0 \
> +be used."
> +#endif
> +#endif
> +
> +static inline void __attribute__ ((__always_inline__))
use __rte_always_inline instead.
> +rte_mov16(uint8_t *restrict dst, const uint8_t *restrict src)
> +{
> +	__int128 * restrict dst128 = (__int128 * restrict)dst;
> +	const __int128 * restrict src128 = (const __int128 * restrict)src;
> +	*dst128 = *src128;
> +}
> +
> +static inline void __attribute__ ((__always_inline__))
> +rte_mov64(uint8_t *restrict dst, const uint8_t *restrict src)
> +{
> +	__int128 * restrict dst128 = (__int128 * restrict)dst;

ISO C does not support ‘__int128’ please use '__int128_t' or '__uint128_t'.

> +	const __int128 * restrict src128 = (const __int128 * restrict)src;
> +	dst128[0] = src128[0];
> +	dst128[1] = src128[1];
> +	dst128[2] = src128[2];
> +	dst128[3] = src128[3];
> +}
> +
<snip>

Would doing this still benifit if size is compile time constant? i.e. when
__builtin_constant_p(n) is true.

> +
> +static inline void *__attribute__ ((__always_inline__))
> +rte_memcpy(void *restrict dst, const void *restrict src, size_t n)
> +{
> +	if (n < 16) {
> +		rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
> +		return dst;
> +	}
> +	if (n < 64) {
> +		rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src, n);
> +		return dst;
> +	}
> +	__builtin_prefetch(src, 0, 0);
> +	__builtin_prefetch(dst, 1, 0);
> +	if (likely(
> +		  (!IS_UNALIGNED_COPY(dst, src) && n <= ALIGNED_THRESHOLD)
> +		   || (IS_UNALIGNED_COPY(dst, src) && n <= UNALIGNED_THRESHOLD)
> +		  )) {
> +		rte_memcpy_ge64((uint8_t *)dst, (const uint8_t *)src, n);
> +		return dst;
> +	} else
> +		return memcpy(dst, src, n);
> +}
> +
> +
> +#else
>  static inline void
>  rte_mov16(uint8_t *dst, const uint8_t *src)
>  {
> @@ -80,6 +271,8 @@
>
>  #define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
>
> +#endif
> +
>  #ifdef __cplusplus
>  }
>  #endif
> --
> 1.8.3.1
>
Regards,
Pavan.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] arch/arm: optimization for memcpy on AArch64
  2017-11-29 12:31 ` Jerin Jacob
@ 2017-12-03 12:37   ` Herbert Guan
  2017-12-15  4:06     ` Jerin Jacob
  0 siblings, 1 reply; 27+ messages in thread
From: Herbert Guan @ 2017-12-03 12:37 UTC (permalink / raw)
  To: Jerin Jacob; +Cc: Jianbo Liu, dev

Jerin,

Thanks a lot for your review and comments.  Please find my comments below inline.

Best regards,
Herbert

> -----Original Message-----
> From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> Sent: Wednesday, November 29, 2017 20:32
> To: Herbert Guan <Herbert.Guan@arm.com>
> Cc: Jianbo Liu <Jianbo.Liu@arm.com>; dev@dpdk.org
> Subject: Re: [PATCH] arch/arm: optimization for memcpy on AArch64
>
> -----Original Message-----
> > Date: Mon, 27 Nov 2017 15:49:45 +0800
> > From: Herbert Guan <herbert.guan@arm.com>
> > To: jerin.jacob@caviumnetworks.com, jianbo.liu@arm.com, dev@dpdk.org
> > CC: Herbert Guan <herbert.guan@arm.com>
> > Subject: [PATCH] arch/arm: optimization for memcpy on AArch64
> > X-Mailer: git-send-email 1.8.3.1
> > +
> > +/**************************************
> > + * Beginning of customization section
> > +**************************************/
> > +#define ALIGNMENT_MASK 0x0F
> > +#ifndef RTE_ARCH_ARM64_MEMCPY_STRICT_ALIGN
> > +// Only src unalignment will be treaed as unaligned copy
>
> C++ style comments. It may generate check patch errors.

I'll change it to use C style comment in the version 2.

>
> > +#define IS_UNALIGNED_COPY(dst, src) ((uintptr_t)(dst) &
> > +ALIGNMENT_MASK) #else // Both dst and src unalignment will be treated
> > +as unaligned copy #define IS_UNALIGNED_COPY(dst, src) \
> > +(((uintptr_t)(dst) | (uintptr_t)(src)) & ALIGNMENT_MASK)
> #endif
> > +
> > +
> > +// If copy size is larger than threshold, memcpy() will be used.
> > +// Run "memcpy_perf_autotest" to determine the proper threshold.
> > +#define ALIGNED_THRESHOLD       ((size_t)(0xffffffff))
> > +#define UNALIGNED_THRESHOLD     ((size_t)(0xffffffff))
>
> Do you see any case where this threshold is useful.

Yes, on some platforms, and/or with some glibc version,  the glibc memcpy has better performance in larger size (e.g., >512, >4096...).  So developers should run unit test to find the best threshold.  The default value of 0xffffffff should be modified with evaluated values.

>
> > +
> > +static inline void *__attribute__ ((__always_inline__))
> > +rte_memcpy(void *restrict dst, const void *restrict src, size_t n)
> > +{
> > +if (n < 16) {
> > +rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
> > +return dst;
> > +}
> > +if (n < 64) {
> > +rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src,
> n);
> > +return dst;
> > +}
>
> Unfortunately we have 128B cache arm64 implementation too. Could you
> please take care that based on RTE_CACHE_LINE_SIZE
>

Here the value of '64' is not the cache line size.  But for the reason that prefetch itself will cost some cycles, it's not worthwhile to do prefetch for small size (e.g. < 64 bytes) copy.  Per my test, prefetching for small size copy will actually lower the performance.

In the other hand, I can only find one 128B cache line aarch64 machine here.  And it do exist some specific optimization for this machine.  Not sure if it'll be beneficial for other 128B cache machines or not.  I prefer not to put it in this patch but in a later standalone specific patch for 128B cache machines.

> > +__builtin_prefetch(src, 0, 0);  // rte_prefetch_non_temporal(src);
> > +__builtin_prefetch(dst, 1, 0);  //  * unchanged *
>
> See above point and Please use DPDK equivalents. rte_prefetch*()

I can use the " rte_prefetch_non_temporal()" for read prefetch.  However, there's no DPDK equivalents for the write prefetch.  Would you suggest that we add one API for DPDK?
BTW, the current DPDK rte_prefetch*() are using ASM instructions.  It might be better to use __builtin_prefetch(src, 0, 0/1/2/3) for better compatibility of future aarch64 architectures.

IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] arch/arm: optimization for memcpy on AArch64
  2017-12-02  7:33 ` Pavan Nikhilesh Bhagavatula
@ 2017-12-03 12:38   ` Herbert Guan
  2017-12-03 14:20     ` Pavan Nikhilesh Bhagavatula
  0 siblings, 1 reply; 27+ messages in thread
From: Herbert Guan @ 2017-12-03 12:38 UTC (permalink / raw)
  To: Pavan Nikhilesh Bhagavatula, Jianbo Liu; +Cc: dev

Pavan,

Thanks for review and comments.  Please find my comments inline below.

Best regards,
Herbert

> -----Original Message-----
> From: Pavan Nikhilesh Bhagavatula
> [mailto:pbhagavatula@caviumnetworks.com]
> Sent: Saturday, December 2, 2017 15:33
> To: Herbert Guan <Herbert.Guan@arm.com>; Jianbo Liu
> <Jianbo.Liu@arm.com>
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH] arch/arm: optimization for memcpy on
> AArch64
>
> On Mon, Nov 27, 2017 at 03:49:45PM +0800, Herbert Guan wrote:
> > This patch provides an option to do rte_memcpy() using 'restrict'
> > qualifier, which can induce GCC to do optimizations by using more
> > efficient instructions, providing some performance gain over memcpy()
> > on some AArch64 platforms/enviroments.
> >
> > The memory copy performance differs between different AArch64
> > platforms. And a more recent glibc (e.g. 2.23 or later) can provide a
> > better memcpy() performance compared to old glibc versions. It's
> > always suggested to use a more recent glibc if possible, from which
> > the entire system can get benefit. If for some reason an old glibc has
> > to be used, this patch is provided for an alternative.
> >
> > This implementation can improve memory copy on some AArch64
> platforms,
> > when an old glibc (e.g. 2.19, 2.17...) is being used.
> > It is disabled by default and needs "RTE_ARCH_ARM64_MEMCPY"
> > defined to activate. It's not always proving better performance than
> > memcpy() so users need to run DPDK unit test "memcpy_perf_autotest"
> > and customize parameters in "customization section" in rte_memcpy_64.h
> > for best performance.
> >
> > Compiler version will also impact the rte_memcpy() performance.
> > It's observed on some platforms and with the same code, GCC 7.2.0
> > compiled binary can provide better performance than GCC 4.8.5. It's
> > suggested to use GCC 5.4.0 or later.
> >
> > Signed-off-by: Herbert Guan <herbert.guan@arm.com>
> > ---
> >  .../common/include/arch/arm/rte_memcpy_64.h        | 193
> +++++++++++++++++++++
> >  1 file changed, 193 insertions(+)
> >
> > diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> > b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> > index b80d8ba..1f42b3c 100644
> > --- a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> > +++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> > @@ -42,6 +42,197 @@
> >
> >  #include "generic/rte_memcpy.h"
> >
> > +#ifdef RTE_ARCH_ARM64_MEMCPY
>
> There is an existing flag for arm32 to enable neon based memcpy
> RTE_ARCH_ARM_NEON_MEMCPY we could reuse that here as restrict does
> the same.
>
This implementation is actually not using ARM NEON instructions so the existing flag is not describing the option exactly.  It'll be good if the existing flag is "RTE_ARCH_ARM_MEMCPY" but unfortunately it might be too late now to get the flags aligned.

> > +#include <rte_common.h>
> > +#include <rte_branch_prediction.h>
> > +
> >
> +/*********************************************************
> ***********
> > +***********
> > + * The memory copy performance differs on different AArch64 micro-
> architectures.
> > + * And the most recent glibc (e.g. 2.23 or later) can provide a
> > +better memcpy()
> > + * performance compared to old glibc versions. It's always suggested
> > +to use a
> > + * more recent glibc if possible, from which the entire system can get
> benefit.
> > + *
> > + * This implementation improves memory copy on some aarch64
> > +micro-architectures,
> > + * when an old glibc (e.g. 2.19, 2.17...) is being used. It is
> > +disabled by
> > + * default and needs "RTE_ARCH_ARM64_MEMCPY" defined to activate.
> > +It's not
> > + * always providing better performance than memcpy() so users need to
> > +run unit
> > + * test "memcpy_perf_autotest" and customize parameters in
> > +customization section
> > + * below for best performance.
> > + *
> > + * Compiler version will also impact the rte_memcpy() performance.
> > +It's observed
> > + * on some platforms and with the same code, GCC 7.2.0 compiled
> > +binaries can
> > + * provide better performance than GCC 4.8.5 compiled binaries.
> > +
> >
> +*********************************************************
> ************
> > +*********/
> > +
> > +/**************************************
> > + * Beginning of customization section
> > +**************************************/
> > +#define ALIGNMENT_MASK 0x0F
> > +#ifndef RTE_ARCH_ARM64_MEMCPY_STRICT_ALIGN
> > +// Only src unalignment will be treaed as unaligned copy #define
> > +IS_UNALIGNED_COPY(dst, src) ((uintptr_t)(dst) & ALIGNMENT_MASK)
>
> We can use existing `rte_is_aligned` function instead.

The exising 'rte_is_aligned()' inline function is defined in a relatively complex way, and there will be more instructions generated (using GCC 7.2.0):

0000000000000000 <align_check_rte>:   // using rte_is_aligned()
   0:91003c01 addx1, x0, #0xf
   4:927cec21 andx1, x1, #0xfffffffffffffff0
   8:eb01001f cmpx0, x1
   c:1a9f07e0 csetw0, ne  // ne = any
  10:d65f03c0 ret
  14:d503201f nop

0000000000000018 <align_check_simp>:   // using above expression
  18:12000c00 andw0, w0, #0xf
  1c:d65f03c0 ret

So to get better performance, it's better to use the simple logic.


>
> > +#else
> > +// Both dst and src unalignment will be treated as unaligned copy
> > +#define IS_UNALIGNED_COPY(dst, src) \
> > +(((uintptr_t)(dst) | (uintptr_t)(src)) & ALIGNMENT_MASK)
> #endif
> > +
> > +
> > +// If copy size is larger than threshold, memcpy() will be used.
> > +// Run "memcpy_perf_autotest" to determine the proper threshold.
> > +#define ALIGNED_THRESHOLD       ((size_t)(0xffffffff))
> > +#define UNALIGNED_THRESHOLD     ((size_t)(0xffffffff))
> > +
> > +
> > +/**************************************
> > + * End of customization section
> > + **************************************/
> > +#ifdef RTE_TOOLCHAIN_GCC
> > +#if (GCC_VERSION < 50400)
> > +#warning "The GCC version is quite old, which may result in
> > +sub-optimal \ performance of the compiled code. It is suggested that
> > +at least GCC 5.4.0 \ be used."
> > +#endif
> > +#endif
> > +
> > +static inline void __attribute__ ((__always_inline__))
> use __rte_always_inline instead.
> > +rte_mov16(uint8_t *restrict dst, const uint8_t *restrict src) {
> > +__int128 * restrict dst128 = (__int128 * restrict)dst;
> > +const __int128 * restrict src128 = (const __int128 * restrict)src;
> > +*dst128 = *src128;
> > +}
> > +
> > +static inline void __attribute__ ((__always_inline__))
> > +rte_mov64(uint8_t *restrict dst, const uint8_t *restrict src) {
> > +__int128 * restrict dst128 = (__int128 * restrict)dst;
>
> ISO C does not support ‘__int128’ please use '__int128_t' or '__uint128_t'.

Very good point.  Thanks for this reminding and I'll update to use '__uint128_t' in the next version.

>
> > +const __int128 * restrict src128 = (const __int128 * restrict)src;
> > +dst128[0] = src128[0];
> > +dst128[1] = src128[1];
> > +dst128[2] = src128[2];
> > +dst128[3] = src128[3];
> > +}
> > +
> <snip>
>
> Would doing this still benifit if size is compile time constant? i.e. when
> __builtin_constant_p(n) is true.
>
Yes, performance margin is observed if size is compile time constant on some tested platforms.

> > +
> > +static inline void *__attribute__ ((__always_inline__))
> > +rte_memcpy(void *restrict dst, const void *restrict src, size_t n) {
> > +if (n < 16) {
> > +rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
> > +return dst;
> > +}
> > +if (n < 64) {
> > +rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src,
> n);
> > +return dst;
> > +}
> > +__builtin_prefetch(src, 0, 0);
> > +__builtin_prefetch(dst, 1, 0);
> > +if (likely(
> > +  (!IS_UNALIGNED_COPY(dst, src) && n <=
> ALIGNED_THRESHOLD)
> > +   || (IS_UNALIGNED_COPY(dst, src) && n <=
> UNALIGNED_THRESHOLD)
> > +  )) {
> > +rte_memcpy_ge64((uint8_t *)dst, (const uint8_t *)src, n);
> > +return dst;
> > +} else
> > +return memcpy(dst, src, n);
> > +}
> > +
> > +
> > +#else
> >  static inline void
> >  rte_mov16(uint8_t *dst, const uint8_t *src)  { @@ -80,6 +271,8 @@
> >
> >  #define rte_memcpy(d, s, n)memcpy((d), (s), (n))
> >
> > +#endif
> > +
> >  #ifdef __cplusplus
> >  }
> >  #endif
> > --
> > 1.8.3.1
> >
> Regards,
> Pavan.
IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] arch/arm: optimization for memcpy on AArch64
  2017-12-03 12:38   ` Herbert Guan
@ 2017-12-03 14:20     ` Pavan Nikhilesh Bhagavatula
  2017-12-04  7:14       ` Herbert Guan
  0 siblings, 1 reply; 27+ messages in thread
From: Pavan Nikhilesh Bhagavatula @ 2017-12-03 14:20 UTC (permalink / raw)
  To: Herbert Guan, jianbo.liu; +Cc: dev

On Sun, Dec 03, 2017 at 12:38:35PM +0000, Herbert Guan wrote:
> Pavan,
>
> Thanks for review and comments.  Please find my comments inline below.
>
> Best regards,
> Herbert
>
<snip>
> > There is an existing flag for arm32 to enable neon based memcpy
> > RTE_ARCH_ARM_NEON_MEMCPY we could reuse that here as restrict does
> > the same.
> >
> This implementation is actually not using ARM NEON instructions so the existing flag is not describing the option exactly.  It'll be good if the existing flag is "RTE_ARCH_ARM_MEMCPY" but unfortunately it might be too late now to get the flags aligned.
>

Correct me if I'm wrong but doesn't restrict tell the compiler to do SIMD
optimization?
Anyway can we put RTE_ARCH_ARM64_MEMCPY into config/common_base as
CONFIG_RTE_ARCH_ARM64_MEMCPY=n so that it would be easier to enable/disable.

> > > +#include <rte_common.h>
> > > +#include <rte_branch_prediction.h>
> > > +
> > >
> > +/*********************************************************
> > ***********
> > > +***********
> > > + * The memory copy performance differs on different AArch64 micro-
> > architectures.
> > > + * And the most recent glibc (e.g. 2.23 or later) can provide a
> > > +better memcpy()
> > > + * performance compared to old glibc versions. It's always suggested
> > > +to use a
> > > + * more recent glibc if possible, from which the entire system can get
> > benefit.
> > > + *
> > > + * This implementation improves memory copy on some aarch64
> > > +micro-architectures,
> > > + * when an old glibc (e.g. 2.19, 2.17...) is being used. It is
> > > +disabled by
> > > + * default and needs "RTE_ARCH_ARM64_MEMCPY" defined to activate.
> > > +It's not
> > > + * always providing better performance than memcpy() so users need to
> > > +run unit
> > > + * test "memcpy_perf_autotest" and customize parameters in
> > > +customization section
> > > + * below for best performance.
> > > + *
> > > + * Compiler version will also impact the rte_memcpy() performance.
> > > +It's observed
> > > + * on some platforms and with the same code, GCC 7.2.0 compiled
> > > +binaries can
> > > + * provide better performance than GCC 4.8.5 compiled binaries.
> > > +
> > >
> > +*********************************************************
> > ************
> > > +*********/
> > > +
> > > +/**************************************
> > > + * Beginning of customization section
> > > +**************************************/
> > > +#define ALIGNMENT_MASK 0x0F
> > > +#ifndef RTE_ARCH_ARM64_MEMCPY_STRICT_ALIGN
> > > +// Only src unalignment will be treaed as unaligned copy #define
> > > +IS_UNALIGNED_COPY(dst, src) ((uintptr_t)(dst) & ALIGNMENT_MASK)
> >
> > We can use existing `rte_is_aligned` function instead.
>
> The exising 'rte_is_aligned()' inline function is defined in a relatively complex way, and there will be more instructions generated (using GCC 7.2.0):
>
> 0000000000000000 <align_check_rte>:   // using rte_is_aligned()
>    0:91003c01 addx1, x0, #0xf
>    4:927cec21 andx1, x1, #0xfffffffffffffff0
>    8:eb01001f cmpx0, x1
>    c:1a9f07e0 csetw0, ne  // ne = any
>   10:d65f03c0 ret
>   14:d503201f nop
>
> 0000000000000018 <align_check_simp>:   // using above expression
>   18:12000c00 andw0, w0, #0xf
>   1c:d65f03c0 ret
>
> So to get better performance, it's better to use the simple logic.

Agreed, I have noticed that too maybe we could change rte_is_aligned to be
simpler (Not in this patch).

<snip>
> > Would doing this still benifit if size is compile time constant? i.e. when
> > __builtin_constant_p(n) is true.
> >
> Yes, performance margin is observed if size is compile time constant on some tested platforms.
>

Sorry I didn't get you but which is better? If size is compile time constant is
using libc memcpy is better or going with restrict implementation better.

If the former then we could do what 32bit rte_memcpy is using i.e.

#define rte_memcpy(dst, src, n)              \
        __extension__ ({                     \
        (__builtin_constant_p(n)) ?          \
        memcpy((dst), (src), (n)) :          \
        rte_memcpy_func((dst), (src), (n)); })

Regards,
Pavan.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] arch/arm: optimization for memcpy on AArch64
  2017-12-03 14:20     ` Pavan Nikhilesh Bhagavatula
@ 2017-12-04  7:14       ` Herbert Guan
  0 siblings, 0 replies; 27+ messages in thread
From: Herbert Guan @ 2017-12-04  7:14 UTC (permalink / raw)
  To: Pavan Nikhilesh Bhagavatula, Jianbo Liu; +Cc: dev



> -----Original Message-----
> From: Pavan Nikhilesh Bhagavatula
> [mailto:pbhagavatula@caviumnetworks.com]
> Sent: Sunday, December 3, 2017 22:21
> To: Herbert Guan <Herbert.Guan@arm.com>; Jianbo Liu
> <Jianbo.Liu@arm.com>
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH] arch/arm: optimization for memcpy on
> AArch64
>
> On Sun, Dec 03, 2017 at 12:38:35PM +0000, Herbert Guan wrote:
> > Pavan,
> >
> > Thanks for review and comments.  Please find my comments inline below.
> >
> > Best regards,
> > Herbert
> >
> <snip>
> > > There is an existing flag for arm32 to enable neon based memcpy
> > > RTE_ARCH_ARM_NEON_MEMCPY we could reuse that here as restrict
> does
> > > the same.
> > >
> > This implementation is actually not using ARM NEON instructions so the
> existing flag is not describing the option exactly.  It'll be good if the existing
> flag is "RTE_ARCH_ARM_MEMCPY" but unfortunately it might be too late
> now to get the flags aligned.
> >
>
> Correct me if I'm wrong but doesn't restrict tell the compiler to do SIMD
> optimization?
> Anyway can we put RTE_ARCH_ARM64_MEMCPY into config/common_base
> as CONFIG_RTE_ARCH_ARM64_MEMCPY=n so that it would be easier to
> enable/disable.
>

The result of using 'restrict' is to generate codes with ldp/stp instructions.  These instructions actually belong to the "data transfer instructions", though they are loading/storing a pair of registers.  'ld1/st1' are SIMD (NEON) instructions.

I can add CONFIG_RTE_ARCH_ARM64_MEMCPY=n  into common_armv8a_linuxapp in the new version as you've suggested.

> > > > +#include <rte_common.h>
> > > > +#include <rte_branch_prediction.h>
> > > > +
> > > >
> > >
> +/*********************************************************
> > > ***********
> > > > +***********
> > > > + * The memory copy performance differs on different AArch64
> > > > +micro-
> > > architectures.
> > > > + * And the most recent glibc (e.g. 2.23 or later) can provide a
> > > > +better memcpy()
> > > > + * performance compared to old glibc versions. It's always
> > > > +suggested to use a
> > > > + * more recent glibc if possible, from which the entire system
> > > > +can get
> > > benefit.
> > > > + *
> > > > + * This implementation improves memory copy on some aarch64
> > > > +micro-architectures,
> > > > + * when an old glibc (e.g. 2.19, 2.17...) is being used. It is
> > > > +disabled by
> > > > + * default and needs "RTE_ARCH_ARM64_MEMCPY" defined to
> activate.
> > > > +It's not
> > > > + * always providing better performance than memcpy() so users
> > > > +need to run unit
> > > > + * test "memcpy_perf_autotest" and customize parameters in
> > > > +customization section
> > > > + * below for best performance.
> > > > + *
> > > > + * Compiler version will also impact the rte_memcpy() performance.
> > > > +It's observed
> > > > + * on some platforms and with the same code, GCC 7.2.0 compiled
> > > > +binaries can
> > > > + * provide better performance than GCC 4.8.5 compiled binaries.
> > > > +
> > > >
> > >
> +*********************************************************
> > > ************
> > > > +*********/
> > > > +
> > > > +/**************************************
> > > > + * Beginning of customization section
> > > > +**************************************/
> > > > +#define ALIGNMENT_MASK 0x0F
> > > > +#ifndef RTE_ARCH_ARM64_MEMCPY_STRICT_ALIGN
> > > > +// Only src unalignment will be treaed as unaligned copy #define
> > > > +IS_UNALIGNED_COPY(dst, src) ((uintptr_t)(dst) & ALIGNMENT_MASK)
> > >
> > > We can use existing `rte_is_aligned` function instead.
> >
> > The exising 'rte_is_aligned()' inline function is defined in a relatively
> complex way, and there will be more instructions generated (using GCC
> 7.2.0):
> >
> > 0000000000000000 <align_check_rte>:   // using rte_is_aligned()
> >    0:91003c01 addx1, x0, #0xf
> >    4:927cec21 andx1, x1, #0xfffffffffffffff0
> >    8:eb01001f cmpx0, x1
> >    c:1a9f07e0 csetw0, ne  // ne = any
> >   10:d65f03c0 ret
> >   14:d503201f nop
> >
> > 0000000000000018 <align_check_simp>:   // using above expression
> >   18:12000c00 andw0, w0, #0xf
> >   1c:d65f03c0 ret
> >
> > So to get better performance, it's better to use the simple logic.
>
> Agreed, I have noticed that too maybe we could change rte_is_aligned to be
> simpler (Not in this patch).
>
> <snip>
> > > Would doing this still benifit if size is compile time constant?
> > > i.e. when
> > > __builtin_constant_p(n) is true.
> > >
> > Yes, performance margin is observed if size is compile time constant on
> some tested platforms.
> >
>
> Sorry I didn't get you but which is better? If size is compile time constant is
> using libc memcpy is better or going with restrict implementation better.
>
> If the former then we could do what 32bit rte_memcpy is using i.e.
>
> #define rte_memcpy(dst, src, n)              \
>         __extension__ ({                     \
>         (__builtin_constant_p(n)) ?          \
>         memcpy((dst), (src), (n)) :          \
>         rte_memcpy_func((dst), (src), (n)); })
>
Per my test, it usually has the same direction.  Means if the variable size can get improved performance, then hopefully the compile time constant will be improved as well, and vice versa.  The percentage might be different.  So in this patch, the property of size parameter (variable or compile time constant is not checked).

> Regards,
> Pavan.

Thanks,
Herbert
IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v2] arch/arm: optimization for memcpy on AArch64
  2017-11-27  7:49 [PATCH] arch/arm: optimization for memcpy on AArch64 Herbert Guan
  2017-11-29 12:31 ` Jerin Jacob
  2017-12-02  7:33 ` Pavan Nikhilesh Bhagavatula
@ 2017-12-05  6:02 ` Herbert Guan
  2017-12-15  3:41   ` Jerin Jacob
  2017-12-18  2:54 ` [PATCH v3] " Herbert Guan
  2018-01-04 10:20 ` [PATCH v5] " Herbert Guan
  4 siblings, 1 reply; 27+ messages in thread
From: Herbert Guan @ 2017-12-05  6:02 UTC (permalink / raw)
  To: dev; +Cc: jerin.jacob, pbhagavatula, jianbo.liu, Herbert Guan

This patch provides an option to do rte_memcpy() using 'restrict'
qualifier, which can induce GCC to do optimizations by using more
efficient instructions, providing some performance gain over memcpy()
on some AArch64 platforms/enviroments.

The memory copy performance differs between different AArch64
platforms. And a more recent glibc (e.g. 2.23 or later)
can provide a better memcpy() performance compared to old glibc
versions. It's always suggested to use a more recent glibc if
possible, from which the entire system can get benefit. If for some
reason an old glibc has to be used, this patch is provided for an
alternative.

This implementation can improve memory copy on some AArch64
platforms, when an old glibc (e.g. 2.19, 2.17...) is being used.
It is disabled by default and needs "RTE_ARCH_ARM64_MEMCPY"
defined to activate. It's not always proving better performance
than memcpy() so users need to run DPDK unit test
"memcpy_perf_autotest" and customize parameters in "customization
section" in rte_memcpy_64.h for best performance.

Compiler version will also impact the rte_memcpy() performance.
It's observed on some platforms and with the same code, GCC 7.2.0
compiled binary can provide better performance than GCC 4.8.5. It's
suggested to use GCC 5.4.0 or later.

Signed-off-by: Herbert Guan <herbert.guan@arm.com>
---
 config/common_armv8a_linuxapp                      |   6 +
 .../common/include/arch/arm/rte_memcpy_64.h        | 195 +++++++++++++++++++++
 2 files changed, 201 insertions(+)

diff --git a/config/common_armv8a_linuxapp b/config/common_armv8a_linuxapp
index 6732d1e..158ce00 100644
--- a/config/common_armv8a_linuxapp
+++ b/config/common_armv8a_linuxapp
@@ -44,6 +44,12 @@ CONFIG_RTE_FORCE_INTRINSICS=y
 # to address minimum DMA alignment across all arm64 implementations.
 CONFIG_RTE_CACHE_LINE_SIZE=128
 
+# Accelarate rte_memcpy.  Be sure to run unit test to determine the
+# best threshold in code.  Refer to notes in source file
+# (lib/librte_eam/common/include/arch/arm/rte_memcpy_64.h) for more
+# info.
+CONFIG_RTE_ARCH_ARM64_MEMCPY=n
+
 CONFIG_RTE_LIBRTE_FM10K_PMD=n
 CONFIG_RTE_LIBRTE_SFC_EFX_PMD=n
 CONFIG_RTE_LIBRTE_AVP_PMD=n
diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
index b80d8ba..a6ad286 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
@@ -42,6 +42,199 @@
 
 #include "generic/rte_memcpy.h"
 
+#ifdef RTE_ARCH_ARM64_MEMCPY
+#include <rte_common.h>
+#include <rte_branch_prediction.h>
+
+/*******************************************************************************
+ * The memory copy performance differs on different AArch64 micro-architectures.
+ * And the most recent glibc (e.g. 2.23 or later) can provide a better memcpy()
+ * performance compared to old glibc versions. It's always suggested to use a
+ * more recent glibc if possible, from which the entire system can get benefit.
+ *
+ * This implementation improves memory copy on some aarch64 micro-architectures,
+ * when an old glibc (e.g. 2.19, 2.17...) is being used. It is disabled by
+ * default and needs "RTE_ARCH_ARM64_MEMCPY" defined to activate. It's not
+ * always providing better performance than memcpy() so users need to run unit
+ * test "memcpy_perf_autotest" and customize parameters in customization section
+ * below for best performance.
+ *
+ * Compiler version will also impact the rte_memcpy() performance. It's observed
+ * on some platforms and with the same code, GCC 7.2.0 compiled binaries can
+ * provide better performance than GCC 4.8.5 compiled binaries.
+ ******************************************************************************/
+
+/**************************************
+ * Beginning of customization section
+ **************************************/
+#define ALIGNMENT_MASK 0x0F
+#ifndef RTE_ARCH_ARM64_MEMCPY_STRICT_ALIGN
+/* Only src unalignment will be treaed as unaligned copy */
+#define IS_UNALIGNED_COPY(dst, src) ((uintptr_t)(dst) & ALIGNMENT_MASK)
+#else
+/* Both dst and src unalignment will be treated as unaligned copy */
+#define IS_UNALIGNED_COPY(dst, src) \
+		(((uintptr_t)(dst) | (uintptr_t)(src)) & ALIGNMENT_MASK)
+#endif
+
+
+/*
+ * If copy size is larger than threshold, memcpy() will be used.
+ * Run "memcpy_perf_autotest" to determine the proper threshold.
+ */
+#define ALIGNED_THRESHOLD       ((size_t)(0xffffffff))
+#define UNALIGNED_THRESHOLD     ((size_t)(0xffffffff))
+
+
+/**************************************
+ * End of customization section
+ **************************************/
+#ifdef RTE_TOOLCHAIN_GCC
+#if (GCC_VERSION < 50400)
+#warning "The GCC version is quite old, which may result in sub-optimal \
+performance of the compiled code. It is suggested that at least GCC 5.4.0 \
+be used."
+#endif
+#endif
+
+static inline void __attribute__ ((__always_inline__))
+rte_mov16(uint8_t *restrict dst, const uint8_t *restrict src)
+{
+	__uint128_t *restrict dst128 = (__uint128_t *restrict)dst;
+	const __uint128_t *restrict src128 = (const __uint128_t *restrict)src;
+	*dst128 = *src128;
+}
+
+static inline void __attribute__ ((__always_inline__))
+rte_mov32(uint8_t *restrict dst, const uint8_t *restrict src)
+{
+	__uint128_t *restrict dst128 = (__uint128_t *restrict)dst;
+	const __uint128_t *restrict src128 = (const __uint128_t *restrict)src;
+	dst128[0] = src128[0];
+	dst128[1] = src128[1];
+}
+
+static inline void __attribute__ ((__always_inline__))
+rte_mov48(uint8_t *restrict dst, const uint8_t *restrict src)
+{
+	__uint128_t *restrict dst128 = (__uint128_t *restrict)dst;
+	const __uint128_t *restrict src128 = (const __uint128_t *restrict)src;
+	dst128[0] = src128[0];
+	dst128[1] = src128[1];
+	dst128[2] = src128[2];
+}
+
+static inline void __attribute__ ((__always_inline__))
+rte_mov64(uint8_t *restrict dst, const uint8_t *restrict src)
+{
+	__uint128_t *restrict dst128 = (__uint128_t *restrict)dst;
+	const __uint128_t *restrict src128 = (const __uint128_t *restrict)src;
+	dst128[0] = src128[0];
+	dst128[1] = src128[1];
+	dst128[2] = src128[2];
+	dst128[3] = src128[3];
+}
+
+static inline void __attribute__ ((__always_inline__))
+rte_mov128(uint8_t *restrict dst, const uint8_t *restrict src)
+{
+	rte_mov64(dst, src);
+	rte_mov64(dst + 64, src + 64);
+}
+
+static inline void __attribute__ ((__always_inline__))
+rte_mov256(uint8_t *restrict dst, const uint8_t *restrict src)
+{
+	rte_mov128(dst, src);
+	rte_mov128(dst + 128, src + 128);
+}
+
+static inline void __attribute__ ((__always_inline__))
+rte_memcpy_lt16(uint8_t *restrict dst, const uint8_t *restrict src, size_t n)
+{
+	if (n & 0x08) {
+		/* copy 8 ~ 15 bytes */
+		*(uint64_t *)dst = *(const uint64_t *)src;
+		*(uint64_t *)(dst - 8 + n) = *(const uint64_t *)(src - 8 + n);
+	} else if (n & 0x04) {
+		/* copy 4 ~ 7 bytes */
+		*(uint32_t *)dst = *(const uint32_t *)src;
+		*(uint32_t *)(dst - 4 + n) = *(const uint32_t *)(src - 4 + n);
+	} else if (n & 0x02) {
+		/* copy 2 ~ 3 bytes */
+		*(uint16_t *)dst = *(const uint16_t *)src;
+		*(uint16_t *)(dst - 2 + n) = *(const uint16_t *)(src - 2 + n);
+	} else if (n & 0x01) {
+		/* copy 1 byte */
+		*dst = *src;
+	}
+}
+
+static inline void __attribute__ ((__always_inline__))
+rte_memcpy_ge16_lt64
+(uint8_t *restrict dst, const uint8_t *restrict src, size_t n)
+{
+	if (n == 16) {
+		rte_mov16(dst, src);
+	} else if (n <= 32) {
+		rte_mov16(dst, src);
+		rte_mov16(dst - 16 + n, src - 16 + n);
+	} else if (n <= 48) {
+		rte_mov32(dst, src);
+		rte_mov16(dst - 16 + n, src - 16 + n);
+	} else {
+		rte_mov48(dst, src);
+		rte_mov16(dst - 16 + n, src - 16 + n);
+	}
+}
+
+static inline void __attribute__ ((__always_inline__))
+rte_memcpy_ge64(uint8_t *restrict dst, const uint8_t *restrict src, size_t n)
+{
+	do {
+		rte_mov64(dst, src);
+		src += 64;
+		dst += 64;
+		n -= 64;
+	} while (likely(n >= 64));
+
+	if (likely(n)) {
+		if (n > 48)
+			rte_mov64(dst - 64 + n, src - 64 + n);
+		else if (n > 32)
+			rte_mov48(dst - 48 + n, src - 48 + n);
+		else if (n > 16)
+			rte_mov32(dst - 32 + n, src - 32 + n);
+		else
+			rte_mov16(dst - 16 + n, src - 16 + n);
+	}
+}
+
+static inline void *__attribute__ ((__always_inline__))
+rte_memcpy(void *restrict dst, const void *restrict src, size_t n)
+{
+	if (n < 16) {
+		rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	}
+	if (n < 64) {
+		rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	}
+	__builtin_prefetch(src, 0, 0);
+	__builtin_prefetch(dst, 1, 0);
+	if (likely(
+		  (!IS_UNALIGNED_COPY(dst, src) && n <= ALIGNED_THRESHOLD)
+		   || (IS_UNALIGNED_COPY(dst, src) && n <= UNALIGNED_THRESHOLD)
+		  )) {
+		rte_memcpy_ge64((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	} else
+		return memcpy(dst, src, n);
+}
+
+
+#else
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
@@ -80,6 +273,8 @@
 
 #define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
 
+#endif
+
 #ifdef __cplusplus
 }
 #endif
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] arch/arm: optimization for memcpy on AArch64
  2017-12-05  6:02 ` [PATCH v2] " Herbert Guan
@ 2017-12-15  3:41   ` Jerin Jacob
  0 siblings, 0 replies; 27+ messages in thread
From: Jerin Jacob @ 2017-12-15  3:41 UTC (permalink / raw)
  To: Herbert Guan; +Cc: dev, pbhagavatula, jianbo.liu

-----Original Message-----
> Date: Tue, 5 Dec 2017 14:02:03 +0800
> From: Herbert Guan <herbert.guan@arm.com>
> To: dev@dpdk.org
> CC: jerin.jacob@caviumnetworks.com, pbhagavatula@caviumnetworks.com,
>  jianbo.liu@arm.com, Herbert Guan <herbert.guan@arm.com>
> Subject: [PATCH v2] arch/arm: optimization for memcpy on AArch64
> X-Mailer: git-send-email 1.8.3.1
> 
> This patch provides an option to do rte_memcpy() using 'restrict'
> qualifier, which can induce GCC to do optimizations by using more
> efficient instructions, providing some performance gain over memcpy()
> on some AArch64 platforms/enviroments.
> 
> The memory copy performance differs between different AArch64
> platforms. And a more recent glibc (e.g. 2.23 or later)
> can provide a better memcpy() performance compared to old glibc
> versions. It's always suggested to use a more recent glibc if
> possible, from which the entire system can get benefit. If for some
> reason an old glibc has to be used, this patch is provided for an
> alternative.
> 
> This implementation can improve memory copy on some AArch64
> platforms, when an old glibc (e.g. 2.19, 2.17...) is being used.
> It is disabled by default and needs "RTE_ARCH_ARM64_MEMCPY"
> defined to activate. It's not always proving better performance
> than memcpy() so users need to run DPDK unit test
> "memcpy_perf_autotest" and customize parameters in "customization
> section" in rte_memcpy_64.h for best performance.
> 
> Compiler version will also impact the rte_memcpy() performance.
> It's observed on some platforms and with the same code, GCC 7.2.0
> compiled binary can provide better performance than GCC 4.8.5. It's
> suggested to use GCC 5.4.0 or later.

Description looks good.

> 
> Signed-off-by: Herbert Guan <herbert.guan@arm.com>
> ---
>  config/common_armv8a_linuxapp                      |   6 +
>  .../common/include/arch/arm/rte_memcpy_64.h        | 195 +++++++++++++++++++++
>  2 files changed, 201 insertions(+)
> 
> diff --git a/config/common_armv8a_linuxapp b/config/common_armv8a_linuxapp
> index 6732d1e..158ce00 100644
> --- a/config/common_armv8a_linuxapp
> +++ b/config/common_armv8a_linuxapp
> @@ -44,6 +44,12 @@ CONFIG_RTE_FORCE_INTRINSICS=y
>  # to address minimum DMA alignment across all arm64 implementations.
>  CONFIG_RTE_CACHE_LINE_SIZE=128
>  
> +# Accelarate rte_memcpy.  Be sure to run unit test to determine the
> +# best threshold in code.  Refer to notes in source file
> +# (lib/librte_eam/common/include/arch/arm/rte_memcpy_64.h) for more

s/librte_eam/librte_eal

> +# info.
> +CONFIG_RTE_ARCH_ARM64_MEMCPY=n
> +
>  CONFIG_RTE_LIBRTE_FM10K_PMD=n
>  CONFIG_RTE_LIBRTE_SFC_EFX_PMD=n
>  CONFIG_RTE_LIBRTE_AVP_PMD=n
> diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> index b80d8ba..a6ad286 100644
> --- a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> +++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> @@ -42,6 +42,199 @@
>  
>  #include "generic/rte_memcpy.h"
>  
> +#ifdef RTE_ARCH_ARM64_MEMCPY
> +#include <rte_common.h>
> +#include <rte_branch_prediction.h>
> +
> +/*******************************************************************************

Please remove "*******************************".The standard C comment don't have that.

> + * The memory copy performance differs on different AArch64 micro-architectures.
> + * And the most recent glibc (e.g. 2.23 or later) can provide a better memcpy()
> + * performance compared to old glibc versions. It's always suggested to use a
> + * more recent glibc if possible, from which the entire system can get benefit.
> + *
> + * This implementation improves memory copy on some aarch64 micro-architectures,
> + * when an old glibc (e.g. 2.19, 2.17...) is being used. It is disabled by
> + * default and needs "RTE_ARCH_ARM64_MEMCPY" defined to activate. It's not
> + * always providing better performance than memcpy() so users need to run unit
> + * test "memcpy_perf_autotest" and customize parameters in customization section
> + * below for best performance.
> + *
> + * Compiler version will also impact the rte_memcpy() performance. It's observed
> + * on some platforms and with the same code, GCC 7.2.0 compiled binaries can
> + * provide better performance than GCC 4.8.5 compiled binaries.
> + ******************************************************************************/
> +
> +/**************************************
> + * Beginning of customization section
> + **************************************/
> +#define ALIGNMENT_MASK 0x0F
> +#ifndef RTE_ARCH_ARM64_MEMCPY_STRICT_ALIGN
> +/* Only src unalignment will be treaed as unaligned copy */
> +#define IS_UNALIGNED_COPY(dst, src) ((uintptr_t)(dst) & ALIGNMENT_MASK)
> +#else
> +/* Both dst and src unalignment will be treated as unaligned copy */
> +#define IS_UNALIGNED_COPY(dst, src) \
> +		(((uintptr_t)(dst) | (uintptr_t)(src)) & ALIGNMENT_MASK)
> +#endif
> +
> +
> +/*
> + * If copy size is larger than threshold, memcpy() will be used.
> + * Run "memcpy_perf_autotest" to determine the proper threshold.
> + */
> +#define ALIGNED_THRESHOLD       ((size_t)(0xffffffff))
> +#define UNALIGNED_THRESHOLD     ((size_t)(0xffffffff))
> +
> +
> +/**************************************
> + * End of customization section
> + **************************************/
> +#ifdef RTE_TOOLCHAIN_GCC
> +#if (GCC_VERSION < 50400)
> +#warning "The GCC version is quite old, which may result in sub-optimal \
> +performance of the compiled code. It is suggested that at least GCC 5.4.0 \
> +be used."
> +#endif
> +#endif
> +
> +static inline void __attribute__ ((__always_inline__))
> +rte_mov16(uint8_t *restrict dst, const uint8_t *restrict src)
> +{
> +	__uint128_t *restrict dst128 = (__uint128_t *restrict)dst;
> +	const __uint128_t *restrict src128 = (const __uint128_t *restrict)src;
> +	*dst128 = *src128;
> +}
> +
> +static inline void __attribute__ ((__always_inline__))
> +rte_mov32(uint8_t *restrict dst, const uint8_t *restrict src)
> +{
> +	__uint128_t *restrict dst128 = (__uint128_t *restrict)dst;
> +	const __uint128_t *restrict src128 = (const __uint128_t *restrict)src;
> +	dst128[0] = src128[0];
> +	dst128[1] = src128[1];
> +}
> +
> +static inline void __attribute__ ((__always_inline__))
> +rte_mov48(uint8_t *restrict dst, const uint8_t *restrict src)
> +{
> +	__uint128_t *restrict dst128 = (__uint128_t *restrict)dst;
> +	const __uint128_t *restrict src128 = (const __uint128_t *restrict)src;
> +	dst128[0] = src128[0];
> +	dst128[1] = src128[1];
> +	dst128[2] = src128[2];
> +}
> +
> +static inline void __attribute__ ((__always_inline__))
> +rte_mov64(uint8_t *restrict dst, const uint8_t *restrict src)
> +{
> +	__uint128_t *restrict dst128 = (__uint128_t *restrict)dst;
> +	const __uint128_t *restrict src128 = (const __uint128_t *restrict)src;
> +	dst128[0] = src128[0];
> +	dst128[1] = src128[1];
> +	dst128[2] = src128[2];
> +	dst128[3] = src128[3];
> +}
> +
> +static inline void __attribute__ ((__always_inline__))
> +rte_mov128(uint8_t *restrict dst, const uint8_t *restrict src)
> +{
> +	rte_mov64(dst, src);
> +	rte_mov64(dst + 64, src + 64);
> +}
> +
> +static inline void __attribute__ ((__always_inline__))
> +rte_mov256(uint8_t *restrict dst, const uint8_t *restrict src)
> +{
> +	rte_mov128(dst, src);
> +	rte_mov128(dst + 128, src + 128);
> +}
> +
> +static inline void __attribute__ ((__always_inline__))
> +rte_memcpy_lt16(uint8_t *restrict dst, const uint8_t *restrict src, size_t n)
> +{
> +	if (n & 0x08) {
> +		/* copy 8 ~ 15 bytes */
> +		*(uint64_t *)dst = *(const uint64_t *)src;
> +		*(uint64_t *)(dst - 8 + n) = *(const uint64_t *)(src - 8 + n);
> +	} else if (n & 0x04) {
> +		/* copy 4 ~ 7 bytes */
> +		*(uint32_t *)dst = *(const uint32_t *)src;
> +		*(uint32_t *)(dst - 4 + n) = *(const uint32_t *)(src - 4 + n);
> +	} else if (n & 0x02) {
> +		/* copy 2 ~ 3 bytes */
> +		*(uint16_t *)dst = *(const uint16_t *)src;
> +		*(uint16_t *)(dst - 2 + n) = *(const uint16_t *)(src - 2 + n);
> +	} else if (n & 0x01) {
> +		/* copy 1 byte */
> +		*dst = *src;
> +	}
> +}
> +
> +static inline void __attribute__ ((__always_inline__))
> +rte_memcpy_ge16_lt64
> +(uint8_t *restrict dst, const uint8_t *restrict src, size_t n)
> +{
> +	if (n == 16) {
> +		rte_mov16(dst, src);
> +	} else if (n <= 32) {
> +		rte_mov16(dst, src);
> +		rte_mov16(dst - 16 + n, src - 16 + n);
> +	} else if (n <= 48) {
> +		rte_mov32(dst, src);
> +		rte_mov16(dst - 16 + n, src - 16 + n);
> +	} else {
> +		rte_mov48(dst, src);
> +		rte_mov16(dst - 16 + n, src - 16 + n);
> +	}
> +}
> +
> +static inline void __attribute__ ((__always_inline__))
> +rte_memcpy_ge64(uint8_t *restrict dst, const uint8_t *restrict src, size_t n)
> +{
> +	do {
> +		rte_mov64(dst, src);
> +		src += 64;
> +		dst += 64;
> +		n -= 64;
> +	} while (likely(n >= 64));
> +
> +	if (likely(n)) {
> +		if (n > 48)
> +			rte_mov64(dst - 64 + n, src - 64 + n);
> +		else if (n > 32)
> +			rte_mov48(dst - 48 + n, src - 48 + n);
> +		else if (n > 16)
> +			rte_mov32(dst - 32 + n, src - 32 + n);
> +		else
> +			rte_mov16(dst - 16 + n, src - 16 + n);
> +	}
> +}
> +
> +static inline void *__attribute__ ((__always_inline__))
> +rte_memcpy(void *restrict dst, const void *restrict src, size_t n)
> +{
> +	if (n < 16) {
> +		rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
> +		return dst;
> +	}
> +	if (n < 64) {
> +		rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src, n);
> +		return dst;
> +	}

I have comment here, I will reply to original thread.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] arch/arm: optimization for memcpy on AArch64
  2017-12-03 12:37   ` Herbert Guan
@ 2017-12-15  4:06     ` Jerin Jacob
  2017-12-18  2:51       ` Herbert Guan
  0 siblings, 1 reply; 27+ messages in thread
From: Jerin Jacob @ 2017-12-15  4:06 UTC (permalink / raw)
  To: Herbert Guan; +Cc: Jianbo Liu, dev

-----Original Message-----
> Date: Sun, 3 Dec 2017 12:37:30 +0000
> From: Herbert Guan <Herbert.Guan@arm.com>
> To: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> CC: Jianbo Liu <Jianbo.Liu@arm.com>, "dev@dpdk.org" <dev@dpdk.org>
> Subject: RE: [PATCH] arch/arm: optimization for memcpy on AArch64
> 
> Jerin,

Hi Herbert,

> 
> Thanks a lot for your review and comments.  Please find my comments below inline.
> 
> Best regards,
> Herbert
> 
> > -----Original Message-----
> > From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> > Sent: Wednesday, November 29, 2017 20:32
> > To: Herbert Guan <Herbert.Guan@arm.com>
> > Cc: Jianbo Liu <Jianbo.Liu@arm.com>; dev@dpdk.org
> > Subject: Re: [PATCH] arch/arm: optimization for memcpy on AArch64
> >
> > -----Original Message-----
> > > Date: Mon, 27 Nov 2017 15:49:45 +0800
> > > From: Herbert Guan <herbert.guan@arm.com>
> > > To: jerin.jacob@caviumnetworks.com, jianbo.liu@arm.com, dev@dpdk.org
> > > CC: Herbert Guan <herbert.guan@arm.com>
> > > Subject: [PATCH] arch/arm: optimization for memcpy on AArch64
> > > X-Mailer: git-send-email 1.8.3.1
> > > +
> > > +/**************************************
> > > + * Beginning of customization section
> > > +**************************************/
> > > +#define ALIGNMENT_MASK 0x0F
> > > +#ifndef RTE_ARCH_ARM64_MEMCPY_STRICT_ALIGN
> > > +// Only src unalignment will be treaed as unaligned copy
> >
> > C++ style comments. It may generate check patch errors.
> 
> I'll change it to use C style comment in the version 2.
> 
> >
> > > +#define IS_UNALIGNED_COPY(dst, src) ((uintptr_t)(dst) &
> > > +ALIGNMENT_MASK) #else // Both dst and src unalignment will be treated
> > > +as unaligned copy #define IS_UNALIGNED_COPY(dst, src) \
> > > +(((uintptr_t)(dst) | (uintptr_t)(src)) & ALIGNMENT_MASK)
> > #endif
> > > +
> > > +
> > > +// If copy size is larger than threshold, memcpy() will be used.
> > > +// Run "memcpy_perf_autotest" to determine the proper threshold.
> > > +#define ALIGNED_THRESHOLD       ((size_t)(0xffffffff))
> > > +#define UNALIGNED_THRESHOLD     ((size_t)(0xffffffff))
> >
> > Do you see any case where this threshold is useful.
> 
> Yes, on some platforms, and/or with some glibc version,  the glibc memcpy has better performance in larger size (e.g., >512, >4096...).  So developers should run unit test to find the best threshold.  The default value of 0xffffffff should be modified with evaluated values.

OK

> 
> >
> > > +
> > > +static inline void *__attribute__ ((__always_inline__))i

use __rte_always_inline

> > > +rte_memcpy(void *restrict dst, const void *restrict src, size_t n)
> > > +{
> > > +if (n < 16) {
> > > +rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
> > > +return dst;
> > > +}
> > > +if (n < 64) {
> > > +rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src,
> > n);
> > > +return dst;
> > > +}
> >
> > Unfortunately we have 128B cache arm64 implementation too. Could you
> > please take care that based on RTE_CACHE_LINE_SIZE
> >
> 
> Here the value of '64' is not the cache line size.  But for the reason that prefetch itself will cost some cycles, it's not worthwhile to do prefetch for small size (e.g. < 64 bytes) copy.  Per my test, prefetching for small size copy will actually lower the performance.

But
I think, '64' is a function of cache size. ie. Any reason why we haven't used rte_memcpy_ge16_lt128()/rte_memcpy_ge128 pair instead of rte_memcpy_ge16_lt64//rte_memcpy_ge64 pair?
I think, if you can add one more conditional compilation to choose between rte_memcpy_ge16_lt128()/rte_memcpy_ge128 vs rte_memcpy_ge16_lt64//rte_memcpy_ge64,
will address the all arm64 variants supported in current DPDK.

> 
> In the other hand, I can only find one 128B cache line aarch64 machine here.  And it do exist some specific optimization for this machine.  Not sure if it'll be beneficial for other 128B cache machines or not.  I prefer not to put it in this patch but in a later standalone specific patch for 128B cache machines.
> 
> > > +__builtin_prefetch(src, 0, 0);  // rte_prefetch_non_temporal(src);
> > > +__builtin_prefetch(dst, 1, 0);  //  * unchanged *

# Why only once __builtin_prefetch used? Why not invoke in rte_memcpy_ge64 loop
# Does it make sense to prefetch src + 64/128 * n

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] arch/arm: optimization for memcpy on AArch64
  2017-12-15  4:06     ` Jerin Jacob
@ 2017-12-18  2:51       ` Herbert Guan
  2017-12-18  4:17         ` Jerin Jacob
  0 siblings, 1 reply; 27+ messages in thread
From: Herbert Guan @ 2017-12-18  2:51 UTC (permalink / raw)
  To: Jerin Jacob; +Cc: Jianbo Liu, dev

Hi Jerin,

> -----Original Message-----
> From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> Sent: Friday, December 15, 2017 12:06
> To: Herbert Guan <Herbert.Guan@arm.com>
> Cc: Jianbo Liu <Jianbo.Liu@arm.com>; dev@dpdk.org
> Subject: Re: [PATCH] arch/arm: optimization for memcpy on AArch64
>
> -----Original Message-----
> > Date: Sun, 3 Dec 2017 12:37:30 +0000
> > From: Herbert Guan <Herbert.Guan@arm.com>
> > To: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> > CC: Jianbo Liu <Jianbo.Liu@arm.com>, "dev@dpdk.org" <dev@dpdk.org>
> > Subject: RE: [PATCH] arch/arm: optimization for memcpy on AArch64
> >
> > Jerin,
>
> Hi Herbert,
>
> >
> > Thanks a lot for your review and comments.  Please find my comments
> below inline.
> >
> > Best regards,
> > Herbert
> >
> > > -----Original Message-----
> > > From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> > > Sent: Wednesday, November 29, 2017 20:32
> > > To: Herbert Guan <Herbert.Guan@arm.com>
> > > Cc: Jianbo Liu <Jianbo.Liu@arm.com>; dev@dpdk.org
> > > Subject: Re: [PATCH] arch/arm: optimization for memcpy on AArch64
> > >
> > > -----Original Message-----
> > > > Date: Mon, 27 Nov 2017 15:49:45 +0800
> > > > From: Herbert Guan <herbert.guan@arm.com>
> > > > To: jerin.jacob@caviumnetworks.com, jianbo.liu@arm.com,
> dev@dpdk.org
> > > > CC: Herbert Guan <herbert.guan@arm.com>
> > > > Subject: [PATCH] arch/arm: optimization for memcpy on AArch64
> > > > X-Mailer: git-send-email 1.8.3.1
> > > > +
> > > > +/**************************************
> > > > + * Beginning of customization section
> > > > +**************************************/
> > > > +#define ALIGNMENT_MASK 0x0F
> > > > +#ifndef RTE_ARCH_ARM64_MEMCPY_STRICT_ALIGN
> > > > +// Only src unalignment will be treaed as unaligned copy
> > >
> > > C++ style comments. It may generate check patch errors.
> >
> > I'll change it to use C style comment in the version 2.
> >
> > >
> > > > +#define IS_UNALIGNED_COPY(dst, src) ((uintptr_t)(dst) &
> > > > +ALIGNMENT_MASK) #else // Both dst and src unalignment will be
> treated
> > > > +as unaligned copy #define IS_UNALIGNED_COPY(dst, src) \
> > > > +(((uintptr_t)(dst) | (uintptr_t)(src)) & ALIGNMENT_MASK)
> > > #endif
> > > > +
> > > > +
> > > > +// If copy size is larger than threshold, memcpy() will be used.
> > > > +// Run "memcpy_perf_autotest" to determine the proper threshold.
> > > > +#define ALIGNED_THRESHOLD       ((size_t)(0xffffffff))
> > > > +#define UNALIGNED_THRESHOLD     ((size_t)(0xffffffff))
> > >
> > > Do you see any case where this threshold is useful.
> >
> > Yes, on some platforms, and/or with some glibc version,  the glibc memcpy
> has better performance in larger size (e.g., >512, >4096...).  So developers
> should run unit test to find the best threshold.  The default value of 0xffffffff
> should be modified with evaluated values.
>
> OK
>
> >
> > >
> > > > +
> > > > +static inline void *__attribute__ ((__always_inline__))i
>
> use __rte_always_inline

Applied in V3 patch.

>
> > > > +rte_memcpy(void *restrict dst, const void *restrict src, size_t n)
> > > > +{
> > > > +if (n < 16) {
> > > > +rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
> > > > +return dst;
> > > > +}
> > > > +if (n < 64) {
> > > > +rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src,
> > > n);
> > > > +return dst;
> > > > +}
> > >
> > > Unfortunately we have 128B cache arm64 implementation too. Could you
> > > please take care that based on RTE_CACHE_LINE_SIZE
> > >
> >
> > Here the value of '64' is not the cache line size.  But for the reason that
> prefetch itself will cost some cycles, it's not worthwhile to do prefetch for
> small size (e.g. < 64 bytes) copy.  Per my test, prefetching for small size copy
> will actually lower the performance.
>
> But
> I think, '64' is a function of cache size. ie. Any reason why we haven't used
> rte_memcpy_ge16_lt128()/rte_memcpy_ge128 pair instead of
> rte_memcpy_ge16_lt64//rte_memcpy_ge64 pair?
> I think, if you can add one more conditional compilation to choose between
> rte_memcpy_ge16_lt128()/rte_memcpy_ge128 vs
> rte_memcpy_ge16_lt64//rte_memcpy_ge64,
> will address the all arm64 variants supported in current DPDK.
>

The logic for 128B cache is implemented as you've suggested, and has been added in V3 patch.

> >
> > In the other hand, I can only find one 128B cache line aarch64 machine here.
> And it do exist some specific optimization for this machine.  Not sure if it'll be
> beneficial for other 128B cache machines or not.  I prefer not to put it in this
> patch but in a later standalone specific patch for 128B cache machines.
> >
> > > > +__builtin_prefetch(src, 0, 0);  // rte_prefetch_non_temporal(src);
> > > > +__builtin_prefetch(dst, 1, 0);  //  * unchanged *
>
> # Why only once __builtin_prefetch used? Why not invoke in
> rte_memcpy_ge64 loop
> # Does it make sense to prefetch src + 64/128 * n

Prefetch is only necessary once at the beginning.  CPU will do auto incremental prefetch when the continuous memory access starts.  It's not necessary to do prefetch in the loop.  In fact doing it in loop will actually break CPU's HW prefetch and degrade the performance.
IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v3] arch/arm: optimization for memcpy on AArch64
  2017-11-27  7:49 [PATCH] arch/arm: optimization for memcpy on AArch64 Herbert Guan
                   ` (2 preceding siblings ...)
  2017-12-05  6:02 ` [PATCH v2] " Herbert Guan
@ 2017-12-18  2:54 ` Herbert Guan
  2017-12-18  7:43   ` Jerin Jacob
  2017-12-21  5:33   ` [PATCH v4] " Herbert Guan
  2018-01-04 10:20 ` [PATCH v5] " Herbert Guan
  4 siblings, 2 replies; 27+ messages in thread
From: Herbert Guan @ 2017-12-18  2:54 UTC (permalink / raw)
  To: dev, jerin.jacob; +Cc: Herbert Guan

This patch provides an option to do rte_memcpy() using 'restrict'
qualifier, which can induce GCC to do optimizations by using more
efficient instructions, providing some performance gain over memcpy()
on some AArch64 platforms/enviroments.

The memory copy performance differs between different AArch64
platforms. And a more recent glibc (e.g. 2.23 or later)
can provide a better memcpy() performance compared to old glibc
versions. It's always suggested to use a more recent glibc if
possible, from which the entire system can get benefit. If for some
reason an old glibc has to be used, this patch is provided for an
alternative.

This implementation can improve memory copy on some AArch64
platforms, when an old glibc (e.g. 2.19, 2.17...) is being used.
It is disabled by default and needs "RTE_ARCH_ARM64_MEMCPY"
defined to activate. It's not always proving better performance
than memcpy() so users need to run DPDK unit test
"memcpy_perf_autotest" and customize parameters in "customization
section" in rte_memcpy_64.h for best performance.

Compiler version will also impact the rte_memcpy() performance.
It's observed on some platforms and with the same code, GCC 7.2.0
compiled binary can provide better performance than GCC 4.8.5. It's
suggested to use GCC 5.4.0 or later.

Signed-off-by: Herbert Guan <herbert.guan@arm.com>
---
 config/common_armv8a_linuxapp                      |   6 +
 .../common/include/arch/arm/rte_memcpy_64.h        | 292 +++++++++++++++++++++
 2 files changed, 298 insertions(+)

diff --git a/config/common_armv8a_linuxapp b/config/common_armv8a_linuxapp
index 6732d1e..8f0cbed 100644
--- a/config/common_armv8a_linuxapp
+++ b/config/common_armv8a_linuxapp
@@ -44,6 +44,12 @@ CONFIG_RTE_FORCE_INTRINSICS=y
 # to address minimum DMA alignment across all arm64 implementations.
 CONFIG_RTE_CACHE_LINE_SIZE=128
 
+# Accelarate rte_memcpy.  Be sure to run unit test to determine the
+# best threshold in code.  Refer to notes in source file
+# (lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h) for more
+# info.
+CONFIG_RTE_ARCH_ARM64_MEMCPY=n
+
 CONFIG_RTE_LIBRTE_FM10K_PMD=n
 CONFIG_RTE_LIBRTE_SFC_EFX_PMD=n
 CONFIG_RTE_LIBRTE_AVP_PMD=n
diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
index b80d8ba..1ea275d 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
@@ -42,6 +42,296 @@
 
 #include "generic/rte_memcpy.h"
 
+#ifdef RTE_ARCH_ARM64_MEMCPY
+#include <rte_common.h>
+#include <rte_branch_prediction.h>
+
+/*
+ * The memory copy performance differs on different AArch64 micro-architectures.
+ * And the most recent glibc (e.g. 2.23 or later) can provide a better memcpy()
+ * performance compared to old glibc versions. It's always suggested to use a
+ * more recent glibc if possible, from which the entire system can get benefit.
+ *
+ * This implementation improves memory copy on some aarch64 micro-architectures,
+ * when an old glibc (e.g. 2.19, 2.17...) is being used. It is disabled by
+ * default and needs "RTE_ARCH_ARM64_MEMCPY" defined to activate. It's not
+ * always providing better performance than memcpy() so users need to run unit
+ * test "memcpy_perf_autotest" and customize parameters in customization section
+ * below for best performance.
+ *
+ * Compiler version will also impact the rte_memcpy() performance. It's observed
+ * on some platforms and with the same code, GCC 7.2.0 compiled binaries can
+ * provide better performance than GCC 4.8.5 compiled binaries.
+ */
+
+/**************************************
+ * Beginning of customization section
+ **************************************/
+#define ALIGNMENT_MASK 0x0F
+#ifndef RTE_ARCH_ARM64_MEMCPY_STRICT_ALIGN
+/* Only src unalignment will be treaed as unaligned copy */
+#define IS_UNALIGNED_COPY(dst, src) ((uintptr_t)(dst) & ALIGNMENT_MASK)
+#else
+/* Both dst and src unalignment will be treated as unaligned copy */
+#define IS_UNALIGNED_COPY(dst, src) \
+		(((uintptr_t)(dst) | (uintptr_t)(src)) & ALIGNMENT_MASK)
+#endif
+
+
+/*
+ * If copy size is larger than threshold, memcpy() will be used.
+ * Run "memcpy_perf_autotest" to determine the proper threshold.
+ */
+#define ALIGNED_THRESHOLD       ((size_t)(0xffffffff))
+#define UNALIGNED_THRESHOLD     ((size_t)(0xffffffff))
+
+/**************************************
+ * End of customization section
+ **************************************/
+#ifdef RTE_TOOLCHAIN_GCC
+#if (GCC_VERSION < 50400)
+#warning "The GCC version is quite old, which may result in sub-optimal \
+performance of the compiled code. It is suggested that at least GCC 5.4.0 \
+be used."
+#endif
+#endif
+
+static __rte_always_inline void
+rte_mov16(uint8_t *restrict dst, const uint8_t *restrict src)
+{
+	__uint128_t *restrict dst128 = (__uint128_t *restrict)dst;
+	const __uint128_t *restrict src128 = (const __uint128_t *restrict)src;
+	*dst128 = *src128;
+}
+
+static __rte_always_inline void
+rte_mov32(uint8_t *restrict dst, const uint8_t *restrict src)
+{
+	__uint128_t *restrict dst128 = (__uint128_t *restrict)dst;
+	const __uint128_t *restrict src128 = (const __uint128_t *restrict)src;
+	dst128[0] = src128[0];
+	dst128[1] = src128[1];
+}
+
+static __rte_always_inline void
+rte_mov48(uint8_t *restrict dst, const uint8_t *restrict src)
+{
+	__uint128_t *restrict dst128 = (__uint128_t *restrict)dst;
+	const __uint128_t *restrict src128 = (const __uint128_t *restrict)src;
+	dst128[0] = src128[0];
+	dst128[1] = src128[1];
+	dst128[2] = src128[2];
+}
+
+static __rte_always_inline void
+rte_mov64(uint8_t *restrict dst, const uint8_t *restrict src)
+{
+	__uint128_t *restrict dst128 = (__uint128_t *restrict)dst;
+	const __uint128_t *restrict src128 = (const __uint128_t *restrict)src;
+	dst128[0] = src128[0];
+	dst128[1] = src128[1];
+	dst128[2] = src128[2];
+	dst128[3] = src128[3];
+}
+
+static __rte_always_inline void
+rte_mov128(uint8_t *restrict dst, const uint8_t *restrict src)
+{
+	/*
+	 * GCC generated code is not having good performance in 128 bytes
+	 * case on some platforms.  Use ASM codes can aviod such downgrade.
+	 */
+	register uint64_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8,
+			tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+	__asm__  __volatile__ (
+		"ldp %2, %3, [%0]\n\t"
+		"ldp %4, %5, [%0, 16]\n\t"
+		"ldp %6, %7, [%0, 32]\n\t"
+		"ldp %8, %9, [%0, 48]\n\t"
+		"ldp %10, %11, [%0, 64]\n\t"
+		"ldp %12, %13, [%0, 80]\n\t"
+		"ldp %14, %15, [%0, 96]\n\t"
+		"ldp %16, %17, [%0, 112]\n\t"
+		"stp %2, %3, [%1]\n\t"
+		"stp %4, %5, [%1, 16]\n\t"
+		"stp %6, %7, [%1, 32]\n\t"
+		"stp %8, %9, [%1, 48]\n\t"
+		"stp %10, %11, [%1, 64]\n\t"
+		"stp %12, %13, [%1, 80]\n\t"
+		"stp %14, %15, [%1, 96]\n\t"
+		"stp %16, %17, [%1, 112]\n\t"
+		: "+r" (src), "+r" (dst),
+		  "=&r" (tmp1), "=&r" (tmp2), "=&r" (tmp3), "=&r" (tmp4),
+		  "=&r" (tmp5), "=&r" (tmp6), "=&r" (tmp7), "=&r" (tmp8),
+		  "=&r" (tmp9), "=&r" (tmp10), "=&r" (tmp11), "=&r" (tmp12),
+		  "=&r" (tmp13), "=&r" (tmp14), "=&r" (tmp15), "=&r" (tmp16)
+		:
+		: "memory");
+}
+
+static __rte_always_inline void
+rte_mov256(uint8_t *restrict dst, const uint8_t *restrict src)
+{
+	rte_mov128(dst, src);
+	rte_mov128(dst + 128, src + 128);
+}
+
+static __rte_always_inline void
+rte_memcpy_lt16(uint8_t *restrict dst, const uint8_t *restrict src, size_t n)
+{
+	if (n & 0x08) {
+		/* copy 8 ~ 15 bytes */
+		*(uint64_t *)dst = *(const uint64_t *)src;
+		*(uint64_t *)(dst - 8 + n) = *(const uint64_t *)(src - 8 + n);
+	} else if (n & 0x04) {
+		/* copy 4 ~ 7 bytes */
+		*(uint32_t *)dst = *(const uint32_t *)src;
+		*(uint32_t *)(dst - 4 + n) = *(const uint32_t *)(src - 4 + n);
+	} else if (n & 0x02) {
+		/* copy 2 ~ 3 bytes */
+		*(uint16_t *)dst = *(const uint16_t *)src;
+		*(uint16_t *)(dst - 2 + n) = *(const uint16_t *)(src - 2 + n);
+	} else if (n & 0x01) {
+		/* copy 1 byte */
+		*dst = *src;
+	}
+}
+
+
+#if RTE_CACHE_LINE_SIZE >= 128
+static __rte_always_inline void
+rte_memcpy_ge16_lt128
+(uint8_t *restrict dst, const uint8_t *restrict src, size_t n)
+{
+	if (n < 64) {
+		if (n == 16) {
+			rte_mov16(dst, src);
+		} else if (n <= 32) {
+			rte_mov16(dst, src);
+			rte_mov16(dst - 16 + n, src - 16 + n);
+		} else if (n <= 48) {
+			rte_mov32(dst, src);
+			rte_mov16(dst - 16 + n, src - 16 + n);
+		} else {
+			rte_mov48(dst, src);
+			rte_mov16(dst - 16 + n, src - 16 + n);
+		}
+	} else {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		if (n > 48 + 64)
+			rte_mov64(dst - 64 + n, src - 64 + n);
+		else if (n > 32 + 64)
+			rte_mov48(dst - 48 + n, src - 48 + n);
+		else if (n > 16 + 64)
+			rte_mov32(dst - 32 + n, src - 32 + n);
+		else if (n > 64)
+			rte_mov16(dst - 16 + n, src - 16 + n);
+	}
+}
+
+static __rte_always_inline void
+rte_memcpy_ge128(uint8_t *restrict dst, const uint8_t *restrict src, size_t n)
+{
+	do {
+		rte_mov128(dst, src);
+		src += 128;
+		dst += 128;
+		n -= 128;
+	} while (likely(n >= 128));
+
+	if (likely(n)) {
+		if (n <= 16)
+			rte_mov16(dst - 16 + n, src - 16 + n);
+		else if (n <= 32)
+			rte_mov32(dst - 32 + n, src - 32 + n);
+		else if (n <= 48)
+			rte_mov48(dst - 48 + n, src - 48 + n);
+		else if (n <= 64)
+			rte_mov64(dst - 64 + n, src - 64 + n);
+		else
+			rte_memcpy_ge16_lt128(dst, src, n);
+	}
+}
+
+#else
+static __rte_always_inline void
+rte_memcpy_ge16_lt64
+(uint8_t *restrict dst, const uint8_t *restrict src, size_t n)
+{
+	if (n == 16) {
+		rte_mov16(dst, src);
+	} else if (n <= 32) {
+		rte_mov16(dst, src);
+		rte_mov16(dst - 16 + n, src - 16 + n);
+	} else if (n <= 48) {
+		rte_mov32(dst, src);
+		rte_mov16(dst - 16 + n, src - 16 + n);
+	} else {
+		rte_mov48(dst, src);
+		rte_mov16(dst - 16 + n, src - 16 + n);
+	}
+}
+
+static __rte_always_inline void
+rte_memcpy_ge64(uint8_t *restrict dst, const uint8_t *restrict src, size_t n)
+{
+	do {
+		rte_mov64(dst, src);
+		src += 64;
+		dst += 64;
+		n -= 64;
+	} while (likely(n >= 64));
+
+	if (likely(n)) {
+		if (n <= 16)
+			rte_mov16(dst - 16 + n, src - 16 + n);
+		else if (n <= 32)
+			rte_mov32(dst - 32 + n, src - 32 + n);
+		else if (n <= 48)
+			rte_mov48(dst - 48 + n, src - 48 + n);
+		else
+			rte_mov64(dst - 64 + n, src - 64 + n);
+	}
+}
+
+#endif
+
+static __rte_always_inline void *
+rte_memcpy(void *restrict dst, const void *restrict src, size_t n)
+{
+	if (n < 16) {
+		rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	}
+#if RTE_CACHE_LINE_SIZE >= 128
+	if (n < 128) {
+		rte_memcpy_ge16_lt128((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	}
+#else
+	if (n < 64) {
+		rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	}
+#endif
+	__builtin_prefetch(src, 0, 0);
+	__builtin_prefetch(dst, 1, 0);
+	if (likely(
+		  (!IS_UNALIGNED_COPY(dst, src) && n <= ALIGNED_THRESHOLD)
+		   || (IS_UNALIGNED_COPY(dst, src) && n <= UNALIGNED_THRESHOLD)
+		  )) {
+#if RTE_CACHE_LINE_SIZE >= 128
+		rte_memcpy_ge128((uint8_t *)dst, (const uint8_t *)src, n);
+#else
+		rte_memcpy_ge64((uint8_t *)dst, (const uint8_t *)src, n);
+#endif
+		return dst;
+	} else
+		return memcpy(dst, src, n);
+}
+
+
+#else
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
@@ -80,6 +370,8 @@
 
 #define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
 
+#endif
+
 #ifdef __cplusplus
 }
 #endif
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH] arch/arm: optimization for memcpy on AArch64
  2017-12-18  2:51       ` Herbert Guan
@ 2017-12-18  4:17         ` Jerin Jacob
  0 siblings, 0 replies; 27+ messages in thread
From: Jerin Jacob @ 2017-12-18  4:17 UTC (permalink / raw)
  To: Herbert Guan; +Cc: Jianbo Liu, dev

-----Original Message-----
> Date: Mon, 18 Dec 2017 02:51:19 +0000
> From: Herbert Guan <Herbert.Guan@arm.com>
> To: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> CC: Jianbo Liu <Jianbo.Liu@arm.com>, "dev@dpdk.org" <dev@dpdk.org>
> Subject: RE: [PATCH] arch/arm: optimization for memcpy on AArch64
> 
> Hi Jerin,

Hi Herbert,

> > >
> > > Here the value of '64' is not the cache line size.  But for the reason that
> > prefetch itself will cost some cycles, it's not worthwhile to do prefetch for
> > small size (e.g. < 64 bytes) copy.  Per my test, prefetching for small size copy
> > will actually lower the performance.
> >
> > But
> > I think, '64' is a function of cache size. ie. Any reason why we haven't used
> > rte_memcpy_ge16_lt128()/rte_memcpy_ge128 pair instead of
> > rte_memcpy_ge16_lt64//rte_memcpy_ge64 pair?
> > I think, if you can add one more conditional compilation to choose between
> > rte_memcpy_ge16_lt128()/rte_memcpy_ge128 vs
> > rte_memcpy_ge16_lt64//rte_memcpy_ge64,
> > will address the all arm64 variants supported in current DPDK.
> >
> 
> The logic for 128B cache is implemented as you've suggested, and has been added in V3 patch.
> 
> > >
> > > In the other hand, I can only find one 128B cache line aarch64 machine here.
> > And it do exist some specific optimization for this machine.  Not sure if it'll be
> > beneficial for other 128B cache machines or not.  I prefer not to put it in this
> > patch but in a later standalone specific patch for 128B cache machines.
> > >
> > > > > +__builtin_prefetch(src, 0, 0);  // rte_prefetch_non_temporal(src);
> > > > > +__builtin_prefetch(dst, 1, 0);  //  * unchanged *
> >
> > # Why only once __builtin_prefetch used? Why not invoke in
> > rte_memcpy_ge64 loop
> > # Does it make sense to prefetch src + 64/128 * n
> 
> Prefetch is only necessary once at the beginning.  CPU will do auto incremental prefetch when the continuous memory access starts.  It's not necessary to do prefetch in the loop.  In fact doing it in loop will actually break CPU's HW prefetch and degrade the performance.

Yes. But, aarch64 specification does not mandate that all implementation should have HW prefetch
mechanism(ie. it is IMPLEMENTATION DEFINED).
I think, You have provided a good start for memcpy implementation and we
can fine tune it _latter_ based different micro architecture.
Your v3 looks good.


> IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.

Please remove such notice from public mailing list.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v3] arch/arm: optimization for memcpy on AArch64
  2017-12-18  2:54 ` [PATCH v3] " Herbert Guan
@ 2017-12-18  7:43   ` Jerin Jacob
  2017-12-19  5:33     ` Herbert Guan
  2017-12-21  5:33   ` [PATCH v4] " Herbert Guan
  1 sibling, 1 reply; 27+ messages in thread
From: Jerin Jacob @ 2017-12-18  7:43 UTC (permalink / raw)
  To: Herbert Guan; +Cc: dev

-----Original Message-----
> Date: Mon, 18 Dec 2017 10:54:24 +0800
> From: Herbert Guan <herbert.guan@arm.com>
> To: dev@dpdk.org, jerin.jacob@caviumnetworks.com
> CC: Herbert Guan <herbert.guan@arm.com>
> Subject: [PATCH v3] arch/arm: optimization for memcpy on AArch64
> X-Mailer: git-send-email 1.8.3.1
> 
> Signed-off-by: Herbert Guan <herbert.guan@arm.com>
> ---
>  config/common_armv8a_linuxapp                      |   6 +
>  .../common/include/arch/arm/rte_memcpy_64.h        | 292 +++++++++++++++++++++
>  2 files changed, 298 insertions(+)
> 
> diff --git a/config/common_armv8a_linuxapp b/config/common_armv8a_linuxapp
> index 6732d1e..8f0cbed 100644
> --- a/config/common_armv8a_linuxapp
> +++ b/config/common_armv8a_linuxapp
> @@ -44,6 +44,12 @@ CONFIG_RTE_FORCE_INTRINSICS=y
>  # to address minimum DMA alignment across all arm64 implementations.
>  CONFIG_RTE_CACHE_LINE_SIZE=128
>  
> +# Accelarate rte_memcpy.  Be sure to run unit test to determine the

Additional space before "Be". Rather than just mentioning the unit test, mention
the absolute test case name(memcpy_perf_autotest)

> +# best threshold in code.  Refer to notes in source file

Additional space before "Refer"

> +# (lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h) for more
> +# info.
> +CONFIG_RTE_ARCH_ARM64_MEMCPY=n
> +
>  CONFIG_RTE_LIBRTE_FM10K_PMD=n
>  CONFIG_RTE_LIBRTE_SFC_EFX_PMD=n
>  CONFIG_RTE_LIBRTE_AVP_PMD=n
> diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> index b80d8ba..1ea275d 100644
> --- a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> +++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> @@ -42,6 +42,296 @@
>  
>  #include "generic/rte_memcpy.h"
>  
> +#ifdef RTE_ARCH_ARM64_MEMCPY

See the comment below at "(GCC_VERSION < 50400)" check

> +#include <rte_common.h>
> +#include <rte_branch_prediction.h>
> +
> +/*
> + * The memory copy performance differs on different AArch64 micro-architectures.
> + * And the most recent glibc (e.g. 2.23 or later) can provide a better memcpy()
> + * performance compared to old glibc versions. It's always suggested to use a
> + * more recent glibc if possible, from which the entire system can get benefit.
> + *
> + * This implementation improves memory copy on some aarch64 micro-architectures,
> + * when an old glibc (e.g. 2.19, 2.17...) is being used. It is disabled by
> + * default and needs "RTE_ARCH_ARM64_MEMCPY" defined to activate. It's not
> + * always providing better performance than memcpy() so users need to run unit
> + * test "memcpy_perf_autotest" and customize parameters in customization section
> + * below for best performance.
> + *
> + * Compiler version will also impact the rte_memcpy() performance. It's observed
> + * on some platforms and with the same code, GCC 7.2.0 compiled binaries can
> + * provide better performance than GCC 4.8.5 compiled binaries.
> + */
> +
> +/**************************************
> + * Beginning of customization section
> + **************************************/
> +#define ALIGNMENT_MASK 0x0F

This symbol will be included in public rte_memcpy.h version for arm64 DPDK build.
Please use RTE_ prefix to avoid multi definition.(RTE_ARCH_ARM64_ALIGN_MASK ? or any shorter name)

> +#ifndef RTE_ARCH_ARM64_MEMCPY_STRICT_ALIGN
> +/* Only src unalignment will be treaed as unaligned copy */
> +#define IS_UNALIGNED_COPY(dst, src) ((uintptr_t)(dst) & ALIGNMENT_MASK)
> +#else
> +/* Both dst and src unalignment will be treated as unaligned copy */
> +#define IS_UNALIGNED_COPY(dst, src) \
> +		(((uintptr_t)(dst) | (uintptr_t)(src)) & ALIGNMENT_MASK)
> +#endif
> +
> +
> +/*
> + * If copy size is larger than threshold, memcpy() will be used.
> + * Run "memcpy_perf_autotest" to determine the proper threshold.
> + */
> +#define ALIGNED_THRESHOLD       ((size_t)(0xffffffff))
> +#define UNALIGNED_THRESHOLD     ((size_t)(0xffffffff))

Same as above comment.

> +
> +/**************************************
> + * End of customization section
> + **************************************/
> +#ifdef RTE_TOOLCHAIN_GCC
> +#if (GCC_VERSION < 50400)
> +#warning "The GCC version is quite old, which may result in sub-optimal \
> +performance of the compiled code. It is suggested that at least GCC 5.4.0 \
> +be used."

Even though it is warning, based on where this file get included it will generate error(see below)
How about, selecting optimized memcpy when RTE_ARCH_ARM64_MEMCPY && if (GCC_VERSION >= 50400) ?

  CC eal_common_options.o
In file included from
/home/jerin/dpdk.org/build/include/rte_memcpy.h:37:0,from
/home/jerin/dpdk.org/lib/librte_eal/common/eal_common_options.c:53:
/home/jerin/dpdk.org/build/include/rte_memcpy_64.h:93:2: error: #warning
                                                        ^^^^^^^^
"The GCC version is quite old, which may result in sub-optimal
performance of the compiled code. It is suggested that at least GCC
5.4.0 be used." [-Werror=cpp]
                ^^^^^^^^^^^^^^
 #warning "The GCC version is quite old, which may result in sub-optimal
\
  ^


> +#endif
> +#endif
> +
> +
> +#if RTE_CACHE_LINE_SIZE >= 128

We can remove this conditional compilation check. ie. It can get compiled for both cases,
But it will be used only when RTE_CACHE_LINE_SIZE >= 128

> +static __rte_always_inline void
> +rte_memcpy_ge16_lt128
> +(uint8_t *restrict dst, const uint8_t *restrict src, size_t n)
> +{
> +	if (n < 64) {
> +		if (n == 16) {
> +			rte_mov16(dst, src);
> +		} else if (n <= 32) {
> +			rte_mov16(dst, src);
> +			rte_mov16(dst - 16 + n, src - 16 + n);
> +		} else if (n <= 48) {
> +			rte_mov32(dst, src);
> +			rte_mov16(dst - 16 + n, src - 16 + n);
> +		} else {
> +			rte_mov48(dst, src);
> +			rte_mov16(dst - 16 + n, src - 16 + n);
> +		}
> +	} else {
> +		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> +		if (n > 48 + 64)
> +			rte_mov64(dst - 64 + n, src - 64 + n);
> +		else if (n > 32 + 64)
> +			rte_mov48(dst - 48 + n, src - 48 + n);
> +		else if (n > 16 + 64)
> +			rte_mov32(dst - 32 + n, src - 32 + n);
> +		else if (n > 64)
> +			rte_mov16(dst - 16 + n, src - 16 + n);
> +	}
> +}
> +
> +
> +#else

Same as above comment.

> +static __rte_always_inline void
> +rte_memcpy_ge16_lt64
> +(uint8_t *restrict dst, const uint8_t *restrict src, size_t n)
> +{
> +	if (n == 16) {
> +		rte_mov16(dst, src);
> +	} else if (n <= 32) {
> +		rte_mov16(dst, src);
> +		rte_mov16(dst - 16 + n, src - 16 + n);
> +	} else if (n <= 48) {
> +		rte_mov32(dst, src);
> +		rte_mov16(dst - 16 + n, src - 16 + n);
> +	} else {
> +		rte_mov48(dst, src);
> +		rte_mov16(dst - 16 + n, src - 16 + n);
> +	}
> +}
> +
> +
> +static __rte_always_inline void *
> +rte_memcpy(void *restrict dst, const void *restrict src, size_t n)
> +{
> +	if (n < 16) {
> +		rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
> +		return dst;
> +	}
> +#if RTE_CACHE_LINE_SIZE >= 128
> +	if (n < 128) {
> +		rte_memcpy_ge16_lt128((uint8_t *)dst, (const uint8_t *)src, n);
> +		return dst;
> +	}
> +#else
> +	if (n < 64) {
> +		rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src, n);
> +		return dst;
> +	}
> +#endif
> +	__builtin_prefetch(src, 0, 0);
> +	__builtin_prefetch(dst, 1, 0);
> +	if (likely(
> +		  (!IS_UNALIGNED_COPY(dst, src) && n <= ALIGNED_THRESHOLD)
> +		   || (IS_UNALIGNED_COPY(dst, src) && n <= UNALIGNED_THRESHOLD)
> +		  )) {
> +#if RTE_CACHE_LINE_SIZE >= 128
> +		rte_memcpy_ge128((uint8_t *)dst, (const uint8_t *)src, n);
> +#else
> +		rte_memcpy_ge64((uint8_t *)dst, (const uint8_t *)src, n);
> +#endif

Can we remove this #ifdef clutter(We have two of them in a same function)?

I suggest to remove this clutter by having the separate routine. ie.
1)
#if RTE_CACHE_LINE_SIZE >= 128
rte_memcpy(void *restrict dst, const void *restrict src, size_t n)
{
}
#else
rte_memcpy(void *restrict dst, const void *restrict src, size_t n)
{
}
#endif

2) Have separate inline function to resolve following logic and used it 
in both variants.

	if (likely(
		  (!IS_UNALIGNED_COPY(dst, src) && n <= ALIGNED_THRESHOLD)
		   || (IS_UNALIGNED_COPY(dst, src) && n <= UNALIGNED_THRESHOLD)
		  )) {

With above changes:
Acked-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v3] arch/arm: optimization for memcpy on AArch64
  2017-12-18  7:43   ` Jerin Jacob
@ 2017-12-19  5:33     ` Herbert Guan
  2017-12-19  7:24       ` Jerin Jacob
  0 siblings, 1 reply; 27+ messages in thread
From: Herbert Guan @ 2017-12-19  5:33 UTC (permalink / raw)
  To: Jerin Jacob; +Cc: dev, nd

Jerin,

Thanks for review and comments.  Please find my feedbacks below inline.

> -----Original Message-----
> From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> Sent: Monday, December 18, 2017 15:44
> To: Herbert Guan <Herbert.Guan@arm.com>
> Cc: dev@dpdk.org
> Subject: Re: [PATCH v3] arch/arm: optimization for memcpy on AArch64
> 
> -----Original Message-----
> > Date: Mon, 18 Dec 2017 10:54:24 +0800
> > From: Herbert Guan <herbert.guan@arm.com>
> > To: dev@dpdk.org, jerin.jacob@caviumnetworks.com
> > CC: Herbert Guan <herbert.guan@arm.com>
> > Subject: [PATCH v3] arch/arm: optimization for memcpy on AArch64
> > X-Mailer: git-send-email 1.8.3.1
> >
> > Signed-off-by: Herbert Guan <herbert.guan@arm.com>
> > ---
> >  config/common_armv8a_linuxapp                      |   6 +
> >  .../common/include/arch/arm/rte_memcpy_64.h        | 292
> +++++++++++++++++++++
> >  2 files changed, 298 insertions(+)
> >
> > diff --git a/config/common_armv8a_linuxapp
> > b/config/common_armv8a_linuxapp index 6732d1e..8f0cbed 100644
> > --- a/config/common_armv8a_linuxapp
> > +++ b/config/common_armv8a_linuxapp
> > @@ -44,6 +44,12 @@ CONFIG_RTE_FORCE_INTRINSICS=y  # to address
> minimum
> > DMA alignment across all arm64 implementations.
> >  CONFIG_RTE_CACHE_LINE_SIZE=128
> >
> > +# Accelarate rte_memcpy.  Be sure to run unit test to determine the
> 
> Additional space before "Be". Rather than just mentioning the unit test,
> mention the absolute test case name(memcpy_perf_autotest)
> 
> > +# best threshold in code.  Refer to notes in source file
> 
> Additional space before "Refer"

Fixed in new version.

> 
> > +# (lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h) for more
> #
> > +info.
> > +CONFIG_RTE_ARCH_ARM64_MEMCPY=n
> > +
> >  CONFIG_RTE_LIBRTE_FM10K_PMD=n
> >  CONFIG_RTE_LIBRTE_SFC_EFX_PMD=n
> >  CONFIG_RTE_LIBRTE_AVP_PMD=n
> > diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> > b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> > index b80d8ba..1ea275d 100644
> > --- a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> > +++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> > @@ -42,6 +42,296 @@
> >
> >  #include "generic/rte_memcpy.h"
> >
> > +#ifdef RTE_ARCH_ARM64_MEMCPY
> 
> See the comment below at "(GCC_VERSION < 50400)" check
> 
> > +#include <rte_common.h>
> > +#include <rte_branch_prediction.h>
> > +
> > +/*
> > + * The memory copy performance differs on different AArch64 micro-
> architectures.
> > + * And the most recent glibc (e.g. 2.23 or later) can provide a
> > +better memcpy()
> > + * performance compared to old glibc versions. It's always suggested
> > +to use a
> > + * more recent glibc if possible, from which the entire system can get
> benefit.
> > + *
> > + * This implementation improves memory copy on some aarch64
> > +micro-architectures,
> > + * when an old glibc (e.g. 2.19, 2.17...) is being used. It is
> > +disabled by
> > + * default and needs "RTE_ARCH_ARM64_MEMCPY" defined to activate.
> > +It's not
> > + * always providing better performance than memcpy() so users need to
> > +run unit
> > + * test "memcpy_perf_autotest" and customize parameters in
> > +customization section
> > + * below for best performance.
> > + *
> > + * Compiler version will also impact the rte_memcpy() performance.
> > +It's observed
> > + * on some platforms and with the same code, GCC 7.2.0 compiled
> > +binaries can
> > + * provide better performance than GCC 4.8.5 compiled binaries.
> > + */
> > +
> > +/**************************************
> > + * Beginning of customization section
> > +**************************************/
> > +#define ALIGNMENT_MASK 0x0F
> 
> This symbol will be included in public rte_memcpy.h version for arm64 DPDK
> build.
> Please use RTE_ prefix to avoid multi
> definition.(RTE_ARCH_ARM64_ALIGN_MASK ? or any shorter name)
> 
Changed to RTE_AARCH64_ALIGN_MASK in new version.

> > +#ifndef RTE_ARCH_ARM64_MEMCPY_STRICT_ALIGN
> > +/* Only src unalignment will be treaed as unaligned copy */ #define
> > +IS_UNALIGNED_COPY(dst, src) ((uintptr_t)(dst) & ALIGNMENT_MASK)
> #else
> > +/* Both dst and src unalignment will be treated as unaligned copy */
> > +#define IS_UNALIGNED_COPY(dst, src) \
> > +		(((uintptr_t)(dst) | (uintptr_t)(src)) & ALIGNMENT_MASK)
> #endif
> > +
> > +
> > +/*
> > + * If copy size is larger than threshold, memcpy() will be used.
> > + * Run "memcpy_perf_autotest" to determine the proper threshold.
> > + */
> > +#define ALIGNED_THRESHOLD       ((size_t)(0xffffffff))
> > +#define UNALIGNED_THRESHOLD     ((size_t)(0xffffffff))
> 
> Same as above comment.
Added RTE_AARCH64_ prefix in new version.

> 
> > +
> > +/**************************************
> > + * End of customization section
> > + **************************************/
> > +#ifdef RTE_TOOLCHAIN_GCC
> > +#if (GCC_VERSION < 50400)
> > +#warning "The GCC version is quite old, which may result in sub-optimal \
> > +performance of the compiled code. It is suggested that at least GCC 5.4.0 \
> > +be used."
> 
> Even though it is warning, based on where this file get included it will
> generate error(see below)
> How about, selecting optimized memcpy when RTE_ARCH_ARM64_MEMCPY
> && if (GCC_VERSION >= 50400) ?
> 
Fully understand that.  While I'm not tending to make it 'silent'.  GCC 4.x is just
quite old and may not provide best optimized code -- not only for DPDK app.  
We can provide another option RTE_AARCH64_SKIP_GCC_VERSION_CHECK to allow
skipping the GCC version check.  How do you think?

>   CC eal_common_options.o
> In file included from
> /home/jerin/dpdk.org/build/include/rte_memcpy.h:37:0,from
> /home/jerin/dpdk.org/lib/librte_eal/common/eal_common_options.c:53:
> /home/jerin/dpdk.org/build/include/rte_memcpy_64.h:93:2: error:
> #warning
>                                                         ^^^^^^^^
> "The GCC version is quite old, which may result in sub-optimal
> performance of the compiled code. It is suggested that at least GCC
> 5.4.0 be used." [-Werror=cpp]
>                 ^^^^^^^^^^^^^^
>  #warning "The GCC version is quite old, which may result in sub-optimal
> \
>   ^
> 
> 
> > +#endif
> > +#endif
> > +
> > +
> > +#if RTE_CACHE_LINE_SIZE >= 128
> 
> We can remove this conditional compilation check. ie. It can get compiled for
> both cases,
> But it will be used only when RTE_CACHE_LINE_SIZE >= 128
> 
OK, it'll be removed in the new version.

> > +static __rte_always_inline void
> > +rte_memcpy_ge16_lt128
> > +(uint8_t *restrict dst, const uint8_t *restrict src, size_t n)
> > +{
> > +	if (n < 64) {
> > +		if (n == 16) {
> > +			rte_mov16(dst, src);
> > +		} else if (n <= 32) {
> > +			rte_mov16(dst, src);
> > +			rte_mov16(dst - 16 + n, src - 16 + n);
> > +		} else if (n <= 48) {
> > +			rte_mov32(dst, src);
> > +			rte_mov16(dst - 16 + n, src - 16 + n);
> > +		} else {
> > +			rte_mov48(dst, src);
> > +			rte_mov16(dst - 16 + n, src - 16 + n);
> > +		}
> > +	} else {
> > +		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> > +		if (n > 48 + 64)
> > +			rte_mov64(dst - 64 + n, src - 64 + n);
> > +		else if (n > 32 + 64)
> > +			rte_mov48(dst - 48 + n, src - 48 + n);
> > +		else if (n > 16 + 64)
> > +			rte_mov32(dst - 32 + n, src - 32 + n);
> > +		else if (n > 64)
> > +			rte_mov16(dst - 16 + n, src - 16 + n);
> > +	}
> > +}
> > +
> > +
> > +#else
> 
> Same as above comment.
> 
> > +static __rte_always_inline void
> > +rte_memcpy_ge16_lt64
> > +(uint8_t *restrict dst, const uint8_t *restrict src, size_t n)
> > +{
> > +	if (n == 16) {
> > +		rte_mov16(dst, src);
> > +	} else if (n <= 32) {
> > +		rte_mov16(dst, src);
> > +		rte_mov16(dst - 16 + n, src - 16 + n);
> > +	} else if (n <= 48) {
> > +		rte_mov32(dst, src);
> > +		rte_mov16(dst - 16 + n, src - 16 + n);
> > +	} else {
> > +		rte_mov48(dst, src);
> > +		rte_mov16(dst - 16 + n, src - 16 + n);
> > +	}
> > +}
> > +
> > +
> > +static __rte_always_inline void *
> > +rte_memcpy(void *restrict dst, const void *restrict src, size_t n)
> > +{
> > +	if (n < 16) {
> > +		rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
> > +		return dst;
> > +	}
> > +#if RTE_CACHE_LINE_SIZE >= 128
> > +	if (n < 128) {
> > +		rte_memcpy_ge16_lt128((uint8_t *)dst, (const uint8_t *)src,
> n);
> > +		return dst;
> > +	}
> > +#else
> > +	if (n < 64) {
> > +		rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src,
> n);
> > +		return dst;
> > +	}
> > +#endif
> > +	__builtin_prefetch(src, 0, 0);
> > +	__builtin_prefetch(dst, 1, 0);
> > +	if (likely(
> > +		  (!IS_UNALIGNED_COPY(dst, src) && n <=
> ALIGNED_THRESHOLD)
> > +		   || (IS_UNALIGNED_COPY(dst, src) && n <=
> UNALIGNED_THRESHOLD)
> > +		  )) {
> > +#if RTE_CACHE_LINE_SIZE >= 128
> > +		rte_memcpy_ge128((uint8_t *)dst, (const uint8_t *)src, n);
> > +#else
> > +		rte_memcpy_ge64((uint8_t *)dst, (const uint8_t *)src, n);
> > +#endif
> 
> Can we remove this #ifdef clutter(We have two of them in a same function)?
> 
> I suggest to remove this clutter by having the separate routine. ie.
> 1)
> #if RTE_CACHE_LINE_SIZE >= 128
> rte_memcpy(void *restrict dst, const void *restrict src, size_t n)
> {
> }
> #else
> rte_memcpy(void *restrict dst, const void *restrict src, size_t n)
> {
> }
> #endif
> 
> 2) Have separate inline function to resolve following logic and used it
> in both variants.
> 
> 	if (likely(
> 		  (!IS_UNALIGNED_COPY(dst, src) && n <=
> ALIGNED_THRESHOLD)
> 		   || (IS_UNALIGNED_COPY(dst, src) && n <=
> UNALIGNED_THRESHOLD)
> 		  )) {
> 

Implemented as suggested in new version.

> With above changes:
> Acked-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>

Thanks,
Herbert Guan

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v3] arch/arm: optimization for memcpy on AArch64
  2017-12-19  5:33     ` Herbert Guan
@ 2017-12-19  7:24       ` Jerin Jacob
  0 siblings, 0 replies; 27+ messages in thread
From: Jerin Jacob @ 2017-12-19  7:24 UTC (permalink / raw)
  To: Herbert Guan; +Cc: dev, nd

-----Original Message-----
> Date: Tue, 19 Dec 2017 05:33:19 +0000
> From: Herbert Guan <Herbert.Guan@arm.com>
> To: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> CC: "dev@dpdk.org" <dev@dpdk.org>, nd <nd@arm.com>
> Subject: RE: [PATCH v3] arch/arm: optimization for memcpy on AArch64
> 
> Jerin,
> 
> Thanks for review and comments.  Please find my feedbacks below inline.
> 
> > -----Original Message-----
> > From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> > Sent: Monday, December 18, 2017 15:44
> > To: Herbert Guan <Herbert.Guan@arm.com>
> > Cc: dev@dpdk.org
> > Subject: Re: [PATCH v3] arch/arm: optimization for memcpy on AArch64
> > 
> > -----Original Message-----
> > > Date: Mon, 18 Dec 2017 10:54:24 +0800
> > > From: Herbert Guan <herbert.guan@arm.com>
> > > To: dev@dpdk.org, jerin.jacob@caviumnetworks.com
> > > CC: Herbert Guan <herbert.guan@arm.com>
> > > Subject: [PATCH v3] arch/arm: optimization for memcpy on AArch64
> > > X-Mailer: git-send-email 1.8.3.1
> > >
> > > Signed-off-by: Herbert Guan <herbert.guan@arm.com>
> > > ---
> > >  config/common_armv8a_linuxapp                      |   6 +
> > >  .../common/include/arch/arm/rte_memcpy_64.h        | 292
> > +++++++++++++++++++++
> > >  2 files changed, 298 insertions(+)
> > >
> > > diff --git a/config/common_armv8a_linuxapp
> > > b/config/common_armv8a_linuxapp index 6732d1e..8f0cbed 100644
> > > --- a/config/common_armv8a_linuxapp
> > > +++ b/config/common_armv8a_linuxapp
> > > @@ -44,6 +44,12 @@ CONFIG_RTE_FORCE_INTRINSICS=y  # to address
> > minimum
> > > DMA alignment across all arm64 implementations.
> > >  CONFIG_RTE_CACHE_LINE_SIZE=128
> > >
> > > +# Accelarate rte_memcpy.  Be sure to run unit test to determine the
> > 
> > Additional space before "Be". Rather than just mentioning the unit test,
> > mention the absolute test case name(memcpy_perf_autotest)
> > 
> > > +# best threshold in code.  Refer to notes in source file
> > 
> > Additional space before "Refer"
> 
> Fixed in new version.
> 
> > 
> > > +# (lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h) for more
> > #
> > > +info.
> > > +CONFIG_RTE_ARCH_ARM64_MEMCPY=n
> > > +
> > >  CONFIG_RTE_LIBRTE_FM10K_PMD=n
> > >  CONFIG_RTE_LIBRTE_SFC_EFX_PMD=n
> > >  CONFIG_RTE_LIBRTE_AVP_PMD=n
> > > diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> > > b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> > > index b80d8ba..1ea275d 100644
> > > --- a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> > > +++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> > > @@ -42,6 +42,296 @@
> > >
> > >  #include "generic/rte_memcpy.h"
> > >
> > > +#ifdef RTE_ARCH_ARM64_MEMCPY
> > 
> > See the comment below at "(GCC_VERSION < 50400)" check
> > 
> > > +#include <rte_common.h>
> > > +#include <rte_branch_prediction.h>
> > > +
> > > +/*
> > > + * The memory copy performance differs on different AArch64 micro-
> > architectures.
> > > + * And the most recent glibc (e.g. 2.23 or later) can provide a
> > > +better memcpy()
> > > + * performance compared to old glibc versions. It's always suggested
> > > +to use a
> > > + * more recent glibc if possible, from which the entire system can get
> > benefit.
> > > + *
> > > + * This implementation improves memory copy on some aarch64
> > > +micro-architectures,
> > > + * when an old glibc (e.g. 2.19, 2.17...) is being used. It is
> > > +disabled by
> > > + * default and needs "RTE_ARCH_ARM64_MEMCPY" defined to activate.
> > > +It's not
> > > + * always providing better performance than memcpy() so users need to
> > > +run unit
> > > + * test "memcpy_perf_autotest" and customize parameters in
> > > +customization section
> > > + * below for best performance.
> > > + *
> > > + * Compiler version will also impact the rte_memcpy() performance.
> > > +It's observed
> > > + * on some platforms and with the same code, GCC 7.2.0 compiled
> > > +binaries can
> > > + * provide better performance than GCC 4.8.5 compiled binaries.
> > > + */
> > > +
> > > +/**************************************
> > > + * Beginning of customization section
> > > +**************************************/
> > > +#define ALIGNMENT_MASK 0x0F
> > 
> > This symbol will be included in public rte_memcpy.h version for arm64 DPDK
> > build.
> > Please use RTE_ prefix to avoid multi
> > definition.(RTE_ARCH_ARM64_ALIGN_MASK ? or any shorter name)
> > 
> Changed to RTE_AARCH64_ALIGN_MASK in new version.

Since it is something to do with memcpy and arm64, I prefer,
RTE_ARM64_MEMCPY_ALIGN_MASK


> 
> > > +#ifndef RTE_ARCH_ARM64_MEMCPY_STRICT_ALIGN
> > > +/* Only src unalignment will be treaed as unaligned copy */ #define
> > > +IS_UNALIGNED_COPY(dst, src) ((uintptr_t)(dst) & ALIGNMENT_MASK)
> > #else
> > > +/* Both dst and src unalignment will be treated as unaligned copy */
> > > +#define IS_UNALIGNED_COPY(dst, src) \
> > > +		(((uintptr_t)(dst) | (uintptr_t)(src)) & ALIGNMENT_MASK)
> > #endif
> > > +
> > > +
> > > +/*
> > > + * If copy size is larger than threshold, memcpy() will be used.
> > > + * Run "memcpy_perf_autotest" to determine the proper threshold.
> > > + */
> > > +#define ALIGNED_THRESHOLD       ((size_t)(0xffffffff))
> > > +#define UNALIGNED_THRESHOLD     ((size_t)(0xffffffff))
> > 
> > Same as above comment.
> Added RTE_AARCH64_ prefix in new version.

Same as above.

> 
> > 
> > > +
> > > +/**************************************
> > > + * End of customization section
> > > + **************************************/
> > > +#ifdef RTE_TOOLCHAIN_GCC
> > > +#if (GCC_VERSION < 50400)
> > > +#warning "The GCC version is quite old, which may result in sub-optimal \
> > > +performance of the compiled code. It is suggested that at least GCC 5.4.0 \
> > > +be used."
> > 
> > Even though it is warning, based on where this file get included it will
> > generate error(see below)
> > How about, selecting optimized memcpy when RTE_ARCH_ARM64_MEMCPY
> > && if (GCC_VERSION >= 50400) ?
> > 
> Fully understand that.  While I'm not tending to make it 'silent'.  GCC 4.x is just
> quite old and may not provide best optimized code -- not only for DPDK app.  
> We can provide another option RTE_AARCH64_SKIP_GCC_VERSION_CHECK to allow
> skipping the GCC version check.  How do you think?

I prefer to reduce the options. But, No strong opinion on this as this
the RTE_ARCH_ARM64_MEMCPY option is by default disabled(ie. No risk).

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v4] arch/arm: optimization for memcpy on AArch64
  2017-12-18  2:54 ` [PATCH v3] " Herbert Guan
  2017-12-18  7:43   ` Jerin Jacob
@ 2017-12-21  5:33   ` Herbert Guan
  2018-01-03 13:35     ` Jerin Jacob
  1 sibling, 1 reply; 27+ messages in thread
From: Herbert Guan @ 2017-12-21  5:33 UTC (permalink / raw)
  To: dev, jerin.jacob; +Cc: Herbert Guan

This patch provides an option to do rte_memcpy() using 'restrict'
qualifier, which can induce GCC to do optimizations by using more
efficient instructions, providing some performance gain over memcpy()
on some AArch64 platforms/enviroments.

The memory copy performance differs between different AArch64
platforms. And a more recent glibc (e.g. 2.23 or later)
can provide a better memcpy() performance compared to old glibc
versions. It's always suggested to use a more recent glibc if
possible, from which the entire system can get benefit. If for some
reason an old glibc has to be used, this patch is provided for an
alternative.

This implementation can improve memory copy on some AArch64
platforms, when an old glibc (e.g. 2.19, 2.17...) is being used.
It is disabled by default and needs "RTE_ARCH_ARM64_MEMCPY"
defined to activate. It's not always proving better performance
than memcpy() so users need to run DPDK unit test
"memcpy_perf_autotest" and customize parameters in "customization
section" in rte_memcpy_64.h for best performance.

Compiler version will also impact the rte_memcpy() performance.
It's observed on some platforms and with the same code, GCC 7.2.0
compiled binary can provide better performance than GCC 4.8.5. It's
suggested to use GCC 5.4.0 or later.

Signed-off-by: Herbert Guan <herbert.guan@arm.com>
---
 config/common_armv8a_linuxapp                      |   6 +
 .../common/include/arch/arm/rte_memcpy_64.h        | 287 +++++++++++++++++++++
 2 files changed, 293 insertions(+)

diff --git a/config/common_armv8a_linuxapp b/config/common_armv8a_linuxapp
index 6732d1e..8f0cbed 100644
--- a/config/common_armv8a_linuxapp
+++ b/config/common_armv8a_linuxapp
@@ -44,6 +44,12 @@ CONFIG_RTE_FORCE_INTRINSICS=y
 # to address minimum DMA alignment across all arm64 implementations.
 CONFIG_RTE_CACHE_LINE_SIZE=128
 
+# Accelarate rte_memcpy.  Be sure to run unit test to determine the
+# best threshold in code.  Refer to notes in source file
+# (lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h) for more
+# info.
+CONFIG_RTE_ARCH_ARM64_MEMCPY=n
+
 CONFIG_RTE_LIBRTE_FM10K_PMD=n
 CONFIG_RTE_LIBRTE_SFC_EFX_PMD=n
 CONFIG_RTE_LIBRTE_AVP_PMD=n
diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
index b80d8ba..b269f34 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
@@ -42,6 +42,291 @@
 
 #include "generic/rte_memcpy.h"
 
+#ifdef RTE_ARCH_ARM64_MEMCPY
+#include <rte_common.h>
+#include <rte_branch_prediction.h>
+
+/*
+ * The memory copy performance differs on different AArch64 micro-architectures.
+ * And the most recent glibc (e.g. 2.23 or later) can provide a better memcpy()
+ * performance compared to old glibc versions. It's always suggested to use a
+ * more recent glibc if possible, from which the entire system can get benefit.
+ *
+ * This implementation improves memory copy on some aarch64 micro-architectures,
+ * when an old glibc (e.g. 2.19, 2.17...) is being used. It is disabled by
+ * default and needs "RTE_ARCH_ARM64_MEMCPY" defined to activate. It's not
+ * always providing better performance than memcpy() so users need to run unit
+ * test "memcpy_perf_autotest" and customize parameters in customization section
+ * below for best performance.
+ *
+ * Compiler version will also impact the rte_memcpy() performance. It's observed
+ * on some platforms and with the same code, GCC 7.2.0 compiled binaries can
+ * provide better performance than GCC 4.8.5 compiled binaries.
+ */
+
+/**************************************
+ * Beginning of customization section
+ **************************************/
+#define RTE_ARM64_MEMCPY_ALIGN_MASK 0x0F
+#ifndef RTE_ARCH_ARM64_MEMCPY_STRICT_ALIGN
+/* Only src unalignment will be treaed as unaligned copy */
+#define IS_UNALIGNED_COPY(dst, src) \
+	((uintptr_t)(dst) & RTE_ARM64_MEMCPY_ALIGN_MASK)
+#else
+/* Both dst and src unalignment will be treated as unaligned copy */
+#define IS_UNALIGNED_COPY(dst, src) \
+	(((uintptr_t)(dst) | (uintptr_t)(src)) & RTE_ARM64_MEMCPY_ALIGN_MASK)
+#endif
+
+
+/*
+ * If copy size is larger than threshold, memcpy() will be used.
+ * Run "memcpy_perf_autotest" to determine the proper threshold.
+ */
+#define RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD       ((size_t)(0xffffffff))
+#define RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD     ((size_t)(0xffffffff))
+
+/*
+ * The logic of USE_RTE_MEMCPY() can also be modified to best fit platform.
+ */
+#define USE_RTE_MEMCPY(dst, src, n) \
+((!IS_UNALIGNED_COPY(dst, src) && n <= RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD) \
+|| (IS_UNALIGNED_COPY(dst, src) && n <= RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD))
+
+
+/**************************************
+ * End of customization section
+ **************************************/
+#if defined(RTE_TOOLCHAIN_GCC) && !defined(RTE_AARCH64_SKIP_GCC_VERSION_CHECK)
+#if (GCC_VERSION < 50400)
+#warning "The GCC version is quite old, which may result in sub-optimal \
+performance of the compiled code. It is suggested that at least GCC 5.4.0 \
+be used."
+#endif
+#endif
+
+static __rte_always_inline void rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+	__uint128_t *dst128 = (__uint128_t *)dst;
+	const __uint128_t *src128 = (const __uint128_t *)src;
+	*dst128 = *src128;
+}
+
+static __rte_always_inline void rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	__uint128_t *dst128 = (__uint128_t *)dst;
+	const __uint128_t *src128 = (const __uint128_t *)src;
+	const __uint128_t x0 = src128[0], x1 = src128[1];
+	dst128[0] = x0;
+	dst128[1] = x1;
+}
+
+static __rte_always_inline void rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+	__uint128_t *dst128 = (__uint128_t *)dst;
+	const __uint128_t *src128 = (const __uint128_t *)src;
+	const __uint128_t x0 = src128[0], x1 = src128[1], x2 = src128[2];
+	dst128[0] = x0;
+	dst128[1] = x1;
+	dst128[2] = x2;
+}
+
+static __rte_always_inline void rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	__uint128_t *dst128 = (__uint128_t *)dst;
+	const __uint128_t *src128 = (const __uint128_t *)src;
+	const __uint128_t
+		x0 = src128[0], x1 = src128[1], x2 = src128[2], x3 = src128[3];
+	dst128[0] = x0;
+	dst128[1] = x1;
+	dst128[2] = x2;
+	dst128[3] = x3;
+}
+
+static __rte_always_inline void rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	__uint128_t *dst128 = (__uint128_t *)dst;
+	const __uint128_t *src128 = (const __uint128_t *)src;
+	/* Keep below declaration & copy sequence for optimized instructions */
+	const __uint128_t
+		x0 = src128[0], x1 = src128[1], x2 = src128[2], x3 = src128[3];
+	dst128[0] = x0;
+	__uint128_t x4 = src128[4];
+	dst128[1] = x1;
+	__uint128_t x5 = src128[5];
+	dst128[2] = x2;
+	__uint128_t x6 = src128[6];
+	dst128[3] = x3;
+	__uint128_t x7 = src128[7];
+	dst128[4] = x4;
+	dst128[5] = x5;
+	dst128[6] = x6;
+	dst128[7] = x7;
+}
+
+static __rte_always_inline void rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov128(dst, src);
+	rte_mov128(dst + 128, src + 128);
+}
+
+static __rte_always_inline void
+rte_memcpy_lt16(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	if (n & 0x08) {
+		/* copy 8 ~ 15 bytes */
+		*(uint64_t *)dst = *(const uint64_t *)src;
+		*(uint64_t *)(dst - 8 + n) = *(const uint64_t *)(src - 8 + n);
+	} else if (n & 0x04) {
+		/* copy 4 ~ 7 bytes */
+		*(uint32_t *)dst = *(const uint32_t *)src;
+		*(uint32_t *)(dst - 4 + n) = *(const uint32_t *)(src - 4 + n);
+	} else if (n & 0x02) {
+		/* copy 2 ~ 3 bytes */
+		*(uint16_t *)dst = *(const uint16_t *)src;
+		*(uint16_t *)(dst - 2 + n) = *(const uint16_t *)(src - 2 + n);
+	} else if (n & 0x01) {
+		/* copy 1 byte */
+		*dst = *src;
+	}
+}
+
+static __rte_always_inline void
+rte_memcpy_ge16_lt128(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	if (n < 64) {
+		if (n == 16) {
+			rte_mov16(dst, src);
+		} else if (n <= 32) {
+			rte_mov16(dst, src);
+			rte_mov16(dst - 16 + n, src - 16 + n);
+		} else if (n <= 48) {
+			rte_mov32(dst, src);
+			rte_mov16(dst - 16 + n, src - 16 + n);
+		} else {
+			rte_mov48(dst, src);
+			rte_mov16(dst - 16 + n, src - 16 + n);
+		}
+	} else {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		if (n > 48 + 64)
+			rte_mov64(dst - 64 + n, src - 64 + n);
+		else if (n > 32 + 64)
+			rte_mov48(dst - 48 + n, src - 48 + n);
+		else if (n > 16 + 64)
+			rte_mov32(dst - 32 + n, src - 32 + n);
+		else if (n > 64)
+			rte_mov16(dst - 16 + n, src - 16 + n);
+	}
+}
+
+static __rte_always_inline void
+rte_memcpy_ge128(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	do {
+		rte_mov128(dst, src);
+		src += 128;
+		dst += 128;
+		n -= 128;
+	} while (likely(n >= 128));
+
+	if (likely(n)) {
+		if (n <= 16)
+			rte_mov16(dst - 16 + n, src - 16 + n);
+		else if (n <= 32)
+			rte_mov32(dst - 32 + n, src - 32 + n);
+		else if (n <= 48)
+			rte_mov48(dst - 48 + n, src - 48 + n);
+		else if (n <= 64)
+			rte_mov64(dst - 64 + n, src - 64 + n);
+		else
+			rte_memcpy_ge16_lt128(dst, src, n);
+	}
+}
+
+static __rte_always_inline void
+rte_memcpy_ge16_lt64(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	if (n == 16) {
+		rte_mov16(dst, src);
+	} else if (n <= 32) {
+		rte_mov16(dst, src);
+		rte_mov16(dst - 16 + n, src - 16 + n);
+	} else if (n <= 48) {
+		rte_mov32(dst, src);
+		rte_mov16(dst - 16 + n, src - 16 + n);
+	} else {
+		rte_mov48(dst, src);
+		rte_mov16(dst - 16 + n, src - 16 + n);
+	}
+}
+
+static __rte_always_inline void
+rte_memcpy_ge64(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	do {
+		rte_mov64(dst, src);
+		src += 64;
+		dst += 64;
+		n -= 64;
+	} while (likely(n >= 64));
+
+	if (likely(n)) {
+		if (n <= 16)
+			rte_mov16(dst - 16 + n, src - 16 + n);
+		else if (n <= 32)
+			rte_mov32(dst - 32 + n, src - 32 + n);
+		else if (n <= 48)
+			rte_mov48(dst - 48 + n, src - 48 + n);
+		else
+			rte_mov64(dst - 64 + n, src - 64 + n);
+	}
+}
+
+#if RTE_CACHE_LINE_SIZE >= 128
+static __rte_always_inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+	if (n < 16) {
+		rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	}
+	if (n < 128) {
+		rte_memcpy_ge16_lt128((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	}
+	__builtin_prefetch(src, 0, 0);
+	__builtin_prefetch(dst, 1, 0);
+	if (likely(USE_RTE_MEMCPY(dst, src, n))) {
+		rte_memcpy_ge128((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	} else
+		return memcpy(dst, src, n);
+}
+
+#else
+static __rte_always_inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+	if (n < 16) {
+		rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	}
+	if (n < 64) {
+		rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	}
+	__builtin_prefetch(src, 0, 0);
+	__builtin_prefetch(dst, 1, 0);
+	if (likely(USE_RTE_MEMCPY(dst, src, n))) {
+		rte_memcpy_ge64((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	} else
+		return memcpy(dst, src, n);
+}
+#endif /* RTE_CACHE_LINE_SIZE >= 128 */
+
+#else
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
@@ -80,6 +365,8 @@
 
 #define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
 
+#endif /* RTE_ARCH_ARM64_MEMCPY */
+
 #ifdef __cplusplus
 }
 #endif
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH v4] arch/arm: optimization for memcpy on AArch64
  2017-12-21  5:33   ` [PATCH v4] " Herbert Guan
@ 2018-01-03 13:35     ` Jerin Jacob
  2018-01-04 10:23       ` Herbert Guan
  0 siblings, 1 reply; 27+ messages in thread
From: Jerin Jacob @ 2018-01-03 13:35 UTC (permalink / raw)
  To: Herbert Guan; +Cc: dev

-----Original Message-----
> Date: Thu, 21 Dec 2017 13:33:47 +0800
> From: Herbert Guan <herbert.guan@arm.com>
> To: dev@dpdk.org, jerin.jacob@caviumnetworks.com
> CC: Herbert Guan <herbert.guan@arm.com>
> Subject: [PATCH v4] arch/arm: optimization for memcpy on AArch64
> X-Mailer: git-send-email 1.8.3.1
> 
> This patch provides an option to do rte_memcpy() using 'restrict'
> qualifier, which can induce GCC to do optimizations by using more
> efficient instructions, providing some performance gain over memcpy()
> on some AArch64 platforms/enviroments.
> 
> The memory copy performance differs between different AArch64
> platforms. And a more recent glibc (e.g. 2.23 or later)
> can provide a better memcpy() performance compared to old glibc
> versions. It's always suggested to use a more recent glibc if
> possible, from which the entire system can get benefit. If for some
> reason an old glibc has to be used, this patch is provided for an
> alternative.
> 
> This implementation can improve memory copy on some AArch64
> platforms, when an old glibc (e.g. 2.19, 2.17...) is being used.
> It is disabled by default and needs "RTE_ARCH_ARM64_MEMCPY"
> defined to activate. It's not always proving better performance
> than memcpy() so users need to run DPDK unit test
> "memcpy_perf_autotest" and customize parameters in "customization
> section" in rte_memcpy_64.h for best performance.
> 
> Compiler version will also impact the rte_memcpy() performance.
> It's observed on some platforms and with the same code, GCC 7.2.0
> compiled binary can provide better performance than GCC 4.8.5. It's
> suggested to use GCC 5.4.0 or later.
> 
> Signed-off-by: Herbert Guan <herbert.guan@arm.com>

Looks good. Find inline request for some minor changes.
Feel free to add my Acked-by with those changes.


> ---
>  config/common_armv8a_linuxapp                      |   6 +
>  .../common/include/arch/arm/rte_memcpy_64.h        | 287 +++++++++++++++++++++
>  2 files changed, 293 insertions(+)
> 
> diff --git a/config/common_armv8a_linuxapp b/config/common_armv8a_linuxapp
> index 6732d1e..8f0cbed 100644
> --- a/config/common_armv8a_linuxapp
> +++ b/config/common_armv8a_linuxapp
> @@ -44,6 +44,12 @@ CONFIG_RTE_FORCE_INTRINSICS=y
>  # to address minimum DMA alignment across all arm64 implementations.
>  CONFIG_RTE_CACHE_LINE_SIZE=128
>  
> +# Accelarate rte_memcpy.  Be sure to run unit test to determine the
> +# best threshold in code.  Refer to notes in source file
> +# (lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h) for more
> +# info.
> +CONFIG_RTE_ARCH_ARM64_MEMCPY=n
> +
>  CONFIG_RTE_LIBRTE_FM10K_PMD=n
>  CONFIG_RTE_LIBRTE_SFC_EFX_PMD=n
>  CONFIG_RTE_LIBRTE_AVP_PMD=n
> diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> index b80d8ba..b269f34 100644
> --- a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> +++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> @@ -42,6 +42,291 @@
>  
>  #include "generic/rte_memcpy.h"
>  
> +#ifdef RTE_ARCH_ARM64_MEMCPY
> +#include <rte_common.h>
> +#include <rte_branch_prediction.h>
> +
> +/*
> + * The memory copy performance differs on different AArch64 micro-architectures.
> + * And the most recent glibc (e.g. 2.23 or later) can provide a better memcpy()
> + * performance compared to old glibc versions. It's always suggested to use a
> + * more recent glibc if possible, from which the entire system can get benefit.
> + *
> + * This implementation improves memory copy on some aarch64 micro-architectures,
> + * when an old glibc (e.g. 2.19, 2.17...) is being used. It is disabled by
> + * default and needs "RTE_ARCH_ARM64_MEMCPY" defined to activate. It's not
> + * always providing better performance than memcpy() so users need to run unit
> + * test "memcpy_perf_autotest" and customize parameters in customization section
> + * below for best performance.
> + *
> + * Compiler version will also impact the rte_memcpy() performance. It's observed
> + * on some platforms and with the same code, GCC 7.2.0 compiled binaries can
> + * provide better performance than GCC 4.8.5 compiled binaries.
> + */
> +
> +/**************************************
> + * Beginning of customization section
> + **************************************/
> +#define RTE_ARM64_MEMCPY_ALIGN_MASK 0x0F
> +#ifndef RTE_ARCH_ARM64_MEMCPY_STRICT_ALIGN
> +/* Only src unalignment will be treaed as unaligned copy */
> +#define IS_UNALIGNED_COPY(dst, src) \

Better to to change to RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY, as it is
defined in public DPDK header file.


> +	((uintptr_t)(dst) & RTE_ARM64_MEMCPY_ALIGN_MASK)
> +#else
> +/* Both dst and src unalignment will be treated as unaligned copy */
> +#define IS_UNALIGNED_COPY(dst, src) \
> +	(((uintptr_t)(dst) | (uintptr_t)(src)) & RTE_ARM64_MEMCPY_ALIGN_MASK)

Same as above

> +#endif
> +
> +
> +/*
> + * If copy size is larger than threshold, memcpy() will be used.
> + * Run "memcpy_perf_autotest" to determine the proper threshold.
> + */
> +#define RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD       ((size_t)(0xffffffff))
> +#define RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD     ((size_t)(0xffffffff))
> +
> +/*
> + * The logic of USE_RTE_MEMCPY() can also be modified to best fit platform.
> + */
> +#define USE_RTE_MEMCPY(dst, src, n) \
> +((!IS_UNALIGNED_COPY(dst, src) && n <= RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD) \
> +|| (IS_UNALIGNED_COPY(dst, src) && n <= RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD))
> +
> +
> +/**************************************
> + * End of customization section
> + **************************************/
> +#if defined(RTE_TOOLCHAIN_GCC) && !defined(RTE_AARCH64_SKIP_GCC_VERSION_CHECK)

To maintain consistency
s/RTE_AARCH64_SKIP_GCC_VERSION_CHECK/RTE_ARM64_MEMCPY_SKIP_GCC_VERSION_CHECK

> +#if (GCC_VERSION < 50400)
> +#warning "The GCC version is quite old, which may result in sub-optimal \
> +performance of the compiled code. It is suggested that at least GCC 5.4.0 \
> +be used."
> +#endif
> +#endif
> +
> +static __rte_always_inline void rte_mov16(uint8_t *dst, const uint8_t *src)

static __rte_always_inline
void rte_mov16(uint8_t *dst, const uint8_t *src)

> +{
> +	__uint128_t *dst128 = (__uint128_t *)dst;
> +	const __uint128_t *src128 = (const __uint128_t *)src;
> +	*dst128 = *src128;
> +}
> +
> +static __rte_always_inline void rte_mov32(uint8_t *dst, const uint8_t *src)

See above

> +{
> +	__uint128_t *dst128 = (__uint128_t *)dst;
> +	const __uint128_t *src128 = (const __uint128_t *)src;
> +	const __uint128_t x0 = src128[0], x1 = src128[1];
> +	dst128[0] = x0;
> +	dst128[1] = x1;
> +}
> +
> +static __rte_always_inline void rte_mov48(uint8_t *dst, const uint8_t *src)
> +{

See above

> +	__uint128_t *dst128 = (__uint128_t *)dst;
> +	const __uint128_t *src128 = (const __uint128_t *)src;
> +	const __uint128_t x0 = src128[0], x1 = src128[1], x2 = src128[2];
> +	dst128[0] = x0;
> +	dst128[1] = x1;
> +	dst128[2] = x2;
> +}
> +
> +static __rte_always_inline void rte_mov64(uint8_t *dst, const uint8_t *src)
> +{

See above

> +	__uint128_t *dst128 = (__uint128_t *)dst;
> +	const __uint128_t *src128 = (const __uint128_t *)src;
> +	const __uint128_t
> +		x0 = src128[0], x1 = src128[1], x2 = src128[2], x3 = src128[3];
> +	dst128[0] = x0;
> +	dst128[1] = x1;
> +	dst128[2] = x2;
> +	dst128[3] = x3;
> +}
> +
> +static __rte_always_inline void rte_mov128(uint8_t *dst, const uint8_t *src)
> +{

See above

> +	__uint128_t *dst128 = (__uint128_t *)dst;
> +	const __uint128_t *src128 = (const __uint128_t *)src;
> +	/* Keep below declaration & copy sequence for optimized instructions */
> +	const __uint128_t
> +		x0 = src128[0], x1 = src128[1], x2 = src128[2], x3 = src128[3];
> +	dst128[0] = x0;
> +	__uint128_t x4 = src128[4];
> +	dst128[1] = x1;
> +	__uint128_t x5 = src128[5];
> +	dst128[2] = x2;
> +	__uint128_t x6 = src128[6];
> +	dst128[3] = x3;
> +	__uint128_t x7 = src128[7];
> +	dst128[4] = x4;
> +	dst128[5] = x5;
> +	dst128[6] = x6;
> +	dst128[7] = x7;
> +}
> +
> +static __rte_always_inline void rte_mov256(uint8_t *dst, const uint8_t *src)
> +{

See above

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v5] arch/arm: optimization for memcpy on AArch64
  2017-11-27  7:49 [PATCH] arch/arm: optimization for memcpy on AArch64 Herbert Guan
                   ` (3 preceding siblings ...)
  2017-12-18  2:54 ` [PATCH v3] " Herbert Guan
@ 2018-01-04 10:20 ` Herbert Guan
  2018-01-12 17:03   ` Thomas Monjalon
  2018-01-19  6:10   ` [PATCH v6] arch/arm: optimization for memcpy on ARM64 Herbert Guan
  4 siblings, 2 replies; 27+ messages in thread
From: Herbert Guan @ 2018-01-04 10:20 UTC (permalink / raw)
  To: dev, jerin.jacob; +Cc: Herbert Guan

This patch provides an option to do rte_memcpy() using 'restrict'
qualifier, which can induce GCC to do optimizations by using more
efficient instructions, providing some performance gain over memcpy()
on some AArch64 platforms/enviroments.

The memory copy performance differs between different AArch64
platforms. And a more recent glibc (e.g. 2.23 or later)
can provide a better memcpy() performance compared to old glibc
versions. It's always suggested to use a more recent glibc if
possible, from which the entire system can get benefit. If for some
reason an old glibc has to be used, this patch is provided for an
alternative.

This implementation can improve memory copy on some AArch64
platforms, when an old glibc (e.g. 2.19, 2.17...) is being used.
It is disabled by default and needs "RTE_ARCH_ARM64_MEMCPY"
defined to activate. It's not always proving better performance
than memcpy() so users need to run DPDK unit test
"memcpy_perf_autotest" and customize parameters in "customization
section" in rte_memcpy_64.h for best performance.

Compiler version will also impact the rte_memcpy() performance.
It's observed on some platforms and with the same code, GCC 7.2.0
compiled binary can provide better performance than GCC 4.8.5. It's
suggested to use GCC 5.4.0 or later.

Signed-off-by: Herbert Guan <herbert.guan@arm.com>
Acked-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 config/common_armv8a_linuxapp                      |   6 +
 .../common/include/arch/arm/rte_memcpy_64.h        | 296 +++++++++++++++++++++
 2 files changed, 302 insertions(+)

diff --git a/config/common_armv8a_linuxapp b/config/common_armv8a_linuxapp
index 6732d1e..8f0cbed 100644
--- a/config/common_armv8a_linuxapp
+++ b/config/common_armv8a_linuxapp
@@ -44,6 +44,12 @@ CONFIG_RTE_FORCE_INTRINSICS=y
 # to address minimum DMA alignment across all arm64 implementations.
 CONFIG_RTE_CACHE_LINE_SIZE=128
 
+# Accelarate rte_memcpy.  Be sure to run unit test to determine the
+# best threshold in code.  Refer to notes in source file
+# (lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h) for more
+# info.
+CONFIG_RTE_ARCH_ARM64_MEMCPY=n
+
 CONFIG_RTE_LIBRTE_FM10K_PMD=n
 CONFIG_RTE_LIBRTE_SFC_EFX_PMD=n
 CONFIG_RTE_LIBRTE_AVP_PMD=n
diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
index b80d8ba..786c9cc 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
@@ -42,6 +42,300 @@
 
 #include "generic/rte_memcpy.h"
 
+#ifdef RTE_ARCH_ARM64_MEMCPY
+#include <rte_common.h>
+#include <rte_branch_prediction.h>
+
+/*
+ * The memory copy performance differs on different AArch64 micro-architectures.
+ * And the most recent glibc (e.g. 2.23 or later) can provide a better memcpy()
+ * performance compared to old glibc versions. It's always suggested to use a
+ * more recent glibc if possible, from which the entire system can get benefit.
+ *
+ * This implementation improves memory copy on some aarch64 micro-architectures,
+ * when an old glibc (e.g. 2.19, 2.17...) is being used. It is disabled by
+ * default and needs "RTE_ARCH_ARM64_MEMCPY" defined to activate. It's not
+ * always providing better performance than memcpy() so users need to run unit
+ * test "memcpy_perf_autotest" and customize parameters in customization section
+ * below for best performance.
+ *
+ * Compiler version will also impact the rte_memcpy() performance. It's observed
+ * on some platforms and with the same code, GCC 7.2.0 compiled binaries can
+ * provide better performance than GCC 4.8.5 compiled binaries.
+ */
+
+/**************************************
+ * Beginning of customization section
+ **************************************/
+#define RTE_ARM64_MEMCPY_ALIGN_MASK 0x0F
+#ifndef RTE_ARCH_ARM64_MEMCPY_STRICT_ALIGN
+/* Only src unalignment will be treaed as unaligned copy */
+#define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
+	((uintptr_t)(dst) & RTE_ARM64_MEMCPY_ALIGN_MASK)
+#else
+/* Both dst and src unalignment will be treated as unaligned copy */
+#define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
+	(((uintptr_t)(dst) | (uintptr_t)(src)) & RTE_ARM64_MEMCPY_ALIGN_MASK)
+#endif
+
+
+/*
+ * If copy size is larger than threshold, memcpy() will be used.
+ * Run "memcpy_perf_autotest" to determine the proper threshold.
+ */
+#define RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD       ((size_t)(0xffffffff))
+#define RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD     ((size_t)(0xffffffff))
+
+/*
+ * The logic of USE_RTE_MEMCPY() can also be modified to best fit platform.
+ */
+#define USE_RTE_MEMCPY(dst, src, n) \
+((!RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \
+n <= RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD) \
+|| (RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \
+n <= RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD))
+
+/**************************************
+ * End of customization section
+ **************************************/
+
+
+#if defined(RTE_TOOLCHAIN_GCC) && !defined(RTE_ARM64_MEMCPY_SKIP_GCC_VER_CHECK)
+#if (GCC_VERSION < 50400)
+#warning "The GCC version is quite old, which may result in sub-optimal \
+performance of the compiled code. It is suggested that at least GCC 5.4.0 \
+be used."
+#endif
+#endif
+
+static __rte_always_inline
+void rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+	__uint128_t *dst128 = (__uint128_t *)dst;
+	const __uint128_t *src128 = (const __uint128_t *)src;
+	*dst128 = *src128;
+}
+
+static __rte_always_inline
+void rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	__uint128_t *dst128 = (__uint128_t *)dst;
+	const __uint128_t *src128 = (const __uint128_t *)src;
+	const __uint128_t x0 = src128[0], x1 = src128[1];
+	dst128[0] = x0;
+	dst128[1] = x1;
+}
+
+static __rte_always_inline
+void rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+	__uint128_t *dst128 = (__uint128_t *)dst;
+	const __uint128_t *src128 = (const __uint128_t *)src;
+	const __uint128_t x0 = src128[0], x1 = src128[1], x2 = src128[2];
+	dst128[0] = x0;
+	dst128[1] = x1;
+	dst128[2] = x2;
+}
+
+static __rte_always_inline
+void rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	__uint128_t *dst128 = (__uint128_t *)dst;
+	const __uint128_t *src128 = (const __uint128_t *)src;
+	const __uint128_t
+		x0 = src128[0], x1 = src128[1], x2 = src128[2], x3 = src128[3];
+	dst128[0] = x0;
+	dst128[1] = x1;
+	dst128[2] = x2;
+	dst128[3] = x3;
+}
+
+static __rte_always_inline
+void rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	__uint128_t *dst128 = (__uint128_t *)dst;
+	const __uint128_t *src128 = (const __uint128_t *)src;
+	/* Keep below declaration & copy sequence for optimized instructions */
+	const __uint128_t
+		x0 = src128[0], x1 = src128[1], x2 = src128[2], x3 = src128[3];
+	dst128[0] = x0;
+	__uint128_t x4 = src128[4];
+	dst128[1] = x1;
+	__uint128_t x5 = src128[5];
+	dst128[2] = x2;
+	__uint128_t x6 = src128[6];
+	dst128[3] = x3;
+	__uint128_t x7 = src128[7];
+	dst128[4] = x4;
+	dst128[5] = x5;
+	dst128[6] = x6;
+	dst128[7] = x7;
+}
+
+static __rte_always_inline
+void rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov128(dst, src);
+	rte_mov128(dst + 128, src + 128);
+}
+
+static __rte_always_inline void
+rte_memcpy_lt16(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	if (n & 0x08) {
+		/* copy 8 ~ 15 bytes */
+		*(uint64_t *)dst = *(const uint64_t *)src;
+		*(uint64_t *)(dst - 8 + n) = *(const uint64_t *)(src - 8 + n);
+	} else if (n & 0x04) {
+		/* copy 4 ~ 7 bytes */
+		*(uint32_t *)dst = *(const uint32_t *)src;
+		*(uint32_t *)(dst - 4 + n) = *(const uint32_t *)(src - 4 + n);
+	} else if (n & 0x02) {
+		/* copy 2 ~ 3 bytes */
+		*(uint16_t *)dst = *(const uint16_t *)src;
+		*(uint16_t *)(dst - 2 + n) = *(const uint16_t *)(src - 2 + n);
+	} else if (n & 0x01) {
+		/* copy 1 byte */
+		*dst = *src;
+	}
+}
+
+static __rte_always_inline
+void rte_memcpy_ge16_lt128(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	if (n < 64) {
+		if (n == 16) {
+			rte_mov16(dst, src);
+		} else if (n <= 32) {
+			rte_mov16(dst, src);
+			rte_mov16(dst - 16 + n, src - 16 + n);
+		} else if (n <= 48) {
+			rte_mov32(dst, src);
+			rte_mov16(dst - 16 + n, src - 16 + n);
+		} else {
+			rte_mov48(dst, src);
+			rte_mov16(dst - 16 + n, src - 16 + n);
+		}
+	} else {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		if (n > 48 + 64)
+			rte_mov64(dst - 64 + n, src - 64 + n);
+		else if (n > 32 + 64)
+			rte_mov48(dst - 48 + n, src - 48 + n);
+		else if (n > 16 + 64)
+			rte_mov32(dst - 32 + n, src - 32 + n);
+		else if (n > 64)
+			rte_mov16(dst - 16 + n, src - 16 + n);
+	}
+}
+
+static __rte_always_inline
+void rte_memcpy_ge128(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	do {
+		rte_mov128(dst, src);
+		src += 128;
+		dst += 128;
+		n -= 128;
+	} while (likely(n >= 128));
+
+	if (likely(n)) {
+		if (n <= 16)
+			rte_mov16(dst - 16 + n, src - 16 + n);
+		else if (n <= 32)
+			rte_mov32(dst - 32 + n, src - 32 + n);
+		else if (n <= 48)
+			rte_mov48(dst - 48 + n, src - 48 + n);
+		else if (n <= 64)
+			rte_mov64(dst - 64 + n, src - 64 + n);
+		else
+			rte_memcpy_ge16_lt128(dst, src, n);
+	}
+}
+
+static __rte_always_inline
+void rte_memcpy_ge16_lt64(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	if (n == 16) {
+		rte_mov16(dst, src);
+	} else if (n <= 32) {
+		rte_mov16(dst, src);
+		rte_mov16(dst - 16 + n, src - 16 + n);
+	} else if (n <= 48) {
+		rte_mov32(dst, src);
+		rte_mov16(dst - 16 + n, src - 16 + n);
+	} else {
+		rte_mov48(dst, src);
+		rte_mov16(dst - 16 + n, src - 16 + n);
+	}
+}
+
+static __rte_always_inline
+void rte_memcpy_ge64(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	do {
+		rte_mov64(dst, src);
+		src += 64;
+		dst += 64;
+		n -= 64;
+	} while (likely(n >= 64));
+
+	if (likely(n)) {
+		if (n <= 16)
+			rte_mov16(dst - 16 + n, src - 16 + n);
+		else if (n <= 32)
+			rte_mov32(dst - 32 + n, src - 32 + n);
+		else if (n <= 48)
+			rte_mov48(dst - 48 + n, src - 48 + n);
+		else
+			rte_mov64(dst - 64 + n, src - 64 + n);
+	}
+}
+
+#if RTE_CACHE_LINE_SIZE >= 128
+static __rte_always_inline
+void *rte_memcpy(void *dst, const void *src, size_t n)
+{
+	if (n < 16) {
+		rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	}
+	if (n < 128) {
+		rte_memcpy_ge16_lt128((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	}
+	__builtin_prefetch(src, 0, 0);
+	__builtin_prefetch(dst, 1, 0);
+	if (likely(USE_RTE_MEMCPY(dst, src, n))) {
+		rte_memcpy_ge128((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	} else
+		return memcpy(dst, src, n);
+}
+
+#else
+static __rte_always_inline
+void *rte_memcpy(void *dst, const void *src, size_t n)
+{
+	if (n < 16) {
+		rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	}
+	if (n < 64) {
+		rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	}
+	__builtin_prefetch(src, 0, 0);
+	__builtin_prefetch(dst, 1, 0);
+	if (likely(USE_RTE_MEMCPY(dst, src, n))) {
+		rte_memcpy_ge64((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	} else
+		return memcpy(dst, src, n);
+}
+#endif /* RTE_CACHE_LINE_SIZE >= 128 */
+
+#else
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
@@ -80,6 +374,8 @@
 
 #define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
 
+#endif /* RTE_ARCH_ARM64_MEMCPY */
+
 #ifdef __cplusplus
 }
 #endif
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH v4] arch/arm: optimization for memcpy on AArch64
  2018-01-03 13:35     ` Jerin Jacob
@ 2018-01-04 10:23       ` Herbert Guan
  0 siblings, 0 replies; 27+ messages in thread
From: Herbert Guan @ 2018-01-04 10:23 UTC (permalink / raw)
  To: Jerin Jacob; +Cc: dev, nd

Thanks for review and comments, Jerin.  A new version has been sent out for review with your comments applied and Acked-by added.

Best regards,
Herbert

> -----Original Message-----
> From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> Sent: Wednesday, January 3, 2018 21:35
> To: Herbert Guan <Herbert.Guan@arm.com>
> Cc: dev@dpdk.org
> Subject: Re: [PATCH v4] arch/arm: optimization for memcpy on AArch64
> 
> -----Original Message-----
> > Date: Thu, 21 Dec 2017 13:33:47 +0800
> > From: Herbert Guan <herbert.guan@arm.com>
> > To: dev@dpdk.org, jerin.jacob@caviumnetworks.com
> > CC: Herbert Guan <herbert.guan@arm.com>
> > Subject: [PATCH v4] arch/arm: optimization for memcpy on AArch64
> > X-Mailer: git-send-email 1.8.3.1
> >
> > This patch provides an option to do rte_memcpy() using 'restrict'
> > qualifier, which can induce GCC to do optimizations by using more
> > efficient instructions, providing some performance gain over memcpy()
> > on some AArch64 platforms/enviroments.
> >
> > The memory copy performance differs between different AArch64
> > platforms. And a more recent glibc (e.g. 2.23 or later)
> > can provide a better memcpy() performance compared to old glibc
> > versions. It's always suggested to use a more recent glibc if
> > possible, from which the entire system can get benefit. If for some
> > reason an old glibc has to be used, this patch is provided for an
> > alternative.
> >
> > This implementation can improve memory copy on some AArch64
> > platforms, when an old glibc (e.g. 2.19, 2.17...) is being used.
> > It is disabled by default and needs "RTE_ARCH_ARM64_MEMCPY"
> > defined to activate. It's not always proving better performance
> > than memcpy() so users need to run DPDK unit test
> > "memcpy_perf_autotest" and customize parameters in "customization
> > section" in rte_memcpy_64.h for best performance.
> >
> > Compiler version will also impact the rte_memcpy() performance.
> > It's observed on some platforms and with the same code, GCC 7.2.0
> > compiled binary can provide better performance than GCC 4.8.5. It's
> > suggested to use GCC 5.4.0 or later.
> >
> > Signed-off-by: Herbert Guan <herbert.guan@arm.com>
> 
> Looks good. Find inline request for some minor changes.
> Feel free to add my Acked-by with those changes.
> 
> 
> > ---
> >  config/common_armv8a_linuxapp                      |   6 +
> >  .../common/include/arch/arm/rte_memcpy_64.h        | 287
> +++++++++++++++++++++
> >  2 files changed, 293 insertions(+)
> >
> > diff --git a/config/common_armv8a_linuxapp
> b/config/common_armv8a_linuxapp
> > index 6732d1e..8f0cbed 100644
> > --- a/config/common_armv8a_linuxapp
> > +++ b/config/common_armv8a_linuxapp
> > @@ -44,6 +44,12 @@ CONFIG_RTE_FORCE_INTRINSICS=y
> >  # to address minimum DMA alignment across all arm64 implementations.
> >  CONFIG_RTE_CACHE_LINE_SIZE=128
> >
> > +# Accelarate rte_memcpy.  Be sure to run unit test to determine the
> > +# best threshold in code.  Refer to notes in source file
> > +# (lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h) for more
> > +# info.
> > +CONFIG_RTE_ARCH_ARM64_MEMCPY=n
> > +
> >  CONFIG_RTE_LIBRTE_FM10K_PMD=n
> >  CONFIG_RTE_LIBRTE_SFC_EFX_PMD=n
> >  CONFIG_RTE_LIBRTE_AVP_PMD=n
> > diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> > index b80d8ba..b269f34 100644
> > --- a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> > +++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> > @@ -42,6 +42,291 @@
> >
> >  #include "generic/rte_memcpy.h"
> >
> > +#ifdef RTE_ARCH_ARM64_MEMCPY
> > +#include <rte_common.h>
> > +#include <rte_branch_prediction.h>
> > +
> > +/*
> > + * The memory copy performance differs on different AArch64 micro-
> architectures.
> > + * And the most recent glibc (e.g. 2.23 or later) can provide a better
> memcpy()
> > + * performance compared to old glibc versions. It's always suggested to
> use a
> > + * more recent glibc if possible, from which the entire system can get
> benefit.
> > + *
> > + * This implementation improves memory copy on some aarch64 micro-
> architectures,
> > + * when an old glibc (e.g. 2.19, 2.17...) is being used. It is disabled by
> > + * default and needs "RTE_ARCH_ARM64_MEMCPY" defined to activate.
> It's not
> > + * always providing better performance than memcpy() so users need to
> run unit
> > + * test "memcpy_perf_autotest" and customize parameters in
> customization section
> > + * below for best performance.
> > + *
> > + * Compiler version will also impact the rte_memcpy() performance. It's
> observed
> > + * on some platforms and with the same code, GCC 7.2.0 compiled
> binaries can
> > + * provide better performance than GCC 4.8.5 compiled binaries.
> > + */
> > +
> > +/**************************************
> > + * Beginning of customization section
> > + **************************************/
> > +#define RTE_ARM64_MEMCPY_ALIGN_MASK 0x0F
> > +#ifndef RTE_ARCH_ARM64_MEMCPY_STRICT_ALIGN
> > +/* Only src unalignment will be treaed as unaligned copy */
> > +#define IS_UNALIGNED_COPY(dst, src) \
> 
> Better to to change to RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY, as it is
> defined in public DPDK header file.
> 
> 
> > +	((uintptr_t)(dst) & RTE_ARM64_MEMCPY_ALIGN_MASK)
> > +#else
> > +/* Both dst and src unalignment will be treated as unaligned copy */
> > +#define IS_UNALIGNED_COPY(dst, src) \
> > +	(((uintptr_t)(dst) | (uintptr_t)(src)) &
> RTE_ARM64_MEMCPY_ALIGN_MASK)
> 
> Same as above
> 
> > +#endif
> > +
> > +
> > +/*
> > + * If copy size is larger than threshold, memcpy() will be used.
> > + * Run "memcpy_perf_autotest" to determine the proper threshold.
> > + */
> > +#define RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD
> ((size_t)(0xffffffff))
> > +#define RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD
> ((size_t)(0xffffffff))
> > +
> > +/*
> > + * The logic of USE_RTE_MEMCPY() can also be modified to best fit
> platform.
> > + */
> > +#define USE_RTE_MEMCPY(dst, src, n) \
> > +((!IS_UNALIGNED_COPY(dst, src) && n <=
> RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD) \
> > +|| (IS_UNALIGNED_COPY(dst, src) && n <=
> RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD))
> > +
> > +
> > +/**************************************
> > + * End of customization section
> > + **************************************/
> > +#if defined(RTE_TOOLCHAIN_GCC)
> && !defined(RTE_AARCH64_SKIP_GCC_VERSION_CHECK)
> 
> To maintain consistency
> s/RTE_AARCH64_SKIP_GCC_VERSION_CHECK/RTE_ARM64_MEMCPY_SKIP_
> GCC_VERSION_CHECK
> 
> > +#if (GCC_VERSION < 50400)
> > +#warning "The GCC version is quite old, which may result in sub-optimal \
> > +performance of the compiled code. It is suggested that at least GCC 5.4.0 \
> > +be used."
> > +#endif
> > +#endif
> > +
> > +static __rte_always_inline void rte_mov16(uint8_t *dst, const uint8_t
> *src)
> 
> static __rte_always_inline
> void rte_mov16(uint8_t *dst, const uint8_t *src)
> 
> > +{
> > +	__uint128_t *dst128 = (__uint128_t *)dst;
> > +	const __uint128_t *src128 = (const __uint128_t *)src;
> > +	*dst128 = *src128;
> > +}
> > +
> > +static __rte_always_inline void rte_mov32(uint8_t *dst, const uint8_t
> *src)
> 
> See above
> 
> > +{
> > +	__uint128_t *dst128 = (__uint128_t *)dst;
> > +	const __uint128_t *src128 = (const __uint128_t *)src;
> > +	const __uint128_t x0 = src128[0], x1 = src128[1];
> > +	dst128[0] = x0;
> > +	dst128[1] = x1;
> > +}
> > +
> > +static __rte_always_inline void rte_mov48(uint8_t *dst, const uint8_t
> *src)
> > +{
> 
> See above
> 
> > +	__uint128_t *dst128 = (__uint128_t *)dst;
> > +	const __uint128_t *src128 = (const __uint128_t *)src;
> > +	const __uint128_t x0 = src128[0], x1 = src128[1], x2 = src128[2];
> > +	dst128[0] = x0;
> > +	dst128[1] = x1;
> > +	dst128[2] = x2;
> > +}
> > +
> > +static __rte_always_inline void rte_mov64(uint8_t *dst, const uint8_t
> *src)
> > +{
> 
> See above
> 
> > +	__uint128_t *dst128 = (__uint128_t *)dst;
> > +	const __uint128_t *src128 = (const __uint128_t *)src;
> > +	const __uint128_t
> > +		x0 = src128[0], x1 = src128[1], x2 = src128[2], x3 = src128[3];
> > +	dst128[0] = x0;
> > +	dst128[1] = x1;
> > +	dst128[2] = x2;
> > +	dst128[3] = x3;
> > +}
> > +
> > +static __rte_always_inline void rte_mov128(uint8_t *dst, const uint8_t
> *src)
> > +{
> 
> See above
> 
> > +	__uint128_t *dst128 = (__uint128_t *)dst;
> > +	const __uint128_t *src128 = (const __uint128_t *)src;
> > +	/* Keep below declaration & copy sequence for optimized
> instructions */
> > +	const __uint128_t
> > +		x0 = src128[0], x1 = src128[1], x2 = src128[2], x3 = src128[3];
> > +	dst128[0] = x0;
> > +	__uint128_t x4 = src128[4];
> > +	dst128[1] = x1;
> > +	__uint128_t x5 = src128[5];
> > +	dst128[2] = x2;
> > +	__uint128_t x6 = src128[6];
> > +	dst128[3] = x3;
> > +	__uint128_t x7 = src128[7];
> > +	dst128[4] = x4;
> > +	dst128[5] = x5;
> > +	dst128[6] = x6;
> > +	dst128[7] = x7;
> > +}
> > +
> > +static __rte_always_inline void rte_mov256(uint8_t *dst, const uint8_t
> *src)
> > +{
> 
> See above

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v5] arch/arm: optimization for memcpy on AArch64
  2018-01-04 10:20 ` [PATCH v5] " Herbert Guan
@ 2018-01-12 17:03   ` Thomas Monjalon
  2018-01-15 10:57     ` Herbert Guan
  2018-01-19  6:10   ` [PATCH v6] arch/arm: optimization for memcpy on ARM64 Herbert Guan
  1 sibling, 1 reply; 27+ messages in thread
From: Thomas Monjalon @ 2018-01-12 17:03 UTC (permalink / raw)
  To: Herbert Guan; +Cc: dev, jerin.jacob

Hi,

All the code is using ARM64, but the title suggests AArch64.
What is the difference between AArch64 and ARM64 (and ARMv8)?

04/01/2018 11:20, Herbert Guan:
> +/**************************************
> + * Beginning of customization section
> + **************************************/
> +#define RTE_ARM64_MEMCPY_ALIGN_MASK 0x0F
> +#ifndef RTE_ARCH_ARM64_MEMCPY_STRICT_ALIGN
> +/* Only src unalignment will be treaed as unaligned copy */

typo: treaed

> +#define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
> +	((uintptr_t)(dst) & RTE_ARM64_MEMCPY_ALIGN_MASK)
> +#else
> +/* Both dst and src unalignment will be treated as unaligned copy */
> +#define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
> +	(((uintptr_t)(dst) | (uintptr_t)(src)) & RTE_ARM64_MEMCPY_ALIGN_MASK)
> +#endif
> +
> +
> +/*
> + * If copy size is larger than threshold, memcpy() will be used.
> + * Run "memcpy_perf_autotest" to determine the proper threshold.
> + */
> +#define RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD       ((size_t)(0xffffffff))
> +#define RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD     ((size_t)(0xffffffff))
> +
> +/*
> + * The logic of USE_RTE_MEMCPY() can also be modified to best fit platform.
> + */
> +#define USE_RTE_MEMCPY(dst, src, n) \
> +((!RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \
> +n <= RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD) \
> +|| (RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \
> +n <= RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD))
> +
> +/**************************************
> + * End of customization section
> + **************************************/

Modifying the code to asjust the platform is not easy for deployment.
Can we move some customization variables inside the configuration file?

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v5] arch/arm: optimization for memcpy on AArch64
  2018-01-12 17:03   ` Thomas Monjalon
@ 2018-01-15 10:57     ` Herbert Guan
  2018-01-15 11:37       ` Thomas Monjalon
  0 siblings, 1 reply; 27+ messages in thread
From: Herbert Guan @ 2018-01-15 10:57 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev, jerin.jacob, nd

Hi Thomas,


> -----Original Message-----
> From: Thomas Monjalon [mailto:thomas@monjalon.net]
> Sent: Saturday, January 13, 2018 1:04
> To: Herbert Guan <Herbert.Guan@arm.com>
> Cc: dev@dpdk.org; jerin.jacob@caviumnetworks.com
> Subject: Re: [dpdk-dev] [PATCH v5] arch/arm: optimization for memcpy on
> AArch64
> 
> Hi,
> 
> All the code is using ARM64, but the title suggests AArch64.
> What is the difference between AArch64 and ARM64 (and ARMv8)?

AArch64 and ARM64 refer to the same thing.  AArch64 refers to the 64-bit architecture introduced since ARMv8-A.  But the Linux kernel community calls it as ARM64.  As to DPDK, in most existing compile flags, ARM64 is used.  So this patch keeps the ARM64 naming in newly added compile options.

> 
> 04/01/2018 11:20, Herbert Guan:
> > +/**************************************
> > + * Beginning of customization section
> > +**************************************/
> > +#define RTE_ARM64_MEMCPY_ALIGN_MASK 0x0F #ifndef
> > +RTE_ARCH_ARM64_MEMCPY_STRICT_ALIGN
> > +/* Only src unalignment will be treaed as unaligned copy */
> 
> typo: treaed

It should be 'treated'.  Will correct it in the next version.

> 
> > +#define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
> > +	((uintptr_t)(dst) & RTE_ARM64_MEMCPY_ALIGN_MASK) #else
> > +/* Both dst and src unalignment will be treated as unaligned copy */
> > +#define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
> > +	(((uintptr_t)(dst) | (uintptr_t)(src)) &
> > +RTE_ARM64_MEMCPY_ALIGN_MASK) #endif
> > +
> > +
> > +/*
> > + * If copy size is larger than threshold, memcpy() will be used.
> > + * Run "memcpy_perf_autotest" to determine the proper threshold.
> > + */
> > +#define RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD
> ((size_t)(0xffffffff))
> > +#define RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD
> ((size_t)(0xffffffff))
> > +
> > +/*
> > + * The logic of USE_RTE_MEMCPY() can also be modified to best fit
> platform.
> > + */
> > +#define USE_RTE_MEMCPY(dst, src, n) \
> > +((!RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \ n <=
> > +RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD) \
> > +|| (RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \
> > +n <= RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD))
> > +
> > +/**************************************
> > + * End of customization section
> > + **************************************/
> 
> Modifying the code to asjust the platform is not easy for deployment.
> Can we move some customization variables inside the configuration file?

RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD and RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD are the 2 parameters can be configured during build-time.  The values can be specified with the best values for the target platform.  Usually it's not necessary to change the expression, the comment added in the code is just to raise the hint that this code piece can be modified.  


Best regards,
Herbert

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v5] arch/arm: optimization for memcpy on AArch64
  2018-01-15 10:57     ` Herbert Guan
@ 2018-01-15 11:37       ` Thomas Monjalon
  2018-01-18 23:54         ` Thomas Monjalon
  0 siblings, 1 reply; 27+ messages in thread
From: Thomas Monjalon @ 2018-01-15 11:37 UTC (permalink / raw)
  To: Herbert Guan; +Cc: dev, jerin.jacob, nd

15/01/2018 11:57, Herbert Guan:
> Hi Thomas,
> 
> From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > Hi,
> > 
> > All the code is using ARM64, but the title suggests AArch64.
> > What is the difference between AArch64 and ARM64 (and ARMv8)?
> 
> AArch64 and ARM64 refer to the same thing.  AArch64 refers to the 64-bit architecture introduced since ARMv8-A.  But the Linux kernel community calls it as ARM64.  As to DPDK, in most existing compile flags, ARM64 is used.  So this patch keeps the ARM64 naming in newly added compile options.

So please let's continue to call it ARM64.

> > 04/01/2018 11:20, Herbert Guan:
> > > +#define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
> > > +	((uintptr_t)(dst) & RTE_ARM64_MEMCPY_ALIGN_MASK) #else
> > > +/* Both dst and src unalignment will be treated as unaligned copy */
> > > +#define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
> > > +	(((uintptr_t)(dst) | (uintptr_t)(src)) &
> > > +RTE_ARM64_MEMCPY_ALIGN_MASK) #endif
> > > +
> > > +
> > > +/*
> > > + * If copy size is larger than threshold, memcpy() will be used.
> > > + * Run "memcpy_perf_autotest" to determine the proper threshold.
> > > + */
> > > +#define RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD
> > ((size_t)(0xffffffff))
> > > +#define RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD
> > ((size_t)(0xffffffff))
> > > +
> > > +/*
> > > + * The logic of USE_RTE_MEMCPY() can also be modified to best fit
> > platform.
> > > + */
> > > +#define USE_RTE_MEMCPY(dst, src, n) \
> > > +((!RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \ n <=
> > > +RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD) \
> > > +|| (RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \
> > > +n <= RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD))
> > > +
> > > +/**************************************
> > > + * End of customization section
> > > + **************************************/
> > 
> > Modifying the code to asjust the platform is not easy for deployment.
> > Can we move some customization variables inside the configuration file?
> 
> RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD and RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD are the 2 parameters can be configured during build-time.  The values can be specified with the best values for the target platform.  Usually it's not necessary to change the expression, the comment added in the code is just to raise the hint that this code piece can be modified.

The build time configuration must be set in the config file
(config/common_armv8a_linuxapp).
v6 please?

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v5] arch/arm: optimization for memcpy on AArch64
  2018-01-15 11:37       ` Thomas Monjalon
@ 2018-01-18 23:54         ` Thomas Monjalon
  2018-01-19  6:16           ` 答复: " Herbert Guan
  0 siblings, 1 reply; 27+ messages in thread
From: Thomas Monjalon @ 2018-01-18 23:54 UTC (permalink / raw)
  To: Herbert Guan; +Cc: dev, jerin.jacob, nd

Ping
Are we targetting to integrate this optimization in DPDK 18.02?
I am expecting a v6, thanks.

15/01/2018 12:37, Thomas Monjalon:
> 15/01/2018 11:57, Herbert Guan:
> > Hi Thomas,
> > 
> > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > Hi,
> > > 
> > > All the code is using ARM64, but the title suggests AArch64.
> > > What is the difference between AArch64 and ARM64 (and ARMv8)?
> > 
> > AArch64 and ARM64 refer to the same thing.  AArch64 refers to the 64-bit architecture introduced since ARMv8-A.  But the Linux kernel community calls it as ARM64.  As to DPDK, in most existing compile flags, ARM64 is used.  So this patch keeps the ARM64 naming in newly added compile options.
> 
> So please let's continue to call it ARM64.
> 
> > > 04/01/2018 11:20, Herbert Guan:
> > > > +#define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
> > > > +	((uintptr_t)(dst) & RTE_ARM64_MEMCPY_ALIGN_MASK) #else
> > > > +/* Both dst and src unalignment will be treated as unaligned copy */
> > > > +#define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
> > > > +	(((uintptr_t)(dst) | (uintptr_t)(src)) &
> > > > +RTE_ARM64_MEMCPY_ALIGN_MASK) #endif
> > > > +
> > > > +
> > > > +/*
> > > > + * If copy size is larger than threshold, memcpy() will be used.
> > > > + * Run "memcpy_perf_autotest" to determine the proper threshold.
> > > > + */
> > > > +#define RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD
> > > ((size_t)(0xffffffff))
> > > > +#define RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD
> > > ((size_t)(0xffffffff))
> > > > +
> > > > +/*
> > > > + * The logic of USE_RTE_MEMCPY() can also be modified to best fit
> > > platform.
> > > > + */
> > > > +#define USE_RTE_MEMCPY(dst, src, n) \
> > > > +((!RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \ n <=
> > > > +RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD) \
> > > > +|| (RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \
> > > > +n <= RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD))
> > > > +
> > > > +/**************************************
> > > > + * End of customization section
> > > > + **************************************/
> > > 
> > > Modifying the code to asjust the platform is not easy for deployment.
> > > Can we move some customization variables inside the configuration file?
> > 
> > RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD and RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD are the 2 parameters can be configured during build-time.  The values can be specified with the best values for the target platform.  Usually it's not necessary to change the expression, the comment added in the code is just to raise the hint that this code piece can be modified.
> 
> The build time configuration must be set in the config file
> (config/common_armv8a_linuxapp).
> v6 please?
> 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v6] arch/arm: optimization for memcpy on ARM64
  2018-01-04 10:20 ` [PATCH v5] " Herbert Guan
  2018-01-12 17:03   ` Thomas Monjalon
@ 2018-01-19  6:10   ` Herbert Guan
  2018-01-20 16:21     ` Thomas Monjalon
  1 sibling, 1 reply; 27+ messages in thread
From: Herbert Guan @ 2018-01-19  6:10 UTC (permalink / raw)
  To: dev, thomas, jerin.jacob; +Cc: Herbert Guan

This patch provides an option to do rte_memcpy() using 'restrict'
qualifier, which can induce GCC to do optimizations by using more
efficient instructions, providing some performance gain over memcpy()
on some ARM64 platforms/enviroments.

The memory copy performance differs between different ARM64
platforms. And a more recent glibc (e.g. 2.23 or later)
can provide a better memcpy() performance compared to old glibc
versions. It's always suggested to use a more recent glibc if
possible, from which the entire system can get benefit. If for some
reason an old glibc has to be used, this patch is provided for an
alternative.

This implementation can improve memory copy on some ARM64
platforms, when an old glibc (e.g. 2.19, 2.17...) is being used.
It is disabled by default and needs "RTE_ARCH_ARM64_MEMCPY"
defined to activate. It's not always proving better performance
than memcpy() so users need to run DPDK unit test
"memcpy_perf_autotest" and customize parameters in "customization
section" in rte_memcpy_64.h for best performance.

Compiler version will also impact the rte_memcpy() performance.
It's observed on some platforms and with the same code, GCC 7.2.0
compiled binary can provide better performance than GCC 4.8.5. It's
suggested to use GCC 5.4.0 or later.

Signed-off-by: Herbert Guan <herbert.guan@arm.com>
Acked-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 config/common_armv8a_linuxapp                      |  12 +
 .../common/include/arch/arm/rte_memcpy_64.h        | 313 +++++++++++++++++++++
 2 files changed, 325 insertions(+)

diff --git a/config/common_armv8a_linuxapp b/config/common_armv8a_linuxapp
index 781c854..790e716 100644
--- a/config/common_armv8a_linuxapp
+++ b/config/common_armv8a_linuxapp
@@ -17,6 +17,18 @@ CONFIG_RTE_FORCE_INTRINSICS=y
 # to address minimum DMA alignment across all arm64 implementations.
 CONFIG_RTE_CACHE_LINE_SIZE=128
 
+# Accelarate rte_memcpy. Be sure to run unit test (memcpy_perf_autotest)
+# to determine the best threshold in code. Refer to notes in source file
+# (lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h) for more info.
+CONFIG_RTE_ARCH_ARM64_MEMCPY=n
+#CONFIG_RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD=2048
+#CONFIG_RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD=512
+# Leave below RTE_ARM64_MEMCPY_xxx options commented out, unless there're
+# strong reasons.
+#CONFIG_RTE_ARM64_MEMCPY_SKIP_GCC_VER_CHECK=n
+#CONFIG_RTE_ARM64_MEMCPY_ALIGN_MASK=0xF
+#CONFIG_RTE_ARM64_MEMCPY_STRICT_ALIGN=n
+
 CONFIG_RTE_LIBRTE_FM10K_PMD=n
 CONFIG_RTE_LIBRTE_SFC_EFX_PMD=n
 CONFIG_RTE_LIBRTE_AVP_PMD=n
diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
index f408aaa..beb97a7 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
@@ -14,6 +14,317 @@
 
 #include "generic/rte_memcpy.h"
 
+#ifdef RTE_ARCH_ARM64_MEMCPY
+#include <rte_common.h>
+#include <rte_branch_prediction.h>
+
+/*
+ * The memory copy performance differs on different AArch64 micro-architectures.
+ * And the most recent glibc (e.g. 2.23 or later) can provide a better memcpy()
+ * performance compared to old glibc versions. It's always suggested to use a
+ * more recent glibc if possible, from which the entire system can get benefit.
+ *
+ * This implementation improves memory copy on some aarch64 micro-architectures,
+ * when an old glibc (e.g. 2.19, 2.17...) is being used. It is disabled by
+ * default and needs "RTE_ARCH_ARM64_MEMCPY" defined to activate. It's not
+ * always providing better performance than memcpy() so users need to run unit
+ * test "memcpy_perf_autotest" and customize parameters in customization section
+ * below for best performance.
+ *
+ * Compiler version will also impact the rte_memcpy() performance. It's observed
+ * on some platforms and with the same code, GCC 7.2.0 compiled binaries can
+ * provide better performance than GCC 4.8.5 compiled binaries.
+ */
+
+/**************************************
+ * Beginning of customization section
+ **************************************/
+#ifndef RTE_ARM64_MEMCPY_ALIGN_MASK
+#define RTE_ARM64_MEMCPY_ALIGN_MASK ((RTE_CACHE_LINE_SIZE >> 3) - 1)
+#endif
+
+#ifndef RTE_ARM64_MEMCPY_STRICT_ALIGN
+/* Only src unalignment will be treated as unaligned copy */
+#define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
+	((uintptr_t)(src) & RTE_ARM64_MEMCPY_ALIGN_MASK)
+#else
+/* Both dst and src unalignment will be treated as unaligned copy */
+#define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
+	(((uintptr_t)(dst) | (uintptr_t)(src)) & RTE_ARM64_MEMCPY_ALIGN_MASK)
+#endif
+
+
+/*
+ * If copy size is larger than threshold, memcpy() will be used.
+ * Run "memcpy_perf_autotest" to determine the proper threshold.
+ */
+#ifdef RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD
+#define USE_ALIGNED_RTE_MEMCPY(dst, src, n) \
+(!RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \
+n <= (size_t)RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD)
+#else
+#define USE_ALIGNED_RTE_MEMCPY(dst, src, n) \
+(!RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src))
+#endif
+#ifdef RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD
+#define USE_UNALIGNED_RTE_MEMCPY(dst, src, n) \
+(RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \
+n <= (size_t)RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD)
+#else
+#define USE_UNALIGNED_RTE_MEMCPY(dst, src, n) \
+(RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src))
+#endif
+/*
+ * The logic of USE_RTE_MEMCPY() can also be modified to best fit platform.
+ */
+#if defined(RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD) \
+|| defined(RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD)
+#define USE_RTE_MEMCPY(dst, src, n) \
+(USE_ALIGNED_RTE_MEMCPY(dst, src, n) || USE_UNALIGNED_RTE_MEMCPY(dst, src, n))
+#else
+#define USE_RTE_MEMCPY(dst, src, n) (1)
+#endif
+/**************************************
+ * End of customization section
+ **************************************/
+
+
+#if defined(RTE_TOOLCHAIN_GCC) && !defined(RTE_ARM64_MEMCPY_SKIP_GCC_VER_CHECK)
+#if (GCC_VERSION < 50400)
+#warning "The GCC version is quite old, which may result in sub-optimal \
+performance of the compiled code. It is suggested that at least GCC 5.4.0 \
+be used."
+#endif
+#endif
+
+static __rte_always_inline
+void rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+	__uint128_t *dst128 = (__uint128_t *)dst;
+	const __uint128_t *src128 = (const __uint128_t *)src;
+	*dst128 = *src128;
+}
+
+static __rte_always_inline
+void rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	__uint128_t *dst128 = (__uint128_t *)dst;
+	const __uint128_t *src128 = (const __uint128_t *)src;
+	const __uint128_t x0 = src128[0], x1 = src128[1];
+	dst128[0] = x0;
+	dst128[1] = x1;
+}
+
+static __rte_always_inline
+void rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+	__uint128_t *dst128 = (__uint128_t *)dst;
+	const __uint128_t *src128 = (const __uint128_t *)src;
+	const __uint128_t x0 = src128[0], x1 = src128[1], x2 = src128[2];
+	dst128[0] = x0;
+	dst128[1] = x1;
+	dst128[2] = x2;
+}
+
+static __rte_always_inline
+void rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	__uint128_t *dst128 = (__uint128_t *)dst;
+	const __uint128_t *src128 = (const __uint128_t *)src;
+	const __uint128_t
+		x0 = src128[0], x1 = src128[1], x2 = src128[2], x3 = src128[3];
+	dst128[0] = x0;
+	dst128[1] = x1;
+	dst128[2] = x2;
+	dst128[3] = x3;
+}
+
+static __rte_always_inline
+void rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	__uint128_t *dst128 = (__uint128_t *)dst;
+	const __uint128_t *src128 = (const __uint128_t *)src;
+	/* Keep below declaration & copy sequence for optimized instructions */
+	const __uint128_t
+		x0 = src128[0], x1 = src128[1], x2 = src128[2], x3 = src128[3];
+	dst128[0] = x0;
+	__uint128_t x4 = src128[4];
+	dst128[1] = x1;
+	__uint128_t x5 = src128[5];
+	dst128[2] = x2;
+	__uint128_t x6 = src128[6];
+	dst128[3] = x3;
+	__uint128_t x7 = src128[7];
+	dst128[4] = x4;
+	dst128[5] = x5;
+	dst128[6] = x6;
+	dst128[7] = x7;
+}
+
+static __rte_always_inline
+void rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov128(dst, src);
+	rte_mov128(dst + 128, src + 128);
+}
+
+static __rte_always_inline void
+rte_memcpy_lt16(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	if (n & 0x08) {
+		/* copy 8 ~ 15 bytes */
+		*(uint64_t *)dst = *(const uint64_t *)src;
+		*(uint64_t *)(dst - 8 + n) = *(const uint64_t *)(src - 8 + n);
+	} else if (n & 0x04) {
+		/* copy 4 ~ 7 bytes */
+		*(uint32_t *)dst = *(const uint32_t *)src;
+		*(uint32_t *)(dst - 4 + n) = *(const uint32_t *)(src - 4 + n);
+	} else if (n & 0x02) {
+		/* copy 2 ~ 3 bytes */
+		*(uint16_t *)dst = *(const uint16_t *)src;
+		*(uint16_t *)(dst - 2 + n) = *(const uint16_t *)(src - 2 + n);
+	} else if (n & 0x01) {
+		/* copy 1 byte */
+		*dst = *src;
+	}
+}
+
+static __rte_always_inline
+void rte_memcpy_ge16_lt128(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	if (n < 64) {
+		if (n == 16) {
+			rte_mov16(dst, src);
+		} else if (n <= 32) {
+			rte_mov16(dst, src);
+			rte_mov16(dst - 16 + n, src - 16 + n);
+		} else if (n <= 48) {
+			rte_mov32(dst, src);
+			rte_mov16(dst - 16 + n, src - 16 + n);
+		} else {
+			rte_mov48(dst, src);
+			rte_mov16(dst - 16 + n, src - 16 + n);
+		}
+	} else {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		if (n > 48 + 64)
+			rte_mov64(dst - 64 + n, src - 64 + n);
+		else if (n > 32 + 64)
+			rte_mov48(dst - 48 + n, src - 48 + n);
+		else if (n > 16 + 64)
+			rte_mov32(dst - 32 + n, src - 32 + n);
+		else if (n > 64)
+			rte_mov16(dst - 16 + n, src - 16 + n);
+	}
+}
+
+static __rte_always_inline
+void rte_memcpy_ge128(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	do {
+		rte_mov128(dst, src);
+		src += 128;
+		dst += 128;
+		n -= 128;
+	} while (likely(n >= 128));
+
+	if (likely(n)) {
+		if (n <= 16)
+			rte_mov16(dst - 16 + n, src - 16 + n);
+		else if (n <= 32)
+			rte_mov32(dst - 32 + n, src - 32 + n);
+		else if (n <= 48)
+			rte_mov48(dst - 48 + n, src - 48 + n);
+		else if (n <= 64)
+			rte_mov64(dst - 64 + n, src - 64 + n);
+		else
+			rte_memcpy_ge16_lt128(dst, src, n);
+	}
+}
+
+static __rte_always_inline
+void rte_memcpy_ge16_lt64(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	if (n == 16) {
+		rte_mov16(dst, src);
+	} else if (n <= 32) {
+		rte_mov16(dst, src);
+		rte_mov16(dst - 16 + n, src - 16 + n);
+	} else if (n <= 48) {
+		rte_mov32(dst, src);
+		rte_mov16(dst - 16 + n, src - 16 + n);
+	} else {
+		rte_mov48(dst, src);
+		rte_mov16(dst - 16 + n, src - 16 + n);
+	}
+}
+
+static __rte_always_inline
+void rte_memcpy_ge64(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	do {
+		rte_mov64(dst, src);
+		src += 64;
+		dst += 64;
+		n -= 64;
+	} while (likely(n >= 64));
+
+	if (likely(n)) {
+		if (n <= 16)
+			rte_mov16(dst - 16 + n, src - 16 + n);
+		else if (n <= 32)
+			rte_mov32(dst - 32 + n, src - 32 + n);
+		else if (n <= 48)
+			rte_mov48(dst - 48 + n, src - 48 + n);
+		else
+			rte_mov64(dst - 64 + n, src - 64 + n);
+	}
+}
+
+#if RTE_CACHE_LINE_SIZE >= 128
+static __rte_always_inline
+void *rte_memcpy(void *dst, const void *src, size_t n)
+{
+	if (n < 16) {
+		rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	}
+	if (n < 128) {
+		rte_memcpy_ge16_lt128((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	}
+	__builtin_prefetch(src, 0, 0);
+	__builtin_prefetch(dst, 1, 0);
+	if (likely(USE_RTE_MEMCPY(dst, src, n))) {
+		rte_memcpy_ge128((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	} else
+		return memcpy(dst, src, n);
+}
+
+#else
+static __rte_always_inline
+void *rte_memcpy(void *dst, const void *src, size_t n)
+{
+	if (n < 16) {
+		rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	}
+	if (n < 64) {
+		rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	}
+	__builtin_prefetch(src, 0, 0);
+	__builtin_prefetch(dst, 1, 0);
+	if (likely(USE_RTE_MEMCPY(dst, src, n))) {
+		rte_memcpy_ge64((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	} else
+		return memcpy(dst, src, n);
+}
+#endif /* RTE_CACHE_LINE_SIZE >= 128 */
+
+#else
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
@@ -52,6 +363,8 @@
 
 #define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
 
+#endif /* RTE_ARCH_ARM64_MEMCPY */
+
 #ifdef __cplusplus
 }
 #endif
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* 答复:  [PATCH v5] arch/arm: optimization for memcpy on AArch64
  2018-01-18 23:54         ` Thomas Monjalon
@ 2018-01-19  6:16           ` Herbert Guan
  0 siblings, 0 replies; 27+ messages in thread
From: Herbert Guan @ 2018-01-19  6:16 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev, jerin.jacob, nd

Yes, this is the target.  I was in a 3-day meeting this week, and had limited time to update the patch.  

A new patch v6 was sent out just now.  It's actually sent twice -- forgot to add version info and "--in-reply-to" in the first one.  Please just ignore that and I'm sorry for the disturbance.


Best regards,
Herbert

-----邮件原件-----
发件人: Thomas Monjalon [mailto:thomas@monjalon.net] 
发送时间: 2018年1月19日 7:54
收件人: Herbert Guan <Herbert.Guan@arm.com>
抄送: dev@dpdk.org; jerin.jacob@caviumnetworks.com; nd <nd@arm.com>
主题: Re: [dpdk-dev] [PATCH v5] arch/arm: optimization for memcpy on AArch64

Ping
Are we targetting to integrate this optimization in DPDK 18.02?
I am expecting a v6, thanks.

15/01/2018 12:37, Thomas Monjalon:
> 15/01/2018 11:57, Herbert Guan:
> > Hi Thomas,
> > 
> > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > Hi,
> > > 
> > > All the code is using ARM64, but the title suggests AArch64.
> > > What is the difference between AArch64 and ARM64 (and ARMv8)?
> > 
> > AArch64 and ARM64 refer to the same thing.  AArch64 refers to the 64-bit architecture introduced since ARMv8-A.  But the Linux kernel community calls it as ARM64.  As to DPDK, in most existing compile flags, ARM64 is used.  So this patch keeps the ARM64 naming in newly added compile options.
> 
> So please let's continue to call it ARM64.
> 
> > > 04/01/2018 11:20, Herbert Guan:
> > > > +#define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
> > > > +	((uintptr_t)(dst) & RTE_ARM64_MEMCPY_ALIGN_MASK) #else
> > > > +/* Both dst and src unalignment will be treated as unaligned 
> > > > +copy */ #define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
> > > > +	(((uintptr_t)(dst) | (uintptr_t)(src)) &
> > > > +RTE_ARM64_MEMCPY_ALIGN_MASK) #endif
> > > > +
> > > > +
> > > > +/*
> > > > + * If copy size is larger than threshold, memcpy() will be used.
> > > > + * Run "memcpy_perf_autotest" to determine the proper threshold.
> > > > + */
> > > > +#define RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD
> > > ((size_t)(0xffffffff))
> > > > +#define RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD
> > > ((size_t)(0xffffffff))
> > > > +
> > > > +/*
> > > > + * The logic of USE_RTE_MEMCPY() can also be modified to best 
> > > > +fit
> > > platform.
> > > > + */
> > > > +#define USE_RTE_MEMCPY(dst, src, n) \ 
> > > > +((!RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \ n <=
> > > > +RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD) \
> > > > +|| (RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \
> > > > +n <= RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD))
> > > > +
> > > > +/**************************************
> > > > + * End of customization section  
> > > > +**************************************/
> > > 
> > > Modifying the code to asjust the platform is not easy for deployment.
> > > Can we move some customization variables inside the configuration file?
> > 
> > RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD and RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD are the 2 parameters can be configured during build-time.  The values can be specified with the best values for the target platform.  Usually it's not necessary to change the expression, the comment added in the code is just to raise the hint that this code piece can be modified.
> 
> The build time configuration must be set in the config file 
> (config/common_armv8a_linuxapp).
> v6 please?
> 




^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v6] arch/arm: optimization for memcpy on ARM64
  2018-01-19  6:10   ` [PATCH v6] arch/arm: optimization for memcpy on ARM64 Herbert Guan
@ 2018-01-20 16:21     ` Thomas Monjalon
  0 siblings, 0 replies; 27+ messages in thread
From: Thomas Monjalon @ 2018-01-20 16:21 UTC (permalink / raw)
  To: Herbert Guan; +Cc: dev, jerin.jacob

19/01/2018 07:10, Herbert Guan:
> This patch provides an option to do rte_memcpy() using 'restrict'
> qualifier, which can induce GCC to do optimizations by using more
> efficient instructions, providing some performance gain over memcpy()
> on some ARM64 platforms/enviroments.
> 
> The memory copy performance differs between different ARM64
> platforms. And a more recent glibc (e.g. 2.23 or later)
> can provide a better memcpy() performance compared to old glibc
> versions. It's always suggested to use a more recent glibc if
> possible, from which the entire system can get benefit. If for some
> reason an old glibc has to be used, this patch is provided for an
> alternative.
> 
> This implementation can improve memory copy on some ARM64
> platforms, when an old glibc (e.g. 2.19, 2.17...) is being used.
> It is disabled by default and needs "RTE_ARCH_ARM64_MEMCPY"
> defined to activate. It's not always proving better performance
> than memcpy() so users need to run DPDK unit test
> "memcpy_perf_autotest" and customize parameters in "customization
> section" in rte_memcpy_64.h for best performance.
> 
> Compiler version will also impact the rte_memcpy() performance.
> It's observed on some platforms and with the same code, GCC 7.2.0
> compiled binary can provide better performance than GCC 4.8.5. It's
> suggested to use GCC 5.4.0 or later.
> 
> Signed-off-by: Herbert Guan <herbert.guan@arm.com>
> Acked-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>

Applied, thanks

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2018-01-20 16:21 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-11-27  7:49 [PATCH] arch/arm: optimization for memcpy on AArch64 Herbert Guan
2017-11-29 12:31 ` Jerin Jacob
2017-12-03 12:37   ` Herbert Guan
2017-12-15  4:06     ` Jerin Jacob
2017-12-18  2:51       ` Herbert Guan
2017-12-18  4:17         ` Jerin Jacob
2017-12-02  7:33 ` Pavan Nikhilesh Bhagavatula
2017-12-03 12:38   ` Herbert Guan
2017-12-03 14:20     ` Pavan Nikhilesh Bhagavatula
2017-12-04  7:14       ` Herbert Guan
2017-12-05  6:02 ` [PATCH v2] " Herbert Guan
2017-12-15  3:41   ` Jerin Jacob
2017-12-18  2:54 ` [PATCH v3] " Herbert Guan
2017-12-18  7:43   ` Jerin Jacob
2017-12-19  5:33     ` Herbert Guan
2017-12-19  7:24       ` Jerin Jacob
2017-12-21  5:33   ` [PATCH v4] " Herbert Guan
2018-01-03 13:35     ` Jerin Jacob
2018-01-04 10:23       ` Herbert Guan
2018-01-04 10:20 ` [PATCH v5] " Herbert Guan
2018-01-12 17:03   ` Thomas Monjalon
2018-01-15 10:57     ` Herbert Guan
2018-01-15 11:37       ` Thomas Monjalon
2018-01-18 23:54         ` Thomas Monjalon
2018-01-19  6:16           ` 答复: " Herbert Guan
2018-01-19  6:10   ` [PATCH v6] arch/arm: optimization for memcpy on ARM64 Herbert Guan
2018-01-20 16:21     ` Thomas Monjalon

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.