All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/3] dynamic linking support
@ 2017-08-25  2:06 Xiaoyun Li
  2017-08-25  2:06 ` [PATCH 1/3] eal/x86: run-time dispatch over memcpy Xiaoyun Li
                   ` (3 more replies)
  0 siblings, 4 replies; 22+ messages in thread
From: Xiaoyun Li @ 2017-08-25  2:06 UTC (permalink / raw)
  To: bruce.richardson; +Cc: dev, wenzhuo.lu, zhihong.wang, qi.z.zhang, Xiaoyun Li

This patchset dynamically selects functions at run-time based on CPU flags
that current machine supports. This patchset modifies mempcy, memcpy perf
test and x86 EFD, using function pointers and bind them at constructor time.

Xiaoyun Li (3):
  eal/x86: run-time dispatch over memcpy
  app/test: run-time dispatch over memcpy perf test
  efd: run-time dispatch over x86 EFD functions

 .../common/include/arch/x86/rte_memcpy.h           | 305 ++++++++++++---------
 lib/librte_efd/rte_efd_x86.h                       |  35 ++-
 mk/machine/native/rte.vars.mk                      |   2 +
 test/test/test_memcpy_perf.c                       |  36 ++-
 4 files changed, 236 insertions(+), 142 deletions(-)

-- 
2.7.4

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH 1/3] eal/x86: run-time dispatch over memcpy
  2017-08-25  2:06 [PATCH 0/3] dynamic linking support Xiaoyun Li
@ 2017-08-25  2:06 ` Xiaoyun Li
  2017-08-30 14:56   ` Ananyev, Konstantin
  2017-08-30 18:00   ` Stephen Hemminger
  2017-08-25  2:06 ` [PATCH 2/3] app/test: run-time dispatch over memcpy perf test Xiaoyun Li
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 22+ messages in thread
From: Xiaoyun Li @ 2017-08-25  2:06 UTC (permalink / raw)
  To: bruce.richardson; +Cc: dev, wenzhuo.lu, zhihong.wang, qi.z.zhang, Xiaoyun Li

This patch dynamically selects functions of memcpy at run-time based
on CPU flags that current machine supports. This patch uses function
pointers which are bind to the relative functions at constrctor time.
To make AVX512 instructions pass compilation, enable the switch in
makefile.

Signed-off-by: Xiaoyun Li <xiaoyun.li@intel.com>
---
 .../common/include/arch/x86/rte_memcpy.h           | 305 ++++++++++++---------
 mk/machine/native/rte.vars.mk                      |   2 +
 2 files changed, 181 insertions(+), 126 deletions(-)

diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
index 74c280c..f68ebd2 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
@@ -45,11 +45,37 @@
 #include <string.h>
 #include <rte_vect.h>
 #include <rte_common.h>
+#include <rte_cpuflags.h>
+#include <rte_log.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+/*
+ * Select SSE/AVX memory copy method as default one.
+ */
+
+static uint16_t alignment_mask = 0x0F;
+
+typedef void (*rte_mov16_t)(uint8_t *dst, const uint8_t *src);
+typedef void (*rte_mov32_t)(uint8_t *dst, const uint8_t *src);
+typedef void (*rte_mov64_t)(uint8_t *dst, const uint8_t *src);
+typedef void (*rte_mov128_t)(uint8_t *dst, const uint8_t *src);
+typedef void (*rte_mov256_t)(uint8_t *dst, const uint8_t *src);
+typedef void (*rte_mov128blocks_t)(uint8_t *dst, const uint8_t *src, size_t n);
+typedef void (*rte_mov512blocks_t)(uint8_t *dst, const uint8_t *src, size_t n);
+typedef void * (*rte_memcpy_generic_t)(void *dst, const void *src, size_t n);
+
+static rte_mov16_t rte_mov16;
+static rte_mov32_t rte_mov32;
+static rte_mov64_t rte_mov64;
+static rte_mov128_t rte_mov128;
+static rte_mov256_t rte_mov256;
+static rte_mov128blocks_t rte_mov128blocks;
+static rte_mov512blocks_t rte_mov512blocks;
+static rte_memcpy_generic_t rte_memcpy_generic;
+
 /**
  * Copy bytes from one location to another. The locations must not overlap.
  *
@@ -68,10 +94,6 @@ extern "C" {
 static __rte_always_inline void *
 rte_memcpy(void *dst, const void *src, size_t n);
 
-#ifdef RTE_MACHINE_CPUFLAG_AVX512F
-
-#define ALIGNMENT_MASK 0x3F
-
 /**
  * AVX512 implementation below
  */
@@ -81,7 +103,7 @@ rte_memcpy(void *dst, const void *src, size_t n);
  * locations should not overlap.
  */
 static inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
+rte_mov16_AVX512F(uint8_t *dst, const uint8_t *src)
 {
 	__m128i xmm0;
 
@@ -94,7 +116,7 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
  * locations should not overlap.
  */
 static inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
+rte_mov32_AVX512F(uint8_t *dst, const uint8_t *src)
 {
 	__m256i ymm0;
 
@@ -107,7 +129,7 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
  * locations should not overlap.
  */
 static inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
+rte_mov64_AVX512F(uint8_t *dst, const uint8_t *src)
 {
 	__m512i zmm0;
 
@@ -120,10 +142,10 @@ rte_mov64(uint8_t *dst, const uint8_t *src)
  * locations should not overlap.
  */
 static inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
+rte_mov128_AVX512F(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov64(dst + 0 * 64, src + 0 * 64);
-	rte_mov64(dst + 1 * 64, src + 1 * 64);
+	(*rte_mov64)(dst + 0 * 64, src + 0 * 64);
+	(*rte_mov64)(dst + 1 * 64, src + 1 * 64);
 }
 
 /**
@@ -131,12 +153,12 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
  * locations should not overlap.
  */
 static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
+rte_mov256_AVX512F(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov64(dst + 0 * 64, src + 0 * 64);
-	rte_mov64(dst + 1 * 64, src + 1 * 64);
-	rte_mov64(dst + 2 * 64, src + 2 * 64);
-	rte_mov64(dst + 3 * 64, src + 3 * 64);
+	(*rte_mov64)(dst + 0 * 64, src + 0 * 64);
+	(*rte_mov64)(dst + 1 * 64, src + 1 * 64);
+	(*rte_mov64)(dst + 2 * 64, src + 2 * 64);
+	(*rte_mov64)(dst + 3 * 64, src + 3 * 64);
 }
 
 /**
@@ -144,7 +166,7 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
  * locations should not overlap.
  */
 static inline void
-rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov128blocks_AVX512F(uint8_t *dst, const uint8_t *src, size_t n)
 {
 	__m512i zmm0, zmm1;
 
@@ -164,7 +186,7 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
  * locations should not overlap.
  */
 static inline void
-rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov512blocks_AVX512F(uint8_t *dst, const uint8_t *src, size_t n)
 {
 	__m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
 
@@ -192,7 +214,7 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 }
 
 static inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_AVX512F(void *dst, const void *src, size_t n)
 {
 	uintptr_t dstu = (uintptr_t)dst;
 	uintptr_t srcu = (uintptr_t)src;
@@ -228,39 +250,39 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
 	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n,
+		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov16)((uint8_t *)dst - 16 + n,
 				  (const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
+		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov32)((uint8_t *)dst - 32 + n,
 				  (const uint8_t *)src - 32 + n);
 		return ret;
 	}
 	if (n <= 512) {
 		if (n >= 256) {
 			n -= 256;
-			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov256)((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 256;
 			dst = (uint8_t *)dst + 256;
 		}
 		if (n >= 128) {
 			n -= 128;
-			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 128;
 			dst = (uint8_t *)dst + 128;
 		}
 COPY_BLOCK_128_BACK63:
 		if (n > 64) {
-			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-			rte_mov64((uint8_t *)dst - 64 + n,
+			(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov64)((uint8_t *)dst - 64 + n,
 					  (const uint8_t *)src - 64 + n);
 			return ret;
 		}
 		if (n > 0)
-			rte_mov64((uint8_t *)dst - 64 + n,
+			(*rte_mov64)((uint8_t *)dst - 64 + n,
 					  (const uint8_t *)src - 64 + n);
 		return ret;
 	}
@@ -272,7 +294,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	if (dstofss > 0) {
 		dstofss = 64 - dstofss;
 		n -= dstofss;
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
 		src = (const uint8_t *)src + dstofss;
 		dst = (uint8_t *)dst + dstofss;
 	}
@@ -282,7 +304,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Use copy block function for better instruction order control,
 	 * which is important when load is unaligned.
 	 */
-	rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
+	(*rte_mov512blocks)((uint8_t *)dst, (const uint8_t *)src, n);
 	bits = n;
 	n = n & 511;
 	bits -= n;
@@ -295,7 +317,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * which is important when load is unaligned.
 	 */
 	if (n >= 128) {
-		rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
+		(*rte_mov128blocks)((uint8_t *)dst, (const uint8_t *)src, n);
 		bits = n;
 		n = n & 127;
 		bits -= n;
@@ -309,10 +331,6 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	goto COPY_BLOCK_128_BACK63;
 }
 
-#elif defined RTE_MACHINE_CPUFLAG_AVX2
-
-#define ALIGNMENT_MASK 0x1F
-
 /**
  * AVX2 implementation below
  */
@@ -322,7 +340,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
  * locations should not overlap.
  */
 static inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
+rte_mov16_AVX2(uint8_t *dst, const uint8_t *src)
 {
 	__m128i xmm0;
 
@@ -335,7 +353,7 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
  * locations should not overlap.
  */
 static inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
+rte_mov32_AVX2(uint8_t *dst, const uint8_t *src)
 {
 	__m256i ymm0;
 
@@ -348,10 +366,10 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
  * locations should not overlap.
  */
 static inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
+rte_mov64_AVX2(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
+	(*rte_mov32)((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
+	(*rte_mov32)((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
 }
 
 /**
@@ -359,12 +377,12 @@ rte_mov64(uint8_t *dst, const uint8_t *src)
  * locations should not overlap.
  */
 static inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
+rte_mov128_AVX2(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
-	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
+	(*rte_mov32)((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
+	(*rte_mov32)((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
+	(*rte_mov32)((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
+	(*rte_mov32)((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
 }
 
 /**
@@ -372,7 +390,7 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
  * locations should not overlap.
  */
 static inline void
-rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov128blocks_AVX2(uint8_t *dst, const uint8_t *src, size_t n)
 {
 	__m256i ymm0, ymm1, ymm2, ymm3;
 
@@ -392,7 +410,7 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 }
 
 static inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_AVX2(void *dst, const void *src, size_t n)
 {
 	uintptr_t dstu = (uintptr_t)dst;
 	uintptr_t srcu = (uintptr_t)src;
@@ -429,46 +447,46 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
 	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n,
+		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov16)((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 48) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
-		rte_mov16((uint8_t *)dst - 16 + n,
+		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov16)((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+		(*rte_mov16)((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
+		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov32)((uint8_t *)dst - 32 + n,
 				(const uint8_t *)src - 32 + n);
 		return ret;
 	}
 	if (n <= 256) {
 		if (n >= 128) {
 			n -= 128;
-			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 128;
 			dst = (uint8_t *)dst + 128;
 		}
 COPY_BLOCK_128_BACK31:
 		if (n >= 64) {
 			n -= 64;
-			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 64;
 			dst = (uint8_t *)dst + 64;
 		}
 		if (n > 32) {
-			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-			rte_mov32((uint8_t *)dst - 32 + n,
+			(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov32)((uint8_t *)dst - 32 + n,
 					(const uint8_t *)src - 32 + n);
 			return ret;
 		}
 		if (n > 0) {
-			rte_mov32((uint8_t *)dst - 32 + n,
+			(*rte_mov32)((uint8_t *)dst - 32 + n,
 					(const uint8_t *)src - 32 + n);
 		}
 		return ret;
@@ -481,7 +499,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	if (dstofss > 0) {
 		dstofss = 32 - dstofss;
 		n -= dstofss;
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
 		src = (const uint8_t *)src + dstofss;
 		dst = (uint8_t *)dst + dstofss;
 	}
@@ -489,7 +507,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	/**
 	 * Copy 128-byte blocks
 	 */
-	rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
+	(*rte_mov128blocks)((uint8_t *)dst, (const uint8_t *)src, n);
 	bits = n;
 	n = n & 127;
 	bits -= n;
@@ -502,10 +520,6 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	goto COPY_BLOCK_128_BACK31;
 }
 
-#else /* RTE_MACHINE_CPUFLAG */
-
-#define ALIGNMENT_MASK 0x0F
-
 /**
  * SSE & AVX implementation below
  */
@@ -515,7 +529,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
  * locations should not overlap.
  */
 static inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
+rte_mov16_DEFAULT(uint8_t *dst, const uint8_t *src)
 {
 	__m128i xmm0;
 
@@ -528,10 +542,10 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
  * locations should not overlap.
  */
 static inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
+rte_mov32_DEFAULT(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 }
 
 /**
@@ -539,12 +553,12 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
  * locations should not overlap.
  */
 static inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
+rte_mov64_DEFAULT(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
+	(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	(*rte_mov16)((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+	(*rte_mov16)((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 }
 
 /**
@@ -552,16 +566,16 @@ rte_mov64(uint8_t *dst, const uint8_t *src)
  * locations should not overlap.
  */
 static inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
+rte_mov128_DEFAULT(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
-	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
-	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
-	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
+	(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	(*rte_mov16)((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+	(*rte_mov16)((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
+	(*rte_mov16)((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
+	(*rte_mov16)((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
+	(*rte_mov16)((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
+	(*rte_mov16)((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
 }
 
 /**
@@ -569,24 +583,24 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
  * locations should not overlap.
  */
 static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
+rte_mov256_DEFAULT(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
-	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
-	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
-	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
-	rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
-	rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
-	rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
-	rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
-	rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
-	rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
-	rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
-	rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
+	(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	(*rte_mov16)((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+	(*rte_mov16)((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
+	(*rte_mov16)((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
+	(*rte_mov16)((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
+	(*rte_mov16)((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
+	(*rte_mov16)((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
+	(*rte_mov16)((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
+	(*rte_mov16)((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
+	(*rte_mov16)((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
+	(*rte_mov16)((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
+	(*rte_mov16)((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
+	(*rte_mov16)((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
+	(*rte_mov16)((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
+	(*rte_mov16)((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
 }
 
 /**
@@ -684,7 +698,7 @@ __extension__ ({                                                      \
 })
 
 static inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_DEFAULT(void *dst, const void *src, size_t n)
 {
 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
 	uintptr_t dstu = (uintptr_t)dst;
@@ -722,19 +736,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
 	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov16)((uint8_t *)dst - 16 + n,
+				(const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 48) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov16)((uint8_t *)dst - 16 + n,
+				(const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov16)((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+		(*rte_mov16)((uint8_t *)dst - 16 + n,
+				(const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 128) {
@@ -743,39 +760,42 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	if (n <= 512) {
 		if (n >= 256) {
 			n -= 256;
-			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
-			rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
+			(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov128)((uint8_t *)dst + 128,
+					(const uint8_t *)src + 128);
 			src = (const uint8_t *)src + 256;
 			dst = (uint8_t *)dst + 256;
 		}
 COPY_BLOCK_255_BACK15:
 		if (n >= 128) {
 			n -= 128;
-			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 128;
 			dst = (uint8_t *)dst + 128;
 		}
 COPY_BLOCK_128_BACK15:
 		if (n >= 64) {
 			n -= 64;
-			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 64;
 			dst = (uint8_t *)dst + 64;
 		}
 COPY_BLOCK_64_BACK15:
 		if (n >= 32) {
 			n -= 32;
-			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 32;
 			dst = (uint8_t *)dst + 32;
 		}
 		if (n > 16) {
-			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+			(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov16)((uint8_t *)dst - 16 + n,
+					(const uint8_t *)src - 16 + n);
 			return ret;
 		}
 		if (n > 0) {
-			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+			(*rte_mov16)((uint8_t *)dst - 16 + n,
+					(const uint8_t *)src - 16 + n);
 		}
 		return ret;
 	}
@@ -790,7 +810,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	if (dstofss > 0) {
 		dstofss = 16 - dstofss + 16;
 		n -= dstofss;
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
 		src = (const uint8_t *)src + dstofss;
 		dst = (uint8_t *)dst + dstofss;
 	}
@@ -804,7 +824,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 		 * Copy 256-byte blocks
 		 */
 		for (; n >= 256; n -= 256) {
-			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov256)((uint8_t *)dst, (const uint8_t *)src);
 			dst = (uint8_t *)dst + 256;
 			src = (const uint8_t *)src + 256;
 		}
@@ -826,7 +846,40 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	goto COPY_BLOCK_64_BACK15;
 }
 
-#endif /* RTE_MACHINE_CPUFLAG */
+static void __attribute__((constructor))
+rte_memcpy_init(void)
+{
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F)) {
+		alignment_mask = 0x3F;
+		rte_mov16 = rte_mov16_AVX512F;
+		rte_mov32 = rte_mov32_AVX512F;
+		rte_mov64 = rte_mov64_AVX512F;
+		rte_mov128 = rte_mov128_AVX512F;
+		rte_mov256 = rte_mov256_AVX512F;
+		rte_mov128blocks = rte_mov128blocks_AVX512F;
+		rte_mov512blocks = rte_mov512blocks_AVX512F;
+		rte_memcpy_generic = rte_memcpy_generic_AVX512F;
+		RTE_LOG(INFO, EAL, "AVX512 implementation of memcpy() is using!\n");
+	} else if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2)) {
+		alignment_mask = 0x1F;
+		rte_mov16 = rte_mov16_AVX2;
+		rte_mov32 = rte_mov32_AVX2;
+		rte_mov64 = rte_mov64_AVX2;
+		rte_mov128 = rte_mov128_AVX2;
+		rte_mov128blocks = rte_mov128blocks_AVX2;
+		rte_memcpy_generic = rte_memcpy_generic_AVX2;
+		RTE_LOG(INFO, EAL, "AVX2 implementation of memcpy() is using!\n");
+	} else {
+		alignment_mask = 0x0F;
+		rte_mov16 = rte_mov16_DEFAULT;
+		rte_mov32 = rte_mov32_DEFAULT;
+		rte_mov64 = rte_mov64_DEFAULT;
+		rte_mov128 = rte_mov128_DEFAULT;
+		rte_mov256 = rte_mov256_DEFAULT;
+		rte_memcpy_generic = rte_memcpy_generic_DEFAULT;
+		RTE_LOG(INFO, EAL, "Default SSE/AVX implementation of memcpy() is using!\n");
+	}
+}
 
 static inline void *
 rte_memcpy_aligned(void *dst, const void *src, size_t n)
@@ -858,8 +911,8 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 
 	/* Copy 16 <= size <= 32 bytes */
 	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n,
+		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov16)((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 
 		return ret;
@@ -867,8 +920,8 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 
 	/* Copy 32 < size <= 64 bytes */
 	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
+		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov32)((uint8_t *)dst - 32 + n,
 				(const uint8_t *)src - 32 + n);
 
 		return ret;
@@ -876,13 +929,13 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 
 	/* Copy 64 bytes blocks */
 	for (; n >= 64; n -= 64) {
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
 		dst = (uint8_t *)dst + 64;
 		src = (const uint8_t *)src + 64;
 	}
 
 	/* Copy whatever left */
-	rte_mov64((uint8_t *)dst - 64 + n,
+	(*rte_mov64)((uint8_t *)dst - 64 + n,
 			(const uint8_t *)src - 64 + n);
 
 	return ret;
@@ -891,10 +944,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 static inline void *
 rte_memcpy(void *dst, const void *src, size_t n)
 {
-	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
+	if (!(((uintptr_t)dst | (uintptr_t)src) & alignment_mask))
 		return rte_memcpy_aligned(dst, src, n);
 	else
-		return rte_memcpy_generic(dst, src, n);
+		return (*rte_memcpy_generic)(dst, src, n);
 }
 
 #ifdef __cplusplus
diff --git a/mk/machine/native/rte.vars.mk b/mk/machine/native/rte.vars.mk
index f7d98d0..cdcf6c6 100644
--- a/mk/machine/native/rte.vars.mk
+++ b/mk/machine/native/rte.vars.mk
@@ -65,3 +65,5 @@ SSE42_SUPPORT=$(shell $(CC) -march=native -dM -E - </dev/null | grep SSE4_2)
 ifeq ($(SSE42_SUPPORT),)
     MACHINE_CFLAGS = -march=corei7
 endif
+
+MACHINE_CFLAGS += -mavx512f
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 2/3] app/test: run-time dispatch over memcpy perf test
  2017-08-25  2:06 [PATCH 0/3] dynamic linking support Xiaoyun Li
  2017-08-25  2:06 ` [PATCH 1/3] eal/x86: run-time dispatch over memcpy Xiaoyun Li
@ 2017-08-25  2:06 ` Xiaoyun Li
  2017-08-25  2:06 ` [PATCH 3/3] efd: run-time dispatch over x86 EFD functions Xiaoyun Li
  2017-09-01  8:56 ` [PATCH v2 0/3] dynamic linking support Xiaoyun Li
  3 siblings, 0 replies; 22+ messages in thread
From: Xiaoyun Li @ 2017-08-25  2:06 UTC (permalink / raw)
  To: bruce.richardson; +Cc: dev, wenzhuo.lu, zhihong.wang, qi.z.zhang, Xiaoyun Li

This patch modifies assignment of alignment unit from build-time
to run-time based on CPU flags that machine supports.

Signed-off-by: Xiaoyun Li <xiaoyun.li@intel.com>
---
 test/test/test_memcpy_perf.c | 36 +++++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/test/test/test_memcpy_perf.c b/test/test/test_memcpy_perf.c
index ff3aaaa..68132e6 100644
--- a/test/test/test_memcpy_perf.c
+++ b/test/test/test_memcpy_perf.c
@@ -79,13 +79,7 @@ static size_t buf_sizes[TEST_VALUE_RANGE];
 #define TEST_BATCH_SIZE         100
 
 /* Data is aligned on this many bytes (power of 2) */
-#ifdef RTE_MACHINE_CPUFLAG_AVX512F
-#define ALIGNMENT_UNIT          64
-#elif defined RTE_MACHINE_CPUFLAG_AVX2
-#define ALIGNMENT_UNIT          32
-#else /* RTE_MACHINE_CPUFLAG */
-#define ALIGNMENT_UNIT          16
-#endif /* RTE_MACHINE_CPUFLAG */
+static uint8_t alignment_unit = 16;
 
 /*
  * Pointers used in performance tests. The two large buffers are for uncached
@@ -101,19 +95,34 @@ init_buffers(void)
 {
 	unsigned i;
 
-	large_buf_read = rte_malloc("memcpy", LARGE_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT);
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F))
+		alignment_unit = 64;
+	else if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2))
+		alignment_unit = 32;
+	else
+		alignment_unit = 16;
+
+	large_buf_read = rte_malloc("memcpy",
+				    LARGE_BUFFER_SIZE + alignment_unit,
+				    alignment_unit);
 	if (large_buf_read == NULL)
 		goto error_large_buf_read;
 
-	large_buf_write = rte_malloc("memcpy", LARGE_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT);
+	large_buf_write = rte_malloc("memcpy",
+				     LARGE_BUFFER_SIZE + alignment_unit,
+				     alignment_unit);
 	if (large_buf_write == NULL)
 		goto error_large_buf_write;
 
-	small_buf_read = rte_malloc("memcpy", SMALL_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT);
+	small_buf_read = rte_malloc("memcpy",
+				    SMALL_BUFFER_SIZE + alignment_unit,
+				    alignment_unit);
 	if (small_buf_read == NULL)
 		goto error_small_buf_read;
 
-	small_buf_write = rte_malloc("memcpy", SMALL_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT);
+	small_buf_write = rte_malloc("memcpy",
+				     SMALL_BUFFER_SIZE + alignment_unit,
+				     alignment_unit);
 	if (small_buf_write == NULL)
 		goto error_small_buf_write;
 
@@ -153,7 +162,7 @@ static inline size_t
 get_rand_offset(size_t uoffset)
 {
 	return ((rte_rand() % (LARGE_BUFFER_SIZE - SMALL_BUFFER_SIZE)) &
-			~(ALIGNMENT_UNIT - 1)) + uoffset;
+			~(alignment_unit - 1)) + uoffset;
 }
 
 /* Fill in source and destination addresses. */
@@ -321,7 +330,8 @@ perf_test(void)
 		   "(bytes)        (ticks)        (ticks)        (ticks)        (ticks)\n"
 		   "------- -------------- -------------- -------------- --------------");
 
-	printf("\n========================== %2dB aligned ============================", ALIGNMENT_UNIT);
+	printf("\n========================= %2dB aligned ============================",
+		alignment_unit);
 	/* Do aligned tests where size is a variable */
 	perf_test_variable_aligned();
 	printf("\n------- -------------- -------------- -------------- --------------");
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 3/3] efd: run-time dispatch over x86 EFD functions
  2017-08-25  2:06 [PATCH 0/3] dynamic linking support Xiaoyun Li
  2017-08-25  2:06 ` [PATCH 1/3] eal/x86: run-time dispatch over memcpy Xiaoyun Li
  2017-08-25  2:06 ` [PATCH 2/3] app/test: run-time dispatch over memcpy perf test Xiaoyun Li
@ 2017-08-25  2:06 ` Xiaoyun Li
  2017-09-01  8:56 ` [PATCH v2 0/3] dynamic linking support Xiaoyun Li
  3 siblings, 0 replies; 22+ messages in thread
From: Xiaoyun Li @ 2017-08-25  2:06 UTC (permalink / raw)
  To: bruce.richardson; +Cc: dev, wenzhuo.lu, zhihong.wang, qi.z.zhang, Xiaoyun Li

This patch dynamically selects x86 EFD functions at run-time.
This patch uses function pointer and binds it to the relative
function based on CPU flags at constructor time.

Signed-off-by: Xiaoyun Li <xiaoyun.li@intel.com>
---
 lib/librte_efd/rte_efd_x86.h | 35 ++++++++++++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/lib/librte_efd/rte_efd_x86.h b/lib/librte_efd/rte_efd_x86.h
index 34f37d7..9b632bb 100644
--- a/lib/librte_efd/rte_efd_x86.h
+++ b/lib/librte_efd/rte_efd_x86.h
@@ -43,12 +43,28 @@
 #define EFD_LOAD_SI128(val) _mm_lddqu_si128(val)
 #endif
 
+typedef efd_value_t
+(*efd_lookup_internal_avx2_t)(const efd_hashfunc_t *group_hash_idx,
+		const efd_lookuptbl_t *group_lookup_table,
+		const uint32_t hash_val_a, const uint32_t hash_val_b);
+
+static efd_lookup_internal_avx2_t efd_lookup_internal_avx2_ptr;
+
 static inline efd_value_t
 efd_lookup_internal_avx2(const efd_hashfunc_t *group_hash_idx,
 		const efd_lookuptbl_t *group_lookup_table,
 		const uint32_t hash_val_a, const uint32_t hash_val_b)
 {
-#ifdef RTE_MACHINE_CPUFLAG_AVX2
+	return (*efd_lookup_internal_avx2_ptr)(group_hash_idx,
+					       group_lookup_table,
+					       hash_val_a, hash_val_b);
+}
+
+static inline efd_value_t
+efd_lookup_internal_avx2_AVX2(const efd_hashfunc_t *group_hash_idx,
+		const efd_lookuptbl_t *group_lookup_table,
+		const uint32_t hash_val_a, const uint32_t hash_val_b)
+{
 	efd_value_t value = 0;
 	uint32_t i = 0;
 	__m256i vhash_val_a = _mm256_set1_epi32(hash_val_a);
@@ -74,13 +90,26 @@ efd_lookup_internal_avx2(const efd_hashfunc_t *group_hash_idx,
 	}
 
 	return value;
-#else
+}
+
+static inline efd_value_t
+efd_lookup_internal_avx2_DEFAULT(const efd_hashfunc_t *group_hash_idx,
+		const efd_lookuptbl_t *group_lookup_table,
+		const uint32_t hash_val_a, const uint32_t hash_val_b)
+{
 	RTE_SET_USED(group_hash_idx);
 	RTE_SET_USED(group_lookup_table);
 	RTE_SET_USED(hash_val_a);
 	RTE_SET_USED(hash_val_b);
 	/* Return dummy value, only to avoid compilation breakage */
 	return 0;
-#endif
+}
 
+static void __attribute__((constructor))
+rte_efd_x86_init(void)
+{
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2))
+		efd_lookup_internal_avx2_ptr = efd_lookup_internal_avx2_AVX2;
+	else
+		efd_lookup_internal_avx2_ptr = efd_lookup_internal_avx2_DEFAULT;
 }
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH 1/3] eal/x86: run-time dispatch over memcpy
  2017-08-25  2:06 ` [PATCH 1/3] eal/x86: run-time dispatch over memcpy Xiaoyun Li
@ 2017-08-30 14:56   ` Ananyev, Konstantin
  2017-08-30 17:51     ` Bruce Richardson
  2017-08-30 18:00   ` Stephen Hemminger
  1 sibling, 1 reply; 22+ messages in thread
From: Ananyev, Konstantin @ 2017-08-30 14:56 UTC (permalink / raw)
  To: Li, Xiaoyun, Richardson, Bruce
  Cc: dev, Lu, Wenzhuo, Wang, Zhihong, Zhang, Qi Z, Li, Xiaoyun



> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Xiaoyun Li
> Sent: Friday, August 25, 2017 3:06 AM
> To: Richardson, Bruce <bruce.richardson@intel.com>
> Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Wang, Zhihong <zhihong.wang@intel.com>; Zhang, Qi Z
> <qi.z.zhang@intel.com>; Li, Xiaoyun <xiaoyun.li@intel.com>
> Subject: [dpdk-dev] [PATCH 1/3] eal/x86: run-time dispatch over memcpy
> 
> This patch dynamically selects functions of memcpy at run-time based
> on CPU flags that current machine supports. This patch uses function
> pointers which are bind to the relative functions at constrctor time.
> To make AVX512 instructions pass compilation, enable the switch in
> makefile.

It seems quite an overhead to add extra function call for each 16B movement...
Wouldn't it be better to have one func_ptr per implementation, i.e:
rte_memcpy_sse(), rte_memcpy_avx2(), rte_memcpy_avx512(), etc.?
Konstantin

> 
> Signed-off-by: Xiaoyun Li <xiaoyun.li@intel.com>
> ---
>  .../common/include/arch/x86/rte_memcpy.h           | 305 ++++++++++++---------
>  mk/machine/native/rte.vars.mk                      |   2 +
>  2 files changed, 181 insertions(+), 126 deletions(-)
> 
> diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> index 74c280c..f68ebd2 100644
> --- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> +++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> @@ -45,11 +45,37 @@
>  #include <string.h>
>  #include <rte_vect.h>
>  #include <rte_common.h>
> +#include <rte_cpuflags.h>
> +#include <rte_log.h>
> 
>  #ifdef __cplusplus
>  extern "C" {
>  #endif
> 
> +/*
> + * Select SSE/AVX memory copy method as default one.
> + */
> +
> +static uint16_t alignment_mask = 0x0F;
> +
> +typedef void (*rte_mov16_t)(uint8_t *dst, const uint8_t *src);
> +typedef void (*rte_mov32_t)(uint8_t *dst, const uint8_t *src);
> +typedef void (*rte_mov64_t)(uint8_t *dst, const uint8_t *src);
> +typedef void (*rte_mov128_t)(uint8_t *dst, const uint8_t *src);
> +typedef void (*rte_mov256_t)(uint8_t *dst, const uint8_t *src);
> +typedef void (*rte_mov128blocks_t)(uint8_t *dst, const uint8_t *src, size_t n);
> +typedef void (*rte_mov512blocks_t)(uint8_t *dst, const uint8_t *src, size_t n);
> +typedef void * (*rte_memcpy_generic_t)(void *dst, const void *src, size_t n);
> +
> +static rte_mov16_t rte_mov16;
> +static rte_mov32_t rte_mov32;
> +static rte_mov64_t rte_mov64;
> +static rte_mov128_t rte_mov128;
> +static rte_mov256_t rte_mov256;
> +static rte_mov128blocks_t rte_mov128blocks;
> +static rte_mov512blocks_t rte_mov512blocks;
> +static rte_memcpy_generic_t rte_memcpy_generic;
> +
>  /**
>   * Copy bytes from one location to another. The locations must not overlap.
>   *
> @@ -68,10 +94,6 @@ extern "C" {
>  static __rte_always_inline void *
>  rte_memcpy(void *dst, const void *src, size_t n);
> 
> -#ifdef RTE_MACHINE_CPUFLAG_AVX512F
> -
> -#define ALIGNMENT_MASK 0x3F
> -
>  /**
>   * AVX512 implementation below
>   */
> @@ -81,7 +103,7 @@ rte_memcpy(void *dst, const void *src, size_t n);
>   * locations should not overlap.
>   */
>  static inline void
> -rte_mov16(uint8_t *dst, const uint8_t *src)
> +rte_mov16_AVX512F(uint8_t *dst, const uint8_t *src)
>  {
>  	__m128i xmm0;
> 
> @@ -94,7 +116,7 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
>   * locations should not overlap.
>   */
>  static inline void
> -rte_mov32(uint8_t *dst, const uint8_t *src)
> +rte_mov32_AVX512F(uint8_t *dst, const uint8_t *src)
>  {
>  	__m256i ymm0;
> 
> @@ -107,7 +129,7 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
>   * locations should not overlap.
>   */
>  static inline void
> -rte_mov64(uint8_t *dst, const uint8_t *src)
> +rte_mov64_AVX512F(uint8_t *dst, const uint8_t *src)
>  {
>  	__m512i zmm0;
> 
> @@ -120,10 +142,10 @@ rte_mov64(uint8_t *dst, const uint8_t *src)
>   * locations should not overlap.
>   */
>  static inline void
> -rte_mov128(uint8_t *dst, const uint8_t *src)
> +rte_mov128_AVX512F(uint8_t *dst, const uint8_t *src)
>  {
> -	rte_mov64(dst + 0 * 64, src + 0 * 64);
> -	rte_mov64(dst + 1 * 64, src + 1 * 64);
> +	(*rte_mov64)(dst + 0 * 64, src + 0 * 64);
> +	(*rte_mov64)(dst + 1 * 64, src + 1 * 64);
>  }
> 
>  /**
> @@ -131,12 +153,12 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
>   * locations should not overlap.
>   */
>  static inline void
> -rte_mov256(uint8_t *dst, const uint8_t *src)
> +rte_mov256_AVX512F(uint8_t *dst, const uint8_t *src)
>  {
> -	rte_mov64(dst + 0 * 64, src + 0 * 64);
> -	rte_mov64(dst + 1 * 64, src + 1 * 64);
> -	rte_mov64(dst + 2 * 64, src + 2 * 64);
> -	rte_mov64(dst + 3 * 64, src + 3 * 64);
> +	(*rte_mov64)(dst + 0 * 64, src + 0 * 64);
> +	(*rte_mov64)(dst + 1 * 64, src + 1 * 64);
> +	(*rte_mov64)(dst + 2 * 64, src + 2 * 64);
> +	(*rte_mov64)(dst + 3 * 64, src + 3 * 64);
>  }
> 
>  /**
> @@ -144,7 +166,7 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
>   * locations should not overlap.
>   */
>  static inline void
> -rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
> +rte_mov128blocks_AVX512F(uint8_t *dst, const uint8_t *src, size_t n)
>  {
>  	__m512i zmm0, zmm1;
> 
> @@ -164,7 +186,7 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
>   * locations should not overlap.
>   */
>  static inline void
> -rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
> +rte_mov512blocks_AVX512F(uint8_t *dst, const uint8_t *src, size_t n)
>  {
>  	__m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
> 
> @@ -192,7 +214,7 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
>  }
> 
>  static inline void *
> -rte_memcpy_generic(void *dst, const void *src, size_t n)
> +rte_memcpy_generic_AVX512F(void *dst, const void *src, size_t n)
>  {
>  	uintptr_t dstu = (uintptr_t)dst;
>  	uintptr_t srcu = (uintptr_t)src;
> @@ -228,39 +250,39 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	 * Fast way when copy size doesn't exceed 512 bytes
>  	 */
>  	if (n <= 32) {
> -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst - 16 + n,
> +		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov16)((uint8_t *)dst - 16 + n,
>  				  (const uint8_t *)src - 16 + n);
>  		return ret;
>  	}
>  	if (n <= 64) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov32((uint8_t *)dst - 32 + n,
> +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov32)((uint8_t *)dst - 32 + n,
>  				  (const uint8_t *)src - 32 + n);
>  		return ret;
>  	}
>  	if (n <= 512) {
>  		if (n >= 256) {
>  			n -= 256;
> -			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov256)((uint8_t *)dst, (const uint8_t *)src);
>  			src = (const uint8_t *)src + 256;
>  			dst = (uint8_t *)dst + 256;
>  		}
>  		if (n >= 128) {
>  			n -= 128;
> -			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);
>  			src = (const uint8_t *)src + 128;
>  			dst = (uint8_t *)dst + 128;
>  		}
>  COPY_BLOCK_128_BACK63:
>  		if (n > 64) {
> -			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> -			rte_mov64((uint8_t *)dst - 64 + n,
> +			(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov64)((uint8_t *)dst - 64 + n,
>  					  (const uint8_t *)src - 64 + n);
>  			return ret;
>  		}
>  		if (n > 0)
> -			rte_mov64((uint8_t *)dst - 64 + n,
> +			(*rte_mov64)((uint8_t *)dst - 64 + n,
>  					  (const uint8_t *)src - 64 + n);
>  		return ret;
>  	}
> @@ -272,7 +294,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	if (dstofss > 0) {
>  		dstofss = 64 - dstofss;
>  		n -= dstofss;
> -		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
>  		src = (const uint8_t *)src + dstofss;
>  		dst = (uint8_t *)dst + dstofss;
>  	}
> @@ -282,7 +304,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	 * Use copy block function for better instruction order control,
>  	 * which is important when load is unaligned.
>  	 */
> -	rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
> +	(*rte_mov512blocks)((uint8_t *)dst, (const uint8_t *)src, n);
>  	bits = n;
>  	n = n & 511;
>  	bits -= n;
> @@ -295,7 +317,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	 * which is important when load is unaligned.
>  	 */
>  	if (n >= 128) {
> -		rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
> +		(*rte_mov128blocks)((uint8_t *)dst, (const uint8_t *)src, n);
>  		bits = n;
>  		n = n & 127;
>  		bits -= n;
> @@ -309,10 +331,6 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	goto COPY_BLOCK_128_BACK63;
>  }
> 
> -#elif defined RTE_MACHINE_CPUFLAG_AVX2
> -
> -#define ALIGNMENT_MASK 0x1F
> -
>  /**
>   * AVX2 implementation below
>   */
> @@ -322,7 +340,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>   * locations should not overlap.
>   */
>  static inline void
> -rte_mov16(uint8_t *dst, const uint8_t *src)
> +rte_mov16_AVX2(uint8_t *dst, const uint8_t *src)
>  {
>  	__m128i xmm0;
> 
> @@ -335,7 +353,7 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
>   * locations should not overlap.
>   */
>  static inline void
> -rte_mov32(uint8_t *dst, const uint8_t *src)
> +rte_mov32_AVX2(uint8_t *dst, const uint8_t *src)
>  {
>  	__m256i ymm0;
> 
> @@ -348,10 +366,10 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
>   * locations should not overlap.
>   */
>  static inline void
> -rte_mov64(uint8_t *dst, const uint8_t *src)
> +rte_mov64_AVX2(uint8_t *dst, const uint8_t *src)
>  {
> -	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> -	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> +	(*rte_mov32)((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> +	(*rte_mov32)((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
>  }
> 
>  /**
> @@ -359,12 +377,12 @@ rte_mov64(uint8_t *dst, const uint8_t *src)
>   * locations should not overlap.
>   */
>  static inline void
> -rte_mov128(uint8_t *dst, const uint8_t *src)
> +rte_mov128_AVX2(uint8_t *dst, const uint8_t *src)
>  {
> -	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> -	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> -	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
> -	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
> +	(*rte_mov32)((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> +	(*rte_mov32)((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> +	(*rte_mov32)((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
> +	(*rte_mov32)((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
>  }
> 
>  /**
> @@ -372,7 +390,7 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
>   * locations should not overlap.
>   */
>  static inline void
> -rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
> +rte_mov128blocks_AVX2(uint8_t *dst, const uint8_t *src, size_t n)
>  {
>  	__m256i ymm0, ymm1, ymm2, ymm3;
> 
> @@ -392,7 +410,7 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
>  }
> 
>  static inline void *
> -rte_memcpy_generic(void *dst, const void *src, size_t n)
> +rte_memcpy_generic_AVX2(void *dst, const void *src, size_t n)
>  {
>  	uintptr_t dstu = (uintptr_t)dst;
>  	uintptr_t srcu = (uintptr_t)src;
> @@ -429,46 +447,46 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	 * Fast way when copy size doesn't exceed 256 bytes
>  	 */
>  	if (n <= 32) {
> -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst - 16 + n,
> +		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov16)((uint8_t *)dst - 16 + n,
>  				(const uint8_t *)src - 16 + n);
>  		return ret;
>  	}
>  	if (n <= 48) {
> -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
> -		rte_mov16((uint8_t *)dst - 16 + n,
> +		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov16)((uint8_t *)dst + 16, (const uint8_t *)src + 16);
> +		(*rte_mov16)((uint8_t *)dst - 16 + n,
>  				(const uint8_t *)src - 16 + n);
>  		return ret;
>  	}
>  	if (n <= 64) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov32((uint8_t *)dst - 32 + n,
> +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov32)((uint8_t *)dst - 32 + n,
>  				(const uint8_t *)src - 32 + n);
>  		return ret;
>  	}
>  	if (n <= 256) {
>  		if (n >= 128) {
>  			n -= 128;
> -			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);
>  			src = (const uint8_t *)src + 128;
>  			dst = (uint8_t *)dst + 128;
>  		}
>  COPY_BLOCK_128_BACK31:
>  		if (n >= 64) {
>  			n -= 64;
> -			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
>  			src = (const uint8_t *)src + 64;
>  			dst = (uint8_t *)dst + 64;
>  		}
>  		if (n > 32) {
> -			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -			rte_mov32((uint8_t *)dst - 32 + n,
> +			(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov32)((uint8_t *)dst - 32 + n,
>  					(const uint8_t *)src - 32 + n);
>  			return ret;
>  		}
>  		if (n > 0) {
> -			rte_mov32((uint8_t *)dst - 32 + n,
> +			(*rte_mov32)((uint8_t *)dst - 32 + n,
>  					(const uint8_t *)src - 32 + n);
>  		}
>  		return ret;
> @@ -481,7 +499,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	if (dstofss > 0) {
>  		dstofss = 32 - dstofss;
>  		n -= dstofss;
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
>  		src = (const uint8_t *)src + dstofss;
>  		dst = (uint8_t *)dst + dstofss;
>  	}
> @@ -489,7 +507,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	/**
>  	 * Copy 128-byte blocks
>  	 */
> -	rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
> +	(*rte_mov128blocks)((uint8_t *)dst, (const uint8_t *)src, n);
>  	bits = n;
>  	n = n & 127;
>  	bits -= n;
> @@ -502,10 +520,6 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	goto COPY_BLOCK_128_BACK31;
>  }
> 
> -#else /* RTE_MACHINE_CPUFLAG */
> -
> -#define ALIGNMENT_MASK 0x0F
> -
>  /**
>   * SSE & AVX implementation below
>   */
> @@ -515,7 +529,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>   * locations should not overlap.
>   */
>  static inline void
> -rte_mov16(uint8_t *dst, const uint8_t *src)
> +rte_mov16_DEFAULT(uint8_t *dst, const uint8_t *src)
>  {
>  	__m128i xmm0;
> 
> @@ -528,10 +542,10 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
>   * locations should not overlap.
>   */
>  static inline void
> -rte_mov32(uint8_t *dst, const uint8_t *src)
> +rte_mov32_DEFAULT(uint8_t *dst, const uint8_t *src)
>  {
> -	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> -	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
>  }
> 
>  /**
> @@ -539,12 +553,12 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
>   * locations should not overlap.
>   */
>  static inline void
> -rte_mov64(uint8_t *dst, const uint8_t *src)
> +rte_mov64_DEFAULT(uint8_t *dst, const uint8_t *src)
>  {
> -	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> -	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> -	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> -	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
>  }
> 
>  /**
> @@ -552,16 +566,16 @@ rte_mov64(uint8_t *dst, const uint8_t *src)
>   * locations should not overlap.
>   */
>  static inline void
> -rte_mov128(uint8_t *dst, const uint8_t *src)
> +rte_mov128_DEFAULT(uint8_t *dst, const uint8_t *src)
>  {
> -	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> -	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> -	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> -	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> -	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
> -	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
> -	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
> -	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
>  }
> 
>  /**
> @@ -569,24 +583,24 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
>   * locations should not overlap.
>   */
>  static inline void
> -rte_mov256(uint8_t *dst, const uint8_t *src)
> +rte_mov256_DEFAULT(uint8_t *dst, const uint8_t *src)
>  {
> -	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> -	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> -	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> -	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> -	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
> -	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
> -	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
> -	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
> -	rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
> -	rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
> -	rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
> -	rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
> -	rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
> -	rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
> -	rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
> -	rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
>  }
> 
>  /**
> @@ -684,7 +698,7 @@ __extension__ ({                                                      \
>  })
> 
>  static inline void *
> -rte_memcpy_generic(void *dst, const void *src, size_t n)
> +rte_memcpy_generic_DEFAULT(void *dst, const void *src, size_t n)
>  {
>  	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
>  	uintptr_t dstu = (uintptr_t)dst;
> @@ -722,19 +736,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	 * Fast way when copy size doesn't exceed 512 bytes
>  	 */
>  	if (n <= 32) {
> -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> +		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov16)((uint8_t *)dst - 16 + n,
> +				(const uint8_t *)src - 16 + n);
>  		return ret;
>  	}
>  	if (n <= 48) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov16)((uint8_t *)dst - 16 + n,
> +				(const uint8_t *)src - 16 + n);
>  		return ret;
>  	}
>  	if (n <= 64) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
> -		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov16)((uint8_t *)dst + 32, (const uint8_t *)src + 32);
> +		(*rte_mov16)((uint8_t *)dst - 16 + n,
> +				(const uint8_t *)src - 16 + n);
>  		return ret;
>  	}
>  	if (n <= 128) {
> @@ -743,39 +760,42 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	if (n <= 512) {
>  		if (n >= 256) {
>  			n -= 256;
> -			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
> -			rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
> +			(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov128)((uint8_t *)dst + 128,
> +					(const uint8_t *)src + 128);
>  			src = (const uint8_t *)src + 256;
>  			dst = (uint8_t *)dst + 256;
>  		}
>  COPY_BLOCK_255_BACK15:
>  		if (n >= 128) {
>  			n -= 128;
> -			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);
>  			src = (const uint8_t *)src + 128;
>  			dst = (uint8_t *)dst + 128;
>  		}
>  COPY_BLOCK_128_BACK15:
>  		if (n >= 64) {
>  			n -= 64;
> -			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
>  			src = (const uint8_t *)src + 64;
>  			dst = (uint8_t *)dst + 64;
>  		}
>  COPY_BLOCK_64_BACK15:
>  		if (n >= 32) {
>  			n -= 32;
> -			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
>  			src = (const uint8_t *)src + 32;
>  			dst = (uint8_t *)dst + 32;
>  		}
>  		if (n > 16) {
> -			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> +			(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov16)((uint8_t *)dst - 16 + n,
> +					(const uint8_t *)src - 16 + n);
>  			return ret;
>  		}
>  		if (n > 0) {
> -			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> +			(*rte_mov16)((uint8_t *)dst - 16 + n,
> +					(const uint8_t *)src - 16 + n);
>  		}
>  		return ret;
>  	}
> @@ -790,7 +810,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	if (dstofss > 0) {
>  		dstofss = 16 - dstofss + 16;
>  		n -= dstofss;
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
>  		src = (const uint8_t *)src + dstofss;
>  		dst = (uint8_t *)dst + dstofss;
>  	}
> @@ -804,7 +824,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  		 * Copy 256-byte blocks
>  		 */
>  		for (; n >= 256; n -= 256) {
> -			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov256)((uint8_t *)dst, (const uint8_t *)src);
>  			dst = (uint8_t *)dst + 256;
>  			src = (const uint8_t *)src + 256;
>  		}
> @@ -826,7 +846,40 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	goto COPY_BLOCK_64_BACK15;
>  }
> 
> -#endif /* RTE_MACHINE_CPUFLAG */
> +static void __attribute__((constructor))
> +rte_memcpy_init(void)
> +{
> +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F)) {
> +		alignment_mask = 0x3F;
> +		rte_mov16 = rte_mov16_AVX512F;
> +		rte_mov32 = rte_mov32_AVX512F;
> +		rte_mov64 = rte_mov64_AVX512F;
> +		rte_mov128 = rte_mov128_AVX512F;
> +		rte_mov256 = rte_mov256_AVX512F;
> +		rte_mov128blocks = rte_mov128blocks_AVX512F;
> +		rte_mov512blocks = rte_mov512blocks_AVX512F;
> +		rte_memcpy_generic = rte_memcpy_generic_AVX512F;
> +		RTE_LOG(INFO, EAL, "AVX512 implementation of memcpy() is using!\n");
> +	} else if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2)) {
> +		alignment_mask = 0x1F;
> +		rte_mov16 = rte_mov16_AVX2;
> +		rte_mov32 = rte_mov32_AVX2;
> +		rte_mov64 = rte_mov64_AVX2;
> +		rte_mov128 = rte_mov128_AVX2;
> +		rte_mov128blocks = rte_mov128blocks_AVX2;
> +		rte_memcpy_generic = rte_memcpy_generic_AVX2;
> +		RTE_LOG(INFO, EAL, "AVX2 implementation of memcpy() is using!\n");
> +	} else {
> +		alignment_mask = 0x0F;
> +		rte_mov16 = rte_mov16_DEFAULT;
> +		rte_mov32 = rte_mov32_DEFAULT;
> +		rte_mov64 = rte_mov64_DEFAULT;
> +		rte_mov128 = rte_mov128_DEFAULT;
> +		rte_mov256 = rte_mov256_DEFAULT;
> +		rte_memcpy_generic = rte_memcpy_generic_DEFAULT;
> +		RTE_LOG(INFO, EAL, "Default SSE/AVX implementation of memcpy() is using!\n");
> +	}
> +}
> 
>  static inline void *
>  rte_memcpy_aligned(void *dst, const void *src, size_t n)
> @@ -858,8 +911,8 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
> 
>  	/* Copy 16 <= size <= 32 bytes */
>  	if (n <= 32) {
> -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst - 16 + n,
> +		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov16)((uint8_t *)dst - 16 + n,
>  				(const uint8_t *)src - 16 + n);
> 
>  		return ret;
> @@ -867,8 +920,8 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
> 
>  	/* Copy 32 < size <= 64 bytes */
>  	if (n <= 64) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov32((uint8_t *)dst - 32 + n,
> +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov32)((uint8_t *)dst - 32 + n,
>  				(const uint8_t *)src - 32 + n);
> 
>  		return ret;
> @@ -876,13 +929,13 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
> 
>  	/* Copy 64 bytes blocks */
>  	for (; n >= 64; n -= 64) {
> -		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
>  		dst = (uint8_t *)dst + 64;
>  		src = (const uint8_t *)src + 64;
>  	}
> 
>  	/* Copy whatever left */
> -	rte_mov64((uint8_t *)dst - 64 + n,
> +	(*rte_mov64)((uint8_t *)dst - 64 + n,
>  			(const uint8_t *)src - 64 + n);
> 
>  	return ret;
> @@ -891,10 +944,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
>  static inline void *
>  rte_memcpy(void *dst, const void *src, size_t n)
>  {
> -	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
> +	if (!(((uintptr_t)dst | (uintptr_t)src) & alignment_mask))
>  		return rte_memcpy_aligned(dst, src, n);
>  	else
> -		return rte_memcpy_generic(dst, src, n);
> +		return (*rte_memcpy_generic)(dst, src, n);
>  }
> 
>  #ifdef __cplusplus
> diff --git a/mk/machine/native/rte.vars.mk b/mk/machine/native/rte.vars.mk
> index f7d98d0..cdcf6c6 100644
> --- a/mk/machine/native/rte.vars.mk
> +++ b/mk/machine/native/rte.vars.mk
> @@ -65,3 +65,5 @@ SSE42_SUPPORT=$(shell $(CC) -march=native -dM -E - </dev/null | grep SSE4_2)
>  ifeq ($(SSE42_SUPPORT),)
>      MACHINE_CFLAGS = -march=corei7
>  endif
> +
> +MACHINE_CFLAGS += -mavx512f
> --
> 2.7.4

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 1/3] eal/x86: run-time dispatch over memcpy
  2017-08-30 14:56   ` Ananyev, Konstantin
@ 2017-08-30 17:51     ` Bruce Richardson
  2017-08-31  1:21       ` Lu, Wenzhuo
  0 siblings, 1 reply; 22+ messages in thread
From: Bruce Richardson @ 2017-08-30 17:51 UTC (permalink / raw)
  To: Ananyev, Konstantin
  Cc: Li, Xiaoyun, dev, Lu, Wenzhuo, Wang, Zhihong, Zhang, Qi Z

On Wed, Aug 30, 2017 at 03:56:35PM +0100, Ananyev, Konstantin wrote:
> 
> 
> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Xiaoyun Li
> > Sent: Friday, August 25, 2017 3:06 AM
> > To: Richardson, Bruce <bruce.richardson@intel.com>
> > Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Wang, Zhihong <zhihong.wang@intel.com>; Zhang, Qi Z
> > <qi.z.zhang@intel.com>; Li, Xiaoyun <xiaoyun.li@intel.com>
> > Subject: [dpdk-dev] [PATCH 1/3] eal/x86: run-time dispatch over memcpy
> > 
> > This patch dynamically selects functions of memcpy at run-time based
> > on CPU flags that current machine supports. This patch uses function
> > pointers which are bind to the relative functions at constrctor time.
> > To make AVX512 instructions pass compilation, enable the switch in
> > makefile.
> 
> It seems quite an overhead to add extra function call for each 16B movement...
> Wouldn't it be better to have one func_ptr per implementation, i.e:
> rte_memcpy_sse(), rte_memcpy_avx2(), rte_memcpy_avx512(), etc.?
> Konstantin
> 
+1 to this.

Also, how big of a benefit is there for this implementation over
standard libc memcpy (in a reasonably bleeding edge distro like e.g.
Fedora 26)?

/Bruce

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 1/3] eal/x86: run-time dispatch over memcpy
  2017-08-25  2:06 ` [PATCH 1/3] eal/x86: run-time dispatch over memcpy Xiaoyun Li
  2017-08-30 14:56   ` Ananyev, Konstantin
@ 2017-08-30 18:00   ` Stephen Hemminger
  2017-08-31  1:23     ` Lu, Wenzhuo
  1 sibling, 1 reply; 22+ messages in thread
From: Stephen Hemminger @ 2017-08-30 18:00 UTC (permalink / raw)
  To: Xiaoyun Li; +Cc: bruce.richardson, dev, wenzhuo.lu, zhihong.wang, qi.z.zhang

On Fri, 25 Aug 2017 10:06:11 +0800
Xiaoyun Li <xiaoyun.li@intel.com> wrote:

> This patch dynamically selects functions of memcpy at run-time based
> on CPU flags that current machine supports. This patch uses function
> pointers which are bind to the relative functions at constrctor time.
> To make AVX512 instructions pass compilation, enable the switch in
> makefile.
> 
> Signed-off-by: Xiaoyun Li <xiaoyun.li@intel.com>

Recent versions of GCC also have better ways to handle this.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 1/3] eal/x86: run-time dispatch over memcpy
  2017-08-30 17:51     ` Bruce Richardson
@ 2017-08-31  1:21       ` Lu, Wenzhuo
  0 siblings, 0 replies; 22+ messages in thread
From: Lu, Wenzhuo @ 2017-08-31  1:21 UTC (permalink / raw)
  To: Richardson, Bruce, Ananyev, Konstantin, Gonzalez Monroy, Sergio
  Cc: Li, Xiaoyun, dev, Wang, Zhihong, Zhang, Qi Z

Hi Bruce,

> -----Original Message-----
> From: Richardson, Bruce
> Sent: Thursday, August 31, 2017 1:52 AM
> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Cc: Li, Xiaoyun <xiaoyun.li@intel.com>; dev@dpdk.org; Lu, Wenzhuo
> <wenzhuo.lu@intel.com>; Wang, Zhihong <zhihong.wang@intel.com>; Zhang,
> Qi Z <qi.z.zhang@intel.com>
> Subject: Re: [dpdk-dev] [PATCH 1/3] eal/x86: run-time dispatch over memcpy
> 
> On Wed, Aug 30, 2017 at 03:56:35PM +0100, Ananyev, Konstantin wrote:
> >
> >
> > > -----Original Message-----
> > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Xiaoyun Li
> > > Sent: Friday, August 25, 2017 3:06 AM
> > > To: Richardson, Bruce <bruce.richardson@intel.com>
> > > Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Wang,
> Zhihong
> > > <zhihong.wang@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Li,
> > > Xiaoyun <xiaoyun.li@intel.com>
> > > Subject: [dpdk-dev] [PATCH 1/3] eal/x86: run-time dispatch over
> > > memcpy
> > >
> > > This patch dynamically selects functions of memcpy at run-time based
> > > on CPU flags that current machine supports. This patch uses function
> > > pointers which are bind to the relative functions at constrctor time.
> > > To make AVX512 instructions pass compilation, enable the switch in
> > > makefile.
> >
> > It seems quite an overhead to add extra function call for each 16B
> movement...
> > Wouldn't it be better to have one func_ptr per implementation, i.e:
> > rte_memcpy_sse(), rte_memcpy_avx2(), rte_memcpy_avx512(), etc.?
> > Konstantin
> >
> +1 to this.
> 
> Also, how big of a benefit is there for this implementation over standard libc
> memcpy (in a reasonably bleeding edge distro like e.g.
> Fedora 26)?
This patch is not an optimization. It only to make the code easier to use. So, the benefit is just the same as before.
I'm also curious about the benefit. Suppose it's better than standard libc. If not, maybe we should just use standard libc and this patch is not valuable.
+ Sergio, the maintainer of this module for more suggestion. Thanks.

> 
> /Bruce

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 1/3] eal/x86: run-time dispatch over memcpy
  2017-08-30 18:00   ` Stephen Hemminger
@ 2017-08-31  1:23     ` Lu, Wenzhuo
  2017-08-31  5:05       ` Stephen Hemminger
  0 siblings, 1 reply; 22+ messages in thread
From: Lu, Wenzhuo @ 2017-08-31  1:23 UTC (permalink / raw)
  To: Stephen Hemminger, Li, Xiaoyun
  Cc: Richardson, Bruce, dev, Wang, Zhihong, Zhang, Qi Z

Hi Stephen,


> -----Original Message-----
> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> Sent: Thursday, August 31, 2017 2:01 AM
> To: Li, Xiaoyun <xiaoyun.li@intel.com>
> Cc: Richardson, Bruce <bruce.richardson@intel.com>; dev@dpdk.org; Lu,
> Wenzhuo <wenzhuo.lu@intel.com>; Wang, Zhihong
> <zhihong.wang@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>
> Subject: Re: [dpdk-dev] [PATCH 1/3] eal/x86: run-time dispatch over memcpy
> 
> On Fri, 25 Aug 2017 10:06:11 +0800
> Xiaoyun Li <xiaoyun.li@intel.com> wrote:
> 
> > This patch dynamically selects functions of memcpy at run-time based
> > on CPU flags that current machine supports. This patch uses function
> > pointers which are bind to the relative functions at constrctor time.
> > To make AVX512 instructions pass compilation, enable the switch in
> > makefile.
> >
> > Signed-off-by: Xiaoyun Li <xiaoyun.li@intel.com>
> 
> Recent versions of GCC also have better ways to handle this.
I think the assumption of using the instructions is that we believe we can do better than the compiler. If it turns out not, maybe we need to change the instructions back to C. But it's another story.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 1/3] eal/x86: run-time dispatch over memcpy
  2017-08-31  1:23     ` Lu, Wenzhuo
@ 2017-08-31  5:05       ` Stephen Hemminger
  2017-08-31  5:24         ` Li, Xiaoyun
  0 siblings, 1 reply; 22+ messages in thread
From: Stephen Hemminger @ 2017-08-31  5:05 UTC (permalink / raw)
  To: Lu, Wenzhuo
  Cc: Wang, Zhihong, dev, Zhang, Qi Z, Li, Xiaoyun, Richardson, Bruce

I was referring to gcc multiversion

https://gcc.gnu.org/wiki/FunctionMultiVersioning


On Aug 30, 2017 6:24 PM, "Lu, Wenzhuo" <wenzhuo.lu@intel.com> wrote:

> Hi Stephen,
>
>
> > -----Original Message-----
> > From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> > Sent: Thursday, August 31, 2017 2:01 AM
> > To: Li, Xiaoyun <xiaoyun.li@intel.com>
> > Cc: Richardson, Bruce <bruce.richardson@intel.com>; dev@dpdk.org; Lu,
> > Wenzhuo <wenzhuo.lu@intel.com>; Wang, Zhihong
> > <zhihong.wang@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>
> > Subject: Re: [dpdk-dev] [PATCH 1/3] eal/x86: run-time dispatch over
> memcpy
> >
> > On Fri, 25 Aug 2017 10:06:11 +0800
> > Xiaoyun Li <xiaoyun.li@intel.com> wrote:
> >
> > > This patch dynamically selects functions of memcpy at run-time based
> > > on CPU flags that current machine supports. This patch uses function
> > > pointers which are bind to the relative functions at constrctor time.
> > > To make AVX512 instructions pass compilation, enable the switch in
> > > makefile.
> > >
> > > Signed-off-by: Xiaoyun Li <xiaoyun.li@intel.com>
> >
> > Recent versions of GCC also have better ways to handle this.
> I think the assumption of using the instructions is that we believe we can
> do better than the compiler. If it turns out not, maybe we need to change
> the instructions back to C. But it's another story.
>
>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 1/3] eal/x86: run-time dispatch over memcpy
  2017-08-31  5:05       ` Stephen Hemminger
@ 2017-08-31  5:24         ` Li, Xiaoyun
  0 siblings, 0 replies; 22+ messages in thread
From: Li, Xiaoyun @ 2017-08-31  5:24 UTC (permalink / raw)
  To: Stephen Hemminger, Lu, Wenzhuo
  Cc: Wang, Zhihong, dev, Zhang, Qi Z, Richardson, Bruce

About gcc FMV, I tried it several days ago.
But the way that the same function name with different attributions only works in C++.
And then I tried GCC6 since it is said that GCC6 would support both C and C++.
But it doesn’t work.

However, if using different function names  with attributions, it works.
And the function with attribution AVX512 means this function would be compiled via AVX512.
So I add attribution for each function and delete –mavx512 in makefile. But I haven’t sent the patch.
Because there are some compilation issues.

Before, only if both compiler and cpu support AVX512 and users want, the AVX512 codes would be compiled since the macro RTE_MACHINE_CPUFLAG_AVX512.
Now, we hope to compiler them all and choose one at runtime based on cpu.
But only above gcc4.9 and newest clang would support AVX512.
So I am thinking adding a macro switch in mk which will determine whether the compiler supports AVX512 and whether users hope to use 512. (don’t need cpu support because it will be determined at run-time)
Only if the compiler supports AVX512 and users hope to use 512, the 512 codes would be compiled.


Best Regards,
Xiaoyun Li



From: Stephen Hemminger [mailto:stephen@networkplumber.org]
Sent: Thursday, August 31, 2017 13:06
To: Lu, Wenzhuo <wenzhuo.lu@intel.com>
Cc: Wang, Zhihong <zhihong.wang@intel.com>; dev@dpdk.org; Zhang, Qi Z <qi.z.zhang@intel.com>; Li, Xiaoyun <xiaoyun.li@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>
Subject: RE: [dpdk-dev] [PATCH 1/3] eal/x86: run-time dispatch over memcpy

I was referring to gcc multiversion

https://gcc.gnu.org/wiki/FunctionMultiVersioning


On Aug 30, 2017 6:24 PM, "Lu, Wenzhuo" <wenzhuo.lu@intel.com<mailto:wenzhuo.lu@intel.com>> wrote:
Hi Stephen,


> -----Original Message-----
> From: Stephen Hemminger [mailto:stephen@networkplumber.org<mailto:stephen@networkplumber.org>]
> Sent: Thursday, August 31, 2017 2:01 AM
> To: Li, Xiaoyun <xiaoyun.li@intel.com<mailto:xiaoyun.li@intel.com>>
> Cc: Richardson, Bruce <bruce.richardson@intel.com<mailto:bruce.richardson@intel.com>>; dev@dpdk.org<mailto:dev@dpdk.org>; Lu,
> Wenzhuo <wenzhuo.lu@intel.com<mailto:wenzhuo.lu@intel.com>>; Wang, Zhihong
> <zhihong.wang@intel.com<mailto:zhihong.wang@intel.com>>; Zhang, Qi Z <qi.z.zhang@intel.com<mailto:qi.z.zhang@intel.com>>
> Subject: Re: [dpdk-dev] [PATCH 1/3] eal/x86: run-time dispatch over memcpy
>
> On Fri, 25 Aug 2017 10:06:11 +0800
> Xiaoyun Li <xiaoyun.li@intel.com<mailto:xiaoyun.li@intel.com>> wrote:
>
> > This patch dynamically selects functions of memcpy at run-time based
> > on CPU flags that current machine supports. This patch uses function
> > pointers which are bind to the relative functions at constrctor time.
> > To make AVX512 instructions pass compilation, enable the switch in
> > makefile.
> >
> > Signed-off-by: Xiaoyun Li <xiaoyun.li@intel.com<mailto:xiaoyun.li@intel.com>>
>
> Recent versions of GCC also have better ways to handle this.
I think the assumption of using the instructions is that we believe we can do better than the compiler. If it turns out not, maybe we need to change the instructions back to C. But it's another story.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH v2 0/3] dynamic linking support
  2017-08-25  2:06 [PATCH 0/3] dynamic linking support Xiaoyun Li
                   ` (2 preceding siblings ...)
  2017-08-25  2:06 ` [PATCH 3/3] efd: run-time dispatch over x86 EFD functions Xiaoyun Li
@ 2017-09-01  8:56 ` Xiaoyun Li
  2017-09-01  8:57   ` [PATCH v2 1/3] eal/x86: run-time dispatch over memcpy Xiaoyun Li
                     ` (2 more replies)
  3 siblings, 3 replies; 22+ messages in thread
From: Xiaoyun Li @ 2017-09-01  8:56 UTC (permalink / raw)
  To: bruce.richardson; +Cc: dev, zhihong.wang, qi.z.zhang, wenzhuo.lu, Xiaoyun Li

This patchset dynamically selects functions at run-time based on CPU flags
that current machine supports. This patchset modifies mempcy, memcpy perf
test and x86 EFD, using function pointers and bind them at constructor time.
Then in the cloud environment, users can compiler once for the minimum target
such as 'haswell'(not 'native') and run on different platforms (equal or above
haswell) and can get ISA optimization based on running CPU.

Xiaoyun Li (3):
  eal/x86: run-time dispatch over memcpy
  app/test: run-time dispatch over memcpy perf test
  efd: run-time dispatch over x86 EFD functions

 .../common/include/arch/x86/rte_memcpy.h           | 343 +++++++++++++--------
 lib/librte_efd/rte_efd_x86.h                       |  41 ++-
 mk/rte.cpuflags.mk                                 |  14 +
 test/test/test_memcpy_perf.c                       |  40 ++-
 4 files changed, 296 insertions(+), 142 deletions(-)

-- 
2.7.4

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH v2 1/3] eal/x86: run-time dispatch over memcpy
  2017-09-01  8:56 ` [PATCH v2 0/3] dynamic linking support Xiaoyun Li
@ 2017-09-01  8:57   ` Xiaoyun Li
  2017-09-01  9:16     ` Ananyev, Konstantin
  2017-09-01 15:34     ` Stephen Hemminger
  2017-09-01  8:57   ` [PATCH v2 2/3] app/test: run-time dispatch over memcpy perf test Xiaoyun Li
  2017-09-01  8:57   ` [PATCH v2 3/3] efd: run-time dispatch over x86 EFD functions Xiaoyun Li
  2 siblings, 2 replies; 22+ messages in thread
From: Xiaoyun Li @ 2017-09-01  8:57 UTC (permalink / raw)
  To: bruce.richardson; +Cc: dev, zhihong.wang, qi.z.zhang, wenzhuo.lu, Xiaoyun Li

This patch dynamically selects functions of memcpy at run-time based
on CPU flags that current machine supports. This patch uses function
pointers which are bind to the relative functions at constrctor time.
In addition, AVX512 instructions set would be compiled only if users
config it enabled and the compiler supports it.

Signed-off-by: Xiaoyun Li <xiaoyun.li@intel.com>
---
v2 
* use gcc function multi-versioning to avoid compilation issue.
* add macros for AVX512 and AVX2. Only if users enable AVX512 and the
compiler supports it, the AVX512 codes would be compiled. Only if the
compiler supports AVX2, the AVX2 codes would be compiled.

 .../common/include/arch/x86/rte_memcpy.h           | 343 +++++++++++++--------
 mk/rte.cpuflags.mk                                 |  14 +
 2 files changed, 231 insertions(+), 126 deletions(-)

diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
index 74c280c..abba6ad 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
@@ -45,11 +45,45 @@
 #include <string.h>
 #include <rte_vect.h>
 #include <rte_common.h>
+#include <rte_cpuflags.h>
+#include <rte_log.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+/*
+ * Select SSE/AVX memory copy method as default one.
+ */
+
+static uint16_t alignment_mask = 0x0F;
+
+typedef void (*rte_mov16_t)(uint8_t *dst, const uint8_t *src);
+typedef void (*rte_mov32_t)(uint8_t *dst, const uint8_t *src);
+typedef void (*rte_mov64_t)(uint8_t *dst, const uint8_t *src);
+typedef void (*rte_mov128_t)(uint8_t *dst, const uint8_t *src);
+typedef void (*rte_mov256_t)(uint8_t *dst, const uint8_t *src);
+#ifdef CC_SUPPORT_AVX2
+typedef void (*rte_mov128blocks_t)(uint8_t *dst, const uint8_t *src, size_t n);
+#endif
+#ifdef CC_SUPPORT_AVX512
+typedef void (*rte_mov512blocks_t)(uint8_t *dst, const uint8_t *src, size_t n);
+#endif
+typedef void * (*rte_memcpy_generic_t)(void *dst, const void *src, size_t n);
+
+static rte_mov16_t rte_mov16;
+static rte_mov32_t rte_mov32;
+static rte_mov64_t rte_mov64;
+static rte_mov128_t rte_mov128;
+static rte_mov256_t rte_mov256;
+#ifdef CC_SUPPORT_AVX2
+static rte_mov128blocks_t rte_mov128blocks;
+#endif
+#ifdef CC_SUPPORT_AVX512
+static rte_mov512blocks_t rte_mov512blocks;
+#endif
+static rte_memcpy_generic_t rte_memcpy_generic;
+
 /**
  * Copy bytes from one location to another. The locations must not overlap.
  *
@@ -68,10 +102,6 @@ extern "C" {
 static __rte_always_inline void *
 rte_memcpy(void *dst, const void *src, size_t n);
 
-#ifdef RTE_MACHINE_CPUFLAG_AVX512F
-
-#define ALIGNMENT_MASK 0x3F
-
 /**
  * AVX512 implementation below
  */
@@ -80,8 +110,10 @@ rte_memcpy(void *dst, const void *src, size_t n);
  * Copy 16 bytes from one location to another,
  * locations should not overlap.
  */
+#ifdef CC_SUPPORT_AVX512
+__attribute__((target("avx512f")))
 static inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
+rte_mov16_AVX512F(uint8_t *dst, const uint8_t *src)
 {
 	__m128i xmm0;
 
@@ -93,8 +125,9 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
  * Copy 32 bytes from one location to another,
  * locations should not overlap.
  */
+__attribute__((target("avx512f")))
 static inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
+rte_mov32_AVX512F(uint8_t *dst, const uint8_t *src)
 {
 	__m256i ymm0;
 
@@ -106,8 +139,9 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
  * Copy 64 bytes from one location to another,
  * locations should not overlap.
  */
+__attribute__((target("avx512f")))
 static inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
+rte_mov64_AVX512F(uint8_t *dst, const uint8_t *src)
 {
 	__m512i zmm0;
 
@@ -119,32 +153,35 @@ rte_mov64(uint8_t *dst, const uint8_t *src)
  * Copy 128 bytes from one location to another,
  * locations should not overlap.
  */
+__attribute__((target("avx512f")))
 static inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
+rte_mov128_AVX512F(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov64(dst + 0 * 64, src + 0 * 64);
-	rte_mov64(dst + 1 * 64, src + 1 * 64);
+	(*rte_mov64)(dst + 0 * 64, src + 0 * 64);
+	(*rte_mov64)(dst + 1 * 64, src + 1 * 64);
 }
 
 /**
  * Copy 256 bytes from one location to another,
  * locations should not overlap.
  */
+__attribute__((target("avx512f")))
 static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
+rte_mov256_AVX512F(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov64(dst + 0 * 64, src + 0 * 64);
-	rte_mov64(dst + 1 * 64, src + 1 * 64);
-	rte_mov64(dst + 2 * 64, src + 2 * 64);
-	rte_mov64(dst + 3 * 64, src + 3 * 64);
+	(*rte_mov64)(dst + 0 * 64, src + 0 * 64);
+	(*rte_mov64)(dst + 1 * 64, src + 1 * 64);
+	(*rte_mov64)(dst + 2 * 64, src + 2 * 64);
+	(*rte_mov64)(dst + 3 * 64, src + 3 * 64);
 }
 
 /**
  * Copy 128-byte blocks from one location to another,
  * locations should not overlap.
  */
+__attribute__((target("avx512f")))
 static inline void
-rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov128blocks_AVX512F(uint8_t *dst, const uint8_t *src, size_t n)
 {
 	__m512i zmm0, zmm1;
 
@@ -163,8 +200,9 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
  * Copy 512-byte blocks from one location to another,
  * locations should not overlap.
  */
+__attribute__((target("avx512f")))
 static inline void
-rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov512blocks_AVX512F(uint8_t *dst, const uint8_t *src, size_t n)
 {
 	__m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
 
@@ -191,8 +229,9 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+__attribute__((target("avx512f")))
 static inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_AVX512F(void *dst, const void *src, size_t n)
 {
 	uintptr_t dstu = (uintptr_t)dst;
 	uintptr_t srcu = (uintptr_t)src;
@@ -228,39 +267,39 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
 	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n,
+		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov16)((uint8_t *)dst - 16 + n,
 				  (const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
+		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov32)((uint8_t *)dst - 32 + n,
 				  (const uint8_t *)src - 32 + n);
 		return ret;
 	}
 	if (n <= 512) {
 		if (n >= 256) {
 			n -= 256;
-			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov256)((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 256;
 			dst = (uint8_t *)dst + 256;
 		}
 		if (n >= 128) {
 			n -= 128;
-			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 128;
 			dst = (uint8_t *)dst + 128;
 		}
 COPY_BLOCK_128_BACK63:
 		if (n > 64) {
-			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-			rte_mov64((uint8_t *)dst - 64 + n,
+			(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov64)((uint8_t *)dst - 64 + n,
 					  (const uint8_t *)src - 64 + n);
 			return ret;
 		}
 		if (n > 0)
-			rte_mov64((uint8_t *)dst - 64 + n,
+			(*rte_mov64)((uint8_t *)dst - 64 + n,
 					  (const uint8_t *)src - 64 + n);
 		return ret;
 	}
@@ -272,7 +311,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	if (dstofss > 0) {
 		dstofss = 64 - dstofss;
 		n -= dstofss;
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
 		src = (const uint8_t *)src + dstofss;
 		dst = (uint8_t *)dst + dstofss;
 	}
@@ -282,7 +321,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Use copy block function for better instruction order control,
 	 * which is important when load is unaligned.
 	 */
-	rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
+	(*rte_mov512blocks)((uint8_t *)dst, (const uint8_t *)src, n);
 	bits = n;
 	n = n & 511;
 	bits -= n;
@@ -295,7 +334,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * which is important when load is unaligned.
 	 */
 	if (n >= 128) {
-		rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
+		(*rte_mov128blocks)((uint8_t *)dst, (const uint8_t *)src, n);
 		bits = n;
 		n = n & 127;
 		bits -= n;
@@ -308,10 +347,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 */
 	goto COPY_BLOCK_128_BACK63;
 }
-
-#elif defined RTE_MACHINE_CPUFLAG_AVX2
-
-#define ALIGNMENT_MASK 0x1F
+#endif
 
 /**
  * AVX2 implementation below
@@ -321,8 +357,10 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
  * Copy 16 bytes from one location to another,
  * locations should not overlap.
  */
+#ifdef CC_SUPPORT_AVX2
+__attribute__((target("avx2")))
 static inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
+rte_mov16_AVX2(uint8_t *dst, const uint8_t *src)
 {
 	__m128i xmm0;
 
@@ -334,8 +372,9 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
  * Copy 32 bytes from one location to another,
  * locations should not overlap.
  */
+__attribute__((target("avx2")))
 static inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
+rte_mov32_AVX2(uint8_t *dst, const uint8_t *src)
 {
 	__m256i ymm0;
 
@@ -347,32 +386,35 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
  * Copy 64 bytes from one location to another,
  * locations should not overlap.
  */
+__attribute__((target("avx2")))
 static inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
+rte_mov64_AVX2(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
+	(*rte_mov32)((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
+	(*rte_mov32)((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
 }
 
 /**
  * Copy 128 bytes from one location to another,
  * locations should not overlap.
  */
+__attribute__((target("avx2")))
 static inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
+rte_mov128_AVX2(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
-	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
+	(*rte_mov32)((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
+	(*rte_mov32)((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
+	(*rte_mov32)((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
+	(*rte_mov32)((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
 }
 
 /**
  * Copy 128-byte blocks from one location to another,
  * locations should not overlap.
  */
+__attribute__((target("avx2")))
 static inline void
-rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov128blocks_AVX2(uint8_t *dst, const uint8_t *src, size_t n)
 {
 	__m256i ymm0, ymm1, ymm2, ymm3;
 
@@ -391,8 +433,9 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+__attribute__((target("avx2")))
 static inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_AVX2(void *dst, const void *src, size_t n)
 {
 	uintptr_t dstu = (uintptr_t)dst;
 	uintptr_t srcu = (uintptr_t)src;
@@ -429,46 +472,46 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
 	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n,
+		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov16)((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 48) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
-		rte_mov16((uint8_t *)dst - 16 + n,
+		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov16)((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+		(*rte_mov16)((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
+		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov32)((uint8_t *)dst - 32 + n,
 				(const uint8_t *)src - 32 + n);
 		return ret;
 	}
 	if (n <= 256) {
 		if (n >= 128) {
 			n -= 128;
-			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 128;
 			dst = (uint8_t *)dst + 128;
 		}
 COPY_BLOCK_128_BACK31:
 		if (n >= 64) {
 			n -= 64;
-			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 64;
 			dst = (uint8_t *)dst + 64;
 		}
 		if (n > 32) {
-			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-			rte_mov32((uint8_t *)dst - 32 + n,
+			(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov32)((uint8_t *)dst - 32 + n,
 					(const uint8_t *)src - 32 + n);
 			return ret;
 		}
 		if (n > 0) {
-			rte_mov32((uint8_t *)dst - 32 + n,
+			(*rte_mov32)((uint8_t *)dst - 32 + n,
 					(const uint8_t *)src - 32 + n);
 		}
 		return ret;
@@ -481,7 +524,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	if (dstofss > 0) {
 		dstofss = 32 - dstofss;
 		n -= dstofss;
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
 		src = (const uint8_t *)src + dstofss;
 		dst = (uint8_t *)dst + dstofss;
 	}
@@ -489,7 +532,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	/**
 	 * Copy 128-byte blocks
 	 */
-	rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
+	(*rte_mov128blocks)((uint8_t *)dst, (const uint8_t *)src, n);
 	bits = n;
 	n = n & 127;
 	bits -= n;
@@ -501,10 +544,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 */
 	goto COPY_BLOCK_128_BACK31;
 }
-
-#else /* RTE_MACHINE_CPUFLAG */
-
-#define ALIGNMENT_MASK 0x0F
+#endif
 
 /**
  * SSE & AVX implementation below
@@ -514,8 +554,9 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
  * Copy 16 bytes from one location to another,
  * locations should not overlap.
  */
+__attribute__((target("default")))
 static inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
+rte_mov16_DEFAULT(uint8_t *dst, const uint8_t *src)
 {
 	__m128i xmm0;
 
@@ -527,66 +568,70 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
  * Copy 32 bytes from one location to another,
  * locations should not overlap.
  */
+__attribute__((target("default")))
 static inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
+rte_mov32_DEFAULT(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 }
 
 /**
  * Copy 64 bytes from one location to another,
  * locations should not overlap.
  */
+__attribute__((target("default")))
 static inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
+rte_mov64_DEFAULT(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
+	(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	(*rte_mov16)((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+	(*rte_mov16)((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 }
 
 /**
  * Copy 128 bytes from one location to another,
  * locations should not overlap.
  */
+__attribute__((target("default")))
 static inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
+rte_mov128_DEFAULT(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
-	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
-	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
-	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
+	(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	(*rte_mov16)((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+	(*rte_mov16)((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
+	(*rte_mov16)((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
+	(*rte_mov16)((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
+	(*rte_mov16)((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
+	(*rte_mov16)((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
 }
 
 /**
  * Copy 256 bytes from one location to another,
  * locations should not overlap.
  */
+__attribute__((target("default")))
 static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
+rte_mov256_DEFAULT(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
-	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
-	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
-	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
-	rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
-	rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
-	rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
-	rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
-	rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
-	rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
-	rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
-	rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
+	(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	(*rte_mov16)((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+	(*rte_mov16)((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
+	(*rte_mov16)((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
+	(*rte_mov16)((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
+	(*rte_mov16)((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
+	(*rte_mov16)((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
+	(*rte_mov16)((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
+	(*rte_mov16)((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
+	(*rte_mov16)((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
+	(*rte_mov16)((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
+	(*rte_mov16)((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
+	(*rte_mov16)((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
+	(*rte_mov16)((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
+	(*rte_mov16)((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
 }
 
 /**
@@ -683,8 +728,9 @@ __extension__ ({                                                      \
     }                                                                 \
 })
 
+__attribute__((target("default")))
 static inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
+rte_memcpy_generic_DEFAULT(void *dst, const void *src, size_t n)
 {
 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
 	uintptr_t dstu = (uintptr_t)dst;
@@ -722,19 +768,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
 	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov16)((uint8_t *)dst - 16 + n,
+				(const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 48) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov16)((uint8_t *)dst - 16 + n,
+				(const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov16)((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+		(*rte_mov16)((uint8_t *)dst - 16 + n,
+				(const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 128) {
@@ -743,39 +792,42 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	if (n <= 512) {
 		if (n >= 256) {
 			n -= 256;
-			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
-			rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
+			(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov128)((uint8_t *)dst + 128,
+					(const uint8_t *)src + 128);
 			src = (const uint8_t *)src + 256;
 			dst = (uint8_t *)dst + 256;
 		}
 COPY_BLOCK_255_BACK15:
 		if (n >= 128) {
 			n -= 128;
-			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 128;
 			dst = (uint8_t *)dst + 128;
 		}
 COPY_BLOCK_128_BACK15:
 		if (n >= 64) {
 			n -= 64;
-			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 64;
 			dst = (uint8_t *)dst + 64;
 		}
 COPY_BLOCK_64_BACK15:
 		if (n >= 32) {
 			n -= 32;
-			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 32;
 			dst = (uint8_t *)dst + 32;
 		}
 		if (n > 16) {
-			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+			(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov16)((uint8_t *)dst - 16 + n,
+					(const uint8_t *)src - 16 + n);
 			return ret;
 		}
 		if (n > 0) {
-			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+			(*rte_mov16)((uint8_t *)dst - 16 + n,
+					(const uint8_t *)src - 16 + n);
 		}
 		return ret;
 	}
@@ -790,7 +842,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	if (dstofss > 0) {
 		dstofss = 16 - dstofss + 16;
 		n -= dstofss;
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
 		src = (const uint8_t *)src + dstofss;
 		dst = (uint8_t *)dst + dstofss;
 	}
@@ -804,7 +856,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 		 * Copy 256-byte blocks
 		 */
 		for (; n >= 256; n -= 256) {
-			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
+			(*rte_mov256)((uint8_t *)dst, (const uint8_t *)src);
 			dst = (uint8_t *)dst + 256;
 			src = (const uint8_t *)src + 256;
 		}
@@ -826,7 +878,46 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	goto COPY_BLOCK_64_BACK15;
 }
 
-#endif /* RTE_MACHINE_CPUFLAG */
+static void __attribute__((constructor))
+rte_memcpy_init(void)
+{
+#ifdef CC_SUPPORT_AVX512
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F)) {
+		alignment_mask = 0x3F;
+		rte_mov16 = rte_mov16_AVX512F;
+		rte_mov32 = rte_mov32_AVX512F;
+		rte_mov64 = rte_mov64_AVX512F;
+		rte_mov128 = rte_mov128_AVX512F;
+		rte_mov256 = rte_mov256_AVX512F;
+		rte_mov128blocks = rte_mov128blocks_AVX512F;
+		rte_mov512blocks = rte_mov512blocks_AVX512F;
+		rte_memcpy_generic = rte_memcpy_generic_AVX512F;
+		RTE_LOG(INFO, EAL, "AVX512 implementation of memcpy() is using!\n");
+	} else
+#endif
+#ifdef CC_SUPPORT_AVX2
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2)) {
+		alignment_mask = 0x1F;
+		rte_mov16 = rte_mov16_AVX2;
+		rte_mov32 = rte_mov32_AVX2;
+		rte_mov64 = rte_mov64_AVX2;
+		rte_mov128 = rte_mov128_AVX2;
+		rte_mov128blocks = rte_mov128blocks_AVX2;
+		rte_memcpy_generic = rte_memcpy_generic_AVX2;
+		RTE_LOG(INFO, EAL, "AVX2 implementation of memcpy() is using!\n");
+	} else
+#endif
+	{
+		alignment_mask = 0x0F;
+		rte_mov16 = rte_mov16_DEFAULT;
+		rte_mov32 = rte_mov32_DEFAULT;
+		rte_mov64 = rte_mov64_DEFAULT;
+		rte_mov128 = rte_mov128_DEFAULT;
+		rte_mov256 = rte_mov256_DEFAULT;
+		rte_memcpy_generic = rte_memcpy_generic_DEFAULT;
+		RTE_LOG(INFO, EAL, "Default SSE/AVX implementation of memcpy() is using!\n");
+	}
+}
 
 static inline void *
 rte_memcpy_aligned(void *dst, const void *src, size_t n)
@@ -858,8 +949,8 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 
 	/* Copy 16 <= size <= 32 bytes */
 	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n,
+		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov16)((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 
 		return ret;
@@ -867,8 +958,8 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 
 	/* Copy 32 < size <= 64 bytes */
 	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
+		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov32)((uint8_t *)dst - 32 + n,
 				(const uint8_t *)src - 32 + n);
 
 		return ret;
@@ -876,13 +967,13 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 
 	/* Copy 64 bytes blocks */
 	for (; n >= 64; n -= 64) {
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
 		dst = (uint8_t *)dst + 64;
 		src = (const uint8_t *)src + 64;
 	}
 
 	/* Copy whatever left */
-	rte_mov64((uint8_t *)dst - 64 + n,
+	(*rte_mov64)((uint8_t *)dst - 64 + n,
 			(const uint8_t *)src - 64 + n);
 
 	return ret;
@@ -891,10 +982,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 static inline void *
 rte_memcpy(void *dst, const void *src, size_t n)
 {
-	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
+	if (!(((uintptr_t)dst | (uintptr_t)src) & alignment_mask))
 		return rte_memcpy_aligned(dst, src, n);
 	else
-		return rte_memcpy_generic(dst, src, n);
+		return (*rte_memcpy_generic)(dst, src, n);
 }
 
 #ifdef __cplusplus
diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
index a813c91..92399ec 100644
--- a/mk/rte.cpuflags.mk
+++ b/mk/rte.cpuflags.mk
@@ -141,3 +141,17 @@ space:= $(empty) $(empty)
 CPUFLAGSTMP1 := $(addprefix RTE_CPUFLAG_,$(CPUFLAGS))
 CPUFLAGSTMP2 := $(subst $(space),$(comma),$(CPUFLAGSTMP1))
 CPUFLAGS_LIST := -DRTE_COMPILE_TIME_CPUFLAGS=$(CPUFLAGSTMP2)
+
+# Check if the compiler supports AVX512.
+CC_SUPPORT_AVX512 := $(shell $(CC) -march=skylake-avx512 -dM -E - < /dev/null 2>&1 | grep -q AVX512 && echo 1)
+ifeq ($(CC_SUPPORT_AVX512),1)
+ifeq ($(CONFIG_RTE_ENABLE_AVX512),y)
+MACHINE_CFLAGS += -DCC_SUPPORT_AVX512
+endif
+endif
+
+# Check if the compiler supports AVX2.
+CC_SUPPORT_AVX2 := $(shell $(CC) -march=core-avx2 -dM -E - < /dev/null 2>&1 | grep -q AVX2 && echo 1)
+ifeq ($(CC_SUPPORT_AVX2),1)
+MACHINE_CFLAGS += -DCC_SUPPORT_AVX2
+endif
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH v2 2/3] app/test: run-time dispatch over memcpy perf test
  2017-09-01  8:56 ` [PATCH v2 0/3] dynamic linking support Xiaoyun Li
  2017-09-01  8:57   ` [PATCH v2 1/3] eal/x86: run-time dispatch over memcpy Xiaoyun Li
@ 2017-09-01  8:57   ` Xiaoyun Li
  2017-09-01  8:57   ` [PATCH v2 3/3] efd: run-time dispatch over x86 EFD functions Xiaoyun Li
  2 siblings, 0 replies; 22+ messages in thread
From: Xiaoyun Li @ 2017-09-01  8:57 UTC (permalink / raw)
  To: bruce.richardson; +Cc: dev, zhihong.wang, qi.z.zhang, wenzhuo.lu, Xiaoyun Li

This patch modifies assignment of alignment unit from build-time
to run-time based on CPU flags that machine supports.

Signed-off-by: Xiaoyun Li <xiaoyun.li@intel.com>
---
 test/test/test_memcpy_perf.c | 40 +++++++++++++++++++++++++++-------------
 1 file changed, 27 insertions(+), 13 deletions(-)

diff --git a/test/test/test_memcpy_perf.c b/test/test/test_memcpy_perf.c
index ff3aaaa..33def3b 100644
--- a/test/test/test_memcpy_perf.c
+++ b/test/test/test_memcpy_perf.c
@@ -79,13 +79,7 @@ static size_t buf_sizes[TEST_VALUE_RANGE];
 #define TEST_BATCH_SIZE         100
 
 /* Data is aligned on this many bytes (power of 2) */
-#ifdef RTE_MACHINE_CPUFLAG_AVX512F
-#define ALIGNMENT_UNIT          64
-#elif defined RTE_MACHINE_CPUFLAG_AVX2
-#define ALIGNMENT_UNIT          32
-#else /* RTE_MACHINE_CPUFLAG */
-#define ALIGNMENT_UNIT          16
-#endif /* RTE_MACHINE_CPUFLAG */
+static uint8_t alignment_unit = 16;
 
 /*
  * Pointers used in performance tests. The two large buffers are for uncached
@@ -100,20 +94,39 @@ static int
 init_buffers(void)
 {
 	unsigned i;
+#ifdef CC_SUPPORT_AVX512
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F))
+		alignment_unit = 64;
+	else
+#endif
+#ifdef CC_SUPPORT_AVX2
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2))
+		alignment_unit = 32;
+	else
+#endif
+		alignment_unit = 16;
 
-	large_buf_read = rte_malloc("memcpy", LARGE_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT);
+	large_buf_read = rte_malloc("memcpy",
+				    LARGE_BUFFER_SIZE + alignment_unit,
+				    alignment_unit);
 	if (large_buf_read == NULL)
 		goto error_large_buf_read;
 
-	large_buf_write = rte_malloc("memcpy", LARGE_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT);
+	large_buf_write = rte_malloc("memcpy",
+				     LARGE_BUFFER_SIZE + alignment_unit,
+				     alignment_unit);
 	if (large_buf_write == NULL)
 		goto error_large_buf_write;
 
-	small_buf_read = rte_malloc("memcpy", SMALL_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT);
+	small_buf_read = rte_malloc("memcpy",
+				    SMALL_BUFFER_SIZE + alignment_unit,
+				    alignment_unit);
 	if (small_buf_read == NULL)
 		goto error_small_buf_read;
 
-	small_buf_write = rte_malloc("memcpy", SMALL_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT);
+	small_buf_write = rte_malloc("memcpy",
+				     SMALL_BUFFER_SIZE + alignment_unit,
+				     alignment_unit);
 	if (small_buf_write == NULL)
 		goto error_small_buf_write;
 
@@ -153,7 +166,7 @@ static inline size_t
 get_rand_offset(size_t uoffset)
 {
 	return ((rte_rand() % (LARGE_BUFFER_SIZE - SMALL_BUFFER_SIZE)) &
-			~(ALIGNMENT_UNIT - 1)) + uoffset;
+			~(alignment_unit - 1)) + uoffset;
 }
 
 /* Fill in source and destination addresses. */
@@ -321,7 +334,8 @@ perf_test(void)
 		   "(bytes)        (ticks)        (ticks)        (ticks)        (ticks)\n"
 		   "------- -------------- -------------- -------------- --------------");
 
-	printf("\n========================== %2dB aligned ============================", ALIGNMENT_UNIT);
+	printf("\n========================= %2dB aligned ============================",
+		alignment_unit);
 	/* Do aligned tests where size is a variable */
 	perf_test_variable_aligned();
 	printf("\n------- -------------- -------------- -------------- --------------");
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH v2 3/3] efd: run-time dispatch over x86 EFD functions
  2017-09-01  8:56 ` [PATCH v2 0/3] dynamic linking support Xiaoyun Li
  2017-09-01  8:57   ` [PATCH v2 1/3] eal/x86: run-time dispatch over memcpy Xiaoyun Li
  2017-09-01  8:57   ` [PATCH v2 2/3] app/test: run-time dispatch over memcpy perf test Xiaoyun Li
@ 2017-09-01  8:57   ` Xiaoyun Li
  2 siblings, 0 replies; 22+ messages in thread
From: Xiaoyun Li @ 2017-09-01  8:57 UTC (permalink / raw)
  To: bruce.richardson; +Cc: dev, zhihong.wang, qi.z.zhang, wenzhuo.lu, Xiaoyun Li

This patch dynamically selects x86 EFD functions at run-time.
This patch uses function pointer and binds it to the relative
function based on CPU flags at constructor time.

Signed-off-by: Xiaoyun Li <xiaoyun.li@intel.com>
---
 lib/librte_efd/rte_efd_x86.h | 41 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 38 insertions(+), 3 deletions(-)

diff --git a/lib/librte_efd/rte_efd_x86.h b/lib/librte_efd/rte_efd_x86.h
index 34f37d7..93b6743 100644
--- a/lib/librte_efd/rte_efd_x86.h
+++ b/lib/librte_efd/rte_efd_x86.h
@@ -43,12 +43,29 @@
 #define EFD_LOAD_SI128(val) _mm_lddqu_si128(val)
 #endif
 
+typedef efd_value_t
+(*efd_lookup_internal_avx2_t)(const efd_hashfunc_t *group_hash_idx,
+		const efd_lookuptbl_t *group_lookup_table,
+		const uint32_t hash_val_a, const uint32_t hash_val_b);
+
+static efd_lookup_internal_avx2_t efd_lookup_internal_avx2_ptr;
+
 static inline efd_value_t
 efd_lookup_internal_avx2(const efd_hashfunc_t *group_hash_idx,
 		const efd_lookuptbl_t *group_lookup_table,
 		const uint32_t hash_val_a, const uint32_t hash_val_b)
 {
-#ifdef RTE_MACHINE_CPUFLAG_AVX2
+	return (*efd_lookup_internal_avx2_ptr)(group_hash_idx,
+					       group_lookup_table,
+					       hash_val_a, hash_val_b);
+}
+
+#ifdef CC_SUPPORT_AVX2
+static inline efd_value_t
+efd_lookup_internal_avx2_AVX2(const efd_hashfunc_t *group_hash_idx,
+		const efd_lookuptbl_t *group_lookup_table,
+		const uint32_t hash_val_a, const uint32_t hash_val_b)
+{
 	efd_value_t value = 0;
 	uint32_t i = 0;
 	__m256i vhash_val_a = _mm256_set1_epi32(hash_val_a);
@@ -74,13 +91,31 @@ efd_lookup_internal_avx2(const efd_hashfunc_t *group_hash_idx,
 	}
 
 	return value;
-#else
+}
+#endif
+
+static inline efd_value_t
+efd_lookup_internal_avx2_DEFAULT(const efd_hashfunc_t *group_hash_idx,
+		const efd_lookuptbl_t *group_lookup_table,
+		const uint32_t hash_val_a, const uint32_t hash_val_b)
+{
 	RTE_SET_USED(group_hash_idx);
 	RTE_SET_USED(group_lookup_table);
 	RTE_SET_USED(hash_val_a);
 	RTE_SET_USED(hash_val_b);
 	/* Return dummy value, only to avoid compilation breakage */
 	return 0;
-#endif
+}
 
+static void __attribute__((constructor))
+rte_efd_x86_init(void)
+{
+#ifdef CC_SUPPORT_AVX2
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2))
+		efd_lookup_internal_avx2_ptr = efd_lookup_internal_avx2_AVX2;
+	else
+		efd_lookup_internal_avx2_ptr = efd_lookup_internal_avx2_DEFAULT;
+#else
+	efd_lookup_internal_avx2_ptr = efd_lookup_internal_avx2_DEFAULT;
+#endif
 }
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/3] eal/x86: run-time dispatch over memcpy
  2017-09-01  8:57   ` [PATCH v2 1/3] eal/x86: run-time dispatch over memcpy Xiaoyun Li
@ 2017-09-01  9:16     ` Ananyev, Konstantin
  2017-09-01  9:28       ` Li, Xiaoyun
  2017-09-01 15:34     ` Stephen Hemminger
  1 sibling, 1 reply; 22+ messages in thread
From: Ananyev, Konstantin @ 2017-09-01  9:16 UTC (permalink / raw)
  To: Li, Xiaoyun, Richardson, Bruce
  Cc: dev, Wang, Zhihong, Zhang, Qi Z, Lu, Wenzhuo, Li, Xiaoyun



> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Xiaoyun Li
> Sent: Friday, September 1, 2017 9:57 AM
> To: Richardson, Bruce <bruce.richardson@intel.com>
> Cc: dev@dpdk.org; Wang, Zhihong <zhihong.wang@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Lu, Wenzhuo
> <wenzhuo.lu@intel.com>; Li, Xiaoyun <xiaoyun.li@intel.com>
> Subject: [dpdk-dev] [PATCH v2 1/3] eal/x86: run-time dispatch over memcpy
> 
> This patch dynamically selects functions of memcpy at run-time based
> on CPU flags that current machine supports. This patch uses function
> pointers which are bind to the relative functions at constrctor time.
> In addition, AVX512 instructions set would be compiled only if users
> config it enabled and the compiler supports it.


I'll ask the same question again:
It seems quite an overhead to add extra function call for each 16(32/64)B movement.
Wouldn't it be better to have one func_ptr per implementation, i.e:
rte_memcpy_sse(), rte_memcpy_avx2(), rte_memcpy_avx512(), etc.?
BTW, what is the performance diff between old and new versions?
Konstantin

> 
> Signed-off-by: Xiaoyun Li <xiaoyun.li@intel.com>
> ---
> v2
> * use gcc function multi-versioning to avoid compilation issue.
> * add macros for AVX512 and AVX2. Only if users enable AVX512 and the
> compiler supports it, the AVX512 codes would be compiled. Only if the
> compiler supports AVX2, the AVX2 codes would be compiled.
> 
>  .../common/include/arch/x86/rte_memcpy.h           | 343 +++++++++++++--------
>  mk/rte.cpuflags.mk                                 |  14 +
>  2 files changed, 231 insertions(+), 126 deletions(-)
> 
> diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> index 74c280c..abba6ad 100644
> --- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> +++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> @@ -45,11 +45,45 @@
>  #include <string.h>
>  #include <rte_vect.h>
>  #include <rte_common.h>
> +#include <rte_cpuflags.h>
> +#include <rte_log.h>
> 
>  #ifdef __cplusplus
>  extern "C" {
>  #endif
> 
> +/*
> + * Select SSE/AVX memory copy method as default one.
> + */
> +
> +static uint16_t alignment_mask = 0x0F;
> +
> +typedef void (*rte_mov16_t)(uint8_t *dst, const uint8_t *src);
> +typedef void (*rte_mov32_t)(uint8_t *dst, const uint8_t *src);
> +typedef void (*rte_mov64_t)(uint8_t *dst, const uint8_t *src);
> +typedef void (*rte_mov128_t)(uint8_t *dst, const uint8_t *src);
> +typedef void (*rte_mov256_t)(uint8_t *dst, const uint8_t *src);
> +#ifdef CC_SUPPORT_AVX2
> +typedef void (*rte_mov128blocks_t)(uint8_t *dst, const uint8_t *src, size_t n);
> +#endif
> +#ifdef CC_SUPPORT_AVX512
> +typedef void (*rte_mov512blocks_t)(uint8_t *dst, const uint8_t *src, size_t n);
> +#endif
> +typedef void * (*rte_memcpy_generic_t)(void *dst, const void *src, size_t n);
> +
> +static rte_mov16_t rte_mov16;
> +static rte_mov32_t rte_mov32;
> +static rte_mov64_t rte_mov64;
> +static rte_mov128_t rte_mov128;
> +static rte_mov256_t rte_mov256;
> +#ifdef CC_SUPPORT_AVX2
> +static rte_mov128blocks_t rte_mov128blocks;
> +#endif
> +#ifdef CC_SUPPORT_AVX512
> +static rte_mov512blocks_t rte_mov512blocks;
> +#endif
> +static rte_memcpy_generic_t rte_memcpy_generic;
> +
>  /**
>   * Copy bytes from one location to another. The locations must not overlap.
>   *
> @@ -68,10 +102,6 @@ extern "C" {
>  static __rte_always_inline void *
>  rte_memcpy(void *dst, const void *src, size_t n);
> 
> -#ifdef RTE_MACHINE_CPUFLAG_AVX512F
> -
> -#define ALIGNMENT_MASK 0x3F
> -
>  /**
>   * AVX512 implementation below
>   */
> @@ -80,8 +110,10 @@ rte_memcpy(void *dst, const void *src, size_t n);
>   * Copy 16 bytes from one location to another,
>   * locations should not overlap.
>   */
> +#ifdef CC_SUPPORT_AVX512
> +__attribute__((target("avx512f")))
>  static inline void
> -rte_mov16(uint8_t *dst, const uint8_t *src)
> +rte_mov16_AVX512F(uint8_t *dst, const uint8_t *src)
>  {
>  	__m128i xmm0;
> 
> @@ -93,8 +125,9 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
>   * Copy 32 bytes from one location to another,
>   * locations should not overlap.
>   */
> +__attribute__((target("avx512f")))
>  static inline void
> -rte_mov32(uint8_t *dst, const uint8_t *src)
> +rte_mov32_AVX512F(uint8_t *dst, const uint8_t *src)
>  {
>  	__m256i ymm0;
> 
> @@ -106,8 +139,9 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
>   * Copy 64 bytes from one location to another,
>   * locations should not overlap.
>   */
> +__attribute__((target("avx512f")))
>  static inline void
> -rte_mov64(uint8_t *dst, const uint8_t *src)
> +rte_mov64_AVX512F(uint8_t *dst, const uint8_t *src)
>  {
>  	__m512i zmm0;
> 
> @@ -119,32 +153,35 @@ rte_mov64(uint8_t *dst, const uint8_t *src)
>   * Copy 128 bytes from one location to another,
>   * locations should not overlap.
>   */
> +__attribute__((target("avx512f")))
>  static inline void
> -rte_mov128(uint8_t *dst, const uint8_t *src)
> +rte_mov128_AVX512F(uint8_t *dst, const uint8_t *src)
>  {
> -	rte_mov64(dst + 0 * 64, src + 0 * 64);
> -	rte_mov64(dst + 1 * 64, src + 1 * 64);
> +	(*rte_mov64)(dst + 0 * 64, src + 0 * 64);
> +	(*rte_mov64)(dst + 1 * 64, src + 1 * 64);
>  }
> 
>  /**
>   * Copy 256 bytes from one location to another,
>   * locations should not overlap.
>   */
> +__attribute__((target("avx512f")))
>  static inline void
> -rte_mov256(uint8_t *dst, const uint8_t *src)
> +rte_mov256_AVX512F(uint8_t *dst, const uint8_t *src)
>  {
> -	rte_mov64(dst + 0 * 64, src + 0 * 64);
> -	rte_mov64(dst + 1 * 64, src + 1 * 64);
> -	rte_mov64(dst + 2 * 64, src + 2 * 64);
> -	rte_mov64(dst + 3 * 64, src + 3 * 64);
> +	(*rte_mov64)(dst + 0 * 64, src + 0 * 64);
> +	(*rte_mov64)(dst + 1 * 64, src + 1 * 64);
> +	(*rte_mov64)(dst + 2 * 64, src + 2 * 64);
> +	(*rte_mov64)(dst + 3 * 64, src + 3 * 64);
>  }
> 
>  /**
>   * Copy 128-byte blocks from one location to another,
>   * locations should not overlap.
>   */
> +__attribute__((target("avx512f")))
>  static inline void
> -rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
> +rte_mov128blocks_AVX512F(uint8_t *dst, const uint8_t *src, size_t n)
>  {
>  	__m512i zmm0, zmm1;
> 
> @@ -163,8 +200,9 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
>   * Copy 512-byte blocks from one location to another,
>   * locations should not overlap.
>   */
> +__attribute__((target("avx512f")))
>  static inline void
> -rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
> +rte_mov512blocks_AVX512F(uint8_t *dst, const uint8_t *src, size_t n)
>  {
>  	__m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
> 
> @@ -191,8 +229,9 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
>  	}
>  }
> 
> +__attribute__((target("avx512f")))
>  static inline void *
> -rte_memcpy_generic(void *dst, const void *src, size_t n)
> +rte_memcpy_generic_AVX512F(void *dst, const void *src, size_t n)
>  {
>  	uintptr_t dstu = (uintptr_t)dst;
>  	uintptr_t srcu = (uintptr_t)src;
> @@ -228,39 +267,39 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	 * Fast way when copy size doesn't exceed 512 bytes
>  	 */
>  	if (n <= 32) {
> -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst - 16 + n,
> +		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov16)((uint8_t *)dst - 16 + n,
>  				  (const uint8_t *)src - 16 + n);
>  		return ret;
>  	}
>  	if (n <= 64) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov32((uint8_t *)dst - 32 + n,
> +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov32)((uint8_t *)dst - 32 + n,
>  				  (const uint8_t *)src - 32 + n);
>  		return ret;
>  	}
>  	if (n <= 512) {
>  		if (n >= 256) {
>  			n -= 256;
> -			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov256)((uint8_t *)dst, (const uint8_t *)src);
>  			src = (const uint8_t *)src + 256;
>  			dst = (uint8_t *)dst + 256;
>  		}
>  		if (n >= 128) {
>  			n -= 128;
> -			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);
>  			src = (const uint8_t *)src + 128;
>  			dst = (uint8_t *)dst + 128;
>  		}
>  COPY_BLOCK_128_BACK63:
>  		if (n > 64) {
> -			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> -			rte_mov64((uint8_t *)dst - 64 + n,
> +			(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov64)((uint8_t *)dst - 64 + n,
>  					  (const uint8_t *)src - 64 + n);
>  			return ret;
>  		}
>  		if (n > 0)
> -			rte_mov64((uint8_t *)dst - 64 + n,
> +			(*rte_mov64)((uint8_t *)dst - 64 + n,
>  					  (const uint8_t *)src - 64 + n);
>  		return ret;
>  	}
> @@ -272,7 +311,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	if (dstofss > 0) {
>  		dstofss = 64 - dstofss;
>  		n -= dstofss;
> -		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
>  		src = (const uint8_t *)src + dstofss;
>  		dst = (uint8_t *)dst + dstofss;
>  	}
> @@ -282,7 +321,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	 * Use copy block function for better instruction order control,
>  	 * which is important when load is unaligned.
>  	 */
> -	rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
> +	(*rte_mov512blocks)((uint8_t *)dst, (const uint8_t *)src, n);
>  	bits = n;
>  	n = n & 511;
>  	bits -= n;
> @@ -295,7 +334,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	 * which is important when load is unaligned.
>  	 */
>  	if (n >= 128) {
> -		rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
> +		(*rte_mov128blocks)((uint8_t *)dst, (const uint8_t *)src, n);
>  		bits = n;
>  		n = n & 127;
>  		bits -= n;
> @@ -308,10 +347,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	 */
>  	goto COPY_BLOCK_128_BACK63;
>  }
> -
> -#elif defined RTE_MACHINE_CPUFLAG_AVX2
> -
> -#define ALIGNMENT_MASK 0x1F
> +#endif
> 
>  /**
>   * AVX2 implementation below
> @@ -321,8 +357,10 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>   * Copy 16 bytes from one location to another,
>   * locations should not overlap.
>   */
> +#ifdef CC_SUPPORT_AVX2
> +__attribute__((target("avx2")))
>  static inline void
> -rte_mov16(uint8_t *dst, const uint8_t *src)
> +rte_mov16_AVX2(uint8_t *dst, const uint8_t *src)
>  {
>  	__m128i xmm0;
> 
> @@ -334,8 +372,9 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
>   * Copy 32 bytes from one location to another,
>   * locations should not overlap.
>   */
> +__attribute__((target("avx2")))
>  static inline void
> -rte_mov32(uint8_t *dst, const uint8_t *src)
> +rte_mov32_AVX2(uint8_t *dst, const uint8_t *src)
>  {
>  	__m256i ymm0;
> 
> @@ -347,32 +386,35 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
>   * Copy 64 bytes from one location to another,
>   * locations should not overlap.
>   */
> +__attribute__((target("avx2")))
>  static inline void
> -rte_mov64(uint8_t *dst, const uint8_t *src)
> +rte_mov64_AVX2(uint8_t *dst, const uint8_t *src)
>  {
> -	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> -	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> +	(*rte_mov32)((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> +	(*rte_mov32)((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
>  }
> 
>  /**
>   * Copy 128 bytes from one location to another,
>   * locations should not overlap.
>   */
> +__attribute__((target("avx2")))
>  static inline void
> -rte_mov128(uint8_t *dst, const uint8_t *src)
> +rte_mov128_AVX2(uint8_t *dst, const uint8_t *src)
>  {
> -	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> -	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> -	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
> -	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
> +	(*rte_mov32)((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> +	(*rte_mov32)((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> +	(*rte_mov32)((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
> +	(*rte_mov32)((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
>  }
> 
>  /**
>   * Copy 128-byte blocks from one location to another,
>   * locations should not overlap.
>   */
> +__attribute__((target("avx2")))
>  static inline void
> -rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
> +rte_mov128blocks_AVX2(uint8_t *dst, const uint8_t *src, size_t n)
>  {
>  	__m256i ymm0, ymm1, ymm2, ymm3;
> 
> @@ -391,8 +433,9 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
>  	}
>  }
> 
> +__attribute__((target("avx2")))
>  static inline void *
> -rte_memcpy_generic(void *dst, const void *src, size_t n)
> +rte_memcpy_generic_AVX2(void *dst, const void *src, size_t n)
>  {
>  	uintptr_t dstu = (uintptr_t)dst;
>  	uintptr_t srcu = (uintptr_t)src;
> @@ -429,46 +472,46 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	 * Fast way when copy size doesn't exceed 256 bytes
>  	 */
>  	if (n <= 32) {
> -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst - 16 + n,
> +		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov16)((uint8_t *)dst - 16 + n,
>  				(const uint8_t *)src - 16 + n);
>  		return ret;
>  	}
>  	if (n <= 48) {
> -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
> -		rte_mov16((uint8_t *)dst - 16 + n,
> +		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov16)((uint8_t *)dst + 16, (const uint8_t *)src + 16);
> +		(*rte_mov16)((uint8_t *)dst - 16 + n,
>  				(const uint8_t *)src - 16 + n);
>  		return ret;
>  	}
>  	if (n <= 64) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov32((uint8_t *)dst - 32 + n,
> +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov32)((uint8_t *)dst - 32 + n,
>  				(const uint8_t *)src - 32 + n);
>  		return ret;
>  	}
>  	if (n <= 256) {
>  		if (n >= 128) {
>  			n -= 128;
> -			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);
>  			src = (const uint8_t *)src + 128;
>  			dst = (uint8_t *)dst + 128;
>  		}
>  COPY_BLOCK_128_BACK31:
>  		if (n >= 64) {
>  			n -= 64;
> -			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
>  			src = (const uint8_t *)src + 64;
>  			dst = (uint8_t *)dst + 64;
>  		}
>  		if (n > 32) {
> -			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -			rte_mov32((uint8_t *)dst - 32 + n,
> +			(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov32)((uint8_t *)dst - 32 + n,
>  					(const uint8_t *)src - 32 + n);
>  			return ret;
>  		}
>  		if (n > 0) {
> -			rte_mov32((uint8_t *)dst - 32 + n,
> +			(*rte_mov32)((uint8_t *)dst - 32 + n,
>  					(const uint8_t *)src - 32 + n);
>  		}
>  		return ret;
> @@ -481,7 +524,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	if (dstofss > 0) {
>  		dstofss = 32 - dstofss;
>  		n -= dstofss;
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
>  		src = (const uint8_t *)src + dstofss;
>  		dst = (uint8_t *)dst + dstofss;
>  	}
> @@ -489,7 +532,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	/**
>  	 * Copy 128-byte blocks
>  	 */
> -	rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
> +	(*rte_mov128blocks)((uint8_t *)dst, (const uint8_t *)src, n);
>  	bits = n;
>  	n = n & 127;
>  	bits -= n;
> @@ -501,10 +544,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	 */
>  	goto COPY_BLOCK_128_BACK31;
>  }
> -
> -#else /* RTE_MACHINE_CPUFLAG */
> -
> -#define ALIGNMENT_MASK 0x0F
> +#endif
> 
>  /**
>   * SSE & AVX implementation below
> @@ -514,8 +554,9 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>   * Copy 16 bytes from one location to another,
>   * locations should not overlap.
>   */
> +__attribute__((target("default")))
>  static inline void
> -rte_mov16(uint8_t *dst, const uint8_t *src)
> +rte_mov16_DEFAULT(uint8_t *dst, const uint8_t *src)
>  {
>  	__m128i xmm0;
> 
> @@ -527,66 +568,70 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
>   * Copy 32 bytes from one location to another,
>   * locations should not overlap.
>   */
> +__attribute__((target("default")))
>  static inline void
> -rte_mov32(uint8_t *dst, const uint8_t *src)
> +rte_mov32_DEFAULT(uint8_t *dst, const uint8_t *src)
>  {
> -	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> -	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
>  }
> 
>  /**
>   * Copy 64 bytes from one location to another,
>   * locations should not overlap.
>   */
> +__attribute__((target("default")))
>  static inline void
> -rte_mov64(uint8_t *dst, const uint8_t *src)
> +rte_mov64_DEFAULT(uint8_t *dst, const uint8_t *src)
>  {
> -	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> -	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> -	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> -	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
>  }
> 
>  /**
>   * Copy 128 bytes from one location to another,
>   * locations should not overlap.
>   */
> +__attribute__((target("default")))
>  static inline void
> -rte_mov128(uint8_t *dst, const uint8_t *src)
> +rte_mov128_DEFAULT(uint8_t *dst, const uint8_t *src)
>  {
> -	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> -	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> -	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> -	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> -	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
> -	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
> -	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
> -	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
>  }
> 
>  /**
>   * Copy 256 bytes from one location to another,
>   * locations should not overlap.
>   */
> +__attribute__((target("default")))
>  static inline void
> -rte_mov256(uint8_t *dst, const uint8_t *src)
> +rte_mov256_DEFAULT(uint8_t *dst, const uint8_t *src)
>  {
> -	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> -	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> -	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> -	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> -	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
> -	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
> -	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
> -	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
> -	rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
> -	rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
> -	rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
> -	rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
> -	rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
> -	rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
> -	rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
> -	rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
>  }
> 
>  /**
> @@ -683,8 +728,9 @@ __extension__ ({                                                      \
>      }                                                                 \
>  })
> 
> +__attribute__((target("default")))
>  static inline void *
> -rte_memcpy_generic(void *dst, const void *src, size_t n)
> +rte_memcpy_generic_DEFAULT(void *dst, const void *src, size_t n)
>  {
>  	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
>  	uintptr_t dstu = (uintptr_t)dst;
> @@ -722,19 +768,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	 * Fast way when copy size doesn't exceed 512 bytes
>  	 */
>  	if (n <= 32) {
> -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> +		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov16)((uint8_t *)dst - 16 + n,
> +				(const uint8_t *)src - 16 + n);
>  		return ret;
>  	}
>  	if (n <= 48) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov16)((uint8_t *)dst - 16 + n,
> +				(const uint8_t *)src - 16 + n);
>  		return ret;
>  	}
>  	if (n <= 64) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
> -		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov16)((uint8_t *)dst + 32, (const uint8_t *)src + 32);
> +		(*rte_mov16)((uint8_t *)dst - 16 + n,
> +				(const uint8_t *)src - 16 + n);
>  		return ret;
>  	}
>  	if (n <= 128) {
> @@ -743,39 +792,42 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	if (n <= 512) {
>  		if (n >= 256) {
>  			n -= 256;
> -			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
> -			rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
> +			(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov128)((uint8_t *)dst + 128,
> +					(const uint8_t *)src + 128);
>  			src = (const uint8_t *)src + 256;
>  			dst = (uint8_t *)dst + 256;
>  		}
>  COPY_BLOCK_255_BACK15:
>  		if (n >= 128) {
>  			n -= 128;
> -			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);
>  			src = (const uint8_t *)src + 128;
>  			dst = (uint8_t *)dst + 128;
>  		}
>  COPY_BLOCK_128_BACK15:
>  		if (n >= 64) {
>  			n -= 64;
> -			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
>  			src = (const uint8_t *)src + 64;
>  			dst = (uint8_t *)dst + 64;
>  		}
>  COPY_BLOCK_64_BACK15:
>  		if (n >= 32) {
>  			n -= 32;
> -			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
>  			src = (const uint8_t *)src + 32;
>  			dst = (uint8_t *)dst + 32;
>  		}
>  		if (n > 16) {
> -			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> +			(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov16)((uint8_t *)dst - 16 + n,
> +					(const uint8_t *)src - 16 + n);
>  			return ret;
>  		}
>  		if (n > 0) {
> -			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> +			(*rte_mov16)((uint8_t *)dst - 16 + n,
> +					(const uint8_t *)src - 16 + n);
>  		}
>  		return ret;
>  	}
> @@ -790,7 +842,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	if (dstofss > 0) {
>  		dstofss = 16 - dstofss + 16;
>  		n -= dstofss;
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
>  		src = (const uint8_t *)src + dstofss;
>  		dst = (uint8_t *)dst + dstofss;
>  	}
> @@ -804,7 +856,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  		 * Copy 256-byte blocks
>  		 */
>  		for (; n >= 256; n -= 256) {
> -			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov256)((uint8_t *)dst, (const uint8_t *)src);
>  			dst = (uint8_t *)dst + 256;
>  			src = (const uint8_t *)src + 256;
>  		}
> @@ -826,7 +878,46 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	goto COPY_BLOCK_64_BACK15;
>  }
> 
> -#endif /* RTE_MACHINE_CPUFLAG */
> +static void __attribute__((constructor))
> +rte_memcpy_init(void)
> +{
> +#ifdef CC_SUPPORT_AVX512
> +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F)) {
> +		alignment_mask = 0x3F;
> +		rte_mov16 = rte_mov16_AVX512F;
> +		rte_mov32 = rte_mov32_AVX512F;
> +		rte_mov64 = rte_mov64_AVX512F;
> +		rte_mov128 = rte_mov128_AVX512F;
> +		rte_mov256 = rte_mov256_AVX512F;
> +		rte_mov128blocks = rte_mov128blocks_AVX512F;
> +		rte_mov512blocks = rte_mov512blocks_AVX512F;
> +		rte_memcpy_generic = rte_memcpy_generic_AVX512F;
> +		RTE_LOG(INFO, EAL, "AVX512 implementation of memcpy() is using!\n");
> +	} else
> +#endif
> +#ifdef CC_SUPPORT_AVX2
> +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2)) {
> +		alignment_mask = 0x1F;
> +		rte_mov16 = rte_mov16_AVX2;
> +		rte_mov32 = rte_mov32_AVX2;
> +		rte_mov64 = rte_mov64_AVX2;
> +		rte_mov128 = rte_mov128_AVX2;
> +		rte_mov128blocks = rte_mov128blocks_AVX2;
> +		rte_memcpy_generic = rte_memcpy_generic_AVX2;
> +		RTE_LOG(INFO, EAL, "AVX2 implementation of memcpy() is using!\n");
> +	} else
> +#endif
> +	{
> +		alignment_mask = 0x0F;
> +		rte_mov16 = rte_mov16_DEFAULT;
> +		rte_mov32 = rte_mov32_DEFAULT;
> +		rte_mov64 = rte_mov64_DEFAULT;
> +		rte_mov128 = rte_mov128_DEFAULT;
> +		rte_mov256 = rte_mov256_DEFAULT;
> +		rte_memcpy_generic = rte_memcpy_generic_DEFAULT;
> +		RTE_LOG(INFO, EAL, "Default SSE/AVX implementation of memcpy() is using!\n");
> +	}
> +}
> 
>  static inline void *
>  rte_memcpy_aligned(void *dst, const void *src, size_t n)
> @@ -858,8 +949,8 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
> 
>  	/* Copy 16 <= size <= 32 bytes */
>  	if (n <= 32) {
> -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst - 16 + n,
> +		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov16)((uint8_t *)dst - 16 + n,
>  				(const uint8_t *)src - 16 + n);
> 
>  		return ret;
> @@ -867,8 +958,8 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
> 
>  	/* Copy 32 < size <= 64 bytes */
>  	if (n <= 64) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov32((uint8_t *)dst - 32 + n,
> +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov32)((uint8_t *)dst - 32 + n,
>  				(const uint8_t *)src - 32 + n);
> 
>  		return ret;
> @@ -876,13 +967,13 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
> 
>  	/* Copy 64 bytes blocks */
>  	for (; n >= 64; n -= 64) {
> -		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
>  		dst = (uint8_t *)dst + 64;
>  		src = (const uint8_t *)src + 64;
>  	}
> 
>  	/* Copy whatever left */
> -	rte_mov64((uint8_t *)dst - 64 + n,
> +	(*rte_mov64)((uint8_t *)dst - 64 + n,
>  			(const uint8_t *)src - 64 + n);
> 
>  	return ret;
> @@ -891,10 +982,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
>  static inline void *
>  rte_memcpy(void *dst, const void *src, size_t n)
>  {
> -	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
> +	if (!(((uintptr_t)dst | (uintptr_t)src) & alignment_mask))
>  		return rte_memcpy_aligned(dst, src, n);
>  	else
> -		return rte_memcpy_generic(dst, src, n);
> +		return (*rte_memcpy_generic)(dst, src, n);
>  }
> 
>  #ifdef __cplusplus
> diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
> index a813c91..92399ec 100644
> --- a/mk/rte.cpuflags.mk
> +++ b/mk/rte.cpuflags.mk
> @@ -141,3 +141,17 @@ space:= $(empty) $(empty)
>  CPUFLAGSTMP1 := $(addprefix RTE_CPUFLAG_,$(CPUFLAGS))
>  CPUFLAGSTMP2 := $(subst $(space),$(comma),$(CPUFLAGSTMP1))
>  CPUFLAGS_LIST := -DRTE_COMPILE_TIME_CPUFLAGS=$(CPUFLAGSTMP2)
> +
> +# Check if the compiler supports AVX512.
> +CC_SUPPORT_AVX512 := $(shell $(CC) -march=skylake-avx512 -dM -E - < /dev/null 2>&1 | grep -q AVX512 && echo 1)
> +ifeq ($(CC_SUPPORT_AVX512),1)
> +ifeq ($(CONFIG_RTE_ENABLE_AVX512),y)
> +MACHINE_CFLAGS += -DCC_SUPPORT_AVX512
> +endif
> +endif
> +
> +# Check if the compiler supports AVX2.
> +CC_SUPPORT_AVX2 := $(shell $(CC) -march=core-avx2 -dM -E - < /dev/null 2>&1 | grep -q AVX2 && echo 1)
> +ifeq ($(CC_SUPPORT_AVX2),1)
> +MACHINE_CFLAGS += -DCC_SUPPORT_AVX2
> +endif
> --
> 2.7.4

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/3] eal/x86: run-time dispatch over memcpy
  2017-09-01  9:16     ` Ananyev, Konstantin
@ 2017-09-01  9:28       ` Li, Xiaoyun
  2017-09-01 10:38         ` Ananyev, Konstantin
  0 siblings, 1 reply; 22+ messages in thread
From: Li, Xiaoyun @ 2017-09-01  9:28 UTC (permalink / raw)
  To: Ananyev, Konstantin, Richardson, Bruce
  Cc: dev, Wang, Zhihong, Zhang, Qi Z, Lu, Wenzhuo

Hi
I send an email just now.
The original codes have so many functions because maybe other places (right now nowhere else) would need functions like rte_mov16.
Then they can directly use it. So I am not sure if I need modify it.

The performance haven't been tested. But it should be the same as before.
Before, if cpu and compiler supports AVX512 and users enable it, the AVX512 codes would be compiled and runed.
I just delete the macro and make it decide at run-time (constructor time) not build time.


Best Regards,
Xiaoyun Li




-----Original Message-----
From: Ananyev, Konstantin 
Sent: Friday, September 1, 2017 17:17
To: Li, Xiaoyun <xiaoyun.li@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>
Cc: dev@dpdk.org; Wang, Zhihong <zhihong.wang@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Li, Xiaoyun <xiaoyun.li@intel.com>
Subject: RE: [dpdk-dev] [PATCH v2 1/3] eal/x86: run-time dispatch over memcpy



> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Xiaoyun Li
> Sent: Friday, September 1, 2017 9:57 AM
> To: Richardson, Bruce <bruce.richardson@intel.com>
> Cc: dev@dpdk.org; Wang, Zhihong <zhihong.wang@intel.com>; Zhang, Qi Z 
> <qi.z.zhang@intel.com>; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Li, 
> Xiaoyun <xiaoyun.li@intel.com>
> Subject: [dpdk-dev] [PATCH v2 1/3] eal/x86: run-time dispatch over 
> memcpy
> 
> This patch dynamically selects functions of memcpy at run-time based 
> on CPU flags that current machine supports. This patch uses function 
> pointers which are bind to the relative functions at constrctor time.
> In addition, AVX512 instructions set would be compiled only if users 
> config it enabled and the compiler supports it.


I'll ask the same question again:
It seems quite an overhead to add extra function call for each 16(32/64)B movement.
Wouldn't it be better to have one func_ptr per implementation, i.e:
rte_memcpy_sse(), rte_memcpy_avx2(), rte_memcpy_avx512(), etc.?
BTW, what is the performance diff between old and new versions?
Konstantin

> 
> Signed-off-by: Xiaoyun Li <xiaoyun.li@intel.com>
> ---
> v2
> * use gcc function multi-versioning to avoid compilation issue.
> * add macros for AVX512 and AVX2. Only if users enable AVX512 and the 
> compiler supports it, the AVX512 codes would be compiled. Only if the 
> compiler supports AVX2, the AVX2 codes would be compiled.
> 
>  .../common/include/arch/x86/rte_memcpy.h           | 343 +++++++++++++--------
>  mk/rte.cpuflags.mk                                 |  14 +
>  2 files changed, 231 insertions(+), 126 deletions(-)
> 
> diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h 
> b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> index 74c280c..abba6ad 100644
> --- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> +++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> @@ -45,11 +45,45 @@
>  #include <string.h>
>  #include <rte_vect.h>
>  #include <rte_common.h>
> +#include <rte_cpuflags.h>
> +#include <rte_log.h>
> 
>  #ifdef __cplusplus
>  extern "C" {
>  #endif
> 
> +/*
> + * Select SSE/AVX memory copy method as default one.
> + */
> +
> +static uint16_t alignment_mask = 0x0F;
> +
> +typedef void (*rte_mov16_t)(uint8_t *dst, const uint8_t *src); 
> +typedef void (*rte_mov32_t)(uint8_t *dst, const uint8_t *src); 
> +typedef void (*rte_mov64_t)(uint8_t *dst, const uint8_t *src); 
> +typedef void (*rte_mov128_t)(uint8_t *dst, const uint8_t *src); 
> +typedef void (*rte_mov256_t)(uint8_t *dst, const uint8_t *src); 
> +#ifdef CC_SUPPORT_AVX2 typedef void (*rte_mov128blocks_t)(uint8_t 
> +*dst, const uint8_t *src, size_t n); #endif #ifdef CC_SUPPORT_AVX512 
> +typedef void (*rte_mov512blocks_t)(uint8_t *dst, const uint8_t *src, 
> +size_t n); #endif typedef void * (*rte_memcpy_generic_t)(void *dst, 
> +const void *src, size_t n);
> +
> +static rte_mov16_t rte_mov16;
> +static rte_mov32_t rte_mov32;
> +static rte_mov64_t rte_mov64;
> +static rte_mov128_t rte_mov128;
> +static rte_mov256_t rte_mov256;
> +#ifdef CC_SUPPORT_AVX2
> +static rte_mov128blocks_t rte_mov128blocks; #endif #ifdef 
> +CC_SUPPORT_AVX512 static rte_mov512blocks_t rte_mov512blocks; #endif 
> +static rte_memcpy_generic_t rte_memcpy_generic;
> +
>  /**
>   * Copy bytes from one location to another. The locations must not overlap.
>   *
> @@ -68,10 +102,6 @@ extern "C" {
>  static __rte_always_inline void *
>  rte_memcpy(void *dst, const void *src, size_t n);
> 
> -#ifdef RTE_MACHINE_CPUFLAG_AVX512F
> -
> -#define ALIGNMENT_MASK 0x3F
> -
>  /**
>   * AVX512 implementation below
>   */
> @@ -80,8 +110,10 @@ rte_memcpy(void *dst, const void *src, size_t n);
>   * Copy 16 bytes from one location to another,
>   * locations should not overlap.
>   */
> +#ifdef CC_SUPPORT_AVX512
> +__attribute__((target("avx512f")))
>  static inline void
> -rte_mov16(uint8_t *dst, const uint8_t *src)
> +rte_mov16_AVX512F(uint8_t *dst, const uint8_t *src)
>  {
>  	__m128i xmm0;
> 
> @@ -93,8 +125,9 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
>   * Copy 32 bytes from one location to another,
>   * locations should not overlap.
>   */
> +__attribute__((target("avx512f")))
>  static inline void
> -rte_mov32(uint8_t *dst, const uint8_t *src)
> +rte_mov32_AVX512F(uint8_t *dst, const uint8_t *src)
>  {
>  	__m256i ymm0;
> 
> @@ -106,8 +139,9 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
>   * Copy 64 bytes from one location to another,
>   * locations should not overlap.
>   */
> +__attribute__((target("avx512f")))
>  static inline void
> -rte_mov64(uint8_t *dst, const uint8_t *src)
> +rte_mov64_AVX512F(uint8_t *dst, const uint8_t *src)
>  {
>  	__m512i zmm0;
> 
> @@ -119,32 +153,35 @@ rte_mov64(uint8_t *dst, const uint8_t *src)
>   * Copy 128 bytes from one location to another,
>   * locations should not overlap.
>   */
> +__attribute__((target("avx512f")))
>  static inline void
> -rte_mov128(uint8_t *dst, const uint8_t *src)
> +rte_mov128_AVX512F(uint8_t *dst, const uint8_t *src)
>  {
> -	rte_mov64(dst + 0 * 64, src + 0 * 64);
> -	rte_mov64(dst + 1 * 64, src + 1 * 64);
> +	(*rte_mov64)(dst + 0 * 64, src + 0 * 64);
> +	(*rte_mov64)(dst + 1 * 64, src + 1 * 64);
>  }
> 
>  /**
>   * Copy 256 bytes from one location to another,
>   * locations should not overlap.
>   */
> +__attribute__((target("avx512f")))
>  static inline void
> -rte_mov256(uint8_t *dst, const uint8_t *src)
> +rte_mov256_AVX512F(uint8_t *dst, const uint8_t *src)
>  {
> -	rte_mov64(dst + 0 * 64, src + 0 * 64);
> -	rte_mov64(dst + 1 * 64, src + 1 * 64);
> -	rte_mov64(dst + 2 * 64, src + 2 * 64);
> -	rte_mov64(dst + 3 * 64, src + 3 * 64);
> +	(*rte_mov64)(dst + 0 * 64, src + 0 * 64);
> +	(*rte_mov64)(dst + 1 * 64, src + 1 * 64);
> +	(*rte_mov64)(dst + 2 * 64, src + 2 * 64);
> +	(*rte_mov64)(dst + 3 * 64, src + 3 * 64);
>  }
> 
>  /**
>   * Copy 128-byte blocks from one location to another,
>   * locations should not overlap.
>   */
> +__attribute__((target("avx512f")))
>  static inline void
> -rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
> +rte_mov128blocks_AVX512F(uint8_t *dst, const uint8_t *src, size_t n)
>  {
>  	__m512i zmm0, zmm1;
> 
> @@ -163,8 +200,9 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
>   * Copy 512-byte blocks from one location to another,
>   * locations should not overlap.
>   */
> +__attribute__((target("avx512f")))
>  static inline void
> -rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
> +rte_mov512blocks_AVX512F(uint8_t *dst, const uint8_t *src, size_t n)
>  {
>  	__m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
> 
> @@ -191,8 +229,9 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
>  	}
>  }
> 
> +__attribute__((target("avx512f")))
>  static inline void *
> -rte_memcpy_generic(void *dst, const void *src, size_t n)
> +rte_memcpy_generic_AVX512F(void *dst, const void *src, size_t n)
>  {
>  	uintptr_t dstu = (uintptr_t)dst;
>  	uintptr_t srcu = (uintptr_t)src;
> @@ -228,39 +267,39 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	 * Fast way when copy size doesn't exceed 512 bytes
>  	 */
>  	if (n <= 32) {
> -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst - 16 + n,
> +		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov16)((uint8_t *)dst - 16 + n,
>  				  (const uint8_t *)src - 16 + n);
>  		return ret;
>  	}
>  	if (n <= 64) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov32((uint8_t *)dst - 32 + n,
> +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov32)((uint8_t *)dst - 32 + n,
>  				  (const uint8_t *)src - 32 + n);
>  		return ret;
>  	}
>  	if (n <= 512) {
>  		if (n >= 256) {
>  			n -= 256;
> -			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov256)((uint8_t *)dst, (const uint8_t *)src);
>  			src = (const uint8_t *)src + 256;
>  			dst = (uint8_t *)dst + 256;
>  		}
>  		if (n >= 128) {
>  			n -= 128;
> -			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);
>  			src = (const uint8_t *)src + 128;
>  			dst = (uint8_t *)dst + 128;
>  		}
>  COPY_BLOCK_128_BACK63:
>  		if (n > 64) {
> -			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> -			rte_mov64((uint8_t *)dst - 64 + n,
> +			(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov64)((uint8_t *)dst - 64 + n,
>  					  (const uint8_t *)src - 64 + n);
>  			return ret;
>  		}
>  		if (n > 0)
> -			rte_mov64((uint8_t *)dst - 64 + n,
> +			(*rte_mov64)((uint8_t *)dst - 64 + n,
>  					  (const uint8_t *)src - 64 + n);
>  		return ret;
>  	}
> @@ -272,7 +311,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	if (dstofss > 0) {
>  		dstofss = 64 - dstofss;
>  		n -= dstofss;
> -		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
>  		src = (const uint8_t *)src + dstofss;
>  		dst = (uint8_t *)dst + dstofss;
>  	}
> @@ -282,7 +321,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	 * Use copy block function for better instruction order control,
>  	 * which is important when load is unaligned.
>  	 */
> -	rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
> +	(*rte_mov512blocks)((uint8_t *)dst, (const uint8_t *)src, n);
>  	bits = n;
>  	n = n & 511;
>  	bits -= n;
> @@ -295,7 +334,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	 * which is important when load is unaligned.
>  	 */
>  	if (n >= 128) {
> -		rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
> +		(*rte_mov128blocks)((uint8_t *)dst, (const uint8_t *)src, n);
>  		bits = n;
>  		n = n & 127;
>  		bits -= n;
> @@ -308,10 +347,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	 */
>  	goto COPY_BLOCK_128_BACK63;
>  }
> -
> -#elif defined RTE_MACHINE_CPUFLAG_AVX2
> -
> -#define ALIGNMENT_MASK 0x1F
> +#endif
> 
>  /**
>   * AVX2 implementation below
> @@ -321,8 +357,10 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>   * Copy 16 bytes from one location to another,
>   * locations should not overlap.
>   */
> +#ifdef CC_SUPPORT_AVX2
> +__attribute__((target("avx2")))
>  static inline void
> -rte_mov16(uint8_t *dst, const uint8_t *src)
> +rte_mov16_AVX2(uint8_t *dst, const uint8_t *src)
>  {
>  	__m128i xmm0;
> 
> @@ -334,8 +372,9 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
>   * Copy 32 bytes from one location to another,
>   * locations should not overlap.
>   */
> +__attribute__((target("avx2")))
>  static inline void
> -rte_mov32(uint8_t *dst, const uint8_t *src)
> +rte_mov32_AVX2(uint8_t *dst, const uint8_t *src)
>  {
>  	__m256i ymm0;
> 
> @@ -347,32 +386,35 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
>   * Copy 64 bytes from one location to another,
>   * locations should not overlap.
>   */
> +__attribute__((target("avx2")))
>  static inline void
> -rte_mov64(uint8_t *dst, const uint8_t *src)
> +rte_mov64_AVX2(uint8_t *dst, const uint8_t *src)
>  {
> -	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> -	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> +	(*rte_mov32)((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> +	(*rte_mov32)((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
>  }
> 
>  /**
>   * Copy 128 bytes from one location to another,
>   * locations should not overlap.
>   */
> +__attribute__((target("avx2")))
>  static inline void
> -rte_mov128(uint8_t *dst, const uint8_t *src)
> +rte_mov128_AVX2(uint8_t *dst, const uint8_t *src)
>  {
> -	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> -	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> -	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
> -	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
> +	(*rte_mov32)((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> +	(*rte_mov32)((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> +	(*rte_mov32)((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
> +	(*rte_mov32)((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
>  }
> 
>  /**
>   * Copy 128-byte blocks from one location to another,
>   * locations should not overlap.
>   */
> +__attribute__((target("avx2")))
>  static inline void
> -rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
> +rte_mov128blocks_AVX2(uint8_t *dst, const uint8_t *src, size_t n)
>  {
>  	__m256i ymm0, ymm1, ymm2, ymm3;
> 
> @@ -391,8 +433,9 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
>  	}
>  }
> 
> +__attribute__((target("avx2")))
>  static inline void *
> -rte_memcpy_generic(void *dst, const void *src, size_t n)
> +rte_memcpy_generic_AVX2(void *dst, const void *src, size_t n)
>  {
>  	uintptr_t dstu = (uintptr_t)dst;
>  	uintptr_t srcu = (uintptr_t)src;
> @@ -429,46 +472,46 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	 * Fast way when copy size doesn't exceed 256 bytes
>  	 */
>  	if (n <= 32) {
> -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst - 16 + n,
> +		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov16)((uint8_t *)dst - 16 + n,
>  				(const uint8_t *)src - 16 + n);
>  		return ret;
>  	}
>  	if (n <= 48) {
> -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
> -		rte_mov16((uint8_t *)dst - 16 + n,
> +		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov16)((uint8_t *)dst + 16, (const uint8_t *)src + 16);
> +		(*rte_mov16)((uint8_t *)dst - 16 + n,
>  				(const uint8_t *)src - 16 + n);
>  		return ret;
>  	}
>  	if (n <= 64) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov32((uint8_t *)dst - 32 + n,
> +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov32)((uint8_t *)dst - 32 + n,
>  				(const uint8_t *)src - 32 + n);
>  		return ret;
>  	}
>  	if (n <= 256) {
>  		if (n >= 128) {
>  			n -= 128;
> -			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);
>  			src = (const uint8_t *)src + 128;
>  			dst = (uint8_t *)dst + 128;
>  		}
>  COPY_BLOCK_128_BACK31:
>  		if (n >= 64) {
>  			n -= 64;
> -			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
>  			src = (const uint8_t *)src + 64;
>  			dst = (uint8_t *)dst + 64;
>  		}
>  		if (n > 32) {
> -			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -			rte_mov32((uint8_t *)dst - 32 + n,
> +			(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov32)((uint8_t *)dst - 32 + n,
>  					(const uint8_t *)src - 32 + n);
>  			return ret;
>  		}
>  		if (n > 0) {
> -			rte_mov32((uint8_t *)dst - 32 + n,
> +			(*rte_mov32)((uint8_t *)dst - 32 + n,
>  					(const uint8_t *)src - 32 + n);
>  		}
>  		return ret;
> @@ -481,7 +524,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	if (dstofss > 0) {
>  		dstofss = 32 - dstofss;
>  		n -= dstofss;
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
>  		src = (const uint8_t *)src + dstofss;
>  		dst = (uint8_t *)dst + dstofss;
>  	}
> @@ -489,7 +532,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	/**
>  	 * Copy 128-byte blocks
>  	 */
> -	rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
> +	(*rte_mov128blocks)((uint8_t *)dst, (const uint8_t *)src, n);
>  	bits = n;
>  	n = n & 127;
>  	bits -= n;
> @@ -501,10 +544,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	 */
>  	goto COPY_BLOCK_128_BACK31;
>  }
> -
> -#else /* RTE_MACHINE_CPUFLAG */
> -
> -#define ALIGNMENT_MASK 0x0F
> +#endif
> 
>  /**
>   * SSE & AVX implementation below
> @@ -514,8 +554,9 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>   * Copy 16 bytes from one location to another,
>   * locations should not overlap.
>   */
> +__attribute__((target("default")))
>  static inline void
> -rte_mov16(uint8_t *dst, const uint8_t *src)
> +rte_mov16_DEFAULT(uint8_t *dst, const uint8_t *src)
>  {
>  	__m128i xmm0;
> 
> @@ -527,66 +568,70 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
>   * Copy 32 bytes from one location to another,
>   * locations should not overlap.
>   */
> +__attribute__((target("default")))
>  static inline void
> -rte_mov32(uint8_t *dst, const uint8_t *src)
> +rte_mov32_DEFAULT(uint8_t *dst, const uint8_t *src)
>  {
> -	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> -	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
>  }
> 
>  /**
>   * Copy 64 bytes from one location to another,
>   * locations should not overlap.
>   */
> +__attribute__((target("default")))
>  static inline void
> -rte_mov64(uint8_t *dst, const uint8_t *src)
> +rte_mov64_DEFAULT(uint8_t *dst, const uint8_t *src)
>  {
> -	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> -	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> -	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> -	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
>  }
> 
>  /**
>   * Copy 128 bytes from one location to another,
>   * locations should not overlap.
>   */
> +__attribute__((target("default")))
>  static inline void
> -rte_mov128(uint8_t *dst, const uint8_t *src)
> +rte_mov128_DEFAULT(uint8_t *dst, const uint8_t *src)
>  {
> -	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> -	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> -	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> -	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> -	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
> -	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
> -	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
> -	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
>  }
> 
>  /**
>   * Copy 256 bytes from one location to another,
>   * locations should not overlap.
>   */
> +__attribute__((target("default")))
>  static inline void
> -rte_mov256(uint8_t *dst, const uint8_t *src)
> +rte_mov256_DEFAULT(uint8_t *dst, const uint8_t *src)
>  {
> -	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> -	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> -	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> -	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> -	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
> -	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
> -	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
> -	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
> -	rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
> -	rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
> -	rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
> -	rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
> -	rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
> -	rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
> -	rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
> -	rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
> +	(*rte_mov16)((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
>  }
> 
>  /**
> @@ -683,8 +728,9 @@ __extension__ ({                                                      \
>      }                                                                 \
>  })
> 
> +__attribute__((target("default")))
>  static inline void *
> -rte_memcpy_generic(void *dst, const void *src, size_t n)
> +rte_memcpy_generic_DEFAULT(void *dst, const void *src, size_t n)
>  {
>  	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
>  	uintptr_t dstu = (uintptr_t)dst;
> @@ -722,19 +768,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	 * Fast way when copy size doesn't exceed 512 bytes
>  	 */
>  	if (n <= 32) {
> -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> +		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov16)((uint8_t *)dst - 16 + n,
> +				(const uint8_t *)src - 16 + n);
>  		return ret;
>  	}
>  	if (n <= 48) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov16)((uint8_t *)dst - 16 + n,
> +				(const uint8_t *)src - 16 + n);
>  		return ret;
>  	}
>  	if (n <= 64) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
> -		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov16)((uint8_t *)dst + 32, (const uint8_t *)src + 32);
> +		(*rte_mov16)((uint8_t *)dst - 16 + n,
> +				(const uint8_t *)src - 16 + n);
>  		return ret;
>  	}
>  	if (n <= 128) {
> @@ -743,39 +792,42 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	if (n <= 512) {
>  		if (n >= 256) {
>  			n -= 256;
> -			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
> -			rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
> +			(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov128)((uint8_t *)dst + 128,
> +					(const uint8_t *)src + 128);
>  			src = (const uint8_t *)src + 256;
>  			dst = (uint8_t *)dst + 256;
>  		}
>  COPY_BLOCK_255_BACK15:
>  		if (n >= 128) {
>  			n -= 128;
> -			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);
>  			src = (const uint8_t *)src + 128;
>  			dst = (uint8_t *)dst + 128;
>  		}
>  COPY_BLOCK_128_BACK15:
>  		if (n >= 64) {
>  			n -= 64;
> -			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
>  			src = (const uint8_t *)src + 64;
>  			dst = (uint8_t *)dst + 64;
>  		}
>  COPY_BLOCK_64_BACK15:
>  		if (n >= 32) {
>  			n -= 32;
> -			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
>  			src = (const uint8_t *)src + 32;
>  			dst = (uint8_t *)dst + 32;
>  		}
>  		if (n > 16) {
> -			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> +			(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov16)((uint8_t *)dst - 16 + n,
> +					(const uint8_t *)src - 16 + n);
>  			return ret;
>  		}
>  		if (n > 0) {
> -			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> +			(*rte_mov16)((uint8_t *)dst - 16 + n,
> +					(const uint8_t *)src - 16 + n);
>  		}
>  		return ret;
>  	}
> @@ -790,7 +842,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	if (dstofss > 0) {
>  		dstofss = 16 - dstofss + 16;
>  		n -= dstofss;
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
>  		src = (const uint8_t *)src + dstofss;
>  		dst = (uint8_t *)dst + dstofss;
>  	}
> @@ -804,7 +856,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  		 * Copy 256-byte blocks
>  		 */
>  		for (; n >= 256; n -= 256) {
> -			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
> +			(*rte_mov256)((uint8_t *)dst, (const uint8_t *)src);
>  			dst = (uint8_t *)dst + 256;
>  			src = (const uint8_t *)src + 256;
>  		}
> @@ -826,7 +878,46 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	goto COPY_BLOCK_64_BACK15;
>  }
> 
> -#endif /* RTE_MACHINE_CPUFLAG */
> +static void __attribute__((constructor))
> +rte_memcpy_init(void)
> +{
> +#ifdef CC_SUPPORT_AVX512
> +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F)) {
> +		alignment_mask = 0x3F;
> +		rte_mov16 = rte_mov16_AVX512F;
> +		rte_mov32 = rte_mov32_AVX512F;
> +		rte_mov64 = rte_mov64_AVX512F;
> +		rte_mov128 = rte_mov128_AVX512F;
> +		rte_mov256 = rte_mov256_AVX512F;
> +		rte_mov128blocks = rte_mov128blocks_AVX512F;
> +		rte_mov512blocks = rte_mov512blocks_AVX512F;
> +		rte_memcpy_generic = rte_memcpy_generic_AVX512F;
> +		RTE_LOG(INFO, EAL, "AVX512 implementation of memcpy() is using!\n");
> +	} else
> +#endif
> +#ifdef CC_SUPPORT_AVX2
> +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2)) {
> +		alignment_mask = 0x1F;
> +		rte_mov16 = rte_mov16_AVX2;
> +		rte_mov32 = rte_mov32_AVX2;
> +		rte_mov64 = rte_mov64_AVX2;
> +		rte_mov128 = rte_mov128_AVX2;
> +		rte_mov128blocks = rte_mov128blocks_AVX2;
> +		rte_memcpy_generic = rte_memcpy_generic_AVX2;
> +		RTE_LOG(INFO, EAL, "AVX2 implementation of memcpy() is using!\n");
> +	} else
> +#endif
> +	{
> +		alignment_mask = 0x0F;
> +		rte_mov16 = rte_mov16_DEFAULT;
> +		rte_mov32 = rte_mov32_DEFAULT;
> +		rte_mov64 = rte_mov64_DEFAULT;
> +		rte_mov128 = rte_mov128_DEFAULT;
> +		rte_mov256 = rte_mov256_DEFAULT;
> +		rte_memcpy_generic = rte_memcpy_generic_DEFAULT;
> +		RTE_LOG(INFO, EAL, "Default SSE/AVX implementation of memcpy() is using!\n");
> +	}
> +}
> 
>  static inline void *
>  rte_memcpy_aligned(void *dst, const void *src, size_t n)
> @@ -858,8 +949,8 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
> 
>  	/* Copy 16 <= size <= 32 bytes */
>  	if (n <= 32) {
> -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst - 16 + n,
> +		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov16)((uint8_t *)dst - 16 + n,
>  				(const uint8_t *)src - 16 + n);
> 
>  		return ret;
> @@ -867,8 +958,8 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
> 
>  	/* Copy 32 < size <= 64 bytes */
>  	if (n <= 64) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov32((uint8_t *)dst - 32 + n,
> +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov32)((uint8_t *)dst - 32 + n,
>  				(const uint8_t *)src - 32 + n);
> 
>  		return ret;
> @@ -876,13 +967,13 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
> 
>  	/* Copy 64 bytes blocks */
>  	for (; n >= 64; n -= 64) {
> -		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> +		(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
>  		dst = (uint8_t *)dst + 64;
>  		src = (const uint8_t *)src + 64;
>  	}
> 
>  	/* Copy whatever left */
> -	rte_mov64((uint8_t *)dst - 64 + n,
> +	(*rte_mov64)((uint8_t *)dst - 64 + n,
>  			(const uint8_t *)src - 64 + n);
> 
>  	return ret;
> @@ -891,10 +982,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
>  static inline void *
>  rte_memcpy(void *dst, const void *src, size_t n)
>  {
> -	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
> +	if (!(((uintptr_t)dst | (uintptr_t)src) & alignment_mask))
>  		return rte_memcpy_aligned(dst, src, n);
>  	else
> -		return rte_memcpy_generic(dst, src, n);
> +		return (*rte_memcpy_generic)(dst, src, n);
>  }
> 
>  #ifdef __cplusplus
> diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
> index a813c91..92399ec 100644
> --- a/mk/rte.cpuflags.mk
> +++ b/mk/rte.cpuflags.mk
> @@ -141,3 +141,17 @@ space:= $(empty) $(empty)
>  CPUFLAGSTMP1 := $(addprefix RTE_CPUFLAG_,$(CPUFLAGS))
>  CPUFLAGSTMP2 := $(subst $(space),$(comma),$(CPUFLAGSTMP1))
>  CPUFLAGS_LIST := -DRTE_COMPILE_TIME_CPUFLAGS=$(CPUFLAGSTMP2)
> +
> +# Check if the compiler supports AVX512.
> +CC_SUPPORT_AVX512 := $(shell $(CC) -march=skylake-avx512 -dM -E - < /dev/null 2>&1 | grep -q AVX512 && echo 1)
> +ifeq ($(CC_SUPPORT_AVX512),1)
> +ifeq ($(CONFIG_RTE_ENABLE_AVX512),y)
> +MACHINE_CFLAGS += -DCC_SUPPORT_AVX512
> +endif
> +endif
> +
> +# Check if the compiler supports AVX2.
> +CC_SUPPORT_AVX2 := $(shell $(CC) -march=core-avx2 -dM -E - < /dev/null 2>&1 | grep -q AVX2 && echo 1)
> +ifeq ($(CC_SUPPORT_AVX2),1)
> +MACHINE_CFLAGS += -DCC_SUPPORT_AVX2
> +endif
> --
> 2.7.4

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/3] eal/x86: run-time dispatch over memcpy
  2017-09-01  9:28       ` Li, Xiaoyun
@ 2017-09-01 10:38         ` Ananyev, Konstantin
  2017-09-04  1:41           ` Li, Xiaoyun
  0 siblings, 1 reply; 22+ messages in thread
From: Ananyev, Konstantin @ 2017-09-01 10:38 UTC (permalink / raw)
  To: Li, Xiaoyun, Richardson, Bruce
  Cc: dev, Wang, Zhihong, Zhang, Qi Z, Lu, Wenzhuo



> -----Original Message-----
> From: Li, Xiaoyun
> Sent: Friday, September 1, 2017 10:29 AM
> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>
> Cc: dev@dpdk.org; Wang, Zhihong <zhihong.wang@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Lu, Wenzhuo
> <wenzhuo.lu@intel.com>
> Subject: RE: [dpdk-dev] [PATCH v2 1/3] eal/x86: run-time dispatch over memcpy
> 
> Hi
> I send an email just now.
> The original codes have so many functions because maybe other places (right now nowhere else) would need functions like rte_mov16.

Which place exactly?
As I understand right now no one using it, and they supposed to be sort of private ones, correct?
So why do you want to make them public?

> Then they can directly use it. So I am not sure if I need modify it.
> 
> The performance haven't been tested. But it should be the same as before.

Hmm, what makes you think that?
Let say to move 64B you are calling rte_mov16() 4 times.
Obviously each function call has an overhead -
prepare parameter registers, save/restore scratch registers, actual call/jmp instruction, ret, etc.
All that costs cycles.
Did you do any performance measurements here?
Konstantin 

> Before, if cpu and compiler supports AVX512 and users enable it, the AVX512 codes would be compiled and runed.
> I just delete the macro and make it decide at run-time (constructor time) not build time.
> 
> 
> Best Regards,
> Xiaoyun Li
> 
> 
> 
> 
> -----Original Message-----
> From: Ananyev, Konstantin
> Sent: Friday, September 1, 2017 17:17
> To: Li, Xiaoyun <xiaoyun.li@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>
> Cc: dev@dpdk.org; Wang, Zhihong <zhihong.wang@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Lu, Wenzhuo
> <wenzhuo.lu@intel.com>; Li, Xiaoyun <xiaoyun.li@intel.com>
> Subject: RE: [dpdk-dev] [PATCH v2 1/3] eal/x86: run-time dispatch over memcpy
> 
> 
> 
> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Xiaoyun Li
> > Sent: Friday, September 1, 2017 9:57 AM
> > To: Richardson, Bruce <bruce.richardson@intel.com>
> > Cc: dev@dpdk.org; Wang, Zhihong <zhihong.wang@intel.com>; Zhang, Qi Z
> > <qi.z.zhang@intel.com>; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Li,
> > Xiaoyun <xiaoyun.li@intel.com>
> > Subject: [dpdk-dev] [PATCH v2 1/3] eal/x86: run-time dispatch over
> > memcpy
> >
> > This patch dynamically selects functions of memcpy at run-time based
> > on CPU flags that current machine supports. This patch uses function
> > pointers which are bind to the relative functions at constrctor time.
> > In addition, AVX512 instructions set would be compiled only if users
> > config it enabled and the compiler supports it.
> 
> 
> I'll ask the same question again:
> It seems quite an overhead to add extra function call for each 16(32/64)B movement.
> Wouldn't it be better to have one func_ptr per implementation, i.e:
> rte_memcpy_sse(), rte_memcpy_avx2(), rte_memcpy_avx512(), etc.?
> BTW, what is the performance diff between old and new versions?
> Konstantin
> 
> >
> > Signed-off-by: Xiaoyun Li <xiaoyun.li@intel.com>
> > ---
> > v2
> > * use gcc function multi-versioning to avoid compilation issue.
> > * add macros for AVX512 and AVX2. Only if users enable AVX512 and the
> > compiler supports it, the AVX512 codes would be compiled. Only if the
> > compiler supports AVX2, the AVX2 codes would be compiled.
> >
> >  .../common/include/arch/x86/rte_memcpy.h           | 343 +++++++++++++--------
> >  mk/rte.cpuflags.mk                                 |  14 +
> >  2 files changed, 231 insertions(+), 126 deletions(-)
> >
> > diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> > b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> > index 74c280c..abba6ad 100644
> > --- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> > +++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> > @@ -45,11 +45,45 @@
> >  #include <string.h>
> >  #include <rte_vect.h>
> >  #include <rte_common.h>
> > +#include <rte_cpuflags.h>
> > +#include <rte_log.h>
> >
> >  #ifdef __cplusplus
> >  extern "C" {
> >  #endif
> >
> > +/*
> > + * Select SSE/AVX memory copy method as default one.
> > + */
> > +
> > +static uint16_t alignment_mask = 0x0F;
> > +
> > +typedef void (*rte_mov16_t)(uint8_t *dst, const uint8_t *src);
> > +typedef void (*rte_mov32_t)(uint8_t *dst, const uint8_t *src);
> > +typedef void (*rte_mov64_t)(uint8_t *dst, const uint8_t *src);
> > +typedef void (*rte_mov128_t)(uint8_t *dst, const uint8_t *src);
> > +typedef void (*rte_mov256_t)(uint8_t *dst, const uint8_t *src);
> > +#ifdef CC_SUPPORT_AVX2 typedef void (*rte_mov128blocks_t)(uint8_t
> > +*dst, const uint8_t *src, size_t n); #endif #ifdef CC_SUPPORT_AVX512
> > +typedef void (*rte_mov512blocks_t)(uint8_t *dst, const uint8_t *src,
> > +size_t n); #endif typedef void * (*rte_memcpy_generic_t)(void *dst,
> > +const void *src, size_t n);
> > +
> > +static rte_mov16_t rte_mov16;
> > +static rte_mov32_t rte_mov32;
> > +static rte_mov64_t rte_mov64;
> > +static rte_mov128_t rte_mov128;
> > +static rte_mov256_t rte_mov256;
> > +#ifdef CC_SUPPORT_AVX2
> > +static rte_mov128blocks_t rte_mov128blocks; #endif #ifdef
> > +CC_SUPPORT_AVX512 static rte_mov512blocks_t rte_mov512blocks; #endif
> > +static rte_memcpy_generic_t rte_memcpy_generic;
> > +
> >  /**
> >   * Copy bytes from one location to another. The locations must not overlap.
> >   *
> > @@ -68,10 +102,6 @@ extern "C" {
> >  static __rte_always_inline void *
> >  rte_memcpy(void *dst, const void *src, size_t n);
> >
> > -#ifdef RTE_MACHINE_CPUFLAG_AVX512F
> > -
> > -#define ALIGNMENT_MASK 0x3F
> > -
> >  /**
> >   * AVX512 implementation below
> >   */
> > @@ -80,8 +110,10 @@ rte_memcpy(void *dst, const void *src, size_t n);
> >   * Copy 16 bytes from one location to another,
> >   * locations should not overlap.
> >   */
> > +#ifdef CC_SUPPORT_AVX512
> > +__attribute__((target("avx512f")))
> >  static inline void
> > -rte_mov16(uint8_t *dst, const uint8_t *src)
> > +rte_mov16_AVX512F(uint8_t *dst, const uint8_t *src)
> >  {
> >  	__m128i xmm0;
> >
> > @@ -93,8 +125,9 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
> >   * Copy 32 bytes from one location to another,
> >   * locations should not overlap.
> >   */
> > +__attribute__((target("avx512f")))
> >  static inline void
> > -rte_mov32(uint8_t *dst, const uint8_t *src)
> > +rte_mov32_AVX512F(uint8_t *dst, const uint8_t *src)
> >  {
> >  	__m256i ymm0;
> >
> > @@ -106,8 +139,9 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
> >   * Copy 64 bytes from one location to another,
> >   * locations should not overlap.
> >   */
> > +__attribute__((target("avx512f")))
> >  static inline void
> > -rte_mov64(uint8_t *dst, const uint8_t *src)
> > +rte_mov64_AVX512F(uint8_t *dst, const uint8_t *src)
> >  {
> >  	__m512i zmm0;
> >
> > @@ -119,32 +153,35 @@ rte_mov64(uint8_t *dst, const uint8_t *src)
> >   * Copy 128 bytes from one location to another,
> >   * locations should not overlap.
> >   */
> > +__attribute__((target("avx512f")))
> >  static inline void
> > -rte_mov128(uint8_t *dst, const uint8_t *src)
> > +rte_mov128_AVX512F(uint8_t *dst, const uint8_t *src)
> >  {
> > -	rte_mov64(dst + 0 * 64, src + 0 * 64);
> > -	rte_mov64(dst + 1 * 64, src + 1 * 64);
> > +	(*rte_mov64)(dst + 0 * 64, src + 0 * 64);
> > +	(*rte_mov64)(dst + 1 * 64, src + 1 * 64);
> >  }
> >
> >  /**
> >   * Copy 256 bytes from one location to another,
> >   * locations should not overlap.
> >   */
> > +__attribute__((target("avx512f")))
> >  static inline void
> > -rte_mov256(uint8_t *dst, const uint8_t *src)
> > +rte_mov256_AVX512F(uint8_t *dst, const uint8_t *src)
> >  {
> > -	rte_mov64(dst + 0 * 64, src + 0 * 64);
> > -	rte_mov64(dst + 1 * 64, src + 1 * 64);
> > -	rte_mov64(dst + 2 * 64, src + 2 * 64);
> > -	rte_mov64(dst + 3 * 64, src + 3 * 64);
> > +	(*rte_mov64)(dst + 0 * 64, src + 0 * 64);
> > +	(*rte_mov64)(dst + 1 * 64, src + 1 * 64);
> > +	(*rte_mov64)(dst + 2 * 64, src + 2 * 64);
> > +	(*rte_mov64)(dst + 3 * 64, src + 3 * 64);
> >  }
> >
> >  /**
> >   * Copy 128-byte blocks from one location to another,
> >   * locations should not overlap.
> >   */
> > +__attribute__((target("avx512f")))
> >  static inline void
> > -rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
> > +rte_mov128blocks_AVX512F(uint8_t *dst, const uint8_t *src, size_t n)
> >  {
> >  	__m512i zmm0, zmm1;
> >
> > @@ -163,8 +200,9 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
> >   * Copy 512-byte blocks from one location to another,
> >   * locations should not overlap.
> >   */
> > +__attribute__((target("avx512f")))
> >  static inline void
> > -rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
> > +rte_mov512blocks_AVX512F(uint8_t *dst, const uint8_t *src, size_t n)
> >  {
> >  	__m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
> >
> > @@ -191,8 +229,9 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
> >  	}
> >  }
> >
> > +__attribute__((target("avx512f")))
> >  static inline void *
> > -rte_memcpy_generic(void *dst, const void *src, size_t n)
> > +rte_memcpy_generic_AVX512F(void *dst, const void *src, size_t n)
> >  {
> >  	uintptr_t dstu = (uintptr_t)dst;
> >  	uintptr_t srcu = (uintptr_t)src;
> > @@ -228,39 +267,39 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> >  	 * Fast way when copy size doesn't exceed 512 bytes
> >  	 */
> >  	if (n <= 32) {
> > -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > -		rte_mov16((uint8_t *)dst - 16 + n,
> > +		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
> > +		(*rte_mov16)((uint8_t *)dst - 16 + n,
> >  				  (const uint8_t *)src - 16 + n);
> >  		return ret;
> >  	}
> >  	if (n <= 64) {
> > -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -		rte_mov32((uint8_t *)dst - 32 + n,
> > +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> > +		(*rte_mov32)((uint8_t *)dst - 32 + n,
> >  				  (const uint8_t *)src - 32 + n);
> >  		return ret;
> >  	}
> >  	if (n <= 512) {
> >  		if (n >= 256) {
> >  			n -= 256;
> > -			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
> > +			(*rte_mov256)((uint8_t *)dst, (const uint8_t *)src);
> >  			src = (const uint8_t *)src + 256;
> >  			dst = (uint8_t *)dst + 256;
> >  		}
> >  		if (n >= 128) {
> >  			n -= 128;
> > -			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
> > +			(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);
> >  			src = (const uint8_t *)src + 128;
> >  			dst = (uint8_t *)dst + 128;
> >  		}
> >  COPY_BLOCK_128_BACK63:
> >  		if (n > 64) {
> > -			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> > -			rte_mov64((uint8_t *)dst - 64 + n,
> > +			(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
> > +			(*rte_mov64)((uint8_t *)dst - 64 + n,
> >  					  (const uint8_t *)src - 64 + n);
> >  			return ret;
> >  		}
> >  		if (n > 0)
> > -			rte_mov64((uint8_t *)dst - 64 + n,
> > +			(*rte_mov64)((uint8_t *)dst - 64 + n,
> >  					  (const uint8_t *)src - 64 + n);
> >  		return ret;
> >  	}
> > @@ -272,7 +311,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> >  	if (dstofss > 0) {
> >  		dstofss = 64 - dstofss;
> >  		n -= dstofss;
> > -		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> > +		(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
> >  		src = (const uint8_t *)src + dstofss;
> >  		dst = (uint8_t *)dst + dstofss;
> >  	}
> > @@ -282,7 +321,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> >  	 * Use copy block function for better instruction order control,
> >  	 * which is important when load is unaligned.
> >  	 */
> > -	rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
> > +	(*rte_mov512blocks)((uint8_t *)dst, (const uint8_t *)src, n);
> >  	bits = n;
> >  	n = n & 511;
> >  	bits -= n;
> > @@ -295,7 +334,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> >  	 * which is important when load is unaligned.
> >  	 */
> >  	if (n >= 128) {
> > -		rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
> > +		(*rte_mov128blocks)((uint8_t *)dst, (const uint8_t *)src, n);
> >  		bits = n;
> >  		n = n & 127;
> >  		bits -= n;
> > @@ -308,10 +347,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> >  	 */
> >  	goto COPY_BLOCK_128_BACK63;
> >  }
> > -
> > -#elif defined RTE_MACHINE_CPUFLAG_AVX2
> > -
> > -#define ALIGNMENT_MASK 0x1F
> > +#endif
> >
> >  /**
> >   * AVX2 implementation below
> > @@ -321,8 +357,10 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> >   * Copy 16 bytes from one location to another,
> >   * locations should not overlap.
> >   */
> > +#ifdef CC_SUPPORT_AVX2
> > +__attribute__((target("avx2")))
> >  static inline void
> > -rte_mov16(uint8_t *dst, const uint8_t *src)
> > +rte_mov16_AVX2(uint8_t *dst, const uint8_t *src)
> >  {
> >  	__m128i xmm0;
> >
> > @@ -334,8 +372,9 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
> >   * Copy 32 bytes from one location to another,
> >   * locations should not overlap.
> >   */
> > +__attribute__((target("avx2")))
> >  static inline void
> > -rte_mov32(uint8_t *dst, const uint8_t *src)
> > +rte_mov32_AVX2(uint8_t *dst, const uint8_t *src)
> >  {
> >  	__m256i ymm0;
> >
> > @@ -347,32 +386,35 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
> >   * Copy 64 bytes from one location to another,
> >   * locations should not overlap.
> >   */
> > +__attribute__((target("avx2")))
> >  static inline void
> > -rte_mov64(uint8_t *dst, const uint8_t *src)
> > +rte_mov64_AVX2(uint8_t *dst, const uint8_t *src)
> >  {
> > -	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> > -	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> > +	(*rte_mov32)((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> > +	(*rte_mov32)((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> >  }
> >
> >  /**
> >   * Copy 128 bytes from one location to another,
> >   * locations should not overlap.
> >   */
> > +__attribute__((target("avx2")))
> >  static inline void
> > -rte_mov128(uint8_t *dst, const uint8_t *src)
> > +rte_mov128_AVX2(uint8_t *dst, const uint8_t *src)
> >  {
> > -	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> > -	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> > -	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
> > -	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
> > +	(*rte_mov32)((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> > +	(*rte_mov32)((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> > +	(*rte_mov32)((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
> > +	(*rte_mov32)((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
> >  }
> >
> >  /**
> >   * Copy 128-byte blocks from one location to another,
> >   * locations should not overlap.
> >   */
> > +__attribute__((target("avx2")))
> >  static inline void
> > -rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
> > +rte_mov128blocks_AVX2(uint8_t *dst, const uint8_t *src, size_t n)
> >  {
> >  	__m256i ymm0, ymm1, ymm2, ymm3;
> >
> > @@ -391,8 +433,9 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
> >  	}
> >  }
> >
> > +__attribute__((target("avx2")))
> >  static inline void *
> > -rte_memcpy_generic(void *dst, const void *src, size_t n)
> > +rte_memcpy_generic_AVX2(void *dst, const void *src, size_t n)
> >  {
> >  	uintptr_t dstu = (uintptr_t)dst;
> >  	uintptr_t srcu = (uintptr_t)src;
> > @@ -429,46 +472,46 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> >  	 * Fast way when copy size doesn't exceed 256 bytes
> >  	 */
> >  	if (n <= 32) {
> > -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > -		rte_mov16((uint8_t *)dst - 16 + n,
> > +		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
> > +		(*rte_mov16)((uint8_t *)dst - 16 + n,
> >  				(const uint8_t *)src - 16 + n);
> >  		return ret;
> >  	}
> >  	if (n <= 48) {
> > -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > -		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
> > -		rte_mov16((uint8_t *)dst - 16 + n,
> > +		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
> > +		(*rte_mov16)((uint8_t *)dst + 16, (const uint8_t *)src + 16);
> > +		(*rte_mov16)((uint8_t *)dst - 16 + n,
> >  				(const uint8_t *)src - 16 + n);
> >  		return ret;
> >  	}
> >  	if (n <= 64) {
> > -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -		rte_mov32((uint8_t *)dst - 32 + n,
> > +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> > +		(*rte_mov32)((uint8_t *)dst - 32 + n,
> >  				(const uint8_t *)src - 32 + n);
> >  		return ret;
> >  	}
> >  	if (n <= 256) {
> >  		if (n >= 128) {
> >  			n -= 128;
> > -			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
> > +			(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);
> >  			src = (const uint8_t *)src + 128;
> >  			dst = (uint8_t *)dst + 128;
> >  		}
> >  COPY_BLOCK_128_BACK31:
> >  		if (n >= 64) {
> >  			n -= 64;
> > -			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> > +			(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
> >  			src = (const uint8_t *)src + 64;
> >  			dst = (uint8_t *)dst + 64;
> >  		}
> >  		if (n > 32) {
> > -			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -			rte_mov32((uint8_t *)dst - 32 + n,
> > +			(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> > +			(*rte_mov32)((uint8_t *)dst - 32 + n,
> >  					(const uint8_t *)src - 32 + n);
> >  			return ret;
> >  		}
> >  		if (n > 0) {
> > -			rte_mov32((uint8_t *)dst - 32 + n,
> > +			(*rte_mov32)((uint8_t *)dst - 32 + n,
> >  					(const uint8_t *)src - 32 + n);
> >  		}
> >  		return ret;
> > @@ -481,7 +524,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> >  	if (dstofss > 0) {
> >  		dstofss = 32 - dstofss;
> >  		n -= dstofss;
> > -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> >  		src = (const uint8_t *)src + dstofss;
> >  		dst = (uint8_t *)dst + dstofss;
> >  	}
> > @@ -489,7 +532,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> >  	/**
> >  	 * Copy 128-byte blocks
> >  	 */
> > -	rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
> > +	(*rte_mov128blocks)((uint8_t *)dst, (const uint8_t *)src, n);
> >  	bits = n;
> >  	n = n & 127;
> >  	bits -= n;
> > @@ -501,10 +544,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> >  	 */
> >  	goto COPY_BLOCK_128_BACK31;
> >  }
> > -
> > -#else /* RTE_MACHINE_CPUFLAG */
> > -
> > -#define ALIGNMENT_MASK 0x0F
> > +#endif
> >
> >  /**
> >   * SSE & AVX implementation below
> > @@ -514,8 +554,9 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> >   * Copy 16 bytes from one location to another,
> >   * locations should not overlap.
> >   */
> > +__attribute__((target("default")))
> >  static inline void
> > -rte_mov16(uint8_t *dst, const uint8_t *src)
> > +rte_mov16_DEFAULT(uint8_t *dst, const uint8_t *src)
> >  {
> >  	__m128i xmm0;
> >
> > @@ -527,66 +568,70 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
> >   * Copy 32 bytes from one location to another,
> >   * locations should not overlap.
> >   */
> > +__attribute__((target("default")))
> >  static inline void
> > -rte_mov32(uint8_t *dst, const uint8_t *src)
> > +rte_mov32_DEFAULT(uint8_t *dst, const uint8_t *src)
> >  {
> > -	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> > -	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> >  }
> >
> >  /**
> >   * Copy 64 bytes from one location to another,
> >   * locations should not overlap.
> >   */
> > +__attribute__((target("default")))
> >  static inline void
> > -rte_mov64(uint8_t *dst, const uint8_t *src)
> > +rte_mov64_DEFAULT(uint8_t *dst, const uint8_t *src)
> >  {
> > -	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> > -	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> > -	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> > -	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> >  }
> >
> >  /**
> >   * Copy 128 bytes from one location to another,
> >   * locations should not overlap.
> >   */
> > +__attribute__((target("default")))
> >  static inline void
> > -rte_mov128(uint8_t *dst, const uint8_t *src)
> > +rte_mov128_DEFAULT(uint8_t *dst, const uint8_t *src)
> >  {
> > -	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> > -	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> > -	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> > -	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> > -	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
> > -	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
> > -	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
> > -	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
> >  }
> >
> >  /**
> >   * Copy 256 bytes from one location to another,
> >   * locations should not overlap.
> >   */
> > +__attribute__((target("default")))
> >  static inline void
> > -rte_mov256(uint8_t *dst, const uint8_t *src)
> > +rte_mov256_DEFAULT(uint8_t *dst, const uint8_t *src)
> >  {
> > -	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> > -	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> > -	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> > -	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> > -	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
> > -	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
> > -	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
> > -	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
> > -	rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
> > -	rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
> > -	rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
> > -	rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
> > -	rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
> > -	rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
> > -	rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
> > -	rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
> >  }
> >
> >  /**
> > @@ -683,8 +728,9 @@ __extension__ ({                                                      \
> >      }                                                                 \
> >  })
> >
> > +__attribute__((target("default")))
> >  static inline void *
> > -rte_memcpy_generic(void *dst, const void *src, size_t n)
> > +rte_memcpy_generic_DEFAULT(void *dst, const void *src, size_t n)
> >  {
> >  	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
> >  	uintptr_t dstu = (uintptr_t)dst;
> > @@ -722,19 +768,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> >  	 * Fast way when copy size doesn't exceed 512 bytes
> >  	 */
> >  	if (n <= 32) {
> > -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > -		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> > +		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
> > +		(*rte_mov16)((uint8_t *)dst - 16 + n,
> > +				(const uint8_t *)src - 16 + n);
> >  		return ret;
> >  	}
> >  	if (n <= 48) {
> > -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> > +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> > +		(*rte_mov16)((uint8_t *)dst - 16 + n,
> > +				(const uint8_t *)src - 16 + n);
> >  		return ret;
> >  	}
> >  	if (n <= 64) {
> > -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
> > -		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> > +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> > +		(*rte_mov16)((uint8_t *)dst + 32, (const uint8_t *)src + 32);
> > +		(*rte_mov16)((uint8_t *)dst - 16 + n,
> > +				(const uint8_t *)src - 16 + n);
> >  		return ret;
> >  	}
> >  	if (n <= 128) {
> > @@ -743,39 +792,42 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> >  	if (n <= 512) {
> >  		if (n >= 256) {
> >  			n -= 256;
> > -			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
> > -			rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
> > +			(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);
> > +			(*rte_mov128)((uint8_t *)dst + 128,
> > +					(const uint8_t *)src + 128);
> >  			src = (const uint8_t *)src + 256;
> >  			dst = (uint8_t *)dst + 256;
> >  		}
> >  COPY_BLOCK_255_BACK15:
> >  		if (n >= 128) {
> >  			n -= 128;
> > -			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
> > +			(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);
> >  			src = (const uint8_t *)src + 128;
> >  			dst = (uint8_t *)dst + 128;
> >  		}
> >  COPY_BLOCK_128_BACK15:
> >  		if (n >= 64) {
> >  			n -= 64;
> > -			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> > +			(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
> >  			src = (const uint8_t *)src + 64;
> >  			dst = (uint8_t *)dst + 64;
> >  		}
> >  COPY_BLOCK_64_BACK15:
> >  		if (n >= 32) {
> >  			n -= 32;
> > -			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > +			(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> >  			src = (const uint8_t *)src + 32;
> >  			dst = (uint8_t *)dst + 32;
> >  		}
> >  		if (n > 16) {
> > -			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > -			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> > +			(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
> > +			(*rte_mov16)((uint8_t *)dst - 16 + n,
> > +					(const uint8_t *)src - 16 + n);
> >  			return ret;
> >  		}
> >  		if (n > 0) {
> > -			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> > +			(*rte_mov16)((uint8_t *)dst - 16 + n,
> > +					(const uint8_t *)src - 16 + n);
> >  		}
> >  		return ret;
> >  	}
> > @@ -790,7 +842,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> >  	if (dstofss > 0) {
> >  		dstofss = 16 - dstofss + 16;
> >  		n -= dstofss;
> > -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> >  		src = (const uint8_t *)src + dstofss;
> >  		dst = (uint8_t *)dst + dstofss;
> >  	}
> > @@ -804,7 +856,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> >  		 * Copy 256-byte blocks
> >  		 */
> >  		for (; n >= 256; n -= 256) {
> > -			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
> > +			(*rte_mov256)((uint8_t *)dst, (const uint8_t *)src);
> >  			dst = (uint8_t *)dst + 256;
> >  			src = (const uint8_t *)src + 256;
> >  		}
> > @@ -826,7 +878,46 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> >  	goto COPY_BLOCK_64_BACK15;
> >  }
> >
> > -#endif /* RTE_MACHINE_CPUFLAG */
> > +static void __attribute__((constructor))
> > +rte_memcpy_init(void)
> > +{
> > +#ifdef CC_SUPPORT_AVX512
> > +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F)) {
> > +		alignment_mask = 0x3F;
> > +		rte_mov16 = rte_mov16_AVX512F;
> > +		rte_mov32 = rte_mov32_AVX512F;
> > +		rte_mov64 = rte_mov64_AVX512F;
> > +		rte_mov128 = rte_mov128_AVX512F;
> > +		rte_mov256 = rte_mov256_AVX512F;
> > +		rte_mov128blocks = rte_mov128blocks_AVX512F;
> > +		rte_mov512blocks = rte_mov512blocks_AVX512F;
> > +		rte_memcpy_generic = rte_memcpy_generic_AVX512F;
> > +		RTE_LOG(INFO, EAL, "AVX512 implementation of memcpy() is using!\n");
> > +	} else
> > +#endif
> > +#ifdef CC_SUPPORT_AVX2
> > +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2)) {
> > +		alignment_mask = 0x1F;
> > +		rte_mov16 = rte_mov16_AVX2;
> > +		rte_mov32 = rte_mov32_AVX2;
> > +		rte_mov64 = rte_mov64_AVX2;
> > +		rte_mov128 = rte_mov128_AVX2;
> > +		rte_mov128blocks = rte_mov128blocks_AVX2;
> > +		rte_memcpy_generic = rte_memcpy_generic_AVX2;
> > +		RTE_LOG(INFO, EAL, "AVX2 implementation of memcpy() is using!\n");
> > +	} else
> > +#endif
> > +	{
> > +		alignment_mask = 0x0F;
> > +		rte_mov16 = rte_mov16_DEFAULT;
> > +		rte_mov32 = rte_mov32_DEFAULT;
> > +		rte_mov64 = rte_mov64_DEFAULT;
> > +		rte_mov128 = rte_mov128_DEFAULT;
> > +		rte_mov256 = rte_mov256_DEFAULT;
> > +		rte_memcpy_generic = rte_memcpy_generic_DEFAULT;
> > +		RTE_LOG(INFO, EAL, "Default SSE/AVX implementation of memcpy() is using!\n");
> > +	}
> > +}
> >
> >  static inline void *
> >  rte_memcpy_aligned(void *dst, const void *src, size_t n)
> > @@ -858,8 +949,8 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
> >
> >  	/* Copy 16 <= size <= 32 bytes */
> >  	if (n <= 32) {
> > -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > -		rte_mov16((uint8_t *)dst - 16 + n,
> > +		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
> > +		(*rte_mov16)((uint8_t *)dst - 16 + n,
> >  				(const uint8_t *)src - 16 + n);
> >
> >  		return ret;
> > @@ -867,8 +958,8 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
> >
> >  	/* Copy 32 < size <= 64 bytes */
> >  	if (n <= 64) {
> > -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -		rte_mov32((uint8_t *)dst - 32 + n,
> > +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> > +		(*rte_mov32)((uint8_t *)dst - 32 + n,
> >  				(const uint8_t *)src - 32 + n);
> >
> >  		return ret;
> > @@ -876,13 +967,13 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
> >
> >  	/* Copy 64 bytes blocks */
> >  	for (; n >= 64; n -= 64) {
> > -		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> > +		(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
> >  		dst = (uint8_t *)dst + 64;
> >  		src = (const uint8_t *)src + 64;
> >  	}
> >
> >  	/* Copy whatever left */
> > -	rte_mov64((uint8_t *)dst - 64 + n,
> > +	(*rte_mov64)((uint8_t *)dst - 64 + n,
> >  			(const uint8_t *)src - 64 + n);
> >
> >  	return ret;
> > @@ -891,10 +982,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
> >  static inline void *
> >  rte_memcpy(void *dst, const void *src, size_t n)
> >  {
> > -	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
> > +	if (!(((uintptr_t)dst | (uintptr_t)src) & alignment_mask))
> >  		return rte_memcpy_aligned(dst, src, n);
> >  	else
> > -		return rte_memcpy_generic(dst, src, n);
> > +		return (*rte_memcpy_generic)(dst, src, n);
> >  }
> >
> >  #ifdef __cplusplus
> > diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
> > index a813c91..92399ec 100644
> > --- a/mk/rte.cpuflags.mk
> > +++ b/mk/rte.cpuflags.mk
> > @@ -141,3 +141,17 @@ space:= $(empty) $(empty)
> >  CPUFLAGSTMP1 := $(addprefix RTE_CPUFLAG_,$(CPUFLAGS))
> >  CPUFLAGSTMP2 := $(subst $(space),$(comma),$(CPUFLAGSTMP1))
> >  CPUFLAGS_LIST := -DRTE_COMPILE_TIME_CPUFLAGS=$(CPUFLAGSTMP2)
> > +
> > +# Check if the compiler supports AVX512.
> > +CC_SUPPORT_AVX512 := $(shell $(CC) -march=skylake-avx512 -dM -E - < /dev/null 2>&1 | grep -q AVX512 && echo 1)
> > +ifeq ($(CC_SUPPORT_AVX512),1)
> > +ifeq ($(CONFIG_RTE_ENABLE_AVX512),y)
> > +MACHINE_CFLAGS += -DCC_SUPPORT_AVX512
> > +endif
> > +endif
> > +
> > +# Check if the compiler supports AVX2.
> > +CC_SUPPORT_AVX2 := $(shell $(CC) -march=core-avx2 -dM -E - < /dev/null 2>&1 | grep -q AVX2 && echo 1)
> > +ifeq ($(CC_SUPPORT_AVX2),1)
> > +MACHINE_CFLAGS += -DCC_SUPPORT_AVX2
> > +endif
> > --
> > 2.7.4

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/3] eal/x86: run-time dispatch over memcpy
  2017-09-01  8:57   ` [PATCH v2 1/3] eal/x86: run-time dispatch over memcpy Xiaoyun Li
  2017-09-01  9:16     ` Ananyev, Konstantin
@ 2017-09-01 15:34     ` Stephen Hemminger
  1 sibling, 0 replies; 22+ messages in thread
From: Stephen Hemminger @ 2017-09-01 15:34 UTC (permalink / raw)
  To: Xiaoyun Li; +Cc: bruce.richardson, dev, zhihong.wang, qi.z.zhang, wenzhuo.lu

On Fri,  1 Sep 2017 16:57:00 +0800
Xiaoyun Li <xiaoyun.li@intel.com> wrote:

> This patch dynamically selects functions of memcpy at run-time based
> on CPU flags that current machine supports. This patch uses function
> pointers which are bind to the relative functions at constrctor time.
> In addition, AVX512 instructions set would be compiled only if users
> config it enabled and the compiler supports it.
> 
> Signed-off-by: Xiaoyun Li <xiaoyun.li@intel.com>

Maybe a less intrusive way would be to build multiple shared libraries,
one for each CPU type, then do dynamic binding at startup.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/3] eal/x86: run-time dispatch over memcpy
  2017-09-01 10:38         ` Ananyev, Konstantin
@ 2017-09-04  1:41           ` Li, Xiaoyun
       [not found]             ` <B9E724F4CB7543449049E7AE7669D82F44216E@SHSMSX101.ccr.corp.intel.com>
  0 siblings, 1 reply; 22+ messages in thread
From: Li, Xiaoyun @ 2017-09-04  1:41 UTC (permalink / raw)
  To: Ananyev, Konstantin, Richardson, Bruce
  Cc: dev, Wang, Zhihong, Zhang, Qi Z, Lu, Wenzhuo

I didn't consider that the orginal codes are inline, would be replaced in compilation time and have no function calls.
Anyway, I'll do perf test this week. And if it drops a lot, I will change to codes like rte_memcpy_AVX512 (less function calls) and test perf again.


Best Regards,
Xiaoyun Li




-----Original Message-----
From: Ananyev, Konstantin 
Sent: Friday, September 1, 2017 18:39
To: Li, Xiaoyun <xiaoyun.li@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>
Cc: dev@dpdk.org; Wang, Zhihong <zhihong.wang@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Lu, Wenzhuo <wenzhuo.lu@intel.com>
Subject: RE: [dpdk-dev] [PATCH v2 1/3] eal/x86: run-time dispatch over memcpy



> -----Original Message-----
> From: Li, Xiaoyun
> Sent: Friday, September 1, 2017 10:29 AM
> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; Richardson, 
> Bruce <bruce.richardson@intel.com>
> Cc: dev@dpdk.org; Wang, Zhihong <zhihong.wang@intel.com>; Zhang, Qi Z 
> <qi.z.zhang@intel.com>; Lu, Wenzhuo <wenzhuo.lu@intel.com>
> Subject: RE: [dpdk-dev] [PATCH v2 1/3] eal/x86: run-time dispatch over 
> memcpy
> 
> Hi
> I send an email just now.
> The original codes have so many functions because maybe other places (right now nowhere else) would need functions like rte_mov16.

Which place exactly?
As I understand right now no one using it, and they supposed to be sort of private ones, correct?
So why do you want to make them public?

> Then they can directly use it. So I am not sure if I need modify it.
> 
> The performance haven't been tested. But it should be the same as before.

Hmm, what makes you think that?
Let say to move 64B you are calling rte_mov16() 4 times.
Obviously each function call has an overhead - prepare parameter registers, save/restore scratch registers, actual call/jmp instruction, ret, etc.
All that costs cycles.
Did you do any performance measurements here?
Konstantin 

> Before, if cpu and compiler supports AVX512 and users enable it, the AVX512 codes would be compiled and runed.
> I just delete the macro and make it decide at run-time (constructor time) not build time.
> 
> 
> Best Regards,
> Xiaoyun Li
> 
> 
> 
> 
> -----Original Message-----
> From: Ananyev, Konstantin
> Sent: Friday, September 1, 2017 17:17
> To: Li, Xiaoyun <xiaoyun.li@intel.com>; Richardson, Bruce 
> <bruce.richardson@intel.com>
> Cc: dev@dpdk.org; Wang, Zhihong <zhihong.wang@intel.com>; Zhang, Qi Z 
> <qi.z.zhang@intel.com>; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Li, 
> Xiaoyun <xiaoyun.li@intel.com>
> Subject: RE: [dpdk-dev] [PATCH v2 1/3] eal/x86: run-time dispatch over 
> memcpy
> 
> 
> 
> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Xiaoyun Li
> > Sent: Friday, September 1, 2017 9:57 AM
> > To: Richardson, Bruce <bruce.richardson@intel.com>
> > Cc: dev@dpdk.org; Wang, Zhihong <zhihong.wang@intel.com>; Zhang, Qi 
> > Z <qi.z.zhang@intel.com>; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Li, 
> > Xiaoyun <xiaoyun.li@intel.com>
> > Subject: [dpdk-dev] [PATCH v2 1/3] eal/x86: run-time dispatch over 
> > memcpy
> >
> > This patch dynamically selects functions of memcpy at run-time based 
> > on CPU flags that current machine supports. This patch uses function 
> > pointers which are bind to the relative functions at constrctor time.
> > In addition, AVX512 instructions set would be compiled only if users 
> > config it enabled and the compiler supports it.
> 
> 
> I'll ask the same question again:
> It seems quite an overhead to add extra function call for each 16(32/64)B movement.
> Wouldn't it be better to have one func_ptr per implementation, i.e:
> rte_memcpy_sse(), rte_memcpy_avx2(), rte_memcpy_avx512(), etc.?
> BTW, what is the performance diff between old and new versions?
> Konstantin
> 
> >
> > Signed-off-by: Xiaoyun Li <xiaoyun.li@intel.com>
> > ---
> > v2
> > * use gcc function multi-versioning to avoid compilation issue.
> > * add macros for AVX512 and AVX2. Only if users enable AVX512 and 
> > the compiler supports it, the AVX512 codes would be compiled. Only 
> > if the compiler supports AVX2, the AVX2 codes would be compiled.
> >
> >  .../common/include/arch/x86/rte_memcpy.h           | 343 +++++++++++++--------
> >  mk/rte.cpuflags.mk                                 |  14 +
> >  2 files changed, 231 insertions(+), 126 deletions(-)
> >
> > diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> > b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> > index 74c280c..abba6ad 100644
> > --- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> > +++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> > @@ -45,11 +45,45 @@
> >  #include <string.h>
> >  #include <rte_vect.h>
> >  #include <rte_common.h>
> > +#include <rte_cpuflags.h>
> > +#include <rte_log.h>
> >
> >  #ifdef __cplusplus
> >  extern "C" {
> >  #endif
> >
> > +/*
> > + * Select SSE/AVX memory copy method as default one.
> > + */
> > +
> > +static uint16_t alignment_mask = 0x0F;
> > +
> > +typedef void (*rte_mov16_t)(uint8_t *dst, const uint8_t *src); 
> > +typedef void (*rte_mov32_t)(uint8_t *dst, const uint8_t *src); 
> > +typedef void (*rte_mov64_t)(uint8_t *dst, const uint8_t *src); 
> > +typedef void (*rte_mov128_t)(uint8_t *dst, const uint8_t *src); 
> > +typedef void (*rte_mov256_t)(uint8_t *dst, const uint8_t *src); 
> > +#ifdef CC_SUPPORT_AVX2 typedef void (*rte_mov128blocks_t)(uint8_t 
> > +*dst, const uint8_t *src, size_t n); #endif #ifdef 
> > +CC_SUPPORT_AVX512 typedef void (*rte_mov512blocks_t)(uint8_t *dst, 
> > +const uint8_t *src, size_t n); #endif typedef void * 
> > +(*rte_memcpy_generic_t)(void *dst, const void *src, size_t n);
> > +
> > +static rte_mov16_t rte_mov16;
> > +static rte_mov32_t rte_mov32;
> > +static rte_mov64_t rte_mov64;
> > +static rte_mov128_t rte_mov128;
> > +static rte_mov256_t rte_mov256;
> > +#ifdef CC_SUPPORT_AVX2
> > +static rte_mov128blocks_t rte_mov128blocks; #endif #ifdef
> > +CC_SUPPORT_AVX512 static rte_mov512blocks_t rte_mov512blocks; 
> > +#endif static rte_memcpy_generic_t rte_memcpy_generic;
> > +
> >  /**
> >   * Copy bytes from one location to another. The locations must not overlap.
> >   *
> > @@ -68,10 +102,6 @@ extern "C" {
> >  static __rte_always_inline void *
> >  rte_memcpy(void *dst, const void *src, size_t n);
> >
> > -#ifdef RTE_MACHINE_CPUFLAG_AVX512F
> > -
> > -#define ALIGNMENT_MASK 0x3F
> > -
> >  /**
> >   * AVX512 implementation below
> >   */
> > @@ -80,8 +110,10 @@ rte_memcpy(void *dst, const void *src, size_t n);
> >   * Copy 16 bytes from one location to another,
> >   * locations should not overlap.
> >   */
> > +#ifdef CC_SUPPORT_AVX512
> > +__attribute__((target("avx512f")))
> >  static inline void
> > -rte_mov16(uint8_t *dst, const uint8_t *src)
> > +rte_mov16_AVX512F(uint8_t *dst, const uint8_t *src)
> >  {
> >  	__m128i xmm0;
> >
> > @@ -93,8 +125,9 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
> >   * Copy 32 bytes from one location to another,
> >   * locations should not overlap.
> >   */
> > +__attribute__((target("avx512f")))
> >  static inline void
> > -rte_mov32(uint8_t *dst, const uint8_t *src)
> > +rte_mov32_AVX512F(uint8_t *dst, const uint8_t *src)
> >  {
> >  	__m256i ymm0;
> >
> > @@ -106,8 +139,9 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
> >   * Copy 64 bytes from one location to another,
> >   * locations should not overlap.
> >   */
> > +__attribute__((target("avx512f")))
> >  static inline void
> > -rte_mov64(uint8_t *dst, const uint8_t *src)
> > +rte_mov64_AVX512F(uint8_t *dst, const uint8_t *src)
> >  {
> >  	__m512i zmm0;
> >
> > @@ -119,32 +153,35 @@ rte_mov64(uint8_t *dst, const uint8_t *src)
> >   * Copy 128 bytes from one location to another,
> >   * locations should not overlap.
> >   */
> > +__attribute__((target("avx512f")))
> >  static inline void
> > -rte_mov128(uint8_t *dst, const uint8_t *src)
> > +rte_mov128_AVX512F(uint8_t *dst, const uint8_t *src)
> >  {
> > -	rte_mov64(dst + 0 * 64, src + 0 * 64);
> > -	rte_mov64(dst + 1 * 64, src + 1 * 64);
> > +	(*rte_mov64)(dst + 0 * 64, src + 0 * 64);
> > +	(*rte_mov64)(dst + 1 * 64, src + 1 * 64);
> >  }
> >
> >  /**
> >   * Copy 256 bytes from one location to another,
> >   * locations should not overlap.
> >   */
> > +__attribute__((target("avx512f")))
> >  static inline void
> > -rte_mov256(uint8_t *dst, const uint8_t *src)
> > +rte_mov256_AVX512F(uint8_t *dst, const uint8_t *src)
> >  {
> > -	rte_mov64(dst + 0 * 64, src + 0 * 64);
> > -	rte_mov64(dst + 1 * 64, src + 1 * 64);
> > -	rte_mov64(dst + 2 * 64, src + 2 * 64);
> > -	rte_mov64(dst + 3 * 64, src + 3 * 64);
> > +	(*rte_mov64)(dst + 0 * 64, src + 0 * 64);
> > +	(*rte_mov64)(dst + 1 * 64, src + 1 * 64);
> > +	(*rte_mov64)(dst + 2 * 64, src + 2 * 64);
> > +	(*rte_mov64)(dst + 3 * 64, src + 3 * 64);
> >  }
> >
> >  /**
> >   * Copy 128-byte blocks from one location to another,
> >   * locations should not overlap.
> >   */
> > +__attribute__((target("avx512f")))
> >  static inline void
> > -rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
> > +rte_mov128blocks_AVX512F(uint8_t *dst, const uint8_t *src, size_t 
> > +n)
> >  {
> >  	__m512i zmm0, zmm1;
> >
> > @@ -163,8 +200,9 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
> >   * Copy 512-byte blocks from one location to another,
> >   * locations should not overlap.
> >   */
> > +__attribute__((target("avx512f")))
> >  static inline void
> > -rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
> > +rte_mov512blocks_AVX512F(uint8_t *dst, const uint8_t *src, size_t 
> > +n)
> >  {
> >  	__m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
> >
> > @@ -191,8 +229,9 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
> >  	}
> >  }
> >
> > +__attribute__((target("avx512f")))
> >  static inline void *
> > -rte_memcpy_generic(void *dst, const void *src, size_t n)
> > +rte_memcpy_generic_AVX512F(void *dst, const void *src, size_t n)
> >  {
> >  	uintptr_t dstu = (uintptr_t)dst;
> >  	uintptr_t srcu = (uintptr_t)src;
> > @@ -228,39 +267,39 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> >  	 * Fast way when copy size doesn't exceed 512 bytes
> >  	 */
> >  	if (n <= 32) {
> > -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > -		rte_mov16((uint8_t *)dst - 16 + n,
> > +		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
> > +		(*rte_mov16)((uint8_t *)dst - 16 + n,
> >  				  (const uint8_t *)src - 16 + n);
> >  		return ret;
> >  	}
> >  	if (n <= 64) {
> > -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -		rte_mov32((uint8_t *)dst - 32 + n,
> > +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> > +		(*rte_mov32)((uint8_t *)dst - 32 + n,
> >  				  (const uint8_t *)src - 32 + n);
> >  		return ret;
> >  	}
> >  	if (n <= 512) {
> >  		if (n >= 256) {
> >  			n -= 256;
> > -			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
> > +			(*rte_mov256)((uint8_t *)dst, (const uint8_t *)src);
> >  			src = (const uint8_t *)src + 256;
> >  			dst = (uint8_t *)dst + 256;
> >  		}
> >  		if (n >= 128) {
> >  			n -= 128;
> > -			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
> > +			(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);
> >  			src = (const uint8_t *)src + 128;
> >  			dst = (uint8_t *)dst + 128;
> >  		}
> >  COPY_BLOCK_128_BACK63:
> >  		if (n > 64) {
> > -			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> > -			rte_mov64((uint8_t *)dst - 64 + n,
> > +			(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
> > +			(*rte_mov64)((uint8_t *)dst - 64 + n,
> >  					  (const uint8_t *)src - 64 + n);
> >  			return ret;
> >  		}
> >  		if (n > 0)
> > -			rte_mov64((uint8_t *)dst - 64 + n,
> > +			(*rte_mov64)((uint8_t *)dst - 64 + n,
> >  					  (const uint8_t *)src - 64 + n);
> >  		return ret;
> >  	}
> > @@ -272,7 +311,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> >  	if (dstofss > 0) {
> >  		dstofss = 64 - dstofss;
> >  		n -= dstofss;
> > -		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> > +		(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
> >  		src = (const uint8_t *)src + dstofss;
> >  		dst = (uint8_t *)dst + dstofss;
> >  	}
> > @@ -282,7 +321,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> >  	 * Use copy block function for better instruction order control,
> >  	 * which is important when load is unaligned.
> >  	 */
> > -	rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
> > +	(*rte_mov512blocks)((uint8_t *)dst, (const uint8_t *)src, n);
> >  	bits = n;
> >  	n = n & 511;
> >  	bits -= n;
> > @@ -295,7 +334,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> >  	 * which is important when load is unaligned.
> >  	 */
> >  	if (n >= 128) {
> > -		rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
> > +		(*rte_mov128blocks)((uint8_t *)dst, (const uint8_t *)src, n);
> >  		bits = n;
> >  		n = n & 127;
> >  		bits -= n;
> > @@ -308,10 +347,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> >  	 */
> >  	goto COPY_BLOCK_128_BACK63;
> >  }
> > -
> > -#elif defined RTE_MACHINE_CPUFLAG_AVX2
> > -
> > -#define ALIGNMENT_MASK 0x1F
> > +#endif
> >
> >  /**
> >   * AVX2 implementation below
> > @@ -321,8 +357,10 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> >   * Copy 16 bytes from one location to another,
> >   * locations should not overlap.
> >   */
> > +#ifdef CC_SUPPORT_AVX2
> > +__attribute__((target("avx2")))
> >  static inline void
> > -rte_mov16(uint8_t *dst, const uint8_t *src)
> > +rte_mov16_AVX2(uint8_t *dst, const uint8_t *src)
> >  {
> >  	__m128i xmm0;
> >
> > @@ -334,8 +372,9 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
> >   * Copy 32 bytes from one location to another,
> >   * locations should not overlap.
> >   */
> > +__attribute__((target("avx2")))
> >  static inline void
> > -rte_mov32(uint8_t *dst, const uint8_t *src)
> > +rte_mov32_AVX2(uint8_t *dst, const uint8_t *src)
> >  {
> >  	__m256i ymm0;
> >
> > @@ -347,32 +386,35 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
> >   * Copy 64 bytes from one location to another,
> >   * locations should not overlap.
> >   */
> > +__attribute__((target("avx2")))
> >  static inline void
> > -rte_mov64(uint8_t *dst, const uint8_t *src)
> > +rte_mov64_AVX2(uint8_t *dst, const uint8_t *src)
> >  {
> > -	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> > -	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> > +	(*rte_mov32)((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> > +	(*rte_mov32)((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 
> > +32);
> >  }
> >
> >  /**
> >   * Copy 128 bytes from one location to another,
> >   * locations should not overlap.
> >   */
> > +__attribute__((target("avx2")))
> >  static inline void
> > -rte_mov128(uint8_t *dst, const uint8_t *src)
> > +rte_mov128_AVX2(uint8_t *dst, const uint8_t *src)
> >  {
> > -	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> > -	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> > -	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
> > -	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
> > +	(*rte_mov32)((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> > +	(*rte_mov32)((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> > +	(*rte_mov32)((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
> > +	(*rte_mov32)((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 
> > +32);
> >  }
> >
> >  /**
> >   * Copy 128-byte blocks from one location to another,
> >   * locations should not overlap.
> >   */
> > +__attribute__((target("avx2")))
> >  static inline void
> > -rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
> > +rte_mov128blocks_AVX2(uint8_t *dst, const uint8_t *src, size_t n)
> >  {
> >  	__m256i ymm0, ymm1, ymm2, ymm3;
> >
> > @@ -391,8 +433,9 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
> >  	}
> >  }
> >
> > +__attribute__((target("avx2")))
> >  static inline void *
> > -rte_memcpy_generic(void *dst, const void *src, size_t n)
> > +rte_memcpy_generic_AVX2(void *dst, const void *src, size_t n)
> >  {
> >  	uintptr_t dstu = (uintptr_t)dst;
> >  	uintptr_t srcu = (uintptr_t)src;
> > @@ -429,46 +472,46 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> >  	 * Fast way when copy size doesn't exceed 256 bytes
> >  	 */
> >  	if (n <= 32) {
> > -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > -		rte_mov16((uint8_t *)dst - 16 + n,
> > +		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
> > +		(*rte_mov16)((uint8_t *)dst - 16 + n,
> >  				(const uint8_t *)src - 16 + n);
> >  		return ret;
> >  	}
> >  	if (n <= 48) {
> > -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > -		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
> > -		rte_mov16((uint8_t *)dst - 16 + n,
> > +		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
> > +		(*rte_mov16)((uint8_t *)dst + 16, (const uint8_t *)src + 16);
> > +		(*rte_mov16)((uint8_t *)dst - 16 + n,
> >  				(const uint8_t *)src - 16 + n);
> >  		return ret;
> >  	}
> >  	if (n <= 64) {
> > -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -		rte_mov32((uint8_t *)dst - 32 + n,
> > +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> > +		(*rte_mov32)((uint8_t *)dst - 32 + n,
> >  				(const uint8_t *)src - 32 + n);
> >  		return ret;
> >  	}
> >  	if (n <= 256) {
> >  		if (n >= 128) {
> >  			n -= 128;
> > -			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
> > +			(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);
> >  			src = (const uint8_t *)src + 128;
> >  			dst = (uint8_t *)dst + 128;
> >  		}
> >  COPY_BLOCK_128_BACK31:
> >  		if (n >= 64) {
> >  			n -= 64;
> > -			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> > +			(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
> >  			src = (const uint8_t *)src + 64;
> >  			dst = (uint8_t *)dst + 64;
> >  		}
> >  		if (n > 32) {
> > -			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -			rte_mov32((uint8_t *)dst - 32 + n,
> > +			(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> > +			(*rte_mov32)((uint8_t *)dst - 32 + n,
> >  					(const uint8_t *)src - 32 + n);
> >  			return ret;
> >  		}
> >  		if (n > 0) {
> > -			rte_mov32((uint8_t *)dst - 32 + n,
> > +			(*rte_mov32)((uint8_t *)dst - 32 + n,
> >  					(const uint8_t *)src - 32 + n);
> >  		}
> >  		return ret;
> > @@ -481,7 +524,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> >  	if (dstofss > 0) {
> >  		dstofss = 32 - dstofss;
> >  		n -= dstofss;
> > -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> >  		src = (const uint8_t *)src + dstofss;
> >  		dst = (uint8_t *)dst + dstofss;
> >  	}
> > @@ -489,7 +532,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> >  	/**
> >  	 * Copy 128-byte blocks
> >  	 */
> > -	rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
> > +	(*rte_mov128blocks)((uint8_t *)dst, (const uint8_t *)src, n);
> >  	bits = n;
> >  	n = n & 127;
> >  	bits -= n;
> > @@ -501,10 +544,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> >  	 */
> >  	goto COPY_BLOCK_128_BACK31;
> >  }
> > -
> > -#else /* RTE_MACHINE_CPUFLAG */
> > -
> > -#define ALIGNMENT_MASK 0x0F
> > +#endif
> >
> >  /**
> >   * SSE & AVX implementation below
> > @@ -514,8 +554,9 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> >   * Copy 16 bytes from one location to another,
> >   * locations should not overlap.
> >   */
> > +__attribute__((target("default")))
> >  static inline void
> > -rte_mov16(uint8_t *dst, const uint8_t *src)
> > +rte_mov16_DEFAULT(uint8_t *dst, const uint8_t *src)
> >  {
> >  	__m128i xmm0;
> >
> > @@ -527,66 +568,70 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
> >   * Copy 32 bytes from one location to another,
> >   * locations should not overlap.
> >   */
> > +__attribute__((target("default")))
> >  static inline void
> > -rte_mov32(uint8_t *dst, const uint8_t *src)
> > +rte_mov32_DEFAULT(uint8_t *dst, const uint8_t *src)
> >  {
> > -	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> > -	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 
> > +16);
> >  }
> >
> >  /**
> >   * Copy 64 bytes from one location to another,
> >   * locations should not overlap.
> >   */
> > +__attribute__((target("default")))
> >  static inline void
> > -rte_mov64(uint8_t *dst, const uint8_t *src)
> > +rte_mov64_DEFAULT(uint8_t *dst, const uint8_t *src)
> >  {
> > -	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> > -	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> > -	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> > -	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 
> > +16);
> >  }
> >
> >  /**
> >   * Copy 128 bytes from one location to another,
> >   * locations should not overlap.
> >   */
> > +__attribute__((target("default")))
> >  static inline void
> > -rte_mov128(uint8_t *dst, const uint8_t *src)
> > +rte_mov128_DEFAULT(uint8_t *dst, const uint8_t *src)
> >  {
> > -	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> > -	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> > -	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> > -	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> > -	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
> > -	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
> > -	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
> > -	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 
> > +16);
> >  }
> >
> >  /**
> >   * Copy 256 bytes from one location to another,
> >   * locations should not overlap.
> >   */
> > +__attribute__((target("default")))
> >  static inline void
> > -rte_mov256(uint8_t *dst, const uint8_t *src)
> > +rte_mov256_DEFAULT(uint8_t *dst, const uint8_t *src)
> >  {
> > -	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> > -	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> > -	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> > -	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> > -	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
> > -	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
> > -	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
> > -	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
> > -	rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
> > -	rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
> > -	rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
> > -	rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
> > -	rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
> > -	rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
> > -	rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
> > -	rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
> > +	(*rte_mov16)((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 
> > +16);
> >  }
> >
> >  /**
> > @@ -683,8 +728,9 @@ __extension__ ({                                                      \
> >      }                                                                 \
> >  })
> >
> > +__attribute__((target("default")))
> >  static inline void *
> > -rte_memcpy_generic(void *dst, const void *src, size_t n)
> > +rte_memcpy_generic_DEFAULT(void *dst, const void *src, size_t n)
> >  {
> >  	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
> >  	uintptr_t dstu = (uintptr_t)dst;
> > @@ -722,19 +768,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> >  	 * Fast way when copy size doesn't exceed 512 bytes
> >  	 */
> >  	if (n <= 32) {
> > -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > -		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> > +		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
> > +		(*rte_mov16)((uint8_t *)dst - 16 + n,
> > +				(const uint8_t *)src - 16 + n);
> >  		return ret;
> >  	}
> >  	if (n <= 48) {
> > -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> > +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> > +		(*rte_mov16)((uint8_t *)dst - 16 + n,
> > +				(const uint8_t *)src - 16 + n);
> >  		return ret;
> >  	}
> >  	if (n <= 64) {
> > -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
> > -		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> > +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> > +		(*rte_mov16)((uint8_t *)dst + 32, (const uint8_t *)src + 32);
> > +		(*rte_mov16)((uint8_t *)dst - 16 + n,
> > +				(const uint8_t *)src - 16 + n);
> >  		return ret;
> >  	}
> >  	if (n <= 128) {
> > @@ -743,39 +792,42 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> >  	if (n <= 512) {
> >  		if (n >= 256) {
> >  			n -= 256;
> > -			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
> > -			rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
> > +			(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);
> > +			(*rte_mov128)((uint8_t *)dst + 128,
> > +					(const uint8_t *)src + 128);
> >  			src = (const uint8_t *)src + 256;
> >  			dst = (uint8_t *)dst + 256;
> >  		}
> >  COPY_BLOCK_255_BACK15:
> >  		if (n >= 128) {
> >  			n -= 128;
> > -			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
> > +			(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);
> >  			src = (const uint8_t *)src + 128;
> >  			dst = (uint8_t *)dst + 128;
> >  		}
> >  COPY_BLOCK_128_BACK15:
> >  		if (n >= 64) {
> >  			n -= 64;
> > -			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> > +			(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
> >  			src = (const uint8_t *)src + 64;
> >  			dst = (uint8_t *)dst + 64;
> >  		}
> >  COPY_BLOCK_64_BACK15:
> >  		if (n >= 32) {
> >  			n -= 32;
> > -			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > +			(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> >  			src = (const uint8_t *)src + 32;
> >  			dst = (uint8_t *)dst + 32;
> >  		}
> >  		if (n > 16) {
> > -			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > -			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> > +			(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
> > +			(*rte_mov16)((uint8_t *)dst - 16 + n,
> > +					(const uint8_t *)src - 16 + n);
> >  			return ret;
> >  		}
> >  		if (n > 0) {
> > -			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> > +			(*rte_mov16)((uint8_t *)dst - 16 + n,
> > +					(const uint8_t *)src - 16 + n);
> >  		}
> >  		return ret;
> >  	}
> > @@ -790,7 +842,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> >  	if (dstofss > 0) {
> >  		dstofss = 16 - dstofss + 16;
> >  		n -= dstofss;
> > -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> >  		src = (const uint8_t *)src + dstofss;
> >  		dst = (uint8_t *)dst + dstofss;
> >  	}
> > @@ -804,7 +856,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> >  		 * Copy 256-byte blocks
> >  		 */
> >  		for (; n >= 256; n -= 256) {
> > -			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
> > +			(*rte_mov256)((uint8_t *)dst, (const uint8_t *)src);
> >  			dst = (uint8_t *)dst + 256;
> >  			src = (const uint8_t *)src + 256;
> >  		}
> > @@ -826,7 +878,46 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> >  	goto COPY_BLOCK_64_BACK15;
> >  }
> >
> > -#endif /* RTE_MACHINE_CPUFLAG */
> > +static void __attribute__((constructor))
> > +rte_memcpy_init(void)
> > +{
> > +#ifdef CC_SUPPORT_AVX512
> > +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F)) {
> > +		alignment_mask = 0x3F;
> > +		rte_mov16 = rte_mov16_AVX512F;
> > +		rte_mov32 = rte_mov32_AVX512F;
> > +		rte_mov64 = rte_mov64_AVX512F;
> > +		rte_mov128 = rte_mov128_AVX512F;
> > +		rte_mov256 = rte_mov256_AVX512F;
> > +		rte_mov128blocks = rte_mov128blocks_AVX512F;
> > +		rte_mov512blocks = rte_mov512blocks_AVX512F;
> > +		rte_memcpy_generic = rte_memcpy_generic_AVX512F;
> > +		RTE_LOG(INFO, EAL, "AVX512 implementation of memcpy() is using!\n");
> > +	} else
> > +#endif
> > +#ifdef CC_SUPPORT_AVX2
> > +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2)) {
> > +		alignment_mask = 0x1F;
> > +		rte_mov16 = rte_mov16_AVX2;
> > +		rte_mov32 = rte_mov32_AVX2;
> > +		rte_mov64 = rte_mov64_AVX2;
> > +		rte_mov128 = rte_mov128_AVX2;
> > +		rte_mov128blocks = rte_mov128blocks_AVX2;
> > +		rte_memcpy_generic = rte_memcpy_generic_AVX2;
> > +		RTE_LOG(INFO, EAL, "AVX2 implementation of memcpy() is using!\n");
> > +	} else
> > +#endif
> > +	{
> > +		alignment_mask = 0x0F;
> > +		rte_mov16 = rte_mov16_DEFAULT;
> > +		rte_mov32 = rte_mov32_DEFAULT;
> > +		rte_mov64 = rte_mov64_DEFAULT;
> > +		rte_mov128 = rte_mov128_DEFAULT;
> > +		rte_mov256 = rte_mov256_DEFAULT;
> > +		rte_memcpy_generic = rte_memcpy_generic_DEFAULT;
> > +		RTE_LOG(INFO, EAL, "Default SSE/AVX implementation of memcpy() is using!\n");
> > +	}
> > +}
> >
> >  static inline void *
> >  rte_memcpy_aligned(void *dst, const void *src, size_t n) @@ -858,8 
> > +949,8 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
> >
> >  	/* Copy 16 <= size <= 32 bytes */
> >  	if (n <= 32) {
> > -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> > -		rte_mov16((uint8_t *)dst - 16 + n,
> > +		(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);
> > +		(*rte_mov16)((uint8_t *)dst - 16 + n,
> >  				(const uint8_t *)src - 16 + n);
> >
> >  		return ret;
> > @@ -867,8 +958,8 @@ rte_memcpy_aligned(void *dst, const void *src, 
> > size_t n)
> >
> >  	/* Copy 32 < size <= 64 bytes */
> >  	if (n <= 64) {
> > -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > -		rte_mov32((uint8_t *)dst - 32 + n,
> > +		(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);
> > +		(*rte_mov32)((uint8_t *)dst - 32 + n,
> >  				(const uint8_t *)src - 32 + n);
> >
> >  		return ret;
> > @@ -876,13 +967,13 @@ rte_memcpy_aligned(void *dst, const void *src, 
> > size_t n)
> >
> >  	/* Copy 64 bytes blocks */
> >  	for (; n >= 64; n -= 64) {
> > -		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> > +		(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);
> >  		dst = (uint8_t *)dst + 64;
> >  		src = (const uint8_t *)src + 64;
> >  	}
> >
> >  	/* Copy whatever left */
> > -	rte_mov64((uint8_t *)dst - 64 + n,
> > +	(*rte_mov64)((uint8_t *)dst - 64 + n,
> >  			(const uint8_t *)src - 64 + n);
> >
> >  	return ret;
> > @@ -891,10 +982,10 @@ rte_memcpy_aligned(void *dst, const void *src, 
> > size_t n)  static inline void *  rte_memcpy(void *dst, const void 
> > *src, size_t n)  {
> > -	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
> > +	if (!(((uintptr_t)dst | (uintptr_t)src) & alignment_mask))
> >  		return rte_memcpy_aligned(dst, src, n);
> >  	else
> > -		return rte_memcpy_generic(dst, src, n);
> > +		return (*rte_memcpy_generic)(dst, src, n);
> >  }
> >
> >  #ifdef __cplusplus
> > diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk index 
> > a813c91..92399ec 100644
> > --- a/mk/rte.cpuflags.mk
> > +++ b/mk/rte.cpuflags.mk
> > @@ -141,3 +141,17 @@ space:= $(empty) $(empty)
> >  CPUFLAGSTMP1 := $(addprefix RTE_CPUFLAG_,$(CPUFLAGS))
> >  CPUFLAGSTMP2 := $(subst $(space),$(comma),$(CPUFLAGSTMP1))
> >  CPUFLAGS_LIST := -DRTE_COMPILE_TIME_CPUFLAGS=$(CPUFLAGSTMP2)
> > +
> > +# Check if the compiler supports AVX512.
> > +CC_SUPPORT_AVX512 := $(shell $(CC) -march=skylake-avx512 -dM -E - < 
> > +/dev/null 2>&1 | grep -q AVX512 && echo 1) ifeq 
> > +($(CC_SUPPORT_AVX512),1) ifeq ($(CONFIG_RTE_ENABLE_AVX512),y) 
> > +MACHINE_CFLAGS += -DCC_SUPPORT_AVX512 endif endif
> > +
> > +# Check if the compiler supports AVX2.
> > +CC_SUPPORT_AVX2 := $(shell $(CC) -march=core-avx2 -dM -E - < 
> > +/dev/null 2>&1 | grep -q AVX2 && echo 1) ifeq 
> > +($(CC_SUPPORT_AVX2),1) MACHINE_CFLAGS += -DCC_SUPPORT_AVX2 endif
> > --
> > 2.7.4

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/3] eal/x86: run-time dispatch over memcpy
       [not found]               ` <B9E724F4CB7543449049E7AE7669D82F442FE6@SHSMSX101.ccr.corp.intel.com>
@ 2017-09-12  2:27                 ` Li, Xiaoyun
  2017-09-20  6:57                   ` Li, Xiaoyun
  0 siblings, 1 reply; 22+ messages in thread
From: Li, Xiaoyun @ 2017-09-12  2:27 UTC (permalink / raw)
  To: Wang, Liang-min, Richardson, Bruce, Ananyev, Konstantin
  Cc: Zhang, Qi Z, Lu, Wenzhuo, Zhang, Helin, pierre, dev

Hi ALL

After investigating, most DPDK codes are already run-time dispatching. Only rte_memcpy chooses the ISA at build-time.

To modify memcpy, there are two ways. The first one is function pointers and another is function multi-versioning in GCC.

But memcpy has been greatly optimized and gets benefit from total inline. If changing it to run-time dispatching via function pointers, the perf will drop a lot especially when copy size is small.

And function multi-versioning in GCC only works for C++. Even if it is said that GCC6 can support C, but in fact it does not support C in my trial.



The attachment is the perf results of memcpy with and without my patch and original DPDK codes but without inline.

It's just for comparison, so right now, I only tested on Broadwell, using AVX2.

The results are from running test/test/test_memcpy_perf.c.

(C = compile-time constant)

/* Do aligned tests where size is a variable */

/* Do aligned tests where size is a compile-time constant */

/* Do unaligned tests where size is a variable */

/* Do unaligned tests where size is a compile-time constant */



4-7 means dpdk costs time 4 and glibc costs time 7

For size smaller than 128 bytes. This patch's perf is bad and even worse than glibc.

When size grows, the perf is better than glibc but worse than original dpdk.

And when grows above about 1024 bytes, it performs similarly to original dpdk.

Furthermore, if delete inline in original dpdk, the perf are similar to the perf with patch.

Different situations(4 types, such as cache to cache) perform differently but the trend is the same (size grows, perf grows).



So if needs dynamic, needs sacrifices some perf and needs to compile for the minimum target (e.g. compile for target avx, run on avx, avx2, avx512f).



Thus, I think this feature shouldn't be delivered in this release.



Best Regards,

Xiaoyun Li

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH v2 1/3] eal/x86: run-time dispatch over memcpy
  2017-09-12  2:27                 ` Li, Xiaoyun
@ 2017-09-20  6:57                   ` Li, Xiaoyun
  0 siblings, 0 replies; 22+ messages in thread
From: Li, Xiaoyun @ 2017-09-20  6:57 UTC (permalink / raw)
  To: Wang, Liang-min, Richardson, Bruce, Ananyev, Konstantin
  Cc: Zhang, Qi Z, Lu, Wenzhuo, Zhang, Helin, pierre, dev

Hi all
After further investigating, we have found some benefits with the patchset.
So the plan is to add a config parameter CONFIG_RTE_ENABLE_RUNTIME_DISPATCH.
By default, the value is "n" and would use current memcpy codes.
Only if users config it to "y", it would use the run-time dispatch codes(without inline).


Best Regards,
Xiaoyun Li




> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Li, Xiaoyun
> Sent: Tuesday, September 12, 2017 10:27
> To: Wang, Liang-min <liang-min.wang@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>
> Cc: Zhang, Qi Z <qi.z.zhang@intel.com>; Lu, Wenzhuo
> <wenzhuo.lu@intel.com>; Zhang, Helin <helin.zhang@intel.com>;
> pierre@emutex.com; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v2 1/3] eal/x86: run-time dispatch over
> memcpy
> 
> Hi ALL
> 
> After investigating, most DPDK codes are already run-time dispatching. Only
> rte_memcpy chooses the ISA at build-time.
> 
> To modify memcpy, there are two ways. The first one is function pointers
> and another is function multi-versioning in GCC.
> 
> But memcpy has been greatly optimized and gets benefit from total inline. If
> changing it to run-time dispatching via function pointers, the perf will drop a
> lot especially when copy size is small.
> 
> And function multi-versioning in GCC only works for C++. Even if it is said that
> GCC6 can support C, but in fact it does not support C in my trial.
> 
> 
> 
> The attachment is the perf results of memcpy with and without my patch and
> original DPDK codes but without inline.
> 
> It's just for comparison, so right now, I only tested on Broadwell, using AVX2.
> 
> The results are from running test/test/test_memcpy_perf.c.
> 
> (C = compile-time constant)
> 
> /* Do aligned tests where size is a variable */
> 
> /* Do aligned tests where size is a compile-time constant */
> 
> /* Do unaligned tests where size is a variable */
> 
> /* Do unaligned tests where size is a compile-time constant */
> 
> 
> 
> 4-7 means dpdk costs time 4 and glibc costs time 7
> 
> For size smaller than 128 bytes. This patch's perf is bad and even worse than
> glibc.
> 
> When size grows, the perf is better than glibc but worse than original dpdk.
> 
> And when grows above about 1024 bytes, it performs similarly to original
> dpdk.
> 
> Furthermore, if delete inline in original dpdk, the perf are similar to the perf
> with patch.
> 
> Different situations(4 types, such as cache to cache) perform differently but
> the trend is the same (size grows, perf grows).
> 
> 
> 
> So if needs dynamic, needs sacrifices some perf and needs to compile for the
> minimum target (e.g. compile for target avx, run on avx, avx2, avx512f).
> 
> 
> 
> Thus, I think this feature shouldn't be delivered in this release.
> 
> 
> 
> Best Regards,
> 
> Xiaoyun Li

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2017-09-20  6:57 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-08-25  2:06 [PATCH 0/3] dynamic linking support Xiaoyun Li
2017-08-25  2:06 ` [PATCH 1/3] eal/x86: run-time dispatch over memcpy Xiaoyun Li
2017-08-30 14:56   ` Ananyev, Konstantin
2017-08-30 17:51     ` Bruce Richardson
2017-08-31  1:21       ` Lu, Wenzhuo
2017-08-30 18:00   ` Stephen Hemminger
2017-08-31  1:23     ` Lu, Wenzhuo
2017-08-31  5:05       ` Stephen Hemminger
2017-08-31  5:24         ` Li, Xiaoyun
2017-08-25  2:06 ` [PATCH 2/3] app/test: run-time dispatch over memcpy perf test Xiaoyun Li
2017-08-25  2:06 ` [PATCH 3/3] efd: run-time dispatch over x86 EFD functions Xiaoyun Li
2017-09-01  8:56 ` [PATCH v2 0/3] dynamic linking support Xiaoyun Li
2017-09-01  8:57   ` [PATCH v2 1/3] eal/x86: run-time dispatch over memcpy Xiaoyun Li
2017-09-01  9:16     ` Ananyev, Konstantin
2017-09-01  9:28       ` Li, Xiaoyun
2017-09-01 10:38         ` Ananyev, Konstantin
2017-09-04  1:41           ` Li, Xiaoyun
     [not found]             ` <B9E724F4CB7543449049E7AE7669D82F44216E@SHSMSX101.ccr.corp.intel.com>
     [not found]               ` <B9E724F4CB7543449049E7AE7669D82F442FE6@SHSMSX101.ccr.corp.intel.com>
2017-09-12  2:27                 ` Li, Xiaoyun
2017-09-20  6:57                   ` Li, Xiaoyun
2017-09-01 15:34     ` Stephen Hemminger
2017-09-01  8:57   ` [PATCH v2 2/3] app/test: run-time dispatch over memcpy perf test Xiaoyun Li
2017-09-01  8:57   ` [PATCH v2 3/3] efd: run-time dispatch over x86 EFD functions Xiaoyun Li

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.