All of lore.kernel.org
 help / color / mirror / Atom feed
* [igt-dev] [PATCH igt] lib: Provide an accelerated routine for readback from WC
@ 2018-02-27 21:50 Chris Wilson
  2018-02-27 21:53 ` Chris Wilson
                   ` (9 more replies)
  0 siblings, 10 replies; 13+ messages in thread
From: Chris Wilson @ 2018-02-27 21:50 UTC (permalink / raw)
  To: igt-dev

Reading from WC is awfully slow as each access is uncached and so
performed synchronously, stalling for the memory load. x86 did introduce
some new instructions in SSE 4.1 to provide a small internal buffer to
accelerate reading back a cacheline at a time from uncached memory, for
this purpose.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 lib/igt_fb.c  |  3 ++-
 lib/igt_x86.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 lib/igt_x86.h |  2 ++
 3 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/lib/igt_fb.c b/lib/igt_fb.c
index ecd73053..7404ba7c 100644
--- a/lib/igt_fb.c
+++ b/lib/igt_fb.c
@@ -32,6 +32,7 @@
 #include "drmtest.h"
 #include "igt_fb.h"
 #include "igt_kms.h"
+#include "igt_x86.h"
 #include "ioctl_wrappers.h"
 #include "intel_chipset.h"
 
@@ -1340,7 +1341,7 @@ static void convert_nv12_to_rgb24(struct igt_fb *fb, struct fb_convert_blit_uplo
 	 * it's faster to copy the whole BO to a temporary buffer and convert
 	 * from there.
 	 */
-	memcpy(buf, blit->linear.map, blit->linear.size);
+	igt_memcpy_from_wc(buf, blit->linear.map, blit->linear.size);
 	y = &buf[blit->linear.offsets[0]];
 	uv = &buf[blit->linear.offsets[1]];
 
diff --git a/lib/igt_x86.c b/lib/igt_x86.c
index 0ed3c6f1..b7b57284 100644
--- a/lib/igt_x86.c
+++ b/lib/igt_x86.c
@@ -36,7 +36,10 @@
 #endif
 
 #include "igt_x86.h"
+
+#include <stdint.h>
 #include <stdio.h>
+#include <string.h>
 
 /**
  * SECTION:igt_x86
@@ -174,3 +177,46 @@ char *igt_x86_features_to_string(unsigned features, char *line)
 	return ret;
 }
 #endif
+
+#if defined(__x86_64__) && !defined(__clang__)
+#define MOVNT 512
+
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+
+#include <smmintrin.h>
+void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
+{
+	if (igt_x86_features() & SSE4_1 && ((uintptr_t)src & 15) == 0) {
+		while (len >= 64) {
+			__m128i *S = (__m128i *)src;
+			__m128i *D = (__m128i *)dst;
+			__m128i tmp[4];
+
+			tmp[0] = _mm_stream_load_si128(S + 0);
+			tmp[1] = _mm_stream_load_si128(S + 1);
+			tmp[2] = _mm_stream_load_si128(S + 2);
+			tmp[3] = _mm_stream_load_si128(S + 3);
+
+			_mm_storeu_si128(D + 0, tmp[0]);
+			_mm_storeu_si128(D + 1, tmp[1]);
+			_mm_storeu_si128(D + 2, tmp[2]);
+			_mm_storeu_si128(D + 3, tmp[3]);
+
+			src += 64;
+			dst += 64;
+			len -= 64;
+		}
+	}
+
+	memcpy(dst, src, len);
+}
+
+#pragma GCC pop_options
+
+#else
+void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
+{
+	memcpy(dst, src, len);
+}
+#endif
diff --git a/lib/igt_x86.h b/lib/igt_x86.h
index 27b7f0fd..d4f8c343 100644
--- a/lib/igt_x86.h
+++ b/lib/igt_x86.h
@@ -55,4 +55,6 @@ static inline char *igt_x86_features_to_string(unsigned features, char *line)
 }
 #endif
 
+void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len);
+
 #endif /* IGT_X86_H */
-- 
2.16.2

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [igt-dev] [PATCH igt] lib: Provide an accelerated routine for readback from WC
  2018-02-27 21:50 [igt-dev] [PATCH igt] lib: Provide an accelerated routine for readback from WC Chris Wilson
@ 2018-02-27 21:53 ` Chris Wilson
  2018-02-27 22:17 ` [igt-dev] [PATCH igt v2] " Chris Wilson
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 13+ messages in thread
From: Chris Wilson @ 2018-02-27 21:53 UTC (permalink / raw)
  To: igt-dev

Quoting Chris Wilson (2018-02-27 21:50:40)
> Reading from WC is awfully slow as each access is uncached and so
> performed synchronously, stalling for the memory load. x86 did introduce
> some new instructions in SSE 4.1 to provide a small internal buffer to
> accelerate reading back a cacheline at a time from uncached memory, for
> this purpose.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  lib/igt_fb.c  |  3 ++-
>  lib/igt_x86.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
>  lib/igt_x86.h |  2 ++
>  3 files changed, 50 insertions(+), 1 deletion(-)
> 
> diff --git a/lib/igt_fb.c b/lib/igt_fb.c
> index ecd73053..7404ba7c 100644
> --- a/lib/igt_fb.c
> +++ b/lib/igt_fb.c
> @@ -32,6 +32,7 @@
>  #include "drmtest.h"
>  #include "igt_fb.h"
>  #include "igt_kms.h"
> +#include "igt_x86.h"
>  #include "ioctl_wrappers.h"
>  #include "intel_chipset.h"
>  
> @@ -1340,7 +1341,7 @@ static void convert_nv12_to_rgb24(struct igt_fb *fb, struct fb_convert_blit_uplo
>          * it's faster to copy the whole BO to a temporary buffer and convert
>          * from there.
>          */
> -       memcpy(buf, blit->linear.map, blit->linear.size);
> +       igt_memcpy_from_wc(buf, blit->linear.map, blit->linear.size);
>         y = &buf[blit->linear.offsets[0]];
>         uv = &buf[blit->linear.offsets[1]];
>  
> diff --git a/lib/igt_x86.c b/lib/igt_x86.c
> index 0ed3c6f1..b7b57284 100644
> --- a/lib/igt_x86.c
> +++ b/lib/igt_x86.c
> @@ -36,7 +36,10 @@
>  #endif
>  
>  #include "igt_x86.h"
> +
> +#include <stdint.h>
>  #include <stdio.h>
> +#include <string.h>
>  
>  /**
>   * SECTION:igt_x86
> @@ -174,3 +177,46 @@ char *igt_x86_features_to_string(unsigned features, char *line)
>         return ret;
>  }
>  #endif
> +
> +#if defined(__x86_64__) && !defined(__clang__)
> +#define MOVNT 512
> +
> +#pragma GCC push_options
> +#pragma GCC target("sse4.1")

Add
#pragma GCC diagnostic ignore "-Wpointer-arith"
for peace and quiet.
-Chris
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [igt-dev] [PATCH igt v2] lib: Provide an accelerated routine for readback from WC
  2018-02-27 21:50 [igt-dev] [PATCH igt] lib: Provide an accelerated routine for readback from WC Chris Wilson
  2018-02-27 21:53 ` Chris Wilson
@ 2018-02-27 22:17 ` Chris Wilson
  2018-02-27 22:20 ` [igt-dev] [PATCH igt v3] " Chris Wilson
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 13+ messages in thread
From: Chris Wilson @ 2018-02-27 22:17 UTC (permalink / raw)
  To: igt-dev

Reading from WC is awfully slow as each access is uncached and so
performed synchronously, stalling for the memory load. x86 did introduce
some new instructions in SSE 4.1 to provide a small internal buffer to
accelerate reading back a cacheline at a time from uncached memory, for
this purpose.

v2: Don't be lazy and handle misalignment.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 lib/igt_fb.c  |  3 +-
 lib/igt_x86.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/igt_x86.h |  2 ++
 3 files changed, 103 insertions(+), 1 deletion(-)

diff --git a/lib/igt_fb.c b/lib/igt_fb.c
index ecd73053..7404ba7c 100644
--- a/lib/igt_fb.c
+++ b/lib/igt_fb.c
@@ -32,6 +32,7 @@
 #include "drmtest.h"
 #include "igt_fb.h"
 #include "igt_kms.h"
+#include "igt_x86.h"
 #include "ioctl_wrappers.h"
 #include "intel_chipset.h"
 
@@ -1340,7 +1341,7 @@ static void convert_nv12_to_rgb24(struct igt_fb *fb, struct fb_convert_blit_uplo
 	 * it's faster to copy the whole BO to a temporary buffer and convert
 	 * from there.
 	 */
-	memcpy(buf, blit->linear.map, blit->linear.size);
+	igt_memcpy_from_wc(buf, blit->linear.map, blit->linear.size);
 	y = &buf[blit->linear.offsets[0]];
 	uv = &buf[blit->linear.offsets[1]];
 
diff --git a/lib/igt_x86.c b/lib/igt_x86.c
index 0ed3c6f1..e15034da 100644
--- a/lib/igt_x86.c
+++ b/lib/igt_x86.c
@@ -36,7 +36,10 @@
 #endif
 
 #include "igt_x86.h"
+
+#include <stdint.h>
 #include <stdio.h>
+#include <string.h>
 
 /**
  * SECTION:igt_x86
@@ -174,3 +177,99 @@ char *igt_x86_features_to_string(unsigned features, char *line)
 	return ret;
 }
 #endif
+
+#if defined(__x86_64__) && !defined(__clang__)
+#define MOVNT 512
+
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#pragma GCC diagnostic ignored "-Wpointer-arith"
+
+#define min(x, y) ({                            \
+	typeof(x) _min1 = (x);                  \
+	typeof(y) _min2 = (y);                  \
+	(void) (&_min1 == &_min2);              \
+	_min1 < _min2 ? _min1 : _min2;		\
+})
+
+#include <smmintrin.h>
+static void memcpy_from_wc_sse41(void *dst, const void *src, unsigned long len)
+{
+	if ((uintptr_t)src & 15) {
+		char buf[16];
+		__m128i *S = (__m128i *)((uintptr_t)src & ~15);
+		unsigned long misalign = (uintptr_t)src & 15;
+		unsigned long copy = min(len, 16 - misalign);
+
+		_mm_storeu_si128((__m128i *)buf,
+				 _mm_stream_load_si128(S));
+
+		memcpy(dst, buf + misalign, copy);
+
+		dst += copy;
+		src += copy;
+		len -= copy;
+	}
+
+	while (len >= 64) {
+		__m128i *S = (__m128i *)src;
+		__m128i *D = (__m128i *)dst;
+		__m128i tmp[4];
+
+		tmp[0] = _mm_stream_load_si128(S + 0);
+		tmp[1] = _mm_stream_load_si128(S + 1);
+		tmp[2] = _mm_stream_load_si128(S + 2);
+		tmp[3] = _mm_stream_load_si128(S + 3);
+
+		_mm_storeu_si128(D + 0, tmp[0]);
+		_mm_storeu_si128(D + 1, tmp[1]);
+		_mm_storeu_si128(D + 2, tmp[2]);
+		_mm_storeu_si128(D + 3, tmp[3]);
+
+		src += 64;
+		dst += 64;
+		len -= 64;
+	}
+
+	while (len >= 16) {
+		_mm_storeu_si128((__m128i *)dst,
+				 _mm_stream_load_si128((__m128i *)src));
+
+		src += 16;
+		dst += 16;
+		len -= 16;
+	}
+
+	if (len) {
+		char buf[16];
+
+		_mm_storeu_si128((__m128i *)buf,
+				 _mm_stream_load_si128((__m128i *)src));
+		memcpy(dst, buf, len);
+	}
+}
+
+static void memcpy_from_wc(void *dst, const void *src, unsigned long len)
+{
+	memcpy(dst, src, len);
+}
+
+static void (*resolve_memcpy_from_wc(void))(void *, const void *, unsigned long)
+{
+	if (igt_x86_features() & SSE4_1)
+		return memcpy_from_wc_sse41;
+
+	return memcpy_from_wc;
+}
+
+void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
+	__attribute__((ifunc("resolve_memcpy_from_wc")));
+
+#pragma GCC pop_options
+
+#else
+void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
+{
+	memcpy(dst, src, len);
+}
+#endif
diff --git a/lib/igt_x86.h b/lib/igt_x86.h
index 27b7f0fd..d4f8c343 100644
--- a/lib/igt_x86.h
+++ b/lib/igt_x86.h
@@ -55,4 +55,6 @@ static inline char *igt_x86_features_to_string(unsigned features, char *line)
 }
 #endif
 
+void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len);
+
 #endif /* IGT_X86_H */
-- 
2.16.2

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [igt-dev] [PATCH igt v3] lib: Provide an accelerated routine for readback from WC
  2018-02-27 21:50 [igt-dev] [PATCH igt] lib: Provide an accelerated routine for readback from WC Chris Wilson
  2018-02-27 21:53 ` Chris Wilson
  2018-02-27 22:17 ` [igt-dev] [PATCH igt v2] " Chris Wilson
@ 2018-02-27 22:20 ` Chris Wilson
  2018-02-27 22:42 ` [igt-dev] [PATCH igt v4] " Chris Wilson
                   ` (6 subsequent siblings)
  9 siblings, 0 replies; 13+ messages in thread
From: Chris Wilson @ 2018-02-27 22:20 UTC (permalink / raw)
  To: igt-dev

Reading from WC is awfully slow as each access is uncached and so
performed synchronously, stalling for the memory load. x86 did introduce
some new instructions in SSE 4.1 to provide a small internal buffer to
accelerate reading back a cacheline at a time from uncached memory, for
this purpose.

v2: Don't be lazy and handle misalignment.
v3: Switch out of sse41 before emitting the generic memcpy routine

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 lib/igt_fb.c  |  3 +-
 lib/igt_x86.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/igt_x86.h |  2 ++
 3 files changed, 103 insertions(+), 1 deletion(-)

diff --git a/lib/igt_fb.c b/lib/igt_fb.c
index ecd73053..7404ba7c 100644
--- a/lib/igt_fb.c
+++ b/lib/igt_fb.c
@@ -32,6 +32,7 @@
 #include "drmtest.h"
 #include "igt_fb.h"
 #include "igt_kms.h"
+#include "igt_x86.h"
 #include "ioctl_wrappers.h"
 #include "intel_chipset.h"
 
@@ -1340,7 +1341,7 @@ static void convert_nv12_to_rgb24(struct igt_fb *fb, struct fb_convert_blit_uplo
 	 * it's faster to copy the whole BO to a temporary buffer and convert
 	 * from there.
 	 */
-	memcpy(buf, blit->linear.map, blit->linear.size);
+	igt_memcpy_from_wc(buf, blit->linear.map, blit->linear.size);
 	y = &buf[blit->linear.offsets[0]];
 	uv = &buf[blit->linear.offsets[1]];
 
diff --git a/lib/igt_x86.c b/lib/igt_x86.c
index 0ed3c6f1..eba4c898 100644
--- a/lib/igt_x86.c
+++ b/lib/igt_x86.c
@@ -36,7 +36,10 @@
 #endif
 
 #include "igt_x86.h"
+
+#include <stdint.h>
 #include <stdio.h>
+#include <string.h>
 
 /**
  * SECTION:igt_x86
@@ -174,3 +177,99 @@ char *igt_x86_features_to_string(unsigned features, char *line)
 	return ret;
 }
 #endif
+
+#if defined(__x86_64__) && !defined(__clang__)
+#define MOVNT 512
+
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#pragma GCC diagnostic ignored "-Wpointer-arith"
+
+#define min(x, y) ({                            \
+	typeof(x) _min1 = (x);                  \
+	typeof(y) _min2 = (y);                  \
+	(void) (&_min1 == &_min2);              \
+	_min1 < _min2 ? _min1 : _min2;		\
+})
+
+#include <smmintrin.h>
+static void memcpy_from_wc_sse41(void *dst, const void *src, unsigned long len)
+{
+	if ((uintptr_t)src & 15) {
+		char buf[16];
+		__m128i *S = (__m128i *)((uintptr_t)src & ~15);
+		unsigned long misalign = (uintptr_t)src & 15;
+		unsigned long copy = min(len, 16 - misalign);
+
+		_mm_storeu_si128((__m128i *)buf,
+				 _mm_stream_load_si128(S));
+
+		memcpy(dst, buf + misalign, copy);
+
+		dst += copy;
+		src += copy;
+		len -= copy;
+	}
+
+	while (len >= 64) {
+		__m128i *S = (__m128i *)src;
+		__m128i *D = (__m128i *)dst;
+		__m128i tmp[4];
+
+		tmp[0] = _mm_stream_load_si128(S + 0);
+		tmp[1] = _mm_stream_load_si128(S + 1);
+		tmp[2] = _mm_stream_load_si128(S + 2);
+		tmp[3] = _mm_stream_load_si128(S + 3);
+
+		_mm_storeu_si128(D + 0, tmp[0]);
+		_mm_storeu_si128(D + 1, tmp[1]);
+		_mm_storeu_si128(D + 2, tmp[2]);
+		_mm_storeu_si128(D + 3, tmp[3]);
+
+		src += 64;
+		dst += 64;
+		len -= 64;
+	}
+
+	while (len >= 16) {
+		_mm_storeu_si128((__m128i *)dst,
+				 _mm_stream_load_si128((__m128i *)src));
+
+		src += 16;
+		dst += 16;
+		len -= 16;
+	}
+
+	if (len) {
+		char buf[16];
+
+		_mm_storeu_si128((__m128i *)buf,
+				 _mm_stream_load_si128((__m128i *)src));
+		memcpy(dst, buf, len);
+	}
+}
+
+#pragma GCC pop_options
+
+static void memcpy_from_wc(void *dst, const void *src, unsigned long len)
+{
+	memcpy(dst, src, len);
+}
+
+static void (*resolve_memcpy_from_wc(void))(void *, const void *, unsigned long)
+{
+	if (igt_x86_features() & SSE4_1)
+		return memcpy_from_wc_sse41;
+
+	return memcpy_from_wc;
+}
+
+void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
+	__attribute__((ifunc("resolve_memcpy_from_wc")));
+
+#else
+void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
+{
+	memcpy(dst, src, len);
+}
+#endif
diff --git a/lib/igt_x86.h b/lib/igt_x86.h
index 27b7f0fd..d4f8c343 100644
--- a/lib/igt_x86.h
+++ b/lib/igt_x86.h
@@ -55,4 +55,6 @@ static inline char *igt_x86_features_to_string(unsigned features, char *line)
 }
 #endif
 
+void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len);
+
 #endif /* IGT_X86_H */
-- 
2.16.2

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [igt-dev] [PATCH igt v4] lib: Provide an accelerated routine for readback from WC
  2018-02-27 21:50 [igt-dev] [PATCH igt] lib: Provide an accelerated routine for readback from WC Chris Wilson
                   ` (2 preceding siblings ...)
  2018-02-27 22:20 ` [igt-dev] [PATCH igt v3] " Chris Wilson
@ 2018-02-27 22:42 ` Chris Wilson
  2018-02-27 23:29 ` [igt-dev] [PATCH igt] " Eric Anholt
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 13+ messages in thread
From: Chris Wilson @ 2018-02-27 22:42 UTC (permalink / raw)
  To: igt-dev

Reading from WC is awfully slow as each access is uncached and so
performed synchronously, stalling for the memory load. x86 did introduce
some new instructions in SSE 4.1 to provide a small internal buffer to
accelerate reading back a cacheline at a time from uncached memory, for
this purpose.

v2: Don't be lazy and handle misalignment.
v3: Switch out of sse41 before emitting the generic memcpy routine
v4: Replace opencoded memcpy_from_wc

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 lib/igt_fb.c                   |  3 +-
 lib/igt_x86.c                  | 98 ++++++++++++++++++++++++++++++++++++++++++
 lib/igt_x86.h                  |  2 +
 tests/gem_fence_thrash.c       | 63 +--------------------------
 tests/gem_mmap_gtt.c           | 37 +---------------
 tests/gem_tiled_pread_pwrite.c | 37 +---------------
 6 files changed, 106 insertions(+), 134 deletions(-)

diff --git a/lib/igt_fb.c b/lib/igt_fb.c
index ecd73053..7404ba7c 100644
--- a/lib/igt_fb.c
+++ b/lib/igt_fb.c
@@ -32,6 +32,7 @@
 #include "drmtest.h"
 #include "igt_fb.h"
 #include "igt_kms.h"
+#include "igt_x86.h"
 #include "ioctl_wrappers.h"
 #include "intel_chipset.h"
 
@@ -1340,7 +1341,7 @@ static void convert_nv12_to_rgb24(struct igt_fb *fb, struct fb_convert_blit_uplo
 	 * it's faster to copy the whole BO to a temporary buffer and convert
 	 * from there.
 	 */
-	memcpy(buf, blit->linear.map, blit->linear.size);
+	igt_memcpy_from_wc(buf, blit->linear.map, blit->linear.size);
 	y = &buf[blit->linear.offsets[0]];
 	uv = &buf[blit->linear.offsets[1]];
 
diff --git a/lib/igt_x86.c b/lib/igt_x86.c
index 0ed3c6f1..a3033c22 100644
--- a/lib/igt_x86.c
+++ b/lib/igt_x86.c
@@ -36,7 +36,10 @@
 #endif
 
 #include "igt_x86.h"
+
+#include <stdint.h>
 #include <stdio.h>
+#include <string.h>
 
 /**
  * SECTION:igt_x86
@@ -174,3 +177,98 @@ char *igt_x86_features_to_string(unsigned features, char *line)
 	return ret;
 }
 #endif
+
+#if defined(__x86_64__) && !defined(__clang__)
+#define MOVNT 512
+
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#pragma GCC diagnostic ignored "-Wpointer-arith"
+
+#define min(x, y) ({                            \
+	typeof(x) _min1 = (x);                  \
+	typeof(y) _min2 = (y);                  \
+	(void) (&_min1 == &_min2);              \
+	_min1 < _min2 ? _min1 : _min2;		\
+})
+
+#include <smmintrin.h>
+static void memcpy_from_wc_sse41(void *dst, const void *src, unsigned long len)
+{
+	char buf[16];
+
+	if ((uintptr_t)src & 15) {
+		__m128i *S = (__m128i *)((uintptr_t)src & ~15);
+		unsigned long misalign = (uintptr_t)src & 15;
+		unsigned long copy = min(len, 16 - misalign);
+
+		_mm_storeu_si128((__m128i *)buf,
+				 _mm_stream_load_si128(S));
+
+		memcpy(dst, buf + misalign, copy);
+
+		dst += copy;
+		src += copy;
+		len -= copy;
+	}
+
+	while (len >= 64) {
+		__m128i *S = (__m128i *)src;
+		__m128i *D = (__m128i *)dst;
+		__m128i tmp[4];
+
+		tmp[0] = _mm_stream_load_si128(S + 0);
+		tmp[1] = _mm_stream_load_si128(S + 1);
+		tmp[2] = _mm_stream_load_si128(S + 2);
+		tmp[3] = _mm_stream_load_si128(S + 3);
+
+		_mm_storeu_si128(D + 0, tmp[0]);
+		_mm_storeu_si128(D + 1, tmp[1]);
+		_mm_storeu_si128(D + 2, tmp[2]);
+		_mm_storeu_si128(D + 3, tmp[3]);
+
+		src += 64;
+		dst += 64;
+		len -= 64;
+	}
+
+	while (len >= 16) {
+		_mm_storeu_si128((__m128i *)dst,
+				 _mm_stream_load_si128((__m128i *)src));
+
+		src += 16;
+		dst += 16;
+		len -= 16;
+	}
+
+	if (len) {
+		_mm_storeu_si128((__m128i *)buf,
+				 _mm_stream_load_si128((__m128i *)src));
+		memcpy(dst, buf, len);
+	}
+}
+
+#pragma GCC pop_options
+
+static void memcpy_from_wc(void *dst, const void *src, unsigned long len)
+{
+	memcpy(dst, src, len);
+}
+
+static void (*resolve_memcpy_from_wc(void))(void *, const void *, unsigned long)
+{
+	if (igt_x86_features() & SSE4_1)
+		return memcpy_from_wc_sse41;
+
+	return memcpy_from_wc;
+}
+
+void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
+	__attribute__((ifunc("resolve_memcpy_from_wc")));
+
+#else
+void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
+{
+	memcpy(dst, src, len);
+}
+#endif
diff --git a/lib/igt_x86.h b/lib/igt_x86.h
index 27b7f0fd..d4f8c343 100644
--- a/lib/igt_x86.h
+++ b/lib/igt_x86.h
@@ -55,4 +55,6 @@ static inline char *igt_x86_features_to_string(unsigned features, char *line)
 }
 #endif
 
+void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len);
+
 #endif /* IGT_X86_H */
diff --git a/tests/gem_fence_thrash.c b/tests/gem_fence_thrash.c
index c8ff961d..2d7fb2ff 100644
--- a/tests/gem_fence_thrash.c
+++ b/tests/gem_fence_thrash.c
@@ -107,75 +107,16 @@ bo_copy (void *_arg)
 	return NULL;
 }
 
-#if defined(__x86_64__) && !defined(__clang__)
-
-#pragma GCC push_options
-#pragma GCC target("sse4.1")
-
-#include <smmintrin.h>
-
-#define MOVNT 512
-
-__attribute__((noinline))
-static void copy_wc_page(void *dst, void *src)
-{
-	if (igt_x86_features() & SSE4_1) {
-		__m128i *S = (__m128i *)src;
-		__m128i *D = (__m128i *)dst;
-
-		for (int i = 0; i < PAGE_SIZE/CACHELINE; i++) {
-			__m128i tmp[4];
-
-			tmp[0] = _mm_stream_load_si128(S++);
-			tmp[1] = _mm_stream_load_si128(S++);
-			tmp[2] = _mm_stream_load_si128(S++);
-			tmp[3] = _mm_stream_load_si128(S++);
-
-			_mm_store_si128(D++, tmp[0]);
-			_mm_store_si128(D++, tmp[1]);
-			_mm_store_si128(D++, tmp[2]);
-			_mm_store_si128(D++, tmp[3]);
-		}
-	} else
-		memcpy(dst, src, PAGE_SIZE);
-}
-
-static void copy_wc_cacheline(void *dst, void *src)
-{
-	if (igt_x86_features() & SSE4_1) {
-		__m128i *S = (__m128i *)src;
-		__m128i *D = (__m128i *)dst;
-		__m128i tmp[4];
-
-		tmp[0] = _mm_stream_load_si128(S++);
-		tmp[1] = _mm_stream_load_si128(S++);
-		tmp[2] = _mm_stream_load_si128(S++);
-		tmp[3] = _mm_stream_load_si128(S++);
-
-		_mm_store_si128(D++, tmp[0]);
-		_mm_store_si128(D++, tmp[1]);
-		_mm_store_si128(D++, tmp[2]);
-		_mm_store_si128(D++, tmp[3]);
-	} else
-		memcpy(dst, src, CACHELINE);
-}
-
-#pragma GCC pop_options
-
-#else
-
 static void copy_wc_page(void *dst, const void *src)
 {
-	memcpy(dst, src, PAGE_SIZE);
+	igt_memcpy_from_wc(dst, src, PAGE_SIZE);
 }
 
 static void copy_wc_cacheline(void *dst, const void *src)
 {
-	memcpy(dst, src, CACHELINE);
+	igt_memcpy_from_wc(dst, src, CACHELINE);
 }
 
-#endif
-
 static void
 _bo_write_verify(struct test *t)
 {
diff --git a/tests/gem_mmap_gtt.c b/tests/gem_mmap_gtt.c
index 0f598125..6a332b25 100644
--- a/tests/gem_mmap_gtt.c
+++ b/tests/gem_mmap_gtt.c
@@ -529,45 +529,10 @@ test_huge_bo(int fd, int huge, int tiling)
 	munmap(linear_pattern, PAGE_SIZE);
 }
 
-#if defined(__x86_64__) && !defined(__clang__)
-#define MOVNT 512
-
-#pragma GCC push_options
-#pragma GCC target("sse4.1")
-
-#include <smmintrin.h>
-__attribute__((noinline))
-static void copy_wc_page(void *dst, void *src)
-{
-	if (igt_x86_features() & SSE4_1) {
-		__m128i *S = (__m128i *)src;
-		__m128i *D = (__m128i *)dst;
-
-		for (int i = 0; i < PAGE_SIZE/64; i++) {
-			__m128i tmp[4];
-
-			tmp[0] = _mm_stream_load_si128(S++);
-			tmp[1] = _mm_stream_load_si128(S++);
-			tmp[2] = _mm_stream_load_si128(S++);
-			tmp[3] = _mm_stream_load_si128(S++);
-
-			_mm_store_si128(D++, tmp[0]);
-			_mm_store_si128(D++, tmp[1]);
-			_mm_store_si128(D++, tmp[2]);
-			_mm_store_si128(D++, tmp[3]);
-		}
-	} else
-		memcpy(dst, src, PAGE_SIZE);
-}
-
-#pragma GCC pop_options
-
-#else
 static void copy_wc_page(void *dst, const void *src)
 {
-	memcpy(dst, src, PAGE_SIZE);
+	igt_memcpy_from_wc(dst, src, PAGE_SIZE);
 }
-#endif
 
 static unsigned int tile_row_size(int tiling, unsigned int stride)
 {
diff --git a/tests/gem_tiled_pread_pwrite.c b/tests/gem_tiled_pread_pwrite.c
index 7b5577fd..313daa38 100644
--- a/tests/gem_tiled_pread_pwrite.c
+++ b/tests/gem_tiled_pread_pwrite.c
@@ -100,45 +100,10 @@ create_bo(int fd)
 	return handle;
 }
 
-#if defined(__x86_64__) && !defined(__clang__)
-#define MOVNT 512
-
-#pragma GCC push_options
-#pragma GCC target("sse4.1")
-
-#include <smmintrin.h>
-__attribute__((noinline))
-static void copy_wc_page(void *dst, void *src)
-{
-	if (igt_x86_features() & SSE4_1) {
-		__m128i *S = (__m128i *)src;
-		__m128i *D = (__m128i *)dst;
-
-		for (int i = 0; i < PAGE_SIZE/64; i++) {
-			__m128i tmp[4];
-
-			tmp[0] = _mm_stream_load_si128(S++);
-			tmp[1] = _mm_stream_load_si128(S++);
-			tmp[2] = _mm_stream_load_si128(S++);
-			tmp[3] = _mm_stream_load_si128(S++);
-
-			_mm_store_si128(D++, tmp[0]);
-			_mm_store_si128(D++, tmp[1]);
-			_mm_store_si128(D++, tmp[2]);
-			_mm_store_si128(D++, tmp[3]);
-		}
-	} else
-		memcpy(dst, src, PAGE_SIZE);
-}
-
-#pragma GCC pop_options
-
-#else
 static void copy_wc_page(void *dst, const void *src)
 {
-	memcpy(dst, src, PAGE_SIZE);
+	igt_memcpy_from_wc(dst, src, PAGE_SIZE);
 }
-#endif
 
 igt_simple_main
 {
-- 
2.16.2

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [igt-dev] [PATCH igt] lib: Provide an accelerated routine for readback from WC
  2018-02-27 21:50 [igt-dev] [PATCH igt] lib: Provide an accelerated routine for readback from WC Chris Wilson
                   ` (3 preceding siblings ...)
  2018-02-27 22:42 ` [igt-dev] [PATCH igt v4] " Chris Wilson
@ 2018-02-27 23:29 ` Eric Anholt
  2018-02-27 23:44 ` [igt-dev] ✓ Fi.CI.BAT: success for lib: Provide an accelerated routine for readback from WC (rev4) Patchwork
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 13+ messages in thread
From: Eric Anholt @ 2018-02-27 23:29 UTC (permalink / raw)
  To: Chris Wilson, igt-dev


[-- Attachment #1.1: Type: text/plain, Size: 465 bytes --]

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Reading from WC is awfully slow as each access is uncached and so
> performed synchronously, stalling for the memory load. x86 did introduce
> some new instructions in SSE 4.1 to provide a small internal buffer to
> accelerate reading back a cacheline at a time from uncached memory, for
> this purpose.

I think without a _mm_mfence() before the movntdqas, you can get stale
results from movntdqa's little cache.

[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [igt-dev] ✓ Fi.CI.BAT: success for lib: Provide an accelerated routine for readback from WC (rev4)
  2018-02-27 21:50 [igt-dev] [PATCH igt] lib: Provide an accelerated routine for readback from WC Chris Wilson
                   ` (4 preceding siblings ...)
  2018-02-27 23:29 ` [igt-dev] [PATCH igt] " Eric Anholt
@ 2018-02-27 23:44 ` Patchwork
  2018-02-28  1:04 ` [igt-dev] ✗ Fi.CI.IGT: failure " Patchwork
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 13+ messages in thread
From: Patchwork @ 2018-02-27 23:44 UTC (permalink / raw)
  To: Chris Wilson; +Cc: igt-dev

== Series Details ==

Series: lib: Provide an accelerated routine for readback from WC (rev4)
URL   : https://patchwork.freedesktop.org/series/39070/
State : success

== Summary ==

IGT patchset tested on top of latest successful build
77411862d239916b40e218eeb6519b8f31fc7f1d meson: Get rid of lib_headers

with latest DRM-Tip kernel build CI_DRM_3842
af8578c6d438 drm-tip: 2018y-02m-27d-20h-28m-22s UTC integration manifest

No testlist changes.

---- Known issues:

Test debugfs_test:
        Subgroup read_all_entries:
                incomplete -> PASS       (fi-snb-2520m) fdo#103713
Test gem_mmap_gtt:
        Subgroup basic-small-bo-tiledx:
                fail       -> PASS       (fi-gdg-551) fdo#102575
Test kms_pipe_crc_basic:
        Subgroup hang-read-crc-pipe-c:
                pass       -> FAIL       (fi-skl-guc) fdo#103191
        Subgroup suspend-read-crc-pipe-c:
                notrun     -> INCOMPLETE (fi-bxt-dsi) fdo#103927

fdo#103713 https://bugs.freedesktop.org/show_bug.cgi?id=103713
fdo#102575 https://bugs.freedesktop.org/show_bug.cgi?id=102575
fdo#103191 https://bugs.freedesktop.org/show_bug.cgi?id=103191
fdo#103927 https://bugs.freedesktop.org/show_bug.cgi?id=103927

fi-bdw-5557u     total:288  pass:267  dwarn:0   dfail:0   fail:0   skip:21  time:419s
fi-bdw-gvtdvm    total:288  pass:264  dwarn:0   dfail:0   fail:0   skip:24  time:424s
fi-blb-e6850     total:288  pass:223  dwarn:1   dfail:0   fail:0   skip:64  time:374s
fi-bsw-n3050     total:288  pass:242  dwarn:0   dfail:0   fail:0   skip:46  time:491s
fi-bwr-2160      total:288  pass:183  dwarn:0   dfail:0   fail:0   skip:105 time:280s
fi-bxt-dsi       total:246  pass:219  dwarn:0   dfail:0   fail:0   skip:26 
fi-bxt-j4205     total:288  pass:259  dwarn:0   dfail:0   fail:0   skip:29  time:483s
fi-byt-j1900     total:288  pass:253  dwarn:0   dfail:0   fail:0   skip:35  time:469s
fi-byt-n2820     total:288  pass:249  dwarn:0   dfail:0   fail:0   skip:39  time:458s
fi-cfl-8700k     total:288  pass:260  dwarn:0   dfail:0   fail:0   skip:28  time:397s
fi-cfl-s2        total:288  pass:262  dwarn:0   dfail:0   fail:0   skip:26  time:563s
fi-cnl-y3        total:288  pass:262  dwarn:0   dfail:0   fail:0   skip:26  time:564s
fi-elk-e7500     total:288  pass:229  dwarn:0   dfail:0   fail:0   skip:59  time:420s
fi-gdg-551       total:288  pass:180  dwarn:0   dfail:0   fail:0   skip:108 time:290s
fi-glk-1         total:288  pass:260  dwarn:0   dfail:0   fail:0   skip:28  time:506s
fi-hsw-4770      total:288  pass:261  dwarn:0   dfail:0   fail:0   skip:27  time:390s
fi-ilk-650       total:288  pass:228  dwarn:0   dfail:0   fail:0   skip:60  time:409s
fi-ivb-3520m     total:288  pass:259  dwarn:0   dfail:0   fail:0   skip:29  time:449s
fi-ivb-3770      total:288  pass:255  dwarn:0   dfail:0   fail:0   skip:33  time:409s
fi-kbl-7500u     total:288  pass:263  dwarn:1   dfail:0   fail:0   skip:24  time:459s
fi-kbl-7560u     total:288  pass:269  dwarn:0   dfail:0   fail:0   skip:19  time:488s
fi-kbl-7567u     total:288  pass:268  dwarn:0   dfail:0   fail:0   skip:20  time:448s
fi-kbl-r         total:288  pass:261  dwarn:0   dfail:0   fail:0   skip:27  time:498s
fi-pnv-d510      total:288  pass:222  dwarn:1   dfail:0   fail:0   skip:65  time:579s
fi-skl-6260u     total:288  pass:268  dwarn:0   dfail:0   fail:0   skip:20  time:427s
fi-skl-6600u     total:288  pass:261  dwarn:0   dfail:0   fail:0   skip:27  time:498s
fi-skl-6700hq    total:288  pass:262  dwarn:0   dfail:0   fail:0   skip:26  time:518s
fi-skl-6700k2    total:288  pass:264  dwarn:0   dfail:0   fail:0   skip:24  time:486s
fi-skl-6770hq    total:288  pass:268  dwarn:0   dfail:0   fail:0   skip:20  time:470s
fi-skl-guc       total:288  pass:259  dwarn:0   dfail:0   fail:1   skip:28  time:407s
fi-skl-gvtdvm    total:288  pass:265  dwarn:0   dfail:0   fail:0   skip:23  time:433s
fi-snb-2520m     total:288  pass:248  dwarn:0   dfail:0   fail:0   skip:40  time:518s
fi-snb-2600      total:288  pass:248  dwarn:0   dfail:0   fail:0   skip:40  time:405s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1016/issues.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [igt-dev] ✗ Fi.CI.IGT: failure for lib: Provide an accelerated routine for readback from WC (rev4)
  2018-02-27 21:50 [igt-dev] [PATCH igt] lib: Provide an accelerated routine for readback from WC Chris Wilson
                   ` (5 preceding siblings ...)
  2018-02-27 23:44 ` [igt-dev] ✓ Fi.CI.BAT: success for lib: Provide an accelerated routine for readback from WC (rev4) Patchwork
@ 2018-02-28  1:04 ` Patchwork
  2018-02-28  9:00 ` [igt-dev] [PATCH igt v6] lib: Provide an accelerated routine for readback from WC Chris Wilson
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 13+ messages in thread
From: Patchwork @ 2018-02-28  1:04 UTC (permalink / raw)
  To: Chris Wilson; +Cc: igt-dev

== Series Details ==

Series: lib: Provide an accelerated routine for readback from WC (rev4)
URL   : https://patchwork.freedesktop.org/series/39070/
State : failure

== Summary ==

---- Possible new issues:

Test perf_pmu:
        Subgroup busy-check-all-vcs0:
                pass       -> FAIL       (shard-snb)

---- Known issues:

Test gem_eio:
        Subgroup in-flight:
                pass       -> INCOMPLETE (shard-apl) fdo#104945
Test kms_flip:
        Subgroup 2x-dpms-vs-vblank-race-interruptible:
                fail       -> PASS       (shard-hsw) fdo#103060
        Subgroup 2x-flip-vs-expired-vblank:
                fail       -> PASS       (shard-hsw) fdo#102887
        Subgroup plain-flip-fb-recreate:
                fail       -> PASS       (shard-hsw) fdo#100368
Test kms_frontbuffer_tracking:
        Subgroup fbc-1p-offscren-pri-indfb-draw-mmap-gtt:
                skip       -> PASS       (shard-snb) fdo#101623
Test kms_plane:
        Subgroup plane-panning-bottom-right-suspend-pipe-b-planes:
                pass       -> SKIP       (shard-hsw) fdo#103540
Test kms_rotation_crc:
        Subgroup primary-rotation-180:
                pass       -> FAIL       (shard-snb) fdo#103925 +1

fdo#104945 https://bugs.freedesktop.org/show_bug.cgi?id=104945
fdo#103060 https://bugs.freedesktop.org/show_bug.cgi?id=103060
fdo#102887 https://bugs.freedesktop.org/show_bug.cgi?id=102887
fdo#100368 https://bugs.freedesktop.org/show_bug.cgi?id=100368
fdo#101623 https://bugs.freedesktop.org/show_bug.cgi?id=101623
fdo#103540 https://bugs.freedesktop.org/show_bug.cgi?id=103540
fdo#103925 https://bugs.freedesktop.org/show_bug.cgi?id=103925

shard-apl        total:3433 pass:1804 dwarn:1   dfail:0   fail:7   skip:1620 time:11765s
shard-hsw        total:3460 pass:1766 dwarn:1   dfail:0   fail:1   skip:1691 time:11675s
shard-snb        total:3460 pass:1357 dwarn:1   dfail:0   fail:3   skip:2099 time:6620s
Blacklisted hosts:
shard-kbl        total:3455 pass:1939 dwarn:3   dfail:0   fail:7   skip:1505 time:9287s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1016/shards.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [igt-dev] [PATCH igt v6] lib: Provide an accelerated routine for readback from WC
  2018-02-27 21:50 [igt-dev] [PATCH igt] lib: Provide an accelerated routine for readback from WC Chris Wilson
                   ` (6 preceding siblings ...)
  2018-02-28  1:04 ` [igt-dev] ✗ Fi.CI.IGT: failure " Patchwork
@ 2018-02-28  9:00 ` Chris Wilson
  2018-02-28 17:12   ` Ville Syrjälä
  2018-02-28  9:31 ` [igt-dev] ✓ Fi.CI.BAT: success for lib: Provide an accelerated routine for readback from WC (rev5) Patchwork
  2018-02-28 10:16 ` [igt-dev] ✓ Fi.CI.IGT: " Patchwork
  9 siblings, 1 reply; 13+ messages in thread
From: Chris Wilson @ 2018-02-28  9:00 UTC (permalink / raw)
  To: igt-dev

Reading from WC is awfully slow as each access is uncached and so
performed synchronously, stalling for the memory load. x86 did introduce
some new instructions in SSE 4.1 to provide a small internal buffer to
accelerate reading back a cacheline at a time from uncached memory, for
this purpose.

v2: Don't be lazy and handle misalignment.
v3: Switch out of sse41 before emitting the generic memcpy routine
v4: Replace opencoded memcpy_from_wc
v5: Always flush the internal buffer before use (Eric)
v6: Assume bulk moves, so check for dst alignment.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Eric Anholt <eric@anholt.net>
---
 lib/igt_fb.c                   |   3 +-
 lib/igt_x86.c                  | 124 +++++++++++++++++++++++++++++++++++++++++
 lib/igt_x86.h                  |   2 +
 tests/gem_fence_thrash.c       |  63 +--------------------
 tests/gem_mmap_gtt.c           |  37 +-----------
 tests/gem_tiled_pread_pwrite.c |  37 +-----------
 6 files changed, 132 insertions(+), 134 deletions(-)

diff --git a/lib/igt_fb.c b/lib/igt_fb.c
index ecd73053..7404ba7c 100644
--- a/lib/igt_fb.c
+++ b/lib/igt_fb.c
@@ -32,6 +32,7 @@
 #include "drmtest.h"
 #include "igt_fb.h"
 #include "igt_kms.h"
+#include "igt_x86.h"
 #include "ioctl_wrappers.h"
 #include "intel_chipset.h"
 
@@ -1340,7 +1341,7 @@ static void convert_nv12_to_rgb24(struct igt_fb *fb, struct fb_convert_blit_uplo
 	 * it's faster to copy the whole BO to a temporary buffer and convert
 	 * from there.
 	 */
-	memcpy(buf, blit->linear.map, blit->linear.size);
+	igt_memcpy_from_wc(buf, blit->linear.map, blit->linear.size);
 	y = &buf[blit->linear.offsets[0]];
 	uv = &buf[blit->linear.offsets[1]];
 
diff --git a/lib/igt_x86.c b/lib/igt_x86.c
index 0ed3c6f1..54539456 100644
--- a/lib/igt_x86.c
+++ b/lib/igt_x86.c
@@ -36,7 +36,10 @@
 #endif
 
 #include "igt_x86.h"
+
+#include <stdint.h>
 #include <stdio.h>
+#include <string.h>
 
 /**
  * SECTION:igt_x86
@@ -174,3 +177,124 @@ char *igt_x86_features_to_string(unsigned features, char *line)
 	return ret;
 }
 #endif
+
+#if defined(__x86_64__) && !defined(__clang__)
+#define MOVNT 512
+
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#pragma GCC diagnostic ignored "-Wpointer-arith"
+
+#define min(x, y) ({                            \
+	typeof(x) _min1 = (x);                  \
+	typeof(y) _min2 = (y);                  \
+	(void) (&_min1 == &_min2);              \
+	_min1 < _min2 ? _min1 : _min2;		\
+})
+
+#include <smmintrin.h>
+static void memcpy_from_wc_sse41(void *dst, const void *src, unsigned long len)
+{
+	char buf[16];
+
+	/* Flush the internal buffer of potential stale gfx data */
+	__builtin_ia32_mfence();
+
+	if ((uintptr_t)src & 15) {
+		__m128i *S = (__m128i *)((uintptr_t)src & ~15);
+		unsigned long misalign = (uintptr_t)src & 15;
+		unsigned long copy = min(len, 16 - misalign);
+
+		_mm_storeu_si128((__m128i *)buf,
+				 _mm_stream_load_si128(S));
+
+		memcpy(dst, buf + misalign, copy);
+
+		dst += copy;
+		src += copy;
+		len -= copy;
+	}
+
+	/* We assume we are doing bulk transfers, so prefer aligned moves */
+	if (((uintptr_t)dst & 15) == 0) {
+		while (len >= 64) {
+			__m128i *S = (__m128i *)src;
+			__m128i *D = (__m128i *)dst;
+			__m128i tmp[4];
+
+			tmp[0] = _mm_stream_load_si128(S + 0);
+			tmp[1] = _mm_stream_load_si128(S + 1);
+			tmp[2] = _mm_stream_load_si128(S + 2);
+			tmp[3] = _mm_stream_load_si128(S + 3);
+
+			_mm_store_si128(D + 0, tmp[0]);
+			_mm_store_si128(D + 1, tmp[1]);
+			_mm_store_si128(D + 2, tmp[2]);
+			_mm_store_si128(D + 3, tmp[3]);
+
+			src += 64;
+			dst += 64;
+			len -= 64;
+		}
+	} else {
+		while (len >= 64) {
+			__m128i *S = (__m128i *)src;
+			__m128i *D = (__m128i *)dst;
+			__m128i tmp[4];
+
+			tmp[0] = _mm_stream_load_si128(S + 0);
+			tmp[1] = _mm_stream_load_si128(S + 1);
+			tmp[2] = _mm_stream_load_si128(S + 2);
+			tmp[3] = _mm_stream_load_si128(S + 3);
+
+			_mm_storeu_si128(D + 0, tmp[0]);
+			_mm_storeu_si128(D + 1, tmp[1]);
+			_mm_storeu_si128(D + 2, tmp[2]);
+			_mm_storeu_si128(D + 3, tmp[3]);
+
+			src += 64;
+			dst += 64;
+			len -= 64;
+		}
+	}
+
+	while (len >= 16) {
+		_mm_storeu_si128((__m128i *)dst,
+				 _mm_stream_load_si128((__m128i *)src));
+
+		src += 16;
+		dst += 16;
+		len -= 16;
+	}
+
+	if (len) {
+		_mm_storeu_si128((__m128i *)buf,
+				 _mm_stream_load_si128((__m128i *)src));
+		memcpy(dst, buf, len);
+	}
+}
+
+#pragma GCC pop_options
+
+static void memcpy_from_wc(void *dst, const void *src, unsigned long len)
+{
+	memcpy(dst, src, len);
+}
+
+static void (*resolve_memcpy_from_wc(void))(void *, const void *, unsigned long)
+{
+	if (igt_x86_features() & SSE4_1)
+		return memcpy_from_wc_sse41;
+
+	return memcpy_from_wc;
+}
+
+void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
+	__attribute__((ifunc("resolve_memcpy_from_wc")));
+
+#else
+void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
+{
+	memcpy(dst, src, len);
+}
+#endif
diff --git a/lib/igt_x86.h b/lib/igt_x86.h
index 27b7f0fd..d4f8c343 100644
--- a/lib/igt_x86.h
+++ b/lib/igt_x86.h
@@ -55,4 +55,6 @@ static inline char *igt_x86_features_to_string(unsigned features, char *line)
 }
 #endif
 
+void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len);
+
 #endif /* IGT_X86_H */
diff --git a/tests/gem_fence_thrash.c b/tests/gem_fence_thrash.c
index c8ff961d..2d7fb2ff 100644
--- a/tests/gem_fence_thrash.c
+++ b/tests/gem_fence_thrash.c
@@ -107,75 +107,16 @@ bo_copy (void *_arg)
 	return NULL;
 }
 
-#if defined(__x86_64__) && !defined(__clang__)
-
-#pragma GCC push_options
-#pragma GCC target("sse4.1")
-
-#include <smmintrin.h>
-
-#define MOVNT 512
-
-__attribute__((noinline))
-static void copy_wc_page(void *dst, void *src)
-{
-	if (igt_x86_features() & SSE4_1) {
-		__m128i *S = (__m128i *)src;
-		__m128i *D = (__m128i *)dst;
-
-		for (int i = 0; i < PAGE_SIZE/CACHELINE; i++) {
-			__m128i tmp[4];
-
-			tmp[0] = _mm_stream_load_si128(S++);
-			tmp[1] = _mm_stream_load_si128(S++);
-			tmp[2] = _mm_stream_load_si128(S++);
-			tmp[3] = _mm_stream_load_si128(S++);
-
-			_mm_store_si128(D++, tmp[0]);
-			_mm_store_si128(D++, tmp[1]);
-			_mm_store_si128(D++, tmp[2]);
-			_mm_store_si128(D++, tmp[3]);
-		}
-	} else
-		memcpy(dst, src, PAGE_SIZE);
-}
-
-static void copy_wc_cacheline(void *dst, void *src)
-{
-	if (igt_x86_features() & SSE4_1) {
-		__m128i *S = (__m128i *)src;
-		__m128i *D = (__m128i *)dst;
-		__m128i tmp[4];
-
-		tmp[0] = _mm_stream_load_si128(S++);
-		tmp[1] = _mm_stream_load_si128(S++);
-		tmp[2] = _mm_stream_load_si128(S++);
-		tmp[3] = _mm_stream_load_si128(S++);
-
-		_mm_store_si128(D++, tmp[0]);
-		_mm_store_si128(D++, tmp[1]);
-		_mm_store_si128(D++, tmp[2]);
-		_mm_store_si128(D++, tmp[3]);
-	} else
-		memcpy(dst, src, CACHELINE);
-}
-
-#pragma GCC pop_options
-
-#else
-
 static void copy_wc_page(void *dst, const void *src)
 {
-	memcpy(dst, src, PAGE_SIZE);
+	igt_memcpy_from_wc(dst, src, PAGE_SIZE);
 }
 
 static void copy_wc_cacheline(void *dst, const void *src)
 {
-	memcpy(dst, src, CACHELINE);
+	igt_memcpy_from_wc(dst, src, CACHELINE);
 }
 
-#endif
-
 static void
 _bo_write_verify(struct test *t)
 {
diff --git a/tests/gem_mmap_gtt.c b/tests/gem_mmap_gtt.c
index 0f598125..6a332b25 100644
--- a/tests/gem_mmap_gtt.c
+++ b/tests/gem_mmap_gtt.c
@@ -529,45 +529,10 @@ test_huge_bo(int fd, int huge, int tiling)
 	munmap(linear_pattern, PAGE_SIZE);
 }
 
-#if defined(__x86_64__) && !defined(__clang__)
-#define MOVNT 512
-
-#pragma GCC push_options
-#pragma GCC target("sse4.1")
-
-#include <smmintrin.h>
-__attribute__((noinline))
-static void copy_wc_page(void *dst, void *src)
-{
-	if (igt_x86_features() & SSE4_1) {
-		__m128i *S = (__m128i *)src;
-		__m128i *D = (__m128i *)dst;
-
-		for (int i = 0; i < PAGE_SIZE/64; i++) {
-			__m128i tmp[4];
-
-			tmp[0] = _mm_stream_load_si128(S++);
-			tmp[1] = _mm_stream_load_si128(S++);
-			tmp[2] = _mm_stream_load_si128(S++);
-			tmp[3] = _mm_stream_load_si128(S++);
-
-			_mm_store_si128(D++, tmp[0]);
-			_mm_store_si128(D++, tmp[1]);
-			_mm_store_si128(D++, tmp[2]);
-			_mm_store_si128(D++, tmp[3]);
-		}
-	} else
-		memcpy(dst, src, PAGE_SIZE);
-}
-
-#pragma GCC pop_options
-
-#else
 static void copy_wc_page(void *dst, const void *src)
 {
-	memcpy(dst, src, PAGE_SIZE);
+	igt_memcpy_from_wc(dst, src, PAGE_SIZE);
 }
-#endif
 
 static unsigned int tile_row_size(int tiling, unsigned int stride)
 {
diff --git a/tests/gem_tiled_pread_pwrite.c b/tests/gem_tiled_pread_pwrite.c
index 7b5577fd..313daa38 100644
--- a/tests/gem_tiled_pread_pwrite.c
+++ b/tests/gem_tiled_pread_pwrite.c
@@ -100,45 +100,10 @@ create_bo(int fd)
 	return handle;
 }
 
-#if defined(__x86_64__) && !defined(__clang__)
-#define MOVNT 512
-
-#pragma GCC push_options
-#pragma GCC target("sse4.1")
-
-#include <smmintrin.h>
-__attribute__((noinline))
-static void copy_wc_page(void *dst, void *src)
-{
-	if (igt_x86_features() & SSE4_1) {
-		__m128i *S = (__m128i *)src;
-		__m128i *D = (__m128i *)dst;
-
-		for (int i = 0; i < PAGE_SIZE/64; i++) {
-			__m128i tmp[4];
-
-			tmp[0] = _mm_stream_load_si128(S++);
-			tmp[1] = _mm_stream_load_si128(S++);
-			tmp[2] = _mm_stream_load_si128(S++);
-			tmp[3] = _mm_stream_load_si128(S++);
-
-			_mm_store_si128(D++, tmp[0]);
-			_mm_store_si128(D++, tmp[1]);
-			_mm_store_si128(D++, tmp[2]);
-			_mm_store_si128(D++, tmp[3]);
-		}
-	} else
-		memcpy(dst, src, PAGE_SIZE);
-}
-
-#pragma GCC pop_options
-
-#else
 static void copy_wc_page(void *dst, const void *src)
 {
-	memcpy(dst, src, PAGE_SIZE);
+	igt_memcpy_from_wc(dst, src, PAGE_SIZE);
 }
-#endif
 
 igt_simple_main
 {
-- 
2.16.2

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [igt-dev] ✓ Fi.CI.BAT: success for lib: Provide an accelerated routine for readback from WC (rev5)
  2018-02-27 21:50 [igt-dev] [PATCH igt] lib: Provide an accelerated routine for readback from WC Chris Wilson
                   ` (7 preceding siblings ...)
  2018-02-28  9:00 ` [igt-dev] [PATCH igt v6] lib: Provide an accelerated routine for readback from WC Chris Wilson
@ 2018-02-28  9:31 ` Patchwork
  2018-02-28 10:16 ` [igt-dev] ✓ Fi.CI.IGT: " Patchwork
  9 siblings, 0 replies; 13+ messages in thread
From: Patchwork @ 2018-02-28  9:31 UTC (permalink / raw)
  To: Chris Wilson; +Cc: igt-dev

== Series Details ==

Series: lib: Provide an accelerated routine for readback from WC (rev5)
URL   : https://patchwork.freedesktop.org/series/39070/
State : success

== Summary ==

IGT patchset tested on top of latest successful build
77411862d239916b40e218eeb6519b8f31fc7f1d meson: Get rid of lib_headers

with latest DRM-Tip kernel build CI_DRM_3844
5d22ee63b365 drm-tip: 2018y-02m-28d-00h-25m-02s UTC integration manifest

No testlist changes.

---- Known issues:

Test gem_exec_suspend:
        Subgroup basic-s3:
                dmesg-warn -> PASS       (fi-skl-6700k2) fdo#104108
Test kms_pipe_crc_basic:
        Subgroup suspend-read-crc-pipe-b:
                pass       -> INCOMPLETE (fi-snb-2520m) fdo#103713
Test prime_vgem:
        Subgroup basic-fence-flip:
                pass       -> FAIL       (fi-ivb-3770) fdo#104008

fdo#104108 https://bugs.freedesktop.org/show_bug.cgi?id=104108
fdo#103713 https://bugs.freedesktop.org/show_bug.cgi?id=103713
fdo#104008 https://bugs.freedesktop.org/show_bug.cgi?id=104008

fi-bdw-5557u     total:288  pass:267  dwarn:0   dfail:0   fail:0   skip:21  time:419s
fi-bdw-gvtdvm    total:288  pass:264  dwarn:0   dfail:0   fail:0   skip:24  time:428s
fi-blb-e6850     total:288  pass:223  dwarn:1   dfail:0   fail:0   skip:64  time:374s
fi-bsw-n3050     total:288  pass:242  dwarn:0   dfail:0   fail:0   skip:46  time:491s
fi-bwr-2160      total:288  pass:183  dwarn:0   dfail:0   fail:0   skip:105 time:278s
fi-bxt-j4205     total:288  pass:259  dwarn:0   dfail:0   fail:0   skip:29  time:480s
fi-byt-j1900     total:288  pass:253  dwarn:0   dfail:0   fail:0   skip:35  time:474s
fi-byt-n2820     total:288  pass:249  dwarn:0   dfail:0   fail:0   skip:39  time:461s
fi-cfl-8700k     total:288  pass:260  dwarn:0   dfail:0   fail:0   skip:28  time:390s
fi-cfl-s2        total:288  pass:262  dwarn:0   dfail:0   fail:0   skip:26  time:566s
fi-cnl-y3        total:288  pass:262  dwarn:0   dfail:0   fail:0   skip:26  time:573s
fi-elk-e7500     total:288  pass:229  dwarn:0   dfail:0   fail:0   skip:59  time:412s
fi-gdg-551       total:288  pass:179  dwarn:0   dfail:0   fail:1   skip:108 time:293s
fi-glk-1         total:288  pass:260  dwarn:0   dfail:0   fail:0   skip:28  time:511s
fi-hsw-4770      total:288  pass:261  dwarn:0   dfail:0   fail:0   skip:27  time:391s
fi-ilk-650       total:288  pass:228  dwarn:0   dfail:0   fail:0   skip:60  time:416s
fi-ivb-3520m     total:288  pass:259  dwarn:0   dfail:0   fail:0   skip:29  time:458s
fi-ivb-3770      total:288  pass:254  dwarn:0   dfail:0   fail:1   skip:33  time:411s
fi-kbl-7500u     total:288  pass:263  dwarn:1   dfail:0   fail:0   skip:24  time:452s
fi-kbl-7560u     total:288  pass:269  dwarn:0   dfail:0   fail:0   skip:19  time:488s
fi-kbl-7567u     total:288  pass:268  dwarn:0   dfail:0   fail:0   skip:20  time:448s
fi-kbl-r         total:288  pass:261  dwarn:0   dfail:0   fail:0   skip:27  time:492s
fi-pnv-d510      total:288  pass:222  dwarn:1   dfail:0   fail:0   skip:65  time:589s
fi-skl-6260u     total:288  pass:268  dwarn:0   dfail:0   fail:0   skip:20  time:433s
fi-skl-6600u     total:288  pass:261  dwarn:0   dfail:0   fail:0   skip:27  time:503s
fi-skl-6700hq    total:288  pass:262  dwarn:0   dfail:0   fail:0   skip:26  time:520s
fi-skl-6700k2    total:288  pass:264  dwarn:0   dfail:0   fail:0   skip:24  time:483s
fi-skl-6770hq    total:288  pass:268  dwarn:0   dfail:0   fail:0   skip:20  time:473s
fi-skl-guc       total:288  pass:260  dwarn:0   dfail:0   fail:0   skip:28  time:408s
fi-skl-gvtdvm    total:288  pass:265  dwarn:0   dfail:0   fail:0   skip:23  time:439s
fi-snb-2520m     total:245  pass:211  dwarn:0   dfail:0   fail:0   skip:33 
fi-snb-2600      total:288  pass:248  dwarn:0   dfail:0   fail:0   skip:40  time:396s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1017/issues.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [igt-dev] ✓ Fi.CI.IGT: success for lib: Provide an accelerated routine for readback from WC (rev5)
  2018-02-27 21:50 [igt-dev] [PATCH igt] lib: Provide an accelerated routine for readback from WC Chris Wilson
                   ` (8 preceding siblings ...)
  2018-02-28  9:31 ` [igt-dev] ✓ Fi.CI.BAT: success for lib: Provide an accelerated routine for readback from WC (rev5) Patchwork
@ 2018-02-28 10:16 ` Patchwork
  9 siblings, 0 replies; 13+ messages in thread
From: Patchwork @ 2018-02-28 10:16 UTC (permalink / raw)
  To: Chris Wilson; +Cc: igt-dev

== Series Details ==

Series: lib: Provide an accelerated routine for readback from WC (rev5)
URL   : https://patchwork.freedesktop.org/series/39070/
State : success

== Summary ==

---- Known issues:

Test drv_suspend:
        Subgroup debugfs-reader:
                pass       -> SKIP       (shard-snb) fdo#102365
Test gem_eio:
        Subgroup in-flight-external:
                incomplete -> PASS       (shard-apl) fdo#104945
Test kms_chv_cursor_fail:
        Subgroup pipe-b-128x128-top-edge:
                dmesg-warn -> PASS       (shard-snb) fdo#105185
Test kms_flip:
        Subgroup 2x-plain-flip-fb-recreate-interruptible:
                pass       -> FAIL       (shard-hsw) fdo#100368
        Subgroup modeset-vs-vblank-race-interruptible:
                pass       -> FAIL       (shard-hsw) fdo#103060
Test kms_mmap_write_crc:
                fail       -> PASS       (shard-apl) fdo#103286
Test kms_setmode:
        Subgroup basic:
                pass       -> FAIL       (shard-hsw) fdo#99912
Test kms_vblank:
        Subgroup pipe-b-ts-continuation-dpms-suspend:
                incomplete -> PASS       (shard-hsw) fdo#105054
Test pm_rpm:
        Subgroup system-suspend:
                incomplete -> PASS       (shard-hsw) fdo#103375

fdo#102365 https://bugs.freedesktop.org/show_bug.cgi?id=102365
fdo#104945 https://bugs.freedesktop.org/show_bug.cgi?id=104945
fdo#105185 https://bugs.freedesktop.org/show_bug.cgi?id=105185
fdo#100368 https://bugs.freedesktop.org/show_bug.cgi?id=100368
fdo#103060 https://bugs.freedesktop.org/show_bug.cgi?id=103060
fdo#103286 https://bugs.freedesktop.org/show_bug.cgi?id=103286
fdo#99912 https://bugs.freedesktop.org/show_bug.cgi?id=99912
fdo#105054 https://bugs.freedesktop.org/show_bug.cgi?id=105054
fdo#103375 https://bugs.freedesktop.org/show_bug.cgi?id=103375

shard-apl        total:3460 pass:1820 dwarn:1   dfail:0   fail:7   skip:1632 time:12065s
shard-hsw        total:3460 pass:1765 dwarn:1   dfail:0   fail:3   skip:1690 time:11668s
shard-snb        total:3460 pass:1358 dwarn:1   dfail:0   fail:1   skip:2100 time:6549s
Blacklisted hosts:
shard-kbl        total:3446 pass:1934 dwarn:1   dfail:0   fail:7   skip:1503 time:9219s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1017/shards.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [igt-dev] [PATCH igt v6] lib: Provide an accelerated routine for readback from WC
  2018-02-28  9:00 ` [igt-dev] [PATCH igt v6] lib: Provide an accelerated routine for readback from WC Chris Wilson
@ 2018-02-28 17:12   ` Ville Syrjälä
  2018-03-01  8:43     ` Chris Wilson
  0 siblings, 1 reply; 13+ messages in thread
From: Ville Syrjälä @ 2018-02-28 17:12 UTC (permalink / raw)
  To: Chris Wilson; +Cc: igt-dev

On Wed, Feb 28, 2018 at 09:00:16AM +0000, Chris Wilson wrote:
> Reading from WC is awfully slow as each access is uncached and so
> performed synchronously, stalling for the memory load. x86 did introduce
> some new instructions in SSE 4.1 to provide a small internal buffer to
> accelerate reading back a cacheline at a time from uncached memory, for
> this purpose.
> 
> v2: Don't be lazy and handle misalignment.
> v3: Switch out of sse41 before emitting the generic memcpy routine
> v4: Replace opencoded memcpy_from_wc
> v5: Always flush the internal buffer before use (Eric)
> v6: Assume bulk moves, so check for dst alignment.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Eric Anholt <eric@anholt.net>
> ---
>  lib/igt_fb.c                   |   3 +-
>  lib/igt_x86.c                  | 124 +++++++++++++++++++++++++++++++++++++++++
>  lib/igt_x86.h                  |   2 +
>  tests/gem_fence_thrash.c       |  63 +--------------------
>  tests/gem_mmap_gtt.c           |  37 +-----------
>  tests/gem_tiled_pread_pwrite.c |  37 +-----------
>  6 files changed, 132 insertions(+), 134 deletions(-)
> 
> diff --git a/lib/igt_fb.c b/lib/igt_fb.c
> index ecd73053..7404ba7c 100644
> --- a/lib/igt_fb.c
> +++ b/lib/igt_fb.c
> @@ -32,6 +32,7 @@
>  #include "drmtest.h"
>  #include "igt_fb.h"
>  #include "igt_kms.h"
> +#include "igt_x86.h"
>  #include "ioctl_wrappers.h"
>  #include "intel_chipset.h"
>  
> @@ -1340,7 +1341,7 @@ static void convert_nv12_to_rgb24(struct igt_fb *fb, struct fb_convert_blit_uplo
>  	 * it's faster to copy the whole BO to a temporary buffer and convert
>  	 * from there.
>  	 */
> -	memcpy(buf, blit->linear.map, blit->linear.size);
> +	igt_memcpy_from_wc(buf, blit->linear.map, blit->linear.size);
>  	y = &buf[blit->linear.offsets[0]];
>  	uv = &buf[blit->linear.offsets[1]];
>  
> diff --git a/lib/igt_x86.c b/lib/igt_x86.c
> index 0ed3c6f1..54539456 100644
> --- a/lib/igt_x86.c
> +++ b/lib/igt_x86.c
> @@ -36,7 +36,10 @@
>  #endif
>  
>  #include "igt_x86.h"
> +
> +#include <stdint.h>
>  #include <stdio.h>
> +#include <string.h>
>  
>  /**
>   * SECTION:igt_x86
> @@ -174,3 +177,124 @@ char *igt_x86_features_to_string(unsigned features, char *line)
>  	return ret;
>  }
>  #endif
> +
> +#if defined(__x86_64__) && !defined(__clang__)
> +#define MOVNT 512

What's this MOVNT define?

> +
> +#pragma GCC push_options
> +#pragma GCC target("sse4.1")
> +#pragma GCC diagnostic ignored "-Wpointer-arith"
> +
> +#define min(x, y) ({                            \
> +	typeof(x) _min1 = (x);                  \
> +	typeof(y) _min2 = (y);                  \
> +	(void) (&_min1 == &_min2);              \
> +	_min1 < _min2 ? _min1 : _min2;		\
> +})

igt_aux.h has this already I believe.

> +
> +#include <smmintrin.h>
> +static void memcpy_from_wc_sse41(void *dst, const void *src, unsigned long len)
> +{
> +	char buf[16];
> +
> +	/* Flush the internal buffer of potential stale gfx data */
> +	__builtin_ia32_mfence();

Isn't there a _mm_mfence()?

Apart from those everything looks all right to me.
Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>

> +
> +	if ((uintptr_t)src & 15) {
> +		__m128i *S = (__m128i *)((uintptr_t)src & ~15);
> +		unsigned long misalign = (uintptr_t)src & 15;
> +		unsigned long copy = min(len, 16 - misalign);
> +
> +		_mm_storeu_si128((__m128i *)buf,
> +				 _mm_stream_load_si128(S));
> +
> +		memcpy(dst, buf + misalign, copy);
> +
> +		dst += copy;
> +		src += copy;
> +		len -= copy;
> +	}
> +
> +	/* We assume we are doing bulk transfers, so prefer aligned moves */
> +	if (((uintptr_t)dst & 15) == 0) {
> +		while (len >= 64) {
> +			__m128i *S = (__m128i *)src;
> +			__m128i *D = (__m128i *)dst;
> +			__m128i tmp[4];
> +
> +			tmp[0] = _mm_stream_load_si128(S + 0);
> +			tmp[1] = _mm_stream_load_si128(S + 1);
> +			tmp[2] = _mm_stream_load_si128(S + 2);
> +			tmp[3] = _mm_stream_load_si128(S + 3);
> +
> +			_mm_store_si128(D + 0, tmp[0]);
> +			_mm_store_si128(D + 1, tmp[1]);
> +			_mm_store_si128(D + 2, tmp[2]);
> +			_mm_store_si128(D + 3, tmp[3]);
> +
> +			src += 64;
> +			dst += 64;
> +			len -= 64;
> +		}
> +	} else {
> +		while (len >= 64) {
> +			__m128i *S = (__m128i *)src;
> +			__m128i *D = (__m128i *)dst;
> +			__m128i tmp[4];
> +
> +			tmp[0] = _mm_stream_load_si128(S + 0);
> +			tmp[1] = _mm_stream_load_si128(S + 1);
> +			tmp[2] = _mm_stream_load_si128(S + 2);
> +			tmp[3] = _mm_stream_load_si128(S + 3);
> +
> +			_mm_storeu_si128(D + 0, tmp[0]);
> +			_mm_storeu_si128(D + 1, tmp[1]);
> +			_mm_storeu_si128(D + 2, tmp[2]);
> +			_mm_storeu_si128(D + 3, tmp[3]);
> +
> +			src += 64;
> +			dst += 64;
> +			len -= 64;
> +		}
> +	}
> +
> +	while (len >= 16) {
> +		_mm_storeu_si128((__m128i *)dst,
> +				 _mm_stream_load_si128((__m128i *)src));
> +
> +		src += 16;
> +		dst += 16;
> +		len -= 16;
> +	}
> +
> +	if (len) {
> +		_mm_storeu_si128((__m128i *)buf,
> +				 _mm_stream_load_si128((__m128i *)src));
> +		memcpy(dst, buf, len);
> +	}
> +}
> +
> +#pragma GCC pop_options
> +
> +static void memcpy_from_wc(void *dst, const void *src, unsigned long len)
> +{
> +	memcpy(dst, src, len);
> +}
> +
> +static void (*resolve_memcpy_from_wc(void))(void *, const void *, unsigned long)
> +{
> +	if (igt_x86_features() & SSE4_1)
> +		return memcpy_from_wc_sse41;
> +
> +	return memcpy_from_wc;
> +}
> +
> +void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
> +	__attribute__((ifunc("resolve_memcpy_from_wc")));
> +
> +#else
> +void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
> +{
> +	memcpy(dst, src, len);
> +}
> +#endif
> diff --git a/lib/igt_x86.h b/lib/igt_x86.h
> index 27b7f0fd..d4f8c343 100644
> --- a/lib/igt_x86.h
> +++ b/lib/igt_x86.h
> @@ -55,4 +55,6 @@ static inline char *igt_x86_features_to_string(unsigned features, char *line)
>  }
>  #endif
>  
> +void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len);
> +
>  #endif /* IGT_X86_H */
> diff --git a/tests/gem_fence_thrash.c b/tests/gem_fence_thrash.c
> index c8ff961d..2d7fb2ff 100644
> --- a/tests/gem_fence_thrash.c
> +++ b/tests/gem_fence_thrash.c
> @@ -107,75 +107,16 @@ bo_copy (void *_arg)
>  	return NULL;
>  }
>  
> -#if defined(__x86_64__) && !defined(__clang__)
> -
> -#pragma GCC push_options
> -#pragma GCC target("sse4.1")
> -
> -#include <smmintrin.h>
> -
> -#define MOVNT 512
> -
> -__attribute__((noinline))
> -static void copy_wc_page(void *dst, void *src)
> -{
> -	if (igt_x86_features() & SSE4_1) {
> -		__m128i *S = (__m128i *)src;
> -		__m128i *D = (__m128i *)dst;
> -
> -		for (int i = 0; i < PAGE_SIZE/CACHELINE; i++) {
> -			__m128i tmp[4];
> -
> -			tmp[0] = _mm_stream_load_si128(S++);
> -			tmp[1] = _mm_stream_load_si128(S++);
> -			tmp[2] = _mm_stream_load_si128(S++);
> -			tmp[3] = _mm_stream_load_si128(S++);
> -
> -			_mm_store_si128(D++, tmp[0]);
> -			_mm_store_si128(D++, tmp[1]);
> -			_mm_store_si128(D++, tmp[2]);
> -			_mm_store_si128(D++, tmp[3]);
> -		}
> -	} else
> -		memcpy(dst, src, PAGE_SIZE);
> -}
> -
> -static void copy_wc_cacheline(void *dst, void *src)
> -{
> -	if (igt_x86_features() & SSE4_1) {
> -		__m128i *S = (__m128i *)src;
> -		__m128i *D = (__m128i *)dst;
> -		__m128i tmp[4];
> -
> -		tmp[0] = _mm_stream_load_si128(S++);
> -		tmp[1] = _mm_stream_load_si128(S++);
> -		tmp[2] = _mm_stream_load_si128(S++);
> -		tmp[3] = _mm_stream_load_si128(S++);
> -
> -		_mm_store_si128(D++, tmp[0]);
> -		_mm_store_si128(D++, tmp[1]);
> -		_mm_store_si128(D++, tmp[2]);
> -		_mm_store_si128(D++, tmp[3]);
> -	} else
> -		memcpy(dst, src, CACHELINE);
> -}
> -
> -#pragma GCC pop_options
> -
> -#else
> -
>  static void copy_wc_page(void *dst, const void *src)
>  {
> -	memcpy(dst, src, PAGE_SIZE);
> +	igt_memcpy_from_wc(dst, src, PAGE_SIZE);
>  }
>  
>  static void copy_wc_cacheline(void *dst, const void *src)
>  {
> -	memcpy(dst, src, CACHELINE);
> +	igt_memcpy_from_wc(dst, src, CACHELINE);
>  }
>  
> -#endif
> -
>  static void
>  _bo_write_verify(struct test *t)
>  {
> diff --git a/tests/gem_mmap_gtt.c b/tests/gem_mmap_gtt.c
> index 0f598125..6a332b25 100644
> --- a/tests/gem_mmap_gtt.c
> +++ b/tests/gem_mmap_gtt.c
> @@ -529,45 +529,10 @@ test_huge_bo(int fd, int huge, int tiling)
>  	munmap(linear_pattern, PAGE_SIZE);
>  }
>  
> -#if defined(__x86_64__) && !defined(__clang__)
> -#define MOVNT 512
> -
> -#pragma GCC push_options
> -#pragma GCC target("sse4.1")
> -
> -#include <smmintrin.h>
> -__attribute__((noinline))
> -static void copy_wc_page(void *dst, void *src)
> -{
> -	if (igt_x86_features() & SSE4_1) {
> -		__m128i *S = (__m128i *)src;
> -		__m128i *D = (__m128i *)dst;
> -
> -		for (int i = 0; i < PAGE_SIZE/64; i++) {
> -			__m128i tmp[4];
> -
> -			tmp[0] = _mm_stream_load_si128(S++);
> -			tmp[1] = _mm_stream_load_si128(S++);
> -			tmp[2] = _mm_stream_load_si128(S++);
> -			tmp[3] = _mm_stream_load_si128(S++);
> -
> -			_mm_store_si128(D++, tmp[0]);
> -			_mm_store_si128(D++, tmp[1]);
> -			_mm_store_si128(D++, tmp[2]);
> -			_mm_store_si128(D++, tmp[3]);
> -		}
> -	} else
> -		memcpy(dst, src, PAGE_SIZE);
> -}
> -
> -#pragma GCC pop_options
> -
> -#else
>  static void copy_wc_page(void *dst, const void *src)
>  {
> -	memcpy(dst, src, PAGE_SIZE);
> +	igt_memcpy_from_wc(dst, src, PAGE_SIZE);
>  }
> -#endif
>  
>  static unsigned int tile_row_size(int tiling, unsigned int stride)
>  {
> diff --git a/tests/gem_tiled_pread_pwrite.c b/tests/gem_tiled_pread_pwrite.c
> index 7b5577fd..313daa38 100644
> --- a/tests/gem_tiled_pread_pwrite.c
> +++ b/tests/gem_tiled_pread_pwrite.c
> @@ -100,45 +100,10 @@ create_bo(int fd)
>  	return handle;
>  }
>  
> -#if defined(__x86_64__) && !defined(__clang__)
> -#define MOVNT 512
> -
> -#pragma GCC push_options
> -#pragma GCC target("sse4.1")
> -
> -#include <smmintrin.h>
> -__attribute__((noinline))
> -static void copy_wc_page(void *dst, void *src)
> -{
> -	if (igt_x86_features() & SSE4_1) {
> -		__m128i *S = (__m128i *)src;
> -		__m128i *D = (__m128i *)dst;
> -
> -		for (int i = 0; i < PAGE_SIZE/64; i++) {
> -			__m128i tmp[4];
> -
> -			tmp[0] = _mm_stream_load_si128(S++);
> -			tmp[1] = _mm_stream_load_si128(S++);
> -			tmp[2] = _mm_stream_load_si128(S++);
> -			tmp[3] = _mm_stream_load_si128(S++);
> -
> -			_mm_store_si128(D++, tmp[0]);
> -			_mm_store_si128(D++, tmp[1]);
> -			_mm_store_si128(D++, tmp[2]);
> -			_mm_store_si128(D++, tmp[3]);
> -		}
> -	} else
> -		memcpy(dst, src, PAGE_SIZE);
> -}
> -
> -#pragma GCC pop_options
> -
> -#else
>  static void copy_wc_page(void *dst, const void *src)
>  {
> -	memcpy(dst, src, PAGE_SIZE);
> +	igt_memcpy_from_wc(dst, src, PAGE_SIZE);
>  }
> -#endif
>  
>  igt_simple_main
>  {
> -- 
> 2.16.2
> 
> _______________________________________________
> igt-dev mailing list
> igt-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/igt-dev

-- 
Ville Syrjälä
Intel OTC
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [igt-dev] [PATCH igt v6] lib: Provide an accelerated routine for readback from WC
  2018-02-28 17:12   ` Ville Syrjälä
@ 2018-03-01  8:43     ` Chris Wilson
  0 siblings, 0 replies; 13+ messages in thread
From: Chris Wilson @ 2018-03-01  8:43 UTC (permalink / raw)
  To: Ville Syrjälä; +Cc: igt-dev

Quoting Ville Syrjälä (2018-02-28 17:12:44)
> On Wed, Feb 28, 2018 at 09:00:16AM +0000, Chris Wilson wrote:
> > +#if defined(__x86_64__) && !defined(__clang__)
> > +#define MOVNT 512
> 
> What's this MOVNT define?

I can't remember. I presume I was thinking about hooking it up to a
detection bit. But 512? Definitely copied that from somewhere.

> > +#pragma GCC push_options
> > +#pragma GCC target("sse4.1")
> > +#pragma GCC diagnostic ignored "-Wpointer-arith"
> > +
> > +#define min(x, y) ({                            \
> > +     typeof(x) _min1 = (x);                  \
> > +     typeof(y) _min2 = (y);                  \
> > +     (void) (&_min1 == &_min2);              \
> > +     _min1 < _min2 ? _min1 : _min2;          \
> > +})
> 
> igt_aux.h has this already I believe.

Missed it, thanks. I just wasn't expecting it or MIN to be undefined.

> > +#include <smmintrin.h>
> > +static void memcpy_from_wc_sse41(void *dst, const void *src, unsigned long len)
> > +{
> > +     char buf[16];
> > +
> > +     /* Flush the internal buffer of potential stale gfx data */
> > +     __builtin_ia32_mfence();
> 
> Isn't there a _mm_mfence()?

If you remember there's only one '_', yes, apparently there is.

> Apart from those everything looks all right to me.
> Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Thanks,
-Chris
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2018-03-01  8:44 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-02-27 21:50 [igt-dev] [PATCH igt] lib: Provide an accelerated routine for readback from WC Chris Wilson
2018-02-27 21:53 ` Chris Wilson
2018-02-27 22:17 ` [igt-dev] [PATCH igt v2] " Chris Wilson
2018-02-27 22:20 ` [igt-dev] [PATCH igt v3] " Chris Wilson
2018-02-27 22:42 ` [igt-dev] [PATCH igt v4] " Chris Wilson
2018-02-27 23:29 ` [igt-dev] [PATCH igt] " Eric Anholt
2018-02-27 23:44 ` [igt-dev] ✓ Fi.CI.BAT: success for lib: Provide an accelerated routine for readback from WC (rev4) Patchwork
2018-02-28  1:04 ` [igt-dev] ✗ Fi.CI.IGT: failure " Patchwork
2018-02-28  9:00 ` [igt-dev] [PATCH igt v6] lib: Provide an accelerated routine for readback from WC Chris Wilson
2018-02-28 17:12   ` Ville Syrjälä
2018-03-01  8:43     ` Chris Wilson
2018-02-28  9:31 ` [igt-dev] ✓ Fi.CI.BAT: success for lib: Provide an accelerated routine for readback from WC (rev5) Patchwork
2018-02-28 10:16 ` [igt-dev] ✓ Fi.CI.IGT: " Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.