All of lore.kernel.org
 help / color / mirror / Atom feed
From: Chris Wilson <chris@chris-wilson.co.uk>
To: igt-dev@lists.freedesktop.org
Subject: [igt-dev] [PATCH igt v6] lib: Provide an accelerated routine for readback from WC
Date: Wed, 28 Feb 2018 09:00:16 +0000	[thread overview]
Message-ID: <20180228090016.4589-1-chris@chris-wilson.co.uk> (raw)
In-Reply-To: <20180227215040.11297-1-chris@chris-wilson.co.uk>

Reading from WC is awfully slow as each access is uncached and so
performed synchronously, stalling for the memory load. x86 did introduce
some new instructions in SSE 4.1 to provide a small internal buffer to
accelerate reading back a cacheline at a time from uncached memory, for
this purpose.

v2: Don't be lazy and handle misalignment.
v3: Switch out of sse41 before emitting the generic memcpy routine
v4: Replace opencoded memcpy_from_wc
v5: Always flush the internal buffer before use (Eric)
v6: Assume bulk moves, so check for dst alignment.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Eric Anholt <eric@anholt.net>
---
 lib/igt_fb.c                   |   3 +-
 lib/igt_x86.c                  | 124 +++++++++++++++++++++++++++++++++++++++++
 lib/igt_x86.h                  |   2 +
 tests/gem_fence_thrash.c       |  63 +--------------------
 tests/gem_mmap_gtt.c           |  37 +-----------
 tests/gem_tiled_pread_pwrite.c |  37 +-----------
 6 files changed, 132 insertions(+), 134 deletions(-)

diff --git a/lib/igt_fb.c b/lib/igt_fb.c
index ecd73053..7404ba7c 100644
--- a/lib/igt_fb.c
+++ b/lib/igt_fb.c
@@ -32,6 +32,7 @@
 #include "drmtest.h"
 #include "igt_fb.h"
 #include "igt_kms.h"
+#include "igt_x86.h"
 #include "ioctl_wrappers.h"
 #include "intel_chipset.h"
 
@@ -1340,7 +1341,7 @@ static void convert_nv12_to_rgb24(struct igt_fb *fb, struct fb_convert_blit_uplo
 	 * it's faster to copy the whole BO to a temporary buffer and convert
 	 * from there.
 	 */
-	memcpy(buf, blit->linear.map, blit->linear.size);
+	igt_memcpy_from_wc(buf, blit->linear.map, blit->linear.size);
 	y = &buf[blit->linear.offsets[0]];
 	uv = &buf[blit->linear.offsets[1]];
 
diff --git a/lib/igt_x86.c b/lib/igt_x86.c
index 0ed3c6f1..54539456 100644
--- a/lib/igt_x86.c
+++ b/lib/igt_x86.c
@@ -36,7 +36,10 @@
 #endif
 
 #include "igt_x86.h"
+
+#include <stdint.h>
 #include <stdio.h>
+#include <string.h>
 
 /**
  * SECTION:igt_x86
@@ -174,3 +177,124 @@ char *igt_x86_features_to_string(unsigned features, char *line)
 	return ret;
 }
 #endif
+
+#if defined(__x86_64__) && !defined(__clang__)
+#define MOVNT 512
+
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#pragma GCC diagnostic ignored "-Wpointer-arith"
+
+#define min(x, y) ({                            \
+	typeof(x) _min1 = (x);                  \
+	typeof(y) _min2 = (y);                  \
+	(void) (&_min1 == &_min2);              \
+	_min1 < _min2 ? _min1 : _min2;		\
+})
+
+#include <smmintrin.h>
+static void memcpy_from_wc_sse41(void *dst, const void *src, unsigned long len)
+{
+	char buf[16];
+
+	/* Flush the internal buffer of potential stale gfx data */
+	__builtin_ia32_mfence();
+
+	if ((uintptr_t)src & 15) {
+		__m128i *S = (__m128i *)((uintptr_t)src & ~15);
+		unsigned long misalign = (uintptr_t)src & 15;
+		unsigned long copy = min(len, 16 - misalign);
+
+		_mm_storeu_si128((__m128i *)buf,
+				 _mm_stream_load_si128(S));
+
+		memcpy(dst, buf + misalign, copy);
+
+		dst += copy;
+		src += copy;
+		len -= copy;
+	}
+
+	/* We assume we are doing bulk transfers, so prefer aligned moves */
+	if (((uintptr_t)dst & 15) == 0) {
+		while (len >= 64) {
+			__m128i *S = (__m128i *)src;
+			__m128i *D = (__m128i *)dst;
+			__m128i tmp[4];
+
+			tmp[0] = _mm_stream_load_si128(S + 0);
+			tmp[1] = _mm_stream_load_si128(S + 1);
+			tmp[2] = _mm_stream_load_si128(S + 2);
+			tmp[3] = _mm_stream_load_si128(S + 3);
+
+			_mm_store_si128(D + 0, tmp[0]);
+			_mm_store_si128(D + 1, tmp[1]);
+			_mm_store_si128(D + 2, tmp[2]);
+			_mm_store_si128(D + 3, tmp[3]);
+
+			src += 64;
+			dst += 64;
+			len -= 64;
+		}
+	} else {
+		while (len >= 64) {
+			__m128i *S = (__m128i *)src;
+			__m128i *D = (__m128i *)dst;
+			__m128i tmp[4];
+
+			tmp[0] = _mm_stream_load_si128(S + 0);
+			tmp[1] = _mm_stream_load_si128(S + 1);
+			tmp[2] = _mm_stream_load_si128(S + 2);
+			tmp[3] = _mm_stream_load_si128(S + 3);
+
+			_mm_storeu_si128(D + 0, tmp[0]);
+			_mm_storeu_si128(D + 1, tmp[1]);
+			_mm_storeu_si128(D + 2, tmp[2]);
+			_mm_storeu_si128(D + 3, tmp[3]);
+
+			src += 64;
+			dst += 64;
+			len -= 64;
+		}
+	}
+
+	while (len >= 16) {
+		_mm_storeu_si128((__m128i *)dst,
+				 _mm_stream_load_si128((__m128i *)src));
+
+		src += 16;
+		dst += 16;
+		len -= 16;
+	}
+
+	if (len) {
+		_mm_storeu_si128((__m128i *)buf,
+				 _mm_stream_load_si128((__m128i *)src));
+		memcpy(dst, buf, len);
+	}
+}
+
+#pragma GCC pop_options
+
+static void memcpy_from_wc(void *dst, const void *src, unsigned long len)
+{
+	memcpy(dst, src, len);
+}
+
+static void (*resolve_memcpy_from_wc(void))(void *, const void *, unsigned long)
+{
+	if (igt_x86_features() & SSE4_1)
+		return memcpy_from_wc_sse41;
+
+	return memcpy_from_wc;
+}
+
+void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
+	__attribute__((ifunc("resolve_memcpy_from_wc")));
+
+#else
+void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
+{
+	memcpy(dst, src, len);
+}
+#endif
diff --git a/lib/igt_x86.h b/lib/igt_x86.h
index 27b7f0fd..d4f8c343 100644
--- a/lib/igt_x86.h
+++ b/lib/igt_x86.h
@@ -55,4 +55,6 @@ static inline char *igt_x86_features_to_string(unsigned features, char *line)
 }
 #endif
 
+void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len);
+
 #endif /* IGT_X86_H */
diff --git a/tests/gem_fence_thrash.c b/tests/gem_fence_thrash.c
index c8ff961d..2d7fb2ff 100644
--- a/tests/gem_fence_thrash.c
+++ b/tests/gem_fence_thrash.c
@@ -107,75 +107,16 @@ bo_copy (void *_arg)
 	return NULL;
 }
 
-#if defined(__x86_64__) && !defined(__clang__)
-
-#pragma GCC push_options
-#pragma GCC target("sse4.1")
-
-#include <smmintrin.h>
-
-#define MOVNT 512
-
-__attribute__((noinline))
-static void copy_wc_page(void *dst, void *src)
-{
-	if (igt_x86_features() & SSE4_1) {
-		__m128i *S = (__m128i *)src;
-		__m128i *D = (__m128i *)dst;
-
-		for (int i = 0; i < PAGE_SIZE/CACHELINE; i++) {
-			__m128i tmp[4];
-
-			tmp[0] = _mm_stream_load_si128(S++);
-			tmp[1] = _mm_stream_load_si128(S++);
-			tmp[2] = _mm_stream_load_si128(S++);
-			tmp[3] = _mm_stream_load_si128(S++);
-
-			_mm_store_si128(D++, tmp[0]);
-			_mm_store_si128(D++, tmp[1]);
-			_mm_store_si128(D++, tmp[2]);
-			_mm_store_si128(D++, tmp[3]);
-		}
-	} else
-		memcpy(dst, src, PAGE_SIZE);
-}
-
-static void copy_wc_cacheline(void *dst, void *src)
-{
-	if (igt_x86_features() & SSE4_1) {
-		__m128i *S = (__m128i *)src;
-		__m128i *D = (__m128i *)dst;
-		__m128i tmp[4];
-
-		tmp[0] = _mm_stream_load_si128(S++);
-		tmp[1] = _mm_stream_load_si128(S++);
-		tmp[2] = _mm_stream_load_si128(S++);
-		tmp[3] = _mm_stream_load_si128(S++);
-
-		_mm_store_si128(D++, tmp[0]);
-		_mm_store_si128(D++, tmp[1]);
-		_mm_store_si128(D++, tmp[2]);
-		_mm_store_si128(D++, tmp[3]);
-	} else
-		memcpy(dst, src, CACHELINE);
-}
-
-#pragma GCC pop_options
-
-#else
-
 static void copy_wc_page(void *dst, const void *src)
 {
-	memcpy(dst, src, PAGE_SIZE);
+	igt_memcpy_from_wc(dst, src, PAGE_SIZE);
 }
 
 static void copy_wc_cacheline(void *dst, const void *src)
 {
-	memcpy(dst, src, CACHELINE);
+	igt_memcpy_from_wc(dst, src, CACHELINE);
 }
 
-#endif
-
 static void
 _bo_write_verify(struct test *t)
 {
diff --git a/tests/gem_mmap_gtt.c b/tests/gem_mmap_gtt.c
index 0f598125..6a332b25 100644
--- a/tests/gem_mmap_gtt.c
+++ b/tests/gem_mmap_gtt.c
@@ -529,45 +529,10 @@ test_huge_bo(int fd, int huge, int tiling)
 	munmap(linear_pattern, PAGE_SIZE);
 }
 
-#if defined(__x86_64__) && !defined(__clang__)
-#define MOVNT 512
-
-#pragma GCC push_options
-#pragma GCC target("sse4.1")
-
-#include <smmintrin.h>
-__attribute__((noinline))
-static void copy_wc_page(void *dst, void *src)
-{
-	if (igt_x86_features() & SSE4_1) {
-		__m128i *S = (__m128i *)src;
-		__m128i *D = (__m128i *)dst;
-
-		for (int i = 0; i < PAGE_SIZE/64; i++) {
-			__m128i tmp[4];
-
-			tmp[0] = _mm_stream_load_si128(S++);
-			tmp[1] = _mm_stream_load_si128(S++);
-			tmp[2] = _mm_stream_load_si128(S++);
-			tmp[3] = _mm_stream_load_si128(S++);
-
-			_mm_store_si128(D++, tmp[0]);
-			_mm_store_si128(D++, tmp[1]);
-			_mm_store_si128(D++, tmp[2]);
-			_mm_store_si128(D++, tmp[3]);
-		}
-	} else
-		memcpy(dst, src, PAGE_SIZE);
-}
-
-#pragma GCC pop_options
-
-#else
 static void copy_wc_page(void *dst, const void *src)
 {
-	memcpy(dst, src, PAGE_SIZE);
+	igt_memcpy_from_wc(dst, src, PAGE_SIZE);
 }
-#endif
 
 static unsigned int tile_row_size(int tiling, unsigned int stride)
 {
diff --git a/tests/gem_tiled_pread_pwrite.c b/tests/gem_tiled_pread_pwrite.c
index 7b5577fd..313daa38 100644
--- a/tests/gem_tiled_pread_pwrite.c
+++ b/tests/gem_tiled_pread_pwrite.c
@@ -100,45 +100,10 @@ create_bo(int fd)
 	return handle;
 }
 
-#if defined(__x86_64__) && !defined(__clang__)
-#define MOVNT 512
-
-#pragma GCC push_options
-#pragma GCC target("sse4.1")
-
-#include <smmintrin.h>
-__attribute__((noinline))
-static void copy_wc_page(void *dst, void *src)
-{
-	if (igt_x86_features() & SSE4_1) {
-		__m128i *S = (__m128i *)src;
-		__m128i *D = (__m128i *)dst;
-
-		for (int i = 0; i < PAGE_SIZE/64; i++) {
-			__m128i tmp[4];
-
-			tmp[0] = _mm_stream_load_si128(S++);
-			tmp[1] = _mm_stream_load_si128(S++);
-			tmp[2] = _mm_stream_load_si128(S++);
-			tmp[3] = _mm_stream_load_si128(S++);
-
-			_mm_store_si128(D++, tmp[0]);
-			_mm_store_si128(D++, tmp[1]);
-			_mm_store_si128(D++, tmp[2]);
-			_mm_store_si128(D++, tmp[3]);
-		}
-	} else
-		memcpy(dst, src, PAGE_SIZE);
-}
-
-#pragma GCC pop_options
-
-#else
 static void copy_wc_page(void *dst, const void *src)
 {
-	memcpy(dst, src, PAGE_SIZE);
+	igt_memcpy_from_wc(dst, src, PAGE_SIZE);
 }
-#endif
 
 igt_simple_main
 {
-- 
2.16.2

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

  parent reply	other threads:[~2018-02-28  9:00 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-02-27 21:50 [igt-dev] [PATCH igt] lib: Provide an accelerated routine for readback from WC Chris Wilson
2018-02-27 21:53 ` Chris Wilson
2018-02-27 22:17 ` [igt-dev] [PATCH igt v2] " Chris Wilson
2018-02-27 22:20 ` [igt-dev] [PATCH igt v3] " Chris Wilson
2018-02-27 22:42 ` [igt-dev] [PATCH igt v4] " Chris Wilson
2018-02-27 23:29 ` [igt-dev] [PATCH igt] " Eric Anholt
2018-02-27 23:44 ` [igt-dev] ✓ Fi.CI.BAT: success for lib: Provide an accelerated routine for readback from WC (rev4) Patchwork
2018-02-28  1:04 ` [igt-dev] ✗ Fi.CI.IGT: failure " Patchwork
2018-02-28  9:00 ` Chris Wilson [this message]
2018-02-28 17:12   ` [igt-dev] [PATCH igt v6] lib: Provide an accelerated routine for readback from WC Ville Syrjälä
2018-03-01  8:43     ` Chris Wilson
2018-02-28  9:31 ` [igt-dev] ✓ Fi.CI.BAT: success for lib: Provide an accelerated routine for readback from WC (rev5) Patchwork
2018-02-28 10:16 ` [igt-dev] ✓ Fi.CI.IGT: " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180228090016.4589-1-chris@chris-wilson.co.uk \
    --to=chris@chris-wilson.co.uk \
    --cc=igt-dev@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.