All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Ville Syrjälä" <ville.syrjala@linux.intel.com>
To: Chris Wilson <chris@chris-wilson.co.uk>
Cc: igt-dev@lists.freedesktop.org
Subject: Re: [igt-dev] [PATCH igt v6] lib: Provide an accelerated routine for readback from WC
Date: Wed, 28 Feb 2018 19:12:44 +0200	[thread overview]
Message-ID: <20180228171244.GO5453@intel.com> (raw)
In-Reply-To: <20180228090016.4589-1-chris@chris-wilson.co.uk>

On Wed, Feb 28, 2018 at 09:00:16AM +0000, Chris Wilson wrote:
> Reading from WC is awfully slow as each access is uncached and so
> performed synchronously, stalling for the memory load. x86 did introduce
> some new instructions in SSE 4.1 to provide a small internal buffer to
> accelerate reading back a cacheline at a time from uncached memory, for
> this purpose.
> 
> v2: Don't be lazy and handle misalignment.
> v3: Switch out of sse41 before emitting the generic memcpy routine
> v4: Replace opencoded memcpy_from_wc
> v5: Always flush the internal buffer before use (Eric)
> v6: Assume bulk moves, so check for dst alignment.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Eric Anholt <eric@anholt.net>
> ---
>  lib/igt_fb.c                   |   3 +-
>  lib/igt_x86.c                  | 124 +++++++++++++++++++++++++++++++++++++++++
>  lib/igt_x86.h                  |   2 +
>  tests/gem_fence_thrash.c       |  63 +--------------------
>  tests/gem_mmap_gtt.c           |  37 +-----------
>  tests/gem_tiled_pread_pwrite.c |  37 +-----------
>  6 files changed, 132 insertions(+), 134 deletions(-)
> 
> diff --git a/lib/igt_fb.c b/lib/igt_fb.c
> index ecd73053..7404ba7c 100644
> --- a/lib/igt_fb.c
> +++ b/lib/igt_fb.c
> @@ -32,6 +32,7 @@
>  #include "drmtest.h"
>  #include "igt_fb.h"
>  #include "igt_kms.h"
> +#include "igt_x86.h"
>  #include "ioctl_wrappers.h"
>  #include "intel_chipset.h"
>  
> @@ -1340,7 +1341,7 @@ static void convert_nv12_to_rgb24(struct igt_fb *fb, struct fb_convert_blit_uplo
>  	 * it's faster to copy the whole BO to a temporary buffer and convert
>  	 * from there.
>  	 */
> -	memcpy(buf, blit->linear.map, blit->linear.size);
> +	igt_memcpy_from_wc(buf, blit->linear.map, blit->linear.size);
>  	y = &buf[blit->linear.offsets[0]];
>  	uv = &buf[blit->linear.offsets[1]];
>  
> diff --git a/lib/igt_x86.c b/lib/igt_x86.c
> index 0ed3c6f1..54539456 100644
> --- a/lib/igt_x86.c
> +++ b/lib/igt_x86.c
> @@ -36,7 +36,10 @@
>  #endif
>  
>  #include "igt_x86.h"
> +
> +#include <stdint.h>
>  #include <stdio.h>
> +#include <string.h>
>  
>  /**
>   * SECTION:igt_x86
> @@ -174,3 +177,124 @@ char *igt_x86_features_to_string(unsigned features, char *line)
>  	return ret;
>  }
>  #endif
> +
> +#if defined(__x86_64__) && !defined(__clang__)
> +#define MOVNT 512

What's this MOVNT define?

> +
> +#pragma GCC push_options
> +#pragma GCC target("sse4.1")
> +#pragma GCC diagnostic ignored "-Wpointer-arith"
> +
> +#define min(x, y) ({                            \
> +	typeof(x) _min1 = (x);                  \
> +	typeof(y) _min2 = (y);                  \
> +	(void) (&_min1 == &_min2);              \
> +	_min1 < _min2 ? _min1 : _min2;		\
> +})

igt_aux.h has this already I believe.

> +
> +#include <smmintrin.h>
> +static void memcpy_from_wc_sse41(void *dst, const void *src, unsigned long len)
> +{
> +	char buf[16];
> +
> +	/* Flush the internal buffer of potential stale gfx data */
> +	__builtin_ia32_mfence();

Isn't there a _mm_mfence()?

Apart from those everything looks all right to me.
Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>

> +
> +	if ((uintptr_t)src & 15) {
> +		__m128i *S = (__m128i *)((uintptr_t)src & ~15);
> +		unsigned long misalign = (uintptr_t)src & 15;
> +		unsigned long copy = min(len, 16 - misalign);
> +
> +		_mm_storeu_si128((__m128i *)buf,
> +				 _mm_stream_load_si128(S));
> +
> +		memcpy(dst, buf + misalign, copy);
> +
> +		dst += copy;
> +		src += copy;
> +		len -= copy;
> +	}
> +
> +	/* We assume we are doing bulk transfers, so prefer aligned moves */
> +	if (((uintptr_t)dst & 15) == 0) {
> +		while (len >= 64) {
> +			__m128i *S = (__m128i *)src;
> +			__m128i *D = (__m128i *)dst;
> +			__m128i tmp[4];
> +
> +			tmp[0] = _mm_stream_load_si128(S + 0);
> +			tmp[1] = _mm_stream_load_si128(S + 1);
> +			tmp[2] = _mm_stream_load_si128(S + 2);
> +			tmp[3] = _mm_stream_load_si128(S + 3);
> +
> +			_mm_store_si128(D + 0, tmp[0]);
> +			_mm_store_si128(D + 1, tmp[1]);
> +			_mm_store_si128(D + 2, tmp[2]);
> +			_mm_store_si128(D + 3, tmp[3]);
> +
> +			src += 64;
> +			dst += 64;
> +			len -= 64;
> +		}
> +	} else {
> +		while (len >= 64) {
> +			__m128i *S = (__m128i *)src;
> +			__m128i *D = (__m128i *)dst;
> +			__m128i tmp[4];
> +
> +			tmp[0] = _mm_stream_load_si128(S + 0);
> +			tmp[1] = _mm_stream_load_si128(S + 1);
> +			tmp[2] = _mm_stream_load_si128(S + 2);
> +			tmp[3] = _mm_stream_load_si128(S + 3);
> +
> +			_mm_storeu_si128(D + 0, tmp[0]);
> +			_mm_storeu_si128(D + 1, tmp[1]);
> +			_mm_storeu_si128(D + 2, tmp[2]);
> +			_mm_storeu_si128(D + 3, tmp[3]);
> +
> +			src += 64;
> +			dst += 64;
> +			len -= 64;
> +		}
> +	}
> +
> +	while (len >= 16) {
> +		_mm_storeu_si128((__m128i *)dst,
> +				 _mm_stream_load_si128((__m128i *)src));
> +
> +		src += 16;
> +		dst += 16;
> +		len -= 16;
> +	}
> +
> +	if (len) {
> +		_mm_storeu_si128((__m128i *)buf,
> +				 _mm_stream_load_si128((__m128i *)src));
> +		memcpy(dst, buf, len);
> +	}
> +}
> +
> +#pragma GCC pop_options
> +
> +static void memcpy_from_wc(void *dst, const void *src, unsigned long len)
> +{
> +	memcpy(dst, src, len);
> +}
> +
> +static void (*resolve_memcpy_from_wc(void))(void *, const void *, unsigned long)
> +{
> +	if (igt_x86_features() & SSE4_1)
> +		return memcpy_from_wc_sse41;
> +
> +	return memcpy_from_wc;
> +}
> +
> +void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
> +	__attribute__((ifunc("resolve_memcpy_from_wc")));
> +
> +#else
> +void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
> +{
> +	memcpy(dst, src, len);
> +}
> +#endif
> diff --git a/lib/igt_x86.h b/lib/igt_x86.h
> index 27b7f0fd..d4f8c343 100644
> --- a/lib/igt_x86.h
> +++ b/lib/igt_x86.h
> @@ -55,4 +55,6 @@ static inline char *igt_x86_features_to_string(unsigned features, char *line)
>  }
>  #endif
>  
> +void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len);
> +
>  #endif /* IGT_X86_H */
> diff --git a/tests/gem_fence_thrash.c b/tests/gem_fence_thrash.c
> index c8ff961d..2d7fb2ff 100644
> --- a/tests/gem_fence_thrash.c
> +++ b/tests/gem_fence_thrash.c
> @@ -107,75 +107,16 @@ bo_copy (void *_arg)
>  	return NULL;
>  }
>  
> -#if defined(__x86_64__) && !defined(__clang__)
> -
> -#pragma GCC push_options
> -#pragma GCC target("sse4.1")
> -
> -#include <smmintrin.h>
> -
> -#define MOVNT 512
> -
> -__attribute__((noinline))
> -static void copy_wc_page(void *dst, void *src)
> -{
> -	if (igt_x86_features() & SSE4_1) {
> -		__m128i *S = (__m128i *)src;
> -		__m128i *D = (__m128i *)dst;
> -
> -		for (int i = 0; i < PAGE_SIZE/CACHELINE; i++) {
> -			__m128i tmp[4];
> -
> -			tmp[0] = _mm_stream_load_si128(S++);
> -			tmp[1] = _mm_stream_load_si128(S++);
> -			tmp[2] = _mm_stream_load_si128(S++);
> -			tmp[3] = _mm_stream_load_si128(S++);
> -
> -			_mm_store_si128(D++, tmp[0]);
> -			_mm_store_si128(D++, tmp[1]);
> -			_mm_store_si128(D++, tmp[2]);
> -			_mm_store_si128(D++, tmp[3]);
> -		}
> -	} else
> -		memcpy(dst, src, PAGE_SIZE);
> -}
> -
> -static void copy_wc_cacheline(void *dst, void *src)
> -{
> -	if (igt_x86_features() & SSE4_1) {
> -		__m128i *S = (__m128i *)src;
> -		__m128i *D = (__m128i *)dst;
> -		__m128i tmp[4];
> -
> -		tmp[0] = _mm_stream_load_si128(S++);
> -		tmp[1] = _mm_stream_load_si128(S++);
> -		tmp[2] = _mm_stream_load_si128(S++);
> -		tmp[3] = _mm_stream_load_si128(S++);
> -
> -		_mm_store_si128(D++, tmp[0]);
> -		_mm_store_si128(D++, tmp[1]);
> -		_mm_store_si128(D++, tmp[2]);
> -		_mm_store_si128(D++, tmp[3]);
> -	} else
> -		memcpy(dst, src, CACHELINE);
> -}
> -
> -#pragma GCC pop_options
> -
> -#else
> -
>  static void copy_wc_page(void *dst, const void *src)
>  {
> -	memcpy(dst, src, PAGE_SIZE);
> +	igt_memcpy_from_wc(dst, src, PAGE_SIZE);
>  }
>  
>  static void copy_wc_cacheline(void *dst, const void *src)
>  {
> -	memcpy(dst, src, CACHELINE);
> +	igt_memcpy_from_wc(dst, src, CACHELINE);
>  }
>  
> -#endif
> -
>  static void
>  _bo_write_verify(struct test *t)
>  {
> diff --git a/tests/gem_mmap_gtt.c b/tests/gem_mmap_gtt.c
> index 0f598125..6a332b25 100644
> --- a/tests/gem_mmap_gtt.c
> +++ b/tests/gem_mmap_gtt.c
> @@ -529,45 +529,10 @@ test_huge_bo(int fd, int huge, int tiling)
>  	munmap(linear_pattern, PAGE_SIZE);
>  }
>  
> -#if defined(__x86_64__) && !defined(__clang__)
> -#define MOVNT 512
> -
> -#pragma GCC push_options
> -#pragma GCC target("sse4.1")
> -
> -#include <smmintrin.h>
> -__attribute__((noinline))
> -static void copy_wc_page(void *dst, void *src)
> -{
> -	if (igt_x86_features() & SSE4_1) {
> -		__m128i *S = (__m128i *)src;
> -		__m128i *D = (__m128i *)dst;
> -
> -		for (int i = 0; i < PAGE_SIZE/64; i++) {
> -			__m128i tmp[4];
> -
> -			tmp[0] = _mm_stream_load_si128(S++);
> -			tmp[1] = _mm_stream_load_si128(S++);
> -			tmp[2] = _mm_stream_load_si128(S++);
> -			tmp[3] = _mm_stream_load_si128(S++);
> -
> -			_mm_store_si128(D++, tmp[0]);
> -			_mm_store_si128(D++, tmp[1]);
> -			_mm_store_si128(D++, tmp[2]);
> -			_mm_store_si128(D++, tmp[3]);
> -		}
> -	} else
> -		memcpy(dst, src, PAGE_SIZE);
> -}
> -
> -#pragma GCC pop_options
> -
> -#else
>  static void copy_wc_page(void *dst, const void *src)
>  {
> -	memcpy(dst, src, PAGE_SIZE);
> +	igt_memcpy_from_wc(dst, src, PAGE_SIZE);
>  }
> -#endif
>  
>  static unsigned int tile_row_size(int tiling, unsigned int stride)
>  {
> diff --git a/tests/gem_tiled_pread_pwrite.c b/tests/gem_tiled_pread_pwrite.c
> index 7b5577fd..313daa38 100644
> --- a/tests/gem_tiled_pread_pwrite.c
> +++ b/tests/gem_tiled_pread_pwrite.c
> @@ -100,45 +100,10 @@ create_bo(int fd)
>  	return handle;
>  }
>  
> -#if defined(__x86_64__) && !defined(__clang__)
> -#define MOVNT 512
> -
> -#pragma GCC push_options
> -#pragma GCC target("sse4.1")
> -
> -#include <smmintrin.h>
> -__attribute__((noinline))
> -static void copy_wc_page(void *dst, void *src)
> -{
> -	if (igt_x86_features() & SSE4_1) {
> -		__m128i *S = (__m128i *)src;
> -		__m128i *D = (__m128i *)dst;
> -
> -		for (int i = 0; i < PAGE_SIZE/64; i++) {
> -			__m128i tmp[4];
> -
> -			tmp[0] = _mm_stream_load_si128(S++);
> -			tmp[1] = _mm_stream_load_si128(S++);
> -			tmp[2] = _mm_stream_load_si128(S++);
> -			tmp[3] = _mm_stream_load_si128(S++);
> -
> -			_mm_store_si128(D++, tmp[0]);
> -			_mm_store_si128(D++, tmp[1]);
> -			_mm_store_si128(D++, tmp[2]);
> -			_mm_store_si128(D++, tmp[3]);
> -		}
> -	} else
> -		memcpy(dst, src, PAGE_SIZE);
> -}
> -
> -#pragma GCC pop_options
> -
> -#else
>  static void copy_wc_page(void *dst, const void *src)
>  {
> -	memcpy(dst, src, PAGE_SIZE);
> +	igt_memcpy_from_wc(dst, src, PAGE_SIZE);
>  }
> -#endif
>  
>  igt_simple_main
>  {
> -- 
> 2.16.2
> 
> _______________________________________________
> igt-dev mailing list
> igt-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/igt-dev

-- 
Ville Syrjälä
Intel OTC
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

  reply	other threads:[~2018-02-28 17:12 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-02-27 21:50 [igt-dev] [PATCH igt] lib: Provide an accelerated routine for readback from WC Chris Wilson
2018-02-27 21:53 ` Chris Wilson
2018-02-27 22:17 ` [igt-dev] [PATCH igt v2] " Chris Wilson
2018-02-27 22:20 ` [igt-dev] [PATCH igt v3] " Chris Wilson
2018-02-27 22:42 ` [igt-dev] [PATCH igt v4] " Chris Wilson
2018-02-27 23:29 ` [igt-dev] [PATCH igt] " Eric Anholt
2018-02-27 23:44 ` [igt-dev] ✓ Fi.CI.BAT: success for lib: Provide an accelerated routine for readback from WC (rev4) Patchwork
2018-02-28  1:04 ` [igt-dev] ✗ Fi.CI.IGT: failure " Patchwork
2018-02-28  9:00 ` [igt-dev] [PATCH igt v6] lib: Provide an accelerated routine for readback from WC Chris Wilson
2018-02-28 17:12   ` Ville Syrjälä [this message]
2018-03-01  8:43     ` Chris Wilson
2018-02-28  9:31 ` [igt-dev] ✓ Fi.CI.BAT: success for lib: Provide an accelerated routine for readback from WC (rev5) Patchwork
2018-02-28 10:16 ` [igt-dev] ✓ Fi.CI.IGT: " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180228171244.GO5453@intel.com \
    --to=ville.syrjala@linux.intel.com \
    --cc=chris@chris-wilson.co.uk \
    --cc=igt-dev@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.