All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory
@ 2016-07-16 12:07 Chris Wilson
  2016-07-16 12:32 ` ✗ Ro.CI.BAT: failure for " Patchwork
                   ` (6 more replies)
  0 siblings, 7 replies; 27+ messages in thread
From: Chris Wilson @ 2016-07-16 12:07 UTC (permalink / raw)
  To: intel-gfx; +Cc: Akash Goel, Mika Kuoppala

This patch provides the infrastructure for performing a 16-byte aligned
read from WC (or even uncached) memory using non-temporal instructions
introduced with sse4.1. Using movntdqa we can bypass the CPU caches and
read directly from memory and ignoring the page attributes set on the
CPU PTE i.e. negating the impact of an otherwise UC access, copying
using movntqda from WC is almost as fast as reading from WB memory,
modulo the possibility of both hitting the CPU cache or leaving the data
in the CPU cache for the next consumer.

This will be used in later patches to accelerate accessing WC memory.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Akash Goel <akash.goel@intel.com>
Cc: Damien Lespiau <damien.lespiau@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/Makefile      |  6 ++++
 drivers/gpu/drm/i915/i915_drv.c    |  2 ++
 drivers/gpu/drm/i915/i915_drv.h    |  3 ++
 drivers/gpu/drm/i915/i915_memcpy.c | 64 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 75 insertions(+)
 create mode 100644 drivers/gpu/drm/i915/i915_memcpy.c

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 75318ebb8d25..15ecd7383d85 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -4,11 +4,17 @@
 
 subdir-ccflags-$(CONFIG_DRM_I915_WERROR) := -Werror
 
+ifeq ($(call as-instr,movntdqa (%eax)$(comma)%xmm0,y),y)
+	subdir-ccflags-y += -DCONFIG_AS_MOVNTDQA
+	subdir-ccflags-y += -mpreferred-stack-boundary=4
+endif
+
 # Please keep these build lists sorted!
 
 # core driver code
 i915-y := i915_drv.o \
 	  i915_irq.o \
+	  i915_memcpy.o \
 	  i915_params.o \
 	  i915_pci.o \
           i915_suspend.o \
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index c5b7b8e0678a..16612542496a 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -848,6 +848,8 @@ static int i915_driver_init_early(struct drm_i915_private *dev_priv,
 	mutex_init(&dev_priv->wm.wm_mutex);
 	mutex_init(&dev_priv->pps_mutex);
 
+	i915_memcpy_init_early(dev_priv);
+
 	ret = i915_workqueues_init(dev_priv);
 	if (ret < 0)
 		return ret;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 27d9b2c374b3..fe58ee78de98 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -4070,4 +4070,7 @@ static inline bool __i915_request_irq_complete(struct drm_i915_gem_request *req)
 	return false;
 }
 
+void i915_memcpy_init_early(struct drm_i915_private *dev_priv);
+void i915_memcpy_from_wc(void *dst, const void *src, unsigned long len);
+
 #endif
diff --git a/drivers/gpu/drm/i915/i915_memcpy.c b/drivers/gpu/drm/i915/i915_memcpy.c
new file mode 100644
index 000000000000..09c24164f2b6
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_memcpy.c
@@ -0,0 +1,64 @@
+#include "i915_drv.h"
+
+DEFINE_STATIC_KEY_FALSE(has_movntqa);
+
+#ifdef CONFIG_AS_MOVNTDQA
+static void __movntqda(void *dst, const void *src, unsigned long len)
+{
+	len >>= 4;
+	while (len >= 4) {
+		__asm__ __volatile__(
+		"movntdqa (%0), %%xmm0\n"
+		"movntdqa 8(%0), %%xmm1\n"
+		"movntdqa 16(%0), %%xmm2\n"
+		"movntdqa 24(%0), %%xmm3\n"
+		"movaps %%xmm0, (%1)\n"
+		"movaps %%xmm1, 8(%1)\n"
+		"movaps %%xmm2, 16(%1)\n"
+		"movaps %%xmm3, 24(%1)\n"
+		: : "r" (src), "r" (dst) : "memory");
+		src += 64;
+		dst += 64;
+		len -= 4;
+	}
+	while (len--) {
+		__asm__ __volatile__(
+		"movntdqa (%0), %%xmm0\n"
+		"movaps %%xmm0, (%1)\n"
+		: : "r" (src), "r" (dst) : "memory");
+		src += 16;
+		dst += 16;
+	}
+}
+#endif
+
+/**
+ * i915_memcpy_from_wc: perform an accelerated *aligned* read from WC
+ * @dst: destination pointer
+ * @src: source pointer
+ * @len: how many bytes to copy
+ *
+ * i915_memcpy_from_wc copies @len bytes from @src to @dst using
+ * non-temporal instructions where available. Note that all arguments
+ * (@src, @dst) must be aligned to 16 bytes and @len must be a multiple
+ * of 16.
+ */
+void i915_memcpy_from_wc(void *dst, const void *src, unsigned long len)
+{
+	GEM_BUG_ON((unsigned long)dst & 15);
+	GEM_BUG_ON((unsigned long)src & 15);
+	GEM_BUG_ON((unsigned long)len & 15);
+
+#ifdef CONFIG_AS_MOVNTDQA
+	if (static_branch_likely(&has_movntqa))
+		 __movntqda(dst, src, len);
+	else
+#endif
+		memcpy(dst, src, len);
+}
+
+void i915_memcpy_init_early(struct drm_i915_private *dev_priv)
+{
+	if (static_cpu_has(X86_FEATURE_XMM4_1))
+		static_branch_enable(&has_movntqa);
+}
-- 
2.8.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* ✗ Ro.CI.BAT: failure for drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory
  2016-07-16 12:07 [PATCH] drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory Chris Wilson
@ 2016-07-16 12:32 ` Patchwork
  2016-07-16 12:33 ` [PATCH] " Chris Wilson
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 27+ messages in thread
From: Patchwork @ 2016-07-16 12:32 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory
URL   : https://patchwork.freedesktop.org/series/9949/
State : failure

== Summary ==

Series 9949v1 drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory
http://patchwork.freedesktop.org/api/1.0/series/9949/revisions/1/mbox

Test drv_module_reload_basic:
                pass       -> SKIP       (ro-hsw-i3-4010u)
Test gem_sync:
        Subgroup basic-store-each:
                dmesg-fail -> PASS       (ro-bdw-i7-5600u)
Test kms_cursor_legacy:
        Subgroup basic-cursor-vs-flip:
                pass       -> FAIL       (ro-ilk1-i5-650)

fi-hsw-i7-4770k  total:242  pass:210  dwarn:0   dfail:0   fail:8   skip:24 
fi-kbl-qkkr      total:242  pass:175  dwarn:26  dfail:2   fail:8   skip:31 
fi-skl-i5-6260u  total:242  pass:219  dwarn:0   dfail:0   fail:7   skip:16 
fi-skl-i7-6700k  total:242  pass:205  dwarn:0   dfail:0   fail:7   skip:30 
fi-snb-i7-2600   total:242  pass:190  dwarn:0   dfail:0   fail:8   skip:44 
ro-bdw-i5-5250u  total:242  pass:214  dwarn:4   dfail:0   fail:7   skip:17 
ro-bdw-i7-5557U  total:242  pass:215  dwarn:1   dfail:0   fail:7   skip:19 
ro-bdw-i7-5600u  total:242  pass:200  dwarn:0   dfail:0   fail:7   skip:35 
ro-bsw-n3050     total:218  pass:173  dwarn:0   dfail:0   fail:2   skip:42 
ro-byt-n2820     total:242  pass:191  dwarn:0   dfail:0   fail:9   skip:42 
ro-hsw-i3-4010u  total:242  pass:205  dwarn:0   dfail:0   fail:8   skip:29 
ro-hsw-i7-4770r  total:242  pass:206  dwarn:0   dfail:0   fail:8   skip:28 
ro-ilk-i7-620lm  total:242  pass:166  dwarn:0   dfail:0   fail:9   skip:67 
ro-ilk1-i5-650   total:237  pass:165  dwarn:0   dfail:0   fail:10  skip:62 
ro-ivb-i7-3770   total:242  pass:197  dwarn:0   dfail:0   fail:8   skip:37 
ro-skl3-i5-6260u total:242  pass:218  dwarn:1   dfail:0   fail:7   skip:16 
ro-snb-i7-2620M  total:242  pass:188  dwarn:0   dfail:0   fail:9   skip:45 
fi-bsw-n3050 failed to connect after reboot

Results at /archive/results/CI_IGT_test/RO_Patchwork_1508/

2fe098f drm-intel-nightly: 2016y-07m-15d-19h-42m-01s UTC integration manifest
66ab838 drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory
  2016-07-16 12:07 [PATCH] drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory Chris Wilson
  2016-07-16 12:32 ` ✗ Ro.CI.BAT: failure for " Patchwork
@ 2016-07-16 12:33 ` Chris Wilson
  2016-07-16 12:45 ` [PATCH v2] " Chris Wilson
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 27+ messages in thread
From: Chris Wilson @ 2016-07-16 12:33 UTC (permalink / raw)
  To: intel-gfx; +Cc: Akash Goel, Mika Kuoppala

On Sat, Jul 16, 2016 at 01:07:35PM +0100, Chris Wilson wrote:
> +void i915_memcpy_from_wc(void *dst, const void *src, unsigned long len)
> +{
> +	GEM_BUG_ON((unsigned long)dst & 15);
> +	GEM_BUG_ON((unsigned long)src & 15);
> +	GEM_BUG_ON((unsigned long)len & 15);
> +
> +#ifdef CONFIG_AS_MOVNTDQA
> +	if (static_branch_likely(&has_movntqa))
> +		 __movntqda(dst, src, len);
> +	else
> +#endif
> +		memcpy(dst, src, len);

Ok, actually using this in a patch, the automatic fixup to a plain
memcpy from WC is bad behaviour and instead I want to punt the fallback
to the caller (i.e. return false;)
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v2] drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory
  2016-07-16 12:07 [PATCH] drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory Chris Wilson
  2016-07-16 12:32 ` ✗ Ro.CI.BAT: failure for " Patchwork
  2016-07-16 12:33 ` [PATCH] " Chris Wilson
@ 2016-07-16 12:45 ` Chris Wilson
  2016-07-16 12:53   ` [PATCH v3] " Chris Wilson
  2016-07-16 13:12 ` ✗ Ro.CI.BAT: failure for drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory (rev2) Patchwork
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 27+ messages in thread
From: Chris Wilson @ 2016-07-16 12:45 UTC (permalink / raw)
  To: intel-gfx; +Cc: Akash Goel, Mika Kuoppala

This patch provides the infrastructure for performing a 16-byte aligned
read from WC (or even uncached) memory using non-temporal instructions
introduced with sse4.1. Using movntdqa we can bypass the CPU caches and
read directly from memory and ignoring the page attributes set on the
CPU PTE i.e. negating the impact of an otherwise UC access, copying
using movntqda from WC is almost as fast as reading from WB memory,
modulo the possibility of both hitting the CPU cache or leaving the data
in the CPU cache for the next consumer.

This will be used in later patches to accelerate accessing WC memory.

v2: Report whether the accelerated copy is successful/possible.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Akash Goel <akash.goel@intel.com>
Cc: Damien Lespiau <damien.lespiau@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/Makefile      |  6 +++
 drivers/gpu/drm/i915/i915_drv.c    |  2 +
 drivers/gpu/drm/i915/i915_drv.h    |  3 ++
 drivers/gpu/drm/i915/i915_memcpy.c | 75 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 86 insertions(+)
 create mode 100644 drivers/gpu/drm/i915/i915_memcpy.c

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 75318ebb8d25..15ecd7383d85 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -4,11 +4,17 @@
 
 subdir-ccflags-$(CONFIG_DRM_I915_WERROR) := -Werror
 
+ifeq ($(call as-instr,movntdqa (%eax)$(comma)%xmm0,y),y)
+	subdir-ccflags-y += -DCONFIG_AS_MOVNTDQA
+	subdir-ccflags-y += -mpreferred-stack-boundary=4
+endif
+
 # Please keep these build lists sorted!
 
 # core driver code
 i915-y := i915_drv.o \
 	  i915_irq.o \
+	  i915_memcpy.o \
 	  i915_params.o \
 	  i915_pci.o \
           i915_suspend.o \
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index c5b7b8e0678a..16612542496a 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -848,6 +848,8 @@ static int i915_driver_init_early(struct drm_i915_private *dev_priv,
 	mutex_init(&dev_priv->wm.wm_mutex);
 	mutex_init(&dev_priv->pps_mutex);
 
+	i915_memcpy_init_early(dev_priv);
+
 	ret = i915_workqueues_init(dev_priv);
 	if (ret < 0)
 		return ret;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 27d9b2c374b3..3c266e7866ba 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -4070,4 +4070,7 @@ static inline bool __i915_request_irq_complete(struct drm_i915_gem_request *req)
 	return false;
 }
 
+void i915_memcpy_init_early(struct drm_i915_private *dev_priv);
+bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len);
+
 #endif
diff --git a/drivers/gpu/drm/i915/i915_memcpy.c b/drivers/gpu/drm/i915/i915_memcpy.c
new file mode 100644
index 000000000000..8370cdef15f6
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_memcpy.c
@@ -0,0 +1,75 @@
+#include "i915_drv.h"
+
+DEFINE_STATIC_KEY_FALSE(has_movntqa);
+
+#ifdef CONFIG_AS_MOVNTDQA
+static void __movntqda(void *dst, const void *src, unsigned long len)
+{
+	len >>= 4;
+	while (len >= 4) {
+		__asm__ __volatile__(
+		"movntdqa (%0), %%xmm0\n"
+		"movntdqa 8(%0), %%xmm1\n"
+		"movntdqa 16(%0), %%xmm2\n"
+		"movntdqa 24(%0), %%xmm3\n"
+		"movaps %%xmm0, (%1)\n"
+		"movaps %%xmm1, 8(%1)\n"
+		"movaps %%xmm2, 16(%1)\n"
+		"movaps %%xmm3, 24(%1)\n"
+		: : "r" (src), "r" (dst) : "memory");
+		src += 64;
+		dst += 64;
+		len -= 4;
+	}
+	while (len--) {
+		__asm__ __volatile__(
+		"movntdqa (%0), %%xmm0\n"
+		"movaps %%xmm0, (%1)\n"
+		: : "r" (src), "r" (dst) : "memory");
+		src += 16;
+		dst += 16;
+	}
+}
+#endif
+
+/**
+ * i915_memcpy_from_wc: perform an accelerated *aligned* read from WC
+ * @dst: destination pointer
+ * @src: source pointer
+ * @len: how many bytes to copy
+ *
+ * i915_memcpy_from_wc copies @len bytes from @src to @dst using
+ * non-temporal instructions where available. Note that all arguments
+ * (@src, @dst) must be aligned to 16 bytes and @len must be a multiple
+ * of 16.
+ *
+ * To test whether accelerated reads from WC are supported, use
+ * i915_memcpy_from_wc(NULL, NULL, 0);
+ *
+ * Returns true if the copy was successful, false if the preconditions
+ * are not met.
+ */
+bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len)
+{
+	GEM_BUG_ON((unsigned long)dst & 15);
+	GEM_BUG_ON((unsigned long)src & 15);
+
+	if (len & 15)
+		return false;
+
+#ifdef CONFIG_AS_MOVNTDQA
+	if (static_branch_likely(&has_movntqa)) {
+		if (len)
+			__movntqda(dst, src, len);
+		 return true;
+	}
+#endif
+
+	return false;
+}
+
+void i915_memcpy_init_early(struct drm_i915_private *dev_priv)
+{
+	if (static_cpu_has(X86_FEATURE_XMM4_1))
+		static_branch_enable(&has_movntqa);
+}
-- 
2.8.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH v3] drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory
  2016-07-16 12:45 ` [PATCH v2] " Chris Wilson
@ 2016-07-16 12:53   ` Chris Wilson
  2016-07-16 15:44     ` [PATCH v5] drm/i915: Use SSE4.1 movntdqa " Chris Wilson
  0 siblings, 1 reply; 27+ messages in thread
From: Chris Wilson @ 2016-07-16 12:53 UTC (permalink / raw)
  To: intel-gfx; +Cc: Akash Goel, Mika Kuoppala

This patch provides the infrastructure for performing a 16-byte aligned
read from WC (or even uncached) memory using non-temporal instructions
introduced with sse4.1. Using movntdqa we can bypass the CPU caches and
read directly from memory and ignoring the page attributes set on the
CPU PTE i.e. negating the impact of an otherwise UC access, copying
using movntqda from WC is almost as fast as reading from WB memory,
modulo the possibility of both hitting the CPU cache or leaving the data
in the CPU cache for the next consumer.

This will be used in later patches to accelerate accessing WC memory.

v2: Report whether the accelerated copy is successful/possible.
v3: Function alignment override was only necessary when using the
function target("sse4.1") - which is not necessary for emitting movntdqa
from __asm__.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Akash Goel <akash.goel@intel.com>
Cc: Damien Lespiau <damien.lespiau@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/Makefile      |  3 ++
 drivers/gpu/drm/i915/i915_drv.c    |  2 +
 drivers/gpu/drm/i915/i915_drv.h    |  3 ++
 drivers/gpu/drm/i915/i915_memcpy.c | 75 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 83 insertions(+)
 create mode 100644 drivers/gpu/drm/i915/i915_memcpy.c

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 75318ebb8d25..a53853daa998 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -3,12 +3,15 @@
 # Direct Rendering Infrastructure (DRI) in XFree86 4.1.0 and higher.
 
 subdir-ccflags-$(CONFIG_DRM_I915_WERROR) := -Werror
+subdir-ccflags-y += \
+	$(call as-instr,movntdqa (%eax)$(comma)%xmm0,-DCONFIG_AS_MOVNTDQA)
 
 # Please keep these build lists sorted!
 
 # core driver code
 i915-y := i915_drv.o \
 	  i915_irq.o \
+	  i915_memcpy.o \
 	  i915_params.o \
 	  i915_pci.o \
           i915_suspend.o \
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index c5b7b8e0678a..16612542496a 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -848,6 +848,8 @@ static int i915_driver_init_early(struct drm_i915_private *dev_priv,
 	mutex_init(&dev_priv->wm.wm_mutex);
 	mutex_init(&dev_priv->pps_mutex);
 
+	i915_memcpy_init_early(dev_priv);
+
 	ret = i915_workqueues_init(dev_priv);
 	if (ret < 0)
 		return ret;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 27d9b2c374b3..3c266e7866ba 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -4070,4 +4070,7 @@ static inline bool __i915_request_irq_complete(struct drm_i915_gem_request *req)
 	return false;
 }
 
+void i915_memcpy_init_early(struct drm_i915_private *dev_priv);
+bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len);
+
 #endif
diff --git a/drivers/gpu/drm/i915/i915_memcpy.c b/drivers/gpu/drm/i915/i915_memcpy.c
new file mode 100644
index 000000000000..8370cdef15f6
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_memcpy.c
@@ -0,0 +1,75 @@
+#include "i915_drv.h"
+
+DEFINE_STATIC_KEY_FALSE(has_movntqa);
+
+#ifdef CONFIG_AS_MOVNTDQA
+static void __movntqda(void *dst, const void *src, unsigned long len)
+{
+	len >>= 4;
+	while (len >= 4) {
+		__asm__ __volatile__(
+		"movntdqa (%0), %%xmm0\n"
+		"movntdqa 8(%0), %%xmm1\n"
+		"movntdqa 16(%0), %%xmm2\n"
+		"movntdqa 24(%0), %%xmm3\n"
+		"movaps %%xmm0, (%1)\n"
+		"movaps %%xmm1, 8(%1)\n"
+		"movaps %%xmm2, 16(%1)\n"
+		"movaps %%xmm3, 24(%1)\n"
+		: : "r" (src), "r" (dst) : "memory");
+		src += 64;
+		dst += 64;
+		len -= 4;
+	}
+	while (len--) {
+		__asm__ __volatile__(
+		"movntdqa (%0), %%xmm0\n"
+		"movaps %%xmm0, (%1)\n"
+		: : "r" (src), "r" (dst) : "memory");
+		src += 16;
+		dst += 16;
+	}
+}
+#endif
+
+/**
+ * i915_memcpy_from_wc: perform an accelerated *aligned* read from WC
+ * @dst: destination pointer
+ * @src: source pointer
+ * @len: how many bytes to copy
+ *
+ * i915_memcpy_from_wc copies @len bytes from @src to @dst using
+ * non-temporal instructions where available. Note that all arguments
+ * (@src, @dst) must be aligned to 16 bytes and @len must be a multiple
+ * of 16.
+ *
+ * To test whether accelerated reads from WC are supported, use
+ * i915_memcpy_from_wc(NULL, NULL, 0);
+ *
+ * Returns true if the copy was successful, false if the preconditions
+ * are not met.
+ */
+bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len)
+{
+	GEM_BUG_ON((unsigned long)dst & 15);
+	GEM_BUG_ON((unsigned long)src & 15);
+
+	if (len & 15)
+		return false;
+
+#ifdef CONFIG_AS_MOVNTDQA
+	if (static_branch_likely(&has_movntqa)) {
+		if (len)
+			__movntqda(dst, src, len);
+		 return true;
+	}
+#endif
+
+	return false;
+}
+
+void i915_memcpy_init_early(struct drm_i915_private *dev_priv)
+{
+	if (static_cpu_has(X86_FEATURE_XMM4_1))
+		static_branch_enable(&has_movntqa);
+}
-- 
2.8.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* ✗ Ro.CI.BAT: failure for drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory (rev2)
  2016-07-16 12:07 [PATCH] drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory Chris Wilson
                   ` (2 preceding siblings ...)
  2016-07-16 12:45 ` [PATCH v2] " Chris Wilson
@ 2016-07-16 13:12 ` Patchwork
  2016-07-16 13:34 ` ✓ Ro.CI.BAT: success for drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory (rev3) Patchwork
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 27+ messages in thread
From: Patchwork @ 2016-07-16 13:12 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory (rev2)
URL   : https://patchwork.freedesktop.org/series/9949/
State : failure

== Summary ==

Series 9949v2 drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory
http://patchwork.freedesktop.org/api/1.0/series/9949/revisions/2/mbox

Test gem_sync:
        Subgroup basic-store-each:
                dmesg-fail -> PASS       (ro-bdw-i7-5600u)
Test kms_pipe_crc_basic:
        Subgroup suspend-read-crc-pipe-a:
                pass       -> INCOMPLETE (fi-skl-i7-6700k)

fi-bsw-n3050     total:242  pass:188  dwarn:0   dfail:0   fail:7   skip:47 
fi-hsw-i7-4770k  total:242  pass:210  dwarn:0   dfail:0   fail:8   skip:24 
fi-kbl-qkkr      total:242  pass:175  dwarn:28  dfail:1   fail:7   skip:31 
fi-skl-i5-6260u  total:242  pass:219  dwarn:0   dfail:0   fail:7   skip:16 
fi-skl-i7-6700k  total:196  pass:171  dwarn:0   dfail:0   fail:0   skip:24 
fi-snb-i7-2600   total:242  pass:190  dwarn:0   dfail:0   fail:8   skip:44 
ro-bdw-i5-5250u  total:242  pass:214  dwarn:4   dfail:0   fail:7   skip:17 
ro-bdw-i7-5557U  total:242  pass:214  dwarn:1   dfail:0   fail:7   skip:20 
ro-bdw-i7-5600u  total:242  pass:200  dwarn:0   dfail:0   fail:7   skip:35 
ro-bsw-n3050     total:218  pass:173  dwarn:0   dfail:0   fail:2   skip:42 
ro-byt-n2820     total:242  pass:191  dwarn:0   dfail:0   fail:9   skip:42 
ro-hsw-i3-4010u  total:242  pass:206  dwarn:0   dfail:0   fail:8   skip:28 
ro-hsw-i7-4770r  total:242  pass:206  dwarn:0   dfail:0   fail:8   skip:28 
ro-ilk-i7-620lm  total:242  pass:166  dwarn:0   dfail:0   fail:9   skip:67 
ro-ilk1-i5-650   total:237  pass:166  dwarn:0   dfail:0   fail:9   skip:62 
ro-ivb-i7-3770   total:242  pass:197  dwarn:0   dfail:0   fail:8   skip:37 
ro-skl3-i5-6260u total:242  pass:218  dwarn:1   dfail:0   fail:7   skip:16 
ro-snb-i7-2620M  total:242  pass:188  dwarn:0   dfail:0   fail:9   skip:45 

Results at /archive/results/CI_IGT_test/RO_Patchwork_1509/

2fe098f drm-intel-nightly: 2016y-07m-15d-19h-42m-01s UTC integration manifest
7fb37fd drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* ✓ Ro.CI.BAT: success for drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory (rev3)
  2016-07-16 12:07 [PATCH] drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory Chris Wilson
                   ` (3 preceding siblings ...)
  2016-07-16 13:12 ` ✗ Ro.CI.BAT: failure for drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory (rev2) Patchwork
@ 2016-07-16 13:34 ` Patchwork
  2016-07-16 16:12 ` ✗ Ro.CI.BAT: failure for drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory (rev4) Patchwork
  2016-07-18 10:59 ` ✗ Ro.CI.BAT: failure for drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory (rev5) Patchwork
  6 siblings, 0 replies; 27+ messages in thread
From: Patchwork @ 2016-07-16 13:34 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory (rev3)
URL   : https://patchwork.freedesktop.org/series/9949/
State : success

== Summary ==

Series 9949v3 drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory
http://patchwork.freedesktop.org/api/1.0/series/9949/revisions/3/mbox

Test gem_sync:
        Subgroup basic-store-each:
                dmesg-fail -> PASS       (ro-bdw-i7-5600u)

fi-hsw-i7-4770k  total:242  pass:210  dwarn:0   dfail:0   fail:8   skip:24 
fi-kbl-qkkr      total:242  pass:176  dwarn:27  dfail:1   fail:8   skip:30 
fi-skl-i5-6260u  total:242  pass:219  dwarn:0   dfail:0   fail:7   skip:16 
fi-skl-i7-6700k  total:242  pass:205  dwarn:0   dfail:0   fail:7   skip:30 
fi-snb-i7-2600   total:242  pass:190  dwarn:0   dfail:0   fail:8   skip:44 
ro-bdw-i5-5250u  total:242  pass:214  dwarn:4   dfail:0   fail:7   skip:17 
ro-bdw-i7-5557U  total:242  pass:215  dwarn:0   dfail:0   fail:7   skip:20 
ro-bdw-i7-5600u  total:242  pass:200  dwarn:0   dfail:0   fail:7   skip:35 
ro-bsw-n3050     total:218  pass:173  dwarn:0   dfail:0   fail:2   skip:42 
ro-byt-n2820     total:242  pass:191  dwarn:0   dfail:0   fail:9   skip:42 
ro-hsw-i3-4010u  total:242  pass:206  dwarn:0   dfail:0   fail:8   skip:28 
ro-hsw-i7-4770r  total:242  pass:206  dwarn:0   dfail:0   fail:8   skip:28 
ro-ilk-i7-620lm  total:242  pass:166  dwarn:0   dfail:0   fail:9   skip:67 
ro-ilk1-i5-650   total:237  pass:166  dwarn:0   dfail:0   fail:9   skip:62 
ro-ivb-i7-3770   total:242  pass:197  dwarn:0   dfail:0   fail:8   skip:37 
ro-skl3-i5-6260u total:242  pass:218  dwarn:1   dfail:0   fail:7   skip:16 
ro-snb-i7-2620M  total:242  pass:188  dwarn:0   dfail:0   fail:9   skip:45 
fi-bsw-n3050 failed to connect after reboot

Results at /archive/results/CI_IGT_test/RO_Patchwork_1510/

2fe098f drm-intel-nightly: 2016y-07m-15d-19h-42m-01s UTC integration manifest
0d5efe3 drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v5] drm/i915: Use SSE4.1 movntdqa to accelerate reads from WC memory
  2016-07-16 12:53   ` [PATCH v3] " Chris Wilson
@ 2016-07-16 15:44     ` Chris Wilson
  2016-07-17  3:21       ` Matt Turner
  2016-07-18  9:31       ` Tvrtko Ursulin
  0 siblings, 2 replies; 27+ messages in thread
From: Chris Wilson @ 2016-07-16 15:44 UTC (permalink / raw)
  To: intel-gfx; +Cc: Akash Goel, Mika Kuoppala

This patch provides the infrastructure for performing a 16-byte aligned
read from WC memory using non-temporal instructions introduced with sse4.1.
Using movntdqa we can bypass the CPU caches and read directly from memory
and ignoring the page attributes set on the CPU PTE i.e. negating the
impact of an otherwise UC access. Copying using movntqda from WC is almost
as fast as reading from WB memory, modulo the possibility of both hitting
the CPU cache or leaving the data in the CPU cache for the next consumer.
(The CPU cache itself my be flushed for the region of the movntdqa and on
later access the movntdqa reads from a separate internal buffer for the
cacheline.) The write back to the memory is however cached.

This will be used in later patches to accelerate accessing WC memory.

v2: Report whether the accelerated copy is successful/possible.
v3: Function alignment override was only necessary when using the
function target("sse4.1") - which is not necessary for emitting movntdqa
from __asm__.
v4: Improve notes on CPU cache behaviour vs non-temporal stores.
v5: Fix byte offsets for unrolled moves.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Akash Goel <akash.goel@intel.com>
Cc: Damien Lespiau <damien.lespiau@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/Makefile      |  3 ++
 drivers/gpu/drm/i915/i915_drv.c    |  2 +
 drivers/gpu/drm/i915/i915_drv.h    |  3 ++
 drivers/gpu/drm/i915/i915_memcpy.c | 75 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 83 insertions(+)
 create mode 100644 drivers/gpu/drm/i915/i915_memcpy.c

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 75318ebb8d25..a53853daa998 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -3,12 +3,15 @@
 # Direct Rendering Infrastructure (DRI) in XFree86 4.1.0 and higher.
 
 subdir-ccflags-$(CONFIG_DRM_I915_WERROR) := -Werror
+subdir-ccflags-y += \
+	$(call as-instr,movntdqa (%eax)$(comma)%xmm0,-DCONFIG_AS_MOVNTDQA)
 
 # Please keep these build lists sorted!
 
 # core driver code
 i915-y := i915_drv.o \
 	  i915_irq.o \
+	  i915_memcpy.o \
 	  i915_params.o \
 	  i915_pci.o \
           i915_suspend.o \
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index c5b7b8e0678a..16612542496a 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -848,6 +848,8 @@ static int i915_driver_init_early(struct drm_i915_private *dev_priv,
 	mutex_init(&dev_priv->wm.wm_mutex);
 	mutex_init(&dev_priv->pps_mutex);
 
+	i915_memcpy_init_early(dev_priv);
+
 	ret = i915_workqueues_init(dev_priv);
 	if (ret < 0)
 		return ret;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 27d9b2c374b3..3c266e7866ba 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -4070,4 +4070,7 @@ static inline bool __i915_request_irq_complete(struct drm_i915_gem_request *req)
 	return false;
 }
 
+void i915_memcpy_init_early(struct drm_i915_private *dev_priv);
+bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len);
+
 #endif
diff --git a/drivers/gpu/drm/i915/i915_memcpy.c b/drivers/gpu/drm/i915/i915_memcpy.c
new file mode 100644
index 000000000000..4ed4d3bb2f3e
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_memcpy.c
@@ -0,0 +1,75 @@
+#include "i915_drv.h"
+
+DEFINE_STATIC_KEY_FALSE(has_movntqa);
+
+#ifdef CONFIG_AS_MOVNTDQA
+static void __movntqda(void *dst, const void *src, unsigned long len)
+{
+	len >>= 4;
+	while (len >= 4) {
+		__asm__ __volatile__(
+		"movntdqa (%0), %%xmm0\n"
+		"movntdqa 16(%0), %%xmm1\n"
+		"movntdqa 32(%0), %%xmm2\n"
+		"movntdqa 48(%0), %%xmm3\n"
+		"movaps %%xmm0, (%1)\n"
+		"movaps %%xmm1, 16(%1)\n"
+		"movaps %%xmm2, 32(%1)\n"
+		"movaps %%xmm3, 48(%1)\n"
+		: : "r" (src), "r" (dst) : "memory");
+		src += 64;
+		dst += 64;
+		len -= 4;
+	}
+	while (len--) {
+		__asm__ __volatile__(
+		"movntdqa (%0), %%xmm0\n"
+		"movaps %%xmm0, (%1)\n"
+		: : "r" (src), "r" (dst) : "memory");
+		src += 16;
+		dst += 16;
+	}
+}
+#endif
+
+/**
+ * i915_memcpy_from_wc: perform an accelerated *aligned* read from WC
+ * @dst: destination pointer
+ * @src: source pointer
+ * @len: how many bytes to copy
+ *
+ * i915_memcpy_from_wc copies @len bytes from @src to @dst using
+ * non-temporal instructions where available. Note that all arguments
+ * (@src, @dst) must be aligned to 16 bytes and @len must be a multiple
+ * of 16.
+ *
+ * To test whether accelerated reads from WC are supported, use
+ * i915_memcpy_from_wc(NULL, NULL, 0);
+ *
+ * Returns true if the copy was successful, false if the preconditions
+ * are not met.
+ */
+bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len)
+{
+	GEM_BUG_ON((unsigned long)dst & 15);
+	GEM_BUG_ON((unsigned long)src & 15);
+
+	if (unlikely(len & 15))
+		return false;
+
+#ifdef CONFIG_AS_MOVNTDQA
+	if (static_branch_likely(&has_movntqa)) {
+		if (len)
+			__movntqda(dst, src, len);
+		return true;
+	}
+#endif
+
+	return false;
+}
+
+void i915_memcpy_init_early(struct drm_i915_private *dev_priv)
+{
+	if (static_cpu_has(X86_FEATURE_XMM4_1))
+		static_branch_enable(&has_movntqa);
+}
-- 
2.8.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* ✗ Ro.CI.BAT: failure for drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory (rev4)
  2016-07-16 12:07 [PATCH] drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory Chris Wilson
                   ` (4 preceding siblings ...)
  2016-07-16 13:34 ` ✓ Ro.CI.BAT: success for drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory (rev3) Patchwork
@ 2016-07-16 16:12 ` Patchwork
  2016-07-18 10:59 ` ✗ Ro.CI.BAT: failure for drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory (rev5) Patchwork
  6 siblings, 0 replies; 27+ messages in thread
From: Patchwork @ 2016-07-16 16:12 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory (rev4)
URL   : https://patchwork.freedesktop.org/series/9949/
State : failure

== Summary ==

Series 9949v4 drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory
http://patchwork.freedesktop.org/api/1.0/series/9949/revisions/4/mbox

Test drv_module_reload_basic:
                pass       -> SKIP       (ro-hsw-i3-4010u)
Test gem_sync:
        Subgroup basic-store-each:
                dmesg-fail -> PASS       (ro-bdw-i7-5600u)
Test kms_pipe_crc_basic:
        Subgroup suspend-read-crc-pipe-a:
                pass       -> INCOMPLETE (fi-skl-i7-6700k)

fi-bsw-n3050     total:242  pass:188  dwarn:0   dfail:0   fail:7   skip:47 
fi-hsw-i7-4770k  total:242  pass:210  dwarn:0   dfail:0   fail:8   skip:24 
fi-kbl-qkkr      total:242  pass:175  dwarn:28  dfail:0   fail:8   skip:31 
fi-skl-i5-6260u  total:242  pass:219  dwarn:0   dfail:0   fail:7   skip:16 
fi-skl-i7-6700k  total:196  pass:171  dwarn:0   dfail:0   fail:0   skip:24 
fi-snb-i7-2600   total:242  pass:190  dwarn:0   dfail:0   fail:8   skip:44 
ro-bdw-i5-5250u  total:242  pass:214  dwarn:4   dfail:0   fail:7   skip:17 
ro-bdw-i7-5557U  total:242  pass:214  dwarn:1   dfail:0   fail:7   skip:20 
ro-bdw-i7-5600u  total:242  pass:200  dwarn:0   dfail:0   fail:7   skip:35 
ro-bsw-n3050     total:218  pass:173  dwarn:0   dfail:0   fail:2   skip:42 
ro-byt-n2820     total:242  pass:191  dwarn:0   dfail:0   fail:9   skip:42 
ro-hsw-i3-4010u  total:242  pass:205  dwarn:0   dfail:0   fail:8   skip:29 
ro-hsw-i7-4770r  total:242  pass:206  dwarn:0   dfail:0   fail:8   skip:28 
ro-ilk-i7-620lm  total:242  pass:166  dwarn:0   dfail:0   fail:9   skip:67 
ro-ilk1-i5-650   total:237  pass:166  dwarn:0   dfail:0   fail:9   skip:62 
ro-ivb-i7-3770   total:242  pass:197  dwarn:0   dfail:0   fail:8   skip:37 
ro-skl3-i5-6260u total:242  pass:218  dwarn:1   dfail:0   fail:7   skip:16 
ro-snb-i7-2620M  total:242  pass:188  dwarn:0   dfail:0   fail:9   skip:45 

Results at /archive/results/CI_IGT_test/RO_Patchwork_1511/

2fe098f drm-intel-nightly: 2016y-07m-15d-19h-42m-01s UTC integration manifest
102060b drm/i915: Use SSE4.1 movntdqa to accelerate reads from WC memory

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v5] drm/i915: Use SSE4.1 movntdqa to accelerate reads from WC memory
  2016-07-16 15:44     ` [PATCH v5] drm/i915: Use SSE4.1 movntdqa " Chris Wilson
@ 2016-07-17  3:21       ` Matt Turner
  2016-07-17  8:12         ` Chris Wilson
  2016-07-18  9:31       ` Tvrtko Ursulin
  1 sibling, 1 reply; 27+ messages in thread
From: Matt Turner @ 2016-07-17  3:21 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx, Akash Goel, Mika Kuoppala

On Sat, Jul 16, 2016 at 8:44 AM, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> This patch provides the infrastructure for performing a 16-byte aligned
> read from WC memory using non-temporal instructions introduced with sse4.1.
> Using movntdqa we can bypass the CPU caches and read directly from memory
> and ignoring the page attributes set on the CPU PTE i.e. negating the
> impact of an otherwise UC access. Copying using movntqda from WC is almost
> as fast as reading from WB memory, modulo the possibility of both hitting
> the CPU cache or leaving the data in the CPU cache for the next consumer.
> (The CPU cache itself my be flushed for the region of the movntdqa and on
> later access the movntdqa reads from a separate internal buffer for the
> cacheline.) The write back to the memory is however cached.
>
> This will be used in later patches to accelerate accessing WC memory.
>
> v2: Report whether the accelerated copy is successful/possible.
> v3: Function alignment override was only necessary when using the
> function target("sse4.1") - which is not necessary for emitting movntdqa
> from __asm__.
> v4: Improve notes on CPU cache behaviour vs non-temporal stores.
> v5: Fix byte offsets for unrolled moves.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Akash Goel <akash.goel@intel.com>
> Cc: Damien Lespiau <damien.lespiau@intel.com>
> Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>  drivers/gpu/drm/i915/Makefile      |  3 ++
>  drivers/gpu/drm/i915/i915_drv.c    |  2 +
>  drivers/gpu/drm/i915/i915_drv.h    |  3 ++
>  drivers/gpu/drm/i915/i915_memcpy.c | 75 ++++++++++++++++++++++++++++++++++++++
>  4 files changed, 83 insertions(+)
>  create mode 100644 drivers/gpu/drm/i915/i915_memcpy.c
>
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index 75318ebb8d25..a53853daa998 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -3,12 +3,15 @@
>  # Direct Rendering Infrastructure (DRI) in XFree86 4.1.0 and higher.
>
>  subdir-ccflags-$(CONFIG_DRM_I915_WERROR) := -Werror
> +subdir-ccflags-y += \
> +       $(call as-instr,movntdqa (%eax)$(comma)%xmm0,-DCONFIG_AS_MOVNTDQA)
>
>  # Please keep these build lists sorted!
>
>  # core driver code
>  i915-y := i915_drv.o \
>           i915_irq.o \
> +         i915_memcpy.o \
>           i915_params.o \
>           i915_pci.o \
>            i915_suspend.o \
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index c5b7b8e0678a..16612542496a 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -848,6 +848,8 @@ static int i915_driver_init_early(struct drm_i915_private *dev_priv,
>         mutex_init(&dev_priv->wm.wm_mutex);
>         mutex_init(&dev_priv->pps_mutex);
>
> +       i915_memcpy_init_early(dev_priv);
> +
>         ret = i915_workqueues_init(dev_priv);
>         if (ret < 0)
>                 return ret;
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 27d9b2c374b3..3c266e7866ba 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -4070,4 +4070,7 @@ static inline bool __i915_request_irq_complete(struct drm_i915_gem_request *req)
>         return false;
>  }
>
> +void i915_memcpy_init_early(struct drm_i915_private *dev_priv);
> +bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len);
> +
>  #endif
> diff --git a/drivers/gpu/drm/i915/i915_memcpy.c b/drivers/gpu/drm/i915/i915_memcpy.c
> new file mode 100644
> index 000000000000..4ed4d3bb2f3e
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_memcpy.c
> @@ -0,0 +1,75 @@
> +#include "i915_drv.h"
> +
> +DEFINE_STATIC_KEY_FALSE(has_movntqa);
> +
> +#ifdef CONFIG_AS_MOVNTDQA
> +static void __movntqda(void *dst, const void *src, unsigned long len)

Many times, you've spelled this movntqda but the instruction mnemonic
is movntdqa (as seen below). I'm not sure if I'm missing something, or
if this typo has somehow slipped in so many times, including in the
commit message.

> +{
> +       len >>= 4;
> +       while (len >= 4) {
> +               __asm__ __volatile__(
> +               "movntdqa (%0), %%xmm0\n"
> +               "movntdqa 16(%0), %%xmm1\n"
> +               "movntdqa 32(%0), %%xmm2\n"
> +               "movntdqa 48(%0), %%xmm3\n"
> +               "movaps %%xmm0, (%1)\n"
> +               "movaps %%xmm1, 16(%1)\n"
> +               "movaps %%xmm2, 32(%1)\n"
> +               "movaps %%xmm3, 48(%1)\n"
> +               : : "r" (src), "r" (dst) : "memory");
> +               src += 64;
> +               dst += 64;
> +               len -= 4;
> +       }
> +       while (len--) {
> +               __asm__ __volatile__(
> +               "movntdqa (%0), %%xmm0\n"
> +               "movaps %%xmm0, (%1)\n"
> +               : : "r" (src), "r" (dst) : "memory");
> +               src += 16;
> +               dst += 16;
> +       }
> +}
> +#endif
> +
> +/**
> + * i915_memcpy_from_wc: perform an accelerated *aligned* read from WC
> + * @dst: destination pointer
> + * @src: source pointer
> + * @len: how many bytes to copy
> + *
> + * i915_memcpy_from_wc copies @len bytes from @src to @dst using
> + * non-temporal instructions where available. Note that all arguments
> + * (@src, @dst) must be aligned to 16 bytes and @len must be a multiple
> + * of 16.
> + *
> + * To test whether accelerated reads from WC are supported, use
> + * i915_memcpy_from_wc(NULL, NULL, 0);
> + *
> + * Returns true if the copy was successful, false if the preconditions
> + * are not met.
> + */
> +bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len)
> +{
> +       GEM_BUG_ON((unsigned long)dst & 15);
> +       GEM_BUG_ON((unsigned long)src & 15);
> +
> +       if (unlikely(len & 15))
> +               return false;
> +
> +#ifdef CONFIG_AS_MOVNTDQA
> +       if (static_branch_likely(&has_movntqa)) {
> +               if (len)
> +                       __movntqda(dst, src, len);
> +               return true;
> +       }
> +#endif
> +
> +       return false;
> +}
> +
> +void i915_memcpy_init_early(struct drm_i915_private *dev_priv)
> +{
> +       if (static_cpu_has(X86_FEATURE_XMM4_1))
> +               static_branch_enable(&has_movntqa);
> +}
> --
> 2.8.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v5] drm/i915: Use SSE4.1 movntdqa to accelerate reads from WC memory
  2016-07-17  3:21       ` Matt Turner
@ 2016-07-17  8:12         ` Chris Wilson
  0 siblings, 0 replies; 27+ messages in thread
From: Chris Wilson @ 2016-07-17  8:12 UTC (permalink / raw)
  To: Matt Turner; +Cc: intel-gfx, Akash Goel, Mika Kuoppala

On Sat, Jul 16, 2016 at 08:21:59PM -0700, Matt Turner wrote:
> On Sat, Jul 16, 2016 at 8:44 AM, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > +#ifdef CONFIG_AS_MOVNTDQA
> > +static void __movntqda(void *dst, const void *src, unsigned long len)
> 
> Many times, you've spelled this movntqda but the instruction mnemonic
> is movntdqa (as seen below). I'm not sure if I'm missing something, or
> if this typo has somehow slipped in so many times, including in the
> commit message.

I know, my fingers just do not accept that it should be ntdqa. And
because the beginning/end of the word is correct, the transposition in
the middle is invisible when skimming over it.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v5] drm/i915: Use SSE4.1 movntdqa to accelerate reads from WC memory
  2016-07-16 15:44     ` [PATCH v5] drm/i915: Use SSE4.1 movntdqa " Chris Wilson
  2016-07-17  3:21       ` Matt Turner
@ 2016-07-18  9:31       ` Tvrtko Ursulin
  2016-07-18 10:01         ` Chris Wilson
  1 sibling, 1 reply; 27+ messages in thread
From: Tvrtko Ursulin @ 2016-07-18  9:31 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: Akash Goel, Mika Kuoppala


On 16/07/16 16:44, Chris Wilson wrote:
> This patch provides the infrastructure for performing a 16-byte aligned
> read from WC memory using non-temporal instructions introduced with sse4.1.
> Using movntdqa we can bypass the CPU caches and read directly from memory
> and ignoring the page attributes set on the CPU PTE i.e. negating the
> impact of an otherwise UC access. Copying using movntqda from WC is almost
> as fast as reading from WB memory, modulo the possibility of both hitting
> the CPU cache or leaving the data in the CPU cache for the next consumer.
> (The CPU cache itself my be flushed for the region of the movntdqa and on
> later access the movntdqa reads from a separate internal buffer for the
> cacheline.) The write back to the memory is however cached.
>
> This will be used in later patches to accelerate accessing WC memory.
>
> v2: Report whether the accelerated copy is successful/possible.
> v3: Function alignment override was only necessary when using the
> function target("sse4.1") - which is not necessary for emitting movntdqa
> from __asm__.
> v4: Improve notes on CPU cache behaviour vs non-temporal stores.
> v5: Fix byte offsets for unrolled moves.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Akash Goel <akash.goel@intel.com>
> Cc: Damien Lespiau <damien.lespiau@intel.com>
> Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/Makefile      |  3 ++
>   drivers/gpu/drm/i915/i915_drv.c    |  2 +
>   drivers/gpu/drm/i915/i915_drv.h    |  3 ++
>   drivers/gpu/drm/i915/i915_memcpy.c | 75 ++++++++++++++++++++++++++++++++++++++
>   4 files changed, 83 insertions(+)
>   create mode 100644 drivers/gpu/drm/i915/i915_memcpy.c
>
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index 75318ebb8d25..a53853daa998 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -3,12 +3,15 @@
>   # Direct Rendering Infrastructure (DRI) in XFree86 4.1.0 and higher.
>
>   subdir-ccflags-$(CONFIG_DRM_I915_WERROR) := -Werror
> +subdir-ccflags-y += \
> +	$(call as-instr,movntdqa (%eax)$(comma)%xmm0,-DCONFIG_AS_MOVNTDQA)
>
>   # Please keep these build lists sorted!
>
>   # core driver code
>   i915-y := i915_drv.o \
>   	  i915_irq.o \
> +	  i915_memcpy.o \
>   	  i915_params.o \
>   	  i915_pci.o \
>             i915_suspend.o \
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index c5b7b8e0678a..16612542496a 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -848,6 +848,8 @@ static int i915_driver_init_early(struct drm_i915_private *dev_priv,
>   	mutex_init(&dev_priv->wm.wm_mutex);
>   	mutex_init(&dev_priv->pps_mutex);
>
> +	i915_memcpy_init_early(dev_priv);
> +
>   	ret = i915_workqueues_init(dev_priv);
>   	if (ret < 0)
>   		return ret;
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 27d9b2c374b3..3c266e7866ba 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -4070,4 +4070,7 @@ static inline bool __i915_request_irq_complete(struct drm_i915_gem_request *req)
>   	return false;
>   }
>
> +void i915_memcpy_init_early(struct drm_i915_private *dev_priv);
> +bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len);
> +
>   #endif
> diff --git a/drivers/gpu/drm/i915/i915_memcpy.c b/drivers/gpu/drm/i915/i915_memcpy.c
> new file mode 100644
> index 000000000000..4ed4d3bb2f3e
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_memcpy.c
> @@ -0,0 +1,75 @@
> +#include "i915_drv.h"
> +
> +DEFINE_STATIC_KEY_FALSE(has_movntqa);
> +
> +#ifdef CONFIG_AS_MOVNTDQA
> +static void __movntqda(void *dst, const void *src, unsigned long len)
> +{
> +	len >>= 4;
> +	while (len >= 4) {
> +		__asm__ __volatile__(
> +		"movntdqa (%0), %%xmm0\n"
> +		"movntdqa 16(%0), %%xmm1\n"
> +		"movntdqa 32(%0), %%xmm2\n"
> +		"movntdqa 48(%0), %%xmm3\n"
> +		"movaps %%xmm0, (%1)\n"
> +		"movaps %%xmm1, 16(%1)\n"
> +		"movaps %%xmm2, 32(%1)\n"
> +		"movaps %%xmm3, 48(%1)\n"
> +		: : "r" (src), "r" (dst) : "memory");
> +		src += 64;
> +		dst += 64;
> +		len -= 4;
> +	}
> +	while (len--) {
> +		__asm__ __volatile__(
> +		"movntdqa (%0), %%xmm0\n"
> +		"movaps %%xmm0, (%1)\n"
> +		: : "r" (src), "r" (dst) : "memory");
> +		src += 16;
> +		dst += 16;
> +	}
> +}
> +#endif

Is it okay nowadays to just use these registers in the kernel?

Many years ago when I last looked into this FPU and MMX registers were 
discouraged against and needed explicit kernel_gpu_begin/end around the 
block. Since they were not saved/restored by the kernel and doing 
otherwise would mess up the userspace context.

Perhaps these new registers are different or the things have generally 
changed since then?

Regards,

Tvrtko

> +
> +/**
> + * i915_memcpy_from_wc: perform an accelerated *aligned* read from WC
> + * @dst: destination pointer
> + * @src: source pointer
> + * @len: how many bytes to copy
> + *
> + * i915_memcpy_from_wc copies @len bytes from @src to @dst using
> + * non-temporal instructions where available. Note that all arguments
> + * (@src, @dst) must be aligned to 16 bytes and @len must be a multiple
> + * of 16.
> + *
> + * To test whether accelerated reads from WC are supported, use
> + * i915_memcpy_from_wc(NULL, NULL, 0);
> + *
> + * Returns true if the copy was successful, false if the preconditions
> + * are not met.
> + */
> +bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len)
> +{
> +	GEM_BUG_ON((unsigned long)dst & 15);
> +	GEM_BUG_ON((unsigned long)src & 15);
> +
> +	if (unlikely(len & 15))
> +		return false;
> +
> +#ifdef CONFIG_AS_MOVNTDQA
> +	if (static_branch_likely(&has_movntqa)) {
> +		if (len)
> +			__movntqda(dst, src, len);
> +		return true;
> +	}
> +#endif
> +
> +	return false;
> +}
> +
> +void i915_memcpy_init_early(struct drm_i915_private *dev_priv)
> +{
> +	if (static_cpu_has(X86_FEATURE_XMM4_1))
> +		static_branch_enable(&has_movntqa);
> +}
>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v5] drm/i915: Use SSE4.1 movntdqa to accelerate reads from WC memory
  2016-07-18  9:31       ` Tvrtko Ursulin
@ 2016-07-18 10:01         ` Chris Wilson
  2016-07-18 10:07           ` [PATCH] " Chris Wilson
  0 siblings, 1 reply; 27+ messages in thread
From: Chris Wilson @ 2016-07-18 10:01 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx, Akash Goel, Mika Kuoppala

On Mon, Jul 18, 2016 at 10:31:05AM +0100, Tvrtko Ursulin wrote:
> On 16/07/16 16:44, Chris Wilson wrote:
> Is it okay nowadays to just use these registers in the kernel?
> 
> Many years ago when I last looked into this FPU and MMX registers
> were discouraged against and needed explicit kernel_gpu_begin/end
> around the block. Since they were not saved/restored by the kernel
> and doing otherwise would mess up the userspace context.
>
> Perhaps these new registers are different or the things have
> generally changed since then?

Hmm, I thought kernel_fpu_begin() was explicitly to handle the lack of
context save/restore of the FPU registers (a different block to the xmm
registers). However, I missed that the existing sse2/avx2 also uses it.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH] drm/i915: Use SSE4.1 movntdqa to accelerate reads from WC memory
  2016-07-18 10:01         ` Chris Wilson
@ 2016-07-18 10:07           ` Chris Wilson
  2016-07-18 10:29             ` Chris Wilson
                               ` (2 more replies)
  0 siblings, 3 replies; 27+ messages in thread
From: Chris Wilson @ 2016-07-18 10:07 UTC (permalink / raw)
  To: intel-gfx; +Cc: Akash Goel, Mika Kuoppala

This patch provides the infrastructure for performing a 16-byte aligned
read from WC memory using non-temporal instructions introduced with sse4.1.
Using movntdqa we can bypass the CPU caches and read directly from memory
and ignoring the page attributes set on the CPU PTE i.e. negating the
impact of an otherwise UC access. Copying using movntqda from WC is almost
as fast as reading from WB memory, modulo the possibility of both hitting
the CPU cache or leaving the data in the CPU cache for the next consumer.
(The CPU cache itself my be flushed for the region of the movntdqa and on
later access the movntdqa reads from a separate internal buffer for the
cacheline.) The write back to the memory is however cached.

This will be used in later patches to accelerate accessing WC memory.

v2: Report whether the accelerated copy is successful/possible.
v3: Function alignment override was only necessary when using the
function target("sse4.1") - which is not necessary for emitting movntdqa
from __asm__.
v4: Improve notes on CPU cache behaviour vs non-temporal stores.
v5: Fix byte offsets for unrolled moves.
v6: Find all remaining typos of movntqda, use kernel_fpu_begin.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Akash Goel <akash.goel@intel.com>
Cc: Damien Lespiau <damien.lespiau@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/Makefile      |   3 ++
 drivers/gpu/drm/i915/i915_drv.c    |   2 +
 drivers/gpu/drm/i915/i915_drv.h    |   3 ++
 drivers/gpu/drm/i915/i915_memcpy.c | 101 +++++++++++++++++++++++++++++++++++++
 4 files changed, 109 insertions(+)
 create mode 100644 drivers/gpu/drm/i915/i915_memcpy.c

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index dda724f04445..3412413408c0 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -3,12 +3,15 @@
 # Direct Rendering Infrastructure (DRI) in XFree86 4.1.0 and higher.
 
 subdir-ccflags-$(CONFIG_DRM_I915_WERROR) := -Werror
+subdir-ccflags-y += \
+	$(call as-instr,movntdqa (%eax)$(comma)%xmm0,-DCONFIG_AS_MOVNTDQA)
 
 # Please keep these build lists sorted!
 
 # core driver code
 i915-y := i915_drv.o \
 	  i915_irq.o \
+	  i915_memcpy.o \
 	  i915_params.o \
 	  i915_pci.o \
           i915_suspend.o \
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 640e3c34d3cd..f1034eb6853b 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -827,6 +827,8 @@ static int i915_driver_init_early(struct drm_i915_private *dev_priv,
 	mutex_init(&dev_priv->wm.wm_mutex);
 	mutex_init(&dev_priv->pps_mutex);
 
+	i915_memcpy_init_early(dev_priv);
+
 	ret = i915_workqueues_init(dev_priv);
 	if (ret < 0)
 		return ret;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 86dcde564de0..f8ae3f3e3af4 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3916,4 +3916,7 @@ static inline bool __i915_request_irq_complete(struct drm_i915_gem_request *req)
 	return false;
 }
 
+void i915_memcpy_init_early(struct drm_i915_private *dev_priv);
+bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len);
+
 #endif
diff --git a/drivers/gpu/drm/i915/i915_memcpy.c b/drivers/gpu/drm/i915/i915_memcpy.c
new file mode 100644
index 000000000000..50fc5799255f
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_memcpy.c
@@ -0,0 +1,101 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <asm/fpu/api.h>
+
+#include "i915_drv.h"
+
+DEFINE_STATIC_KEY_FALSE(has_movntdqa);
+
+#ifdef CONFIG_AS_MOVNTDQA
+static void __memcpy_ntdqa(void *dst, const void *src, unsigned long len)
+{
+	kernel_fpu_begin();
+
+	len >>= 4;
+	while (len >= 4) {
+		asm("movntdqa   (%0), %%xmm0\n"
+		    "movntdqa 16(%0), %%xmm1\n"
+		    "movntdqa 32(%0), %%xmm2\n"
+		    "movntdqa 48(%0), %%xmm3\n"
+		    "movaps %%xmm0,   (%1)\n"
+		    "movaps %%xmm1, 16(%1)\n"
+		    "movaps %%xmm2, 32(%1)\n"
+		    "movaps %%xmm3, 48(%1)\n"
+		    :: "r" (src), "r" (dst) : "memory");
+		src += 64;
+		dst += 64;
+		len -= 4;
+	}
+	while (len--) {
+		asm("movntdqa (%0), %%xmm0\n"
+		    "movaps %%xmm0, (%1)\n"
+		    :: "r" (src), "r" (dst) : "memory");
+		src += 16;
+		dst += 16;
+	}
+
+	kernel_fpu_end();
+}
+#endif
+
+/**
+ * i915_memcpy_from_wc: perform an accelerated *aligned* read from WC
+ * @dst: destination pointer
+ * @src: source pointer
+ * @len: how many bytes to copy
+ *
+ * i915_memcpy_from_wc copies @len bytes from @src to @dst using
+ * non-temporal instructions where available. Note that all arguments
+ * (@src, @dst) must be aligned to 16 bytes and @len must be a multiple
+ * of 16.
+ *
+ * To test whether accelerated reads from WC are supported, use
+ * i915_memcpy_from_wc(NULL, NULL, 0);
+ *
+ * Returns true if the copy was successful, false if the preconditions
+ * are not met.
+ */
+bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len)
+{
+	if (unlikely(((unsigned long)dst | (unsigned long)src | len) & 15))
+		return false;
+
+#ifdef CONFIG_AS_MOVNTDQA
+	if (static_branch_likely(&has_movntdqa)) {
+		if (len)
+			__memcpy_ntdqa(dst, src, len);
+		return true;
+	}
+#endif
+
+	return false;
+}
+
+void i915_memcpy_init_early(struct drm_i915_private *dev_priv)
+{
+	if (static_cpu_has(X86_FEATURE_XMM4_1))
+		static_branch_enable(&has_movntdqa);
+}
-- 
2.8.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH] drm/i915: Use SSE4.1 movntdqa to accelerate reads from WC memory
  2016-07-18 10:07           ` [PATCH] " Chris Wilson
@ 2016-07-18 10:29             ` Chris Wilson
  2016-07-18 11:15             ` Tvrtko Ursulin
  2016-07-19  6:50             ` Daniel Vetter
  2 siblings, 0 replies; 27+ messages in thread
From: Chris Wilson @ 2016-07-18 10:29 UTC (permalink / raw)
  To: intel-gfx; +Cc: Akash Goel, Mika Kuoppala

On Mon, Jul 18, 2016 at 11:07:14AM +0100, Chris Wilson wrote:
> This patch provides the infrastructure for performing a 16-byte aligned
> read from WC memory using non-temporal instructions introduced with sse4.1.
> Using movntdqa we can bypass the CPU caches and read directly from memory
> and ignoring the page attributes set on the CPU PTE i.e. negating the
> impact of an otherwise UC access. Copying using movntqda from WC is almost

                                                  ARGH.
						  -Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* ✗ Ro.CI.BAT: failure for drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory (rev5)
  2016-07-16 12:07 [PATCH] drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory Chris Wilson
                   ` (5 preceding siblings ...)
  2016-07-16 16:12 ` ✗ Ro.CI.BAT: failure for drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory (rev4) Patchwork
@ 2016-07-18 10:59 ` Patchwork
  6 siblings, 0 replies; 27+ messages in thread
From: Patchwork @ 2016-07-18 10:59 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory (rev5)
URL   : https://patchwork.freedesktop.org/series/9949/
State : failure

== Summary ==

Series 9949v5 drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory
http://patchwork.freedesktop.org/api/1.0/series/9949/revisions/5/mbox

Test gem_sync:
        Subgroup basic-store-each:
                pass       -> DMESG-FAIL (ro-bdw-i7-5600u)

fi-hsw-i7-4770k  total:242  pass:210  dwarn:0   dfail:0   fail:12  skip:20 
fi-kbl-qkkr      total:242  pass:176  dwarn:29  dfail:0   fail:10  skip:27 
fi-skl-i5-6260u  total:242  pass:219  dwarn:0   dfail:0   fail:11  skip:12 
fi-snb-i7-2600   total:242  pass:190  dwarn:0   dfail:0   fail:12  skip:40 
ro-bdw-i5-5250u  total:243  pass:214  dwarn:4   dfail:0   fail:12  skip:13 
ro-bdw-i7-5600u  total:243  pass:199  dwarn:0   dfail:1   fail:11  skip:32 
ro-bsw-n3050     total:243  pass:173  dwarn:0   dfail:0   fail:3   skip:42 
ro-byt-n2820     total:243  pass:191  dwarn:0   dfail:0   fail:14  skip:38 
ro-hsw-i3-4010u  total:243  pass:206  dwarn:0   dfail:0   fail:13  skip:24 
ro-hsw-i7-4770r  total:243  pass:206  dwarn:0   dfail:0   fail:13  skip:24 
ro-ilk-i7-620lm  total:243  pass:166  dwarn:0   dfail:0   fail:14  skip:63 
ro-ilk1-i5-650   total:238  pass:166  dwarn:0   dfail:0   fail:14  skip:58 
ro-ivb-i7-3770   total:243  pass:197  dwarn:0   dfail:0   fail:13  skip:33 
ro-skl3-i5-6260u total:243  pass:218  dwarn:1   dfail:0   fail:12  skip:12 
ro-snb-i7-2620M  total:243  pass:188  dwarn:0   dfail:0   fail:13  skip:42 
fi-bsw-n3050 failed to connect after reboot
ro-bdw-i7-5557U failed to connect after reboot

Results at /archive/results/CI_IGT_test/RO_Patchwork_1515/

74bd981 drm-intel-nightly: 2016y-07m-18d-10h-05m-42s UTC integration manifest
9acc3fd drm/i915: Use SSE4.1 movntdqa to accelerate reads from WC memory

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] drm/i915: Use SSE4.1 movntdqa to accelerate reads from WC memory
  2016-07-18 10:07           ` [PATCH] " Chris Wilson
  2016-07-18 10:29             ` Chris Wilson
@ 2016-07-18 11:15             ` Tvrtko Ursulin
  2016-07-18 11:35               ` Chris Wilson
  2016-07-19  6:50             ` Daniel Vetter
  2 siblings, 1 reply; 27+ messages in thread
From: Tvrtko Ursulin @ 2016-07-18 11:15 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: Akash Goel, Mika Kuoppala


On 18/07/16 11:07, Chris Wilson wrote:
> This patch provides the infrastructure for performing a 16-byte aligned
> read from WC memory using non-temporal instructions introduced with sse4.1.
> Using movntdqa we can bypass the CPU caches and read directly from memory
> and ignoring the page attributes set on the CPU PTE i.e. negating the
> impact of an otherwise UC access. Copying using movntqda from WC is almost
> as fast as reading from WB memory, modulo the possibility of both hitting
> the CPU cache or leaving the data in the CPU cache for the next consumer.
> (The CPU cache itself my be flushed for the region of the movntdqa and on
> later access the movntdqa reads from a separate internal buffer for the
> cacheline.) The write back to the memory is however cached.
>
> This will be used in later patches to accelerate accessing WC memory.
>
> v2: Report whether the accelerated copy is successful/possible.
> v3: Function alignment override was only necessary when using the
> function target("sse4.1") - which is not necessary for emitting movntdqa
> from __asm__.
> v4: Improve notes on CPU cache behaviour vs non-temporal stores.
> v5: Fix byte offsets for unrolled moves.
> v6: Find all remaining typos of movntqda, use kernel_fpu_begin.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Akash Goel <akash.goel@intel.com>
> Cc: Damien Lespiau <damien.lespiau@intel.com>
> Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/Makefile      |   3 ++
>   drivers/gpu/drm/i915/i915_drv.c    |   2 +
>   drivers/gpu/drm/i915/i915_drv.h    |   3 ++
>   drivers/gpu/drm/i915/i915_memcpy.c | 101 +++++++++++++++++++++++++++++++++++++
>   4 files changed, 109 insertions(+)
>   create mode 100644 drivers/gpu/drm/i915/i915_memcpy.c
>
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index dda724f04445..3412413408c0 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -3,12 +3,15 @@
>   # Direct Rendering Infrastructure (DRI) in XFree86 4.1.0 and higher.
>
>   subdir-ccflags-$(CONFIG_DRM_I915_WERROR) := -Werror
> +subdir-ccflags-y += \
> +	$(call as-instr,movntdqa (%eax)$(comma)%xmm0,-DCONFIG_AS_MOVNTDQA)
>
>   # Please keep these build lists sorted!
>
>   # core driver code
>   i915-y := i915_drv.o \
>   	  i915_irq.o \
> +	  i915_memcpy.o \
>   	  i915_params.o \
>   	  i915_pci.o \
>             i915_suspend.o \
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 640e3c34d3cd..f1034eb6853b 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -827,6 +827,8 @@ static int i915_driver_init_early(struct drm_i915_private *dev_priv,
>   	mutex_init(&dev_priv->wm.wm_mutex);
>   	mutex_init(&dev_priv->pps_mutex);
>
> +	i915_memcpy_init_early(dev_priv);
> +
>   	ret = i915_workqueues_init(dev_priv);
>   	if (ret < 0)
>   		return ret;
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 86dcde564de0..f8ae3f3e3af4 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -3916,4 +3916,7 @@ static inline bool __i915_request_irq_complete(struct drm_i915_gem_request *req)
>   	return false;
>   }
>
> +void i915_memcpy_init_early(struct drm_i915_private *dev_priv);
> +bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len);
> +
>   #endif
> diff --git a/drivers/gpu/drm/i915/i915_memcpy.c b/drivers/gpu/drm/i915/i915_memcpy.c
> new file mode 100644
> index 000000000000..50fc5799255f
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_memcpy.c
> @@ -0,0 +1,101 @@
> +/*
> + * Copyright © 2016 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + *
> + */
> +
> +#include <linux/kernel.h>
> +#include <asm/fpu/api.h>
> +
> +#include "i915_drv.h"
> +
> +DEFINE_STATIC_KEY_FALSE(has_movntdqa);
> +
> +#ifdef CONFIG_AS_MOVNTDQA
> +static void __memcpy_ntdqa(void *dst, const void *src, unsigned long len)
> +{
> +	kernel_fpu_begin();
> +
> +	len >>= 4;
> +	while (len >= 4) {
> +		asm("movntdqa   (%0), %%xmm0\n"
> +		    "movntdqa 16(%0), %%xmm1\n"
> +		    "movntdqa 32(%0), %%xmm2\n"
> +		    "movntdqa 48(%0), %%xmm3\n"
> +		    "movaps %%xmm0,   (%1)\n"
> +		    "movaps %%xmm1, 16(%1)\n"
> +		    "movaps %%xmm2, 32(%1)\n"
> +		    "movaps %%xmm3, 48(%1)\n"
> +		    :: "r" (src), "r" (dst) : "memory");
> +		src += 64;
> +		dst += 64;
> +		len -= 4;
> +	}
> +	while (len--) {
> +		asm("movntdqa (%0), %%xmm0\n"
> +		    "movaps %%xmm0, (%1)\n"
> +		    :: "r" (src), "r" (dst) : "memory");
> +		src += 16;
> +		dst += 16;

I am not sure about this, but looking at the raid6 for example, it has a 
lot more annotations in cases like this.

It seems to be telling the compiler which memory ranges does each 
instruction access, and also uses "asm volatile" - whether or not that 
is really needed I don't know.

For example:
                 asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));

And:
                 asm volatile("movdqa %%xmm4,%0" : "=m" (q[d]));

Each one is telling the compiler the instruction is either reading or 
writing respectively from a certain memory address.

You don't have any of that, and don't even specify nothing as an output 
parameter so I am not sure if your code is safe.

> +	}
> +
> +	kernel_fpu_end();
> +}
> +#endif
> +
> +/**
> + * i915_memcpy_from_wc: perform an accelerated *aligned* read from WC
> + * @dst: destination pointer
> + * @src: source pointer
> + * @len: how many bytes to copy
> + *
> + * i915_memcpy_from_wc copies @len bytes from @src to @dst using
> + * non-temporal instructions where available. Note that all arguments
> + * (@src, @dst) must be aligned to 16 bytes and @len must be a multiple
> + * of 16.
> + *
> + * To test whether accelerated reads from WC are supported, use
> + * i915_memcpy_from_wc(NULL, NULL, 0);
> + *
> + * Returns true if the copy was successful, false if the preconditions
> + * are not met.
> + */
> +bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len)
> +{
> +	if (unlikely(((unsigned long)dst | (unsigned long)src | len) & 15))
> +		return false;
> +
> +#ifdef CONFIG_AS_MOVNTDQA
> +	if (static_branch_likely(&has_movntdqa)) {
> +		if (len)
> +			__memcpy_ntdqa(dst, src, len);
> +		return true;
> +	}
> +#endif
> +
> +	return false;
> +}
> +
> +void i915_memcpy_init_early(struct drm_i915_private *dev_priv)
> +{
> +	if (static_cpu_has(X86_FEATURE_XMM4_1))
> +		static_branch_enable(&has_movntdqa);
> +}
>

I was not familiar with static key stuff and the only thing I can notice 
is that it is used very little throughout the kernel. On the other hand 
I haven't found any references in the documentation that it should be 
used sparingly or something.

But the general question would be - is it worth it here? Static branches 
should be really efficient in the off case, correct? And we don't really 
care about the performance of the off case here. So would it be just as 
good to use a normal branch?

On the other hand maybe there is no special reason why these are not 
used more so maybe you are completely fine to use them...

Regards,

Tvrtko



_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] drm/i915: Use SSE4.1 movntdqa to accelerate reads from WC memory
  2016-07-18 11:15             ` Tvrtko Ursulin
@ 2016-07-18 11:35               ` Chris Wilson
  2016-07-18 11:57                 ` Dave Gordon
  2016-07-18 12:07                 ` Tvrtko Ursulin
  0 siblings, 2 replies; 27+ messages in thread
From: Chris Wilson @ 2016-07-18 11:35 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx, Akash Goel, Mika Kuoppala

On Mon, Jul 18, 2016 at 12:15:32PM +0100, Tvrtko Ursulin wrote:
> I am not sure about this, but looking at the raid6 for example, it
> has a lot more annotations in cases like this.
> 
> It seems to be telling the compiler which memory ranges does each
> instruction access, and also uses "asm volatile" - whether or not
> that is really needed I don't know.
> 
> For example:
>                 asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
>
> And:
>                 asm volatile("movdqa %%xmm4,%0" : "=m" (q[d]));
> 
> Each one is telling the compiler the instruction is either reading
> or writing respectively from a certain memory address.
> 
> You don't have any of that, and don't even specify nothing as an
> output parameter so I am not sure if your code is safe.

The asm is correct. We do not modify either of the two pointers which we
pass in via register inputs, but the memory behind them - hence the memory
clobber.

> >+void i915_memcpy_init_early(struct drm_i915_private *dev_priv)
> >+{
> >+	if (static_cpu_has(X86_FEATURE_XMM4_1))
> >+		static_branch_enable(&has_movntdqa);
> >+}
> >
> 
> I was not familiar with static key stuff and the only thing I can
> notice is that it is used very little throughout the kernel. On the
> other hand I haven't found any references in the documentation that
> it should be used sparingly or something.
> 
> But the general question would be - is it worth it here? Static
> branches should be really efficient in the off case, correct? And we
> don't really care about the performance of the off case here. So
> would it be just as good to use a normal branch?

It's not the cost of the branch, but the static_cpu_has() in comparison
to a small copy.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] drm/i915: Use SSE4.1 movntdqa to accelerate reads from WC memory
  2016-07-18 11:35               ` Chris Wilson
@ 2016-07-18 11:57                 ` Dave Gordon
  2016-07-18 12:56                   ` Tvrtko Ursulin
  2016-07-18 12:07                 ` Tvrtko Ursulin
  1 sibling, 1 reply; 27+ messages in thread
From: Dave Gordon @ 2016-07-18 11:57 UTC (permalink / raw)
  To: Chris Wilson, Tvrtko Ursulin, intel-gfx, Akash Goel, Mika Kuoppala

On 18/07/16 12:35, Chris Wilson wrote:
> On Mon, Jul 18, 2016 at 12:15:32PM +0100, Tvrtko Ursulin wrote:
>> I am not sure about this, but looking at the raid6 for example, it
>> has a lot more annotations in cases like this.
>>
>> It seems to be telling the compiler which memory ranges does each
>> instruction access, and also uses "asm volatile" - whether or not
>> that is really needed I don't know.
>>
>> For example:
>>                  asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
>>
>> And:
>>                  asm volatile("movdqa %%xmm4,%0" : "=m" (q[d]));
>>
>> Each one is telling the compiler the instruction is either reading
>> or writing respectively from a certain memory address.
>>
>> You don't have any of that, and don't even specify nothing as an
>> output parameter so I am not sure if your code is safe.
>
> The asm is correct. We do not modify either of the two pointers which we
> pass in via register inputs, but the memory behind them - hence the memory
> clobber.

This is a choice of how much we let the compiler decide about 
addressing, and how much we tell it about what the asm code really does. 
The examples above get the compiler to generate *any* suitable 
addressing mode for each specific location involved in the transfers, so 
the compiler knows a lot about what's happening and can track where each 
datum comes from and goes to.

OTOH Chris' code

+        asm("movntdqa   (%0), %%xmm0\n"
+            "movntdqa 16(%0), %%xmm1\n"
+            "movntdqa 32(%0), %%xmm2\n"
+            "movntdqa 48(%0), %%xmm3\n"
+            "movaps %%xmm0,   (%1)\n"
+            "movaps %%xmm1, 16(%1)\n"
+            "movaps %%xmm2, 32(%1)\n"
+            "movaps %%xmm3, 48(%1)\n"
+            :: "r" (src), "r" (dst) : "memory");

- doesn't need "volatile" because asm statements that have no output 
operands are implicitly volatile.

- makes the compiler give us the source and destination *addresses* in a 
register each; beyond that, it doesn't know what we're doing with them, 
so the third ("clobbers") parameter has to say "memory" i.e. treat *all* 
memory contents as unknown after this.

[[From GCC docs: The "memory" clobber tells the compiler that the 
assembly code performs memory reads or writes to items other than those 
listed in the input and output operands (for example, accessing the 
memory pointed to by one of the input parameters). To ensure memory 
contains correct values, GCC may need to flush specific register values 
to memory before executing the asm. Further, the compiler does not 
assume that any values read from memory before an asm remain unchanged 
after that asm; it reloads them as needed. Using the "memory" clobber 
effectively forms a read/write memory barrier for the compiler.]]

BTW, should we not tell it we've *also* clobbered %xmm[0-3]?

So they're both correct, just taking different approaches. I don't know 
which would give the best performance for this specific case.

.Dave.

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] drm/i915: Use SSE4.1 movntdqa to accelerate reads from WC memory
  2016-07-18 11:35               ` Chris Wilson
  2016-07-18 11:57                 ` Dave Gordon
@ 2016-07-18 12:07                 ` Tvrtko Ursulin
  1 sibling, 0 replies; 27+ messages in thread
From: Tvrtko Ursulin @ 2016-07-18 12:07 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx, Akash Goel, Mika Kuoppala


On 18/07/16 12:35, Chris Wilson wrote:
> On Mon, Jul 18, 2016 at 12:15:32PM +0100, Tvrtko Ursulin wrote:
>> I am not sure about this, but looking at the raid6 for example, it
>> has a lot more annotations in cases like this.
>>
>> It seems to be telling the compiler which memory ranges does each
>> instruction access, and also uses "asm volatile" - whether or not
>> that is really needed I don't know.
>>
>> For example:
>>                  asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
>>
>> And:
>>                  asm volatile("movdqa %%xmm4,%0" : "=m" (q[d]));
>>
>> Each one is telling the compiler the instruction is either reading
>> or writing respectively from a certain memory address.
>>
>> You don't have any of that, and don't even specify nothing as an
>> output parameter so I am not sure if your code is safe.
>
> The asm is correct. We do not modify either of the two pointers which we
> pass in via register inputs, but the memory behind them - hence the memory
> clobber.

So you are saying memory clobber is like a big hammer telling the 
compiler you are modifying some memory and it is not allowed to assume 
or remove anything?

There would be no benefit in being more specific, like the RAID code does?

>>> +void i915_memcpy_init_early(struct drm_i915_private *dev_priv)
>>> +{
>>> +	if (static_cpu_has(X86_FEATURE_XMM4_1))
>>> +		static_branch_enable(&has_movntdqa);
>>> +}
>>>
>>
>> I was not familiar with static key stuff and the only thing I can
>> notice is that it is used very little throughout the kernel. On the
>> other hand I haven't found any references in the documentation that
>> it should be used sparingly or something.
>>
>> But the general question would be - is it worth it here? Static
>> branches should be really efficient in the off case, correct? And we
>> don't really care about the performance of the off case here. So
>> would it be just as good to use a normal branch?
>
> It's not the cost of the branch, but the static_cpu_has() in comparison
> to a small copy.

Could cache it in dev_priv. Not saying you should, just asking about 
pros and cons of the two approaches vs the amount of code. Well not even 
the amount of code, just the fact static keys seem to be used so little 
so wondering why.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] drm/i915: Use SSE4.1 movntdqa to accelerate reads from WC memory
  2016-07-18 11:57                 ` Dave Gordon
@ 2016-07-18 12:56                   ` Tvrtko Ursulin
  2016-07-18 13:46                     ` Tvrtko Ursulin
  2016-07-18 13:48                     ` Dave Gordon
  0 siblings, 2 replies; 27+ messages in thread
From: Tvrtko Ursulin @ 2016-07-18 12:56 UTC (permalink / raw)
  To: Dave Gordon, Chris Wilson, intel-gfx, Akash Goel, Mika Kuoppala


On 18/07/16 12:57, Dave Gordon wrote:
> On 18/07/16 12:35, Chris Wilson wrote:
>> On Mon, Jul 18, 2016 at 12:15:32PM +0100, Tvrtko Ursulin wrote:
>>> I am not sure about this, but looking at the raid6 for example, it
>>> has a lot more annotations in cases like this.
>>>
>>> It seems to be telling the compiler which memory ranges does each
>>> instruction access, and also uses "asm volatile" - whether or not
>>> that is really needed I don't know.
>>>
>>> For example:
>>>                  asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
>>>
>>> And:
>>>                  asm volatile("movdqa %%xmm4,%0" : "=m" (q[d]));
>>>
>>> Each one is telling the compiler the instruction is either reading
>>> or writing respectively from a certain memory address.
>>>
>>> You don't have any of that, and don't even specify nothing as an
>>> output parameter so I am not sure if your code is safe.
>>
>> The asm is correct. We do not modify either of the two pointers which we
>> pass in via register inputs, but the memory behind them - hence the 
>> memory
>> clobber.
> 
> This is a choice of how much we let the compiler decide about 
> addressing, and how much we tell it about what the asm code really does. 
> The examples above get the compiler to generate *any* suitable 
> addressing mode for each specific location involved in the transfers, so 
> the compiler knows a lot about what's happening and can track where each 
> datum comes from and goes to.
> 
> OTOH Chris' code
> 
> +        asm("movntdqa   (%0), %%xmm0\n"
> +            "movntdqa 16(%0), %%xmm1\n"
> +            "movntdqa 32(%0), %%xmm2\n"
> +            "movntdqa 48(%0), %%xmm3\n"
> +            "movaps %%xmm0,   (%1)\n"
> +            "movaps %%xmm1, 16(%1)\n"
> +            "movaps %%xmm2, 32(%1)\n"
> +            "movaps %%xmm3, 48(%1)\n"
> +            :: "r" (src), "r" (dst) : "memory");
> 
> - doesn't need "volatile" because asm statements that have no output 
> operands are implicitly volatile.
> 
> - makes the compiler give us the source and destination *addresses* in a 
> register each; beyond that, it doesn't know what we're doing with them, 
> so the third ("clobbers") parameter has to say "memory" i.e. treat *all* 
> memory contents as unknown after this.
> 
> [[From GCC docs: The "memory" clobber tells the compiler that the 
> assembly code performs memory reads or writes to items other than those 
> listed in the input and output operands (for example, accessing the 
> memory pointed to by one of the input parameters). To ensure memory 
> contains correct values, GCC may need to flush specific register values 
> to memory before executing the asm. Further, the compiler does not 
> assume that any values read from memory before an asm remain unchanged 
> after that asm; it reloads them as needed. Using the "memory" clobber 
> effectively forms a read/write memory barrier for the compiler.]]
> 
> BTW, should we not tell it we've *also* clobbered %xmm[0-3]?
> 
> So they're both correct, just taking different approaches. I don't know 
> which would give the best performance for this specific case.

Cool, learn something new every day. :)

I've tried writing it as:

struct qw2 {
	u64	q[2];
} __attribute__((packed));

static void __memcpy_ntdqa(struct qw2 *dst, const struct qw2 *src, unsigned long len)
{
	kernel_fpu_begin();

	len >>= 4;
	while (len >= 4) {
		asm("movntdqa   (%0), %%xmm0" :: "r" (src), "m" (src[0]));
		asm("movntdqa 16(%0), %%xmm1" :: "r" (src), "m" (src[1]));
		asm("movntdqa 32(%0), %%xmm2" :: "r" (src), "m" (src[2]));
		asm("movntdqa 48(%0), %%xmm3" :: "r" (src), "m" (src[3]));
		asm("movaps %%xmm0,   (%1)" : "=m" (dst[0]) : "r" (dst));
		asm("movaps %%xmm1, 16(%1)" : "=m" (dst[1]) : "r" (dst));
		asm("movaps %%xmm2, 32(%1)" : "=m" (dst[2]) : "r" (dst));
		asm("movaps %%xmm3, 48(%1)" : "=m" (dst[3]) : "r" (dst));
		src += 4;
		dst += 4;
		len -= 4;
	}
	while (len--) {
		asm("movntdqa (%0), %%xmm0" :: "r" (src), "m" (src[0]));
		asm("movaps %%xmm0, (%1)" : "=m" (dst[0]) : "r" (dst));
		src++;
		dst++;
	}

	kernel_fpu_end();
}

That appears to allow GCC to interleave SSE and normal instructions,
presumably that means it is trying to utilize the execution units better?

I wonder if it makes a difference in speed?


Old code main loop assembly looks like:

  58:   66 0f 38 2a 00          movntdqa (%rax),%xmm0
  5d:   66 0f 38 2a 48 10       movntdqa 0x10(%rax),%xmm1
  63:   66 0f 38 2a 50 20       movntdqa 0x20(%rax),%xmm2
  69:   66 0f 38 2a 58 30       movntdqa 0x30(%rax),%xmm3
  6f:   0f 29 01                movaps %xmm0,(%rcx)
  72:   0f 29 49 10             movaps %xmm1,0x10(%rcx)
  76:   0f 29 51 20             movaps %xmm2,0x20(%rcx)
  7a:   0f 29 59 30             movaps %xmm3,0x30(%rcx)
  7e:   49 83 e8 04             sub    $0x4,%r8
  82:   48 83 c0 40             add    $0x40,%rax
  86:   48 83 c1 40             add    $0x40,%rcx
  8a:   49 83 f8 03             cmp    $0x3,%r8
  8e:   77 c8                   ja     58 <i915_memcpy_from_wc+0x58>

While the above version generates:

  58:   66 0f 38 2a 00          movntdqa (%rax),%xmm0
  5d:   66 0f 38 2a 48 10       movntdqa 0x10(%rax),%xmm1
  63:   66 0f 38 2a 50 20       movntdqa 0x20(%rax),%xmm2
  69:   66 0f 38 2a 58 30       movntdqa 0x30(%rax),%xmm3
  6f:   49 83 e8 04             sub    $0x4,%r8
  73:   48 83 c0 40             add    $0x40,%rax
  77:   0f 29 01                movaps %xmm0,(%rcx)
  7a:   0f 29 49 10             movaps %xmm1,0x10(%rcx)
  7e:   0f 29 51 20             movaps %xmm2,0x20(%rcx)
  82:   0f 29 59 30             movaps %xmm3,0x30(%rcx)
  86:   48 83 c1 40             add    $0x40,%rcx
  8a:   49 83 f8 03             cmp    $0x3,%r8
  8e:   77 c8                   ja     58 <i915_memcpy_from_wc+0x58>

Interestingly, in both cases GCC does some in my mind futile
shuffling aroung between the two loops. Instead of just
carrying on with src and dst and len how they are, it goes
to use a different register set for the second loop:

So this reshuffling:

  90:   48 8d 42 fc             lea    -0x4(%rdx),%rax
  94:   83 e2 03                and    $0x3,%edx
  97:   48 c1 e8 02             shr    $0x2,%rax
  9b:   48 83 c0 01             add    $0x1,%rax
  9f:   48 c1 e0 06             shl    $0x6,%rax
  a3:   48 01 c6                add    %rax,%rsi
  a6:   48 01 c7                add    %rax,%rdi
  a9:   48 8d 42 ff             lea    -0x1(%rdx),%rax
  ad:   48 85 d2                test   %rdx,%rdx
  b0:   74 1a                   je     cc <i915_memcpy_from_wc+0xcc>

And then the second loop:

  b2:   66 0f 38 2a 06          movntdqa (%rsi),%xmm0
  b7:   48 83 e8 01             sub    $0x1,%rax
  bb:   48 83 c6 10             add    $0x10,%rsi
  bf:   0f 29 07                movaps %xmm0,(%rdi)
  c2:   48 83 c7 10             add    $0x10,%rdi
  c6:   48 83 f8 ff             cmp    $0xffffffffffffffff,%rax
  ca:   75 e6                   jne    b2 <i915_memcpy_from_wc+0xb2>

Any thoughts on this?

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] drm/i915: Use SSE4.1 movntdqa to accelerate reads from WC memory
  2016-07-18 12:56                   ` Tvrtko Ursulin
@ 2016-07-18 13:46                     ` Tvrtko Ursulin
  2016-07-18 15:06                       ` Tvrtko Ursulin
  2016-07-18 13:48                     ` Dave Gordon
  1 sibling, 1 reply; 27+ messages in thread
From: Tvrtko Ursulin @ 2016-07-18 13:46 UTC (permalink / raw)
  To: Dave Gordon, Chris Wilson, intel-gfx, Akash Goel, Mika Kuoppala


On 18/07/16 13:56, Tvrtko Ursulin wrote:
> 
> On 18/07/16 12:57, Dave Gordon wrote:
>> On 18/07/16 12:35, Chris Wilson wrote:
>>> On Mon, Jul 18, 2016 at 12:15:32PM +0100, Tvrtko Ursulin wrote:
>>>> I am not sure about this, but looking at the raid6 for example, it
>>>> has a lot more annotations in cases like this.
>>>>
>>>> It seems to be telling the compiler which memory ranges does each
>>>> instruction access, and also uses "asm volatile" - whether or not
>>>> that is really needed I don't know.
>>>>
>>>> For example:
>>>>                   asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
>>>>
>>>> And:
>>>>                   asm volatile("movdqa %%xmm4,%0" : "=m" (q[d]));
>>>>
>>>> Each one is telling the compiler the instruction is either reading
>>>> or writing respectively from a certain memory address.
>>>>
>>>> You don't have any of that, and don't even specify nothing as an
>>>> output parameter so I am not sure if your code is safe.
>>>
>>> The asm is correct. We do not modify either of the two pointers which we
>>> pass in via register inputs, but the memory behind them - hence the
>>> memory
>>> clobber.
>>
>> This is a choice of how much we let the compiler decide about
>> addressing, and how much we tell it about what the asm code really does.
>> The examples above get the compiler to generate *any* suitable
>> addressing mode for each specific location involved in the transfers, so
>> the compiler knows a lot about what's happening and can track where each
>> datum comes from and goes to.
>>
>> OTOH Chris' code
>>
>> +        asm("movntdqa   (%0), %%xmm0\n"
>> +            "movntdqa 16(%0), %%xmm1\n"
>> +            "movntdqa 32(%0), %%xmm2\n"
>> +            "movntdqa 48(%0), %%xmm3\n"
>> +            "movaps %%xmm0,   (%1)\n"
>> +            "movaps %%xmm1, 16(%1)\n"
>> +            "movaps %%xmm2, 32(%1)\n"
>> +            "movaps %%xmm3, 48(%1)\n"
>> +            :: "r" (src), "r" (dst) : "memory");
>>
>> - doesn't need "volatile" because asm statements that have no output
>> operands are implicitly volatile.
>>
>> - makes the compiler give us the source and destination *addresses* in a
>> register each; beyond that, it doesn't know what we're doing with them,
>> so the third ("clobbers") parameter has to say "memory" i.e. treat *all*
>> memory contents as unknown after this.
>>
>> [[From GCC docs: The "memory" clobber tells the compiler that the
>> assembly code performs memory reads or writes to items other than those
>> listed in the input and output operands (for example, accessing the
>> memory pointed to by one of the input parameters). To ensure memory
>> contains correct values, GCC may need to flush specific register values
>> to memory before executing the asm. Further, the compiler does not
>> assume that any values read from memory before an asm remain unchanged
>> after that asm; it reloads them as needed. Using the "memory" clobber
>> effectively forms a read/write memory barrier for the compiler.]]
>>
>> BTW, should we not tell it we've *also* clobbered %xmm[0-3]?
>>
>> So they're both correct, just taking different approaches. I don't know
>> which would give the best performance for this specific case.
> 
> Cool, learn something new every day. :)
> 
> I've tried writing it as:
> 
> struct qw2 {
> 	u64	q[2];
> } __attribute__((packed));
> 
> static void __memcpy_ntdqa(struct qw2 *dst, const struct qw2 *src, unsigned long len)
> {
> 	kernel_fpu_begin();
> 
> 	len >>= 4;
> 	while (len >= 4) {
> 		asm("movntdqa   (%0), %%xmm0" :: "r" (src), "m" (src[0]));
> 		asm("movntdqa 16(%0), %%xmm1" :: "r" (src), "m" (src[1]));
> 		asm("movntdqa 32(%0), %%xmm2" :: "r" (src), "m" (src[2]));
> 		asm("movntdqa 48(%0), %%xmm3" :: "r" (src), "m" (src[3]));
> 		asm("movaps %%xmm0,   (%1)" : "=m" (dst[0]) : "r" (dst));
> 		asm("movaps %%xmm1, 16(%1)" : "=m" (dst[1]) : "r" (dst));
> 		asm("movaps %%xmm2, 32(%1)" : "=m" (dst[2]) : "r" (dst));
> 		asm("movaps %%xmm3, 48(%1)" : "=m" (dst[3]) : "r" (dst));
> 		src += 4;
> 		dst += 4;
> 		len -= 4;
> 	}
> 	while (len--) {
> 		asm("movntdqa (%0), %%xmm0" :: "r" (src), "m" (src[0]));
> 		asm("movaps %%xmm0, (%1)" : "=m" (dst[0]) : "r" (dst));
> 		src++;
> 		dst++;
> 	}
> 
> 	kernel_fpu_end();
> }
> 
> That appears to allow GCC to interleave SSE and normal instructions,
> presumably that means it is trying to utilize the execution units better?
> 
> I wonder if it makes a difference in speed?
> 
> 
> Old code main loop assembly looks like:
> 
>    58:   66 0f 38 2a 00          movntdqa (%rax),%xmm0
>    5d:   66 0f 38 2a 48 10       movntdqa 0x10(%rax),%xmm1
>    63:   66 0f 38 2a 50 20       movntdqa 0x20(%rax),%xmm2
>    69:   66 0f 38 2a 58 30       movntdqa 0x30(%rax),%xmm3
>    6f:   0f 29 01                movaps %xmm0,(%rcx)
>    72:   0f 29 49 10             movaps %xmm1,0x10(%rcx)
>    76:   0f 29 51 20             movaps %xmm2,0x20(%rcx)
>    7a:   0f 29 59 30             movaps %xmm3,0x30(%rcx)
>    7e:   49 83 e8 04             sub    $0x4,%r8
>    82:   48 83 c0 40             add    $0x40,%rax
>    86:   48 83 c1 40             add    $0x40,%rcx
>    8a:   49 83 f8 03             cmp    $0x3,%r8
>    8e:   77 c8                   ja     58 <i915_memcpy_from_wc+0x58>
> 
> While the above version generates:
> 
>    58:   66 0f 38 2a 00          movntdqa (%rax),%xmm0
>    5d:   66 0f 38 2a 48 10       movntdqa 0x10(%rax),%xmm1
>    63:   66 0f 38 2a 50 20       movntdqa 0x20(%rax),%xmm2
>    69:   66 0f 38 2a 58 30       movntdqa 0x30(%rax),%xmm3
>    6f:   49 83 e8 04             sub    $0x4,%r8
>    73:   48 83 c0 40             add    $0x40,%rax
>    77:   0f 29 01                movaps %xmm0,(%rcx)
>    7a:   0f 29 49 10             movaps %xmm1,0x10(%rcx)
>    7e:   0f 29 51 20             movaps %xmm2,0x20(%rcx)
>    82:   0f 29 59 30             movaps %xmm3,0x30(%rcx)
>    86:   48 83 c1 40             add    $0x40,%rcx
>    8a:   49 83 f8 03             cmp    $0x3,%r8
>    8e:   77 c8                   ja     58 <i915_memcpy_from_wc+0x58>
> 
> Interestingly, in both cases GCC does some in my mind futile
> shuffling aroung between the two loops. Instead of just
> carrying on with src and dst and len how they are, it goes
> to use a different register set for the second loop:
> 
> So this reshuffling:
> 
>    90:   48 8d 42 fc             lea    -0x4(%rdx),%rax
>    94:   83 e2 03                and    $0x3,%edx
>    97:   48 c1 e8 02             shr    $0x2,%rax
>    9b:   48 83 c0 01             add    $0x1,%rax
>    9f:   48 c1 e0 06             shl    $0x6,%rax
>    a3:   48 01 c6                add    %rax,%rsi
>    a6:   48 01 c7                add    %rax,%rdi
>    a9:   48 8d 42 ff             lea    -0x1(%rdx),%rax
>    ad:   48 85 d2                test   %rdx,%rdx
>    b0:   74 1a                   je     cc <i915_memcpy_from_wc+0xcc>
> 
> And then the second loop:
> 
>    b2:   66 0f 38 2a 06          movntdqa (%rsi),%xmm0
>    b7:   48 83 e8 01             sub    $0x1,%rax
>    bb:   48 83 c6 10             add    $0x10,%rsi
>    bf:   0f 29 07                movaps %xmm0,(%rdi)
>    c2:   48 83 c7 10             add    $0x10,%rdi
>    c6:   48 83 f8 ff             cmp    $0xffffffffffffffff,%rax
>    ca:   75 e6                   jne    b2 <i915_memcpy_from_wc+0xb2>
> 
> Any thoughts on this?

This version generates the smallest code:

static void __memcpy_ntdqa(struct qw2 *dst, const struct qw2 *src, unsigned long len)
{
	unsigned long l4;

	kernel_fpu_begin();

	l4 = len / 4;
	while (l4) {
		asm("movntdqa   (%0), %%xmm0" :: "r" (src), "m" (src[0]));
		asm("movntdqa 16(%0), %%xmm1" :: "r" (src), "m" (src[1]));
		asm("movntdqa 32(%0), %%xmm2" :: "r" (src), "m" (src[2]));
		asm("movntdqa 48(%0), %%xmm3" :: "r" (src), "m" (src[3]));
		asm("movaps %%xmm0,   (%1)" : "=m" (dst[0]) : "r" (dst));
		asm("movaps %%xmm1, 16(%1)" : "=m" (dst[1]) : "r" (dst));
		asm("movaps %%xmm2, 32(%1)" : "=m" (dst[2]) : "r" (dst));
		asm("movaps %%xmm3, 48(%1)" : "=m" (dst[3]) : "r" (dst));
		src += 4;
		dst += 4;
		l4--;
	}

	len %= 4;
	while (len) {
		asm("movntdqa (%0), %%xmm0" :: "r" (src), "m" (src[0]));
		asm("movaps %%xmm0, (%1)" : "=m" (dst[0]) : "r" (dst));
		src++;
		dst++;
		len--;
	}

	kernel_fpu_end();
}

Although I still haven't figured out a way to convince it to use
the same registers for src and dest between the two loops.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] drm/i915: Use SSE4.1 movntdqa to accelerate reads from WC memory
  2016-07-18 12:56                   ` Tvrtko Ursulin
  2016-07-18 13:46                     ` Tvrtko Ursulin
@ 2016-07-18 13:48                     ` Dave Gordon
  1 sibling, 0 replies; 27+ messages in thread
From: Dave Gordon @ 2016-07-18 13:48 UTC (permalink / raw)
  To: Tvrtko Ursulin, Chris Wilson, intel-gfx, Akash Goel, Mika Kuoppala

On 18/07/16 13:56, Tvrtko Ursulin wrote:
>
> On 18/07/16 12:57, Dave Gordon wrote:
>> On 18/07/16 12:35, Chris Wilson wrote:
>>> On Mon, Jul 18, 2016 at 12:15:32PM +0100, Tvrtko Ursulin wrote:
>>>> I am not sure about this, but looking at the raid6 for example, it
>>>> has a lot more annotations in cases like this.
>>>>
>>>> It seems to be telling the compiler which memory ranges does each
>>>> instruction access, and also uses "asm volatile" - whether or not
>>>> that is really needed I don't know.
>>>>
>>>> For example:
>>>>                   asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
>>>>
>>>> And:
>>>>                   asm volatile("movdqa %%xmm4,%0" : "=m" (q[d]));
>>>>
>>>> Each one is telling the compiler the instruction is either reading
>>>> or writing respectively from a certain memory address.
>>>>
>>>> You don't have any of that, and don't even specify nothing as an
>>>> output parameter so I am not sure if your code is safe.
>>>
>>> The asm is correct. We do not modify either of the two pointers which we
>>> pass in via register inputs, but the memory behind them - hence the
>>> memory
>>> clobber.
>>
>> This is a choice of how much we let the compiler decide about
>> addressing, and how much we tell it about what the asm code really does.
>> The examples above get the compiler to generate *any* suitable
>> addressing mode for each specific location involved in the transfers, so
>> the compiler knows a lot about what's happening and can track where each
>> datum comes from and goes to.
>>
>> OTOH Chris' code
>>
>> +        asm("movntdqa   (%0), %%xmm0\n"
>> +            "movntdqa 16(%0), %%xmm1\n"
>> +            "movntdqa 32(%0), %%xmm2\n"
>> +            "movntdqa 48(%0), %%xmm3\n"
>> +            "movaps %%xmm0,   (%1)\n"
>> +            "movaps %%xmm1, 16(%1)\n"
>> +            "movaps %%xmm2, 32(%1)\n"
>> +            "movaps %%xmm3, 48(%1)\n"
>> +            :: "r" (src), "r" (dst) : "memory");
>>
>> - doesn't need "volatile" because asm statements that have no output
>> operands are implicitly volatile.
>>
>> - makes the compiler give us the source and destination *addresses* in a
>> register each; beyond that, it doesn't know what we're doing with them,
>> so the third ("clobbers") parameter has to say "memory" i.e. treat *all*
>> memory contents as unknown after this.
>>
>> [[From GCC docs: The "memory" clobber tells the compiler that the
>> assembly code performs memory reads or writes to items other than those
>> listed in the input and output operands (for example, accessing the
>> memory pointed to by one of the input parameters). To ensure memory
>> contains correct values, GCC may need to flush specific register values
>> to memory before executing the asm. Further, the compiler does not
>> assume that any values read from memory before an asm remain unchanged
>> after that asm; it reloads them as needed. Using the "memory" clobber
>> effectively forms a read/write memory barrier for the compiler.]]
>>
>> BTW, should we not tell it we've *also* clobbered %xmm[0-3]?
>>
>> So they're both correct, just taking different approaches. I don't know
>> which would give the best performance for this specific case.
>
> Cool, learn something new every day. :)
>
> I've tried writing it as:
>
> struct qw2 {
> 	u64	q[2];
> } __attribute__((packed));
>
> static void __memcpy_ntdqa(struct qw2 *dst, const struct qw2 *src, unsigned long len)
> {
> 	kernel_fpu_begin();
>
> 	len >>= 4;
> 	while (len >= 4) {
> 		asm("movntdqa   (%0), %%xmm0" :: "r" (src), "m" (src[0]));
> 		asm("movntdqa 16(%0), %%xmm1" :: "r" (src), "m" (src[1]));

Couldn't this be just:
		asm("movntdqa %1, %%xmm1" :: "r" (src), "m" (src[1]));
thus letting the compiler supply the offset?

Does the compiler know about %xmm* registers? If so you could maybe get 
it to choose which to use in each instruction, or should at least tell 
it which ones are being clobbered.

> 		asm("movntdqa 32(%0), %%xmm2" :: "r" (src), "m" (src[2]));
> 		asm("movntdqa 48(%0), %%xmm3" :: "r" (src), "m" (src[3]));
> 		asm("movaps %%xmm0,   (%1)" : "=m" (dst[0]) : "r" (dst));
> 		asm("movaps %%xmm1, 16(%1)" : "=m" (dst[1]) : "r" (dst));
> 		asm("movaps %%xmm2, 32(%1)" : "=m" (dst[2]) : "r" (dst));
> 		asm("movaps %%xmm3, 48(%1)" : "=m" (dst[3]) : "r" (dst));
> 		src += 4;
> 		dst += 4;
> 		len -= 4;
> 	}
> 	while (len--) {
> 		asm("movntdqa (%0), %%xmm0" :: "r" (src), "m" (src[0]));
> 		asm("movaps %%xmm0, (%1)" : "=m" (dst[0]) : "r" (dst));
> 		src++;
> 		dst++;
> 	}
>
> 	kernel_fpu_end();
> }
>
> That appears to allow GCC to interleave SSE and normal instructions,
> presumably that means it is trying to utilize the execution units better?
>
> I wonder if it makes a difference in speed?
>
> Old code main loop assembly looks like:
>
>    58:   66 0f 38 2a 00          movntdqa (%rax),%xmm0
>    5d:   66 0f 38 2a 48 10       movntdqa 0x10(%rax),%xmm1
>    63:   66 0f 38 2a 50 20       movntdqa 0x20(%rax),%xmm2
>    69:   66 0f 38 2a 58 30       movntdqa 0x30(%rax),%xmm3
>    6f:   0f 29 01                movaps %xmm0,(%rcx)
>    72:   0f 29 49 10             movaps %xmm1,0x10(%rcx)
>    76:   0f 29 51 20             movaps %xmm2,0x20(%rcx)
>    7a:   0f 29 59 30             movaps %xmm3,0x30(%rcx)
>    7e:   49 83 e8 04             sub    $0x4,%r8
>    82:   48 83 c0 40             add    $0x40,%rax
>    86:   48 83 c1 40             add    $0x40,%rcx
>    8a:   49 83 f8 03             cmp    $0x3,%r8
>    8e:   77 c8                   ja     58 <i915_memcpy_from_wc+0x58>
>
> While the above version generates:
>
>    58:   66 0f 38 2a 00          movntdqa (%rax),%xmm0
>    5d:   66 0f 38 2a 48 10       movntdqa 0x10(%rax),%xmm1
>    63:   66 0f 38 2a 50 20       movntdqa 0x20(%rax),%xmm2
>    69:   66 0f 38 2a 58 30       movntdqa 0x30(%rax),%xmm3
>    6f:   49 83 e8 04             sub    $0x4,%r8
>    73:   48 83 c0 40             add    $0x40,%rax
>    77:   0f 29 01                movaps %xmm0,(%rcx)
>    7a:   0f 29 49 10             movaps %xmm1,0x10(%rcx)
>    7e:   0f 29 51 20             movaps %xmm2,0x20(%rcx)
>    82:   0f 29 59 30             movaps %xmm3,0x30(%rcx)
>    86:   48 83 c1 40             add    $0x40,%rcx
>    8a:   49 83 f8 03             cmp    $0x3,%r8
>    8e:   77 c8                   ja     58 <i915_memcpy_from_wc+0x58>

Perhaps marginally better as it separates the load and store more, but I 
don't think a couple of integer operations on registers are actually 
going to make any difference.

> Interestingly, in both cases GCC does some in my mind futile
> shuffling aroung between the two loops. Instead of just
> carrying on with src and dst and len how they are, it goes
> to use a different register set for the second loop:
>
> So this reshuffling:
>
>    90:   48 8d 42 fc             lea    -0x4(%rdx),%rax
>    94:   83 e2 03                and    $0x3,%edx
>    97:   48 c1 e8 02             shr    $0x2,%rax
>    9b:   48 83 c0 01             add    $0x1,%rax
>    9f:   48 c1 e0 06             shl    $0x6,%rax
>    a3:   48 01 c6                add    %rax,%rsi
>    a6:   48 01 c7                add    %rax,%rdi
>    a9:   48 8d 42 ff             lea    -0x1(%rdx),%rax
>    ad:   48 85 d2                test   %rdx,%rdx
>    b0:   74 1a                   je     cc <i915_memcpy_from_wc+0xcc>

Strange ... more investigation required!

.Dave.

> And then the second loop:
>
>    b2:   66 0f 38 2a 06          movntdqa (%rsi),%xmm0
>    b7:   48 83 e8 01             sub    $0x1,%rax
>    bb:   48 83 c6 10             add    $0x10,%rsi
>    bf:   0f 29 07                movaps %xmm0,(%rdi)
>    c2:   48 83 c7 10             add    $0x10,%rdi
>    c6:   48 83 f8 ff             cmp    $0xffffffffffffffff,%rax
>    ca:   75 e6                   jne    b2 <i915_memcpy_from_wc+0xb2>
>
> Any thoughts on this?
>
> Regards,
> Tvrtko


_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] drm/i915: Use SSE4.1 movntdqa to accelerate reads from WC memory
  2016-07-18 13:46                     ` Tvrtko Ursulin
@ 2016-07-18 15:06                       ` Tvrtko Ursulin
  2016-07-18 16:05                         ` Dave Gordon
  2016-07-19 10:26                         ` Tvrtko Ursulin
  0 siblings, 2 replies; 27+ messages in thread
From: Tvrtko Ursulin @ 2016-07-18 15:06 UTC (permalink / raw)
  To: Dave Gordon, Chris Wilson, intel-gfx, Akash Goel, Mika Kuoppala


On 18/07/16 14:46, Tvrtko Ursulin wrote:

[snip]

> This version generates the smallest code:
> 
> static void __memcpy_ntdqa(struct qw2 *dst, const struct qw2 *src, unsigned long len)
> {
> 	unsigned long l4;
> 
> 	kernel_fpu_begin();
> 
> 	l4 = len / 4;
> 	while (l4) {
> 		asm("movntdqa   (%0), %%xmm0" :: "r" (src), "m" (src[0]));
> 		asm("movntdqa 16(%0), %%xmm1" :: "r" (src), "m" (src[1]));
> 		asm("movntdqa 32(%0), %%xmm2" :: "r" (src), "m" (src[2]));
> 		asm("movntdqa 48(%0), %%xmm3" :: "r" (src), "m" (src[3]));
> 		asm("movaps %%xmm0,   (%1)" : "=m" (dst[0]) : "r" (dst));
> 		asm("movaps %%xmm1, 16(%1)" : "=m" (dst[1]) : "r" (dst));
> 		asm("movaps %%xmm2, 32(%1)" : "=m" (dst[2]) : "r" (dst));
> 		asm("movaps %%xmm3, 48(%1)" : "=m" (dst[3]) : "r" (dst));
> 		src += 4;
> 		dst += 4;
> 		l4--;
> 	}
> 
> 	len %= 4;
> 	while (len) {
> 		asm("movntdqa (%0), %%xmm0" :: "r" (src), "m" (src[0]));
> 		asm("movaps %%xmm0, (%1)" : "=m" (dst[0]) : "r" (dst));
> 		src++;
> 		dst++;
> 		len--;
> 	}
> 
> 	kernel_fpu_end();
> }
> 
> Although I still haven't figured out a way to convince it to use
> the same registers for src and dest between the two loops.

I remembered one famous interview question, along the lines of, "what
is the code below doing". Translated to this example:

static void __memcpy_ntdqa(struct qw2 *dst, const struct qw2 *src, unsigned long len)
{
	unsigned long n;

	kernel_fpu_begin();

	n = (len + 3) / 4;
	switch (len % 4) {
	case 0: do { asm("movntdqa %1, %%xmm0\n"
			  "movaps %%xmm0, %0\n" : "=m" (*dst): "m" (*src));
		     src++; dst++;
	case 3:	     asm("movntdqa %1, %%xmm1\n"
			  "movaps %%xmm1, %0\n" : "=m" (*dst): "m" (*src));
		     src++; dst++;
	case 2:	     asm("movntdqa %1, %%xmm2\n"
			  "movaps %%xmm2, %0\n" : "=m" (*dst): "m" (*src));
		     src++; dst++;
	case 1:	     asm("movntdqa %1, %%xmm3\n"
			  "movaps %%xmm3, %0\n" : "=m" (*dst): "m" (*src));
		     src++; dst++;
		} while (--n > 0);
	}

	kernel_fpu_end();
}

:D 

No idea if loads/stores can run async in this case.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] drm/i915: Use SSE4.1 movntdqa to accelerate reads from WC memory
  2016-07-18 15:06                       ` Tvrtko Ursulin
@ 2016-07-18 16:05                         ` Dave Gordon
  2016-07-19 10:26                         ` Tvrtko Ursulin
  1 sibling, 0 replies; 27+ messages in thread
From: Dave Gordon @ 2016-07-18 16:05 UTC (permalink / raw)
  To: Tvrtko Ursulin, Chris Wilson, intel-gfx, Akash Goel, Mika Kuoppala

On 18/07/16 16:06, Tvrtko Ursulin wrote:
>
> On 18/07/16 14:46, Tvrtko Ursulin wrote:
>
> [snip]
>
>> This version generates the smallest code:
>>
>> static void __memcpy_ntdqa(struct qw2 *dst, const struct qw2 *src, unsigned long len)
>> {
>> 	unsigned long l4;
>>
>> 	kernel_fpu_begin();
>>
>> 	l4 = len / 4;
>> 	while (l4) {
>> 		asm("movntdqa   (%0), %%xmm0" :: "r" (src), "m" (src[0]));
>> 		asm("movntdqa 16(%0), %%xmm1" :: "r" (src), "m" (src[1]));
>> 		asm("movntdqa 32(%0), %%xmm2" :: "r" (src), "m" (src[2]));
>> 		asm("movntdqa 48(%0), %%xmm3" :: "r" (src), "m" (src[3]));
>> 		asm("movaps %%xmm0,   (%1)" : "=m" (dst[0]) : "r" (dst));
>> 		asm("movaps %%xmm1, 16(%1)" : "=m" (dst[1]) : "r" (dst));
>> 		asm("movaps %%xmm2, 32(%1)" : "=m" (dst[2]) : "r" (dst));
>> 		asm("movaps %%xmm3, 48(%1)" : "=m" (dst[3]) : "r" (dst));
>> 		src += 4;
>> 		dst += 4;
>> 		l4--;
>> 	}
>>
>> 	len %= 4;
>> 	while (len) {
>> 		asm("movntdqa (%0), %%xmm0" :: "r" (src), "m" (src[0]));
>> 		asm("movaps %%xmm0, (%1)" : "=m" (dst[0]) : "r" (dst));
>> 		src++;
>> 		dst++;
>> 		len--;
>> 	}
>>
>> 	kernel_fpu_end();
>> }
>>
>> Although I still haven't figured out a way to convince it to use
>> the same registers for src and dest between the two loops.
>
> I remembered one famous interview question, along the lines of, "what
> is the code below doing". Translated to this example:
>
> static void __memcpy_ntdqa(struct qw2 *dst, const struct qw2 *src, unsigned long len)
> {
> 	unsigned long n;
>
> 	kernel_fpu_begin();
>
> 	n = (len + 3) / 4;
> 	switch (len % 4) {
> 	case 0: do { asm("movntdqa %1, %%xmm0\n"
> 			  "movaps %%xmm0, %0\n" : "=m" (*dst): "m" (*src));
> 		     src++; dst++;
> 	case 3:	     asm("movntdqa %1, %%xmm1\n"
> 			  "movaps %%xmm1, %0\n" : "=m" (*dst): "m" (*src));
> 		     src++; dst++;
> 	case 2:	     asm("movntdqa %1, %%xmm2\n"
> 			  "movaps %%xmm2, %0\n" : "=m" (*dst): "m" (*src));
> 		     src++; dst++;
> 	case 1:	     asm("movntdqa %1, %%xmm3\n"
> 			  "movaps %%xmm3, %0\n" : "=m" (*dst): "m" (*src));
> 		     src++; dst++;
> 		} while (--n > 0);
> 	}
>
> 	kernel_fpu_end();
> }
>
> :D
>
> No idea if loads/stores can run async in this case.
>
> Regards,
> Tvrtko

Here's yet another variant, just to document other ways of writing it:

#include "asm/fpu/api.h"

/* This is the datatype of an xmm register */
typedef double xmmd_t __attribute__ ((vector_size (16)));

__attribute__((target("sse4.1")))
void __memcpy_ntdqa(xmmd_t *dst, const xmmd_t *src, unsigned long len)
{
	xmmd_t tmp0, tmp1, tmp2, tmp3;
	unsigned long l64;

	kernel_fpu_begin();

	/* Whole 64-byte blocks as 4*16 bytes */
	for (l64 = len/64; l64--; ) {
		asm("movntdqa %1, %0" : "=x" (tmp0) : "m" (*src++));
		asm("movntdqa %1, %0" : "=x" (tmp1) : "m" (*src++));
		asm("movntdqa %1, %0" : "=x" (tmp2) : "m" (*src++));
		asm("movntdqa %1, %0" : "=x" (tmp3) : "m" (*src++));
		asm("movaps   %1, %0" : "=m" (*dst++) : "x" (tmp0));
		asm("movaps   %1, %0" : "=m" (*dst++) : "x" (tmp1));
		asm("movaps   %1, %0" : "=m" (*dst++) : "x" (tmp2));
		asm("movaps   %1, %0" : "=m" (*dst++) : "x" (tmp3));
	}

	/* Remaining up-to-3 16-byte chunks */
	for (len &= 63, len >>= 4; len--; ) {
		asm("movntdqa %1, %0" : "=x" (tmp0) : "m" (*src++));
		asm("movaps   %1, %0" : "=m" (*dst++) : "x" (tmp0));
	}

	kernel_fpu_end();
}

I wondered whether we could get GCC to unroll the loops automatically 
i.e. just write the one loop and say we wanted it unrolled four times, 
leaving the compiler to deal with the remainder; but I didn't find a way 
to specify "unroll 4 times" as opposed to just "unroll this some".

.Dave.
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] drm/i915: Use SSE4.1 movntdqa to accelerate reads from WC memory
  2016-07-18 10:07           ` [PATCH] " Chris Wilson
  2016-07-18 10:29             ` Chris Wilson
  2016-07-18 11:15             ` Tvrtko Ursulin
@ 2016-07-19  6:50             ` Daniel Vetter
  2 siblings, 0 replies; 27+ messages in thread
From: Daniel Vetter @ 2016-07-19  6:50 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx, Akash Goel, Mika Kuoppala

On Mon, Jul 18, 2016 at 11:07:14AM +0100, Chris Wilson wrote:
> This patch provides the infrastructure for performing a 16-byte aligned
> read from WC memory using non-temporal instructions introduced with sse4.1.
> Using movntdqa we can bypass the CPU caches and read directly from memory
> and ignoring the page attributes set on the CPU PTE i.e. negating the
> impact of an otherwise UC access. Copying using movntqda from WC is almost
> as fast as reading from WB memory, modulo the possibility of both hitting
> the CPU cache or leaving the data in the CPU cache for the next consumer.
> (The CPU cache itself my be flushed for the region of the movntdqa and on
> later access the movntdqa reads from a separate internal buffer for the
> cacheline.) The write back to the memory is however cached.
> 
> This will be used in later patches to accelerate accessing WC memory.
> 
> v2: Report whether the accelerated copy is successful/possible.
> v3: Function alignment override was only necessary when using the
> function target("sse4.1") - which is not necessary for emitting movntdqa
> from __asm__.
> v4: Improve notes on CPU cache behaviour vs non-temporal stores.
> v5: Fix byte offsets for unrolled moves.
> v6: Find all remaining typos of movntqda, use kernel_fpu_begin.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Akash Goel <akash.goel@intel.com>
> Cc: Damien Lespiau <damien.lespiau@intel.com>
> Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Cc: x86 arch folks in case they're interested int this too?
-Daniel

> ---
>  drivers/gpu/drm/i915/Makefile      |   3 ++
>  drivers/gpu/drm/i915/i915_drv.c    |   2 +
>  drivers/gpu/drm/i915/i915_drv.h    |   3 ++
>  drivers/gpu/drm/i915/i915_memcpy.c | 101 +++++++++++++++++++++++++++++++++++++
>  4 files changed, 109 insertions(+)
>  create mode 100644 drivers/gpu/drm/i915/i915_memcpy.c
> 
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index dda724f04445..3412413408c0 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -3,12 +3,15 @@
>  # Direct Rendering Infrastructure (DRI) in XFree86 4.1.0 and higher.
>  
>  subdir-ccflags-$(CONFIG_DRM_I915_WERROR) := -Werror
> +subdir-ccflags-y += \
> +	$(call as-instr,movntdqa (%eax)$(comma)%xmm0,-DCONFIG_AS_MOVNTDQA)
>  
>  # Please keep these build lists sorted!
>  
>  # core driver code
>  i915-y := i915_drv.o \
>  	  i915_irq.o \
> +	  i915_memcpy.o \
>  	  i915_params.o \
>  	  i915_pci.o \
>            i915_suspend.o \
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 640e3c34d3cd..f1034eb6853b 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -827,6 +827,8 @@ static int i915_driver_init_early(struct drm_i915_private *dev_priv,
>  	mutex_init(&dev_priv->wm.wm_mutex);
>  	mutex_init(&dev_priv->pps_mutex);
>  
> +	i915_memcpy_init_early(dev_priv);
> +
>  	ret = i915_workqueues_init(dev_priv);
>  	if (ret < 0)
>  		return ret;
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 86dcde564de0..f8ae3f3e3af4 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -3916,4 +3916,7 @@ static inline bool __i915_request_irq_complete(struct drm_i915_gem_request *req)
>  	return false;
>  }
>  
> +void i915_memcpy_init_early(struct drm_i915_private *dev_priv);
> +bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len);
> +
>  #endif
> diff --git a/drivers/gpu/drm/i915/i915_memcpy.c b/drivers/gpu/drm/i915/i915_memcpy.c
> new file mode 100644
> index 000000000000..50fc5799255f
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_memcpy.c
> @@ -0,0 +1,101 @@
> +/*
> + * Copyright © 2016 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + *
> + */
> +
> +#include <linux/kernel.h>
> +#include <asm/fpu/api.h>
> +
> +#include "i915_drv.h"
> +
> +DEFINE_STATIC_KEY_FALSE(has_movntdqa);
> +
> +#ifdef CONFIG_AS_MOVNTDQA
> +static void __memcpy_ntdqa(void *dst, const void *src, unsigned long len)
> +{
> +	kernel_fpu_begin();
> +
> +	len >>= 4;
> +	while (len >= 4) {
> +		asm("movntdqa   (%0), %%xmm0\n"
> +		    "movntdqa 16(%0), %%xmm1\n"
> +		    "movntdqa 32(%0), %%xmm2\n"
> +		    "movntdqa 48(%0), %%xmm3\n"
> +		    "movaps %%xmm0,   (%1)\n"
> +		    "movaps %%xmm1, 16(%1)\n"
> +		    "movaps %%xmm2, 32(%1)\n"
> +		    "movaps %%xmm3, 48(%1)\n"
> +		    :: "r" (src), "r" (dst) : "memory");
> +		src += 64;
> +		dst += 64;
> +		len -= 4;
> +	}
> +	while (len--) {
> +		asm("movntdqa (%0), %%xmm0\n"
> +		    "movaps %%xmm0, (%1)\n"
> +		    :: "r" (src), "r" (dst) : "memory");
> +		src += 16;
> +		dst += 16;
> +	}
> +
> +	kernel_fpu_end();
> +}
> +#endif
> +
> +/**
> + * i915_memcpy_from_wc: perform an accelerated *aligned* read from WC
> + * @dst: destination pointer
> + * @src: source pointer
> + * @len: how many bytes to copy
> + *
> + * i915_memcpy_from_wc copies @len bytes from @src to @dst using
> + * non-temporal instructions where available. Note that all arguments
> + * (@src, @dst) must be aligned to 16 bytes and @len must be a multiple
> + * of 16.
> + *
> + * To test whether accelerated reads from WC are supported, use
> + * i915_memcpy_from_wc(NULL, NULL, 0);
> + *
> + * Returns true if the copy was successful, false if the preconditions
> + * are not met.
> + */
> +bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len)
> +{
> +	if (unlikely(((unsigned long)dst | (unsigned long)src | len) & 15))
> +		return false;
> +
> +#ifdef CONFIG_AS_MOVNTDQA
> +	if (static_branch_likely(&has_movntdqa)) {
> +		if (len)
> +			__memcpy_ntdqa(dst, src, len);
> +		return true;
> +	}
> +#endif
> +
> +	return false;
> +}
> +
> +void i915_memcpy_init_early(struct drm_i915_private *dev_priv)
> +{
> +	if (static_cpu_has(X86_FEATURE_XMM4_1))
> +		static_branch_enable(&has_movntdqa);
> +}
> -- 
> 2.8.1
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] drm/i915: Use SSE4.1 movntdqa to accelerate reads from WC memory
  2016-07-18 15:06                       ` Tvrtko Ursulin
  2016-07-18 16:05                         ` Dave Gordon
@ 2016-07-19 10:26                         ` Tvrtko Ursulin
  1 sibling, 0 replies; 27+ messages in thread
From: Tvrtko Ursulin @ 2016-07-19 10:26 UTC (permalink / raw)
  To: Dave Gordon, Chris Wilson, intel-gfx, Akash Goel, Mika Kuoppala


On 18/07/16 16:06, Tvrtko Ursulin wrote:
> On 18/07/16 14:46, Tvrtko Ursulin wrote:
>
> [snip]
>
>> This version generates the smallest code:
>>
>> static void __memcpy_ntdqa(struct qw2 *dst, const struct qw2 *src, unsigned long len)
>> {
>> 	unsigned long l4;
>>
>> 	kernel_fpu_begin();
>>
>> 	l4 = len / 4;
>> 	while (l4) {
>> 		asm("movntdqa   (%0), %%xmm0" :: "r" (src), "m" (src[0]));
>> 		asm("movntdqa 16(%0), %%xmm1" :: "r" (src), "m" (src[1]));
>> 		asm("movntdqa 32(%0), %%xmm2" :: "r" (src), "m" (src[2]));
>> 		asm("movntdqa 48(%0), %%xmm3" :: "r" (src), "m" (src[3]));
>> 		asm("movaps %%xmm0,   (%1)" : "=m" (dst[0]) : "r" (dst));
>> 		asm("movaps %%xmm1, 16(%1)" : "=m" (dst[1]) : "r" (dst));
>> 		asm("movaps %%xmm2, 32(%1)" : "=m" (dst[2]) : "r" (dst));
>> 		asm("movaps %%xmm3, 48(%1)" : "=m" (dst[3]) : "r" (dst));
>> 		src += 4;
>> 		dst += 4;
>> 		l4--;
>> 	}
>>
>> 	len %= 4;
>> 	while (len) {
>> 		asm("movntdqa (%0), %%xmm0" :: "r" (src), "m" (src[0]));
>> 		asm("movaps %%xmm0, (%1)" : "=m" (dst[0]) : "r" (dst));
>> 		src++;
>> 		dst++;
>> 		len--;
>> 	}
>>
>> 	kernel_fpu_end();
>> }
>>
>> Although I still haven't figured out a way to convince it to use
>> the same registers for src and dest between the two loops.
>
> I remembered one famous interview question, along the lines of, "what
> is the code below doing". Translated to this example:
>
> static void __memcpy_ntdqa(struct qw2 *dst, const struct qw2 *src, unsigned long len)
> {
> 	unsigned long n;
>
> 	kernel_fpu_begin();
>

Bugfix here:

+	len /= 16;

> 	n = (len + 3) / 4;
> 	switch (len % 4) {
> 	case 0: do { asm("movntdqa %1, %%xmm0\n"
> 			  "movaps %%xmm0, %0\n" : "=m" (*dst): "m" (*src));
> 		     src++; dst++;
> 	case 3:	     asm("movntdqa %1, %%xmm1\n"
> 			  "movaps %%xmm1, %0\n" : "=m" (*dst): "m" (*src));
> 		     src++; dst++;
> 	case 2:	     asm("movntdqa %1, %%xmm2\n"
> 			  "movaps %%xmm2, %0\n" : "=m" (*dst): "m" (*src));
> 		     src++; dst++;
> 	case 1:	     asm("movntdqa %1, %%xmm3\n"
> 			  "movaps %%xmm3, %0\n" : "=m" (*dst): "m" (*src));
> 		     src++; dst++;
> 		} while (--n > 0);
> 	}
>
> 	kernel_fpu_end();
> }
>
> :D
>
> No idea if loads/stores can run async in this case.

It seems to be equally fast as the unrolled loop. And it generates the 
smallest code. :)

I would also suggest a "likely (len)" in the original patch since zero 
length copies are not expected and that also manages to shrink the code 
a bit.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2016-07-19 10:26 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-07-16 12:07 [PATCH] drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory Chris Wilson
2016-07-16 12:32 ` ✗ Ro.CI.BAT: failure for " Patchwork
2016-07-16 12:33 ` [PATCH] " Chris Wilson
2016-07-16 12:45 ` [PATCH v2] " Chris Wilson
2016-07-16 12:53   ` [PATCH v3] " Chris Wilson
2016-07-16 15:44     ` [PATCH v5] drm/i915: Use SSE4.1 movntdqa " Chris Wilson
2016-07-17  3:21       ` Matt Turner
2016-07-17  8:12         ` Chris Wilson
2016-07-18  9:31       ` Tvrtko Ursulin
2016-07-18 10:01         ` Chris Wilson
2016-07-18 10:07           ` [PATCH] " Chris Wilson
2016-07-18 10:29             ` Chris Wilson
2016-07-18 11:15             ` Tvrtko Ursulin
2016-07-18 11:35               ` Chris Wilson
2016-07-18 11:57                 ` Dave Gordon
2016-07-18 12:56                   ` Tvrtko Ursulin
2016-07-18 13:46                     ` Tvrtko Ursulin
2016-07-18 15:06                       ` Tvrtko Ursulin
2016-07-18 16:05                         ` Dave Gordon
2016-07-19 10:26                         ` Tvrtko Ursulin
2016-07-18 13:48                     ` Dave Gordon
2016-07-18 12:07                 ` Tvrtko Ursulin
2016-07-19  6:50             ` Daniel Vetter
2016-07-16 13:12 ` ✗ Ro.CI.BAT: failure for drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory (rev2) Patchwork
2016-07-16 13:34 ` ✓ Ro.CI.BAT: success for drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory (rev3) Patchwork
2016-07-16 16:12 ` ✗ Ro.CI.BAT: failure for drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory (rev4) Patchwork
2016-07-18 10:59 ` ✗ Ro.CI.BAT: failure for drm/i915: Use SSE4.1 movntqda to accelerate reads from WC memory (rev5) Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.