All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 1/2] drm/i915/gen9: Clean up MOCS table definitions
@ 2016-04-26 12:44 Imre Deak
  2016-04-26 12:44 ` [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config Imre Deak
  2016-04-26 16:55 ` ✗ Fi.CI.BAT: failure for series starting with [v2,1/2] drm/i915/gen9: Clean up MOCS table definitions Patchwork
  0 siblings, 2 replies; 32+ messages in thread
From: Imre Deak @ 2016-04-26 12:44 UTC (permalink / raw)
  To: intel-gfx

Use named struct initializers for clarity. Also fix the target cache
definition to reflect its role in GEN9 onwards. On GEN8 a TC value of 0
meant ELLC but on GEN9+ it means the TC and LRU controls are taken from
the PTE.

No functional change, igt/gem_mocs_settings still passing after this
change.

v2: (Chris)
- Add back the hexa literals for the entries.
  Add note that igt/gem_mocs_settings still passes.

CC: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Imre Deak <imre.deak@intel.com>
---
 drivers/gpu/drm/i915/intel_mocs.c | 88 +++++++++++++++++++++++++++------------
 1 file changed, 61 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_mocs.c b/drivers/gpu/drm/i915/intel_mocs.c
index 23b8545..f391ad6 100644
--- a/drivers/gpu/drm/i915/intel_mocs.c
+++ b/drivers/gpu/drm/i915/intel_mocs.c
@@ -66,9 +66,10 @@ struct drm_i915_mocs_table {
 #define L3_WB			3
 
 /* Target cache */
-#define ELLC			0
-#define LLC			1
-#define LLC_ELLC		2
+#define LE_TC_PAGETABLE		0
+#define LE_TC_LLC		1
+#define LE_TC_LLC_ELLC		2
+#define LE_TC_LLC_ELLC_ALT	3
 
 /*
  * MOCS tables
@@ -96,34 +97,67 @@ struct drm_i915_mocs_table {
  *       end.
  */
 static const struct drm_i915_mocs_entry skylake_mocs_table[] = {
-	/* { 0x00000009, 0x0010 } */
-	{ (LE_CACHEABILITY(LE_UC) | LE_TGT_CACHE(LLC_ELLC) | LE_LRUM(0) |
-	   LE_AOM(0) | LE_RSC(0) | LE_SCC(0) | LE_PFM(0) | LE_SCF(0)),
-	  (L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC)) },
-	/* { 0x00000038, 0x0030 } */
-	{ (LE_CACHEABILITY(LE_PAGETABLE) | LE_TGT_CACHE(LLC_ELLC) | LE_LRUM(3) |
-	   LE_AOM(0) | LE_RSC(0) | LE_SCC(0) | LE_PFM(0) | LE_SCF(0)),
-	  (L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB)) },
-	/* { 0x0000003b, 0x0030 } */
-	{ (LE_CACHEABILITY(LE_WB) | LE_TGT_CACHE(LLC_ELLC) | LE_LRUM(3) |
-	   LE_AOM(0) | LE_RSC(0) | LE_SCC(0) | LE_PFM(0) | LE_SCF(0)),
-	  (L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB)) }
+	{ /* 0x00000009 */
+	  .control_value = LE_CACHEABILITY(LE_UC) |
+			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
+			   LE_LRUM(0) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
+			   LE_PFM(0) | LE_SCF(0),
+
+	  /* 0x0010 */
+	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
+	},
+	{
+	  /* 0x00000038 */
+	  .control_value = LE_CACHEABILITY(LE_PAGETABLE) |
+			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
+			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
+			   LE_PFM(0) | LE_SCF(0),
+	  /* 0x0030 */
+	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
+	},
+	{
+	  /* 0x0000003b */
+	  .control_value = LE_CACHEABILITY(LE_WB) |
+			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
+			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
+			   LE_PFM(0) | LE_SCF(0),
+	  /* 0x0030 */
+	  .l3cc_value =   L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
+	},
 };
 
 /* NOTE: the LE_TGT_CACHE is not used on Broxton */
 static const struct drm_i915_mocs_entry broxton_mocs_table[] = {
-	/* { 0x00000009, 0x0010 } */
-	{ (LE_CACHEABILITY(LE_UC) | LE_TGT_CACHE(LLC_ELLC) | LE_LRUM(0) |
-	   LE_AOM(0) | LE_RSC(0) | LE_SCC(0) | LE_PFM(0) | LE_SCF(0)),
-	  (L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC)) },
-	/* { 0x00000038, 0x0030 } */
-	{ (LE_CACHEABILITY(LE_PAGETABLE) | LE_TGT_CACHE(LLC_ELLC) | LE_LRUM(3) |
-	   LE_AOM(0) | LE_RSC(0) | LE_SCC(0) | LE_PFM(0) | LE_SCF(0)),
-	  (L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB)) },
-	/* { 0x0000003b, 0x0030 } */
-	{ (LE_CACHEABILITY(LE_WB) | LE_TGT_CACHE(LLC_ELLC) | LE_LRUM(3) |
-	   LE_AOM(0) | LE_RSC(0) | LE_SCC(0) | LE_PFM(0) | LE_SCF(0)),
-	  (L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB)) }
+	{
+	  /* 0x00000009 */
+	  .control_value = LE_CACHEABILITY(LE_UC) |
+			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
+			   LE_LRUM(0) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
+			   LE_PFM(0) | LE_SCF(0),
+
+	  /* 0x0010 */
+	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
+	},
+	{
+	  /* 0x00000038 */
+	  .control_value = LE_CACHEABILITY(LE_PAGETABLE) |
+			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
+			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
+			   LE_PFM(0) | LE_SCF(0),
+
+	  /* 0x0030 */
+	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
+	},
+	{
+	  /* 0x0000003b */
+	  .control_value = LE_CACHEABILITY(LE_WB) |
+			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
+			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
+			   LE_PFM(0) | LE_SCF(0),
+
+	  /* 0x0030 */
+	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
+	},
 };
 
 /**
-- 
2.5.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config
  2016-04-26 12:44 [PATCH v2 1/2] drm/i915/gen9: Clean up MOCS table definitions Imre Deak
@ 2016-04-26 12:44 ` Imre Deak
  2016-04-26 12:57   ` Chris Wilson
  2016-04-26 13:12   ` Chris Wilson
  2016-04-26 16:55 ` ✗ Fi.CI.BAT: failure for series starting with [v2,1/2] drm/i915/gen9: Clean up MOCS table definitions Patchwork
  1 sibling, 2 replies; 32+ messages in thread
From: Imre Deak @ 2016-04-26 12:44 UTC (permalink / raw)
  To: intel-gfx; +Cc: Eero Tamminen, Valtteri Rantala, Michael T Frederick

Setting a write-back cache policy in the MOCS entry definition also
implies snooping, which has a considerable overhead. This is
unexpected for a few reasons:
- From user-space's point of view since it didn't want a coherent
  surface (it didn't set the buffer as such via the set caching IOCTL).
- There is a separate MOCS entry field for snooping (which we never
  set).
- This MOCS table is about caching in (e)LLC and there is no (e)LLC on
  BXT. There is a separate table for L3 cache control.

Considering the above the current behavior of snooping looks like an
unintentional side-effect of the WB setting. Changing it to be PTE
based cacheability gets rid of the snooping without any ill-effects.
For a coherent surface the application would use a separate MOCS entry
(at index 1) and call the set caching IOCTL to setup the PTE entries
for the corresponding buffer to be snooped.

This resulted in 70% improvement in synthetic texturing benchmarks.

Kudos to Valtteri Rantala, Eero Tamminen and Michael T Frederick and
Ville who helped to narrow the source of problem to the kernel and to
the snooping behaviour in particular.

With a follow-up change to adjust the 3rd entry value
igt/gem_mocs_settings is passing after this change.

v2:
- Rebase on v2 of patch 1/2.

CC: Valtteri Rantala <valtteri.rantala@intel.com>
CC: Eero Tamminen <eero.t.tamminen@intel.com>
CC: Michael T Frederick <michael.t.frederick@intel.com>
CC: Ville Syrjälä <ville.syrjala@linux.intel.com>
CC: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Imre Deak <imre.deak@intel.com>
---
 drivers/gpu/drm/i915/intel_mocs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_mocs.c b/drivers/gpu/drm/i915/intel_mocs.c
index f391ad6..d38f65e 100644
--- a/drivers/gpu/drm/i915/intel_mocs.c
+++ b/drivers/gpu/drm/i915/intel_mocs.c
@@ -149,8 +149,8 @@ static const struct drm_i915_mocs_entry broxton_mocs_table[] = {
 	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
 	},
 	{
-	  /* 0x0000003b */
-	  .control_value = LE_CACHEABILITY(LE_WB) |
+	  /* 0x00000038 */
+	  .control_value = LE_CACHEABILITY(LE_PAGETABLE) |
 			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
 			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
 			   LE_PFM(0) | LE_SCF(0),
-- 
2.5.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config
  2016-04-26 12:44 ` [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config Imre Deak
@ 2016-04-26 12:57   ` Chris Wilson
  2016-04-26 13:17     ` Imre Deak
  2016-04-26 13:12   ` Chris Wilson
  1 sibling, 1 reply; 32+ messages in thread
From: Chris Wilson @ 2016-04-26 12:57 UTC (permalink / raw)
  To: Imre Deak; +Cc: intel-gfx, Valtteri Rantala, Eero Tamminen, Michael T Frederick

On Tue, Apr 26, 2016 at 03:44:22PM +0300, Imre Deak wrote:
> Setting a write-back cache policy in the MOCS entry definition also
> implies snooping, which has a considerable overhead. This is
> unexpected for a few reasons:

If it is snooping, then I don't see why it is undesirable to have it
available in a mocs setting. If it is bogus and the bit is undefined,
then by all means remove it.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config
  2016-04-26 12:44 ` [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config Imre Deak
  2016-04-26 12:57   ` Chris Wilson
@ 2016-04-26 13:12   ` Chris Wilson
  1 sibling, 0 replies; 32+ messages in thread
From: Chris Wilson @ 2016-04-26 13:12 UTC (permalink / raw)
  To: Imre Deak; +Cc: intel-gfx, Valtteri Rantala, Eero Tamminen, Michael T Frederick

On Tue, Apr 26, 2016 at 03:44:22PM +0300, Imre Deak wrote:
> Setting a write-back cache policy in the MOCS entry definition also
> implies snooping, which has a considerable overhead. This is
> unexpected for a few reasons:
> - From user-space's point of view since it didn't want a coherent
>   surface (it didn't set the buffer as such via the set caching IOCTL).

To put it into context, this is the point of MOCS. Userspace can
configure its buffer access without having to tell the kernel to do a
synchronous (i.e. stalling and is slow) update of the PTE.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config
  2016-04-26 12:57   ` Chris Wilson
@ 2016-04-26 13:17     ` Imre Deak
  2016-04-26 13:23       ` Chris Wilson
  0 siblings, 1 reply; 32+ messages in thread
From: Imre Deak @ 2016-04-26 13:17 UTC (permalink / raw)
  To: Chris Wilson
  Cc: intel-gfx, Valtteri Rantala, Eero Tamminen, Michael T Frederick

On ti, 2016-04-26 at 13:57 +0100, Chris Wilson wrote:
> On Tue, Apr 26, 2016 at 03:44:22PM +0300, Imre Deak wrote:
> > Setting a write-back cache policy in the MOCS entry definition also
> > implies snooping, which has a considerable overhead. This is
> > unexpected for a few reasons:
> 
> If it is snooping, then I don't see why it is undesirable to have it
> available in a mocs setting. If it is bogus and the bit is undefined,
> then by all means remove it.

None of these entries are used alone for coherent surfaces. For that
the application would have to use entry index#1 or #2 _and_ call the
set caching IOCTL to set the corresponding buffer to be cached. The
problem is that without setting the buffer to be cacheable the
expectation is that we won't be snooping and incur the corresponding
overhead. This is what this patch addresses.

The bit is also bogus, if we wanted snooping via MOCS we'd use the
dedicated HW flag for that.

If we wanted to have a snooping MOCS entry we should add that
separately (as a forth entry), but we'd need this change as a fix for
current users.

--Imre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config
  2016-04-26 13:17     ` Imre Deak
@ 2016-04-26 13:23       ` Chris Wilson
  2016-04-26 13:43         ` Imre Deak
  2016-04-26 14:26         ` Eero Tamminen
  0 siblings, 2 replies; 32+ messages in thread
From: Chris Wilson @ 2016-04-26 13:23 UTC (permalink / raw)
  To: Imre Deak; +Cc: intel-gfx, Valtteri Rantala, Eero Tamminen, Michael T Frederick

On Tue, Apr 26, 2016 at 04:17:55PM +0300, Imre Deak wrote:
> On ti, 2016-04-26 at 13:57 +0100, Chris Wilson wrote:
> > On Tue, Apr 26, 2016 at 03:44:22PM +0300, Imre Deak wrote:
> > > Setting a write-back cache policy in the MOCS entry definition also
> > > implies snooping, which has a considerable overhead. This is
> > > unexpected for a few reasons:
> > 
> > If it is snooping, then I don't see why it is undesirable to have it
> > available in a mocs setting. If it is bogus and the bit is undefined,
> > then by all means remove it.
> 
> None of these entries are used alone for coherent surfaces. For that
> the application would have to use entry index#1 or #2 _and_ call the
> set caching IOCTL to set the corresponding buffer to be cached.

No, the application doesn't. There are sufficent interfaces exposed that
userspace can bypass the kernel if it so desired.

> The
> problem is that without setting the buffer to be cacheable the
> expectation is that we won't be snooping and incur the corresponding
> overhead. This is what this patch addresses.

Not true.

> The bit is also bogus, if we wanted snooping via MOCS we'd use the
> dedicated HW flag for that.

But you keep saying this bit *enables* snooping. So either it does or it
doesn't.

> If we wanted to have a snooping MOCS entry we should add that
> separately (as a forth entry), but we'd need this change as a fix for
> current users.

The current users who are getting what they request but don't know what
they were requesting?
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config
  2016-04-26 13:23       ` Chris Wilson
@ 2016-04-26 13:43         ` Imre Deak
  2016-04-26 13:58           ` Chris Wilson
  2016-04-26 14:26         ` Eero Tamminen
  1 sibling, 1 reply; 32+ messages in thread
From: Imre Deak @ 2016-04-26 13:43 UTC (permalink / raw)
  To: Chris Wilson
  Cc: intel-gfx, Valtteri Rantala, Eero Tamminen, Michael T Frederick

On ti, 2016-04-26 at 14:23 +0100, Chris Wilson wrote:
> On Tue, Apr 26, 2016 at 04:17:55PM +0300, Imre Deak wrote:
> > On ti, 2016-04-26 at 13:57 +0100, Chris Wilson wrote:
> > > On Tue, Apr 26, 2016 at 03:44:22PM +0300, Imre Deak wrote:
> > > > Setting a write-back cache policy in the MOCS entry definition
> > > > also
> > > > implies snooping, which has a considerable overhead. This is
> > > > unexpected for a few reasons:
> > > 
> > > If it is snooping, then I don't see why it is undesirable to have
> > > it
> > > available in a mocs setting. If it is bogus and the bit is
> > > undefined,
> > > then by all means remove it.
> > 
> > None of these entries are used alone for coherent surfaces. For
> > that
> > the application would have to use entry index#1 or #2 _and_ call
> > the
> > set caching IOCTL to set the corresponding buffer to be cached.
> 
> No, the application doesn't. There are sufficent interfaces exposed
> that
> userspace can bypass the kernel if it so desired.
> 
> > The
> > problem is that without setting the buffer to be cacheable the
> > expectation is that we won't be snooping and incur the
> > corresponding
> > overhead. This is what this patch addresses.
> 
> Not true.

This is what we get running basic tests with current Mesa and UFO
drivers. They use entry #2 and do not expect snooping and the incurred
overhead. If they wanted a coherent surface they would have to call the
set caching IOCTL at least on VLV/CHV since on those platforms you
don't have a snooping flag in the MOCS, you have to set up the PTE
accordingly.

> > The bit is also bogus, if we wanted snooping via MOCS we'd use the
> > dedicated HW flag for that.
> 
> But you keep saying this bit *enables* snooping. So either it does or
> it doesn't.

It enables snooping but that's just a side effect. WB certainly doesn't
make sense for BXT since this WB flag controls the cacheability in
(e)LLC which doesn't exist on BXT. We'd use the 'snooping' HW MOCS flag
for this purpose.

> > If we wanted to have a snooping MOCS entry we should add that
> > separately (as a forth entry), but we'd need this change as a fix
> > for
> > current users.
> 
> The current users who are getting what they request but don't know
> what they were requesting?

They were requesting a non-coherent surface via MOCS entry #2.

--Imre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config
  2016-04-26 13:43         ` Imre Deak
@ 2016-04-26 13:58           ` Chris Wilson
  0 siblings, 0 replies; 32+ messages in thread
From: Chris Wilson @ 2016-04-26 13:58 UTC (permalink / raw)
  To: Imre Deak; +Cc: intel-gfx, Valtteri Rantala, Eero Tamminen, Michael T Frederick

On Tue, Apr 26, 2016 at 04:43:10PM +0300, Imre Deak wrote:
> On ti, 2016-04-26 at 14:23 +0100, Chris Wilson wrote:
> > On Tue, Apr 26, 2016 at 04:17:55PM +0300, Imre Deak wrote:
> > > On ti, 2016-04-26 at 13:57 +0100, Chris Wilson wrote:
> > > > On Tue, Apr 26, 2016 at 03:44:22PM +0300, Imre Deak wrote:
> > > > > Setting a write-back cache policy in the MOCS entry definition
> > > > > also
> > > > > implies snooping, which has a considerable overhead. This is
> > > > > unexpected for a few reasons:
> > > > 
> > > > If it is snooping, then I don't see why it is undesirable to have
> > > > it
> > > > available in a mocs setting. If it is bogus and the bit is
> > > > undefined,
> > > > then by all means remove it.
> > > 
> > > None of these entries are used alone for coherent surfaces. For
> > > that
> > > the application would have to use entry index#1 or #2 _and_ call
> > > the
> > > set caching IOCTL to set the corresponding buffer to be cached.
> > 
> > No, the application doesn't. There are sufficent interfaces exposed
> > that
> > userspace can bypass the kernel if it so desired.
> > 
> > > The
> > > problem is that without setting the buffer to be cacheable the
> > > expectation is that we won't be snooping and incur the
> > > corresponding
> > > overhead. This is what this patch addresses.
> > 
> > Not true.
> 
> This is what we get running basic tests with current Mesa and UFO
> drivers. They use entry #2 and do not expect snooping and the incurred
> overhead. If they wanted a coherent surface they would have to call the
> set caching IOCTL at least on VLV/CHV since on those platforms you
> don't have a snooping flag in the MOCS, you have to set up the PTE
> accordingly.

That doesn't imply anything about a new interface for bxt. This feature
allows userspace to mix snooping, cached, and uncached access to the
same surface within the same batch. And userspace is able to read that
information coherently without having to tell the kernel its
set-caching.

> > > The bit is also bogus, if we wanted snooping via MOCS we'd use the
> > > dedicated HW flag for that.
> > 
> > But you keep saying this bit *enables* snooping. So either it does or
> > it doesn't.
> 
> It enables snooping but that's just a side effect. WB certainly doesn't
> make sense for BXT since this WB flag controls the cacheability in
> (e)LLC which doesn't exist on BXT. We'd use the 'snooping' HW MOCS flag
> for this purpose.
> 
> > > If we wanted to have a snooping MOCS entry we should add that
> > > separately (as a forth entry), but we'd need this change as a fix
> > > for
> > > current users.
> > 
> > The current users who are getting what they request but don't know
> > what they were requesting?
> 
> They were requesting a non-coherent surface via MOCS entry #2.

But the kernel was advertising a coherent surface via MOCS entry #2. Is
not the code just cut'n'paste from LLC. It's not like mesa has a good
history of supporting !llc at all.

If the patch is make #2 == #0, then seriously there is no point as we
already advertise that mocs setting in #0. I'd rather have the entry
fixed to work as advertised.

At the moment your argument is just that userspace is shooting itself in
the foot.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config
  2016-04-26 13:23       ` Chris Wilson
  2016-04-26 13:43         ` Imre Deak
@ 2016-04-26 14:26         ` Eero Tamminen
  2016-04-26 14:30           ` Daniel Vetter
  2016-04-26 14:42           ` Chris Wilson
  1 sibling, 2 replies; 32+ messages in thread
From: Eero Tamminen @ 2016-04-26 14:26 UTC (permalink / raw)
  To: Chris Wilson, Imre Deak, intel-gfx, Valtteri Rantala,
	Michael T Frederick, Ville Syrjälä

Hi,

On 26.04.2016 16:23, Chris Wilson wrote:
> On Tue, Apr 26, 2016 at 04:17:55PM +0300, Imre Deak wrote:
>> On ti, 2016-04-26 at 13:57 +0100, Chris Wilson wrote:
>>> On Tue, Apr 26, 2016 at 03:44:22PM +0300, Imre Deak wrote:
>>>> Setting a write-back cache policy in the MOCS entry definition also
>>>> implies snooping, which has a considerable overhead. This is
>>>> unexpected for a few reasons:
>>>
>>> If it is snooping, then I don't see why it is undesirable to have it
>>> available in a mocs setting. If it is bogus and the bit is undefined,
>>> then by all means remove it.
>>
>> None of these entries are used alone for coherent surfaces. For that
>> the application would have to use entry index#1 or #2 _and_ call the
>> set caching IOCTL to set the corresponding buffer to be cached.
>
> No, the application doesn't. There are sufficent interfaces exposed that
> userspace can bypass the kernel if it so desired.
>
>> The
>> problem is that without setting the buffer to be cacheable the
>> expectation is that we won't be snooping and incur the corresponding
>> overhead. This is what this patch addresses.
>
> Not true.
>
>> The bit is also bogus, if we wanted snooping via MOCS we'd use the
>> dedicated HW flag for that.
>
> But you keep saying this bit *enables* snooping. So either it does or it
> doesn't.
>
>> If we wanted to have a snooping MOCS entry we should add that
>> separately (as a forth entry), but we'd need this change as a fix for
>> current users.
>
> The current users who are getting what they request but don't know what
> they were requesting?

What this kernel ABI (index entry #2) has been agreed & documented to 
provide?

I thought this entry is supposed to replace the writeback LLC/eLLC cache 
MOCS setting Mesa is using on (e.g. BDW) to speed up accesses to a 
memory area which it knows always to be accessed so that it can be cached.

If app runs on HW where LLC/eLLC is missing, giving the app extra 
slowdown instead of potential speedup sounds like failed HW abstraction. :-)


	- Eero

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config
  2016-04-26 14:26         ` Eero Tamminen
@ 2016-04-26 14:30           ` Daniel Vetter
  2016-04-26 17:18             ` Eero Tamminen
  2016-04-26 17:57             ` Ville Syrjälä
  2016-04-26 14:42           ` Chris Wilson
  1 sibling, 2 replies; 32+ messages in thread
From: Daniel Vetter @ 2016-04-26 14:30 UTC (permalink / raw)
  To: Eero Tamminen; +Cc: Michael T Frederick, Valtteri Rantala, intel-gfx

On Tue, Apr 26, 2016 at 05:26:43PM +0300, Eero Tamminen wrote:
> Hi,
> 
> On 26.04.2016 16:23, Chris Wilson wrote:
> >On Tue, Apr 26, 2016 at 04:17:55PM +0300, Imre Deak wrote:
> >>On ti, 2016-04-26 at 13:57 +0100, Chris Wilson wrote:
> >>>On Tue, Apr 26, 2016 at 03:44:22PM +0300, Imre Deak wrote:
> >>>>Setting a write-back cache policy in the MOCS entry definition also
> >>>>implies snooping, which has a considerable overhead. This is
> >>>>unexpected for a few reasons:
> >>>
> >>>If it is snooping, then I don't see why it is undesirable to have it
> >>>available in a mocs setting. If it is bogus and the bit is undefined,
> >>>then by all means remove it.
> >>
> >>None of these entries are used alone for coherent surfaces. For that
> >>the application would have to use entry index#1 or #2 _and_ call the
> >>set caching IOCTL to set the corresponding buffer to be cached.
> >
> >No, the application doesn't. There are sufficent interfaces exposed that
> >userspace can bypass the kernel if it so desired.
> >
> >>The
> >>problem is that without setting the buffer to be cacheable the
> >>expectation is that we won't be snooping and incur the corresponding
> >>overhead. This is what this patch addresses.
> >
> >Not true.
> >
> >>The bit is also bogus, if we wanted snooping via MOCS we'd use the
> >>dedicated HW flag for that.
> >
> >But you keep saying this bit *enables* snooping. So either it does or it
> >doesn't.
> >
> >>If we wanted to have a snooping MOCS entry we should add that
> >>separately (as a forth entry), but we'd need this change as a fix for
> >>current users.
> >
> >The current users who are getting what they request but don't know what
> >they were requesting?
> 
> What this kernel ABI (index entry #2) has been agreed & documented to
> provide?
> 
> I thought this entry is supposed to replace the writeback LLC/eLLC cache
> MOCS setting Mesa is using on (e.g. BDW) to speed up accesses to a memory
> area which it knows always to be accessed so that it can be cached.
> 
> If app runs on HW where LLC/eLLC is missing, giving the app extra slowdown
> instead of potential speedup sounds like failed HW abstraction. :-)

Well mesa needs to know llc vs. !llc anyway to not totally suck, and
defining entry #2 as "coherent, always" makes sense. I thought entry 0 was
the reaonable default aka pte passthrough and hence managed by kernel?

If mesa asks for nonsense, the kernel is happy to oblige.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config
  2016-04-26 14:26         ` Eero Tamminen
  2016-04-26 14:30           ` Daniel Vetter
@ 2016-04-26 14:42           ` Chris Wilson
  2016-04-26 16:01             ` Imre Deak
  1 sibling, 1 reply; 32+ messages in thread
From: Chris Wilson @ 2016-04-26 14:42 UTC (permalink / raw)
  To: Eero Tamminen; +Cc: Valtteri Rantala, intel-gfx, Michael T Frederick

On Tue, Apr 26, 2016 at 05:26:43PM +0300, Eero Tamminen wrote:
> Hi,
> 
> On 26.04.2016 16:23, Chris Wilson wrote:
> >On Tue, Apr 26, 2016 at 04:17:55PM +0300, Imre Deak wrote:
> >>On ti, 2016-04-26 at 13:57 +0100, Chris Wilson wrote:
> >>>On Tue, Apr 26, 2016 at 03:44:22PM +0300, Imre Deak wrote:
> >>>>Setting a write-back cache policy in the MOCS entry definition also
> >>>>implies snooping, which has a considerable overhead. This is
> >>>>unexpected for a few reasons:
> >>>
> >>>If it is snooping, then I don't see why it is undesirable to have it
> >>>available in a mocs setting. If it is bogus and the bit is undefined,
> >>>then by all means remove it.
> >>
> >>None of these entries are used alone for coherent surfaces. For that
> >>the application would have to use entry index#1 or #2 _and_ call the
> >>set caching IOCTL to set the corresponding buffer to be cached.
> >
> >No, the application doesn't. There are sufficent interfaces exposed that
> >userspace can bypass the kernel if it so desired.
> >
> >>The
> >>problem is that without setting the buffer to be cacheable the
> >>expectation is that we won't be snooping and incur the corresponding
> >>overhead. This is what this patch addresses.
> >
> >Not true.
> >
> >>The bit is also bogus, if we wanted snooping via MOCS we'd use the
> >>dedicated HW flag for that.
> >
> >But you keep saying this bit *enables* snooping. So either it does or it
> >doesn't.
> >
> >>If we wanted to have a snooping MOCS entry we should add that
> >>separately (as a forth entry), but we'd need this change as a fix for
> >>current users.
> >
> >The current users who are getting what they request but don't know what
> >they were requesting?
> 
> What this kernel ABI (index entry #2) has been agreed & documented
> to provide?

The ABI is what we agree makes sense between hardware / kernel /
userspace, and then we stick to it.

> I thought this entry is supposed to replace the writeback LLC/eLLC
> cache MOCS setting Mesa is using on (e.g. BDW) to speed up accesses
> to a memory area which it knows always to be accessed so that it can
> be cached.

Which it does... Only it speeds us writeback from the CPU, not the GPU. ;)

The confusion seems to be in mistaking !llc for llc. We have to come to
some agreement on whether it makes sense having multiple entries for the
same follows-PTE mocs or whether it makes more sense to expose the hw
capabilties.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config
  2016-04-26 14:42           ` Chris Wilson
@ 2016-04-26 16:01             ` Imre Deak
  2016-04-28  8:17               ` Daniel Vetter
  0 siblings, 1 reply; 32+ messages in thread
From: Imre Deak @ 2016-04-26 16:01 UTC (permalink / raw)
  To: Chris Wilson, Eero Tamminen
  Cc: intel-gfx, Valtteri Rantala, Michael T Frederick

On ti, 2016-04-26 at 15:42 +0100, Chris Wilson wrote:
> On Tue, Apr 26, 2016 at 05:26:43PM +0300, Eero Tamminen wrote:
> > Hi,
> > 
> > On 26.04.2016 16:23, Chris Wilson wrote:
> > > On Tue, Apr 26, 2016 at 04:17:55PM +0300, Imre Deak wrote:
> > > > On ti, 2016-04-26 at 13:57 +0100, Chris Wilson wrote:
> > > > > On Tue, Apr 26, 2016 at 03:44:22PM +0300, Imre Deak wrote:
> > > > > > Setting a write-back cache policy in the MOCS entry
> > > > > > definition also
> > > > > > implies snooping, which has a considerable overhead. This
> > > > > > is
> > > > > > unexpected for a few reasons:
> > > > > 
> > > > > If it is snooping, then I don't see why it is undesirable to
> > > > > have it
> > > > > available in a mocs setting. If it is bogus and the bit is
> > > > > undefined,
> > > > > then by all means remove it.
> > > > 
> > > > None of these entries are used alone for coherent surfaces. For
> > > > that
> > > > the application would have to use entry index#1 or #2 _and_
> > > > call the
> > > > set caching IOCTL to set the corresponding buffer to be cached.
> > > 
> > > No, the application doesn't. There are sufficent interfaces
> > > exposed that
> > > userspace can bypass the kernel if it so desired.
> > > 
> > > > The
> > > > problem is that without setting the buffer to be cacheable the
> > > > expectation is that we won't be snooping and incur the
> > > > corresponding
> > > > overhead. This is what this patch addresses.
> > > 
> > > Not true.
> > > 
> > > > The bit is also bogus, if we wanted snooping via MOCS we'd use
> > > > the
> > > > dedicated HW flag for that.
> > > 
> > > But you keep saying this bit *enables* snooping. So either it
> > > does or it
> > > doesn't.
> > > 
> > > > If we wanted to have a snooping MOCS entry we should add that
> > > > separately (as a forth entry), but we'd need this change as a
> > > > fix for
> > > > current users.
> > > 
> > > The current users who are getting what they request but don't
> > > know what
> > > they were requesting?
> > 
> > What this kernel ABI (index entry #2) has been agreed & documented
> > to provide?
> 
> The ABI is what we agree makes sense between hardware / kernel /
> userspace, and then we stick to it.
> 
> > I thought this entry is supposed to replace the writeback LLC/eLLC
> > cache MOCS setting Mesa is using on (e.g. BDW) to speed up accesses
> > to a memory area which it knows always to be accessed so that it
> > can
> > be cached.
> 
> Which it does... Only it speeds us writeback from the CPU, not the
> GPU. ;)
> 
> The confusion seems to be in mistaking !llc for llc. We have to come
> to
> some agreement on whether it makes sense having multiple entries for
> the
> same follows-PTE mocs or whether it makes more sense to expose the hw
> capabilties.

Note that we can't just say to Mesa to use index #0 on BXT instead of
index #2, since index #0 turns off caching in GPU L3, so we'd have to
also redefine that to be L3 cached. And I don't know what turning on L3
caching for index #0 would break, I would rather avoid doing that. Imo
defining the entries the following way would allow us to use the same
indexes on GEN9 regardless of it being SKL or BXT:
#0: uncached (neither in L3 nor in (e)LLC)
#1: PTE passthrough
#2: cached everywhere (L3 + (e)LLC if it exists), non-coherent
#3: cached everywhere, coherent

I'm not sure if there is even a user for coherent surfaces atm, so then
we could delay adding #3 until it's really needed.

--Imre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 32+ messages in thread

* ✗ Fi.CI.BAT: failure for series starting with [v2,1/2] drm/i915/gen9: Clean up MOCS table definitions
  2016-04-26 12:44 [PATCH v2 1/2] drm/i915/gen9: Clean up MOCS table definitions Imre Deak
  2016-04-26 12:44 ` [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config Imre Deak
@ 2016-04-26 16:55 ` Patchwork
  1 sibling, 0 replies; 32+ messages in thread
From: Patchwork @ 2016-04-26 16:55 UTC (permalink / raw)
  To: Imre Deak; +Cc: intel-gfx

== Series Details ==

Series: series starting with [v2,1/2] drm/i915/gen9: Clean up MOCS table definitions
URL   : https://patchwork.freedesktop.org/series/6332/
State : failure

== Summary ==

Series 6332v1 Series without cover letter
http://patchwork.freedesktop.org/api/1.0/series/6332/revisions/1/mbox/

Test drv_module_reload_basic:
                pass       -> INCOMPLETE (skl-nuci5)
Test gem_ringfill:
        Subgroup basic-default-interruptible:
                pass       -> INCOMPLETE (bsw-nuc-2)
Test kms_pipe_crc_basic:
        Subgroup hang-read-crc-pipe-b:
                incomplete -> PASS       (snb-dellxps)

bdw-nuci7        total:200  pass:188  dwarn:0   dfail:0   fail:0   skip:12 
bdw-ultra        total:200  pass:175  dwarn:0   dfail:0   fail:0   skip:25 
bsw-nuc-2        total:4    pass:3    dwarn:0   dfail:0   fail:0   skip:0  
byt-nuc          total:199  pass:158  dwarn:0   dfail:0   fail:0   skip:41 
hsw-brixbox      total:200  pass:174  dwarn:0   dfail:0   fail:0   skip:26 
hsw-gt2          total:200  pass:178  dwarn:0   dfail:0   fail:1   skip:21 
ivb-t430s        total:200  pass:169  dwarn:0   dfail:0   fail:0   skip:31 
skl-i7k-2        total:200  pass:173  dwarn:0   dfail:0   fail:0   skip:27 
skl-nuci5        total:167  pass:159  dwarn:0   dfail:0   fail:0   skip:7  
snb-dellxps      total:200  pass:158  dwarn:0   dfail:0   fail:0   skip:42 
snb-x220t        total:200  pass:158  dwarn:0   dfail:0   fail:1   skip:41 

Results at /archive/results/CI_IGT_test/Patchwork_2074/

e005db1cb2c60d18abe837ac683d8993ea77b239 drm-intel-nightly: 2016y-04m-26d-12h-51m-57s UTC integration manifest
721ac06 drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config
db5f276 drm/i915/gen9: Clean up MOCS table definitions

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config
  2016-04-26 14:30           ` Daniel Vetter
@ 2016-04-26 17:18             ` Eero Tamminen
  2016-04-26 17:25               ` Frederick, Michael T
  2016-04-26 17:57             ` Ville Syrjälä
  1 sibling, 1 reply; 32+ messages in thread
From: Eero Tamminen @ 2016-04-26 17:18 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: Michael T Frederick, Valtteri Rantala, intel-gfx

Hi,

On 26.04.2016 17:30, Daniel Vetter wrote:
> On Tue, Apr 26, 2016 at 05:26:43PM +0300, Eero Tamminen wrote:
[...]
>> What this kernel ABI (index entry #2) has been agreed & documented to
>> provide?
>>
>> I thought this entry is supposed to replace the writeback LLC/eLLC cache
>> MOCS setting Mesa is using on (e.g. BDW) to speed up accesses to a memory
>> area which it knows always to be accessed so that it can be cached.
>>
>> If app runs on HW where LLC/eLLC is missing, giving the app extra slowdown
>> instead of potential speedup sounds like failed HW abstraction. :-)
>
> Well mesa needs to know llc vs. !llc anyway to not totally suck,

What do you think it should do with that information?

I assume you to mean, that Mesa needs to know the *amount* of LLC and 
change its behavior based on that amount, not just whether it's present.

In that case Mesa does, and has always totally "sucked".  Mesa on 
earlier GEN(s) cached everything that can be cached, and I assume it to 
try to do that with GEN9 too.


However, based on our MOCS testing on BDW, that actually gives the best 
overall perf results.  On average it doesn't give much, but it was 
better than any straightforward (buffer size/type) heuristics for making 
something not to be cached in effort to utilize LLC "better".

It seemed that LLC is too small to have meaningful generic heuristics 
for normal 3D workloads (or they need to be very complex, something 
needing months of testing & iteration, or be per application, not generic).

eLLC could be a different matter as it's large enough that one can put 
e.g. color/depth buffer there.

Skip cache setting for LLC may also be useful, if it works (as it in a 
sense extends the cache size), and render compression can also change 
things.  Problem with RBC is that it makes assumptions about memory 
areas usage even less reliable as you don't know how well the content 
compresses.


> and defining entry #2 as "coherent, always" makes sense. I thought entry 0 was
> the reaonable default aka pte passthrough and hence managed by kernel?
>
> If mesa asks for nonsense, the kernel is happy to oblige.


	- Eero

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config
  2016-04-26 17:18             ` Eero Tamminen
@ 2016-04-26 17:25               ` Frederick, Michael T
  2016-04-27 13:25                 ` Eero Tamminen
  0 siblings, 1 reply; 32+ messages in thread
From: Frederick, Michael T @ 2016-04-26 17:25 UTC (permalink / raw)
  To: Tamminen, Eero T, Daniel Vetter; +Cc: Rantala, Valtteri, intel-gfx

Sorry I'm not tracking all the MOCs discussions.  I just want to indicate what the coherency means in SoC for BXT.

GTI sets the non-inclusive bit on the IDI interface based on how it treats the memory.  In BXT case where there is no uncore cache, "non-inclusive" just indicates snoop or not.  BXT has a snoop filter in order to make the latency of snooping GT from a core roughly similar to snooping another core.

For BXT:
If GTI sets non-inclusive=0 (i.e. coherent): transaction looks up in the SF and the SA snoops the cores.  The potential impact here is that for high BW coherent traffic, the SF will become the BW limiter of the system and cap BW at 33% * 34GBps. For writes like WCILFs snoops to cores must be resolved before SA requests WR data from GT.  For reads the common case should have no impact because snoop latency is generally much less than memory data latency.  In general snoop latency for a core is relatively small, but there is also the prospect that a core could be down (e.g. ratio change) or loaded w/ snooping.
If GTI sets non-inclusive=1 (i.e. non-coherent): transaction takes the SF bypass and the SA does not snoop the cores.  This is best for high-BW since it removes the SF bottleneck and doesn't require core interaction.

Thanks, Mike


-----Original Message-----
From: Tamminen, Eero T 
Sent: Tuesday, April 26, 2016 10:19 AM
To: Daniel Vetter <daniel@ffwll.ch>
Cc: Chris Wilson <chris@chris-wilson.co.uk>; Deak, Imre <imre.deak@intel.com>; intel-gfx@lists.freedesktop.org; Rantala, Valtteri <valtteri.rantala@intel.com>; Frederick, Michael T <michael.t.frederick@intel.com>; Ville Syrjälä <ville.syrjala@linux.intel.com>
Subject: Re: [Intel-gfx] [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config

Hi,

On 26.04.2016 17:30, Daniel Vetter wrote:
> On Tue, Apr 26, 2016 at 05:26:43PM +0300, Eero Tamminen wrote:
[...]
>> What this kernel ABI (index entry #2) has been agreed & documented to 
>> provide?
>>
>> I thought this entry is supposed to replace the writeback LLC/eLLC 
>> cache MOCS setting Mesa is using on (e.g. BDW) to speed up accesses 
>> to a memory area which it knows always to be accessed so that it can be cached.
>>
>> If app runs on HW where LLC/eLLC is missing, giving the app extra 
>> slowdown instead of potential speedup sounds like failed HW 
>> abstraction. :-)
>
> Well mesa needs to know llc vs. !llc anyway to not totally suck,

What do you think it should do with that information?

I assume you to mean, that Mesa needs to know the *amount* of LLC and change its behavior based on that amount, not just whether it's present.

In that case Mesa does, and has always totally "sucked".  Mesa on earlier GEN(s) cached everything that can be cached, and I assume it to try to do that with GEN9 too.


However, based on our MOCS testing on BDW, that actually gives the best overall perf results.  On average it doesn't give much, but it was better than any straightforward (buffer size/type) heuristics for making something not to be cached in effort to utilize LLC "better".

It seemed that LLC is too small to have meaningful generic heuristics for normal 3D workloads (or they need to be very complex, something needing months of testing & iteration, or be per application, not generic).

eLLC could be a different matter as it's large enough that one can put e.g. color/depth buffer there.

Skip cache setting for LLC may also be useful, if it works (as it in a sense extends the cache size), and render compression can also change things.  Problem with RBC is that it makes assumptions about memory areas usage even less reliable as you don't know how well the content compresses.


> and defining entry #2 as "coherent, always" makes sense. I thought 
> entry 0 was the reaonable default aka pte passthrough and hence managed by kernel?
>
> If mesa asks for nonsense, the kernel is happy to oblige.


	- Eero

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config
  2016-04-26 14:30           ` Daniel Vetter
  2016-04-26 17:18             ` Eero Tamminen
@ 2016-04-26 17:57             ` Ville Syrjälä
  2016-04-28  8:13               ` Daniel Vetter
  1 sibling, 1 reply; 32+ messages in thread
From: Ville Syrjälä @ 2016-04-26 17:57 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: Eero Tamminen, Michael T Frederick, Valtteri Rantala, intel-gfx

On Tue, Apr 26, 2016 at 04:30:05PM +0200, Daniel Vetter wrote:
> On Tue, Apr 26, 2016 at 05:26:43PM +0300, Eero Tamminen wrote:
> > Hi,
> > 
> > On 26.04.2016 16:23, Chris Wilson wrote:
> > >On Tue, Apr 26, 2016 at 04:17:55PM +0300, Imre Deak wrote:
> > >>On ti, 2016-04-26 at 13:57 +0100, Chris Wilson wrote:
> > >>>On Tue, Apr 26, 2016 at 03:44:22PM +0300, Imre Deak wrote:
> > >>>>Setting a write-back cache policy in the MOCS entry definition also
> > >>>>implies snooping, which has a considerable overhead. This is
> > >>>>unexpected for a few reasons:
> > >>>
> > >>>If it is snooping, then I don't see why it is undesirable to have it
> > >>>available in a mocs setting. If it is bogus and the bit is undefined,
> > >>>then by all means remove it.
> > >>
> > >>None of these entries are used alone for coherent surfaces. For that
> > >>the application would have to use entry index#1 or #2 _and_ call the
> > >>set caching IOCTL to set the corresponding buffer to be cached.
> > >
> > >No, the application doesn't. There are sufficent interfaces exposed that
> > >userspace can bypass the kernel if it so desired.
> > >
> > >>The
> > >>problem is that without setting the buffer to be cacheable the
> > >>expectation is that we won't be snooping and incur the corresponding
> > >>overhead. This is what this patch addresses.
> > >
> > >Not true.
> > >
> > >>The bit is also bogus, if we wanted snooping via MOCS we'd use the
> > >>dedicated HW flag for that.
> > >
> > >But you keep saying this bit *enables* snooping. So either it does or it
> > >doesn't.
> > >
> > >>If we wanted to have a snooping MOCS entry we should add that
> > >>separately (as a forth entry), but we'd need this change as a fix for
> > >>current users.
> > >
> > >The current users who are getting what they request but don't know what
> > >they were requesting?
> > 
> > What this kernel ABI (index entry #2) has been agreed & documented to
> > provide?
> > 
> > I thought this entry is supposed to replace the writeback LLC/eLLC cache
> > MOCS setting Mesa is using on (e.g. BDW) to speed up accesses to a memory
> > area which it knows always to be accessed so that it can be cached.
> > 
> > If app runs on HW where LLC/eLLC is missing, giving the app extra slowdown
> > instead of potential speedup sounds like failed HW abstraction. :-)
> 
> Well mesa needs to know llc vs. !llc anyway to not totally suck, and
> defining entry #2 as "coherent, always" makes sense. I thought entry 0 was
> the reaonable default aka pte passthrough and hence managed by kernel?

Nope, we fscked that up somewhat, and entry 1 is the PTE one :( So if
userspace forgets to set MOCS on gen9 it won't get the behaviour it
would have gotten on previous gens.

> 
> If mesa asks for nonsense, the kernel is happy to oblige.

We never really defined what entry 2 actually means: coherent or sane
performance. Mesa has, rightfully IMO, made the assumption that it means
the latter since we never set out to define any MOCS entries with
coherency in mind. And seeing that it's already out in the wild, I think
it's better to respect it. If we change it now we would just make it
more painful for people when they get their hands on the hardware.

I think what we should do is define what the MOCS indexes mean in some
uapi header. Then there would be no ambiguity.

-- 
Ville Syrjälä
Intel OTC
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config
  2016-04-26 17:25               ` Frederick, Michael T
@ 2016-04-27 13:25                 ` Eero Tamminen
  2016-04-27 14:53                   ` Chris Wilson
  0 siblings, 1 reply; 32+ messages in thread
From: Eero Tamminen @ 2016-04-27 13:25 UTC (permalink / raw)
  To: Frederick, Michael T, Daniel Vetter; +Cc: Rantala, Valtteri, intel-gfx

Hi,

On 26.04.2016 20:25, Frederick, Michael T wrote:
> Sorry I'm not tracking all the MOCs discussions.  I just want to indicate what the coherency means in SoC for BXT.
>
> GTI sets the non-inclusive bit on the IDI interface based on how it treats the memory.  In BXT case where there is no uncore cache, "non-inclusive" just indicates snoop or not.  BXT has a snoop filter in order to make the latency of snooping GT from a core roughly similar to snooping another core.
>
> For BXT:
> If GTI sets non-inclusive=0 (i.e. coherent): transaction looks up in the SF and the SA snoops the cores.  The potential impact here is that for high BW coherent traffic, the SF will become the BW limiter of the system and cap BW at 33% * 34GBps. For writes like WCILFs snoops to cores must be resolved before SA requests WR data from GT.  For reads the common case should have no impact because snoop latency is generally much less than memory data latency.  In general snoop latency for a core is relatively small, but there is also the prospect that a core could be down (e.g. ratio change) or loaded w/ snooping.
> If GTI sets non-inclusive=1 (i.e. non-coherent): transaction takes the SF bypass and the SA does not snoop the cores.  This is best for high-BW since it removes the SF bottleneck and doesn't require core interaction.

Thanks for the explanation!

AFAIK:

* In regards to 3D driver operations, CPU side doesn't modify the buffer 
contents while GPU is working on them.  CPU side sets up the buffers 
(textures, VBOs, batches etc), and then (after a flush) GPU is asked to 
act on them.

* For things like texture streaming, the driver either internally 
synchronizes the data or creates a new copy of it whenever application 
tells that data is updated.  There's always some kind of "upload" 
involved (GL API needs it as non-integrated GPU's don't share memory 
with CPU).

While it's possible that there's a case where snooping would be needed, 
I cannot think of any myself.

Daniel, Chris, did you have some concrete example in mind where 3D 
driver would require CPU to snoop GPU?


	- Eero

Disclaimer: I'm not a 3D driver developer myself.

>
> Thanks, Mike
>
>
> -----Original Message-----
> From: Tamminen, Eero T
> Sent: Tuesday, April 26, 2016 10:19 AM
> To: Daniel Vetter <daniel@ffwll.ch>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>; Deak, Imre <imre.deak@intel.com>; intel-gfx@lists.freedesktop.org; Rantala, Valtteri <valtteri.rantala@intel.com>; Frederick, Michael T <michael.t.frederick@intel.com>; Ville Syrjälä <ville.syrjala@linux.intel.com>
> Subject: Re: [Intel-gfx] [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config
>
> Hi,
>
> On 26.04.2016 17:30, Daniel Vetter wrote:
>> On Tue, Apr 26, 2016 at 05:26:43PM +0300, Eero Tamminen wrote:
> [...]
>>> What this kernel ABI (index entry #2) has been agreed & documented to
>>> provide?
>>>
>>> I thought this entry is supposed to replace the writeback LLC/eLLC
>>> cache MOCS setting Mesa is using on (e.g. BDW) to speed up accesses
>>> to a memory area which it knows always to be accessed so that it can be cached.
>>>
>>> If app runs on HW where LLC/eLLC is missing, giving the app extra
>>> slowdown instead of potential speedup sounds like failed HW
>>> abstraction. :-)
>>
>> Well mesa needs to know llc vs. !llc anyway to not totally suck,
>
> What do you think it should do with that information?
>
> I assume you to mean, that Mesa needs to know the *amount* of LLC and change its behavior based on that amount, not just whether it's present.
>
> In that case Mesa does, and has always totally "sucked".  Mesa on earlier GEN(s) cached everything that can be cached, and I assume it to try to do that with GEN9 too.
>
>
> However, based on our MOCS testing on BDW, that actually gives the best overall perf results.  On average it doesn't give much, but it was better than any straightforward (buffer size/type) heuristics for making something not to be cached in effort to utilize LLC "better".
>
> It seemed that LLC is too small to have meaningful generic heuristics for normal 3D workloads (or they need to be very complex, something needing months of testing & iteration, or be per application, not generic).
>
> eLLC could be a different matter as it's large enough that one can put e.g. color/depth buffer there.
>
> Skip cache setting for LLC may also be useful, if it works (as it in a sense extends the cache size), and render compression can also change things.  Problem with RBC is that it makes assumptions about memory areas usage even less reliable as you don't know how well the content compresses.
>
>
>> and defining entry #2 as "coherent, always" makes sense. I thought
>> entry 0 was the reaonable default aka pte passthrough and hence managed by kernel?
>>
>> If mesa asks for nonsense, the kernel is happy to oblige.
>
>
> 	- Eero
>

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config
  2016-04-27 13:25                 ` Eero Tamminen
@ 2016-04-27 14:53                   ` Chris Wilson
  2016-04-27 18:42                     ` Dave Gordon
  2016-04-29  8:01                     ` Eero Tamminen
  0 siblings, 2 replies; 32+ messages in thread
From: Chris Wilson @ 2016-04-27 14:53 UTC (permalink / raw)
  To: Eero Tamminen; +Cc: intel-gfx, Frederick, Michael T, Rantala, Valtteri

On Wed, Apr 27, 2016 at 04:25:09PM +0300, Eero Tamminen wrote:
> Hi,
> 
> On 26.04.2016 20:25, Frederick, Michael T wrote:
> >Sorry I'm not tracking all the MOCs discussions.  I just want to indicate what the coherency means in SoC for BXT.
> >
> >GTI sets the non-inclusive bit on the IDI interface based on how it treats the memory.  In BXT case where there is no uncore cache, "non-inclusive" just indicates snoop or not.  BXT has a snoop filter in order to make the latency of snooping GT from a core roughly similar to snooping another core.
> >
> >For BXT:
> >If GTI sets non-inclusive=0 (i.e. coherent): transaction looks up in the SF and the SA snoops the cores.  The potential impact here is that for high BW coherent traffic, the SF will become the BW limiter of the system and cap BW at 33% * 34GBps. For writes like WCILFs snoops to cores must be resolved before SA requests WR data from GT.  For reads the common case should have no impact because snoop latency is generally much less than memory data latency.  In general snoop latency for a core is relatively small, but there is also the prospect that a core could be down (e.g. ratio change) or loaded w/ snooping.
> >If GTI sets non-inclusive=1 (i.e. non-coherent): transaction takes the SF bypass and the SA does not snoop the cores.  This is best for high-BW since it removes the SF bottleneck and doesn't require core interaction.
> 
> Thanks for the explanation!
> 
> AFAIK:
> 
> * In regards to 3D driver operations, CPU side doesn't modify the
> buffer contents while GPU is working on them.  CPU side sets up the
> buffers (textures, VBOs, batches etc), and then (after a flush) GPU
> is asked to act on them.
> 
> * For things like texture streaming, the driver either internally
> synchronizes the data or creates a new copy of it whenever
> application tells that data is updated.  There's always some kind of
> "upload" involved (GL API needs it as non-integrated GPU's don't
> share memory with CPU).
> 
> While it's possible that there's a case where snooping would be
> needed, I cannot think of any myself.
> 
> Daniel, Chris, did you have some concrete example in mind where 3D
> driver would require CPU to snoop GPU?

Not mesa, but X can do concurrent rendering to a Pixmap whilst also
rendering from other parts of that Pixmap into a GPU side buffer and
presentation/compositing thereof. X uses snooping both ways (from client
memory to GPU and from GPU to client memory) as well as mixed rendering.

Mesa should be using snooping for both SubTexImage and GetTexImage. On
the SubTexImage path you can use the sampler to do format conversions
that even including the sync overhead for correctness when using client
memory avoid the awful format conversion code in mesa. Using the GPU to
write into client memory and avoiding WC reads is approximately an
order of magnitude (8x) faster than the current code mesa uses.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config
  2016-04-27 14:53                   ` Chris Wilson
@ 2016-04-27 18:42                     ` Dave Gordon
  2016-04-29  8:01                     ` Eero Tamminen
  1 sibling, 0 replies; 32+ messages in thread
From: Dave Gordon @ 2016-04-27 18:42 UTC (permalink / raw)
  To: intel-gfx

On 27/04/16 15:53, Chris Wilson wrote:
> On Wed, Apr 27, 2016 at 04:25:09PM +0300, Eero Tamminen wrote:
>> Hi,
>>
>> On 26.04.2016 20:25, Frederick, Michael T wrote:
>>> Sorry I'm not tracking all the MOCs discussions.  I just want to indicate what the coherency means in SoC for BXT.
>>>
>>> GTI sets the non-inclusive bit on the IDI interface based on how it treats the memory.  In BXT case where there is no uncore cache, "non-inclusive" just indicates snoop or not.  BXT has a snoop filter in order to make the latency of snooping GT from a core roughly similar to snooping another core.
>>>
>>> For BXT:
>>> If GTI sets non-inclusive=0 (i.e. coherent): transaction looks up in the SF and the SA snoops the cores.  The potential impact here is that for high BW coherent traffic, the SF will become the BW limiter of the system and cap BW at 33% * 34GBps. For writes like WCILFs snoops to cores must be resolved before SA requests WR data from GT.  For reads the common case should have no impact because snoop latency is generally much less than memory data latency.  In general snoop latency for a core is relatively small, but there is also the prospect that a core could be down (e.g. ratio change) or loaded w/ snooping.
>>> If GTI sets non-inclusive=1 (i.e. non-coherent): transaction takes the SF bypass and the SA does not snoop the cores.  This is best for high-BW since it removes the SF bottleneck and doesn't require core interaction.
>>
>> Thanks for the explanation!
>>
>> AFAIK:
>>
>> * In regards to 3D driver operations, CPU side doesn't modify the
>> buffer contents while GPU is working on them.  CPU side sets up the
>> buffers (textures, VBOs, batches etc), and then (after a flush) GPU
>> is asked to act on them.
>>
>> * For things like texture streaming, the driver either internally
>> synchronizes the data or creates a new copy of it whenever
>> application tells that data is updated.  There's always some kind of
>> "upload" involved (GL API needs it as non-integrated GPU's don't
>> share memory with CPU).
>>
>> While it's possible that there's a case where snooping would be
>> needed, I cannot think of any myself.
>>
>> Daniel, Chris, did you have some concrete example in mind where 3D
>> driver would require CPU to snoop GPU?
>
> Not mesa, but X can do concurrent rendering to a Pixmap whilst also
> rendering from other parts of that Pixmap into a GPU side buffer and
> presentation/compositing thereof. X uses snooping both ways (from client
> memory to GPU and from GPU to client memory) as well as mixed rendering.
>
> Mesa should be using snooping for both SubTexImage and GetTexImage. On
> the SubTexImage path you can use the sampler to do format conversions
> that even including the sync overhead for correctness when using client
> memory avoid the awful format conversion code in mesa. Using the GPU to
> write into client memory and avoiding WC reads is approximately an
> order of magnitude (8x) faster than the current code mesa uses.
> -Chris

Presumably its useful for the CPU to snoop the h/w status page(s), and 
maybe the ring-context part of a context image (so that TAIL updates are 
coherent), but OTOH snooping the rest of the context image might add 
overhead, and AFAIK we don't normally read (or write) any of that after 
setup. So maybe we don't want vmap-whole-object after all?

.Dave.

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config
  2016-04-26 17:57             ` Ville Syrjälä
@ 2016-04-28  8:13               ` Daniel Vetter
  2016-04-28 10:48                 ` Ville Syrjälä
  0 siblings, 1 reply; 32+ messages in thread
From: Daniel Vetter @ 2016-04-28  8:13 UTC (permalink / raw)
  To: Ville Syrjälä
  Cc: Eero Tamminen, Michael T Frederick, Valtteri Rantala, intel-gfx

On Tue, Apr 26, 2016 at 08:57:51PM +0300, Ville Syrjälä wrote:
> On Tue, Apr 26, 2016 at 04:30:05PM +0200, Daniel Vetter wrote:
> > On Tue, Apr 26, 2016 at 05:26:43PM +0300, Eero Tamminen wrote:
> > > Hi,
> > > 
> > > On 26.04.2016 16:23, Chris Wilson wrote:
> > > >On Tue, Apr 26, 2016 at 04:17:55PM +0300, Imre Deak wrote:
> > > >>On ti, 2016-04-26 at 13:57 +0100, Chris Wilson wrote:
> > > >>>On Tue, Apr 26, 2016 at 03:44:22PM +0300, Imre Deak wrote:
> > > >>>>Setting a write-back cache policy in the MOCS entry definition also
> > > >>>>implies snooping, which has a considerable overhead. This is
> > > >>>>unexpected for a few reasons:
> > > >>>
> > > >>>If it is snooping, then I don't see why it is undesirable to have it
> > > >>>available in a mocs setting. If it is bogus and the bit is undefined,
> > > >>>then by all means remove it.
> > > >>
> > > >>None of these entries are used alone for coherent surfaces. For that
> > > >>the application would have to use entry index#1 or #2 _and_ call the
> > > >>set caching IOCTL to set the corresponding buffer to be cached.
> > > >
> > > >No, the application doesn't. There are sufficent interfaces exposed that
> > > >userspace can bypass the kernel if it so desired.
> > > >
> > > >>The
> > > >>problem is that without setting the buffer to be cacheable the
> > > >>expectation is that we won't be snooping and incur the corresponding
> > > >>overhead. This is what this patch addresses.
> > > >
> > > >Not true.
> > > >
> > > >>The bit is also bogus, if we wanted snooping via MOCS we'd use the
> > > >>dedicated HW flag for that.
> > > >
> > > >But you keep saying this bit *enables* snooping. So either it does or it
> > > >doesn't.
> > > >
> > > >>If we wanted to have a snooping MOCS entry we should add that
> > > >>separately (as a forth entry), but we'd need this change as a fix for
> > > >>current users.
> > > >
> > > >The current users who are getting what they request but don't know what
> > > >they were requesting?
> > > 
> > > What this kernel ABI (index entry #2) has been agreed & documented to
> > > provide?
> > > 
> > > I thought this entry is supposed to replace the writeback LLC/eLLC cache
> > > MOCS setting Mesa is using on (e.g. BDW) to speed up accesses to a memory
> > > area which it knows always to be accessed so that it can be cached.
> > > 
> > > If app runs on HW where LLC/eLLC is missing, giving the app extra slowdown
> > > instead of potential speedup sounds like failed HW abstraction. :-)
> > 
> > Well mesa needs to know llc vs. !llc anyway to not totally suck, and
> > defining entry #2 as "coherent, always" makes sense. I thought entry 0 was
> > the reaonable default aka pte passthrough and hence managed by kernel?
> 
> Nope, we fscked that up somewhat, and entry 1 is the PTE one :( So if
> userspace forgets to set MOCS on gen9 it won't get the behaviour it
> would have gotten on previous gens.

How do we manage to pass the various set_caching vs. rendercpy test then
on skl? Or do we just not, and everyone is still lalala?

Sounds like something to fix either way.

> > If mesa asks for nonsense, the kernel is happy to oblige.
> 
> We never really defined what entry 2 actually means: coherent or sane
> performance. Mesa has, rightfully IMO, made the assumption that it means
> the latter since we never set out to define any MOCS entries with
> coherency in mind. And seeing that it's already out in the wild, I think
> it's better to respect it. If we change it now we would just make it
> more painful for people when they get their hands on the hardware.
> 
> I think what we should do is define what the MOCS indexes mean in some
> uapi header. Then there would be no ambiguity.

Problem with encoding this is that sooner or later we're playing a game of
whack-a-mole where kernel second-guesses mesa to support old versions, and
mesa tries to work around kernel assumptions that no longer fit reworked
code. "It's the easiest solution for existing binaries" is imo very poor
justification for ABI, especially when it's just about performance.
Fundamentally mesa still works correctly, just a bit slow.

I don't want to merge a random hack just because. And it sounds like
someone needs to fix up the MOCS story on gen9 overall anyway.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config
  2016-04-26 16:01             ` Imre Deak
@ 2016-04-28  8:17               ` Daniel Vetter
  2016-04-28  8:38                 ` Imre Deak
  0 siblings, 1 reply; 32+ messages in thread
From: Daniel Vetter @ 2016-04-28  8:17 UTC (permalink / raw)
  To: Imre Deak; +Cc: Eero Tamminen, Valtteri Rantala, intel-gfx, Michael T Frederick

On Tue, Apr 26, 2016 at 07:01:06PM +0300, Imre Deak wrote:
> On ti, 2016-04-26 at 15:42 +0100, Chris Wilson wrote:
> > On Tue, Apr 26, 2016 at 05:26:43PM +0300, Eero Tamminen wrote:
> > > Hi,
> > > 
> > > On 26.04.2016 16:23, Chris Wilson wrote:
> > > > On Tue, Apr 26, 2016 at 04:17:55PM +0300, Imre Deak wrote:
> > > > > On ti, 2016-04-26 at 13:57 +0100, Chris Wilson wrote:
> > > > > > On Tue, Apr 26, 2016 at 03:44:22PM +0300, Imre Deak wrote:
> > > > > > > Setting a write-back cache policy in the MOCS entry
> > > > > > > definition also
> > > > > > > implies snooping, which has a considerable overhead. This
> > > > > > > is
> > > > > > > unexpected for a few reasons:
> > > > > > 
> > > > > > If it is snooping, then I don't see why it is undesirable to
> > > > > > have it
> > > > > > available in a mocs setting. If it is bogus and the bit is
> > > > > > undefined,
> > > > > > then by all means remove it.
> > > > > 
> > > > > None of these entries are used alone for coherent surfaces. For
> > > > > that
> > > > > the application would have to use entry index#1 or #2 _and_
> > > > > call the
> > > > > set caching IOCTL to set the corresponding buffer to be cached.
> > > > 
> > > > No, the application doesn't. There are sufficent interfaces
> > > > exposed that
> > > > userspace can bypass the kernel if it so desired.
> > > > 
> > > > > The
> > > > > problem is that without setting the buffer to be cacheable the
> > > > > expectation is that we won't be snooping and incur the
> > > > > corresponding
> > > > > overhead. This is what this patch addresses.
> > > > 
> > > > Not true.
> > > > 
> > > > > The bit is also bogus, if we wanted snooping via MOCS we'd use
> > > > > the
> > > > > dedicated HW flag for that.
> > > > 
> > > > But you keep saying this bit *enables* snooping. So either it
> > > > does or it
> > > > doesn't.
> > > > 
> > > > > If we wanted to have a snooping MOCS entry we should add that
> > > > > separately (as a forth entry), but we'd need this change as a
> > > > > fix for
> > > > > current users.
> > > > 
> > > > The current users who are getting what they request but don't
> > > > know what
> > > > they were requesting?
> > > 
> > > What this kernel ABI (index entry #2) has been agreed & documented
> > > to provide?
> > 
> > The ABI is what we agree makes sense between hardware / kernel /
> > userspace, and then we stick to it.
> > 
> > > I thought this entry is supposed to replace the writeback LLC/eLLC
> > > cache MOCS setting Mesa is using on (e.g. BDW) to speed up accesses
> > > to a memory area which it knows always to be accessed so that it
> > > can
> > > be cached.
> > 
> > Which it does... Only it speeds us writeback from the CPU, not the
> > GPU. ;)
> > 
> > The confusion seems to be in mistaking !llc for llc. We have to come
> > to
> > some agreement on whether it makes sense having multiple entries for
> > the
> > same follows-PTE mocs or whether it makes more sense to expose the hw
> > capabilties.
> 
> Note that we can't just say to Mesa to use index #0 on BXT instead of
> index #2, since index #0 turns off caching in GPU L3, so we'd have to
> also redefine that to be L3 cached. And I don't know what turning on L3
> caching for index #0 would break, I would rather avoid doing that. Imo
> defining the entries the following way would allow us to use the same
> indexes on GEN9 regardless of it being SKL or BXT:
> #0: uncached (neither in L3 nor in (e)LLC)
> #1: PTE passthrough

So our rendercpy in igt does set MOCS entry 1. Or how exactly do all the
set_caching vs. rendercpy tests we currently have pass? Just not?

Also, you're guaranateeing that opencl/libva don't screw this up either?

It's UABI, and we've botched it. Smells like we need to have a lot more
solid story and make sure we get this right this time around. There's also
the "how does dma-buf mmap support fit in" question.

Oh and nope, none of the relevant testcases are in BAT at all.

> #2: cached everywhere (L3 + (e)LLC if it exists), non-coherent
> #3: cached everywhere, coherent
> 
> I'm not sure if there is even a user for coherent surfaces atm, so then
> we could delay adding #3 until it's really needed.

The problem with the above is that the various access paths on SoC
(without LLC) and big core (with llc) are massively different. You need
lots of different cases in upload/download paths for max speed anyway.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config
  2016-04-28  8:17               ` Daniel Vetter
@ 2016-04-28  8:38                 ` Imre Deak
  2016-04-28 14:48                   ` Daniel Vetter
  0 siblings, 1 reply; 32+ messages in thread
From: Imre Deak @ 2016-04-28  8:38 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: Eero Tamminen, Valtteri Rantala, intel-gfx, Michael T Frederick

On to, 2016-04-28 at 10:17 +0200, Daniel Vetter wrote:
> On Tue, Apr 26, 2016 at 07:01:06PM +0300, Imre Deak wrote:
> > On ti, 2016-04-26 at 15:42 +0100, Chris Wilson wrote:
> > > On Tue, Apr 26, 2016 at 05:26:43PM +0300, Eero Tamminen wrote:
> > > > Hi,
> > > > 
> > > > On 26.04.2016 16:23, Chris Wilson wrote:
> > > > > On Tue, Apr 26, 2016 at 04:17:55PM +0300, Imre Deak wrote:
> > > > > > On ti, 2016-04-26 at 13:57 +0100, Chris Wilson wrote:
> > > > > > > On Tue, Apr 26, 2016 at 03:44:22PM +0300, Imre Deak wrote:
> > > > > > > > Setting a write-back cache policy in the MOCS entry
> > > > > > > > definition also
> > > > > > > > implies snooping, which has a considerable overhead. This
> > > > > > > > is
> > > > > > > > unexpected for a few reasons:
> > > > > > > 
> > > > > > > If it is snooping, then I don't see why it is undesirable to
> > > > > > > have it
> > > > > > > available in a mocs setting. If it is bogus and the bit is
> > > > > > > undefined,
> > > > > > > then by all means remove it.
> > > > > > 
> > > > > > None of these entries are used alone for coherent surfaces. For
> > > > > > that
> > > > > > the application would have to use entry index#1 or #2 _and_
> > > > > > call the
> > > > > > set caching IOCTL to set the corresponding buffer to be cached.
> > > > > 
> > > > > No, the application doesn't. There are sufficent interfaces
> > > > > exposed that
> > > > > userspace can bypass the kernel if it so desired.
> > > > > 
> > > > > > The
> > > > > > problem is that without setting the buffer to be cacheable the
> > > > > > expectation is that we won't be snooping and incur the
> > > > > > corresponding
> > > > > > overhead. This is what this patch addresses.
> > > > > 
> > > > > Not true.
> > > > > 
> > > > > > The bit is also bogus, if we wanted snooping via MOCS we'd use
> > > > > > the
> > > > > > dedicated HW flag for that.
> > > > > 
> > > > > But you keep saying this bit *enables* snooping. So either it
> > > > > does or it
> > > > > doesn't.
> > > > > 
> > > > > > If we wanted to have a snooping MOCS entry we should add that
> > > > > > separately (as a forth entry), but we'd need this change as a
> > > > > > fix for
> > > > > > current users.
> > > > > 
> > > > > The current users who are getting what they request but don't
> > > > > know what
> > > > > they were requesting?
> > > > 
> > > > What this kernel ABI (index entry #2) has been agreed & documented
> > > > to provide?
> > > 
> > > The ABI is what we agree makes sense between hardware / kernel /
> > > userspace, and then we stick to it.
> > > 
> > > > I thought this entry is supposed to replace the writeback LLC/eLLC
> > > > cache MOCS setting Mesa is using on (e.g. BDW) to speed up accesses
> > > > to a memory area which it knows always to be accessed so that it
> > > > can
> > > > be cached.
> > > 
> > > Which it does... Only it speeds us writeback from the CPU, not the
> > > GPU. ;)
> > > 
> > > The confusion seems to be in mistaking !llc for llc. We have to come
> > > to
> > > some agreement on whether it makes sense having multiple entries for
> > > the
> > > same follows-PTE mocs or whether it makes more sense to expose the hw
> > > capabilties.
> > 
> > Note that we can't just say to Mesa to use index #0 on BXT instead of
> > index #2, since index #0 turns off caching in GPU L3, so we'd have to
> > also redefine that to be L3 cached. And I don't know what turning on L3
> > caching for index #0 would break, I would rather avoid doing that. Imo
> > defining the entries the following way would allow us to use the same
> > indexes on GEN9 regardless of it being SKL or BXT:
> > #0: uncached (neither in L3 nor in (e)LLC)
> > #1: PTE passthrough
> 
> So our rendercpy in igt does set MOCS entry 1. Or how exactly do all the
> set_caching vs. rendercpy tests we currently have pass? Just not?

We don't have tests for coherent surfaces. The current rendercpy test
just uses uncached buffers, so they are flushed before checking the
result. I could add a new subtest to rendercpy to test with a
cached/coherent surface.

> Also, you're guaranateeing that opencl/libva don't screw this up either?

If they don't set the given buffer to be cached via the set_caching
IOCTL (as a consequence making them coherent) they are already screwed
on CHV. If they call the IOCTL they are fine on BXT too.

> It's UABI, and we've botched it. Smells like we need to have a lot more
> solid story and make sure we get this right this time around. There's also
> the "how does dma-buf mmap support fit in" question.
> 
> Oh and nope, none of the relevant testcases are in BAT at all.
> 
> > #2: cached everywhere (L3 + (e)LLC if it exists), non-coherent
> > #3: cached everywhere, coherent
> > 
> > I'm not sure if there is even a user for coherent surfaces atm, so then
> > we could delay adding #3 until it's really needed.
> 
> The problem with the above is that the various access paths on SoC
> (without LLC) and big core (with llc) are massively different. You need
> lots of different cases in upload/download paths for max speed anyway.

The current assumption everywhere in user space is the above three
entries with their definition above as far as I know. There are no
users that would use a MOCS entry to get a coherent surface, we could
add that as a 4th entry once a user arises.

--Imre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config
  2016-04-28  8:13               ` Daniel Vetter
@ 2016-04-28 10:48                 ` Ville Syrjälä
  2016-04-28 14:44                   ` Daniel Vetter
  0 siblings, 1 reply; 32+ messages in thread
From: Ville Syrjälä @ 2016-04-28 10:48 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: Eero Tamminen, Michael T Frederick, Valtteri Rantala, intel-gfx

On Thu, Apr 28, 2016 at 10:13:37AM +0200, Daniel Vetter wrote:
> On Tue, Apr 26, 2016 at 08:57:51PM +0300, Ville Syrjälä wrote:
> > On Tue, Apr 26, 2016 at 04:30:05PM +0200, Daniel Vetter wrote:
> > > On Tue, Apr 26, 2016 at 05:26:43PM +0300, Eero Tamminen wrote:
> > > > Hi,
> > > > 
> > > > On 26.04.2016 16:23, Chris Wilson wrote:
> > > > >On Tue, Apr 26, 2016 at 04:17:55PM +0300, Imre Deak wrote:
> > > > >>On ti, 2016-04-26 at 13:57 +0100, Chris Wilson wrote:
> > > > >>>On Tue, Apr 26, 2016 at 03:44:22PM +0300, Imre Deak wrote:
> > > > >>>>Setting a write-back cache policy in the MOCS entry definition also
> > > > >>>>implies snooping, which has a considerable overhead. This is
> > > > >>>>unexpected for a few reasons:
> > > > >>>
> > > > >>>If it is snooping, then I don't see why it is undesirable to have it
> > > > >>>available in a mocs setting. If it is bogus and the bit is undefined,
> > > > >>>then by all means remove it.
> > > > >>
> > > > >>None of these entries are used alone for coherent surfaces. For that
> > > > >>the application would have to use entry index#1 or #2 _and_ call the
> > > > >>set caching IOCTL to set the corresponding buffer to be cached.
> > > > >
> > > > >No, the application doesn't. There are sufficent interfaces exposed that
> > > > >userspace can bypass the kernel if it so desired.
> > > > >
> > > > >>The
> > > > >>problem is that without setting the buffer to be cacheable the
> > > > >>expectation is that we won't be snooping and incur the corresponding
> > > > >>overhead. This is what this patch addresses.
> > > > >
> > > > >Not true.
> > > > >
> > > > >>The bit is also bogus, if we wanted snooping via MOCS we'd use the
> > > > >>dedicated HW flag for that.
> > > > >
> > > > >But you keep saying this bit *enables* snooping. So either it does or it
> > > > >doesn't.
> > > > >
> > > > >>If we wanted to have a snooping MOCS entry we should add that
> > > > >>separately (as a forth entry), but we'd need this change as a fix for
> > > > >>current users.
> > > > >
> > > > >The current users who are getting what they request but don't know what
> > > > >they were requesting?
> > > > 
> > > > What this kernel ABI (index entry #2) has been agreed & documented to
> > > > provide?
> > > > 
> > > > I thought this entry is supposed to replace the writeback LLC/eLLC cache
> > > > MOCS setting Mesa is using on (e.g. BDW) to speed up accesses to a memory
> > > > area which it knows always to be accessed so that it can be cached.
> > > > 
> > > > If app runs on HW where LLC/eLLC is missing, giving the app extra slowdown
> > > > instead of potential speedup sounds like failed HW abstraction. :-)
> > > 
> > > Well mesa needs to know llc vs. !llc anyway to not totally suck, and
> > > defining entry #2 as "coherent, always" makes sense. I thought entry 0 was
> > > the reaonable default aka pte passthrough and hence managed by kernel?
> > 
> > Nope, we fscked that up somewhat, and entry 1 is the PTE one :( So if
> > userspace forgets to set MOCS on gen9 it won't get the behaviour it
> > would have gotten on previous gens.
> 
> How do we manage to pass the various set_caching vs. rendercpy test then
> on skl? Or do we just not, and everyone is still lalala?

Why wouldn't it pass? It's all coherent anyway. Although apparently the
docs have some confusing comments that things wouldn't be coherent unless
you enable LLC caching, but maybe that's just about coherent L3 or
something?

> 
> Sounds like something to fix either way.
> 
> > > If mesa asks for nonsense, the kernel is happy to oblige.
> > 
> > We never really defined what entry 2 actually means: coherent or sane
> > performance. Mesa has, rightfully IMO, made the assumption that it means
> > the latter since we never set out to define any MOCS entries with
> > coherency in mind. And seeing that it's already out in the wild, I think
> > it's better to respect it. If we change it now we would just make it
> > more painful for people when they get their hands on the hardware.
> > 
> > I think what we should do is define what the MOCS indexes mean in some
> > uapi header. Then there would be no ambiguity.
> 
> Problem with encoding this is that sooner or later we're playing a game of
> whack-a-mole where kernel second-guesses mesa to support old versions, and
> mesa tries to work around kernel assumptions that no longer fit reworked
> code. "It's the easiest solution for existing binaries" is imo very poor
> justification for ABI, especially when it's just about performance.
> Fundamentally mesa still works correctly, just a bit slow.
> 
> I don't want to merge a random hack just because.

I don't see it as any kind of hack. That's the ABI we agreed on pretty
much in my opinion. No one had any concerns about coherency when the
current MOCS entries were allocated IIRC.

> And it sounds like
> someone needs to fix up the MOCS story on gen9 overall anyway.
> -Daniel
> -- 
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch

-- 
Ville Syrjälä
Intel OTC
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config
  2016-04-28 10:48                 ` Ville Syrjälä
@ 2016-04-28 14:44                   ` Daniel Vetter
  2016-04-28 17:21                     ` Ville Syrjälä
  0 siblings, 1 reply; 32+ messages in thread
From: Daniel Vetter @ 2016-04-28 14:44 UTC (permalink / raw)
  To: Ville Syrjälä
  Cc: Eero Tamminen, Michael T Frederick, Valtteri Rantala, intel-gfx

On Thu, Apr 28, 2016 at 01:48:27PM +0300, Ville Syrjälä wrote:
> On Thu, Apr 28, 2016 at 10:13:37AM +0200, Daniel Vetter wrote:
> > On Tue, Apr 26, 2016 at 08:57:51PM +0300, Ville Syrjälä wrote:
> > > On Tue, Apr 26, 2016 at 04:30:05PM +0200, Daniel Vetter wrote:
> > > > On Tue, Apr 26, 2016 at 05:26:43PM +0300, Eero Tamminen wrote:
> > > > > Hi,
> > > > > 
> > > > > On 26.04.2016 16:23, Chris Wilson wrote:
> > > > > >On Tue, Apr 26, 2016 at 04:17:55PM +0300, Imre Deak wrote:
> > > > > >>On ti, 2016-04-26 at 13:57 +0100, Chris Wilson wrote:
> > > > > >>>On Tue, Apr 26, 2016 at 03:44:22PM +0300, Imre Deak wrote:
> > > > > >>>>Setting a write-back cache policy in the MOCS entry definition also
> > > > > >>>>implies snooping, which has a considerable overhead. This is
> > > > > >>>>unexpected for a few reasons:
> > > > > >>>
> > > > > >>>If it is snooping, then I don't see why it is undesirable to have it
> > > > > >>>available in a mocs setting. If it is bogus and the bit is undefined,
> > > > > >>>then by all means remove it.
> > > > > >>
> > > > > >>None of these entries are used alone for coherent surfaces. For that
> > > > > >>the application would have to use entry index#1 or #2 _and_ call the
> > > > > >>set caching IOCTL to set the corresponding buffer to be cached.
> > > > > >
> > > > > >No, the application doesn't. There are sufficent interfaces exposed that
> > > > > >userspace can bypass the kernel if it so desired.
> > > > > >
> > > > > >>The
> > > > > >>problem is that without setting the buffer to be cacheable the
> > > > > >>expectation is that we won't be snooping and incur the corresponding
> > > > > >>overhead. This is what this patch addresses.
> > > > > >
> > > > > >Not true.
> > > > > >
> > > > > >>The bit is also bogus, if we wanted snooping via MOCS we'd use the
> > > > > >>dedicated HW flag for that.
> > > > > >
> > > > > >But you keep saying this bit *enables* snooping. So either it does or it
> > > > > >doesn't.
> > > > > >
> > > > > >>If we wanted to have a snooping MOCS entry we should add that
> > > > > >>separately (as a forth entry), but we'd need this change as a fix for
> > > > > >>current users.
> > > > > >
> > > > > >The current users who are getting what they request but don't know what
> > > > > >they were requesting?
> > > > > 
> > > > > What this kernel ABI (index entry #2) has been agreed & documented to
> > > > > provide?
> > > > > 
> > > > > I thought this entry is supposed to replace the writeback LLC/eLLC cache
> > > > > MOCS setting Mesa is using on (e.g. BDW) to speed up accesses to a memory
> > > > > area which it knows always to be accessed so that it can be cached.
> > > > > 
> > > > > If app runs on HW where LLC/eLLC is missing, giving the app extra slowdown
> > > > > instead of potential speedup sounds like failed HW abstraction. :-)
> > > > 
> > > > Well mesa needs to know llc vs. !llc anyway to not totally suck, and
> > > > defining entry #2 as "coherent, always" makes sense. I thought entry 0 was
> > > > the reaonable default aka pte passthrough and hence managed by kernel?
> > > 
> > > Nope, we fscked that up somewhat, and entry 1 is the PTE one :( So if
> > > userspace forgets to set MOCS on gen9 it won't get the behaviour it
> > > would have gotten on previous gens.
> > 
> > How do we manage to pass the various set_caching vs. rendercpy test then
> > on skl? Or do we just not, and everyone is still lalala?
> 
> Why wouldn't it pass? It's all coherent anyway. Although apparently the
> docs have some confusing comments that things wouldn't be coherent unless
> you enable LLC caching, but maybe that's just about coherent L3 or
> something?

We have at least one that uses cache control by the kernel and makes sure
(using crcs) that the stuff actually lands. If we accidentally do coherent
access instead of wc cache dirt might end up in cpu caches. At least it
seems pretty fishy.

> > Sounds like something to fix either way.
> > 
> > > > If mesa asks for nonsense, the kernel is happy to oblige.
> > > 
> > > We never really defined what entry 2 actually means: coherent or sane
> > > performance. Mesa has, rightfully IMO, made the assumption that it means
> > > the latter since we never set out to define any MOCS entries with
> > > coherency in mind. And seeing that it's already out in the wild, I think
> > > it's better to respect it. If we change it now we would just make it
> > > more painful for people when they get their hands on the hardware.
> > > 
> > > I think what we should do is define what the MOCS indexes mean in some
> > > uapi header. Then there would be no ambiguity.
> > 
> > Problem with encoding this is that sooner or later we're playing a game of
> > whack-a-mole where kernel second-guesses mesa to support old versions, and
> > mesa tries to work around kernel assumptions that no longer fit reworked
> > code. "It's the easiest solution for existing binaries" is imo very poor
> > justification for ABI, especially when it's just about performance.
> > Fundamentally mesa still works correctly, just a bit slow.
> > 
> > I don't want to merge a random hack just because.
> 
> I don't see it as any kind of hack. That's the ABI we agreed on pretty
> much in my opinion. No one had any concerns about coherency when the
> current MOCS entries were allocated IIRC.

Hm, not being concerned about coherency when discussing MOCS sounds like a
pretty big review oversight.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config
  2016-04-28  8:38                 ` Imre Deak
@ 2016-04-28 14:48                   ` Daniel Vetter
  2016-04-28 17:15                     ` Imre Deak
  2016-04-28 17:25                     ` Ville Syrjälä
  0 siblings, 2 replies; 32+ messages in thread
From: Daniel Vetter @ 2016-04-28 14:48 UTC (permalink / raw)
  To: Imre Deak; +Cc: Eero Tamminen, Michael T Frederick, Valtteri Rantala, intel-gfx

On Thu, Apr 28, 2016 at 11:38:55AM +0300, Imre Deak wrote:
> On to, 2016-04-28 at 10:17 +0200, Daniel Vetter wrote:
> > On Tue, Apr 26, 2016 at 07:01:06PM +0300, Imre Deak wrote:
> > > On ti, 2016-04-26 at 15:42 +0100, Chris Wilson wrote:
> > > > On Tue, Apr 26, 2016 at 05:26:43PM +0300, Eero Tamminen wrote:
> > > > > Hi,
> > > > > 
> > > > > On 26.04.2016 16:23, Chris Wilson wrote:
> > > > > > On Tue, Apr 26, 2016 at 04:17:55PM +0300, Imre Deak wrote:
> > > > > > > On ti, 2016-04-26 at 13:57 +0100, Chris Wilson wrote:
> > > > > > > > On Tue, Apr 26, 2016 at 03:44:22PM +0300, Imre Deak wrote:
> > > > > > > > > Setting a write-back cache policy in the MOCS entry
> > > > > > > > > definition also
> > > > > > > > > implies snooping, which has a considerable overhead. This
> > > > > > > > > is
> > > > > > > > > unexpected for a few reasons:
> > > > > > > > 
> > > > > > > > If it is snooping, then I don't see why it is undesirable to
> > > > > > > > have it
> > > > > > > > available in a mocs setting. If it is bogus and the bit is
> > > > > > > > undefined,
> > > > > > > > then by all means remove it.
> > > > > > > 
> > > > > > > None of these entries are used alone for coherent surfaces. For
> > > > > > > that
> > > > > > > the application would have to use entry index#1 or #2 _and_
> > > > > > > call the
> > > > > > > set caching IOCTL to set the corresponding buffer to be cached.
> > > > > > 
> > > > > > No, the application doesn't. There are sufficent interfaces
> > > > > > exposed that
> > > > > > userspace can bypass the kernel if it so desired.
> > > > > > 
> > > > > > > The
> > > > > > > problem is that without setting the buffer to be cacheable the
> > > > > > > expectation is that we won't be snooping and incur the
> > > > > > > corresponding
> > > > > > > overhead. This is what this patch addresses.
> > > > > > 
> > > > > > Not true.
> > > > > > 
> > > > > > > The bit is also bogus, if we wanted snooping via MOCS we'd use
> > > > > > > the
> > > > > > > dedicated HW flag for that.
> > > > > > 
> > > > > > But you keep saying this bit *enables* snooping. So either it
> > > > > > does or it
> > > > > > doesn't.
> > > > > > 
> > > > > > > If we wanted to have a snooping MOCS entry we should add that
> > > > > > > separately (as a forth entry), but we'd need this change as a
> > > > > > > fix for
> > > > > > > current users.
> > > > > > 
> > > > > > The current users who are getting what they request but don't
> > > > > > know what
> > > > > > they were requesting?
> > > > > 
> > > > > What this kernel ABI (index entry #2) has been agreed & documented
> > > > > to provide?
> > > > 
> > > > The ABI is what we agree makes sense between hardware / kernel /
> > > > userspace, and then we stick to it.
> > > > 
> > > > > I thought this entry is supposed to replace the writeback LLC/eLLC
> > > > > cache MOCS setting Mesa is using on (e.g. BDW) to speed up accesses
> > > > > to a memory area which it knows always to be accessed so that it
> > > > > can
> > > > > be cached.
> > > > 
> > > > Which it does... Only it speeds us writeback from the CPU, not the
> > > > GPU. ;)
> > > > 
> > > > The confusion seems to be in mistaking !llc for llc. We have to come
> > > > to
> > > > some agreement on whether it makes sense having multiple entries for
> > > > the
> > > > same follows-PTE mocs or whether it makes more sense to expose the hw
> > > > capabilties.
> > > 
> > > Note that we can't just say to Mesa to use index #0 on BXT instead of
> > > index #2, since index #0 turns off caching in GPU L3, so we'd have to
> > > also redefine that to be L3 cached. And I don't know what turning on L3
> > > caching for index #0 would break, I would rather avoid doing that. Imo
> > > defining the entries the following way would allow us to use the same
> > > indexes on GEN9 regardless of it being SKL or BXT:
> > > #0: uncached (neither in L3 nor in (e)LLC)
> > > #1: PTE passthrough
> > 
> > So our rendercpy in igt does set MOCS entry 1. Or how exactly do all the
> > set_caching vs. rendercpy tests we currently have pass? Just not?
> 
> We don't have tests for coherent surfaces. The current rendercpy test
> just uses uncached buffers, so they are flushed before checking the
> result. I could add a new subtest to rendercpy to test with a
> cached/coherent surface.
> 
> > Also, you're guaranateeing that opencl/libva don't screw this up either?
> 
> If they don't set the given buffer to be cached via the set_caching
> IOCTL (as a consequence making them coherent) they are already screwed
> on CHV. If they call the IOCTL they are fine on BXT too.

We do implicit set_caching when displaying something to something
coherent. To make that work userspace should use the "use PTE" mode by
default, except when they really know what they're doing. That's also the
mode that's supposed to give you the most reasonable performance. But
somehow that mode ended up in MOCS entry 1, so pretty much guaranteed
userspace will get it wrong. Mesa just hit a perf snag, but might as well
have been visual corruption. I think it'd be a lot safe to make "use PTE"
entry 0.
-Daniel

> > It's UABI, and we've botched it. Smells like we need to have a lot more
> > solid story and make sure we get this right this time around. There's also
> > the "how does dma-buf mmap support fit in" question.
> > 
> > Oh and nope, none of the relevant testcases are in BAT at all.
> > 
> > > #2: cached everywhere (L3 + (e)LLC if it exists), non-coherent
> > > #3: cached everywhere, coherent
> > > 
> > > I'm not sure if there is even a user for coherent surfaces atm, so then
> > > we could delay adding #3 until it's really needed.
> > 
> > The problem with the above is that the various access paths on SoC
> > (without LLC) and big core (with llc) are massively different. You need
> > lots of different cases in upload/download paths for max speed anyway.
> 
> The current assumption everywhere in user space is the above three
> entries with their definition above as far as I know. There are no
> users that would use a MOCS entry to get a coherent surface, we could
> add that as a 4th entry once a user arises.
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config
  2016-04-28 14:48                   ` Daniel Vetter
@ 2016-04-28 17:15                     ` Imre Deak
  2016-05-02  8:28                       ` Daniel Vetter
  2016-04-28 17:25                     ` Ville Syrjälä
  1 sibling, 1 reply; 32+ messages in thread
From: Imre Deak @ 2016-04-28 17:15 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: Eero Tamminen, Valtteri Rantala, intel-gfx, Michael T Frederick

On to, 2016-04-28 at 16:48 +0200, Daniel Vetter wrote:
> On Thu, Apr 28, 2016 at 11:38:55AM +0300, Imre Deak wrote:
> > On to, 2016-04-28 at 10:17 +0200, Daniel Vetter wrote:
> > > On Tue, Apr 26, 2016 at 07:01:06PM +0300, Imre Deak wrote:
> > > > On ti, 2016-04-26 at 15:42 +0100, Chris Wilson wrote:
> > > > > On Tue, Apr 26, 2016 at 05:26:43PM +0300, Eero Tamminen
> > > > > wrote:
> > > > > > Hi,
> > > > > > 
> > > > > > On 26.04.2016 16:23, Chris Wilson wrote:
> > > > > > > On Tue, Apr 26, 2016 at 04:17:55PM +0300, Imre Deak
> > > > > > > wrote:
> > > > > > > > On ti, 2016-04-26 at 13:57 +0100, Chris Wilson wrote:
> > > > > > > > > On Tue, Apr 26, 2016 at 03:44:22PM +0300, Imre Deak
> > > > > > > > > wrote:
> > > > > > > > > > Setting a write-back cache policy in the MOCS entry
> > > > > > > > > > definition also
> > > > > > > > > > implies snooping, which has a considerable
> > > > > > > > > > overhead. This
> > > > > > > > > > is
> > > > > > > > > > unexpected for a few reasons:
> > > > > > > > > 
> > > > > > > > > If it is snooping, then I don't see why it is
> > > > > > > > > undesirable to
> > > > > > > > > have it
> > > > > > > > > available in a mocs setting. If it is bogus and the
> > > > > > > > > bit is
> > > > > > > > > undefined,
> > > > > > > > > then by all means remove it.
> > > > > > > > 
> > > > > > > > None of these entries are used alone for coherent
> > > > > > > > surfaces. For
> > > > > > > > that
> > > > > > > > the application would have to use entry index#1 or #2
> > > > > > > > _and_
> > > > > > > > call the
> > > > > > > > set caching IOCTL to set the corresponding buffer to be
> > > > > > > > cached.
> > > > > > > 
> > > > > > > No, the application doesn't. There are sufficent
> > > > > > > interfaces
> > > > > > > exposed that
> > > > > > > userspace can bypass the kernel if it so desired.
> > > > > > > 
> > > > > > > > The
> > > > > > > > problem is that without setting the buffer to be
> > > > > > > > cacheable the
> > > > > > > > expectation is that we won't be snooping and incur the
> > > > > > > > corresponding
> > > > > > > > overhead. This is what this patch addresses.
> > > > > > > 
> > > > > > > Not true.
> > > > > > > 
> > > > > > > > The bit is also bogus, if we wanted snooping via MOCS
> > > > > > > > we'd use
> > > > > > > > the
> > > > > > > > dedicated HW flag for that.
> > > > > > > 
> > > > > > > But you keep saying this bit *enables* snooping. So
> > > > > > > either it
> > > > > > > does or it
> > > > > > > doesn't.
> > > > > > > 
> > > > > > > > If we wanted to have a snooping MOCS entry we should
> > > > > > > > add that
> > > > > > > > separately (as a forth entry), but we'd need this
> > > > > > > > change as a
> > > > > > > > fix for
> > > > > > > > current users.
> > > > > > > 
> > > > > > > The current users who are getting what they request but
> > > > > > > don't
> > > > > > > know what
> > > > > > > they were requesting?
> > > > > > 
> > > > > > What this kernel ABI (index entry #2) has been agreed &
> > > > > > documented
> > > > > > to provide?
> > > > > 
> > > > > The ABI is what we agree makes sense between hardware /
> > > > > kernel /
> > > > > userspace, and then we stick to it.
> > > > > 
> > > > > > I thought this entry is supposed to replace the writeback
> > > > > > LLC/eLLC
> > > > > > cache MOCS setting Mesa is using on (e.g. BDW) to speed up
> > > > > > accesses
> > > > > > to a memory area which it knows always to be accessed so
> > > > > > that it
> > > > > > can
> > > > > > be cached.
> > > > > 
> > > > > Which it does... Only it speeds us writeback from the CPU,
> > > > > not the
> > > > > GPU. ;)
> > > > > 
> > > > > The confusion seems to be in mistaking !llc for llc. We have
> > > > > to come
> > > > > to
> > > > > some agreement on whether it makes sense having multiple
> > > > > entries for
> > > > > the
> > > > > same follows-PTE mocs or whether it makes more sense to
> > > > > expose the hw
> > > > > capabilties.
> > > > 
> > > > Note that we can't just say to Mesa to use index #0 on BXT
> > > > instead of
> > > > index #2, since index #0 turns off caching in GPU L3, so we'd
> > > > have to
> > > > also redefine that to be L3 cached. And I don't know what
> > > > turning on L3
> > > > caching for index #0 would break, I would rather avoid doing
> > > > that. Imo
> > > > defining the entries the following way would allow us to use
> > > > the same
> > > > indexes on GEN9 regardless of it being SKL or BXT:
> > > > #0: uncached (neither in L3 nor in (e)LLC)
> > > > #1: PTE passthrough
> > > 
> > > So our rendercpy in igt does set MOCS entry 1. Or how exactly do
> > > all the
> > > set_caching vs. rendercpy tests we currently have pass? Just not?
> > 
> > We don't have tests for coherent surfaces. The current rendercpy
> > test
> > just uses uncached buffers, so they are flushed before checking the
> > result. I could add a new subtest to rendercpy to test with a
> > cached/coherent surface.
> > 
> > > Also, you're guaranateeing that opencl/libva don't screw this up
> > > either?
> > 
> > If they don't set the given buffer to be cached via the set_caching
> > IOCTL (as a consequence making them coherent) they are already
> > screwed
> > on CHV. If they call the IOCTL they are fine on BXT too.
> 
> We do implicit set_caching when displaying something to something
> coherent. To make that work userspace should use the "use PTE" mode by
> default, except when they really know what they're doing.
> That's also the mode that's supposed to give you the most reasonable
> performance. But somehow that mode ended up in MOCS entry 1, so pretty much
> guaranteed userspace will get it wrong. Mesa just hit a perf snag, but
> might as well have been visual corruption. I think it'd be a lot safe to
> make "use PTE" entry 0.

Mesa uses entries 1 and 2. If something else like opencl or libva (or
even Mesa for that matter) uses index 0 for PTE pass-through that's a
bug on its own. I don't know if this is the case, but it's a separate
issue from what I'm trying to fix here.

This isn't about a case where a PTE pass-through entry needs to be
provided, but about the case where a cached but non-coherent one is
needed. Mesa assumes this to be entry 2 and I don't see why we couldn't
make sure that this assumption holds. Note that this entry on BXT could
be both a PTE pass-through one as in this patch or LLC-UC.

--Imre

> > > It's UABI, and we've botched it. Smells like we need to have a
> > > lot more
> > > solid story and make sure we get this right this time around.
> > > There's also
> > > the "how does dma-buf mmap support fit in" question.
> > > 
> > > Oh and nope, none of the relevant testcases are in BAT at all.
> > > 
> > > > #2: cached everywhere (L3 + (e)LLC if it exists), non-coherent
> > > > #3: cached everywhere, coherent
> > > > 
> > > > I'm not sure if there is even a user for coherent surfaces atm,
> > > > so then
> > > > we could delay adding #3 until it's really needed.
> > > 
> > > The problem with the above is that the various access paths on
> > > SoC
> > > (without LLC) and big core (with llc) are massively different.
> > > You need
> > > lots of different cases in upload/download paths for max speed
> > > anyway.
> > 
> > The current assumption everywhere in user space is the above three
> > entries with their definition above as far as I know. There are no
> > users that would use a MOCS entry to get a coherent surface, we
> > could
> > add that as a 4th entry once a user arises.
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config
  2016-04-28 14:44                   ` Daniel Vetter
@ 2016-04-28 17:21                     ` Ville Syrjälä
  0 siblings, 0 replies; 32+ messages in thread
From: Ville Syrjälä @ 2016-04-28 17:21 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: Eero Tamminen, Michael T Frederick, Valtteri Rantala, intel-gfx

On Thu, Apr 28, 2016 at 04:44:00PM +0200, Daniel Vetter wrote:
> On Thu, Apr 28, 2016 at 01:48:27PM +0300, Ville Syrjälä wrote:
> > On Thu, Apr 28, 2016 at 10:13:37AM +0200, Daniel Vetter wrote:
> > > On Tue, Apr 26, 2016 at 08:57:51PM +0300, Ville Syrjälä wrote:
> > > > On Tue, Apr 26, 2016 at 04:30:05PM +0200, Daniel Vetter wrote:
> > > > > On Tue, Apr 26, 2016 at 05:26:43PM +0300, Eero Tamminen wrote:
> > > > > > Hi,
> > > > > > 
> > > > > > On 26.04.2016 16:23, Chris Wilson wrote:
> > > > > > >On Tue, Apr 26, 2016 at 04:17:55PM +0300, Imre Deak wrote:
> > > > > > >>On ti, 2016-04-26 at 13:57 +0100, Chris Wilson wrote:
> > > > > > >>>On Tue, Apr 26, 2016 at 03:44:22PM +0300, Imre Deak wrote:
> > > > > > >>>>Setting a write-back cache policy in the MOCS entry definition also
> > > > > > >>>>implies snooping, which has a considerable overhead. This is
> > > > > > >>>>unexpected for a few reasons:
> > > > > > >>>
> > > > > > >>>If it is snooping, then I don't see why it is undesirable to have it
> > > > > > >>>available in a mocs setting. If it is bogus and the bit is undefined,
> > > > > > >>>then by all means remove it.
> > > > > > >>
> > > > > > >>None of these entries are used alone for coherent surfaces. For that
> > > > > > >>the application would have to use entry index#1 or #2 _and_ call the
> > > > > > >>set caching IOCTL to set the corresponding buffer to be cached.
> > > > > > >
> > > > > > >No, the application doesn't. There are sufficent interfaces exposed that
> > > > > > >userspace can bypass the kernel if it so desired.
> > > > > > >
> > > > > > >>The
> > > > > > >>problem is that without setting the buffer to be cacheable the
> > > > > > >>expectation is that we won't be snooping and incur the corresponding
> > > > > > >>overhead. This is what this patch addresses.
> > > > > > >
> > > > > > >Not true.
> > > > > > >
> > > > > > >>The bit is also bogus, if we wanted snooping via MOCS we'd use the
> > > > > > >>dedicated HW flag for that.
> > > > > > >
> > > > > > >But you keep saying this bit *enables* snooping. So either it does or it
> > > > > > >doesn't.
> > > > > > >
> > > > > > >>If we wanted to have a snooping MOCS entry we should add that
> > > > > > >>separately (as a forth entry), but we'd need this change as a fix for
> > > > > > >>current users.
> > > > > > >
> > > > > > >The current users who are getting what they request but don't know what
> > > > > > >they were requesting?
> > > > > > 
> > > > > > What this kernel ABI (index entry #2) has been agreed & documented to
> > > > > > provide?
> > > > > > 
> > > > > > I thought this entry is supposed to replace the writeback LLC/eLLC cache
> > > > > > MOCS setting Mesa is using on (e.g. BDW) to speed up accesses to a memory
> > > > > > area which it knows always to be accessed so that it can be cached.
> > > > > > 
> > > > > > If app runs on HW where LLC/eLLC is missing, giving the app extra slowdown
> > > > > > instead of potential speedup sounds like failed HW abstraction. :-)
> > > > > 
> > > > > Well mesa needs to know llc vs. !llc anyway to not totally suck, and
> > > > > defining entry #2 as "coherent, always" makes sense. I thought entry 0 was
> > > > > the reaonable default aka pte passthrough and hence managed by kernel?
> > > > 
> > > > Nope, we fscked that up somewhat, and entry 1 is the PTE one :( So if
> > > > userspace forgets to set MOCS on gen9 it won't get the behaviour it
> > > > would have gotten on previous gens.
> > > 
> > > How do we manage to pass the various set_caching vs. rendercpy test then
> > > on skl? Or do we just not, and everyone is still lalala?
> > 
> > Why wouldn't it pass? It's all coherent anyway. Although apparently the
> > docs have some confusing comments that things wouldn't be coherent unless
> > you enable LLC caching, but maybe that's just about coherent L3 or
> > something?
> 
> We have at least one that uses cache control by the kernel and makes sure
> (using crcs) that the stuff actually lands. If we accidentally do coherent
> access instead of wc cache dirt might end up in cpu caches. At least it
> seems pretty fishy.

I don't recall a test dealing with rendercopy + set_caching + display.

> 
> > > Sounds like something to fix either way.
> > > 
> > > > > If mesa asks for nonsense, the kernel is happy to oblige.
> > > > 
> > > > We never really defined what entry 2 actually means: coherent or sane
> > > > performance. Mesa has, rightfully IMO, made the assumption that it means
> > > > the latter since we never set out to define any MOCS entries with
> > > > coherency in mind. And seeing that it's already out in the wild, I think
> > > > it's better to respect it. If we change it now we would just make it
> > > > more painful for people when they get their hands on the hardware.
> > > > 
> > > > I think what we should do is define what the MOCS indexes mean in some
> > > > uapi header. Then there would be no ambiguity.
> > > 
> > > Problem with encoding this is that sooner or later we're playing a game of
> > > whack-a-mole where kernel second-guesses mesa to support old versions, and
> > > mesa tries to work around kernel assumptions that no longer fit reworked
> > > code. "It's the easiest solution for existing binaries" is imo very poor
> > > justification for ABI, especially when it's just about performance.
> > > Fundamentally mesa still works correctly, just a bit slow.
> > > 
> > > I don't want to merge a random hack just because.
> > 
> > I don't see it as any kind of hack. That's the ABI we agreed on pretty
> > much in my opinion. No one had any concerns about coherency when the
> > current MOCS entries were allocated IIRC.
> 
> Hm, not being concerned about coherency when discussing MOCS sounds like a
> pretty big review oversight.

Perhaps. Actually I can't really remeber if such arguments were raised or
not. But at least I don't remember anything of the sort. My impressions is
that the whole thing dragged on for so long that everyone was just happy
to get something usable in.

Anyways, I think it's silly to keep arguing about this since we have no
actual user for any coherent MOCS entries.

-- 
Ville Syrjälä
Intel OTC
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config
  2016-04-28 14:48                   ` Daniel Vetter
  2016-04-28 17:15                     ` Imre Deak
@ 2016-04-28 17:25                     ` Ville Syrjälä
  1 sibling, 0 replies; 32+ messages in thread
From: Ville Syrjälä @ 2016-04-28 17:25 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: Michael T Frederick, intel-gfx, Eero Tamminen, Valtteri Rantala

On Thu, Apr 28, 2016 at 04:48:37PM +0200, Daniel Vetter wrote:
> On Thu, Apr 28, 2016 at 11:38:55AM +0300, Imre Deak wrote:
> > On to, 2016-04-28 at 10:17 +0200, Daniel Vetter wrote:
> > > On Tue, Apr 26, 2016 at 07:01:06PM +0300, Imre Deak wrote:
> > > > On ti, 2016-04-26 at 15:42 +0100, Chris Wilson wrote:
> > > > > On Tue, Apr 26, 2016 at 05:26:43PM +0300, Eero Tamminen wrote:
> > > > > > Hi,
> > > > > > 
> > > > > > On 26.04.2016 16:23, Chris Wilson wrote:
> > > > > > > On Tue, Apr 26, 2016 at 04:17:55PM +0300, Imre Deak wrote:
> > > > > > > > On ti, 2016-04-26 at 13:57 +0100, Chris Wilson wrote:
> > > > > > > > > On Tue, Apr 26, 2016 at 03:44:22PM +0300, Imre Deak wrote:
> > > > > > > > > > Setting a write-back cache policy in the MOCS entry
> > > > > > > > > > definition also
> > > > > > > > > > implies snooping, which has a considerable overhead. This
> > > > > > > > > > is
> > > > > > > > > > unexpected for a few reasons:
> > > > > > > > > 
> > > > > > > > > If it is snooping, then I don't see why it is undesirable to
> > > > > > > > > have it
> > > > > > > > > available in a mocs setting. If it is bogus and the bit is
> > > > > > > > > undefined,
> > > > > > > > > then by all means remove it.
> > > > > > > > 
> > > > > > > > None of these entries are used alone for coherent surfaces. For
> > > > > > > > that
> > > > > > > > the application would have to use entry index#1 or #2 _and_
> > > > > > > > call the
> > > > > > > > set caching IOCTL to set the corresponding buffer to be cached.
> > > > > > > 
> > > > > > > No, the application doesn't. There are sufficent interfaces
> > > > > > > exposed that
> > > > > > > userspace can bypass the kernel if it so desired.
> > > > > > > 
> > > > > > > > The
> > > > > > > > problem is that without setting the buffer to be cacheable the
> > > > > > > > expectation is that we won't be snooping and incur the
> > > > > > > > corresponding
> > > > > > > > overhead. This is what this patch addresses.
> > > > > > > 
> > > > > > > Not true.
> > > > > > > 
> > > > > > > > The bit is also bogus, if we wanted snooping via MOCS we'd use
> > > > > > > > the
> > > > > > > > dedicated HW flag for that.
> > > > > > > 
> > > > > > > But you keep saying this bit *enables* snooping. So either it
> > > > > > > does or it
> > > > > > > doesn't.
> > > > > > > 
> > > > > > > > If we wanted to have a snooping MOCS entry we should add that
> > > > > > > > separately (as a forth entry), but we'd need this change as a
> > > > > > > > fix for
> > > > > > > > current users.
> > > > > > > 
> > > > > > > The current users who are getting what they request but don't
> > > > > > > know what
> > > > > > > they were requesting?
> > > > > > 
> > > > > > What this kernel ABI (index entry #2) has been agreed & documented
> > > > > > to provide?
> > > > > 
> > > > > The ABI is what we agree makes sense between hardware / kernel /
> > > > > userspace, and then we stick to it.
> > > > > 
> > > > > > I thought this entry is supposed to replace the writeback LLC/eLLC
> > > > > > cache MOCS setting Mesa is using on (e.g. BDW) to speed up accesses
> > > > > > to a memory area which it knows always to be accessed so that it
> > > > > > can
> > > > > > be cached.
> > > > > 
> > > > > Which it does... Only it speeds us writeback from the CPU, not the
> > > > > GPU. ;)
> > > > > 
> > > > > The confusion seems to be in mistaking !llc for llc. We have to come
> > > > > to
> > > > > some agreement on whether it makes sense having multiple entries for
> > > > > the
> > > > > same follows-PTE mocs or whether it makes more sense to expose the hw
> > > > > capabilties.
> > > > 
> > > > Note that we can't just say to Mesa to use index #0 on BXT instead of
> > > > index #2, since index #0 turns off caching in GPU L3, so we'd have to
> > > > also redefine that to be L3 cached. And I don't know what turning on L3
> > > > caching for index #0 would break, I would rather avoid doing that. Imo
> > > > defining the entries the following way would allow us to use the same
> > > > indexes on GEN9 regardless of it being SKL or BXT:
> > > > #0: uncached (neither in L3 nor in (e)LLC)
> > > > #1: PTE passthrough
> > > 
> > > So our rendercpy in igt does set MOCS entry 1. Or how exactly do all the
> > > set_caching vs. rendercpy tests we currently have pass? Just not?
> > 
> > We don't have tests for coherent surfaces. The current rendercpy test
> > just uses uncached buffers, so they are flushed before checking the
> > result. I could add a new subtest to rendercpy to test with a
> > cached/coherent surface.
> > 
> > > Also, you're guaranateeing that opencl/libva don't screw this up either?
> > 
> > If they don't set the given buffer to be cached via the set_caching
> > IOCTL (as a consequence making them coherent) they are already screwed
> > on CHV. If they call the IOCTL they are fine on BXT too.
> 
> We do implicit set_caching when displaying something to something
> coherent. To make that work userspace should use the "use PTE" mode by
> default, except when they really know what they're doing. That's also the
> mode that's supposed to give you the most reasonable performance. But
> somehow that mode ended up in MOCS entry 1, so pretty much guaranteed
> userspace will get it wrong. Mesa just hit a perf snag, but might as well
> have been visual corruption. I think it'd be a lot safe to make "use PTE"
> entry 0.

If no one so far uses MOCS entry 0, we could redefine it safely. Mesa
doesn't use it (at least on purpose, it might by accident though).
I have no idea about anyone else.

-- 
Ville Syrjälä
Intel OTC
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config
  2016-04-27 14:53                   ` Chris Wilson
  2016-04-27 18:42                     ` Dave Gordon
@ 2016-04-29  8:01                     ` Eero Tamminen
  1 sibling, 0 replies; 32+ messages in thread
From: Eero Tamminen @ 2016-04-29  8:01 UTC (permalink / raw)
  To: Chris Wilson, Frederick, Michael T, Daniel Vetter, Deak, Imre,
	intel-gfx, Rantala, Valtteri, Ville Syrjälä

Hi,

On 27.04.2016 17:53, Chris Wilson wrote:
> On Wed, Apr 27, 2016 at 04:25:09PM +0300, Eero Tamminen wrote:
[...]
>> Daniel, Chris, did you have some concrete example in mind where 3D
>> driver would require CPU to snoop GPU?
>
> Not mesa, but X can do concurrent rendering to a Pixmap whilst also
> rendering from other parts of that Pixmap into a GPU side buffer and
> presentation/compositing thereof. X uses snooping both ways (from client
> memory to GPU and from GPU to client memory) as well as mixed rendering.

Is that something your "sna/gen9: Quick and dirty implementation" for X 
DDX does & does it expect index #2 to be coherent:
https://cgit.freedesktop.org/xorg/driver/xf86-video-intel/commit/?id=4e172a38e1707465c189c56bdb7ee4bdaf54c9d4
?

<aside>
While it on SKL improves the trivial GpuTest Triangle case by 50% and 
some more realistic cases up to ~20%, it regresses many other cases, up 
to 25%.

Martin bisected that while ago, but I'm not sure whether he's mailed you 
about it yet.  We don't know what the difference was on BXT, as we 
didn't HW for testing it.
</aside>


> Mesa should be using snooping for both SubTexImage and GetTexImage. On
> the SubTexImage path you can use the sampler to do format conversions
> that even including the sync overhead for correctness when using client
> memory avoid the awful format conversion code in mesa. Using the GPU to
> write into client memory and avoiding WC reads is approximately an
> order of magnitude (8x) faster than the current code mesa uses.

How did you arrive at the 8x speedup?  Did you calculate it (how?) or do 
you have a test that shows this speedup?

Disabling snooping on BXT increased the GPU read memory bandwidth by 
*>70%* in Imre's tests.


	- Eero

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config
  2016-04-28 17:15                     ` Imre Deak
@ 2016-05-02  8:28                       ` Daniel Vetter
  2016-05-02 11:18                         ` Ville Syrjälä
  2016-05-02 13:50                         ` Imre Deak
  0 siblings, 2 replies; 32+ messages in thread
From: Daniel Vetter @ 2016-05-02  8:28 UTC (permalink / raw)
  To: Imre Deak; +Cc: Eero Tamminen, Michael T Frederick, Valtteri Rantala, intel-gfx

On Thu, Apr 28, 2016 at 08:15:24PM +0300, Imre Deak wrote:
> On to, 2016-04-28 at 16:48 +0200, Daniel Vetter wrote:
> > On Thu, Apr 28, 2016 at 11:38:55AM +0300, Imre Deak wrote:
> > > On to, 2016-04-28 at 10:17 +0200, Daniel Vetter wrote:
> > > > Also, you're guaranateeing that opencl/libva don't screw this up
> > > > either?
> > > 
> > > If they don't set the given buffer to be cached via the set_caching
> > > IOCTL (as a consequence making them coherent) they are already
> > > screwed
> > > on CHV. If they call the IOCTL they are fine on BXT too.
> > 
> > We do implicit set_caching when displaying something to something
> > coherent. To make that work userspace should use the "use PTE" mode by
> > default, except when they really know what they're doing.
> > That's also the mode that's supposed to give you the most reasonable
> > performance. But somehow that mode ended up in MOCS entry 1, so pretty much
> > guaranteed userspace will get it wrong. Mesa just hit a perf snag, but
> > might as well have been visual corruption. I think it'd be a lot safe to
> > make "use PTE" entry 0.
> 
> Mesa uses entries 1 and 2. If something else like opencl or libva (or
> even Mesa for that matter) uses index 0 for PTE pass-through that's a
> bug on its own. I don't know if this is the case, but it's a separate
> issue from what I'm trying to fix here.
> 
> This isn't about a case where a PTE pass-through entry needs to be
> provided, but about the case where a cached but non-coherent one is
> needed. Mesa assumes this to be entry 2 and I don't see why we couldn't
> make sure that this assumption holds. Note that this entry on BXT could
> be both a PTE pass-through one as in this patch or LLC-UC.

Yeah my comment about entry #0 was is a different track of discussion.
Should still fix it up while we clarify what entries 1&2 really mean.

When defining entries as "cached" please make triple sure what exactly you
mean by that. Since eLLC, LLC and on-gpu L3$ are all different caches, in
different parts of the coherency fabric. And L3$ has functional relevance
since if that's not set compute features fall apart.

So maybe a better definition would be "L3$ cached (useful for
compute)+performance optimized otherwise for general use+might be
incoherent depending upon platform" for entry 2. That would make sense and
covers it all, but imo yours a bit too simple (assuming my understanding
of cache architecture on gen9 is accurate, they change it all the bloody
time). Note that e.g. on older platforms we could enable L3$ either in the
PTE, or in MOCS settings in the batch itself, which is why the "useful for
compute" only started to become relevant for gen9. And why we needed the
kernel MOCS patch really.

Simililarly we'd need to come up with something accurate for #1 and #0
(and it sounds like they both need to mean PTE default per kernel). Hooray
for wasting one entry ;-)
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config
  2016-05-02  8:28                       ` Daniel Vetter
@ 2016-05-02 11:18                         ` Ville Syrjälä
  2016-05-02 13:50                         ` Imre Deak
  1 sibling, 0 replies; 32+ messages in thread
From: Ville Syrjälä @ 2016-05-02 11:18 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: Michael T Frederick, intel-gfx, Eero Tamminen, Valtteri Rantala

On Mon, May 02, 2016 at 10:28:50AM +0200, Daniel Vetter wrote:
> On Thu, Apr 28, 2016 at 08:15:24PM +0300, Imre Deak wrote:
> > On to, 2016-04-28 at 16:48 +0200, Daniel Vetter wrote:
> > > On Thu, Apr 28, 2016 at 11:38:55AM +0300, Imre Deak wrote:
> > > > On to, 2016-04-28 at 10:17 +0200, Daniel Vetter wrote:
> > > > > Also, you're guaranateeing that opencl/libva don't screw this up
> > > > > either?
> > > > 
> > > > If they don't set the given buffer to be cached via the set_caching
> > > > IOCTL (as a consequence making them coherent) they are already
> > > > screwed
> > > > on CHV. If they call the IOCTL they are fine on BXT too.
> > > 
> > > We do implicit set_caching when displaying something to something
> > > coherent. To make that work userspace should use the "use PTE" mode by
> > > default, except when they really know what they're doing.
> > > That's also the mode that's supposed to give you the most reasonable
> > > performance. But somehow that mode ended up in MOCS entry 1, so pretty much
> > > guaranteed userspace will get it wrong. Mesa just hit a perf snag, but
> > > might as well have been visual corruption. I think it'd be a lot safe to
> > > make "use PTE" entry 0.
> > 
> > Mesa uses entries 1 and 2. If something else like opencl or libva (or
> > even Mesa for that matter) uses index 0 for PTE pass-through that's a
> > bug on its own. I don't know if this is the case, but it's a separate
> > issue from what I'm trying to fix here.
> > 
> > This isn't about a case where a PTE pass-through entry needs to be
> > provided, but about the case where a cached but non-coherent one is
> > needed. Mesa assumes this to be entry 2 and I don't see why we couldn't
> > make sure that this assumption holds. Note that this entry on BXT could
> > be both a PTE pass-through one as in this patch or LLC-UC.
> 
> Yeah my comment about entry #0 was is a different track of discussion.
> Should still fix it up while we clarify what entries 1&2 really mean.
> 
> When defining entries as "cached" please make triple sure what exactly you
> mean by that. Since eLLC, LLC and on-gpu L3$ are all different caches, in
> different parts of the coherency fabric. And L3$ has functional relevance
> since if that's not set compute features fall apart.
> 
> So maybe a better definition would be "L3$ cached (useful for
> compute)+performance optimized otherwise for general use+might be
> incoherent depending upon platform" for entry 2. That would make sense and
> covers it all, but imo yours a bit too simple (assuming my understanding
> of cache architecture on gen9 is accurate, they change it all the bloody
> time). Note that e.g. on older platforms we could enable L3$ either in the
> PTE, or in MOCS settings in the batch itself, which is why the "useful for
> compute" only started to become relevant for gen9. And why we needed the
> kernel MOCS patch really.

L3 control via PTE died with IVB. Since then it's been MOCS only.

-- 
Ville Syrjälä
Intel OTC
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config
  2016-05-02  8:28                       ` Daniel Vetter
  2016-05-02 11:18                         ` Ville Syrjälä
@ 2016-05-02 13:50                         ` Imre Deak
  1 sibling, 0 replies; 32+ messages in thread
From: Imre Deak @ 2016-05-02 13:50 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: Eero Tamminen, Valtteri Rantala, intel-gfx, Michael T Frederick

On ma, 2016-05-02 at 10:28 +0200, Daniel Vetter wrote:
> On Thu, Apr 28, 2016 at 08:15:24PM +0300, Imre Deak wrote:
> > On to, 2016-04-28 at 16:48 +0200, Daniel Vetter wrote:
> > > On Thu, Apr 28, 2016 at 11:38:55AM +0300, Imre Deak wrote:
> > > > On to, 2016-04-28 at 10:17 +0200, Daniel Vetter wrote:
> > > > > Also, you're guaranateeing that opencl/libva don't screw this up
> > > > > either?
> > > > 
> > > > If they don't set the given buffer to be cached via the set_caching
> > > > IOCTL (as a consequence making them coherent) they are already
> > > > screwed
> > > > on CHV. If they call the IOCTL they are fine on BXT too.
> > > 
> > > We do implicit set_caching when displaying something to something
> > > coherent. To make that work userspace should use the "use PTE" mode by
> > > default, except when they really know what they're doing.
> > > That's also the mode that's supposed to give you the most reasonable
> > > performance. But somehow that mode ended up in MOCS entry 1, so pretty much
> > > guaranteed userspace will get it wrong. Mesa just hit a perf snag, but
> > > might as well have been visual corruption. I think it'd be a lot safe to
> > > make "use PTE" entry 0.
> > 
> > Mesa uses entries 1 and 2. If something else like opencl or libva (or
> > even Mesa for that matter) uses index 0 for PTE pass-through that's a
> > bug on its own. I don't know if this is the case, but it's a separate
> > issue from what I'm trying to fix here.
> > 
> > This isn't about a case where a PTE pass-through entry needs to be
> > provided, but about the case where a cached but non-coherent one is
> > needed. Mesa assumes this to be entry 2 and I don't see why we couldn't
> > make sure that this assumption holds. Note that this entry on BXT could
> > be both a PTE pass-through one as in this patch or LLC-UC.
> 
> Yeah my comment about entry #0 was is a different track of discussion.
> Should still fix it up while we clarify what entries 1&2 really mean.
> 
> When defining entries as "cached" please make triple sure what exactly you
> mean by that. Since eLLC, LLC and on-gpu L3$ are all different caches, in
> different parts of the coherency fabric. And L3$ has functional relevance
> since if that's not set compute features fall apart.

Based on the current MOCS table entry definitions:

index#0 - Not cached in either of GPU-L3, LLC or eLLC. Not coherent with CPU.

index#1 - Cached in GPU-L3. LLC/eLLC cacheability controlled by the kernel (via PTE). Coherency with CPU controlled by the kernel (via PTE).

index#2 - Cached in GPU-L3. Cached in both LLC and eLLC on platforms where these caches exist. Not coherent with CPU.

> So maybe a better definition would be "L3$ cached (useful for
> compute)+performance optimized otherwise for general use+might be
> incoherent depending upon platform" for entry 2.

Do you mean not to cache it in LLC? That would be a change on SKL/KBL
where entry 2 has LLC/eLLC cacheing enabled atm. I think we should keep
that enabled.

> That would make sense and
> covers it all, but imo yours a bit too simple (assuming my understanding
> of cache architecture on gen9 is accurate, they change it all the bloody
> time).

Well, my definition was "cached in all caches available to the GPU in
the given platform and not coherent with the CPU". This also covers it
all, but we can make it more explicit as in the description of index#2
above. We can also come up with a higher level description like
"optimized for speed" as you suggest, although I think performance
would depend anyway on the mapped object type/size. In any case we
should make it clear in the definition what's the significance of
coherency vs. non-coherency (i.e. GPU access overhead).

> Note that e.g. on older platforms we could enable L3$ either in the
> PTE, or in MOCS settings in the batch itself, which is why the "useful for
> compute" only started to become relevant for gen9. And why we needed the
> kernel MOCS patch really.
> 
> Simililarly we'd need to come up with something accurate for #1 and #0
> (and it sounds like they both need to mean PTE default per kernel). Hooray
> for wasting one entry ;-)

Yes I guess entry 0 could and should be redefined (but as a follow-up).

--Imre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2016-05-02 13:50 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-04-26 12:44 [PATCH v2 1/2] drm/i915/gen9: Clean up MOCS table definitions Imre Deak
2016-04-26 12:44 ` [PATCH v2 2/2] drm/i915/bxt: Fix inadvertent CPU snooping due to incorrect MOCS config Imre Deak
2016-04-26 12:57   ` Chris Wilson
2016-04-26 13:17     ` Imre Deak
2016-04-26 13:23       ` Chris Wilson
2016-04-26 13:43         ` Imre Deak
2016-04-26 13:58           ` Chris Wilson
2016-04-26 14:26         ` Eero Tamminen
2016-04-26 14:30           ` Daniel Vetter
2016-04-26 17:18             ` Eero Tamminen
2016-04-26 17:25               ` Frederick, Michael T
2016-04-27 13:25                 ` Eero Tamminen
2016-04-27 14:53                   ` Chris Wilson
2016-04-27 18:42                     ` Dave Gordon
2016-04-29  8:01                     ` Eero Tamminen
2016-04-26 17:57             ` Ville Syrjälä
2016-04-28  8:13               ` Daniel Vetter
2016-04-28 10:48                 ` Ville Syrjälä
2016-04-28 14:44                   ` Daniel Vetter
2016-04-28 17:21                     ` Ville Syrjälä
2016-04-26 14:42           ` Chris Wilson
2016-04-26 16:01             ` Imre Deak
2016-04-28  8:17               ` Daniel Vetter
2016-04-28  8:38                 ` Imre Deak
2016-04-28 14:48                   ` Daniel Vetter
2016-04-28 17:15                     ` Imre Deak
2016-05-02  8:28                       ` Daniel Vetter
2016-05-02 11:18                         ` Ville Syrjälä
2016-05-02 13:50                         ` Imre Deak
2016-04-28 17:25                     ` Ville Syrjälä
2016-04-26 13:12   ` Chris Wilson
2016-04-26 16:55 ` ✗ Fi.CI.BAT: failure for series starting with [v2,1/2] drm/i915/gen9: Clean up MOCS table definitions Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.