All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] drm/i915: Stop engines around GPU reset preparations
@ 2018-03-02 11:33 Chris Wilson
  2018-03-02 11:33 ` [PATCH 2/2] drm/i915: Suspend submission tasklets around wedging Chris Wilson
                   ` (3 more replies)
  0 siblings, 4 replies; 10+ messages in thread
From: Chris Wilson @ 2018-03-02 11:33 UTC (permalink / raw)
  To: intel-gfx

As we make preparations to reset the GPU state, we assume that the GPU
is hung and will not advance. Make this assumption more explicit by
setting the STOP_RING bit on the engines as part of our early reset
preparations.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Michel Thierry <michel.thierry@intel.com>
---
See https://intel-gfx-ci.01.org/tree/drm-tip/kasan_15/fi-bdw-5557u/pstore22-1519879816_Panic_3.log
for a bizarre error that kasan-farm keeps on trying over. Maybe related
to this?
---
 drivers/gpu/drm/i915/i915_drv.c     |  3 +++
 drivers/gpu/drm/i915/i915_drv.h     | 10 ++++++++--
 drivers/gpu/drm/i915/intel_uncore.c | 33 +++++++++++++++++++++++++++++++++
 3 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index aaa861b51024..925f5722d077 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1908,6 +1908,8 @@ void i915_reset(struct drm_i915_private *i915, unsigned int flags)
 	error->reset_count++;
 
 	disable_irq(i915->drm.irq);
+	intel_gpu_reset_prepare(i915, ALL_ENGINES);
+
 	ret = i915_gem_reset_prepare(i915);
 	if (ret) {
 		dev_err(i915->drm.dev, "GPU recovery failed\n");
@@ -1969,6 +1971,7 @@ void i915_reset(struct drm_i915_private *i915, unsigned int flags)
 
 finish:
 	i915_gem_reset_finish(i915);
+	intel_gpu_reset_finish(i915, ALL_ENGINES);
 	enable_irq(i915->drm.irq);
 
 wakeup:
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 10c9e5e619ab..b95e675e0834 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2957,8 +2957,14 @@ extern const struct dev_pm_ops i915_pm_ops;
 extern int i915_driver_load(struct pci_dev *pdev,
 			    const struct pci_device_id *ent);
 extern void i915_driver_unload(struct drm_device *dev);
-extern int intel_gpu_reset(struct drm_i915_private *dev_priv, u32 engine_mask);
-extern bool intel_has_gpu_reset(struct drm_i915_private *dev_priv);
+
+bool intel_has_gpu_reset(struct drm_i915_private *dev_priv);
+
+void intel_gpu_reset_prepare(struct drm_i915_private *dev_priv,
+			     unsigned engine_mask);
+int intel_gpu_reset(struct drm_i915_private *dev_priv, u32 engine_mask);
+void intel_gpu_reset_finish(struct drm_i915_private *dev_priv,
+			    unsigned engine_mask);
 
 #define I915_RESET_QUIET BIT(0)
 extern void i915_reset(struct drm_i915_private *i915, unsigned int flags);
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index 5ae9a62712ca..7186fe4d2ba9 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1899,6 +1899,29 @@ static reset_func intel_get_gpu_reset(struct drm_i915_private *dev_priv)
 		return NULL;
 }
 
+static void i915_engines_set_mode(struct drm_i915_private *dev_priv,
+				  unsigned engine_mask,
+				  u32 mode)
+{
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+
+	if (INTEL_GEN(dev_priv) < 3)
+		return;
+
+	for_each_engine_masked(engine, dev_priv, engine_mask, id)
+		I915_WRITE_FW(RING_MI_MODE(engine->mmio_base), mode);
+}
+
+void intel_gpu_reset_prepare(struct drm_i915_private *dev_priv,
+			     unsigned engine_mask)
+{
+	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
+
+	i915_engines_set_mode(dev_priv, engine_mask,
+			      _MASKED_BIT_ENABLE(STOP_RING));
+}
+
 int intel_gpu_reset(struct drm_i915_private *dev_priv, unsigned engine_mask)
 {
 	reset_func reset = intel_get_gpu_reset(dev_priv);
@@ -1939,6 +1962,16 @@ int intel_gpu_reset(struct drm_i915_private *dev_priv, unsigned engine_mask)
 	return ret;
 }
 
+void intel_gpu_reset_finish(struct drm_i915_private *dev_priv,
+			    unsigned engine_mask)
+{
+	/* Clear the STOP_RING bit as the reset may not have occurred */
+	i915_engines_set_mode(dev_priv, engine_mask,
+			      _MASKED_BIT_DISABLE(STOP_RING));
+
+	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
+}
+
 bool intel_has_gpu_reset(struct drm_i915_private *dev_priv)
 {
 	return intel_get_gpu_reset(dev_priv) != NULL;
-- 
2.16.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 2/2] drm/i915: Suspend submission tasklets around wedging
  2018-03-02 11:33 [PATCH 1/2] drm/i915: Stop engines around GPU reset preparations Chris Wilson
@ 2018-03-02 11:33 ` Chris Wilson
  2018-03-02 12:55   ` Chris Wilson
  2018-03-02 13:39   ` Mika Kuoppala
  2018-03-02 11:50 ` [PATCH 1/2] drm/i915: Stop engines around GPU reset preparations Mika Kuoppala
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 10+ messages in thread
From: Chris Wilson @ 2018-03-02 11:33 UTC (permalink / raw)
  To: intel-gfx

After starting hard at sequences like

[   28.199013]  systemd-1       2..s. 26062228us : execlists_submission_tasklet: rcs0 cs-irq head=0 [0?], tail=1 [1?]
[   28.199095]  systemd-1       2..s. 26062229us : execlists_submission_tasklet: rcs0 csb[1]: status=0x00000018:0x00000000, active=0x1
[   28.199177]  systemd-1       2..s. 26062230us : execlists_submission_tasklet: rcs0 out[0]: ctx=0.1, seqno=3, prio=-1024
[   28.199258]  systemd-1       2..s. 26062231us : execlists_submission_tasklet: rcs0 completed ctx=0
[   28.199340]  gem_eio-829     1..s1 26066853us : execlists_submission_tasklet: rcs0 in[0]:  ctx=1.1, seqno=1, prio=0
[   28.199421]   <idle>-0       2..s. 26066863us : execlists_submission_tasklet: rcs0 cs-irq head=1 [1?], tail=2 [2?]
[   28.199503]   <idle>-0       2..s. 26066865us : execlists_submission_tasklet: rcs0 csb[2]: status=0x00000001:0x00000000, active=0x1
[   28.199585]  gem_eio-829     1..s1 26067077us : execlists_submission_tasklet: rcs0 in[1]:  ctx=3.1, seqno=2, prio=0
[   28.199667]  gem_eio-829     1..s1 26067078us : execlists_submission_tasklet: rcs0 in[0]:  ctx=1.2, seqno=1, prio=0
[   28.199749]   <idle>-0       2..s. 26067084us : execlists_submission_tasklet: rcs0 cs-irq head=2 [2?], tail=3 [3?]
[   28.199830]   <idle>-0       2..s. 26067085us : execlists_submission_tasklet: rcs0 csb[3]: status=0x00008002:0x00000001, active=0x1
[   28.199912]   <idle>-0       2..s. 26067086us : execlists_submission_tasklet: rcs0 out[0]: ctx=1.2, seqno=1, prio=0
[   28.199994]  gem_eio-829     2..s. 28246084us : execlists_submission_tasklet: rcs0 cs-irq head=3 [3?], tail=4 [4?]
[   28.200096]  gem_eio-829     2..s. 28246088us : execlists_submission_tasklet: rcs0 csb[4]: status=0x00000014:0x00000001, active=0x5
[   28.200178]  gem_eio-829     2..s. 28246089us : execlists_submission_tasklet: rcs0 out[0]: ctx=0.0, seqno=0, prio=0
[   28.200260]  gem_eio-829     2..s. 28246127us : execlists_submission_tasklet: execlists_submission_tasklet:886 GEM_BUG_ON(buf[2 * head + 1] != port->context_id)

the conclusion is that the only place where the ports are reset to zero,
is from engine->cancel_requests called during i915_gem_set_wedged().

The race is horrible as it results from calling set-wedged on active HW
(the GPU reset failed) and as such we need to be careful as the HW state
changes beneath us. Fortunately, it's the same scary conditions as
affect normal reset, so we can reuse the same machinery to disable state
tracking as we clobber it.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104945
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Michel Thierry <michel.thierry@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c  | 6 +++++-
 drivers/gpu/drm/i915/intel_lrc.c | 5 +++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index c29b1a1cbe96..dcdcc09240b9 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3212,8 +3212,10 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
 	 * rolling the global seqno forward (since this would complete requests
 	 * for which we haven't set the fence error to EIO yet).
 	 */
-	for_each_engine(engine, i915, id)
+	for_each_engine(engine, i915, id) {
+		i915_gem_reset_prepare_engine(engine);
 		engine->submit_request = nop_submit_request;
+	}
 
 	/*
 	 * Make sure no one is running the old callback before we proceed with
@@ -3255,6 +3257,8 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
 		intel_engine_init_global_seqno(engine,
 					       intel_engine_last_submit(engine));
 		spin_unlock_irqrestore(&engine->timeline->lock, flags);
+
+		i915_gem_reset_finish_engine(engine);
 	}
 
 	wake_up_all(&i915->gpu_error.reset_queue);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 14288743909f..c1a3636e94fc 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -687,6 +687,8 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
 	struct rb_node *rb;
 	unsigned long flags;
 
+	GEM_TRACE("%s\n", engine->name);
+
 	spin_lock_irqsave(&engine->timeline->lock, flags);
 
 	/* Cancel the requests on the HW and clear the ELSP tracker. */
@@ -733,6 +735,9 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
 	 */
 	clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
 
+	/* Mark all CS interrupts as complete */
+	execlists->active = 0;
+
 	spin_unlock_irqrestore(&engine->timeline->lock, flags);
 }
 
-- 
2.16.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/2] drm/i915: Stop engines around GPU reset preparations
  2018-03-02 11:33 [PATCH 1/2] drm/i915: Stop engines around GPU reset preparations Chris Wilson
  2018-03-02 11:33 ` [PATCH 2/2] drm/i915: Suspend submission tasklets around wedging Chris Wilson
@ 2018-03-02 11:50 ` Mika Kuoppala
  2018-03-02 12:00   ` Chris Wilson
  2018-03-02 12:06 ` ✓ Fi.CI.BAT: success for series starting with [1/2] " Patchwork
  2018-03-02 13:31 ` ✗ Fi.CI.IGT: failure " Patchwork
  3 siblings, 1 reply; 10+ messages in thread
From: Mika Kuoppala @ 2018-03-02 11:50 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> As we make preparations to reset the GPU state, we assume that the GPU
> is hung and will not advance. Make this assumption more explicit by
> setting the STOP_RING bit on the engines as part of our early reset
> preparations.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Michel Thierry <michel.thierry@intel.com>
> ---
> See https://intel-gfx-ci.01.org/tree/drm-tip/kasan_15/fi-bdw-5557u/pstore22-1519879816_Panic_3.log
> for a bizarre error that kasan-farm keeps on trying over. Maybe related
> to this?
> ---
>  drivers/gpu/drm/i915/i915_drv.c     |  3 +++
>  drivers/gpu/drm/i915/i915_drv.h     | 10 ++++++++--
>  drivers/gpu/drm/i915/intel_uncore.c | 33 +++++++++++++++++++++++++++++++++
>  3 files changed, 44 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index aaa861b51024..925f5722d077 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -1908,6 +1908,8 @@ void i915_reset(struct drm_i915_private *i915, unsigned int flags)
>  	error->reset_count++;
>  
>  	disable_irq(i915->drm.irq);
> +	intel_gpu_reset_prepare(i915, ALL_ENGINES);
> +
>  	ret = i915_gem_reset_prepare(i915);
>  	if (ret) {
>  		dev_err(i915->drm.dev, "GPU recovery failed\n");
> @@ -1969,6 +1971,7 @@ void i915_reset(struct drm_i915_private *i915, unsigned int flags)
>  
>  finish:
>  	i915_gem_reset_finish(i915);
> +	intel_gpu_reset_finish(i915, ALL_ENGINES);
>  	enable_irq(i915->drm.irq);
>  
>  wakeup:
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 10c9e5e619ab..b95e675e0834 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2957,8 +2957,14 @@ extern const struct dev_pm_ops i915_pm_ops;
>  extern int i915_driver_load(struct pci_dev *pdev,
>  			    const struct pci_device_id *ent);
>  extern void i915_driver_unload(struct drm_device *dev);
> -extern int intel_gpu_reset(struct drm_i915_private *dev_priv, u32 engine_mask);
> -extern bool intel_has_gpu_reset(struct drm_i915_private *dev_priv);
> +
> +bool intel_has_gpu_reset(struct drm_i915_private *dev_priv);
> +
> +void intel_gpu_reset_prepare(struct drm_i915_private *dev_priv,
> +			     unsigned engine_mask);
> +int intel_gpu_reset(struct drm_i915_private *dev_priv, u32 engine_mask);
> +void intel_gpu_reset_finish(struct drm_i915_private *dev_priv,
> +			    unsigned engine_mask);
>  
>  #define I915_RESET_QUIET BIT(0)
>  extern void i915_reset(struct drm_i915_private *i915, unsigned int flags);
> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> index 5ae9a62712ca..7186fe4d2ba9 100644
> --- a/drivers/gpu/drm/i915/intel_uncore.c
> +++ b/drivers/gpu/drm/i915/intel_uncore.c
> @@ -1899,6 +1899,29 @@ static reset_func intel_get_gpu_reset(struct drm_i915_private *dev_priv)
>  		return NULL;
>  }
>  
> +static void i915_engines_set_mode(struct drm_i915_private *dev_priv,
> +				  unsigned engine_mask,
> +				  u32 mode)
> +{
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +
> +	if (INTEL_GEN(dev_priv) < 3)
> +		return;
> +
> +	for_each_engine_masked(engine, dev_priv, engine_mask, id)
> +		I915_WRITE_FW(RING_MI_MODE(engine->mmio_base), mode);

Is there reason to not use gen3_stop_engine in this level?

-Mika

> +}
> +
> +void intel_gpu_reset_prepare(struct drm_i915_private *dev_priv,
> +			     unsigned engine_mask)
> +{
> +	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
> +
> +	i915_engines_set_mode(dev_priv, engine_mask,
> +			      _MASKED_BIT_ENABLE(STOP_RING));
> +}
> +
>  int intel_gpu_reset(struct drm_i915_private *dev_priv, unsigned engine_mask)
>  {
>  	reset_func reset = intel_get_gpu_reset(dev_priv);
> @@ -1939,6 +1962,16 @@ int intel_gpu_reset(struct drm_i915_private *dev_priv, unsigned engine_mask)
>  	return ret;
>  }
>  
> +void intel_gpu_reset_finish(struct drm_i915_private *dev_priv,
> +			    unsigned engine_mask)
> +{
> +	/* Clear the STOP_RING bit as the reset may not have occurred */
> +	i915_engines_set_mode(dev_priv, engine_mask,
> +			      _MASKED_BIT_DISABLE(STOP_RING));
> +
> +	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
> +}
> +
>  bool intel_has_gpu_reset(struct drm_i915_private *dev_priv)
>  {
>  	return intel_get_gpu_reset(dev_priv) != NULL;
> -- 
> 2.16.2
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/2] drm/i915: Stop engines around GPU reset preparations
  2018-03-02 11:50 ` [PATCH 1/2] drm/i915: Stop engines around GPU reset preparations Mika Kuoppala
@ 2018-03-02 12:00   ` Chris Wilson
  2018-03-02 12:17     ` Mika Kuoppala
  0 siblings, 1 reply; 10+ messages in thread
From: Chris Wilson @ 2018-03-02 12:00 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx

Quoting Mika Kuoppala (2018-03-02 11:50:32)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> > +static void i915_engines_set_mode(struct drm_i915_private *dev_priv,
> > +                               unsigned engine_mask,
> > +                               u32 mode)
> > +{
> > +     struct intel_engine_cs *engine;
> > +     enum intel_engine_id id;
> > +
> > +     if (INTEL_GEN(dev_priv) < 3)
> > +             return;
> > +
> > +     for_each_engine_masked(engine, dev_priv, engine_mask, id)
> > +             I915_WRITE_FW(RING_MI_MODE(engine->mmio_base), mode);
> 
> Is there reason to not use gen3_stop_engine in this level?

It clears HEAD/TAIL, so undoing it in the case of no reset is a bit more
tricky.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* ✓ Fi.CI.BAT: success for series starting with [1/2] drm/i915: Stop engines around GPU reset preparations
  2018-03-02 11:33 [PATCH 1/2] drm/i915: Stop engines around GPU reset preparations Chris Wilson
  2018-03-02 11:33 ` [PATCH 2/2] drm/i915: Suspend submission tasklets around wedging Chris Wilson
  2018-03-02 11:50 ` [PATCH 1/2] drm/i915: Stop engines around GPU reset preparations Mika Kuoppala
@ 2018-03-02 12:06 ` Patchwork
  2018-03-02 13:31 ` ✗ Fi.CI.IGT: failure " Patchwork
  3 siblings, 0 replies; 10+ messages in thread
From: Patchwork @ 2018-03-02 12:06 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/2] drm/i915: Stop engines around GPU reset preparations
URL   : https://patchwork.freedesktop.org/series/39270/
State : success

== Summary ==

Series 39270v1 series starting with [1/2] drm/i915: Stop engines around GPU reset preparations
https://patchwork.freedesktop.org/api/1.0/series/39270/revisions/1/mbox/

---- Known issues:

Test kms_pipe_crc_basic:
        Subgroup suspend-read-crc-pipe-b:
                incomplete -> PASS       (fi-snb-2520m) fdo#103713
Test prime_vgem:
        Subgroup basic-fence-flip:
                fail       -> PASS       (fi-byt-n2820) fdo#104008

fdo#103713 https://bugs.freedesktop.org/show_bug.cgi?id=103713
fdo#104008 https://bugs.freedesktop.org/show_bug.cgi?id=104008

fi-bdw-5557u     total:288  pass:267  dwarn:0   dfail:0   fail:0   skip:21  time:411s
fi-bdw-gvtdvm    total:288  pass:264  dwarn:0   dfail:0   fail:0   skip:24  time:423s
fi-blb-e6850     total:288  pass:223  dwarn:1   dfail:0   fail:0   skip:64  time:371s
fi-bsw-n3050     total:288  pass:242  dwarn:0   dfail:0   fail:0   skip:46  time:479s
fi-bwr-2160      total:288  pass:183  dwarn:0   dfail:0   fail:0   skip:105 time:278s
fi-bxt-dsi       total:288  pass:258  dwarn:0   dfail:0   fail:0   skip:30  time:476s
fi-bxt-j4205     total:288  pass:259  dwarn:0   dfail:0   fail:0   skip:29  time:481s
fi-byt-j1900     total:288  pass:253  dwarn:0   dfail:0   fail:0   skip:35  time:463s
fi-byt-n2820     total:288  pass:249  dwarn:0   dfail:0   fail:0   skip:39  time:455s
fi-cfl-8700k     total:288  pass:260  dwarn:0   dfail:0   fail:0   skip:28  time:393s
fi-cfl-s2        total:288  pass:262  dwarn:0   dfail:0   fail:0   skip:26  time:564s
fi-cfl-u         total:288  pass:262  dwarn:0   dfail:0   fail:0   skip:26  time:494s
fi-elk-e7500     total:288  pass:229  dwarn:0   dfail:0   fail:0   skip:59  time:413s
fi-gdg-551       total:288  pass:180  dwarn:0   dfail:0   fail:0   skip:108 time:288s
fi-glk-1         total:288  pass:260  dwarn:0   dfail:0   fail:0   skip:28  time:507s
fi-hsw-4770      total:288  pass:261  dwarn:0   dfail:0   fail:0   skip:27  time:391s
fi-ilk-650       total:288  pass:228  dwarn:0   dfail:0   fail:0   skip:60  time:407s
fi-ivb-3520m     total:288  pass:259  dwarn:0   dfail:0   fail:0   skip:29  time:445s
fi-ivb-3770      total:288  pass:255  dwarn:0   dfail:0   fail:0   skip:33  time:410s
fi-kbl-7500u     total:288  pass:263  dwarn:1   dfail:0   fail:0   skip:24  time:449s
fi-kbl-7560u     total:288  pass:269  dwarn:0   dfail:0   fail:0   skip:19  time:491s
fi-kbl-7567u     total:288  pass:268  dwarn:0   dfail:0   fail:0   skip:20  time:450s
fi-kbl-r         total:288  pass:261  dwarn:0   dfail:0   fail:0   skip:27  time:491s
fi-pnv-d510      total:288  pass:222  dwarn:1   dfail:0   fail:0   skip:65  time:586s
fi-skl-6260u     total:288  pass:268  dwarn:0   dfail:0   fail:0   skip:20  time:424s
fi-skl-6600u     total:288  pass:261  dwarn:0   dfail:0   fail:0   skip:27  time:500s
fi-skl-6700hq    total:288  pass:262  dwarn:0   dfail:0   fail:0   skip:26  time:518s
fi-skl-6700k2    total:288  pass:264  dwarn:0   dfail:0   fail:0   skip:24  time:488s
fi-skl-6770hq    total:288  pass:268  dwarn:0   dfail:0   fail:0   skip:20  time:467s
fi-skl-guc       total:288  pass:260  dwarn:0   dfail:0   fail:0   skip:28  time:403s
fi-skl-gvtdvm    total:288  pass:265  dwarn:0   dfail:0   fail:0   skip:23  time:429s
fi-snb-2520m     total:288  pass:248  dwarn:0   dfail:0   fail:0   skip:40  time:512s
fi-snb-2600      total:288  pass:248  dwarn:0   dfail:0   fail:0   skip:40  time:388s
fi-cnl-y3 failed to collect. IGT log at Patchwork_8210/fi-cnl-y3/run0.log

b2e10fd5e8b2cd72b0e1eba46c1221dc3d4b70bc drm-tip: 2018y-03m-02d-09h-36m-59s UTC integration manifest
f2dac0f529e2 drm/i915: Suspend submission tasklets around wedging
1e1a9eccd43f drm/i915: Stop engines around GPU reset preparations

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_8210/issues.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/2] drm/i915: Stop engines around GPU reset preparations
  2018-03-02 12:00   ` Chris Wilson
@ 2018-03-02 12:17     ` Mika Kuoppala
  2018-03-02 12:31       ` Chris Wilson
  0 siblings, 1 reply; 10+ messages in thread
From: Mika Kuoppala @ 2018-03-02 12:17 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Quoting Mika Kuoppala (2018-03-02 11:50:32)
>> Chris Wilson <chris@chris-wilson.co.uk> writes:
>> > +static void i915_engines_set_mode(struct drm_i915_private *dev_priv,
>> > +                               unsigned engine_mask,
>> > +                               u32 mode)
>> > +{
>> > +     struct intel_engine_cs *engine;
>> > +     enum intel_engine_id id;
>> > +
>> > +     if (INTEL_GEN(dev_priv) < 3)
>> > +             return;
>> > +
>> > +     for_each_engine_masked(engine, dev_priv, engine_mask, id)
>> > +             I915_WRITE_FW(RING_MI_MODE(engine->mmio_base), mode);
>> 
>> Is there reason to not use gen3_stop_engine in this level?
>
> It clears HEAD/TAIL, so undoing it in the case of no reset is a bit more
> tricky.

With this we now have 3 different flavours of stopping an engine.

I would like to see early on prepare reset to call engine->stop(),
which would be unified way of bring engine to halt. And limit
any further restoration of state if we can't really manage to reset it,
leaving it as stopped and dormant as we possibly could get it.

Then only on successful reset and restoration of init state, we would
have an engine->start().

But as this does stop the engine early on it is an improvement,
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/2] drm/i915: Stop engines around GPU reset preparations
  2018-03-02 12:17     ` Mika Kuoppala
@ 2018-03-02 12:31       ` Chris Wilson
  0 siblings, 0 replies; 10+ messages in thread
From: Chris Wilson @ 2018-03-02 12:31 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx

Quoting Mika Kuoppala (2018-03-02 12:17:19)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Quoting Mika Kuoppala (2018-03-02 11:50:32)
> >> Chris Wilson <chris@chris-wilson.co.uk> writes:
> >> > +static void i915_engines_set_mode(struct drm_i915_private *dev_priv,
> >> > +                               unsigned engine_mask,
> >> > +                               u32 mode)
> >> > +{
> >> > +     struct intel_engine_cs *engine;
> >> > +     enum intel_engine_id id;
> >> > +
> >> > +     if (INTEL_GEN(dev_priv) < 3)
> >> > +             return;
> >> > +
> >> > +     for_each_engine_masked(engine, dev_priv, engine_mask, id)
> >> > +             I915_WRITE_FW(RING_MI_MODE(engine->mmio_base), mode);
> >> 
> >> Is there reason to not use gen3_stop_engine in this level?
> >
> > It clears HEAD/TAIL, so undoing it in the case of no reset is a bit more
> > tricky.
> 
> With this we now have 3 different flavours of stopping an engine.
> 
> I would like to see early on prepare reset to call engine->stop(),
> which would be unified way of bring engine to halt. And limit
> any further restoration of state if we can't really manage to reset it,
> leaving it as stopped and dormant as we possibly could get it.
> 
> Then only on successful reset and restoration of init state, we would
> have an engine->start().

Wire it up, and see how it looks. We do

	engine->stop # reset(early) or suspend
	engine->reset # reset or suspend
	engine->cancel # reset or wedge
	engine->init # reset or resume
	engine->start # reset(late) or resume

I agree adopting that scheme should help, if we can nail the code into
individual steps without repetition. (If we find we repeat ourselves,
the above scheme doesn't fit :)
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] drm/i915: Suspend submission tasklets around wedging
  2018-03-02 11:33 ` [PATCH 2/2] drm/i915: Suspend submission tasklets around wedging Chris Wilson
@ 2018-03-02 12:55   ` Chris Wilson
  2018-03-02 13:39   ` Mika Kuoppala
  1 sibling, 0 replies; 10+ messages in thread
From: Chris Wilson @ 2018-03-02 12:55 UTC (permalink / raw)
  To: intel-gfx

Quoting Chris Wilson (2018-03-02 11:33:24)
> After starting hard at sequences like
> 
> [   28.199013]  systemd-1       2..s. 26062228us : execlists_submission_tasklet: rcs0 cs-irq head=0 [0?], tail=1 [1?]
> [   28.199095]  systemd-1       2..s. 26062229us : execlists_submission_tasklet: rcs0 csb[1]: status=0x00000018:0x00000000, active=0x1
> [   28.199177]  systemd-1       2..s. 26062230us : execlists_submission_tasklet: rcs0 out[0]: ctx=0.1, seqno=3, prio=-1024
> [   28.199258]  systemd-1       2..s. 26062231us : execlists_submission_tasklet: rcs0 completed ctx=0
> [   28.199340]  gem_eio-829     1..s1 26066853us : execlists_submission_tasklet: rcs0 in[0]:  ctx=1.1, seqno=1, prio=0
> [   28.199421]   <idle>-0       2..s. 26066863us : execlists_submission_tasklet: rcs0 cs-irq head=1 [1?], tail=2 [2?]
> [   28.199503]   <idle>-0       2..s. 26066865us : execlists_submission_tasklet: rcs0 csb[2]: status=0x00000001:0x00000000, active=0x1
> [   28.199585]  gem_eio-829     1..s1 26067077us : execlists_submission_tasklet: rcs0 in[1]:  ctx=3.1, seqno=2, prio=0
> [   28.199667]  gem_eio-829     1..s1 26067078us : execlists_submission_tasklet: rcs0 in[0]:  ctx=1.2, seqno=1, prio=0
> [   28.199749]   <idle>-0       2..s. 26067084us : execlists_submission_tasklet: rcs0 cs-irq head=2 [2?], tail=3 [3?]
> [   28.199830]   <idle>-0       2..s. 26067085us : execlists_submission_tasklet: rcs0 csb[3]: status=0x00008002:0x00000001, active=0x1
> [   28.199912]   <idle>-0       2..s. 26067086us : execlists_submission_tasklet: rcs0 out[0]: ctx=1.2, seqno=1, prio=0
> [   28.199994]  gem_eio-829     2..s. 28246084us : execlists_submission_tasklet: rcs0 cs-irq head=3 [3?], tail=4 [4?]
> [   28.200096]  gem_eio-829     2..s. 28246088us : execlists_submission_tasklet: rcs0 csb[4]: status=0x00000014:0x00000001, active=0x5
> [   28.200178]  gem_eio-829     2..s. 28246089us : execlists_submission_tasklet: rcs0 out[0]: ctx=0.0, seqno=0, prio=0
> [   28.200260]  gem_eio-829     2..s. 28246127us : execlists_submission_tasklet: execlists_submission_tasklet:886 GEM_BUG_ON(buf[2 * head + 1] != port->context_id)
> 
> the conclusion is that the only place where the ports are reset to zero,
> is from engine->cancel_requests called during i915_gem_set_wedged().
> 
> The race is horrible as it results from calling set-wedged on active HW
> (the GPU reset failed) and as such we need to be careful as the HW state
> changes beneath us. Fortunately, it's the same scary conditions as
> affect normal reset, so we can reuse the same machinery to disable state
> tracking as we clobber it.
> 

Fixes: af7a8ffad9c5 ("drm/i915: Use rcu instead of stop_machine in set_wedged")

> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104945
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Michel Thierry <michel.thierry@intel.com>
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* ✗ Fi.CI.IGT: failure for series starting with [1/2] drm/i915: Stop engines around GPU reset preparations
  2018-03-02 11:33 [PATCH 1/2] drm/i915: Stop engines around GPU reset preparations Chris Wilson
                   ` (2 preceding siblings ...)
  2018-03-02 12:06 ` ✓ Fi.CI.BAT: success for series starting with [1/2] " Patchwork
@ 2018-03-02 13:31 ` Patchwork
  3 siblings, 0 replies; 10+ messages in thread
From: Patchwork @ 2018-03-02 13:31 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/2] drm/i915: Stop engines around GPU reset preparations
URL   : https://patchwork.freedesktop.org/series/39270/
State : failure

== Summary ==

---- Possible new issues:

Test kms_frontbuffer_tracking:
        Subgroup fbc-1p-primscrn-spr-indfb-draw-pwrite:
                pass       -> FAIL       (shard-apl)

---- Known issues:

Test drv_selftest:
        Subgroup live_gtt:
                pass       -> INCOMPLETE (shard-apl) fdo#103927
Test gem_eio:
        Subgroup in-flight-contexts:
                incomplete -> PASS       (shard-apl) fdo#104945
Test kms_chv_cursor_fail:
        Subgroup pipe-b-256x256-bottom-edge:
                dmesg-warn -> PASS       (shard-snb) fdo#105185 +2
Test kms_cursor_crc:
        Subgroup cursor-64x64-suspend:
                pass       -> INCOMPLETE (shard-hsw) fdo#103540
Test kms_flip:
        Subgroup modeset-vs-vblank-race-interruptible:
                pass       -> FAIL       (shard-hsw) fdo#103060

fdo#103927 https://bugs.freedesktop.org/show_bug.cgi?id=103927
fdo#104945 https://bugs.freedesktop.org/show_bug.cgi?id=104945
fdo#105185 https://bugs.freedesktop.org/show_bug.cgi?id=105185
fdo#103540 https://bugs.freedesktop.org/show_bug.cgi?id=103540
fdo#103060 https://bugs.freedesktop.org/show_bug.cgi?id=103060

shard-apl        total:3441 pass:1798 dwarn:1   dfail:0   fail:8   skip:1632 time:12107s
shard-hsw        total:3422 pass:1748 dwarn:1   dfail:0   fail:2   skip:1669 time:11536s
shard-snb        total:3463 pass:1360 dwarn:2   dfail:0   fail:1   skip:2100 time:6926s
Blacklisted hosts:
shard-kbl        total:3445 pass:1926 dwarn:1   dfail:0   fail:8   skip:1509 time:9796s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_8210/shards.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] drm/i915: Suspend submission tasklets around wedging
  2018-03-02 11:33 ` [PATCH 2/2] drm/i915: Suspend submission tasklets around wedging Chris Wilson
  2018-03-02 12:55   ` Chris Wilson
@ 2018-03-02 13:39   ` Mika Kuoppala
  1 sibling, 0 replies; 10+ messages in thread
From: Mika Kuoppala @ 2018-03-02 13:39 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> After starting hard at sequences like
>
> [   28.199013]  systemd-1       2..s. 26062228us : execlists_submission_tasklet: rcs0 cs-irq head=0 [0?], tail=1 [1?]
> [   28.199095]  systemd-1       2..s. 26062229us : execlists_submission_tasklet: rcs0 csb[1]: status=0x00000018:0x00000000, active=0x1
> [   28.199177]  systemd-1       2..s. 26062230us : execlists_submission_tasklet: rcs0 out[0]: ctx=0.1, seqno=3, prio=-1024
> [   28.199258]  systemd-1       2..s. 26062231us : execlists_submission_tasklet: rcs0 completed ctx=0
> [   28.199340]  gem_eio-829     1..s1 26066853us : execlists_submission_tasklet: rcs0 in[0]:  ctx=1.1, seqno=1, prio=0
> [   28.199421]   <idle>-0       2..s. 26066863us : execlists_submission_tasklet: rcs0 cs-irq head=1 [1?], tail=2 [2?]
> [   28.199503]   <idle>-0       2..s. 26066865us : execlists_submission_tasklet: rcs0 csb[2]: status=0x00000001:0x00000000, active=0x1
> [   28.199585]  gem_eio-829     1..s1 26067077us : execlists_submission_tasklet: rcs0 in[1]:  ctx=3.1, seqno=2, prio=0
> [   28.199667]  gem_eio-829     1..s1 26067078us : execlists_submission_tasklet: rcs0 in[0]:  ctx=1.2, seqno=1, prio=0
> [   28.199749]   <idle>-0       2..s. 26067084us : execlists_submission_tasklet: rcs0 cs-irq head=2 [2?], tail=3 [3?]
> [   28.199830]   <idle>-0       2..s. 26067085us : execlists_submission_tasklet: rcs0 csb[3]: status=0x00008002:0x00000001, active=0x1
> [   28.199912]   <idle>-0       2..s. 26067086us : execlists_submission_tasklet: rcs0 out[0]: ctx=1.2, seqno=1, prio=0
> [   28.199994]  gem_eio-829     2..s. 28246084us : execlists_submission_tasklet: rcs0 cs-irq head=3 [3?], tail=4 [4?]
> [   28.200096]  gem_eio-829     2..s. 28246088us : execlists_submission_tasklet: rcs0 csb[4]: status=0x00000014:0x00000001, active=0x5
> [   28.200178]  gem_eio-829     2..s. 28246089us : execlists_submission_tasklet: rcs0 out[0]: ctx=0.0, seqno=0, prio=0
> [   28.200260]  gem_eio-829     2..s. 28246127us : execlists_submission_tasklet: execlists_submission_tasklet:886 GEM_BUG_ON(buf[2 * head + 1] != port->context_id)
>
> the conclusion is that the only place where the ports are reset to zero,
> is from engine->cancel_requests called during i915_gem_set_wedged().
>
> The race is horrible as it results from calling set-wedged on active HW
> (the GPU reset failed) and as such we need to be careful as the HW state
> changes beneath us. Fortunately, it's the same scary conditions as
> affect normal reset, so we can reuse the same machinery to disable state
> tracking as we clobber it.
>
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104945
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Michel Thierry <michel.thierry@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem.c  | 6 +++++-
>  drivers/gpu/drm/i915/intel_lrc.c | 5 +++++
>  2 files changed, 10 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index c29b1a1cbe96..dcdcc09240b9 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3212,8 +3212,10 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
>  	 * rolling the global seqno forward (since this would complete requests
>  	 * for which we haven't set the fence error to EIO yet).
>  	 */
> -	for_each_engine(engine, i915, id)
> +	for_each_engine(engine, i915, id) {
> +		i915_gem_reset_prepare_engine(engine);
>  		engine->submit_request = nop_submit_request;
> +	}
>  
>  	/*
>  	 * Make sure no one is running the old callback before we proceed with
> @@ -3255,6 +3257,8 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
>  		intel_engine_init_global_seqno(engine,
>  					       intel_engine_last_submit(engine));
>  		spin_unlock_irqrestore(&engine->timeline->lock, flags);
> +
> +		i915_gem_reset_finish_engine(engine);
>  	}
>  
>  	wake_up_all(&i915->gpu_error.reset_queue);
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 14288743909f..c1a3636e94fc 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -687,6 +687,8 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
>  	struct rb_node *rb;
>  	unsigned long flags;
>  
> +	GEM_TRACE("%s\n", engine->name);
> +
>  	spin_lock_irqsave(&engine->timeline->lock, flags);
>  
>  	/* Cancel the requests on the HW and clear the ELSP tracker. */
> @@ -733,6 +735,9 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
>  	 */
>  	clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
>  
> +	/* Mark all CS interrupts as complete */
> +	execlists->active = 0;

With the followup patch to handle the other irq state manipulation inside
timeline lock, albeit it feels a little like borrowing a lock, I am content.

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

> +
>  	spin_unlock_irqrestore(&engine->timeline->lock, flags);
>  }
>  
> -- 
> 2.16.2
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2018-03-02 13:43 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-02 11:33 [PATCH 1/2] drm/i915: Stop engines around GPU reset preparations Chris Wilson
2018-03-02 11:33 ` [PATCH 2/2] drm/i915: Suspend submission tasklets around wedging Chris Wilson
2018-03-02 12:55   ` Chris Wilson
2018-03-02 13:39   ` Mika Kuoppala
2018-03-02 11:50 ` [PATCH 1/2] drm/i915: Stop engines around GPU reset preparations Mika Kuoppala
2018-03-02 12:00   ` Chris Wilson
2018-03-02 12:17     ` Mika Kuoppala
2018-03-02 12:31       ` Chris Wilson
2018-03-02 12:06 ` ✓ Fi.CI.BAT: success for series starting with [1/2] " Patchwork
2018-03-02 13:31 ` ✗ Fi.CI.IGT: failure " Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.