All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/i915: Inspect subunit states on hangcheck
@ 2015-12-01 12:17 Mika Kuoppala
  2015-12-01 12:31 ` Chris Wilson
                   ` (3 more replies)
  0 siblings, 4 replies; 9+ messages in thread
From: Mika Kuoppala @ 2015-12-01 12:17 UTC (permalink / raw)
  To: intel-gfx; +Cc: miku

If head seems stuck and engine in question is rcs,
inspect subunit state transitions before deciding that
this really is a hang instead of limited progress.

References: https://bugs.freedesktop.org/show_bug.cgi?id=93029
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Dave Gordon <david.s.gordon@intel.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_irq.c         | 49 +++++++++++++++++++++++++++++----
 drivers/gpu/drm/i915/intel_ringbuffer.h |  1 +
 2 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index e88d692..e6ae54f 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2913,13 +2913,31 @@ static void semaphore_clear_deadlocks(struct drm_i915_private *dev_priv)
 		ring->hangcheck.deadlock = 0;
 }
 
-static enum intel_ring_hangcheck_action
-ring_stuck(struct intel_engine_cs *ring, u64 acthd)
+static bool subunits_stuck(struct intel_engine_cs *ring)
 {
-	struct drm_device *dev = ring->dev;
-	struct drm_i915_private *dev_priv = dev->dev_private;
-	u32 tmp;
+	int i;
+	u32 instdone[I915_NUM_INSTDONE_REG];
+	bool stuck;
+
+	if (ring->id != RCS)
+		return true;
+
+	i915_get_extra_instdone(ring->dev, instdone);
 
+	stuck = true;
+	for (i = 0; i < I915_NUM_INSTDONE_REG; i++) {
+		if (instdone[i] != ring->hangcheck.instdone[i])
+			stuck = false;
+
+		ring->hangcheck.instdone[i] = instdone[i];
+	}
+
+	return stuck;
+}
+
+static enum intel_ring_hangcheck_action
+head_stuck(struct intel_engine_cs *ring, u64 acthd)
+{
 	if (acthd != ring->hangcheck.acthd) {
 		if (acthd > ring->hangcheck.max_acthd) {
 			ring->hangcheck.max_acthd = acthd;
@@ -2929,6 +2947,24 @@ ring_stuck(struct intel_engine_cs *ring, u64 acthd)
 		return HANGCHECK_ACTIVE_LOOP;
 	}
 
+	if (!subunits_stuck(ring))
+		return HANGCHECK_ACTIVE_LOOP;
+
+	return HANGCHECK_HUNG;
+}
+
+static enum intel_ring_hangcheck_action
+ring_stuck(struct intel_engine_cs *ring, u64 acthd)
+{
+	struct drm_device *dev = ring->dev;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	enum intel_ring_hangcheck_action ha;
+	u32 tmp;
+
+	ha = head_stuck(ring, acthd);
+	if (ha != HANGCHECK_HUNG)
+		return ha;
+
 	if (IS_GEN2(dev))
 		return HANGCHECK_HUNG;
 
@@ -3064,6 +3100,9 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 				ring->hangcheck.score--;
 
 			ring->hangcheck.acthd = ring->hangcheck.max_acthd = 0;
+
+			memset(ring->hangcheck.instdone, 0,
+			       sizeof(ring->hangcheck.instdone));
 		}
 
 		ring->hangcheck.seqno = seqno;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 5d1eb20..b8fe92e 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -93,6 +93,7 @@ struct intel_ring_hangcheck {
 	int score;
 	enum intel_ring_hangcheck_action action;
 	int deadlock;
+	u32 instdone[I915_NUM_INSTDONE_REG];
 };
 
 struct intel_ringbuffer {
-- 
2.5.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/i915: Inspect subunit states on hangcheck
  2015-12-01 12:17 [PATCH] drm/i915: Inspect subunit states on hangcheck Mika Kuoppala
@ 2015-12-01 12:31 ` Chris Wilson
  2015-12-01 12:56 ` Arun Siluvery
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 9+ messages in thread
From: Chris Wilson @ 2015-12-01 12:31 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx, miku

On Tue, Dec 01, 2015 at 02:17:25PM +0200, Mika Kuoppala wrote:
> If head seems stuck and engine in question is rcs,
> inspect subunit state transitions before deciding that
> this really is a hang instead of limited progress.
> 
> References: https://bugs.freedesktop.org/show_bug.cgi?id=93029
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Dave Gordon <david.s.gordon@intel.com>
> Cc: Daniel Vetter <daniel@ffwll.ch>
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/i915: Inspect subunit states on hangcheck
  2015-12-01 12:17 [PATCH] drm/i915: Inspect subunit states on hangcheck Mika Kuoppala
  2015-12-01 12:31 ` Chris Wilson
@ 2015-12-01 12:56 ` Arun Siluvery
  2015-12-01 12:58 ` Mika Kuoppala
  2015-12-01 15:56 ` Mika Kuoppala
  3 siblings, 0 replies; 9+ messages in thread
From: Arun Siluvery @ 2015-12-01 12:56 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx; +Cc: miku

On 01/12/2015 12:17, Mika Kuoppala wrote:
> If head seems stuck and engine in question is rcs,
> inspect subunit state transitions before deciding that
> this really is a hang instead of limited progress.
>
> References: https://bugs.freedesktop.org/show_bug.cgi?id=93029
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Dave Gordon <david.s.gordon@intel.com>
> Cc: Daniel Vetter <daniel@ffwll.ch>
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_irq.c         | 49 +++++++++++++++++++++++++++++----
>   drivers/gpu/drm/i915/intel_ringbuffer.h |  1 +
>   2 files changed, 45 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index e88d692..e6ae54f 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2913,13 +2913,31 @@ static void semaphore_clear_deadlocks(struct drm_i915_private *dev_priv)
>   		ring->hangcheck.deadlock = 0;
>   }
>
> -static enum intel_ring_hangcheck_action
> -ring_stuck(struct intel_engine_cs *ring, u64 acthd)
> +static bool subunits_stuck(struct intel_engine_cs *ring)
>   {
> -	struct drm_device *dev = ring->dev;
> -	struct drm_i915_private *dev_priv = dev->dev_private;
> -	u32 tmp;
> +	int i;
> +	u32 instdone[I915_NUM_INSTDONE_REG];
> +	bool stuck;
> +
> +	if (ring->id != RCS)
> +		return true;
> +
> +	i915_get_extra_instdone(ring->dev, instdone);
>
> +	stuck = true;
> +	for (i = 0; i < I915_NUM_INSTDONE_REG; i++) {
> +		if (instdone[i] != ring->hangcheck.instdone[i])
> +			stuck = false;

This may not be completely reliable. Tomas Elf in his TDR tests observed 
that instdone kept changing even when CS is hung and in a stable state.

regards
Arun

> +
> +		ring->hangcheck.instdone[i] = instdone[i];
> +	}
> +
> +	return stuck;
> +}
> +
> +static enum intel_ring_hangcheck_action
> +head_stuck(struct intel_engine_cs *ring, u64 acthd)
> +{
>   	if (acthd != ring->hangcheck.acthd) {
>   		if (acthd > ring->hangcheck.max_acthd) {
>   			ring->hangcheck.max_acthd = acthd;
> @@ -2929,6 +2947,24 @@ ring_stuck(struct intel_engine_cs *ring, u64 acthd)
>   		return HANGCHECK_ACTIVE_LOOP;
>   	}
>
> +	if (!subunits_stuck(ring))
> +		return HANGCHECK_ACTIVE_LOOP;
> +
> +	return HANGCHECK_HUNG;
> +}
> +
> +static enum intel_ring_hangcheck_action
> +ring_stuck(struct intel_engine_cs *ring, u64 acthd)
> +{
> +	struct drm_device *dev = ring->dev;
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	enum intel_ring_hangcheck_action ha;
> +	u32 tmp;
> +
> +	ha = head_stuck(ring, acthd);
> +	if (ha != HANGCHECK_HUNG)
> +		return ha;
> +
>   	if (IS_GEN2(dev))
>   		return HANGCHECK_HUNG;
>
> @@ -3064,6 +3100,9 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>   				ring->hangcheck.score--;
>
>   			ring->hangcheck.acthd = ring->hangcheck.max_acthd = 0;
> +
> +			memset(ring->hangcheck.instdone, 0,
> +			       sizeof(ring->hangcheck.instdone));
>   		}
>
>   		ring->hangcheck.seqno = seqno;
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 5d1eb20..b8fe92e 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -93,6 +93,7 @@ struct intel_ring_hangcheck {
>   	int score;
>   	enum intel_ring_hangcheck_action action;
>   	int deadlock;
> +	u32 instdone[I915_NUM_INSTDONE_REG];
>   };
>
>   struct intel_ringbuffer {
>

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/i915: Inspect subunit states on hangcheck
  2015-12-01 12:17 [PATCH] drm/i915: Inspect subunit states on hangcheck Mika Kuoppala
  2015-12-01 12:31 ` Chris Wilson
  2015-12-01 12:56 ` Arun Siluvery
@ 2015-12-01 12:58 ` Mika Kuoppala
  2015-12-01 15:56 ` Mika Kuoppala
  3 siblings, 0 replies; 9+ messages in thread
From: Mika Kuoppala @ 2015-12-01 12:58 UTC (permalink / raw)
  To: intel-gfx; +Cc: miku

Mika Kuoppala <mika.kuoppala@linux.intel.com> writes:

> If head seems stuck and engine in question is rcs,
> inspect subunit state transitions before deciding that
> this really is a hang instead of limited progress.
>

One thing to add into this description is that
now as we always have one extra hangcheck step even
in true stuck cases, this makes the hang declaration
one tick longer, 7.5s.

If this becomes a problem, we can sample the instdone
state on first check converge back to old behaviour.

One real, but hard to achive, improvement in determinism
would be to normalize the amount of 'work' each gen does in
a given tick. This way the shader progressions would be
more or less equal and same amount of shader work would
cause same hangcheck behaviour regardless of actual
speed of computation.

-Mika

> References: https://bugs.freedesktop.org/show_bug.cgi?id=93029
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Dave Gordon <david.s.gordon@intel.com>
> Cc: Daniel Vetter <daniel@ffwll.ch>
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_irq.c         | 49 +++++++++++++++++++++++++++++----
>  drivers/gpu/drm/i915/intel_ringbuffer.h |  1 +
>  2 files changed, 45 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index e88d692..e6ae54f 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2913,13 +2913,31 @@ static void semaphore_clear_deadlocks(struct drm_i915_private *dev_priv)
>  		ring->hangcheck.deadlock = 0;
>  }
>  
> -static enum intel_ring_hangcheck_action
> -ring_stuck(struct intel_engine_cs *ring, u64 acthd)
> +static bool subunits_stuck(struct intel_engine_cs *ring)
>  {
> -	struct drm_device *dev = ring->dev;
> -	struct drm_i915_private *dev_priv = dev->dev_private;
> -	u32 tmp;
> +	int i;
> +	u32 instdone[I915_NUM_INSTDONE_REG];
> +	bool stuck;
> +
> +	if (ring->id != RCS)
> +		return true;
> +
> +	i915_get_extra_instdone(ring->dev, instdone);
>  
> +	stuck = true;
> +	for (i = 0; i < I915_NUM_INSTDONE_REG; i++) {
> +		if (instdone[i] != ring->hangcheck.instdone[i])
> +			stuck = false;
> +
> +		ring->hangcheck.instdone[i] = instdone[i];
> +	}
> +
> +	return stuck;
> +}
> +
> +static enum intel_ring_hangcheck_action
> +head_stuck(struct intel_engine_cs *ring, u64 acthd)
> +{
>  	if (acthd != ring->hangcheck.acthd) {
>  		if (acthd > ring->hangcheck.max_acthd) {
>  			ring->hangcheck.max_acthd = acthd;
> @@ -2929,6 +2947,24 @@ ring_stuck(struct intel_engine_cs *ring, u64 acthd)
>  		return HANGCHECK_ACTIVE_LOOP;
>  	}
>  
> +	if (!subunits_stuck(ring))
> +		return HANGCHECK_ACTIVE_LOOP;
> +
> +	return HANGCHECK_HUNG;
> +}
> +
> +static enum intel_ring_hangcheck_action
> +ring_stuck(struct intel_engine_cs *ring, u64 acthd)
> +{
> +	struct drm_device *dev = ring->dev;
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	enum intel_ring_hangcheck_action ha;
> +	u32 tmp;
> +
> +	ha = head_stuck(ring, acthd);
> +	if (ha != HANGCHECK_HUNG)
> +		return ha;
> +
>  	if (IS_GEN2(dev))
>  		return HANGCHECK_HUNG;
>  
> @@ -3064,6 +3100,9 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>  				ring->hangcheck.score--;
>  
>  			ring->hangcheck.acthd = ring->hangcheck.max_acthd = 0;
> +
> +			memset(ring->hangcheck.instdone, 0,
> +			       sizeof(ring->hangcheck.instdone));
>  		}
>  
>  		ring->hangcheck.seqno = seqno;
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 5d1eb20..b8fe92e 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -93,6 +93,7 @@ struct intel_ring_hangcheck {
>  	int score;
>  	enum intel_ring_hangcheck_action action;
>  	int deadlock;
> +	u32 instdone[I915_NUM_INSTDONE_REG];
>  };
>  
>  struct intel_ringbuffer {
> -- 
> 2.5.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH] drm/i915: Inspect subunit states on hangcheck
  2015-12-01 12:17 [PATCH] drm/i915: Inspect subunit states on hangcheck Mika Kuoppala
                   ` (2 preceding siblings ...)
  2015-12-01 12:58 ` Mika Kuoppala
@ 2015-12-01 15:56 ` Mika Kuoppala
  2015-12-10 13:54   ` Chris Wilson
  3 siblings, 1 reply; 9+ messages in thread
From: Mika Kuoppala @ 2015-12-01 15:56 UTC (permalink / raw)
  To: intel-gfx; +Cc: miku

If head seems stuck and engine in question is rcs,
inspect subunit state transitions from undone to done,
before deciding that this really is a hang instead of limited
progress. Only account the transitions of subunits from
undone to done once, to prevent unstable subunit states
to keep us falsely active.

As this adds one extra steps to hangcheck heuristics,
before hang is declared, it adds 1500ms to to detect hang
for render ring to a total of 7500ms. We could sample
the subunit states on first head stuck condition but
decide not to do so only in order to mimic old behaviour. This
way the check order of promotion from seqno > atchd > instdone
is consistently done.

v2: Deal with unstable done states (Arun)
    Clear instdone progress on head and seqno movement (Chris)
    Report raw and accumulated instdone's in in debugfs (Chris)
    Return HANGCHECK_ACTIVE on undone->done

References: https://bugs.freedesktop.org/show_bug.cgi?id=93029
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Dave Gordon <david.s.gordon@intel.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Arun Siluvery <arun.siluvery@linux.intel.com>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c     | 20 ++++++++++-
 drivers/gpu/drm/i915/i915_irq.c         | 62 ++++++++++++++++++++++++++++++---
 drivers/gpu/drm/i915/intel_ringbuffer.h |  1 +
 3 files changed, 77 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index bfd57fb..4255f1d 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1332,7 +1332,8 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 	struct intel_engine_cs *ring;
 	u64 acthd[I915_NUM_RINGS];
 	u32 seqno[I915_NUM_RINGS];
-	int i;
+	u32 instdone[I915_NUM_INSTDONE_REG];
+	int i, j;
 
 	if (!i915.enable_hangcheck) {
 		seq_printf(m, "Hangcheck disabled\n");
@@ -1346,6 +1347,8 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 		acthd[i] = intel_ring_get_active_head(ring);
 	}
 
+	i915_get_extra_instdone(dev, instdone);
+
 	intel_runtime_pm_put(dev_priv);
 
 	if (delayed_work_pending(&dev_priv->gpu_error.hangcheck_work)) {
@@ -1366,6 +1369,21 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 			   (long long)ring->hangcheck.max_acthd);
 		seq_printf(m, "\tscore = %d\n", ring->hangcheck.score);
 		seq_printf(m, "\taction = %d\n", ring->hangcheck.action);
+
+		if (ring->id == RCS) {
+			seq_puts(m, "\tinstdone read =");
+
+			for (j = 0; j < I915_NUM_INSTDONE_REG; j++)
+				seq_printf(m, " 0x%08x", instdone[j]);
+
+			seq_puts(m, "\n\tinstdone accu =");
+
+			for (j = 0; j < I915_NUM_INSTDONE_REG; j++)
+				seq_printf(m, " 0x%08x",
+					   ring->hangcheck.instdone[j]);
+
+			seq_puts(m, "\n");
+		}
 	}
 
 	return 0;
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index e88d692..87254c8 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2913,14 +2913,44 @@ static void semaphore_clear_deadlocks(struct drm_i915_private *dev_priv)
 		ring->hangcheck.deadlock = 0;
 }
 
-static enum intel_ring_hangcheck_action
-ring_stuck(struct intel_engine_cs *ring, u64 acthd)
+static bool subunits_stuck(struct intel_engine_cs *ring)
 {
-	struct drm_device *dev = ring->dev;
-	struct drm_i915_private *dev_priv = dev->dev_private;
-	u32 tmp;
+	u32 instdone[I915_NUM_INSTDONE_REG];
+	bool stuck;
+	int i;
+
+	if (ring->id != RCS)
+		return true;
+
+	i915_get_extra_instdone(ring->dev, instdone);
 
+	/* There might be unstable subunit states even when
+	 * actual head is not moving. Filter out the unstable ones by
+	 * accumulating the undone -> done transitions and only
+	 * consider those as progress.
+	 */
+	stuck = true;
+	for (i = 0; i < I915_NUM_INSTDONE_REG; i++) {
+		const u32 tmp = instdone[i] | ring->hangcheck.instdone[i];
+
+		if (tmp != ring->hangcheck.instdone[i])
+			stuck = false;
+
+		ring->hangcheck.instdone[i] |= tmp;
+	}
+
+	return stuck;
+}
+
+static enum intel_ring_hangcheck_action
+head_stuck(struct intel_engine_cs *ring, u64 acthd)
+{
 	if (acthd != ring->hangcheck.acthd) {
+
+		/* Clear subunit states on head movement */
+		memset(ring->hangcheck.instdone, 0,
+		       sizeof(ring->hangcheck.instdone));
+
 		if (acthd > ring->hangcheck.max_acthd) {
 			ring->hangcheck.max_acthd = acthd;
 			return HANGCHECK_ACTIVE;
@@ -2929,6 +2959,24 @@ ring_stuck(struct intel_engine_cs *ring, u64 acthd)
 		return HANGCHECK_ACTIVE_LOOP;
 	}
 
+	if (!subunits_stuck(ring))
+		return HANGCHECK_ACTIVE;
+
+	return HANGCHECK_HUNG;
+}
+
+static enum intel_ring_hangcheck_action
+ring_stuck(struct intel_engine_cs *ring, u64 acthd)
+{
+	struct drm_device *dev = ring->dev;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	enum intel_ring_hangcheck_action ha;
+	u32 tmp;
+
+	ha = head_stuck(ring, acthd);
+	if (ha != HANGCHECK_HUNG)
+		return ha;
+
 	if (IS_GEN2(dev))
 		return HANGCHECK_HUNG;
 
@@ -3063,7 +3111,11 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 			if (ring->hangcheck.score > 0)
 				ring->hangcheck.score--;
 
+			/* Clear head and subunit states on seqno movement */
 			ring->hangcheck.acthd = ring->hangcheck.max_acthd = 0;
+
+			memset(ring->hangcheck.instdone, 0,
+			       sizeof(ring->hangcheck.instdone));
 		}
 
 		ring->hangcheck.seqno = seqno;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 5d1eb20..b8fe92e 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -93,6 +93,7 @@ struct intel_ring_hangcheck {
 	int score;
 	enum intel_ring_hangcheck_action action;
 	int deadlock;
+	u32 instdone[I915_NUM_INSTDONE_REG];
 };
 
 struct intel_ringbuffer {
-- 
2.5.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/i915: Inspect subunit states on hangcheck
  2015-12-01 15:56 ` Mika Kuoppala
@ 2015-12-10 13:54   ` Chris Wilson
  2015-12-10 16:30     ` Mika Kuoppala
  2016-01-08 14:54     ` Mika Kuoppala
  0 siblings, 2 replies; 9+ messages in thread
From: Chris Wilson @ 2015-12-10 13:54 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx, miku

On Tue, Dec 01, 2015 at 05:56:12PM +0200, Mika Kuoppala wrote:
> If head seems stuck and engine in question is rcs,
> inspect subunit state transitions from undone to done,
> before deciding that this really is a hang instead of limited
> progress. Only account the transitions of subunits from
> undone to done once, to prevent unstable subunit states
> to keep us falsely active.
> 
> As this adds one extra steps to hangcheck heuristics,
> before hang is declared, it adds 1500ms to to detect hang
> for render ring to a total of 7500ms. We could sample
> the subunit states on first head stuck condition but
> decide not to do so only in order to mimic old behaviour. This
> way the check order of promotion from seqno > atchd > instdone
> is consistently done.
> 
> v2: Deal with unstable done states (Arun)
>     Clear instdone progress on head and seqno movement (Chris)
>     Report raw and accumulated instdone's in in debugfs (Chris)
>     Return HANGCHECK_ACTIVE on undone->done
> 
> References: https://bugs.freedesktop.org/show_bug.cgi?id=93029
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Dave Gordon <david.s.gordon@intel.com>
> Cc: Daniel Vetter <daniel@ffwll.ch>
> Cc: Arun Siluvery <arun.siluvery@linux.intel.com>
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>

I feel slightly dubious in discarding the 1->0 transitions (as it just
means that a shared function that was previously idle is now in use
again), but if they truly do fluctuate randomly? then accumulating
should mean we eventually escape.

Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/i915: Inspect subunit states on hangcheck
  2015-12-10 13:54   ` Chris Wilson
@ 2015-12-10 16:30     ` Mika Kuoppala
  2016-01-08 14:54     ` Mika Kuoppala
  1 sibling, 0 replies; 9+ messages in thread
From: Mika Kuoppala @ 2015-12-10 16:30 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx, miku

Chris Wilson <chris@chris-wilson.co.uk> writes:

> On Tue, Dec 01, 2015 at 05:56:12PM +0200, Mika Kuoppala wrote:
>> If head seems stuck and engine in question is rcs,
>> inspect subunit state transitions from undone to done,
>> before deciding that this really is a hang instead of limited
>> progress. Only account the transitions of subunits from
>> undone to done once, to prevent unstable subunit states
>> to keep us falsely active.
>> 
>> As this adds one extra steps to hangcheck heuristics,
>> before hang is declared, it adds 1500ms to to detect hang
>> for render ring to a total of 7500ms. We could sample
>> the subunit states on first head stuck condition but
>> decide not to do so only in order to mimic old behaviour. This
>> way the check order of promotion from seqno > atchd > instdone
>> is consistently done.
>> 
>> v2: Deal with unstable done states (Arun)
>>     Clear instdone progress on head and seqno movement (Chris)
>>     Report raw and accumulated instdone's in in debugfs (Chris)
>>     Return HANGCHECK_ACTIVE on undone->done
>> 
>> References: https://bugs.freedesktop.org/show_bug.cgi?id=93029
>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> Cc: Dave Gordon <david.s.gordon@intel.com>
>> Cc: Daniel Vetter <daniel@ffwll.ch>
>> Cc: Arun Siluvery <arun.siluvery@linux.intel.com>
>> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
>
> I feel slightly dubious in discarding the 1->0 transitions (as it just
> means that a shared function that was previously idle is now in use
> again), but if they truly do fluctuate randomly? then accumulating
> should mean we eventually escape.

Atleast with a tight bb start loop inside one batch one 
bit fluctulated.

We could improve further by having maximum amount of
transitions per bit?

>
> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
> -Chris

Thanks,
-Mika

>
> -- 
> Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/i915: Inspect subunit states on hangcheck
  2015-12-10 13:54   ` Chris Wilson
  2015-12-10 16:30     ` Mika Kuoppala
@ 2016-01-08 14:54     ` Mika Kuoppala
  2016-01-08 15:10       ` Chris Wilson
  1 sibling, 1 reply; 9+ messages in thread
From: Mika Kuoppala @ 2016-01-08 14:54 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx, miku

Chris Wilson <chris@chris-wilson.co.uk> writes:

> On Tue, Dec 01, 2015 at 05:56:12PM +0200, Mika Kuoppala wrote:
>> If head seems stuck and engine in question is rcs,
>> inspect subunit state transitions from undone to done,
>> before deciding that this really is a hang instead of limited
>> progress. Only account the transitions of subunits from
>> undone to done once, to prevent unstable subunit states
>> to keep us falsely active.
>> 
>> As this adds one extra steps to hangcheck heuristics,
>> before hang is declared, it adds 1500ms to to detect hang
>> for render ring to a total of 7500ms. We could sample
>> the subunit states on first head stuck condition but
>> decide not to do so only in order to mimic old behaviour. This
>> way the check order of promotion from seqno > atchd > instdone
>> is consistently done.
>> 
>> v2: Deal with unstable done states (Arun)
>>     Clear instdone progress on head and seqno movement (Chris)
>>     Report raw and accumulated instdone's in in debugfs (Chris)
>>     Return HANGCHECK_ACTIVE on undone->done
>> 
>> References: https://bugs.freedesktop.org/show_bug.cgi?id=93029
>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> Cc: Dave Gordon <david.s.gordon@intel.com>
>> Cc: Daniel Vetter <daniel@ffwll.ch>
>> Cc: Arun Siluvery <arun.siluvery@linux.intel.com>
>> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
>
> I feel slightly dubious in discarding the 1->0 transitions (as it just
> means that a shared function that was previously idle is now in use
> again), but if they truly do fluctuate randomly? then accumulating
> should mean we eventually escape.
>
> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>

Queued for -next, thanks for the review. 

-Mika
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/i915: Inspect subunit states on hangcheck
  2016-01-08 14:54     ` Mika Kuoppala
@ 2016-01-08 15:10       ` Chris Wilson
  0 siblings, 0 replies; 9+ messages in thread
From: Chris Wilson @ 2016-01-08 15:10 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx, miku

On Fri, Jan 08, 2016 at 04:54:19PM +0200, Mika Kuoppala wrote:
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > On Tue, Dec 01, 2015 at 05:56:12PM +0200, Mika Kuoppala wrote:
> >> If head seems stuck and engine in question is rcs,
> >> inspect subunit state transitions from undone to done,
> >> before deciding that this really is a hang instead of limited
> >> progress. Only account the transitions of subunits from
> >> undone to done once, to prevent unstable subunit states
> >> to keep us falsely active.
> >> 
> >> As this adds one extra steps to hangcheck heuristics,
> >> before hang is declared, it adds 1500ms to to detect hang
> >> for render ring to a total of 7500ms. We could sample
> >> the subunit states on first head stuck condition but
> >> decide not to do so only in order to mimic old behaviour. This
> >> way the check order of promotion from seqno > atchd > instdone
> >> is consistently done.
> >> 
> >> v2: Deal with unstable done states (Arun)
> >>     Clear instdone progress on head and seqno movement (Chris)
> >>     Report raw and accumulated instdone's in in debugfs (Chris)
> >>     Return HANGCHECK_ACTIVE on undone->done
> >> 
> >> References: https://bugs.freedesktop.org/show_bug.cgi?id=93029
> >> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> >> Cc: Dave Gordon <david.s.gordon@intel.com>
> >> Cc: Daniel Vetter <daniel@ffwll.ch>
> >> Cc: Arun Siluvery <arun.siluvery@linux.intel.com>
> >> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> >
> > I feel slightly dubious in discarding the 1->0 transitions (as it just
> > means that a shared function that was previously idle is now in use
> > again), but if they truly do fluctuate randomly? then accumulating
> > should mean we eventually escape.
> >
> > Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
> 
> Queued for -next, thanks for the review. 

Hmm, you just reminded me that we have a problem with HEAD running wild
now as we only detect a loop when it goes past 1<<48 (and we only
increment the score when we loop).

Something like:
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index b2ef2d0c211b..4fe28a0301f2 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2949,21 +2949,15 @@ static enum intel_engine_hangcheck_action
 head_stuck(struct intel_engine_cs *ring, u64 acthd)
 {
        if (acthd != ring->hangcheck.acthd) {
-
                /* Clear subunit states on head movement */
                memset(ring->hangcheck.instdone, 0,
                       sizeof(ring->hangcheck.instdone));
 
-               if (acthd > ring->hangcheck.max_acthd) {
-                       ring->hangcheck.max_acthd = acthd;
-                       return HANGCHECK_ACTIVE;
-               }
-
                return HANGCHECK_ACTIVE_LOOP;
        }
 
        if (!subunits_stuck(ring))
-               return HANGCHECK_ACTIVE;
+               return HANGCHECK_ACTIVE_LOOP;
 
        return HANGCHECK_HUNG;
 }
@@ -3117,7 +3111,9 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
                         * attempts across multiple batches.
                         */
                        if (ring->hangcheck.score > 0)
-                               ring->hangcheck.score--;
+                               ring->hangcheck.score -= HUNG
+                       if (ring->hangcheck.score < 0)
+                               ring->hangcheck.score = 0;
 
                        /* Clear head and subunit states on seqno movement */
                        ring->hangcheck.acthd = ring->hangcheck.max_acthd = 0;

-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2016-01-08 15:11 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-12-01 12:17 [PATCH] drm/i915: Inspect subunit states on hangcheck Mika Kuoppala
2015-12-01 12:31 ` Chris Wilson
2015-12-01 12:56 ` Arun Siluvery
2015-12-01 12:58 ` Mika Kuoppala
2015-12-01 15:56 ` Mika Kuoppala
2015-12-10 13:54   ` Chris Wilson
2015-12-10 16:30     ` Mika Kuoppala
2016-01-08 14:54     ` Mika Kuoppala
2016-01-08 15:10       ` Chris Wilson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.