All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/5] drm/i915: pass seqno to i915_hangcheck_ring_idle
@ 2013-05-13 13:32 Mika Kuoppala
  2013-05-13 13:32 ` [PATCH 2/5] drm/i915: track ring progression using seqnos Mika Kuoppala
                   ` (3 more replies)
  0 siblings, 4 replies; 13+ messages in thread
From: Mika Kuoppala @ 2013-05-13 13:32 UTC (permalink / raw)
  To: intel-gfx; +Cc: ben, miku

In preparation for next commit, pass seqno as a parameter
to i915_hangcheck_ring_idle as it will be used inside
i915_hangcheck_elapsed.

Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_irq.c |   11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 879c4cc..0e5c9b0 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2274,11 +2274,11 @@ ring_last_seqno(struct intel_ring_buffer *ring)
 			  struct drm_i915_gem_request, list)->seqno;
 }
 
-static bool i915_hangcheck_ring_idle(struct intel_ring_buffer *ring, bool *err)
+static bool i915_hangcheck_ring_idle(struct intel_ring_buffer *ring,
+				     u32 ring_seqno, bool *err)
 {
 	if (list_empty(&ring->request_list) ||
-	    i915_seqno_passed(ring->get_seqno(ring, false),
-			      ring_last_seqno(ring))) {
+	    i915_seqno_passed(ring_seqno, ring_last_seqno(ring))) {
 		/* Issue a wake-up to catch stuck h/w. */
 		if (waitqueue_active(&ring->irq_queue)) {
 			DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
@@ -2395,7 +2395,10 @@ void i915_hangcheck_elapsed(unsigned long data)
 	memset(acthd, 0, sizeof(acthd));
 	idle = true;
 	for_each_ring(ring, dev_priv, i) {
-	    idle &= i915_hangcheck_ring_idle(ring, &err);
+		u32 seqno;
+
+		seqno = ring->get_seqno(ring, false);
+		idle &= i915_hangcheck_ring_idle(ring, seqno, &err);
 	    acthd[i] = intel_ring_get_active_head(ring);
 	}
 
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 2/5] drm/i915: track ring progression using seqnos
  2013-05-13 13:32 [PATCH 1/5] drm/i915: pass seqno to i915_hangcheck_ring_idle Mika Kuoppala
@ 2013-05-13 13:32 ` Mika Kuoppala
  2013-05-17 17:40   ` Ben Widawsky
  2013-05-13 13:32 ` [PATCH 3/5] drm/i915: introduce i915_hangcheck_ring_hung Mika Kuoppala
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 13+ messages in thread
From: Mika Kuoppala @ 2013-05-13 13:32 UTC (permalink / raw)
  To: intel-gfx; +Cc: ben, miku

Instead of relying in acthd, track ring seqno progression
to detect if ring has hung.

v2: put hangcheck stuff inside struct (Chris Wilson)

Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h         |    2 --
 drivers/gpu/drm/i915/i915_irq.c         |   30 +++++++++++++-----------------
 drivers/gpu/drm/i915/intel_ringbuffer.h |    6 ++++++
 3 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 14817de..db7cda9 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -834,8 +834,6 @@ struct i915_gpu_error {
 #define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD)
 	struct timer_list hangcheck_timer;
 	int hangcheck_count;
-	uint32_t last_acthd[I915_NUM_RINGS];
-	uint32_t prev_instdone[I915_NUM_INSTDONE_REG];
 
 	/* For reset and error_state handling. */
 	spinlock_t lock;
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 0e5c9b0..004ad34 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2384,22 +2384,19 @@ void i915_hangcheck_elapsed(unsigned long data)
 {
 	struct drm_device *dev = (struct drm_device *)data;
 	drm_i915_private_t *dev_priv = dev->dev_private;
-	uint32_t acthd[I915_NUM_RINGS], instdone[I915_NUM_INSTDONE_REG];
 	struct intel_ring_buffer *ring;
 	bool err = false, idle;
 	int i;
+	u32 seqno[I915_NUM_RINGS];
+	bool work_done;
 
 	if (!i915_enable_hangcheck)
 		return;
 
-	memset(acthd, 0, sizeof(acthd));
 	idle = true;
 	for_each_ring(ring, dev_priv, i) {
-		u32 seqno;
-
-		seqno = ring->get_seqno(ring, false);
-		idle &= i915_hangcheck_ring_idle(ring, seqno, &err);
-	    acthd[i] = intel_ring_get_active_head(ring);
+		seqno[i] = ring->get_seqno(ring, false);
+		idle &= i915_hangcheck_ring_idle(ring, seqno[i], &err);
 	}
 
 	/* If all work is done then ACTHD clearly hasn't advanced. */
@@ -2415,20 +2412,19 @@ void i915_hangcheck_elapsed(unsigned long data)
 		return;
 	}
 
-	i915_get_extra_instdone(dev, instdone);
-	if (memcmp(dev_priv->gpu_error.last_acthd, acthd,
-		   sizeof(acthd)) == 0 &&
-	    memcmp(dev_priv->gpu_error.prev_instdone, instdone,
-		   sizeof(instdone)) == 0) {
+	work_done = false;
+	for_each_ring(ring, dev_priv, i) {
+		if (ring->hangcheck.seqno != seqno[i]) {
+			work_done = true;
+			ring->hangcheck.seqno = seqno[i];
+		}
+	}
+
+	if (!work_done) {
 		if (i915_hangcheck_hung(dev))
 			return;
 	} else {
 		dev_priv->gpu_error.hangcheck_count = 0;
-
-		memcpy(dev_priv->gpu_error.last_acthd, acthd,
-		       sizeof(acthd));
-		memcpy(dev_priv->gpu_error.prev_instdone, instdone,
-		       sizeof(instdone));
 	}
 
 repeat:
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index dac1614..ef374a8 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -37,6 +37,10 @@ struct  intel_hw_status_page {
 #define I915_READ_SYNC_0(ring) I915_READ(RING_SYNC_0((ring)->mmio_base))
 #define I915_READ_SYNC_1(ring) I915_READ(RING_SYNC_1((ring)->mmio_base))
 
+struct intel_ring_hangcheck {
+	u32 seqno;
+};
+
 struct  intel_ring_buffer {
 	const char	*name;
 	enum intel_ring_id {
@@ -137,6 +141,8 @@ struct  intel_ring_buffer {
 	struct i915_hw_context *default_context;
 	struct i915_hw_context *last_context;
 
+	struct intel_ring_hangcheck hangcheck;
+
 	void *private;
 };
 
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 3/5] drm/i915: introduce i915_hangcheck_ring_hung
  2013-05-13 13:32 [PATCH 1/5] drm/i915: pass seqno to i915_hangcheck_ring_idle Mika Kuoppala
  2013-05-13 13:32 ` [PATCH 2/5] drm/i915: track ring progression using seqnos Mika Kuoppala
@ 2013-05-13 13:32 ` Mika Kuoppala
  2013-05-24 18:26   ` Ben Widawsky
  2013-05-13 13:32 ` [PATCH 4/5] drm/i915: detect hang using per ring hangcheck_score Mika Kuoppala
  2013-05-13 13:32 ` [PATCH 5/5] drm/i915: remove i915_hangcheck_hung Mika Kuoppala
  3 siblings, 1 reply; 13+ messages in thread
From: Mika Kuoppala @ 2013-05-13 13:32 UTC (permalink / raw)
  To: intel-gfx; +Cc: ben, miku

In preparation to track per ring progress in hangcheck,
add i915_hangcheck_ring_hung.

v2: omit dev parameter (Ben Widawsky)

Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_irq.c |   29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 004ad34..6efe3b0 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2345,28 +2345,33 @@ static bool kick_ring(struct intel_ring_buffer *ring)
 	return false;
 }
 
+static bool i915_hangcheck_ring_hung(struct intel_ring_buffer *ring)
+{
+	if (IS_GEN2(ring->dev))
+		return false;
+
+	/* Is the chip hanging on a WAIT_FOR_EVENT?
+	 * If so we can simply poke the RB_WAIT bit
+	 * and break the hang. This should work on
+	 * all but the second generation chipsets.
+	 */
+	return !kick_ring(ring);
+}
+
 static bool i915_hangcheck_hung(struct drm_device *dev)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
 
 	if (dev_priv->gpu_error.hangcheck_count++ > 1) {
 		bool hung = true;
+		struct intel_ring_buffer *ring;
+		int i;
 
 		DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
 		i915_handle_error(dev, true);
 
-		if (!IS_GEN2(dev)) {
-			struct intel_ring_buffer *ring;
-			int i;
-
-			/* Is the chip hanging on a WAIT_FOR_EVENT?
-			 * If so we can simply poke the RB_WAIT bit
-			 * and break the hang. This should work on
-			 * all but the second generation chipsets.
-			 */
-			for_each_ring(ring, dev_priv, i)
-				hung &= !kick_ring(ring);
-		}
+		for_each_ring(ring, dev_priv, i)
+			hung &= i915_hangcheck_ring_hung(ring);
 
 		return hung;
 	}
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 4/5] drm/i915: detect hang using per ring hangcheck_score
  2013-05-13 13:32 [PATCH 1/5] drm/i915: pass seqno to i915_hangcheck_ring_idle Mika Kuoppala
  2013-05-13 13:32 ` [PATCH 2/5] drm/i915: track ring progression using seqnos Mika Kuoppala
  2013-05-13 13:32 ` [PATCH 3/5] drm/i915: introduce i915_hangcheck_ring_hung Mika Kuoppala
@ 2013-05-13 13:32 ` Mika Kuoppala
  2013-05-27  1:34   ` Ben Widawsky
  2013-05-13 13:32 ` [PATCH 5/5] drm/i915: remove i915_hangcheck_hung Mika Kuoppala
  3 siblings, 1 reply; 13+ messages in thread
From: Mika Kuoppala @ 2013-05-13 13:32 UTC (permalink / raw)
  To: intel-gfx; +Cc: ben, miku

Keep track of ring seqno progress and if there are no
progress detected, declare hang. Use actual head (acthd)
to distinguish between ring stuck and batchbuffer looping
situation. Stuck ring will be kicked to trigger progress.

v2: use atchd to detect stuck ring from loop (Ben Widawsky)

v3: Use acthd to check when ring needs kicking.
Declare hang on third time in order to give time for
kick_ring to take effect.

Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_irq.c         |   80 ++++++++++++++++++-------------
 drivers/gpu/drm/i915/intel_ringbuffer.h |    2 +
 2 files changed, 49 insertions(+), 33 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 6efe3b0..5dde61f 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -681,7 +681,6 @@ static void notify_ring(struct drm_device *dev,
 
 	wake_up_all(&ring->irq_queue);
 	if (i915_enable_hangcheck) {
-		dev_priv->gpu_error.hangcheck_count = 0;
 		mod_timer(&dev_priv->gpu_error.hangcheck_timer,
 			  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
 	}
@@ -2381,61 +2380,76 @@ static bool i915_hangcheck_hung(struct drm_device *dev)
 
 /**
  * This is called when the chip hasn't reported back with completed
- * batchbuffers in a long time. The first time this is called we simply record
- * ACTHD. If ACTHD hasn't changed by the time the hangcheck timer elapses
- * again, we assume the chip is wedged and try to fix it.
+ * batchbuffers in a long time. We keep track per ring seqno progress and
+ * if there are no progress, hangcheck score for that ring is increased.
+ * Further, acthd is inspected to see if the ring is stuck. On stuck case
+ * we kick the ring. If we see no progress on three subsequent calls
+ * we assume chip is wedged and try to fix it by resetting the chip.
  */
 void i915_hangcheck_elapsed(unsigned long data)
 {
 	struct drm_device *dev = (struct drm_device *)data;
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	struct intel_ring_buffer *ring;
-	bool err = false, idle;
 	int i;
-	u32 seqno[I915_NUM_RINGS];
-	bool work_done;
+	int busy_count = 0, rings_hung = 0;
+	bool stuck[I915_NUM_RINGS];
 
 	if (!i915_enable_hangcheck)
 		return;
 
-	idle = true;
 	for_each_ring(ring, dev_priv, i) {
-		seqno[i] = ring->get_seqno(ring, false);
-		idle &= i915_hangcheck_ring_idle(ring, seqno[i], &err);
-	}
+		u32 seqno, acthd;
+		bool idle, err = false;
+
+		seqno = ring->get_seqno(ring, false);
+		acthd = intel_ring_get_active_head(ring);
+		idle = i915_hangcheck_ring_idle(ring, seqno, &err);
+		stuck[i] = ring->hangcheck.acthd == acthd;
+
+		if (idle) {
+			if (err)
+				ring->hangcheck.score += 2;
+			else
+				ring->hangcheck.score = 0;
+		} else {
+			busy_count++;
 
-	/* If all work is done then ACTHD clearly hasn't advanced. */
-	if (idle) {
-		if (err) {
-			if (i915_hangcheck_hung(dev))
-				return;
+			if (ring->hangcheck.seqno == seqno) {
+				ring->hangcheck.score++;
 
-			goto repeat;
+				/* Kick ring if stuck*/
+				if (stuck[i])
+					i915_hangcheck_ring_hung(ring);
+			} else {
+				ring->hangcheck.score = 0;
+			}
 		}
 
-		dev_priv->gpu_error.hangcheck_count = 0;
-		return;
+		ring->hangcheck.seqno = seqno;
+		ring->hangcheck.acthd = acthd;
 	}
 
-	work_done = false;
 	for_each_ring(ring, dev_priv, i) {
-		if (ring->hangcheck.seqno != seqno[i]) {
-			work_done = true;
-			ring->hangcheck.seqno = seqno[i];
+		if (ring->hangcheck.score > 2) {
+			rings_hung++;
+			DRM_ERROR("%s: %s on %s 0x%x\n", ring->name,
+				  stuck[i] ? "stuck" : "no progress",
+				  stuck[i] ? "addr" : "seqno",
+				  stuck[i] ? ring->hangcheck.acthd & HEAD_ADDR :
+				  ring->hangcheck.seqno);
 		}
 	}
 
-	if (!work_done) {
-		if (i915_hangcheck_hung(dev))
-			return;
-	} else {
-		dev_priv->gpu_error.hangcheck_count = 0;
-	}
+	if (rings_hung)
+		return i915_handle_error(dev, true);
 
-repeat:
-	/* Reset timer case chip hangs without another request being added */
-	mod_timer(&dev_priv->gpu_error.hangcheck_timer,
-		  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
+	if (busy_count)
+		/* Reset timer case chip hangs without another request
+		 * being added */
+		mod_timer(&dev_priv->gpu_error.hangcheck_timer,
+			  round_jiffies_up(jiffies +
+					   DRM_I915_HANGCHECK_JIFFIES));
 }
 
 /* drm_dma.h hooks
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index ef374a8..5886667 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -39,6 +39,8 @@ struct  intel_hw_status_page {
 
 struct intel_ring_hangcheck {
 	u32 seqno;
+	u32 acthd;
+	int score;
 };
 
 struct  intel_ring_buffer {
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 5/5] drm/i915: remove i915_hangcheck_hung
  2013-05-13 13:32 [PATCH 1/5] drm/i915: pass seqno to i915_hangcheck_ring_idle Mika Kuoppala
                   ` (2 preceding siblings ...)
  2013-05-13 13:32 ` [PATCH 4/5] drm/i915: detect hang using per ring hangcheck_score Mika Kuoppala
@ 2013-05-13 13:32 ` Mika Kuoppala
  2013-05-29 17:48   ` Ben Widawsky
  3 siblings, 1 reply; 13+ messages in thread
From: Mika Kuoppala @ 2013-05-13 13:32 UTC (permalink / raw)
  To: intel-gfx; +Cc: ben, miku

Rework of per ring hangcheck made this obsolete.

Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h |    1 -
 drivers/gpu/drm/i915/i915_irq.c |   21 ---------------------
 2 files changed, 22 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index db7cda9..3161d33 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -833,7 +833,6 @@ struct i915_gpu_error {
 #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
 #define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD)
 	struct timer_list hangcheck_timer;
-	int hangcheck_count;
 
 	/* For reset and error_state handling. */
 	spinlock_t lock;
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 5dde61f..f204d83 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2357,27 +2357,6 @@ static bool i915_hangcheck_ring_hung(struct intel_ring_buffer *ring)
 	return !kick_ring(ring);
 }
 
-static bool i915_hangcheck_hung(struct drm_device *dev)
-{
-	drm_i915_private_t *dev_priv = dev->dev_private;
-
-	if (dev_priv->gpu_error.hangcheck_count++ > 1) {
-		bool hung = true;
-		struct intel_ring_buffer *ring;
-		int i;
-
-		DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
-		i915_handle_error(dev, true);
-
-		for_each_ring(ring, dev_priv, i)
-			hung &= i915_hangcheck_ring_hung(ring);
-
-		return hung;
-	}
-
-	return false;
-}
-
 /**
  * This is called when the chip hasn't reported back with completed
  * batchbuffers in a long time. We keep track per ring seqno progress and
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/5] drm/i915: track ring progression using seqnos
  2013-05-13 13:32 ` [PATCH 2/5] drm/i915: track ring progression using seqnos Mika Kuoppala
@ 2013-05-17 17:40   ` Ben Widawsky
  2013-05-24 14:16     ` [PATCH 02/14] " Mika Kuoppala
  0 siblings, 1 reply; 13+ messages in thread
From: Ben Widawsky @ 2013-05-17 17:40 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx, miku

On Mon, May 13, 2013 at 04:32:10PM +0300, Mika Kuoppala wrote:
> Instead of relying in acthd, track ring seqno progression
> to detect if ring has hung.
> 
> v2: put hangcheck stuff inside struct (Chris Wilson)
> 
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h         |    2 --
>  drivers/gpu/drm/i915/i915_irq.c         |   30 +++++++++++++-----------------
>  drivers/gpu/drm/i915/intel_ringbuffer.h |    6 ++++++
>  3 files changed, 19 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 14817de..db7cda9 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -834,8 +834,6 @@ struct i915_gpu_error {
>  #define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD)
>  	struct timer_list hangcheck_timer;
>  	int hangcheck_count;
> -	uint32_t last_acthd[I915_NUM_RINGS];
> -	uint32_t prev_instdone[I915_NUM_INSTDONE_REG];
>  
>  	/* For reset and error_state handling. */
>  	spinlock_t lock;
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 0e5c9b0..004ad34 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2384,22 +2384,19 @@ void i915_hangcheck_elapsed(unsigned long data)
>  {
>  	struct drm_device *dev = (struct drm_device *)data;
>  	drm_i915_private_t *dev_priv = dev->dev_private;
> -	uint32_t acthd[I915_NUM_RINGS], instdone[I915_NUM_INSTDONE_REG];
>  	struct intel_ring_buffer *ring;
>  	bool err = false, idle;
>  	int i;
> +	u32 seqno[I915_NUM_RINGS];
> +	bool work_done;
>  
>  	if (!i915_enable_hangcheck)
>  		return;
>  
> -	memset(acthd, 0, sizeof(acthd));
>  	idle = true;
>  	for_each_ring(ring, dev_priv, i) {
> -		u32 seqno;
> -
> -		seqno = ring->get_seqno(ring, false);
> -		idle &= i915_hangcheck_ring_idle(ring, seqno, &err);
> -	    acthd[i] = intel_ring_get_active_head(ring);
> +		seqno[i] = ring->get_seqno(ring, false);
> +		idle &= i915_hangcheck_ring_idle(ring, seqno[i], &err);
>  	}
>  
>  	/* If all work is done then ACTHD clearly hasn't advanced. */
> @@ -2415,20 +2412,19 @@ void i915_hangcheck_elapsed(unsigned long data)
>  		return;
>  	}
>  
> -	i915_get_extra_instdone(dev, instdone);
> -	if (memcmp(dev_priv->gpu_error.last_acthd, acthd,
> -		   sizeof(acthd)) == 0 &&
> -	    memcmp(dev_priv->gpu_error.prev_instdone, instdone,
> -		   sizeof(instdone)) == 0) {
> +	work_done = false;
> +	for_each_ring(ring, dev_priv, i) {
> +		if (ring->hangcheck.seqno != seqno[i]) {
> +			work_done = true;
> +			ring->hangcheck.seqno = seqno[i];
> +		}
> +	}
> +
> +	if (!work_done) {
>  		if (i915_hangcheck_hung(dev))
>  			return;
>  	} else {
>  		dev_priv->gpu_error.hangcheck_count = 0;
> -
> -		memcpy(dev_priv->gpu_error.last_acthd, acthd,
> -		       sizeof(acthd));
> -		memcpy(dev_priv->gpu_error.prev_instdone, instdone,
> -		       sizeof(instdone));
>  	}
>  
>  repeat:
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index dac1614..ef374a8 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -37,6 +37,10 @@ struct  intel_hw_status_page {
>  #define I915_READ_SYNC_0(ring) I915_READ(RING_SYNC_0((ring)->mmio_base))
>  #define I915_READ_SYNC_1(ring) I915_READ(RING_SYNC_1((ring)->mmio_base))
>  
> +struct intel_ring_hangcheck {
> +	u32 seqno;
> +};
> +

Shouldn't you initialize this thing in i915_gem_init_seqno()?

>  struct  intel_ring_buffer {
>  	const char	*name;
>  	enum intel_ring_id {
> @@ -137,6 +141,8 @@ struct  intel_ring_buffer {
>  	struct i915_hw_context *default_context;
>  	struct i915_hw_context *last_context;
>  
> +	struct intel_ring_hangcheck hangcheck;
> +
>  	void *private;
>  };
>  
> -- 
> 1.7.9.5
> 

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 02/14] drm/i915: track ring progression using seqnos
  2013-05-17 17:40   ` Ben Widawsky
@ 2013-05-24 14:16     ` Mika Kuoppala
  0 siblings, 0 replies; 13+ messages in thread
From: Mika Kuoppala @ 2013-05-24 14:16 UTC (permalink / raw)
  To: intel-gfx; +Cc: ben

Instead of relying in acthd, track ring seqno progression
to detect if ring has hung.

v2: put hangcheck stuff inside struct (Chris Wilson)

v3: initialize hangcheck.seqno (Ben Widawsky)

Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h         |    2 --
 drivers/gpu/drm/i915/i915_irq.c         |   30 +++++++++++++-----------------
 drivers/gpu/drm/i915/intel_ringbuffer.c |    1 +
 drivers/gpu/drm/i915/intel_ringbuffer.h |    6 ++++++
 4 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index f6419f4..c226f7e 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -841,8 +841,6 @@ struct i915_gpu_error {
 #define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD)
 	struct timer_list hangcheck_timer;
 	int hangcheck_count;
-	uint32_t last_acthd[I915_NUM_RINGS];
-	uint32_t prev_instdone[I915_NUM_INSTDONE_REG];
 
 	/* For reset and error_state handling. */
 	spinlock_t lock;
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 0e5c9b0..004ad34 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2384,22 +2384,19 @@ void i915_hangcheck_elapsed(unsigned long data)
 {
 	struct drm_device *dev = (struct drm_device *)data;
 	drm_i915_private_t *dev_priv = dev->dev_private;
-	uint32_t acthd[I915_NUM_RINGS], instdone[I915_NUM_INSTDONE_REG];
 	struct intel_ring_buffer *ring;
 	bool err = false, idle;
 	int i;
+	u32 seqno[I915_NUM_RINGS];
+	bool work_done;
 
 	if (!i915_enable_hangcheck)
 		return;
 
-	memset(acthd, 0, sizeof(acthd));
 	idle = true;
 	for_each_ring(ring, dev_priv, i) {
-		u32 seqno;
-
-		seqno = ring->get_seqno(ring, false);
-		idle &= i915_hangcheck_ring_idle(ring, seqno, &err);
-	    acthd[i] = intel_ring_get_active_head(ring);
+		seqno[i] = ring->get_seqno(ring, false);
+		idle &= i915_hangcheck_ring_idle(ring, seqno[i], &err);
 	}
 
 	/* If all work is done then ACTHD clearly hasn't advanced. */
@@ -2415,20 +2412,19 @@ void i915_hangcheck_elapsed(unsigned long data)
 		return;
 	}
 
-	i915_get_extra_instdone(dev, instdone);
-	if (memcmp(dev_priv->gpu_error.last_acthd, acthd,
-		   sizeof(acthd)) == 0 &&
-	    memcmp(dev_priv->gpu_error.prev_instdone, instdone,
-		   sizeof(instdone)) == 0) {
+	work_done = false;
+	for_each_ring(ring, dev_priv, i) {
+		if (ring->hangcheck.seqno != seqno[i]) {
+			work_done = true;
+			ring->hangcheck.seqno = seqno[i];
+		}
+	}
+
+	if (!work_done) {
 		if (i915_hangcheck_hung(dev))
 			return;
 	} else {
 		dev_priv->gpu_error.hangcheck_count = 0;
-
-		memcpy(dev_priv->gpu_error.last_acthd, acthd,
-		       sizeof(acthd));
-		memcpy(dev_priv->gpu_error.prev_instdone, instdone,
-		       sizeof(instdone));
 	}
 
 repeat:
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 3d2c236..5698fae 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1502,6 +1502,7 @@ void intel_ring_init_seqno(struct intel_ring_buffer *ring, u32 seqno)
 	}
 
 	ring->set_seqno(ring, seqno);
+	ring->hangcheck.seqno = seqno;
 }
 
 void intel_ring_advance(struct intel_ring_buffer *ring)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index dac1614..ef374a8 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -37,6 +37,10 @@ struct  intel_hw_status_page {
 #define I915_READ_SYNC_0(ring) I915_READ(RING_SYNC_0((ring)->mmio_base))
 #define I915_READ_SYNC_1(ring) I915_READ(RING_SYNC_1((ring)->mmio_base))
 
+struct intel_ring_hangcheck {
+	u32 seqno;
+};
+
 struct  intel_ring_buffer {
 	const char	*name;
 	enum intel_ring_id {
@@ -137,6 +141,8 @@ struct  intel_ring_buffer {
 	struct i915_hw_context *default_context;
 	struct i915_hw_context *last_context;
 
+	struct intel_ring_hangcheck hangcheck;
+
 	void *private;
 };
 
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH 3/5] drm/i915: introduce i915_hangcheck_ring_hung
  2013-05-13 13:32 ` [PATCH 3/5] drm/i915: introduce i915_hangcheck_ring_hung Mika Kuoppala
@ 2013-05-24 18:26   ` Ben Widawsky
  2013-05-25 11:26     ` Daniel Vetter
  0 siblings, 1 reply; 13+ messages in thread
From: Ben Widawsky @ 2013-05-24 18:26 UTC (permalink / raw)
  To: Mika Kuoppala, Daniel Vetter; +Cc: intel-gfx, miku

On Mon, May 13, 2013 at 04:32:11PM +0300, Mika Kuoppala wrote:
> In preparation to track per ring progress in hangcheck,
> add i915_hangcheck_ring_hung.
> 
> v2: omit dev parameter (Ben Widawsky)
> 
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>

With the updated patch 2, patches 1-3 are:
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>

I'm on vacation, so I won't get to 4 & 5 for a few days.

> ---
>  drivers/gpu/drm/i915/i915_irq.c |   29 +++++++++++++++++------------
>  1 file changed, 17 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 004ad34..6efe3b0 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2345,28 +2345,33 @@ static bool kick_ring(struct intel_ring_buffer *ring)
>  	return false;
>  }
>  
> +static bool i915_hangcheck_ring_hung(struct intel_ring_buffer *ring)
> +{
> +	if (IS_GEN2(ring->dev))
> +		return false;
> +
> +	/* Is the chip hanging on a WAIT_FOR_EVENT?
> +	 * If so we can simply poke the RB_WAIT bit
> +	 * and break the hang. This should work on
> +	 * all but the second generation chipsets.
> +	 */
> +	return !kick_ring(ring);
> +}
> +
>  static bool i915_hangcheck_hung(struct drm_device *dev)
>  {
>  	drm_i915_private_t *dev_priv = dev->dev_private;
>  
>  	if (dev_priv->gpu_error.hangcheck_count++ > 1) {
>  		bool hung = true;
> +		struct intel_ring_buffer *ring;
> +		int i;
>  
>  		DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
>  		i915_handle_error(dev, true);
>  
> -		if (!IS_GEN2(dev)) {
> -			struct intel_ring_buffer *ring;
> -			int i;
> -
> -			/* Is the chip hanging on a WAIT_FOR_EVENT?
> -			 * If so we can simply poke the RB_WAIT bit
> -			 * and break the hang. This should work on
> -			 * all but the second generation chipsets.
> -			 */
> -			for_each_ring(ring, dev_priv, i)
> -				hung &= !kick_ring(ring);
> -		}
> +		for_each_ring(ring, dev_priv, i)
> +			hung &= i915_hangcheck_ring_hung(ring);
>  
>  		return hung;
>  	}
> -- 
> 1.7.9.5
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 3/5] drm/i915: introduce i915_hangcheck_ring_hung
  2013-05-24 18:26   ` Ben Widawsky
@ 2013-05-25 11:26     ` Daniel Vetter
  0 siblings, 0 replies; 13+ messages in thread
From: Daniel Vetter @ 2013-05-25 11:26 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: intel-gfx, miku

On Fri, May 24, 2013 at 11:26:42AM -0700, Ben Widawsky wrote:
> On Mon, May 13, 2013 at 04:32:11PM +0300, Mika Kuoppala wrote:
> > In preparation to track per ring progress in hangcheck,
> > add i915_hangcheck_ring_hung.
> > 
> > v2: omit dev parameter (Ben Widawsky)
> > 
> > Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> 
> With the updated patch 2, patches 1-3 are:
> Reviewed-by: Ben Widawsky <ben@bwidawsk.net>

Merged the same to dinq, thanks.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 4/5] drm/i915: detect hang using per ring hangcheck_score
  2013-05-13 13:32 ` [PATCH 4/5] drm/i915: detect hang using per ring hangcheck_score Mika Kuoppala
@ 2013-05-27  1:34   ` Ben Widawsky
  2013-05-27 17:42     ` Mika Kuoppala
  2013-05-30  6:04     ` [PATCH] " Mika Kuoppala
  0 siblings, 2 replies; 13+ messages in thread
From: Ben Widawsky @ 2013-05-27  1:34 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx, miku

On Mon, May 13, 2013 at 04:32:12PM +0300, Mika Kuoppala wrote:
> Keep track of ring seqno progress and if there are no
> progress detected, declare hang. Use actual head (acthd)
> to distinguish between ring stuck and batchbuffer looping
> situation. Stuck ring will be kicked to trigger progress.
> 
> v2: use atchd to detect stuck ring from loop (Ben Widawsky)
> 
> v3: Use acthd to check when ring needs kicking.
> Declare hang on third time in order to give time for
> kick_ring to take effect.
> 
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
>
>
>
> ---
>  drivers/gpu/drm/i915/i915_irq.c         |   80 ++++++++++++++++++-------------
>  drivers/gpu/drm/i915/intel_ringbuffer.h |    2 +
>  2 files changed, 49 insertions(+), 33 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 6efe3b0..5dde61f 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -681,7 +681,6 @@ static void notify_ring(struct drm_device *dev,
>  
>  	wake_up_all(&ring->irq_queue);
>  	if (i915_enable_hangcheck) {
> -		dev_priv->gpu_error.hangcheck_count = 0;
>  		mod_timer(&dev_priv->gpu_error.hangcheck_timer,
>  			  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
>  	}
> @@ -2381,61 +2380,76 @@ static bool i915_hangcheck_hung(struct drm_device *dev)
>  
>  /**
>   * This is called when the chip hasn't reported back with completed
> - * batchbuffers in a long time. The first time this is called we simply record
> - * ACTHD. If ACTHD hasn't changed by the time the hangcheck timer elapses
> - * again, we assume the chip is wedged and try to fix it.
> + * batchbuffers in a long time. We keep track per ring seqno progress and
> + * if there are no progress, hangcheck score for that ring is increased.
> + * Further, acthd is inspected to see if the ring is stuck. On stuck case
> + * we kick the ring. If we see no progress on three subsequent calls
> + * we assume chip is wedged and try to fix it by resetting the chip.
>   */
>  void i915_hangcheck_elapsed(unsigned long data)
>  {
>  	struct drm_device *dev = (struct drm_device *)data;
>  	drm_i915_private_t *dev_priv = dev->dev_private;
>  	struct intel_ring_buffer *ring;
> -	bool err = false, idle;
>  	int i;
> -	u32 seqno[I915_NUM_RINGS];
> -	bool work_done;
> +	int busy_count = 0, rings_hung = 0;
> +	bool stuck[I915_NUM_RINGS];
>  
>  	if (!i915_enable_hangcheck)
>  		return;
>  
> -	idle = true;
>  	for_each_ring(ring, dev_priv, i) {
> -		seqno[i] = ring->get_seqno(ring, false);
> -		idle &= i915_hangcheck_ring_idle(ring, seqno[i], &err);
> -	}
> +		u32 seqno, acthd;
> +		bool idle, err = false;
> +
> +		seqno = ring->get_seqno(ring, false);
> +		acthd = intel_ring_get_active_head(ring);
> +		idle = i915_hangcheck_ring_idle(ring, seqno, &err);
> +		stuck[i] = ring->hangcheck.acthd == acthd;
> +
> +		if (idle) {
> +			if (err)
> +				ring->hangcheck.score += 2;
> +			else
> +				ring->hangcheck.score = 0;
> +		} else {
> +			busy_count++;
>  
> -	/* If all work is done then ACTHD clearly hasn't advanced. */
> -	if (idle) {
> -		if (err) {
> -			if (i915_hangcheck_hung(dev))
> -				return;
> +			if (ring->hangcheck.seqno == seqno) {
> +				ring->hangcheck.score++;
>  
> -			goto repeat;
> +				/* Kick ring if stuck*/
> +				if (stuck[i])
> +					i915_hangcheck_ring_hung(ring);
> +			} else {
> +				ring->hangcheck.score = 0;
> +			}
>  		}
>  
> -		dev_priv->gpu_error.hangcheck_count = 0;
> -		return;
> +		ring->hangcheck.seqno = seqno;
> +		ring->hangcheck.acthd = acthd;
>  	}
>  
> -	work_done = false;
>  	for_each_ring(ring, dev_priv, i) {
> -		if (ring->hangcheck.seqno != seqno[i]) {
> -			work_done = true;
> -			ring->hangcheck.seqno = seqno[i];
> +		if (ring->hangcheck.score > 2) {
> +			rings_hung++;
> +			DRM_ERROR("%s: %s on %s 0x%x\n", ring->name,
> +				  stuck[i] ? "stuck" : "no progress",
> +				  stuck[i] ? "addr" : "seqno",
> +				  stuck[i] ? ring->hangcheck.acthd & HEAD_ADDR :
> +				  ring->hangcheck.seqno);
>  		}
>  	}
>  
> -	if (!work_done) {
> -		if (i915_hangcheck_hung(dev))
> -			return;
> -	} else {
> -		dev_priv->gpu_error.hangcheck_count = 0;
> -	}
> +	if (rings_hung)
> +		return i915_handle_error(dev, true);
>  
> -repeat:
> -	/* Reset timer case chip hangs without another request being added */
> -	mod_timer(&dev_priv->gpu_error.hangcheck_timer,
> -		  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
> +	if (busy_count)
> +		/* Reset timer case chip hangs without another request
> +		 * being added */
> +		mod_timer(&dev_priv->gpu_error.hangcheck_timer,
> +			  round_jiffies_up(jiffies +
> +					   DRM_I915_HANGCHECK_JIFFIES));
>  }
>  
>  /* drm_dma.h hooks
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index ef374a8..5886667 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -39,6 +39,8 @@ struct  intel_hw_status_page {
>  
>  struct intel_ring_hangcheck {
>  	u32 seqno;
> +	u32 acthd;
> +	int score;
>  };
>  
>  struct  intel_ring_buffer {
>

Maybe I'm just stating the functional changes of the patch, but in case
they were unintended here is what I see as potential issues:

1. If ring B is waiting on ring A via semaphore, and ring A is making
   progress, albeit slowly - the hangcheck will fire. The check will
   determine that A is moving, however ring B will appear hung because
   the ACTHD doesn't move. I honestly can't say if that's actually a
   realistic problem to hit it probably implies the timeout value is too
   low.

2. There's also another corner case on the kick. If the seqno = 2
   (though not stuck), and on the 3rd hangcheck, the ring is stuck, and
   we try to kick it... we don't actually try to find out if the kick
   helped.

#2 doesn't worry me much, just wanted to point it out. The question then
is, did I make #1 up? If not, what should we do?

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 4/5] drm/i915: detect hang using per ring hangcheck_score
  2013-05-27  1:34   ` Ben Widawsky
@ 2013-05-27 17:42     ` Mika Kuoppala
  2013-05-30  6:04     ` [PATCH] " Mika Kuoppala
  1 sibling, 0 replies; 13+ messages in thread
From: Mika Kuoppala @ 2013-05-27 17:42 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: intel-gfx, miku

Ben Widawsky <ben@bwidawsk.net> writes:

> On Mon, May 13, 2013 at 04:32:12PM +0300, Mika Kuoppala wrote:
>> Keep track of ring seqno progress and if there are no
>> progress detected, declare hang. Use actual head (acthd)
>> to distinguish between ring stuck and batchbuffer looping
>> situation. Stuck ring will be kicked to trigger progress.
>> 
>> v2: use atchd to detect stuck ring from loop (Ben Widawsky)
>> 
>> v3: Use acthd to check when ring needs kicking.
>> Declare hang on third time in order to give time for
>> kick_ring to take effect.
>> 
>> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
>>
>>
>>
>> ---
>>  drivers/gpu/drm/i915/i915_irq.c         |   80 ++++++++++++++++++-------------
>>  drivers/gpu/drm/i915/intel_ringbuffer.h |    2 +
>>  2 files changed, 49 insertions(+), 33 deletions(-)
>> 
>> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
>> index 6efe3b0..5dde61f 100644
>> --- a/drivers/gpu/drm/i915/i915_irq.c
>> +++ b/drivers/gpu/drm/i915/i915_irq.c
>> @@ -681,7 +681,6 @@ static void notify_ring(struct drm_device *dev,
>>  
>>  	wake_up_all(&ring->irq_queue);
>>  	if (i915_enable_hangcheck) {
>> -		dev_priv->gpu_error.hangcheck_count = 0;
>>  		mod_timer(&dev_priv->gpu_error.hangcheck_timer,
>>  			  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
>>  	}
>> @@ -2381,61 +2380,76 @@ static bool i915_hangcheck_hung(struct drm_device *dev)
>>  
>>  /**
>>   * This is called when the chip hasn't reported back with completed
>> - * batchbuffers in a long time. The first time this is called we simply record
>> - * ACTHD. If ACTHD hasn't changed by the time the hangcheck timer elapses
>> - * again, we assume the chip is wedged and try to fix it.
>> + * batchbuffers in a long time. We keep track per ring seqno progress and
>> + * if there are no progress, hangcheck score for that ring is increased.
>> + * Further, acthd is inspected to see if the ring is stuck. On stuck case
>> + * we kick the ring. If we see no progress on three subsequent calls
>> + * we assume chip is wedged and try to fix it by resetting the chip.
>>   */
>>  void i915_hangcheck_elapsed(unsigned long data)
>>  {
>>  	struct drm_device *dev = (struct drm_device *)data;
>>  	drm_i915_private_t *dev_priv = dev->dev_private;
>>  	struct intel_ring_buffer *ring;
>> -	bool err = false, idle;
>>  	int i;
>> -	u32 seqno[I915_NUM_RINGS];
>> -	bool work_done;
>> +	int busy_count = 0, rings_hung = 0;
>> +	bool stuck[I915_NUM_RINGS];
>>  
>>  	if (!i915_enable_hangcheck)
>>  		return;
>>  
>> -	idle = true;
>>  	for_each_ring(ring, dev_priv, i) {
>> -		seqno[i] = ring->get_seqno(ring, false);
>> -		idle &= i915_hangcheck_ring_idle(ring, seqno[i], &err);
>> -	}
>> +		u32 seqno, acthd;
>> +		bool idle, err = false;
>> +
>> +		seqno = ring->get_seqno(ring, false);
>> +		acthd = intel_ring_get_active_head(ring);
>> +		idle = i915_hangcheck_ring_idle(ring, seqno, &err);
>> +		stuck[i] = ring->hangcheck.acthd == acthd;
>> +
>> +		if (idle) {
>> +			if (err)
>> +				ring->hangcheck.score += 2;
>> +			else
>> +				ring->hangcheck.score = 0;
>> +		} else {
>> +			busy_count++;
>>  
>> -	/* If all work is done then ACTHD clearly hasn't advanced. */
>> -	if (idle) {
>> -		if (err) {
>> -			if (i915_hangcheck_hung(dev))
>> -				return;
>> +			if (ring->hangcheck.seqno == seqno) {
>> +				ring->hangcheck.score++;
>>  
>> -			goto repeat;
>> +				/* Kick ring if stuck*/
>> +				if (stuck[i])
>> +					i915_hangcheck_ring_hung(ring);
>> +			} else {
>> +				ring->hangcheck.score = 0;
>> +			}
>>  		}
>>  
>> -		dev_priv->gpu_error.hangcheck_count = 0;
>> -		return;
>> +		ring->hangcheck.seqno = seqno;
>> +		ring->hangcheck.acthd = acthd;
>>  	}
>>  
>> -	work_done = false;
>>  	for_each_ring(ring, dev_priv, i) {
>> -		if (ring->hangcheck.seqno != seqno[i]) {
>> -			work_done = true;
>> -			ring->hangcheck.seqno = seqno[i];
>> +		if (ring->hangcheck.score > 2) {
>> +			rings_hung++;
>> +			DRM_ERROR("%s: %s on %s 0x%x\n", ring->name,
>> +				  stuck[i] ? "stuck" : "no progress",
>> +				  stuck[i] ? "addr" : "seqno",
>> +				  stuck[i] ? ring->hangcheck.acthd & HEAD_ADDR :
>> +				  ring->hangcheck.seqno);
>>  		}
>>  	}
>>  
>> -	if (!work_done) {
>> -		if (i915_hangcheck_hung(dev))
>> -			return;
>> -	} else {
>> -		dev_priv->gpu_error.hangcheck_count = 0;
>> -	}
>> +	if (rings_hung)
>> +		return i915_handle_error(dev, true);
>>  
>> -repeat:
>> -	/* Reset timer case chip hangs without another request being added */
>> -	mod_timer(&dev_priv->gpu_error.hangcheck_timer,
>> -		  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
>> +	if (busy_count)
>> +		/* Reset timer case chip hangs without another request
>> +		 * being added */
>> +		mod_timer(&dev_priv->gpu_error.hangcheck_timer,
>> +			  round_jiffies_up(jiffies +
>> +					   DRM_I915_HANGCHECK_JIFFIES));
>>  }
>>  
>>  /* drm_dma.h hooks
>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
>> index ef374a8..5886667 100644
>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
>> @@ -39,6 +39,8 @@ struct  intel_hw_status_page {
>>  
>>  struct intel_ring_hangcheck {
>>  	u32 seqno;
>> +	u32 acthd;
>> +	int score;
>>  };
>>  
>>  struct  intel_ring_buffer {
>>
>
> Maybe I'm just stating the functional changes of the patch, but in case
> they were unintended here is what I see as potential issues:
>
> 1. If ring B is waiting on ring A via semaphore, and ring A is making
>    progress, albeit slowly - the hangcheck will fire. The check will
>    determine that A is moving, however ring B will appear hung because
>    the ACTHD doesn't move. I honestly can't say if that's actually a
>    realistic problem to hit it probably implies the timeout value is too
>    low.
>
> 2. There's also another corner case on the kick. If the seqno = 2
>    (though not stuck), and on the 3rd hangcheck, the ring is stuck, and
>    we try to kick it... we don't actually try to find out if the kick
>    helped.
>
> #2 doesn't worry me much, just wanted to point it out. The question then
> is, did I make #1 up? If not, what should we do?

#1:
The concern is real. If any single batch takes more than 4.5seconds
to complete, hang will be declared, no matter what is the underlying
reason: loop or hang. This fact should be included in the commit
message. I will update the commit message.

We could do more than error message triaging with acthd, but I wanted
to declare hang on the basis on 'no real progress has been made in
4.5seconds'. Early in the RFC series on these patches, the
hangcheck.score was incremented in different quantities depending on
if it was a wait or nonwait on ring, but it made hang declaration
time non constant and was dropped.

Also related: ring which was waiting will be marked as such and avoid
the investigation when guilty batchbuffer will tracked on postmortem
analysis

#2:
I guess you meant if hangcheck.score == 2. Yes, at first run
we store the seqno if no progress is made, second run we kick it and
third and final run we kick it and declare hang. Extra kick yes, but
second kick should have already resolved situation if it was resolvable.

Thanks,
--Mika

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 5/5] drm/i915: remove i915_hangcheck_hung
  2013-05-13 13:32 ` [PATCH 5/5] drm/i915: remove i915_hangcheck_hung Mika Kuoppala
@ 2013-05-29 17:48   ` Ben Widawsky
  0 siblings, 0 replies; 13+ messages in thread
From: Ben Widawsky @ 2013-05-29 17:48 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx, miku

On Mon, May 13, 2013 at 04:32:13PM +0300, Mika Kuoppala wrote:
> Rework of per ring hangcheck made this obsolete.
> 
>
>
Probably could have put this in the last patch. Assuming you add the
comment on #4, and Daniel/Chris are okay with the proposed functional
change, 4 & 5 are:
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>

> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h |    1 -
>  drivers/gpu/drm/i915/i915_irq.c |   21 ---------------------
>  2 files changed, 22 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index db7cda9..3161d33 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -833,7 +833,6 @@ struct i915_gpu_error {
>  #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
>  #define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD)
>  	struct timer_list hangcheck_timer;
> -	int hangcheck_count;
>  
>  	/* For reset and error_state handling. */
>  	spinlock_t lock;
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 5dde61f..f204d83 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2357,27 +2357,6 @@ static bool i915_hangcheck_ring_hung(struct intel_ring_buffer *ring)
>  	return !kick_ring(ring);
>  }
>  
> -static bool i915_hangcheck_hung(struct drm_device *dev)
> -{
> -	drm_i915_private_t *dev_priv = dev->dev_private;
> -
> -	if (dev_priv->gpu_error.hangcheck_count++ > 1) {
> -		bool hung = true;
> -		struct intel_ring_buffer *ring;
> -		int i;
> -
> -		DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
> -		i915_handle_error(dev, true);
> -
> -		for_each_ring(ring, dev_priv, i)
> -			hung &= i915_hangcheck_ring_hung(ring);
> -
> -		return hung;
> -	}
> -
> -	return false;
> -}
> -
>  /**
>   * This is called when the chip hasn't reported back with completed
>   * batchbuffers in a long time. We keep track per ring seqno progress and
> -- 
> 1.7.9.5
> 

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH] drm/i915: detect hang using per ring hangcheck_score
  2013-05-27  1:34   ` Ben Widawsky
  2013-05-27 17:42     ` Mika Kuoppala
@ 2013-05-30  6:04     ` Mika Kuoppala
  1 sibling, 0 replies; 13+ messages in thread
From: Mika Kuoppala @ 2013-05-30  6:04 UTC (permalink / raw)
  To: intel-gfx; +Cc: miku

Keep track of ring seqno progress and if there are no
progress detected, declare hang. Use actual head (acthd)
to distinguish between ring stuck and batchbuffer looping
situation. Stuck ring will be kicked to trigger progress.

This commit adds a hard limit for batchbuffer completion time.
If batchbuffer completion time is more than 4.5 seconds,
the gpu will be declared hung.

v2: use atchd to detect stuck ring from loop (Ben Widawsky)

v3: Use acthd to check when ring needs kicking.
Declare hang on third time in order to give time for
kick_ring to take effect.

v4: Update commit msg

Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_irq.c         |   80 ++++++++++++++++++-------------
 drivers/gpu/drm/i915/intel_ringbuffer.h |    2 +
 2 files changed, 49 insertions(+), 33 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 557acd3..c4492bf 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -683,7 +683,6 @@ static void notify_ring(struct drm_device *dev,
 
 	wake_up_all(&ring->irq_queue);
 	if (i915_enable_hangcheck) {
-		dev_priv->gpu_error.hangcheck_count = 0;
 		mod_timer(&dev_priv->gpu_error.hangcheck_timer,
 			  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
 	}
@@ -2385,61 +2384,76 @@ static bool i915_hangcheck_hung(struct drm_device *dev)
 
 /**
  * This is called when the chip hasn't reported back with completed
- * batchbuffers in a long time. The first time this is called we simply record
- * ACTHD. If ACTHD hasn't changed by the time the hangcheck timer elapses
- * again, we assume the chip is wedged and try to fix it.
+ * batchbuffers in a long time. We keep track per ring seqno progress and
+ * if there are no progress, hangcheck score for that ring is increased.
+ * Further, acthd is inspected to see if the ring is stuck. On stuck case
+ * we kick the ring. If we see no progress on three subsequent calls
+ * we assume chip is wedged and try to fix it by resetting the chip.
  */
 void i915_hangcheck_elapsed(unsigned long data)
 {
 	struct drm_device *dev = (struct drm_device *)data;
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	struct intel_ring_buffer *ring;
-	bool err = false, idle;
 	int i;
-	u32 seqno[I915_NUM_RINGS];
-	bool work_done;
+	int busy_count = 0, rings_hung = 0;
+	bool stuck[I915_NUM_RINGS];
 
 	if (!i915_enable_hangcheck)
 		return;
 
-	idle = true;
 	for_each_ring(ring, dev_priv, i) {
-		seqno[i] = ring->get_seqno(ring, false);
-		idle &= i915_hangcheck_ring_idle(ring, seqno[i], &err);
-	}
+		u32 seqno, acthd;
+		bool idle, err = false;
+
+		seqno = ring->get_seqno(ring, false);
+		acthd = intel_ring_get_active_head(ring);
+		idle = i915_hangcheck_ring_idle(ring, seqno, &err);
+		stuck[i] = ring->hangcheck.acthd == acthd;
+
+		if (idle) {
+			if (err)
+				ring->hangcheck.score += 2;
+			else
+				ring->hangcheck.score = 0;
+		} else {
+			busy_count++;
 
-	/* If all work is done then ACTHD clearly hasn't advanced. */
-	if (idle) {
-		if (err) {
-			if (i915_hangcheck_hung(dev))
-				return;
+			if (ring->hangcheck.seqno == seqno) {
+				ring->hangcheck.score++;
 
-			goto repeat;
+				/* Kick ring if stuck*/
+				if (stuck[i])
+					i915_hangcheck_ring_hung(ring);
+			} else {
+				ring->hangcheck.score = 0;
+			}
 		}
 
-		dev_priv->gpu_error.hangcheck_count = 0;
-		return;
+		ring->hangcheck.seqno = seqno;
+		ring->hangcheck.acthd = acthd;
 	}
 
-	work_done = false;
 	for_each_ring(ring, dev_priv, i) {
-		if (ring->hangcheck.seqno != seqno[i]) {
-			work_done = true;
-			ring->hangcheck.seqno = seqno[i];
+		if (ring->hangcheck.score > 2) {
+			rings_hung++;
+			DRM_ERROR("%s: %s on %s 0x%x\n", ring->name,
+				  stuck[i] ? "stuck" : "no progress",
+				  stuck[i] ? "addr" : "seqno",
+				  stuck[i] ? ring->hangcheck.acthd & HEAD_ADDR :
+				  ring->hangcheck.seqno);
 		}
 	}
 
-	if (!work_done) {
-		if (i915_hangcheck_hung(dev))
-			return;
-	} else {
-		dev_priv->gpu_error.hangcheck_count = 0;
-	}
+	if (rings_hung)
+		return i915_handle_error(dev, true);
 
-repeat:
-	/* Reset timer case chip hangs without another request being added */
-	mod_timer(&dev_priv->gpu_error.hangcheck_timer,
-		  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
+	if (busy_count)
+		/* Reset timer case chip hangs without another request
+		 * being added */
+		mod_timer(&dev_priv->gpu_error.hangcheck_timer,
+			  round_jiffies_up(jiffies +
+					   DRM_I915_HANGCHECK_JIFFIES));
 }
 
 /* drm_dma.h hooks
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index ef374a8..5886667 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -39,6 +39,8 @@ struct  intel_hw_status_page {
 
 struct intel_ring_hangcheck {
 	u32 seqno;
+	u32 acthd;
+	int score;
 };
 
 struct  intel_ring_buffer {
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2013-05-30  6:05 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-05-13 13:32 [PATCH 1/5] drm/i915: pass seqno to i915_hangcheck_ring_idle Mika Kuoppala
2013-05-13 13:32 ` [PATCH 2/5] drm/i915: track ring progression using seqnos Mika Kuoppala
2013-05-17 17:40   ` Ben Widawsky
2013-05-24 14:16     ` [PATCH 02/14] " Mika Kuoppala
2013-05-13 13:32 ` [PATCH 3/5] drm/i915: introduce i915_hangcheck_ring_hung Mika Kuoppala
2013-05-24 18:26   ` Ben Widawsky
2013-05-25 11:26     ` Daniel Vetter
2013-05-13 13:32 ` [PATCH 4/5] drm/i915: detect hang using per ring hangcheck_score Mika Kuoppala
2013-05-27  1:34   ` Ben Widawsky
2013-05-27 17:42     ` Mika Kuoppala
2013-05-30  6:04     ` [PATCH] " Mika Kuoppala
2013-05-13 13:32 ` [PATCH 5/5] drm/i915: remove i915_hangcheck_hung Mika Kuoppala
2013-05-29 17:48   ` Ben Widawsky

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.