All of lore.kernel.org
 help / color / mirror / Atom feed
From: Mika Kuoppala <mika.kuoppala@linux.intel.com>
To: intel-gfx@lists.freedesktop.org
Cc: ben@bwidawsk.net, miku@iki.fi
Subject: [PATCH 4/5] drm/i915: detect hang using per ring hangcheck_score
Date: Mon, 13 May 2013 16:32:12 +0300	[thread overview]
Message-ID: <1368451933-32571-4-git-send-email-mika.kuoppala@intel.com> (raw)
In-Reply-To: <1368451933-32571-1-git-send-email-mika.kuoppala@intel.com>

Keep track of ring seqno progress and if there are no
progress detected, declare hang. Use actual head (acthd)
to distinguish between ring stuck and batchbuffer looping
situation. Stuck ring will be kicked to trigger progress.

v2: use atchd to detect stuck ring from loop (Ben Widawsky)

v3: Use acthd to check when ring needs kicking.
Declare hang on third time in order to give time for
kick_ring to take effect.

Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_irq.c         |   80 ++++++++++++++++++-------------
 drivers/gpu/drm/i915/intel_ringbuffer.h |    2 +
 2 files changed, 49 insertions(+), 33 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 6efe3b0..5dde61f 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -681,7 +681,6 @@ static void notify_ring(struct drm_device *dev,
 
 	wake_up_all(&ring->irq_queue);
 	if (i915_enable_hangcheck) {
-		dev_priv->gpu_error.hangcheck_count = 0;
 		mod_timer(&dev_priv->gpu_error.hangcheck_timer,
 			  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
 	}
@@ -2381,61 +2380,76 @@ static bool i915_hangcheck_hung(struct drm_device *dev)
 
 /**
  * This is called when the chip hasn't reported back with completed
- * batchbuffers in a long time. The first time this is called we simply record
- * ACTHD. If ACTHD hasn't changed by the time the hangcheck timer elapses
- * again, we assume the chip is wedged and try to fix it.
+ * batchbuffers in a long time. We keep track per ring seqno progress and
+ * if there are no progress, hangcheck score for that ring is increased.
+ * Further, acthd is inspected to see if the ring is stuck. On stuck case
+ * we kick the ring. If we see no progress on three subsequent calls
+ * we assume chip is wedged and try to fix it by resetting the chip.
  */
 void i915_hangcheck_elapsed(unsigned long data)
 {
 	struct drm_device *dev = (struct drm_device *)data;
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	struct intel_ring_buffer *ring;
-	bool err = false, idle;
 	int i;
-	u32 seqno[I915_NUM_RINGS];
-	bool work_done;
+	int busy_count = 0, rings_hung = 0;
+	bool stuck[I915_NUM_RINGS];
 
 	if (!i915_enable_hangcheck)
 		return;
 
-	idle = true;
 	for_each_ring(ring, dev_priv, i) {
-		seqno[i] = ring->get_seqno(ring, false);
-		idle &= i915_hangcheck_ring_idle(ring, seqno[i], &err);
-	}
+		u32 seqno, acthd;
+		bool idle, err = false;
+
+		seqno = ring->get_seqno(ring, false);
+		acthd = intel_ring_get_active_head(ring);
+		idle = i915_hangcheck_ring_idle(ring, seqno, &err);
+		stuck[i] = ring->hangcheck.acthd == acthd;
+
+		if (idle) {
+			if (err)
+				ring->hangcheck.score += 2;
+			else
+				ring->hangcheck.score = 0;
+		} else {
+			busy_count++;
 
-	/* If all work is done then ACTHD clearly hasn't advanced. */
-	if (idle) {
-		if (err) {
-			if (i915_hangcheck_hung(dev))
-				return;
+			if (ring->hangcheck.seqno == seqno) {
+				ring->hangcheck.score++;
 
-			goto repeat;
+				/* Kick ring if stuck*/
+				if (stuck[i])
+					i915_hangcheck_ring_hung(ring);
+			} else {
+				ring->hangcheck.score = 0;
+			}
 		}
 
-		dev_priv->gpu_error.hangcheck_count = 0;
-		return;
+		ring->hangcheck.seqno = seqno;
+		ring->hangcheck.acthd = acthd;
 	}
 
-	work_done = false;
 	for_each_ring(ring, dev_priv, i) {
-		if (ring->hangcheck.seqno != seqno[i]) {
-			work_done = true;
-			ring->hangcheck.seqno = seqno[i];
+		if (ring->hangcheck.score > 2) {
+			rings_hung++;
+			DRM_ERROR("%s: %s on %s 0x%x\n", ring->name,
+				  stuck[i] ? "stuck" : "no progress",
+				  stuck[i] ? "addr" : "seqno",
+				  stuck[i] ? ring->hangcheck.acthd & HEAD_ADDR :
+				  ring->hangcheck.seqno);
 		}
 	}
 
-	if (!work_done) {
-		if (i915_hangcheck_hung(dev))
-			return;
-	} else {
-		dev_priv->gpu_error.hangcheck_count = 0;
-	}
+	if (rings_hung)
+		return i915_handle_error(dev, true);
 
-repeat:
-	/* Reset timer case chip hangs without another request being added */
-	mod_timer(&dev_priv->gpu_error.hangcheck_timer,
-		  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
+	if (busy_count)
+		/* Reset timer case chip hangs without another request
+		 * being added */
+		mod_timer(&dev_priv->gpu_error.hangcheck_timer,
+			  round_jiffies_up(jiffies +
+					   DRM_I915_HANGCHECK_JIFFIES));
 }
 
 /* drm_dma.h hooks
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index ef374a8..5886667 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -39,6 +39,8 @@ struct  intel_hw_status_page {
 
 struct intel_ring_hangcheck {
 	u32 seqno;
+	u32 acthd;
+	int score;
 };
 
 struct  intel_ring_buffer {
-- 
1.7.9.5

  parent reply	other threads:[~2013-05-13 13:32 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-05-13 13:32 [PATCH 1/5] drm/i915: pass seqno to i915_hangcheck_ring_idle Mika Kuoppala
2013-05-13 13:32 ` [PATCH 2/5] drm/i915: track ring progression using seqnos Mika Kuoppala
2013-05-17 17:40   ` Ben Widawsky
2013-05-24 14:16     ` [PATCH 02/14] " Mika Kuoppala
2013-05-13 13:32 ` [PATCH 3/5] drm/i915: introduce i915_hangcheck_ring_hung Mika Kuoppala
2013-05-24 18:26   ` Ben Widawsky
2013-05-25 11:26     ` Daniel Vetter
2013-05-13 13:32 ` Mika Kuoppala [this message]
2013-05-27  1:34   ` [PATCH 4/5] drm/i915: detect hang using per ring hangcheck_score Ben Widawsky
2013-05-27 17:42     ` Mika Kuoppala
2013-05-30  6:04     ` [PATCH] " Mika Kuoppala
2013-05-13 13:32 ` [PATCH 5/5] drm/i915: remove i915_hangcheck_hung Mika Kuoppala
2013-05-29 17:48   ` Ben Widawsky

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1368451933-32571-4-git-send-email-mika.kuoppala@intel.com \
    --to=mika.kuoppala@linux.intel.com \
    --cc=ben@bwidawsk.net \
    --cc=intel-gfx@lists.freedesktop.org \
    --cc=miku@iki.fi \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.