All of lore.kernel.org
 help / color / mirror / Atom feed
From: Mika Kuoppala <mika.kuoppala@linux.intel.com>
To: intel-gfx@lists.freedesktop.org
Subject: [RFC] [PATCH 1/7] drm/i915: detect infinite loops in hang check
Date: Mon,  4 Feb 2013 16:04:37 +0200	[thread overview]
Message-ID: <1359986683-29788-2-git-send-email-mika.kuoppala@intel.com> (raw)
In-Reply-To: <1359986683-29788-1-git-send-email-mika.kuoppala@intel.com>

If there was a batch chaining loop or infinite loop in the batchbuffer,
we didn't detect it as acthd and instdone kept changing in those cases
and hang was never declared.

To detect ring hangs, including infinite loops, keep track of ring
seqno progression.

Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h         |    3 -
 drivers/gpu/drm/i915/i915_irq.c         |  127 +++++++++++++++----------------
 drivers/gpu/drm/i915/intel_ringbuffer.h |    4 +
 3 files changed, 67 insertions(+), 67 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 984523d..4daab74 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -791,9 +791,6 @@ struct i915_gpu_error {
 #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
 #define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD)
 	struct timer_list hangcheck_timer;
-	int hangcheck_count;
-	uint32_t last_acthd[I915_NUM_RINGS];
-	uint32_t prev_instdone[I915_NUM_INSTDONE_REG];
 
 	/* For reset and error_state handling. */
 	spinlock_t lock;
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 5648d84..d0a9e21 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -356,7 +356,7 @@ static void notify_ring(struct drm_device *dev,
 
 	wake_up_all(&ring->irq_queue);
 	if (i915_enable_hangcheck) {
-		dev_priv->gpu_error.hangcheck_count = 0;
+		ring->hangcheck_count = 0;
 		mod_timer(&dev_priv->gpu_error.hangcheck_timer,
 			  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
 	}
@@ -1727,11 +1727,11 @@ ring_last_seqno(struct intel_ring_buffer *ring)
 			  struct drm_i915_gem_request, list)->seqno;
 }
 
-static bool i915_hangcheck_ring_idle(struct intel_ring_buffer *ring, bool *err)
+static bool i915_hangcheck_ring_idle(struct intel_ring_buffer *ring,
+				     u32 ring_seqno, bool *err)
 {
 	if (list_empty(&ring->request_list) ||
-	    i915_seqno_passed(ring->get_seqno(ring, false),
-			      ring_last_seqno(ring))) {
+	    i915_seqno_passed(ring_seqno, ring_last_seqno(ring))) {
 		/* Issue a wake-up to catch stuck h/w. */
 		if (waitqueue_active(&ring->irq_queue)) {
 			DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
@@ -1749,39 +1749,32 @@ static bool kick_ring(struct intel_ring_buffer *ring)
 	struct drm_device *dev = ring->dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	u32 tmp = I915_READ_CTL(ring);
+
+	ring->hangcheck_waiting = false;
+
 	if (tmp & RING_WAIT) {
 		DRM_ERROR("Kicking stuck wait on %s\n",
 			  ring->name);
 		I915_WRITE_CTL(ring, tmp);
-		return true;
+		ring->hangcheck_waiting = true;
 	}
-	return false;
-}
-
-static bool i915_hangcheck_hung(struct drm_device *dev)
-{
-	drm_i915_private_t *dev_priv = dev->dev_private;
 
-	if (dev_priv->gpu_error.hangcheck_count++ > 1) {
-		bool hung = true;
+	if ((INTEL_INFO(dev)->gen >= 6) && (tmp & RING_WAIT_SEMAPHORE))
+		ring->hangcheck_waiting = true;
 
-		DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
-		i915_handle_error(dev, true);
-
-		if (!IS_GEN2(dev)) {
-			struct intel_ring_buffer *ring;
-			int i;
-
-			/* Is the chip hanging on a WAIT_FOR_EVENT?
-			 * If so we can simply poke the RB_WAIT bit
-			 * and break the hang. This should work on
-			 * all but the second generation chipsets.
-			 */
-			for_each_ring(ring, dev_priv, i)
-				hung &= !kick_ring(ring);
-		}
+	return ring->hangcheck_waiting;
+}
 
-		return hung;
+static bool i915_hangcheck_ring_hung(struct drm_device *dev,
+				     struct intel_ring_buffer *ring)
+{
+	if (!IS_GEN2(dev)) {
+		/* Is the chip hanging on a WAIT_FOR_EVENT?
+		 * If so we can simply poke the RB_WAIT bit
+		 * and break the hang. This should work on
+		 * all but the second generation chipsets.
+		 */
+		return !kick_ring(ring);
 	}
 
 	return false;
@@ -1789,62 +1782,68 @@ static bool i915_hangcheck_hung(struct drm_device *dev)
 
 /**
  * This is called when the chip hasn't reported back with completed
- * batchbuffers in a long time. The first time this is called we simply record
- * ACTHD. If ACTHD hasn't changed by the time the hangcheck timer elapses
- * again, we assume the chip is wedged and try to fix it.
+ * batchbuffers in a long time. We record current seqno for each count and
+ * in subsequent calls we check if requests have been processed by each ring.
+ * If there is no progress on specific ring, we declare it as hung.
  */
 void i915_hangcheck_elapsed(unsigned long data)
 {
 	struct drm_device *dev = (struct drm_device *)data;
 	drm_i915_private_t *dev_priv = dev->dev_private;
-	uint32_t acthd[I915_NUM_RINGS], instdone[I915_NUM_INSTDONE_REG];
 	struct intel_ring_buffer *ring;
-	bool err = false, idle;
+	bool ring_hung;
 	int i;
+	int busy_count = 0;
 
 	if (!i915_enable_hangcheck)
 		return;
 
-	memset(acthd, 0, sizeof(acthd));
-	idle = true;
 	for_each_ring(ring, dev_priv, i) {
-	    idle &= i915_hangcheck_ring_idle(ring, &err);
-	    acthd[i] = intel_ring_get_active_head(ring);
-	}
+		bool err = false, idle;
+		u32 seqno;
 
-	/* If all work is done then ACTHD clearly hasn't advanced. */
-	if (idle) {
-		if (err) {
-			if (i915_hangcheck_hung(dev))
-				return;
+		seqno = ring->get_seqno(ring, false);
+		idle = i915_hangcheck_ring_idle(ring, seqno, &err);
+
+		if (idle) {
+			if (err)
+				ring->hangcheck_count++;
+			else
+				ring->hangcheck_count = 0;
+		} else {
+			busy_count++;
 
-			goto repeat;
+			if (ring->hangcheck_seqno == seqno) {
+				ring->hangcheck_count++;
+
+				/* If the ring is not waiting, raise the
+				 * hung score */
+				if (i915_hangcheck_ring_hung(dev, ring))
+					ring->hangcheck_count++;
+			} else {
+				ring->hangcheck_count = 0;
+			}
 		}
 
-		dev_priv->gpu_error.hangcheck_count = 0;
-		return;
+		ring->hangcheck_seqno = seqno;
 	}
 
-	i915_get_extra_instdone(dev, instdone);
-	if (memcmp(dev_priv->gpu_error.last_acthd, acthd,
-		   sizeof(acthd)) == 0 &&
-	    memcmp(dev_priv->gpu_error.prev_instdone, instdone,
-		   sizeof(instdone)) == 0) {
-		if (i915_hangcheck_hung(dev))
-			return;
-	} else {
-		dev_priv->gpu_error.hangcheck_count = 0;
-
-		memcpy(dev_priv->gpu_error.last_acthd, acthd,
-		       sizeof(acthd));
-		memcpy(dev_priv->gpu_error.prev_instdone, instdone,
-		       sizeof(instdone));
+	ring_hung = false;
+	for_each_ring(ring, dev_priv, i) {
+		if (ring->hangcheck_count > 2) {
+			ring_hung = true;
+			DRM_ERROR("%s seems hung\n", ring->name);
+		}
 	}
 
-repeat:
+	if (ring_hung)
+		return i915_handle_error(dev, true);
+
 	/* Reset timer case chip hangs without another request being added */
-	mod_timer(&dev_priv->gpu_error.hangcheck_timer,
-		  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
+	if  (busy_count)
+		mod_timer(&dev_priv->gpu_error.hangcheck_timer,
+			  round_jiffies_up(jiffies +
+					   DRM_I915_HANGCHECK_JIFFIES));
 }
 
 /* drm_dma.h hooks
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index d66208c..7257252 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -137,6 +137,10 @@ struct  intel_ring_buffer {
 	struct i915_hw_context *default_context;
 	struct drm_i915_gem_object *last_context_obj;
 
+	int hangcheck_count;
+	u32 hangcheck_seqno;
+	bool hangcheck_waiting;
+
 	void *private;
 };
 
-- 
1.7.9.5

  reply	other threads:[~2013-02-04 14:02 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-02-04 14:04 [RFC] [PATCH 0/7] arb robustness enablers Mika Kuoppala
2013-02-04 14:04 ` Mika Kuoppala [this message]
2013-02-15  6:14   ` [RFC] [PATCH 1/7] drm/i915: detect infinite loops in hang check Ben Widawsky
2013-02-15  9:49     ` Daniel Vetter
2013-02-15 14:48     ` Mika Kuoppala
2013-02-04 14:04 ` [RFC] [PATCH 2/7] drm/i915: add struct i915_reset_stats Mika Kuoppala
2013-02-15  6:21   ` Ben Widawsky
2013-02-04 14:04 ` [RFC] [PATCH 3/7] drm/i915: add reset_status for hw_contexts Mika Kuoppala
2013-02-04 14:04 ` [RFC] [PATCH 4/7] drm/i915: add i915_get_reset_status_ioctl Mika Kuoppala
2013-02-05 10:47   ` [PATCH 4/7] drm/i915: add i915_get_reset_stats_ioctl Mika Kuoppala
2013-02-04 14:04 ` [RFC] [PATCH 5/7] drm/i915: add batch object and context to i915_add_request() Mika Kuoppala
2013-02-04 14:04 ` [RFC] [PATCH 6/7] drm/i915: reference count for i915_hw_contexts Mika Kuoppala
2013-02-15  5:55   ` Ben Widawsky
2013-02-04 14:04 ` [RFC] [PATCH 7/7] drm/i915: find guilty batch buffer on ring resets Mika Kuoppala
2013-02-07 14:11   ` Ville Syrjälä
2013-02-15 14:12     ` Mika Kuoppala
2013-02-15 14:55       ` Ville Syrjälä
2013-02-08 10:35 ` [RFC] [PATCH 0/7] arb robustness enablers Daniel Vetter

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1359986683-29788-2-git-send-email-mika.kuoppala@intel.com \
    --to=mika.kuoppala@linux.intel.com \
    --cc=intel-gfx@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.