All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] drm/i915: Only slightly increment hangcheck score if we succesfully kick a ring
@ 2013-06-06  9:45 Chris Wilson
  2013-06-06  9:45 ` [PATCH 2/2] drm/i915: Don't count semaphore waits towards a stuck ring Chris Wilson
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Chris Wilson @ 2013-06-06  9:45 UTC (permalink / raw)
  To: intel-gfx; +Cc: Ben Widawsky

After kicking a ring, it should be free to make progress again and so
should not be accused of being stuck until hangcheck fires once more. In
order to catch a denial-of-service within a batch or across multiple
batches, we still do increment the hangcheck score - just not as
severely so that it takes multiple kicks to fail.

This should address part of Ben's justified criticism of

commit 05407ff889ceebe383aa5907219f86582ef96b72
Author: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Date:   Thu May 30 09:04:29 2013 +0300

    drm/i915: detect hang using per ring hangcheck_score

"There's also another corner case on the kick. If the seqno = 2
(though not stuck), and on the 3rd hangcheck, the ring is stuck, and
we try to kick it... we don't actually try to find out if the kick
helped."

v2: Make sure we catch DoS attempts with batches full of invalid WAITs.

References: https://bugs.freedesktop.org/show_bug.cgi?id=65394
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Ben Widawsky <ben@bwidawsk.net>
---
 drivers/gpu/drm/i915/i915_irq.c |  105 ++++++++++++++++++++-------------------
 1 file changed, 53 insertions(+), 52 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 85694d7..2d1890d 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2324,21 +2324,11 @@ ring_last_seqno(struct intel_ring_buffer *ring)
 			  struct drm_i915_gem_request, list)->seqno;
 }
 
-static bool i915_hangcheck_ring_idle(struct intel_ring_buffer *ring,
-				     u32 ring_seqno, bool *err)
-{
-	if (list_empty(&ring->request_list) ||
-	    i915_seqno_passed(ring_seqno, ring_last_seqno(ring))) {
-		/* Issue a wake-up to catch stuck h/w. */
-		if (waitqueue_active(&ring->irq_queue)) {
-			DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
-				  ring->name);
-			wake_up_all(&ring->irq_queue);
-			*err = true;
-		}
-		return true;
-	}
-	return false;
+static bool
+ring_idle(struct intel_ring_buffer *ring, u32 seqno)
+{
+	return (list_empty(&ring->request_list) ||
+		i915_seqno_passed(seqno, ring_last_seqno(ring)));
 }
 
 static bool semaphore_passed(struct intel_ring_buffer *ring)
@@ -2372,16 +2362,26 @@ static bool semaphore_passed(struct intel_ring_buffer *ring)
 				 ioread32(ring->virtual_start+acthd+4)+1);
 }
 
-static bool kick_ring(struct intel_ring_buffer *ring)
+static bool ring_hung(struct intel_ring_buffer *ring)
 {
 	struct drm_device *dev = ring->dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
-	u32 tmp = I915_READ_CTL(ring);
+	u32 tmp;
+
+	if (IS_GEN2(dev))
+		return true;
+
+	/* Is the chip hanging on a WAIT_FOR_EVENT?
+	 * If so we can simply poke the RB_WAIT bit
+	 * and break the hang. This should work on
+	 * all but the second generation chipsets.
+	 */
+	tmp = I915_READ_CTL(ring);
 	if (tmp & RING_WAIT) {
 		DRM_ERROR("Kicking stuck wait on %s\n",
 			  ring->name);
 		I915_WRITE_CTL(ring, tmp);
-		return true;
+		return false;
 	}
 
 	if (INTEL_INFO(dev)->gen >= 6 &&
@@ -2390,22 +2390,10 @@ static bool kick_ring(struct intel_ring_buffer *ring)
 		DRM_ERROR("Kicking stuck semaphore on %s\n",
 			  ring->name);
 		I915_WRITE_CTL(ring, tmp);
-		return true;
-	}
-	return false;
-}
-
-static bool i915_hangcheck_ring_hung(struct intel_ring_buffer *ring)
-{
-	if (IS_GEN2(ring->dev))
 		return false;
+	}
 
-	/* Is the chip hanging on a WAIT_FOR_EVENT?
-	 * If so we can simply poke the RB_WAIT bit
-	 * and break the hang. This should work on
-	 * all but the second generation chipsets.
-	 */
-	return !kick_ring(ring);
+	return true;
 }
 
 /**
@@ -2423,37 +2411,50 @@ void i915_hangcheck_elapsed(unsigned long data)
 	struct intel_ring_buffer *ring;
 	int i;
 	int busy_count = 0, rings_hung = 0;
-	bool stuck[I915_NUM_RINGS];
+	bool stuck[I915_NUM_RINGS] = { 0 };
+#define KICK 5
+#define HUNG 20
+#define FIRE 30
 
 	if (!i915_enable_hangcheck)
 		return;
 
 	for_each_ring(ring, dev_priv, i) {
 		u32 seqno, acthd;
-		bool idle, err = false;
 
 		seqno = ring->get_seqno(ring, false);
 		acthd = intel_ring_get_active_head(ring);
-		idle = i915_hangcheck_ring_idle(ring, seqno, &err);
-		stuck[i] = ring->hangcheck.acthd == acthd;
-
-		if (idle) {
-			if (err)
-				ring->hangcheck.score += 2;
-			else
-				ring->hangcheck.score = 0;
-		} else {
-			busy_count++;
 
-			if (ring->hangcheck.seqno == seqno) {
-				ring->hangcheck.score++;
-
-				/* Kick ring if stuck*/
-				if (stuck[i])
-					i915_hangcheck_ring_hung(ring);
+		if (ring->hangcheck.seqno == seqno) {
+			if (ring_idle(ring, seqno)) {
+				if (waitqueue_active(&ring->irq_queue)) {
+					/* Issue a wake-up to catch stuck h/w. */
+					DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
+						  ring->name);
+					wake_up_all(&ring->irq_queue);
+					ring->hangcheck.score += HUNG;
+					busy_count++;
+				}
 			} else {
-				ring->hangcheck.score = 0;
+				stuck[i] = ring->hangcheck.acthd == acthd;
+				if (stuck[i]) {
+					/* Every time we kick the ring, add a
+					 * small increment to the hangcheck
+					 * score so that we can catch a
+					 * batch that is repeatedly kicked.
+					 */
+					ring->hangcheck.score += KICK;
+					if (ring_hung(ring))
+						ring->hangcheck.score += HUNG;
+				}
+				busy_count++;
 			}
+		} else {
+			/* Gradually reduce the count so that we catch DoS
+			 * attempts across multiple batches.
+			 */
+			if (ring->hangcheck.score > 0)
+				ring->hangcheck.score--;
 		}
 
 		ring->hangcheck.seqno = seqno;
@@ -2461,7 +2462,7 @@ void i915_hangcheck_elapsed(unsigned long data)
 	}
 
 	for_each_ring(ring, dev_priv, i) {
-		if (ring->hangcheck.score > 2) {
+		if (ring->hangcheck.score > FIRE) {
 			rings_hung++;
 			DRM_ERROR("%s: %s on %s 0x%x\n", ring->name,
 				  stuck[i] ? "stuck" : "no progress",
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/2] drm/i915: Don't count semaphore waits towards a stuck ring
  2013-06-06  9:45 [PATCH 1/2] drm/i915: Only slightly increment hangcheck score if we succesfully kick a ring Chris Wilson
@ 2013-06-06  9:45 ` Chris Wilson
  2013-06-07  8:55   ` [PATCH] " Chris Wilson
  2013-06-06 14:41 ` [PATCH 1/2] drm/i915: Only slightly increment hangcheck score if we succesfully kick a ring Mika Kuoppala
  2013-06-07  3:58 ` Ben Widawsky
  2 siblings, 1 reply; 8+ messages in thread
From: Chris Wilson @ 2013-06-06  9:45 UTC (permalink / raw)
  To: intel-gfx; +Cc: Ben Widawsky

If we detect a ring is in a valid wait for another, just let it be.
Eventually it will either begin to progress again, or the entire system
will come grinding to a halt and then hangcheck will fire as soon as the
deadlock is detected.

This error was foretold by Ben in
commit 05407ff889ceebe383aa5907219f86582ef96b72
Author: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Date:   Thu May 30 09:04:29 2013 +0300

    drm/i915: detect hang using per ring hangcheck_score

"If ring B is waiting on ring A via semaphore, and ring A is making
progress, albeit slowly - the hangcheck will fire. The check will
determine that A is moving, however ring B will appear hung because
the ACTHD doesn't move. I honestly can't say if that's actually a
realistic problem to hit it probably implies the timeout value is too
low."

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=65394
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Ben Widawsky <ben@bwidawsk.net>
---
 drivers/gpu/drm/i915/i915_irq.c         |   66 +++++++++++++++++++++++--------
 drivers/gpu/drm/i915/intel_ringbuffer.h |    1 +
 2 files changed, 51 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 2d1890d..948eb9f 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2331,21 +2331,21 @@ ring_idle(struct intel_ring_buffer *ring, u32 seqno)
 		i915_seqno_passed(seqno, ring_last_seqno(ring)));
 }
 
-static bool semaphore_passed(struct intel_ring_buffer *ring)
+static struct intel_ring_buffer *
+semaphore_waits_for(struct intel_ring_buffer *ring, u32 *seqno)
 {
 	struct drm_i915_private *dev_priv = ring->dev->dev_private;
-	u32 acthd = intel_ring_get_active_head(ring) & HEAD_ADDR;
-	struct intel_ring_buffer *signaller;
-	u32 cmd, ipehr, acthd_min;
+	u32 cmd, ipehr, acthd, acthd_min;
 
 	ipehr = I915_READ(RING_IPEHR(ring->mmio_base));
 	if ((ipehr & ~(0x3 << 16)) !=
 	    (MI_SEMAPHORE_MBOX | MI_SEMAPHORE_COMPARE | MI_SEMAPHORE_REGISTER))
-		return false;
+		return NULL;
 
 	/* ACTHD is likely pointing to the dword after the actual command,
 	 * so scan backwards until we find the MBOX.
 	 */
+	acthd = intel_ring_get_active_head(ring) & HEAD_ADDR;
 	acthd_min = max((int)acthd - 3 * 4, 0);
 	do {
 		cmd = ioread32(ring->virtual_start + acthd);
@@ -2354,12 +2354,40 @@ static bool semaphore_passed(struct intel_ring_buffer *ring)
 
 		acthd -= 4;
 		if (acthd < acthd_min)
-			return false;
+			return NULL;
 	} while (1);
 
-	signaller = &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3];
-	return i915_seqno_passed(signaller->get_seqno(signaller, false),
-				 ioread32(ring->virtual_start+acthd+4)+1);
+	*seqno = ioread32(ring->virtual_start+acthd+4)+1;
+	return &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3];
+}
+
+static int semaphore_passed(struct intel_ring_buffer *ring)
+{
+	struct drm_i915_private *dev_priv = ring->dev->dev_private;
+	struct intel_ring_buffer *signaller;
+	u32 seqno, ctl;
+
+	ring->hangcheck.deadlock = true;
+
+	signaller = semaphore_waits_for(ring, &seqno);
+	if (signaller == NULL || signaller->hangcheck.deadlock)
+		return -1;
+
+	/* cursory check for an unkickable deadlock */
+	ctl = I915_READ_CTL(signaller);
+	if (ctl & RING_WAIT_SEMAPHORE && semaphore_passed(signaller) < 0)
+		return -1;
+
+	return i915_seqno_passed(signaller->get_seqno(signaller, false), seqno);
+}
+
+static void semaphore_clear_deadlocks(struct drm_i915_private *dev_priv)
+{
+	struct intel_ring_buffer *ring;
+	int i;
+
+	for_each_ring(ring, dev_priv, i)
+		ring->hangcheck.deadlock = false;
 }
 
 static bool ring_hung(struct intel_ring_buffer *ring)
@@ -2384,13 +2412,17 @@ static bool ring_hung(struct intel_ring_buffer *ring)
 		return false;
 	}
 
-	if (INTEL_INFO(dev)->gen >= 6 &&
-	    tmp & RING_WAIT_SEMAPHORE &&
-	    semaphore_passed(ring)) {
-		DRM_ERROR("Kicking stuck semaphore on %s\n",
-			  ring->name);
-		I915_WRITE_CTL(ring, tmp);
-		return false;
+	if (INTEL_INFO(dev)->gen >= 6 && tmp & RING_WAIT_SEMAPHORE) {
+		switch (semaphore_passed(ring)) {
+		default:
+			return true;
+		case 1:
+			DRM_ERROR("Kicking stuck semaphore on %s\n",
+				  ring->name);
+			I915_WRITE_CTL(ring, tmp);
+		case 0:
+			return false;
+		}
 	}
 
 	return true;
@@ -2422,6 +2454,8 @@ void i915_hangcheck_elapsed(unsigned long data)
 	for_each_ring(ring, dev_priv, i) {
 		u32 seqno, acthd;
 
+		semaphore_clear_deadlocks(dev_priv);
+
 		seqno = ring->get_seqno(ring, false);
 		acthd = intel_ring_get_active_head(ring);
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index efc403d..a3e9610 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -38,6 +38,7 @@ struct  intel_hw_status_page {
 #define I915_READ_SYNC_1(ring) I915_READ(RING_SYNC_1((ring)->mmio_base))
 
 struct intel_ring_hangcheck {
+	bool deadlock;
 	u32 seqno;
 	u32 acthd;
 	int score;
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/2] drm/i915: Only slightly increment hangcheck score if we succesfully kick a ring
  2013-06-06  9:45 [PATCH 1/2] drm/i915: Only slightly increment hangcheck score if we succesfully kick a ring Chris Wilson
  2013-06-06  9:45 ` [PATCH 2/2] drm/i915: Don't count semaphore waits towards a stuck ring Chris Wilson
@ 2013-06-06 14:41 ` Mika Kuoppala
  2013-06-07  3:58 ` Ben Widawsky
  2 siblings, 0 replies; 8+ messages in thread
From: Mika Kuoppala @ 2013-06-06 14:41 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: Ben Widawsky

Chris Wilson <chris@chris-wilson.co.uk> writes:

> After kicking a ring, it should be free to make progress again and so
> should not be accused of being stuck until hangcheck fires once more. In
> order to catch a denial-of-service within a batch or across multiple
> batches, we still do increment the hangcheck score - just not as
> severely so that it takes multiple kicks to fail.
>
> This should address part of Ben's justified criticism of
>
> commit 05407ff889ceebe383aa5907219f86582ef96b72
> Author: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Date:   Thu May 30 09:04:29 2013 +0300
>
>     drm/i915: detect hang using per ring hangcheck_score
>
> "There's also another corner case on the kick. If the seqno = 2
> (though not stuck), and on the 3rd hangcheck, the ring is stuck, and
> we try to kick it... we don't actually try to find out if the kick
> helped."
>
> v2: Make sure we catch DoS attempts with batches full of invalid WAITs.
>
> References: https://bugs.freedesktop.org/show_bug.cgi?id=65394
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Ben Widawsky <ben@bwidawsk.net>

Hopefully Ben will also take a look as he saw exactly what
was coming.

Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/2] drm/i915: Only slightly increment hangcheck score if we succesfully kick a ring
  2013-06-06  9:45 [PATCH 1/2] drm/i915: Only slightly increment hangcheck score if we succesfully kick a ring Chris Wilson
  2013-06-06  9:45 ` [PATCH 2/2] drm/i915: Don't count semaphore waits towards a stuck ring Chris Wilson
  2013-06-06 14:41 ` [PATCH 1/2] drm/i915: Only slightly increment hangcheck score if we succesfully kick a ring Mika Kuoppala
@ 2013-06-07  3:58 ` Ben Widawsky
  2013-06-07  7:33   ` Chris Wilson
  2 siblings, 1 reply; 8+ messages in thread
From: Ben Widawsky @ 2013-06-07  3:58 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

On Thu, Jun 06, 2013 at 10:45:42AM +0100, Chris Wilson wrote:
> After kicking a ring, it should be free to make progress again and so
> should not be accused of being stuck until hangcheck fires once more. In
> order to catch a denial-of-service within a batch or across multiple
> batches, we still do increment the hangcheck score - just not as
> severely so that it takes multiple kicks to fail.
> 
> This should address part of Ben's justified criticism of
> 
> commit 05407ff889ceebe383aa5907219f86582ef96b72
> Author: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Date:   Thu May 30 09:04:29 2013 +0300
> 
>     drm/i915: detect hang using per ring hangcheck_score
> 
> "There's also another corner case on the kick. If the seqno = 2
> (though not stuck), and on the 3rd hangcheck, the ring is stuck, and
> we try to kick it... we don't actually try to find out if the kick
> helped."
> 
> v2: Make sure we catch DoS attempts with batches full of invalid WAITs.
> 
> References: https://bugs.freedesktop.org/show_bug.cgi?id=65394
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Ben Widawsky <ben@bwidawsk.net>
> ---
>  drivers/gpu/drm/i915/i915_irq.c |  105 ++++++++++++++++++++-------------------
>  1 file changed, 53 insertions(+), 52 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 85694d7..2d1890d 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2324,21 +2324,11 @@ ring_last_seqno(struct intel_ring_buffer *ring)
>  			  struct drm_i915_gem_request, list)->seqno;
>  }
>  
> -static bool i915_hangcheck_ring_idle(struct intel_ring_buffer *ring,
> -				     u32 ring_seqno, bool *err)
> -{
> -	if (list_empty(&ring->request_list) ||
> -	    i915_seqno_passed(ring_seqno, ring_last_seqno(ring))) {
> -		/* Issue a wake-up to catch stuck h/w. */
> -		if (waitqueue_active(&ring->irq_queue)) {
> -			DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
> -				  ring->name);
> -			wake_up_all(&ring->irq_queue);
> -			*err = true;
> -		}
> -		return true;
> -	}
> -	return false;
> +static bool
> +ring_idle(struct intel_ring_buffer *ring, u32 seqno)
> +{
> +	return (list_empty(&ring->request_list) ||
> +		i915_seqno_passed(seqno, ring_last_seqno(ring)));
>  }
>  
>  static bool semaphore_passed(struct intel_ring_buffer *ring)
> @@ -2372,16 +2362,26 @@ static bool semaphore_passed(struct intel_ring_buffer *ring)
>  				 ioread32(ring->virtual_start+acthd+4)+1);
>  }
>  
> -static bool kick_ring(struct intel_ring_buffer *ring)
> +static bool ring_hung(struct intel_ring_buffer *ring)
>  {
>  	struct drm_device *dev = ring->dev;
>  	struct drm_i915_private *dev_priv = dev->dev_private;
> -	u32 tmp = I915_READ_CTL(ring);
> +	u32 tmp;
> +
> +	if (IS_GEN2(dev))
> +		return true;
> +
> +	/* Is the chip hanging on a WAIT_FOR_EVENT?
> +	 * If so we can simply poke the RB_WAIT bit
> +	 * and break the hang. This should work on
> +	 * all but the second generation chipsets.
> +	 */
> +	tmp = I915_READ_CTL(ring);
>  	if (tmp & RING_WAIT) {
>  		DRM_ERROR("Kicking stuck wait on %s\n",
>  			  ring->name);
>  		I915_WRITE_CTL(ring, tmp);
> -		return true;
> +		return false;
>  	}
>  
>  	if (INTEL_INFO(dev)->gen >= 6 &&
> @@ -2390,22 +2390,10 @@ static bool kick_ring(struct intel_ring_buffer *ring)
>  		DRM_ERROR("Kicking stuck semaphore on %s\n",
>  			  ring->name);
>  		I915_WRITE_CTL(ring, tmp);
> -		return true;
> -	}
> -	return false;
> -}
> -
> -static bool i915_hangcheck_ring_hung(struct intel_ring_buffer *ring)
> -{
> -	if (IS_GEN2(ring->dev))
>  		return false;
> +	}
>  
> -	/* Is the chip hanging on a WAIT_FOR_EVENT?
> -	 * If so we can simply poke the RB_WAIT bit
> -	 * and break the hang. This should work on
> -	 * all but the second generation chipsets.
> -	 */
> -	return !kick_ring(ring);
> +	return true;
>  }
>  
>  /**
> @@ -2423,37 +2411,50 @@ void i915_hangcheck_elapsed(unsigned long data)
>  	struct intel_ring_buffer *ring;
>  	int i;
>  	int busy_count = 0, rings_hung = 0;
> -	bool stuck[I915_NUM_RINGS];
> +	bool stuck[I915_NUM_RINGS] = { 0 };
> +#define KICK 5
> +#define HUNG 20
> +#define FIRE 30
>  
>  	if (!i915_enable_hangcheck)
>  		return;
>  
>  	for_each_ring(ring, dev_priv, i) {
>  		u32 seqno, acthd;
> -		bool idle, err = false;
>  
>  		seqno = ring->get_seqno(ring, false);
>  		acthd = intel_ring_get_active_head(ring);
> -		idle = i915_hangcheck_ring_idle(ring, seqno, &err);
> -		stuck[i] = ring->hangcheck.acthd == acthd;
> -
> -		if (idle) {
> -			if (err)
> -				ring->hangcheck.score += 2;
> -			else
> -				ring->hangcheck.score = 0;
> -		} else {
> -			busy_count++;
>  
> -			if (ring->hangcheck.seqno == seqno) {
> -				ring->hangcheck.score++;
> -
> -				/* Kick ring if stuck*/
> -				if (stuck[i])
> -					i915_hangcheck_ring_hung(ring);
> +		if (ring->hangcheck.seqno == seqno) {
> +			if (ring_idle(ring, seqno)) {
> +				if (waitqueue_active(&ring->irq_queue)) {
> +					/* Issue a wake-up to catch stuck h/w. */
> +					DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
> +						  ring->name);
> +					wake_up_all(&ring->irq_queue);
> +					ring->hangcheck.score += HUNG;
> +					busy_count++;
> +				}

Correct me if I'm wrong, but we shouldn't be able to get here if the
waitqueue isn't active, so perhaps a WARN for the else?

>  			} else {
> -				ring->hangcheck.score = 0;
> +				stuck[i] = ring->hangcheck.acthd == acthd;
> +				if (stuck[i]) {
> +					/* Every time we kick the ring, add a
> +					 * small increment to the hangcheck
> +					 * score so that we can catch a
> +					 * batch that is repeatedly kicked.
> +					 */
> +					ring->hangcheck.score += KICK;
> +					if (ring_hung(ring))
> +						ring->hangcheck.score += HUNG;
> +				}
> +				busy_count++;
>  			}
> +		} else {
> +			/* Gradually reduce the count so that we catch DoS
> +			 * attempts across multiple batches.
> +			 */
> +			if (ring->hangcheck.score > 0)
> +				ring->hangcheck.score--;
>  		}

This else feels weird to me. I think we'd want to decrease at least by a
multiple of KICK, or HUNG if seqno is actually moving. I'm also not
certain I understand why we're worried about DoS. Did that conversation
happen somewhere I can't find?

>  
>  		ring->hangcheck.seqno = seqno;
> @@ -2461,7 +2462,7 @@ void i915_hangcheck_elapsed(unsigned long data)
>  	}
>  
>  	for_each_ring(ring, dev_priv, i) {
> -		if (ring->hangcheck.score > 2) {
> +		if (ring->hangcheck.score > FIRE) {
>  			rings_hung++;
>  			DRM_ERROR("%s: %s on %s 0x%x\n", ring->name,
>  				  stuck[i] ? "stuck" : "no progress",
> -- 
> 1.7.10.4
> 

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/2] drm/i915: Only slightly increment hangcheck score if we succesfully kick a ring
  2013-06-07  3:58 ` Ben Widawsky
@ 2013-06-07  7:33   ` Chris Wilson
  0 siblings, 0 replies; 8+ messages in thread
From: Chris Wilson @ 2013-06-07  7:33 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: intel-gfx

On Thu, Jun 06, 2013 at 08:58:10PM -0700, Ben Widawsky wrote:
> On Thu, Jun 06, 2013 at 10:45:42AM +0100, Chris Wilson wrote:
> > +		if (ring->hangcheck.seqno == seqno) {
> > +			if (ring_idle(ring, seqno)) {
> > +				if (waitqueue_active(&ring->irq_queue)) {
> > +					/* Issue a wake-up to catch stuck h/w. */
> > +					DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
> > +						  ring->name);
> > +					wake_up_all(&ring->irq_queue);
> > +					ring->hangcheck.score += HUNG;
> > +					busy_count++;
> > +				}
> 
> Correct me if I'm wrong, but we shouldn't be able to get here if the
> waitqueue isn't active, so perhaps a WARN for the else?

No, we will inspect unused rings whilst other rings are running. So we
expect to find rings that never advance the seqno and have no requests
pending.
 
> >  			} else {
> > -				ring->hangcheck.score = 0;
> > +				stuck[i] = ring->hangcheck.acthd == acthd;
> > +				if (stuck[i]) {
> > +					/* Every time we kick the ring, add a
> > +					 * small increment to the hangcheck
> > +					 * score so that we can catch a
> > +					 * batch that is repeatedly kicked.
> > +					 */
> > +					ring->hangcheck.score += KICK;
> > +					if (ring_hung(ring))
> > +						ring->hangcheck.score += HUNG;
> > +				}
> > +				busy_count++;
> >  			}
> > +		} else {
> > +			/* Gradually reduce the count so that we catch DoS
> > +			 * attempts across multiple batches.
> > +			 */
> > +			if (ring->hangcheck.score > 0)
> > +				ring->hangcheck.score--;
> >  		}
> 
> This else feels weird to me. I think we'd want to decrease at least by a
> multiple of KICK, or HUNG if seqno is actually moving. I'm also not
> certain I understand why we're worried about DoS. Did that conversation
> happen somewhere I can't find?

Mika said that the reason why he incremented hangcheck score whilst
kicking rings was in case someone submitted a batch with multiple
invalid WAITs. It is easy to extend that attack to an invalid WAIT per
batch and DoS the machine, if we forget about kicking after each batch.

I actually would prefer not to charge the ring at all unless we do kick
it (i.e. so we do not accidentally declare a machine hung whilst busy
but waiting on semapores). Time to respin again.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH] drm/i915: Don't count semaphore waits towards a stuck ring
  2013-06-06  9:45 ` [PATCH 2/2] drm/i915: Don't count semaphore waits towards a stuck ring Chris Wilson
@ 2013-06-07  8:55   ` Chris Wilson
  2013-06-10  1:12     ` Ben Widawsky
  0 siblings, 1 reply; 8+ messages in thread
From: Chris Wilson @ 2013-06-07  8:55 UTC (permalink / raw)
  To: intel-gfx; +Cc: Ben Widawsky

If we detect a ring is in a valid wait for another, just let it be.
Eventually it will either begin to progress again, or the entire system
will come grinding to a halt and then hangcheck will fire as soon as the
deadlock is detected.

This error was foretold by Ben in
commit 05407ff889ceebe383aa5907219f86582ef96b72
Author: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Date:   Thu May 30 09:04:29 2013 +0300

    drm/i915: detect hang using per ring hangcheck_score

"If ring B is waiting on ring A via semaphore, and ring A is making
progress, albeit slowly - the hangcheck will fire. The check will
determine that A is moving, however ring B will appear hung because
the ACTHD doesn't move. I honestly can't say if that's actually a
realistic problem to hit it probably implies the timeout value is too
low."

v2: Make sure we don't even incur the KICK cost whilst waiting.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=65394
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Ben Widawsky <ben@bwidawsk.net>
---
 drivers/gpu/drm/i915/i915_irq.c         |   95 ++++++++++++++++++++++---------
 drivers/gpu/drm/i915/intel_ringbuffer.h |    1 +
 2 files changed, 70 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 2d1890d..05f8f75 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2331,21 +2331,21 @@ ring_idle(struct intel_ring_buffer *ring, u32 seqno)
 		i915_seqno_passed(seqno, ring_last_seqno(ring)));
 }
 
-static bool semaphore_passed(struct intel_ring_buffer *ring)
+static struct intel_ring_buffer *
+semaphore_waits_for(struct intel_ring_buffer *ring, u32 *seqno)
 {
 	struct drm_i915_private *dev_priv = ring->dev->dev_private;
-	u32 acthd = intel_ring_get_active_head(ring) & HEAD_ADDR;
-	struct intel_ring_buffer *signaller;
-	u32 cmd, ipehr, acthd_min;
+	u32 cmd, ipehr, acthd, acthd_min;
 
 	ipehr = I915_READ(RING_IPEHR(ring->mmio_base));
 	if ((ipehr & ~(0x3 << 16)) !=
 	    (MI_SEMAPHORE_MBOX | MI_SEMAPHORE_COMPARE | MI_SEMAPHORE_REGISTER))
-		return false;
+		return NULL;
 
 	/* ACTHD is likely pointing to the dword after the actual command,
 	 * so scan backwards until we find the MBOX.
 	 */
+	acthd = intel_ring_get_active_head(ring) & HEAD_ADDR;
 	acthd_min = max((int)acthd - 3 * 4, 0);
 	do {
 		cmd = ioread32(ring->virtual_start + acthd);
@@ -2354,22 +2354,53 @@ static bool semaphore_passed(struct intel_ring_buffer *ring)
 
 		acthd -= 4;
 		if (acthd < acthd_min)
-			return false;
+			return NULL;
 	} while (1);
 
-	signaller = &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3];
-	return i915_seqno_passed(signaller->get_seqno(signaller, false),
-				 ioread32(ring->virtual_start+acthd+4)+1);
+	*seqno = ioread32(ring->virtual_start+acthd+4)+1;
+	return &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3];
+}
+
+static int semaphore_passed(struct intel_ring_buffer *ring)
+{
+	struct drm_i915_private *dev_priv = ring->dev->dev_private;
+	struct intel_ring_buffer *signaller;
+	u32 seqno, ctl;
+
+	ring->hangcheck.deadlock = true;
+
+	signaller = semaphore_waits_for(ring, &seqno);
+	if (signaller == NULL || signaller->hangcheck.deadlock)
+		return -1;
+
+	/* cursory check for an unkickable deadlock */
+	ctl = I915_READ_CTL(signaller);
+	if (ctl & RING_WAIT_SEMAPHORE && semaphore_passed(signaller) < 0)
+		return -1;
+
+	return i915_seqno_passed(signaller->get_seqno(signaller, false), seqno);
 }
 
-static bool ring_hung(struct intel_ring_buffer *ring)
+static void semaphore_clear_deadlocks(struct drm_i915_private *dev_priv)
+{
+	struct intel_ring_buffer *ring;
+	int i;
+
+	for_each_ring(ring, dev_priv, i)
+		ring->hangcheck.deadlock = false;
+}
+
+static enum { pass, kick, hung } ring_stuck(struct intel_ring_buffer *ring, u32 acthd)
 {
 	struct drm_device *dev = ring->dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	u32 tmp;
 
+	if (ring->hangcheck.acthd != acthd)
+		return pass;
+
 	if (IS_GEN2(dev))
-		return true;
+		return hung;
 
 	/* Is the chip hanging on a WAIT_FOR_EVENT?
 	 * If so we can simply poke the RB_WAIT bit
@@ -2381,19 +2412,24 @@ static bool ring_hung(struct intel_ring_buffer *ring)
 		DRM_ERROR("Kicking stuck wait on %s\n",
 			  ring->name);
 		I915_WRITE_CTL(ring, tmp);
-		return false;
-	}
-
-	if (INTEL_INFO(dev)->gen >= 6 &&
-	    tmp & RING_WAIT_SEMAPHORE &&
-	    semaphore_passed(ring)) {
-		DRM_ERROR("Kicking stuck semaphore on %s\n",
-			  ring->name);
-		I915_WRITE_CTL(ring, tmp);
-		return false;
+		return kick;
+	}
+
+	if (INTEL_INFO(dev)->gen >= 6 && tmp & RING_WAIT_SEMAPHORE) {
+		switch (semaphore_passed(ring)) {
+		default:
+			return hung;
+		case 1:
+			DRM_ERROR("Kicking stuck semaphore on %s\n",
+				  ring->name);
+			I915_WRITE_CTL(ring, tmp);
+			return kick;
+		case 0:
+			return pass;
+		}
 	}
 
-	return true;
+	return hung;
 }
 
 /**
@@ -2422,6 +2458,8 @@ void i915_hangcheck_elapsed(unsigned long data)
 	for_each_ring(ring, dev_priv, i) {
 		u32 seqno, acthd;
 
+		semaphore_clear_deadlocks(dev_priv);
+
 		seqno = ring->get_seqno(ring, false);
 		acthd = intel_ring_get_active_head(ring);
 
@@ -2436,16 +2474,21 @@ void i915_hangcheck_elapsed(unsigned long data)
 					busy_count++;
 				}
 			} else {
-				stuck[i] = ring->hangcheck.acthd == acthd;
-				if (stuck[i]) {
+				switch (ring_stuck(ring, acthd)) {
+				case pass:
+					break;
+				case kick:
 					/* Every time we kick the ring, add a
 					 * small increment to the hangcheck
 					 * score so that we can catch a
 					 * batch that is repeatedly kicked.
 					 */
 					ring->hangcheck.score += KICK;
-					if (ring_hung(ring))
-						ring->hangcheck.score += HUNG;
+					break;
+				case hung:
+					ring->hangcheck.score += HUNG;
+					stuck[i] = true;
+					break;
 				}
 				busy_count++;
 			}
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index efc403d..a3e9610 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -38,6 +38,7 @@ struct  intel_hw_status_page {
 #define I915_READ_SYNC_1(ring) I915_READ(RING_SYNC_1((ring)->mmio_base))
 
 struct intel_ring_hangcheck {
+	bool deadlock;
 	u32 seqno;
 	u32 acthd;
 	int score;
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] drm/i915: Don't count semaphore waits towards a stuck ring
  2013-06-07  8:55   ` [PATCH] " Chris Wilson
@ 2013-06-10  1:12     ` Ben Widawsky
  0 siblings, 0 replies; 8+ messages in thread
From: Ben Widawsky @ 2013-06-10  1:12 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

On Fri, Jun 07, 2013 at 09:55:51AM +0100, Chris Wilson wrote:
> If we detect a ring is in a valid wait for another, just let it be.
> Eventually it will either begin to progress again, or the entire system
> will come grinding to a halt and then hangcheck will fire as soon as the
> deadlock is detected.
> 
> This error was foretold by Ben in
> commit 05407ff889ceebe383aa5907219f86582ef96b72
> Author: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Date:   Thu May 30 09:04:29 2013 +0300
> 
>     drm/i915: detect hang using per ring hangcheck_score
> 
> "If ring B is waiting on ring A via semaphore, and ring A is making
> progress, albeit slowly - the hangcheck will fire. The check will
> determine that A is moving, however ring B will appear hung because
> the ACTHD doesn't move. I honestly can't say if that's actually a
> realistic problem to hit it probably implies the timeout value is too
> low."
> 
> v2: Make sure we don't even incur the KICK cost whilst waiting.
> 
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=65394
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Ben Widawsky <ben@bwidawsk.net>
> ---
>  drivers/gpu/drm/i915/i915_irq.c         |   95 ++++++++++++++++++++++---------
>  drivers/gpu/drm/i915/intel_ringbuffer.h |    1 +
>  2 files changed, 70 insertions(+), 26 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 2d1890d..05f8f75 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2331,21 +2331,21 @@ ring_idle(struct intel_ring_buffer *ring, u32 seqno)
>  		i915_seqno_passed(seqno, ring_last_seqno(ring)));
>  }
>  
> -static bool semaphore_passed(struct intel_ring_buffer *ring)
> +static struct intel_ring_buffer *
> +semaphore_waits_for(struct intel_ring_buffer *ring, u32 *seqno)
>  {
>  	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> -	u32 acthd = intel_ring_get_active_head(ring) & HEAD_ADDR;
> -	struct intel_ring_buffer *signaller;
> -	u32 cmd, ipehr, acthd_min;
> +	u32 cmd, ipehr, acthd, acthd_min;
>  
>  	ipehr = I915_READ(RING_IPEHR(ring->mmio_base));
>  	if ((ipehr & ~(0x3 << 16)) !=
>  	    (MI_SEMAPHORE_MBOX | MI_SEMAPHORE_COMPARE | MI_SEMAPHORE_REGISTER))
> -		return false;
> +		return NULL;
>  
>  	/* ACTHD is likely pointing to the dword after the actual command,
>  	 * so scan backwards until we find the MBOX.
>  	 */
> +	acthd = intel_ring_get_active_head(ring) & HEAD_ADDR;
>  	acthd_min = max((int)acthd - 3 * 4, 0);
>  	do {
>  		cmd = ioread32(ring->virtual_start + acthd);
> @@ -2354,22 +2354,53 @@ static bool semaphore_passed(struct intel_ring_buffer *ring)
>  
>  		acthd -= 4;
>  		if (acthd < acthd_min)
> -			return false;
> +			return NULL;
>  	} while (1);
>  
> -	signaller = &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3];
> -	return i915_seqno_passed(signaller->get_seqno(signaller, false),
> -				 ioread32(ring->virtual_start+acthd+4)+1);
> +	*seqno = ioread32(ring->virtual_start+acthd+4)+1;
> +	return &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3];
> +}
> +
>
I hated these since they were introduced... does this work still with
VECS? Seems at the very least %3 should be NUM_RINGS?
>
> +static int semaphore_passed(struct intel_ring_buffer *ring)
> +{
> +	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> +	struct intel_ring_buffer *signaller;
> +	u32 seqno, ctl;
> +
> +	ring->hangcheck.deadlock = true;
> +
> +	signaller = semaphore_waits_for(ring, &seqno);
> +	if (signaller == NULL || signaller->hangcheck.deadlock)
> +		return -1;
> +
> +	/* cursory check for an unkickable deadlock */
> +	ctl = I915_READ_CTL(signaller);
> +	if (ctl & RING_WAIT_SEMAPHORE && semaphore_passed(signaller) < 0)
> +		return -1;
> +
> +	return i915_seqno_passed(signaller->get_seqno(signaller, false), seqno);
>  }
>  
> -static bool ring_hung(struct intel_ring_buffer *ring)
> +static void semaphore_clear_deadlocks(struct drm_i915_private *dev_priv)
> +{
> +	struct intel_ring_buffer *ring;
> +	int i;
> +
> +	for_each_ring(ring, dev_priv, i)
> +		ring->hangcheck.deadlock = false;
> +}
> +
> +static enum { pass, kick, hung } ring_stuck(struct intel_ring_buffer *ring, u32 acthd)
>  {
>  	struct drm_device *dev = ring->dev;
>  	struct drm_i915_private *dev_priv = dev->dev_private;
>  	u32 tmp;
>  
> +	if (ring->hangcheck.acthd != acthd)
> +		return pass;
> +
>  	if (IS_GEN2(dev))
> -		return true;
> +		return hung;
>  
>  	/* Is the chip hanging on a WAIT_FOR_EVENT?
>  	 * If so we can simply poke the RB_WAIT bit
> @@ -2381,19 +2412,24 @@ static bool ring_hung(struct intel_ring_buffer *ring)
>  		DRM_ERROR("Kicking stuck wait on %s\n",
>  			  ring->name);
>  		I915_WRITE_CTL(ring, tmp);
> -		return false;
> -	}
> -
> -	if (INTEL_INFO(dev)->gen >= 6 &&
> -	    tmp & RING_WAIT_SEMAPHORE &&
> -	    semaphore_passed(ring)) {
> -		DRM_ERROR("Kicking stuck semaphore on %s\n",
> -			  ring->name);
> -		I915_WRITE_CTL(ring, tmp);
> -		return false;
> +		return kick;
> +	}
> +
> +	if (INTEL_INFO(dev)->gen >= 6 && tmp & RING_WAIT_SEMAPHORE) {
> +		switch (semaphore_passed(ring)) {
> +		default:
> +			return hung;
> +		case 1:
> +			DRM_ERROR("Kicking stuck semaphore on %s\n",
> +				  ring->name);
> +			I915_WRITE_CTL(ring, tmp);
> +			return kick;
> +		case 0:
> +			return pass;
> +		}
>  	}
>  
> -	return true;
> +	return hung;
>  }
>  
>  /**
> @@ -2422,6 +2458,8 @@ void i915_hangcheck_elapsed(unsigned long data)
>  	for_each_ring(ring, dev_priv, i) {
>  		u32 seqno, acthd;
>  
> +		semaphore_clear_deadlocks(dev_priv);
> +
>  		seqno = ring->get_seqno(ring, false);
>  		acthd = intel_ring_get_active_head(ring);
>  
> @@ -2436,16 +2474,21 @@ void i915_hangcheck_elapsed(unsigned long data)
>  					busy_count++;
>  				}
>  			} else {
> -				stuck[i] = ring->hangcheck.acthd == acthd;
> -				if (stuck[i]) {
> +				switch (ring_stuck(ring, acthd)) {
> +				case pass:
> +					break;
> +				case kick:
>  					/* Every time we kick the ring, add a
>  					 * small increment to the hangcheck
>  					 * score so that we can catch a
>  					 * batch that is repeatedly kicked.
>  					 */
>  					ring->hangcheck.score += KICK;
> -					if (ring_hung(ring))
> -						ring->hangcheck.score += HUNG;
> +					break;
> +				case hung:
> +					ring->hangcheck.score += HUNG;
> +					stuck[i] = true;
> +					break;
>  				}
>  				busy_count++;
>  			}
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index efc403d..a3e9610 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -38,6 +38,7 @@ struct  intel_hw_status_page {
>  #define I915_READ_SYNC_1(ring) I915_READ(RING_SYNC_1((ring)->mmio_base))
>  
>  struct intel_ring_hangcheck {
> +	bool deadlock;
>  	u32 seqno;
>  	u32 acthd;
>  	int score;
> -- 
> 1.7.10.4
> 
>

I'm going to give this another look tomorrow if Daniel doesn't merge it
before then. The patch is a bit hard for me to read, but the idea seems
like how I'd want it to behave.

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 1/2] drm/i915: Only slightly increment hangcheck score if we succesfully kick a ring
@ 2013-06-10  7:49 Chris Wilson
  0 siblings, 0 replies; 8+ messages in thread
From: Chris Wilson @ 2013-06-10  7:49 UTC (permalink / raw)
  To: intel-gfx; +Cc: Ben Widawsky

After kicking a ring, it should be free to make progress again and so
should not be accused of being stuck until hangcheck fires once more. In
order to catch a denial-of-service within a batch or across multiple
batches, we still do increment the hangcheck score - just not as
severely so that it takes multiple kicks to fail.

This should address part of Ben's justified criticism of

commit 05407ff889ceebe383aa5907219f86582ef96b72
Author: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Date:   Thu May 30 09:04:29 2013 +0300

    drm/i915: detect hang using per ring hangcheck_score

"There's also another corner case on the kick. If the seqno = 2
(though not stuck), and on the 3rd hangcheck, the ring is stuck, and
we try to kick it... we don't actually try to find out if the kick
helped."

v2: Make sure we catch DoS attempts with batches full of invalid WAITs.
v3: Preserve the ability to detect loops by always charging the ring
    if it is busy on the same request.

References: https://bugs.freedesktop.org/show_bug.cgi?id=65394
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Ben Widawsky <ben@bwidawsk.net>
---
 drivers/gpu/drm/i915/i915_irq.c |  109 ++++++++++++++++++++-------------------
 1 file changed, 57 insertions(+), 52 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index dcb5209..ffa6305 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2324,21 +2324,11 @@ ring_last_seqno(struct intel_ring_buffer *ring)
 			  struct drm_i915_gem_request, list)->seqno;
 }
 
-static bool i915_hangcheck_ring_idle(struct intel_ring_buffer *ring,
-				     u32 ring_seqno, bool *err)
-{
-	if (list_empty(&ring->request_list) ||
-	    i915_seqno_passed(ring_seqno, ring_last_seqno(ring))) {
-		/* Issue a wake-up to catch stuck h/w. */
-		if (waitqueue_active(&ring->irq_queue)) {
-			DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
-				  ring->name);
-			wake_up_all(&ring->irq_queue);
-			*err = true;
-		}
-		return true;
-	}
-	return false;
+static bool
+ring_idle(struct intel_ring_buffer *ring, u32 seqno)
+{
+	return (list_empty(&ring->request_list) ||
+		i915_seqno_passed(seqno, ring_last_seqno(ring)));
 }
 
 static bool semaphore_passed(struct intel_ring_buffer *ring)
@@ -2372,16 +2362,26 @@ static bool semaphore_passed(struct intel_ring_buffer *ring)
 				 ioread32(ring->virtual_start+acthd+4)+1);
 }
 
-static bool kick_ring(struct intel_ring_buffer *ring)
+static bool ring_hung(struct intel_ring_buffer *ring)
 {
 	struct drm_device *dev = ring->dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
-	u32 tmp = I915_READ_CTL(ring);
+	u32 tmp;
+
+	if (IS_GEN2(dev))
+		return true;
+
+	/* Is the chip hanging on a WAIT_FOR_EVENT?
+	 * If so we can simply poke the RB_WAIT bit
+	 * and break the hang. This should work on
+	 * all but the second generation chipsets.
+	 */
+	tmp = I915_READ_CTL(ring);
 	if (tmp & RING_WAIT) {
 		DRM_ERROR("Kicking stuck wait on %s\n",
 			  ring->name);
 		I915_WRITE_CTL(ring, tmp);
-		return true;
+		return false;
 	}
 
 	if (INTEL_INFO(dev)->gen >= 6 &&
@@ -2390,22 +2390,10 @@ static bool kick_ring(struct intel_ring_buffer *ring)
 		DRM_ERROR("Kicking stuck semaphore on %s\n",
 			  ring->name);
 		I915_WRITE_CTL(ring, tmp);
-		return true;
-	}
-	return false;
-}
-
-static bool i915_hangcheck_ring_hung(struct intel_ring_buffer *ring)
-{
-	if (IS_GEN2(ring->dev))
 		return false;
+	}
 
-	/* Is the chip hanging on a WAIT_FOR_EVENT?
-	 * If so we can simply poke the RB_WAIT bit
-	 * and break the hang. This should work on
-	 * all but the second generation chipsets.
-	 */
-	return !kick_ring(ring);
+	return true;
 }
 
 /**
@@ -2423,37 +2411,54 @@ void i915_hangcheck_elapsed(unsigned long data)
 	struct intel_ring_buffer *ring;
 	int i;
 	int busy_count = 0, rings_hung = 0;
-	bool stuck[I915_NUM_RINGS];
+	bool stuck[I915_NUM_RINGS] = { 0 };
+#define BUSY 1
+#define KICK 5
+#define HUNG 20
+#define FIRE 30
 
 	if (!i915_enable_hangcheck)
 		return;
 
 	for_each_ring(ring, dev_priv, i) {
 		u32 seqno, acthd;
-		bool idle, err = false;
 
 		seqno = ring->get_seqno(ring, false);
 		acthd = intel_ring_get_active_head(ring);
-		idle = i915_hangcheck_ring_idle(ring, seqno, &err);
-		stuck[i] = ring->hangcheck.acthd == acthd;
-
-		if (idle) {
-			if (err)
-				ring->hangcheck.score += 2;
-			else
-				ring->hangcheck.score = 0;
-		} else {
-			busy_count++;
 
-			if (ring->hangcheck.seqno == seqno) {
-				ring->hangcheck.score++;
-
-				/* Kick ring if stuck*/
-				if (stuck[i])
-					i915_hangcheck_ring_hung(ring);
+		if (ring->hangcheck.seqno == seqno) {
+			if (ring_idle(ring, seqno)) {
+				if (waitqueue_active(&ring->irq_queue)) {
+					/* Issue a wake-up to catch stuck h/w. */
+					DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
+						  ring->name);
+					wake_up_all(&ring->irq_queue);
+					ring->hangcheck.score += HUNG;
+					busy_count++;
+				}
 			} else {
-				ring->hangcheck.score = 0;
+				int score;
+
+				stuck[i] = ring->hangcheck.acthd == acthd;
+				if (stuck[i]) {
+					/* Every time we kick the ring, add a
+					 * small increment to the hangcheck
+					 * score so that we can catch a
+					 * batch that is repeatedly kicked.
+					 */
+					score = ring_hung(ring) ? HUNG : KICK;
+				} else
+					score = BUSY;
+
+				ring->hangcheck.score += score;
+				busy_count++;
 			}
+		} else {
+			/* Gradually reduce the count so that we catch DoS
+			 * attempts across multiple batches.
+			 */
+			if (ring->hangcheck.score > 0)
+				ring->hangcheck.score--;
 		}
 
 		ring->hangcheck.seqno = seqno;
@@ -2461,7 +2466,7 @@ void i915_hangcheck_elapsed(unsigned long data)
 	}
 
 	for_each_ring(ring, dev_priv, i) {
-		if (ring->hangcheck.score > 2) {
+		if (ring->hangcheck.score > FIRE) {
 			rings_hung++;
 			DRM_ERROR("%s: %s on %s 0x%x\n", ring->name,
 				  stuck[i] ? "stuck" : "no progress",
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2013-06-10  7:50 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-06-06  9:45 [PATCH 1/2] drm/i915: Only slightly increment hangcheck score if we succesfully kick a ring Chris Wilson
2013-06-06  9:45 ` [PATCH 2/2] drm/i915: Don't count semaphore waits towards a stuck ring Chris Wilson
2013-06-07  8:55   ` [PATCH] " Chris Wilson
2013-06-10  1:12     ` Ben Widawsky
2013-06-06 14:41 ` [PATCH 1/2] drm/i915: Only slightly increment hangcheck score if we succesfully kick a ring Mika Kuoppala
2013-06-07  3:58 ` Ben Widawsky
2013-06-07  7:33   ` Chris Wilson
2013-06-10  7:49 Chris Wilson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.