All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/4] context reset stats fixes for ppgtt
@ 2014-01-29 15:05 Mika Kuoppala
  2014-01-29 15:05 ` [PATCH 1/4] drm/i915: Use i915_hw_context to set reset stats Mika Kuoppala
                   ` (3 more replies)
  0 siblings, 4 replies; 14+ messages in thread
From: Mika Kuoppala @ 2014-01-29 15:05 UTC (permalink / raw)
  To: intel-gfx; +Cc: miku

These are on top of drm-intel-nightly.

Series tries to address Ben's concerns on the previous
version at: 1389968431-24123-1-git-send-email-mika.kuoppala@intel.com

Mika Kuoppala (4):
  drm/i915: Use i915_hw_context to set reset stats
  drm/i915: Tune down debug output when context is banned
  drm/i915: Use hangcheck score to find guilty context
  drm/i915: Get rid of acthd based guilty batch search

 drivers/gpu/drm/i915/i915_drv.h         |    5 +
 drivers/gpu/drm/i915/i915_gem.c         |  158 ++++++++++---------------------
 drivers/gpu/drm/i915/i915_gem_context.c |    9 +-
 drivers/gpu/drm/i915/i915_irq.c         |    3 +-
 drivers/gpu/drm/i915/intel_ringbuffer.h |    2 +
 5 files changed, 58 insertions(+), 119 deletions(-)

-- 
1.7.9.5

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/4] drm/i915: Use i915_hw_context to set reset stats
  2014-01-29 15:05 [PATCH 0/4] context reset stats fixes for ppgtt Mika Kuoppala
@ 2014-01-29 15:05 ` Mika Kuoppala
  2014-01-29 20:38   ` Ben Widawsky
  2014-01-30 14:01   ` [PATCH v2 " Mika Kuoppala
  2014-01-29 15:05 ` [PATCH 2/4] drm/i915: Tune down debug output when context is banned Mika Kuoppala
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 14+ messages in thread
From: Mika Kuoppala @ 2014-01-29 15:05 UTC (permalink / raw)
  To: intel-gfx; +Cc: miku

With full ppgtt support drm_i915_file_private gained knowledge
about the default context. Also reset stats are now inside
i915_hw_context so we can use proper abstraction.

Suggested-by: Ben Widawsky <ben@bwidawsk.net>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c |   44 ++++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 39770f7..e41d520 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2305,11 +2305,17 @@ static bool i915_request_guilty(struct drm_i915_gem_request *request,
 	return false;
 }
 
-static bool i915_context_is_banned(const struct i915_ctx_hang_stats *hs)
+static bool i915_context_is_banned(const struct i915_hw_context *ctx)
 {
-	const unsigned long elapsed = get_seconds() - hs->guilty_ts;
+	struct drm_i915_private *dev_priv;
+	unsigned long elapsed;
 
-	if (hs->banned)
+	BUG_ON(!ctx->last_ring);
+
+	dev_priv = ctx->last_ring->dev->dev_private;
+	elapsed = get_seconds() - ctx->hang_stats.guilty_ts;
+
+	if (ctx->hang_stats.banned)
 		return true;
 
 	if (elapsed <= DRM_I915_CTX_BAN_PERIOD) {
@@ -2324,9 +2330,10 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
 				  struct drm_i915_gem_request *request,
 				  u32 acthd)
 {
-	struct i915_ctx_hang_stats *hs = NULL;
 	bool inside, guilty;
 	unsigned long offset = 0;
+	struct i915_hw_context *ctx = request->ctx;
+	struct i915_ctx_hang_stats *hs;
 
 	/* Innocent until proven guilty */
 	guilty = false;
@@ -2341,28 +2348,23 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
 			  ring->name,
 			  inside ? "inside" : "flushing",
 			  offset,
-			  request->ctx ? request->ctx->id : 0,
+			  ctx ? ctx->id : 0,
 			  acthd);
 
 		guilty = true;
 	}
 
-	/* If contexts are disabled or this is the default context, use
-	 * file_priv->reset_state
-	 */
-	if (request->ctx && request->ctx->id != DEFAULT_CONTEXT_ID)
-		hs = &request->ctx->hang_stats;
-	else if (request->file_priv)
-		hs = &request->file_priv->private_default_ctx->hang_stats;
-
-	if (hs) {
-		if (guilty) {
-			hs->banned = i915_context_is_banned(hs);
-			hs->batch_active++;
-			hs->guilty_ts = get_seconds();
-		} else {
-			hs->batch_pending++;
-		}
+	if (WARN_ON(!ctx))
+		return;
+
+	hs = &ctx->hang_stats;
+
+	if (guilty) {
+		hs->banned = i915_context_is_banned(ctx);
+		hs->batch_active++;
+		hs->guilty_ts = get_seconds();
+	} else {
+		hs->batch_pending++;
 	}
 }
 
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/4] drm/i915: Tune down debug output when context is banned
  2014-01-29 15:05 [PATCH 0/4] context reset stats fixes for ppgtt Mika Kuoppala
  2014-01-29 15:05 ` [PATCH 1/4] drm/i915: Use i915_hw_context to set reset stats Mika Kuoppala
@ 2014-01-29 15:05 ` Mika Kuoppala
  2014-01-29 20:47   ` Ben Widawsky
  2014-01-30 14:05   ` [PATCH v3 " Mika Kuoppala
  2014-01-29 15:05 ` [PATCH 3/4] drm/i915: Use hangcheck score to find guilty context Mika Kuoppala
  2014-01-29 15:05 ` [PATCH 4/4] drm/i915: Get rid of acthd based guilty batch search Mika Kuoppala
  3 siblings, 2 replies; 14+ messages in thread
From: Mika Kuoppala @ 2014-01-29 15:05 UTC (permalink / raw)
  To: intel-gfx; +Cc: miku

If we have stopped rings then we know that test is running
so no need for spam. In addition, only spam when default
context gets banned.

v2: - make sure default context ban gets shown (Chris)
    - use helper for checking for default context, everywhere (Chris)

Reference: https://bugs.freedesktop.org/show_bug.cgi?id=73652
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h         |    5 +++++
 drivers/gpu/drm/i915/i915_gem.c         |    5 ++++-
 drivers/gpu/drm/i915/i915_gem_context.c |    9 ++-------
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 3673ba1..b34fd31 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2271,6 +2271,11 @@ static inline void i915_gem_context_unreference(struct i915_hw_context *ctx)
 		kref_put(&ctx->ref, i915_gem_context_free);
 }
 
+static inline bool i915_gem_context_is_default(const struct i915_hw_context *c)
+{
+	return c->id == DEFAULT_CONTEXT_ID;
+}
+
 int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
 				  struct drm_file *file);
 int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data,
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index e41d520..8f9ac0a 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2319,7 +2319,10 @@ static bool i915_context_is_banned(const struct i915_hw_context *ctx)
 		return true;
 
 	if (elapsed <= DRM_I915_CTX_BAN_PERIOD) {
-		DRM_ERROR("context hanging too fast, declaring banned!\n");
+		if (dev_priv->gpu_error.stop_rings == 0 &&
+		    i915_gem_context_is_default(ctx))
+			DRM_ERROR("context hanging too fast, banning!\n");
+
 		return true;
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 2b0598e..985c1ed 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -228,11 +228,6 @@ err_out:
 	return ERR_PTR(ret);
 }
 
-static inline bool is_default_context(struct i915_hw_context *ctx)
-{
-	return (ctx->id == DEFAULT_CONTEXT_ID);
-}
-
 /**
  * The default context needs to exist per ring that uses contexts. It stores the
  * context state of the GPU for applications that don't utilize HW contexts, as
@@ -478,7 +473,7 @@ static int context_idr_cleanup(int id, void *p, void *data)
 	struct i915_hw_context *ctx = p;
 
 	/* Ignore the default context because close will handle it */
-	if (is_default_context(ctx))
+	if (i915_gem_context_is_default(ctx))
 		return 0;
 
 	i915_gem_context_unreference(ctx);
@@ -656,7 +651,7 @@ static int do_switch(struct intel_ring_buffer *ring,
 		vma->bind_vma(vma, to->obj->cache_level, GLOBAL_BIND);
 	}
 
-	if (!to->is_initialized || is_default_context(to))
+	if (!to->is_initialized || i915_gem_context_is_default(to))
 		hw_flags |= MI_RESTORE_INHIBIT;
 
 	ret = mi_set_context(ring, to, hw_flags);
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 3/4] drm/i915: Use hangcheck score to find guilty context
  2014-01-29 15:05 [PATCH 0/4] context reset stats fixes for ppgtt Mika Kuoppala
  2014-01-29 15:05 ` [PATCH 1/4] drm/i915: Use i915_hw_context to set reset stats Mika Kuoppala
  2014-01-29 15:05 ` [PATCH 2/4] drm/i915: Tune down debug output when context is banned Mika Kuoppala
@ 2014-01-29 15:05 ` Mika Kuoppala
  2014-01-29 15:05 ` [PATCH 4/4] drm/i915: Get rid of acthd based guilty batch search Mika Kuoppala
  3 siblings, 0 replies; 14+ messages in thread
From: Mika Kuoppala @ 2014-01-29 15:05 UTC (permalink / raw)
  To: intel-gfx; +Cc: miku

With full ppgtt using acthd is not enough to find guilty
batch buffer. We get multiple false positives as acthd is
per vm.

Instead of scanning which vm was running on a ring,
to find corressponding context, use a different, simpler,
strategy of finding batches that caused gpu hang:

If hangcheck has declared ring to be hung, find first non complete
request on that ring and claim it was guilty.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=73652
Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c         |   42 +++++++++++++++++++++----------
 drivers/gpu/drm/i915/i915_irq.c         |    3 +--
 drivers/gpu/drm/i915/intel_ringbuffer.h |    2 ++
 3 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 8f9ac0a..a46a1a7 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2331,21 +2331,19 @@ static bool i915_context_is_banned(const struct i915_hw_context *ctx)
 
 static void i915_set_reset_status(struct intel_ring_buffer *ring,
 				  struct drm_i915_gem_request *request,
-				  u32 acthd)
+				  const bool guilty)
 {
-	bool inside, guilty;
+	const u32 acthd = intel_ring_get_active_head(ring);
+	bool inside;
 	unsigned long offset = 0;
 	struct i915_hw_context *ctx = request->ctx;
 	struct i915_ctx_hang_stats *hs;
 
-	/* Innocent until proven guilty */
-	guilty = false;
-
 	if (request->batch_obj)
 		offset = i915_gem_obj_offset(request->batch_obj,
 					     request_to_vm(request));
 
-	if (ring->hangcheck.action != HANGCHECK_WAIT &&
+	if (guilty &&
 	    i915_request_guilty(request, acthd, &inside)) {
 		DRM_DEBUG("%s hung %s bo (0x%lx ctx %d) at 0x%x\n",
 			  ring->name,
@@ -2353,8 +2351,6 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
 			  offset,
 			  ctx ? ctx->id : 0,
 			  acthd);
-
-		guilty = true;
 	}
 
 	if (WARN_ON(!ctx))
@@ -2382,19 +2378,39 @@ static void i915_gem_free_request(struct drm_i915_gem_request *request)
 	kfree(request);
 }
 
-static void i915_gem_reset_ring_status(struct drm_i915_private *dev_priv,
-				       struct intel_ring_buffer *ring)
+static struct drm_i915_gem_request *
+i915_gem_find_first_non_complete(struct intel_ring_buffer *ring)
 {
-	u32 completed_seqno = ring->get_seqno(ring, false);
-	u32 acthd = intel_ring_get_active_head(ring);
 	struct drm_i915_gem_request *request;
+	const u32 completed_seqno = ring->get_seqno(ring, false);
 
 	list_for_each_entry(request, &ring->request_list, list) {
 		if (i915_seqno_passed(completed_seqno, request->seqno))
 			continue;
 
-		i915_set_reset_status(ring, request, acthd);
+		return request;
 	}
+
+	return NULL;
+}
+
+static void i915_gem_reset_ring_status(struct drm_i915_private *dev_priv,
+				       struct intel_ring_buffer *ring)
+{
+	struct drm_i915_gem_request *request;
+	bool ring_hung;
+
+	request = i915_gem_find_first_non_complete(ring);
+
+	if (request == NULL)
+		return;
+
+	ring_hung = ring->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG;
+
+	i915_set_reset_status(ring, request, ring_hung);
+
+	list_for_each_entry_continue(request, &ring->request_list, list)
+		i915_set_reset_status(ring, request, false);
 }
 
 static void i915_gem_reset_ring_cleanup(struct drm_i915_private *dev_priv,
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 72ade87..e4eb1988 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2493,7 +2493,6 @@ static void i915_hangcheck_elapsed(unsigned long data)
 #define BUSY 1
 #define KICK 5
 #define HUNG 20
-#define FIRE 30
 
 	if (!i915.enable_hangcheck)
 		return;
@@ -2577,7 +2576,7 @@ static void i915_hangcheck_elapsed(unsigned long data)
 	}
 
 	for_each_ring(ring, dev_priv, i) {
-		if (ring->hangcheck.score > FIRE) {
+		if (ring->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) {
 			DRM_INFO("%s on %s\n",
 				 stuck[i] ? "stuck" : "no progress",
 				 ring->name);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 71a73f4..38c757e 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -41,6 +41,8 @@ enum intel_ring_hangcheck_action {
 	HANGCHECK_HUNG,
 };
 
+#define HANGCHECK_SCORE_RING_HUNG 31
+
 struct intel_ring_hangcheck {
 	bool deadlock;
 	u32 seqno;
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 4/4] drm/i915: Get rid of acthd based guilty batch search
  2014-01-29 15:05 [PATCH 0/4] context reset stats fixes for ppgtt Mika Kuoppala
                   ` (2 preceding siblings ...)
  2014-01-29 15:05 ` [PATCH 3/4] drm/i915: Use hangcheck score to find guilty context Mika Kuoppala
@ 2014-01-29 15:05 ` Mika Kuoppala
  2014-01-29 21:44   ` Ben Widawsky
  3 siblings, 1 reply; 14+ messages in thread
From: Mika Kuoppala @ 2014-01-29 15:05 UTC (permalink / raw)
  To: intel-gfx; +Cc: miku

As we seek the guilty batch using request and hangcheck
score, this code is not needed anymore.

Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c |   91 ++-------------------------------------
 1 file changed, 4 insertions(+), 87 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index a46a1a7..8637898 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2241,70 +2241,6 @@ i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
 	spin_unlock(&file_priv->mm.lock);
 }
 
-static bool i915_head_inside_object(u32 acthd, struct drm_i915_gem_object *obj,
-				    struct i915_address_space *vm)
-{
-	if (acthd >= i915_gem_obj_offset(obj, vm) &&
-	    acthd < i915_gem_obj_offset(obj, vm) + obj->base.size)
-		return true;
-
-	return false;
-}
-
-static bool i915_head_inside_request(const u32 acthd_unmasked,
-				     const u32 request_start,
-				     const u32 request_end)
-{
-	const u32 acthd = acthd_unmasked & HEAD_ADDR;
-
-	if (request_start < request_end) {
-		if (acthd >= request_start && acthd < request_end)
-			return true;
-	} else if (request_start > request_end) {
-		if (acthd >= request_start || acthd < request_end)
-			return true;
-	}
-
-	return false;
-}
-
-static struct i915_address_space *
-request_to_vm(struct drm_i915_gem_request *request)
-{
-	struct drm_i915_private *dev_priv = request->ring->dev->dev_private;
-	struct i915_address_space *vm;
-
-	if (request->ctx)
-		vm = request->ctx->vm;
-	else
-		vm = &dev_priv->gtt.base;
-
-	return vm;
-}
-
-static bool i915_request_guilty(struct drm_i915_gem_request *request,
-				const u32 acthd, bool *inside)
-{
-	/* There is a possibility that unmasked head address
-	 * pointing inside the ring, matches the batch_obj address range.
-	 * However this is extremely unlikely.
-	 */
-	if (request->batch_obj) {
-		if (i915_head_inside_object(acthd, request->batch_obj,
-					    request_to_vm(request))) {
-			*inside = true;
-			return true;
-		}
-	}
-
-	if (i915_head_inside_request(acthd, request->head, request->tail)) {
-		*inside = false;
-		return true;
-	}
-
-	return false;
-}
-
 static bool i915_context_is_banned(const struct i915_hw_context *ctx)
 {
 	struct drm_i915_private *dev_priv;
@@ -2329,30 +2265,11 @@ static bool i915_context_is_banned(const struct i915_hw_context *ctx)
 	return false;
 }
 
-static void i915_set_reset_status(struct intel_ring_buffer *ring,
-				  struct drm_i915_gem_request *request,
-				  const bool guilty)
+static void i915_set_reset_status(struct i915_hw_context *ctx,
+				 const bool guilty)
 {
-	const u32 acthd = intel_ring_get_active_head(ring);
-	bool inside;
-	unsigned long offset = 0;
-	struct i915_hw_context *ctx = request->ctx;
 	struct i915_ctx_hang_stats *hs;
 
-	if (request->batch_obj)
-		offset = i915_gem_obj_offset(request->batch_obj,
-					     request_to_vm(request));
-
-	if (guilty &&
-	    i915_request_guilty(request, acthd, &inside)) {
-		DRM_DEBUG("%s hung %s bo (0x%lx ctx %d) at 0x%x\n",
-			  ring->name,
-			  inside ? "inside" : "flushing",
-			  offset,
-			  ctx ? ctx->id : 0,
-			  acthd);
-	}
-
 	if (WARN_ON(!ctx))
 		return;
 
@@ -2407,10 +2324,10 @@ static void i915_gem_reset_ring_status(struct drm_i915_private *dev_priv,
 
 	ring_hung = ring->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG;
 
-	i915_set_reset_status(ring, request, ring_hung);
+	i915_set_reset_status(request->ctx, ring_hung);
 
 	list_for_each_entry_continue(request, &ring->request_list, list)
-		i915_set_reset_status(ring, request, false);
+		i915_set_reset_status(request->ctx, false);
 }
 
 static void i915_gem_reset_ring_cleanup(struct drm_i915_private *dev_priv,
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/4] drm/i915: Use i915_hw_context to set reset stats
  2014-01-29 15:05 ` [PATCH 1/4] drm/i915: Use i915_hw_context to set reset stats Mika Kuoppala
@ 2014-01-29 20:38   ` Ben Widawsky
  2014-01-30 14:01   ` [PATCH v2 " Mika Kuoppala
  1 sibling, 0 replies; 14+ messages in thread
From: Ben Widawsky @ 2014-01-29 20:38 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx, miku

On Wed, Jan 29, 2014 at 05:05:36PM +0200, Mika Kuoppala wrote:
> With full ppgtt support drm_i915_file_private gained knowledge
> about the default context. Also reset stats are now inside
> i915_hw_context so we can use proper abstraction.
> 
> Suggested-by: Ben Widawsky <ben@bwidawsk.net>
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem.c |   44 ++++++++++++++++++++-------------------
>  1 file changed, 23 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 39770f7..e41d520 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2305,11 +2305,17 @@ static bool i915_request_guilty(struct drm_i915_gem_request *request,
>  	return false;
>  }
>  
> -static bool i915_context_is_banned(const struct i915_ctx_hang_stats *hs)
> +static bool i915_context_is_banned(const struct i915_hw_context *ctx)
>  {
> -	const unsigned long elapsed = get_seconds() - hs->guilty_ts;
> +	struct drm_i915_private *dev_priv;
> +	unsigned long elapsed;
>  
> -	if (hs->banned)
> +	BUG_ON(!ctx->last_ring);

If you're that worried, just pass in dev, or dev_priv. I think it's a
valid BUG_ON btw, just a weird place to put it. I'd vote to shove the
BUG_ON in i915_set_reset_status() - and drop this completely.

> +
> +	dev_priv = ctx->last_ring->dev->dev_private;
> +	elapsed = get_seconds() - ctx->hang_stats.guilty_ts;
> +
> +	if (ctx->hang_stats.banned)
>  		return true;
>  
>  	if (elapsed <= DRM_I915_CTX_BAN_PERIOD) {
> @@ -2324,9 +2330,10 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
>  				  struct drm_i915_gem_request *request,
>  				  u32 acthd)
>  {
> -	struct i915_ctx_hang_stats *hs = NULL;
>  	bool inside, guilty;
>  	unsigned long offset = 0;
> +	struct i915_hw_context *ctx = request->ctx;
> +	struct i915_ctx_hang_stats *hs;
>  
>  	/* Innocent until proven guilty */
>  	guilty = false;
> @@ -2341,28 +2348,23 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
>  			  ring->name,
>  			  inside ? "inside" : "flushing",
>  			  offset,
> -			  request->ctx ? request->ctx->id : 0,
> +			  ctx ? ctx->id : 0,

You may as well drop this ternary and move your WARN_ON(!ctx) up above.


With or without my requests:
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>

>  			  acthd);
>  
>  		guilty = true;
>  	}
>  
> -	/* If contexts are disabled or this is the default context, use
> -	 * file_priv->reset_state
> -	 */
> -	if (request->ctx && request->ctx->id != DEFAULT_CONTEXT_ID)
> -		hs = &request->ctx->hang_stats;
> -	else if (request->file_priv)
> -		hs = &request->file_priv->private_default_ctx->hang_stats;
> -
> -	if (hs) {
> -		if (guilty) {
> -			hs->banned = i915_context_is_banned(hs);
> -			hs->batch_active++;
> -			hs->guilty_ts = get_seconds();
> -		} else {
> -			hs->batch_pending++;
> -		}
> +	if (WARN_ON(!ctx))
> +		return;
> +
> +	hs = &ctx->hang_stats;
> +
> +	if (guilty) {
> +		hs->banned = i915_context_is_banned(ctx);
> +		hs->batch_active++;
> +		hs->guilty_ts = get_seconds();
> +	} else {
> +		hs->batch_pending++;
>  	}
>  }
>  

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] drm/i915: Tune down debug output when context is banned
  2014-01-29 15:05 ` [PATCH 2/4] drm/i915: Tune down debug output when context is banned Mika Kuoppala
@ 2014-01-29 20:47   ` Ben Widawsky
  2014-01-29 20:54     ` Daniel Vetter
  2014-01-30 14:05   ` [PATCH v3 " Mika Kuoppala
  1 sibling, 1 reply; 14+ messages in thread
From: Ben Widawsky @ 2014-01-29 20:47 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx, miku

On Wed, Jan 29, 2014 at 05:05:37PM +0200, Mika Kuoppala wrote:
> If we have stopped rings then we know that test is running
> so no need for spam. In addition, only spam when default
> context gets banned.
> 
> v2: - make sure default context ban gets shown (Chris)
>     - use helper for checking for default context, everywhere (Chris)
> 
> Reference: https://bugs.freedesktop.org/show_bug.cgi?id=73652
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h         |    5 +++++
>  drivers/gpu/drm/i915/i915_gem.c         |    5 ++++-
>  drivers/gpu/drm/i915/i915_gem_context.c |    9 ++-------
>  3 files changed, 11 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 3673ba1..b34fd31 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2271,6 +2271,11 @@ static inline void i915_gem_context_unreference(struct i915_hw_context *ctx)
>  		kref_put(&ctx->ref, i915_gem_context_free);
>  }
>  
> +static inline bool i915_gem_context_is_default(const struct i915_hw_context *c)
> +{
> +	return c->id == DEFAULT_CONTEXT_ID;
> +}
> +
>  int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
>  				  struct drm_file *file);
>  int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data,
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index e41d520..8f9ac0a 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2319,7 +2319,10 @@ static bool i915_context_is_banned(const struct i915_hw_context *ctx)
>  		return true;
>  
>  	if (elapsed <= DRM_I915_CTX_BAN_PERIOD) {
> -		DRM_ERROR("context hanging too fast, declaring banned!\n");
> +		if (dev_priv->gpu_error.stop_rings == 0 &&
> +		    i915_gem_context_is_default(ctx))
> +			DRM_ERROR("context hanging too fast, banning!\n");
> +

else DRM_DEBUG[_DRIVER](...)?

Reviewed-by: Ben Widawsky <ben@bwidawsk.net>

>  		return true;
>  	}
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 2b0598e..985c1ed 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -228,11 +228,6 @@ err_out:
>  	return ERR_PTR(ret);
>  }
>  
> -static inline bool is_default_context(struct i915_hw_context *ctx)
> -{
> -	return (ctx->id == DEFAULT_CONTEXT_ID);
> -}
> -
>  /**
>   * The default context needs to exist per ring that uses contexts. It stores the
>   * context state of the GPU for applications that don't utilize HW contexts, as
> @@ -478,7 +473,7 @@ static int context_idr_cleanup(int id, void *p, void *data)
>  	struct i915_hw_context *ctx = p;
>  
>  	/* Ignore the default context because close will handle it */
> -	if (is_default_context(ctx))
> +	if (i915_gem_context_is_default(ctx))
>  		return 0;
>  
>  	i915_gem_context_unreference(ctx);
> @@ -656,7 +651,7 @@ static int do_switch(struct intel_ring_buffer *ring,
>  		vma->bind_vma(vma, to->obj->cache_level, GLOBAL_BIND);
>  	}
>  
> -	if (!to->is_initialized || is_default_context(to))
> +	if (!to->is_initialized || i915_gem_context_is_default(to))
>  		hw_flags |= MI_RESTORE_INHIBIT;
>  
>  	ret = mi_set_context(ring, to, hw_flags);
> -- 
> 1.7.9.5
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] drm/i915: Tune down debug output when context is banned
  2014-01-29 20:47   ` Ben Widawsky
@ 2014-01-29 20:54     ` Daniel Vetter
  0 siblings, 0 replies; 14+ messages in thread
From: Daniel Vetter @ 2014-01-29 20:54 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: intel-gfx, miku

On Wed, Jan 29, 2014 at 12:47:13PM -0800, Ben Widawsky wrote:
> On Wed, Jan 29, 2014 at 05:05:37PM +0200, Mika Kuoppala wrote:
> > If we have stopped rings then we know that test is running
> > so no need for spam. In addition, only spam when default
> > context gets banned.
> > 
> > v2: - make sure default context ban gets shown (Chris)
> >     - use helper for checking for default context, everywhere (Chris)
> > 
> > Reference: https://bugs.freedesktop.org/show_bug.cgi?id=73652
> > Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_drv.h         |    5 +++++
> >  drivers/gpu/drm/i915/i915_gem.c         |    5 ++++-
> >  drivers/gpu/drm/i915/i915_gem_context.c |    9 ++-------
> >  3 files changed, 11 insertions(+), 8 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > index 3673ba1..b34fd31 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -2271,6 +2271,11 @@ static inline void i915_gem_context_unreference(struct i915_hw_context *ctx)
> >  		kref_put(&ctx->ref, i915_gem_context_free);
> >  }
> >  
> > +static inline bool i915_gem_context_is_default(const struct i915_hw_context *c)
> > +{
> > +	return c->id == DEFAULT_CONTEXT_ID;
> > +}
> > +
> >  int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
> >  				  struct drm_file *file);
> >  int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data,
> > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> > index e41d520..8f9ac0a 100644
> > --- a/drivers/gpu/drm/i915/i915_gem.c
> > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > @@ -2319,7 +2319,10 @@ static bool i915_context_is_banned(const struct i915_hw_context *ctx)
> >  		return true;
> >  
> >  	if (elapsed <= DRM_I915_CTX_BAN_PERIOD) {
> > -		DRM_ERROR("context hanging too fast, declaring banned!\n");
> > +		if (dev_priv->gpu_error.stop_rings == 0 &&
> > +		    i915_gem_context_is_default(ctx))
> > +			DRM_ERROR("context hanging too fast, banning!\n");
> > +
> 
> else DRM_DEBUG[_DRIVER](...)?

If you do that maybe use "gpu hanging too fast" for the default context,
and "context hanging too fast" for the others ...
-Daniel

> 
> Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
> 
> >  		return true;
> >  	}
> >  
> > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> > index 2b0598e..985c1ed 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_context.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> > @@ -228,11 +228,6 @@ err_out:
> >  	return ERR_PTR(ret);
> >  }
> >  
> > -static inline bool is_default_context(struct i915_hw_context *ctx)
> > -{
> > -	return (ctx->id == DEFAULT_CONTEXT_ID);
> > -}
> > -
> >  /**
> >   * The default context needs to exist per ring that uses contexts. It stores the
> >   * context state of the GPU for applications that don't utilize HW contexts, as
> > @@ -478,7 +473,7 @@ static int context_idr_cleanup(int id, void *p, void *data)
> >  	struct i915_hw_context *ctx = p;
> >  
> >  	/* Ignore the default context because close will handle it */
> > -	if (is_default_context(ctx))
> > +	if (i915_gem_context_is_default(ctx))
> >  		return 0;
> >  
> >  	i915_gem_context_unreference(ctx);
> > @@ -656,7 +651,7 @@ static int do_switch(struct intel_ring_buffer *ring,
> >  		vma->bind_vma(vma, to->obj->cache_level, GLOBAL_BIND);
> >  	}
> >  
> > -	if (!to->is_initialized || is_default_context(to))
> > +	if (!to->is_initialized || i915_gem_context_is_default(to))
> >  		hw_flags |= MI_RESTORE_INHIBIT;
> >  
> >  	ret = mi_set_context(ring, to, hw_flags);
> > -- 
> > 1.7.9.5
> > 
> > _______________________________________________
> > Intel-gfx mailing list
> > Intel-gfx@lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> 
> -- 
> Ben Widawsky, Intel Open Source Technology Center
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4/4] drm/i915: Get rid of acthd based guilty batch search
  2014-01-29 15:05 ` [PATCH 4/4] drm/i915: Get rid of acthd based guilty batch search Mika Kuoppala
@ 2014-01-29 21:44   ` Ben Widawsky
  2014-01-30 12:59     ` Chris Wilson
  0 siblings, 1 reply; 14+ messages in thread
From: Ben Widawsky @ 2014-01-29 21:44 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx, miku

On Wed, Jan 29, 2014 at 05:05:39PM +0200, Mika Kuoppala wrote:
> As we seek the guilty batch using request and hangcheck
> score, this code is not needed anymore.
> 
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem.c |   91 ++-------------------------------------
>  1 file changed, 4 insertions(+), 87 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index a46a1a7..8637898 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2241,70 +2241,6 @@ i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
>  	spin_unlock(&file_priv->mm.lock);
>  }
>  
> -static bool i915_head_inside_object(u32 acthd, struct drm_i915_gem_object *obj,
> -				    struct i915_address_space *vm)
> -{
> -	if (acthd >= i915_gem_obj_offset(obj, vm) &&
> -	    acthd < i915_gem_obj_offset(obj, vm) + obj->base.size)
> -		return true;
> -
> -	return false;
> -}
> -
> -static bool i915_head_inside_request(const u32 acthd_unmasked,
> -				     const u32 request_start,
> -				     const u32 request_end)
> -{
> -	const u32 acthd = acthd_unmasked & HEAD_ADDR;
> -
> -	if (request_start < request_end) {
> -		if (acthd >= request_start && acthd < request_end)
> -			return true;
> -	} else if (request_start > request_end) {
> -		if (acthd >= request_start || acthd < request_end)
> -			return true;
> -	}
> -
> -	return false;
> -}
> -
> -static struct i915_address_space *
> -request_to_vm(struct drm_i915_gem_request *request)
> -{
> -	struct drm_i915_private *dev_priv = request->ring->dev->dev_private;
> -	struct i915_address_space *vm;
> -
> -	if (request->ctx)
> -		vm = request->ctx->vm;
> -	else
> -		vm = &dev_priv->gtt.base;
> -
> -	return vm;
> -}
> -
> -static bool i915_request_guilty(struct drm_i915_gem_request *request,
> -				const u32 acthd, bool *inside)
> -{
> -	/* There is a possibility that unmasked head address
> -	 * pointing inside the ring, matches the batch_obj address range.
> -	 * However this is extremely unlikely.
> -	 */
> -	if (request->batch_obj) {
> -		if (i915_head_inside_object(acthd, request->batch_obj,
> -					    request_to_vm(request))) {
> -			*inside = true;
> -			return true;
> -		}
> -	}
> -
> -	if (i915_head_inside_request(acthd, request->head, request->tail)) {
> -		*inside = false;
> -		return true;
> -	}
> -
> -	return false;
> -}
> -
>  static bool i915_context_is_banned(const struct i915_hw_context *ctx)
>  {
>  	struct drm_i915_private *dev_priv;
> @@ -2329,30 +2265,11 @@ static bool i915_context_is_banned(const struct i915_hw_context *ctx)
>  	return false;
>  }
>  
> -static void i915_set_reset_status(struct intel_ring_buffer *ring,
> -				  struct drm_i915_gem_request *request,
> -				  const bool guilty)
> +static void i915_set_reset_status(struct i915_hw_context *ctx,
> +				 const bool guilty)
>  {
> -	const u32 acthd = intel_ring_get_active_head(ring);
> -	bool inside;
> -	unsigned long offset = 0;
> -	struct i915_hw_context *ctx = request->ctx;
>  	struct i915_ctx_hang_stats *hs;
>  
> -	if (request->batch_obj)
> -		offset = i915_gem_obj_offset(request->batch_obj,
> -					     request_to_vm(request));
> -
> -	if (guilty &&
> -	    i915_request_guilty(request, acthd, &inside)) {
> -		DRM_DEBUG("%s hung %s bo (0x%lx ctx %d) at 0x%x\n",
> -			  ring->name,
> -			  inside ? "inside" : "flushing",
> -			  offset,
> -			  ctx ? ctx->id : 0,
> -			  acthd);
> -	}
> -
>  	if (WARN_ON(!ctx))
>  		return;
>  
> @@ -2407,10 +2324,10 @@ static void i915_gem_reset_ring_status(struct drm_i915_private *dev_priv,
>  
>  	ring_hung = ring->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG;
>  
> -	i915_set_reset_status(ring, request, ring_hung);
> +	i915_set_reset_status(request->ctx, ring_hung);
>  
>  	list_for_each_entry_continue(request, &ring->request_list, list)
> -		i915_set_reset_status(ring, request, false);
> +		i915_set_reset_status(request->ctx, false);
>  }
>  
>  static void i915_gem_reset_ring_cleanup(struct drm_i915_private *dev_priv,

I can't say that I'm a huge fan of calling i915_set_reset_status twice
(I bit my lip while reading the last patch). To me it suggests that the
interface probably ended up a bit poorly designed. I can live with it
though, I just couldn't bite my lip for 2 patches in a row :-)

I guess I've missed how this solves the issue I poked about in the
original series. However, the code overall is a big improvement, and
even in the unlikely case that I am not just being blind to your
solution- the odds of having multiple hung rings are slim enough that I
can live with that either way.

I did put some requests in the patches. Each already had an
unconditional r-b. Therefore, the series is:
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>

Last word: As I've discussed with Chris too, I am overall a bit wary of
removing any use upon hardware for doing a lot of these error triage,
detection and collection functions. I really like that no matter how
bonghits our driver gets, we can read certain registers to try to figure
things out. I say this now since I think after this series I will no
longer have a leg to stand on in the, we shouldn't use requests for
error collection, discussion. Thanks for reading my rant.

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4/4] drm/i915: Get rid of acthd based guilty batch search
  2014-01-29 21:44   ` Ben Widawsky
@ 2014-01-30 12:59     ` Chris Wilson
  0 siblings, 0 replies; 14+ messages in thread
From: Chris Wilson @ 2014-01-30 12:59 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: intel-gfx, miku

On Wed, Jan 29, 2014 at 01:44:34PM -0800, Ben Widawsky wrote:
> Last word: As I've discussed with Chris too, I am overall a bit wary of
> removing any use upon hardware for doing a lot of these error triage,
> detection and collection functions. I really like that no matter how
> bonghits our driver gets, we can read certain registers to try to figure
> things out. I say this now since I think after this series I will no
> longer have a leg to stand on in the, we shouldn't use requests for
> error collection, discussion. Thanks for reading my rant.

I know exactly what you mean. But without a complete snapshot of the
hardware and memory (including old contents of overwritten buffers) we
will always be missing something when we come to post-mortem debugging.

I trust tracking activity by seqno though, and in the error states where
that itself has been erroneous the fault has stood out (and had been the
root cause of the hang anyway). Hence my feeling that it is exactly what
I want for port-mortem debugging.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v2 1/4] drm/i915: Use i915_hw_context to set reset stats
  2014-01-29 15:05 ` [PATCH 1/4] drm/i915: Use i915_hw_context to set reset stats Mika Kuoppala
  2014-01-29 20:38   ` Ben Widawsky
@ 2014-01-30 14:01   ` Mika Kuoppala
  2014-01-30 15:26     ` Daniel Vetter
  1 sibling, 1 reply; 14+ messages in thread
From: Mika Kuoppala @ 2014-01-30 14:01 UTC (permalink / raw)
  To: intel-gfx

With full ppgtt support drm_i915_file_private gained knowledge
about the default context. Also reset stats are now inside
i915_hw_context so we can use proper abstraction.

v2: Move BUG_ON and WARN_ON to more proper locations (Ben)

Suggested-by: Ben Widawsky <ben@bwidawsk.net>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
---
 drivers/gpu/drm/i915/i915_gem.c |   44 ++++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 39770f7..4b7e6a2 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2305,11 +2305,15 @@ static bool i915_request_guilty(struct drm_i915_gem_request *request,
 	return false;
 }
 
-static bool i915_context_is_banned(const struct i915_ctx_hang_stats *hs)
+static bool i915_context_is_banned(const struct i915_hw_context *ctx)
 {
-	const unsigned long elapsed = get_seconds() - hs->guilty_ts;
+	struct drm_i915_private *dev_priv;
+	unsigned long elapsed;
 
-	if (hs->banned)
+	dev_priv = ctx->last_ring->dev->dev_private;
+	elapsed = get_seconds() - ctx->hang_stats.guilty_ts;
+
+	if (ctx->hang_stats.banned)
 		return true;
 
 	if (elapsed <= DRM_I915_CTX_BAN_PERIOD) {
@@ -2324,9 +2328,13 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
 				  struct drm_i915_gem_request *request,
 				  u32 acthd)
 {
-	struct i915_ctx_hang_stats *hs = NULL;
 	bool inside, guilty;
 	unsigned long offset = 0;
+	struct i915_hw_context *ctx = request->ctx;
+	struct i915_ctx_hang_stats *hs;
+
+	if (WARN_ON(!ctx))
+		return;
 
 	/* Innocent until proven guilty */
 	guilty = false;
@@ -2341,28 +2349,22 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
 			  ring->name,
 			  inside ? "inside" : "flushing",
 			  offset,
-			  request->ctx ? request->ctx->id : 0,
+			  ctx->id,
 			  acthd);
 
 		guilty = true;
 	}
 
-	/* If contexts are disabled or this is the default context, use
-	 * file_priv->reset_state
-	 */
-	if (request->ctx && request->ctx->id != DEFAULT_CONTEXT_ID)
-		hs = &request->ctx->hang_stats;
-	else if (request->file_priv)
-		hs = &request->file_priv->private_default_ctx->hang_stats;
-
-	if (hs) {
-		if (guilty) {
-			hs->banned = i915_context_is_banned(hs);
-			hs->batch_active++;
-			hs->guilty_ts = get_seconds();
-		} else {
-			hs->batch_pending++;
-		}
+	BUG_ON(!ctx->last_ring);
+
+	hs = &ctx->hang_stats;
+
+	if (guilty) {
+		hs->banned = i915_context_is_banned(ctx);
+		hs->batch_active++;
+		hs->guilty_ts = get_seconds();
+	} else {
+		hs->batch_pending++;
 	}
 }
 
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v3 2/4] drm/i915: Tune down debug output when context is banned
  2014-01-29 15:05 ` [PATCH 2/4] drm/i915: Tune down debug output when context is banned Mika Kuoppala
  2014-01-29 20:47   ` Ben Widawsky
@ 2014-01-30 14:05   ` Mika Kuoppala
  1 sibling, 0 replies; 14+ messages in thread
From: Mika Kuoppala @ 2014-01-30 14:05 UTC (permalink / raw)
  To: intel-gfx

If we have stopped rings then we know that test is running
so no need for spam. In addition, only spam when default
context gets banned.

v2: - make sure default context ban gets shown (Chris)
    - use helper for checking for default context, everywhere (Chris)

v3: - dont be quiet when debug is set (Ben, Daniel)

Reference: https://bugs.freedesktop.org/show_bug.cgi?id=73652
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
---
 drivers/gpu/drm/i915/i915_drv.h         |    5 +++++
 drivers/gpu/drm/i915/i915_gem.c         |    8 +++++++-
 drivers/gpu/drm/i915/i915_gem_context.c |    9 ++-------
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 3673ba1..b34fd31 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2271,6 +2271,11 @@ static inline void i915_gem_context_unreference(struct i915_hw_context *ctx)
 		kref_put(&ctx->ref, i915_gem_context_free);
 }
 
+static inline bool i915_gem_context_is_default(const struct i915_hw_context *c)
+{
+	return c->id == DEFAULT_CONTEXT_ID;
+}
+
 int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
 				  struct drm_file *file);
 int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data,
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 4b7e6a2..0d1760f 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2317,7 +2317,13 @@ static bool i915_context_is_banned(const struct i915_hw_context *ctx)
 		return true;
 
 	if (elapsed <= DRM_I915_CTX_BAN_PERIOD) {
-		DRM_ERROR("context hanging too fast, declaring banned!\n");
+		if (dev_priv->gpu_error.stop_rings == 0 &&
+		    i915_gem_context_is_default(ctx)) {
+			DRM_ERROR("gpu hanging too fast, banning!\n");
+		} else {
+			DRM_DEBUG("context hanging too fast, banning!\n");
+		}
+
 		return true;
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 2b0598e..985c1ed 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -228,11 +228,6 @@ err_out:
 	return ERR_PTR(ret);
 }
 
-static inline bool is_default_context(struct i915_hw_context *ctx)
-{
-	return (ctx->id == DEFAULT_CONTEXT_ID);
-}
-
 /**
  * The default context needs to exist per ring that uses contexts. It stores the
  * context state of the GPU for applications that don't utilize HW contexts, as
@@ -478,7 +473,7 @@ static int context_idr_cleanup(int id, void *p, void *data)
 	struct i915_hw_context *ctx = p;
 
 	/* Ignore the default context because close will handle it */
-	if (is_default_context(ctx))
+	if (i915_gem_context_is_default(ctx))
 		return 0;
 
 	i915_gem_context_unreference(ctx);
@@ -656,7 +651,7 @@ static int do_switch(struct intel_ring_buffer *ring,
 		vma->bind_vma(vma, to->obj->cache_level, GLOBAL_BIND);
 	}
 
-	if (!to->is_initialized || is_default_context(to))
+	if (!to->is_initialized || i915_gem_context_is_default(to))
 		hw_flags |= MI_RESTORE_INHIBIT;
 
 	ret = mi_set_context(ring, to, hw_flags);
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 1/4] drm/i915: Use i915_hw_context to set reset stats
  2014-01-30 14:01   ` [PATCH v2 " Mika Kuoppala
@ 2014-01-30 15:26     ` Daniel Vetter
  0 siblings, 0 replies; 14+ messages in thread
From: Daniel Vetter @ 2014-01-30 15:26 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

On Thu, Jan 30, 2014 at 04:01:15PM +0200, Mika Kuoppala wrote:
> With full ppgtt support drm_i915_file_private gained knowledge
> about the default context. Also reset stats are now inside
> i915_hw_context so we can use proper abstraction.
> 
> v2: Move BUG_ON and WARN_ON to more proper locations (Ben)
> 
> Suggested-by: Ben Widawsky <ben@bwidawsk.net>
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> Reviewed-by: Ben Widawsky <ben@bwidawsk.net>

[snip]

> +	BUG_ON(!ctx->last_ring);

I've changed this to a WARN_ON - killing the entire driver over this seems
a bit excessive.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v3 2/4] drm/i915: Tune down debug output when context is banned
       [not found] <id:1391007939-5741-3-git-send-email-mika.kuoppala@intel.com>
@ 2014-01-30 14:03 ` Mika Kuoppala
  0 siblings, 0 replies; 14+ messages in thread
From: Mika Kuoppala @ 2014-01-30 14:03 UTC (permalink / raw)
  To: intel-gfx

If we have stopped rings then we know that test is running
so no need for spam. In addition, only spam when default
context gets banned.

v2: - make sure default context ban gets shown (Chris)
    - use helper for checking for default context, everywhere (Chris)

v3: - dont be quiet when debug is set (Ben, Daniel)

Reference: https://bugs.freedesktop.org/show_bug.cgi?id=73652
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
---
 drivers/gpu/drm/i915/i915_drv.h         |    5 +++++
 drivers/gpu/drm/i915/i915_gem.c         |    8 +++++++-
 drivers/gpu/drm/i915/i915_gem_context.c |    9 ++-------
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 3673ba1..b34fd31 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2271,6 +2271,11 @@ static inline void i915_gem_context_unreference(struct i915_hw_context *ctx)
 		kref_put(&ctx->ref, i915_gem_context_free);
 }
 
+static inline bool i915_gem_context_is_default(const struct i915_hw_context *c)
+{
+	return c->id == DEFAULT_CONTEXT_ID;
+}
+
 int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
 				  struct drm_file *file);
 int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data,
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 4b7e6a2..0d1760f 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2317,7 +2317,13 @@ static bool i915_context_is_banned(const struct i915_hw_context *ctx)
 		return true;
 
 	if (elapsed <= DRM_I915_CTX_BAN_PERIOD) {
-		DRM_ERROR("context hanging too fast, declaring banned!\n");
+		if (dev_priv->gpu_error.stop_rings == 0 &&
+		    i915_gem_context_is_default(ctx)) {
+			DRM_ERROR("gpu hanging too fast, banning!\n");
+		} else {
+			DRM_DEBUG("context hanging too fast, banning!\n");
+		}
+
 		return true;
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 2b0598e..985c1ed 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -228,11 +228,6 @@ err_out:
 	return ERR_PTR(ret);
 }
 
-static inline bool is_default_context(struct i915_hw_context *ctx)
-{
-	return (ctx->id == DEFAULT_CONTEXT_ID);
-}
-
 /**
  * The default context needs to exist per ring that uses contexts. It stores the
  * context state of the GPU for applications that don't utilize HW contexts, as
@@ -478,7 +473,7 @@ static int context_idr_cleanup(int id, void *p, void *data)
 	struct i915_hw_context *ctx = p;
 
 	/* Ignore the default context because close will handle it */
-	if (is_default_context(ctx))
+	if (i915_gem_context_is_default(ctx))
 		return 0;
 
 	i915_gem_context_unreference(ctx);
@@ -656,7 +651,7 @@ static int do_switch(struct intel_ring_buffer *ring,
 		vma->bind_vma(vma, to->obj->cache_level, GLOBAL_BIND);
 	}
 
-	if (!to->is_initialized || is_default_context(to))
+	if (!to->is_initialized || i915_gem_context_is_default(to))
 		hw_flags |= MI_RESTORE_INHIBIT;
 
 	ret = mi_set_context(ring, to, hw_flags);
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2014-01-30 15:26 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-01-29 15:05 [PATCH 0/4] context reset stats fixes for ppgtt Mika Kuoppala
2014-01-29 15:05 ` [PATCH 1/4] drm/i915: Use i915_hw_context to set reset stats Mika Kuoppala
2014-01-29 20:38   ` Ben Widawsky
2014-01-30 14:01   ` [PATCH v2 " Mika Kuoppala
2014-01-30 15:26     ` Daniel Vetter
2014-01-29 15:05 ` [PATCH 2/4] drm/i915: Tune down debug output when context is banned Mika Kuoppala
2014-01-29 20:47   ` Ben Widawsky
2014-01-29 20:54     ` Daniel Vetter
2014-01-30 14:05   ` [PATCH v3 " Mika Kuoppala
2014-01-29 15:05 ` [PATCH 3/4] drm/i915: Use hangcheck score to find guilty context Mika Kuoppala
2014-01-29 15:05 ` [PATCH 4/4] drm/i915: Get rid of acthd based guilty batch search Mika Kuoppala
2014-01-29 21:44   ` Ben Widawsky
2014-01-30 12:59     ` Chris Wilson
     [not found] <id:1391007939-5741-3-git-send-email-mika.kuoppala@intel.com>
2014-01-30 14:03 ` [PATCH v3 2/4] drm/i915: Tune down debug output when context is banned Mika Kuoppala

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.