All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] drm/i915: Record more information about the hanging contexts
@ 2017-01-29  9:24 Chris Wilson
  2017-01-30 15:24 ` Mika Kuoppala
  2017-01-30 17:24 ` ✗ Fi.CI.BAT: warning for drm/i915: Record more information about the hanging contexts (rev2) Patchwork
  0 siblings, 2 replies; 3+ messages in thread
From: Chris Wilson @ 2017-01-29  9:24 UTC (permalink / raw)
  To: intel-gfx; +Cc: Mika Kuoppala

Include extra information such as the user_handle and hw_id so that
userspace can identify which of their contexts hung, useful if they are
performing self-diagnositics.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h       | 14 +++++--
 drivers/gpu/drm/i915/i915_gpu_error.c | 77 ++++++++++++++++++++++-------------
 2 files changed, 59 insertions(+), 32 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index c1fde816db63..7e7bc4504c94 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -970,6 +970,16 @@ struct drm_i915_error_state {
 		u32 semaphore_mboxes[I915_NUM_ENGINES - 1];
 		struct intel_instdone instdone;
 
+		struct drm_i915_error_context {
+			char comm[TASK_COMM_LEN];
+			int pid;
+			u32 handle;
+			u32 hw_id;
+			int ban_score;
+			int active;
+			int guilty;
+		} context;
+
 		struct drm_i915_error_object {
 			u64 gtt_offset;
 			u64 gtt_size;
@@ -1003,10 +1013,6 @@ struct drm_i915_error_state {
 				u32 pp_dir_base;
 			};
 		} vm_info;
-
-		pid_t pid;
-		char comm[TASK_COMM_LEN];
-		int context_bans;
 	} engine[I915_NUM_ENGINES];
 
 	struct drm_i915_error_buffer {
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index e5375323eb06..5283fe815a4d 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -384,6 +384,15 @@ static void error_print_request(struct drm_i915_error_state_buf *m,
 		   erq->head, erq->tail);
 }
 
+static void error_print_context(struct drm_i915_error_state_buf *m,
+				const char *header,
+				struct drm_i915_error_context *ctx)
+{
+	err_printf(m, "%s%s[%d] user_handle %d hw_id %d, ban score %d guilty %d active %d\n",
+		   header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id,
+		   ctx->ban_score, ctx->guilty, ctx->active);
+}
+
 static void error_print_engine(struct drm_i915_error_state_buf *m,
 			       struct drm_i915_error_engine *ee)
 {
@@ -457,6 +466,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
 
 	error_print_request(m, "  ELSP[0]: ", &ee->execlist[0]);
 	error_print_request(m, "  ELSP[1]: ", &ee->execlist[1]);
+	error_print_context(m, "  Active context: ", &ee->context);
 }
 
 void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
@@ -562,12 +572,12 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 
 	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
 		if (error->engine[i].hangcheck_stalled &&
-		    error->engine[i].pid != -1) {
-			err_printf(m, "Active process (on ring %s): %s [%d], context bans %d\n",
+		    error->engine[i].context.pid) {
+			err_printf(m, "Active process (on ring %s): %s [%d], score %d\n",
 				   engine_str(i),
-				   error->engine[i].comm,
-				   error->engine[i].pid,
-				   error->engine[i].context_bans);
+				   error->engine[i].context.comm,
+				   error->engine[i].context.pid,
+				   error->engine[i].context.ban_score);
 		}
 	}
 	err_printf(m, "Reset count: %u\n", error->reset_count);
@@ -658,11 +668,13 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 		obj = ee->batchbuffer;
 		if (obj) {
 			err_puts(m, dev_priv->engine[i]->name);
-			if (ee->pid != -1)
-				err_printf(m, " (submitted by %s [%d], bans %d)",
-					   ee->comm,
-					   ee->pid,
-					   ee->context_bans);
+			if (ee->context.pid)
+				err_printf(m, " (submitted by %s [%d], ctx %d [%d], score %d)",
+					   ee->context.comm,
+					   ee->context.pid,
+					   ee->context.handle,
+					   ee->context.hw_id,
+					   ee->context.ban_score);
 			err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
 				   upper_32_bits(obj->gtt_offset),
 				   lower_32_bits(obj->gtt_offset));
@@ -1267,6 +1279,28 @@ static void error_record_engine_execlists(struct intel_engine_cs *engine,
 				       &ee->execlist[n]);
 }
 
+static void record_context(struct drm_i915_error_context *e,
+			   struct i915_gem_context *ctx)
+{
+	if (ctx->pid) {
+		struct task_struct *task;
+
+		rcu_read_lock();
+		task = pid_task(ctx->pid, PIDTYPE_PID);
+		if (task) {
+			strcpy(e->comm, task->comm);
+			e->pid = task->pid;
+		}
+		rcu_read_unlock();
+	}
+
+	e->handle = ctx->user_handle;
+	e->hw_id = ctx->hw_id;
+	e->ban_score = ctx->ban_score;
+	e->guilty = ctx->guilty_count;
+	e->active = ctx->active_count;
+}
+
 static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 				  struct drm_i915_error_state *error)
 {
@@ -1281,7 +1315,6 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 		struct drm_i915_error_engine *ee = &error->engine[i];
 		struct drm_i915_gem_request *request;
 
-		ee->pid = -1;
 		ee->engine_id = -1;
 
 		if (!engine)
@@ -1296,11 +1329,12 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 		request = i915_gem_find_active_request(engine);
 		if (request) {
 			struct intel_ring *ring;
-			struct pid *pid;
 
 			ee->vm = request->ctx->ppgtt ?
 				&request->ctx->ppgtt->base : &ggtt->base;
 
+			record_context(&ee->context, request->ctx);
+
 			/* We need to copy these to an anonymous buffer
 			 * as the simplest method to avoid being overwritten
 			 * by userspace.
@@ -1318,19 +1352,6 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 				i915_error_object_create(dev_priv,
 							 request->ctx->engine[i].state);
 
-			pid = request->ctx->pid;
-			if (pid) {
-				struct task_struct *task;
-
-				rcu_read_lock();
-				task = pid_task(pid, PIDTYPE_PID);
-				if (task) {
-					strcpy(ee->comm, task->comm);
-					ee->pid = task->pid;
-				}
-				rcu_read_unlock();
-			}
-
 			error->simulated |=
 				i915_gem_context_no_error_capture(request->ctx);
 
@@ -1534,12 +1555,12 @@ static void i915_error_capture_msg(struct drm_i915_private *dev_priv,
 			"GPU HANG: ecode %d:%d:0x%08x",
 			INTEL_GEN(dev_priv), engine_id, ecode);
 
-	if (engine_id != -1 && error->engine[engine_id].pid != -1)
+	if (engine_id != -1 && error->engine[engine_id].context.pid)
 		len += scnprintf(error->error_msg + len,
 				 sizeof(error->error_msg) - len,
 				 ", in %s [%d]",
-				 error->engine[engine_id].comm,
-				 error->engine[engine_id].pid);
+				 error->engine[engine_id].context.comm,
+				 error->engine[engine_id].context.pid);
 
 	scnprintf(error->error_msg + len, sizeof(error->error_msg) - len,
 		  ", reason: %s, action: %s",
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH v2] drm/i915: Record more information about the hanging contexts
  2017-01-29  9:24 [PATCH v2] drm/i915: Record more information about the hanging contexts Chris Wilson
@ 2017-01-30 15:24 ` Mika Kuoppala
  2017-01-30 17:24 ` ✗ Fi.CI.BAT: warning for drm/i915: Record more information about the hanging contexts (rev2) Patchwork
  1 sibling, 0 replies; 3+ messages in thread
From: Mika Kuoppala @ 2017-01-30 15:24 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Include extra information such as the user_handle and hw_id so that
> userspace can identify which of their contexts hung, useful if they are
> performing self-diagnositics.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h       | 14 +++++--
>  drivers/gpu/drm/i915/i915_gpu_error.c | 77 ++++++++++++++++++++++-------------
>  2 files changed, 59 insertions(+), 32 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index c1fde816db63..7e7bc4504c94 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -970,6 +970,16 @@ struct drm_i915_error_state {
>  		u32 semaphore_mboxes[I915_NUM_ENGINES - 1];
>  		struct intel_instdone instdone;
>  
> +		struct drm_i915_error_context {
> +			char comm[TASK_COMM_LEN];
> +			int pid;

s/int/pid_t

Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>

> +			u32 handle;
> +			u32 hw_id;
> +			int ban_score;
> +			int active;
> +			int guilty;
> +		} context;
> +
>  		struct drm_i915_error_object {
>  			u64 gtt_offset;
>  			u64 gtt_size;
> @@ -1003,10 +1013,6 @@ struct drm_i915_error_state {
>  				u32 pp_dir_base;
>  			};
>  		} vm_info;
> -
> -		pid_t pid;
> -		char comm[TASK_COMM_LEN];
> -		int context_bans;
>  	} engine[I915_NUM_ENGINES];
>  
>  	struct drm_i915_error_buffer {
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index e5375323eb06..5283fe815a4d 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -384,6 +384,15 @@ static void error_print_request(struct drm_i915_error_state_buf *m,
>  		   erq->head, erq->tail);
>  }
>  
> +static void error_print_context(struct drm_i915_error_state_buf *m,
> +				const char *header,
> +				struct drm_i915_error_context *ctx)
> +{
> +	err_printf(m, "%s%s[%d] user_handle %d hw_id %d, ban score %d guilty %d active %d\n",
> +		   header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id,
> +		   ctx->ban_score, ctx->guilty, ctx->active);
> +}
> +
>  static void error_print_engine(struct drm_i915_error_state_buf *m,
>  			       struct drm_i915_error_engine *ee)
>  {
> @@ -457,6 +466,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
>  
>  	error_print_request(m, "  ELSP[0]: ", &ee->execlist[0]);
>  	error_print_request(m, "  ELSP[1]: ", &ee->execlist[1]);
> +	error_print_context(m, "  Active context: ", &ee->context);
>  }
>  
>  void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
> @@ -562,12 +572,12 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>  
>  	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
>  		if (error->engine[i].hangcheck_stalled &&
> -		    error->engine[i].pid != -1) {
> -			err_printf(m, "Active process (on ring %s): %s [%d], context bans %d\n",
> +		    error->engine[i].context.pid) {
> +			err_printf(m, "Active process (on ring %s): %s [%d], score %d\n",
>  				   engine_str(i),
> -				   error->engine[i].comm,
> -				   error->engine[i].pid,
> -				   error->engine[i].context_bans);
> +				   error->engine[i].context.comm,
> +				   error->engine[i].context.pid,
> +				   error->engine[i].context.ban_score);
>  		}
>  	}
>  	err_printf(m, "Reset count: %u\n", error->reset_count);
> @@ -658,11 +668,13 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>  		obj = ee->batchbuffer;
>  		if (obj) {
>  			err_puts(m, dev_priv->engine[i]->name);
> -			if (ee->pid != -1)
> -				err_printf(m, " (submitted by %s [%d], bans %d)",
> -					   ee->comm,
> -					   ee->pid,
> -					   ee->context_bans);
> +			if (ee->context.pid)
> +				err_printf(m, " (submitted by %s [%d], ctx %d [%d], score %d)",
> +					   ee->context.comm,
> +					   ee->context.pid,
> +					   ee->context.handle,
> +					   ee->context.hw_id,
> +					   ee->context.ban_score);
>  			err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
>  				   upper_32_bits(obj->gtt_offset),
>  				   lower_32_bits(obj->gtt_offset));
> @@ -1267,6 +1279,28 @@ static void error_record_engine_execlists(struct intel_engine_cs *engine,
>  				       &ee->execlist[n]);
>  }
>  
> +static void record_context(struct drm_i915_error_context *e,
> +			   struct i915_gem_context *ctx)
> +{
> +	if (ctx->pid) {
> +		struct task_struct *task;
> +
> +		rcu_read_lock();
> +		task = pid_task(ctx->pid, PIDTYPE_PID);
> +		if (task) {
> +			strcpy(e->comm, task->comm);
> +			e->pid = task->pid;
> +		}
> +		rcu_read_unlock();
> +	}
> +
> +	e->handle = ctx->user_handle;
> +	e->hw_id = ctx->hw_id;
> +	e->ban_score = ctx->ban_score;
> +	e->guilty = ctx->guilty_count;
> +	e->active = ctx->active_count;
> +}
> +
>  static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
>  				  struct drm_i915_error_state *error)
>  {
> @@ -1281,7 +1315,6 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
>  		struct drm_i915_error_engine *ee = &error->engine[i];
>  		struct drm_i915_gem_request *request;
>  
> -		ee->pid = -1;
>  		ee->engine_id = -1;
>  
>  		if (!engine)
> @@ -1296,11 +1329,12 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
>  		request = i915_gem_find_active_request(engine);
>  		if (request) {
>  			struct intel_ring *ring;
> -			struct pid *pid;
>  
>  			ee->vm = request->ctx->ppgtt ?
>  				&request->ctx->ppgtt->base : &ggtt->base;
>  
> +			record_context(&ee->context, request->ctx);
> +
>  			/* We need to copy these to an anonymous buffer
>  			 * as the simplest method to avoid being overwritten
>  			 * by userspace.
> @@ -1318,19 +1352,6 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
>  				i915_error_object_create(dev_priv,
>  							 request->ctx->engine[i].state);
>  
> -			pid = request->ctx->pid;
> -			if (pid) {
> -				struct task_struct *task;
> -
> -				rcu_read_lock();
> -				task = pid_task(pid, PIDTYPE_PID);
> -				if (task) {
> -					strcpy(ee->comm, task->comm);
> -					ee->pid = task->pid;
> -				}
> -				rcu_read_unlock();
> -			}
> -
>  			error->simulated |=
>  				i915_gem_context_no_error_capture(request->ctx);
>  
> @@ -1534,12 +1555,12 @@ static void i915_error_capture_msg(struct drm_i915_private *dev_priv,
>  			"GPU HANG: ecode %d:%d:0x%08x",
>  			INTEL_GEN(dev_priv), engine_id, ecode);
>  
> -	if (engine_id != -1 && error->engine[engine_id].pid != -1)
> +	if (engine_id != -1 && error->engine[engine_id].context.pid)
>  		len += scnprintf(error->error_msg + len,
>  				 sizeof(error->error_msg) - len,
>  				 ", in %s [%d]",
> -				 error->engine[engine_id].comm,
> -				 error->engine[engine_id].pid);
> +				 error->engine[engine_id].context.comm,
> +				 error->engine[engine_id].context.pid);
>  
>  	scnprintf(error->error_msg + len, sizeof(error->error_msg) - len,
>  		  ", reason: %s, action: %s",
> -- 
> 2.11.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 3+ messages in thread

* ✗ Fi.CI.BAT: warning for drm/i915: Record more information about the hanging contexts (rev2)
  2017-01-29  9:24 [PATCH v2] drm/i915: Record more information about the hanging contexts Chris Wilson
  2017-01-30 15:24 ` Mika Kuoppala
@ 2017-01-30 17:24 ` Patchwork
  1 sibling, 0 replies; 3+ messages in thread
From: Patchwork @ 2017-01-30 17:24 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: drm/i915: Record more information about the hanging contexts (rev2)
URL   : https://patchwork.freedesktop.org/series/18662/
State : warning

== Summary ==

Series 18662v2 drm/i915: Record more information about the hanging contexts
https://patchwork.freedesktop.org/api/1.0/series/18662/revisions/2/mbox/

Test kms_force_connector_basic:
        Subgroup force-connector-state:
                pass       -> DMESG-WARN (fi-snb-2520m)

fi-bdw-5557u     total:246  pass:232  dwarn:0   dfail:0   fail:0   skip:14 
fi-bsw-n3050     total:246  pass:207  dwarn:0   dfail:0   fail:0   skip:39 
fi-bxt-j4205     total:246  pass:224  dwarn:0   dfail:0   fail:0   skip:22 
fi-bxt-t5700     total:78   pass:65   dwarn:0   dfail:0   fail:0   skip:12 
fi-byt-j1900     total:246  pass:219  dwarn:0   dfail:0   fail:0   skip:27 
fi-byt-n2820     total:246  pass:215  dwarn:0   dfail:0   fail:0   skip:31 
fi-hsw-4770      total:246  pass:227  dwarn:0   dfail:0   fail:0   skip:19 
fi-hsw-4770r     total:246  pass:227  dwarn:0   dfail:0   fail:0   skip:19 
fi-ivb-3520m     total:246  pass:225  dwarn:0   dfail:0   fail:0   skip:21 
fi-ivb-3770      total:246  pass:225  dwarn:0   dfail:0   fail:0   skip:21 
fi-kbl-7500u     total:246  pass:223  dwarn:0   dfail:0   fail:2   skip:21 
fi-skl-6260u     total:246  pass:233  dwarn:0   dfail:0   fail:0   skip:13 
fi-skl-6700hq    total:246  pass:226  dwarn:0   dfail:0   fail:0   skip:20 
fi-skl-6700k     total:246  pass:221  dwarn:4   dfail:0   fail:0   skip:21 
fi-skl-6770hq    total:246  pass:233  dwarn:0   dfail:0   fail:0   skip:13 
fi-snb-2520m     total:246  pass:214  dwarn:1   dfail:0   fail:0   skip:31 
fi-snb-2600      total:246  pass:214  dwarn:0   dfail:0   fail:0   skip:32 

e1cc28133f0474a85da1fc4017686c701b564312 drm-tip: 2017y-01m-30d-15h-50m-59s UTC integration manifest
1e19d96 drm/i915: Record more information about the hanging contexts

== Logs ==

For more details see: https://intel-gfx-ci.01.org/CI/Patchwork_3640/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2017-01-30 17:24 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-01-29  9:24 [PATCH v2] drm/i915: Record more information about the hanging contexts Chris Wilson
2017-01-30 15:24 ` Mika Kuoppala
2017-01-30 17:24 ` ✗ Fi.CI.BAT: warning for drm/i915: Record more information about the hanging contexts (rev2) Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.