All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/1] drm/i915: Copy user requested buffers into the error state
@ 2017-01-24 11:16 Mika Kuoppala
  2017-01-24 12:54 ` ✓ Fi.CI.BAT: success for series starting with [1/1] " Patchwork
  2017-01-24 19:44 ` [PATCH 1/1] " Ben Widawsky
  0 siblings, 2 replies; 5+ messages in thread
From: Mika Kuoppala @ 2017-01-24 11:16 UTC (permalink / raw)
  To: intel-gfx; +Cc: Ben Widawsky

From: Chris Wilson <chris@chris-wilson.co.uk>

Introduce a new execobject.flag (EXEC_OBJECT_CAPTURE) that userspace may
use to indicate that it wants the contents of this buffer preserved in
the error state (/sys/class/drm/cardN/error) following a GPU hang
involving this batch.

Use this at your discretion, the contents of the error state. although
compressed, are allocated with GFP_ATOMIC (i.e. limited) and kept for all
eternity (until the error state is destroyed).

v2: rebased from Chris's tree on top of drm-tip (Mika)

Based on an earlier patch by Ben Widawsky <ben@bwidawsk.net>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> (v1)
Cc: Ben Widawsky <ben@bwidawsk.net>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Abdiel Janulgue <abdiel.janulgue@linux.intel.com>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c            |  1 +
 drivers/gpu/drm/i915/i915_drv.h            |  3 +++
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 12 +++++++++
 drivers/gpu/drm/i915/i915_gem_request.c    | 16 ++++++++++++
 drivers/gpu/drm/i915/i915_gem_request.h    | 11 ++++++++
 drivers/gpu/drm/i915/i915_gpu_error.c      | 40 +++++++++++++++++++++++++++++-
 include/uapi/drm/i915_drm.h                |  9 ++++++-
 7 files changed, 90 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index ca168b2..3207b70 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -349,6 +349,7 @@ static int i915_getparam(struct drm_device *dev, void *data,
 	case I915_PARAM_HAS_EXEC_HANDLE_LUT:
 	case I915_PARAM_HAS_COHERENT_PHYS_GTT:
 	case I915_PARAM_HAS_EXEC_SOFTPIN:
+	case I915_PARAM_HAS_EXEC_CAPTURE:
 		/* For the time being all of these are always true;
 		 * if some supported hardware does not have one of these
 		 * features this value needs to be provided from
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 2446280..1300c30b 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -978,6 +978,9 @@ struct drm_i915_error_state {
 			u32 *pages[0];
 		} *ringbuffer, *batchbuffer, *wa_batchbuffer, *ctx, *hws_page;
 
+		struct drm_i915_error_object **user_bo;
+		long user_bo_count;
+
 		struct drm_i915_error_object *wa_ctx;
 
 		struct drm_i915_error_request {
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index c66e905..6a1a2c2 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1111,6 +1111,18 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
 	list_for_each_entry(vma, vmas, exec_list) {
 		struct drm_i915_gem_object *obj = vma->obj;
 
+		if (vma->exec_entry->flags & EXEC_OBJECT_CAPTURE) {
+			struct i915_gem_capture_list *capture;
+
+			capture = kmalloc(sizeof(*capture), GFP_KERNEL);
+			if (unlikely(!capture))
+				return -ENOMEM;
+
+			capture->next = req->capture_list;
+			capture->vma = vma;
+			req->capture_list = capture;
+		}
+
 		ret = i915_gem_request_await_object
 			(req, obj, obj->base.pending_write_domain);
 		if (ret)
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 72b7f7d..acd09d7 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -204,6 +204,19 @@ void i915_gem_retire_noop(struct i915_gem_active *active,
 	/* Space left intentionally blank */
 }
 
+static void request_free_capture_list(struct drm_i915_gem_request *request)
+{
+	struct i915_gem_capture_list *capture;
+
+	capture = request->capture_list;
+	while (capture) {
+		struct i915_gem_capture_list *next = capture->next;
+
+		kfree(capture);
+		capture = next;
+	}
+}
+
 static void i915_gem_request_retire(struct drm_i915_gem_request *request)
 {
 	struct intel_engine_cs *engine = request->engine;
@@ -238,6 +251,8 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
 				 msecs_to_jiffies(100));
 	}
 
+	request_free_capture_list(request);
+
 	/* Walk through the active list, calling retire on each. This allows
 	 * objects to track their GPU activity and mark themselves as idle
 	 * when their *last* active request is completed (updating state
@@ -589,6 +604,7 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	req->global_seqno = 0;
 	req->file_priv = NULL;
 	req->batch = NULL;
+	req->capture_list = NULL;
 
 	/*
 	 * Reserve space in the ring buffer for all the commands required to
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index ea511f0..b380b7e 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -71,6 +71,11 @@ struct i915_priotree {
 #define I915_PRIORITY_MIN (-I915_PRIORITY_MAX)
 };
 
+struct i915_gem_capture_list {
+	struct i915_gem_capture_list *next;
+	struct i915_vma *vma;
+};
+
 /**
  * Request queue structure.
  *
@@ -174,6 +179,12 @@ struct drm_i915_gem_request {
 	 * error state dump only).
 	 */
 	struct i915_vma *batch;
+	/** Additional buffers requested by userspace to be captured upon
+	 * a GPU hang. The vma/obj on this list are protected by their
+	 * active reference - all objects on this list must also be
+	 * on the active_list (of their final request).
+	 */
+	struct i915_gem_capture_list *capture_list;
 	struct list_head active_list;
 
 	/** Time at which this request was emitted, in jiffies. */
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index e537532..5c8531b 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -669,6 +669,10 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 			print_error_obj(m, dev_priv->engine[i], NULL, obj);
 		}
 
+		for (j = 0; j < ee->user_bo_count; j++)
+			print_error_obj(m, dev_priv->engine[i],
+					"user", ee->user_bo[j]);
+
 		if (ee->num_requests) {
 			err_printf(m, "%s --- %d requests\n",
 				   dev_priv->engine[i]->name,
@@ -774,11 +778,15 @@ static void i915_error_state_free(struct kref *error_ref)
 {
 	struct drm_i915_error_state *error = container_of(error_ref,
 							  typeof(*error), ref);
-	int i;
+	long i, j;
 
 	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
 		struct drm_i915_error_engine *ee = &error->engine[i];
 
+		for (j = 0; j < ee->user_bo_count; j++)
+			i915_error_object_free(ee->user_bo[j]);
+		kfree(ee->user_bo);
+
 		i915_error_object_free(ee->batchbuffer);
 		i915_error_object_free(ee->wa_batchbuffer);
 		i915_error_object_free(ee->ringbuffer);
@@ -1267,6 +1275,35 @@ static void error_record_engine_execlists(struct intel_engine_cs *engine,
 				       &ee->execlist[n]);
 }
 
+static void request_record_user_bo(struct drm_i915_gem_request *request,
+				   struct drm_i915_error_engine *ee)
+{
+	struct i915_gem_capture_list *c;
+	struct drm_i915_error_object **bo;
+	long count;
+
+	count = 0;
+	for (c = request->capture_list; c; c = c->next)
+		count++;
+
+	bo = NULL;
+	if (count)
+		bo = kcalloc(count, sizeof(*bo), GFP_ATOMIC);
+	if (!bo)
+		return;
+
+	count = 0;
+	for (c = request->capture_list; c; c = c->next) {
+		bo[count] = i915_error_object_create(request->i915, c->vma);
+		if (!bo[count])
+			break;
+		count++;
+	}
+
+	ee->user_bo = bo;
+	ee->user_bo_count = count;
+}
+
 static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 				  struct drm_i915_error_state *error)
 {
@@ -1313,6 +1350,7 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 				ee->wa_batchbuffer =
 					i915_error_object_create(dev_priv,
 								 engine->scratch);
+			request_record_user_bo(request, ee);
 
 			ee->ctx =
 				i915_error_object_create(dev_priv,
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 57093b4..a16e322 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -397,6 +397,12 @@ typedef struct drm_i915_irq_wait {
 #define I915_PARAM_HAS_SCHEDULER	 41
 #define I915_PARAM_HUC_STATUS		 42
 
+/* Query whether DRM_I915_GEM_EXECBUFFER2 supports the ability to capture
+ * user specified bufffers for post-mortem debugging of GPU hangs. See
+ * EXEC_OBJECT_CAPTURE.
+ */
+#define I915_PARAM_HAS_EXEC_CAPTURE	 43
+
 typedef struct drm_i915_getparam {
 	__s32 param;
 	/*
@@ -737,8 +743,9 @@ struct drm_i915_gem_exec_object2 {
 #define EXEC_OBJECT_SUPPORTS_48B_ADDRESS (1<<3)
 #define EXEC_OBJECT_PINNED		 (1<<4)
 #define EXEC_OBJECT_PAD_TO_SIZE		 (1<<5)
+#define EXEC_OBJECT_CAPTURE		 (1<<6)
 /* All remaining bits are MBZ and RESERVED FOR FUTURE USE */
-#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_PAD_TO_SIZE<<1)
+#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_CAPTURE<<1)
 	__u64 flags;
 
 	union {
-- 
2.7.4

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* ✓ Fi.CI.BAT: success for series starting with [1/1] drm/i915: Copy user requested buffers into the error state
  2017-01-24 11:16 [PATCH 1/1] drm/i915: Copy user requested buffers into the error state Mika Kuoppala
@ 2017-01-24 12:54 ` Patchwork
  2017-01-24 19:44 ` [PATCH 1/1] " Ben Widawsky
  1 sibling, 0 replies; 5+ messages in thread
From: Patchwork @ 2017-01-24 12:54 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/1] drm/i915: Copy user requested buffers into the error state
URL   : https://patchwork.freedesktop.org/series/18476/
State : success

== Summary ==

Series 18476v1 Series without cover letter
https://patchwork.freedesktop.org/api/1.0/series/18476/revisions/1/mbox/


fi-bdw-5557u     total:246  pass:232  dwarn:0   dfail:0   fail:0   skip:14 
fi-bsw-n3050     total:246  pass:207  dwarn:0   dfail:0   fail:0   skip:39 
fi-bxt-j4205     total:246  pass:224  dwarn:0   dfail:0   fail:0   skip:22 
fi-bxt-t5700     total:79   pass:66   dwarn:0   dfail:0   fail:0   skip:12 
fi-byt-j1900     total:246  pass:219  dwarn:0   dfail:0   fail:0   skip:27 
fi-byt-n2820     total:246  pass:215  dwarn:0   dfail:0   fail:0   skip:31 
fi-hsw-4770      total:246  pass:227  dwarn:0   dfail:0   fail:0   skip:19 
fi-hsw-4770r     total:246  pass:227  dwarn:0   dfail:0   fail:0   skip:19 
fi-ivb-3520m     total:246  pass:225  dwarn:0   dfail:0   fail:0   skip:21 
fi-ivb-3770      total:246  pass:225  dwarn:0   dfail:0   fail:0   skip:21 
fi-kbl-7500u     total:246  pass:225  dwarn:0   dfail:0   fail:0   skip:21 
fi-skl-6260u     total:246  pass:233  dwarn:0   dfail:0   fail:0   skip:13 
fi-skl-6700hq    total:246  pass:226  dwarn:0   dfail:0   fail:0   skip:20 
fi-skl-6700k     total:246  pass:222  dwarn:3   dfail:0   fail:0   skip:21 
fi-skl-6770hq    total:246  pass:233  dwarn:0   dfail:0   fail:0   skip:13 
fi-snb-2520m     total:246  pass:215  dwarn:0   dfail:0   fail:0   skip:31 
fi-snb-2600      total:246  pass:214  dwarn:0   dfail:0   fail:0   skip:32 

64fc20ef2f4bf8a6b563a812485fc6ac86637fcd drm-tip: 2017y-01m-24d-10h-02m-05s UTC integration manifest
63f9a6e drm/i915: Copy user requested buffers into the error state

== Logs ==

For more details see: https://intel-gfx-ci.01.org/CI/Patchwork_3591/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 1/1] drm/i915: Copy user requested buffers into the error state
  2017-01-24 11:16 [PATCH 1/1] drm/i915: Copy user requested buffers into the error state Mika Kuoppala
  2017-01-24 12:54 ` ✓ Fi.CI.BAT: success for series starting with [1/1] " Patchwork
@ 2017-01-24 19:44 ` Ben Widawsky
  2017-01-24 21:17   ` Chris Wilson
  2017-01-24 21:32   ` Chris Wilson
  1 sibling, 2 replies; 5+ messages in thread
From: Ben Widawsky @ 2017-01-24 19:44 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

On 17-01-24 13:16:56, Mika Kuoppala wrote:
>From: Chris Wilson <chris@chris-wilson.co.uk>
>
>Introduce a new execobject.flag (EXEC_OBJECT_CAPTURE) that userspace may
>use to indicate that it wants the contents of this buffer preserved in
>the error state (/sys/class/drm/cardN/error) following a GPU hang
>involving this batch.
>
>Use this at your discretion, the contents of the error state. although
>compressed, are allocated with GFP_ATOMIC (i.e. limited) and kept for all
>eternity (until the error state is destroyed).
>
>v2: rebased from Chris's tree on top of drm-tip (Mika)
>
>Based on an earlier patch by Ben Widawsky <ben@bwidawsk.net>
>Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> (v1)
>Cc: Ben Widawsky <ben@bwidawsk.net>
>Cc: Matt Turner <mattst88@gmail.com>
>Cc: Abdiel Janulgue <abdiel.janulgue@linux.intel.com>
>Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
>---
> drivers/gpu/drm/i915/i915_drv.c            |  1 +
> drivers/gpu/drm/i915/i915_drv.h            |  3 +++
> drivers/gpu/drm/i915/i915_gem_execbuffer.c | 12 +++++++++
> drivers/gpu/drm/i915/i915_gem_request.c    | 16 ++++++++++++
> drivers/gpu/drm/i915/i915_gem_request.h    | 11 ++++++++
> drivers/gpu/drm/i915/i915_gpu_error.c      | 40 +++++++++++++++++++++++++++++-
> include/uapi/drm/i915_drm.h                |  9 ++++++-
> 7 files changed, 90 insertions(+), 2 deletions(-)
>
>diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
>index ca168b2..3207b70 100644
>--- a/drivers/gpu/drm/i915/i915_drv.c
>+++ b/drivers/gpu/drm/i915/i915_drv.c
>@@ -349,6 +349,7 @@ static int i915_getparam(struct drm_device *dev, void *data,
> 	case I915_PARAM_HAS_EXEC_HANDLE_LUT:
> 	case I915_PARAM_HAS_COHERENT_PHYS_GTT:
> 	case I915_PARAM_HAS_EXEC_SOFTPIN:
>+	case I915_PARAM_HAS_EXEC_CAPTURE:
> 		/* For the time being all of these are always true;
> 		 * if some supported hardware does not have one of these
> 		 * features this value needs to be provided from
>diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>index 2446280..1300c30b 100644
>--- a/drivers/gpu/drm/i915/i915_drv.h
>+++ b/drivers/gpu/drm/i915/i915_drv.h
>@@ -978,6 +978,9 @@ struct drm_i915_error_state {
> 			u32 *pages[0];
> 		} *ringbuffer, *batchbuffer, *wa_batchbuffer, *ctx, *hws_page;
>
>+		struct drm_i915_error_object **user_bo;
>+		long user_bo_count;
>+
> 		struct drm_i915_error_object *wa_ctx;
>
> 		struct drm_i915_error_request {
>diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>index c66e905..6a1a2c2 100644
>--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>@@ -1111,6 +1111,18 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
> 	list_for_each_entry(vma, vmas, exec_list) {
> 		struct drm_i915_gem_object *obj = vma->obj;
>
>+		if (vma->exec_entry->flags & EXEC_OBJECT_CAPTURE) {
>+			struct i915_gem_capture_list *capture;
>+
>+			capture = kmalloc(sizeof(*capture), GFP_KERNEL);
>+			if (unlikely(!capture))
>+				return -ENOMEM;
>+
>+			capture->next = req->capture_list;
>+			capture->vma = vma;
>+			req->capture_list = capture;
>+		}
>+
> 		ret = i915_gem_request_await_object
> 			(req, obj, obj->base.pending_write_domain);
> 		if (ret)
>diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
>index 72b7f7d..acd09d7 100644
>--- a/drivers/gpu/drm/i915/i915_gem_request.c
>+++ b/drivers/gpu/drm/i915/i915_gem_request.c
>@@ -204,6 +204,19 @@ void i915_gem_retire_noop(struct i915_gem_active *active,
> 	/* Space left intentionally blank */
> }
>
>+static void request_free_capture_list(struct drm_i915_gem_request *request)
>+{
>+	struct i915_gem_capture_list *capture;
>+
>+	capture = request->capture_list;
>+	while (capture) {
>+		struct i915_gem_capture_list *next = capture->next;
>+
>+		kfree(capture);
>+		capture = next;
>+	}
>+}
>+
> static void i915_gem_request_retire(struct drm_i915_gem_request *request)
> {
> 	struct intel_engine_cs *engine = request->engine;
>@@ -238,6 +251,8 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
> 				 msecs_to_jiffies(100));
> 	}
>
>+	request_free_capture_list(request);
>+
> 	/* Walk through the active list, calling retire on each. This allows
> 	 * objects to track their GPU activity and mark themselves as idle
> 	 * when their *last* active request is completed (updating state
>@@ -589,6 +604,7 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
> 	req->global_seqno = 0;
> 	req->file_priv = NULL;
> 	req->batch = NULL;
>+	req->capture_list = NULL;
>
> 	/*
> 	 * Reserve space in the ring buffer for all the commands required to
>diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
>index ea511f0..b380b7e 100644
>--- a/drivers/gpu/drm/i915/i915_gem_request.h
>+++ b/drivers/gpu/drm/i915/i915_gem_request.h
>@@ -71,6 +71,11 @@ struct i915_priotree {
> #define I915_PRIORITY_MIN (-I915_PRIORITY_MAX)
> };
>
>+struct i915_gem_capture_list {
>+	struct i915_gem_capture_list *next;
>+	struct i915_vma *vma;
>+};
>+
> /**
>  * Request queue structure.
>  *
>@@ -174,6 +179,12 @@ struct drm_i915_gem_request {
> 	 * error state dump only).
> 	 */
> 	struct i915_vma *batch;
>+	/** Additional buffers requested by userspace to be captured upon
>+	 * a GPU hang. The vma/obj on this list are protected by their
>+	 * active reference - all objects on this list must also be
>+	 * on the active_list (of their final request).
>+	 */
>+	struct i915_gem_capture_list *capture_list;
> 	struct list_head active_list;
>
> 	/** Time at which this request was emitted, in jiffies. */
>diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
>index e537532..5c8531b 100644
>--- a/drivers/gpu/drm/i915/i915_gpu_error.c
>+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
>@@ -669,6 +669,10 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
> 			print_error_obj(m, dev_priv->engine[i], NULL, obj);
> 		}
>
>+		for (j = 0; j < ee->user_bo_count; j++)
>+			print_error_obj(m, dev_priv->engine[i],
>+					"user", ee->user_bo[j]);
>+

We'll need a way to be able to figure out which bo corresponds to what we asked
to capture. I'm not quite sure what the best way to do this is, but I'd like
userspace to have to make minimal effort to figure it out.

> 		if (ee->num_requests) {
> 			err_printf(m, "%s --- %d requests\n",
> 				   dev_priv->engine[i]->name,
>@@ -774,11 +778,15 @@ static void i915_error_state_free(struct kref *error_ref)
> {
> 	struct drm_i915_error_state *error = container_of(error_ref,
> 							  typeof(*error), ref);
>-	int i;
>+	long i, j;
>
> 	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
> 		struct drm_i915_error_engine *ee = &error->engine[i];
>
>+		for (j = 0; j < ee->user_bo_count; j++)
>+			i915_error_object_free(ee->user_bo[j]);
>+		kfree(ee->user_bo);
>+
> 		i915_error_object_free(ee->batchbuffer);
> 		i915_error_object_free(ee->wa_batchbuffer);
> 		i915_error_object_free(ee->ringbuffer);
>@@ -1267,6 +1275,35 @@ static void error_record_engine_execlists(struct intel_engine_cs *engine,
> 				       &ee->execlist[n]);
> }
>
>+static void request_record_user_bo(struct drm_i915_gem_request *request,
>+				   struct drm_i915_error_engine *ee)
>+{
>+	struct i915_gem_capture_list *c;
>+	struct drm_i915_error_object **bo;
>+	long count;
>+
>+	count = 0;
>+	for (c = request->capture_list; c; c = c->next)
>+		count++;
>+
>+	bo = NULL;
>+	if (count)
>+		bo = kcalloc(count, sizeof(*bo), GFP_ATOMIC);
>+	if (!bo)
>+		return;
>+
>+	count = 0;
>+	for (c = request->capture_list; c; c = c->next) {
>+		bo[count] = i915_error_object_create(request->i915, c->vma);
>+		if (!bo[count])
>+			break;
>+		count++;
>+	}
>+
>+	ee->user_bo = bo;
>+	ee->user_bo_count = count;
>+}
>+
> static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
> 				  struct drm_i915_error_state *error)
> {
>@@ -1313,6 +1350,7 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
> 				ee->wa_batchbuffer =
> 					i915_error_object_create(dev_priv,
> 								 engine->scratch);
>+			request_record_user_bo(request, ee);
>
> 			ee->ctx =
> 				i915_error_object_create(dev_priv,
>diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>index 57093b4..a16e322 100644
>--- a/include/uapi/drm/i915_drm.h
>+++ b/include/uapi/drm/i915_drm.h
>@@ -397,6 +397,12 @@ typedef struct drm_i915_irq_wait {
> #define I915_PARAM_HAS_SCHEDULER	 41
> #define I915_PARAM_HUC_STATUS		 42
>
>+/* Query whether DRM_I915_GEM_EXECBUFFER2 supports the ability to capture
>+ * user specified bufffers for post-mortem debugging of GPU hangs. See
>+ * EXEC_OBJECT_CAPTURE.
>+ */
>+#define I915_PARAM_HAS_EXEC_CAPTURE	 43
>+
> typedef struct drm_i915_getparam {
> 	__s32 param;
> 	/*
>@@ -737,8 +743,9 @@ struct drm_i915_gem_exec_object2 {
> #define EXEC_OBJECT_SUPPORTS_48B_ADDRESS (1<<3)
> #define EXEC_OBJECT_PINNED		 (1<<4)
> #define EXEC_OBJECT_PAD_TO_SIZE		 (1<<5)
>+#define EXEC_OBJECT_CAPTURE		 (1<<6)
> /* All remaining bits are MBZ and RESERVED FOR FUTURE USE */
>-#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_PAD_TO_SIZE<<1)
>+#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_CAPTURE<<1)
> 	__u64 flags;
>
> 	union {
>-- 
>2.7.4
>

-- 
Ben Widawsky, Intel Open Source Technology Center
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 1/1] drm/i915: Copy user requested buffers into the error state
  2017-01-24 19:44 ` [PATCH 1/1] " Ben Widawsky
@ 2017-01-24 21:17   ` Chris Wilson
  2017-01-24 21:32   ` Chris Wilson
  1 sibling, 0 replies; 5+ messages in thread
From: Chris Wilson @ 2017-01-24 21:17 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: intel-gfx

On Tue, Jan 24, 2017 at 11:44:26AM -0800, Ben Widawsky wrote:
> On 17-01-24 13:16:56, Mika Kuoppala wrote:
> >+		for (j = 0; j < ee->user_bo_count; j++)
> >+			print_error_obj(m, dev_priv->engine[i],
> >+					"user", ee->user_bo[j]);
> >+
> 
> We'll need a way to be able to figure out which bo corresponds to what we asked
> to capture. I'm not quite sure what the best way to do this is, but I'd like
> userspace to have to make minimal effort to figure it out.

You know the offset of each bo you request and that matches the output
here. That's how I checked the selected bo was captured in my testcase.

Given that the offset (in this context) will be fixed from submission to
hang and you are accurately tracking the offset of each bo in each
context, that should be enough to reverse map from the error to
userspace struct. Hmm, actually identifying which context hung is not
obvious from the error state at the moment.

I think we should also start sketching ideas to make error state parsing
easier. A more structured output, json like?
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 1/1] drm/i915: Copy user requested buffers into the error state
  2017-01-24 19:44 ` [PATCH 1/1] " Ben Widawsky
  2017-01-24 21:17   ` Chris Wilson
@ 2017-01-24 21:32   ` Chris Wilson
  1 sibling, 0 replies; 5+ messages in thread
From: Chris Wilson @ 2017-01-24 21:32 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: intel-gfx

On Tue, Jan 24, 2017 at 11:44:26AM -0800, Ben Widawsky wrote:
> >+		for (j = 0; j < ee->user_bo_count; j++)
> >+			print_error_obj(m, dev_priv->engine[i],
> >+					"user", ee->user_bo[j]);
> >+
> 
> We'll need a way to be able to figure out which bo corresponds to what we asked
> to capture. I'm not quite sure what the best way to do this is, but I'd like
> userspace to have to make minimal effort to figure it out.

On the alternative offline front, I thought that one of the bos
captured would be an aux buffer that contained such user debug info - it
can even patch the offsets of the other bo into its tables.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2017-01-24 21:32 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-01-24 11:16 [PATCH 1/1] drm/i915: Copy user requested buffers into the error state Mika Kuoppala
2017-01-24 12:54 ` ✓ Fi.CI.BAT: success for series starting with [1/1] " Patchwork
2017-01-24 19:44 ` [PATCH 1/1] " Ben Widawsky
2017-01-24 21:17   ` Chris Wilson
2017-01-24 21:32   ` Chris Wilson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.