All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/i915: Record pid/comm of hanging task
@ 2014-02-05 16:40 Chris Wilson
  2014-02-05 17:20 ` Jesse Barnes
  2014-02-06 12:57 ` Chris Wilson
  0 siblings, 2 replies; 3+ messages in thread
From: Chris Wilson @ 2014-02-05 16:40 UTC (permalink / raw)
  To: intel-gfx; +Cc: Ben Widawsky

After finding the guilty batch and request, we can use it to find the
process that submitted the batch and then add the culprit into the error
state.

This is a slightly different approach from Ben's in that instead of
adding the extra information into the struct i915_hw_context, we use the
information already captured in struct drm_file which is then referenced
from the request.

Link: http://lists.freedesktop.org/archives/intel-gfx/2013-August/032280.html
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Ben Widawsky <ben@bwidawsk.net>
---
 drivers/gpu/drm/i915/i915_drv.h       |  4 ++
 drivers/gpu/drm/i915/i915_gem.c       |  1 +
 drivers/gpu/drm/i915/i915_gpu_error.c | 69 ++++++++++++++++++++++-------------
 3 files changed, 49 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 1d317f8c9f05..2007062e4a35 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -374,6 +374,9 @@ struct drm_i915_error_state {
 				u32 pp_dir_base;
 			};
 		} vm_info;
+
+		pid_t pid;
+		char comm[TASK_COMM_LEN];
 	} ring[I915_NUM_RINGS];
 
 	struct drm_i915_error_buffer {
@@ -1825,6 +1828,7 @@ struct drm_i915_gem_request {
 
 struct drm_i915_file_private {
 	struct drm_i915_private *dev_priv;
+	struct drm_file *file;
 
 	struct {
 		spinlock_t lock;
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 157877bd7a0b..605e4ab636e8 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4981,6 +4981,7 @@ int i915_gem_open(struct drm_device *dev, struct drm_file *file)
 
 	file->driver_priv = file_priv;
 	file_priv->dev_priv = dev->dev_private;
+	file_priv->file = file;
 
 	spin_lock_init(&file_priv->mm.lock);
 	INIT_LIST_HEAD(&file_priv->mm.request_list);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index ea588c66ffc4..386bf52dd16a 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -309,6 +309,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	struct drm_i915_error_state *error = error_priv->error;
 	int i, j, page, offset, elt;
+	int max_hangcheck_score;
 
 	if (!error) {
 		err_printf(m, "no error state collected\n");
@@ -318,6 +319,20 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 	err_printf(m, "Time: %ld s %ld us\n", error->time.tv_sec,
 		   error->time.tv_usec);
 	err_printf(m, "Kernel: " UTS_RELEASE "\n");
+	max_hangcheck_score = 0;
+	for (i = 0; i < ARRAY_SIZE(error->ring); i++) {
+		if (error->ring[i].hangcheck_score > max_hangcheck_score)
+			max_hangcheck_score = error->ring[i].hangcheck_score;
+	}
+	for (i = 0; i < ARRAY_SIZE(error->ring); i++) {
+		if (error->ring[i].hangcheck_score == max_hangcheck_score &&
+		    error->ring[i].pid != -1) {
+			err_printf(m, "Active process (on ring %s): %s [%d]\n",
+				   ring_str(i),
+				   error->ring[i].comm,
+				   error->ring[i].pid);
+		}
+	}
 	err_printf(m, "PCI ID: 0x%04x\n", dev->pdev->device);
 	err_printf(m, "EIR: 0x%08x\n", error->eir);
 	err_printf(m, "IER: 0x%08x\n", error->ier);
@@ -363,8 +378,11 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 		struct drm_i915_error_object *obj;
 
 		if ((obj = error->ring[i].batchbuffer)) {
-			err_printf(m, "%s --- gtt_offset = 0x%08x\n",
-				   dev_priv->ring[i].name,
+			err_puts(m, dev_priv->ring[i].name);
+			if (error->ring[i].pid != -1)
+				err_printf(m, " (submitted by %s [%d])",
+					   error->ring[i].comm, error->ring[i].pid);
+			err_printf(m, " --- gtt_offset = 0x%08x\n",
 				   obj->gtt_offset);
 			offset = 0;
 			for (page = 0; page < obj->page_count; page++) {
@@ -698,9 +716,9 @@ static void i915_gem_record_fences(struct drm_device *dev,
 	}
 }
 
-static struct drm_i915_error_object *
-i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
-			     struct intel_ring_buffer *ring)
+static struct drm_i915_gem_request *
+i915_error_first_request(struct drm_i915_private *dev_priv,
+			 struct intel_ring_buffer *ring)
 {
 	struct drm_i915_gem_request *request;
 	u32 seqno;
@@ -708,29 +726,12 @@ i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
 	if (!ring->get_seqno)
 		return NULL;
 
-	if (HAS_BROKEN_CS_TLB(dev_priv->dev)) {
-		struct drm_i915_gem_object *obj;
-		u32 acthd = I915_READ(ACTHD);
-
-		if (WARN_ON(ring->id != RCS))
-			return NULL;
-
-		obj = ring->scratch.obj;
-		if (obj != NULL &&
-		    acthd >= i915_gem_obj_ggtt_offset(obj) &&
-		    acthd < i915_gem_obj_ggtt_offset(obj) + obj->base.size)
-			return i915_error_ggtt_object_create(dev_priv, obj);
-	}
-
 	seqno = ring->get_seqno(ring, false);
 	list_for_each_entry(request, &ring->request_list, list) {
 		if (i915_seqno_passed(seqno, request->seqno))
 			continue;
 
-		/* We need to copy these to an anonymous buffer as the simplest
-		 * method to avoid being overwritten by userspace.
-		 */
-		return i915_error_object_create(dev_priv, request->batch_obj, request->ctx->vm);
+		return request;
 	}
 
 	return NULL;
@@ -884,8 +885,26 @@ static void i915_gem_record_rings(struct drm_device *dev,
 
 		i915_record_ring_state(dev, ring, &error->ring[i]);
 
-		error->ring[i].batchbuffer =
-			i915_error_first_batchbuffer(dev_priv, ring);
+		error->ring[i].pid = -1;
+		request = i915_error_first_request(dev_priv, ring);
+		if (request) {
+			/* We need to copy these to an anonymous buffer as the simplest
+			 * method to avoid being overwritten by userspace.
+			 */
+			error->ring[i].batchbuffer =
+				i915_error_object_create(dev_priv, request->batch_obj, request->ctx->vm);
+			if (request->file_priv) {
+				struct task_struct *task;
+
+				rcu_read_lock();
+				task = pid_task(request->file_priv->file->pid, PIDTYPE_PID);
+				if (task) {
+					strcpy(error->ring[i].comm, task->comm);
+					error->ring[i].pid = task->pid;
+				}
+				rcu_read_unlock();
+			}
+		}
 
 		error->ring[i].ringbuffer =
 			i915_error_ggtt_object_create(dev_priv, ring->obj);
-- 
1.9.rc1

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] drm/i915: Record pid/comm of hanging task
  2014-02-05 16:40 [PATCH] drm/i915: Record pid/comm of hanging task Chris Wilson
@ 2014-02-05 17:20 ` Jesse Barnes
  2014-02-06 12:57 ` Chris Wilson
  1 sibling, 0 replies; 3+ messages in thread
From: Jesse Barnes @ 2014-02-05 17:20 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx, Ben Widawsky

On Wed,  5 Feb 2014 16:40:41 +0000
Chris Wilson <chris@chris-wilson.co.uk> wrote:

> After finding the guilty batch and request, we can use it to find the
> process that submitted the batch and then add the culprit into the error
> state.
> 
> This is a slightly different approach from Ben's in that instead of
> adding the extra information into the struct i915_hw_context, we use the
> information already captured in struct drm_file which is then referenced
> from the request.
> 
> Link: http://lists.freedesktop.org/archives/intel-gfx/2013-August/032280.html
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Ben Widawsky <ben@bwidawsk.net>

Would be nice to have this in dmesg, so I can tell if it's my bitcoin
miner or a chrome tab that's killing things.

Thanks,
Jesse

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH] drm/i915: Record pid/comm of hanging task
  2014-02-05 16:40 [PATCH] drm/i915: Record pid/comm of hanging task Chris Wilson
  2014-02-05 17:20 ` Jesse Barnes
@ 2014-02-06 12:57 ` Chris Wilson
  1 sibling, 0 replies; 3+ messages in thread
From: Chris Wilson @ 2014-02-06 12:57 UTC (permalink / raw)
  To: intel-gfx; +Cc: Ben Widawsky

After finding the guilty batch and request, we can use it to find the
process that submitted the batch and then add the culprit into the error
state.

This is a slightly different approach from Ben's in that instead of
adding the extra information into the struct i915_hw_context, we use the
information already captured in struct drm_file which is then referenced
from the request.

v2: Also capture the workaround buffer for gen2, so that we can compare
    its contents against the intended batch for the active request.

Link: http://lists.freedesktop.org/archives/intel-gfx/2013-August/032280.html
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Ben Widawsky <ben@bwidawsk.net>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---

Jesse would like similar information included with the error message, as
Mika is working on improving that error message, I delegate that task to
him. ;-)
-Chris

---
 drivers/gpu/drm/i915/i915_drv.h       |   6 +-
 drivers/gpu/drm/i915/i915_gem.c       |   1 +
 drivers/gpu/drm/i915/i915_gpu_error.c | 115 +++++++++++++++++++++-------------
 3 files changed, 78 insertions(+), 44 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 1d317f8c9f05..159f94637f3a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -359,7 +359,7 @@ struct drm_i915_error_state {
 			int page_count;
 			u32 gtt_offset;
 			u32 *pages[0];
-		} *ringbuffer, *batchbuffer, *ctx, *hws_page;
+		} *ringbuffer, *batchbuffer, *wa_batchbuffer, *ctx, *hws_page;
 
 		struct drm_i915_error_request {
 			long jiffies;
@@ -374,6 +374,9 @@ struct drm_i915_error_state {
 				u32 pp_dir_base;
 			};
 		} vm_info;
+
+		pid_t pid;
+		char comm[TASK_COMM_LEN];
 	} ring[I915_NUM_RINGS];
 
 	struct drm_i915_error_buffer {
@@ -1825,6 +1828,7 @@ struct drm_i915_gem_request {
 
 struct drm_i915_file_private {
 	struct drm_i915_private *dev_priv;
+	struct drm_file *file;
 
 	struct {
 		spinlock_t lock;
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 157877bd7a0b..605e4ab636e8 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4981,6 +4981,7 @@ int i915_gem_open(struct drm_device *dev, struct drm_file *file)
 
 	file->driver_priv = file_priv;
 	file_priv->dev_priv = dev->dev_private;
+	file_priv->file = file;
 
 	spin_lock_init(&file_priv->mm.lock);
 	INIT_LIST_HEAD(&file_priv->mm.request_list);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 0ff158939f1d..c1da48b5189d 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -302,13 +302,28 @@ void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
 	va_end(args);
 }
 
+static void print_error_obj(struct drm_i915_error_state_buf *m,
+			    struct drm_i915_error_object *obj)
+{
+	int page, offset, elt;
+
+	for (page = offset = 0; page < obj->page_count; page++) {
+		for (elt = 0; elt < PAGE_SIZE/4; elt++) {
+			err_printf(m, "%08x :  %08x\n", offset,
+				   obj->pages[page][elt]);
+			offset += 4;
+		}
+	}
+}
+
 int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 			    const struct i915_error_state_file_priv *error_priv)
 {
 	struct drm_device *dev = error_priv->dev;
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	struct drm_i915_error_state *error = error_priv->error;
-	int i, j, page, offset, elt;
+	int i, j, offset, elt;
+	int max_hangcheck_score;
 
 	if (!error) {
 		err_printf(m, "no error state collected\n");
@@ -318,6 +333,20 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 	err_printf(m, "Time: %ld s %ld us\n", error->time.tv_sec,
 		   error->time.tv_usec);
 	err_printf(m, "Kernel: " UTS_RELEASE "\n");
+	max_hangcheck_score = 0;
+	for (i = 0; i < ARRAY_SIZE(error->ring); i++) {
+		if (error->ring[i].hangcheck_score > max_hangcheck_score)
+			max_hangcheck_score = error->ring[i].hangcheck_score;
+	}
+	for (i = 0; i < ARRAY_SIZE(error->ring); i++) {
+		if (error->ring[i].hangcheck_score == max_hangcheck_score &&
+		    error->ring[i].pid != -1) {
+			err_printf(m, "Active process (on ring %s): %s [%d]\n",
+				   ring_str(i),
+				   error->ring[i].comm,
+				   error->ring[i].pid);
+		}
+	}
 	err_printf(m, "PCI ID: 0x%04x\n", dev->pdev->device);
 	err_printf(m, "EIR: 0x%08x\n", error->eir);
 	err_printf(m, "IER: 0x%08x\n", error->ier);
@@ -363,17 +392,19 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 		struct drm_i915_error_object *obj;
 
 		if ((obj = error->ring[i].batchbuffer)) {
-			err_printf(m, "%s --- gtt_offset = 0x%08x\n",
-				   dev_priv->ring[i].name,
+			err_puts(m, dev_priv->ring[i].name);
+			if (error->ring[i].pid != -1)
+				err_printf(m, " (submitted by %s [%d])",
+					   error->ring[i].comm, error->ring[i].pid);
+			err_printf(m, " --- gtt_offset = 0x%08x\n",
 				   obj->gtt_offset);
-			offset = 0;
-			for (page = 0; page < obj->page_count; page++) {
-				for (elt = 0; elt < PAGE_SIZE/4; elt++) {
-					err_printf(m, "%08x :  %08x\n", offset,
-						   obj->pages[page][elt]);
-					offset += 4;
-				}
-			}
+			print_error_obj(m, obj);
+		}
+
+		if ((obj = error->ring[i].wa_batchbuffer)) {
+			err_printf(m, "%s (w/a) --- gtt_offset = 0x%08x\n",
+				   dev_priv->ring[i].name, obj->gtt_offset);
+			print_error_obj(m, obj);
 		}
 
 		if (error->ring[i].num_requests) {
@@ -392,15 +423,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 			err_printf(m, "%s --- ringbuffer = 0x%08x\n",
 				   dev_priv->ring[i].name,
 				   obj->gtt_offset);
-			offset = 0;
-			for (page = 0; page < obj->page_count; page++) {
-				for (elt = 0; elt < PAGE_SIZE/4; elt++) {
-					err_printf(m, "%08x :  %08x\n",
-						   offset,
-						   obj->pages[page][elt]);
-					offset += 4;
-				}
-			}
+			print_error_obj(m, obj);
 		}
 
 		if ((obj = error->ring[i].hws_page)) {
@@ -725,9 +748,9 @@ static void i915_gem_record_fences(struct drm_device *dev,
 	}
 }
 
-static struct drm_i915_error_object *
-i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
-			     struct intel_ring_buffer *ring)
+static struct drm_i915_gem_request *
+i915_error_first_request(struct drm_i915_private *dev_priv,
+			 struct intel_ring_buffer *ring)
 {
 	struct drm_i915_gem_request *request;
 	u32 seqno;
@@ -735,29 +758,12 @@ i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
 	if (!ring->get_seqno)
 		return NULL;
 
-	if (HAS_BROKEN_CS_TLB(dev_priv->dev)) {
-		struct drm_i915_gem_object *obj;
-		u32 acthd = I915_READ(ACTHD);
-
-		if (WARN_ON(ring->id != RCS))
-			return NULL;
-
-		obj = ring->scratch.obj;
-		if (obj != NULL &&
-		    acthd >= i915_gem_obj_ggtt_offset(obj) &&
-		    acthd < i915_gem_obj_ggtt_offset(obj) + obj->base.size)
-			return i915_error_ggtt_object_create(dev_priv, obj);
-	}
-
 	seqno = ring->get_seqno(ring, false);
 	list_for_each_entry(request, &ring->request_list, list) {
 		if (i915_seqno_passed(seqno, request->seqno))
 			continue;
 
-		/* We need to copy these to an anonymous buffer as the simplest
-		 * method to avoid being overwritten by userspace.
-		 */
-		return i915_error_object_create(dev_priv, request->batch_obj, request->ctx->vm);
+		return request;
 	}
 
 	return NULL;
@@ -911,8 +917,31 @@ static void i915_gem_record_rings(struct drm_device *dev,
 
 		i915_record_ring_state(dev, ring, &error->ring[i]);
 
-		error->ring[i].batchbuffer =
-			i915_error_first_batchbuffer(dev_priv, ring);
+		error->ring[i].pid = -1;
+		request = i915_error_first_request(dev_priv, ring);
+		if (request) {
+			/* We need to copy these to an anonymous buffer as the simplest
+			 * method to avoid being overwritten by userspace.
+			 */
+			error->ring[i].batchbuffer =
+				i915_error_object_create(dev_priv, request->batch_obj, request->ctx->vm);
+
+			if (HAS_BROKEN_CS_TLB(dev_priv->dev) && ring->scratch.obj)
+				error->ring[i].wa_batchbuffer =
+					i915_error_ggtt_object_create(dev_priv, ring->scratch.obj);
+
+			if (request->file_priv) {
+				struct task_struct *task;
+
+				rcu_read_lock();
+				task = pid_task(request->file_priv->file->pid, PIDTYPE_PID);
+				if (task) {
+					strcpy(error->ring[i].comm, task->comm);
+					error->ring[i].pid = task->pid;
+				}
+				rcu_read_unlock();
+			}
+		}
 
 		error->ring[i].ringbuffer =
 			i915_error_ggtt_object_create(dev_priv, ring->obj);
-- 
1.9.rc1

^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2014-02-06 12:57 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-02-05 16:40 [PATCH] drm/i915: Record pid/comm of hanging task Chris Wilson
2014-02-05 17:20 ` Jesse Barnes
2014-02-06 12:57 ` Chris Wilson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.