All of lore.kernel.org
 help / color / mirror / Atom feed
From: Chris Wilson <chris@chris-wilson.co.uk>
To: intel-gfx@lists.freedesktop.org
Subject: [PATCH 14/17] drm/i915: Only include active engines in the capture state
Date: Tue, 30 Jul 2019 14:30:32 +0100	[thread overview]
Message-ID: <20190730133035.1977-15-chris@chris-wilson.co.uk> (raw)
In-Reply-To: <20190730133035.1977-1-chris@chris-wilson.co.uk>

Skip printing out idle engines that did not contribute to the GPU hang.
As the number of engines gets ever larger, we have increasing noise in
the error state where typically there is only one guilty request on one
engine that we need to inspect.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gpu_error.c | 238 +++++++++++---------------
 drivers/gpu/drm/i915/i915_gpu_error.h |   7 +-
 2 files changed, 104 insertions(+), 141 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 0c0f255000c2..5d8e9287f9ca 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -49,27 +49,6 @@
 #define ALLOW_FAIL (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN)
 #define ATOMIC_MAYFAIL (GFP_ATOMIC | __GFP_NOWARN)
 
-static inline const struct intel_engine_cs *
-engine_lookup(const struct drm_i915_private *i915, unsigned int id)
-{
-	if (id >= I915_NUM_ENGINES)
-		return NULL;
-
-	return i915->engine[id];
-}
-
-static inline const char *
-__engine_name(const struct intel_engine_cs *engine)
-{
-	return engine ? engine->name : "";
-}
-
-static const char *
-engine_name(const struct drm_i915_private *i915, unsigned int id)
-{
-	return __engine_name(engine_lookup(i915, id));
-}
-
 static void __sg_set_buf(struct scatterlist *sg,
 			 void *addr, unsigned int len, loff_t it)
 {
@@ -447,7 +426,7 @@ static void error_print_instdone(struct drm_i915_error_state_buf *m,
 	err_printf(m, "  INSTDONE: 0x%08x\n",
 		   ee->instdone.instdone);
 
-	if (ee->engine_id != RCS0 || INTEL_GEN(m->i915) <= 3)
+	if (ee->engine->class != RENDER_CLASS || INTEL_GEN(m->i915) <= 3)
 		return;
 
 	err_printf(m, "  SC_INSTDONE: 0x%08x\n",
@@ -501,8 +480,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
 {
 	int n;
 
-	err_printf(m, "%s command stream:\n",
-		   engine_name(m->i915, ee->engine_id));
+	err_printf(m, "%s command stream:\n", ee->engine->name);
 	err_printf(m, "  IDLE?: %s\n", yesno(ee->idle));
 	err_printf(m, "  START: 0x%08x\n", ee->start);
 	err_printf(m, "  HEAD:  0x%08x [0x%08x]\n", ee->head, ee->rq_head);
@@ -578,9 +556,9 @@ void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
 }
 
 static void print_error_obj(struct drm_i915_error_state_buf *m,
-			    struct intel_engine_cs *engine,
+			    const struct intel_engine_cs *engine,
 			    const char *name,
-			    struct drm_i915_error_object *obj)
+			    const struct drm_i915_error_object *obj)
 {
 	char out[ASCII85_BUFSZ];
 	int page;
@@ -677,7 +655,7 @@ static void err_free_sgl(struct scatterlist *sgl)
 static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
 			       struct i915_gpu_state *error)
 {
-	struct drm_i915_error_object *obj;
+	const struct drm_i915_error_engine *ee;
 	struct timespec64 ts;
 	int i, j;
 
@@ -701,15 +679,12 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
 		   jiffies_to_msecs(jiffies - error->capture),
 		   jiffies_to_msecs(error->capture - error->epoch));
 
-	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
-		if (!error->engine[i].context.pid)
-			continue;
-
+	for (ee = error->engine; ee; ee = ee->next)
 		err_printf(m, "Active process (on ring %s): %s [%d]\n",
-			   engine_name(m->i915, i),
-			   error->engine[i].context.comm,
-			   error->engine[i].context.pid);
-	}
+			   ee->engine->name,
+			   ee->context.comm,
+			   ee->context.pid);
+
 	err_printf(m, "Reset count: %u\n", error->reset_count);
 	err_printf(m, "Suspend count: %u\n", error->suspend_count);
 	err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform));
@@ -758,17 +733,15 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
 	if (IS_GEN(m->i915, 7))
 		err_printf(m, "ERR_INT: 0x%08x\n", error->err_int);
 
-	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
-		if (error->engine[i].engine_id != -1)
-			error_print_engine(m, &error->engine[i], error->epoch);
-	}
+	for (ee = error->engine; ee; ee = ee->next)
+		error_print_engine(m, ee, error->epoch);
 
-	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
-		const struct drm_i915_error_engine *ee = &error->engine[i];
+	for (ee = error->engine; ee; ee = ee->next) {
+		const struct drm_i915_error_object *obj;
 
 		obj = ee->batchbuffer;
 		if (obj) {
-			err_puts(m, m->i915->engine[i]->name);
+			err_puts(m, ee->engine->name);
 			if (ee->context.pid)
 				err_printf(m, " (submitted by %s [%d])",
 					   ee->context.comm,
@@ -776,16 +749,15 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
 			err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
 				   upper_32_bits(obj->gtt_offset),
 				   lower_32_bits(obj->gtt_offset));
-			print_error_obj(m, m->i915->engine[i], NULL, obj);
+			print_error_obj(m, ee->engine, NULL, obj);
 		}
 
 		for (j = 0; j < ee->user_bo_count; j++)
-			print_error_obj(m, m->i915->engine[i],
-					"user", ee->user_bo[j]);
+			print_error_obj(m, ee->engine, "user", ee->user_bo[j]);
 
 		if (ee->num_requests) {
 			err_printf(m, "%s --- %d requests\n",
-				   m->i915->engine[i]->name,
+				   ee->engine->name,
 				   ee->num_requests);
 			for (j = 0; j < ee->num_requests; j++)
 				error_print_request(m, " ",
@@ -793,22 +765,13 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
 						    error->epoch);
 		}
 
-		print_error_obj(m, m->i915->engine[i],
-				"ringbuffer", ee->ringbuffer);
-
-		print_error_obj(m, m->i915->engine[i],
-				"HW Status", ee->hws_page);
-
-		print_error_obj(m, m->i915->engine[i],
-				"HW context", ee->ctx);
-
-		print_error_obj(m, m->i915->engine[i],
-				"WA context", ee->wa_ctx);
-
-		print_error_obj(m, m->i915->engine[i],
+		print_error_obj(m, ee->engine, "ringbuffer", ee->ringbuffer);
+		print_error_obj(m, ee->engine, "HW Status", ee->hws_page);
+		print_error_obj(m, ee->engine, "HW context", ee->ctx);
+		print_error_obj(m, ee->engine, "WA context", ee->wa_ctx);
+		print_error_obj(m, ee->engine,
 				"WA batchbuffer", ee->wa_batchbuffer);
-
-		print_error_obj(m, m->i915->engine[i],
+		print_error_obj(m, ee->engine,
 				"NULL context", ee->default_state);
 	}
 
@@ -957,13 +920,15 @@ void __i915_gpu_state_free(struct kref *error_ref)
 {
 	struct i915_gpu_state *error =
 		container_of(error_ref, typeof(*error), ref);
-	long i, j;
+	long i;
 
-	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
-		struct drm_i915_error_engine *ee = &error->engine[i];
+	while (error->engine) {
+		struct drm_i915_error_engine *ee = error->engine;
 
-		for (j = 0; j < ee->user_bo_count; j++)
-			i915_error_object_free(ee->user_bo[j]);
+		error->engine = ee->next;
+
+		for (i = 0; i < ee->user_bo_count; i++)
+			i915_error_object_free(ee->user_bo[i]);
 		kfree(ee->user_bo);
 
 		i915_error_object_free(ee->batchbuffer);
@@ -974,6 +939,7 @@ void __i915_gpu_state_free(struct kref *error_ref)
 		i915_error_object_free(ee->wa_ctx);
 
 		kfree(ee->requests);
+		kfree(ee);
 	}
 
 	kfree(error->overlay);
@@ -1055,23 +1021,17 @@ i915_error_object_create(struct drm_i915_private *i915,
  *
  * It's only a small step better than a random number in its current form.
  */
-static u32 i915_error_generate_code(struct i915_gpu_state *error,
-				    intel_engine_mask_t engine_mask)
+static u32 i915_error_generate_code(struct i915_gpu_state *error)
 {
+	const struct drm_i915_error_engine *ee = error->engine;
+
 	/*
 	 * IPEHR would be an ideal way to detect errors, as it's the gross
 	 * measure of "the command that hung." However, has some very common
 	 * synchronization commands which almost always appear in the case
 	 * strictly a client bug. Use instdone to differentiate those some.
 	 */
-	if (engine_mask) {
-		struct drm_i915_error_engine *ee =
-			&error->engine[ffs(engine_mask)];
-
-		return ee->ipehr ^ ee->instdone.instdone;
-	}
-
-	return 0;
+	return ee ? ee->ipehr ^ ee->instdone.instdone : 0;
 }
 
 static void gem_record_fences(struct i915_gpu_state *error)
@@ -1282,9 +1242,11 @@ static void error_record_engine_execlists(const struct intel_engine_cs *engine,
 	ee->num_ports = n;
 }
 
-static void record_context(struct drm_i915_error_context *e,
-			   struct i915_gem_context *ctx)
+static bool record_context(struct drm_i915_error_context *e,
+			   const struct i915_request *rq)
 {
+	const struct i915_gem_context *ctx = rq->gem_context;
+
 	if (ctx->pid) {
 		struct task_struct *task;
 
@@ -1301,6 +1263,8 @@ static void record_context(struct drm_i915_error_context *e,
 	e->sched_attr = ctx->sched;
 	e->guilty = atomic_read(&ctx->guilty_count);
 	e->active = atomic_read(&ctx->active_count);
+
+	return i915_gem_context_no_error_capture(ctx);
 }
 
 struct capture_vma {
@@ -1395,74 +1359,67 @@ static void
 gem_record_rings(struct i915_gpu_state *error, struct compress *compress)
 {
 	struct drm_i915_private *i915 = error->i915;
-	int i;
+	struct intel_engine_cs *engine;
+	struct drm_i915_error_engine *ee;
+
+	ee = kzalloc(sizeof(*ee), GFP_KERNEL);
+	if (!ee)
+		return;
 
-	for (i = 0; i < I915_NUM_ENGINES; i++) {
-		struct intel_engine_cs *engine = i915->engine[i];
-		struct drm_i915_error_engine *ee = &error->engine[i];
+	for_each_user_engine(engine, i915) {
 		struct capture_vma *capture = NULL;
 		struct i915_request *request;
 		unsigned long flags;
 
-		ee->engine_id = -1;
-
-		if (!engine)
-			continue;
-
-		ee->engine_id = i;
-
 		/* Refill our page pool before entering atomic section */
 		pool_refill(&compress->pool, ALLOW_FAIL);
 
-		error_record_engine_registers(error, engine, ee);
-		error_record_engine_execlists(engine, ee);
-
 		spin_lock_irqsave(&engine->active.lock, flags);
 		request = intel_engine_find_active_request(engine);
-		if (request) {
-			struct i915_gem_context *ctx = request->gem_context;
-			struct intel_ring *ring = request->ring;
-
-			record_context(&ee->context, ctx);
-
-			/*
-			 * We need to copy these to an anonymous buffer
-			 * as the simplest method to avoid being overwritten
-			 * by userspace.
-			 */
-			capture = capture_vma(capture,
-					      request->batch,
-					      &ee->batchbuffer);
+		if (!request) {
+			spin_unlock_irqrestore(&engine->active.lock, flags);
+			continue;
+		}
 
-			if (HAS_BROKEN_CS_TLB(i915))
-				capture = capture_vma(capture,
-						      engine->gt->scratch,
-						      &ee->wa_batchbuffer);
+		error->simulated |= record_context(&ee->context, request);
 
-			capture = request_record_user_bo(request, ee, capture);
+		/*
+		 * We need to copy these to an anonymous buffer
+		 * as the simplest method to avoid being overwritten
+		 * by userspace.
+		 */
+		capture = capture_vma(capture,
+				      request->batch,
+				      &ee->batchbuffer);
 
+		if (HAS_BROKEN_CS_TLB(i915))
 			capture = capture_vma(capture,
-					      request->hw_context->state,
-					      &ee->ctx);
+					      engine->gt->scratch,
+					      &ee->wa_batchbuffer);
 
-			capture = capture_vma(capture,
-					      ring->vma,
-					      &ee->ringbuffer);
+		capture = request_record_user_bo(request, ee, capture);
 
-			error->simulated |=
-				i915_gem_context_no_error_capture(ctx);
+		capture = capture_vma(capture,
+				      request->hw_context->state,
+				      &ee->ctx);
 
-			ee->rq_head = request->head;
-			ee->rq_post = request->postfix;
-			ee->rq_tail = request->tail;
+		capture = capture_vma(capture,
+				      request->ring->vma,
+				      &ee->ringbuffer);
 
-			ee->cpu_ring_head = ring->head;
-			ee->cpu_ring_tail = ring->tail;
+		ee->cpu_ring_head = request->ring->head;
+		ee->cpu_ring_tail = request->ring->tail;
 
-			engine_record_requests(engine, request, ee);
-		}
+		ee->rq_head = request->head;
+		ee->rq_post = request->postfix;
+		ee->rq_tail = request->tail;
+
+		engine_record_requests(engine, request, ee);
 		spin_unlock_irqrestore(&engine->active.lock, flags);
 
+		error_record_engine_registers(error, engine, ee);
+		error_record_engine_execlists(engine, ee);
+
 		while (capture) {
 			struct capture_vma *this = capture;
 			struct i915_vma *vma = *this->slot;
@@ -1489,7 +1446,18 @@ gem_record_rings(struct i915_gpu_state *error, struct compress *compress)
 
 		ee->default_state =
 			capture_object(i915, engine->default_state, compress);
+
+		ee->engine = engine;
+
+		ee->next = error->engine;
+		error->engine = ee;
+
+		ee = kzalloc(sizeof(*ee), GFP_KERNEL);
+		if (!ee)
+			return;
 	}
+
+	kfree(ee);
 }
 
 static void
@@ -1618,24 +1586,18 @@ error_msg(struct i915_gpu_state *error,
 	  intel_engine_mask_t engines, const char *msg)
 {
 	int len;
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(error->engine); i++)
-		if (!error->engine[i].context.pid)
-			engines &= ~BIT(i);
 
 	len = scnprintf(error->error_msg, sizeof(error->error_msg),
 			"GPU HANG: ecode %d:%x:0x%08x",
 			INTEL_GEN(error->i915), engines,
-			i915_error_generate_code(error, engines));
-	if (engines) {
+			i915_error_generate_code(error));
+	if (error->engine) {
 		/* Just show the first executing process, more is confusing */
-		i = __ffs(engines);
 		len += scnprintf(error->error_msg + len,
 				 sizeof(error->error_msg) - len,
 				 ", in %s [%d]",
-				 error->engine[i].context.comm,
-				 error->engine[i].context.pid);
+				 error->engine->context.comm,
+				 error->engine->context.pid);
 	}
 	if (msg)
 		len += scnprintf(error->error_msg + len,
@@ -1676,12 +1638,10 @@ static void capture_params(struct i915_gpu_state *error)
 
 static unsigned long capture_find_epoch(const struct i915_gpu_state *error)
 {
+	const struct drm_i915_error_engine *ee;
 	unsigned long epoch = error->capture;
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
-		const struct drm_i915_error_engine *ee = &error->engine[i];
 
+	for (ee = error->engine; ee; ee = ee->next) {
 		if (ee->hangcheck_timestamp &&
 		    time_before(ee->hangcheck_timestamp, epoch))
 			epoch = ee->hangcheck_timestamp;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
index a24c35107d16..df9f57766626 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -81,7 +81,8 @@ struct i915_gpu_state {
 	struct intel_display_error_state *display;
 
 	struct drm_i915_error_engine {
-		int engine_id;
+		const struct intel_engine_cs *engine;
+
 		/* Software tracked state */
 		bool idle;
 		unsigned long hangcheck_timestamp;
@@ -158,7 +159,9 @@ struct i915_gpu_state {
 				u32 pp_dir_base;
 			};
 		} vm_info;
-	} engine[I915_NUM_ENGINES];
+
+		struct drm_i915_error_engine *next;
+	} *engine;
 
 	struct scatterlist *sgl, *fit;
 };
-- 
2.22.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

  parent reply	other threads:[~2019-07-30 13:30 UTC|newest]

Thread overview: 39+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-07-30 13:30 Quick and dirty intel_gt_pm.c rebase Chris Wilson
2019-07-30 13:30 ` [PATCH 01/17] drm/i915/execlists: Always clear pending&inflight requests on reset Chris Wilson
2019-08-01  8:08   ` Andi Shyti
2019-08-01  8:13     ` Chris Wilson
2019-07-30 13:30 ` [PATCH 02/17] drm/i915: Allow sharing the idle-barrier from other kernel requests Chris Wilson
2019-07-30 13:30 ` [PATCH 03/17] drm/i915: Flush extra hard after writing relocations through the GTT Chris Wilson
2019-07-30 13:30 ` [PATCH 04/17] drm/i915: Use drm_i915_private directly from drv_get_drvdata() Chris Wilson
2019-08-05 17:05   ` Andi Shyti
2019-08-05 18:01     ` Chris Wilson
2019-07-30 13:30 ` [PATCH 05/17] drm/i915/gem: Make caps.scheduler static Chris Wilson
2019-08-05 17:08   ` Andi Shyti
2019-08-05 18:07     ` Chris Wilson
2019-07-30 13:30 ` [PATCH 06/17] drm/i915: Move aliasing_ppgtt underneath its i915_ggtt Chris Wilson
2019-07-30 13:58   ` Tvrtko Ursulin
2019-07-30 14:12     ` Chris Wilson
2019-07-30 13:30 ` [PATCH 07/17] drm/i915/gt: Provide a local intel_context.vm Chris Wilson
2019-07-30 13:30 ` [PATCH 08/17] drm/i915: Remove lrc default desc from GEM context Chris Wilson
2019-07-30 22:57   ` Kumar Valsan, Prathap
2019-07-30 13:30 ` [PATCH 09/17] drm/i915: Push the ring creation flags to the backend Chris Wilson
2019-08-05 17:08   ` Andi Shyti
2019-09-02 13:59     ` Tvrtko Ursulin
2019-09-06 18:18       ` Chris Wilson
2019-09-02 14:17   ` Tvrtko Ursulin
2019-07-30 13:30 ` [PATCH 10/17] drm/i915: Hide unshrinkable context objects from the shrinker Chris Wilson
2019-08-02 16:01   ` Matthew Auld
2019-07-30 13:30 ` [PATCH 11/17] drm/i915/gt: Move the [class][inst] lookup for engines onto the GT Chris Wilson
2019-07-30 13:30 ` [PATCH 12/17] drm/i915: Use intel_engine_lookup_user for probing HAS_BSD etc Chris Wilson
2019-08-05 17:08   ` Andi Shyti
2019-07-30 13:30 ` [PATCH 13/17] drm/i915: Isolate i915_getparam_ioctl() Chris Wilson
2019-08-05 17:09   ` Andi Shyti
2019-07-30 13:30 ` Chris Wilson [this message]
2019-07-30 13:30 ` [PATCH 15/17] drm/i915: Flush the freed object list on file close Chris Wilson
2019-08-02 17:00   ` Matthew Auld
2019-08-02 19:46     ` Chris Wilson
2019-07-30 13:30 ` [PATCH 16/17] drm/i915: Make debugfs/per_file_stats scale better Chris Wilson
2019-07-30 13:30 ` [PATCH 17/17] drm/i915/gt: Extract GT runtime power management from intel_pm.c Chris Wilson
2019-07-30 14:00 ` ✗ Fi.CI.CHECKPATCH: warning for series starting with [01/17] drm/i915/execlists: Always clear pending&inflight requests on reset Patchwork
2019-07-30 14:09 ` ✗ Fi.CI.SPARSE: " Patchwork
2019-07-30 14:38 ` ✗ Fi.CI.BAT: failure " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190730133035.1977-15-chris@chris-wilson.co.uk \
    --to=chris@chris-wilson.co.uk \
    --cc=intel-gfx@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.