All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/i915: Report if an unbannable context is involved in a GPU hang
@ 2018-02-05  9:41 Chris Wilson
  2018-02-05 10:01 ` Mika Kuoppala
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Chris Wilson @ 2018-02-05  9:41 UTC (permalink / raw)
  To: intel-gfx; +Cc: Mika

Since unbannable contexts are special and supposed not to be causing GPU
hangs in the first place, make it clear when they are implicated in said
hang. In practice, most unbannable contexts are those created by igt
for the express purpose of throwing untold thousands of hangs at the GPU
and wish to keep doing so to finish the test. Normally they are cleaned
up, but it's when they or the other unbannable kernel contexts stay
stuck in an erroneous state that we need to worry and so need
highlighting.

Suggested-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com
---
 drivers/gpu/drm/i915/i915_drv.h       |  1 +
 drivers/gpu/drm/i915/i915_gpu_error.c | 21 +++++++++++++++------
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 4e158aab36d6..d6b5ac2a563d 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -555,6 +555,7 @@ struct i915_gpu_state {
 			int ban_score;
 			int active;
 			int guilty;
+			bool bannable;
 		} context;
 
 		struct drm_i915_error_object {
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index a81351d9e3a6..67c902412193 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -396,6 +396,11 @@ static void error_print_instdone(struct drm_i915_error_state_buf *m,
 			   ee->instdone.row[slice][subslice]);
 }
 
+static const char *bannable(const struct drm_i915_error_context *ctx)
+{
+	return ctx->bannable ? "" : " (unbannable)";
+}
+
 static void error_print_request(struct drm_i915_error_state_buf *m,
 				const char *prefix,
 				const struct drm_i915_error_request *erq)
@@ -414,9 +419,10 @@ static void error_print_context(struct drm_i915_error_state_buf *m,
 				const char *header,
 				const struct drm_i915_error_context *ctx)
 {
-	err_printf(m, "%s%s[%d] user_handle %d hw_id %d, prio %d, ban score %d guilty %d active %d\n",
+	err_printf(m, "%s%s[%d] user_handle %d hw_id %d, prio %d, ban score %d%s guilty %d active %d\n",
 		   header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id,
-		   ctx->priority, ctx->ban_score, ctx->guilty, ctx->active);
+		   ctx->priority, ctx->ban_score, bannable(ctx),
+		   ctx->guilty, ctx->active);
 }
 
 static void error_print_engine(struct drm_i915_error_state_buf *m,
@@ -644,11 +650,12 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
 		if (error->engine[i].hangcheck_stalled &&
 		    error->engine[i].context.pid) {
-			err_printf(m, "Active process (on ring %s): %s [%d], score %d\n",
+			err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n",
 				   engine_name(m->i915, i),
 				   error->engine[i].context.comm,
 				   error->engine[i].context.pid,
-				   error->engine[i].context.ban_score);
+				   error->engine[i].context.ban_score,
+				   bannable(&error->engine[i].context));
 		}
 	}
 	err_printf(m, "Reset count: %u\n", error->reset_count);
@@ -736,12 +743,13 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 		if (obj) {
 			err_puts(m, dev_priv->engine[i]->name);
 			if (ee->context.pid)
-				err_printf(m, " (submitted by %s [%d], ctx %d [%d], score %d)",
+				err_printf(m, " (submitted by %s [%d], ctx %d [%d], score %d%s)",
 					   ee->context.comm,
 					   ee->context.pid,
 					   ee->context.handle,
 					   ee->context.hw_id,
-					   ee->context.ban_score);
+					   ee->context.ban_score,
+					   bannable(&ee->context));
 			err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
 				   upper_32_bits(obj->gtt_offset),
 				   lower_32_bits(obj->gtt_offset));
@@ -1383,6 +1391,7 @@ static void record_context(struct drm_i915_error_context *e,
 	e->hw_id = ctx->hw_id;
 	e->priority = ctx->priority;
 	e->ban_score = atomic_read(&ctx->ban_score);
+	e->bannable = i915_gem_context_is_bannable(ctx);
 	e->guilty = atomic_read(&ctx->guilty_count);
 	e->active = atomic_read(&ctx->active_count);
 }
-- 
2.15.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] drm/i915: Report if an unbannable context is involved in a GPU hang
  2018-02-05  9:41 [PATCH] drm/i915: Report if an unbannable context is involved in a GPU hang Chris Wilson
@ 2018-02-05 10:01 ` Mika Kuoppala
  2018-02-05 10:37 ` ✓ Fi.CI.BAT: success for " Patchwork
  2018-02-05 11:31 ` ✓ Fi.CI.IGT: " Patchwork
  2 siblings, 0 replies; 4+ messages in thread
From: Mika Kuoppala @ 2018-02-05 10:01 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: Mika

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Since unbannable contexts are special and supposed not to be causing GPU
> hangs in the first place, make it clear when they are implicated in said
> hang. In practice, most unbannable contexts are those created by igt
> for the express purpose of throwing untold thousands of hangs at the GPU
> and wish to keep doing so to finish the test. Normally they are cleaned
> up, but it's when they or the other unbannable kernel contexts stay
> stuck in an erroneous state that we need to worry and so need
> highlighting.
>
> Suggested-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com

+>

Well, this should make things obvious if this happens.

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

> ---
>  drivers/gpu/drm/i915/i915_drv.h       |  1 +
>  drivers/gpu/drm/i915/i915_gpu_error.c | 21 +++++++++++++++------
>  2 files changed, 16 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 4e158aab36d6..d6b5ac2a563d 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -555,6 +555,7 @@ struct i915_gpu_state {
>  			int ban_score;
>  			int active;
>  			int guilty;
> +			bool bannable;
>  		} context;
>  
>  		struct drm_i915_error_object {
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index a81351d9e3a6..67c902412193 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -396,6 +396,11 @@ static void error_print_instdone(struct drm_i915_error_state_buf *m,
>  			   ee->instdone.row[slice][subslice]);
>  }
>  
> +static const char *bannable(const struct drm_i915_error_context *ctx)
> +{
> +	return ctx->bannable ? "" : " (unbannable)";
> +}
> +
>  static void error_print_request(struct drm_i915_error_state_buf *m,
>  				const char *prefix,
>  				const struct drm_i915_error_request *erq)
> @@ -414,9 +419,10 @@ static void error_print_context(struct drm_i915_error_state_buf *m,
>  				const char *header,
>  				const struct drm_i915_error_context *ctx)
>  {
> -	err_printf(m, "%s%s[%d] user_handle %d hw_id %d, prio %d, ban score %d guilty %d active %d\n",
> +	err_printf(m, "%s%s[%d] user_handle %d hw_id %d, prio %d, ban score %d%s guilty %d active %d\n",
>  		   header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id,
> -		   ctx->priority, ctx->ban_score, ctx->guilty, ctx->active);
> +		   ctx->priority, ctx->ban_score, bannable(ctx),
> +		   ctx->guilty, ctx->active);
>  }
>  
>  static void error_print_engine(struct drm_i915_error_state_buf *m,
> @@ -644,11 +650,12 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>  	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
>  		if (error->engine[i].hangcheck_stalled &&
>  		    error->engine[i].context.pid) {
> -			err_printf(m, "Active process (on ring %s): %s [%d], score %d\n",
> +			err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n",
>  				   engine_name(m->i915, i),
>  				   error->engine[i].context.comm,
>  				   error->engine[i].context.pid,
> -				   error->engine[i].context.ban_score);
> +				   error->engine[i].context.ban_score,
> +				   bannable(&error->engine[i].context));
>  		}
>  	}
>  	err_printf(m, "Reset count: %u\n", error->reset_count);
> @@ -736,12 +743,13 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>  		if (obj) {
>  			err_puts(m, dev_priv->engine[i]->name);
>  			if (ee->context.pid)
> -				err_printf(m, " (submitted by %s [%d], ctx %d [%d], score %d)",
> +				err_printf(m, " (submitted by %s [%d], ctx %d [%d], score %d%s)",
>  					   ee->context.comm,
>  					   ee->context.pid,
>  					   ee->context.handle,
>  					   ee->context.hw_id,
> -					   ee->context.ban_score);
> +					   ee->context.ban_score,
> +					   bannable(&ee->context));
>  			err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
>  				   upper_32_bits(obj->gtt_offset),
>  				   lower_32_bits(obj->gtt_offset));
> @@ -1383,6 +1391,7 @@ static void record_context(struct drm_i915_error_context *e,
>  	e->hw_id = ctx->hw_id;
>  	e->priority = ctx->priority;
>  	e->ban_score = atomic_read(&ctx->ban_score);
> +	e->bannable = i915_gem_context_is_bannable(ctx);
>  	e->guilty = atomic_read(&ctx->guilty_count);
>  	e->active = atomic_read(&ctx->active_count);
>  }
> -- 
> 2.15.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 4+ messages in thread

* ✓ Fi.CI.BAT: success for drm/i915: Report if an unbannable context is involved in a GPU hang
  2018-02-05  9:41 [PATCH] drm/i915: Report if an unbannable context is involved in a GPU hang Chris Wilson
  2018-02-05 10:01 ` Mika Kuoppala
@ 2018-02-05 10:37 ` Patchwork
  2018-02-05 11:31 ` ✓ Fi.CI.IGT: " Patchwork
  2 siblings, 0 replies; 4+ messages in thread
From: Patchwork @ 2018-02-05 10:37 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: drm/i915: Report if an unbannable context is involved in a GPU hang
URL   : https://patchwork.freedesktop.org/series/37648/
State : success

== Summary ==

Series 37648v1 drm/i915: Report if an unbannable context is involved in a GPU hang
https://patchwork.freedesktop.org/api/1.0/series/37648/revisions/1/mbox/

Test gem_mmap_gtt:
        Subgroup basic-small-bo-tiledx:
                pass       -> FAIL       (fi-gdg-551) fdo#102575
Test kms_pipe_crc_basic:
        Subgroup suspend-read-crc-pipe-b:
                pass       -> INCOMPLETE (fi-snb-2520m) fdo#103713

fdo#102575 https://bugs.freedesktop.org/show_bug.cgi?id=102575
fdo#103713 https://bugs.freedesktop.org/show_bug.cgi?id=103713

fi-bdw-5557u     total:288  pass:267  dwarn:0   dfail:0   fail:0   skip:21  time:417s
fi-bdw-gvtdvm    total:288  pass:264  dwarn:0   dfail:0   fail:0   skip:24  time:420s
fi-blb-e6850     total:288  pass:223  dwarn:1   dfail:0   fail:0   skip:64  time:372s
fi-bsw-n3050     total:288  pass:242  dwarn:0   dfail:0   fail:0   skip:46  time:482s
fi-bwr-2160      total:288  pass:183  dwarn:0   dfail:0   fail:0   skip:105 time:281s
fi-bxt-dsi       total:288  pass:258  dwarn:0   dfail:0   fail:0   skip:30  time:481s
fi-bxt-j4205     total:288  pass:259  dwarn:0   dfail:0   fail:0   skip:29  time:482s
fi-byt-j1900     total:288  pass:253  dwarn:0   dfail:0   fail:0   skip:35  time:469s
fi-byt-n2820     total:288  pass:249  dwarn:0   dfail:0   fail:0   skip:39  time:456s
fi-cfl-s2        total:288  pass:262  dwarn:0   dfail:0   fail:0   skip:26  time:559s
fi-elk-e7500     total:288  pass:229  dwarn:0   dfail:0   fail:0   skip:59  time:412s
fi-gdg-551       total:288  pass:179  dwarn:0   dfail:0   fail:1   skip:108 time:278s
fi-glk-1         total:288  pass:260  dwarn:0   dfail:0   fail:0   skip:28  time:512s
fi-hsw-4770      total:288  pass:261  dwarn:0   dfail:0   fail:0   skip:27  time:389s
fi-hsw-4770r     total:288  pass:261  dwarn:0   dfail:0   fail:0   skip:27  time:395s
fi-ilk-650       total:288  pass:228  dwarn:0   dfail:0   fail:0   skip:60  time:409s
fi-ivb-3520m     total:288  pass:259  dwarn:0   dfail:0   fail:0   skip:29  time:452s
fi-ivb-3770      total:288  pass:255  dwarn:0   dfail:0   fail:0   skip:33  time:411s
fi-kbl-7500u     total:288  pass:263  dwarn:1   dfail:0   fail:0   skip:24  time:453s
fi-kbl-7560u     total:288  pass:269  dwarn:0   dfail:0   fail:0   skip:19  time:501s
fi-kbl-7567u     total:288  pass:268  dwarn:0   dfail:0   fail:0   skip:20  time:450s
fi-kbl-r         total:288  pass:261  dwarn:0   dfail:0   fail:0   skip:27  time:499s
fi-pnv-d510      total:288  pass:222  dwarn:1   dfail:0   fail:0   skip:65  time:575s
fi-skl-6260u     total:288  pass:268  dwarn:0   dfail:0   fail:0   skip:20  time:423s
fi-skl-6600u     total:288  pass:261  dwarn:0   dfail:0   fail:0   skip:27  time:505s
fi-skl-6700hq    total:288  pass:262  dwarn:0   dfail:0   fail:0   skip:26  time:526s
fi-skl-6700k2    total:288  pass:264  dwarn:0   dfail:0   fail:0   skip:24  time:490s
fi-skl-6770hq    total:288  pass:268  dwarn:0   dfail:0   fail:0   skip:20  time:479s
fi-skl-guc       total:288  pass:260  dwarn:0   dfail:0   fail:0   skip:28  time:419s
fi-skl-gvtdvm    total:288  pass:265  dwarn:0   dfail:0   fail:0   skip:23  time:427s
fi-snb-2520m     total:245  pass:211  dwarn:0   dfail:0   fail:0   skip:33 
fi-snb-2600      total:288  pass:248  dwarn:0   dfail:0   fail:0   skip:40  time:397s
Blacklisted hosts:
fi-glk-dsi       total:288  pass:258  dwarn:0   dfail:0   fail:0   skip:30  time:466s

2e76a2952923eba64c4f9baf461613bc42ee997a drm-tip: 2018y-02m-02d-20h-33m-12s UTC integration manifest
2166dc72adba drm/i915: Report if an unbannable context is involved in a GPU hang

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_7884/issues.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 4+ messages in thread

* ✓ Fi.CI.IGT: success for drm/i915: Report if an unbannable context is involved in a GPU hang
  2018-02-05  9:41 [PATCH] drm/i915: Report if an unbannable context is involved in a GPU hang Chris Wilson
  2018-02-05 10:01 ` Mika Kuoppala
  2018-02-05 10:37 ` ✓ Fi.CI.BAT: success for " Patchwork
@ 2018-02-05 11:31 ` Patchwork
  2 siblings, 0 replies; 4+ messages in thread
From: Patchwork @ 2018-02-05 11:31 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: drm/i915: Report if an unbannable context is involved in a GPU hang
URL   : https://patchwork.freedesktop.org/series/37648/
State : success

== Summary ==

Test perf:
        Subgroup enable-disable:
                pass       -> FAIL       (shard-apl) fdo#103715
Test gem_eio:
        Subgroup in-flight-contexts:
                dmesg-warn -> PASS       (shard-snb) fdo#104058
Test kms_cursor_legacy:
        Subgroup flip-vs-cursor-atomic:
                fail       -> PASS       (shard-hsw) fdo#102670 +1
Test kms_flip:
        Subgroup 2x-flip-vs-expired-vblank-interruptible:
                fail       -> PASS       (shard-hsw) fdo#102887
        Subgroup 2x-plain-flip-ts-check:
                fail       -> PASS       (shard-hsw) fdo#100368
Test perf_pmu:
        Subgroup busy-double-start-vcs0:
                pass       -> INCOMPLETE (shard-apl) fdo#103927

fdo#103715 https://bugs.freedesktop.org/show_bug.cgi?id=103715
fdo#104058 https://bugs.freedesktop.org/show_bug.cgi?id=104058
fdo#102670 https://bugs.freedesktop.org/show_bug.cgi?id=102670
fdo#102887 https://bugs.freedesktop.org/show_bug.cgi?id=102887
fdo#100368 https://bugs.freedesktop.org/show_bug.cgi?id=100368
fdo#103927 https://bugs.freedesktop.org/show_bug.cgi?id=103927

shard-apl        total:2795 pass:1724 dwarn:1   dfail:0   fail:20  skip:1049 time:12004s
shard-hsw        total:2836 pass:1733 dwarn:1   dfail:0   fail:11  skip:1090 time:11572s
shard-snb        total:2836 pass:1328 dwarn:1   dfail:0   fail:10  skip:1497 time:6379s
Blacklisted hosts:
shard-kbl        total:2836 pass:1871 dwarn:1   dfail:1   fail:21  skip:942 time:9408s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_7884/shards.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2018-02-05 11:31 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-02-05  9:41 [PATCH] drm/i915: Report if an unbannable context is involved in a GPU hang Chris Wilson
2018-02-05 10:01 ` Mika Kuoppala
2018-02-05 10:37 ` ✓ Fi.CI.BAT: success for " Patchwork
2018-02-05 11:31 ` ✓ Fi.CI.IGT: " Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.