All of lore.kernel.org
 help / color / mirror / Atom feed
* [CI] drm/i915: Optionally disable automatic recovery after a GPU reset
@ 2019-02-18 10:58 Chris Wilson
  2019-02-18 11:28 ` ✗ Fi.CI.CHECKPATCH: warning for drm/i915: Optionally disable automatic recovery after a GPU reset (rev3) Patchwork
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Chris Wilson @ 2019-02-18 10:58 UTC (permalink / raw)
  To: intel-gfx

Some clients, such as mesa, may only emit minimal incremental batches
that rely on the logical context state from previous batches. They know
that recovery is impossible after a hang as their required GPU state is
lost, and that each in flight and subsequent batch will hang (resetting
the context image back to default perpetuating the problem).

To avoid getting into the state in the first place, we can allow clients
to opt out of automatic recovery and elect to ban any guilty context
following a hang. This prevents the continual stream of hangs and allows
the client to recreate their context and rebuild the state from scratch.

v2: Prefer calling it recoverable rather than unrecoverable.

References: https://lists.freedesktop.org/archives/mesa-dev/2019-February/215431.html
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Kenneth Graunke <kenneth@whitecape.org>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Acked-by: Kenneth Graunke <kenneth@whitecape.org> # for mesa
---
 drivers/gpu/drm/i915/i915_gem_context.c | 15 +++++++++++++++
 drivers/gpu/drm/i915/i915_gem_context.h | 16 ++++++++++++++++
 drivers/gpu/drm/i915/i915_reset.c       |  3 ++-
 include/uapi/drm/i915_drm.h             | 20 ++++++++++++++++++++
 4 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 280813a4bf82..da21c843fed8 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -401,6 +401,8 @@ __create_hw_context(struct drm_i915_private *dev_priv,
 	ctx->remap_slice = ALL_L3_SLICES(dev_priv);
 
 	i915_gem_context_set_bannable(ctx);
+	i915_gem_context_set_recoverable(ctx);
+
 	ctx->ring_size = 4 * PAGE_SIZE;
 	ctx->desc_template =
 		default_desc_template(dev_priv, dev_priv->mm.aliasing_ppgtt);
@@ -951,6 +953,10 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 		args->size = 0;
 		args->value = i915_gem_context_is_bannable(ctx);
 		break;
+	case I915_CONTEXT_PARAM_RECOVERABLE:
+		args->size = 0;
+		args->value = i915_gem_context_is_recoverable(ctx);
+		break;
 	case I915_CONTEXT_PARAM_PRIORITY:
 		args->size = 0;
 		args->value = ctx->sched.priority >> I915_USER_PRIORITY_SHIFT;
@@ -1285,6 +1291,15 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 			i915_gem_context_clear_bannable(ctx);
 		break;
 
+	case I915_CONTEXT_PARAM_RECOVERABLE:
+		if (args->size)
+			ret = -EINVAL;
+		else if (args->value)
+			i915_gem_context_set_recoverable(ctx);
+		else
+			i915_gem_context_clear_recoverable(ctx);
+		break;
+
 	case I915_CONTEXT_PARAM_PRIORITY:
 		{
 			s64 priority = args->value;
diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
index ca150a764c24..071108d34ae0 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/i915_gem_context.h
@@ -134,6 +134,7 @@ struct i915_gem_context {
 #define UCONTEXT_NO_ZEROMAP		0
 #define UCONTEXT_NO_ERROR_CAPTURE	1
 #define UCONTEXT_BANNABLE		2
+#define UCONTEXT_RECOVERABLE		3
 
 	/**
 	 * @flags: small set of booleans
@@ -270,6 +271,21 @@ static inline void i915_gem_context_clear_bannable(struct i915_gem_context *ctx)
 	clear_bit(UCONTEXT_BANNABLE, &ctx->user_flags);
 }
 
+static inline bool i915_gem_context_is_recoverable(const struct i915_gem_context *ctx)
+{
+	return test_bit(UCONTEXT_RECOVERABLE, &ctx->user_flags);
+}
+
+static inline void i915_gem_context_set_recoverable(struct i915_gem_context *ctx)
+{
+	set_bit(UCONTEXT_RECOVERABLE, &ctx->user_flags);
+}
+
+static inline void i915_gem_context_clear_recoverable(struct i915_gem_context *ctx)
+{
+	clear_bit(UCONTEXT_RECOVERABLE, &ctx->user_flags);
+}
+
 static inline bool i915_gem_context_is_banned(const struct i915_gem_context *ctx)
 {
 	return test_bit(CONTEXT_BANNED, &ctx->flags);
diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index 5a067a4b3d5d..1911e00d2581 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -66,7 +66,8 @@ static bool context_mark_guilty(struct i915_gem_context *ctx)
 
 	bannable = i915_gem_context_is_bannable(ctx);
 	score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
-	banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
+	banned = (!i915_gem_context_is_recoverable(ctx) ||
+		  score >= CONTEXT_SCORE_BAN_THRESHOLD);
 
 	/* Cool contexts don't accumulate client ban score */
 	if (!bannable)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 397810fa2d33..c890b7992d5c 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1491,6 +1491,26 @@ struct drm_i915_gem_context_param {
 	 * drm_i915_gem_context_param_sseu.
 	 */
 #define I915_CONTEXT_PARAM_SSEU		0x7
+
+/*
+ * Not all clients may want to attempt automatic recover of a context after
+ * a hang (for example, some clients may only submit very small incremental
+ * batches relying on known logical state of previous batches which will never
+ * recover correctly and each attempt will hang), and so would prefer that
+ * the context is forever banned instead.
+ *
+ * If set to false (0), after a reset, subsequent (and in flight) rendering
+ * from this context is discarded, and the client will need to create a new
+ * context to use instead.
+ *
+ * If set to true (1), the kernel will automatically attempt to recover the
+ * context by skipping the hanging batch and executing the next batch starting
+ * from the default context state (discarding the incomplete logical context
+ * state lost due to the reset).
+ *
+ * On creation, all new contexts are marked as recoverable.
+ */
+#define I915_CONTEXT_PARAM_RECOVERABLE	0x8
 	__u64 value;
 };
 
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* ✗ Fi.CI.CHECKPATCH: warning for drm/i915: Optionally disable automatic recovery after a GPU reset (rev3)
  2019-02-18 10:58 [CI] drm/i915: Optionally disable automatic recovery after a GPU reset Chris Wilson
@ 2019-02-18 11:28 ` Patchwork
  2019-02-18 11:48 ` ✓ Fi.CI.BAT: success " Patchwork
  2019-02-18 16:01 ` ✗ Fi.CI.IGT: failure " Patchwork
  2 siblings, 0 replies; 4+ messages in thread
From: Patchwork @ 2019-02-18 11:28 UTC (permalink / raw)
  To: intel-gfx

== Series Details ==

Series: drm/i915: Optionally disable automatic recovery after a GPU reset (rev3)
URL   : https://patchwork.freedesktop.org/series/50458/
State : warning

== Summary ==

$ dim checkpatch origin/drm-tip
1a729371c753 drm/i915: Optionally disable automatic recovery after a GPU reset
-:20: WARNING:COMMIT_LOG_LONG_LINE: Possible unwrapped commit description (prefer a maximum 75 chars per line)
#20: 
References: https://lists.freedesktop.org/archives/mesa-dev/2019-February/215431.html

total: 0 errors, 1 warnings, 0 checks, 96 lines checked

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 4+ messages in thread

* ✓ Fi.CI.BAT: success for drm/i915: Optionally disable automatic recovery after a GPU reset (rev3)
  2019-02-18 10:58 [CI] drm/i915: Optionally disable automatic recovery after a GPU reset Chris Wilson
  2019-02-18 11:28 ` ✗ Fi.CI.CHECKPATCH: warning for drm/i915: Optionally disable automatic recovery after a GPU reset (rev3) Patchwork
@ 2019-02-18 11:48 ` Patchwork
  2019-02-18 16:01 ` ✗ Fi.CI.IGT: failure " Patchwork
  2 siblings, 0 replies; 4+ messages in thread
From: Patchwork @ 2019-02-18 11:48 UTC (permalink / raw)
  To: intel-gfx

== Series Details ==

Series: drm/i915: Optionally disable automatic recovery after a GPU reset (rev3)
URL   : https://patchwork.freedesktop.org/series/50458/
State : success

== Summary ==

CI Bug Log - changes from CI_DRM_5625 -> Patchwork_12244
====================================================

Summary
-------

  **SUCCESS**

  No regressions found.

  External URL: https://patchwork.freedesktop.org/api/1.0/series/50458/revisions/3/mbox/

Known issues
------------

  Here are the changes found in Patchwork_12244 that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@kms_pipe_crc_basic@read-crc-pipe-a:
    - fi-byt-clapper:     PASS -> FAIL [fdo#107362]

  
#### Possible fixes ####

  * igt@kms_pipe_crc_basic@suspend-read-crc-pipe-a:
    - fi-byt-clapper:     FAIL [fdo#103191] / [fdo#107362] -> PASS +1

  * igt@pm_rpm@basic-pci-d3-state:
    - fi-bsw-kefka:       {SKIP} [fdo#109271] -> PASS

  * igt@pm_rpm@basic-rte:
    - fi-bsw-kefka:       FAIL [fdo#108800] -> PASS

  
  [fdo#103191]: https://bugs.freedesktop.org/show_bug.cgi?id=103191
  [fdo#107362]: https://bugs.freedesktop.org/show_bug.cgi?id=107362
  [fdo#108800]: https://bugs.freedesktop.org/show_bug.cgi?id=108800
  [fdo#109271]: https://bugs.freedesktop.org/show_bug.cgi?id=109271


Participating hosts (46 -> 40)
------------------------------

  Missing    (6): fi-ilk-m540 fi-hsw-4200u fi-byt-squawks fi-bsw-cyan fi-ctg-p8600 fi-icl-y 


Build changes
-------------

    * Linux: CI_DRM_5625 -> Patchwork_12244

  CI_DRM_5625: de5ff4bca0978ede380791d39b7afa8812622469 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_4834: 6727e17c00b2ad0a801f96fc8a2afe404b95ec88 @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_12244: 1a729371c75313b2a73a019bb81102e9e4c78c61 @ git://anongit.freedesktop.org/gfx-ci/linux


== Linux commits ==

1a729371c753 drm/i915: Optionally disable automatic recovery after a GPU reset

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_12244/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 4+ messages in thread

* ✗ Fi.CI.IGT: failure for drm/i915: Optionally disable automatic recovery after a GPU reset (rev3)
  2019-02-18 10:58 [CI] drm/i915: Optionally disable automatic recovery after a GPU reset Chris Wilson
  2019-02-18 11:28 ` ✗ Fi.CI.CHECKPATCH: warning for drm/i915: Optionally disable automatic recovery after a GPU reset (rev3) Patchwork
  2019-02-18 11:48 ` ✓ Fi.CI.BAT: success " Patchwork
@ 2019-02-18 16:01 ` Patchwork
  2 siblings, 0 replies; 4+ messages in thread
From: Patchwork @ 2019-02-18 16:01 UTC (permalink / raw)
  To: intel-gfx

== Series Details ==

Series: drm/i915: Optionally disable automatic recovery after a GPU reset (rev3)
URL   : https://patchwork.freedesktop.org/series/50458/
State : failure

== Summary ==

CI Bug Log - changes from CI_DRM_5625_full -> Patchwork_12244_full
====================================================

Summary
-------

  **FAILURE**

  Serious unknown changes coming with Patchwork_12244_full absolutely need to be
  verified manually.
  
  If you think the reported changes have nothing to do with the changes
  introduced in Patchwork_12244_full, please notify your bug team to allow them
  to document this new failure mode, which will reduce false positives in CI.

  

Possible new issues
-------------------

  Here are the unknown changes that may have been introduced in Patchwork_12244_full:

### IGT changes ###

#### Possible regressions ####

  * igt@gem_ctx_param@invalid-param-set:
    - shard-kbl:          PASS -> FAIL
    - shard-hsw:          PASS -> FAIL
    - shard-glk:          PASS -> FAIL
    - shard-iclb:         PASS -> FAIL
    - shard-apl:          PASS -> FAIL

  
Known issues
------------

  Here are the changes found in Patchwork_12244_full that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@gem_ctx_param@invalid-param-get:
    - shard-apl:          PASS -> FAIL [fdo#109559]
    - shard-iclb:         PASS -> FAIL [fdo#109559]
    - shard-glk:          PASS -> FAIL [fdo#109559]
    - shard-snb:          PASS -> FAIL [fdo#109559]
    - shard-hsw:          PASS -> FAIL [fdo#109559]
    - shard-kbl:          NOTRUN -> FAIL [fdo#109559]

  * igt@i915_selftest@live_workarounds:
    - shard-iclb:         PASS -> DMESG-FAIL [fdo#108954]

  * igt@kms_atomic_transition@1x-modeset-transitions-nonblocking-fencing:
    - shard-apl:          PASS -> FAIL [fdo#109660]

  * igt@kms_busy@extended-pageflip-modeset-hang-oldfb-render-a:
    - shard-hsw:          NOTRUN -> DMESG-WARN [fdo#107956]

  * igt@kms_ccs@pipe-a-crc-sprite-planes-basic:
    - shard-glk:          PASS -> FAIL [fdo#108145]

  * igt@kms_color@pipe-c-ctm-max:
    - shard-iclb:         NOTRUN -> DMESG-FAIL [fdo#109624]

  * igt@kms_cursor_crc@cursor-64x21-sliding:
    - shard-apl:          PASS -> FAIL [fdo#103232]

  * igt@kms_frontbuffer_tracking@fbc-1p-primscrn-spr-indfb-draw-mmap-gtt:
    - shard-apl:          PASS -> FAIL [fdo#103167] +1

  * igt@kms_frontbuffer_tracking@fbc-1p-primscrn-spr-indfb-move:
    - shard-iclb:         PASS -> FAIL [fdo#103167] +2

  * igt@kms_frontbuffer_tracking@fbc-2p-primscrn-spr-indfb-onoff:
    - shard-glk:          PASS -> FAIL [fdo#103167] +1

  * igt@kms_plane_alpha_blend@pipe-a-alpha-basic:
    - shard-kbl:          NOTRUN -> FAIL [fdo#108145] / [fdo#108590]

  * igt@kms_plane_alpha_blend@pipe-a-constant-alpha-max:
    - shard-apl:          NOTRUN -> FAIL [fdo#108145]

  * igt@kms_plane_multiple@atomic-pipe-b-tiling-x:
    - shard-iclb:         PASS -> FAIL [fdo#103166]

  * igt@kms_plane_multiple@atomic-pipe-b-tiling-y:
    - shard-apl:          PASS -> FAIL [fdo#103166] +2

  * igt@kms_plane_multiple@atomic-pipe-c-tiling-none:
    - shard-glk:          PASS -> FAIL [fdo#103166] +2

  * igt@kms_vblank@pipe-c-ts-continuation-modeset:
    - shard-apl:          NOTRUN -> FAIL [fdo#104894]

  * igt@pm_rpm@gem-mmap-cpu:
    - shard-iclb:         PASS -> DMESG-WARN [fdo#107724] +2

  
#### Possible fixes ####

  * igt@kms_cursor_crc@cursor-128x42-offscreen:
    - shard-hsw:          INCOMPLETE [fdo#103540] -> PASS

  * igt@kms_cursor_crc@cursor-256x256-suspend:
    - shard-apl:          FAIL [fdo#103191] / [fdo#103232] -> PASS

  * igt@kms_frontbuffer_tracking@fbc-1p-primscrn-spr-indfb-draw-mmap-cpu:
    - shard-apl:          FAIL [fdo#103167] -> PASS +1

  * igt@kms_frontbuffer_tracking@fbc-2p-primscrn-spr-indfb-draw-blt:
    - shard-glk:          FAIL [fdo#103167] -> PASS

  * igt@kms_frontbuffer_tracking@psr-1p-primscrn-spr-indfb-draw-mmap-wc:
    - shard-iclb:         FAIL [fdo#103167] -> PASS +3

  * igt@kms_plane_multiple@atomic-pipe-a-tiling-y:
    - shard-apl:          FAIL [fdo#103166] -> PASS +1

  * igt@kms_plane_multiple@atomic-pipe-b-tiling-y:
    - shard-glk:          FAIL [fdo#103166] -> PASS +1

  * igt@kms_plane_multiple@atomic-pipe-b-tiling-yf:
    - shard-iclb:         FAIL [fdo#103166] -> PASS +4

  * igt@kms_rotation_crc@multiplane-rotation-cropping-bottom:
    - shard-kbl:          DMESG-FAIL [fdo#105763] -> PASS

  * igt@kms_setmode@basic:
    - shard-apl:          FAIL [fdo#99912] -> PASS

  * igt@kms_sysfs_edid_timing:
    - shard-iclb:         FAIL [fdo#100047] -> PASS

  * igt@pm_rpm@system-suspend-modeset:
    - shard-iclb:         DMESG-WARN [fdo#107724] -> PASS

  
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  [fdo#100047]: https://bugs.freedesktop.org/show_bug.cgi?id=100047
  [fdo#103166]: https://bugs.freedesktop.org/show_bug.cgi?id=103166
  [fdo#103167]: https://bugs.freedesktop.org/show_bug.cgi?id=103167
  [fdo#103191]: https://bugs.freedesktop.org/show_bug.cgi?id=103191
  [fdo#103232]: https://bugs.freedesktop.org/show_bug.cgi?id=103232
  [fdo#103540]: https://bugs.freedesktop.org/show_bug.cgi?id=103540
  [fdo#104894]: https://bugs.freedesktop.org/show_bug.cgi?id=104894
  [fdo#105763]: https://bugs.freedesktop.org/show_bug.cgi?id=105763
  [fdo#107724]: https://bugs.freedesktop.org/show_bug.cgi?id=107724
  [fdo#107956]: https://bugs.freedesktop.org/show_bug.cgi?id=107956
  [fdo#108145]: https://bugs.freedesktop.org/show_bug.cgi?id=108145
  [fdo#108590]: https://bugs.freedesktop.org/show_bug.cgi?id=108590
  [fdo#108954]: https://bugs.freedesktop.org/show_bug.cgi?id=108954
  [fdo#109271]: https://bugs.freedesktop.org/show_bug.cgi?id=109271
  [fdo#109274]: https://bugs.freedesktop.org/show_bug.cgi?id=109274
  [fdo#109278]: https://bugs.freedesktop.org/show_bug.cgi?id=109278
  [fdo#109279]: https://bugs.freedesktop.org/show_bug.cgi?id=109279
  [fdo#109280]: https://bugs.freedesktop.org/show_bug.cgi?id=109280
  [fdo#109284]: https://bugs.freedesktop.org/show_bug.cgi?id=109284
  [fdo#109441]: https://bugs.freedesktop.org/show_bug.cgi?id=109441
  [fdo#109559]: https://bugs.freedesktop.org/show_bug.cgi?id=109559
  [fdo#109624]: https://bugs.freedesktop.org/show_bug.cgi?id=109624
  [fdo#109660]: https://bugs.freedesktop.org/show_bug.cgi?id=109660
  [fdo#99912]: https://bugs.freedesktop.org/show_bug.cgi?id=99912


Participating hosts (7 -> 6)
------------------------------

  Missing    (1): shard-skl 


Build changes
-------------

    * Linux: CI_DRM_5625 -> Patchwork_12244

  CI_DRM_5625: de5ff4bca0978ede380791d39b7afa8812622469 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_4834: 6727e17c00b2ad0a801f96fc8a2afe404b95ec88 @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_12244: 1a729371c75313b2a73a019bb81102e9e4c78c61 @ git://anongit.freedesktop.org/gfx-ci/linux
  piglit_4509: fdc5a4ca11124ab8413c7988896eec4c97336694 @ git://anongit.freedesktop.org/piglit

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_12244/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2019-02-18 16:01 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-02-18 10:58 [CI] drm/i915: Optionally disable automatic recovery after a GPU reset Chris Wilson
2019-02-18 11:28 ` ✗ Fi.CI.CHECKPATCH: warning for drm/i915: Optionally disable automatic recovery after a GPU reset (rev3) Patchwork
2019-02-18 11:48 ` ✓ Fi.CI.BAT: success " Patchwork
2019-02-18 16:01 ` ✗ Fi.CI.IGT: failure " Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.