All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/i915: Optionally disable automatic recovery after a GPU reset
@ 2018-10-02 19:07 Chris Wilson
  2018-10-02 19:47 ` ✓ Fi.CI.BAT: success for " Patchwork
                   ` (4 more replies)
  0 siblings, 5 replies; 8+ messages in thread
From: Chris Wilson @ 2018-10-02 19:07 UTC (permalink / raw)
  To: intel-gfx; +Cc: Kenneth Graunke, Mika Kuoppala

Some clients, such as mesa, may only emit minimal incremental batches
that rely on the logical context state from previous batches. They know
that recovery is impossible after a hang as their required GPU state is
lost, and that each in flight and subsequent batch will hang (resetting
the context image back to default perpetuating the problem).

To avoid getting into the state in the first place, we can allow clients
to opt out of automatic recovery and elect to ban any guilty context
following a hang. This prevents the continual stream of hangs and allows
the client to recreate their context and rebuild the state from scratch.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Kenneth Graunke <kenneth@whitecape.org>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c         |  3 ++-
 drivers/gpu/drm/i915/i915_gem_context.c | 12 ++++++++++++
 drivers/gpu/drm/i915/i915_gem_context.h | 16 ++++++++++++++++
 include/uapi/drm/i915_drm.h             | 20 ++++++++++++++++++++
 4 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 7d45e71100bc..eee06d90d460 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3018,7 +3018,8 @@ static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
 
 	bannable = i915_gem_context_is_bannable(ctx);
 	score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
-	banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
+	banned = (i915_gem_context_is_unrecoverable(ctx) ||
+		  score >= CONTEXT_SCORE_BAN_THRESHOLD);
 
 	/* Cool contexts don't accumulate client ban score */
 	if (!bannable)
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 8cbe58070561..2d5e4119786a 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -878,6 +878,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 	case I915_CONTEXT_PARAM_BANNABLE:
 		args->value = i915_gem_context_is_bannable(ctx);
 		break;
+	case I915_CONTEXT_PARAM_RECOVERABLE:
+		args->value = !i915_gem_context_is_unrecoverable(ctx);
+		break;
 	case I915_CONTEXT_PARAM_PRIORITY:
 		args->value = ctx->sched.priority >> I915_USER_PRIORITY_SHIFT;
 		break;
@@ -933,6 +936,15 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 			i915_gem_context_clear_bannable(ctx);
 		break;
 
+	case I915_CONTEXT_PARAM_RECOVERABLE:
+		if (args->size)
+			ret = -EINVAL;
+		else if (args->value)
+			i915_gem_context_clear_unrecoverable(ctx);
+		else
+			i915_gem_context_set_unrecoverable(ctx);
+		break;
+
 	case I915_CONTEXT_PARAM_PRIORITY:
 		{
 			s64 priority = args->value;
diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
index 08165f6a0a84..2d6b8b0307e5 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/i915_gem_context.h
@@ -123,6 +123,7 @@ struct i915_gem_context {
 #define UCONTEXT_NO_ZEROMAP		0
 #define UCONTEXT_NO_ERROR_CAPTURE	1
 #define UCONTEXT_BANNABLE		2
+#define UCONTEXT_NO_RECOVERY		3
 
 	/**
 	 * @flags: small set of booleans
@@ -247,6 +248,21 @@ static inline void i915_gem_context_clear_bannable(struct i915_gem_context *ctx)
 	clear_bit(UCONTEXT_BANNABLE, &ctx->user_flags);
 }
 
+static inline bool i915_gem_context_is_unrecoverable(const struct i915_gem_context *ctx)
+{
+	return test_bit(UCONTEXT_NO_RECOVERY, &ctx->user_flags);
+}
+
+static inline void i915_gem_context_set_unrecoverable(struct i915_gem_context *ctx)
+{
+	set_bit(UCONTEXT_NO_RECOVERY, &ctx->user_flags);
+}
+
+static inline void i915_gem_context_clear_unrecoverable(struct i915_gem_context *ctx)
+{
+	clear_bit(UCONTEXT_NO_RECOVERY, &ctx->user_flags);
+}
+
 static inline bool i915_gem_context_is_banned(const struct i915_gem_context *ctx)
 {
 	return test_bit(CONTEXT_BANNED, &ctx->flags);
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 298b2e197744..eeb258a12b95 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1486,6 +1486,26 @@ struct drm_i915_gem_context_param {
 #define   I915_CONTEXT_MAX_USER_PRIORITY	1023 /* inclusive */
 #define   I915_CONTEXT_DEFAULT_PRIORITY		0
 #define   I915_CONTEXT_MIN_USER_PRIORITY	-1023 /* inclusive */
+
+/*
+ * Not all clients may want to attempt automatic recover of a context after
+ * a hang (for example, some clients may only submit very small incremental
+ * batches relying on known logical state of previous batches which will never
+ * recover correctly and each attempt will hang), and so would prefer that
+ * the context is forever banned instead.
+ *
+ * If set to false (0), after a reset, subsequent (and in flight) rendering
+ * from this context is discarded, and the client will need to create a new
+ * context to use instead.
+ *
+ * If set to true (1), the kernel will automatically attempt to recover the
+ * context by skipping the hanging batch and executing the next batch starting
+ * from the default context state (discarding the incomplete logical context
+ * state lost due to the reset).
+ *
+ * On creation, all new contexts are marked as recoverable.
+ */
+#define I915_CONTEXT_PARAM_RECOVERABLE	0x7
 	__u64 value;
 };
 
-- 
2.19.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* ✓ Fi.CI.BAT: success for drm/i915: Optionally disable automatic recovery after a GPU reset
  2018-10-02 19:07 [PATCH] drm/i915: Optionally disable automatic recovery after a GPU reset Chris Wilson
@ 2018-10-02 19:47 ` Patchwork
  2018-10-03  6:22 ` [PATCH] " Mika Kuoppala
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 8+ messages in thread
From: Patchwork @ 2018-10-02 19:47 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: drm/i915: Optionally disable automatic recovery after a GPU reset
URL   : https://patchwork.freedesktop.org/series/50458/
State : success

== Summary ==

= CI Bug Log - changes from CI_DRM_4915 -> Patchwork_10329 =

== Summary - WARNING ==

  Minor unknown changes coming with Patchwork_10329 need to be verified
  manually.
  
  If you think the reported changes have nothing to do with the changes
  introduced in Patchwork_10329, please notify your bug team to allow them
  to document this new failure mode, which will reduce false positives in CI.

  External URL: https://patchwork.freedesktop.org/api/1.0/series/50458/revisions/1/mbox/

== Possible new issues ==

  Here are the unknown changes that may have been introduced in Patchwork_10329:

  === IGT changes ===

    ==== Warnings ====

    igt@drv_selftest@live_guc:
      fi-glk-j4005:       SKIP -> PASS

    
== Known issues ==

  Here are the changes found in Patchwork_10329 that come from known issues:

  === IGT changes ===

    ==== Issues hit ====

    igt@kms_frontbuffer_tracking@basic:
      fi-byt-clapper:     PASS -> FAIL (fdo#103167)

    igt@kms_pipe_crc_basic@suspend-read-crc-pipe-a:
      fi-bdw-samus:       NOTRUN -> INCOMPLETE (fdo#107773)

    
    ==== Possible fixes ====

    igt@drv_selftest@live_coherency:
      fi-gdg-551:         DMESG-FAIL (fdo#107164) -> PASS

    igt@drv_selftest@live_execlists:
      fi-glk-j4005:       INCOMPLETE (k.org#198133, fdo#103359) -> PASS

    igt@gem_exec_suspend@basic-s3:
      fi-bdw-samus:       INCOMPLETE (fdo#107773) -> PASS

    igt@kms_pipe_crc_basic@read-crc-pipe-a-frame-sequence:
      fi-byt-clapper:     FAIL (fdo#103191, fdo#107362) -> PASS

    igt@kms_pipe_crc_basic@suspend-read-crc-pipe-b:
      fi-skl-guc:         FAIL (fdo#103191) -> PASS

    
  fdo#103167 https://bugs.freedesktop.org/show_bug.cgi?id=103167
  fdo#103191 https://bugs.freedesktop.org/show_bug.cgi?id=103191
  fdo#103359 https://bugs.freedesktop.org/show_bug.cgi?id=103359
  fdo#107164 https://bugs.freedesktop.org/show_bug.cgi?id=107164
  fdo#107362 https://bugs.freedesktop.org/show_bug.cgi?id=107362
  fdo#107773 https://bugs.freedesktop.org/show_bug.cgi?id=107773
  k.org#198133 https://bugzilla.kernel.org/show_bug.cgi?id=198133


== Participating hosts (46 -> 41) ==

  Missing    (5): fi-bsw-cyan fi-byt-squawks fi-icl-u2 fi-skl-6260u fi-pnv-d510 


== Build changes ==

    * Linux: CI_DRM_4915 -> Patchwork_10329

  CI_DRM_4915: 26e7a7d954a9c28b97af8ca7813f430fd9117232 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_4660: d0975646c50568e66e65b44b81d28232d059b94e @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_10329: 96793c4e00847805704679317f5e403d80ca0948 @ git://anongit.freedesktop.org/gfx-ci/linux


== Linux commits ==

96793c4e0084 drm/i915: Optionally disable automatic recovery after a GPU reset

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_10329/issues.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] drm/i915: Optionally disable automatic recovery after a GPU reset
  2018-10-02 19:07 [PATCH] drm/i915: Optionally disable automatic recovery after a GPU reset Chris Wilson
  2018-10-02 19:47 ` ✓ Fi.CI.BAT: success for " Patchwork
@ 2018-10-03  6:22 ` Mika Kuoppala
  2018-10-03  7:22   ` Chris Wilson
  2018-10-03 10:59 ` [PATCH v2] " Chris Wilson
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 8+ messages in thread
From: Mika Kuoppala @ 2018-10-03  6:22 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: Kenneth Graunke

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Some clients, such as mesa, may only emit minimal incremental batches
> that rely on the logical context state from previous batches. They know
> that recovery is impossible after a hang as their required GPU state is
> lost, and that each in flight and subsequent batch will hang (resetting
> the context image back to default perpetuating the problem).
>
> To avoid getting into the state in the first place, we can allow clients
> to opt out of automatic recovery and elect to ban any guilty context
> following a hang. This prevents the continual stream of hangs and allows
> the client to recreate their context and rebuild the state from scratch.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Kenneth Graunke <kenneth@whitecape.org>
> Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem.c         |  3 ++-
>  drivers/gpu/drm/i915/i915_gem_context.c | 12 ++++++++++++
>  drivers/gpu/drm/i915/i915_gem_context.h | 16 ++++++++++++++++
>  include/uapi/drm/i915_drm.h             | 20 ++++++++++++++++++++
>  4 files changed, 50 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 7d45e71100bc..eee06d90d460 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3018,7 +3018,8 @@ static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
>  
>  	bannable = i915_gem_context_is_bannable(ctx);
>  	score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
> -	banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
> +	banned = (i915_gem_context_is_unrecoverable(ctx) ||
> +		  score >= CONTEXT_SCORE_BAN_THRESHOLD);

We treat banned contexts rather aggressively on client level
banning scoring.

Should we give some leeway if a client tells it don't
need recovery, instead of being more harsh on them?

As with this, third hang would lead to client ban.

>  
>  	/* Cool contexts don't accumulate client ban score */
>  	if (!bannable)
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 8cbe58070561..2d5e4119786a 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -878,6 +878,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>  	case I915_CONTEXT_PARAM_BANNABLE:
>  		args->value = i915_gem_context_is_bannable(ctx);
>  		break;
> +	case I915_CONTEXT_PARAM_RECOVERABLE:
> +		args->value = !i915_gem_context_is_unrecoverable(ctx);

Pondering here why not the internal coding be also
i915_gem_context_is_recoverable(ctx). Atleast in here
would read better.

-Mika

> +		break;
>  	case I915_CONTEXT_PARAM_PRIORITY:
>  		args->value = ctx->sched.priority >> I915_USER_PRIORITY_SHIFT;
>  		break;
> @@ -933,6 +936,15 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>  			i915_gem_context_clear_bannable(ctx);
>  		break;
>  
> +	case I915_CONTEXT_PARAM_RECOVERABLE:
> +		if (args->size)
> +			ret = -EINVAL;
> +		else if (args->value)
> +			i915_gem_context_clear_unrecoverable(ctx);
> +		else
> +			i915_gem_context_set_unrecoverable(ctx);
> +		break;
> +
>  	case I915_CONTEXT_PARAM_PRIORITY:
>  		{
>  			s64 priority = args->value;
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
> index 08165f6a0a84..2d6b8b0307e5 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.h
> +++ b/drivers/gpu/drm/i915/i915_gem_context.h
> @@ -123,6 +123,7 @@ struct i915_gem_context {
>  #define UCONTEXT_NO_ZEROMAP		0
>  #define UCONTEXT_NO_ERROR_CAPTURE	1
>  #define UCONTEXT_BANNABLE		2
> +#define UCONTEXT_NO_RECOVERY		3
>  
>  	/**
>  	 * @flags: small set of booleans
> @@ -247,6 +248,21 @@ static inline void i915_gem_context_clear_bannable(struct i915_gem_context *ctx)
>  	clear_bit(UCONTEXT_BANNABLE, &ctx->user_flags);
>  }
>  
> +static inline bool i915_gem_context_is_unrecoverable(const struct i915_gem_context *ctx)
> +{
> +	return test_bit(UCONTEXT_NO_RECOVERY, &ctx->user_flags);
> +}
> +
> +static inline void i915_gem_context_set_unrecoverable(struct i915_gem_context *ctx)
> +{
> +	set_bit(UCONTEXT_NO_RECOVERY, &ctx->user_flags);
> +}
> +
> +static inline void i915_gem_context_clear_unrecoverable(struct i915_gem_context *ctx)
> +{
> +	clear_bit(UCONTEXT_NO_RECOVERY, &ctx->user_flags);
> +}
> +
>  static inline bool i915_gem_context_is_banned(const struct i915_gem_context *ctx)
>  {
>  	return test_bit(CONTEXT_BANNED, &ctx->flags);
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 298b2e197744..eeb258a12b95 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1486,6 +1486,26 @@ struct drm_i915_gem_context_param {
>  #define   I915_CONTEXT_MAX_USER_PRIORITY	1023 /* inclusive */
>  #define   I915_CONTEXT_DEFAULT_PRIORITY		0
>  #define   I915_CONTEXT_MIN_USER_PRIORITY	-1023 /* inclusive */
> +
> +/*
> + * Not all clients may want to attempt automatic recover of a context after
> + * a hang (for example, some clients may only submit very small incremental
> + * batches relying on known logical state of previous batches which will never
> + * recover correctly and each attempt will hang), and so would prefer that
> + * the context is forever banned instead.
> + *
> + * If set to false (0), after a reset, subsequent (and in flight) rendering
> + * from this context is discarded, and the client will need to create a new
> + * context to use instead.
> + *
> + * If set to true (1), the kernel will automatically attempt to recover the
> + * context by skipping the hanging batch and executing the next batch starting
> + * from the default context state (discarding the incomplete logical context
> + * state lost due to the reset).
> + *
> + * On creation, all new contexts are marked as recoverable.
> + */
> +#define I915_CONTEXT_PARAM_RECOVERABLE	0x7
>  	__u64 value;
>  };
>  
> -- 
> 2.19.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] drm/i915: Optionally disable automatic recovery after a GPU reset
  2018-10-03  6:22 ` [PATCH] " Mika Kuoppala
@ 2018-10-03  7:22   ` Chris Wilson
  0 siblings, 0 replies; 8+ messages in thread
From: Chris Wilson @ 2018-10-03  7:22 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx; +Cc: Kenneth Graunke

Quoting Mika Kuoppala (2018-10-03 07:22:13)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Some clients, such as mesa, may only emit minimal incremental batches
> > that rely on the logical context state from previous batches. They know
> > that recovery is impossible after a hang as their required GPU state is
> > lost, and that each in flight and subsequent batch will hang (resetting
> > the context image back to default perpetuating the problem).
> >
> > To avoid getting into the state in the first place, we can allow clients
> > to opt out of automatic recovery and elect to ban any guilty context
> > following a hang. This prevents the continual stream of hangs and allows
> > the client to recreate their context and rebuild the state from scratch.
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Kenneth Graunke <kenneth@whitecape.org>
> > Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_gem.c         |  3 ++-
> >  drivers/gpu/drm/i915/i915_gem_context.c | 12 ++++++++++++
> >  drivers/gpu/drm/i915/i915_gem_context.h | 16 ++++++++++++++++
> >  include/uapi/drm/i915_drm.h             | 20 ++++++++++++++++++++
> >  4 files changed, 50 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> > index 7d45e71100bc..eee06d90d460 100644
> > --- a/drivers/gpu/drm/i915/i915_gem.c
> > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > @@ -3018,7 +3018,8 @@ static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
> >  
> >       bannable = i915_gem_context_is_bannable(ctx);
> >       score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
> > -     banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
> > +     banned = (i915_gem_context_is_unrecoverable(ctx) ||
> > +               score >= CONTEXT_SCORE_BAN_THRESHOLD);
> 
> We treat banned contexts rather aggressively on client level
> banning scoring.
> 
> Should we give some leeway if a client tells it don't
> need recovery, instead of being more harsh on them?
> 
> As with this, third hang would lead to client ban.

Yes, no. I was thinking of the same problem, but ultimately I decided
that it was not worth pursuing atm. It still takes 10s (or thereabouts)
for us to detect a hang, so a client can lock the machine up for 30s
even with the "quicker" recovery. As stated, the client will be banned
anyway as it cannot recover without taking action itself (just after a
few broken batches). So from that perspective, it doesn't look any worse
than before. If it becomes a more noticeable problem (WebGL) we can open
a dialogue as to how to proceed.

> >       /* Cool contexts don't accumulate client ban score */
> >       if (!bannable)
> > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> > index 8cbe58070561..2d5e4119786a 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_context.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> > @@ -878,6 +878,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
> >       case I915_CONTEXT_PARAM_BANNABLE:
> >               args->value = i915_gem_context_is_bannable(ctx);
> >               break;
> > +     case I915_CONTEXT_PARAM_RECOVERABLE:
> > +             args->value = !i915_gem_context_is_unrecoverable(ctx);
> 
> Pondering here why not the internal coding be also
> i915_gem_context_is_recoverable(ctx). Atleast in here
> would read better.

6 of one, half a dozen of the other. Improve one place, makes the other
equally less readable. I'd buy the argument that we should prefer the
default state to be expressed in the name.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH v2] drm/i915: Optionally disable automatic recovery after a GPU reset
  2018-10-02 19:07 [PATCH] drm/i915: Optionally disable automatic recovery after a GPU reset Chris Wilson
  2018-10-02 19:47 ` ✓ Fi.CI.BAT: success for " Patchwork
  2018-10-03  6:22 ` [PATCH] " Mika Kuoppala
@ 2018-10-03 10:59 ` Chris Wilson
  2018-10-04  9:58   ` Mika Kuoppala
  2018-10-03 13:02 ` ✓ Fi.CI.BAT: success for drm/i915: Optionally disable automatic recovery after a GPU reset (rev2) Patchwork
  2018-10-03 21:13 ` ✗ Fi.CI.IGT: failure " Patchwork
  4 siblings, 1 reply; 8+ messages in thread
From: Chris Wilson @ 2018-10-03 10:59 UTC (permalink / raw)
  To: intel-gfx; +Cc: Kenneth Graunke, Mika Kuoppala

Some clients, such as mesa, may only emit minimal incremental batches
that rely on the logical context state from previous batches. They know
that recovery is impossible after a hang as their required GPU state is
lost, and that each in flight and subsequent batch will hang (resetting
the context image back to default perpetuating the problem).

To avoid getting into the state in the first place, we can allow clients
to opt out of automatic recovery and elect to ban any guilty context
following a hang. This prevents the continual stream of hangs and allows
the client to recreate their context and rebuild the state from scratch.

v2: Prefer calling it recoverable rather than unrecoverable.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Kenneth Graunke <kenneth@whitecape.org>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c         |  3 ++-
 drivers/gpu/drm/i915/i915_gem_context.c | 13 +++++++++++++
 drivers/gpu/drm/i915/i915_gem_context.h | 16 ++++++++++++++++
 include/uapi/drm/i915_drm.h             | 20 ++++++++++++++++++++
 4 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 7d45e71100bc..9b02d70fbabf 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3018,7 +3018,8 @@ static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
 
 	bannable = i915_gem_context_is_bannable(ctx);
 	score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
-	banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
+	banned = (!i915_gem_context_is_recoverable(ctx) ||
+		  score >= CONTEXT_SCORE_BAN_THRESHOLD);
 
 	/* Cool contexts don't accumulate client ban score */
 	if (!bannable)
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 8cbe58070561..b31da0375c8c 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -338,6 +338,7 @@ __create_hw_context(struct drm_i915_private *dev_priv,
 	list_add_tail(&ctx->link, &dev_priv->contexts.list);
 	ctx->i915 = dev_priv;
 	ctx->sched.priority = I915_USER_PRIORITY(I915_PRIORITY_NORMAL);
+	ctx->user_flags = BIT(UCONTEXT_RECOVERABLE);
 
 	for (n = 0; n < ARRAY_SIZE(ctx->__engine); n++) {
 		struct intel_context *ce = &ctx->__engine[n];
@@ -878,6 +879,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 	case I915_CONTEXT_PARAM_BANNABLE:
 		args->value = i915_gem_context_is_bannable(ctx);
 		break;
+	case I915_CONTEXT_PARAM_RECOVERABLE:
+		args->value = i915_gem_context_is_recoverable(ctx);
+		break;
 	case I915_CONTEXT_PARAM_PRIORITY:
 		args->value = ctx->sched.priority >> I915_USER_PRIORITY_SHIFT;
 		break;
@@ -933,6 +937,15 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 			i915_gem_context_clear_bannable(ctx);
 		break;
 
+	case I915_CONTEXT_PARAM_RECOVERABLE:
+		if (args->size)
+			ret = -EINVAL;
+		else if (args->value)
+			i915_gem_context_set_recoverable(ctx);
+		else
+			i915_gem_context_clear_recoverable(ctx);
+		break;
+
 	case I915_CONTEXT_PARAM_PRIORITY:
 		{
 			s64 priority = args->value;
diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
index 08165f6a0a84..397d50b826d7 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/i915_gem_context.h
@@ -123,6 +123,7 @@ struct i915_gem_context {
 #define UCONTEXT_NO_ZEROMAP		0
 #define UCONTEXT_NO_ERROR_CAPTURE	1
 #define UCONTEXT_BANNABLE		2
+#define UCONTEXT_RECOVERABLE		3
 
 	/**
 	 * @flags: small set of booleans
@@ -247,6 +248,21 @@ static inline void i915_gem_context_clear_bannable(struct i915_gem_context *ctx)
 	clear_bit(UCONTEXT_BANNABLE, &ctx->user_flags);
 }
 
+static inline bool i915_gem_context_is_recoverable(const struct i915_gem_context *ctx)
+{
+	return test_bit(UCONTEXT_RECOVERABLE, &ctx->user_flags);
+}
+
+static inline void i915_gem_context_set_recoverable(struct i915_gem_context *ctx)
+{
+	set_bit(UCONTEXT_RECOVERABLE, &ctx->user_flags);
+}
+
+static inline void i915_gem_context_clear_recoverable(struct i915_gem_context *ctx)
+{
+	clear_bit(UCONTEXT_RECOVERABLE, &ctx->user_flags);
+}
+
 static inline bool i915_gem_context_is_banned(const struct i915_gem_context *ctx)
 {
 	return test_bit(CONTEXT_BANNED, &ctx->flags);
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 298b2e197744..eeb258a12b95 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1486,6 +1486,26 @@ struct drm_i915_gem_context_param {
 #define   I915_CONTEXT_MAX_USER_PRIORITY	1023 /* inclusive */
 #define   I915_CONTEXT_DEFAULT_PRIORITY		0
 #define   I915_CONTEXT_MIN_USER_PRIORITY	-1023 /* inclusive */
+
+/*
+ * Not all clients may want to attempt automatic recover of a context after
+ * a hang (for example, some clients may only submit very small incremental
+ * batches relying on known logical state of previous batches which will never
+ * recover correctly and each attempt will hang), and so would prefer that
+ * the context is forever banned instead.
+ *
+ * If set to false (0), after a reset, subsequent (and in flight) rendering
+ * from this context is discarded, and the client will need to create a new
+ * context to use instead.
+ *
+ * If set to true (1), the kernel will automatically attempt to recover the
+ * context by skipping the hanging batch and executing the next batch starting
+ * from the default context state (discarding the incomplete logical context
+ * state lost due to the reset).
+ *
+ * On creation, all new contexts are marked as recoverable.
+ */
+#define I915_CONTEXT_PARAM_RECOVERABLE	0x7
 	__u64 value;
 };
 
-- 
2.19.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* ✓ Fi.CI.BAT: success for drm/i915: Optionally disable automatic recovery after a GPU reset (rev2)
  2018-10-02 19:07 [PATCH] drm/i915: Optionally disable automatic recovery after a GPU reset Chris Wilson
                   ` (2 preceding siblings ...)
  2018-10-03 10:59 ` [PATCH v2] " Chris Wilson
@ 2018-10-03 13:02 ` Patchwork
  2018-10-03 21:13 ` ✗ Fi.CI.IGT: failure " Patchwork
  4 siblings, 0 replies; 8+ messages in thread
From: Patchwork @ 2018-10-03 13:02 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: drm/i915: Optionally disable automatic recovery after a GPU reset (rev2)
URL   : https://patchwork.freedesktop.org/series/50458/
State : success

== Summary ==

= CI Bug Log - changes from CI_DRM_4919 -> Patchwork_10339 =

== Summary - WARNING ==

  Minor unknown changes coming with Patchwork_10339 need to be verified
  manually.
  
  If you think the reported changes have nothing to do with the changes
  introduced in Patchwork_10339, please notify your bug team to allow them
  to document this new failure mode, which will reduce false positives in CI.

  External URL: https://patchwork.freedesktop.org/api/1.0/series/50458/revisions/2/mbox/

== Possible new issues ==

  Here are the unknown changes that may have been introduced in Patchwork_10339:

  === IGT changes ===

    ==== Warnings ====

    igt@pm_rpm@module-reload:
      fi-hsw-4770r:       PASS -> SKIP

    
== Known issues ==

  Here are the changes found in Patchwork_10339 that come from known issues:

  === IGT changes ===

    ==== Issues hit ====

    igt@amdgpu/amd_basic@cs-compute:
      fi-kbl-8809g:       NOTRUN -> FAIL (fdo#108094)

    igt@amdgpu/amd_prime@amd-to-i915:
      fi-kbl-8809g:       NOTRUN -> FAIL (fdo#107341)

    igt@drv_module_reload@basic-reload:
      fi-glk-j4005:       NOTRUN -> DMESG-WARN (fdo#106248, fdo#106725)

    igt@drv_module_reload@basic-reload-inject:
      fi-hsw-4770r:       PASS -> DMESG-WARN (fdo#107924, fdo#107425)

    igt@gem_exec_suspend@basic-s3:
      fi-bdw-samus:       PASS -> INCOMPLETE (fdo#107773)

    igt@kms_flip@basic-flip-vs-modeset:
      fi-glk-j4005:       NOTRUN -> DMESG-WARN (fdo#106097)

    igt@kms_frontbuffer_tracking@basic:
      fi-byt-clapper:     NOTRUN -> FAIL (fdo#103167)

    igt@kms_pipe_crc_basic@nonblocking-crc-pipe-b-frame-sequence:
      fi-byt-clapper:     NOTRUN -> FAIL (fdo#107362, fdo#103191)

    igt@kms_pipe_crc_basic@suspend-read-crc-pipe-b:
      fi-blb-e6850:       PASS -> INCOMPLETE (fdo#107718)

    igt@pm_rpm@module-reload:
      fi-skl-6600u:       PASS -> INCOMPLETE (fdo#107807)
      fi-glk-j4005:       NOTRUN -> FAIL (fdo#104767)

    
    ==== Possible fixes ====

    igt@gem_exec_suspend@basic-s3:
      fi-byt-clapper:     INCOMPLETE (fdo#102657) -> PASS

    
  fdo#102657 https://bugs.freedesktop.org/show_bug.cgi?id=102657
  fdo#103167 https://bugs.freedesktop.org/show_bug.cgi?id=103167
  fdo#103191 https://bugs.freedesktop.org/show_bug.cgi?id=103191
  fdo#104767 https://bugs.freedesktop.org/show_bug.cgi?id=104767
  fdo#106097 https://bugs.freedesktop.org/show_bug.cgi?id=106097
  fdo#106248 https://bugs.freedesktop.org/show_bug.cgi?id=106248
  fdo#106725 https://bugs.freedesktop.org/show_bug.cgi?id=106725
  fdo#107341 https://bugs.freedesktop.org/show_bug.cgi?id=107341
  fdo#107362 https://bugs.freedesktop.org/show_bug.cgi?id=107362
  fdo#107425 https://bugs.freedesktop.org/show_bug.cgi?id=107425
  fdo#107718 https://bugs.freedesktop.org/show_bug.cgi?id=107718
  fdo#107773 https://bugs.freedesktop.org/show_bug.cgi?id=107773
  fdo#107807 https://bugs.freedesktop.org/show_bug.cgi?id=107807
  fdo#107924 https://bugs.freedesktop.org/show_bug.cgi?id=107924
  fdo#108094 https://bugs.freedesktop.org/show_bug.cgi?id=108094


== Participating hosts (49 -> 43) ==

  Additional (1): fi-glk-j4005 
  Missing    (7): fi-hsw-4200u fi-byt-squawks fi-bsw-cyan fi-icl-u2 fi-cfl-guc fi-ctg-p8600 fi-icl-u 


== Build changes ==

    * Linux: CI_DRM_4919 -> Patchwork_10339

  CI_DRM_4919: e489eb0296673790264b25266ef45aae7d1ab566 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_4665: 267870165d9ef66b4ab423e4efe7bacba023d75e @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_10339: aaa4a238fd2f72bd4e7970197f93a70d436c3a9a @ git://anongit.freedesktop.org/gfx-ci/linux


== Linux commits ==

aaa4a238fd2f drm/i915: Optionally disable automatic recovery after a GPU reset

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_10339/issues.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* ✗ Fi.CI.IGT: failure for drm/i915: Optionally disable automatic recovery after a GPU reset (rev2)
  2018-10-02 19:07 [PATCH] drm/i915: Optionally disable automatic recovery after a GPU reset Chris Wilson
                   ` (3 preceding siblings ...)
  2018-10-03 13:02 ` ✓ Fi.CI.BAT: success for drm/i915: Optionally disable automatic recovery after a GPU reset (rev2) Patchwork
@ 2018-10-03 21:13 ` Patchwork
  4 siblings, 0 replies; 8+ messages in thread
From: Patchwork @ 2018-10-03 21:13 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: drm/i915: Optionally disable automatic recovery after a GPU reset (rev2)
URL   : https://patchwork.freedesktop.org/series/50458/
State : failure

== Summary ==

= CI Bug Log - changes from CI_DRM_4919_full -> Patchwork_10339_full =

== Summary - FAILURE ==

  Serious unknown changes coming with Patchwork_10339_full absolutely need to be
  verified manually.
  
  If you think the reported changes have nothing to do with the changes
  introduced in Patchwork_10339_full, please notify your bug team to allow them
  to document this new failure mode, which will reduce false positives in CI.

  

== Possible new issues ==

  Here are the unknown changes that may have been introduced in Patchwork_10339_full:

  === IGT changes ===

    ==== Possible regressions ====

    igt@gem_ctx_param@invalid-param-get:
      shard-skl:          PASS -> FAIL +1
      shard-apl:          PASS -> FAIL +1
      shard-glk:          PASS -> FAIL +1

    igt@gem_ctx_param@invalid-param-set:
      shard-kbl:          PASS -> FAIL +1
      shard-hsw:          PASS -> FAIL +1
      shard-snb:          PASS -> FAIL +1

    
== Known issues ==

  Here are the changes found in Patchwork_10339_full that come from known issues:

  === IGT changes ===

    ==== Issues hit ====

    igt@drv_suspend@debugfs-reader:
      shard-skl:          NOTRUN -> INCOMPLETE (fdo#104108, fdo#107773) +1

    igt@gem_exec_big:
      shard-hsw:          PASS -> TIMEOUT (fdo#107937)

    igt@gem_userptr_blits@readonly-unsync:
      shard-kbl:          PASS -> INCOMPLETE (fdo#103665)

    igt@kms_cursor_crc@cursor-128x128-random:
      shard-apl:          PASS -> FAIL (fdo#103232) +4

    igt@kms_cursor_crc@cursor-128x42-onscreen:
      shard-glk:          PASS -> FAIL (fdo#103232) +2

    igt@kms_cursor_crc@cursor-64x64-suspend:
      shard-apl:          PASS -> FAIL (fdo#103232, fdo#103191)

    igt@kms_flip@2x-flip-vs-expired-vblank:
      shard-glk:          PASS -> FAIL (fdo#105363)

    igt@kms_frontbuffer_tracking@fbc-1p-primscrn-spr-indfb-draw-blt:
      shard-glk:          PASS -> FAIL (fdo#103167) +3

    igt@kms_frontbuffer_tracking@fbc-1p-primscrn-spr-indfb-draw-mmap-gtt:
      shard-apl:          PASS -> FAIL (fdo#103167) +3

    igt@kms_pipe_crc_basic@suspend-read-crc-pipe-c:
      shard-glk:          PASS -> DMESG-WARN (fdo#106538, fdo#105763)

    {igt@kms_plane_alpha_blend@pipe-a-constant-alpha-max}:
      shard-skl:          NOTRUN -> FAIL (fdo#108145)

    {igt@kms_plane_alpha_blend@pipe-c-constant-alpha-max}:
      shard-glk:          PASS -> FAIL (fdo#108145) +2

    igt@kms_plane_multiple@atomic-pipe-a-tiling-y:
      shard-apl:          PASS -> FAIL (fdo#103166)

    igt@kms_plane_multiple@atomic-pipe-b-tiling-y:
      shard-glk:          PASS -> FAIL (fdo#103166) +4

    igt@kms_vblank@pipe-c-ts-continuation-dpms-suspend:
      shard-skl:          PASS -> INCOMPLETE (fdo#104108)

    igt@kms_vblank@pipe-c-ts-continuation-suspend:
      shard-apl:          PASS -> INCOMPLETE (fdo#103927)

    igt@pm_rpm@fences:
      shard-skl:          PASS -> INCOMPLETE (fdo#107807)

    
    ==== Possible fixes ====

    igt@kms_busy@extended-pageflip-hang-newfb-render-c:
      shard-apl:          DMESG-WARN (fdo#107956) -> PASS

    igt@kms_busy@extended-pageflip-modeset-hang-oldfb-render-a:
      shard-kbl:          DMESG-WARN (fdo#107956) -> PASS

    igt@kms_cursor_crc@cursor-128x42-random:
      shard-skl:          FAIL (fdo#103232) -> PASS

    igt@kms_cursor_crc@cursor-256x256-sliding:
      shard-glk:          FAIL (fdo#103232) -> PASS

    igt@kms_cursor_crc@cursor-64x21-random:
      shard-apl:          FAIL (fdo#103232) -> PASS

    igt@kms_draw_crc@draw-method-xrgb2101010-blt-untiled:
      shard-skl:          FAIL (fdo#103184) -> PASS +1

    igt@kms_draw_crc@draw-method-xrgb8888-pwrite-ytiled:
      shard-glk:          FAIL -> PASS

    igt@kms_frontbuffer_tracking@fbc-1p-offscren-pri-shrfb-draw-mmap-gtt:
      shard-skl:          FAIL (fdo#103167) -> PASS +5

    igt@kms_frontbuffer_tracking@fbc-1p-primscrn-pri-indfb-draw-mmap-wc:
      shard-skl:          FAIL (fdo#105682) -> PASS

    igt@kms_frontbuffer_tracking@fbc-1p-primscrn-spr-indfb-draw-mmap-wc:
      shard-apl:          FAIL (fdo#103167) -> PASS

    igt@kms_frontbuffer_tracking@fbc-1p-primscrn-spr-indfb-fullscreen:
      shard-glk:          FAIL (fdo#103167) -> PASS +3

    igt@kms_plane@pixel-format-pipe-a-planes:
      shard-glk:          FAIL (fdo#103166) -> PASS

    igt@kms_plane@plane-panning-bottom-right-suspend-pipe-c-planes:
      shard-kbl:          INCOMPLETE (fdo#103665) -> PASS

    igt@kms_plane@plane-position-covered-pipe-a-planes:
      shard-apl:          FAIL (fdo#103166) -> PASS +3

    {igt@kms_plane_alpha_blend@pipe-a-coverage-7efc}:
      shard-skl:          FAIL (fdo#108145) -> PASS

    {igt@kms_plane_alpha_blend@pipe-b-alpha-opaque-fb}:
      shard-glk:          FAIL (fdo#108145) -> PASS

    
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  fdo#103166 https://bugs.freedesktop.org/show_bug.cgi?id=103166
  fdo#103167 https://bugs.freedesktop.org/show_bug.cgi?id=103167
  fdo#103184 https://bugs.freedesktop.org/show_bug.cgi?id=103184
  fdo#103191 https://bugs.freedesktop.org/show_bug.cgi?id=103191
  fdo#103232 https://bugs.freedesktop.org/show_bug.cgi?id=103232
  fdo#103665 https://bugs.freedesktop.org/show_bug.cgi?id=103665
  fdo#103927 https://bugs.freedesktop.org/show_bug.cgi?id=103927
  fdo#104108 https://bugs.freedesktop.org/show_bug.cgi?id=104108
  fdo#105363 https://bugs.freedesktop.org/show_bug.cgi?id=105363
  fdo#105682 https://bugs.freedesktop.org/show_bug.cgi?id=105682
  fdo#105763 https://bugs.freedesktop.org/show_bug.cgi?id=105763
  fdo#106538 https://bugs.freedesktop.org/show_bug.cgi?id=106538
  fdo#107773 https://bugs.freedesktop.org/show_bug.cgi?id=107773
  fdo#107807 https://bugs.freedesktop.org/show_bug.cgi?id=107807
  fdo#107937 https://bugs.freedesktop.org/show_bug.cgi?id=107937
  fdo#107956 https://bugs.freedesktop.org/show_bug.cgi?id=107956
  fdo#108145 https://bugs.freedesktop.org/show_bug.cgi?id=108145


== Participating hosts (6 -> 6) ==

  No changes in participating hosts


== Build changes ==

    * Linux: CI_DRM_4919 -> Patchwork_10339

  CI_DRM_4919: e489eb0296673790264b25266ef45aae7d1ab566 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_4665: 267870165d9ef66b4ab423e4efe7bacba023d75e @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_10339: aaa4a238fd2f72bd4e7970197f93a70d436c3a9a @ git://anongit.freedesktop.org/gfx-ci/linux
  piglit_4509: fdc5a4ca11124ab8413c7988896eec4c97336694 @ git://anongit.freedesktop.org/piglit

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_10339/shards.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] drm/i915: Optionally disable automatic recovery after a GPU reset
  2018-10-03 10:59 ` [PATCH v2] " Chris Wilson
@ 2018-10-04  9:58   ` Mika Kuoppala
  0 siblings, 0 replies; 8+ messages in thread
From: Mika Kuoppala @ 2018-10-04  9:58 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: Kenneth Graunke

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Some clients, such as mesa, may only emit minimal incremental batches
> that rely on the logical context state from previous batches. They know
> that recovery is impossible after a hang as their required GPU state is
> lost, and that each in flight and subsequent batch will hang (resetting
> the context image back to default perpetuating the problem).
>
> To avoid getting into the state in the first place, we can allow clients
> to opt out of automatic recovery and elect to ban any guilty context
> following a hang. This prevents the continual stream of hangs and allows
> the client to recreate their context and rebuild the state from scratch.
>
> v2: Prefer calling it recoverable rather than unrecoverable.
>

Ok. You went forward with this. I am fine with both
flavours,

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Kenneth Graunke <kenneth@whitecape.org>
> Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem.c         |  3 ++-
>  drivers/gpu/drm/i915/i915_gem_context.c | 13 +++++++++++++
>  drivers/gpu/drm/i915/i915_gem_context.h | 16 ++++++++++++++++
>  include/uapi/drm/i915_drm.h             | 20 ++++++++++++++++++++
>  4 files changed, 51 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 7d45e71100bc..9b02d70fbabf 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3018,7 +3018,8 @@ static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
>  
>  	bannable = i915_gem_context_is_bannable(ctx);
>  	score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
> -	banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
> +	banned = (!i915_gem_context_is_recoverable(ctx) ||
> +		  score >= CONTEXT_SCORE_BAN_THRESHOLD);
>  
>  	/* Cool contexts don't accumulate client ban score */
>  	if (!bannable)
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 8cbe58070561..b31da0375c8c 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -338,6 +338,7 @@ __create_hw_context(struct drm_i915_private *dev_priv,
>  	list_add_tail(&ctx->link, &dev_priv->contexts.list);
>  	ctx->i915 = dev_priv;
>  	ctx->sched.priority = I915_USER_PRIORITY(I915_PRIORITY_NORMAL);
> +	ctx->user_flags = BIT(UCONTEXT_RECOVERABLE);
>  
>  	for (n = 0; n < ARRAY_SIZE(ctx->__engine); n++) {
>  		struct intel_context *ce = &ctx->__engine[n];
> @@ -878,6 +879,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>  	case I915_CONTEXT_PARAM_BANNABLE:
>  		args->value = i915_gem_context_is_bannable(ctx);
>  		break;
> +	case I915_CONTEXT_PARAM_RECOVERABLE:
> +		args->value = i915_gem_context_is_recoverable(ctx);
> +		break;
>  	case I915_CONTEXT_PARAM_PRIORITY:
>  		args->value = ctx->sched.priority >> I915_USER_PRIORITY_SHIFT;
>  		break;
> @@ -933,6 +937,15 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>  			i915_gem_context_clear_bannable(ctx);
>  		break;
>  
> +	case I915_CONTEXT_PARAM_RECOVERABLE:
> +		if (args->size)
> +			ret = -EINVAL;
> +		else if (args->value)
> +			i915_gem_context_set_recoverable(ctx);
> +		else
> +			i915_gem_context_clear_recoverable(ctx);
> +		break;
> +
>  	case I915_CONTEXT_PARAM_PRIORITY:
>  		{
>  			s64 priority = args->value;
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
> index 08165f6a0a84..397d50b826d7 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.h
> +++ b/drivers/gpu/drm/i915/i915_gem_context.h
> @@ -123,6 +123,7 @@ struct i915_gem_context {
>  #define UCONTEXT_NO_ZEROMAP		0
>  #define UCONTEXT_NO_ERROR_CAPTURE	1
>  #define UCONTEXT_BANNABLE		2
> +#define UCONTEXT_RECOVERABLE		3
>  
>  	/**
>  	 * @flags: small set of booleans
> @@ -247,6 +248,21 @@ static inline void i915_gem_context_clear_bannable(struct i915_gem_context *ctx)
>  	clear_bit(UCONTEXT_BANNABLE, &ctx->user_flags);
>  }
>  
> +static inline bool i915_gem_context_is_recoverable(const struct i915_gem_context *ctx)
> +{
> +	return test_bit(UCONTEXT_RECOVERABLE, &ctx->user_flags);
> +}
> +
> +static inline void i915_gem_context_set_recoverable(struct i915_gem_context *ctx)
> +{
> +	set_bit(UCONTEXT_RECOVERABLE, &ctx->user_flags);
> +}
> +
> +static inline void i915_gem_context_clear_recoverable(struct i915_gem_context *ctx)
> +{
> +	clear_bit(UCONTEXT_RECOVERABLE, &ctx->user_flags);
> +}
> +
>  static inline bool i915_gem_context_is_banned(const struct i915_gem_context *ctx)
>  {
>  	return test_bit(CONTEXT_BANNED, &ctx->flags);
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 298b2e197744..eeb258a12b95 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1486,6 +1486,26 @@ struct drm_i915_gem_context_param {
>  #define   I915_CONTEXT_MAX_USER_PRIORITY	1023 /* inclusive */
>  #define   I915_CONTEXT_DEFAULT_PRIORITY		0
>  #define   I915_CONTEXT_MIN_USER_PRIORITY	-1023 /* inclusive */
> +
> +/*
> + * Not all clients may want to attempt automatic recover of a context after
> + * a hang (for example, some clients may only submit very small incremental
> + * batches relying on known logical state of previous batches which will never
> + * recover correctly and each attempt will hang), and so would prefer that
> + * the context is forever banned instead.
> + *
> + * If set to false (0), after a reset, subsequent (and in flight) rendering
> + * from this context is discarded, and the client will need to create a new
> + * context to use instead.
> + *
> + * If set to true (1), the kernel will automatically attempt to recover the
> + * context by skipping the hanging batch and executing the next batch starting
> + * from the default context state (discarding the incomplete logical context
> + * state lost due to the reset).
> + *
> + * On creation, all new contexts are marked as recoverable.
> + */
> +#define I915_CONTEXT_PARAM_RECOVERABLE	0x7
>  	__u64 value;
>  };
>  
> -- 
> 2.19.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2018-10-04  9:58 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-10-02 19:07 [PATCH] drm/i915: Optionally disable automatic recovery after a GPU reset Chris Wilson
2018-10-02 19:47 ` ✓ Fi.CI.BAT: success for " Patchwork
2018-10-03  6:22 ` [PATCH] " Mika Kuoppala
2018-10-03  7:22   ` Chris Wilson
2018-10-03 10:59 ` [PATCH v2] " Chris Wilson
2018-10-04  9:58   ` Mika Kuoppala
2018-10-03 13:02 ` ✓ Fi.CI.BAT: success for drm/i915: Optionally disable automatic recovery after a GPU reset (rev2) Patchwork
2018-10-03 21:13 ` ✗ Fi.CI.IGT: failure " Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.