Re: [PATCH v5 3/5] drm/i915: Watchdog timeout: Ringbuffer command emission for gen8+

From: Chris Wilson <chris@chris-wilson.co.uk>
To: Carlos Santa <carlos.santa@intel.com>, intel-gfx@lists.freedesktop.org
Cc: Michel Thierry <michel.thierry@intel.com>
Subject: Re: [PATCH v5 3/5] drm/i915: Watchdog timeout: Ringbuffer command emission for gen8+
Date: Sat, 30 Mar 2019 09:01:10 +0000	[thread overview]
Message-ID: <155393647005.24691.2200638315146110295@skylake-alporthouse-com> (raw)
In-Reply-To: <20190322234118.65980-4-carlos.santa@intel.com>

Quoting Carlos Santa (2019-03-22 23:41:16)
> From: Michel Thierry <michel.thierry@intel.com>
> 
> Emit the required commands into the ring buffer for starting and
> stopping the watchdog timer before/after batch buffer start during
> batch buffer submission.

I'm expecting to see some discussion of how this is handled across
preemption here since you are inside an arbitration enabled section.

> v2: Support watchdog threshold per context engine, merge lri commands,
> and move watchdog commands emission to emit_bb_start. Request space of
> combined start_watchdog, bb_start and stop_watchdog to avoid any error
> after emitting bb_start.
> 
> v3: There were too many req->engine in emit_bb_start.
> Use GEM_BUG_ON instead of returning a very late EINVAL in the remote
> case of watchdog misprogramming; set correct LRI cmd size in
> emit_stop_watchdog. (Chris)
> 
> v4: Rebase.
> v5: use to_intel_context instead of ctx->engine.
> v6: Rebase.
> v7: Rebase,
>     Store gpu watchdog capability in engine flag (Tvrtko)
>     Store WATCHDOG_DISABLE magic # in engine (Tvrtko)
>     No need to declare emit_{start|stop}_watchdog as vfuncs (Tvrtko)
>     Replace flag watchdog_running with enable_watchdog (Tvrtko)
>     Emit a single MI_NOOP by conditionally checking whether the #
>     of emitted OPs is odd (Tvrtko)
> v8: Rebase
> 
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Antonio Argenziano <antonio.argenziano@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
> Signed-off-by: Carlos Santa <carlos.santa@intel.com>
> ---
>  drivers/gpu/drm/i915/intel_context_types.h |  4 +
>  drivers/gpu/drm/i915/intel_engine_cs.c     |  2 +
>  drivers/gpu/drm/i915/intel_engine_types.h  | 17 ++++-
>  drivers/gpu/drm/i915/intel_lrc.c           | 89 +++++++++++++++++++++-
>  drivers/gpu/drm/i915/intel_lrc.h           |  2 +
>  5 files changed, 106 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_context_types.h b/drivers/gpu/drm/i915/intel_context_types.h
> index 6dc9b4b9067b..e56fc263568e 100644
> --- a/drivers/gpu/drm/i915/intel_context_types.h
> +++ b/drivers/gpu/drm/i915/intel_context_types.h
> @@ -51,6 +51,10 @@ struct intel_context {
>         u64 lrc_desc;
>  
>         atomic_t pin_count;
> +       /** watchdog_threshold: hw watchdog threshold value,
> +        * in clock counts
> +        */

Gah. Why would you put it here? Between a tightly coupled count + mutex.

> +       u32 watchdog_threshold;
>         struct mutex pin_mutex; /* guards pinning and associated on-gpuing */
>  
>         /**
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index 88cf0fc07623..d4ea07b70904 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -324,6 +324,8 @@ intel_engine_setup(struct drm_i915_private *dev_priv,
>         if (engine->context_size)
>                 DRIVER_CAPS(dev_priv)->has_logical_contexts = true;
>  
> +       engine->watchdog_disable_id = get_watchdog_disable(engine);
> +
>         /* Nothing to do here, execute in order of dependencies */
>         engine->schedule = NULL;
>  
> diff --git a/drivers/gpu/drm/i915/intel_engine_types.h b/drivers/gpu/drm/i915/intel_engine_types.h
> index c4f66b774e7c..1f99b536471d 100644
> --- a/drivers/gpu/drm/i915/intel_engine_types.h
> +++ b/drivers/gpu/drm/i915/intel_engine_types.h
> @@ -260,6 +260,7 @@ struct intel_engine_cs {
>         unsigned int guc_id;
>         intel_engine_mask_t mask;
>  
> +       u32 watchdog_disable_id;

You've just put this between a pair of u8s.

>         u8 uabi_class;
>  
>         u8 class;
> @@ -422,10 +423,12 @@ struct intel_engine_cs {
>  
>         struct intel_engine_hangcheck hangcheck;
>  
> -#define I915_ENGINE_NEEDS_CMD_PARSER BIT(0)
> -#define I915_ENGINE_SUPPORTS_STATS   BIT(1)
> -#define I915_ENGINE_HAS_PREEMPTION   BIT(2)
> -#define I915_ENGINE_HAS_SEMAPHORES   BIT(3)
> +#define I915_ENGINE_NEEDS_CMD_PARSER  BIT(0)
> +#define I915_ENGINE_SUPPORTS_STATS    BIT(1)
> +#define I915_ENGINE_HAS_PREEMPTION    BIT(2)
> +#define I915_ENGINE_HAS_SEMAPHORES    BIT(3)
> +#define I915_ENGINE_SUPPORTS_WATCHDOG BIT(4)
> +
>         unsigned int flags;
>  
>         /*
> @@ -509,6 +512,12 @@ intel_engine_has_semaphores(const struct intel_engine_cs *engine)
>         return engine->flags & I915_ENGINE_HAS_SEMAPHORES;
>  }
>  
> +static inline bool
> +intel_engine_supports_watchdog(const struct intel_engine_cs *engine)
> +{
> +       return engine->flags & I915_ENGINE_SUPPORTS_WATCHDOG;
> +}
> +
>  #define instdone_slice_mask(dev_priv__) \
>         (IS_GEN(dev_priv__, 7) ? \
>          1 : RUNTIME_INFO(dev_priv__)->sseu.slice_mask)
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 85785a94f6ae..78ea54a5dbc3 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -2036,16 +2036,75 @@ static void execlists_reset_finish(struct intel_engine_cs *engine)
>                   atomic_read(&execlists->tasklet.count));
>  }
>  
> +static u32 *gen8_emit_start_watchdog(struct i915_request *rq, u32 *cs)
> +{
> +       struct intel_engine_cs *engine = rq->engine;
> +       struct i915_gem_context *ctx = rq->gem_context;
> +       struct intel_context *ce = intel_context_lookup(ctx, engine);
> +
> +       GEM_BUG_ON(!intel_engine_supports_watchdog(engine));
> +
> +       /*
> +        * watchdog register must never be programmed to zero. This would
> +        * cause the watchdog counter to exceed and not allow the engine to
> +        * go into IDLE state
> +        */
> +       GEM_BUG_ON(ce->watchdog_threshold == 0);
> +
> +       /* Set counter period */
> +       *cs++ = MI_LOAD_REGISTER_IMM(2);
> +       *cs++ = i915_mmio_reg_offset(RING_THRESH(engine->mmio_base));
> +       *cs++ = ce->watchdog_threshold;
> +       /* Start counter */
> +       *cs++ = i915_mmio_reg_offset(RING_CNTR(engine->mmio_base));
> +       *cs++ = GEN8_WATCHDOG_ENABLE;

Hmm, so no watchdog seqno.

> +       return cs;
> +}
> +
> +static u32 *gen8_emit_stop_watchdog(struct i915_request *rq, u32 *cs)
> +{
> +       struct intel_engine_cs *engine = rq->engine;
> +
> +       GEM_BUG_ON(!intel_engine_supports_watchdog(engine));
> +
> +       *cs++ = MI_LOAD_REGISTER_IMM(1);
> +       *cs++ = i915_mmio_reg_offset(RING_CNTR(engine->mmio_base));
> +       *cs++ = engine->watchdog_disable_id;
> +
> +       return cs;
> +}
> +
>  static int gen8_emit_bb_start(struct i915_request *rq,
>                               u64 offset, u32 len,
>                               const unsigned int flags)
>  {
> +       struct intel_engine_cs *engine = rq->engine;
> +       struct i915_gem_context *ctx = rq->gem_context;
> +       struct intel_context *ce = intel_context_lookup(ctx, engine);

Ahem. This keeps on getting worse.

>         u32 *cs;
> +       u32 num_dwords;
> +       bool enable_watchdog = false;
>  
> -       cs = intel_ring_begin(rq, 6);
> +       /* bb_start only */
> +       num_dwords = 6;
> +
> +       /* check if watchdog will be required */
> +       if (ce->watchdog_threshold != 0) {
> +               /* + start_watchdog (6) + stop_watchdog (4) */
> +               num_dwords += 10;
> +               enable_watchdog = true;
> +       }
> +
> +       cs = intel_ring_begin(rq, num_dwords);
>         if (IS_ERR(cs))
>                 return PTR_ERR(cs);
>  
> +       if (enable_watchdog) {
> +               /* Start watchdog timer */

Please don't simply repeat code in comments.

> +               cs = gen8_emit_start_watchdog(rq, cs);
> +       }
> +
>         /*
>          * WaDisableCtxRestoreArbitration:bdw,chv
>          *
> @@ -2072,10 +2131,16 @@ static int gen8_emit_bb_start(struct i915_request *rq,
>         *cs++ = upper_32_bits(offset);
>  
>         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
> -       *cs++ = MI_NOOP;
>  
> -       intel_ring_advance(rq, cs);
> +       if (enable_watchdog) {
> +               /* Cancel watchdog timer */
> +               cs = gen8_emit_stop_watchdog(rq, cs);
> +       }
> +
> +       if (*cs%2 != 0)
> +               *cs++ = MI_NOOP;

This is wrong. cs points into the unset portion of the ring. The
watchdog commands are even, so there is no reason to move the original
NOOP, or at least no reason to make it conditional.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx