Re: [Intel-gfx] [Patch v3 2/2] drm/i915: Introduce Wa_1401127433

From: Matt Roper <matthew.d.roper@intel.com>
To: Matt Atwood <matthew.s.atwood@intel.com>
Cc: intel-gfx@lists.freedesktop.org
Subject: Re: [Intel-gfx] [Patch v3 2/2] drm/i915: Introduce Wa_1401127433
Date: Fri, 1 Dec 2023 14:05:10 -0800	[thread overview]
Message-ID: <20231201220510.GD1327160@mdroper-desk1.amr.corp.intel.com> (raw)
In-Reply-To: <20231129205122.3464299-2-matthew.s.atwood@intel.com>

You're missing a digit in the patch subject.

On Wed, Nov 29, 2023 at 12:51:22PM -0800, Matt Atwood wrote:
> Wa_14011274333 applies to gen 12.0->12.55. There are regions of
> registers that restore to default values on resume from rc6. The

It would be more clear to write this sentence as "The TDL power context
is not properly restored on RC6 exit, causing registers in the context
to revert to their hardware default values on RC6 exit."

> hardware has a built in register (CTX_WA_PTR), that can point to a
> region of memory full of commands to restore non default values on rc6
> resume.
> 
> Based off patch by Tilak Tangudu.
> 
> v2: Use correct lineage number, more generically apply workarounds for
> all registers impacted, move workaround to gt/intel_workarounds.c
> (MattR)
> 
> v3: Correctly use intel_engine_cs, intel_engine_regs, use
> reg_in_range_table() for wa registers impacted search, move the majority
> of functional changes into intel_workarounds.c, free up pinned memory on
> engine tear down, be more verbose in commit message and wa comments, use
> generic function for platforms applied to. (MattR)
> 
> Signed-off-by: Matt Atwood <matthew.s.atwood@intel.com>
> ---
>  drivers/gpu/drm/i915/gt/intel_engine_cs.c    |   2 +
>  drivers/gpu/drm/i915/gt/intel_engine_regs.h  |   4 +
>  drivers/gpu/drm/i915/gt/intel_engine_types.h |   3 +
>  drivers/gpu/drm/i915/gt/intel_workarounds.c  | 114 +++++++++++++++++++
>  drivers/gpu/drm/i915/gt/intel_workarounds.h  |   1 +
>  5 files changed, 124 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> index 40687806d22a6..d3628462bfb3e 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> @@ -1573,6 +1573,8 @@ void intel_engine_cleanup_common(struct intel_engine_cs *engine)
>  	intel_wa_list_free(&engine->ctx_wa_list);
>  	intel_wa_list_free(&engine->wa_list);
>  	intel_wa_list_free(&engine->whitelist);
> +
> +	intel_ctx_wa_bb_fini(engine);
>  }
>  
>  /**
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_regs.h b/drivers/gpu/drm/i915/gt/intel_engine_regs.h
> index a8eac59e37793..becb466f5910d 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_regs.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_regs.h
> @@ -251,6 +251,10 @@
>  #define GEN11_VECS_SFC_USAGE(base)		_MMIO((base) + 0x2014)
>  #define   GEN11_VECS_SFC_USAGE_BIT		(1 << 0)
>  
> +#define CTX_WA_PTR(base)			_MMIO((base) + 0x2058)

This isn't the right offset; you've accidentally pre-added the RCS base.
The definition should be (base + 0x58).

> +#define   CTX_WA_PTR_ADDR_MASK			REG_GENMASK(31, 12)
> +#define   CTX_WA_VALID				REG_BIT(0)
> +
>  #define RING_HWS_PGA_GEN6(base)	_MMIO((base) + 0x2080)
>  
>  #define GEN12_HCP_SFC_LOCK_STATUS(base)		_MMIO((base) + 0x2914)
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> index 960e6be2042fe..95ff3e1adf11e 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> @@ -388,6 +388,9 @@ struct intel_engine_cs {
>  	u32 context_size;
>  	u32 mmio_base;
>  
> +	u32 *ctx_wa_bb;
> +	struct i915_vma *vma;

You should pick a more descriptive name here.  Otherwise nobody will
know what this is a vma for.

> +
>  	struct intel_engine_tlb_inv tlb_inv;
>  
>  	/*
> diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c
> index 9bc0654efdc0c..1d33555039725 100644
> --- a/drivers/gpu/drm/i915/gt/intel_workarounds.c
> +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c
> @@ -5,6 +5,8 @@
>  
>  #include "i915_drv.h"
>  #include "i915_reg.h"
> +#include "i915_perf.h"
> +#include "gem/i915_gem_internal.h"
>  #include "intel_context.h"
>  #include "intel_engine_pm.h"
>  #include "intel_engine_regs.h"
> @@ -14,6 +16,7 @@
>  #include "intel_gt_print.h"
>  #include "intel_gt_regs.h"
>  #include "intel_ring.h"
> +#include "intel_uncore.h"
>  #include "intel_workarounds.h"
>  
>  /**
> @@ -2985,6 +2988,105 @@ general_render_compute_wa_init(struct intel_engine_cs *engine, struct i915_wa_li
>  	}
>  }
>  
> +static const struct i915_range tdl_power_ctx[] = {
> +	{ .start = 0xe100, .end = 0xe100 },
> +	{ .start = 0xe180, .end = 0xe194 },
> +	{},
> +};
> +
> +static void
> +emit_ctx_wa_bb(struct intel_engine_cs *engine, struct i915_wa_list *wal)
> +{
> +	const struct i915_wa *wa;
> +	struct intel_uncore *uncore = engine->gt->uncore;
> +	int i, count = 0;
> +	u32 *cs = engine->ctx_wa_bb;
> +
> +	*cs++ = MI_LOAD_REGISTER_IMM(1);

I think you meant to come back and patch up the length of the
instruction after emitting however many registers we find, right?  It
looks like you forgot that part.

> +	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
> +		if (reg_in_range_table(i915_mmio_reg_offset(wa->mcr_reg), tdl_power_ctx)) {
> +			*cs++ = i915_mmio_reg_offset(wa->mcr_reg);
> +			*cs++ = wa->set;

Part of the reason this is so easy to handle is because all of the
registers in the TDL power context are masked, so we don't have to worry
about clobbering unrelated bits when re-programming the workaround.  You
might want to add a comment noting that so that it doesn't look like
we're just reprogramming the whole register here.

> +			count++;
> +		}
> +	}
> +	*cs++ = MI_BATCH_BUFFER_END;
> +
> +	if (count != 0) {
> +		i915_gem_object_flush_map(engine->vma->obj);
> +		intel_uncore_write(uncore, CTX_WA_PTR(engine->mmio_base),
> +				   REG_FIELD_PREP(CTX_WA_PTR_ADDR_MASK,
> +						  i915_vma_offset(engine->vma) & GENMASK(31, 12)) |
> +				   CTX_WA_VALID);

CTX_WA_PTR gets cleared on render domain reset, so you need to make sure
it gets re-applied properly as well.  You should probably do this actual
register write in intel_engine_apply_workarounds() (to make sure it's
re-applied on the non-GuC platforms) and also add this register to the
guc ADS regset so that the GuC will save/restore it around resets on
ADL-P.

If it turned out you didn't actually need the batchbuffer, you could
release its object here and clear the vma reference in the engine.  The
presence/absence of that pointer can be used as the condition to
determine whether the register should be written in
intel_engine_apply_workarounds().

> +	}
> +}
> +
> +static int ctx_wa_bb_init(struct intel_engine_cs *engine)
> +{
> +	struct drm_i915_private *i915 = engine->i915;
> +	struct intel_gt *gt = engine->gt;
> +	struct drm_i915_gem_object *obj;
> +	struct i915_vma *vma;
> +	void *batch;
> +	struct i915_gem_ww_ctx ww;
> +	int err;
> +
> +	obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
> +	if (IS_ERR(obj))
> +		return PTR_ERR(obj);
> +
> +	vma = i915_vma_instance(obj, &gt->ggtt->vm, NULL);
> +	if (IS_ERR(vma)) {
> +		err = PTR_ERR(vma);
> +		goto err;
> +	}
> +
> +	engine->vma = vma;
> +	i915_gem_ww_ctx_init(&ww, true);
> +
> +retry:
> +	err = i915_gem_object_lock(obj, &ww);
> +	if (!err)
> +		err = i915_ggtt_pin(engine->vma, &ww, 0, PIN_HIGH);
> +	if (err)
> +		goto err_ww_fini;
> +
> +	batch = i915_gem_object_pin_map(obj, I915_MAP_WB);
> +	if (IS_ERR(batch)) {
> +		err = PTR_ERR(batch);
> +		goto err_unpin;
> +	}
> +	engine->ctx_wa_bb = batch;
> +
> +	return 0;

Aren't we still holding locks in the wait-wound context?

> +
> +err_unpin:
> +	i915_vma_unpin(engine->vma);
> +
> +err_ww_fini:
> +	if (err == -EDEADLK) {
> +		err = i915_gem_ww_ctx_backoff(&ww);
> +		if (!err)
> +			goto retry;
> +	}
> +
> +	i915_gem_ww_ctx_fini(&ww);
> +	i915_vma_put(engine->vma);
> +
> +err:
> +	i915_gem_object_put(obj);
> +	return err;
> +}
> +
> +void intel_ctx_wa_bb_fini(struct intel_engine_cs *engine)
> +{
> +	i915_vma_unpin_and_release(&engine->vma, I915_VMA_RELEASE_MAP);

You might want to clear the vma pointer as well to allow this function
to be used in more places, as noted above.

> +}
> +
> +#define NEEDS_CTX_WABB(engine) ( \

It might be better to put the name of the workaround in the macro here.
There are lots of potential reasons why we might want to execute a batch
on RC6 exit; this workaround is just one specific reason, but there may
be others in the future.

> +	IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 0), IP_VER(12, 55)) && \

This isn't needed on 12.55.  It was a pre-production-only workaround for
DG2-G10, and it never applied to DG2-G11 or DG2-G12.  So 12.00 - 12.10
would be the proper range.

Note that if you did need to program this on DG2, then you would have
also needed to take Wa_18018699957 into account, which involves
switching to use BB_PER_CTX_PTR instead of CTX_WA_PTR.

> +	engine->class == RENDER_CLASS)
> +
>  static void
>  engine_init_workarounds(struct intel_engine_cs *engine, struct i915_wa_list *wal)
>  {
> @@ -3007,6 +3109,18 @@ engine_init_workarounds(struct intel_engine_cs *engine, struct i915_wa_list *wal
>  		rcs_engine_wa_init(engine, wal);
>  	else
>  		xcs_engine_wa_init(engine, wal);
> +
> +	/* Wa_14011274333
> +	 * After the workaround list has been populated some platforms have
> +	 * regions of addresses that do not retain their values after resuming
> +	 * from rc6. For these platforms, add all workarounds in these regions
> +	 * to a batch buffer, this batch buffer is stored in memory and
> +	 * executes on every rc6 resume.
> +	 */
> +	if (NEEDS_CTX_WABB(engine)) {
> +		ctx_wa_bb_init(engine);

This function returns errors, but you're not handling them here, so
failure will be silently ignored.

Matt

> +		emit_ctx_wa_bb(engine, wal);
> +	}
>  }
>  
>  void intel_engine_init_workarounds(struct intel_engine_cs *engine)
> diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.h b/drivers/gpu/drm/i915/gt/intel_workarounds.h
> index 9beaab77c7f0b..fe8946b0c7b67 100644
> --- a/drivers/gpu/drm/i915/gt/intel_workarounds.h
> +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.h
> @@ -36,4 +36,5 @@ void intel_engine_apply_workarounds(struct intel_engine_cs *engine);
>  int intel_engine_verify_workarounds(struct intel_engine_cs *engine,
>  				    const char *from);
>  
> +void intel_ctx_wa_bb_fini(struct intel_engine_cs *engine);
>  #endif
> -- 
> 2.40.1
> 

-- 
Matt Roper
Graphics Software Engineer
Linux GPU Platform Enablement
Intel Corporation