Re: [PATCH v3] drm/i915: Rework GPU reset sequence to match driver load & thaw

From: "Mcaulay, Alistair" <alistair.mcaulay@intel.com>
To: "Daniel Vetter (daniel@ffwll.ch)" <daniel@ffwll.ch>,
	"intel-gfx@lists.freedesktop.org"
	<intel-gfx@lists.freedesktop.org>
Subject: Re: [PATCH v3] drm/i915: Rework GPU reset sequence to match driver load & thaw
Date: Thu, 21 Aug 2014 12:38:58 +0000	[thread overview]
Message-ID: <2F6A3166A8653C4D914E172D478C0F012E4FFE5C@IRSMSX105.ger.corp.intel.com> (raw)
In-Reply-To: <877g24bqss.fsf@gaia.fi.intel.com>

Hi Daniel,

Is there anything else needing done before this patch can be merged?

Thanks,
Alistair.

> -----Original Message-----
> From: Mika Kuoppala [mailto:mika.kuoppala@linux.intel.com]
> Sent: Tuesday, August 19, 2014 1:36 PM
> To: Mcaulay, Alistair; intel-gfx@lists.freedesktop.org
> Subject: RE: [PATCH v3] drm/i915: Rework GPU reset sequence to match
> driver load & thaw
> 
> "Mcaulay, Alistair" <alistair.mcaulay@intel.com> writes:
> 
> > Hi Mika,
> >
> > can you please review this patch, and verify it fixes the issues in your
> previous review.
> >
> > Thanks,
> > Alistair.
> >
> >> -----Original Message-----
> >> From: Mcaulay, Alistair
> >> Sent: Friday, August 15, 2014 6:52 PM
> >> To: intel-gfx@lists.freedesktop.org
> >> Cc: Mcaulay, Alistair
> >> Subject: [PATCH v3] drm/i915: Rework GPU reset sequence to match
> >> driver load & thaw
> >>
> >> From: "McAulay, Alistair" <alistair.mcaulay@intel.com>
> >>
> >> This patch is to address Daniels concerns over different code during reset:
> >>
> >> http://lists.freedesktop.org/archives/intel-gfx/2014-June/047758.html
> >>
> >> "The reason for aiming as hard as possible to use the exact same code
> >> for driver load, gpu reset and runtime pm/system resume is that we've
> >> simply seen too many bugs due to slight variations and unintended
> omissions."
> >>
> >> Tested using igt drv_hangman.
> >>
> >> V2: Cleaner way of preventing check_wedge returning -EAGAIN
> >> V3: Clean the last_context during reset, to ensure do_switch() does
> >> the MI_SET_CONTEXT. As per review.
> >> Signed-off-by: McAulay, Alistair <alistair.mcaulay@intel.com>
> >> ---
> 
> Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
> 
> >>  drivers/gpu/drm/i915/i915_drv.c         |  6 +++
> >>  drivers/gpu/drm/i915/i915_drv.h         |  3 ++
> >>  drivers/gpu/drm/i915/i915_gem.c         |  4 +-
> >>  drivers/gpu/drm/i915/i915_gem_context.c | 33 +++-------------
> >>  drivers/gpu/drm/i915/i915_gem_gtt.c     | 67 +++++--------------------------
> --
> >>  drivers/gpu/drm/i915/i915_gem_gtt.h     |  3 +-
> >>  6 files changed, 28 insertions(+), 88 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/i915/i915_drv.c
> >> b/drivers/gpu/drm/i915/i915_drv.c index 5e4fefd..3bfafe6 100644
> >> --- a/drivers/gpu/drm/i915/i915_drv.c
> >> +++ b/drivers/gpu/drm/i915/i915_drv.c
> >> @@ -806,7 +806,13 @@ int i915_reset(struct drm_device *dev)
> >>  			!dev_priv->ums.mm_suspended) {
> >>  		dev_priv->ums.mm_suspended = 0;
> >>
> >> +		/* Used to prevent gem_check_wedged returning -EAGAIN
> >> during gpu reset */
> >> +		dev_priv->gpu_error.reload_in_reset = true;
> >> +
> >>  		ret = i915_gem_init_hw(dev);
> >> +
> >> +		dev_priv->gpu_error.reload_in_reset = false;
> >> +
> >>  		mutex_unlock(&dev->struct_mutex);
> >>  		if (ret) {
> >>  			DRM_ERROR("Failed hw init on reset %d\n", ret); diff
> --git
> >> a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> >> index 991b663..116daff 100644
> >> --- a/drivers/gpu/drm/i915/i915_drv.h
> >> +++ b/drivers/gpu/drm/i915/i915_drv.h
> >> @@ -1217,6 +1217,9 @@ struct i915_gpu_error {
> >>
> >>  	/* For missed irq/seqno simulation. */
> >>  	unsigned int test_irq_rings;
> >> +
> >> +	/* Used to prevent gem_check_wedged returning -EAGAIN during
> >> gpu reset   */
> >> +	bool reload_in_reset;
> >>  };
> >>
> >>  enum modeset_restore {
> >> diff --git a/drivers/gpu/drm/i915/i915_gem.c
> >> b/drivers/gpu/drm/i915/i915_gem.c index ef047bc..e7396eb 100644
> >> --- a/drivers/gpu/drm/i915/i915_gem.c
> >> +++ b/drivers/gpu/drm/i915/i915_gem.c
> >> @@ -1085,7 +1085,9 @@ i915_gem_check_wedge(struct i915_gpu_error
> >> *error,
> >>  		if (i915_terminally_wedged(error))
> >>  			return -EIO;
> >>
> >> -		return -EAGAIN;
> >> +		/* Check if GPU Reset is in progress */
> >> +		if (!error->reload_in_reset)
> >> +			return -EAGAIN;
> >>  	}
> >>
> >>  	return 0;
> >> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c
> >> b/drivers/gpu/drm/i915/i915_gem_context.c
> >> index de72a28..9378ad8 100644
> >> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> >> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> >> @@ -377,34 +377,17 @@ void i915_gem_context_reset(struct
> drm_device
> >> *dev)
> >>  	struct drm_i915_private *dev_priv = dev->dev_private;
> >>  	int i;
> >>
> >> -	/* Prevent the hardware from restoring the last context (which
> >> hung) on
> >> -	 * the next switch */
> >>  	for (i = 0; i < I915_NUM_RINGS; i++) {
> >>  		struct intel_engine_cs *ring = &dev_priv->ring[i];
> >> -		struct intel_context *dctx = ring->default_context;
> >>  		struct intel_context *lctx = ring->last_context;
> >>
> >> -		/* Do a fake switch to the default context */
> >> -		if (lctx == dctx)
> >> -			continue;
> >> -
> >> -		if (!lctx)
> >> -			continue;
> >> +		if (lctx) {
> >> +			if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
> >> +				i915_gem_object_ggtt_unpin(lctx-
> >> >legacy_hw_ctx.rcs_state);
> >>
> >> -		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
> >> -			WARN_ON(i915_gem_obj_ggtt_pin(dctx-
> >> >legacy_hw_ctx.rcs_state,
> >> -
> >> get_context_alignment(dev), 0));
> >> -			/* Fake a finish/inactive */
> >> -			dctx->legacy_hw_ctx.rcs_state->base.write_domain
> >> = 0;
> >> -			dctx->legacy_hw_ctx.rcs_state->active = 0;
> >> +			i915_gem_context_unreference(lctx);
> >> +			ring->last_context = NULL;
> >>  		}
> >> -
> >> -		if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
> >> -			i915_gem_object_ggtt_unpin(lctx-
> >> >legacy_hw_ctx.rcs_state);
> >> -
> >> -		i915_gem_context_unreference(lctx);
> >> -		i915_gem_context_reference(dctx);
> >> -		ring->last_context = dctx;
> >>  	}
> >>  }
> >>
> >> @@ -498,10 +481,6 @@ int i915_gem_context_enable(struct
> >> drm_i915_private *dev_priv)
> >>  		ppgtt->enable(ppgtt);
> >>  	}
> >>
> >> -	/* FIXME: We should make this work, even in reset */
> >> -	if (i915_reset_in_progress(&dev_priv->gpu_error))
> >> -		return 0;
> >> -
> >>  	BUG_ON(!dev_priv->ring[RCS].default_context);
> >>
> >>  	for_each_ring(ring, dev_priv, i) {
> >> @@ -645,7 +624,7 @@ static int do_switch(struct intel_engine_cs *ring,
> >>  	from = ring->last_context;
> >>
> >>  	if (USES_FULL_PPGTT(ring->dev)) {
> >> -		ret = ppgtt->switch_mm(ppgtt, ring, false);
> >> +		ret = ppgtt->switch_mm(ppgtt, ring);
> >>  		if (ret)
> >>  			goto unpin_out;
> >>  	}
> >> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c
> >> b/drivers/gpu/drm/i915/i915_gem_gtt.c
> >> index 5188936..450c8a9 100644
> >> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> >> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> >> @@ -216,19 +216,12 @@ static gen6_gtt_pte_t
> >> iris_pte_encode(dma_addr_t addr,
> >>
> >>  /* Broadwell Page Directory Pointer Descriptors */  static int
> >> gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
> >> -			   uint64_t val, bool synchronous)
> >> +			   uint64_t val)
> >>  {
> >> -	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> >>  	int ret;
> >>
> >>  	BUG_ON(entry >= 4);
> >>
> >> -	if (synchronous) {
> >> -		I915_WRITE(GEN8_RING_PDP_UDW(ring, entry), val >> 32);
> >> -		I915_WRITE(GEN8_RING_PDP_LDW(ring, entry), (u32)val);
> >> -		return 0;
> >> -	}
> >> -
> >>  	ret = intel_ring_begin(ring, 6);
> >>  	if (ret)
> >>  		return ret;
> >> @@ -245,8 +238,7 @@ static int gen8_write_pdp(struct intel_engine_cs
> >> *ring, unsigned entry,  }
> >>
> >>  static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
> >> -			  struct intel_engine_cs *ring,
> >> -			  bool synchronous)
> >> +			  struct intel_engine_cs *ring)
> >>  {
> >>  	int i, ret;
> >>
> >> @@ -255,7 +247,7 @@ static int gen8_mm_switch(struct i915_hw_ppgtt
> >> *ppgtt,
> >>
> >>  	for (i = used_pd - 1; i >= 0; i--) {
> >>  		dma_addr_t addr = ppgtt->pd_dma_addr[i];
> >> -		ret = gen8_write_pdp(ring, i, addr, synchronous);
> >> +		ret = gen8_write_pdp(ring, i, addr);
> >>  		if (ret)
> >>  			return ret;
> >>  	}
> >> @@ -724,29 +716,10 @@ static uint32_t get_pd_offset(struct
> >> i915_hw_ppgtt
> >> *ppgtt)  }
> >>
> >>  static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
> >> -			 struct intel_engine_cs *ring,
> >> -			 bool synchronous)
> >> +			 struct intel_engine_cs *ring)
> >>  {
> >> -	struct drm_device *dev = ppgtt->base.dev;
> >> -	struct drm_i915_private *dev_priv = dev->dev_private;
> >>  	int ret;
> >>
> >> -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> >> -	 * manually frob these bits. Ideally we could use the ring functions,
> >> -	 * except our error handling makes it quite difficult (can't use
> >> -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> >> -	 *
> >> -	 * FIXME: We should try not to special case reset
> >> -	 */
> >> -	if (synchronous ||
> >> -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> >> -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> >> -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> >> -		I915_WRITE(RING_PP_DIR_BASE(ring),
> >> get_pd_offset(ppgtt));
> >> -		POSTING_READ(RING_PP_DIR_BASE(ring));
> >> -		return 0;
> >> -	}
> >> -
> >>  	/* NB: TLBs must be flushed and invalidated before a switch */
> >>  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS,
> >> I915_GEM_GPU_DOMAINS);
> >>  	if (ret)
> >> @@ -768,29 +741,10 @@ static int hsw_mm_switch(struct i915_hw_ppgtt
> >> *ppgtt,  }
> >>
> >>  static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
> >> -			  struct intel_engine_cs *ring,
> >> -			  bool synchronous)
> >> +			  struct intel_engine_cs *ring)
> >>  {
> >> -	struct drm_device *dev = ppgtt->base.dev;
> >> -	struct drm_i915_private *dev_priv = dev->dev_private;
> >>  	int ret;
> >>
> >> -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> >> -	 * manually frob these bits. Ideally we could use the ring functions,
> >> -	 * except our error handling makes it quite difficult (can't use
> >> -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> >> -	 *
> >> -	 * FIXME: We should try not to special case reset
> >> -	 */
> >> -	if (synchronous ||
> >> -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> >> -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> >> -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> >> -		I915_WRITE(RING_PP_DIR_BASE(ring),
> >> get_pd_offset(ppgtt));
> >> -		POSTING_READ(RING_PP_DIR_BASE(ring));
> >> -		return 0;
> >> -	}
> >> -
> >>  	/* NB: TLBs must be flushed and invalidated before a switch */
> >>  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS,
> >> I915_GEM_GPU_DOMAINS);
> >>  	if (ret)
> >> @@ -819,14 +773,11 @@ static int gen7_mm_switch(struct
> i915_hw_ppgtt
> >> *ppgtt,  }
> >>
> >>  static int gen6_mm_switch(struct i915_hw_ppgtt *ppgtt,
> >> -			  struct intel_engine_cs *ring,
> >> -			  bool synchronous)
> >> +			  struct intel_engine_cs *ring)
> >>  {
> >>  	struct drm_device *dev = ppgtt->base.dev;
> >>  	struct drm_i915_private *dev_priv = dev->dev_private;
> >>
> >> -	if (!synchronous)
> >> -		return 0;
> >>
> >>  	I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> >>  	I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt)); @@ -
> >> 852,7 +803,7 @@ static int gen8_ppgtt_enable(struct i915_hw_ppgtt
> *ppgtt)
> >>  		if (USES_FULL_PPGTT(dev))
> >>  			continue;
> >>
> >> -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> >> +		ret = ppgtt->switch_mm(ppgtt, ring);
> >>  		if (ret)
> >>  			goto err_out;
> >>  	}
> >> @@ -897,7 +848,7 @@ static int gen7_ppgtt_enable(struct i915_hw_ppgtt
> >> *ppgtt)
> >>  		if (USES_FULL_PPGTT(dev))
> >>  			continue;
> >>
> >> -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> >> +		ret = ppgtt->switch_mm(ppgtt, ring);
> >>  		if (ret)
> >>  			return ret;
> >>  	}
> >> @@ -926,7 +877,7 @@ static int gen6_ppgtt_enable(struct i915_hw_ppgtt
> >> *ppgtt)
> >>  	I915_WRITE(GFX_MODE,
> >> _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
> >>
> >>  	for_each_ring(ring, dev_priv, i) {
> >> -		int ret = ppgtt->switch_mm(ppgtt, ring, true);
> >> +		int ret = ppgtt->switch_mm(ppgtt, ring);
> >>  		if (ret)
> >>  			return ret;
> >>  	}
> >> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h
> >> b/drivers/gpu/drm/i915/i915_gem_gtt.h
> >> index 8d6f7c1..bf1e4fc 100644
> >> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> >> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> >> @@ -262,8 +262,7 @@ struct i915_hw_ppgtt {
> >>
> >>  	int (*enable)(struct i915_hw_ppgtt *ppgtt);
> >>  	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
> >> -			 struct intel_engine_cs *ring,
> >> -			 bool synchronous);
> >> +			 struct intel_engine_cs *ring);
> >>  	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file
> >> *m);  };
> >>
> >> --
> >> 2.0.4