All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/i915: Rework GPU reset sequence to match driver load & thaw
@ 2014-07-16 15:05 alistair.mcaulay
  2014-07-26  1:05 ` Ben Widawsky
  2014-08-05  8:47 ` [PATCH v2] " alistair.mcaulay
  0 siblings, 2 replies; 30+ messages in thread
From: alistair.mcaulay @ 2014-07-16 15:05 UTC (permalink / raw)
  To: intel-gfx

From: "McAulay, Alistair" <alistair.mcaulay@intel.com>

This patch is to address Daniels concerns over different code during reset:

http://lists.freedesktop.org/archives/intel-gfx/2014-June/047758.html

"The reason for aiming as hard as possible to use the exact same code for
driver load, gpu reset and runtime pm/system resume is that we've simply
seen too many bugs due to slight variations and unintended omissions."

Tested using igt drv_hangman.

Signed-off-by: McAulay, Alistair <alistair.mcaulay@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c         |  2 -
 drivers/gpu/drm/i915/i915_gem_context.c | 42 +--------------------
 drivers/gpu/drm/i915/i915_gem_gtt.c     | 67 +++++----------------------------
 drivers/gpu/drm/i915/i915_gem_gtt.h     |  3 +-
 drivers/gpu/drm/i915/intel_ringbuffer.c |  4 +-
 5 files changed, 14 insertions(+), 104 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index ef047bc..b38e086 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2590,8 +2590,6 @@ void i915_gem_reset(struct drm_device *dev)
 	for_each_ring(ring, dev_priv, i)
 		i915_gem_reset_ring_cleanup(dev_priv, ring);
 
-	i915_gem_context_reset(dev);
-
 	i915_gem_restore_fences(dev);
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index de72a28..d96219f 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -372,42 +372,6 @@ err_destroy:
 	return ERR_PTR(ret);
 }
 
-void i915_gem_context_reset(struct drm_device *dev)
-{
-	struct drm_i915_private *dev_priv = dev->dev_private;
-	int i;
-
-	/* Prevent the hardware from restoring the last context (which hung) on
-	 * the next switch */
-	for (i = 0; i < I915_NUM_RINGS; i++) {
-		struct intel_engine_cs *ring = &dev_priv->ring[i];
-		struct intel_context *dctx = ring->default_context;
-		struct intel_context *lctx = ring->last_context;
-
-		/* Do a fake switch to the default context */
-		if (lctx == dctx)
-			continue;
-
-		if (!lctx)
-			continue;
-
-		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
-			WARN_ON(i915_gem_obj_ggtt_pin(dctx->legacy_hw_ctx.rcs_state,
-						      get_context_alignment(dev), 0));
-			/* Fake a finish/inactive */
-			dctx->legacy_hw_ctx.rcs_state->base.write_domain = 0;
-			dctx->legacy_hw_ctx.rcs_state->active = 0;
-		}
-
-		if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
-			i915_gem_object_ggtt_unpin(lctx->legacy_hw_ctx.rcs_state);
-
-		i915_gem_context_unreference(lctx);
-		i915_gem_context_reference(dctx);
-		ring->last_context = dctx;
-	}
-}
-
 int i915_gem_context_init(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
@@ -498,10 +462,6 @@ int i915_gem_context_enable(struct drm_i915_private *dev_priv)
 		ppgtt->enable(ppgtt);
 	}
 
-	/* FIXME: We should make this work, even in reset */
-	if (i915_reset_in_progress(&dev_priv->gpu_error))
-		return 0;
-
 	BUG_ON(!dev_priv->ring[RCS].default_context);
 
 	for_each_ring(ring, dev_priv, i) {
@@ -645,7 +605,7 @@ static int do_switch(struct intel_engine_cs *ring,
 	from = ring->last_context;
 
 	if (USES_FULL_PPGTT(ring->dev)) {
-		ret = ppgtt->switch_mm(ppgtt, ring, false);
+		ret = ppgtt->switch_mm(ppgtt, ring);
 		if (ret)
 			goto unpin_out;
 	}
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 5188936..450c8a9 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -216,19 +216,12 @@ static gen6_gtt_pte_t iris_pte_encode(dma_addr_t addr,
 
 /* Broadwell Page Directory Pointer Descriptors */
 static int gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
-			   uint64_t val, bool synchronous)
+			   uint64_t val)
 {
-	struct drm_i915_private *dev_priv = ring->dev->dev_private;
 	int ret;
 
 	BUG_ON(entry >= 4);
 
-	if (synchronous) {
-		I915_WRITE(GEN8_RING_PDP_UDW(ring, entry), val >> 32);
-		I915_WRITE(GEN8_RING_PDP_LDW(ring, entry), (u32)val);
-		return 0;
-	}
-
 	ret = intel_ring_begin(ring, 6);
 	if (ret)
 		return ret;
@@ -245,8 +238,7 @@ static int gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
 }
 
 static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
-			  struct intel_engine_cs *ring,
-			  bool synchronous)
+			  struct intel_engine_cs *ring)
 {
 	int i, ret;
 
@@ -255,7 +247,7 @@ static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
 
 	for (i = used_pd - 1; i >= 0; i--) {
 		dma_addr_t addr = ppgtt->pd_dma_addr[i];
-		ret = gen8_write_pdp(ring, i, addr, synchronous);
+		ret = gen8_write_pdp(ring, i, addr);
 		if (ret)
 			return ret;
 	}
@@ -724,29 +716,10 @@ static uint32_t get_pd_offset(struct i915_hw_ppgtt *ppgtt)
 }
 
 static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
-			 struct intel_engine_cs *ring,
-			 bool synchronous)
+			 struct intel_engine_cs *ring)
 {
-	struct drm_device *dev = ppgtt->base.dev;
-	struct drm_i915_private *dev_priv = dev->dev_private;
 	int ret;
 
-	/* If we're in reset, we can assume the GPU is sufficiently idle to
-	 * manually frob these bits. Ideally we could use the ring functions,
-	 * except our error handling makes it quite difficult (can't use
-	 * intel_ring_begin, ring->flush, or intel_ring_advance)
-	 *
-	 * FIXME: We should try not to special case reset
-	 */
-	if (synchronous ||
-	    i915_reset_in_progress(&dev_priv->gpu_error)) {
-		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
-		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
-		I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
-		POSTING_READ(RING_PP_DIR_BASE(ring));
-		return 0;
-	}
-
 	/* NB: TLBs must be flushed and invalidated before a switch */
 	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, I915_GEM_GPU_DOMAINS);
 	if (ret)
@@ -768,29 +741,10 @@ static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
 }
 
 static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
-			  struct intel_engine_cs *ring,
-			  bool synchronous)
+			  struct intel_engine_cs *ring)
 {
-	struct drm_device *dev = ppgtt->base.dev;
-	struct drm_i915_private *dev_priv = dev->dev_private;
 	int ret;
 
-	/* If we're in reset, we can assume the GPU is sufficiently idle to
-	 * manually frob these bits. Ideally we could use the ring functions,
-	 * except our error handling makes it quite difficult (can't use
-	 * intel_ring_begin, ring->flush, or intel_ring_advance)
-	 *
-	 * FIXME: We should try not to special case reset
-	 */
-	if (synchronous ||
-	    i915_reset_in_progress(&dev_priv->gpu_error)) {
-		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
-		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
-		I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
-		POSTING_READ(RING_PP_DIR_BASE(ring));
-		return 0;
-	}
-
 	/* NB: TLBs must be flushed and invalidated before a switch */
 	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, I915_GEM_GPU_DOMAINS);
 	if (ret)
@@ -819,14 +773,11 @@ static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
 }
 
 static int gen6_mm_switch(struct i915_hw_ppgtt *ppgtt,
-			  struct intel_engine_cs *ring,
-			  bool synchronous)
+			  struct intel_engine_cs *ring)
 {
 	struct drm_device *dev = ppgtt->base.dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
-	if (!synchronous)
-		return 0;
 
 	I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
 	I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
@@ -852,7 +803,7 @@ static int gen8_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
 		if (USES_FULL_PPGTT(dev))
 			continue;
 
-		ret = ppgtt->switch_mm(ppgtt, ring, true);
+		ret = ppgtt->switch_mm(ppgtt, ring);
 		if (ret)
 			goto err_out;
 	}
@@ -897,7 +848,7 @@ static int gen7_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
 		if (USES_FULL_PPGTT(dev))
 			continue;
 
-		ret = ppgtt->switch_mm(ppgtt, ring, true);
+		ret = ppgtt->switch_mm(ppgtt, ring);
 		if (ret)
 			return ret;
 	}
@@ -926,7 +877,7 @@ static int gen6_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
 	I915_WRITE(GFX_MODE, _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
 
 	for_each_ring(ring, dev_priv, i) {
-		int ret = ppgtt->switch_mm(ppgtt, ring, true);
+		int ret = ppgtt->switch_mm(ppgtt, ring);
 		if (ret)
 			return ret;
 	}
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index 8d6f7c1..bf1e4fc 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -262,8 +262,7 @@ struct i915_hw_ppgtt {
 
 	int (*enable)(struct i915_hw_ppgtt *ppgtt);
 	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
-			 struct intel_engine_cs *ring,
-			 bool synchronous);
+			 struct intel_engine_cs *ring);
 	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file *m);
 };
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 599709e..e33c2e1 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1832,7 +1832,9 @@ int intel_ring_begin(struct intel_engine_cs *ring,
 
 	ret = i915_gem_check_wedge(&dev_priv->gpu_error,
 				   dev_priv->mm.interruptible);
-	if (ret)
+
+	/* -EAGAIN means a reset is in progress, it is Ok to continue */
+	if (ret && (ret != -EAGAIN))
 		return ret;
 
 	ret = __intel_ring_prepare(ring, num_dwords * sizeof(uint32_t));
-- 
2.0.0

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [PATCH] drm/i915: Rework GPU reset sequence to match driver load & thaw
  2014-07-16 15:05 [PATCH] drm/i915: Rework GPU reset sequence to match driver load & thaw alistair.mcaulay
@ 2014-07-26  1:05 ` Ben Widawsky
  2014-07-28  9:26   ` Daniel Vetter
  2014-08-05  8:47 ` [PATCH v2] " alistair.mcaulay
  1 sibling, 1 reply; 30+ messages in thread
From: Ben Widawsky @ 2014-07-26  1:05 UTC (permalink / raw)
  To: alistair.mcaulay; +Cc: intel-gfx

On Wed, Jul 16, 2014 at 04:05:59PM +0100, alistair.mcaulay@intel.com wrote:
> From: "McAulay, Alistair" <alistair.mcaulay@intel.com>
> 
> This patch is to address Daniels concerns over different code during reset:
> 
> http://lists.freedesktop.org/archives/intel-gfx/2014-June/047758.html
> 
> "The reason for aiming as hard as possible to use the exact same code for
> driver load, gpu reset and runtime pm/system resume is that we've simply
> seen too many bugs due to slight variations and unintended omissions."
> 
> Tested using igt drv_hangman.
> 
> Signed-off-by: McAulay, Alistair <alistair.mcaulay@intel.com>

2 quick comments before I actually do a real review.
1. Did you actually run this with and without full ppgtt?
2. I don't think this is actually fulfilling what Daniel is requesting,
though we can let him comment.
3. Did you reall do #1?

Assuming you satisifed #1, can you please post the igt results for the
permutations (pre patch w/ and w/o ppgtt; post patch w/ and w/o ppgtt)

I really want this data because I spent *a lot* of time with these
specific areas in the PPGTT work, and I am somewhat skeptical enough of
the code has changed that this will magically work. I also tried the
trickiness with the ring handling functions, and never succeeded. Also,
with the context stuff, I'm simply not convinced it can magically
vanish. If igt looks good, and Daniel agrees that this is what he
actually wanted, I will go fishing for corner cases and do the review.

Thanks.

> ---
>  drivers/gpu/drm/i915/i915_gem.c         |  2 -
>  drivers/gpu/drm/i915/i915_gem_context.c | 42 +--------------------
>  drivers/gpu/drm/i915/i915_gem_gtt.c     | 67 +++++----------------------------
>  drivers/gpu/drm/i915/i915_gem_gtt.h     |  3 +-
>  drivers/gpu/drm/i915/intel_ringbuffer.c |  4 +-
>  5 files changed, 14 insertions(+), 104 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index ef047bc..b38e086 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2590,8 +2590,6 @@ void i915_gem_reset(struct drm_device *dev)
>  	for_each_ring(ring, dev_priv, i)
>  		i915_gem_reset_ring_cleanup(dev_priv, ring);
>  
> -	i915_gem_context_reset(dev);
> -
>  	i915_gem_restore_fences(dev);
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index de72a28..d96219f 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -372,42 +372,6 @@ err_destroy:
>  	return ERR_PTR(ret);
>  }
>  
> -void i915_gem_context_reset(struct drm_device *dev)
> -{
> -	struct drm_i915_private *dev_priv = dev->dev_private;
> -	int i;
> -
> -	/* Prevent the hardware from restoring the last context (which hung) on
> -	 * the next switch */
> -	for (i = 0; i < I915_NUM_RINGS; i++) {
> -		struct intel_engine_cs *ring = &dev_priv->ring[i];
> -		struct intel_context *dctx = ring->default_context;
> -		struct intel_context *lctx = ring->last_context;
> -
> -		/* Do a fake switch to the default context */
> -		if (lctx == dctx)
> -			continue;
> -
> -		if (!lctx)
> -			continue;
> -
> -		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
> -			WARN_ON(i915_gem_obj_ggtt_pin(dctx->legacy_hw_ctx.rcs_state,
> -						      get_context_alignment(dev), 0));
> -			/* Fake a finish/inactive */
> -			dctx->legacy_hw_ctx.rcs_state->base.write_domain = 0;
> -			dctx->legacy_hw_ctx.rcs_state->active = 0;
> -		}
> -
> -		if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
> -			i915_gem_object_ggtt_unpin(lctx->legacy_hw_ctx.rcs_state);
> -
> -		i915_gem_context_unreference(lctx);
> -		i915_gem_context_reference(dctx);
> -		ring->last_context = dctx;
> -	}
> -}
> -
>  int i915_gem_context_init(struct drm_device *dev)
>  {
>  	struct drm_i915_private *dev_priv = dev->dev_private;
> @@ -498,10 +462,6 @@ int i915_gem_context_enable(struct drm_i915_private *dev_priv)
>  		ppgtt->enable(ppgtt);
>  	}
>  
> -	/* FIXME: We should make this work, even in reset */
> -	if (i915_reset_in_progress(&dev_priv->gpu_error))
> -		return 0;
> -
>  	BUG_ON(!dev_priv->ring[RCS].default_context);
>  
>  	for_each_ring(ring, dev_priv, i) {
> @@ -645,7 +605,7 @@ static int do_switch(struct intel_engine_cs *ring,
>  	from = ring->last_context;
>  
>  	if (USES_FULL_PPGTT(ring->dev)) {
> -		ret = ppgtt->switch_mm(ppgtt, ring, false);
> +		ret = ppgtt->switch_mm(ppgtt, ring);
>  		if (ret)
>  			goto unpin_out;
>  	}
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 5188936..450c8a9 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -216,19 +216,12 @@ static gen6_gtt_pte_t iris_pte_encode(dma_addr_t addr,
>  
>  /* Broadwell Page Directory Pointer Descriptors */
>  static int gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
> -			   uint64_t val, bool synchronous)
> +			   uint64_t val)
>  {
> -	struct drm_i915_private *dev_priv = ring->dev->dev_private;
>  	int ret;
>  
>  	BUG_ON(entry >= 4);
>  
> -	if (synchronous) {
> -		I915_WRITE(GEN8_RING_PDP_UDW(ring, entry), val >> 32);
> -		I915_WRITE(GEN8_RING_PDP_LDW(ring, entry), (u32)val);
> -		return 0;
> -	}
> -
>  	ret = intel_ring_begin(ring, 6);
>  	if (ret)
>  		return ret;
> @@ -245,8 +238,7 @@ static int gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
>  }
>  
>  static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
> -			  struct intel_engine_cs *ring,
> -			  bool synchronous)
> +			  struct intel_engine_cs *ring)
>  {
>  	int i, ret;
>  
> @@ -255,7 +247,7 @@ static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
>  
>  	for (i = used_pd - 1; i >= 0; i--) {
>  		dma_addr_t addr = ppgtt->pd_dma_addr[i];
> -		ret = gen8_write_pdp(ring, i, addr, synchronous);
> +		ret = gen8_write_pdp(ring, i, addr);
>  		if (ret)
>  			return ret;
>  	}
> @@ -724,29 +716,10 @@ static uint32_t get_pd_offset(struct i915_hw_ppgtt *ppgtt)
>  }
>  
>  static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
> -			 struct intel_engine_cs *ring,
> -			 bool synchronous)
> +			 struct intel_engine_cs *ring)
>  {
> -	struct drm_device *dev = ppgtt->base.dev;
> -	struct drm_i915_private *dev_priv = dev->dev_private;
>  	int ret;
>  
> -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> -	 * manually frob these bits. Ideally we could use the ring functions,
> -	 * except our error handling makes it quite difficult (can't use
> -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> -	 *
> -	 * FIXME: We should try not to special case reset
> -	 */
> -	if (synchronous ||
> -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> -		I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
> -		POSTING_READ(RING_PP_DIR_BASE(ring));
> -		return 0;
> -	}
> -
>  	/* NB: TLBs must be flushed and invalidated before a switch */
>  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, I915_GEM_GPU_DOMAINS);
>  	if (ret)
> @@ -768,29 +741,10 @@ static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
>  }
>  
>  static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
> -			  struct intel_engine_cs *ring,
> -			  bool synchronous)
> +			  struct intel_engine_cs *ring)
>  {
> -	struct drm_device *dev = ppgtt->base.dev;
> -	struct drm_i915_private *dev_priv = dev->dev_private;
>  	int ret;
>  
> -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> -	 * manually frob these bits. Ideally we could use the ring functions,
> -	 * except our error handling makes it quite difficult (can't use
> -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> -	 *
> -	 * FIXME: We should try not to special case reset
> -	 */
> -	if (synchronous ||
> -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> -		I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
> -		POSTING_READ(RING_PP_DIR_BASE(ring));
> -		return 0;
> -	}
> -
>  	/* NB: TLBs must be flushed and invalidated before a switch */
>  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, I915_GEM_GPU_DOMAINS);
>  	if (ret)
> @@ -819,14 +773,11 @@ static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
>  }
>  
>  static int gen6_mm_switch(struct i915_hw_ppgtt *ppgtt,
> -			  struct intel_engine_cs *ring,
> -			  bool synchronous)
> +			  struct intel_engine_cs *ring)
>  {
>  	struct drm_device *dev = ppgtt->base.dev;
>  	struct drm_i915_private *dev_priv = dev->dev_private;
>  
> -	if (!synchronous)
> -		return 0;
>  
>  	I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
>  	I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
> @@ -852,7 +803,7 @@ static int gen8_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
>  		if (USES_FULL_PPGTT(dev))
>  			continue;
>  
> -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> +		ret = ppgtt->switch_mm(ppgtt, ring);
>  		if (ret)
>  			goto err_out;
>  	}
> @@ -897,7 +848,7 @@ static int gen7_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
>  		if (USES_FULL_PPGTT(dev))
>  			continue;
>  
> -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> +		ret = ppgtt->switch_mm(ppgtt, ring);
>  		if (ret)
>  			return ret;
>  	}
> @@ -926,7 +877,7 @@ static int gen6_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
>  	I915_WRITE(GFX_MODE, _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
>  
>  	for_each_ring(ring, dev_priv, i) {
> -		int ret = ppgtt->switch_mm(ppgtt, ring, true);
> +		int ret = ppgtt->switch_mm(ppgtt, ring);
>  		if (ret)
>  			return ret;
>  	}
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
> index 8d6f7c1..bf1e4fc 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> @@ -262,8 +262,7 @@ struct i915_hw_ppgtt {
>  
>  	int (*enable)(struct i915_hw_ppgtt *ppgtt);
>  	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
> -			 struct intel_engine_cs *ring,
> -			 bool synchronous);
> +			 struct intel_engine_cs *ring);
>  	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file *m);
>  };
>  
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 599709e..e33c2e1 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -1832,7 +1832,9 @@ int intel_ring_begin(struct intel_engine_cs *ring,
>  
>  	ret = i915_gem_check_wedge(&dev_priv->gpu_error,
>  				   dev_priv->mm.interruptible);
> -	if (ret)
> +
> +	/* -EAGAIN means a reset is in progress, it is Ok to continue */
> +	if (ret && (ret != -EAGAIN))
>  		return ret;
>  
>  	ret = __intel_ring_prepare(ring, num_dwords * sizeof(uint32_t));
> -- 
> 2.0.0
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] drm/i915: Rework GPU reset sequence to match driver load & thaw
  2014-07-26  1:05 ` Ben Widawsky
@ 2014-07-28  9:26   ` Daniel Vetter
  2014-07-28 17:12     ` Mcaulay, Alistair
  2014-07-29  7:36     ` Chris Wilson
  0 siblings, 2 replies; 30+ messages in thread
From: Daniel Vetter @ 2014-07-28  9:26 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: intel-gfx

On Fri, Jul 25, 2014 at 06:05:29PM -0700, Ben Widawsky wrote:
> On Wed, Jul 16, 2014 at 04:05:59PM +0100, alistair.mcaulay@intel.com wrote:
> > From: "McAulay, Alistair" <alistair.mcaulay@intel.com>
> > 
> > This patch is to address Daniels concerns over different code during reset:
> > 
> > http://lists.freedesktop.org/archives/intel-gfx/2014-June/047758.html
> > 
> > "The reason for aiming as hard as possible to use the exact same code for
> > driver load, gpu reset and runtime pm/system resume is that we've simply
> > seen too many bugs due to slight variations and unintended omissions."
> > 
> > Tested using igt drv_hangman.
> > 
> > Signed-off-by: McAulay, Alistair <alistair.mcaulay@intel.com>
> 
> 2 quick comments before I actually do a real review.
> 1. Did you actually run this with and without full ppgtt?
> 2. I don't think this is actually fulfilling what Daniel is requesting,
> though we can let him comment.

Mostly looks like what I think we need. Comments below.

> 3. Did you reall do #1?
> 
> Assuming you satisifed #1, can you please post the igt results for the
> permutations (pre patch w/ and w/o ppgtt; post patch w/ and w/o ppgtt)
> 
> I really want this data because I spent *a lot* of time with these
> specific areas in the PPGTT work, and I am somewhat skeptical enough of
> the code has changed that this will magically work. I also tried the
> trickiness with the ring handling functions, and never succeeded. Also,
> with the context stuff, I'm simply not convinced it can magically
> vanish. If igt looks good, and Daniel agrees that this is what he
> actually wanted, I will go fishing for corner cases and do the review.

I think the hack in ring_begin might explain why it never worked before.
But fully agreed, we really need to test this well (and fill gaps if igt
misses anything around resets - we don't have any systematic gpu reset
coverage anywhere outside of igt).

> 
> Thanks.
> 
> > ---
> >  drivers/gpu/drm/i915/i915_gem.c         |  2 -
> >  drivers/gpu/drm/i915/i915_gem_context.c | 42 +--------------------
> >  drivers/gpu/drm/i915/i915_gem_gtt.c     | 67 +++++----------------------------
> >  drivers/gpu/drm/i915/i915_gem_gtt.h     |  3 +-
> >  drivers/gpu/drm/i915/intel_ringbuffer.c |  4 +-
> >  5 files changed, 14 insertions(+), 104 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> > index ef047bc..b38e086 100644
> > --- a/drivers/gpu/drm/i915/i915_gem.c
> > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > @@ -2590,8 +2590,6 @@ void i915_gem_reset(struct drm_device *dev)
> >  	for_each_ring(ring, dev_priv, i)
> >  		i915_gem_reset_ring_cleanup(dev_priv, ring);
> >  
> > -	i915_gem_context_reset(dev);
> > -
> >  	i915_gem_restore_fences(dev);
> >  }
> >  
> > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> > index de72a28..d96219f 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_context.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> > @@ -372,42 +372,6 @@ err_destroy:
> >  	return ERR_PTR(ret);
> >  }
> >  
> > -void i915_gem_context_reset(struct drm_device *dev)
> > -{
> > -	struct drm_i915_private *dev_priv = dev->dev_private;
> > -	int i;
> > -
> > -	/* Prevent the hardware from restoring the last context (which hung) on
> > -	 * the next switch */
> > -	for (i = 0; i < I915_NUM_RINGS; i++) {
> > -		struct intel_engine_cs *ring = &dev_priv->ring[i];
> > -		struct intel_context *dctx = ring->default_context;
> > -		struct intel_context *lctx = ring->last_context;
> > -
> > -		/* Do a fake switch to the default context */
> > -		if (lctx == dctx)
> > -			continue;
> > -
> > -		if (!lctx)
> > -			continue;
> > -
> > -		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
> > -			WARN_ON(i915_gem_obj_ggtt_pin(dctx->legacy_hw_ctx.rcs_state,
> > -						      get_context_alignment(dev), 0));
> > -			/* Fake a finish/inactive */
> > -			dctx->legacy_hw_ctx.rcs_state->base.write_domain = 0;
> > -			dctx->legacy_hw_ctx.rcs_state->active = 0;
> > -		}
> > -
> > -		if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
> > -			i915_gem_object_ggtt_unpin(lctx->legacy_hw_ctx.rcs_state);
> > -
> > -		i915_gem_context_unreference(lctx);
> > -		i915_gem_context_reference(dctx);
> > -		ring->last_context = dctx;
> > -	}
> > -}

I don't understand why we no longer need this - after reset we probably
have the default context loaded (if we resue the driver load sequence
exactly), so I expect that we must reset the software tracking
accordingly.

> > -
> >  int i915_gem_context_init(struct drm_device *dev)
> >  {
> >  	struct drm_i915_private *dev_priv = dev->dev_private;
> > @@ -498,10 +462,6 @@ int i915_gem_context_enable(struct drm_i915_private *dev_priv)
> >  		ppgtt->enable(ppgtt);
> >  	}
> >  
> > -	/* FIXME: We should make this work, even in reset */
> > -	if (i915_reset_in_progress(&dev_priv->gpu_error))
> > -		return 0;
> > -
> >  	BUG_ON(!dev_priv->ring[RCS].default_context);
> >  
> >  	for_each_ring(ring, dev_priv, i) {
> > @@ -645,7 +605,7 @@ static int do_switch(struct intel_engine_cs *ring,
> >  	from = ring->last_context;
> >  
> >  	if (USES_FULL_PPGTT(ring->dev)) {
> > -		ret = ppgtt->switch_mm(ppgtt, ring, false);
> > +		ret = ppgtt->switch_mm(ppgtt, ring);
> >  		if (ret)
> >  			goto unpin_out;
> >  	}
> > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> > index 5188936..450c8a9 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> > @@ -216,19 +216,12 @@ static gen6_gtt_pte_t iris_pte_encode(dma_addr_t addr,
> >  
> >  /* Broadwell Page Directory Pointer Descriptors */
> >  static int gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
> > -			   uint64_t val, bool synchronous)
> > +			   uint64_t val)
> >  {
> > -	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> >  	int ret;
> >  
> >  	BUG_ON(entry >= 4);
> >  
> > -	if (synchronous) {
> > -		I915_WRITE(GEN8_RING_PDP_UDW(ring, entry), val >> 32);
> > -		I915_WRITE(GEN8_RING_PDP_LDW(ring, entry), (u32)val);
> > -		return 0;
> > -	}
> > -
> >  	ret = intel_ring_begin(ring, 6);
> >  	if (ret)
> >  		return ret;
> > @@ -245,8 +238,7 @@ static int gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
> >  }
> >  
> >  static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > -			  struct intel_engine_cs *ring,
> > -			  bool synchronous)
> > +			  struct intel_engine_cs *ring)
> >  {
> >  	int i, ret;
> >  
> > @@ -255,7 +247,7 @@ static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
> >  
> >  	for (i = used_pd - 1; i >= 0; i--) {
> >  		dma_addr_t addr = ppgtt->pd_dma_addr[i];
> > -		ret = gen8_write_pdp(ring, i, addr, synchronous);
> > +		ret = gen8_write_pdp(ring, i, addr);
> >  		if (ret)
> >  			return ret;
> >  	}
> > @@ -724,29 +716,10 @@ static uint32_t get_pd_offset(struct i915_hw_ppgtt *ppgtt)
> >  }
> >  
> >  static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > -			 struct intel_engine_cs *ring,
> > -			 bool synchronous)
> > +			 struct intel_engine_cs *ring)
> >  {
> > -	struct drm_device *dev = ppgtt->base.dev;
> > -	struct drm_i915_private *dev_priv = dev->dev_private;
> >  	int ret;
> >  
> > -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> > -	 * manually frob these bits. Ideally we could use the ring functions,
> > -	 * except our error handling makes it quite difficult (can't use
> > -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> > -	 *
> > -	 * FIXME: We should try not to special case reset
> > -	 */
> > -	if (synchronous ||
> > -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> > -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> > -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> > -		I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
> > -		POSTING_READ(RING_PP_DIR_BASE(ring));
> > -		return 0;
> > -	}
> > -
> >  	/* NB: TLBs must be flushed and invalidated before a switch */
> >  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, I915_GEM_GPU_DOMAINS);
> >  	if (ret)
> > @@ -768,29 +741,10 @@ static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
> >  }
> >  
> >  static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > -			  struct intel_engine_cs *ring,
> > -			  bool synchronous)
> > +			  struct intel_engine_cs *ring)
> >  {
> > -	struct drm_device *dev = ppgtt->base.dev;
> > -	struct drm_i915_private *dev_priv = dev->dev_private;
> >  	int ret;
> >  
> > -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> > -	 * manually frob these bits. Ideally we could use the ring functions,
> > -	 * except our error handling makes it quite difficult (can't use
> > -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> > -	 *
> > -	 * FIXME: We should try not to special case reset
> > -	 */
> > -	if (synchronous ||
> > -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> > -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> > -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> > -		I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
> > -		POSTING_READ(RING_PP_DIR_BASE(ring));
> > -		return 0;
> > -	}
> > -
> >  	/* NB: TLBs must be flushed and invalidated before a switch */
> >  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, I915_GEM_GPU_DOMAINS);
> >  	if (ret)
> > @@ -819,14 +773,11 @@ static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
> >  }
> >  
> >  static int gen6_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > -			  struct intel_engine_cs *ring,
> > -			  bool synchronous)
> > +			  struct intel_engine_cs *ring)
> >  {
> >  	struct drm_device *dev = ppgtt->base.dev;
> >  	struct drm_i915_private *dev_priv = dev->dev_private;
> >  
> > -	if (!synchronous)
> > -		return 0;
> >  
> >  	I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> >  	I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
> > @@ -852,7 +803,7 @@ static int gen8_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
> >  		if (USES_FULL_PPGTT(dev))
> >  			continue;
> >  
> > -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> > +		ret = ppgtt->switch_mm(ppgtt, ring);
> >  		if (ret)
> >  			goto err_out;
> >  	}
> > @@ -897,7 +848,7 @@ static int gen7_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
> >  		if (USES_FULL_PPGTT(dev))
> >  			continue;
> >  
> > -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> > +		ret = ppgtt->switch_mm(ppgtt, ring);
> >  		if (ret)
> >  			return ret;
> >  	}
> > @@ -926,7 +877,7 @@ static int gen6_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
> >  	I915_WRITE(GFX_MODE, _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
> >  
> >  	for_each_ring(ring, dev_priv, i) {
> > -		int ret = ppgtt->switch_mm(ppgtt, ring, true);
> > +		int ret = ppgtt->switch_mm(ppgtt, ring);
> >  		if (ret)
> >  			return ret;
> >  	}
> > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
> > index 8d6f7c1..bf1e4fc 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> > @@ -262,8 +262,7 @@ struct i915_hw_ppgtt {
> >  
> >  	int (*enable)(struct i915_hw_ppgtt *ppgtt);
> >  	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
> > -			 struct intel_engine_cs *ring,
> > -			 bool synchronous);
> > +			 struct intel_engine_cs *ring);
> >  	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file *m);
> >  };
> >  
> > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > index 599709e..e33c2e1 100644
> > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > @@ -1832,7 +1832,9 @@ int intel_ring_begin(struct intel_engine_cs *ring,
> >  
> >  	ret = i915_gem_check_wedge(&dev_priv->gpu_error,
> >  				   dev_priv->mm.interruptible);
> > -	if (ret)
> > +
> > +	/* -EAGAIN means a reset is in progress, it is Ok to continue */
> > +	if (ret && (ret != -EAGAIN))
> >  		return ret;

Oh, I guess that's the tricky bit why the old approach never worked -
because reset_in_progress is set we failed the context/ppgtt loading
through the rings and screwed up.

Problem with your approach is that we want to bail out here if a reset is
in progress, so we can't just eat the EAGAIN. If we do that we potentially
deadlock or overflow the ring.

I think we need a different hack here, and a few layers down (i.e. at the
place where we actually generate that offending -EAGAIN).

- Around the re-init sequence in the reset function we set
  dev_priv->mm.reload_in_reset or similar. Since we hold dev->struct_mutex
  no one will see that, as long as we never leak it out of the critical
  section.

- In the ring_begin code that checks for gpu hangs we ignore
  reset_in_progress if this bit is set.

- Both places need fairly big comments to explain what exactly is going
  on.

Thanks, Daniel

> >  
> >  	ret = __intel_ring_prepare(ring, num_dwords * sizeof(uint32_t));
> > -- 
> > 2.0.0
> > 
> > _______________________________________________
> > Intel-gfx mailing list
> > Intel-gfx@lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> 
> -- 
> Ben Widawsky, Intel Open Source Technology Center
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] drm/i915: Rework GPU reset sequence to match driver load & thaw
  2014-07-28  9:26   ` Daniel Vetter
@ 2014-07-28 17:12     ` Mcaulay, Alistair
  2014-07-29  0:16       ` Ben Widawsky
  2014-07-29  7:36     ` Chris Wilson
  1 sibling, 1 reply; 30+ messages in thread
From: Mcaulay, Alistair @ 2014-07-28 17:12 UTC (permalink / raw)
  To: Daniel Vetter, Ben Widawsky; +Cc: intel-gfx

Hi Ben / Daniel,
I agree that this needs to be properly tested. Are there any particular igt tests you would suggest I use?
I've been running:
drv_hangman, drv_suspend, gem_hangcheck_forcewake.

Also do you have a set of PPGTT Patches that should work with these tests. Michel sent me a set of patches to enable
PPGTT, but these 3 tests fail with the patches.

Thanks,
Alistair.

-----Original Message-----
From: Daniel Vetter [mailto:daniel.vetter@ffwll.ch] On Behalf Of Daniel Vetter
Sent: Monday, July 28, 2014 10:27 AM
To: Ben Widawsky
Cc: Mcaulay, Alistair; intel-gfx@lists.freedesktop.org
Subject: Re: [Intel-gfx] [PATCH] drm/i915: Rework GPU reset sequence to match driver load & thaw

On Fri, Jul 25, 2014 at 06:05:29PM -0700, Ben Widawsky wrote:
> On Wed, Jul 16, 2014 at 04:05:59PM +0100, alistair.mcaulay@intel.com wrote:
> > From: "McAulay, Alistair" <alistair.mcaulay@intel.com>
> > 
> > This patch is to address Daniels concerns over different code during reset:
> > 
> > http://lists.freedesktop.org/archives/intel-gfx/2014-June/047758.htm
> > l
> > 
> > "The reason for aiming as hard as possible to use the exact same 
> > code for driver load, gpu reset and runtime pm/system resume is that 
> > we've simply seen too many bugs due to slight variations and unintended omissions."
> > 
> > Tested using igt drv_hangman.
> > 
> > Signed-off-by: McAulay, Alistair <alistair.mcaulay@intel.com>
> 
> 2 quick comments before I actually do a real review.
> 1. Did you actually run this with and without full ppgtt?
> 2. I don't think this is actually fulfilling what Daniel is 
> requesting, though we can let him comment.

Mostly looks like what I think we need. Comments below.

> 3. Did you reall do #1?
> 
> Assuming you satisifed #1, can you please post the igt results for the 
> permutations (pre patch w/ and w/o ppgtt; post patch w/ and w/o ppgtt)
> 
> I really want this data because I spent *a lot* of time with these 
> specific areas in the PPGTT work, and I am somewhat skeptical enough 
> of the code has changed that this will magically work. I also tried 
> the trickiness with the ring handling functions, and never succeeded. 
> Also, with the context stuff, I'm simply not convinced it can 
> magically vanish. If igt looks good, and Daniel agrees that this is 
> what he actually wanted, I will go fishing for corner cases and do the review.

I think the hack in ring_begin might explain why it never worked before.
But fully agreed, we really need to test this well (and fill gaps if igt misses anything around resets - we don't have any systematic gpu reset coverage anywhere outside of igt).

> 
> Thanks.
> 
> > ---
> >  drivers/gpu/drm/i915/i915_gem.c         |  2 -
> >  drivers/gpu/drm/i915/i915_gem_context.c | 42 +--------------------
> >  drivers/gpu/drm/i915/i915_gem_gtt.c     | 67 +++++----------------------------
> >  drivers/gpu/drm/i915/i915_gem_gtt.h     |  3 +-
> >  drivers/gpu/drm/i915/intel_ringbuffer.c |  4 +-
> >  5 files changed, 14 insertions(+), 104 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_gem.c 
> > b/drivers/gpu/drm/i915/i915_gem.c index ef047bc..b38e086 100644
> > --- a/drivers/gpu/drm/i915/i915_gem.c
> > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > @@ -2590,8 +2590,6 @@ void i915_gem_reset(struct drm_device *dev)
> >  	for_each_ring(ring, dev_priv, i)
> >  		i915_gem_reset_ring_cleanup(dev_priv, ring);
> >  
> > -	i915_gem_context_reset(dev);
> > -
> >  	i915_gem_restore_fences(dev);
> >  }
> >  
> > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c 
> > b/drivers/gpu/drm/i915/i915_gem_context.c
> > index de72a28..d96219f 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_context.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> > @@ -372,42 +372,6 @@ err_destroy:
> >  	return ERR_PTR(ret);
> >  }
> >  
> > -void i915_gem_context_reset(struct drm_device *dev) -{
> > -	struct drm_i915_private *dev_priv = dev->dev_private;
> > -	int i;
> > -
> > -	/* Prevent the hardware from restoring the last context (which hung) on
> > -	 * the next switch */
> > -	for (i = 0; i < I915_NUM_RINGS; i++) {
> > -		struct intel_engine_cs *ring = &dev_priv->ring[i];
> > -		struct intel_context *dctx = ring->default_context;
> > -		struct intel_context *lctx = ring->last_context;
> > -
> > -		/* Do a fake switch to the default context */
> > -		if (lctx == dctx)
> > -			continue;
> > -
> > -		if (!lctx)
> > -			continue;
> > -
> > -		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
> > -			WARN_ON(i915_gem_obj_ggtt_pin(dctx->legacy_hw_ctx.rcs_state,
> > -						      get_context_alignment(dev), 0));
> > -			/* Fake a finish/inactive */
> > -			dctx->legacy_hw_ctx.rcs_state->base.write_domain = 0;
> > -			dctx->legacy_hw_ctx.rcs_state->active = 0;
> > -		}
> > -
> > -		if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
> > -			i915_gem_object_ggtt_unpin(lctx->legacy_hw_ctx.rcs_state);
> > -
> > -		i915_gem_context_unreference(lctx);
> > -		i915_gem_context_reference(dctx);
> > -		ring->last_context = dctx;
> > -	}
> > -}

I don't understand why we no longer need this - after reset we probably have the default context loaded (if we resue the driver load sequence exactly), so I expect that we must reset the software tracking accordingly.

> > -
> >  int i915_gem_context_init(struct drm_device *dev)  {
> >  	struct drm_i915_private *dev_priv = dev->dev_private; @@ -498,10 
> > +462,6 @@ int i915_gem_context_enable(struct drm_i915_private *dev_priv)
> >  		ppgtt->enable(ppgtt);
> >  	}
> >  
> > -	/* FIXME: We should make this work, even in reset */
> > -	if (i915_reset_in_progress(&dev_priv->gpu_error))
> > -		return 0;
> > -
> >  	BUG_ON(!dev_priv->ring[RCS].default_context);
> >  
> >  	for_each_ring(ring, dev_priv, i) { @@ -645,7 +605,7 @@ static int 
> > do_switch(struct intel_engine_cs *ring,
> >  	from = ring->last_context;
> >  
> >  	if (USES_FULL_PPGTT(ring->dev)) {
> > -		ret = ppgtt->switch_mm(ppgtt, ring, false);
> > +		ret = ppgtt->switch_mm(ppgtt, ring);
> >  		if (ret)
> >  			goto unpin_out;
> >  	}
> > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c 
> > b/drivers/gpu/drm/i915/i915_gem_gtt.c
> > index 5188936..450c8a9 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> > @@ -216,19 +216,12 @@ static gen6_gtt_pte_t 
> > iris_pte_encode(dma_addr_t addr,
> >  
> >  /* Broadwell Page Directory Pointer Descriptors */  static int 
> > gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
> > -			   uint64_t val, bool synchronous)
> > +			   uint64_t val)
> >  {
> > -	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> >  	int ret;
> >  
> >  	BUG_ON(entry >= 4);
> >  
> > -	if (synchronous) {
> > -		I915_WRITE(GEN8_RING_PDP_UDW(ring, entry), val >> 32);
> > -		I915_WRITE(GEN8_RING_PDP_LDW(ring, entry), (u32)val);
> > -		return 0;
> > -	}
> > -
> >  	ret = intel_ring_begin(ring, 6);
> >  	if (ret)
> >  		return ret;
> > @@ -245,8 +238,7 @@ static int gen8_write_pdp(struct intel_engine_cs 
> > *ring, unsigned entry,  }
> >  
> >  static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > -			  struct intel_engine_cs *ring,
> > -			  bool synchronous)
> > +			  struct intel_engine_cs *ring)
> >  {
> >  	int i, ret;
> >  
> > @@ -255,7 +247,7 @@ static int gen8_mm_switch(struct i915_hw_ppgtt 
> > *ppgtt,
> >  
> >  	for (i = used_pd - 1; i >= 0; i--) {
> >  		dma_addr_t addr = ppgtt->pd_dma_addr[i];
> > -		ret = gen8_write_pdp(ring, i, addr, synchronous);
> > +		ret = gen8_write_pdp(ring, i, addr);
> >  		if (ret)
> >  			return ret;
> >  	}
> > @@ -724,29 +716,10 @@ static uint32_t get_pd_offset(struct 
> > i915_hw_ppgtt *ppgtt)  }
> >  
> >  static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > -			 struct intel_engine_cs *ring,
> > -			 bool synchronous)
> > +			 struct intel_engine_cs *ring)
> >  {
> > -	struct drm_device *dev = ppgtt->base.dev;
> > -	struct drm_i915_private *dev_priv = dev->dev_private;
> >  	int ret;
> >  
> > -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> > -	 * manually frob these bits. Ideally we could use the ring functions,
> > -	 * except our error handling makes it quite difficult (can't use
> > -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> > -	 *
> > -	 * FIXME: We should try not to special case reset
> > -	 */
> > -	if (synchronous ||
> > -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> > -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> > -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> > -		I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
> > -		POSTING_READ(RING_PP_DIR_BASE(ring));
> > -		return 0;
> > -	}
> > -
> >  	/* NB: TLBs must be flushed and invalidated before a switch */
> >  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, I915_GEM_GPU_DOMAINS);
> >  	if (ret)
> > @@ -768,29 +741,10 @@ static int hsw_mm_switch(struct i915_hw_ppgtt 
> > *ppgtt,  }
> >  
> >  static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > -			  struct intel_engine_cs *ring,
> > -			  bool synchronous)
> > +			  struct intel_engine_cs *ring)
> >  {
> > -	struct drm_device *dev = ppgtt->base.dev;
> > -	struct drm_i915_private *dev_priv = dev->dev_private;
> >  	int ret;
> >  
> > -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> > -	 * manually frob these bits. Ideally we could use the ring functions,
> > -	 * except our error handling makes it quite difficult (can't use
> > -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> > -	 *
> > -	 * FIXME: We should try not to special case reset
> > -	 */
> > -	if (synchronous ||
> > -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> > -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> > -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> > -		I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
> > -		POSTING_READ(RING_PP_DIR_BASE(ring));
> > -		return 0;
> > -	}
> > -
> >  	/* NB: TLBs must be flushed and invalidated before a switch */
> >  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, I915_GEM_GPU_DOMAINS);
> >  	if (ret)
> > @@ -819,14 +773,11 @@ static int gen7_mm_switch(struct i915_hw_ppgtt 
> > *ppgtt,  }
> >  
> >  static int gen6_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > -			  struct intel_engine_cs *ring,
> > -			  bool synchronous)
> > +			  struct intel_engine_cs *ring)
> >  {
> >  	struct drm_device *dev = ppgtt->base.dev;
> >  	struct drm_i915_private *dev_priv = dev->dev_private;
> >  
> > -	if (!synchronous)
> > -		return 0;
> >  
> >  	I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> >  	I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt)); @@ 
> > -852,7 +803,7 @@ static int gen8_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
> >  		if (USES_FULL_PPGTT(dev))
> >  			continue;
> >  
> > -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> > +		ret = ppgtt->switch_mm(ppgtt, ring);
> >  		if (ret)
> >  			goto err_out;
> >  	}
> > @@ -897,7 +848,7 @@ static int gen7_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
> >  		if (USES_FULL_PPGTT(dev))
> >  			continue;
> >  
> > -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> > +		ret = ppgtt->switch_mm(ppgtt, ring);
> >  		if (ret)
> >  			return ret;
> >  	}
> > @@ -926,7 +877,7 @@ static int gen6_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
> >  	I915_WRITE(GFX_MODE, _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
> >  
> >  	for_each_ring(ring, dev_priv, i) {
> > -		int ret = ppgtt->switch_mm(ppgtt, ring, true);
> > +		int ret = ppgtt->switch_mm(ppgtt, ring);
> >  		if (ret)
> >  			return ret;
> >  	}
> > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h 
> > b/drivers/gpu/drm/i915/i915_gem_gtt.h
> > index 8d6f7c1..bf1e4fc 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> > @@ -262,8 +262,7 @@ struct i915_hw_ppgtt {
> >  
> >  	int (*enable)(struct i915_hw_ppgtt *ppgtt);
> >  	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
> > -			 struct intel_engine_cs *ring,
> > -			 bool synchronous);
> > +			 struct intel_engine_cs *ring);
> >  	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file 
> > *m);  };
> >  
> > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c 
> > b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > index 599709e..e33c2e1 100644
> > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > @@ -1832,7 +1832,9 @@ int intel_ring_begin(struct intel_engine_cs 
> > *ring,
> >  
> >  	ret = i915_gem_check_wedge(&dev_priv->gpu_error,
> >  				   dev_priv->mm.interruptible);
> > -	if (ret)
> > +
> > +	/* -EAGAIN means a reset is in progress, it is Ok to continue */
> > +	if (ret && (ret != -EAGAIN))
> >  		return ret;

Oh, I guess that's the tricky bit why the old approach never worked - because reset_in_progress is set we failed the context/ppgtt loading through the rings and screwed up.

Problem with your approach is that we want to bail out here if a reset is in progress, so we can't just eat the EAGAIN. If we do that we potentially deadlock or overflow the ring.

I think we need a different hack here, and a few layers down (i.e. at the place where we actually generate that offending -EAGAIN).

- Around the re-init sequence in the reset function we set
  dev_priv->mm.reload_in_reset or similar. Since we hold dev->struct_mutex
  no one will see that, as long as we never leak it out of the critical
  section.

- In the ring_begin code that checks for gpu hangs we ignore
  reset_in_progress if this bit is set.

- Both places need fairly big comments to explain what exactly is going
  on.

Thanks, Daniel

> >  
> >  	ret = __intel_ring_prepare(ring, num_dwords * sizeof(uint32_t));
> > --
> > 2.0.0
> > 
> > _______________________________________________
> > Intel-gfx mailing list
> > Intel-gfx@lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> 
> --
> Ben Widawsky, Intel Open Source Technology Center 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

--
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] drm/i915: Rework GPU reset sequence to match driver load & thaw
  2014-07-28 17:12     ` Mcaulay, Alistair
@ 2014-07-29  0:16       ` Ben Widawsky
  2014-07-29 17:25         ` Mcaulay, Alistair
  0 siblings, 1 reply; 30+ messages in thread
From: Ben Widawsky @ 2014-07-29  0:16 UTC (permalink / raw)
  To: Mcaulay, Alistair; +Cc: intel-gfx

On Mon, Jul 28, 2014 at 05:12:59PM +0000, Mcaulay, Alistair wrote:
> Hi Ben / Daniel,
> I agree that this needs to be properly tested. Are there any particular igt tests you would suggest I use?
> I've been running:
> drv_hangman, drv_suspend, gem_hangcheck_forcewake.

I thought IGT had added some random reset stuff in the way that we have
for signals. However, it looks like I imagined it. I guess you can add
gem_hang, gem_reset_stats, and tests/debugfs_wedged to that list. Daniel
can probably provide any I might have missed.

The way that igt quiesces everything these days really hurts the ability
to test multi-process. If every tests starts off with no work, and
running the default context, things are pretty trivial. Similarly,
running these tests in isolation, even if it isn't quiescing doesn't
help the situation. The way I wrote the code originally was through
debugging hangs on a desktop as I developed patches, and not with IGT
(though drv_hangman could catch many issues). I'd definitely recommend
trying to invoke hangs on a running desktop. I'd advise doing this by
modifying mesa to submit a crap batch, I can provide you more details on
how to do this if you need it. Also try to disable the quiescing in IGT
and run more than these tests in isolation.

> 
> Also do you have a set of PPGTT Patches that should work with these tests. Michel sent me a set of patches to enable
> PPGTT, but these 3 tests fail with the patches.

I will try to reproduce this on my patch series when I have some time
and if nothing else, that should be a good preparation/refresher for the
patch review anyway. The patches I wrote shouldn't have touched much on
these paths - not sure if Michel changed anything there.

With patch on top of what Michel sent you, everything passes?

> 
> Thanks,
> Alistair.
> 
> -----Original Message-----
> From: Daniel Vetter [mailto:daniel.vetter@ffwll.ch] On Behalf Of Daniel Vetter
> Sent: Monday, July 28, 2014 10:27 AM
> To: Ben Widawsky
> Cc: Mcaulay, Alistair; intel-gfx@lists.freedesktop.org
> Subject: Re: [Intel-gfx] [PATCH] drm/i915: Rework GPU reset sequence to match driver load & thaw
> 
> On Fri, Jul 25, 2014 at 06:05:29PM -0700, Ben Widawsky wrote:
> > On Wed, Jul 16, 2014 at 04:05:59PM +0100, alistair.mcaulay@intel.com wrote:
> > > From: "McAulay, Alistair" <alistair.mcaulay@intel.com>
> > > 
> > > This patch is to address Daniels concerns over different code during reset:
> > > 
> > > http://lists.freedesktop.org/archives/intel-gfx/2014-June/047758.htm
> > > l
> > > 
> > > "The reason for aiming as hard as possible to use the exact same 
> > > code for driver load, gpu reset and runtime pm/system resume is that 
> > > we've simply seen too many bugs due to slight variations and unintended omissions."
> > > 
> > > Tested using igt drv_hangman.
> > > 
> > > Signed-off-by: McAulay, Alistair <alistair.mcaulay@intel.com>
> > 
> > 2 quick comments before I actually do a real review.
> > 1. Did you actually run this with and without full ppgtt?
> > 2. I don't think this is actually fulfilling what Daniel is 
> > requesting, though we can let him comment.
> 
> Mostly looks like what I think we need. Comments below.
> 
> > 3. Did you reall do #1?
> > 
> > Assuming you satisifed #1, can you please post the igt results for the 
> > permutations (pre patch w/ and w/o ppgtt; post patch w/ and w/o ppgtt)
> > 
> > I really want this data because I spent *a lot* of time with these 
> > specific areas in the PPGTT work, and I am somewhat skeptical enough 
> > of the code has changed that this will magically work. I also tried 
> > the trickiness with the ring handling functions, and never succeeded. 
> > Also, with the context stuff, I'm simply not convinced it can 
> > magically vanish. If igt looks good, and Daniel agrees that this is 
> > what he actually wanted, I will go fishing for corner cases and do the review.
> 
> I think the hack in ring_begin might explain why it never worked before.
> But fully agreed, we really need to test this well (and fill gaps if igt misses anything around resets - we don't have any systematic gpu reset coverage anywhere outside of igt).
> 
> > 
> > Thanks.
> > 
> > > ---
> > >  drivers/gpu/drm/i915/i915_gem.c         |  2 -
> > >  drivers/gpu/drm/i915/i915_gem_context.c | 42 +--------------------
> > >  drivers/gpu/drm/i915/i915_gem_gtt.c     | 67 +++++----------------------------
> > >  drivers/gpu/drm/i915/i915_gem_gtt.h     |  3 +-
> > >  drivers/gpu/drm/i915/intel_ringbuffer.c |  4 +-
> > >  5 files changed, 14 insertions(+), 104 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/i915/i915_gem.c 
> > > b/drivers/gpu/drm/i915/i915_gem.c index ef047bc..b38e086 100644
> > > --- a/drivers/gpu/drm/i915/i915_gem.c
> > > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > > @@ -2590,8 +2590,6 @@ void i915_gem_reset(struct drm_device *dev)
> > >  	for_each_ring(ring, dev_priv, i)
> > >  		i915_gem_reset_ring_cleanup(dev_priv, ring);
> > >  
> > > -	i915_gem_context_reset(dev);
> > > -
> > >  	i915_gem_restore_fences(dev);
> > >  }
> > >  
> > > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c 
> > > b/drivers/gpu/drm/i915/i915_gem_context.c
> > > index de72a28..d96219f 100644
> > > --- a/drivers/gpu/drm/i915/i915_gem_context.c
> > > +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> > > @@ -372,42 +372,6 @@ err_destroy:
> > >  	return ERR_PTR(ret);
> > >  }
> > >  
> > > -void i915_gem_context_reset(struct drm_device *dev) -{
> > > -	struct drm_i915_private *dev_priv = dev->dev_private;
> > > -	int i;
> > > -
> > > -	/* Prevent the hardware from restoring the last context (which hung) on
> > > -	 * the next switch */
> > > -	for (i = 0; i < I915_NUM_RINGS; i++) {
> > > -		struct intel_engine_cs *ring = &dev_priv->ring[i];
> > > -		struct intel_context *dctx = ring->default_context;
> > > -		struct intel_context *lctx = ring->last_context;
> > > -
> > > -		/* Do a fake switch to the default context */
> > > -		if (lctx == dctx)
> > > -			continue;
> > > -
> > > -		if (!lctx)
> > > -			continue;
> > > -
> > > -		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
> > > -			WARN_ON(i915_gem_obj_ggtt_pin(dctx->legacy_hw_ctx.rcs_state,
> > > -						      get_context_alignment(dev), 0));
> > > -			/* Fake a finish/inactive */
> > > -			dctx->legacy_hw_ctx.rcs_state->base.write_domain = 0;
> > > -			dctx->legacy_hw_ctx.rcs_state->active = 0;
> > > -		}
> > > -
> > > -		if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
> > > -			i915_gem_object_ggtt_unpin(lctx->legacy_hw_ctx.rcs_state);
> > > -
> > > -		i915_gem_context_unreference(lctx);
> > > -		i915_gem_context_reference(dctx);
> > > -		ring->last_context = dctx;
> > > -	}
> > > -}
> 
> I don't understand why we no longer need this - after reset we probably have the default context loaded (if we resue the driver load sequence exactly), so I expect that we must reset the software tracking accordingly.
> 
> > > -
> > >  int i915_gem_context_init(struct drm_device *dev)  {
> > >  	struct drm_i915_private *dev_priv = dev->dev_private; @@ -498,10 
> > > +462,6 @@ int i915_gem_context_enable(struct drm_i915_private *dev_priv)
> > >  		ppgtt->enable(ppgtt);
> > >  	}
> > >  
> > > -	/* FIXME: We should make this work, even in reset */
> > > -	if (i915_reset_in_progress(&dev_priv->gpu_error))
> > > -		return 0;
> > > -
> > >  	BUG_ON(!dev_priv->ring[RCS].default_context);
> > >  
> > >  	for_each_ring(ring, dev_priv, i) { @@ -645,7 +605,7 @@ static int 
> > > do_switch(struct intel_engine_cs *ring,
> > >  	from = ring->last_context;
> > >  
> > >  	if (USES_FULL_PPGTT(ring->dev)) {
> > > -		ret = ppgtt->switch_mm(ppgtt, ring, false);
> > > +		ret = ppgtt->switch_mm(ppgtt, ring);
> > >  		if (ret)
> > >  			goto unpin_out;
> > >  	}
> > > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c 
> > > b/drivers/gpu/drm/i915/i915_gem_gtt.c
> > > index 5188936..450c8a9 100644
> > > --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> > > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> > > @@ -216,19 +216,12 @@ static gen6_gtt_pte_t 
> > > iris_pte_encode(dma_addr_t addr,
> > >  
> > >  /* Broadwell Page Directory Pointer Descriptors */  static int 
> > > gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
> > > -			   uint64_t val, bool synchronous)
> > > +			   uint64_t val)
> > >  {
> > > -	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> > >  	int ret;
> > >  
> > >  	BUG_ON(entry >= 4);
> > >  
> > > -	if (synchronous) {
> > > -		I915_WRITE(GEN8_RING_PDP_UDW(ring, entry), val >> 32);
> > > -		I915_WRITE(GEN8_RING_PDP_LDW(ring, entry), (u32)val);
> > > -		return 0;
> > > -	}
> > > -
> > >  	ret = intel_ring_begin(ring, 6);
> > >  	if (ret)
> > >  		return ret;
> > > @@ -245,8 +238,7 @@ static int gen8_write_pdp(struct intel_engine_cs 
> > > *ring, unsigned entry,  }
> > >  
> > >  static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > > -			  struct intel_engine_cs *ring,
> > > -			  bool synchronous)
> > > +			  struct intel_engine_cs *ring)
> > >  {
> > >  	int i, ret;
> > >  
> > > @@ -255,7 +247,7 @@ static int gen8_mm_switch(struct i915_hw_ppgtt 
> > > *ppgtt,
> > >  
> > >  	for (i = used_pd - 1; i >= 0; i--) {
> > >  		dma_addr_t addr = ppgtt->pd_dma_addr[i];
> > > -		ret = gen8_write_pdp(ring, i, addr, synchronous);
> > > +		ret = gen8_write_pdp(ring, i, addr);
> > >  		if (ret)
> > >  			return ret;
> > >  	}
> > > @@ -724,29 +716,10 @@ static uint32_t get_pd_offset(struct 
> > > i915_hw_ppgtt *ppgtt)  }
> > >  
> > >  static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > > -			 struct intel_engine_cs *ring,
> > > -			 bool synchronous)
> > > +			 struct intel_engine_cs *ring)
> > >  {
> > > -	struct drm_device *dev = ppgtt->base.dev;
> > > -	struct drm_i915_private *dev_priv = dev->dev_private;
> > >  	int ret;
> > >  
> > > -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> > > -	 * manually frob these bits. Ideally we could use the ring functions,
> > > -	 * except our error handling makes it quite difficult (can't use
> > > -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> > > -	 *
> > > -	 * FIXME: We should try not to special case reset
> > > -	 */
> > > -	if (synchronous ||
> > > -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> > > -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> > > -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> > > -		I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
> > > -		POSTING_READ(RING_PP_DIR_BASE(ring));
> > > -		return 0;
> > > -	}
> > > -
> > >  	/* NB: TLBs must be flushed and invalidated before a switch */
> > >  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, I915_GEM_GPU_DOMAINS);
> > >  	if (ret)
> > > @@ -768,29 +741,10 @@ static int hsw_mm_switch(struct i915_hw_ppgtt 
> > > *ppgtt,  }
> > >  
> > >  static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > > -			  struct intel_engine_cs *ring,
> > > -			  bool synchronous)
> > > +			  struct intel_engine_cs *ring)
> > >  {
> > > -	struct drm_device *dev = ppgtt->base.dev;
> > > -	struct drm_i915_private *dev_priv = dev->dev_private;
> > >  	int ret;
> > >  
> > > -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> > > -	 * manually frob these bits. Ideally we could use the ring functions,
> > > -	 * except our error handling makes it quite difficult (can't use
> > > -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> > > -	 *
> > > -	 * FIXME: We should try not to special case reset
> > > -	 */
> > > -	if (synchronous ||
> > > -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> > > -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> > > -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> > > -		I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
> > > -		POSTING_READ(RING_PP_DIR_BASE(ring));
> > > -		return 0;
> > > -	}
> > > -
> > >  	/* NB: TLBs must be flushed and invalidated before a switch */
> > >  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, I915_GEM_GPU_DOMAINS);
> > >  	if (ret)
> > > @@ -819,14 +773,11 @@ static int gen7_mm_switch(struct i915_hw_ppgtt 
> > > *ppgtt,  }
> > >  
> > >  static int gen6_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > > -			  struct intel_engine_cs *ring,
> > > -			  bool synchronous)
> > > +			  struct intel_engine_cs *ring)
> > >  {
> > >  	struct drm_device *dev = ppgtt->base.dev;
> > >  	struct drm_i915_private *dev_priv = dev->dev_private;
> > >  
> > > -	if (!synchronous)
> > > -		return 0;
> > >  
> > >  	I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> > >  	I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt)); @@ 
> > > -852,7 +803,7 @@ static int gen8_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
> > >  		if (USES_FULL_PPGTT(dev))
> > >  			continue;
> > >  
> > > -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> > > +		ret = ppgtt->switch_mm(ppgtt, ring);
> > >  		if (ret)
> > >  			goto err_out;
> > >  	}
> > > @@ -897,7 +848,7 @@ static int gen7_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
> > >  		if (USES_FULL_PPGTT(dev))
> > >  			continue;
> > >  
> > > -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> > > +		ret = ppgtt->switch_mm(ppgtt, ring);
> > >  		if (ret)
> > >  			return ret;
> > >  	}
> > > @@ -926,7 +877,7 @@ static int gen6_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
> > >  	I915_WRITE(GFX_MODE, _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
> > >  
> > >  	for_each_ring(ring, dev_priv, i) {
> > > -		int ret = ppgtt->switch_mm(ppgtt, ring, true);
> > > +		int ret = ppgtt->switch_mm(ppgtt, ring);
> > >  		if (ret)
> > >  			return ret;
> > >  	}
> > > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h 
> > > b/drivers/gpu/drm/i915/i915_gem_gtt.h
> > > index 8d6f7c1..bf1e4fc 100644
> > > --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> > > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> > > @@ -262,8 +262,7 @@ struct i915_hw_ppgtt {
> > >  
> > >  	int (*enable)(struct i915_hw_ppgtt *ppgtt);
> > >  	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
> > > -			 struct intel_engine_cs *ring,
> > > -			 bool synchronous);
> > > +			 struct intel_engine_cs *ring);
> > >  	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file 
> > > *m);  };
> > >  
> > > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c 
> > > b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > > index 599709e..e33c2e1 100644
> > > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> > > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > > @@ -1832,7 +1832,9 @@ int intel_ring_begin(struct intel_engine_cs 
> > > *ring,
> > >  
> > >  	ret = i915_gem_check_wedge(&dev_priv->gpu_error,
> > >  				   dev_priv->mm.interruptible);
> > > -	if (ret)
> > > +
> > > +	/* -EAGAIN means a reset is in progress, it is Ok to continue */
> > > +	if (ret && (ret != -EAGAIN))
> > >  		return ret;
> 
> Oh, I guess that's the tricky bit why the old approach never worked - because reset_in_progress is set we failed the context/ppgtt loading through the rings and screwed up.
> 
> Problem with your approach is that we want to bail out here if a reset is in progress, so we can't just eat the EAGAIN. If we do that we potentially deadlock or overflow the ring.
> 
> I think we need a different hack here, and a few layers down (i.e. at the place where we actually generate that offending -EAGAIN).
> 
> - Around the re-init sequence in the reset function we set
>   dev_priv->mm.reload_in_reset or similar. Since we hold dev->struct_mutex
>   no one will see that, as long as we never leak it out of the critical
>   section.
> 
> - In the ring_begin code that checks for gpu hangs we ignore
>   reset_in_progress if this bit is set.
> 
> - Both places need fairly big comments to explain what exactly is going
>   on.
> 
> Thanks, Daniel
> 
> > >  
> > >  	ret = __intel_ring_prepare(ring, num_dwords * sizeof(uint32_t));
> > > --
> > > 2.0.0
> > > 
> > > _______________________________________________
> > > Intel-gfx mailing list
> > > Intel-gfx@lists.freedesktop.org
> > > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> > 
> > --
> > Ben Widawsky, Intel Open Source Technology Center 
> > _______________________________________________
> > Intel-gfx mailing list
> > Intel-gfx@lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> 
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> +41 (0) 79 365 57 48 - http://blog.ffwll.ch

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] drm/i915: Rework GPU reset sequence to match driver load & thaw
  2014-07-28  9:26   ` Daniel Vetter
  2014-07-28 17:12     ` Mcaulay, Alistair
@ 2014-07-29  7:36     ` Chris Wilson
  2014-07-29 10:32       ` Daniel Vetter
  1 sibling, 1 reply; 30+ messages in thread
From: Chris Wilson @ 2014-07-29  7:36 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: Ben Widawsky, intel-gfx

On Mon, Jul 28, 2014 at 11:26:38AM +0200, Daniel Vetter wrote:
> Oh, I guess that's the tricky bit why the old approach never worked -
> because reset_in_progress is set we failed the context/ppgtt loading
> through the rings and screwed up.
> 
> Problem with your approach is that we want to bail out here if a reset is
> in progress, so we can't just eat the EAGAIN. If we do that we potentially
> deadlock or overflow the ring.
> 
> I think we need a different hack here, and a few layers down (i.e. at the
> place where we actually generate that offending -EAGAIN).
> 
> - Around the re-init sequence in the reset function we set
>   dev_priv->mm.reload_in_reset or similar. Since we hold dev->struct_mutex
>   no one will see that, as long as we never leak it out of the critical
>   section.
> 
> - In the ring_begin code that checks for gpu hangs we ignore
>   reset_in_progress if this bit is set.
> 
> - Both places need fairly big comments to explain what exactly is going
>   on.

This is going from bad to worse. I think you can do better if you looked
at the problem afresh.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] drm/i915: Rework GPU reset sequence to match driver load & thaw
  2014-07-29  7:36     ` Chris Wilson
@ 2014-07-29 10:32       ` Daniel Vetter
  2014-07-30 16:59         ` Mcaulay, Alistair
  0 siblings, 1 reply; 30+ messages in thread
From: Daniel Vetter @ 2014-07-29 10:32 UTC (permalink / raw)
  To: Chris Wilson, Daniel Vetter, Ben Widawsky, intel-gfx

On Tue, Jul 29, 2014 at 08:36:33AM +0100, Chris Wilson wrote:
> On Mon, Jul 28, 2014 at 11:26:38AM +0200, Daniel Vetter wrote:
> > Oh, I guess that's the tricky bit why the old approach never worked -
> > because reset_in_progress is set we failed the context/ppgtt loading
> > through the rings and screwed up.
> > 
> > Problem with your approach is that we want to bail out here if a reset is
> > in progress, so we can't just eat the EAGAIN. If we do that we potentially
> > deadlock or overflow the ring.
> > 
> > I think we need a different hack here, and a few layers down (i.e. at the
> > place where we actually generate that offending -EAGAIN).
> > 
> > - Around the re-init sequence in the reset function we set
> >   dev_priv->mm.reload_in_reset or similar. Since we hold dev->struct_mutex
> >   no one will see that, as long as we never leak it out of the critical
> >   section.
> > 
> > - In the ring_begin code that checks for gpu hangs we ignore
> >   reset_in_progress if this bit is set.
> > 
> > - Both places need fairly big comments to explain what exactly is going
> >   on.
> 
> This is going from bad to worse. I think you can do better if you looked
> at the problem afresh.

Well we can't really reset reset_in_progress at that point, since not all
reset is done yet. Especially the modeset stuff. So I don't think that
reordering the reset sequence would get us out of this ugly spot. And I
don't see any other solution really. Do you?
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] drm/i915: Rework GPU reset sequence to match driver load & thaw
  2014-07-29  0:16       ` Ben Widawsky
@ 2014-07-29 17:25         ` Mcaulay, Alistair
  2014-07-29 18:12           ` Daniel Vetter
  0 siblings, 1 reply; 30+ messages in thread
From: Mcaulay, Alistair @ 2014-07-29 17:25 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: intel-gfx


drv_suspend,  gem_hangcheck_forcewake are working fine now with PPGTT enabled. Both with and without my patch.

The results are the same with and without my patch for:

$ sudo ~/drm_nightly/intel-gpu-tools/tests/drv_hangman 
IGT-Version: 1.7-g70e6ed9 (x86_64) (Linux: 3.16.0-rc5+ x86_64)
Subtest error-state-debugfs-entry: SUCCESS
Subtest error-state-sysfs-entry: SUCCESS
Subtest ring-stop-sysfs-entry: SUCCESS
Subtest error-state-basic: SUCCESS
Subtest error-state-capture-render: SUCCESS
Test assertion failure function check_error_state, file drv_hangman.c:303:
Failed assertion: gtt_offset == expected_offset
Subtest error-state-capture-bsd: FAIL
Test assertion failure function check_error_state, file drv_hangman.c:303:
Failed assertion: gtt_offset == expected_offset
Subtest error-state-capture-blt: FAIL
Test assertion failure function check_error_state, file drv_hangman.c:303:
Failed assertion: gtt_offset == expected_offset
Subtest error-state-capture-vebox: FAIL

$ sudo ~/drm_nightly/intel-gpu-tools/tests/gem_reset_stats
IGT-Version: 1.7-g70e6ed9 (x86_64) (Linux: 3.16.0-rc5+ x86_64)
Subtest params: SUCCESS
Subtest params-ctx-render: SUCCESS
Subtest reset-stats-render: SUCCESS
Subtest reset-stats-ctx-render: SUCCESS
Subtest ban-render: SUCCESS
Subtest ban-ctx-render: SUCCESS
Subtest reset-count-render: SUCCESS
Subtest reset-count-ctx-render: SUCCESS
Subtest unrelated-ctx-render: SUCCESS
Subtest close-pending-render: SUCCESS
Subtest close-pending-ctx-render: SUCCESS
<test now hangs>


Both good without PPGTT.
I haven't yet seen a regression with the patch.

-----Original Message-----
From: Ben Widawsky [mailto:ben@bwidawsk.net] 
Sent: Tuesday, July 29, 2014 1:17 AM
To: Mcaulay, Alistair
Cc: Daniel Vetter; intel-gfx@lists.freedesktop.org
Subject: Re: [Intel-gfx] [PATCH] drm/i915: Rework GPU reset sequence to match driver load & thaw

On Mon, Jul 28, 2014 at 05:12:59PM +0000, Mcaulay, Alistair wrote:
> Hi Ben / Daniel,
> I agree that this needs to be properly tested. Are there any particular igt tests you would suggest I use?
> I've been running:
> drv_hangman, drv_suspend, gem_hangcheck_forcewake.

I thought IGT had added some random reset stuff in the way that we have for signals. However, it looks like I imagined it. I guess you can add gem_hang, gem_reset_stats, and tests/debugfs_wedged to that list. Daniel can probably provide any I might have missed.

The way that igt quiesces everything these days really hurts the ability to test multi-process. If every tests starts off with no work, and running the default context, things are pretty trivial. Similarly, running these tests in isolation, even if it isn't quiescing doesn't help the situation. The way I wrote the code originally was through debugging hangs on a desktop as I developed patches, and not with IGT (though drv_hangman could catch many issues). I'd definitely recommend trying to invoke hangs on a running desktop. I'd advise doing this by modifying mesa to submit a crap batch, I can provide you more details on how to do this if you need it. Also try to disable the quiescing in IGT and run more than these tests in isolation.

> 
> Also do you have a set of PPGTT Patches that should work with these 
> tests. Michel sent me a set of patches to enable PPGTT, but these 3 tests fail with the patches.

I will try to reproduce this on my patch series when I have some time and if nothing else, that should be a good preparation/refresher for the patch review anyway. The patches I wrote shouldn't have touched much on these paths - not sure if Michel changed anything there.

With patch on top of what Michel sent you, everything passes?

> 
> Thanks,
> Alistair.
> 
> -----Original Message-----
> From: Daniel Vetter [mailto:daniel.vetter@ffwll.ch] On Behalf Of 
> Daniel Vetter
> Sent: Monday, July 28, 2014 10:27 AM
> To: Ben Widawsky
> Cc: Mcaulay, Alistair; intel-gfx@lists.freedesktop.org
> Subject: Re: [Intel-gfx] [PATCH] drm/i915: Rework GPU reset sequence 
> to match driver load & thaw
> 
> On Fri, Jul 25, 2014 at 06:05:29PM -0700, Ben Widawsky wrote:
> > On Wed, Jul 16, 2014 at 04:05:59PM +0100, alistair.mcaulay@intel.com wrote:
> > > From: "McAulay, Alistair" <alistair.mcaulay@intel.com>
> > > 
> > > This patch is to address Daniels concerns over different code during reset:
> > > 
> > > http://lists.freedesktop.org/archives/intel-gfx/2014-June/047758.h
> > > tm
> > > l
> > > 
> > > "The reason for aiming as hard as possible to use the exact same 
> > > code for driver load, gpu reset and runtime pm/system resume is 
> > > that we've simply seen too many bugs due to slight variations and unintended omissions."
> > > 
> > > Tested using igt drv_hangman.
> > > 
> > > Signed-off-by: McAulay, Alistair <alistair.mcaulay@intel.com>
> > 
> > 2 quick comments before I actually do a real review.
> > 1. Did you actually run this with and without full ppgtt?
> > 2. I don't think this is actually fulfilling what Daniel is 
> > requesting, though we can let him comment.
> 
> Mostly looks like what I think we need. Comments below.
> 
> > 3. Did you reall do #1?
> > 
> > Assuming you satisifed #1, can you please post the igt results for 
> > the permutations (pre patch w/ and w/o ppgtt; post patch w/ and w/o 
> > ppgtt)
> > 
> > I really want this data because I spent *a lot* of time with these 
> > specific areas in the PPGTT work, and I am somewhat skeptical enough 
> > of the code has changed that this will magically work. I also tried 
> > the trickiness with the ring handling functions, and never succeeded.
> > Also, with the context stuff, I'm simply not convinced it can 
> > magically vanish. If igt looks good, and Daniel agrees that this is 
> > what he actually wanted, I will go fishing for corner cases and do the review.
> 
> I think the hack in ring_begin might explain why it never worked before.
> But fully agreed, we really need to test this well (and fill gaps if igt misses anything around resets - we don't have any systematic gpu reset coverage anywhere outside of igt).
> 
> > 
> > Thanks.
> > 
> > > ---
> > >  drivers/gpu/drm/i915/i915_gem.c         |  2 -
> > >  drivers/gpu/drm/i915/i915_gem_context.c | 42 +--------------------
> > >  drivers/gpu/drm/i915/i915_gem_gtt.c     | 67 +++++----------------------------
> > >  drivers/gpu/drm/i915/i915_gem_gtt.h     |  3 +-
> > >  drivers/gpu/drm/i915/intel_ringbuffer.c |  4 +-
> > >  5 files changed, 14 insertions(+), 104 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/i915/i915_gem.c 
> > > b/drivers/gpu/drm/i915/i915_gem.c index ef047bc..b38e086 100644
> > > --- a/drivers/gpu/drm/i915/i915_gem.c
> > > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > > @@ -2590,8 +2590,6 @@ void i915_gem_reset(struct drm_device *dev)
> > >  	for_each_ring(ring, dev_priv, i)
> > >  		i915_gem_reset_ring_cleanup(dev_priv, ring);
> > >  
> > > -	i915_gem_context_reset(dev);
> > > -
> > >  	i915_gem_restore_fences(dev);
> > >  }
> > >  
> > > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c
> > > b/drivers/gpu/drm/i915/i915_gem_context.c
> > > index de72a28..d96219f 100644
> > > --- a/drivers/gpu/drm/i915/i915_gem_context.c
> > > +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> > > @@ -372,42 +372,6 @@ err_destroy:
> > >  	return ERR_PTR(ret);
> > >  }
> > >  
> > > -void i915_gem_context_reset(struct drm_device *dev) -{
> > > -	struct drm_i915_private *dev_priv = dev->dev_private;
> > > -	int i;
> > > -
> > > -	/* Prevent the hardware from restoring the last context (which hung) on
> > > -	 * the next switch */
> > > -	for (i = 0; i < I915_NUM_RINGS; i++) {
> > > -		struct intel_engine_cs *ring = &dev_priv->ring[i];
> > > -		struct intel_context *dctx = ring->default_context;
> > > -		struct intel_context *lctx = ring->last_context;
> > > -
> > > -		/* Do a fake switch to the default context */
> > > -		if (lctx == dctx)
> > > -			continue;
> > > -
> > > -		if (!lctx)
> > > -			continue;
> > > -
> > > -		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
> > > -			WARN_ON(i915_gem_obj_ggtt_pin(dctx->legacy_hw_ctx.rcs_state,
> > > -						      get_context_alignment(dev), 0));
> > > -			/* Fake a finish/inactive */
> > > -			dctx->legacy_hw_ctx.rcs_state->base.write_domain = 0;
> > > -			dctx->legacy_hw_ctx.rcs_state->active = 0;
> > > -		}
> > > -
> > > -		if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
> > > -			i915_gem_object_ggtt_unpin(lctx->legacy_hw_ctx.rcs_state);
> > > -
> > > -		i915_gem_context_unreference(lctx);
> > > -		i915_gem_context_reference(dctx);
> > > -		ring->last_context = dctx;
> > > -	}
> > > -}
> 
> I don't understand why we no longer need this - after reset we probably have the default context loaded (if we resue the driver load sequence exactly), so I expect that we must reset the software tracking accordingly.
> 
> > > -
> > >  int i915_gem_context_init(struct drm_device *dev)  {
> > >  	struct drm_i915_private *dev_priv = dev->dev_private; @@ -498,10
> > > +462,6 @@ int i915_gem_context_enable(struct drm_i915_private 
> > > +*dev_priv)
> > >  		ppgtt->enable(ppgtt);
> > >  	}
> > >  
> > > -	/* FIXME: We should make this work, even in reset */
> > > -	if (i915_reset_in_progress(&dev_priv->gpu_error))
> > > -		return 0;
> > > -
> > >  	BUG_ON(!dev_priv->ring[RCS].default_context);
> > >  
> > >  	for_each_ring(ring, dev_priv, i) { @@ -645,7 +605,7 @@ static 
> > > int do_switch(struct intel_engine_cs *ring,
> > >  	from = ring->last_context;
> > >  
> > >  	if (USES_FULL_PPGTT(ring->dev)) {
> > > -		ret = ppgtt->switch_mm(ppgtt, ring, false);
> > > +		ret = ppgtt->switch_mm(ppgtt, ring);
> > >  		if (ret)
> > >  			goto unpin_out;
> > >  	}
> > > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c
> > > b/drivers/gpu/drm/i915/i915_gem_gtt.c
> > > index 5188936..450c8a9 100644
> > > --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> > > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> > > @@ -216,19 +216,12 @@ static gen6_gtt_pte_t 
> > > iris_pte_encode(dma_addr_t addr,
> > >  
> > >  /* Broadwell Page Directory Pointer Descriptors */  static int 
> > > gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
> > > -			   uint64_t val, bool synchronous)
> > > +			   uint64_t val)
> > >  {
> > > -	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> > >  	int ret;
> > >  
> > >  	BUG_ON(entry >= 4);
> > >  
> > > -	if (synchronous) {
> > > -		I915_WRITE(GEN8_RING_PDP_UDW(ring, entry), val >> 32);
> > > -		I915_WRITE(GEN8_RING_PDP_LDW(ring, entry), (u32)val);
> > > -		return 0;
> > > -	}
> > > -
> > >  	ret = intel_ring_begin(ring, 6);
> > >  	if (ret)
> > >  		return ret;
> > > @@ -245,8 +238,7 @@ static int gen8_write_pdp(struct 
> > > intel_engine_cs *ring, unsigned entry,  }
> > >  
> > >  static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > > -			  struct intel_engine_cs *ring,
> > > -			  bool synchronous)
> > > +			  struct intel_engine_cs *ring)
> > >  {
> > >  	int i, ret;
> > >  
> > > @@ -255,7 +247,7 @@ static int gen8_mm_switch(struct i915_hw_ppgtt 
> > > *ppgtt,
> > >  
> > >  	for (i = used_pd - 1; i >= 0; i--) {
> > >  		dma_addr_t addr = ppgtt->pd_dma_addr[i];
> > > -		ret = gen8_write_pdp(ring, i, addr, synchronous);
> > > +		ret = gen8_write_pdp(ring, i, addr);
> > >  		if (ret)
> > >  			return ret;
> > >  	}
> > > @@ -724,29 +716,10 @@ static uint32_t get_pd_offset(struct 
> > > i915_hw_ppgtt *ppgtt)  }
> > >  
> > >  static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > > -			 struct intel_engine_cs *ring,
> > > -			 bool synchronous)
> > > +			 struct intel_engine_cs *ring)
> > >  {
> > > -	struct drm_device *dev = ppgtt->base.dev;
> > > -	struct drm_i915_private *dev_priv = dev->dev_private;
> > >  	int ret;
> > >  
> > > -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> > > -	 * manually frob these bits. Ideally we could use the ring functions,
> > > -	 * except our error handling makes it quite difficult (can't use
> > > -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> > > -	 *
> > > -	 * FIXME: We should try not to special case reset
> > > -	 */
> > > -	if (synchronous ||
> > > -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> > > -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> > > -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> > > -		I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
> > > -		POSTING_READ(RING_PP_DIR_BASE(ring));
> > > -		return 0;
> > > -	}
> > > -
> > >  	/* NB: TLBs must be flushed and invalidated before a switch */
> > >  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, I915_GEM_GPU_DOMAINS);
> > >  	if (ret)
> > > @@ -768,29 +741,10 @@ static int hsw_mm_switch(struct 
> > > i915_hw_ppgtt *ppgtt,  }
> > >  
> > >  static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > > -			  struct intel_engine_cs *ring,
> > > -			  bool synchronous)
> > > +			  struct intel_engine_cs *ring)
> > >  {
> > > -	struct drm_device *dev = ppgtt->base.dev;
> > > -	struct drm_i915_private *dev_priv = dev->dev_private;
> > >  	int ret;
> > >  
> > > -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> > > -	 * manually frob these bits. Ideally we could use the ring functions,
> > > -	 * except our error handling makes it quite difficult (can't use
> > > -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> > > -	 *
> > > -	 * FIXME: We should try not to special case reset
> > > -	 */
> > > -	if (synchronous ||
> > > -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> > > -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> > > -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> > > -		I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
> > > -		POSTING_READ(RING_PP_DIR_BASE(ring));
> > > -		return 0;
> > > -	}
> > > -
> > >  	/* NB: TLBs must be flushed and invalidated before a switch */
> > >  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, I915_GEM_GPU_DOMAINS);
> > >  	if (ret)
> > > @@ -819,14 +773,11 @@ static int gen7_mm_switch(struct 
> > > i915_hw_ppgtt *ppgtt,  }
> > >  
> > >  static int gen6_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > > -			  struct intel_engine_cs *ring,
> > > -			  bool synchronous)
> > > +			  struct intel_engine_cs *ring)
> > >  {
> > >  	struct drm_device *dev = ppgtt->base.dev;
> > >  	struct drm_i915_private *dev_priv = dev->dev_private;
> > >  
> > > -	if (!synchronous)
> > > -		return 0;
> > >  
> > >  	I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> > >  	I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt)); @@
> > > -852,7 +803,7 @@ static int gen8_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
> > >  		if (USES_FULL_PPGTT(dev))
> > >  			continue;
> > >  
> > > -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> > > +		ret = ppgtt->switch_mm(ppgtt, ring);
> > >  		if (ret)
> > >  			goto err_out;
> > >  	}
> > > @@ -897,7 +848,7 @@ static int gen7_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
> > >  		if (USES_FULL_PPGTT(dev))
> > >  			continue;
> > >  
> > > -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> > > +		ret = ppgtt->switch_mm(ppgtt, ring);
> > >  		if (ret)
> > >  			return ret;
> > >  	}
> > > @@ -926,7 +877,7 @@ static int gen6_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
> > >  	I915_WRITE(GFX_MODE, _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
> > >  
> > >  	for_each_ring(ring, dev_priv, i) {
> > > -		int ret = ppgtt->switch_mm(ppgtt, ring, true);
> > > +		int ret = ppgtt->switch_mm(ppgtt, ring);
> > >  		if (ret)
> > >  			return ret;
> > >  	}
> > > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h
> > > b/drivers/gpu/drm/i915/i915_gem_gtt.h
> > > index 8d6f7c1..bf1e4fc 100644
> > > --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> > > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> > > @@ -262,8 +262,7 @@ struct i915_hw_ppgtt {
> > >  
> > >  	int (*enable)(struct i915_hw_ppgtt *ppgtt);
> > >  	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
> > > -			 struct intel_engine_cs *ring,
> > > -			 bool synchronous);
> > > +			 struct intel_engine_cs *ring);
> > >  	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file 
> > > *m);  };
> > >  
> > > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c
> > > b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > > index 599709e..e33c2e1 100644
> > > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> > > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > > @@ -1832,7 +1832,9 @@ int intel_ring_begin(struct intel_engine_cs 
> > > *ring,
> > >  
> > >  	ret = i915_gem_check_wedge(&dev_priv->gpu_error,
> > >  				   dev_priv->mm.interruptible);
> > > -	if (ret)
> > > +
> > > +	/* -EAGAIN means a reset is in progress, it is Ok to continue */
> > > +	if (ret && (ret != -EAGAIN))
> > >  		return ret;
> 
> Oh, I guess that's the tricky bit why the old approach never worked - because reset_in_progress is set we failed the context/ppgtt loading through the rings and screwed up.
> 
> Problem with your approach is that we want to bail out here if a reset is in progress, so we can't just eat the EAGAIN. If we do that we potentially deadlock or overflow the ring.
> 
> I think we need a different hack here, and a few layers down (i.e. at the place where we actually generate that offending -EAGAIN).
> 
> - Around the re-init sequence in the reset function we set
>   dev_priv->mm.reload_in_reset or similar. Since we hold dev->struct_mutex
>   no one will see that, as long as we never leak it out of the critical
>   section.
> 
> - In the ring_begin code that checks for gpu hangs we ignore
>   reset_in_progress if this bit is set.
> 
> - Both places need fairly big comments to explain what exactly is going
>   on.
> 
> Thanks, Daniel
> 
> > >  
> > >  	ret = __intel_ring_prepare(ring, num_dwords * sizeof(uint32_t));
> > > --
> > > 2.0.0
> > > 
> > > _______________________________________________
> > > Intel-gfx mailing list
> > > Intel-gfx@lists.freedesktop.org
> > > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> > 
> > --
> > Ben Widawsky, Intel Open Source Technology Center 
> > _______________________________________________
> > Intel-gfx mailing list
> > Intel-gfx@lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> 
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> +41 (0) 79 365 57 48 - http://blog.ffwll.ch

--
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] drm/i915: Rework GPU reset sequence to match driver load & thaw
  2014-07-29 17:25         ` Mcaulay, Alistair
@ 2014-07-29 18:12           ` Daniel Vetter
  0 siblings, 0 replies; 30+ messages in thread
From: Daniel Vetter @ 2014-07-29 18:12 UTC (permalink / raw)
  To: Mcaulay, Alistair; +Cc: Ben Widawsky, intel-gfx

On Tue, Jul 29, 2014 at 05:25:48PM +0000, Mcaulay, Alistair wrote:
> 
> drv_suspend,  gem_hangcheck_forcewake are working fine now with PPGTT enabled. Both with and without my patch.
> 
> The results are the same with and without my patch for:
> 
> $ sudo ~/drm_nightly/intel-gpu-tools/tests/drv_hangman 
> IGT-Version: 1.7-g70e6ed9 (x86_64) (Linux: 3.16.0-rc5+ x86_64)
> Subtest error-state-debugfs-entry: SUCCESS
> Subtest error-state-sysfs-entry: SUCCESS
> Subtest ring-stop-sysfs-entry: SUCCESS
> Subtest error-state-basic: SUCCESS
> Subtest error-state-capture-render: SUCCESS
> Test assertion failure function check_error_state, file drv_hangman.c:303:
> Failed assertion: gtt_offset == expected_offset
> Subtest error-state-capture-bsd: FAIL
> Test assertion failure function check_error_state, file drv_hangman.c:303:
> Failed assertion: gtt_offset == expected_offset
> Subtest error-state-capture-blt: FAIL
> Test assertion failure function check_error_state, file drv_hangman.c:303:
> Failed assertion: gtt_offset == expected_offset
> Subtest error-state-capture-vebox: FAIL
> 
> $ sudo ~/drm_nightly/intel-gpu-tools/tests/gem_reset_stats
> IGT-Version: 1.7-g70e6ed9 (x86_64) (Linux: 3.16.0-rc5+ x86_64)
> Subtest params: SUCCESS
> Subtest params-ctx-render: SUCCESS
> Subtest reset-stats-render: SUCCESS
> Subtest reset-stats-ctx-render: SUCCESS
> Subtest ban-render: SUCCESS
> Subtest ban-ctx-render: SUCCESS
> Subtest reset-count-render: SUCCESS
> Subtest reset-count-ctx-render: SUCCESS
> Subtest unrelated-ctx-render: SUCCESS
> Subtest close-pending-render: SUCCESS
> Subtest close-pending-ctx-render: SUCCESS
> <test now hangs>
> 
> 
> Both good without PPGTT.
> I haven't yet seen a regression with the patch.

gem_ringfill might be able to hit the -EAGAIN issue. But that testcase is
missing subtest with signals to interrupt the ioctls. I'll add that.
-Daniel

> 
> -----Original Message-----
> From: Ben Widawsky [mailto:ben@bwidawsk.net] 
> Sent: Tuesday, July 29, 2014 1:17 AM
> To: Mcaulay, Alistair
> Cc: Daniel Vetter; intel-gfx@lists.freedesktop.org
> Subject: Re: [Intel-gfx] [PATCH] drm/i915: Rework GPU reset sequence to match driver load & thaw
> 
> On Mon, Jul 28, 2014 at 05:12:59PM +0000, Mcaulay, Alistair wrote:
> > Hi Ben / Daniel,
> > I agree that this needs to be properly tested. Are there any particular igt tests you would suggest I use?
> > I've been running:
> > drv_hangman, drv_suspend, gem_hangcheck_forcewake.
> 
> I thought IGT had added some random reset stuff in the way that we have for signals. However, it looks like I imagined it. I guess you can add gem_hang, gem_reset_stats, and tests/debugfs_wedged to that list. Daniel can probably provide any I might have missed.
> 
> The way that igt quiesces everything these days really hurts the ability to test multi-process. If every tests starts off with no work, and running the default context, things are pretty trivial. Similarly, running these tests in isolation, even if it isn't quiescing doesn't help the situation. The way I wrote the code originally was through debugging hangs on a desktop as I developed patches, and not with IGT (though drv_hangman could catch many issues). I'd definitely recommend trying to invoke hangs on a running desktop. I'd advise doing this by modifying mesa to submit a crap batch, I can provide you more details on how to do this if you need it. Also try to disable the quiescing in IGT and run more than these tests in isolation.
> 
> > 
> > Also do you have a set of PPGTT Patches that should work with these 
> > tests. Michel sent me a set of patches to enable PPGTT, but these 3 tests fail with the patches.
> 
> I will try to reproduce this on my patch series when I have some time and if nothing else, that should be a good preparation/refresher for the patch review anyway. The patches I wrote shouldn't have touched much on these paths - not sure if Michel changed anything there.
> 
> With patch on top of what Michel sent you, everything passes?
> 
> > 
> > Thanks,
> > Alistair.
> > 
> > -----Original Message-----
> > From: Daniel Vetter [mailto:daniel.vetter@ffwll.ch] On Behalf Of 
> > Daniel Vetter
> > Sent: Monday, July 28, 2014 10:27 AM
> > To: Ben Widawsky
> > Cc: Mcaulay, Alistair; intel-gfx@lists.freedesktop.org
> > Subject: Re: [Intel-gfx] [PATCH] drm/i915: Rework GPU reset sequence 
> > to match driver load & thaw
> > 
> > On Fri, Jul 25, 2014 at 06:05:29PM -0700, Ben Widawsky wrote:
> > > On Wed, Jul 16, 2014 at 04:05:59PM +0100, alistair.mcaulay@intel.com wrote:
> > > > From: "McAulay, Alistair" <alistair.mcaulay@intel.com>
> > > > 
> > > > This patch is to address Daniels concerns over different code during reset:
> > > > 
> > > > http://lists.freedesktop.org/archives/intel-gfx/2014-June/047758.h
> > > > tm
> > > > l
> > > > 
> > > > "The reason for aiming as hard as possible to use the exact same 
> > > > code for driver load, gpu reset and runtime pm/system resume is 
> > > > that we've simply seen too many bugs due to slight variations and unintended omissions."
> > > > 
> > > > Tested using igt drv_hangman.
> > > > 
> > > > Signed-off-by: McAulay, Alistair <alistair.mcaulay@intel.com>
> > > 
> > > 2 quick comments before I actually do a real review.
> > > 1. Did you actually run this with and without full ppgtt?
> > > 2. I don't think this is actually fulfilling what Daniel is 
> > > requesting, though we can let him comment.
> > 
> > Mostly looks like what I think we need. Comments below.
> > 
> > > 3. Did you reall do #1?
> > > 
> > > Assuming you satisifed #1, can you please post the igt results for 
> > > the permutations (pre patch w/ and w/o ppgtt; post patch w/ and w/o 
> > > ppgtt)
> > > 
> > > I really want this data because I spent *a lot* of time with these 
> > > specific areas in the PPGTT work, and I am somewhat skeptical enough 
> > > of the code has changed that this will magically work. I also tried 
> > > the trickiness with the ring handling functions, and never succeeded.
> > > Also, with the context stuff, I'm simply not convinced it can 
> > > magically vanish. If igt looks good, and Daniel agrees that this is 
> > > what he actually wanted, I will go fishing for corner cases and do the review.
> > 
> > I think the hack in ring_begin might explain why it never worked before.
> > But fully agreed, we really need to test this well (and fill gaps if igt misses anything around resets - we don't have any systematic gpu reset coverage anywhere outside of igt).
> > 
> > > 
> > > Thanks.
> > > 
> > > > ---
> > > >  drivers/gpu/drm/i915/i915_gem.c         |  2 -
> > > >  drivers/gpu/drm/i915/i915_gem_context.c | 42 +--------------------
> > > >  drivers/gpu/drm/i915/i915_gem_gtt.c     | 67 +++++----------------------------
> > > >  drivers/gpu/drm/i915/i915_gem_gtt.h     |  3 +-
> > > >  drivers/gpu/drm/i915/intel_ringbuffer.c |  4 +-
> > > >  5 files changed, 14 insertions(+), 104 deletions(-)
> > > > 
> > > > diff --git a/drivers/gpu/drm/i915/i915_gem.c 
> > > > b/drivers/gpu/drm/i915/i915_gem.c index ef047bc..b38e086 100644
> > > > --- a/drivers/gpu/drm/i915/i915_gem.c
> > > > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > > > @@ -2590,8 +2590,6 @@ void i915_gem_reset(struct drm_device *dev)
> > > >  	for_each_ring(ring, dev_priv, i)
> > > >  		i915_gem_reset_ring_cleanup(dev_priv, ring);
> > > >  
> > > > -	i915_gem_context_reset(dev);
> > > > -
> > > >  	i915_gem_restore_fences(dev);
> > > >  }
> > > >  
> > > > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c
> > > > b/drivers/gpu/drm/i915/i915_gem_context.c
> > > > index de72a28..d96219f 100644
> > > > --- a/drivers/gpu/drm/i915/i915_gem_context.c
> > > > +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> > > > @@ -372,42 +372,6 @@ err_destroy:
> > > >  	return ERR_PTR(ret);
> > > >  }
> > > >  
> > > > -void i915_gem_context_reset(struct drm_device *dev) -{
> > > > -	struct drm_i915_private *dev_priv = dev->dev_private;
> > > > -	int i;
> > > > -
> > > > -	/* Prevent the hardware from restoring the last context (which hung) on
> > > > -	 * the next switch */
> > > > -	for (i = 0; i < I915_NUM_RINGS; i++) {
> > > > -		struct intel_engine_cs *ring = &dev_priv->ring[i];
> > > > -		struct intel_context *dctx = ring->default_context;
> > > > -		struct intel_context *lctx = ring->last_context;
> > > > -
> > > > -		/* Do a fake switch to the default context */
> > > > -		if (lctx == dctx)
> > > > -			continue;
> > > > -
> > > > -		if (!lctx)
> > > > -			continue;
> > > > -
> > > > -		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
> > > > -			WARN_ON(i915_gem_obj_ggtt_pin(dctx->legacy_hw_ctx.rcs_state,
> > > > -						      get_context_alignment(dev), 0));
> > > > -			/* Fake a finish/inactive */
> > > > -			dctx->legacy_hw_ctx.rcs_state->base.write_domain = 0;
> > > > -			dctx->legacy_hw_ctx.rcs_state->active = 0;
> > > > -		}
> > > > -
> > > > -		if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
> > > > -			i915_gem_object_ggtt_unpin(lctx->legacy_hw_ctx.rcs_state);
> > > > -
> > > > -		i915_gem_context_unreference(lctx);
> > > > -		i915_gem_context_reference(dctx);
> > > > -		ring->last_context = dctx;
> > > > -	}
> > > > -}
> > 
> > I don't understand why we no longer need this - after reset we probably have the default context loaded (if we resue the driver load sequence exactly), so I expect that we must reset the software tracking accordingly.
> > 
> > > > -
> > > >  int i915_gem_context_init(struct drm_device *dev)  {
> > > >  	struct drm_i915_private *dev_priv = dev->dev_private; @@ -498,10
> > > > +462,6 @@ int i915_gem_context_enable(struct drm_i915_private 
> > > > +*dev_priv)
> > > >  		ppgtt->enable(ppgtt);
> > > >  	}
> > > >  
> > > > -	/* FIXME: We should make this work, even in reset */
> > > > -	if (i915_reset_in_progress(&dev_priv->gpu_error))
> > > > -		return 0;
> > > > -
> > > >  	BUG_ON(!dev_priv->ring[RCS].default_context);
> > > >  
> > > >  	for_each_ring(ring, dev_priv, i) { @@ -645,7 +605,7 @@ static 
> > > > int do_switch(struct intel_engine_cs *ring,
> > > >  	from = ring->last_context;
> > > >  
> > > >  	if (USES_FULL_PPGTT(ring->dev)) {
> > > > -		ret = ppgtt->switch_mm(ppgtt, ring, false);
> > > > +		ret = ppgtt->switch_mm(ppgtt, ring);
> > > >  		if (ret)
> > > >  			goto unpin_out;
> > > >  	}
> > > > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c
> > > > b/drivers/gpu/drm/i915/i915_gem_gtt.c
> > > > index 5188936..450c8a9 100644
> > > > --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> > > > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> > > > @@ -216,19 +216,12 @@ static gen6_gtt_pte_t 
> > > > iris_pte_encode(dma_addr_t addr,
> > > >  
> > > >  /* Broadwell Page Directory Pointer Descriptors */  static int 
> > > > gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
> > > > -			   uint64_t val, bool synchronous)
> > > > +			   uint64_t val)
> > > >  {
> > > > -	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> > > >  	int ret;
> > > >  
> > > >  	BUG_ON(entry >= 4);
> > > >  
> > > > -	if (synchronous) {
> > > > -		I915_WRITE(GEN8_RING_PDP_UDW(ring, entry), val >> 32);
> > > > -		I915_WRITE(GEN8_RING_PDP_LDW(ring, entry), (u32)val);
> > > > -		return 0;
> > > > -	}
> > > > -
> > > >  	ret = intel_ring_begin(ring, 6);
> > > >  	if (ret)
> > > >  		return ret;
> > > > @@ -245,8 +238,7 @@ static int gen8_write_pdp(struct 
> > > > intel_engine_cs *ring, unsigned entry,  }
> > > >  
> > > >  static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > > > -			  struct intel_engine_cs *ring,
> > > > -			  bool synchronous)
> > > > +			  struct intel_engine_cs *ring)
> > > >  {
> > > >  	int i, ret;
> > > >  
> > > > @@ -255,7 +247,7 @@ static int gen8_mm_switch(struct i915_hw_ppgtt 
> > > > *ppgtt,
> > > >  
> > > >  	for (i = used_pd - 1; i >= 0; i--) {
> > > >  		dma_addr_t addr = ppgtt->pd_dma_addr[i];
> > > > -		ret = gen8_write_pdp(ring, i, addr, synchronous);
> > > > +		ret = gen8_write_pdp(ring, i, addr);
> > > >  		if (ret)
> > > >  			return ret;
> > > >  	}
> > > > @@ -724,29 +716,10 @@ static uint32_t get_pd_offset(struct 
> > > > i915_hw_ppgtt *ppgtt)  }
> > > >  
> > > >  static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > > > -			 struct intel_engine_cs *ring,
> > > > -			 bool synchronous)
> > > > +			 struct intel_engine_cs *ring)
> > > >  {
> > > > -	struct drm_device *dev = ppgtt->base.dev;
> > > > -	struct drm_i915_private *dev_priv = dev->dev_private;
> > > >  	int ret;
> > > >  
> > > > -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> > > > -	 * manually frob these bits. Ideally we could use the ring functions,
> > > > -	 * except our error handling makes it quite difficult (can't use
> > > > -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> > > > -	 *
> > > > -	 * FIXME: We should try not to special case reset
> > > > -	 */
> > > > -	if (synchronous ||
> > > > -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> > > > -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> > > > -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> > > > -		I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
> > > > -		POSTING_READ(RING_PP_DIR_BASE(ring));
> > > > -		return 0;
> > > > -	}
> > > > -
> > > >  	/* NB: TLBs must be flushed and invalidated before a switch */
> > > >  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, I915_GEM_GPU_DOMAINS);
> > > >  	if (ret)
> > > > @@ -768,29 +741,10 @@ static int hsw_mm_switch(struct 
> > > > i915_hw_ppgtt *ppgtt,  }
> > > >  
> > > >  static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > > > -			  struct intel_engine_cs *ring,
> > > > -			  bool synchronous)
> > > > +			  struct intel_engine_cs *ring)
> > > >  {
> > > > -	struct drm_device *dev = ppgtt->base.dev;
> > > > -	struct drm_i915_private *dev_priv = dev->dev_private;
> > > >  	int ret;
> > > >  
> > > > -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> > > > -	 * manually frob these bits. Ideally we could use the ring functions,
> > > > -	 * except our error handling makes it quite difficult (can't use
> > > > -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> > > > -	 *
> > > > -	 * FIXME: We should try not to special case reset
> > > > -	 */
> > > > -	if (synchronous ||
> > > > -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> > > > -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> > > > -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> > > > -		I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
> > > > -		POSTING_READ(RING_PP_DIR_BASE(ring));
> > > > -		return 0;
> > > > -	}
> > > > -
> > > >  	/* NB: TLBs must be flushed and invalidated before a switch */
> > > >  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, I915_GEM_GPU_DOMAINS);
> > > >  	if (ret)
> > > > @@ -819,14 +773,11 @@ static int gen7_mm_switch(struct 
> > > > i915_hw_ppgtt *ppgtt,  }
> > > >  
> > > >  static int gen6_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > > > -			  struct intel_engine_cs *ring,
> > > > -			  bool synchronous)
> > > > +			  struct intel_engine_cs *ring)
> > > >  {
> > > >  	struct drm_device *dev = ppgtt->base.dev;
> > > >  	struct drm_i915_private *dev_priv = dev->dev_private;
> > > >  
> > > > -	if (!synchronous)
> > > > -		return 0;
> > > >  
> > > >  	I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> > > >  	I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt)); @@
> > > > -852,7 +803,7 @@ static int gen8_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
> > > >  		if (USES_FULL_PPGTT(dev))
> > > >  			continue;
> > > >  
> > > > -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> > > > +		ret = ppgtt->switch_mm(ppgtt, ring);
> > > >  		if (ret)
> > > >  			goto err_out;
> > > >  	}
> > > > @@ -897,7 +848,7 @@ static int gen7_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
> > > >  		if (USES_FULL_PPGTT(dev))
> > > >  			continue;
> > > >  
> > > > -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> > > > +		ret = ppgtt->switch_mm(ppgtt, ring);
> > > >  		if (ret)
> > > >  			return ret;
> > > >  	}
> > > > @@ -926,7 +877,7 @@ static int gen6_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
> > > >  	I915_WRITE(GFX_MODE, _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
> > > >  
> > > >  	for_each_ring(ring, dev_priv, i) {
> > > > -		int ret = ppgtt->switch_mm(ppgtt, ring, true);
> > > > +		int ret = ppgtt->switch_mm(ppgtt, ring);
> > > >  		if (ret)
> > > >  			return ret;
> > > >  	}
> > > > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h
> > > > b/drivers/gpu/drm/i915/i915_gem_gtt.h
> > > > index 8d6f7c1..bf1e4fc 100644
> > > > --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> > > > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> > > > @@ -262,8 +262,7 @@ struct i915_hw_ppgtt {
> > > >  
> > > >  	int (*enable)(struct i915_hw_ppgtt *ppgtt);
> > > >  	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
> > > > -			 struct intel_engine_cs *ring,
> > > > -			 bool synchronous);
> > > > +			 struct intel_engine_cs *ring);
> > > >  	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file 
> > > > *m);  };
> > > >  
> > > > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c
> > > > b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > > > index 599709e..e33c2e1 100644
> > > > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> > > > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > > > @@ -1832,7 +1832,9 @@ int intel_ring_begin(struct intel_engine_cs 
> > > > *ring,
> > > >  
> > > >  	ret = i915_gem_check_wedge(&dev_priv->gpu_error,
> > > >  				   dev_priv->mm.interruptible);
> > > > -	if (ret)
> > > > +
> > > > +	/* -EAGAIN means a reset is in progress, it is Ok to continue */
> > > > +	if (ret && (ret != -EAGAIN))
> > > >  		return ret;
> > 
> > Oh, I guess that's the tricky bit why the old approach never worked - because reset_in_progress is set we failed the context/ppgtt loading through the rings and screwed up.
> > 
> > Problem with your approach is that we want to bail out here if a reset is in progress, so we can't just eat the EAGAIN. If we do that we potentially deadlock or overflow the ring.
> > 
> > I think we need a different hack here, and a few layers down (i.e. at the place where we actually generate that offending -EAGAIN).
> > 
> > - Around the re-init sequence in the reset function we set
> >   dev_priv->mm.reload_in_reset or similar. Since we hold dev->struct_mutex
> >   no one will see that, as long as we never leak it out of the critical
> >   section.
> > 
> > - In the ring_begin code that checks for gpu hangs we ignore
> >   reset_in_progress if this bit is set.
> > 
> > - Both places need fairly big comments to explain what exactly is going
> >   on.
> > 
> > Thanks, Daniel
> > 
> > > >  
> > > >  	ret = __intel_ring_prepare(ring, num_dwords * sizeof(uint32_t));
> > > > --
> > > > 2.0.0
> > > > 
> > > > _______________________________________________
> > > > Intel-gfx mailing list
> > > > Intel-gfx@lists.freedesktop.org
> > > > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> > > 
> > > --
> > > Ben Widawsky, Intel Open Source Technology Center 
> > > _______________________________________________
> > > Intel-gfx mailing list
> > > Intel-gfx@lists.freedesktop.org
> > > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> > 
> > --
> > Daniel Vetter
> > Software Engineer, Intel Corporation
> > +41 (0) 79 365 57 48 - http://blog.ffwll.ch
> 
> --
> Ben Widawsky, Intel Open Source Technology Center

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] drm/i915: Rework GPU reset sequence to match driver load & thaw
  2014-07-29 10:32       ` Daniel Vetter
@ 2014-07-30 16:59         ` Mcaulay, Alistair
  2014-07-30 21:00           ` Daniel Vetter
  0 siblings, 1 reply; 30+ messages in thread
From: Mcaulay, Alistair @ 2014-07-30 16:59 UTC (permalink / raw)
  To: Daniel Vetter, Chris Wilson, Ben Widawsky, intel-gfx

Hi Daniel,

could you please be clearer on the change you mean.  I think you mean something functionally equivalent to the code below, but done in a less hacky way.
(This slight change has made no change to test results)
Or is the idea to return at a different point to this?
I couldn't find " dev_priv->mm.reload_in_reset or similar" in the code. The only thing I can find is error->reset_counter,
which is used in check_wedge(). Bottom bit set means RESET_IN_PROGRESS, top bit means WEDGED


 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
 +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
 @@ -1832,7 +1832,9 @@ int intel_ring_begin(struct intel_engine_cs 
 *ring,
  
  	ret = i915_gem_check_wedge(&dev_priv->gpu_error,
  				   dev_priv->mm.interruptible);
 -	if (ret)
 +
 +	/* -EAGAIN means a reset is in progress, it is Ok to return */
 +	if (ret == -EAGAIN)
 + 		return 0;
 + 	if (ret)
 +		return ret;
  
  	ret = __intel_ring_prepare(ring, num_dwords * sizeof(uint32_t));

Alistair.

-----Original Message-----
From: Intel-gfx [mailto:intel-gfx-bounces@lists.freedesktop.org] On Behalf Of Daniel Vetter
Sent: Tuesday, July 29, 2014 11:33 AM
To: Chris Wilson; Daniel Vetter; Ben Widawsky; intel-gfx@lists.freedesktop.org
Subject: Re: [Intel-gfx] [PATCH] drm/i915: Rework GPU reset sequence to match driver load & thaw

On Tue, Jul 29, 2014 at 08:36:33AM +0100, Chris Wilson wrote:
> On Mon, Jul 28, 2014 at 11:26:38AM +0200, Daniel Vetter wrote:
> > Oh, I guess that's the tricky bit why the old approach never worked 
> > - because reset_in_progress is set we failed the context/ppgtt 
> > loading through the rings and screwed up.
> > 
> > Problem with your approach is that we want to bail out here if a 
> > reset is in progress, so we can't just eat the EAGAIN. If we do that 
> > we potentially deadlock or overflow the ring.
> > 
> > I think we need a different hack here, and a few layers down (i.e. 
> > at the place where we actually generate that offending -EAGAIN).
> > 
> > - Around the re-init sequence in the reset function we set
> >   dev_priv->mm.reload_in_reset or similar

. Since we hold dev->struct_mutex
> >   no one will see that, as long as we never leak it out of the critical
> >   section.
> > 
> > - In the ring_begin code that checks for gpu hangs we ignore
> >   reset_in_progress if this bit is set.
> > 
> > - Both places need fairly big comments to explain what exactly is going
> >   on.
> 
> This is going from bad to worse. I think you can do better if you 
> looked at the problem afresh.

Well we can't really reset reset_in_progress at that point, since not all reset is done yet. Especially the modeset stuff. So I don't think that reordering the reset sequence would get us out of this ugly spot. And I don't see any other solution really. Do you?
-Daniel
--
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] drm/i915: Rework GPU reset sequence to match driver load & thaw
  2014-07-30 16:59         ` Mcaulay, Alistair
@ 2014-07-30 21:00           ` Daniel Vetter
  2014-07-31 16:37             ` Mcaulay, Alistair
  0 siblings, 1 reply; 30+ messages in thread
From: Daniel Vetter @ 2014-07-30 21:00 UTC (permalink / raw)
  To: Mcaulay, Alistair; +Cc: Ben Widawsky, intel-gfx

On Wed, Jul 30, 2014 at 04:59:33PM +0000, Mcaulay, Alistair wrote:
> Hi Daniel,
> 
> could you please be clearer on the change you mean.  I think you mean something functionally equivalent to the code below, but done in a less hacky way.
> (This slight change has made no change to test results)
> Or is the idea to return at a different point to this?
> I couldn't find " dev_priv->mm.reload_in_reset or similar" in the code. The only thing I can find is error->reset_counter,
> which is used in check_wedge(). Bottom bit set means RESET_IN_PROGRESS, top bit means WEDGED

Well I've meant that you have to add a new dev_prive->mm.realod_in_reset.
And the below won't work since in all other places but when doing a gpu
reset we want the -EAGAIN to reach callers. Actually it's really important
that if we have an -EGAIN we don't eat it.

And I guess the check for mm.reload_in_reset should actually be in
gem_check_wedged.
-Daniel

> 
> 
>  --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
>  +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
>  @@ -1832,7 +1832,9 @@ int intel_ring_begin(struct intel_engine_cs 
>  *ring,
>   
>   	ret = i915_gem_check_wedge(&dev_priv->gpu_error,
>   				   dev_priv->mm.interruptible);
>  -	if (ret)
>  +
>  +	/* -EAGAIN means a reset is in progress, it is Ok to return */
>  +	if (ret == -EAGAIN)
>  + 		return 0;
>  + 	if (ret)
>  +		return ret;
>   
>   	ret = __intel_ring_prepare(ring, num_dwords * sizeof(uint32_t));
> 
> Alistair.
> 
> -----Original Message-----
> From: Intel-gfx [mailto:intel-gfx-bounces@lists.freedesktop.org] On Behalf Of Daniel Vetter
> Sent: Tuesday, July 29, 2014 11:33 AM
> To: Chris Wilson; Daniel Vetter; Ben Widawsky; intel-gfx@lists.freedesktop.org
> Subject: Re: [Intel-gfx] [PATCH] drm/i915: Rework GPU reset sequence to match driver load & thaw
> 
> On Tue, Jul 29, 2014 at 08:36:33AM +0100, Chris Wilson wrote:
> > On Mon, Jul 28, 2014 at 11:26:38AM +0200, Daniel Vetter wrote:
> > > Oh, I guess that's the tricky bit why the old approach never worked 
> > > - because reset_in_progress is set we failed the context/ppgtt 
> > > loading through the rings and screwed up.
> > > 
> > > Problem with your approach is that we want to bail out here if a 
> > > reset is in progress, so we can't just eat the EAGAIN. If we do that 
> > > we potentially deadlock or overflow the ring.
> > > 
> > > I think we need a different hack here, and a few layers down (i.e. 
> > > at the place where we actually generate that offending -EAGAIN).
> > > 
> > > - Around the re-init sequence in the reset function we set
> > >   dev_priv->mm.reload_in_reset or similar
> 
> . Since we hold dev->struct_mutex
> > >   no one will see that, as long as we never leak it out of the critical
> > >   section.
> > > 
> > > - In the ring_begin code that checks for gpu hangs we ignore
> > >   reset_in_progress if this bit is set.
> > > 
> > > - Both places need fairly big comments to explain what exactly is going
> > >   on.
> > 
> > This is going from bad to worse. I think you can do better if you 
> > looked at the problem afresh.
> 
> Well we can't really reset reset_in_progress at that point, since not all reset is done yet. Especially the modeset stuff. So I don't think that reordering the reset sequence would get us out of this ugly spot. And I don't see any other solution really. Do you?
> -Daniel
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> +41 (0) 79 365 57 48 - http://blog.ffwll.ch
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] drm/i915: Rework GPU reset sequence to match driver load & thaw
  2014-07-30 21:00           ` Daniel Vetter
@ 2014-07-31 16:37             ` Mcaulay, Alistair
  2014-08-04  7:52               ` Daniel Vetter
  0 siblings, 1 reply; 30+ messages in thread
From: Mcaulay, Alistair @ 2014-07-31 16:37 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: Ben Widawsky, intel-gfx

Hi Daniel,

Something more like this then?  (and revert the change to intel_ring_begin(), putting it back to how it was )

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 991b663..b811ff2 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1217,6 +1217,9 @@ struct i915_gpu_error {
 
 	/* For missed irq/seqno simulation. */
 	unsigned int test_irq_rings;
+
+	/* Used to prevent gem_check_wedged returning -EAGAIN during gpu reset   */
+	bool reload_in_progress;
 };
 
 enum modeset_restore {
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index b38e086..a25d3b5 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1085,7 +1085,9 @@ i915_gem_check_wedge(struct i915_gpu_error *error,
 		if (i915_terminally_wedged(error))
 			return -EIO;
 
-		return -EAGAIN;
+		/* Check if GPU Reset is in progress */
+		if (!error->reload_in_reset)
+			return -EAGAIN;
 	}
 
 	return 0;
@@ -2579,6 +2581,8 @@ void i915_gem_reset(struct drm_device *dev)
 	struct intel_engine_cs *ring;
 	int i;
 
+	/* Used to prevent gem_check_wedged returning -EAGAIN during gpu reset */
+	dev_priv->gpu_error.reload_in_reset = true;
 	/*
 	 * Before we free the objects from the requests, we need to inspect
 	 * them for finding the guilty party. As the requests only borrow
@@ -2591,6 +2595,8 @@ void i915_gem_reset(struct drm_device *dev)
 		i915_gem_reset_ring_cleanup(dev_priv, ring);
 
 	i915_gem_restore_fences(dev);
+
+	dev_priv->gpu_error.reload_in_reset = false;
 }


-----Original Message-----
From: Daniel Vetter [mailto:daniel.vetter@ffwll.ch] On Behalf Of Daniel Vetter
Sent: Wednesday, July 30, 2014 10:01 PM
To: Mcaulay, Alistair
Cc: Daniel Vetter; Chris Wilson; Ben Widawsky; intel-gfx@lists.freedesktop.org
Subject: Re: [Intel-gfx] [PATCH] drm/i915: Rework GPU reset sequence to match driver load & thaw

On Wed, Jul 30, 2014 at 04:59:33PM +0000, Mcaulay, Alistair wrote:
> Hi Daniel,
> 
> could you please be clearer on the change you mean.  I think you mean something functionally equivalent to the code below, but done in a less hacky way.
> (This slight change has made no change to test results) Or is the idea 
> to return at a different point to this?
> I couldn't find " dev_priv->mm.reload_in_reset or similar" in the 
> code. The only thing I can find is error->reset_counter, which is used 
> in check_wedge(). Bottom bit set means RESET_IN_PROGRESS, top bit 
> means WEDGED

Well I've meant that you have to add a new dev_prive->mm.realod_in_reset.
And the below won't work since in all other places but when doing a gpu reset we want the -EAGAIN to reach callers. Actually it's really important that if we have an -EGAIN we don't eat it.

And I guess the check for mm.reload_in_reset should actually be in gem_check_wedged.
-Daniel

> 
> 
>  --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
>  +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
>  @@ -1832,7 +1832,9 @@ int intel_ring_begin(struct intel_engine_cs  
> *ring,
>   
>   	ret = i915_gem_check_wedge(&dev_priv->gpu_error,
>   				   dev_priv->mm.interruptible);
>  -	if (ret)
>  +
>  +	/* -EAGAIN means a reset is in progress, it is Ok to return */
>  +	if (ret == -EAGAIN)
>  + 		return 0;
>  + 	if (ret)
>  +		return ret;
>   
>   	ret = __intel_ring_prepare(ring, num_dwords * sizeof(uint32_t));
> 
> Alistair.
> 
> -----Original Message-----
> From: Intel-gfx [mailto:intel-gfx-bounces@lists.freedesktop.org] On 
> Behalf Of Daniel Vetter
> Sent: Tuesday, July 29, 2014 11:33 AM
> To: Chris Wilson; Daniel Vetter; Ben Widawsky; 
> intel-gfx@lists.freedesktop.org
> Subject: Re: [Intel-gfx] [PATCH] drm/i915: Rework GPU reset sequence 
> to match driver load & thaw
> 
> On Tue, Jul 29, 2014 at 08:36:33AM +0100, Chris Wilson wrote:
> > On Mon, Jul 28, 2014 at 11:26:38AM +0200, Daniel Vetter wrote:
> > > Oh, I guess that's the tricky bit why the old approach never 
> > > worked
> > > - because reset_in_progress is set we failed the context/ppgtt 
> > > loading through the rings and screwed up.
> > > 
> > > Problem with your approach is that we want to bail out here if a 
> > > reset is in progress, so we can't just eat the EAGAIN. If we do 
> > > that we potentially deadlock or overflow the ring.
> > > 
> > > I think we need a different hack here, and a few layers down (i.e. 
> > > at the place where we actually generate that offending -EAGAIN).
> > > 
> > > - Around the re-init sequence in the reset function we set
> > >   dev_priv->mm.reload_in_reset or similar
> 
> . Since we hold dev->struct_mutex
> > >   no one will see that, as long as we never leak it out of the critical
> > >   section.
> > > 
> > > - In the ring_begin code that checks for gpu hangs we ignore
> > >   reset_in_progress if this bit is set.
> > > 
> > > - Both places need fairly big comments to explain what exactly is going
> > >   on.
> > 
> > This is going from bad to worse. I think you can do better if you 
> > looked at the problem afresh.
> 
> Well we can't really reset reset_in_progress at that point, since not all reset is done yet. Especially the modeset stuff. So I don't think that reordering the reset sequence would get us out of this ugly spot. And I don't see any other solution really. Do you?
> -Daniel
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> +41 (0) 79 365 57 48 - http://blog.ffwll.ch
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

--
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [PATCH] drm/i915: Rework GPU reset sequence to match driver load & thaw
  2014-07-31 16:37             ` Mcaulay, Alistair
@ 2014-08-04  7:52               ` Daniel Vetter
  0 siblings, 0 replies; 30+ messages in thread
From: Daniel Vetter @ 2014-08-04  7:52 UTC (permalink / raw)
  To: Mcaulay, Alistair; +Cc: Ben Widawsky, intel-gfx

On Thu, Jul 31, 2014 at 04:37:14PM +0000, Mcaulay, Alistair wrote:
> Hi Daniel,
> 
> Something more like this then?  (and revert the change to intel_ring_begin(), putting it back to how it was )

Yeah, roughly. Except that I would place the reload_in_reset wrapping in
the i915_reset function. It is paramount that we never leak this outside
of the dev->struct_mutex protection so that other threads can't ever
observe this to be set. So putting it right next to the mutex locking is
better.

Also I think you've wrapped the wrong function - the re-init is done in
i915_gem_init_hw, this here just resets the software state (mostly) and is
done before the actual gpu hw reset is done. gem_init_hw is only run if
the reset succeeds.
-Daniel

> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 991b663..b811ff2 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1217,6 +1217,9 @@ struct i915_gpu_error {
>  
>  	/* For missed irq/seqno simulation. */
>  	unsigned int test_irq_rings;
> +
> +	/* Used to prevent gem_check_wedged returning -EAGAIN during gpu reset   */
> +	bool reload_in_progress;
>  };
>  
>  enum modeset_restore {
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index b38e086..a25d3b5 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1085,7 +1085,9 @@ i915_gem_check_wedge(struct i915_gpu_error *error,
>  		if (i915_terminally_wedged(error))
>  			return -EIO;
>  
> -		return -EAGAIN;
> +		/* Check if GPU Reset is in progress */
> +		if (!error->reload_in_reset)
> +			return -EAGAIN;
>  	}
>  
>  	return 0;
> @@ -2579,6 +2581,8 @@ void i915_gem_reset(struct drm_device *dev)
>  	struct intel_engine_cs *ring;
>  	int i;
>  
> +	/* Used to prevent gem_check_wedged returning -EAGAIN during gpu reset */
> +	dev_priv->gpu_error.reload_in_reset = true;
>  	/*
>  	 * Before we free the objects from the requests, we need to inspect
>  	 * them for finding the guilty party. As the requests only borrow
> @@ -2591,6 +2595,8 @@ void i915_gem_reset(struct drm_device *dev)
>  		i915_gem_reset_ring_cleanup(dev_priv, ring);
>  
>  	i915_gem_restore_fences(dev);
> +
> +	dev_priv->gpu_error.reload_in_reset = false;
>  }
> 
> 
> -----Original Message-----
> From: Daniel Vetter [mailto:daniel.vetter@ffwll.ch] On Behalf Of Daniel Vetter
> Sent: Wednesday, July 30, 2014 10:01 PM
> To: Mcaulay, Alistair
> Cc: Daniel Vetter; Chris Wilson; Ben Widawsky; intel-gfx@lists.freedesktop.org
> Subject: Re: [Intel-gfx] [PATCH] drm/i915: Rework GPU reset sequence to match driver load & thaw
> 
> On Wed, Jul 30, 2014 at 04:59:33PM +0000, Mcaulay, Alistair wrote:
> > Hi Daniel,
> > 
> > could you please be clearer on the change you mean.  I think you mean something functionally equivalent to the code below, but done in a less hacky way.
> > (This slight change has made no change to test results) Or is the idea 
> > to return at a different point to this?
> > I couldn't find " dev_priv->mm.reload_in_reset or similar" in the 
> > code. The only thing I can find is error->reset_counter, which is used 
> > in check_wedge(). Bottom bit set means RESET_IN_PROGRESS, top bit 
> > means WEDGED
> 
> Well I've meant that you have to add a new dev_prive->mm.realod_in_reset.
> And the below won't work since in all other places but when doing a gpu reset we want the -EAGAIN to reach callers. Actually it's really important that if we have an -EGAIN we don't eat it.
> 
> And I guess the check for mm.reload_in_reset should actually be in gem_check_wedged.
> -Daniel
> 
> > 
> > 
> >  --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> >  +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> >  @@ -1832,7 +1832,9 @@ int intel_ring_begin(struct intel_engine_cs  
> > *ring,
> >   
> >   	ret = i915_gem_check_wedge(&dev_priv->gpu_error,
> >   				   dev_priv->mm.interruptible);
> >  -	if (ret)
> >  +
> >  +	/* -EAGAIN means a reset is in progress, it is Ok to return */
> >  +	if (ret == -EAGAIN)
> >  + 		return 0;
> >  + 	if (ret)
> >  +		return ret;
> >   
> >   	ret = __intel_ring_prepare(ring, num_dwords * sizeof(uint32_t));
> > 
> > Alistair.
> > 
> > -----Original Message-----
> > From: Intel-gfx [mailto:intel-gfx-bounces@lists.freedesktop.org] On 
> > Behalf Of Daniel Vetter
> > Sent: Tuesday, July 29, 2014 11:33 AM
> > To: Chris Wilson; Daniel Vetter; Ben Widawsky; 
> > intel-gfx@lists.freedesktop.org
> > Subject: Re: [Intel-gfx] [PATCH] drm/i915: Rework GPU reset sequence 
> > to match driver load & thaw
> > 
> > On Tue, Jul 29, 2014 at 08:36:33AM +0100, Chris Wilson wrote:
> > > On Mon, Jul 28, 2014 at 11:26:38AM +0200, Daniel Vetter wrote:
> > > > Oh, I guess that's the tricky bit why the old approach never 
> > > > worked
> > > > - because reset_in_progress is set we failed the context/ppgtt 
> > > > loading through the rings and screwed up.
> > > > 
> > > > Problem with your approach is that we want to bail out here if a 
> > > > reset is in progress, so we can't just eat the EAGAIN. If we do 
> > > > that we potentially deadlock or overflow the ring.
> > > > 
> > > > I think we need a different hack here, and a few layers down (i.e. 
> > > > at the place where we actually generate that offending -EAGAIN).
> > > > 
> > > > - Around the re-init sequence in the reset function we set
> > > >   dev_priv->mm.reload_in_reset or similar
> > 
> > . Since we hold dev->struct_mutex
> > > >   no one will see that, as long as we never leak it out of the critical
> > > >   section.
> > > > 
> > > > - In the ring_begin code that checks for gpu hangs we ignore
> > > >   reset_in_progress if this bit is set.
> > > > 
> > > > - Both places need fairly big comments to explain what exactly is going
> > > >   on.
> > > 
> > > This is going from bad to worse. I think you can do better if you 
> > > looked at the problem afresh.
> > 
> > Well we can't really reset reset_in_progress at that point, since not all reset is done yet. Especially the modeset stuff. So I don't think that reordering the reset sequence would get us out of this ugly spot. And I don't see any other solution really. Do you?
> > -Daniel
> > --
> > Daniel Vetter
> > Software Engineer, Intel Corporation
> > +41 (0) 79 365 57 48 - http://blog.ffwll.ch
> > _______________________________________________
> > Intel-gfx mailing list
> > Intel-gfx@lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> 
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> +41 (0) 79 365 57 48 - http://blog.ffwll.ch

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH v2] drm/i915: Rework GPU reset sequence to match driver load & thaw
  2014-07-16 15:05 [PATCH] drm/i915: Rework GPU reset sequence to match driver load & thaw alistair.mcaulay
  2014-07-26  1:05 ` Ben Widawsky
@ 2014-08-05  8:47 ` alistair.mcaulay
  2014-08-06 12:58   ` Mcaulay, Alistair
  2014-08-06 16:24   ` Mika Kuoppala
  1 sibling, 2 replies; 30+ messages in thread
From: alistair.mcaulay @ 2014-08-05  8:47 UTC (permalink / raw)
  To: intel-gfx

From: "McAulay, Alistair" <alistair.mcaulay@intel.com>

This patch is to address Daniels concerns over different code during reset:

http://lists.freedesktop.org/archives/intel-gfx/2014-June/047758.html

"The reason for aiming as hard as possible to use the exact same code for
driver load, gpu reset and runtime pm/system resume is that we've simply
seen too many bugs due to slight variations and unintended omissions."

Tested using igt drv_hangman.

V2: Cleaner way of preventing check_wedge returning -EAGAIN

Signed-off-by: McAulay, Alistair <alistair.mcaulay@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c         |  6 +++
 drivers/gpu/drm/i915/i915_drv.h         |  3 ++
 drivers/gpu/drm/i915/i915_gem.c         |  6 +--
 drivers/gpu/drm/i915/i915_gem_context.c | 42 +--------------------
 drivers/gpu/drm/i915/i915_gem_gtt.c     | 67 +++++----------------------------
 drivers/gpu/drm/i915/i915_gem_gtt.h     |  3 +-
 6 files changed, 23 insertions(+), 104 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 5e4fefd..3bfafe6 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -806,7 +806,13 @@ int i915_reset(struct drm_device *dev)
 			!dev_priv->ums.mm_suspended) {
 		dev_priv->ums.mm_suspended = 0;
 
+		/* Used to prevent gem_check_wedged returning -EAGAIN during gpu reset */
+		dev_priv->gpu_error.reload_in_reset = true;
+
 		ret = i915_gem_init_hw(dev);
+
+		dev_priv->gpu_error.reload_in_reset = false;
+
 		mutex_unlock(&dev->struct_mutex);
 		if (ret) {
 			DRM_ERROR("Failed hw init on reset %d\n", ret);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 991b663..116daff 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1217,6 +1217,9 @@ struct i915_gpu_error {
 
 	/* For missed irq/seqno simulation. */
 	unsigned int test_irq_rings;
+
+	/* Used to prevent gem_check_wedged returning -EAGAIN during gpu reset   */
+	bool reload_in_reset;
 };
 
 enum modeset_restore {
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index ef047bc..14e1770 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1085,7 +1085,9 @@ i915_gem_check_wedge(struct i915_gpu_error *error,
 		if (i915_terminally_wedged(error))
 			return -EIO;
 
-		return -EAGAIN;
+		/* Check if GPU Reset is in progress */
+		if (!error->reload_in_reset)
+			return -EAGAIN;
 	}
 
 	return 0;
@@ -2590,8 +2592,6 @@ void i915_gem_reset(struct drm_device *dev)
 	for_each_ring(ring, dev_priv, i)
 		i915_gem_reset_ring_cleanup(dev_priv, ring);
 
-	i915_gem_context_reset(dev);
-
 	i915_gem_restore_fences(dev);
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index de72a28..d96219f 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -372,42 +372,6 @@ err_destroy:
 	return ERR_PTR(ret);
 }
 
-void i915_gem_context_reset(struct drm_device *dev)
-{
-	struct drm_i915_private *dev_priv = dev->dev_private;
-	int i;
-
-	/* Prevent the hardware from restoring the last context (which hung) on
-	 * the next switch */
-	for (i = 0; i < I915_NUM_RINGS; i++) {
-		struct intel_engine_cs *ring = &dev_priv->ring[i];
-		struct intel_context *dctx = ring->default_context;
-		struct intel_context *lctx = ring->last_context;
-
-		/* Do a fake switch to the default context */
-		if (lctx == dctx)
-			continue;
-
-		if (!lctx)
-			continue;
-
-		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
-			WARN_ON(i915_gem_obj_ggtt_pin(dctx->legacy_hw_ctx.rcs_state,
-						      get_context_alignment(dev), 0));
-			/* Fake a finish/inactive */
-			dctx->legacy_hw_ctx.rcs_state->base.write_domain = 0;
-			dctx->legacy_hw_ctx.rcs_state->active = 0;
-		}
-
-		if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
-			i915_gem_object_ggtt_unpin(lctx->legacy_hw_ctx.rcs_state);
-
-		i915_gem_context_unreference(lctx);
-		i915_gem_context_reference(dctx);
-		ring->last_context = dctx;
-	}
-}
-
 int i915_gem_context_init(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
@@ -498,10 +462,6 @@ int i915_gem_context_enable(struct drm_i915_private *dev_priv)
 		ppgtt->enable(ppgtt);
 	}
 
-	/* FIXME: We should make this work, even in reset */
-	if (i915_reset_in_progress(&dev_priv->gpu_error))
-		return 0;
-
 	BUG_ON(!dev_priv->ring[RCS].default_context);
 
 	for_each_ring(ring, dev_priv, i) {
@@ -645,7 +605,7 @@ static int do_switch(struct intel_engine_cs *ring,
 	from = ring->last_context;
 
 	if (USES_FULL_PPGTT(ring->dev)) {
-		ret = ppgtt->switch_mm(ppgtt, ring, false);
+		ret = ppgtt->switch_mm(ppgtt, ring);
 		if (ret)
 			goto unpin_out;
 	}
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 5188936..450c8a9 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -216,19 +216,12 @@ static gen6_gtt_pte_t iris_pte_encode(dma_addr_t addr,
 
 /* Broadwell Page Directory Pointer Descriptors */
 static int gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
-			   uint64_t val, bool synchronous)
+			   uint64_t val)
 {
-	struct drm_i915_private *dev_priv = ring->dev->dev_private;
 	int ret;
 
 	BUG_ON(entry >= 4);
 
-	if (synchronous) {
-		I915_WRITE(GEN8_RING_PDP_UDW(ring, entry), val >> 32);
-		I915_WRITE(GEN8_RING_PDP_LDW(ring, entry), (u32)val);
-		return 0;
-	}
-
 	ret = intel_ring_begin(ring, 6);
 	if (ret)
 		return ret;
@@ -245,8 +238,7 @@ static int gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
 }
 
 static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
-			  struct intel_engine_cs *ring,
-			  bool synchronous)
+			  struct intel_engine_cs *ring)
 {
 	int i, ret;
 
@@ -255,7 +247,7 @@ static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
 
 	for (i = used_pd - 1; i >= 0; i--) {
 		dma_addr_t addr = ppgtt->pd_dma_addr[i];
-		ret = gen8_write_pdp(ring, i, addr, synchronous);
+		ret = gen8_write_pdp(ring, i, addr);
 		if (ret)
 			return ret;
 	}
@@ -724,29 +716,10 @@ static uint32_t get_pd_offset(struct i915_hw_ppgtt *ppgtt)
 }
 
 static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
-			 struct intel_engine_cs *ring,
-			 bool synchronous)
+			 struct intel_engine_cs *ring)
 {
-	struct drm_device *dev = ppgtt->base.dev;
-	struct drm_i915_private *dev_priv = dev->dev_private;
 	int ret;
 
-	/* If we're in reset, we can assume the GPU is sufficiently idle to
-	 * manually frob these bits. Ideally we could use the ring functions,
-	 * except our error handling makes it quite difficult (can't use
-	 * intel_ring_begin, ring->flush, or intel_ring_advance)
-	 *
-	 * FIXME: We should try not to special case reset
-	 */
-	if (synchronous ||
-	    i915_reset_in_progress(&dev_priv->gpu_error)) {
-		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
-		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
-		I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
-		POSTING_READ(RING_PP_DIR_BASE(ring));
-		return 0;
-	}
-
 	/* NB: TLBs must be flushed and invalidated before a switch */
 	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, I915_GEM_GPU_DOMAINS);
 	if (ret)
@@ -768,29 +741,10 @@ static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
 }
 
 static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
-			  struct intel_engine_cs *ring,
-			  bool synchronous)
+			  struct intel_engine_cs *ring)
 {
-	struct drm_device *dev = ppgtt->base.dev;
-	struct drm_i915_private *dev_priv = dev->dev_private;
 	int ret;
 
-	/* If we're in reset, we can assume the GPU is sufficiently idle to
-	 * manually frob these bits. Ideally we could use the ring functions,
-	 * except our error handling makes it quite difficult (can't use
-	 * intel_ring_begin, ring->flush, or intel_ring_advance)
-	 *
-	 * FIXME: We should try not to special case reset
-	 */
-	if (synchronous ||
-	    i915_reset_in_progress(&dev_priv->gpu_error)) {
-		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
-		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
-		I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
-		POSTING_READ(RING_PP_DIR_BASE(ring));
-		return 0;
-	}
-
 	/* NB: TLBs must be flushed and invalidated before a switch */
 	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, I915_GEM_GPU_DOMAINS);
 	if (ret)
@@ -819,14 +773,11 @@ static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
 }
 
 static int gen6_mm_switch(struct i915_hw_ppgtt *ppgtt,
-			  struct intel_engine_cs *ring,
-			  bool synchronous)
+			  struct intel_engine_cs *ring)
 {
 	struct drm_device *dev = ppgtt->base.dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
-	if (!synchronous)
-		return 0;
 
 	I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
 	I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
@@ -852,7 +803,7 @@ static int gen8_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
 		if (USES_FULL_PPGTT(dev))
 			continue;
 
-		ret = ppgtt->switch_mm(ppgtt, ring, true);
+		ret = ppgtt->switch_mm(ppgtt, ring);
 		if (ret)
 			goto err_out;
 	}
@@ -897,7 +848,7 @@ static int gen7_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
 		if (USES_FULL_PPGTT(dev))
 			continue;
 
-		ret = ppgtt->switch_mm(ppgtt, ring, true);
+		ret = ppgtt->switch_mm(ppgtt, ring);
 		if (ret)
 			return ret;
 	}
@@ -926,7 +877,7 @@ static int gen6_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
 	I915_WRITE(GFX_MODE, _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
 
 	for_each_ring(ring, dev_priv, i) {
-		int ret = ppgtt->switch_mm(ppgtt, ring, true);
+		int ret = ppgtt->switch_mm(ppgtt, ring);
 		if (ret)
 			return ret;
 	}
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index 8d6f7c1..bf1e4fc 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -262,8 +262,7 @@ struct i915_hw_ppgtt {
 
 	int (*enable)(struct i915_hw_ppgtt *ppgtt);
 	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
-			 struct intel_engine_cs *ring,
-			 bool synchronous);
+			 struct intel_engine_cs *ring);
 	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file *m);
 };
 
-- 
2.0.0

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [PATCH v2] drm/i915: Rework GPU reset sequence to match driver load & thaw
  2014-08-05  8:47 ` [PATCH v2] " alistair.mcaulay
@ 2014-08-06 12:58   ` Mcaulay, Alistair
  2014-08-06 16:24   ` Mika Kuoppala
  1 sibling, 0 replies; 30+ messages in thread
From: Mcaulay, Alistair @ 2014-08-06 12:58 UTC (permalink / raw)
  To: Daniel Vetter (daniel@ffwll.ch); +Cc: intel-gfx


Hi Daniel,

I think this new patch fixes your issues with the previous one, can you please let me know.

Thanks,
Alistair.

> -----Original Message-----
> From: Mcaulay, Alistair
> Sent: Tuesday, August 05, 2014 9:47 AM
> To: intel-gfx@lists.freedesktop.org
> Cc: Mcaulay, Alistair
> Subject: [PATCH v2] drm/i915: Rework GPU reset sequence to match driver
> load & thaw
> 
> From: "McAulay, Alistair" <alistair.mcaulay@intel.com>
> 
> This patch is to address Daniels concerns over different code during reset:
> 
> http://lists.freedesktop.org/archives/intel-gfx/2014-June/047758.html
> 
> "The reason for aiming as hard as possible to use the exact same code for
> driver load, gpu reset and runtime pm/system resume is that we've simply
> seen too many bugs due to slight variations and unintended omissions."
> 
> Tested using igt drv_hangman.
> 
> V2: Cleaner way of preventing check_wedge returning -EAGAIN
> 
> Signed-off-by: McAulay, Alistair <alistair.mcaulay@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.c         |  6 +++
>  drivers/gpu/drm/i915/i915_drv.h         |  3 ++
>  drivers/gpu/drm/i915/i915_gem.c         |  6 +--
>  drivers/gpu/drm/i915/i915_gem_context.c | 42 +--------------------
>  drivers/gpu/drm/i915/i915_gem_gtt.c     | 67 +++++----------------------------
>  drivers/gpu/drm/i915/i915_gem_gtt.h     |  3 +-
>  6 files changed, 23 insertions(+), 104 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.c
> b/drivers/gpu/drm/i915/i915_drv.c index 5e4fefd..3bfafe6 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -806,7 +806,13 @@ int i915_reset(struct drm_device *dev)
>  			!dev_priv->ums.mm_suspended) {
>  		dev_priv->ums.mm_suspended = 0;
> 
> +		/* Used to prevent gem_check_wedged returning -EAGAIN
> during gpu reset */
> +		dev_priv->gpu_error.reload_in_reset = true;
> +
>  		ret = i915_gem_init_hw(dev);
> +
> +		dev_priv->gpu_error.reload_in_reset = false;
> +
>  		mutex_unlock(&dev->struct_mutex);
>  		if (ret) {
>  			DRM_ERROR("Failed hw init on reset %d\n", ret); diff
> --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 991b663..116daff 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1217,6 +1217,9 @@ struct i915_gpu_error {
> 
>  	/* For missed irq/seqno simulation. */
>  	unsigned int test_irq_rings;
> +
> +	/* Used to prevent gem_check_wedged returning -EAGAIN during
> gpu reset   */
> +	bool reload_in_reset;
>  };
> 
>  enum modeset_restore {
> diff --git a/drivers/gpu/drm/i915/i915_gem.c
> b/drivers/gpu/drm/i915/i915_gem.c index ef047bc..14e1770 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1085,7 +1085,9 @@ i915_gem_check_wedge(struct i915_gpu_error
> *error,
>  		if (i915_terminally_wedged(error))
>  			return -EIO;
> 
> -		return -EAGAIN;
> +		/* Check if GPU Reset is in progress */
> +		if (!error->reload_in_reset)
> +			return -EAGAIN;
>  	}
> 
>  	return 0;
> @@ -2590,8 +2592,6 @@ void i915_gem_reset(struct drm_device *dev)
>  	for_each_ring(ring, dev_priv, i)
>  		i915_gem_reset_ring_cleanup(dev_priv, ring);
> 
> -	i915_gem_context_reset(dev);
> -
>  	i915_gem_restore_fences(dev);
>  }
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c
> b/drivers/gpu/drm/i915/i915_gem_context.c
> index de72a28..d96219f 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -372,42 +372,6 @@ err_destroy:
>  	return ERR_PTR(ret);
>  }
> 
> -void i915_gem_context_reset(struct drm_device *dev) -{
> -	struct drm_i915_private *dev_priv = dev->dev_private;
> -	int i;
> -
> -	/* Prevent the hardware from restoring the last context (which
> hung) on
> -	 * the next switch */
> -	for (i = 0; i < I915_NUM_RINGS; i++) {
> -		struct intel_engine_cs *ring = &dev_priv->ring[i];
> -		struct intel_context *dctx = ring->default_context;
> -		struct intel_context *lctx = ring->last_context;
> -
> -		/* Do a fake switch to the default context */
> -		if (lctx == dctx)
> -			continue;
> -
> -		if (!lctx)
> -			continue;
> -
> -		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
> -			WARN_ON(i915_gem_obj_ggtt_pin(dctx-
> >legacy_hw_ctx.rcs_state,
> -
> get_context_alignment(dev), 0));
> -			/* Fake a finish/inactive */
> -			dctx->legacy_hw_ctx.rcs_state->base.write_domain
> = 0;
> -			dctx->legacy_hw_ctx.rcs_state->active = 0;
> -		}
> -
> -		if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
> -			i915_gem_object_ggtt_unpin(lctx-
> >legacy_hw_ctx.rcs_state);
> -
> -		i915_gem_context_unreference(lctx);
> -		i915_gem_context_reference(dctx);
> -		ring->last_context = dctx;
> -	}
> -}
> -
>  int i915_gem_context_init(struct drm_device *dev)  {
>  	struct drm_i915_private *dev_priv = dev->dev_private; @@ -498,10
> +462,6 @@ int i915_gem_context_enable(struct drm_i915_private
> *dev_priv)
>  		ppgtt->enable(ppgtt);
>  	}
> 
> -	/* FIXME: We should make this work, even in reset */
> -	if (i915_reset_in_progress(&dev_priv->gpu_error))
> -		return 0;
> -
>  	BUG_ON(!dev_priv->ring[RCS].default_context);
> 
>  	for_each_ring(ring, dev_priv, i) {
> @@ -645,7 +605,7 @@ static int do_switch(struct intel_engine_cs *ring,
>  	from = ring->last_context;
> 
>  	if (USES_FULL_PPGTT(ring->dev)) {
> -		ret = ppgtt->switch_mm(ppgtt, ring, false);
> +		ret = ppgtt->switch_mm(ppgtt, ring);
>  		if (ret)
>  			goto unpin_out;
>  	}
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c
> b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 5188936..450c8a9 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -216,19 +216,12 @@ static gen6_gtt_pte_t
> iris_pte_encode(dma_addr_t addr,
> 
>  /* Broadwell Page Directory Pointer Descriptors */  static int
> gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
> -			   uint64_t val, bool synchronous)
> +			   uint64_t val)
>  {
> -	struct drm_i915_private *dev_priv = ring->dev->dev_private;
>  	int ret;
> 
>  	BUG_ON(entry >= 4);
> 
> -	if (synchronous) {
> -		I915_WRITE(GEN8_RING_PDP_UDW(ring, entry), val >> 32);
> -		I915_WRITE(GEN8_RING_PDP_LDW(ring, entry), (u32)val);
> -		return 0;
> -	}
> -
>  	ret = intel_ring_begin(ring, 6);
>  	if (ret)
>  		return ret;
> @@ -245,8 +238,7 @@ static int gen8_write_pdp(struct intel_engine_cs
> *ring, unsigned entry,  }
> 
>  static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
> -			  struct intel_engine_cs *ring,
> -			  bool synchronous)
> +			  struct intel_engine_cs *ring)
>  {
>  	int i, ret;
> 
> @@ -255,7 +247,7 @@ static int gen8_mm_switch(struct i915_hw_ppgtt
> *ppgtt,
> 
>  	for (i = used_pd - 1; i >= 0; i--) {
>  		dma_addr_t addr = ppgtt->pd_dma_addr[i];
> -		ret = gen8_write_pdp(ring, i, addr, synchronous);
> +		ret = gen8_write_pdp(ring, i, addr);
>  		if (ret)
>  			return ret;
>  	}
> @@ -724,29 +716,10 @@ static uint32_t get_pd_offset(struct i915_hw_ppgtt
> *ppgtt)  }
> 
>  static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
> -			 struct intel_engine_cs *ring,
> -			 bool synchronous)
> +			 struct intel_engine_cs *ring)
>  {
> -	struct drm_device *dev = ppgtt->base.dev;
> -	struct drm_i915_private *dev_priv = dev->dev_private;
>  	int ret;
> 
> -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> -	 * manually frob these bits. Ideally we could use the ring functions,
> -	 * except our error handling makes it quite difficult (can't use
> -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> -	 *
> -	 * FIXME: We should try not to special case reset
> -	 */
> -	if (synchronous ||
> -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> -		I915_WRITE(RING_PP_DIR_BASE(ring),
> get_pd_offset(ppgtt));
> -		POSTING_READ(RING_PP_DIR_BASE(ring));
> -		return 0;
> -	}
> -
>  	/* NB: TLBs must be flushed and invalidated before a switch */
>  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS,
> I915_GEM_GPU_DOMAINS);
>  	if (ret)
> @@ -768,29 +741,10 @@ static int hsw_mm_switch(struct i915_hw_ppgtt
> *ppgtt,  }
> 
>  static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
> -			  struct intel_engine_cs *ring,
> -			  bool synchronous)
> +			  struct intel_engine_cs *ring)
>  {
> -	struct drm_device *dev = ppgtt->base.dev;
> -	struct drm_i915_private *dev_priv = dev->dev_private;
>  	int ret;
> 
> -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> -	 * manually frob these bits. Ideally we could use the ring functions,
> -	 * except our error handling makes it quite difficult (can't use
> -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> -	 *
> -	 * FIXME: We should try not to special case reset
> -	 */
> -	if (synchronous ||
> -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> -		I915_WRITE(RING_PP_DIR_BASE(ring),
> get_pd_offset(ppgtt));
> -		POSTING_READ(RING_PP_DIR_BASE(ring));
> -		return 0;
> -	}
> -
>  	/* NB: TLBs must be flushed and invalidated before a switch */
>  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS,
> I915_GEM_GPU_DOMAINS);
>  	if (ret)
> @@ -819,14 +773,11 @@ static int gen7_mm_switch(struct i915_hw_ppgtt
> *ppgtt,  }
> 
>  static int gen6_mm_switch(struct i915_hw_ppgtt *ppgtt,
> -			  struct intel_engine_cs *ring,
> -			  bool synchronous)
> +			  struct intel_engine_cs *ring)
>  {
>  	struct drm_device *dev = ppgtt->base.dev;
>  	struct drm_i915_private *dev_priv = dev->dev_private;
> 
> -	if (!synchronous)
> -		return 0;
> 
>  	I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
>  	I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt)); @@ -
> 852,7 +803,7 @@ static int gen8_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
>  		if (USES_FULL_PPGTT(dev))
>  			continue;
> 
> -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> +		ret = ppgtt->switch_mm(ppgtt, ring);
>  		if (ret)
>  			goto err_out;
>  	}
> @@ -897,7 +848,7 @@ static int gen7_ppgtt_enable(struct i915_hw_ppgtt
> *ppgtt)
>  		if (USES_FULL_PPGTT(dev))
>  			continue;
> 
> -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> +		ret = ppgtt->switch_mm(ppgtt, ring);
>  		if (ret)
>  			return ret;
>  	}
> @@ -926,7 +877,7 @@ static int gen6_ppgtt_enable(struct i915_hw_ppgtt
> *ppgtt)
>  	I915_WRITE(GFX_MODE,
> _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
> 
>  	for_each_ring(ring, dev_priv, i) {
> -		int ret = ppgtt->switch_mm(ppgtt, ring, true);
> +		int ret = ppgtt->switch_mm(ppgtt, ring);
>  		if (ret)
>  			return ret;
>  	}
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h
> b/drivers/gpu/drm/i915/i915_gem_gtt.h
> index 8d6f7c1..bf1e4fc 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> @@ -262,8 +262,7 @@ struct i915_hw_ppgtt {
> 
>  	int (*enable)(struct i915_hw_ppgtt *ppgtt);
>  	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
> -			 struct intel_engine_cs *ring,
> -			 bool synchronous);
> +			 struct intel_engine_cs *ring);
>  	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file
> *m);  };
> 
> --
> 2.0.0

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2] drm/i915: Rework GPU reset sequence to match driver load & thaw
  2014-08-05  8:47 ` [PATCH v2] " alistair.mcaulay
  2014-08-06 12:58   ` Mcaulay, Alistair
@ 2014-08-06 16:24   ` Mika Kuoppala
  2014-08-15 13:33     ` Mcaulay, Alistair
  1 sibling, 1 reply; 30+ messages in thread
From: Mika Kuoppala @ 2014-08-06 16:24 UTC (permalink / raw)
  To: alistair.mcaulay, intel-gfx


Hi,

alistair.mcaulay@intel.com writes:

> From: "McAulay, Alistair" <alistair.mcaulay@intel.com>
>
> This patch is to address Daniels concerns over different code during reset:
>
> http://lists.freedesktop.org/archives/intel-gfx/2014-June/047758.html
>
> "The reason for aiming as hard as possible to use the exact same code for
> driver load, gpu reset and runtime pm/system resume is that we've simply
> seen too many bugs due to slight variations and unintended omissions."
>
> Tested using igt drv_hangman.
>
> V2: Cleaner way of preventing check_wedge returning -EAGAIN
>
> Signed-off-by: McAulay, Alistair <alistair.mcaulay@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.c         |  6 +++
>  drivers/gpu/drm/i915/i915_drv.h         |  3 ++
>  drivers/gpu/drm/i915/i915_gem.c         |  6 +--
>  drivers/gpu/drm/i915/i915_gem_context.c | 42 +--------------------
>  drivers/gpu/drm/i915/i915_gem_gtt.c     | 67 +++++----------------------------
>  drivers/gpu/drm/i915/i915_gem_gtt.h     |  3 +-
>  6 files changed, 23 insertions(+), 104 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 5e4fefd..3bfafe6 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -806,7 +806,13 @@ int i915_reset(struct drm_device *dev)
>  			!dev_priv->ums.mm_suspended) {
>  		dev_priv->ums.mm_suspended = 0;
>  
> +		/* Used to prevent gem_check_wedged returning -EAGAIN during gpu reset */
> +		dev_priv->gpu_error.reload_in_reset = true;
> +
>  		ret = i915_gem_init_hw(dev);
> +
> +		dev_priv->gpu_error.reload_in_reset = false;
> +
>  		mutex_unlock(&dev->struct_mutex);
>  		if (ret) {
>  			DRM_ERROR("Failed hw init on reset %d\n", ret);
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 991b663..116daff 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1217,6 +1217,9 @@ struct i915_gpu_error {
>  
>  	/* For missed irq/seqno simulation. */
>  	unsigned int test_irq_rings;
> +
> +	/* Used to prevent gem_check_wedged returning -EAGAIN during gpu reset   */
> +	bool reload_in_reset;
>  };
>  
>  enum modeset_restore {
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index ef047bc..14e1770 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1085,7 +1085,9 @@ i915_gem_check_wedge(struct i915_gpu_error *error,
>  		if (i915_terminally_wedged(error))
>  			return -EIO;
>  
> -		return -EAGAIN;
> +		/* Check if GPU Reset is in progress */
> +		if (!error->reload_in_reset)
> +			return -EAGAIN;
>  	}
>  
>  	return 0;
> @@ -2590,8 +2592,6 @@ void i915_gem_reset(struct drm_device *dev)
>  	for_each_ring(ring, dev_priv, i)
>  		i915_gem_reset_ring_cleanup(dev_priv, ring);
>  
> -	i915_gem_context_reset(dev);
> -
>  	i915_gem_restore_fences(dev);
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index de72a28..d96219f 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -372,42 +372,6 @@ err_destroy:
>  	return ERR_PTR(ret);
>  }
>  
> -void i915_gem_context_reset(struct drm_device *dev)
> -{
> -	struct drm_i915_private *dev_priv = dev->dev_private;
> -	int i;
> -
> -	/* Prevent the hardware from restoring the last context (which hung) on
> -	 * the next switch */
> -	for (i = 0; i < I915_NUM_RINGS; i++) {
> -		struct intel_engine_cs *ring = &dev_priv->ring[i];
> -		struct intel_context *dctx = ring->default_context;
> -		struct intel_context *lctx = ring->last_context;
> -
> -		/* Do a fake switch to the default context */
> -		if (lctx == dctx)
> -			continue;
> -
> -		if (!lctx)
> -			continue;
> -
> -		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
> -			WARN_ON(i915_gem_obj_ggtt_pin(dctx->legacy_hw_ctx.rcs_state,
> -						      get_context_alignment(dev), 0));
> -			/* Fake a finish/inactive */
> -			dctx->legacy_hw_ctx.rcs_state->base.write_domain = 0;
> -			dctx->legacy_hw_ctx.rcs_state->active = 0;
> -		}
> -
> -		if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
> -			i915_gem_object_ggtt_unpin(lctx->legacy_hw_ctx.rcs_state);
> -
> -		i915_gem_context_unreference(lctx);
> -		i915_gem_context_reference(dctx);
> -		ring->last_context = dctx;
> -	}
> -}
> -

I am with Daniel on this one. I don't understand how can we throw
everything in here away.

We need to force hw to switch to a working context, after reset,
so that our internal state tracking matches.

Further, if we aim to more unification I think we should make it
so that the initial render state will get run, also after reset.

If we cleanup the last context for each ring set default context
carefully, i915_gem_context_enable() will then switch to default contexts
and reinit them using the initial render state. Something like this:

void i915_gem_context_reset(struct drm_device *dev)
{
	struct drm_i915_private *dev_priv = dev->dev_private;
	int i;

	for (i = 0; i < I915_NUM_RINGS; i++) {
		struct intel_engine_cs *ring = &dev_priv->ring[i];
		struct intel_context *lctx = ring->last_context;
		struct intel_context *dctx = ring->default_context;

		if (lctx) {
			if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
				i915_gem_object_ggtt_unpin(lctx->legacy_hw_ctx.rcs_state);

			i915_gem_context_unreference(lctx);
			ring->last_context = NULL;
		}

		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
			dctx->legacy_hw_ctx.rcs_state->base.write_domain = 0;
			dctx->legacy_hw_ctx.rcs_state->active = 0;
			dctx->legacy_hw_ctx.initialized = false;
		}
	}
}

The state would be closer of what we get after module reload.

-Mika

>  int i915_gem_context_init(struct drm_device *dev)
>  {
>  	struct drm_i915_private *dev_priv = dev->dev_private;
> @@ -498,10 +462,6 @@ int i915_gem_context_enable(struct drm_i915_private *dev_priv)
>  		ppgtt->enable(ppgtt);
>  	}
>  
> -	/* FIXME: We should make this work, even in reset */
> -	if (i915_reset_in_progress(&dev_priv->gpu_error))
> -		return 0;
> -
>  	BUG_ON(!dev_priv->ring[RCS].default_context);
>  
>  	for_each_ring(ring, dev_priv, i) {
> @@ -645,7 +605,7 @@ static int do_switch(struct intel_engine_cs *ring,
>  	from = ring->last_context;
>  
>  	if (USES_FULL_PPGTT(ring->dev)) {
> -		ret = ppgtt->switch_mm(ppgtt, ring, false);
> +		ret = ppgtt->switch_mm(ppgtt, ring);
>  		if (ret)
>  			goto unpin_out;
>  	}
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 5188936..450c8a9 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -216,19 +216,12 @@ static gen6_gtt_pte_t iris_pte_encode(dma_addr_t addr,
>  
>  /* Broadwell Page Directory Pointer Descriptors */
>  static int gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
> -			   uint64_t val, bool synchronous)
> +			   uint64_t val)
>  {
> -	struct drm_i915_private *dev_priv = ring->dev->dev_private;
>  	int ret;
>  
>  	BUG_ON(entry >= 4);
>  
> -	if (synchronous) {
> -		I915_WRITE(GEN8_RING_PDP_UDW(ring, entry), val >> 32);
> -		I915_WRITE(GEN8_RING_PDP_LDW(ring, entry), (u32)val);
> -		return 0;
> -	}
> -
>  	ret = intel_ring_begin(ring, 6);
>  	if (ret)
>  		return ret;
> @@ -245,8 +238,7 @@ static int gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
>  }
>  
>  static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
> -			  struct intel_engine_cs *ring,
> -			  bool synchronous)
> +			  struct intel_engine_cs *ring)
>  {
>  	int i, ret;
>  
> @@ -255,7 +247,7 @@ static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
>  
>  	for (i = used_pd - 1; i >= 0; i--) {
>  		dma_addr_t addr = ppgtt->pd_dma_addr[i];
> -		ret = gen8_write_pdp(ring, i, addr, synchronous);
> +		ret = gen8_write_pdp(ring, i, addr);
>  		if (ret)
>  			return ret;
>  	}
> @@ -724,29 +716,10 @@ static uint32_t get_pd_offset(struct i915_hw_ppgtt *ppgtt)
>  }
>  
>  static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
> -			 struct intel_engine_cs *ring,
> -			 bool synchronous)
> +			 struct intel_engine_cs *ring)
>  {
> -	struct drm_device *dev = ppgtt->base.dev;
> -	struct drm_i915_private *dev_priv = dev->dev_private;
>  	int ret;
>  
> -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> -	 * manually frob these bits. Ideally we could use the ring functions,
> -	 * except our error handling makes it quite difficult (can't use
> -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> -	 *
> -	 * FIXME: We should try not to special case reset
> -	 */
> -	if (synchronous ||
> -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> -		I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
> -		POSTING_READ(RING_PP_DIR_BASE(ring));
> -		return 0;
> -	}
> -
>  	/* NB: TLBs must be flushed and invalidated before a switch */
>  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, I915_GEM_GPU_DOMAINS);
>  	if (ret)
> @@ -768,29 +741,10 @@ static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
>  }
>  
>  static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
> -			  struct intel_engine_cs *ring,
> -			  bool synchronous)
> +			  struct intel_engine_cs *ring)
>  {
> -	struct drm_device *dev = ppgtt->base.dev;
> -	struct drm_i915_private *dev_priv = dev->dev_private;
>  	int ret;
>  
> -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> -	 * manually frob these bits. Ideally we could use the ring functions,
> -	 * except our error handling makes it quite difficult (can't use
> -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> -	 *
> -	 * FIXME: We should try not to special case reset
> -	 */
> -	if (synchronous ||
> -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> -		I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
> -		POSTING_READ(RING_PP_DIR_BASE(ring));
> -		return 0;
> -	}
> -
>  	/* NB: TLBs must be flushed and invalidated before a switch */
>  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, I915_GEM_GPU_DOMAINS);
>  	if (ret)
> @@ -819,14 +773,11 @@ static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
>  }
>  
>  static int gen6_mm_switch(struct i915_hw_ppgtt *ppgtt,
> -			  struct intel_engine_cs *ring,
> -			  bool synchronous)
> +			  struct intel_engine_cs *ring)
>  {
>  	struct drm_device *dev = ppgtt->base.dev;
>  	struct drm_i915_private *dev_priv = dev->dev_private;
>  
> -	if (!synchronous)
> -		return 0;
>  
>  	I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
>  	I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
> @@ -852,7 +803,7 @@ static int gen8_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
>  		if (USES_FULL_PPGTT(dev))
>  			continue;
>  
> -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> +		ret = ppgtt->switch_mm(ppgtt, ring);
>  		if (ret)
>  			goto err_out;
>  	}
> @@ -897,7 +848,7 @@ static int gen7_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
>  		if (USES_FULL_PPGTT(dev))
>  			continue;
>  
> -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> +		ret = ppgtt->switch_mm(ppgtt, ring);
>  		if (ret)
>  			return ret;
>  	}
> @@ -926,7 +877,7 @@ static int gen6_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
>  	I915_WRITE(GFX_MODE, _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
>  
>  	for_each_ring(ring, dev_priv, i) {
> -		int ret = ppgtt->switch_mm(ppgtt, ring, true);
> +		int ret = ppgtt->switch_mm(ppgtt, ring);
>  		if (ret)
>  			return ret;
>  	}
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
> index 8d6f7c1..bf1e4fc 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> @@ -262,8 +262,7 @@ struct i915_hw_ppgtt {
>  
>  	int (*enable)(struct i915_hw_ppgtt *ppgtt);
>  	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
> -			 struct intel_engine_cs *ring,
> -			 bool synchronous);
> +			 struct intel_engine_cs *ring);
>  	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file *m);
>  };
>  
> -- 
> 2.0.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2] drm/i915: Rework GPU reset sequence to match driver load & thaw
  2014-08-06 16:24   ` Mika Kuoppala
@ 2014-08-15 13:33     ` Mcaulay, Alistair
  2014-08-15 15:41       ` Daniel Vetter
  2014-08-15 17:03       ` Mika Kuoppala
  0 siblings, 2 replies; 30+ messages in thread
From: Mcaulay, Alistair @ 2014-08-15 13:33 UTC (permalink / raw)
  To: Mika Kuoppala, Daniel Vetter, intel-gfx

Hi Mika / Daniel,

below is the basic code path of a reset which has been changed by my patch:

i915_reset()
{
	....
	i915_gem_reset() -> This used to call i915_gem_context_reset(), which has now been removed. 
	.....
	i915_gem_init_hw()
		.....
		i915_gem_context_enable() -> This used to return during reset. Now it doesn't
		.....
			for each ring, i915_switch_context(default)
				do_switch();
		.....

	.....
}

" I am with Daniel on this one. I don't understand how can we throw everything in here away."
Did you maybe mean Ben?
Daniel, I thought you were happy with the implementation, and V2 fixed your last concern, could you please comment.

" We need to force hw to switch to a working context, after reset, so that our internal state tracking matches."
I believe this patch does that using i915_switch_context, rather than the hacky i915_gem_context_reset()

Alistair.

> -----Original Message-----
> From: Mika Kuoppala [mailto:mika.kuoppala@linux.intel.com]
> Sent: Wednesday, August 06, 2014 5:25 PM
> To: Mcaulay, Alistair; intel-gfx@lists.freedesktop.org
> Subject: Re: [Intel-gfx] [PATCH v2] drm/i915: Rework GPU reset sequence to
> match driver load & thaw
> 
> 
> Hi,
> 
> alistair.mcaulay@intel.com writes:
> 
> > From: "McAulay, Alistair" <alistair.mcaulay@intel.com>
> >
> > This patch is to address Daniels concerns over different code during reset:
> >
> > http://lists.freedesktop.org/archives/intel-gfx/2014-June/047758.html
> >
> > "The reason for aiming as hard as possible to use the exact same code
> > for driver load, gpu reset and runtime pm/system resume is that we've
> > simply seen too many bugs due to slight variations and unintended
> omissions."
> >
> > Tested using igt drv_hangman.
> >
> > V2: Cleaner way of preventing check_wedge returning -EAGAIN
> >
> > Signed-off-by: McAulay, Alistair <alistair.mcaulay@intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_drv.c         |  6 +++
> >  drivers/gpu/drm/i915/i915_drv.h         |  3 ++
> >  drivers/gpu/drm/i915/i915_gem.c         |  6 +--
> >  drivers/gpu/drm/i915/i915_gem_context.c | 42 +--------------------
> >  drivers/gpu/drm/i915/i915_gem_gtt.c     | 67 +++++----------------------------
> >  drivers/gpu/drm/i915/i915_gem_gtt.h     |  3 +-
> >  6 files changed, 23 insertions(+), 104 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_drv.c
> > b/drivers/gpu/drm/i915/i915_drv.c index 5e4fefd..3bfafe6 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.c
> > +++ b/drivers/gpu/drm/i915/i915_drv.c
> > @@ -806,7 +806,13 @@ int i915_reset(struct drm_device *dev)
> >  			!dev_priv->ums.mm_suspended) {
> >  		dev_priv->ums.mm_suspended = 0;
> >
> > +		/* Used to prevent gem_check_wedged returning -EAGAIN
> during gpu reset */
> > +		dev_priv->gpu_error.reload_in_reset = true;
> > +
> >  		ret = i915_gem_init_hw(dev);
> > +
> > +		dev_priv->gpu_error.reload_in_reset = false;
> > +
> >  		mutex_unlock(&dev->struct_mutex);
> >  		if (ret) {
> >  			DRM_ERROR("Failed hw init on reset %d\n", ret); diff
> --git
> > a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > index 991b663..116daff 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -1217,6 +1217,9 @@ struct i915_gpu_error {
> >
> >  	/* For missed irq/seqno simulation. */
> >  	unsigned int test_irq_rings;
> > +
> > +	/* Used to prevent gem_check_wedged returning -EAGAIN during
> gpu reset   */
> > +	bool reload_in_reset;
> >  };
> >
> >  enum modeset_restore {
> > diff --git a/drivers/gpu/drm/i915/i915_gem.c
> > b/drivers/gpu/drm/i915/i915_gem.c index ef047bc..14e1770 100644
> > --- a/drivers/gpu/drm/i915/i915_gem.c
> > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > @@ -1085,7 +1085,9 @@ i915_gem_check_wedge(struct i915_gpu_error
> *error,
> >  		if (i915_terminally_wedged(error))
> >  			return -EIO;
> >
> > -		return -EAGAIN;
> > +		/* Check if GPU Reset is in progress */
> > +		if (!error->reload_in_reset)
> > +			return -EAGAIN;
> >  	}
> >
> >  	return 0;
> > @@ -2590,8 +2592,6 @@ void i915_gem_reset(struct drm_device *dev)
> >  	for_each_ring(ring, dev_priv, i)
> >  		i915_gem_reset_ring_cleanup(dev_priv, ring);
> >
> > -	i915_gem_context_reset(dev);
> > -
> >  	i915_gem_restore_fences(dev);
> >  }
> >
> > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c
> > b/drivers/gpu/drm/i915/i915_gem_context.c
> > index de72a28..d96219f 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_context.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> > @@ -372,42 +372,6 @@ err_destroy:
> >  	return ERR_PTR(ret);
> >  }
> >
> > -void i915_gem_context_reset(struct drm_device *dev) -{
> > -	struct drm_i915_private *dev_priv = dev->dev_private;
> > -	int i;
> > -
> > -	/* Prevent the hardware from restoring the last context (which
> hung) on
> > -	 * the next switch */
> > -	for (i = 0; i < I915_NUM_RINGS; i++) {
> > -		struct intel_engine_cs *ring = &dev_priv->ring[i];
> > -		struct intel_context *dctx = ring->default_context;
> > -		struct intel_context *lctx = ring->last_context;
> > -
> > -		/* Do a fake switch to the default context */
> > -		if (lctx == dctx)
> > -			continue;
> > -
> > -		if (!lctx)
> > -			continue;
> > -
> > -		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
> > -			WARN_ON(i915_gem_obj_ggtt_pin(dctx-
> >legacy_hw_ctx.rcs_state,
> > -
> get_context_alignment(dev), 0));
> > -			/* Fake a finish/inactive */
> > -			dctx->legacy_hw_ctx.rcs_state->base.write_domain
> = 0;
> > -			dctx->legacy_hw_ctx.rcs_state->active = 0;
> > -		}
> > -
> > -		if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
> > -			i915_gem_object_ggtt_unpin(lctx-
> >legacy_hw_ctx.rcs_state);
> > -
> > -		i915_gem_context_unreference(lctx);
> > -		i915_gem_context_reference(dctx);
> > -		ring->last_context = dctx;
> > -	}
> > -}
> > -
> 
> I am with Daniel on this one. I don't understand how can we throw
> everything in here away.
> 
> We need to force hw to switch to a working context, after reset, so that our
> internal state tracking matches.
> 
> Further, if we aim to more unification I think we should make it so that the
> initial render state will get run, also after reset.
> 
> If we cleanup the last context for each ring set default context carefully,
> i915_gem_context_enable() will then switch to default contexts and reinit
> them using the initial render state. Something like this:
> 
> void i915_gem_context_reset(struct drm_device *dev) {
> 	struct drm_i915_private *dev_priv = dev->dev_private;
> 	int i;
> 
> 	for (i = 0; i < I915_NUM_RINGS; i++) {
> 		struct intel_engine_cs *ring = &dev_priv->ring[i];
> 		struct intel_context *lctx = ring->last_context;
> 		struct intel_context *dctx = ring->default_context;
> 
> 		if (lctx) {
> 			if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
> 				i915_gem_object_ggtt_unpin(lctx-
> >legacy_hw_ctx.rcs_state);
> 
> 			i915_gem_context_unreference(lctx);
> 			ring->last_context = NULL;
> 		}
> 
> 		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
> 			dctx->legacy_hw_ctx.rcs_state->base.write_domain
> = 0;
> 			dctx->legacy_hw_ctx.rcs_state->active = 0;
> 			dctx->legacy_hw_ctx.initialized = false;
> 		}
> 	}
> }
> 
> The state would be closer of what we get after module reload.
> 
> -Mika
> 
> >  int i915_gem_context_init(struct drm_device *dev)  {
> >  	struct drm_i915_private *dev_priv = dev->dev_private; @@ -498,10
> > +462,6 @@ int i915_gem_context_enable(struct drm_i915_private
> *dev_priv)
> >  		ppgtt->enable(ppgtt);
> >  	}
> >
> > -	/* FIXME: We should make this work, even in reset */
> > -	if (i915_reset_in_progress(&dev_priv->gpu_error))
> > -		return 0;
> > -
> >  	BUG_ON(!dev_priv->ring[RCS].default_context);
> >
> >  	for_each_ring(ring, dev_priv, i) {
> > @@ -645,7 +605,7 @@ static int do_switch(struct intel_engine_cs *ring,
> >  	from = ring->last_context;
> >
> >  	if (USES_FULL_PPGTT(ring->dev)) {
> > -		ret = ppgtt->switch_mm(ppgtt, ring, false);
> > +		ret = ppgtt->switch_mm(ppgtt, ring);
> >  		if (ret)
> >  			goto unpin_out;
> >  	}
> > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c
> > b/drivers/gpu/drm/i915/i915_gem_gtt.c
> > index 5188936..450c8a9 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> > @@ -216,19 +216,12 @@ static gen6_gtt_pte_t
> iris_pte_encode(dma_addr_t
> > addr,
> >
> >  /* Broadwell Page Directory Pointer Descriptors */  static int
> > gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
> > -			   uint64_t val, bool synchronous)
> > +			   uint64_t val)
> >  {
> > -	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> >  	int ret;
> >
> >  	BUG_ON(entry >= 4);
> >
> > -	if (synchronous) {
> > -		I915_WRITE(GEN8_RING_PDP_UDW(ring, entry), val >> 32);
> > -		I915_WRITE(GEN8_RING_PDP_LDW(ring, entry), (u32)val);
> > -		return 0;
> > -	}
> > -
> >  	ret = intel_ring_begin(ring, 6);
> >  	if (ret)
> >  		return ret;
> > @@ -245,8 +238,7 @@ static int gen8_write_pdp(struct intel_engine_cs
> > *ring, unsigned entry,  }
> >
> >  static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > -			  struct intel_engine_cs *ring,
> > -			  bool synchronous)
> > +			  struct intel_engine_cs *ring)
> >  {
> >  	int i, ret;
> >
> > @@ -255,7 +247,7 @@ static int gen8_mm_switch(struct i915_hw_ppgtt
> > *ppgtt,
> >
> >  	for (i = used_pd - 1; i >= 0; i--) {
> >  		dma_addr_t addr = ppgtt->pd_dma_addr[i];
> > -		ret = gen8_write_pdp(ring, i, addr, synchronous);
> > +		ret = gen8_write_pdp(ring, i, addr);
> >  		if (ret)
> >  			return ret;
> >  	}
> > @@ -724,29 +716,10 @@ static uint32_t get_pd_offset(struct
> > i915_hw_ppgtt *ppgtt)  }
> >
> >  static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > -			 struct intel_engine_cs *ring,
> > -			 bool synchronous)
> > +			 struct intel_engine_cs *ring)
> >  {
> > -	struct drm_device *dev = ppgtt->base.dev;
> > -	struct drm_i915_private *dev_priv = dev->dev_private;
> >  	int ret;
> >
> > -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> > -	 * manually frob these bits. Ideally we could use the ring functions,
> > -	 * except our error handling makes it quite difficult (can't use
> > -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> > -	 *
> > -	 * FIXME: We should try not to special case reset
> > -	 */
> > -	if (synchronous ||
> > -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> > -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> > -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> > -		I915_WRITE(RING_PP_DIR_BASE(ring),
> get_pd_offset(ppgtt));
> > -		POSTING_READ(RING_PP_DIR_BASE(ring));
> > -		return 0;
> > -	}
> > -
> >  	/* NB: TLBs must be flushed and invalidated before a switch */
> >  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS,
> I915_GEM_GPU_DOMAINS);
> >  	if (ret)
> > @@ -768,29 +741,10 @@ static int hsw_mm_switch(struct i915_hw_ppgtt
> > *ppgtt,  }
> >
> >  static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > -			  struct intel_engine_cs *ring,
> > -			  bool synchronous)
> > +			  struct intel_engine_cs *ring)
> >  {
> > -	struct drm_device *dev = ppgtt->base.dev;
> > -	struct drm_i915_private *dev_priv = dev->dev_private;
> >  	int ret;
> >
> > -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> > -	 * manually frob these bits. Ideally we could use the ring functions,
> > -	 * except our error handling makes it quite difficult (can't use
> > -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> > -	 *
> > -	 * FIXME: We should try not to special case reset
> > -	 */
> > -	if (synchronous ||
> > -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> > -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> > -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> > -		I915_WRITE(RING_PP_DIR_BASE(ring),
> get_pd_offset(ppgtt));
> > -		POSTING_READ(RING_PP_DIR_BASE(ring));
> > -		return 0;
> > -	}
> > -
> >  	/* NB: TLBs must be flushed and invalidated before a switch */
> >  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS,
> I915_GEM_GPU_DOMAINS);
> >  	if (ret)
> > @@ -819,14 +773,11 @@ static int gen7_mm_switch(struct i915_hw_ppgtt
> > *ppgtt,  }
> >
> >  static int gen6_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > -			  struct intel_engine_cs *ring,
> > -			  bool synchronous)
> > +			  struct intel_engine_cs *ring)
> >  {
> >  	struct drm_device *dev = ppgtt->base.dev;
> >  	struct drm_i915_private *dev_priv = dev->dev_private;
> >
> > -	if (!synchronous)
> > -		return 0;
> >
> >  	I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> >  	I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt)); @@ -
> 852,7
> > +803,7 @@ static int gen8_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
> >  		if (USES_FULL_PPGTT(dev))
> >  			continue;
> >
> > -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> > +		ret = ppgtt->switch_mm(ppgtt, ring);
> >  		if (ret)
> >  			goto err_out;
> >  	}
> > @@ -897,7 +848,7 @@ static int gen7_ppgtt_enable(struct i915_hw_ppgtt
> *ppgtt)
> >  		if (USES_FULL_PPGTT(dev))
> >  			continue;
> >
> > -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> > +		ret = ppgtt->switch_mm(ppgtt, ring);
> >  		if (ret)
> >  			return ret;
> >  	}
> > @@ -926,7 +877,7 @@ static int gen6_ppgtt_enable(struct i915_hw_ppgtt
> *ppgtt)
> >  	I915_WRITE(GFX_MODE,
> _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
> >
> >  	for_each_ring(ring, dev_priv, i) {
> > -		int ret = ppgtt->switch_mm(ppgtt, ring, true);
> > +		int ret = ppgtt->switch_mm(ppgtt, ring);
> >  		if (ret)
> >  			return ret;
> >  	}
> > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h
> > b/drivers/gpu/drm/i915/i915_gem_gtt.h
> > index 8d6f7c1..bf1e4fc 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> > @@ -262,8 +262,7 @@ struct i915_hw_ppgtt {
> >
> >  	int (*enable)(struct i915_hw_ppgtt *ppgtt);
> >  	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
> > -			 struct intel_engine_cs *ring,
> > -			 bool synchronous);
> > +			 struct intel_engine_cs *ring);
> >  	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file
> *m);
> > };
> >
> > --
> > 2.0.0
> >
> > _______________________________________________
> > Intel-gfx mailing list
> > Intel-gfx@lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2] drm/i915: Rework GPU reset sequence to match driver load & thaw
  2014-08-15 13:33     ` Mcaulay, Alistair
@ 2014-08-15 15:41       ` Daniel Vetter
  2014-08-15 17:03       ` Mika Kuoppala
  1 sibling, 0 replies; 30+ messages in thread
From: Daniel Vetter @ 2014-08-15 15:41 UTC (permalink / raw)
  To: Mcaulay, Alistair; +Cc: intel-gfx

On Fri, Aug 15, 2014 at 3:33 PM, Mcaulay, Alistair
<alistair.mcaulay@intel.com> wrote:
> below is the basic code path of a reset which has been changed by my patch:
>
> i915_reset()
> {
>         ....
>         i915_gem_reset() -> This used to call i915_gem_context_reset(), which has now been removed.
>         .....
>         i915_gem_init_hw()
>                 .....
>                 i915_gem_context_enable() -> This used to return during reset. Now it doesn't
>                 .....
>                         for each ring, i915_switch_context(default)
>                                 do_switch();
>                 .....
>
>         .....
> }
>
> " I am with Daniel on this one. I don't understand how can we throw everything in here away."
> Did you maybe mean Ben?
> Daniel, I thought you were happy with the implementation, and V2 fixed your last concern, could you please comment.

I'm happy with the underlying fix, but I didn't check all the details
and instead signed up Mika for that. Helps me scale and also makes
sure that more people understand tricky parts.

> " We need to force hw to switch to a working context, after reset, so that our internal state tracking matches."
> I believe this patch does that using i915_switch_context, rather than the hacky i915_gem_context_reset()

I'm half in a plane already, so I'll leave it to you and Mika to
figure this out. Maybe the only thing missing is a bit more
explanation in the commit message why exactly we can remove all that
code. On a quick glance both Mika's concern and your reply make sense.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2] drm/i915: Rework GPU reset sequence to match driver load & thaw
  2014-08-15 13:33     ` Mcaulay, Alistair
  2014-08-15 15:41       ` Daniel Vetter
@ 2014-08-15 17:03       ` Mika Kuoppala
  2014-08-15 17:51         ` [PATCH v3] " alistair.mcaulay
  1 sibling, 1 reply; 30+ messages in thread
From: Mika Kuoppala @ 2014-08-15 17:03 UTC (permalink / raw)
  To: Mcaulay, Alistair, Daniel Vetter, intel-gfx

"Mcaulay, Alistair" <alistair.mcaulay@intel.com> writes:

> Hi Mika / Daniel,
>
> below is the basic code path of a reset which has been changed by my patch:
>
> i915_reset()
> {
> 	....
> 	i915_gem_reset() -> This used to call i915_gem_context_reset(), which has now been removed. 
> 	.....
> 	i915_gem_init_hw()
> 		.....
> 		i915_gem_context_enable() -> This used to return during reset. Now it doesn't
> 		.....
> 			for each ring, i915_switch_context(default)
> 				do_switch();
> 		.....
>
> 	.....
> }
>
> " I am with Daniel on this one. I don't understand how can we throw everything in here away."
> Did you maybe mean Ben?
> Daniel, I thought you were happy with the implementation, and V2 fixed your last concern, could you please comment.
>
> " We need to force hw to switch to a working context, after reset, so that our internal state tracking matches."
> I believe this patch does that using i915_switch_context, rather than the hacky i915_gem_context_reset()

Our internal state tracking will be ok after i915_gem_context_enable()
has been called. All rings have been set to the default context.

But what happens with this sequence:

- render ring was running in default context 
- reset happens
- we call i915_gem_context_enable 
- do_switch will get called 
- it figure out that last context is the same as we are switching to
  (from == to) and it bails out
- we never wrote anything to ring, so we have pre reset context running.
  MI_SET_CONTEXT was never run.

Even if reset would not clear the CCID, I think it is safest to
force a MI_SET_CONTEXT here.

Further, if the default context was mangled before the reset,
we should reinitialize it to a known state by running
i915_gem_render_state_init() for it. But this can be
considered as a possible future work.

-Mika

> Alistair.
>
>> -----Original Message-----
>> From: Mika Kuoppala [mailto:mika.kuoppala@linux.intel.com]
>> Sent: Wednesday, August 06, 2014 5:25 PM
>> To: Mcaulay, Alistair; intel-gfx@lists.freedesktop.org
>> Subject: Re: [Intel-gfx] [PATCH v2] drm/i915: Rework GPU reset sequence to
>> match driver load & thaw
>> 
>> 
>> Hi,
>> 
>> alistair.mcaulay@intel.com writes:
>> 
>> > From: "McAulay, Alistair" <alistair.mcaulay@intel.com>
>> >
>> > This patch is to address Daniels concerns over different code during reset:
>> >
>> > http://lists.freedesktop.org/archives/intel-gfx/2014-June/047758.html
>> >
>> > "The reason for aiming as hard as possible to use the exact same code
>> > for driver load, gpu reset and runtime pm/system resume is that we've
>> > simply seen too many bugs due to slight variations and unintended
>> omissions."
>> >
>> > Tested using igt drv_hangman.
>> >
>> > V2: Cleaner way of preventing check_wedge returning -EAGAIN
>> >
>> > Signed-off-by: McAulay, Alistair <alistair.mcaulay@intel.com>
>> > ---
>> >  drivers/gpu/drm/i915/i915_drv.c         |  6 +++
>> >  drivers/gpu/drm/i915/i915_drv.h         |  3 ++
>> >  drivers/gpu/drm/i915/i915_gem.c         |  6 +--
>> >  drivers/gpu/drm/i915/i915_gem_context.c | 42 +--------------------
>> >  drivers/gpu/drm/i915/i915_gem_gtt.c     | 67 +++++----------------------------
>> >  drivers/gpu/drm/i915/i915_gem_gtt.h     |  3 +-
>> >  6 files changed, 23 insertions(+), 104 deletions(-)
>> >
>> > diff --git a/drivers/gpu/drm/i915/i915_drv.c
>> > b/drivers/gpu/drm/i915/i915_drv.c index 5e4fefd..3bfafe6 100644
>> > --- a/drivers/gpu/drm/i915/i915_drv.c
>> > +++ b/drivers/gpu/drm/i915/i915_drv.c
>> > @@ -806,7 +806,13 @@ int i915_reset(struct drm_device *dev)
>> >  			!dev_priv->ums.mm_suspended) {
>> >  		dev_priv->ums.mm_suspended = 0;
>> >
>> > +		/* Used to prevent gem_check_wedged returning -EAGAIN
>> during gpu reset */
>> > +		dev_priv->gpu_error.reload_in_reset = true;
>> > +
>> >  		ret = i915_gem_init_hw(dev);
>> > +
>> > +		dev_priv->gpu_error.reload_in_reset = false;
>> > +
>> >  		mutex_unlock(&dev->struct_mutex);
>> >  		if (ret) {
>> >  			DRM_ERROR("Failed hw init on reset %d\n", ret); diff
>> --git
>> > a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> > index 991b663..116daff 100644
>> > --- a/drivers/gpu/drm/i915/i915_drv.h
>> > +++ b/drivers/gpu/drm/i915/i915_drv.h
>> > @@ -1217,6 +1217,9 @@ struct i915_gpu_error {
>> >
>> >  	/* For missed irq/seqno simulation. */
>> >  	unsigned int test_irq_rings;
>> > +
>> > +	/* Used to prevent gem_check_wedged returning -EAGAIN during
>> gpu reset   */
>> > +	bool reload_in_reset;
>> >  };
>> >
>> >  enum modeset_restore {
>> > diff --git a/drivers/gpu/drm/i915/i915_gem.c
>> > b/drivers/gpu/drm/i915/i915_gem.c index ef047bc..14e1770 100644
>> > --- a/drivers/gpu/drm/i915/i915_gem.c
>> > +++ b/drivers/gpu/drm/i915/i915_gem.c
>> > @@ -1085,7 +1085,9 @@ i915_gem_check_wedge(struct i915_gpu_error
>> *error,
>> >  		if (i915_terminally_wedged(error))
>> >  			return -EIO;
>> >
>> > -		return -EAGAIN;
>> > +		/* Check if GPU Reset is in progress */
>> > +		if (!error->reload_in_reset)
>> > +			return -EAGAIN;
>> >  	}
>> >
>> >  	return 0;
>> > @@ -2590,8 +2592,6 @@ void i915_gem_reset(struct drm_device *dev)
>> >  	for_each_ring(ring, dev_priv, i)
>> >  		i915_gem_reset_ring_cleanup(dev_priv, ring);
>> >
>> > -	i915_gem_context_reset(dev);
>> > -
>> >  	i915_gem_restore_fences(dev);
>> >  }
>> >
>> > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c
>> > b/drivers/gpu/drm/i915/i915_gem_context.c
>> > index de72a28..d96219f 100644
>> > --- a/drivers/gpu/drm/i915/i915_gem_context.c
>> > +++ b/drivers/gpu/drm/i915/i915_gem_context.c
>> > @@ -372,42 +372,6 @@ err_destroy:
>> >  	return ERR_PTR(ret);
>> >  }
>> >
>> > -void i915_gem_context_reset(struct drm_device *dev) -{
>> > -	struct drm_i915_private *dev_priv = dev->dev_private;
>> > -	int i;
>> > -
>> > -	/* Prevent the hardware from restoring the last context (which
>> hung) on
>> > -	 * the next switch */
>> > -	for (i = 0; i < I915_NUM_RINGS; i++) {
>> > -		struct intel_engine_cs *ring = &dev_priv->ring[i];
>> > -		struct intel_context *dctx = ring->default_context;
>> > -		struct intel_context *lctx = ring->last_context;
>> > -
>> > -		/* Do a fake switch to the default context */
>> > -		if (lctx == dctx)
>> > -			continue;
>> > -
>> > -		if (!lctx)
>> > -			continue;
>> > -
>> > -		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
>> > -			WARN_ON(i915_gem_obj_ggtt_pin(dctx-
>> >legacy_hw_ctx.rcs_state,
>> > -
>> get_context_alignment(dev), 0));
>> > -			/* Fake a finish/inactive */
>> > -			dctx->legacy_hw_ctx.rcs_state->base.write_domain
>> = 0;
>> > -			dctx->legacy_hw_ctx.rcs_state->active = 0;
>> > -		}
>> > -
>> > -		if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
>> > -			i915_gem_object_ggtt_unpin(lctx-
>> >legacy_hw_ctx.rcs_state);
>> > -
>> > -		i915_gem_context_unreference(lctx);
>> > -		i915_gem_context_reference(dctx);
>> > -		ring->last_context = dctx;
>> > -	}
>> > -}
>> > -
>> 
>> I am with Daniel on this one. I don't understand how can we throw
>> everything in here away.
>> 
>> We need to force hw to switch to a working context, after reset, so that our
>> internal state tracking matches.
>> 
>> Further, if we aim to more unification I think we should make it so that the
>> initial render state will get run, also after reset.
>> 
>> If we cleanup the last context for each ring set default context carefully,
>> i915_gem_context_enable() will then switch to default contexts and reinit
>> them using the initial render state. Something like this:
>> 
>> void i915_gem_context_reset(struct drm_device *dev) {
>> 	struct drm_i915_private *dev_priv = dev->dev_private;
>> 	int i;
>> 
>> 	for (i = 0; i < I915_NUM_RINGS; i++) {
>> 		struct intel_engine_cs *ring = &dev_priv->ring[i];
>> 		struct intel_context *lctx = ring->last_context;
>> 		struct intel_context *dctx = ring->default_context;
>> 
>> 		if (lctx) {
>> 			if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
>> 				i915_gem_object_ggtt_unpin(lctx-
>> >legacy_hw_ctx.rcs_state);
>> 
>> 			i915_gem_context_unreference(lctx);
>> 			ring->last_context = NULL;
>> 		}
>> 
>> 		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
>> 			dctx->legacy_hw_ctx.rcs_state->base.write_domain
>> = 0;
>> 			dctx->legacy_hw_ctx.rcs_state->active = 0;
>> 			dctx->legacy_hw_ctx.initialized = false;
>> 		}
>> 	}
>> }
>> 
>> The state would be closer of what we get after module reload.
>> 
>> -Mika
>> 
>> >  int i915_gem_context_init(struct drm_device *dev)  {
>> >  	struct drm_i915_private *dev_priv = dev->dev_private; @@ -498,10
>> > +462,6 @@ int i915_gem_context_enable(struct drm_i915_private
>> *dev_priv)
>> >  		ppgtt->enable(ppgtt);
>> >  	}
>> >
>> > -	/* FIXME: We should make this work, even in reset */
>> > -	if (i915_reset_in_progress(&dev_priv->gpu_error))
>> > -		return 0;
>> > -
>> >  	BUG_ON(!dev_priv->ring[RCS].default_context);
>> >
>> >  	for_each_ring(ring, dev_priv, i) {
>> > @@ -645,7 +605,7 @@ static int do_switch(struct intel_engine_cs *ring,
>> >  	from = ring->last_context;
>> >
>> >  	if (USES_FULL_PPGTT(ring->dev)) {
>> > -		ret = ppgtt->switch_mm(ppgtt, ring, false);
>> > +		ret = ppgtt->switch_mm(ppgtt, ring);
>> >  		if (ret)
>> >  			goto unpin_out;
>> >  	}
>> > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c
>> > b/drivers/gpu/drm/i915/i915_gem_gtt.c
>> > index 5188936..450c8a9 100644
>> > --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
>> > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
>> > @@ -216,19 +216,12 @@ static gen6_gtt_pte_t
>> iris_pte_encode(dma_addr_t
>> > addr,
>> >
>> >  /* Broadwell Page Directory Pointer Descriptors */  static int
>> > gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
>> > -			   uint64_t val, bool synchronous)
>> > +			   uint64_t val)
>> >  {
>> > -	struct drm_i915_private *dev_priv = ring->dev->dev_private;
>> >  	int ret;
>> >
>> >  	BUG_ON(entry >= 4);
>> >
>> > -	if (synchronous) {
>> > -		I915_WRITE(GEN8_RING_PDP_UDW(ring, entry), val >> 32);
>> > -		I915_WRITE(GEN8_RING_PDP_LDW(ring, entry), (u32)val);
>> > -		return 0;
>> > -	}
>> > -
>> >  	ret = intel_ring_begin(ring, 6);
>> >  	if (ret)
>> >  		return ret;
>> > @@ -245,8 +238,7 @@ static int gen8_write_pdp(struct intel_engine_cs
>> > *ring, unsigned entry,  }
>> >
>> >  static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
>> > -			  struct intel_engine_cs *ring,
>> > -			  bool synchronous)
>> > +			  struct intel_engine_cs *ring)
>> >  {
>> >  	int i, ret;
>> >
>> > @@ -255,7 +247,7 @@ static int gen8_mm_switch(struct i915_hw_ppgtt
>> > *ppgtt,
>> >
>> >  	for (i = used_pd - 1; i >= 0; i--) {
>> >  		dma_addr_t addr = ppgtt->pd_dma_addr[i];
>> > -		ret = gen8_write_pdp(ring, i, addr, synchronous);
>> > +		ret = gen8_write_pdp(ring, i, addr);
>> >  		if (ret)
>> >  			return ret;
>> >  	}
>> > @@ -724,29 +716,10 @@ static uint32_t get_pd_offset(struct
>> > i915_hw_ppgtt *ppgtt)  }
>> >
>> >  static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
>> > -			 struct intel_engine_cs *ring,
>> > -			 bool synchronous)
>> > +			 struct intel_engine_cs *ring)
>> >  {
>> > -	struct drm_device *dev = ppgtt->base.dev;
>> > -	struct drm_i915_private *dev_priv = dev->dev_private;
>> >  	int ret;
>> >
>> > -	/* If we're in reset, we can assume the GPU is sufficiently idle to
>> > -	 * manually frob these bits. Ideally we could use the ring functions,
>> > -	 * except our error handling makes it quite difficult (can't use
>> > -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
>> > -	 *
>> > -	 * FIXME: We should try not to special case reset
>> > -	 */
>> > -	if (synchronous ||
>> > -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
>> > -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
>> > -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
>> > -		I915_WRITE(RING_PP_DIR_BASE(ring),
>> get_pd_offset(ppgtt));
>> > -		POSTING_READ(RING_PP_DIR_BASE(ring));
>> > -		return 0;
>> > -	}
>> > -
>> >  	/* NB: TLBs must be flushed and invalidated before a switch */
>> >  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS,
>> I915_GEM_GPU_DOMAINS);
>> >  	if (ret)
>> > @@ -768,29 +741,10 @@ static int hsw_mm_switch(struct i915_hw_ppgtt
>> > *ppgtt,  }
>> >
>> >  static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
>> > -			  struct intel_engine_cs *ring,
>> > -			  bool synchronous)
>> > +			  struct intel_engine_cs *ring)
>> >  {
>> > -	struct drm_device *dev = ppgtt->base.dev;
>> > -	struct drm_i915_private *dev_priv = dev->dev_private;
>> >  	int ret;
>> >
>> > -	/* If we're in reset, we can assume the GPU is sufficiently idle to
>> > -	 * manually frob these bits. Ideally we could use the ring functions,
>> > -	 * except our error handling makes it quite difficult (can't use
>> > -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
>> > -	 *
>> > -	 * FIXME: We should try not to special case reset
>> > -	 */
>> > -	if (synchronous ||
>> > -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
>> > -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
>> > -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
>> > -		I915_WRITE(RING_PP_DIR_BASE(ring),
>> get_pd_offset(ppgtt));
>> > -		POSTING_READ(RING_PP_DIR_BASE(ring));
>> > -		return 0;
>> > -	}
>> > -
>> >  	/* NB: TLBs must be flushed and invalidated before a switch */
>> >  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS,
>> I915_GEM_GPU_DOMAINS);
>> >  	if (ret)
>> > @@ -819,14 +773,11 @@ static int gen7_mm_switch(struct i915_hw_ppgtt
>> > *ppgtt,  }
>> >
>> >  static int gen6_mm_switch(struct i915_hw_ppgtt *ppgtt,
>> > -			  struct intel_engine_cs *ring,
>> > -			  bool synchronous)
>> > +			  struct intel_engine_cs *ring)
>> >  {
>> >  	struct drm_device *dev = ppgtt->base.dev;
>> >  	struct drm_i915_private *dev_priv = dev->dev_private;
>> >
>> > -	if (!synchronous)
>> > -		return 0;
>> >
>> >  	I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
>> >  	I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt)); @@ -
>> 852,7
>> > +803,7 @@ static int gen8_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
>> >  		if (USES_FULL_PPGTT(dev))
>> >  			continue;
>> >
>> > -		ret = ppgtt->switch_mm(ppgtt, ring, true);
>> > +		ret = ppgtt->switch_mm(ppgtt, ring);
>> >  		if (ret)
>> >  			goto err_out;
>> >  	}
>> > @@ -897,7 +848,7 @@ static int gen7_ppgtt_enable(struct i915_hw_ppgtt
>> *ppgtt)
>> >  		if (USES_FULL_PPGTT(dev))
>> >  			continue;
>> >
>> > -		ret = ppgtt->switch_mm(ppgtt, ring, true);
>> > +		ret = ppgtt->switch_mm(ppgtt, ring);
>> >  		if (ret)
>> >  			return ret;
>> >  	}
>> > @@ -926,7 +877,7 @@ static int gen6_ppgtt_enable(struct i915_hw_ppgtt
>> *ppgtt)
>> >  	I915_WRITE(GFX_MODE,
>> _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
>> >
>> >  	for_each_ring(ring, dev_priv, i) {
>> > -		int ret = ppgtt->switch_mm(ppgtt, ring, true);
>> > +		int ret = ppgtt->switch_mm(ppgtt, ring);
>> >  		if (ret)
>> >  			return ret;
>> >  	}
>> > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h
>> > b/drivers/gpu/drm/i915/i915_gem_gtt.h
>> > index 8d6f7c1..bf1e4fc 100644
>> > --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
>> > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
>> > @@ -262,8 +262,7 @@ struct i915_hw_ppgtt {
>> >
>> >  	int (*enable)(struct i915_hw_ppgtt *ppgtt);
>> >  	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
>> > -			 struct intel_engine_cs *ring,
>> > -			 bool synchronous);
>> > +			 struct intel_engine_cs *ring);
>> >  	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file
>> *m);
>> > };
>> >
>> > --
>> > 2.0.0
>> >
>> > _______________________________________________
>> > Intel-gfx mailing list
>> > Intel-gfx@lists.freedesktop.org
>> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH v3] drm/i915: Rework GPU reset sequence to match driver load & thaw
  2014-08-15 17:03       ` Mika Kuoppala
@ 2014-08-15 17:51         ` alistair.mcaulay
  2014-08-19 10:12           ` Mcaulay, Alistair
  2014-08-20 14:46           ` Daniel, Thomas
  0 siblings, 2 replies; 30+ messages in thread
From: alistair.mcaulay @ 2014-08-15 17:51 UTC (permalink / raw)
  To: intel-gfx

From: "McAulay, Alistair" <alistair.mcaulay@intel.com>

This patch is to address Daniels concerns over different code during reset:

http://lists.freedesktop.org/archives/intel-gfx/2014-June/047758.html

"The reason for aiming as hard as possible to use the exact same code for
driver load, gpu reset and runtime pm/system resume is that we've simply
seen too many bugs due to slight variations and unintended omissions."

Tested using igt drv_hangman.

V2: Cleaner way of preventing check_wedge returning -EAGAIN
V3: Clean the last_context during reset, to ensure do_switch() does the MI_SET_CONTEXT. As per review.
Signed-off-by: McAulay, Alistair <alistair.mcaulay@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c         |  6 +++
 drivers/gpu/drm/i915/i915_drv.h         |  3 ++
 drivers/gpu/drm/i915/i915_gem.c         |  4 +-
 drivers/gpu/drm/i915/i915_gem_context.c | 33 +++-------------
 drivers/gpu/drm/i915/i915_gem_gtt.c     | 67 +++++----------------------------
 drivers/gpu/drm/i915/i915_gem_gtt.h     |  3 +-
 6 files changed, 28 insertions(+), 88 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 5e4fefd..3bfafe6 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -806,7 +806,13 @@ int i915_reset(struct drm_device *dev)
 			!dev_priv->ums.mm_suspended) {
 		dev_priv->ums.mm_suspended = 0;
 
+		/* Used to prevent gem_check_wedged returning -EAGAIN during gpu reset */
+		dev_priv->gpu_error.reload_in_reset = true;
+
 		ret = i915_gem_init_hw(dev);
+
+		dev_priv->gpu_error.reload_in_reset = false;
+
 		mutex_unlock(&dev->struct_mutex);
 		if (ret) {
 			DRM_ERROR("Failed hw init on reset %d\n", ret);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 991b663..116daff 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1217,6 +1217,9 @@ struct i915_gpu_error {
 
 	/* For missed irq/seqno simulation. */
 	unsigned int test_irq_rings;
+
+	/* Used to prevent gem_check_wedged returning -EAGAIN during gpu reset   */
+	bool reload_in_reset;
 };
 
 enum modeset_restore {
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index ef047bc..e7396eb 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1085,7 +1085,9 @@ i915_gem_check_wedge(struct i915_gpu_error *error,
 		if (i915_terminally_wedged(error))
 			return -EIO;
 
-		return -EAGAIN;
+		/* Check if GPU Reset is in progress */
+		if (!error->reload_in_reset)
+			return -EAGAIN;
 	}
 
 	return 0;
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index de72a28..9378ad8 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -377,34 +377,17 @@ void i915_gem_context_reset(struct drm_device *dev)
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	int i;
 
-	/* Prevent the hardware from restoring the last context (which hung) on
-	 * the next switch */
 	for (i = 0; i < I915_NUM_RINGS; i++) {
 		struct intel_engine_cs *ring = &dev_priv->ring[i];
-		struct intel_context *dctx = ring->default_context;
 		struct intel_context *lctx = ring->last_context;
 
-		/* Do a fake switch to the default context */
-		if (lctx == dctx)
-			continue;
-
-		if (!lctx)
-			continue;
+		if (lctx) {
+			if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
+				i915_gem_object_ggtt_unpin(lctx->legacy_hw_ctx.rcs_state);
 
-		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
-			WARN_ON(i915_gem_obj_ggtt_pin(dctx->legacy_hw_ctx.rcs_state,
-						      get_context_alignment(dev), 0));
-			/* Fake a finish/inactive */
-			dctx->legacy_hw_ctx.rcs_state->base.write_domain = 0;
-			dctx->legacy_hw_ctx.rcs_state->active = 0;
+			i915_gem_context_unreference(lctx);
+			ring->last_context = NULL;
 		}
-
-		if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
-			i915_gem_object_ggtt_unpin(lctx->legacy_hw_ctx.rcs_state);
-
-		i915_gem_context_unreference(lctx);
-		i915_gem_context_reference(dctx);
-		ring->last_context = dctx;
 	}
 }
 
@@ -498,10 +481,6 @@ int i915_gem_context_enable(struct drm_i915_private *dev_priv)
 		ppgtt->enable(ppgtt);
 	}
 
-	/* FIXME: We should make this work, even in reset */
-	if (i915_reset_in_progress(&dev_priv->gpu_error))
-		return 0;
-
 	BUG_ON(!dev_priv->ring[RCS].default_context);
 
 	for_each_ring(ring, dev_priv, i) {
@@ -645,7 +624,7 @@ static int do_switch(struct intel_engine_cs *ring,
 	from = ring->last_context;
 
 	if (USES_FULL_PPGTT(ring->dev)) {
-		ret = ppgtt->switch_mm(ppgtt, ring, false);
+		ret = ppgtt->switch_mm(ppgtt, ring);
 		if (ret)
 			goto unpin_out;
 	}
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 5188936..450c8a9 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -216,19 +216,12 @@ static gen6_gtt_pte_t iris_pte_encode(dma_addr_t addr,
 
 /* Broadwell Page Directory Pointer Descriptors */
 static int gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
-			   uint64_t val, bool synchronous)
+			   uint64_t val)
 {
-	struct drm_i915_private *dev_priv = ring->dev->dev_private;
 	int ret;
 
 	BUG_ON(entry >= 4);
 
-	if (synchronous) {
-		I915_WRITE(GEN8_RING_PDP_UDW(ring, entry), val >> 32);
-		I915_WRITE(GEN8_RING_PDP_LDW(ring, entry), (u32)val);
-		return 0;
-	}
-
 	ret = intel_ring_begin(ring, 6);
 	if (ret)
 		return ret;
@@ -245,8 +238,7 @@ static int gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
 }
 
 static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
-			  struct intel_engine_cs *ring,
-			  bool synchronous)
+			  struct intel_engine_cs *ring)
 {
 	int i, ret;
 
@@ -255,7 +247,7 @@ static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
 
 	for (i = used_pd - 1; i >= 0; i--) {
 		dma_addr_t addr = ppgtt->pd_dma_addr[i];
-		ret = gen8_write_pdp(ring, i, addr, synchronous);
+		ret = gen8_write_pdp(ring, i, addr);
 		if (ret)
 			return ret;
 	}
@@ -724,29 +716,10 @@ static uint32_t get_pd_offset(struct i915_hw_ppgtt *ppgtt)
 }
 
 static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
-			 struct intel_engine_cs *ring,
-			 bool synchronous)
+			 struct intel_engine_cs *ring)
 {
-	struct drm_device *dev = ppgtt->base.dev;
-	struct drm_i915_private *dev_priv = dev->dev_private;
 	int ret;
 
-	/* If we're in reset, we can assume the GPU is sufficiently idle to
-	 * manually frob these bits. Ideally we could use the ring functions,
-	 * except our error handling makes it quite difficult (can't use
-	 * intel_ring_begin, ring->flush, or intel_ring_advance)
-	 *
-	 * FIXME: We should try not to special case reset
-	 */
-	if (synchronous ||
-	    i915_reset_in_progress(&dev_priv->gpu_error)) {
-		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
-		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
-		I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
-		POSTING_READ(RING_PP_DIR_BASE(ring));
-		return 0;
-	}
-
 	/* NB: TLBs must be flushed and invalidated before a switch */
 	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, I915_GEM_GPU_DOMAINS);
 	if (ret)
@@ -768,29 +741,10 @@ static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
 }
 
 static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
-			  struct intel_engine_cs *ring,
-			  bool synchronous)
+			  struct intel_engine_cs *ring)
 {
-	struct drm_device *dev = ppgtt->base.dev;
-	struct drm_i915_private *dev_priv = dev->dev_private;
 	int ret;
 
-	/* If we're in reset, we can assume the GPU is sufficiently idle to
-	 * manually frob these bits. Ideally we could use the ring functions,
-	 * except our error handling makes it quite difficult (can't use
-	 * intel_ring_begin, ring->flush, or intel_ring_advance)
-	 *
-	 * FIXME: We should try not to special case reset
-	 */
-	if (synchronous ||
-	    i915_reset_in_progress(&dev_priv->gpu_error)) {
-		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
-		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
-		I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
-		POSTING_READ(RING_PP_DIR_BASE(ring));
-		return 0;
-	}
-
 	/* NB: TLBs must be flushed and invalidated before a switch */
 	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, I915_GEM_GPU_DOMAINS);
 	if (ret)
@@ -819,14 +773,11 @@ static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
 }
 
 static int gen6_mm_switch(struct i915_hw_ppgtt *ppgtt,
-			  struct intel_engine_cs *ring,
-			  bool synchronous)
+			  struct intel_engine_cs *ring)
 {
 	struct drm_device *dev = ppgtt->base.dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
-	if (!synchronous)
-		return 0;
 
 	I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
 	I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt));
@@ -852,7 +803,7 @@ static int gen8_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
 		if (USES_FULL_PPGTT(dev))
 			continue;
 
-		ret = ppgtt->switch_mm(ppgtt, ring, true);
+		ret = ppgtt->switch_mm(ppgtt, ring);
 		if (ret)
 			goto err_out;
 	}
@@ -897,7 +848,7 @@ static int gen7_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
 		if (USES_FULL_PPGTT(dev))
 			continue;
 
-		ret = ppgtt->switch_mm(ppgtt, ring, true);
+		ret = ppgtt->switch_mm(ppgtt, ring);
 		if (ret)
 			return ret;
 	}
@@ -926,7 +877,7 @@ static int gen6_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
 	I915_WRITE(GFX_MODE, _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
 
 	for_each_ring(ring, dev_priv, i) {
-		int ret = ppgtt->switch_mm(ppgtt, ring, true);
+		int ret = ppgtt->switch_mm(ppgtt, ring);
 		if (ret)
 			return ret;
 	}
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index 8d6f7c1..bf1e4fc 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -262,8 +262,7 @@ struct i915_hw_ppgtt {
 
 	int (*enable)(struct i915_hw_ppgtt *ppgtt);
 	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
-			 struct intel_engine_cs *ring,
-			 bool synchronous);
+			 struct intel_engine_cs *ring);
 	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file *m);
 };
 
-- 
2.0.4

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [PATCH v3] drm/i915: Rework GPU reset sequence to match driver load & thaw
  2014-08-15 17:51         ` [PATCH v3] " alistair.mcaulay
@ 2014-08-19 10:12           ` Mcaulay, Alistair
  2014-08-19 12:35             ` Mika Kuoppala
  2014-08-20 14:46           ` Daniel, Thomas
  1 sibling, 1 reply; 30+ messages in thread
From: Mcaulay, Alistair @ 2014-08-19 10:12 UTC (permalink / raw)
  To: Mika Kuoppala (mika.kuoppala@linux.intel.com), intel-gfx

Hi Mika,

can you please review this patch, and verify it fixes the issues in your previous review.

Thanks,
Alistair.

> -----Original Message-----
> From: Mcaulay, Alistair
> Sent: Friday, August 15, 2014 6:52 PM
> To: intel-gfx@lists.freedesktop.org
> Cc: Mcaulay, Alistair
> Subject: [PATCH v3] drm/i915: Rework GPU reset sequence to match driver
> load & thaw
> 
> From: "McAulay, Alistair" <alistair.mcaulay@intel.com>
> 
> This patch is to address Daniels concerns over different code during reset:
> 
> http://lists.freedesktop.org/archives/intel-gfx/2014-June/047758.html
> 
> "The reason for aiming as hard as possible to use the exact same code for
> driver load, gpu reset and runtime pm/system resume is that we've simply
> seen too many bugs due to slight variations and unintended omissions."
> 
> Tested using igt drv_hangman.
> 
> V2: Cleaner way of preventing check_wedge returning -EAGAIN
> V3: Clean the last_context during reset, to ensure do_switch() does the
> MI_SET_CONTEXT. As per review.
> Signed-off-by: McAulay, Alistair <alistair.mcaulay@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.c         |  6 +++
>  drivers/gpu/drm/i915/i915_drv.h         |  3 ++
>  drivers/gpu/drm/i915/i915_gem.c         |  4 +-
>  drivers/gpu/drm/i915/i915_gem_context.c | 33 +++-------------
>  drivers/gpu/drm/i915/i915_gem_gtt.c     | 67 +++++----------------------------
>  drivers/gpu/drm/i915/i915_gem_gtt.h     |  3 +-
>  6 files changed, 28 insertions(+), 88 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.c
> b/drivers/gpu/drm/i915/i915_drv.c index 5e4fefd..3bfafe6 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -806,7 +806,13 @@ int i915_reset(struct drm_device *dev)
>  			!dev_priv->ums.mm_suspended) {
>  		dev_priv->ums.mm_suspended = 0;
> 
> +		/* Used to prevent gem_check_wedged returning -EAGAIN
> during gpu reset */
> +		dev_priv->gpu_error.reload_in_reset = true;
> +
>  		ret = i915_gem_init_hw(dev);
> +
> +		dev_priv->gpu_error.reload_in_reset = false;
> +
>  		mutex_unlock(&dev->struct_mutex);
>  		if (ret) {
>  			DRM_ERROR("Failed hw init on reset %d\n", ret); diff
> --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 991b663..116daff 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1217,6 +1217,9 @@ struct i915_gpu_error {
> 
>  	/* For missed irq/seqno simulation. */
>  	unsigned int test_irq_rings;
> +
> +	/* Used to prevent gem_check_wedged returning -EAGAIN during
> gpu reset   */
> +	bool reload_in_reset;
>  };
> 
>  enum modeset_restore {
> diff --git a/drivers/gpu/drm/i915/i915_gem.c
> b/drivers/gpu/drm/i915/i915_gem.c index ef047bc..e7396eb 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1085,7 +1085,9 @@ i915_gem_check_wedge(struct i915_gpu_error
> *error,
>  		if (i915_terminally_wedged(error))
>  			return -EIO;
> 
> -		return -EAGAIN;
> +		/* Check if GPU Reset is in progress */
> +		if (!error->reload_in_reset)
> +			return -EAGAIN;
>  	}
> 
>  	return 0;
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c
> b/drivers/gpu/drm/i915/i915_gem_context.c
> index de72a28..9378ad8 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -377,34 +377,17 @@ void i915_gem_context_reset(struct drm_device
> *dev)
>  	struct drm_i915_private *dev_priv = dev->dev_private;
>  	int i;
> 
> -	/* Prevent the hardware from restoring the last context (which
> hung) on
> -	 * the next switch */
>  	for (i = 0; i < I915_NUM_RINGS; i++) {
>  		struct intel_engine_cs *ring = &dev_priv->ring[i];
> -		struct intel_context *dctx = ring->default_context;
>  		struct intel_context *lctx = ring->last_context;
> 
> -		/* Do a fake switch to the default context */
> -		if (lctx == dctx)
> -			continue;
> -
> -		if (!lctx)
> -			continue;
> +		if (lctx) {
> +			if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
> +				i915_gem_object_ggtt_unpin(lctx-
> >legacy_hw_ctx.rcs_state);
> 
> -		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
> -			WARN_ON(i915_gem_obj_ggtt_pin(dctx-
> >legacy_hw_ctx.rcs_state,
> -
> get_context_alignment(dev), 0));
> -			/* Fake a finish/inactive */
> -			dctx->legacy_hw_ctx.rcs_state->base.write_domain
> = 0;
> -			dctx->legacy_hw_ctx.rcs_state->active = 0;
> +			i915_gem_context_unreference(lctx);
> +			ring->last_context = NULL;
>  		}
> -
> -		if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
> -			i915_gem_object_ggtt_unpin(lctx-
> >legacy_hw_ctx.rcs_state);
> -
> -		i915_gem_context_unreference(lctx);
> -		i915_gem_context_reference(dctx);
> -		ring->last_context = dctx;
>  	}
>  }
> 
> @@ -498,10 +481,6 @@ int i915_gem_context_enable(struct
> drm_i915_private *dev_priv)
>  		ppgtt->enable(ppgtt);
>  	}
> 
> -	/* FIXME: We should make this work, even in reset */
> -	if (i915_reset_in_progress(&dev_priv->gpu_error))
> -		return 0;
> -
>  	BUG_ON(!dev_priv->ring[RCS].default_context);
> 
>  	for_each_ring(ring, dev_priv, i) {
> @@ -645,7 +624,7 @@ static int do_switch(struct intel_engine_cs *ring,
>  	from = ring->last_context;
> 
>  	if (USES_FULL_PPGTT(ring->dev)) {
> -		ret = ppgtt->switch_mm(ppgtt, ring, false);
> +		ret = ppgtt->switch_mm(ppgtt, ring);
>  		if (ret)
>  			goto unpin_out;
>  	}
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c
> b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 5188936..450c8a9 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -216,19 +216,12 @@ static gen6_gtt_pte_t
> iris_pte_encode(dma_addr_t addr,
> 
>  /* Broadwell Page Directory Pointer Descriptors */  static int
> gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
> -			   uint64_t val, bool synchronous)
> +			   uint64_t val)
>  {
> -	struct drm_i915_private *dev_priv = ring->dev->dev_private;
>  	int ret;
> 
>  	BUG_ON(entry >= 4);
> 
> -	if (synchronous) {
> -		I915_WRITE(GEN8_RING_PDP_UDW(ring, entry), val >> 32);
> -		I915_WRITE(GEN8_RING_PDP_LDW(ring, entry), (u32)val);
> -		return 0;
> -	}
> -
>  	ret = intel_ring_begin(ring, 6);
>  	if (ret)
>  		return ret;
> @@ -245,8 +238,7 @@ static int gen8_write_pdp(struct intel_engine_cs
> *ring, unsigned entry,  }
> 
>  static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
> -			  struct intel_engine_cs *ring,
> -			  bool synchronous)
> +			  struct intel_engine_cs *ring)
>  {
>  	int i, ret;
> 
> @@ -255,7 +247,7 @@ static int gen8_mm_switch(struct i915_hw_ppgtt
> *ppgtt,
> 
>  	for (i = used_pd - 1; i >= 0; i--) {
>  		dma_addr_t addr = ppgtt->pd_dma_addr[i];
> -		ret = gen8_write_pdp(ring, i, addr, synchronous);
> +		ret = gen8_write_pdp(ring, i, addr);
>  		if (ret)
>  			return ret;
>  	}
> @@ -724,29 +716,10 @@ static uint32_t get_pd_offset(struct i915_hw_ppgtt
> *ppgtt)  }
> 
>  static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
> -			 struct intel_engine_cs *ring,
> -			 bool synchronous)
> +			 struct intel_engine_cs *ring)
>  {
> -	struct drm_device *dev = ppgtt->base.dev;
> -	struct drm_i915_private *dev_priv = dev->dev_private;
>  	int ret;
> 
> -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> -	 * manually frob these bits. Ideally we could use the ring functions,
> -	 * except our error handling makes it quite difficult (can't use
> -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> -	 *
> -	 * FIXME: We should try not to special case reset
> -	 */
> -	if (synchronous ||
> -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> -		I915_WRITE(RING_PP_DIR_BASE(ring),
> get_pd_offset(ppgtt));
> -		POSTING_READ(RING_PP_DIR_BASE(ring));
> -		return 0;
> -	}
> -
>  	/* NB: TLBs must be flushed and invalidated before a switch */
>  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS,
> I915_GEM_GPU_DOMAINS);
>  	if (ret)
> @@ -768,29 +741,10 @@ static int hsw_mm_switch(struct i915_hw_ppgtt
> *ppgtt,  }
> 
>  static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
> -			  struct intel_engine_cs *ring,
> -			  bool synchronous)
> +			  struct intel_engine_cs *ring)
>  {
> -	struct drm_device *dev = ppgtt->base.dev;
> -	struct drm_i915_private *dev_priv = dev->dev_private;
>  	int ret;
> 
> -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> -	 * manually frob these bits. Ideally we could use the ring functions,
> -	 * except our error handling makes it quite difficult (can't use
> -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> -	 *
> -	 * FIXME: We should try not to special case reset
> -	 */
> -	if (synchronous ||
> -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> -		I915_WRITE(RING_PP_DIR_BASE(ring),
> get_pd_offset(ppgtt));
> -		POSTING_READ(RING_PP_DIR_BASE(ring));
> -		return 0;
> -	}
> -
>  	/* NB: TLBs must be flushed and invalidated before a switch */
>  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS,
> I915_GEM_GPU_DOMAINS);
>  	if (ret)
> @@ -819,14 +773,11 @@ static int gen7_mm_switch(struct i915_hw_ppgtt
> *ppgtt,  }
> 
>  static int gen6_mm_switch(struct i915_hw_ppgtt *ppgtt,
> -			  struct intel_engine_cs *ring,
> -			  bool synchronous)
> +			  struct intel_engine_cs *ring)
>  {
>  	struct drm_device *dev = ppgtt->base.dev;
>  	struct drm_i915_private *dev_priv = dev->dev_private;
> 
> -	if (!synchronous)
> -		return 0;
> 
>  	I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
>  	I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt)); @@ -
> 852,7 +803,7 @@ static int gen8_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
>  		if (USES_FULL_PPGTT(dev))
>  			continue;
> 
> -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> +		ret = ppgtt->switch_mm(ppgtt, ring);
>  		if (ret)
>  			goto err_out;
>  	}
> @@ -897,7 +848,7 @@ static int gen7_ppgtt_enable(struct i915_hw_ppgtt
> *ppgtt)
>  		if (USES_FULL_PPGTT(dev))
>  			continue;
> 
> -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> +		ret = ppgtt->switch_mm(ppgtt, ring);
>  		if (ret)
>  			return ret;
>  	}
> @@ -926,7 +877,7 @@ static int gen6_ppgtt_enable(struct i915_hw_ppgtt
> *ppgtt)
>  	I915_WRITE(GFX_MODE,
> _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
> 
>  	for_each_ring(ring, dev_priv, i) {
> -		int ret = ppgtt->switch_mm(ppgtt, ring, true);
> +		int ret = ppgtt->switch_mm(ppgtt, ring);
>  		if (ret)
>  			return ret;
>  	}
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h
> b/drivers/gpu/drm/i915/i915_gem_gtt.h
> index 8d6f7c1..bf1e4fc 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> @@ -262,8 +262,7 @@ struct i915_hw_ppgtt {
> 
>  	int (*enable)(struct i915_hw_ppgtt *ppgtt);
>  	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
> -			 struct intel_engine_cs *ring,
> -			 bool synchronous);
> +			 struct intel_engine_cs *ring);
>  	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file
> *m);  };
> 
> --
> 2.0.4

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v3] drm/i915: Rework GPU reset sequence to match driver load & thaw
  2014-08-19 10:12           ` Mcaulay, Alistair
@ 2014-08-19 12:35             ` Mika Kuoppala
  2014-08-21 12:38               ` Mcaulay, Alistair
  0 siblings, 1 reply; 30+ messages in thread
From: Mika Kuoppala @ 2014-08-19 12:35 UTC (permalink / raw)
  To: Mcaulay, Alistair, intel-gfx

"Mcaulay, Alistair" <alistair.mcaulay@intel.com> writes:

> Hi Mika,
>
> can you please review this patch, and verify it fixes the issues in your previous review.
>
> Thanks,
> Alistair.
>
>> -----Original Message-----
>> From: Mcaulay, Alistair
>> Sent: Friday, August 15, 2014 6:52 PM
>> To: intel-gfx@lists.freedesktop.org
>> Cc: Mcaulay, Alistair
>> Subject: [PATCH v3] drm/i915: Rework GPU reset sequence to match driver
>> load & thaw
>> 
>> From: "McAulay, Alistair" <alistair.mcaulay@intel.com>
>> 
>> This patch is to address Daniels concerns over different code during reset:
>> 
>> http://lists.freedesktop.org/archives/intel-gfx/2014-June/047758.html
>> 
>> "The reason for aiming as hard as possible to use the exact same code for
>> driver load, gpu reset and runtime pm/system resume is that we've simply
>> seen too many bugs due to slight variations and unintended omissions."
>> 
>> Tested using igt drv_hangman.
>> 
>> V2: Cleaner way of preventing check_wedge returning -EAGAIN
>> V3: Clean the last_context during reset, to ensure do_switch() does the
>> MI_SET_CONTEXT. As per review.
>> Signed-off-by: McAulay, Alistair <alistair.mcaulay@intel.com>
>> ---

Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>

>>  drivers/gpu/drm/i915/i915_drv.c         |  6 +++
>>  drivers/gpu/drm/i915/i915_drv.h         |  3 ++
>>  drivers/gpu/drm/i915/i915_gem.c         |  4 +-
>>  drivers/gpu/drm/i915/i915_gem_context.c | 33 +++-------------
>>  drivers/gpu/drm/i915/i915_gem_gtt.c     | 67 +++++----------------------------
>>  drivers/gpu/drm/i915/i915_gem_gtt.h     |  3 +-
>>  6 files changed, 28 insertions(+), 88 deletions(-)
>> 
>> diff --git a/drivers/gpu/drm/i915/i915_drv.c
>> b/drivers/gpu/drm/i915/i915_drv.c index 5e4fefd..3bfafe6 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.c
>> +++ b/drivers/gpu/drm/i915/i915_drv.c
>> @@ -806,7 +806,13 @@ int i915_reset(struct drm_device *dev)
>>  			!dev_priv->ums.mm_suspended) {
>>  		dev_priv->ums.mm_suspended = 0;
>> 
>> +		/* Used to prevent gem_check_wedged returning -EAGAIN
>> during gpu reset */
>> +		dev_priv->gpu_error.reload_in_reset = true;
>> +
>>  		ret = i915_gem_init_hw(dev);
>> +
>> +		dev_priv->gpu_error.reload_in_reset = false;
>> +
>>  		mutex_unlock(&dev->struct_mutex);
>>  		if (ret) {
>>  			DRM_ERROR("Failed hw init on reset %d\n", ret); diff
>> --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> index 991b663..116daff 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -1217,6 +1217,9 @@ struct i915_gpu_error {
>> 
>>  	/* For missed irq/seqno simulation. */
>>  	unsigned int test_irq_rings;
>> +
>> +	/* Used to prevent gem_check_wedged returning -EAGAIN during
>> gpu reset   */
>> +	bool reload_in_reset;
>>  };
>> 
>>  enum modeset_restore {
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c
>> b/drivers/gpu/drm/i915/i915_gem.c index ef047bc..e7396eb 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -1085,7 +1085,9 @@ i915_gem_check_wedge(struct i915_gpu_error
>> *error,
>>  		if (i915_terminally_wedged(error))
>>  			return -EIO;
>> 
>> -		return -EAGAIN;
>> +		/* Check if GPU Reset is in progress */
>> +		if (!error->reload_in_reset)
>> +			return -EAGAIN;
>>  	}
>> 
>>  	return 0;
>> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c
>> b/drivers/gpu/drm/i915/i915_gem_context.c
>> index de72a28..9378ad8 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_context.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
>> @@ -377,34 +377,17 @@ void i915_gem_context_reset(struct drm_device
>> *dev)
>>  	struct drm_i915_private *dev_priv = dev->dev_private;
>>  	int i;
>> 
>> -	/* Prevent the hardware from restoring the last context (which
>> hung) on
>> -	 * the next switch */
>>  	for (i = 0; i < I915_NUM_RINGS; i++) {
>>  		struct intel_engine_cs *ring = &dev_priv->ring[i];
>> -		struct intel_context *dctx = ring->default_context;
>>  		struct intel_context *lctx = ring->last_context;
>> 
>> -		/* Do a fake switch to the default context */
>> -		if (lctx == dctx)
>> -			continue;
>> -
>> -		if (!lctx)
>> -			continue;
>> +		if (lctx) {
>> +			if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
>> +				i915_gem_object_ggtt_unpin(lctx-
>> >legacy_hw_ctx.rcs_state);
>> 
>> -		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
>> -			WARN_ON(i915_gem_obj_ggtt_pin(dctx-
>> >legacy_hw_ctx.rcs_state,
>> -
>> get_context_alignment(dev), 0));
>> -			/* Fake a finish/inactive */
>> -			dctx->legacy_hw_ctx.rcs_state->base.write_domain
>> = 0;
>> -			dctx->legacy_hw_ctx.rcs_state->active = 0;
>> +			i915_gem_context_unreference(lctx);
>> +			ring->last_context = NULL;
>>  		}
>> -
>> -		if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
>> -			i915_gem_object_ggtt_unpin(lctx-
>> >legacy_hw_ctx.rcs_state);
>> -
>> -		i915_gem_context_unreference(lctx);
>> -		i915_gem_context_reference(dctx);
>> -		ring->last_context = dctx;
>>  	}
>>  }
>> 
>> @@ -498,10 +481,6 @@ int i915_gem_context_enable(struct
>> drm_i915_private *dev_priv)
>>  		ppgtt->enable(ppgtt);
>>  	}
>> 
>> -	/* FIXME: We should make this work, even in reset */
>> -	if (i915_reset_in_progress(&dev_priv->gpu_error))
>> -		return 0;
>> -
>>  	BUG_ON(!dev_priv->ring[RCS].default_context);
>> 
>>  	for_each_ring(ring, dev_priv, i) {
>> @@ -645,7 +624,7 @@ static int do_switch(struct intel_engine_cs *ring,
>>  	from = ring->last_context;
>> 
>>  	if (USES_FULL_PPGTT(ring->dev)) {
>> -		ret = ppgtt->switch_mm(ppgtt, ring, false);
>> +		ret = ppgtt->switch_mm(ppgtt, ring);
>>  		if (ret)
>>  			goto unpin_out;
>>  	}
>> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c
>> b/drivers/gpu/drm/i915/i915_gem_gtt.c
>> index 5188936..450c8a9 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
>> @@ -216,19 +216,12 @@ static gen6_gtt_pte_t
>> iris_pte_encode(dma_addr_t addr,
>> 
>>  /* Broadwell Page Directory Pointer Descriptors */  static int
>> gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
>> -			   uint64_t val, bool synchronous)
>> +			   uint64_t val)
>>  {
>> -	struct drm_i915_private *dev_priv = ring->dev->dev_private;
>>  	int ret;
>> 
>>  	BUG_ON(entry >= 4);
>> 
>> -	if (synchronous) {
>> -		I915_WRITE(GEN8_RING_PDP_UDW(ring, entry), val >> 32);
>> -		I915_WRITE(GEN8_RING_PDP_LDW(ring, entry), (u32)val);
>> -		return 0;
>> -	}
>> -
>>  	ret = intel_ring_begin(ring, 6);
>>  	if (ret)
>>  		return ret;
>> @@ -245,8 +238,7 @@ static int gen8_write_pdp(struct intel_engine_cs
>> *ring, unsigned entry,  }
>> 
>>  static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
>> -			  struct intel_engine_cs *ring,
>> -			  bool synchronous)
>> +			  struct intel_engine_cs *ring)
>>  {
>>  	int i, ret;
>> 
>> @@ -255,7 +247,7 @@ static int gen8_mm_switch(struct i915_hw_ppgtt
>> *ppgtt,
>> 
>>  	for (i = used_pd - 1; i >= 0; i--) {
>>  		dma_addr_t addr = ppgtt->pd_dma_addr[i];
>> -		ret = gen8_write_pdp(ring, i, addr, synchronous);
>> +		ret = gen8_write_pdp(ring, i, addr);
>>  		if (ret)
>>  			return ret;
>>  	}
>> @@ -724,29 +716,10 @@ static uint32_t get_pd_offset(struct i915_hw_ppgtt
>> *ppgtt)  }
>> 
>>  static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
>> -			 struct intel_engine_cs *ring,
>> -			 bool synchronous)
>> +			 struct intel_engine_cs *ring)
>>  {
>> -	struct drm_device *dev = ppgtt->base.dev;
>> -	struct drm_i915_private *dev_priv = dev->dev_private;
>>  	int ret;
>> 
>> -	/* If we're in reset, we can assume the GPU is sufficiently idle to
>> -	 * manually frob these bits. Ideally we could use the ring functions,
>> -	 * except our error handling makes it quite difficult (can't use
>> -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
>> -	 *
>> -	 * FIXME: We should try not to special case reset
>> -	 */
>> -	if (synchronous ||
>> -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
>> -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
>> -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
>> -		I915_WRITE(RING_PP_DIR_BASE(ring),
>> get_pd_offset(ppgtt));
>> -		POSTING_READ(RING_PP_DIR_BASE(ring));
>> -		return 0;
>> -	}
>> -
>>  	/* NB: TLBs must be flushed and invalidated before a switch */
>>  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS,
>> I915_GEM_GPU_DOMAINS);
>>  	if (ret)
>> @@ -768,29 +741,10 @@ static int hsw_mm_switch(struct i915_hw_ppgtt
>> *ppgtt,  }
>> 
>>  static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
>> -			  struct intel_engine_cs *ring,
>> -			  bool synchronous)
>> +			  struct intel_engine_cs *ring)
>>  {
>> -	struct drm_device *dev = ppgtt->base.dev;
>> -	struct drm_i915_private *dev_priv = dev->dev_private;
>>  	int ret;
>> 
>> -	/* If we're in reset, we can assume the GPU is sufficiently idle to
>> -	 * manually frob these bits. Ideally we could use the ring functions,
>> -	 * except our error handling makes it quite difficult (can't use
>> -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
>> -	 *
>> -	 * FIXME: We should try not to special case reset
>> -	 */
>> -	if (synchronous ||
>> -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
>> -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
>> -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
>> -		I915_WRITE(RING_PP_DIR_BASE(ring),
>> get_pd_offset(ppgtt));
>> -		POSTING_READ(RING_PP_DIR_BASE(ring));
>> -		return 0;
>> -	}
>> -
>>  	/* NB: TLBs must be flushed and invalidated before a switch */
>>  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS,
>> I915_GEM_GPU_DOMAINS);
>>  	if (ret)
>> @@ -819,14 +773,11 @@ static int gen7_mm_switch(struct i915_hw_ppgtt
>> *ppgtt,  }
>> 
>>  static int gen6_mm_switch(struct i915_hw_ppgtt *ppgtt,
>> -			  struct intel_engine_cs *ring,
>> -			  bool synchronous)
>> +			  struct intel_engine_cs *ring)
>>  {
>>  	struct drm_device *dev = ppgtt->base.dev;
>>  	struct drm_i915_private *dev_priv = dev->dev_private;
>> 
>> -	if (!synchronous)
>> -		return 0;
>> 
>>  	I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
>>  	I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt)); @@ -
>> 852,7 +803,7 @@ static int gen8_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
>>  		if (USES_FULL_PPGTT(dev))
>>  			continue;
>> 
>> -		ret = ppgtt->switch_mm(ppgtt, ring, true);
>> +		ret = ppgtt->switch_mm(ppgtt, ring);
>>  		if (ret)
>>  			goto err_out;
>>  	}
>> @@ -897,7 +848,7 @@ static int gen7_ppgtt_enable(struct i915_hw_ppgtt
>> *ppgtt)
>>  		if (USES_FULL_PPGTT(dev))
>>  			continue;
>> 
>> -		ret = ppgtt->switch_mm(ppgtt, ring, true);
>> +		ret = ppgtt->switch_mm(ppgtt, ring);
>>  		if (ret)
>>  			return ret;
>>  	}
>> @@ -926,7 +877,7 @@ static int gen6_ppgtt_enable(struct i915_hw_ppgtt
>> *ppgtt)
>>  	I915_WRITE(GFX_MODE,
>> _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
>> 
>>  	for_each_ring(ring, dev_priv, i) {
>> -		int ret = ppgtt->switch_mm(ppgtt, ring, true);
>> +		int ret = ppgtt->switch_mm(ppgtt, ring);
>>  		if (ret)
>>  			return ret;
>>  	}
>> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h
>> b/drivers/gpu/drm/i915/i915_gem_gtt.h
>> index 8d6f7c1..bf1e4fc 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
>> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
>> @@ -262,8 +262,7 @@ struct i915_hw_ppgtt {
>> 
>>  	int (*enable)(struct i915_hw_ppgtt *ppgtt);
>>  	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
>> -			 struct intel_engine_cs *ring,
>> -			 bool synchronous);
>> +			 struct intel_engine_cs *ring);
>>  	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file
>> *m);  };
>> 
>> --
>> 2.0.4

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v3] drm/i915: Rework GPU reset sequence to match driver load & thaw
  2014-08-15 17:51         ` [PATCH v3] " alistair.mcaulay
  2014-08-19 10:12           ` Mcaulay, Alistair
@ 2014-08-20 14:46           ` Daniel, Thomas
  2014-08-20 14:58             ` Chris Wilson
  1 sibling, 1 reply; 30+ messages in thread
From: Daniel, Thomas @ 2014-08-20 14:46 UTC (permalink / raw)
  To: Mcaulay, Alistair, intel-gfx



> -----Original Message-----
> From: Intel-gfx [mailto:intel-gfx-bounces@lists.freedesktop.org] On Behalf
> Of alistair.mcaulay@intel.com
> Sent: Friday, August 15, 2014 6:52 PM
> To: intel-gfx@lists.freedesktop.org
> Subject: [Intel-gfx] [PATCH v3] drm/i915: Rework GPU reset sequence to
> match driver load & thaw
> 
> From: "McAulay, Alistair" <alistair.mcaulay@intel.com>
> 
> This patch is to address Daniels concerns over different code during reset:
> 
> http://lists.freedesktop.org/archives/intel-gfx/2014-June/047758.html
> 
> "The reason for aiming as hard as possible to use the exact same code for
> driver load, gpu reset and runtime pm/system resume is that we've simply
> seen too many bugs due to slight variations and unintended omissions."
> 
> Tested using igt drv_hangman.
> 
> V2: Cleaner way of preventing check_wedge returning -EAGAIN
> V3: Clean the last_context during reset, to ensure do_switch() does the
> MI_SET_CONTEXT. As per review.
> Signed-off-by: McAulay, Alistair <alistair.mcaulay@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.c         |  6 +++
>  drivers/gpu/drm/i915/i915_drv.h         |  3 ++
>  drivers/gpu/drm/i915/i915_gem.c         |  4 +-
>  drivers/gpu/drm/i915/i915_gem_context.c | 33 +++-------------
>  drivers/gpu/drm/i915/i915_gem_gtt.c     | 67 +++++----------------------------
>  drivers/gpu/drm/i915/i915_gem_gtt.h     |  3 +-
>  6 files changed, 28 insertions(+), 88 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.c
> b/drivers/gpu/drm/i915/i915_drv.c index 5e4fefd..3bfafe6 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -806,7 +806,13 @@ int i915_reset(struct drm_device *dev)
>  			!dev_priv->ums.mm_suspended) {
>  		dev_priv->ums.mm_suspended = 0;
> 
> +		/* Used to prevent gem_check_wedged returning -EAGAIN
> during gpu reset */
> +		dev_priv->gpu_error.reload_in_reset = true;
> +
>  		ret = i915_gem_init_hw(dev);
> +
> +		dev_priv->gpu_error.reload_in_reset = false;
> +
>  		mutex_unlock(&dev->struct_mutex);
>  		if (ret) {
>  			DRM_ERROR("Failed hw init on reset %d\n", ret); diff
> --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 991b663..116daff 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1217,6 +1217,9 @@ struct i915_gpu_error {
> 
>  	/* For missed irq/seqno simulation. */
>  	unsigned int test_irq_rings;
> +
> +	/* Used to prevent gem_check_wedged returning -EAGAIN during
> gpu reset   */
> +	bool reload_in_reset;
>  };
> 
>  enum modeset_restore {
> diff --git a/drivers/gpu/drm/i915/i915_gem.c
> b/drivers/gpu/drm/i915/i915_gem.c index ef047bc..e7396eb 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1085,7 +1085,9 @@ i915_gem_check_wedge(struct i915_gpu_error
> *error,
>  		if (i915_terminally_wedged(error))
>  			return -EIO;
> 
> -		return -EAGAIN;
> +		/* Check if GPU Reset is in progress */
> +		if (!error->reload_in_reset)
> +			return -EAGAIN;
>  	}
> 
>  	return 0;
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c
> b/drivers/gpu/drm/i915/i915_gem_context.c
> index de72a28..9378ad8 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -377,34 +377,17 @@ void i915_gem_context_reset(struct drm_device
> *dev)
>  	struct drm_i915_private *dev_priv = dev->dev_private;
>  	int i;
> 
> -	/* Prevent the hardware from restoring the last context (which
> hung) on
> -	 * the next switch */
>  	for (i = 0; i < I915_NUM_RINGS; i++) {
>  		struct intel_engine_cs *ring = &dev_priv->ring[i];
> -		struct intel_context *dctx = ring->default_context;
>  		struct intel_context *lctx = ring->last_context;
> 
> -		/* Do a fake switch to the default context */
> -		if (lctx == dctx)
> -			continue;
> -
> -		if (!lctx)
> -			continue;
> +		if (lctx) {
> +			if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
> +				i915_gem_object_ggtt_unpin(lctx-
> >legacy_hw_ctx.rcs_state);
> 
> -		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
> -			WARN_ON(i915_gem_obj_ggtt_pin(dctx-
> >legacy_hw_ctx.rcs_state,
> -
> get_context_alignment(dev), 0));
> -			/* Fake a finish/inactive */
> -			dctx->legacy_hw_ctx.rcs_state->base.write_domain
> = 0;
> -			dctx->legacy_hw_ctx.rcs_state->active = 0;
> +			i915_gem_context_unreference(lctx);
> +			ring->last_context = NULL;
>  		}
> -
> -		if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
> -			i915_gem_object_ggtt_unpin(lctx-
> >legacy_hw_ctx.rcs_state);
> -
> -		i915_gem_context_unreference(lctx);
> -		i915_gem_context_reference(dctx);
> -		ring->last_context = dctx;
>  	}
>  }
> 
> @@ -498,10 +481,6 @@ int i915_gem_context_enable(struct
> drm_i915_private *dev_priv)
>  		ppgtt->enable(ppgtt);
>  	}
> 
> -	/* FIXME: We should make this work, even in reset */
> -	if (i915_reset_in_progress(&dev_priv->gpu_error))
> -		return 0;
> -
>  	BUG_ON(!dev_priv->ring[RCS].default_context);
> 
>  	for_each_ring(ring, dev_priv, i) {
> @@ -645,7 +624,7 @@ static int do_switch(struct intel_engine_cs *ring,
>  	from = ring->last_context;
> 
>  	if (USES_FULL_PPGTT(ring->dev)) {
> -		ret = ppgtt->switch_mm(ppgtt, ring, false);
> +		ret = ppgtt->switch_mm(ppgtt, ring);
>  		if (ret)
>  			goto unpin_out;
>  	}
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c
> b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 5188936..450c8a9 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -216,19 +216,12 @@ static gen6_gtt_pte_t
> iris_pte_encode(dma_addr_t addr,
> 
>  /* Broadwell Page Directory Pointer Descriptors */  static int
> gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
> -			   uint64_t val, bool synchronous)
> +			   uint64_t val)
>  {
> -	struct drm_i915_private *dev_priv = ring->dev->dev_private;
>  	int ret;
> 
>  	BUG_ON(entry >= 4);
> 
> -	if (synchronous) {
> -		I915_WRITE(GEN8_RING_PDP_UDW(ring, entry), val >> 32);
> -		I915_WRITE(GEN8_RING_PDP_LDW(ring, entry), (u32)val);
> -		return 0;
> -	}
Removing synchronous breaks execlist mode because it will try to write
directly to the ring.  I will post a patch to fix this by no-oping
i915_ppgtt_init() for execlists.

Thomas.

> -
>  	ret = intel_ring_begin(ring, 6);
>  	if (ret)
>  		return ret;
> @@ -245,8 +238,7 @@ static int gen8_write_pdp(struct intel_engine_cs
> *ring, unsigned entry,  }
> 
>  static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
> -			  struct intel_engine_cs *ring,
> -			  bool synchronous)
> +			  struct intel_engine_cs *ring)
>  {
>  	int i, ret;
> 
> @@ -255,7 +247,7 @@ static int gen8_mm_switch(struct i915_hw_ppgtt
> *ppgtt,
> 
>  	for (i = used_pd - 1; i >= 0; i--) {
>  		dma_addr_t addr = ppgtt->pd_dma_addr[i];
> -		ret = gen8_write_pdp(ring, i, addr, synchronous);
> +		ret = gen8_write_pdp(ring, i, addr);
>  		if (ret)
>  			return ret;
>  	}
> @@ -724,29 +716,10 @@ static uint32_t get_pd_offset(struct i915_hw_ppgtt
> *ppgtt)  }
> 
>  static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
> -			 struct intel_engine_cs *ring,
> -			 bool synchronous)
> +			 struct intel_engine_cs *ring)
>  {
> -	struct drm_device *dev = ppgtt->base.dev;
> -	struct drm_i915_private *dev_priv = dev->dev_private;
>  	int ret;
> 
> -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> -	 * manually frob these bits. Ideally we could use the ring functions,
> -	 * except our error handling makes it quite difficult (can't use
> -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> -	 *
> -	 * FIXME: We should try not to special case reset
> -	 */
> -	if (synchronous ||
> -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> -		I915_WRITE(RING_PP_DIR_BASE(ring),
> get_pd_offset(ppgtt));
> -		POSTING_READ(RING_PP_DIR_BASE(ring));
> -		return 0;
> -	}
> -
>  	/* NB: TLBs must be flushed and invalidated before a switch */
>  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS,
> I915_GEM_GPU_DOMAINS);
>  	if (ret)
> @@ -768,29 +741,10 @@ static int hsw_mm_switch(struct i915_hw_ppgtt
> *ppgtt,  }
> 
>  static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
> -			  struct intel_engine_cs *ring,
> -			  bool synchronous)
> +			  struct intel_engine_cs *ring)
>  {
> -	struct drm_device *dev = ppgtt->base.dev;
> -	struct drm_i915_private *dev_priv = dev->dev_private;
>  	int ret;
> 
> -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> -	 * manually frob these bits. Ideally we could use the ring functions,
> -	 * except our error handling makes it quite difficult (can't use
> -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> -	 *
> -	 * FIXME: We should try not to special case reset
> -	 */
> -	if (synchronous ||
> -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> -		I915_WRITE(RING_PP_DIR_BASE(ring),
> get_pd_offset(ppgtt));
> -		POSTING_READ(RING_PP_DIR_BASE(ring));
> -		return 0;
> -	}
> -
>  	/* NB: TLBs must be flushed and invalidated before a switch */
>  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS,
> I915_GEM_GPU_DOMAINS);
>  	if (ret)
> @@ -819,14 +773,11 @@ static int gen7_mm_switch(struct i915_hw_ppgtt
> *ppgtt,  }
> 
>  static int gen6_mm_switch(struct i915_hw_ppgtt *ppgtt,
> -			  struct intel_engine_cs *ring,
> -			  bool synchronous)
> +			  struct intel_engine_cs *ring)
>  {
>  	struct drm_device *dev = ppgtt->base.dev;
>  	struct drm_i915_private *dev_priv = dev->dev_private;
> 
> -	if (!synchronous)
> -		return 0;
> 
>  	I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
>  	I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt)); @@ -
> 852,7 +803,7 @@ static int gen8_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
>  		if (USES_FULL_PPGTT(dev))
>  			continue;
> 
> -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> +		ret = ppgtt->switch_mm(ppgtt, ring);
>  		if (ret)
>  			goto err_out;
>  	}
> @@ -897,7 +848,7 @@ static int gen7_ppgtt_enable(struct i915_hw_ppgtt
> *ppgtt)
>  		if (USES_FULL_PPGTT(dev))
>  			continue;
> 
> -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> +		ret = ppgtt->switch_mm(ppgtt, ring);
>  		if (ret)
>  			return ret;
>  	}
> @@ -926,7 +877,7 @@ static int gen6_ppgtt_enable(struct i915_hw_ppgtt
> *ppgtt)
>  	I915_WRITE(GFX_MODE,
> _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
> 
>  	for_each_ring(ring, dev_priv, i) {
> -		int ret = ppgtt->switch_mm(ppgtt, ring, true);
> +		int ret = ppgtt->switch_mm(ppgtt, ring);
>  		if (ret)
>  			return ret;
>  	}
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h
> b/drivers/gpu/drm/i915/i915_gem_gtt.h
> index 8d6f7c1..bf1e4fc 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> @@ -262,8 +262,7 @@ struct i915_hw_ppgtt {
> 
>  	int (*enable)(struct i915_hw_ppgtt *ppgtt);
>  	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
> -			 struct intel_engine_cs *ring,
> -			 bool synchronous);
> +			 struct intel_engine_cs *ring);
>  	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file
> *m);  };
> 
> --
> 2.0.4
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v3] drm/i915: Rework GPU reset sequence to match driver load & thaw
  2014-08-20 14:46           ` Daniel, Thomas
@ 2014-08-20 14:58             ` Chris Wilson
  2014-08-20 15:21               ` Mcaulay, Alistair
  0 siblings, 1 reply; 30+ messages in thread
From: Chris Wilson @ 2014-08-20 14:58 UTC (permalink / raw)
  To: Daniel, Thomas; +Cc: intel-gfx

On Wed, Aug 20, 2014 at 02:46:37PM +0000, Daniel, Thomas wrote:
> 
> 
> > -----Original Message-----
> > From: Intel-gfx [mailto:intel-gfx-bounces@lists.freedesktop.org] On Behalf
> > Of alistair.mcaulay@intel.com
> > Sent: Friday, August 15, 2014 6:52 PM
> > To: intel-gfx@lists.freedesktop.org
> > Subject: [Intel-gfx] [PATCH v3] drm/i915: Rework GPU reset sequence to
> > match driver load & thaw
> > 
> > From: "McAulay, Alistair" <alistair.mcaulay@intel.com>
> > 
> > This patch is to address Daniels concerns over different code during reset:
> > 
> > http://lists.freedesktop.org/archives/intel-gfx/2014-June/047758.html
> > 
> > "The reason for aiming as hard as possible to use the exact same code for
> > driver load, gpu reset and runtime pm/system resume is that we've simply
> > seen too many bugs due to slight variations and unintended omissions."
> > 
> > Tested using igt drv_hangman.
> > 
> > V2: Cleaner way of preventing check_wedge returning -EAGAIN
> > V3: Clean the last_context during reset, to ensure do_switch() does the
> > MI_SET_CONTEXT. As per review.
> > Signed-off-by: McAulay, Alistair <alistair.mcaulay@intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_drv.c         |  6 +++
> >  drivers/gpu/drm/i915/i915_drv.h         |  3 ++
> >  drivers/gpu/drm/i915/i915_gem.c         |  4 +-
> >  drivers/gpu/drm/i915/i915_gem_context.c | 33 +++-------------
> >  drivers/gpu/drm/i915/i915_gem_gtt.c     | 67 +++++----------------------------
> >  drivers/gpu/drm/i915/i915_gem_gtt.h     |  3 +-
> >  6 files changed, 28 insertions(+), 88 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_drv.c
> > b/drivers/gpu/drm/i915/i915_drv.c index 5e4fefd..3bfafe6 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.c
> > +++ b/drivers/gpu/drm/i915/i915_drv.c
> > @@ -806,7 +806,13 @@ int i915_reset(struct drm_device *dev)
> >  			!dev_priv->ums.mm_suspended) {
> >  		dev_priv->ums.mm_suspended = 0;
> > 
> > +		/* Used to prevent gem_check_wedged returning -EAGAIN
> > during gpu reset */
> > +		dev_priv->gpu_error.reload_in_reset = true;
> > +
> >  		ret = i915_gem_init_hw(dev);
> > +
> > +		dev_priv->gpu_error.reload_in_reset = false;
> > +
> >  		mutex_unlock(&dev->struct_mutex);
> >  		if (ret) {
> >  			DRM_ERROR("Failed hw init on reset %d\n", ret); diff
> > --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > index 991b663..116daff 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -1217,6 +1217,9 @@ struct i915_gpu_error {
> > 
> >  	/* For missed irq/seqno simulation. */
> >  	unsigned int test_irq_rings;
> > +
> > +	/* Used to prevent gem_check_wedged returning -EAGAIN during
> > gpu reset   */
> > +	bool reload_in_reset;
> >  };
> > 
> >  enum modeset_restore {
> > diff --git a/drivers/gpu/drm/i915/i915_gem.c
> > b/drivers/gpu/drm/i915/i915_gem.c index ef047bc..e7396eb 100644
> > --- a/drivers/gpu/drm/i915/i915_gem.c
> > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > @@ -1085,7 +1085,9 @@ i915_gem_check_wedge(struct i915_gpu_error
> > *error,
> >  		if (i915_terminally_wedged(error))
> >  			return -EIO;
> > 
> > -		return -EAGAIN;
> > +		/* Check if GPU Reset is in progress */
> > +		if (!error->reload_in_reset)
> > +			return -EAGAIN;

This is silly. You already have the same flag above. Look closer.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v3] drm/i915: Rework GPU reset sequence to match driver load & thaw
  2014-08-20 14:58             ` Chris Wilson
@ 2014-08-20 15:21               ` Mcaulay, Alistair
  2014-08-20 15:56                 ` Chris Wilson
  0 siblings, 1 reply; 30+ messages in thread
From: Mcaulay, Alistair @ 2014-08-20 15:21 UTC (permalink / raw)
  To: Chris Wilson, Daniel, Thomas; +Cc: intel-gfx



> -----Original Message-----
> From: Chris Wilson [mailto:chris@chris-wilson.co.uk]
> Sent: Wednesday, August 20, 2014 3:58 PM
> To: Daniel, Thomas
> Cc: Mcaulay, Alistair; intel-gfx@lists.freedesktop.org
> Subject: Re: [Intel-gfx] [PATCH v3] drm/i915: Rework GPU reset sequence to
> match driver load & thaw
> 
> On Wed, Aug 20, 2014 at 02:46:37PM +0000, Daniel, Thomas wrote:
> >
> >
> > > -----Original Message-----
> > > From: Intel-gfx [mailto:intel-gfx-bounces@lists.freedesktop.org] On
> > > Behalf Of alistair.mcaulay@intel.com
> > > Sent: Friday, August 15, 2014 6:52 PM
> > > To: intel-gfx@lists.freedesktop.org
> > > Subject: [Intel-gfx] [PATCH v3] drm/i915: Rework GPU reset sequence
> > > to match driver load & thaw
> > >
> > > From: "McAulay, Alistair" <alistair.mcaulay@intel.com>
> > >
> > > This patch is to address Daniels concerns over different code during reset:
> > >
> > > http://lists.freedesktop.org/archives/intel-gfx/2014-June/047758.htm
> > > l
> > >
> > > "The reason for aiming as hard as possible to use the exact same
> > > code for driver load, gpu reset and runtime pm/system resume is that
> > > we've simply seen too many bugs due to slight variations and unintended
> omissions."
> > >
> > > Tested using igt drv_hangman.
> > >
> > > V2: Cleaner way of preventing check_wedge returning -EAGAIN
> > > V3: Clean the last_context during reset, to ensure do_switch() does
> > > the MI_SET_CONTEXT. As per review.
> > > Signed-off-by: McAulay, Alistair <alistair.mcaulay@intel.com>
> > > ---
> > >  drivers/gpu/drm/i915/i915_drv.c         |  6 +++
> > >  drivers/gpu/drm/i915/i915_drv.h         |  3 ++
> > >  drivers/gpu/drm/i915/i915_gem.c         |  4 +-
> > >  drivers/gpu/drm/i915/i915_gem_context.c | 33 +++-------------
> > >  drivers/gpu/drm/i915/i915_gem_gtt.c     | 67 +++++--------------------------
> --
> > >  drivers/gpu/drm/i915/i915_gem_gtt.h     |  3 +-
> > >  6 files changed, 28 insertions(+), 88 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/i915/i915_drv.c
> > > b/drivers/gpu/drm/i915/i915_drv.c index 5e4fefd..3bfafe6 100644
> > > --- a/drivers/gpu/drm/i915/i915_drv.c
> > > +++ b/drivers/gpu/drm/i915/i915_drv.c
> > > @@ -806,7 +806,13 @@ int i915_reset(struct drm_device *dev)
> > >  			!dev_priv->ums.mm_suspended) {
> > >  		dev_priv->ums.mm_suspended = 0;
> > >
> > > +		/* Used to prevent gem_check_wedged returning -EAGAIN
> > > during gpu reset */
> > > +		dev_priv->gpu_error.reload_in_reset = true;
> > > +
> > >  		ret = i915_gem_init_hw(dev);
> > > +
> > > +		dev_priv->gpu_error.reload_in_reset = false;
> > > +
> > >  		mutex_unlock(&dev->struct_mutex);
> > >  		if (ret) {
> > >  			DRM_ERROR("Failed hw init on reset %d\n", ret); diff
> --git
> > > a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > > index 991b663..116daff 100644
> > > --- a/drivers/gpu/drm/i915/i915_drv.h
> > > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > > @@ -1217,6 +1217,9 @@ struct i915_gpu_error {
> > >
> > >  	/* For missed irq/seqno simulation. */
> > >  	unsigned int test_irq_rings;
> > > +
> > > +	/* Used to prevent gem_check_wedged returning -EAGAIN during
> > > gpu reset   */
> > > +	bool reload_in_reset;
> > >  };
> > >
> > >  enum modeset_restore {
> > > diff --git a/drivers/gpu/drm/i915/i915_gem.c
> > > b/drivers/gpu/drm/i915/i915_gem.c index ef047bc..e7396eb 100644
> > > --- a/drivers/gpu/drm/i915/i915_gem.c
> > > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > > @@ -1085,7 +1085,9 @@ i915_gem_check_wedge(struct i915_gpu_error
> > > *error,
> > >  		if (i915_terminally_wedged(error))
> > >  			return -EIO;
> > >
> > > -		return -EAGAIN;
> > > +		/* Check if GPU Reset is in progress */
> > > +		if (!error->reload_in_reset)
> > > +			return -EAGAIN;
> 
> This is silly. You already have the same flag above. Look closer.
> -Chris
> 
> --
> Chris Wilson, Intel Open Source Technology Centre

It is not the same. This is a special case when re-initialising the hw. This flag is to allow gem_init_hw() to complete successfully during reset. 
At any other point during reset, -EAGAIN should be returned.

Alistair.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v3] drm/i915: Rework GPU reset sequence to match driver load & thaw
  2014-08-20 15:21               ` Mcaulay, Alistair
@ 2014-08-20 15:56                 ` Chris Wilson
  2014-08-25 20:18                   ` Daniel Vetter
  0 siblings, 1 reply; 30+ messages in thread
From: Chris Wilson @ 2014-08-20 15:56 UTC (permalink / raw)
  To: Mcaulay, Alistair; +Cc: intel-gfx

On Wed, Aug 20, 2014 at 03:21:55PM +0000, Mcaulay, Alistair wrote:
> It is not the same. This is a special case when re-initialising the hw. This flag is to allow gem_init_hw() to complete successfully during reset. 
> At any other point during reset, -EAGAIN should be returned.

Indeed. You've missed the point. Look closer at the reset counter and
reset ordering.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v3] drm/i915: Rework GPU reset sequence to match driver load & thaw
  2014-08-19 12:35             ` Mika Kuoppala
@ 2014-08-21 12:38               ` Mcaulay, Alistair
  2014-08-25 20:28                 ` Daniel Vetter
  0 siblings, 1 reply; 30+ messages in thread
From: Mcaulay, Alistair @ 2014-08-21 12:38 UTC (permalink / raw)
  To: Daniel Vetter (daniel@ffwll.ch), intel-gfx

Hi Daniel,

Is there anything else needing done before this patch can be merged?

Thanks,
Alistair.

> -----Original Message-----
> From: Mika Kuoppala [mailto:mika.kuoppala@linux.intel.com]
> Sent: Tuesday, August 19, 2014 1:36 PM
> To: Mcaulay, Alistair; intel-gfx@lists.freedesktop.org
> Subject: RE: [PATCH v3] drm/i915: Rework GPU reset sequence to match
> driver load & thaw
> 
> "Mcaulay, Alistair" <alistair.mcaulay@intel.com> writes:
> 
> > Hi Mika,
> >
> > can you please review this patch, and verify it fixes the issues in your
> previous review.
> >
> > Thanks,
> > Alistair.
> >
> >> -----Original Message-----
> >> From: Mcaulay, Alistair
> >> Sent: Friday, August 15, 2014 6:52 PM
> >> To: intel-gfx@lists.freedesktop.org
> >> Cc: Mcaulay, Alistair
> >> Subject: [PATCH v3] drm/i915: Rework GPU reset sequence to match
> >> driver load & thaw
> >>
> >> From: "McAulay, Alistair" <alistair.mcaulay@intel.com>
> >>
> >> This patch is to address Daniels concerns over different code during reset:
> >>
> >> http://lists.freedesktop.org/archives/intel-gfx/2014-June/047758.html
> >>
> >> "The reason for aiming as hard as possible to use the exact same code
> >> for driver load, gpu reset and runtime pm/system resume is that we've
> >> simply seen too many bugs due to slight variations and unintended
> omissions."
> >>
> >> Tested using igt drv_hangman.
> >>
> >> V2: Cleaner way of preventing check_wedge returning -EAGAIN
> >> V3: Clean the last_context during reset, to ensure do_switch() does
> >> the MI_SET_CONTEXT. As per review.
> >> Signed-off-by: McAulay, Alistair <alistair.mcaulay@intel.com>
> >> ---
> 
> Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
> 
> >>  drivers/gpu/drm/i915/i915_drv.c         |  6 +++
> >>  drivers/gpu/drm/i915/i915_drv.h         |  3 ++
> >>  drivers/gpu/drm/i915/i915_gem.c         |  4 +-
> >>  drivers/gpu/drm/i915/i915_gem_context.c | 33 +++-------------
> >>  drivers/gpu/drm/i915/i915_gem_gtt.c     | 67 +++++--------------------------
> --
> >>  drivers/gpu/drm/i915/i915_gem_gtt.h     |  3 +-
> >>  6 files changed, 28 insertions(+), 88 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/i915/i915_drv.c
> >> b/drivers/gpu/drm/i915/i915_drv.c index 5e4fefd..3bfafe6 100644
> >> --- a/drivers/gpu/drm/i915/i915_drv.c
> >> +++ b/drivers/gpu/drm/i915/i915_drv.c
> >> @@ -806,7 +806,13 @@ int i915_reset(struct drm_device *dev)
> >>  			!dev_priv->ums.mm_suspended) {
> >>  		dev_priv->ums.mm_suspended = 0;
> >>
> >> +		/* Used to prevent gem_check_wedged returning -EAGAIN
> >> during gpu reset */
> >> +		dev_priv->gpu_error.reload_in_reset = true;
> >> +
> >>  		ret = i915_gem_init_hw(dev);
> >> +
> >> +		dev_priv->gpu_error.reload_in_reset = false;
> >> +
> >>  		mutex_unlock(&dev->struct_mutex);
> >>  		if (ret) {
> >>  			DRM_ERROR("Failed hw init on reset %d\n", ret); diff
> --git
> >> a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> >> index 991b663..116daff 100644
> >> --- a/drivers/gpu/drm/i915/i915_drv.h
> >> +++ b/drivers/gpu/drm/i915/i915_drv.h
> >> @@ -1217,6 +1217,9 @@ struct i915_gpu_error {
> >>
> >>  	/* For missed irq/seqno simulation. */
> >>  	unsigned int test_irq_rings;
> >> +
> >> +	/* Used to prevent gem_check_wedged returning -EAGAIN during
> >> gpu reset   */
> >> +	bool reload_in_reset;
> >>  };
> >>
> >>  enum modeset_restore {
> >> diff --git a/drivers/gpu/drm/i915/i915_gem.c
> >> b/drivers/gpu/drm/i915/i915_gem.c index ef047bc..e7396eb 100644
> >> --- a/drivers/gpu/drm/i915/i915_gem.c
> >> +++ b/drivers/gpu/drm/i915/i915_gem.c
> >> @@ -1085,7 +1085,9 @@ i915_gem_check_wedge(struct i915_gpu_error
> >> *error,
> >>  		if (i915_terminally_wedged(error))
> >>  			return -EIO;
> >>
> >> -		return -EAGAIN;
> >> +		/* Check if GPU Reset is in progress */
> >> +		if (!error->reload_in_reset)
> >> +			return -EAGAIN;
> >>  	}
> >>
> >>  	return 0;
> >> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c
> >> b/drivers/gpu/drm/i915/i915_gem_context.c
> >> index de72a28..9378ad8 100644
> >> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> >> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> >> @@ -377,34 +377,17 @@ void i915_gem_context_reset(struct
> drm_device
> >> *dev)
> >>  	struct drm_i915_private *dev_priv = dev->dev_private;
> >>  	int i;
> >>
> >> -	/* Prevent the hardware from restoring the last context (which
> >> hung) on
> >> -	 * the next switch */
> >>  	for (i = 0; i < I915_NUM_RINGS; i++) {
> >>  		struct intel_engine_cs *ring = &dev_priv->ring[i];
> >> -		struct intel_context *dctx = ring->default_context;
> >>  		struct intel_context *lctx = ring->last_context;
> >>
> >> -		/* Do a fake switch to the default context */
> >> -		if (lctx == dctx)
> >> -			continue;
> >> -
> >> -		if (!lctx)
> >> -			continue;
> >> +		if (lctx) {
> >> +			if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
> >> +				i915_gem_object_ggtt_unpin(lctx-
> >> >legacy_hw_ctx.rcs_state);
> >>
> >> -		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
> >> -			WARN_ON(i915_gem_obj_ggtt_pin(dctx-
> >> >legacy_hw_ctx.rcs_state,
> >> -
> >> get_context_alignment(dev), 0));
> >> -			/* Fake a finish/inactive */
> >> -			dctx->legacy_hw_ctx.rcs_state->base.write_domain
> >> = 0;
> >> -			dctx->legacy_hw_ctx.rcs_state->active = 0;
> >> +			i915_gem_context_unreference(lctx);
> >> +			ring->last_context = NULL;
> >>  		}
> >> -
> >> -		if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
> >> -			i915_gem_object_ggtt_unpin(lctx-
> >> >legacy_hw_ctx.rcs_state);
> >> -
> >> -		i915_gem_context_unreference(lctx);
> >> -		i915_gem_context_reference(dctx);
> >> -		ring->last_context = dctx;
> >>  	}
> >>  }
> >>
> >> @@ -498,10 +481,6 @@ int i915_gem_context_enable(struct
> >> drm_i915_private *dev_priv)
> >>  		ppgtt->enable(ppgtt);
> >>  	}
> >>
> >> -	/* FIXME: We should make this work, even in reset */
> >> -	if (i915_reset_in_progress(&dev_priv->gpu_error))
> >> -		return 0;
> >> -
> >>  	BUG_ON(!dev_priv->ring[RCS].default_context);
> >>
> >>  	for_each_ring(ring, dev_priv, i) {
> >> @@ -645,7 +624,7 @@ static int do_switch(struct intel_engine_cs *ring,
> >>  	from = ring->last_context;
> >>
> >>  	if (USES_FULL_PPGTT(ring->dev)) {
> >> -		ret = ppgtt->switch_mm(ppgtt, ring, false);
> >> +		ret = ppgtt->switch_mm(ppgtt, ring);
> >>  		if (ret)
> >>  			goto unpin_out;
> >>  	}
> >> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c
> >> b/drivers/gpu/drm/i915/i915_gem_gtt.c
> >> index 5188936..450c8a9 100644
> >> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> >> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> >> @@ -216,19 +216,12 @@ static gen6_gtt_pte_t
> >> iris_pte_encode(dma_addr_t addr,
> >>
> >>  /* Broadwell Page Directory Pointer Descriptors */  static int
> >> gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
> >> -			   uint64_t val, bool synchronous)
> >> +			   uint64_t val)
> >>  {
> >> -	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> >>  	int ret;
> >>
> >>  	BUG_ON(entry >= 4);
> >>
> >> -	if (synchronous) {
> >> -		I915_WRITE(GEN8_RING_PDP_UDW(ring, entry), val >> 32);
> >> -		I915_WRITE(GEN8_RING_PDP_LDW(ring, entry), (u32)val);
> >> -		return 0;
> >> -	}
> >> -
> >>  	ret = intel_ring_begin(ring, 6);
> >>  	if (ret)
> >>  		return ret;
> >> @@ -245,8 +238,7 @@ static int gen8_write_pdp(struct intel_engine_cs
> >> *ring, unsigned entry,  }
> >>
> >>  static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
> >> -			  struct intel_engine_cs *ring,
> >> -			  bool synchronous)
> >> +			  struct intel_engine_cs *ring)
> >>  {
> >>  	int i, ret;
> >>
> >> @@ -255,7 +247,7 @@ static int gen8_mm_switch(struct i915_hw_ppgtt
> >> *ppgtt,
> >>
> >>  	for (i = used_pd - 1; i >= 0; i--) {
> >>  		dma_addr_t addr = ppgtt->pd_dma_addr[i];
> >> -		ret = gen8_write_pdp(ring, i, addr, synchronous);
> >> +		ret = gen8_write_pdp(ring, i, addr);
> >>  		if (ret)
> >>  			return ret;
> >>  	}
> >> @@ -724,29 +716,10 @@ static uint32_t get_pd_offset(struct
> >> i915_hw_ppgtt
> >> *ppgtt)  }
> >>
> >>  static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
> >> -			 struct intel_engine_cs *ring,
> >> -			 bool synchronous)
> >> +			 struct intel_engine_cs *ring)
> >>  {
> >> -	struct drm_device *dev = ppgtt->base.dev;
> >> -	struct drm_i915_private *dev_priv = dev->dev_private;
> >>  	int ret;
> >>
> >> -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> >> -	 * manually frob these bits. Ideally we could use the ring functions,
> >> -	 * except our error handling makes it quite difficult (can't use
> >> -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> >> -	 *
> >> -	 * FIXME: We should try not to special case reset
> >> -	 */
> >> -	if (synchronous ||
> >> -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> >> -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> >> -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> >> -		I915_WRITE(RING_PP_DIR_BASE(ring),
> >> get_pd_offset(ppgtt));
> >> -		POSTING_READ(RING_PP_DIR_BASE(ring));
> >> -		return 0;
> >> -	}
> >> -
> >>  	/* NB: TLBs must be flushed and invalidated before a switch */
> >>  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS,
> >> I915_GEM_GPU_DOMAINS);
> >>  	if (ret)
> >> @@ -768,29 +741,10 @@ static int hsw_mm_switch(struct i915_hw_ppgtt
> >> *ppgtt,  }
> >>
> >>  static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
> >> -			  struct intel_engine_cs *ring,
> >> -			  bool synchronous)
> >> +			  struct intel_engine_cs *ring)
> >>  {
> >> -	struct drm_device *dev = ppgtt->base.dev;
> >> -	struct drm_i915_private *dev_priv = dev->dev_private;
> >>  	int ret;
> >>
> >> -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> >> -	 * manually frob these bits. Ideally we could use the ring functions,
> >> -	 * except our error handling makes it quite difficult (can't use
> >> -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> >> -	 *
> >> -	 * FIXME: We should try not to special case reset
> >> -	 */
> >> -	if (synchronous ||
> >> -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> >> -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> >> -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> >> -		I915_WRITE(RING_PP_DIR_BASE(ring),
> >> get_pd_offset(ppgtt));
> >> -		POSTING_READ(RING_PP_DIR_BASE(ring));
> >> -		return 0;
> >> -	}
> >> -
> >>  	/* NB: TLBs must be flushed and invalidated before a switch */
> >>  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS,
> >> I915_GEM_GPU_DOMAINS);
> >>  	if (ret)
> >> @@ -819,14 +773,11 @@ static int gen7_mm_switch(struct
> i915_hw_ppgtt
> >> *ppgtt,  }
> >>
> >>  static int gen6_mm_switch(struct i915_hw_ppgtt *ppgtt,
> >> -			  struct intel_engine_cs *ring,
> >> -			  bool synchronous)
> >> +			  struct intel_engine_cs *ring)
> >>  {
> >>  	struct drm_device *dev = ppgtt->base.dev;
> >>  	struct drm_i915_private *dev_priv = dev->dev_private;
> >>
> >> -	if (!synchronous)
> >> -		return 0;
> >>
> >>  	I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> >>  	I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt)); @@ -
> >> 852,7 +803,7 @@ static int gen8_ppgtt_enable(struct i915_hw_ppgtt
> *ppgtt)
> >>  		if (USES_FULL_PPGTT(dev))
> >>  			continue;
> >>
> >> -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> >> +		ret = ppgtt->switch_mm(ppgtt, ring);
> >>  		if (ret)
> >>  			goto err_out;
> >>  	}
> >> @@ -897,7 +848,7 @@ static int gen7_ppgtt_enable(struct i915_hw_ppgtt
> >> *ppgtt)
> >>  		if (USES_FULL_PPGTT(dev))
> >>  			continue;
> >>
> >> -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> >> +		ret = ppgtt->switch_mm(ppgtt, ring);
> >>  		if (ret)
> >>  			return ret;
> >>  	}
> >> @@ -926,7 +877,7 @@ static int gen6_ppgtt_enable(struct i915_hw_ppgtt
> >> *ppgtt)
> >>  	I915_WRITE(GFX_MODE,
> >> _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
> >>
> >>  	for_each_ring(ring, dev_priv, i) {
> >> -		int ret = ppgtt->switch_mm(ppgtt, ring, true);
> >> +		int ret = ppgtt->switch_mm(ppgtt, ring);
> >>  		if (ret)
> >>  			return ret;
> >>  	}
> >> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h
> >> b/drivers/gpu/drm/i915/i915_gem_gtt.h
> >> index 8d6f7c1..bf1e4fc 100644
> >> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> >> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> >> @@ -262,8 +262,7 @@ struct i915_hw_ppgtt {
> >>
> >>  	int (*enable)(struct i915_hw_ppgtt *ppgtt);
> >>  	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
> >> -			 struct intel_engine_cs *ring,
> >> -			 bool synchronous);
> >> +			 struct intel_engine_cs *ring);
> >>  	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file
> >> *m);  };
> >>
> >> --
> >> 2.0.4

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v3] drm/i915: Rework GPU reset sequence to match driver load & thaw
  2014-08-20 15:56                 ` Chris Wilson
@ 2014-08-25 20:18                   ` Daniel Vetter
  2014-08-26  6:09                     ` Chris Wilson
  0 siblings, 1 reply; 30+ messages in thread
From: Daniel Vetter @ 2014-08-25 20:18 UTC (permalink / raw)
  To: Chris Wilson, Mcaulay, Alistair, Daniel, Thomas, intel-gfx

On Wed, Aug 20, 2014 at 04:56:41PM +0100, Chris Wilson wrote:
> On Wed, Aug 20, 2014 at 03:21:55PM +0000, Mcaulay, Alistair wrote:
> > It is not the same. This is a special case when re-initialising the hw. This flag is to allow gem_init_hw() to complete successfully during reset. 
> > At any other point during reset, -EAGAIN should be returned.
> 
> Indeed. You've missed the point. Look closer at the reset counter and
> reset ordering.

We could try to mark the gpu as reset again before starting the reinit to
avoid this kludge. But that has the problem that if the ring init fails we
have a bit a mess in marking the gpu terminally wedged. So I think overall
not prettier than what we have here ... And if I'm mistaken I guess I can
put on my idiot hat and merge the fixup ;-)
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v3] drm/i915: Rework GPU reset sequence to match driver load & thaw
  2014-08-21 12:38               ` Mcaulay, Alistair
@ 2014-08-25 20:28                 ` Daniel Vetter
  0 siblings, 0 replies; 30+ messages in thread
From: Daniel Vetter @ 2014-08-25 20:28 UTC (permalink / raw)
  To: Mcaulay, Alistair; +Cc: intel-gfx

On Thu, Aug 21, 2014 at 12:38:58PM +0000, Mcaulay, Alistair wrote:
> Hi Daniel,
> 
> Is there anything else needing done before this patch can be merged?

Me getting working internet mostly. Patch merged with the comment in
check_wedge a bit extended. Also had to rebase, please double-check that I
didn't fumble it.

Thanks, Daniel

> 
> Thanks,
> Alistair.
> 
> > -----Original Message-----
> > From: Mika Kuoppala [mailto:mika.kuoppala@linux.intel.com]
> > Sent: Tuesday, August 19, 2014 1:36 PM
> > To: Mcaulay, Alistair; intel-gfx@lists.freedesktop.org
> > Subject: RE: [PATCH v3] drm/i915: Rework GPU reset sequence to match
> > driver load & thaw
> > 
> > "Mcaulay, Alistair" <alistair.mcaulay@intel.com> writes:
> > 
> > > Hi Mika,
> > >
> > > can you please review this patch, and verify it fixes the issues in your
> > previous review.
> > >
> > > Thanks,
> > > Alistair.
> > >
> > >> -----Original Message-----
> > >> From: Mcaulay, Alistair
> > >> Sent: Friday, August 15, 2014 6:52 PM
> > >> To: intel-gfx@lists.freedesktop.org
> > >> Cc: Mcaulay, Alistair
> > >> Subject: [PATCH v3] drm/i915: Rework GPU reset sequence to match
> > >> driver load & thaw
> > >>
> > >> From: "McAulay, Alistair" <alistair.mcaulay@intel.com>
> > >>
> > >> This patch is to address Daniels concerns over different code during reset:
> > >>
> > >> http://lists.freedesktop.org/archives/intel-gfx/2014-June/047758.html
> > >>
> > >> "The reason for aiming as hard as possible to use the exact same code
> > >> for driver load, gpu reset and runtime pm/system resume is that we've
> > >> simply seen too many bugs due to slight variations and unintended
> > omissions."
> > >>
> > >> Tested using igt drv_hangman.
> > >>
> > >> V2: Cleaner way of preventing check_wedge returning -EAGAIN
> > >> V3: Clean the last_context during reset, to ensure do_switch() does
> > >> the MI_SET_CONTEXT. As per review.
> > >> Signed-off-by: McAulay, Alistair <alistair.mcaulay@intel.com>
> > >> ---
> > 
> > Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
> > 
> > >>  drivers/gpu/drm/i915/i915_drv.c         |  6 +++
> > >>  drivers/gpu/drm/i915/i915_drv.h         |  3 ++
> > >>  drivers/gpu/drm/i915/i915_gem.c         |  4 +-
> > >>  drivers/gpu/drm/i915/i915_gem_context.c | 33 +++-------------
> > >>  drivers/gpu/drm/i915/i915_gem_gtt.c     | 67 +++++--------------------------
> > --
> > >>  drivers/gpu/drm/i915/i915_gem_gtt.h     |  3 +-
> > >>  6 files changed, 28 insertions(+), 88 deletions(-)
> > >>
> > >> diff --git a/drivers/gpu/drm/i915/i915_drv.c
> > >> b/drivers/gpu/drm/i915/i915_drv.c index 5e4fefd..3bfafe6 100644
> > >> --- a/drivers/gpu/drm/i915/i915_drv.c
> > >> +++ b/drivers/gpu/drm/i915/i915_drv.c
> > >> @@ -806,7 +806,13 @@ int i915_reset(struct drm_device *dev)
> > >>  			!dev_priv->ums.mm_suspended) {
> > >>  		dev_priv->ums.mm_suspended = 0;
> > >>
> > >> +		/* Used to prevent gem_check_wedged returning -EAGAIN
> > >> during gpu reset */
> > >> +		dev_priv->gpu_error.reload_in_reset = true;
> > >> +
> > >>  		ret = i915_gem_init_hw(dev);
> > >> +
> > >> +		dev_priv->gpu_error.reload_in_reset = false;
> > >> +
> > >>  		mutex_unlock(&dev->struct_mutex);
> > >>  		if (ret) {
> > >>  			DRM_ERROR("Failed hw init on reset %d\n", ret); diff
> > --git
> > >> a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > >> index 991b663..116daff 100644
> > >> --- a/drivers/gpu/drm/i915/i915_drv.h
> > >> +++ b/drivers/gpu/drm/i915/i915_drv.h
> > >> @@ -1217,6 +1217,9 @@ struct i915_gpu_error {
> > >>
> > >>  	/* For missed irq/seqno simulation. */
> > >>  	unsigned int test_irq_rings;
> > >> +
> > >> +	/* Used to prevent gem_check_wedged returning -EAGAIN during
> > >> gpu reset   */
> > >> +	bool reload_in_reset;
> > >>  };
> > >>
> > >>  enum modeset_restore {
> > >> diff --git a/drivers/gpu/drm/i915/i915_gem.c
> > >> b/drivers/gpu/drm/i915/i915_gem.c index ef047bc..e7396eb 100644
> > >> --- a/drivers/gpu/drm/i915/i915_gem.c
> > >> +++ b/drivers/gpu/drm/i915/i915_gem.c
> > >> @@ -1085,7 +1085,9 @@ i915_gem_check_wedge(struct i915_gpu_error
> > >> *error,
> > >>  		if (i915_terminally_wedged(error))
> > >>  			return -EIO;
> > >>
> > >> -		return -EAGAIN;
> > >> +		/* Check if GPU Reset is in progress */
> > >> +		if (!error->reload_in_reset)
> > >> +			return -EAGAIN;
> > >>  	}
> > >>
> > >>  	return 0;
> > >> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c
> > >> b/drivers/gpu/drm/i915/i915_gem_context.c
> > >> index de72a28..9378ad8 100644
> > >> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> > >> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> > >> @@ -377,34 +377,17 @@ void i915_gem_context_reset(struct
> > drm_device
> > >> *dev)
> > >>  	struct drm_i915_private *dev_priv = dev->dev_private;
> > >>  	int i;
> > >>
> > >> -	/* Prevent the hardware from restoring the last context (which
> > >> hung) on
> > >> -	 * the next switch */
> > >>  	for (i = 0; i < I915_NUM_RINGS; i++) {
> > >>  		struct intel_engine_cs *ring = &dev_priv->ring[i];
> > >> -		struct intel_context *dctx = ring->default_context;
> > >>  		struct intel_context *lctx = ring->last_context;
> > >>
> > >> -		/* Do a fake switch to the default context */
> > >> -		if (lctx == dctx)
> > >> -			continue;
> > >> -
> > >> -		if (!lctx)
> > >> -			continue;
> > >> +		if (lctx) {
> > >> +			if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
> > >> +				i915_gem_object_ggtt_unpin(lctx-
> > >> >legacy_hw_ctx.rcs_state);
> > >>
> > >> -		if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
> > >> -			WARN_ON(i915_gem_obj_ggtt_pin(dctx-
> > >> >legacy_hw_ctx.rcs_state,
> > >> -
> > >> get_context_alignment(dev), 0));
> > >> -			/* Fake a finish/inactive */
> > >> -			dctx->legacy_hw_ctx.rcs_state->base.write_domain
> > >> = 0;
> > >> -			dctx->legacy_hw_ctx.rcs_state->active = 0;
> > >> +			i915_gem_context_unreference(lctx);
> > >> +			ring->last_context = NULL;
> > >>  		}
> > >> -
> > >> -		if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
> > >> -			i915_gem_object_ggtt_unpin(lctx-
> > >> >legacy_hw_ctx.rcs_state);
> > >> -
> > >> -		i915_gem_context_unreference(lctx);
> > >> -		i915_gem_context_reference(dctx);
> > >> -		ring->last_context = dctx;
> > >>  	}
> > >>  }
> > >>
> > >> @@ -498,10 +481,6 @@ int i915_gem_context_enable(struct
> > >> drm_i915_private *dev_priv)
> > >>  		ppgtt->enable(ppgtt);
> > >>  	}
> > >>
> > >> -	/* FIXME: We should make this work, even in reset */
> > >> -	if (i915_reset_in_progress(&dev_priv->gpu_error))
> > >> -		return 0;
> > >> -
> > >>  	BUG_ON(!dev_priv->ring[RCS].default_context);
> > >>
> > >>  	for_each_ring(ring, dev_priv, i) {
> > >> @@ -645,7 +624,7 @@ static int do_switch(struct intel_engine_cs *ring,
> > >>  	from = ring->last_context;
> > >>
> > >>  	if (USES_FULL_PPGTT(ring->dev)) {
> > >> -		ret = ppgtt->switch_mm(ppgtt, ring, false);
> > >> +		ret = ppgtt->switch_mm(ppgtt, ring);
> > >>  		if (ret)
> > >>  			goto unpin_out;
> > >>  	}
> > >> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c
> > >> b/drivers/gpu/drm/i915/i915_gem_gtt.c
> > >> index 5188936..450c8a9 100644
> > >> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> > >> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> > >> @@ -216,19 +216,12 @@ static gen6_gtt_pte_t
> > >> iris_pte_encode(dma_addr_t addr,
> > >>
> > >>  /* Broadwell Page Directory Pointer Descriptors */  static int
> > >> gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
> > >> -			   uint64_t val, bool synchronous)
> > >> +			   uint64_t val)
> > >>  {
> > >> -	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> > >>  	int ret;
> > >>
> > >>  	BUG_ON(entry >= 4);
> > >>
> > >> -	if (synchronous) {
> > >> -		I915_WRITE(GEN8_RING_PDP_UDW(ring, entry), val >> 32);
> > >> -		I915_WRITE(GEN8_RING_PDP_LDW(ring, entry), (u32)val);
> > >> -		return 0;
> > >> -	}
> > >> -
> > >>  	ret = intel_ring_begin(ring, 6);
> > >>  	if (ret)
> > >>  		return ret;
> > >> @@ -245,8 +238,7 @@ static int gen8_write_pdp(struct intel_engine_cs
> > >> *ring, unsigned entry,  }
> > >>
> > >>  static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > >> -			  struct intel_engine_cs *ring,
> > >> -			  bool synchronous)
> > >> +			  struct intel_engine_cs *ring)
> > >>  {
> > >>  	int i, ret;
> > >>
> > >> @@ -255,7 +247,7 @@ static int gen8_mm_switch(struct i915_hw_ppgtt
> > >> *ppgtt,
> > >>
> > >>  	for (i = used_pd - 1; i >= 0; i--) {
> > >>  		dma_addr_t addr = ppgtt->pd_dma_addr[i];
> > >> -		ret = gen8_write_pdp(ring, i, addr, synchronous);
> > >> +		ret = gen8_write_pdp(ring, i, addr);
> > >>  		if (ret)
> > >>  			return ret;
> > >>  	}
> > >> @@ -724,29 +716,10 @@ static uint32_t get_pd_offset(struct
> > >> i915_hw_ppgtt
> > >> *ppgtt)  }
> > >>
> > >>  static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > >> -			 struct intel_engine_cs *ring,
> > >> -			 bool synchronous)
> > >> +			 struct intel_engine_cs *ring)
> > >>  {
> > >> -	struct drm_device *dev = ppgtt->base.dev;
> > >> -	struct drm_i915_private *dev_priv = dev->dev_private;
> > >>  	int ret;
> > >>
> > >> -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> > >> -	 * manually frob these bits. Ideally we could use the ring functions,
> > >> -	 * except our error handling makes it quite difficult (can't use
> > >> -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> > >> -	 *
> > >> -	 * FIXME: We should try not to special case reset
> > >> -	 */
> > >> -	if (synchronous ||
> > >> -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> > >> -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> > >> -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> > >> -		I915_WRITE(RING_PP_DIR_BASE(ring),
> > >> get_pd_offset(ppgtt));
> > >> -		POSTING_READ(RING_PP_DIR_BASE(ring));
> > >> -		return 0;
> > >> -	}
> > >> -
> > >>  	/* NB: TLBs must be flushed and invalidated before a switch */
> > >>  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS,
> > >> I915_GEM_GPU_DOMAINS);
> > >>  	if (ret)
> > >> @@ -768,29 +741,10 @@ static int hsw_mm_switch(struct i915_hw_ppgtt
> > >> *ppgtt,  }
> > >>
> > >>  static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > >> -			  struct intel_engine_cs *ring,
> > >> -			  bool synchronous)
> > >> +			  struct intel_engine_cs *ring)
> > >>  {
> > >> -	struct drm_device *dev = ppgtt->base.dev;
> > >> -	struct drm_i915_private *dev_priv = dev->dev_private;
> > >>  	int ret;
> > >>
> > >> -	/* If we're in reset, we can assume the GPU is sufficiently idle to
> > >> -	 * manually frob these bits. Ideally we could use the ring functions,
> > >> -	 * except our error handling makes it quite difficult (can't use
> > >> -	 * intel_ring_begin, ring->flush, or intel_ring_advance)
> > >> -	 *
> > >> -	 * FIXME: We should try not to special case reset
> > >> -	 */
> > >> -	if (synchronous ||
> > >> -	    i915_reset_in_progress(&dev_priv->gpu_error)) {
> > >> -		WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> > >> -		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> > >> -		I915_WRITE(RING_PP_DIR_BASE(ring),
> > >> get_pd_offset(ppgtt));
> > >> -		POSTING_READ(RING_PP_DIR_BASE(ring));
> > >> -		return 0;
> > >> -	}
> > >> -
> > >>  	/* NB: TLBs must be flushed and invalidated before a switch */
> > >>  	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS,
> > >> I915_GEM_GPU_DOMAINS);
> > >>  	if (ret)
> > >> @@ -819,14 +773,11 @@ static int gen7_mm_switch(struct
> > i915_hw_ppgtt
> > >> *ppgtt,  }
> > >>
> > >>  static int gen6_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > >> -			  struct intel_engine_cs *ring,
> > >> -			  bool synchronous)
> > >> +			  struct intel_engine_cs *ring)
> > >>  {
> > >>  	struct drm_device *dev = ppgtt->base.dev;
> > >>  	struct drm_i915_private *dev_priv = dev->dev_private;
> > >>
> > >> -	if (!synchronous)
> > >> -		return 0;
> > >>
> > >>  	I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> > >>  	I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt)); @@ -
> > >> 852,7 +803,7 @@ static int gen8_ppgtt_enable(struct i915_hw_ppgtt
> > *ppgtt)
> > >>  		if (USES_FULL_PPGTT(dev))
> > >>  			continue;
> > >>
> > >> -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> > >> +		ret = ppgtt->switch_mm(ppgtt, ring);
> > >>  		if (ret)
> > >>  			goto err_out;
> > >>  	}
> > >> @@ -897,7 +848,7 @@ static int gen7_ppgtt_enable(struct i915_hw_ppgtt
> > >> *ppgtt)
> > >>  		if (USES_FULL_PPGTT(dev))
> > >>  			continue;
> > >>
> > >> -		ret = ppgtt->switch_mm(ppgtt, ring, true);
> > >> +		ret = ppgtt->switch_mm(ppgtt, ring);
> > >>  		if (ret)
> > >>  			return ret;
> > >>  	}
> > >> @@ -926,7 +877,7 @@ static int gen6_ppgtt_enable(struct i915_hw_ppgtt
> > >> *ppgtt)
> > >>  	I915_WRITE(GFX_MODE,
> > >> _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
> > >>
> > >>  	for_each_ring(ring, dev_priv, i) {
> > >> -		int ret = ppgtt->switch_mm(ppgtt, ring, true);
> > >> +		int ret = ppgtt->switch_mm(ppgtt, ring);
> > >>  		if (ret)
> > >>  			return ret;
> > >>  	}
> > >> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h
> > >> b/drivers/gpu/drm/i915/i915_gem_gtt.h
> > >> index 8d6f7c1..bf1e4fc 100644
> > >> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> > >> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> > >> @@ -262,8 +262,7 @@ struct i915_hw_ppgtt {
> > >>
> > >>  	int (*enable)(struct i915_hw_ppgtt *ppgtt);
> > >>  	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
> > >> -			 struct intel_engine_cs *ring,
> > >> -			 bool synchronous);
> > >> +			 struct intel_engine_cs *ring);
> > >>  	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file
> > >> *m);  };
> > >>
> > >> --
> > >> 2.0.4

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v3] drm/i915: Rework GPU reset sequence to match driver load & thaw
  2014-08-25 20:18                   ` Daniel Vetter
@ 2014-08-26  6:09                     ` Chris Wilson
  0 siblings, 0 replies; 30+ messages in thread
From: Chris Wilson @ 2014-08-26  6:09 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx

On Mon, Aug 25, 2014 at 10:18:09PM +0200, Daniel Vetter wrote:
> On Wed, Aug 20, 2014 at 04:56:41PM +0100, Chris Wilson wrote:
> > On Wed, Aug 20, 2014 at 03:21:55PM +0000, Mcaulay, Alistair wrote:
> > > It is not the same. This is a special case when re-initialising the hw. This flag is to allow gem_init_hw() to complete successfully during reset. 
> > > At any other point during reset, -EAGAIN should be returned.
> > 
> > Indeed. You've missed the point. Look closer at the reset counter and
> > reset ordering.
> 
> We could try to mark the gpu as reset again before starting the reinit to
> avoid this kludge. But that has the problem that if the ring init fails we
> have a bit a mess in marking the gpu terminally wedged. So I think overall
> not prettier than what we have here ... And if I'm mistaken I guess I can
> put on my idiot hat and merge the fixup ;-)

See other patches during the week. Marking the reset as complete after
performing the reset and before resuming the hardware is not ugly. If
the reset fails, you still have the wedge | in-progress. If the reset
succeeds but resume fails, you then have wedge | !in-progress.

Seriously read the request patch to see how it straightens out ring
access, full stop.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2014-08-26  6:09 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-07-16 15:05 [PATCH] drm/i915: Rework GPU reset sequence to match driver load & thaw alistair.mcaulay
2014-07-26  1:05 ` Ben Widawsky
2014-07-28  9:26   ` Daniel Vetter
2014-07-28 17:12     ` Mcaulay, Alistair
2014-07-29  0:16       ` Ben Widawsky
2014-07-29 17:25         ` Mcaulay, Alistair
2014-07-29 18:12           ` Daniel Vetter
2014-07-29  7:36     ` Chris Wilson
2014-07-29 10:32       ` Daniel Vetter
2014-07-30 16:59         ` Mcaulay, Alistair
2014-07-30 21:00           ` Daniel Vetter
2014-07-31 16:37             ` Mcaulay, Alistair
2014-08-04  7:52               ` Daniel Vetter
2014-08-05  8:47 ` [PATCH v2] " alistair.mcaulay
2014-08-06 12:58   ` Mcaulay, Alistair
2014-08-06 16:24   ` Mika Kuoppala
2014-08-15 13:33     ` Mcaulay, Alistair
2014-08-15 15:41       ` Daniel Vetter
2014-08-15 17:03       ` Mika Kuoppala
2014-08-15 17:51         ` [PATCH v3] " alistair.mcaulay
2014-08-19 10:12           ` Mcaulay, Alistair
2014-08-19 12:35             ` Mika Kuoppala
2014-08-21 12:38               ` Mcaulay, Alistair
2014-08-25 20:28                 ` Daniel Vetter
2014-08-20 14:46           ` Daniel, Thomas
2014-08-20 14:58             ` Chris Wilson
2014-08-20 15:21               ` Mcaulay, Alistair
2014-08-20 15:56                 ` Chris Wilson
2014-08-25 20:18                   ` Daniel Vetter
2014-08-26  6:09                     ` Chris Wilson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.