All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v6] drm/amdgpu: handle IH ring1 overflow
@ 2021-11-24 22:58 Philip Yang
  2021-11-25  1:56 ` Felix Kuehling
  0 siblings, 1 reply; 5+ messages in thread
From: Philip Yang @ 2021-11-24 22:58 UTC (permalink / raw)
  To: amd-gfx; +Cc: Philip Yang, Felix.Kuehling, christian.koenig

IH ring1 is used to process GPU retry fault, overflow is enabled to
drain retry fault because we want receive other interrupts while
handling retry fault to recover range. There is no overflow flag set
when wptr pass rptr. Use timestamp of rptr and wptr to handle overflow
and drain retry fault.

Add amdgpu_ih_function interface decode_iv_ts for different chips to get
timestamp from IV entry with different iv size and timestamp offset.
amdgpu_ih_decode_iv_ts_helper is used for vega10, vega20, navi10.

Drain retry fault is done if processed_timestamp is equal to or larger
than checkpoint timestamp. Page fault handler skips retry fault entry if
entry timestamp goes backward.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c | 58 +++++++++++++-------------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h | 16 ++++++-
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  5 +++
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  5 +++
 drivers/gpu/drm/amd/amdgpu/navi10_ih.c |  1 +
 drivers/gpu/drm/amd/amdgpu/vega10_ih.c |  1 +
 drivers/gpu/drm/amd/amdgpu/vega20_ih.c |  1 +
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c   |  2 +-
 8 files changed, 56 insertions(+), 33 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
index 0c7963dfacad..3e043acaab82 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
@@ -164,52 +164,32 @@ void amdgpu_ih_ring_write(struct amdgpu_ih_ring *ih, const uint32_t *iv,
 	}
 }
 
-/* Waiter helper that checks current rptr matches or passes checkpoint wptr */
-static bool amdgpu_ih_has_checkpoint_processed(struct amdgpu_device *adev,
-					struct amdgpu_ih_ring *ih,
-					uint32_t checkpoint_wptr,
-					uint32_t *prev_rptr)
-{
-	uint32_t cur_rptr = ih->rptr | (*prev_rptr & ~ih->ptr_mask);
-
-	/* rptr has wrapped. */
-	if (cur_rptr < *prev_rptr)
-		cur_rptr += ih->ptr_mask + 1;
-	*prev_rptr = cur_rptr;
-
-	/* check ring is empty to workaround missing wptr overflow flag */
-	return cur_rptr >= checkpoint_wptr ||
-	       (cur_rptr & ih->ptr_mask) == amdgpu_ih_get_wptr(adev, ih);
-}
-
 /**
- * amdgpu_ih_wait_on_checkpoint_process - wait to process IVs up to checkpoint
+ * amdgpu_ih_wait_on_checkpoint_process_ts - wait to process IVs up to checkpoint
  *
  * @adev: amdgpu_device pointer
  * @ih: ih ring to process
  *
  * Used to ensure ring has processed IVs up to the checkpoint write pointer.
  */
-int amdgpu_ih_wait_on_checkpoint_process(struct amdgpu_device *adev,
+int amdgpu_ih_wait_on_checkpoint_process_ts(struct amdgpu_device *adev,
 					struct amdgpu_ih_ring *ih)
 {
-	uint32_t checkpoint_wptr, rptr;
+	uint32_t checkpoint_wptr;
+	uint64_t checkpoint_ts;
+	long timeout = HZ;
 
 	if (!ih->enabled || adev->shutdown)
 		return -ENODEV;
 
 	checkpoint_wptr = amdgpu_ih_get_wptr(adev, ih);
-	/* Order wptr with rptr. */
+	/* Order wptr with ring data. */
 	rmb();
-	rptr = READ_ONCE(ih->rptr);
-
-	/* wptr has wrapped. */
-	if (rptr > checkpoint_wptr)
-		checkpoint_wptr += ih->ptr_mask + 1;
+	checkpoint_ts = amdgpu_ih_decode_iv_ts(adev, ih, checkpoint_wptr, -1);
 
-	return wait_event_interruptible(ih->wait_process,
-				amdgpu_ih_has_checkpoint_processed(adev, ih,
-						checkpoint_wptr, &rptr));
+	return wait_event_interruptible_timeout(ih->wait_process,
+		    !amdgpu_ih_ts_after(ih->processed_timestamp, checkpoint_ts),
+		    timeout);
 }
 
 /**
@@ -298,4 +278,22 @@ void amdgpu_ih_decode_iv_helper(struct amdgpu_device *adev,
 
 	/* wptr/rptr are in bytes! */
 	ih->rptr += 32;
+	if (ih == &adev->irq.ih1 &&
+	    amdgpu_ih_ts_after(ih->processed_timestamp, entry->timestamp))
+		ih->processed_timestamp = entry->timestamp;
+}
+
+uint64_t amdgpu_ih_decode_iv_ts_helper(struct amdgpu_ih_ring *ih, u32 rptr,
+				       signed int offset)
+{
+	uint32_t iv_size = 32;
+	uint32_t dw1, dw2;
+	uint32_t index;
+
+	rptr += iv_size * offset;
+	index = (rptr & ih->ptr_mask) >> 2;
+
+	dw1 = le32_to_cpu(ih->ring[index + 1]);
+	dw2 = le32_to_cpu(ih->ring[index + 2]);
+	return dw1 | ((u64)(dw2 & 0xffff) << 32);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
index 0649b59830a5..dd1c2eded6b9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
@@ -68,20 +68,30 @@ struct amdgpu_ih_ring {
 
 	/* For waiting on IH processing at checkpoint. */
 	wait_queue_head_t wait_process;
+	uint64_t		processed_timestamp;
 };
 
+/* return true if time stamp t2 is after t1 with 48bit wrap around */
+#define amdgpu_ih_ts_after(t1, t2) \
+		(((int64_t)((t2) << 16) - (int64_t)((t1) << 16)) > 0LL)
+
 /* provided by the ih block */
 struct amdgpu_ih_funcs {
 	/* ring read/write ptr handling, called from interrupt context */
 	u32 (*get_wptr)(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih);
 	void (*decode_iv)(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih,
 			  struct amdgpu_iv_entry *entry);
+	uint64_t (*decode_iv_ts)(struct amdgpu_ih_ring *ih, u32 rptr,
+				 signed int offset);
 	void (*set_rptr)(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih);
 };
 
 #define amdgpu_ih_get_wptr(adev, ih) (adev)->irq.ih_funcs->get_wptr((adev), (ih))
 #define amdgpu_ih_decode_iv(adev, iv) \
 	(adev)->irq.ih_funcs->decode_iv((adev), (ih), (iv))
+#define amdgpu_ih_decode_iv_ts(adev, ih, rptr, offset) \
+	(WARN_ON_ONCE(!(adev)->irq.ih_funcs->decode_iv_ts) ? 0 : \
+	(adev)->irq.ih_funcs->decode_iv_ts((ih), (rptr), (offset)))
 #define amdgpu_ih_set_rptr(adev, ih) (adev)->irq.ih_funcs->set_rptr((adev), (ih))
 
 int amdgpu_ih_ring_init(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih,
@@ -89,10 +99,12 @@ int amdgpu_ih_ring_init(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih,
 void amdgpu_ih_ring_fini(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih);
 void amdgpu_ih_ring_write(struct amdgpu_ih_ring *ih, const uint32_t *iv,
 			  unsigned int num_dw);
-int amdgpu_ih_wait_on_checkpoint_process(struct amdgpu_device *adev,
-					struct amdgpu_ih_ring *ih);
+int amdgpu_ih_wait_on_checkpoint_process_ts(struct amdgpu_device *adev,
+					    struct amdgpu_ih_ring *ih);
 int amdgpu_ih_process(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih);
 void amdgpu_ih_decode_iv_helper(struct amdgpu_device *adev,
 				struct amdgpu_ih_ring *ih,
 				struct amdgpu_iv_entry *entry);
+uint64_t amdgpu_ih_decode_iv_ts_helper(struct amdgpu_ih_ring *ih, u32 rptr,
+				       signed int offset);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 3ec5ff5a6dbe..b129898db433 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -119,6 +119,11 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
 			return 1;
 		}
 
+		/* Stale retry fault if timestamp goes backward */
+		if (entry->ih == &adev->irq.ih1 &&
+		    amdgpu_ih_ts_after(entry->timestamp, entry->ih->processed_timestamp))
+			return 1;
+
 		/* Try to handle the recoverable page faults by filling page
 		 * tables
 		 */
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index cb82404df534..c0d9b254bbb5 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -535,6 +535,11 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
 			return 1;
 		}
 
+		/* Stale retry fault if timestamp goes backward */
+		if (entry->ih == &adev->irq.ih1 &&
+		    amdgpu_ih_ts_after(entry->timestamp, entry->ih->processed_timestamp))
+			return 1;
+
 		/* Try to handle the recoverable page faults by filling page
 		 * tables
 		 */
diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
index 38241cf0e1f1..8ce5b8ca1fd7 100644
--- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
@@ -716,6 +716,7 @@ static const struct amd_ip_funcs navi10_ih_ip_funcs = {
 static const struct amdgpu_ih_funcs navi10_ih_funcs = {
 	.get_wptr = navi10_ih_get_wptr,
 	.decode_iv = amdgpu_ih_decode_iv_helper,
+	.decode_iv_ts = amdgpu_ih_decode_iv_ts_helper,
 	.set_rptr = navi10_ih_set_rptr
 };
 
diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
index a9ca6988009e..3070466f54e1 100644
--- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
@@ -640,6 +640,7 @@ const struct amd_ip_funcs vega10_ih_ip_funcs = {
 static const struct amdgpu_ih_funcs vega10_ih_funcs = {
 	.get_wptr = vega10_ih_get_wptr,
 	.decode_iv = amdgpu_ih_decode_iv_helper,
+	.decode_iv_ts = amdgpu_ih_decode_iv_ts_helper,
 	.set_rptr = vega10_ih_set_rptr
 };
 
diff --git a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
index f51dfc38ac65..3b4eb8285943 100644
--- a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
@@ -688,6 +688,7 @@ const struct amd_ip_funcs vega20_ih_ip_funcs = {
 static const struct amdgpu_ih_funcs vega20_ih_funcs = {
 	.get_wptr = vega20_ih_get_wptr,
 	.decode_iv = amdgpu_ih_decode_iv_helper,
+	.decode_iv_ts = amdgpu_ih_decode_iv_ts_helper,
 	.set_rptr = vega20_ih_set_rptr
 };
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 10868d5b549f..663489ae56d7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1974,7 +1974,7 @@ static void svm_range_drain_retry_fault(struct svm_range_list *svms)
 
 		pr_debug("drain retry fault gpu %d svms %p\n", i, svms);
 
-		amdgpu_ih_wait_on_checkpoint_process(pdd->dev->adev,
+		amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev,
 						     &pdd->dev->adev->irq.ih1);
 		pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms);
 	}
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v6] drm/amdgpu: handle IH ring1 overflow
  2021-11-24 22:58 [PATCH v6] drm/amdgpu: handle IH ring1 overflow Philip Yang
@ 2021-11-25  1:56 ` Felix Kuehling
  2021-11-25  7:11   ` Christian König
  2021-11-25 15:09   ` philip yang
  0 siblings, 2 replies; 5+ messages in thread
From: Felix Kuehling @ 2021-11-25  1:56 UTC (permalink / raw)
  To: Philip Yang, amd-gfx; +Cc: christian.koenig

Am 2021-11-24 um 5:58 p.m. schrieb Philip Yang:
> IH ring1 is used to process GPU retry fault, overflow is enabled to
> drain retry fault because we want receive other interrupts while
> handling retry fault to recover range. There is no overflow flag set
> when wptr pass rptr. Use timestamp of rptr and wptr to handle overflow
> and drain retry fault.
>
> Add amdgpu_ih_function interface decode_iv_ts for different chips to get
> timestamp from IV entry with different iv size and timestamp offset.
> amdgpu_ih_decode_iv_ts_helper is used for vega10, vega20, navi10.
>
> Drain retry fault is done if processed_timestamp is equal to or larger
> than checkpoint timestamp. Page fault handler skips retry fault entry if
> entry timestamp goes backward.
>
> Signed-off-by: Philip Yang <Philip.Yang@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c | 58 +++++++++++++-------------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h | 16 ++++++-
>  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  5 +++
>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  5 +++
>  drivers/gpu/drm/amd/amdgpu/navi10_ih.c |  1 +
>  drivers/gpu/drm/amd/amdgpu/vega10_ih.c |  1 +
>  drivers/gpu/drm/amd/amdgpu/vega20_ih.c |  1 +
>  drivers/gpu/drm/amd/amdkfd/kfd_svm.c   |  2 +-
>  8 files changed, 56 insertions(+), 33 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
> index 0c7963dfacad..3e043acaab82 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
> @@ -164,52 +164,32 @@ void amdgpu_ih_ring_write(struct amdgpu_ih_ring *ih, const uint32_t *iv,
>  	}
>  }
>  
> -/* Waiter helper that checks current rptr matches or passes checkpoint wptr */
> -static bool amdgpu_ih_has_checkpoint_processed(struct amdgpu_device *adev,
> -					struct amdgpu_ih_ring *ih,
> -					uint32_t checkpoint_wptr,
> -					uint32_t *prev_rptr)
> -{
> -	uint32_t cur_rptr = ih->rptr | (*prev_rptr & ~ih->ptr_mask);
> -
> -	/* rptr has wrapped. */
> -	if (cur_rptr < *prev_rptr)
> -		cur_rptr += ih->ptr_mask + 1;
> -	*prev_rptr = cur_rptr;
> -
> -	/* check ring is empty to workaround missing wptr overflow flag */
> -	return cur_rptr >= checkpoint_wptr ||
> -	       (cur_rptr & ih->ptr_mask) == amdgpu_ih_get_wptr(adev, ih);
> -}
> -
>  /**
> - * amdgpu_ih_wait_on_checkpoint_process - wait to process IVs up to checkpoint
> + * amdgpu_ih_wait_on_checkpoint_process_ts - wait to process IVs up to checkpoint
>   *
>   * @adev: amdgpu_device pointer
>   * @ih: ih ring to process
>   *
>   * Used to ensure ring has processed IVs up to the checkpoint write pointer.
>   */
> -int amdgpu_ih_wait_on_checkpoint_process(struct amdgpu_device *adev,
> +int amdgpu_ih_wait_on_checkpoint_process_ts(struct amdgpu_device *adev,
>  					struct amdgpu_ih_ring *ih)
>  {
> -	uint32_t checkpoint_wptr, rptr;
> +	uint32_t checkpoint_wptr;
> +	uint64_t checkpoint_ts;
> +	long timeout = HZ;
>  
>  	if (!ih->enabled || adev->shutdown)
>  		return -ENODEV;
>  
>  	checkpoint_wptr = amdgpu_ih_get_wptr(adev, ih);
> -	/* Order wptr with rptr. */
> +	/* Order wptr with ring data. */
>  	rmb();
> -	rptr = READ_ONCE(ih->rptr);
> -
> -	/* wptr has wrapped. */
> -	if (rptr > checkpoint_wptr)
> -		checkpoint_wptr += ih->ptr_mask + 1;
> +	checkpoint_ts = amdgpu_ih_decode_iv_ts(adev, ih, checkpoint_wptr, -1);
>  
> -	return wait_event_interruptible(ih->wait_process,
> -				amdgpu_ih_has_checkpoint_processed(adev, ih,
> -						checkpoint_wptr, &rptr));
> +	return wait_event_interruptible_timeout(ih->wait_process,
> +		    !amdgpu_ih_ts_after(ih->processed_timestamp, checkpoint_ts),
> +		    timeout);
>  }
>  
>  /**
> @@ -298,4 +278,22 @@ void amdgpu_ih_decode_iv_helper(struct amdgpu_device *adev,
>  
>  	/* wptr/rptr are in bytes! */
>  	ih->rptr += 32;
> +	if (ih == &adev->irq.ih1 &&
> +	    amdgpu_ih_ts_after(ih->processed_timestamp, entry->timestamp))
> +		ih->processed_timestamp = entry->timestamp;

I'd call this "latest_decoded_timestamp". At this point it hasn't been
processed yet.

Also, I think it would be safe and cheap enough to do this on all IH
rings, in case someone finds it useful for something else, e.g. using
amdgpu_ih_wait_on_checkpoint_process_ts on IH ring 0.


> +}
> +
> +uint64_t amdgpu_ih_decode_iv_ts_helper(struct amdgpu_ih_ring *ih, u32 rptr,
> +				       signed int offset)
> +{
> +	uint32_t iv_size = 32;
> +	uint32_t dw1, dw2;
> +	uint32_t index;
> +
> +	rptr += iv_size * offset;
> +	index = (rptr & ih->ptr_mask) >> 2;
> +
> +	dw1 = le32_to_cpu(ih->ring[index + 1]);
> +	dw2 = le32_to_cpu(ih->ring[index + 2]);
> +	return dw1 | ((u64)(dw2 & 0xffff) << 32);
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
> index 0649b59830a5..dd1c2eded6b9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
> @@ -68,20 +68,30 @@ struct amdgpu_ih_ring {
>  
>  	/* For waiting on IH processing at checkpoint. */
>  	wait_queue_head_t wait_process;
> +	uint64_t		processed_timestamp;
>  };
>  
> +/* return true if time stamp t2 is after t1 with 48bit wrap around */
> +#define amdgpu_ih_ts_after(t1, t2) \
> +		(((int64_t)((t2) << 16) - (int64_t)((t1) << 16)) > 0LL)
> +
>  /* provided by the ih block */
>  struct amdgpu_ih_funcs {
>  	/* ring read/write ptr handling, called from interrupt context */
>  	u32 (*get_wptr)(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih);
>  	void (*decode_iv)(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih,
>  			  struct amdgpu_iv_entry *entry);
> +	uint64_t (*decode_iv_ts)(struct amdgpu_ih_ring *ih, u32 rptr,
> +				 signed int offset);
>  	void (*set_rptr)(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih);
>  };
>  
>  #define amdgpu_ih_get_wptr(adev, ih) (adev)->irq.ih_funcs->get_wptr((adev), (ih))
>  #define amdgpu_ih_decode_iv(adev, iv) \
>  	(adev)->irq.ih_funcs->decode_iv((adev), (ih), (iv))
> +#define amdgpu_ih_decode_iv_ts(adev, ih, rptr, offset) \
> +	(WARN_ON_ONCE(!(adev)->irq.ih_funcs->decode_iv_ts) ? 0 : \
> +	(adev)->irq.ih_funcs->decode_iv_ts((ih), (rptr), (offset)))
>  #define amdgpu_ih_set_rptr(adev, ih) (adev)->irq.ih_funcs->set_rptr((adev), (ih))
>  
>  int amdgpu_ih_ring_init(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih,
> @@ -89,10 +99,12 @@ int amdgpu_ih_ring_init(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih,
>  void amdgpu_ih_ring_fini(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih);
>  void amdgpu_ih_ring_write(struct amdgpu_ih_ring *ih, const uint32_t *iv,
>  			  unsigned int num_dw);
> -int amdgpu_ih_wait_on_checkpoint_process(struct amdgpu_device *adev,
> -					struct amdgpu_ih_ring *ih);
> +int amdgpu_ih_wait_on_checkpoint_process_ts(struct amdgpu_device *adev,
> +					    struct amdgpu_ih_ring *ih);
>  int amdgpu_ih_process(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih);
>  void amdgpu_ih_decode_iv_helper(struct amdgpu_device *adev,
>  				struct amdgpu_ih_ring *ih,
>  				struct amdgpu_iv_entry *entry);
> +uint64_t amdgpu_ih_decode_iv_ts_helper(struct amdgpu_ih_ring *ih, u32 rptr,
> +				       signed int offset);
>  #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 3ec5ff5a6dbe..b129898db433 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -119,6 +119,11 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
>  			return 1;
>  		}
>  
> +		/* Stale retry fault if timestamp goes backward */
> +		if (entry->ih == &adev->irq.ih1 &&
> +		    amdgpu_ih_ts_after(entry->timestamp, entry->ih->processed_timestamp))
> +			return 1;
> +

This check should go before amdgpu_gmc_filter_faults. Otherwise
amdgpu_gmc_filter_faults may later drop a real fault because it added a
stale fault in its hash table.


>  		/* Try to handle the recoverable page faults by filling page
>  		 * tables
>  		 */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index cb82404df534..c0d9b254bbb5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -535,6 +535,11 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
>  			return 1;
>  		}
>  
> +		/* Stale retry fault if timestamp goes backward */
> +		if (entry->ih == &adev->irq.ih1 &&
> +		    amdgpu_ih_ts_after(entry->timestamp, entry->ih->processed_timestamp))
> +			return 1;
> +

Same as above.

Regards,
  Felix


>  		/* Try to handle the recoverable page faults by filling page
>  		 * tables
>  		 */
> diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
> index 38241cf0e1f1..8ce5b8ca1fd7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
> @@ -716,6 +716,7 @@ static const struct amd_ip_funcs navi10_ih_ip_funcs = {
>  static const struct amdgpu_ih_funcs navi10_ih_funcs = {
>  	.get_wptr = navi10_ih_get_wptr,
>  	.decode_iv = amdgpu_ih_decode_iv_helper,
> +	.decode_iv_ts = amdgpu_ih_decode_iv_ts_helper,
>  	.set_rptr = navi10_ih_set_rptr
>  };
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> index a9ca6988009e..3070466f54e1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> @@ -640,6 +640,7 @@ const struct amd_ip_funcs vega10_ih_ip_funcs = {
>  static const struct amdgpu_ih_funcs vega10_ih_funcs = {
>  	.get_wptr = vega10_ih_get_wptr,
>  	.decode_iv = amdgpu_ih_decode_iv_helper,
> +	.decode_iv_ts = amdgpu_ih_decode_iv_ts_helper,
>  	.set_rptr = vega10_ih_set_rptr
>  };
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
> index f51dfc38ac65..3b4eb8285943 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
> @@ -688,6 +688,7 @@ const struct amd_ip_funcs vega20_ih_ip_funcs = {
>  static const struct amdgpu_ih_funcs vega20_ih_funcs = {
>  	.get_wptr = vega20_ih_get_wptr,
>  	.decode_iv = amdgpu_ih_decode_iv_helper,
> +	.decode_iv_ts = amdgpu_ih_decode_iv_ts_helper,
>  	.set_rptr = vega20_ih_set_rptr
>  };
>  
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index 10868d5b549f..663489ae56d7 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -1974,7 +1974,7 @@ static void svm_range_drain_retry_fault(struct svm_range_list *svms)
>  
>  		pr_debug("drain retry fault gpu %d svms %p\n", i, svms);
>  
> -		amdgpu_ih_wait_on_checkpoint_process(pdd->dev->adev,
> +		amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev,
>  						     &pdd->dev->adev->irq.ih1);
>  		pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms);
>  	}

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v6] drm/amdgpu: handle IH ring1 overflow
  2021-11-25  1:56 ` Felix Kuehling
@ 2021-11-25  7:11   ` Christian König
  2021-11-25 15:12     ` philip yang
  2021-11-25 15:09   ` philip yang
  1 sibling, 1 reply; 5+ messages in thread
From: Christian König @ 2021-11-25  7:11 UTC (permalink / raw)
  To: Felix Kuehling, Philip Yang, amd-gfx

Am 25.11.21 um 02:56 schrieb Felix Kuehling:
> Am 2021-11-24 um 5:58 p.m. schrieb Philip Yang:
> [SNIP]
>>   #define amdgpu_ih_get_wptr(adev, ih) (adev)->irq.ih_funcs->get_wptr((adev), (ih))
>>   #define amdgpu_ih_decode_iv(adev, iv) \
>>   	(adev)->irq.ih_funcs->decode_iv((adev), (ih), (iv))
>> +#define amdgpu_ih_decode_iv_ts(adev, ih, rptr, offset) \
>> +	(WARN_ON_ONCE(!(adev)->irq.ih_funcs->decode_iv_ts) ? 0 : \
>> +	(adev)->irq.ih_funcs->decode_iv_ts((ih), (rptr), (offset)))
>>   #define amdgpu_ih_set_rptr(adev, ih) (adev)->irq.ih_funcs->set_rptr((adev), (ih))
>>   
>>   int amdgpu_ih_ring_init(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih,
>> @@ -89,10 +99,12 @@ int amdgpu_ih_ring_init(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih,
>>   void amdgpu_ih_ring_fini(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih);
>>   void amdgpu_ih_ring_write(struct amdgpu_ih_ring *ih, const uint32_t *iv,
>>   			  unsigned int num_dw);
>> -int amdgpu_ih_wait_on_checkpoint_process(struct amdgpu_device *adev,
>> -					struct amdgpu_ih_ring *ih);
>> +int amdgpu_ih_wait_on_checkpoint_process_ts(struct amdgpu_device *adev,
>> +					    struct amdgpu_ih_ring *ih);
>>   int amdgpu_ih_process(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih);
>>   void amdgpu_ih_decode_iv_helper(struct amdgpu_device *adev,
>>   				struct amdgpu_ih_ring *ih,
>>   				struct amdgpu_iv_entry *entry);
>> +uint64_t amdgpu_ih_decode_iv_ts_helper(struct amdgpu_ih_ring *ih, u32 rptr,
>> +				       signed int offset);
>>   #endif
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> index 3ec5ff5a6dbe..b129898db433 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> @@ -119,6 +119,11 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
>>   			return 1;
>>   		}
>>   
>> +		/* Stale retry fault if timestamp goes backward */
>> +		if (entry->ih == &adev->irq.ih1 &&
>> +		    amdgpu_ih_ts_after(entry->timestamp, entry->ih->processed_timestamp))
>> +			return 1;
>> +
> This check should go before amdgpu_gmc_filter_faults. Otherwise
> amdgpu_gmc_filter_faults may later drop a real fault because it added a
> stale fault in its hash table.

I was already wondering if we shouldn't move all of this completely into 
amdgpu_gmc_filter_faults().

I mean essentially we are filtering faults here once more, just based on 
a different criteria.

Regards,
Christian.


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v6] drm/amdgpu: handle IH ring1 overflow
  2021-11-25  1:56 ` Felix Kuehling
  2021-11-25  7:11   ` Christian König
@ 2021-11-25 15:09   ` philip yang
  1 sibling, 0 replies; 5+ messages in thread
From: philip yang @ 2021-11-25 15:09 UTC (permalink / raw)
  To: Felix Kuehling, Philip Yang, amd-gfx; +Cc: christian.koenig

[-- Attachment #1: Type: text/html, Size: 12146 bytes --]

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v6] drm/amdgpu: handle IH ring1 overflow
  2021-11-25  7:11   ` Christian König
@ 2021-11-25 15:12     ` philip yang
  0 siblings, 0 replies; 5+ messages in thread
From: philip yang @ 2021-11-25 15:12 UTC (permalink / raw)
  To: Christian König, Felix Kuehling, Philip Yang, amd-gfx

[-- Attachment #1: Type: text/html, Size: 5557 bytes --]

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2021-11-25 15:12 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-24 22:58 [PATCH v6] drm/amdgpu: handle IH ring1 overflow Philip Yang
2021-11-25  1:56 ` Felix Kuehling
2021-11-25  7:11   ` Christian König
2021-11-25 15:12     ` philip yang
2021-11-25 15:09   ` philip yang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.