All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/4] drm/amdgpu: add ring soft recovery v3
@ 2018-08-23 11:23 Christian König
       [not found] ` <20180823112334.10321-1-christian.koenig-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 7+ messages in thread
From: Christian König @ 2018-08-23 11:23 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Instead of hammering hard on the GPU try a soft recovery first.

v2: reorder code a bit
v3: increase timeout to 10ms, increment GPU reset counter

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c  |  6 ++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 25 +++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  4 ++++
 3 files changed, 35 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 265ff90f4e01..d93e31a5c4e7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -33,6 +33,12 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
 	struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
 	struct amdgpu_job *job = to_amdgpu_job(s_job);
 
+	if (amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
+		DRM_ERROR("ring %s timeout, but soft recovered\n",
+			  s_job->sched->name);
+		return;
+	}
+
 	DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
 		  job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
 		  ring->fence_drv.sync_seq);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 5dfd26be1eec..d445acb3d956 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -383,6 +383,31 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
 	amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);
 }
 
+/**
+ * amdgpu_ring_soft_recovery - try to soft recover a ring lockup
+ *
+ * @ring: ring to try the recovery on
+ * @vmid: VMID we try to get going again
+ * @fence: timedout fence
+ *
+ * Tries to get a ring proceeding again when it is stuck.
+ */
+bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
+			       struct dma_fence *fence)
+{
+	ktime_t deadline = ktime_add_us(ktime_get(), 10000);
+
+	if (!ring->funcs->soft_recovery)
+		return false;
+
+	atomic_inc(&adev->gpu_reset_counter);
+	while (!dma_fence_is_signaled(fence) &&
+	       ktime_to_ns(ktime_sub(deadline, ktime_get())) > 0)
+		ring->funcs->soft_recovery(ring, vmid);
+
+	return dma_fence_is_signaled(fence);
+}
+
 /*
  * Debugfs info
  */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 409fdd9b9710..9cc239968e40 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -168,6 +168,8 @@ struct amdgpu_ring_funcs {
 	/* priority functions */
 	void (*set_priority) (struct amdgpu_ring *ring,
 			      enum drm_sched_priority priority);
+	/* Try to soft recover the ring to make the fence signal */
+	void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid);
 };
 
 struct amdgpu_ring {
@@ -260,6 +262,8 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring);
 void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
 						uint32_t reg0, uint32_t val0,
 						uint32_t reg1, uint32_t val1);
+bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
+			       struct dma_fence *fence);
 
 static inline void amdgpu_ring_clear_ring(struct amdgpu_ring *ring)
 {
-- 
2.14.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 2/4] drm/amdgpu: implement soft_recovery for GFX7
       [not found] ` <20180823112334.10321-1-christian.koenig-5C7GfCeVMHo@public.gmane.org>
@ 2018-08-23 11:23   ` Christian König
  2018-08-23 11:23   ` [PATCH 3/4] drm/amdgpu: implement soft_recovery for GFX8 v2 Christian König
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 7+ messages in thread
From: Christian König @ 2018-08-23 11:23 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Try to kill waves on the SQ.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
index 95452c5a9df6..a15d9c0f233b 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
@@ -4212,6 +4212,18 @@ static void gfx_v7_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
 	amdgpu_ring_write(ring, (1 << (oa_size + oa_base)) - (1 << oa_base));
 }
 
+static void gfx_v7_0_ring_soft_recovery(struct amdgpu_ring *ring, unsigned vmid)
+{
+	struct amdgpu_device *adev = ring->adev;
+	uint32_t value = 0;
+
+	value = REG_SET_FIELD(value, SQ_CMD, CMD, 0x03);
+	value = REG_SET_FIELD(value, SQ_CMD, MODE, 0x01);
+	value = REG_SET_FIELD(value, SQ_CMD, CHECK_VMID, 1);
+	value = REG_SET_FIELD(value, SQ_CMD, VM_ID, vmid);
+	WREG32(mmSQ_CMD, value);
+}
+
 static uint32_t wave_read_ind(struct amdgpu_device *adev, uint32_t simd, uint32_t wave, uint32_t address)
 {
 	WREG32(mmSQ_IND_INDEX,
@@ -5088,6 +5100,7 @@ static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_gfx = {
 	.pad_ib = amdgpu_ring_generic_pad_ib,
 	.emit_cntxcntl = gfx_v7_ring_emit_cntxcntl,
 	.emit_wreg = gfx_v7_0_ring_emit_wreg,
+	.soft_recovery = gfx_v7_0_ring_soft_recovery,
 };
 
 static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute = {
-- 
2.14.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 3/4] drm/amdgpu: implement soft_recovery for GFX8 v2
       [not found] ` <20180823112334.10321-1-christian.koenig-5C7GfCeVMHo@public.gmane.org>
  2018-08-23 11:23   ` [PATCH 2/4] drm/amdgpu: implement soft_recovery for GFX7 Christian König
@ 2018-08-23 11:23   ` Christian König
  2018-08-23 11:23   ` [PATCH 4/4] drm/amdgpu: implement soft_recovery for GFX9 Christian König
                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 7+ messages in thread
From: Christian König @ 2018-08-23 11:23 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Try to kill waves on the SQ.

v2: only for the GFX ring for now.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 282dba6cce86..9de940a65c80 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -6714,6 +6714,18 @@ static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg,
 	amdgpu_ring_write(ring, val);
 }
 
+static void gfx_v8_0_ring_soft_recovery(struct amdgpu_ring *ring, unsigned vmid)
+{
+	struct amdgpu_device *adev = ring->adev;
+	uint32_t value = 0;
+
+	value = REG_SET_FIELD(value, SQ_CMD, CMD, 0x03);
+	value = REG_SET_FIELD(value, SQ_CMD, MODE, 0x01);
+	value = REG_SET_FIELD(value, SQ_CMD, CHECK_VMID, 1);
+	value = REG_SET_FIELD(value, SQ_CMD, VM_ID, vmid);
+	WREG32(mmSQ_CMD, value);
+}
+
 static void gfx_v8_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
 						 enum amdgpu_interrupt_state state)
 {
@@ -7171,6 +7183,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
 	.init_cond_exec = gfx_v8_0_ring_emit_init_cond_exec,
 	.patch_cond_exec = gfx_v8_0_ring_emit_patch_cond_exec,
 	.emit_wreg = gfx_v8_0_ring_emit_wreg,
+	.soft_recovery = gfx_v8_0_ring_soft_recovery,
 };
 
 static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
-- 
2.14.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 4/4] drm/amdgpu: implement soft_recovery for GFX9
       [not found] ` <20180823112334.10321-1-christian.koenig-5C7GfCeVMHo@public.gmane.org>
  2018-08-23 11:23   ` [PATCH 2/4] drm/amdgpu: implement soft_recovery for GFX7 Christian König
  2018-08-23 11:23   ` [PATCH 3/4] drm/amdgpu: implement soft_recovery for GFX8 v2 Christian König
@ 2018-08-23 11:23   ` Christian König
  2018-08-23 13:08   ` [PATCH 1/4] drm/amdgpu: add ring soft recovery v3 Huang Rui
  2018-08-23 15:20   ` Zhu, Rex
  4 siblings, 0 replies; 7+ messages in thread
From: Christian König @ 2018-08-23 11:23 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Try to kill waves on the SQ.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 44707f94b2c5..ab5cacea967b 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4421,6 +4421,18 @@ static void gfx_v9_0_ring_emit_reg_write_reg_wait(struct amdgpu_ring *ring,
 							   ref, mask);
 }
 
+static void gfx_v9_0_ring_soft_recovery(struct amdgpu_ring *ring, unsigned vmid)
+{
+	struct amdgpu_device *adev = ring->adev;
+	uint32_t value = 0;
+
+	value = REG_SET_FIELD(value, SQ_CMD, CMD, 0x03);
+	value = REG_SET_FIELD(value, SQ_CMD, MODE, 0x01);
+	value = REG_SET_FIELD(value, SQ_CMD, CHECK_VMID, 1);
+	value = REG_SET_FIELD(value, SQ_CMD, VM_ID, vmid);
+	WREG32(mmSQ_CMD, value);
+}
+
 static void gfx_v9_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
 						 enum amdgpu_interrupt_state state)
 {
@@ -4743,6 +4755,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
 	.emit_wreg = gfx_v9_0_ring_emit_wreg,
 	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
 	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
+	.soft_recovery = gfx_v9_0_ring_soft_recovery,
 };
 
 static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
-- 
2.14.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH 1/4] drm/amdgpu: add ring soft recovery v3
       [not found] ` <20180823112334.10321-1-christian.koenig-5C7GfCeVMHo@public.gmane.org>
                     ` (2 preceding siblings ...)
  2018-08-23 11:23   ` [PATCH 4/4] drm/amdgpu: implement soft_recovery for GFX9 Christian König
@ 2018-08-23 13:08   ` Huang Rui
  2018-08-23 15:20   ` Zhu, Rex
  4 siblings, 0 replies; 7+ messages in thread
From: Huang Rui @ 2018-08-23 13:08 UTC (permalink / raw)
  To: Christian König; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On Thu, Aug 23, 2018 at 01:23:31PM +0200, Christian König wrote:
> Instead of hammering hard on the GPU try a soft recovery first.
> 
> v2: reorder code a bit
> v3: increase timeout to 10ms, increment GPU reset counter
> 
> Signed-off-by: Christian König <christian.koenig@amd.com>

Series are Reviewed-by: Huang Rui <ray.huang@amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c  |  6 ++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 25 +++++++++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  4 ++++
>  3 files changed, 35 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 265ff90f4e01..d93e31a5c4e7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -33,6 +33,12 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
>  	struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
>  	struct amdgpu_job *job = to_amdgpu_job(s_job);
>  
> +	if (amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
> +		DRM_ERROR("ring %s timeout, but soft recovered\n",
> +			  s_job->sched->name);
> +		return;
> +	}
> +
>  	DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
>  		  job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
>  		  ring->fence_drv.sync_seq);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> index 5dfd26be1eec..d445acb3d956 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> @@ -383,6 +383,31 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
>  	amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);
>  }
>  
> +/**
> + * amdgpu_ring_soft_recovery - try to soft recover a ring lockup
> + *
> + * @ring: ring to try the recovery on
> + * @vmid: VMID we try to get going again
> + * @fence: timedout fence
> + *
> + * Tries to get a ring proceeding again when it is stuck.
> + */
> +bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
> +			       struct dma_fence *fence)
> +{
> +	ktime_t deadline = ktime_add_us(ktime_get(), 10000);
> +
> +	if (!ring->funcs->soft_recovery)
> +		return false;
> +
> +	atomic_inc(&adev->gpu_reset_counter);
> +	while (!dma_fence_is_signaled(fence) &&
> +	       ktime_to_ns(ktime_sub(deadline, ktime_get())) > 0)
> +		ring->funcs->soft_recovery(ring, vmid);
> +
> +	return dma_fence_is_signaled(fence);
> +}
> +
>  /*
>   * Debugfs info
>   */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 409fdd9b9710..9cc239968e40 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -168,6 +168,8 @@ struct amdgpu_ring_funcs {
>  	/* priority functions */
>  	void (*set_priority) (struct amdgpu_ring *ring,
>  			      enum drm_sched_priority priority);
> +	/* Try to soft recover the ring to make the fence signal */
> +	void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid);
>  };
>  
>  struct amdgpu_ring {
> @@ -260,6 +262,8 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring);
>  void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
>  						uint32_t reg0, uint32_t val0,
>  						uint32_t reg1, uint32_t val1);
> +bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
> +			       struct dma_fence *fence);
>  
>  static inline void amdgpu_ring_clear_ring(struct amdgpu_ring *ring)
>  {
> -- 
> 2.14.1
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [PATCH 1/4] drm/amdgpu: add ring soft recovery v3
       [not found] ` <20180823112334.10321-1-christian.koenig-5C7GfCeVMHo@public.gmane.org>
                     ` (3 preceding siblings ...)
  2018-08-23 13:08   ` [PATCH 1/4] drm/amdgpu: add ring soft recovery v3 Huang Rui
@ 2018-08-23 15:20   ` Zhu, Rex
       [not found]     ` <BYAPR12MB2775BAADFC74D3301884B5F2FB370-ZGDeBxoHBPmJeBUhB162ZQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  4 siblings, 1 reply; 7+ messages in thread
From: Zhu, Rex @ 2018-08-23 15:20 UTC (permalink / raw)
  To: Christian König, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW



> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
> Christian König
> Sent: Thursday, August 23, 2018 7:24 PM
> To: amd-gfx@lists.freedesktop.org
> Subject: [PATCH 1/4] drm/amdgpu: add ring soft recovery v3
> 
> Instead of hammering hard on the GPU try a soft recovery first.
> 
> v2: reorder code a bit
> v3: increase timeout to 10ms, increment GPU reset counter
> 
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c  |  6 ++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 25
> +++++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  4 ++++
>  3 files changed, 35 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 265ff90f4e01..d93e31a5c4e7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -33,6 +33,12 @@ static void amdgpu_job_timedout(struct
> drm_sched_job *s_job)
>  	struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
>  	struct amdgpu_job *job = to_amdgpu_job(s_job);
> 
> +	if (amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence-
> >parent)) {
> +		DRM_ERROR("ring %s timeout, but soft recovered\n",
> +			  s_job->sched->name);
> +		return;
> +	}
> +
>  	DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
>  		  job->base.sched->name, atomic_read(&ring-
> >fence_drv.last_seq),
>  		  ring->fence_drv.sync_seq);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> index 5dfd26be1eec..d445acb3d956 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> @@ -383,6 +383,31 @@ void
> amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
>  	amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);  }
> 
> +/**
> + * amdgpu_ring_soft_recovery - try to soft recover a ring lockup
> + *
> + * @ring: ring to try the recovery on
> + * @vmid: VMID we try to get going again
> + * @fence: timedout fence
> + *
> + * Tries to get a ring proceeding again when it is stuck.
> + */
> +bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int
> vmid,
> +			       struct dma_fence *fence)
> +{
> +	ktime_t deadline = ktime_add_us(ktime_get(), 10000);
> +
> +	if (!ring->funcs->soft_recovery)
> +		return false;
> +
> +	atomic_inc(&adev->gpu_reset_counter);
> +	while (!dma_fence_is_signaled(fence) &&
> +	       ktime_to_ns(ktime_sub(deadline, ktime_get())) > 0)
> +		ring->funcs->soft_recovery(ring, vmid);
Hi Christian,

Is it necessary to add a udelay() here?

Regards
Rex
> +	return dma_fence_is_signaled(fence);
> +}
> +
>  /*
>   * Debugfs info
>   */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 409fdd9b9710..9cc239968e40 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -168,6 +168,8 @@ struct amdgpu_ring_funcs {
>  	/* priority functions */
>  	void (*set_priority) (struct amdgpu_ring *ring,
>  			      enum drm_sched_priority priority);
> +	/* Try to soft recover the ring to make the fence signal */
> +	void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid);
>  };
> 
>  struct amdgpu_ring {
> @@ -260,6 +262,8 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring);
> void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring
> *ring,
>  						uint32_t reg0, uint32_t val0,
>  						uint32_t reg1, uint32_t val1);
> +bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int
> vmid,
> +			       struct dma_fence *fence);
> 
>  static inline void amdgpu_ring_clear_ring(struct amdgpu_ring *ring)  {
> --
> 2.14.1
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 1/4] drm/amdgpu: add ring soft recovery v3
       [not found]     ` <BYAPR12MB2775BAADFC74D3301884B5F2FB370-ZGDeBxoHBPmJeBUhB162ZQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2018-08-23 18:25       ` Christian König
  0 siblings, 0 replies; 7+ messages in thread
From: Christian König @ 2018-08-23 18:25 UTC (permalink / raw)
  To: Zhu, Rex, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 23.08.2018 um 17:20 schrieb Zhu, Rex:
>
>> -----Original Message-----
>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>> Christian König
>> Sent: Thursday, August 23, 2018 7:24 PM
>> To: amd-gfx@lists.freedesktop.org
>> Subject: [PATCH 1/4] drm/amdgpu: add ring soft recovery v3
>>
>> Instead of hammering hard on the GPU try a soft recovery first.
>>
>> v2: reorder code a bit
>> v3: increase timeout to 10ms, increment GPU reset counter
>>
>> Signed-off-by: Christian König <christian.koenig@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c  |  6 ++++++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 25
>> +++++++++++++++++++++++++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  4 ++++
>>   3 files changed, 35 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> index 265ff90f4e01..d93e31a5c4e7 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> @@ -33,6 +33,12 @@ static void amdgpu_job_timedout(struct
>> drm_sched_job *s_job)
>>   	struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
>>   	struct amdgpu_job *job = to_amdgpu_job(s_job);
>>
>> +	if (amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence-
>>> parent)) {
>> +		DRM_ERROR("ring %s timeout, but soft recovered\n",
>> +			  s_job->sched->name);
>> +		return;
>> +	}
>> +
>>   	DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
>>   		  job->base.sched->name, atomic_read(&ring-
>>> fence_drv.last_seq),
>>   		  ring->fence_drv.sync_seq);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> index 5dfd26be1eec..d445acb3d956 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> @@ -383,6 +383,31 @@ void
>> amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
>>   	amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);  }
>>
>> +/**
>> + * amdgpu_ring_soft_recovery - try to soft recover a ring lockup
>> + *
>> + * @ring: ring to try the recovery on
>> + * @vmid: VMID we try to get going again
>> + * @fence: timedout fence
>> + *
>> + * Tries to get a ring proceeding again when it is stuck.
>> + */
>> +bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int
>> vmid,
>> +			       struct dma_fence *fence)
>> +{
>> +	ktime_t deadline = ktime_add_us(ktime_get(), 10000);
>> +
>> +	if (!ring->funcs->soft_recovery)
>> +		return false;
>> +
>> +	atomic_inc(&adev->gpu_reset_counter);
>> +	while (!dma_fence_is_signaled(fence) &&
>> +	       ktime_to_ns(ktime_sub(deadline, ktime_get())) > 0)
>> +		ring->funcs->soft_recovery(ring, vmid);
> Hi Christian,
>
> Is it necessary to add a udelay() here?

No, I don't think so.

Christian.

>
> Regards
> Rex
>> +	return dma_fence_is_signaled(fence);
>> +}
>> +
>>   /*
>>    * Debugfs info
>>    */
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> index 409fdd9b9710..9cc239968e40 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> @@ -168,6 +168,8 @@ struct amdgpu_ring_funcs {
>>   	/* priority functions */
>>   	void (*set_priority) (struct amdgpu_ring *ring,
>>   			      enum drm_sched_priority priority);
>> +	/* Try to soft recover the ring to make the fence signal */
>> +	void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid);
>>   };
>>
>>   struct amdgpu_ring {
>> @@ -260,6 +262,8 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring);
>> void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring
>> *ring,
>>   						uint32_t reg0, uint32_t val0,
>>   						uint32_t reg1, uint32_t val1);
>> +bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int
>> vmid,
>> +			       struct dma_fence *fence);
>>
>>   static inline void amdgpu_ring_clear_ring(struct amdgpu_ring *ring)  {
>> --
>> 2.14.1
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2018-08-23 18:25 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-08-23 11:23 [PATCH 1/4] drm/amdgpu: add ring soft recovery v3 Christian König
     [not found] ` <20180823112334.10321-1-christian.koenig-5C7GfCeVMHo@public.gmane.org>
2018-08-23 11:23   ` [PATCH 2/4] drm/amdgpu: implement soft_recovery for GFX7 Christian König
2018-08-23 11:23   ` [PATCH 3/4] drm/amdgpu: implement soft_recovery for GFX8 v2 Christian König
2018-08-23 11:23   ` [PATCH 4/4] drm/amdgpu: implement soft_recovery for GFX9 Christian König
2018-08-23 13:08   ` [PATCH 1/4] drm/amdgpu: add ring soft recovery v3 Huang Rui
2018-08-23 15:20   ` Zhu, Rex
     [not found]     ` <BYAPR12MB2775BAADFC74D3301884B5F2FB370-ZGDeBxoHBPmJeBUhB162ZQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2018-08-23 18:25       ` Christian König

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.