All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/5] drm/amdgpu: cleanup GPU recovery check a bit
@ 2018-08-22 10:05 Christian König
       [not found] ` <20180822100531.52863-1-christian.koenig-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Christian König @ 2018-08-22 10:05 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Check if we should call the function instead of providing the forced
flag.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 38 ++++++++++++++++++++----------
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c    |  4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  3 ++-
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c      |  4 ++--
 drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c      |  3 ++-
 7 files changed, 36 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 19ef7711d944..340e40d03d54 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1158,8 +1158,9 @@ int emu_soc_asic_init(struct amdgpu_device *adev);
 #define amdgpu_asic_need_full_reset(adev) (adev)->asic_funcs->need_full_reset((adev))
 
 /* Common functions */
+bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev);
 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
-			      struct amdgpu_job* job, bool force);
+			      struct amdgpu_job* job);
 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev);
 bool amdgpu_device_need_post(struct amdgpu_device *adev);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index c23339d8ae2d..9f5e4be76d5e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3244,32 +3244,44 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
 	return r;
 }
 
+/**
+ * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
+ *
+ * @adev: amdgpu device pointer
+ *
+ * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
+ * a hung GPU.
+ */
+bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
+{
+	if (!amdgpu_device_ip_check_soft_reset(adev)) {
+		DRM_INFO("Timeout, but no hardware hang detected.\n");
+		return false;
+	}
+
+	if (amdgpu_gpu_recovery == 0 || (amdgpu_gpu_recovery == -1  &&
+					 !amdgpu_sriov_vf(adev))) {
+		DRM_INFO("GPU recovery disabled.\n");
+		return false;
+	}
+
+	return true;
+}
+
 /**
  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
  *
  * @adev: amdgpu device pointer
  * @job: which job trigger hang
- * @force: forces reset regardless of amdgpu_gpu_recovery
  *
  * Attempt to reset the GPU if it has hung (all asics).
  * Returns 0 for success or an error on failure.
  */
 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
-			      struct amdgpu_job *job, bool force)
+			      struct amdgpu_job *job)
 {
 	int i, r, resched;
 
-	if (!force && !amdgpu_device_ip_check_soft_reset(adev)) {
-		DRM_INFO("No hardware hang detected. Did some blocks stall?\n");
-		return 0;
-	}
-
-	if (!force && (amdgpu_gpu_recovery == 0 ||
-			(amdgpu_gpu_recovery == -1  && !amdgpu_sriov_vf(adev)))) {
-		DRM_INFO("GPU recovery disabled.\n");
-		return 0;
-	}
-
 	dev_info(adev->dev, "GPU reset begin!\n");
 
 	mutex_lock(&adev->lock_reset);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index e74d620d9699..68cccebb8463 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -702,7 +702,7 @@ static int amdgpu_debugfs_gpu_recover(struct seq_file *m, void *data)
 	struct amdgpu_device *adev = dev->dev_private;
 
 	seq_printf(m, "gpu recover\n");
-	amdgpu_device_gpu_recover(adev, NULL, true);
+	amdgpu_device_gpu_recover(adev, NULL);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
index 1abf5b5bac9e..b927e8798534 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
@@ -105,8 +105,8 @@ static void amdgpu_irq_reset_work_func(struct work_struct *work)
 	struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
 						  reset_work);
 
-	if (!amdgpu_sriov_vf(adev))
-		amdgpu_device_gpu_recover(adev, NULL, false);
+	if (!amdgpu_sriov_vf(adev) && amdgpu_device_should_recover_gpu(adev))
+		amdgpu_device_gpu_recover(adev, NULL);
 }
 
 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 391e2f7c03aa..265ff90f4e01 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -37,7 +37,8 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
 		  job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
 		  ring->fence_drv.sync_seq);
 
-	amdgpu_device_gpu_recover(ring->adev, job, false);
+	if (amdgpu_device_should_recover_gpu(ring->adev))
+		amdgpu_device_gpu_recover(ring->adev, job);
 }
 
 int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 078f70faedcb..8cbb4655896a 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -266,8 +266,8 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
 	}
 
 	/* Trigger recovery for world switch failure if no TDR */
-	if (amdgpu_lockup_timeout == 0)
-		amdgpu_device_gpu_recover(adev, NULL, true);
+	if (amdgpu_device_should_recover_gpu(adev))
+		amdgpu_device_gpu_recover(adev, NULL);
 }
 
 static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
index 9fc1c37344ce..842567b53df5 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
@@ -521,7 +521,8 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
 	}
 
 	/* Trigger recovery due to world switch failure */
-	amdgpu_device_gpu_recover(adev, NULL, false);
+	if (amdgpu_device_should_recover_gpu(adev))
+		amdgpu_device_gpu_recover(adev, NULL);
 }
 
 static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev,
-- 
2.14.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/5] drm/amdgpu: add ring soft recovery v2
       [not found] ` <20180822100531.52863-1-christian.koenig-5C7GfCeVMHo@public.gmane.org>
@ 2018-08-22 10:05   ` Christian König
       [not found]     ` <20180822100531.52863-2-christian.koenig-5C7GfCeVMHo@public.gmane.org>
  2018-08-22 10:05   ` [PATCH 3/5] drm/amdgpu: implement soft_recovery for GFX7 Christian König
                     ` (3 subsequent siblings)
  4 siblings, 1 reply; 14+ messages in thread
From: Christian König @ 2018-08-22 10:05 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Instead of hammering hard on the GPU try a soft recovery first.

v2: reorder code a bit

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c  |  6 ++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 24 ++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  4 ++++
 3 files changed, 34 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 265ff90f4e01..d93e31a5c4e7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -33,6 +33,12 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
 	struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
 	struct amdgpu_job *job = to_amdgpu_job(s_job);
 
+	if (amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
+		DRM_ERROR("ring %s timeout, but soft recovered\n",
+			  s_job->sched->name);
+		return;
+	}
+
 	DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
 		  job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
 		  ring->fence_drv.sync_seq);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 5dfd26be1eec..c045a4e38ad1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -383,6 +383,30 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
 	amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);
 }
 
+/**
+ * amdgpu_ring_soft_recovery - try to soft recover a ring lockup
+ *
+ * @ring: ring to try the recovery on
+ * @vmid: VMID we try to get going again
+ * @fence: timedout fence
+ *
+ * Tries to get a ring proceeding again when it is stuck.
+ */
+bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
+			       struct dma_fence *fence)
+{
+	ktime_t deadline = ktime_add_us(ktime_get(), 1000);
+
+	if (!ring->funcs->soft_recovery)
+		return false;
+
+	while (!dma_fence_is_signaled(fence) &&
+	       ktime_to_ns(ktime_sub(deadline, ktime_get())) > 0)
+		ring->funcs->soft_recovery(ring, vmid);
+
+	return dma_fence_is_signaled(fence);
+}
+
 /*
  * Debugfs info
  */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 409fdd9b9710..9cc239968e40 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -168,6 +168,8 @@ struct amdgpu_ring_funcs {
 	/* priority functions */
 	void (*set_priority) (struct amdgpu_ring *ring,
 			      enum drm_sched_priority priority);
+	/* Try to soft recover the ring to make the fence signal */
+	void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid);
 };
 
 struct amdgpu_ring {
@@ -260,6 +262,8 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring);
 void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
 						uint32_t reg0, uint32_t val0,
 						uint32_t reg1, uint32_t val1);
+bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
+			       struct dma_fence *fence);
 
 static inline void amdgpu_ring_clear_ring(struct amdgpu_ring *ring)
 {
-- 
2.14.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 3/5] drm/amdgpu: implement soft_recovery for GFX7
       [not found] ` <20180822100531.52863-1-christian.koenig-5C7GfCeVMHo@public.gmane.org>
  2018-08-22 10:05   ` [PATCH 2/5] drm/amdgpu: add ring soft recovery v2 Christian König
@ 2018-08-22 10:05   ` Christian König
  2018-08-22 10:05   ` [PATCH 4/5] drm/amdgpu: implement soft_recovery for GFX8 v2 Christian König
                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 14+ messages in thread
From: Christian König @ 2018-08-22 10:05 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Try to kill waves on the SQ.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
index 95452c5a9df6..a15d9c0f233b 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
@@ -4212,6 +4212,18 @@ static void gfx_v7_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
 	amdgpu_ring_write(ring, (1 << (oa_size + oa_base)) - (1 << oa_base));
 }
 
+static void gfx_v7_0_ring_soft_recovery(struct amdgpu_ring *ring, unsigned vmid)
+{
+	struct amdgpu_device *adev = ring->adev;
+	uint32_t value = 0;
+
+	value = REG_SET_FIELD(value, SQ_CMD, CMD, 0x03);
+	value = REG_SET_FIELD(value, SQ_CMD, MODE, 0x01);
+	value = REG_SET_FIELD(value, SQ_CMD, CHECK_VMID, 1);
+	value = REG_SET_FIELD(value, SQ_CMD, VM_ID, vmid);
+	WREG32(mmSQ_CMD, value);
+}
+
 static uint32_t wave_read_ind(struct amdgpu_device *adev, uint32_t simd, uint32_t wave, uint32_t address)
 {
 	WREG32(mmSQ_IND_INDEX,
@@ -5088,6 +5100,7 @@ static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_gfx = {
 	.pad_ib = amdgpu_ring_generic_pad_ib,
 	.emit_cntxcntl = gfx_v7_ring_emit_cntxcntl,
 	.emit_wreg = gfx_v7_0_ring_emit_wreg,
+	.soft_recovery = gfx_v7_0_ring_soft_recovery,
 };
 
 static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute = {
-- 
2.14.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 4/5] drm/amdgpu: implement soft_recovery for GFX8 v2
       [not found] ` <20180822100531.52863-1-christian.koenig-5C7GfCeVMHo@public.gmane.org>
  2018-08-22 10:05   ` [PATCH 2/5] drm/amdgpu: add ring soft recovery v2 Christian König
  2018-08-22 10:05   ` [PATCH 3/5] drm/amdgpu: implement soft_recovery for GFX7 Christian König
@ 2018-08-22 10:05   ` Christian König
  2018-08-22 10:05   ` [PATCH 5/5] drm/amdgpu: implement soft_recovery for GFX9 Christian König
  2018-08-22 14:56   ` [PATCH 1/5] drm/amdgpu: cleanup GPU recovery check a bit Andrey Grodzovsky
  4 siblings, 0 replies; 14+ messages in thread
From: Christian König @ 2018-08-22 10:05 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Try to kill waves on the SQ.

v2: only for the GFX ring for now.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 282dba6cce86..9de940a65c80 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -6714,6 +6714,18 @@ static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg,
 	amdgpu_ring_write(ring, val);
 }
 
+static void gfx_v8_0_ring_soft_recovery(struct amdgpu_ring *ring, unsigned vmid)
+{
+	struct amdgpu_device *adev = ring->adev;
+	uint32_t value = 0;
+
+	value = REG_SET_FIELD(value, SQ_CMD, CMD, 0x03);
+	value = REG_SET_FIELD(value, SQ_CMD, MODE, 0x01);
+	value = REG_SET_FIELD(value, SQ_CMD, CHECK_VMID, 1);
+	value = REG_SET_FIELD(value, SQ_CMD, VM_ID, vmid);
+	WREG32(mmSQ_CMD, value);
+}
+
 static void gfx_v8_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
 						 enum amdgpu_interrupt_state state)
 {
@@ -7171,6 +7183,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
 	.init_cond_exec = gfx_v8_0_ring_emit_init_cond_exec,
 	.patch_cond_exec = gfx_v8_0_ring_emit_patch_cond_exec,
 	.emit_wreg = gfx_v8_0_ring_emit_wreg,
+	.soft_recovery = gfx_v8_0_ring_soft_recovery,
 };
 
 static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
-- 
2.14.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 5/5] drm/amdgpu: implement soft_recovery for GFX9
       [not found] ` <20180822100531.52863-1-christian.koenig-5C7GfCeVMHo@public.gmane.org>
                     ` (2 preceding siblings ...)
  2018-08-22 10:05   ` [PATCH 4/5] drm/amdgpu: implement soft_recovery for GFX8 v2 Christian König
@ 2018-08-22 10:05   ` Christian König
  2018-08-22 14:56   ` [PATCH 1/5] drm/amdgpu: cleanup GPU recovery check a bit Andrey Grodzovsky
  4 siblings, 0 replies; 14+ messages in thread
From: Christian König @ 2018-08-22 10:05 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Try to kill waves on the SQ.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 44707f94b2c5..ab5cacea967b 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4421,6 +4421,18 @@ static void gfx_v9_0_ring_emit_reg_write_reg_wait(struct amdgpu_ring *ring,
 							   ref, mask);
 }
 
+static void gfx_v9_0_ring_soft_recovery(struct amdgpu_ring *ring, unsigned vmid)
+{
+	struct amdgpu_device *adev = ring->adev;
+	uint32_t value = 0;
+
+	value = REG_SET_FIELD(value, SQ_CMD, CMD, 0x03);
+	value = REG_SET_FIELD(value, SQ_CMD, MODE, 0x01);
+	value = REG_SET_FIELD(value, SQ_CMD, CHECK_VMID, 1);
+	value = REG_SET_FIELD(value, SQ_CMD, VM_ID, vmid);
+	WREG32(mmSQ_CMD, value);
+}
+
 static void gfx_v9_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
 						 enum amdgpu_interrupt_state state)
 {
@@ -4743,6 +4755,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
 	.emit_wreg = gfx_v9_0_ring_emit_wreg,
 	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
 	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
+	.soft_recovery = gfx_v9_0_ring_soft_recovery,
 };
 
 static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
-- 
2.14.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/5] drm/amdgpu: cleanup GPU recovery check a bit
       [not found] ` <20180822100531.52863-1-christian.koenig-5C7GfCeVMHo@public.gmane.org>
                     ` (3 preceding siblings ...)
  2018-08-22 10:05   ` [PATCH 5/5] drm/amdgpu: implement soft_recovery for GFX9 Christian König
@ 2018-08-22 14:56   ` Andrey Grodzovsky
  4 siblings, 0 replies; 14+ messages in thread
From: Andrey Grodzovsky @ 2018-08-22 14:56 UTC (permalink / raw)
  To: Christian König, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Series is Acked-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>

Andrey


On 08/22/2018 06:05 AM, Christian König wrote:
> Check if we should call the function instead of providing the forced
> flag.
>
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  3 ++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 38 ++++++++++++++++++++----------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  |  2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c    |  4 ++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  3 ++-
>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c      |  4 ++--
>   drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c      |  3 ++-
>   7 files changed, 36 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 19ef7711d944..340e40d03d54 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1158,8 +1158,9 @@ int emu_soc_asic_init(struct amdgpu_device *adev);
>   #define amdgpu_asic_need_full_reset(adev) (adev)->asic_funcs->need_full_reset((adev))
>   
>   /* Common functions */
> +bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev);
>   int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
> -			      struct amdgpu_job* job, bool force);
> +			      struct amdgpu_job* job);
>   void amdgpu_device_pci_config_reset(struct amdgpu_device *adev);
>   bool amdgpu_device_need_post(struct amdgpu_device *adev);
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index c23339d8ae2d..9f5e4be76d5e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3244,32 +3244,44 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
>   	return r;
>   }
>   
> +/**
> + * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
> + *
> + * @adev: amdgpu device pointer
> + *
> + * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
> + * a hung GPU.
> + */
> +bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
> +{
> +	if (!amdgpu_device_ip_check_soft_reset(adev)) {
> +		DRM_INFO("Timeout, but no hardware hang detected.\n");
> +		return false;
> +	}
> +
> +	if (amdgpu_gpu_recovery == 0 || (amdgpu_gpu_recovery == -1  &&
> +					 !amdgpu_sriov_vf(adev))) {
> +		DRM_INFO("GPU recovery disabled.\n");
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
>   /**
>    * amdgpu_device_gpu_recover - reset the asic and recover scheduler
>    *
>    * @adev: amdgpu device pointer
>    * @job: which job trigger hang
> - * @force: forces reset regardless of amdgpu_gpu_recovery
>    *
>    * Attempt to reset the GPU if it has hung (all asics).
>    * Returns 0 for success or an error on failure.
>    */
>   int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
> -			      struct amdgpu_job *job, bool force)
> +			      struct amdgpu_job *job)
>   {
>   	int i, r, resched;
>   
> -	if (!force && !amdgpu_device_ip_check_soft_reset(adev)) {
> -		DRM_INFO("No hardware hang detected. Did some blocks stall?\n");
> -		return 0;
> -	}
> -
> -	if (!force && (amdgpu_gpu_recovery == 0 ||
> -			(amdgpu_gpu_recovery == -1  && !amdgpu_sriov_vf(adev)))) {
> -		DRM_INFO("GPU recovery disabled.\n");
> -		return 0;
> -	}
> -
>   	dev_info(adev->dev, "GPU reset begin!\n");
>   
>   	mutex_lock(&adev->lock_reset);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index e74d620d9699..68cccebb8463 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -702,7 +702,7 @@ static int amdgpu_debugfs_gpu_recover(struct seq_file *m, void *data)
>   	struct amdgpu_device *adev = dev->dev_private;
>   
>   	seq_printf(m, "gpu recover\n");
> -	amdgpu_device_gpu_recover(adev, NULL, true);
> +	amdgpu_device_gpu_recover(adev, NULL);
>   
>   	return 0;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> index 1abf5b5bac9e..b927e8798534 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> @@ -105,8 +105,8 @@ static void amdgpu_irq_reset_work_func(struct work_struct *work)
>   	struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
>   						  reset_work);
>   
> -	if (!amdgpu_sriov_vf(adev))
> -		amdgpu_device_gpu_recover(adev, NULL, false);
> +	if (!amdgpu_sriov_vf(adev) && amdgpu_device_should_recover_gpu(adev))
> +		amdgpu_device_gpu_recover(adev, NULL);
>   }
>   
>   /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 391e2f7c03aa..265ff90f4e01 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -37,7 +37,8 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
>   		  job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
>   		  ring->fence_drv.sync_seq);
>   
> -	amdgpu_device_gpu_recover(ring->adev, job, false);
> +	if (amdgpu_device_should_recover_gpu(ring->adev))
> +		amdgpu_device_gpu_recover(ring->adev, job);
>   }
>   
>   int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> index 078f70faedcb..8cbb4655896a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> @@ -266,8 +266,8 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>   	}
>   
>   	/* Trigger recovery for world switch failure if no TDR */
> -	if (amdgpu_lockup_timeout == 0)
> -		amdgpu_device_gpu_recover(adev, NULL, true);
> +	if (amdgpu_device_should_recover_gpu(adev))
> +		amdgpu_device_gpu_recover(adev, NULL);
>   }
>   
>   static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> index 9fc1c37344ce..842567b53df5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> @@ -521,7 +521,8 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
>   	}
>   
>   	/* Trigger recovery due to world switch failure */
> -	amdgpu_device_gpu_recover(adev, NULL, false);
> +	if (amdgpu_device_should_recover_gpu(adev))
> +		amdgpu_device_gpu_recover(adev, NULL);
>   }
>   
>   static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/5] drm/amdgpu: add ring soft recovery v2
       [not found]     ` <20180822100531.52863-2-christian.koenig-5C7GfCeVMHo@public.gmane.org>
@ 2018-08-22 16:55       ` Alex Deucher
       [not found]         ` <CADnq5_PU5TT4Q9oE+egAd2S4UgDENLu-Las_fnUZrNJUS=rmCw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Alex Deucher @ 2018-08-22 16:55 UTC (permalink / raw)
  To: Christian König; +Cc: amd-gfx list

On Wed, Aug 22, 2018 at 6:05 AM Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
>
> Instead of hammering hard on the GPU try a soft recovery first.
>
> v2: reorder code a bit
>
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c  |  6 ++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 24 ++++++++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  4 ++++
>  3 files changed, 34 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 265ff90f4e01..d93e31a5c4e7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -33,6 +33,12 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
>         struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
>         struct amdgpu_job *job = to_amdgpu_job(s_job);
>
> +       if (amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
> +               DRM_ERROR("ring %s timeout, but soft recovered\n",
> +                         s_job->sched->name);
> +               return;
> +       }

I think we should still bubble up the error to userspace even if we
can recover.  Data is lost when the wave is killed.  We should treat
it like a GPU reset.

Alex

> +
>         DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
>                   job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
>                   ring->fence_drv.sync_seq);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> index 5dfd26be1eec..c045a4e38ad1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> @@ -383,6 +383,30 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
>         amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);
>  }
>
> +/**
> + * amdgpu_ring_soft_recovery - try to soft recover a ring lockup
> + *
> + * @ring: ring to try the recovery on
> + * @vmid: VMID we try to get going again
> + * @fence: timedout fence
> + *
> + * Tries to get a ring proceeding again when it is stuck.
> + */
> +bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
> +                              struct dma_fence *fence)
> +{
> +       ktime_t deadline = ktime_add_us(ktime_get(), 1000);
> +
> +       if (!ring->funcs->soft_recovery)
> +               return false;
> +
> +       while (!dma_fence_is_signaled(fence) &&
> +              ktime_to_ns(ktime_sub(deadline, ktime_get())) > 0)
> +               ring->funcs->soft_recovery(ring, vmid);
> +
> +       return dma_fence_is_signaled(fence);
> +}
> +
>  /*
>   * Debugfs info
>   */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 409fdd9b9710..9cc239968e40 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -168,6 +168,8 @@ struct amdgpu_ring_funcs {
>         /* priority functions */
>         void (*set_priority) (struct amdgpu_ring *ring,
>                               enum drm_sched_priority priority);
> +       /* Try to soft recover the ring to make the fence signal */
> +       void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid);
>  };
>
>  struct amdgpu_ring {
> @@ -260,6 +262,8 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring);
>  void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
>                                                 uint32_t reg0, uint32_t val0,
>                                                 uint32_t reg1, uint32_t val1);
> +bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
> +                              struct dma_fence *fence);
>
>  static inline void amdgpu_ring_clear_ring(struct amdgpu_ring *ring)
>  {
> --
> 2.14.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/5] drm/amdgpu: add ring soft recovery v2
       [not found]         ` <CADnq5_PU5TT4Q9oE+egAd2S4UgDENLu-Las_fnUZrNJUS=rmCw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2018-08-22 19:32           ` Marek Olšák
       [not found]             ` <CAAxE2A5L5UjsWVMSBGVe82GyVS3LAPtVh-5QBb5Ope5ZVEHKig-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  2018-08-23  7:17           ` Huang Rui
  1 sibling, 1 reply; 14+ messages in thread
From: Marek Olšák @ 2018-08-22 19:32 UTC (permalink / raw)
  To: Alex Deucher; +Cc: Christian König, amd-gfx mailing list

On Wed, Aug 22, 2018 at 12:56 PM Alex Deucher <alexdeucher@gmail.com> wrote:
>
> On Wed, Aug 22, 2018 at 6:05 AM Christian König
> <ckoenig.leichtzumerken@gmail.com> wrote:
> >
> > Instead of hammering hard on the GPU try a soft recovery first.
> >
> > v2: reorder code a bit
> >
> > Signed-off-by: Christian König <christian.koenig@amd.com>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c  |  6 ++++++
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 24 ++++++++++++++++++++++++
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  4 ++++
> >  3 files changed, 34 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > index 265ff90f4e01..d93e31a5c4e7 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > @@ -33,6 +33,12 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
> >         struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
> >         struct amdgpu_job *job = to_amdgpu_job(s_job);
> >
> > +       if (amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
> > +               DRM_ERROR("ring %s timeout, but soft recovered\n",
> > +                         s_job->sched->name);
> > +               return;
> > +       }
>
> I think we should still bubble up the error to userspace even if we
> can recover.  Data is lost when the wave is killed.  We should treat
> it like a GPU reset.

Yes, please increment gpu_reset_counter, so that we are compliant with
OpenGL. Being able to recover from infinite loops is great, but test
suites also expect this to be properly reported to userspace via the
per-context query.

Also please bump the deadline to 1 second. Even you if you kill all
shaders, the IB can also contain CP DMA, which may take longer than 1
ms.

Marek

Marek

>
> Alex
>
> > +
> >         DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
> >                   job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
> >                   ring->fence_drv.sync_seq);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> > index 5dfd26be1eec..c045a4e38ad1 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> > @@ -383,6 +383,30 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
> >         amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);
> >  }
> >
> > +/**
> > + * amdgpu_ring_soft_recovery - try to soft recover a ring lockup
> > + *
> > + * @ring: ring to try the recovery on
> > + * @vmid: VMID we try to get going again
> > + * @fence: timedout fence
> > + *
> > + * Tries to get a ring proceeding again when it is stuck.
> > + */
> > +bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
> > +                              struct dma_fence *fence)
> > +{
> > +       ktime_t deadline = ktime_add_us(ktime_get(), 1000);
> > +
> > +       if (!ring->funcs->soft_recovery)
> > +               return false;
> > +
> > +       while (!dma_fence_is_signaled(fence) &&
> > +              ktime_to_ns(ktime_sub(deadline, ktime_get())) > 0)
> > +               ring->funcs->soft_recovery(ring, vmid);
> > +
> > +       return dma_fence_is_signaled(fence);
> > +}
> > +
> >  /*
> >   * Debugfs info
> >   */
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> > index 409fdd9b9710..9cc239968e40 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> > @@ -168,6 +168,8 @@ struct amdgpu_ring_funcs {
> >         /* priority functions */
> >         void (*set_priority) (struct amdgpu_ring *ring,
> >                               enum drm_sched_priority priority);
> > +       /* Try to soft recover the ring to make the fence signal */
> > +       void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid);
> >  };
> >
> >  struct amdgpu_ring {
> > @@ -260,6 +262,8 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring);
> >  void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
> >                                                 uint32_t reg0, uint32_t val0,
> >                                                 uint32_t reg1, uint32_t val1);
> > +bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
> > +                              struct dma_fence *fence);
> >
> >  static inline void amdgpu_ring_clear_ring(struct amdgpu_ring *ring)
> >  {
> > --
> > 2.14.1
> >
> > _______________________________________________
> > amd-gfx mailing list
> > amd-gfx@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/5] drm/amdgpu: add ring soft recovery v2
       [not found]             ` <CAAxE2A5L5UjsWVMSBGVe82GyVS3LAPtVh-5QBb5Ope5ZVEHKig-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2018-08-23  6:51               ` Christian König
       [not found]                 ` <b4b062be-2bc8-035b-e5db-2d17de698826-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Christian König @ 2018-08-23  6:51 UTC (permalink / raw)
  To: Marek Olšák, Alex Deucher; +Cc: amd-gfx mailing list

Am 22.08.2018 um 21:32 schrieb Marek Olšák:
> On Wed, Aug 22, 2018 at 12:56 PM Alex Deucher <alexdeucher@gmail.com> wrote:
>> On Wed, Aug 22, 2018 at 6:05 AM Christian König
>> <ckoenig.leichtzumerken@gmail.com> wrote:
>>> Instead of hammering hard on the GPU try a soft recovery first.
>>>
>>> v2: reorder code a bit
>>>
>>> Signed-off-by: Christian König <christian.koenig@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c  |  6 ++++++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 24 ++++++++++++++++++++++++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  4 ++++
>>>   3 files changed, 34 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> index 265ff90f4e01..d93e31a5c4e7 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> @@ -33,6 +33,12 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
>>>          struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
>>>          struct amdgpu_job *job = to_amdgpu_job(s_job);
>>>
>>> +       if (amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
>>> +               DRM_ERROR("ring %s timeout, but soft recovered\n",
>>> +                         s_job->sched->name);
>>> +               return;
>>> +       }
>> I think we should still bubble up the error to userspace even if we
>> can recover.  Data is lost when the wave is killed.  We should treat
>> it like a GPU reset.
> Yes, please increment gpu_reset_counter, so that we are compliant with
> OpenGL. Being able to recover from infinite loops is great, but test
> suites also expect this to be properly reported to userspace via the
> per-context query.

Sure that shouldn't be a problem.

> Also please bump the deadline to 1 second. Even you if you kill all
> shaders, the IB can also contain CP DMA, which may take longer than 1
> ms.

Is there any way we can get a feedback from the SQ if the kill was 
successfully?

1 second is way to long, since in the case of a blocked MC we need to 
start up hard reset relative fast.

Regards,
Christian.

>
> Marek
>
> Marek
>
>> Alex
>>
>>> +
>>>          DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
>>>                    job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
>>>                    ring->fence_drv.sync_seq);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> index 5dfd26be1eec..c045a4e38ad1 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> @@ -383,6 +383,30 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
>>>          amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);
>>>   }
>>>
>>> +/**
>>> + * amdgpu_ring_soft_recovery - try to soft recover a ring lockup
>>> + *
>>> + * @ring: ring to try the recovery on
>>> + * @vmid: VMID we try to get going again
>>> + * @fence: timedout fence
>>> + *
>>> + * Tries to get a ring proceeding again when it is stuck.
>>> + */
>>> +bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
>>> +                              struct dma_fence *fence)
>>> +{
>>> +       ktime_t deadline = ktime_add_us(ktime_get(), 1000);
>>> +
>>> +       if (!ring->funcs->soft_recovery)
>>> +               return false;
>>> +
>>> +       while (!dma_fence_is_signaled(fence) &&
>>> +              ktime_to_ns(ktime_sub(deadline, ktime_get())) > 0)
>>> +               ring->funcs->soft_recovery(ring, vmid);
>>> +
>>> +       return dma_fence_is_signaled(fence);
>>> +}
>>> +
>>>   /*
>>>    * Debugfs info
>>>    */
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> index 409fdd9b9710..9cc239968e40 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> @@ -168,6 +168,8 @@ struct amdgpu_ring_funcs {
>>>          /* priority functions */
>>>          void (*set_priority) (struct amdgpu_ring *ring,
>>>                                enum drm_sched_priority priority);
>>> +       /* Try to soft recover the ring to make the fence signal */
>>> +       void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid);
>>>   };
>>>
>>>   struct amdgpu_ring {
>>> @@ -260,6 +262,8 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring);
>>>   void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
>>>                                                  uint32_t reg0, uint32_t val0,
>>>                                                  uint32_t reg1, uint32_t val1);
>>> +bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
>>> +                              struct dma_fence *fence);
>>>
>>>   static inline void amdgpu_ring_clear_ring(struct amdgpu_ring *ring)
>>>   {
>>> --
>>> 2.14.1
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/5] drm/amdgpu: add ring soft recovery v2
       [not found]         ` <CADnq5_PU5TT4Q9oE+egAd2S4UgDENLu-Las_fnUZrNJUS=rmCw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  2018-08-22 19:32           ` Marek Olšák
@ 2018-08-23  7:17           ` Huang Rui
  2018-08-23  9:21             ` Christian König
  1 sibling, 1 reply; 14+ messages in thread
From: Huang Rui @ 2018-08-23  7:17 UTC (permalink / raw)
  To: Alex Deucher; +Cc: Christian König, amd-gfx list

On Wed, Aug 22, 2018 at 12:55:43PM -0400, Alex Deucher wrote:
> On Wed, Aug 22, 2018 at 6:05 AM Christian König
> <ckoenig.leichtzumerken@gmail.com> wrote:
> >
> > Instead of hammering hard on the GPU try a soft recovery first.
> >
> > v2: reorder code a bit
> >
> > Signed-off-by: Christian König <christian.koenig@amd.com>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c  |  6 ++++++
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 24 ++++++++++++++++++++++++
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  4 ++++
> >  3 files changed, 34 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > index 265ff90f4e01..d93e31a5c4e7 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > @@ -33,6 +33,12 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
> >         struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
> >         struct amdgpu_job *job = to_amdgpu_job(s_job);
> >
> > +       if (amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
> > +               DRM_ERROR("ring %s timeout, but soft recovered\n",
> > +                         s_job->sched->name);
> > +               return;
> > +       }
> 
> I think we should still bubble up the error to userspace even if we
> can recover.  Data is lost when the wave is killed.  We should treat
> it like a GPU reset.
> 

May I know what does the wavefront stand for? Why we can do the "light"
recover than reset here?

Thanks,
Ray

> Alex
> 
> > +
> >         DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
> >                   job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
> >                   ring->fence_drv.sync_seq);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> > index 5dfd26be1eec..c045a4e38ad1 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> > @@ -383,6 +383,30 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
> >         amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);
> >  }
> >
> > +/**
> > + * amdgpu_ring_soft_recovery - try to soft recover a ring lockup
> > + *
> > + * @ring: ring to try the recovery on
> > + * @vmid: VMID we try to get going again
> > + * @fence: timedout fence
> > + *
> > + * Tries to get a ring proceeding again when it is stuck.
> > + */
> > +bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
> > +                              struct dma_fence *fence)
> > +{
> > +       ktime_t deadline = ktime_add_us(ktime_get(), 1000);
> > +
> > +       if (!ring->funcs->soft_recovery)
> > +               return false;
> > +
> > +       while (!dma_fence_is_signaled(fence) &&
> > +              ktime_to_ns(ktime_sub(deadline, ktime_get())) > 0)
> > +               ring->funcs->soft_recovery(ring, vmid);
> > +
> > +       return dma_fence_is_signaled(fence);
> > +}
> > +
> >  /*
> >   * Debugfs info
> >   */
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> > index 409fdd9b9710..9cc239968e40 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> > @@ -168,6 +168,8 @@ struct amdgpu_ring_funcs {
> >         /* priority functions */
> >         void (*set_priority) (struct amdgpu_ring *ring,
> >                               enum drm_sched_priority priority);
> > +       /* Try to soft recover the ring to make the fence signal */
> > +       void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid);
> >  };
> >
> >  struct amdgpu_ring {
> > @@ -260,6 +262,8 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring);
> >  void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
> >                                                 uint32_t reg0, uint32_t val0,
> >                                                 uint32_t reg1, uint32_t val1);
> > +bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
> > +                              struct dma_fence *fence);
> >
> >  static inline void amdgpu_ring_clear_ring(struct amdgpu_ring *ring)
> >  {
> > --
> > 2.14.1
> >
> > _______________________________________________
> > amd-gfx mailing list
> > amd-gfx@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/5] drm/amdgpu: add ring soft recovery v2
  2018-08-23  7:17           ` Huang Rui
@ 2018-08-23  9:21             ` Christian König
       [not found]               ` <11e0af50-f589-b2e1-3431-8125fe2014ad-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Christian König @ 2018-08-23  9:21 UTC (permalink / raw)
  To: Huang Rui, Alex Deucher; +Cc: amd-gfx list

Am 23.08.2018 um 09:17 schrieb Huang Rui:
> On Wed, Aug 22, 2018 at 12:55:43PM -0400, Alex Deucher wrote:
>> On Wed, Aug 22, 2018 at 6:05 AM Christian König
>> <ckoenig.leichtzumerken@gmail.com> wrote:
>>> Instead of hammering hard on the GPU try a soft recovery first.
>>>
>>> v2: reorder code a bit
>>>
>>> Signed-off-by: Christian König <christian.koenig@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c  |  6 ++++++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 24 ++++++++++++++++++++++++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  4 ++++
>>>   3 files changed, 34 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> index 265ff90f4e01..d93e31a5c4e7 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> @@ -33,6 +33,12 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
>>>          struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
>>>          struct amdgpu_job *job = to_amdgpu_job(s_job);
>>>
>>> +       if (amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
>>> +               DRM_ERROR("ring %s timeout, but soft recovered\n",
>>> +                         s_job->sched->name);
>>> +               return;
>>> +       }
>> I think we should still bubble up the error to userspace even if we
>> can recover.  Data is lost when the wave is killed.  We should treat
>> it like a GPU reset.
>>
> May I know what does the wavefront stand for? Why we can do the "light"
> recover than reset here?

Wavefront means a running shader in the SQ.

Basically this only covers the case when the application sends down a 
shader with an endless loop to the hardware. Here we just kill the 
shader and try to continue.

When you run into a hang because of a corrupted resource descriptor you 
need usually need a full ASIC reset to get out of that again.

Regards,
Christian.

>
> Thanks,
> Ray
>
>> Alex
>>
>>> +
>>>          DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
>>>                    job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
>>>                    ring->fence_drv.sync_seq);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> index 5dfd26be1eec..c045a4e38ad1 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> @@ -383,6 +383,30 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
>>>          amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);
>>>   }
>>>
>>> +/**
>>> + * amdgpu_ring_soft_recovery - try to soft recover a ring lockup
>>> + *
>>> + * @ring: ring to try the recovery on
>>> + * @vmid: VMID we try to get going again
>>> + * @fence: timedout fence
>>> + *
>>> + * Tries to get a ring proceeding again when it is stuck.
>>> + */
>>> +bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
>>> +                              struct dma_fence *fence)
>>> +{
>>> +       ktime_t deadline = ktime_add_us(ktime_get(), 1000);
>>> +
>>> +       if (!ring->funcs->soft_recovery)
>>> +               return false;
>>> +
>>> +       while (!dma_fence_is_signaled(fence) &&
>>> +              ktime_to_ns(ktime_sub(deadline, ktime_get())) > 0)
>>> +               ring->funcs->soft_recovery(ring, vmid);
>>> +
>>> +       return dma_fence_is_signaled(fence);
>>> +}
>>> +
>>>   /*
>>>    * Debugfs info
>>>    */
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> index 409fdd9b9710..9cc239968e40 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> @@ -168,6 +168,8 @@ struct amdgpu_ring_funcs {
>>>          /* priority functions */
>>>          void (*set_priority) (struct amdgpu_ring *ring,
>>>                                enum drm_sched_priority priority);
>>> +       /* Try to soft recover the ring to make the fence signal */
>>> +       void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid);
>>>   };
>>>
>>>   struct amdgpu_ring {
>>> @@ -260,6 +262,8 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring);
>>>   void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
>>>                                                  uint32_t reg0, uint32_t val0,
>>>                                                  uint32_t reg1, uint32_t val1);
>>> +bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
>>> +                              struct dma_fence *fence);
>>>
>>>   static inline void amdgpu_ring_clear_ring(struct amdgpu_ring *ring)
>>>   {
>>> --
>>> 2.14.1
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/5] drm/amdgpu: add ring soft recovery v2
       [not found]               ` <11e0af50-f589-b2e1-3431-8125fe2014ad-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2018-08-23 13:03                 ` Huang Rui
  0 siblings, 0 replies; 14+ messages in thread
From: Huang Rui @ 2018-08-23 13:03 UTC (permalink / raw)
  To: Koenig, Christian; +Cc: Alex Deucher, amd-gfx list

On Thu, Aug 23, 2018 at 05:21:53PM +0800, Christian König wrote:
> Am 23.08.2018 um 09:17 schrieb Huang Rui:
> > On Wed, Aug 22, 2018 at 12:55:43PM -0400, Alex Deucher wrote:
> >> On Wed, Aug 22, 2018 at 6:05 AM Christian König
> >> <ckoenig.leichtzumerken@gmail.com> wrote:
> >>> Instead of hammering hard on the GPU try a soft recovery first.
> >>>
> >>> v2: reorder code a bit
> >>>
> >>> Signed-off-by: Christian König <christian.koenig@amd.com>
> >>> ---
> >>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c  |  6 ++++++
> >>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 24 ++++++++++++++++++++++++
> >>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  4 ++++
> >>>   3 files changed, 34 insertions(+)
> >>>
> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> >>> index 265ff90f4e01..d93e31a5c4e7 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> >>> @@ -33,6 +33,12 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
> >>>          struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
> >>>          struct amdgpu_job *job = to_amdgpu_job(s_job);
> >>>
> >>> +       if (amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
> >>> +               DRM_ERROR("ring %s timeout, but soft recovered\n",
> >>> +                         s_job->sched->name);
> >>> +               return;
> >>> +       }
> >> I think we should still bubble up the error to userspace even if we
> >> can recover.  Data is lost when the wave is killed.  We should treat
> >> it like a GPU reset.
> >>
> > May I know what does the wavefront stand for? Why we can do the "light"
> > recover than reset here?
> 
> Wavefront means a running shader in the SQ.
> 
> Basically this only covers the case when the application sends down a 
> shader with an endless loop to the hardware. Here we just kill the 
> shader and try to continue.
> 
> When you run into a hang because of a corrupted resource descriptor you 
> need usually need a full ASIC reset to get out of that again.
> 

Good to know this, thank you.

Series are Acked-by: Huang Rui <ray.huang@amd.com>

> Regards,
> Christian.
> 
> >
> > Thanks,
> > Ray
> >
> >> Alex
> >>
> >>> +
> >>>          DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
> >>>                    job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
> >>>                    ring->fence_drv.sync_seq);
> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> >>> index 5dfd26be1eec..c045a4e38ad1 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> >>> @@ -383,6 +383,30 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
> >>>          amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);
> >>>   }
> >>>
> >>> +/**
> >>> + * amdgpu_ring_soft_recovery - try to soft recover a ring lockup
> >>> + *
> >>> + * @ring: ring to try the recovery on
> >>> + * @vmid: VMID we try to get going again
> >>> + * @fence: timedout fence
> >>> + *
> >>> + * Tries to get a ring proceeding again when it is stuck.
> >>> + */
> >>> +bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
> >>> +                              struct dma_fence *fence)
> >>> +{
> >>> +       ktime_t deadline = ktime_add_us(ktime_get(), 1000);
> >>> +
> >>> +       if (!ring->funcs->soft_recovery)
> >>> +               return false;
> >>> +
> >>> +       while (!dma_fence_is_signaled(fence) &&
> >>> +              ktime_to_ns(ktime_sub(deadline, ktime_get())) > 0)
> >>> +               ring->funcs->soft_recovery(ring, vmid);
> >>> +
> >>> +       return dma_fence_is_signaled(fence);
> >>> +}
> >>> +
> >>>   /*
> >>>    * Debugfs info
> >>>    */
> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> >>> index 409fdd9b9710..9cc239968e40 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> >>> @@ -168,6 +168,8 @@ struct amdgpu_ring_funcs {
> >>>          /* priority functions */
> >>>          void (*set_priority) (struct amdgpu_ring *ring,
> >>>                                enum drm_sched_priority priority);
> >>> +       /* Try to soft recover the ring to make the fence signal */
> >>> +       void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid);
> >>>   };
> >>>
> >>>   struct amdgpu_ring {
> >>> @@ -260,6 +262,8 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring);
> >>>   void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
> >>>                                                  uint32_t reg0, uint32_t val0,
> >>>                                                  uint32_t reg1, uint32_t val1);
> >>> +bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
> >>> +                              struct dma_fence *fence);
> >>>
> >>>   static inline void amdgpu_ring_clear_ring(struct amdgpu_ring *ring)
> >>>   {
> >>> --
> >>> 2.14.1
> >>>
> >>> _______________________________________________
> >>> amd-gfx mailing list
> >>> amd-gfx@lists.freedesktop.org
> >>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> >> _______________________________________________
> >> amd-gfx mailing list
> >> amd-gfx@lists.freedesktop.org
> >> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> 
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/5] drm/amdgpu: add ring soft recovery v2
       [not found]                 ` <b4b062be-2bc8-035b-e5db-2d17de698826-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2018-08-23 20:21                   ` Marek Olšák
       [not found]                     ` <CAAxE2A4TKbg-Z07u9nsr=k3ujzA7p8m-2NEiAFmHP8Lab_Zy4g-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Marek Olšák @ 2018-08-23 20:21 UTC (permalink / raw)
  To: Christian König; +Cc: Alex Deucher, amd-gfx mailing list

On Thu, Aug 23, 2018 at 2:51 AM Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
>
> Am 22.08.2018 um 21:32 schrieb Marek Olšák:
> > On Wed, Aug 22, 2018 at 12:56 PM Alex Deucher <alexdeucher@gmail.com> wrote:
> >> On Wed, Aug 22, 2018 at 6:05 AM Christian König
> >> <ckoenig.leichtzumerken@gmail.com> wrote:
> >>> Instead of hammering hard on the GPU try a soft recovery first.
> >>>
> >>> v2: reorder code a bit
> >>>
> >>> Signed-off-by: Christian König <christian.koenig@amd.com>
> >>> ---
> >>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c  |  6 ++++++
> >>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 24 ++++++++++++++++++++++++
> >>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  4 ++++
> >>>   3 files changed, 34 insertions(+)
> >>>
> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> >>> index 265ff90f4e01..d93e31a5c4e7 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> >>> @@ -33,6 +33,12 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
> >>>          struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
> >>>          struct amdgpu_job *job = to_amdgpu_job(s_job);
> >>>
> >>> +       if (amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
> >>> +               DRM_ERROR("ring %s timeout, but soft recovered\n",
> >>> +                         s_job->sched->name);
> >>> +               return;
> >>> +       }
> >> I think we should still bubble up the error to userspace even if we
> >> can recover.  Data is lost when the wave is killed.  We should treat
> >> it like a GPU reset.
> > Yes, please increment gpu_reset_counter, so that we are compliant with
> > OpenGL. Being able to recover from infinite loops is great, but test
> > suites also expect this to be properly reported to userspace via the
> > per-context query.
>
> Sure that shouldn't be a problem.
>
> > Also please bump the deadline to 1 second. Even you if you kill all
> > shaders, the IB can also contain CP DMA, which may take longer than 1
> > ms.
>
> Is there any way we can get a feedback from the SQ if the kill was
> successfully?

I don't think so. The kill should be finished pretty quickly, but more
waves with infinite loops may be waiting to be launched, so you still
need to repeat the kill command. And we should ideally repeat it for 1
second.

The reason is that vertex shader waves take a lot of time to launch. A
very very very large draw call can keep launching new waves for 1
second with the same infinite loop. You would have to soft-reset all
VGTs to stop that.

>
> 1 second is way to long, since in the case of a blocked MC we need to
> start up hard reset relative fast.

10 seconds have already passed.

I think that some hangs from corrupted descriptors may still be
recoverable just by killing waves.

Marek

>
> Regards,
> Christian.
>
> >
> > Marek
> >
> > Marek
> >
> >> Alex
> >>
> >>> +
> >>>          DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
> >>>                    job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
> >>>                    ring->fence_drv.sync_seq);
> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> >>> index 5dfd26be1eec..c045a4e38ad1 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> >>> @@ -383,6 +383,30 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
> >>>          amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);
> >>>   }
> >>>
> >>> +/**
> >>> + * amdgpu_ring_soft_recovery - try to soft recover a ring lockup
> >>> + *
> >>> + * @ring: ring to try the recovery on
> >>> + * @vmid: VMID we try to get going again
> >>> + * @fence: timedout fence
> >>> + *
> >>> + * Tries to get a ring proceeding again when it is stuck.
> >>> + */
> >>> +bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
> >>> +                              struct dma_fence *fence)
> >>> +{
> >>> +       ktime_t deadline = ktime_add_us(ktime_get(), 1000);
> >>> +
> >>> +       if (!ring->funcs->soft_recovery)
> >>> +               return false;
> >>> +
> >>> +       while (!dma_fence_is_signaled(fence) &&
> >>> +              ktime_to_ns(ktime_sub(deadline, ktime_get())) > 0)
> >>> +               ring->funcs->soft_recovery(ring, vmid);
> >>> +
> >>> +       return dma_fence_is_signaled(fence);
> >>> +}
> >>> +
> >>>   /*
> >>>    * Debugfs info
> >>>    */
> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> >>> index 409fdd9b9710..9cc239968e40 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> >>> @@ -168,6 +168,8 @@ struct amdgpu_ring_funcs {
> >>>          /* priority functions */
> >>>          void (*set_priority) (struct amdgpu_ring *ring,
> >>>                                enum drm_sched_priority priority);
> >>> +       /* Try to soft recover the ring to make the fence signal */
> >>> +       void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid);
> >>>   };
> >>>
> >>>   struct amdgpu_ring {
> >>> @@ -260,6 +262,8 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring);
> >>>   void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
> >>>                                                  uint32_t reg0, uint32_t val0,
> >>>                                                  uint32_t reg1, uint32_t val1);
> >>> +bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
> >>> +                              struct dma_fence *fence);
> >>>
> >>>   static inline void amdgpu_ring_clear_ring(struct amdgpu_ring *ring)
> >>>   {
> >>> --
> >>> 2.14.1
> >>>
> >>> _______________________________________________
> >>> amd-gfx mailing list
> >>> amd-gfx@lists.freedesktop.org
> >>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> >> _______________________________________________
> >> amd-gfx mailing list
> >> amd-gfx@lists.freedesktop.org
> >> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/5] drm/amdgpu: add ring soft recovery v2
       [not found]                     ` <CAAxE2A4TKbg-Z07u9nsr=k3ujzA7p8m-2NEiAFmHP8Lab_Zy4g-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2018-08-24  7:28                       ` Christian König
  0 siblings, 0 replies; 14+ messages in thread
From: Christian König @ 2018-08-24  7:28 UTC (permalink / raw)
  To: Marek Olšák, Christian König
  Cc: Alex Deucher, amd-gfx mailing list

Am 23.08.2018 um 22:21 schrieb Marek Olšák:
> On Thu, Aug 23, 2018 at 2:51 AM Christian König
> <ckoenig.leichtzumerken@gmail.com> wrote:
>> Am 22.08.2018 um 21:32 schrieb Marek Olšák:
>>> On Wed, Aug 22, 2018 at 12:56 PM Alex Deucher <alexdeucher@gmail.com> wrote:
>>>> On Wed, Aug 22, 2018 at 6:05 AM Christian König
>>>> <ckoenig.leichtzumerken@gmail.com> wrote:
>>>>> Instead of hammering hard on the GPU try a soft recovery first.
>>>>>
>>>>> v2: reorder code a bit
>>>>>
>>>>> Signed-off-by: Christian König <christian.koenig@amd.com>
>>>>> ---
>>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_job.c  |  6 ++++++
>>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 24 ++++++++++++++++++++++++
>>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  4 ++++
>>>>>    3 files changed, 34 insertions(+)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>>> index 265ff90f4e01..d93e31a5c4e7 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>>> @@ -33,6 +33,12 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
>>>>>           struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
>>>>>           struct amdgpu_job *job = to_amdgpu_job(s_job);
>>>>>
>>>>> +       if (amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
>>>>> +               DRM_ERROR("ring %s timeout, but soft recovered\n",
>>>>> +                         s_job->sched->name);
>>>>> +               return;
>>>>> +       }
>>>> I think we should still bubble up the error to userspace even if we
>>>> can recover.  Data is lost when the wave is killed.  We should treat
>>>> it like a GPU reset.
>>> Yes, please increment gpu_reset_counter, so that we are compliant with
>>> OpenGL. Being able to recover from infinite loops is great, but test
>>> suites also expect this to be properly reported to userspace via the
>>> per-context query.
>> Sure that shouldn't be a problem.
>>
>>> Also please bump the deadline to 1 second. Even you if you kill all
>>> shaders, the IB can also contain CP DMA, which may take longer than 1
>>> ms.
>> Is there any way we can get a feedback from the SQ if the kill was
>> successfully?
> I don't think so. The kill should be finished pretty quickly, but more
> waves with infinite loops may be waiting to be launched, so you still
> need to repeat the kill command. And we should ideally repeat it for 1
> second.
>
> The reason is that vertex shader waves take a lot of time to launch. A
> very very very large draw call can keep launching new waves for 1
> second with the same infinite loop. You would have to soft-reset all
> VGTs to stop that.
>
>> 1 second is way to long, since in the case of a blocked MC we need to
>> start up hard reset relative fast.
> 10 seconds have already passed.

That is a good argument, but I still find 1 second way to long.

Keep in mind that we hammer on the SQ here to kill the waves as soon as 
they are launched.

> I think that some hangs from corrupted descriptors may still be
> recoverable just by killing waves.

Agreed, especially since Vega10 the memory paths became much more robust 
to error handling.

Christian.

>
> Marek
>
>> Regards,
>> Christian.
>>
>>> Marek
>>>
>>> Marek
>>>
>>>> Alex
>>>>
>>>>> +
>>>>>           DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
>>>>>                     job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
>>>>>                     ring->fence_drv.sync_seq);
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>>>> index 5dfd26be1eec..c045a4e38ad1 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>>>> @@ -383,6 +383,30 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
>>>>>           amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);
>>>>>    }
>>>>>
>>>>> +/**
>>>>> + * amdgpu_ring_soft_recovery - try to soft recover a ring lockup
>>>>> + *
>>>>> + * @ring: ring to try the recovery on
>>>>> + * @vmid: VMID we try to get going again
>>>>> + * @fence: timedout fence
>>>>> + *
>>>>> + * Tries to get a ring proceeding again when it is stuck.
>>>>> + */
>>>>> +bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
>>>>> +                              struct dma_fence *fence)
>>>>> +{
>>>>> +       ktime_t deadline = ktime_add_us(ktime_get(), 1000);
>>>>> +
>>>>> +       if (!ring->funcs->soft_recovery)
>>>>> +               return false;
>>>>> +
>>>>> +       while (!dma_fence_is_signaled(fence) &&
>>>>> +              ktime_to_ns(ktime_sub(deadline, ktime_get())) > 0)
>>>>> +               ring->funcs->soft_recovery(ring, vmid);
>>>>> +
>>>>> +       return dma_fence_is_signaled(fence);
>>>>> +}
>>>>> +
>>>>>    /*
>>>>>     * Debugfs info
>>>>>     */
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>> index 409fdd9b9710..9cc239968e40 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>> @@ -168,6 +168,8 @@ struct amdgpu_ring_funcs {
>>>>>           /* priority functions */
>>>>>           void (*set_priority) (struct amdgpu_ring *ring,
>>>>>                                 enum drm_sched_priority priority);
>>>>> +       /* Try to soft recover the ring to make the fence signal */
>>>>> +       void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid);
>>>>>    };
>>>>>
>>>>>    struct amdgpu_ring {
>>>>> @@ -260,6 +262,8 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring);
>>>>>    void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
>>>>>                                                   uint32_t reg0, uint32_t val0,
>>>>>                                                   uint32_t reg1, uint32_t val1);
>>>>> +bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
>>>>> +                              struct dma_fence *fence);
>>>>>
>>>>>    static inline void amdgpu_ring_clear_ring(struct amdgpu_ring *ring)
>>>>>    {
>>>>> --
>>>>> 2.14.1
>>>>>
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2018-08-24  7:28 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-08-22 10:05 [PATCH 1/5] drm/amdgpu: cleanup GPU recovery check a bit Christian König
     [not found] ` <20180822100531.52863-1-christian.koenig-5C7GfCeVMHo@public.gmane.org>
2018-08-22 10:05   ` [PATCH 2/5] drm/amdgpu: add ring soft recovery v2 Christian König
     [not found]     ` <20180822100531.52863-2-christian.koenig-5C7GfCeVMHo@public.gmane.org>
2018-08-22 16:55       ` Alex Deucher
     [not found]         ` <CADnq5_PU5TT4Q9oE+egAd2S4UgDENLu-Las_fnUZrNJUS=rmCw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2018-08-22 19:32           ` Marek Olšák
     [not found]             ` <CAAxE2A5L5UjsWVMSBGVe82GyVS3LAPtVh-5QBb5Ope5ZVEHKig-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2018-08-23  6:51               ` Christian König
     [not found]                 ` <b4b062be-2bc8-035b-e5db-2d17de698826-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2018-08-23 20:21                   ` Marek Olšák
     [not found]                     ` <CAAxE2A4TKbg-Z07u9nsr=k3ujzA7p8m-2NEiAFmHP8Lab_Zy4g-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2018-08-24  7:28                       ` Christian König
2018-08-23  7:17           ` Huang Rui
2018-08-23  9:21             ` Christian König
     [not found]               ` <11e0af50-f589-b2e1-3431-8125fe2014ad-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2018-08-23 13:03                 ` Huang Rui
2018-08-22 10:05   ` [PATCH 3/5] drm/amdgpu: implement soft_recovery for GFX7 Christian König
2018-08-22 10:05   ` [PATCH 4/5] drm/amdgpu: implement soft_recovery for GFX8 v2 Christian König
2018-08-22 10:05   ` [PATCH 5/5] drm/amdgpu: implement soft_recovery for GFX9 Christian König
2018-08-22 14:56   ` [PATCH 1/5] drm/amdgpu: cleanup GPU recovery check a bit Andrey Grodzovsky

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.