All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/5] drm/amd/amdgpu revert "implement tdr advanced mode"
@ 2022-11-09  9:50 Christian König
  2022-11-09  9:50 ` [PATCH 2/5] drm/amdgpu: stop resubmitting jobs for GPU reset v2 Christian König
                   ` (4 more replies)
  0 siblings, 5 replies; 8+ messages in thread
From: Christian König @ 2022-11-09  9:50 UTC (permalink / raw)
  To: Alexander.Deucher, daniel.vetter, dri-devel, Shaoyun.Liu
  Cc: Christian König

This reverts commit e6c6338f393b74ac0b303d567bb918b44ae7ad75.

This feature basically re-submits one job after another to
figure out which one was the one causing a hang.

This is obviously incompatible with gang-submit which requires
that multiple jobs run at the same time. It's also absolutely
not helpful to crash the hardware multiple times if a clean
recovery is desired.

For testing and debugging environments we should rather disable
recovery alltogether to be able to inspect the state with a hw
debugger.

Additional to that the sw implementation is clearly buggy and causes
reference count issues for the hardware fence.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 103 ---------------------
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |   2 +-
 drivers/gpu/drm/scheduler/sched_main.c     |  58 ++----------
 include/drm/gpu_scheduler.h                |   3 -
 4 files changed, 10 insertions(+), 156 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 5b9f992e4607..0da55fd97df8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5077,94 +5077,6 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
 	return 0;
 }
 
-static void amdgpu_device_recheck_guilty_jobs(
-	struct amdgpu_device *adev, struct list_head *device_list_handle,
-	struct amdgpu_reset_context *reset_context)
-{
-	int i, r = 0;
-
-	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
-		struct amdgpu_ring *ring = adev->rings[i];
-		int ret = 0;
-		struct drm_sched_job *s_job;
-
-		if (!ring || !ring->sched.thread)
-			continue;
-
-		s_job = list_first_entry_or_null(&ring->sched.pending_list,
-				struct drm_sched_job, list);
-		if (s_job == NULL)
-			continue;
-
-		/* clear job's guilty and depend the folowing step to decide the real one */
-		drm_sched_reset_karma(s_job);
-		drm_sched_resubmit_jobs_ext(&ring->sched, 1);
-
-		if (!s_job->s_fence->parent) {
-			DRM_WARN("Failed to get a HW fence for job!");
-			continue;
-		}
-
-		ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
-		if (ret == 0) { /* timeout */
-			DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
-						ring->sched.name, s_job->id);
-
-
-			amdgpu_fence_driver_isr_toggle(adev, true);
-
-			/* Clear this failed job from fence array */
-			amdgpu_fence_driver_clear_job_fences(ring);
-
-			amdgpu_fence_driver_isr_toggle(adev, false);
-
-			/* Since the job won't signal and we go for
-			 * another resubmit drop this parent pointer
-			 */
-			dma_fence_put(s_job->s_fence->parent);
-			s_job->s_fence->parent = NULL;
-
-			/* set guilty */
-			drm_sched_increase_karma(s_job);
-			amdgpu_reset_prepare_hwcontext(adev, reset_context);
-retry:
-			/* do hw reset */
-			if (amdgpu_sriov_vf(adev)) {
-				amdgpu_virt_fini_data_exchange(adev);
-				r = amdgpu_device_reset_sriov(adev, false);
-				if (r)
-					adev->asic_reset_res = r;
-			} else {
-				clear_bit(AMDGPU_SKIP_HW_RESET,
-					  &reset_context->flags);
-				r = amdgpu_do_asic_reset(device_list_handle,
-							 reset_context);
-				if (r && r == -EAGAIN)
-					goto retry;
-			}
-
-			/*
-			 * add reset counter so that the following
-			 * resubmitted job could flush vmid
-			 */
-			atomic_inc(&adev->gpu_reset_counter);
-			continue;
-		}
-
-		/* got the hw fence, signal finished fence */
-		atomic_dec(ring->sched.score);
-		dma_fence_get(&s_job->s_fence->finished);
-		dma_fence_signal(&s_job->s_fence->finished);
-		dma_fence_put(&s_job->s_fence->finished);
-
-		/* remove node from list and free the job */
-		spin_lock(&ring->sched.job_list_lock);
-		list_del_init(&s_job->list);
-		spin_unlock(&ring->sched.job_list_lock);
-		ring->sched.ops->free_job(s_job);
-	}
-}
-
 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -5185,7 +5097,6 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
 
 }
 
-
 /**
  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
  *
@@ -5208,7 +5119,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	int i, r = 0;
 	bool need_emergency_restart = false;
 	bool audio_suspended = false;
-	int tmp_vram_lost_counter;
 	bool gpu_reset_for_dev_remove = false;
 
 	gpu_reset_for_dev_remove =
@@ -5354,7 +5264,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 		amdgpu_device_stop_pending_resets(tmp_adev);
 	}
 
-	tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
 	/* Actual ASIC resets if needed.*/
 	/* Host driver will handle XGMI hive reset for SRIOV */
 	if (amdgpu_sriov_vf(adev)) {
@@ -5379,18 +5288,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	/* Post ASIC reset for all devs .*/
 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
 
-		/*
-		 * Sometimes a later bad compute job can block a good gfx job as gfx
-		 * and compute ring share internal GC HW mutually. We add an additional
-		 * guilty jobs recheck step to find the real guilty job, it synchronously
-		 * submits and pends for the first job being signaled. If it gets timeout,
-		 * we identify it as a real guilty job.
-		 */
-		if (amdgpu_gpu_recovery == 2 &&
-			!(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))
-			amdgpu_device_recheck_guilty_jobs(
-				tmp_adev, device_list_handle, reset_context);
-
 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
 			struct amdgpu_ring *ring = tmp_adev->rings[i];
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 8e97e95aca8c..a6820603214f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -519,7 +519,7 @@ module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444);
  * DOC: gpu_recovery (int)
  * Set to enable GPU recovery mechanism (1 = enable, 0 = disable). The default is -1 (auto, disabled except SRIOV).
  */
-MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (2 = advanced tdr mode, 1 = enable, 0 = disable, -1 = auto)");
+MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 = enable, 0 = disable, -1 = auto)");
 module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
 
 /**
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 68317d3a7a27..e77e1fd16732 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -355,27 +355,6 @@ static void drm_sched_job_timedout(struct work_struct *work)
 	}
 }
 
- /**
-  * drm_sched_increase_karma - Update sched_entity guilty flag
-  *
-  * @bad: The job guilty of time out
-  *
-  * Increment on every hang caused by the 'bad' job. If this exceeds the hang
-  * limit of the scheduler then the respective sched entity is marked guilty and
-  * jobs from it will not be scheduled further
-  */
-void drm_sched_increase_karma(struct drm_sched_job *bad)
-{
-	drm_sched_increase_karma_ext(bad, 1);
-}
-EXPORT_SYMBOL(drm_sched_increase_karma);
-
-void drm_sched_reset_karma(struct drm_sched_job *bad)
-{
-	drm_sched_increase_karma_ext(bad, 0);
-}
-EXPORT_SYMBOL(drm_sched_reset_karma);
-
 /**
  * drm_sched_stop - stop the scheduler
  *
@@ -516,32 +495,15 @@ EXPORT_SYMBOL(drm_sched_start);
  *
  */
 void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched)
-{
-	drm_sched_resubmit_jobs_ext(sched, INT_MAX);
-}
-EXPORT_SYMBOL(drm_sched_resubmit_jobs);
-
-/**
- * drm_sched_resubmit_jobs_ext - helper to relunch certain number of jobs from mirror ring list
- *
- * @sched: scheduler instance
- * @max: job numbers to relaunch
- *
- */
-void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
 {
 	struct drm_sched_job *s_job, *tmp;
 	uint64_t guilty_context;
 	bool found_guilty = false;
 	struct dma_fence *fence;
-	int i = 0;
 
 	list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
 		struct drm_sched_fence *s_fence = s_job->s_fence;
 
-		if (i >= max)
-			break;
-
 		if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
 			found_guilty = true;
 			guilty_context = s_job->s_fence->scheduled.context;
@@ -551,7 +513,6 @@ void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
 			dma_fence_set_error(&s_fence->finished, -ECANCELED);
 
 		fence = sched->ops->run_job(s_job);
-		i++;
 
 		if (IS_ERR_OR_NULL(fence)) {
 			if (IS_ERR(fence))
@@ -567,7 +528,7 @@ void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
 		}
 	}
 }
-EXPORT_SYMBOL(drm_sched_resubmit_jobs_ext);
+EXPORT_SYMBOL(drm_sched_resubmit_jobs);
 
 /**
  * drm_sched_job_init - init a scheduler job
@@ -1082,13 +1043,15 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched)
 EXPORT_SYMBOL(drm_sched_fini);
 
 /**
- * drm_sched_increase_karma_ext - Update sched_entity guilty flag
+ * drm_sched_increase_karma - Update sched_entity guilty flag
  *
  * @bad: The job guilty of time out
- * @type: type for increase/reset karma
  *
+ * Increment on every hang caused by the 'bad' job. If this exceeds the hang
+ * limit of the scheduler then the respective sched entity is marked guilty and
+ * jobs from it will not be scheduled further
  */
-void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
+void drm_sched_increase_karma(struct drm_sched_job *bad)
 {
 	int i;
 	struct drm_sched_entity *tmp;
@@ -1100,10 +1063,7 @@ void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
 	 * corrupt but keep in mind that kernel jobs always considered good.
 	 */
 	if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) {
-		if (type == 0)
-			atomic_set(&bad->karma, 0);
-		else if (type == 1)
-			atomic_inc(&bad->karma);
+		atomic_inc(&bad->karma);
 
 		for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_KERNEL;
 		     i++) {
@@ -1114,7 +1074,7 @@ void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
 				if (bad->s_fence->scheduled.context ==
 				    entity->fence_context) {
 					if (entity->guilty)
-						atomic_set(entity->guilty, type);
+						atomic_set(entity->guilty, 1);
 					break;
 				}
 			}
@@ -1124,4 +1084,4 @@ void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
 		}
 	}
 }
-EXPORT_SYMBOL(drm_sched_increase_karma_ext);
+EXPORT_SYMBOL(drm_sched_increase_karma);
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 289a33e80639..156601fd7053 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -497,10 +497,7 @@ void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
 void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
 void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery);
 void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched);
-void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max);
 void drm_sched_increase_karma(struct drm_sched_job *bad);
-void drm_sched_reset_karma(struct drm_sched_job *bad);
-void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type);
 bool drm_sched_dependency_optimized(struct dma_fence* fence,
 				    struct drm_sched_entity *entity);
 void drm_sched_fault(struct drm_gpu_scheduler *sched);
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/5] drm/amdgpu: stop resubmitting jobs for GPU reset v2
  2022-11-09  9:50 [PATCH 1/5] drm/amd/amdgpu revert "implement tdr advanced mode" Christian König
@ 2022-11-09  9:50 ` Christian König
  2022-11-09  9:50 ` [PATCH 3/5] drm/amdgpu: stop resubmittting jobs in amdgpu_pci_resume Christian König
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 8+ messages in thread
From: Christian König @ 2022-11-09  9:50 UTC (permalink / raw)
  To: Alexander.Deucher, daniel.vetter, dri-devel, Shaoyun.Liu
  Cc: Christian König

Re-submitting IBs by the kernel has many problems because pre-
requisite state is not automatically re-created as well. In
other words neither binary semaphores nor things like ring
buffer pointers are in the state they should be when the
hardware starts to work on the IBs again.

Additional to that even after more than 5 years of
developing this feature it is still not stable and we have
massively problems getting the reference counts right.

As discussed with user space developers this behavior is not
helpful in the first place. For graphics and multimedia
workloads it makes much more sense to either completely
re-create the context or at least re-submitting the IBs
from userspace.

For compute use cases re-submitting is also not very
helpful since userspace must rely on the accuracy of
the result.

Because of this we stop this practice and instead just
properly note that the fence submission was canceled. The
only use case we keep the re-submission for now is SRIOV
and function level resets.

v2: as suggested by Sshaoyun stop resubmitting jobs even for SRIOV

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 0da55fd97df8..3a51c4c61833 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5294,11 +5294,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 			if (!ring || !ring->sched.thread)
 				continue;
 
-			/* No point to resubmit jobs if we didn't HW reset*/
-			if (!tmp_adev->asic_reset_res && !job_signaled)
-				drm_sched_resubmit_jobs(&ring->sched);
-
-			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
+			drm_sched_start(&ring->sched, true);
 		}
 
 		if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3))
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 3/5] drm/amdgpu: stop resubmittting jobs in amdgpu_pci_resume
  2022-11-09  9:50 [PATCH 1/5] drm/amd/amdgpu revert "implement tdr advanced mode" Christian König
  2022-11-09  9:50 ` [PATCH 2/5] drm/amdgpu: stop resubmitting jobs for GPU reset v2 Christian König
@ 2022-11-09  9:50 ` Christian König
  2022-11-10 19:43   ` Alex Deucher
  2022-11-09  9:50 ` [PATCH 4/5] drm/scheduler: cleanup define Christian König
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 8+ messages in thread
From: Christian König @ 2022-11-09  9:50 UTC (permalink / raw)
  To: Alexander.Deucher, daniel.vetter, dri-devel, Shaoyun.Liu
  Cc: Christian König

As far as I can see this is not really recoverable since a PCI reset
clears VRAM.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3a51c4c61833..8564d4a8e3e8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5747,8 +5747,6 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
 		if (!ring || !ring->sched.thread)
 			continue;
 
-
-		drm_sched_resubmit_jobs(&ring->sched);
 		drm_sched_start(&ring->sched, true);
 	}
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 4/5] drm/scheduler: cleanup define
  2022-11-09  9:50 [PATCH 1/5] drm/amd/amdgpu revert "implement tdr advanced mode" Christian König
  2022-11-09  9:50 ` [PATCH 2/5] drm/amdgpu: stop resubmitting jobs for GPU reset v2 Christian König
  2022-11-09  9:50 ` [PATCH 3/5] drm/amdgpu: stop resubmittting jobs in amdgpu_pci_resume Christian König
@ 2022-11-09  9:50 ` Christian König
  2022-11-09  9:50 ` [PATCH 5/5] drm/scheduler: deprecate drm_sched_resubmit_jobs Christian König
  2022-11-10 19:45 ` [PATCH 1/5] drm/amd/amdgpu revert "implement tdr advanced mode" Alex Deucher
  4 siblings, 0 replies; 8+ messages in thread
From: Christian König @ 2022-11-09  9:50 UTC (permalink / raw)
  To: Alexander.Deucher, daniel.vetter, dri-devel, Shaoyun.Liu
  Cc: Christian König

Remove some not implemented function define

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 include/drm/gpu_scheduler.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 156601fd7053..73a2327d6b00 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -501,7 +501,6 @@ void drm_sched_increase_karma(struct drm_sched_job *bad);
 bool drm_sched_dependency_optimized(struct dma_fence* fence,
 				    struct drm_sched_entity *entity);
 void drm_sched_fault(struct drm_gpu_scheduler *sched);
-void drm_sched_job_kickout(struct drm_sched_job *s_job);
 
 void drm_sched_rq_add_entity(struct drm_sched_rq *rq,
 			     struct drm_sched_entity *entity);
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 5/5] drm/scheduler: deprecate drm_sched_resubmit_jobs
  2022-11-09  9:50 [PATCH 1/5] drm/amd/amdgpu revert "implement tdr advanced mode" Christian König
                   ` (2 preceding siblings ...)
  2022-11-09  9:50 ` [PATCH 4/5] drm/scheduler: cleanup define Christian König
@ 2022-11-09  9:50 ` Christian König
  2022-11-10 19:45 ` [PATCH 1/5] drm/amd/amdgpu revert "implement tdr advanced mode" Alex Deucher
  4 siblings, 0 replies; 8+ messages in thread
From: Christian König @ 2022-11-09  9:50 UTC (permalink / raw)
  To: Alexander.Deucher, daniel.vetter, dri-devel, Shaoyun.Liu
  Cc: Christian König

This interface is not working as it should.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/scheduler/sched_main.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index e77e1fd16732..9eacce8aae3f 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -489,10 +489,21 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
 EXPORT_SYMBOL(drm_sched_start);
 
 /**
- * drm_sched_resubmit_jobs - helper to relaunch jobs from the pending list
+ * drm_sched_resubmit_jobs - Deprecated, don't use in new code!
  *
  * @sched: scheduler instance
  *
+ * Re-submitting jobs was a concept AMD came up as cheap way to implement
+ * recovery after a job timeout.
+ *
+ * This turned out to be not working very well. First of all there are many
+ * problem with the dma_fence implementation and requirements. Either the
+ * implementation is risking deadlocks with core memory management or violating
+ * documented implementation details of the dma_fence object.
+ *
+ * Drivers can still save and restore their state for recovery operations, but
+ * we shouldn't make this a general scheduler feature around the dma_fence
+ * interface.
  */
 void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched)
 {
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 3/5] drm/amdgpu: stop resubmittting jobs in amdgpu_pci_resume
  2022-11-09  9:50 ` [PATCH 3/5] drm/amdgpu: stop resubmittting jobs in amdgpu_pci_resume Christian König
@ 2022-11-10 19:43   ` Alex Deucher
  0 siblings, 0 replies; 8+ messages in thread
From: Alex Deucher @ 2022-11-10 19:43 UTC (permalink / raw)
  To: Christian König
  Cc: Alexander.Deucher, daniel.vetter, Christian König,
	dri-devel, Shaoyun.Liu

On Wed, Nov 9, 2022 at 4:50 AM Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
>
> As far as I can see this is not really recoverable since a PCI reset
> clears VRAM.

Might be more clear to say the state of VRAM is unreliable due to a
PCI event like an AER event or a link reset or DPC event.  The reset
itself may not clear VRAM directly (e.g., mode2 reset or APUs).

Alex

>
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 --
>  1 file changed, 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 3a51c4c61833..8564d4a8e3e8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5747,8 +5747,6 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
>                 if (!ring || !ring->sched.thread)
>                         continue;
>
> -
> -               drm_sched_resubmit_jobs(&ring->sched);
>                 drm_sched_start(&ring->sched, true);
>         }
>
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/5] drm/amd/amdgpu revert "implement tdr advanced mode"
  2022-11-09  9:50 [PATCH 1/5] drm/amd/amdgpu revert "implement tdr advanced mode" Christian König
                   ` (3 preceding siblings ...)
  2022-11-09  9:50 ` [PATCH 5/5] drm/scheduler: deprecate drm_sched_resubmit_jobs Christian König
@ 2022-11-10 19:45 ` Alex Deucher
  4 siblings, 0 replies; 8+ messages in thread
From: Alex Deucher @ 2022-11-10 19:45 UTC (permalink / raw)
  To: Christian König
  Cc: Alexander.Deucher, daniel.vetter, Christian König,
	dri-devel, Shaoyun.Liu

Some comments on patch 3.  With that addressed, the series is:
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

On Wed, Nov 9, 2022 at 4:50 AM Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
>
> This reverts commit e6c6338f393b74ac0b303d567bb918b44ae7ad75.
>
> This feature basically re-submits one job after another to
> figure out which one was the one causing a hang.
>
> This is obviously incompatible with gang-submit which requires
> that multiple jobs run at the same time. It's also absolutely
> not helpful to crash the hardware multiple times if a clean
> recovery is desired.
>
> For testing and debugging environments we should rather disable
> recovery alltogether to be able to inspect the state with a hw
> debugger.
>
> Additional to that the sw implementation is clearly buggy and causes
> reference count issues for the hardware fence.
>
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 103 ---------------------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |   2 +-
>  drivers/gpu/drm/scheduler/sched_main.c     |  58 ++----------
>  include/drm/gpu_scheduler.h                |   3 -
>  4 files changed, 10 insertions(+), 156 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 5b9f992e4607..0da55fd97df8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5077,94 +5077,6 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
>         return 0;
>  }
>
> -static void amdgpu_device_recheck_guilty_jobs(
> -       struct amdgpu_device *adev, struct list_head *device_list_handle,
> -       struct amdgpu_reset_context *reset_context)
> -{
> -       int i, r = 0;
> -
> -       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> -               struct amdgpu_ring *ring = adev->rings[i];
> -               int ret = 0;
> -               struct drm_sched_job *s_job;
> -
> -               if (!ring || !ring->sched.thread)
> -                       continue;
> -
> -               s_job = list_first_entry_or_null(&ring->sched.pending_list,
> -                               struct drm_sched_job, list);
> -               if (s_job == NULL)
> -                       continue;
> -
> -               /* clear job's guilty and depend the folowing step to decide the real one */
> -               drm_sched_reset_karma(s_job);
> -               drm_sched_resubmit_jobs_ext(&ring->sched, 1);
> -
> -               if (!s_job->s_fence->parent) {
> -                       DRM_WARN("Failed to get a HW fence for job!");
> -                       continue;
> -               }
> -
> -               ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
> -               if (ret == 0) { /* timeout */
> -                       DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
> -                                               ring->sched.name, s_job->id);
> -
> -
> -                       amdgpu_fence_driver_isr_toggle(adev, true);
> -
> -                       /* Clear this failed job from fence array */
> -                       amdgpu_fence_driver_clear_job_fences(ring);
> -
> -                       amdgpu_fence_driver_isr_toggle(adev, false);
> -
> -                       /* Since the job won't signal and we go for
> -                        * another resubmit drop this parent pointer
> -                        */
> -                       dma_fence_put(s_job->s_fence->parent);
> -                       s_job->s_fence->parent = NULL;
> -
> -                       /* set guilty */
> -                       drm_sched_increase_karma(s_job);
> -                       amdgpu_reset_prepare_hwcontext(adev, reset_context);
> -retry:
> -                       /* do hw reset */
> -                       if (amdgpu_sriov_vf(adev)) {
> -                               amdgpu_virt_fini_data_exchange(adev);
> -                               r = amdgpu_device_reset_sriov(adev, false);
> -                               if (r)
> -                                       adev->asic_reset_res = r;
> -                       } else {
> -                               clear_bit(AMDGPU_SKIP_HW_RESET,
> -                                         &reset_context->flags);
> -                               r = amdgpu_do_asic_reset(device_list_handle,
> -                                                        reset_context);
> -                               if (r && r == -EAGAIN)
> -                                       goto retry;
> -                       }
> -
> -                       /*
> -                        * add reset counter so that the following
> -                        * resubmitted job could flush vmid
> -                        */
> -                       atomic_inc(&adev->gpu_reset_counter);
> -                       continue;
> -               }
> -
> -               /* got the hw fence, signal finished fence */
> -               atomic_dec(ring->sched.score);
> -               dma_fence_get(&s_job->s_fence->finished);
> -               dma_fence_signal(&s_job->s_fence->finished);
> -               dma_fence_put(&s_job->s_fence->finished);
> -
> -               /* remove node from list and free the job */
> -               spin_lock(&ring->sched.job_list_lock);
> -               list_del_init(&s_job->list);
> -               spin_unlock(&ring->sched.job_list_lock);
> -               ring->sched.ops->free_job(s_job);
> -       }
> -}
> -
>  static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
>  {
>         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> @@ -5185,7 +5097,6 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
>
>  }
>
> -
>  /**
>   * amdgpu_device_gpu_recover - reset the asic and recover scheduler
>   *
> @@ -5208,7 +5119,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>         int i, r = 0;
>         bool need_emergency_restart = false;
>         bool audio_suspended = false;
> -       int tmp_vram_lost_counter;
>         bool gpu_reset_for_dev_remove = false;
>
>         gpu_reset_for_dev_remove =
> @@ -5354,7 +5264,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>                 amdgpu_device_stop_pending_resets(tmp_adev);
>         }
>
> -       tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
>         /* Actual ASIC resets if needed.*/
>         /* Host driver will handle XGMI hive reset for SRIOV */
>         if (amdgpu_sriov_vf(adev)) {
> @@ -5379,18 +5288,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>         /* Post ASIC reset for all devs .*/
>         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
>
> -               /*
> -                * Sometimes a later bad compute job can block a good gfx job as gfx
> -                * and compute ring share internal GC HW mutually. We add an additional
> -                * guilty jobs recheck step to find the real guilty job, it synchronously
> -                * submits and pends for the first job being signaled. If it gets timeout,
> -                * we identify it as a real guilty job.
> -                */
> -               if (amdgpu_gpu_recovery == 2 &&
> -                       !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))
> -                       amdgpu_device_recheck_guilty_jobs(
> -                               tmp_adev, device_list_handle, reset_context);
> -
>                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                         struct amdgpu_ring *ring = tmp_adev->rings[i];
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 8e97e95aca8c..a6820603214f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -519,7 +519,7 @@ module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444);
>   * DOC: gpu_recovery (int)
>   * Set to enable GPU recovery mechanism (1 = enable, 0 = disable). The default is -1 (auto, disabled except SRIOV).
>   */
> -MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (2 = advanced tdr mode, 1 = enable, 0 = disable, -1 = auto)");
> +MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 = enable, 0 = disable, -1 = auto)");
>  module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
>
>  /**
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index 68317d3a7a27..e77e1fd16732 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -355,27 +355,6 @@ static void drm_sched_job_timedout(struct work_struct *work)
>         }
>  }
>
> - /**
> -  * drm_sched_increase_karma - Update sched_entity guilty flag
> -  *
> -  * @bad: The job guilty of time out
> -  *
> -  * Increment on every hang caused by the 'bad' job. If this exceeds the hang
> -  * limit of the scheduler then the respective sched entity is marked guilty and
> -  * jobs from it will not be scheduled further
> -  */
> -void drm_sched_increase_karma(struct drm_sched_job *bad)
> -{
> -       drm_sched_increase_karma_ext(bad, 1);
> -}
> -EXPORT_SYMBOL(drm_sched_increase_karma);
> -
> -void drm_sched_reset_karma(struct drm_sched_job *bad)
> -{
> -       drm_sched_increase_karma_ext(bad, 0);
> -}
> -EXPORT_SYMBOL(drm_sched_reset_karma);
> -
>  /**
>   * drm_sched_stop - stop the scheduler
>   *
> @@ -516,32 +495,15 @@ EXPORT_SYMBOL(drm_sched_start);
>   *
>   */
>  void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched)
> -{
> -       drm_sched_resubmit_jobs_ext(sched, INT_MAX);
> -}
> -EXPORT_SYMBOL(drm_sched_resubmit_jobs);
> -
> -/**
> - * drm_sched_resubmit_jobs_ext - helper to relunch certain number of jobs from mirror ring list
> - *
> - * @sched: scheduler instance
> - * @max: job numbers to relaunch
> - *
> - */
> -void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
>  {
>         struct drm_sched_job *s_job, *tmp;
>         uint64_t guilty_context;
>         bool found_guilty = false;
>         struct dma_fence *fence;
> -       int i = 0;
>
>         list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
>                 struct drm_sched_fence *s_fence = s_job->s_fence;
>
> -               if (i >= max)
> -                       break;
> -
>                 if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
>                         found_guilty = true;
>                         guilty_context = s_job->s_fence->scheduled.context;
> @@ -551,7 +513,6 @@ void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
>                         dma_fence_set_error(&s_fence->finished, -ECANCELED);
>
>                 fence = sched->ops->run_job(s_job);
> -               i++;
>
>                 if (IS_ERR_OR_NULL(fence)) {
>                         if (IS_ERR(fence))
> @@ -567,7 +528,7 @@ void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
>                 }
>         }
>  }
> -EXPORT_SYMBOL(drm_sched_resubmit_jobs_ext);
> +EXPORT_SYMBOL(drm_sched_resubmit_jobs);
>
>  /**
>   * drm_sched_job_init - init a scheduler job
> @@ -1082,13 +1043,15 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched)
>  EXPORT_SYMBOL(drm_sched_fini);
>
>  /**
> - * drm_sched_increase_karma_ext - Update sched_entity guilty flag
> + * drm_sched_increase_karma - Update sched_entity guilty flag
>   *
>   * @bad: The job guilty of time out
> - * @type: type for increase/reset karma
>   *
> + * Increment on every hang caused by the 'bad' job. If this exceeds the hang
> + * limit of the scheduler then the respective sched entity is marked guilty and
> + * jobs from it will not be scheduled further
>   */
> -void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
> +void drm_sched_increase_karma(struct drm_sched_job *bad)
>  {
>         int i;
>         struct drm_sched_entity *tmp;
> @@ -1100,10 +1063,7 @@ void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
>          * corrupt but keep in mind that kernel jobs always considered good.
>          */
>         if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) {
> -               if (type == 0)
> -                       atomic_set(&bad->karma, 0);
> -               else if (type == 1)
> -                       atomic_inc(&bad->karma);
> +               atomic_inc(&bad->karma);
>
>                 for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_KERNEL;
>                      i++) {
> @@ -1114,7 +1074,7 @@ void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
>                                 if (bad->s_fence->scheduled.context ==
>                                     entity->fence_context) {
>                                         if (entity->guilty)
> -                                               atomic_set(entity->guilty, type);
> +                                               atomic_set(entity->guilty, 1);
>                                         break;
>                                 }
>                         }
> @@ -1124,4 +1084,4 @@ void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
>                 }
>         }
>  }
> -EXPORT_SYMBOL(drm_sched_increase_karma_ext);
> +EXPORT_SYMBOL(drm_sched_increase_karma);
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 289a33e80639..156601fd7053 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -497,10 +497,7 @@ void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
>  void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
>  void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery);
>  void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched);
> -void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max);
>  void drm_sched_increase_karma(struct drm_sched_job *bad);
> -void drm_sched_reset_karma(struct drm_sched_job *bad);
> -void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type);
>  bool drm_sched_dependency_optimized(struct dma_fence* fence,
>                                     struct drm_sched_entity *entity);
>  void drm_sched_fault(struct drm_gpu_scheduler *sched);
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 5/5] drm/scheduler: deprecate drm_sched_resubmit_jobs
  2022-10-26 15:35 Christian König
@ 2022-10-26 15:35 ` Christian König
  0 siblings, 0 replies; 8+ messages in thread
From: Christian König @ 2022-10-26 15:35 UTC (permalink / raw)
  To: luben.tuikov, vprosyak, Alexander.Deucher, daniel.vetter,
	amd-gfx, dri-devel
  Cc: Christian König

This interface is not working as it should.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/scheduler/sched_main.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index bb28f31bff6f..ecd4afab4adb 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -489,10 +489,21 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
 EXPORT_SYMBOL(drm_sched_start);
 
 /**
- * drm_sched_resubmit_jobs - helper to relaunch jobs from the pending list
+ * drm_sched_resubmit_jobs - Deprecated, don't use in new code!
  *
  * @sched: scheduler instance
  *
+ * Re-submitting jobs was a concept AMD came up as cheap way to implement
+ * recovery after a job timeout.
+ *
+ * This turned out to be not working very well. First of all there are many
+ * problem with the dma_fence implementation and requirements. Either the
+ * implementation is risking deadlocks with core memory management or violating
+ * documented implementation details of the dma_fence object.
+ *
+ * Drivers can still save and restore their state for recovery operations, but
+ * we shouldn't make this a general scheduler feature around the dma_fence
+ * interface.
  */
 void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched)
 {
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2022-11-10 19:45 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-11-09  9:50 [PATCH 1/5] drm/amd/amdgpu revert "implement tdr advanced mode" Christian König
2022-11-09  9:50 ` [PATCH 2/5] drm/amdgpu: stop resubmitting jobs for GPU reset v2 Christian König
2022-11-09  9:50 ` [PATCH 3/5] drm/amdgpu: stop resubmittting jobs in amdgpu_pci_resume Christian König
2022-11-10 19:43   ` Alex Deucher
2022-11-09  9:50 ` [PATCH 4/5] drm/scheduler: cleanup define Christian König
2022-11-09  9:50 ` [PATCH 5/5] drm/scheduler: deprecate drm_sched_resubmit_jobs Christian König
2022-11-10 19:45 ` [PATCH 1/5] drm/amd/amdgpu revert "implement tdr advanced mode" Alex Deucher
  -- strict thread matches above, loose matches on Subject: below --
2022-10-26 15:35 Christian König
2022-10-26 15:35 ` [PATCH 5/5] drm/scheduler: deprecate drm_sched_resubmit_jobs Christian König

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.