All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/5] drm/amd/amdgpu revert "implement tdr advanced mode"
@ 2022-10-26 15:35 Christian König
  2022-10-26 15:35 ` [PATCH 2/5] drm/amdgpu: stop resubmitting jobs for bare metal reset Christian König
                   ` (5 more replies)
  0 siblings, 6 replies; 14+ messages in thread
From: Christian König @ 2022-10-26 15:35 UTC (permalink / raw)
  To: luben.tuikov, vprosyak, Alexander.Deucher, daniel.vetter,
	amd-gfx, dri-devel
  Cc: Christian König

This reverts commit e6c6338f393b74ac0b303d567bb918b44ae7ad75.

This feature basically re-submits one job after another to
figure out which one was the one causing a hang.

This is obviously incompatible with gang-submit which requires
that multiple jobs run at the same time. It's also absolutely
not helpful to crash the hardware multiple times if a clean
recovery is desired.

For testing and debugging environments we should rather disable
recovery alltogether to be able to inspect the state with a hw
debugger.

Additional to that the sw implementation is clearly buggy and causes
reference count issues for the hardware fence.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 103 ---------------------
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |   2 +-
 drivers/gpu/drm/scheduler/sched_main.c     |  58 ++----------
 include/drm/gpu_scheduler.h                |   3 -
 4 files changed, 10 insertions(+), 156 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 6f958603c8cc..d4584e577b51 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5070,94 +5070,6 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
 	return 0;
 }
 
-static void amdgpu_device_recheck_guilty_jobs(
-	struct amdgpu_device *adev, struct list_head *device_list_handle,
-	struct amdgpu_reset_context *reset_context)
-{
-	int i, r = 0;
-
-	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
-		struct amdgpu_ring *ring = adev->rings[i];
-		int ret = 0;
-		struct drm_sched_job *s_job;
-
-		if (!ring || !ring->sched.thread)
-			continue;
-
-		s_job = list_first_entry_or_null(&ring->sched.pending_list,
-				struct drm_sched_job, list);
-		if (s_job == NULL)
-			continue;
-
-		/* clear job's guilty and depend the folowing step to decide the real one */
-		drm_sched_reset_karma(s_job);
-		drm_sched_resubmit_jobs_ext(&ring->sched, 1);
-
-		if (!s_job->s_fence->parent) {
-			DRM_WARN("Failed to get a HW fence for job!");
-			continue;
-		}
-
-		ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
-		if (ret == 0) { /* timeout */
-			DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
-						ring->sched.name, s_job->id);
-
-
-			amdgpu_fence_driver_isr_toggle(adev, true);
-
-			/* Clear this failed job from fence array */
-			amdgpu_fence_driver_clear_job_fences(ring);
-
-			amdgpu_fence_driver_isr_toggle(adev, false);
-
-			/* Since the job won't signal and we go for
-			 * another resubmit drop this parent pointer
-			 */
-			dma_fence_put(s_job->s_fence->parent);
-			s_job->s_fence->parent = NULL;
-
-			/* set guilty */
-			drm_sched_increase_karma(s_job);
-			amdgpu_reset_prepare_hwcontext(adev, reset_context);
-retry:
-			/* do hw reset */
-			if (amdgpu_sriov_vf(adev)) {
-				amdgpu_virt_fini_data_exchange(adev);
-				r = amdgpu_device_reset_sriov(adev, false);
-				if (r)
-					adev->asic_reset_res = r;
-			} else {
-				clear_bit(AMDGPU_SKIP_HW_RESET,
-					  &reset_context->flags);
-				r = amdgpu_do_asic_reset(device_list_handle,
-							 reset_context);
-				if (r && r == -EAGAIN)
-					goto retry;
-			}
-
-			/*
-			 * add reset counter so that the following
-			 * resubmitted job could flush vmid
-			 */
-			atomic_inc(&adev->gpu_reset_counter);
-			continue;
-		}
-
-		/* got the hw fence, signal finished fence */
-		atomic_dec(ring->sched.score);
-		dma_fence_get(&s_job->s_fence->finished);
-		dma_fence_signal(&s_job->s_fence->finished);
-		dma_fence_put(&s_job->s_fence->finished);
-
-		/* remove node from list and free the job */
-		spin_lock(&ring->sched.job_list_lock);
-		list_del_init(&s_job->list);
-		spin_unlock(&ring->sched.job_list_lock);
-		ring->sched.ops->free_job(s_job);
-	}
-}
-
 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -5178,7 +5090,6 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
 
 }
 
-
 /**
  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
  *
@@ -5201,7 +5112,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	int i, r = 0;
 	bool need_emergency_restart = false;
 	bool audio_suspended = false;
-	int tmp_vram_lost_counter;
 	bool gpu_reset_for_dev_remove = false;
 
 	gpu_reset_for_dev_remove =
@@ -5347,7 +5257,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 		amdgpu_device_stop_pending_resets(tmp_adev);
 	}
 
-	tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
 	/* Actual ASIC resets if needed.*/
 	/* Host driver will handle XGMI hive reset for SRIOV */
 	if (amdgpu_sriov_vf(adev)) {
@@ -5372,18 +5281,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	/* Post ASIC reset for all devs .*/
 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
 
-		/*
-		 * Sometimes a later bad compute job can block a good gfx job as gfx
-		 * and compute ring share internal GC HW mutually. We add an additional
-		 * guilty jobs recheck step to find the real guilty job, it synchronously
-		 * submits and pends for the first job being signaled. If it gets timeout,
-		 * we identify it as a real guilty job.
-		 */
-		if (amdgpu_gpu_recovery == 2 &&
-			!(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))
-			amdgpu_device_recheck_guilty_jobs(
-				tmp_adev, device_list_handle, reset_context);
-
 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
 			struct amdgpu_ring *ring = tmp_adev->rings[i];
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 16f6a313335e..f177d8e5b665 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -519,7 +519,7 @@ module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444);
  * DOC: gpu_recovery (int)
  * Set to enable GPU recovery mechanism (1 = enable, 0 = disable). The default is -1 (auto, disabled except SRIOV).
  */
-MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (2 = advanced tdr mode, 1 = enable, 0 = disable, -1 = auto)");
+MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 = enable, 0 = disable, -1 = auto)");
 module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
 
 /**
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index e0ab14e0fb6b..bb28f31bff6f 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -355,27 +355,6 @@ static void drm_sched_job_timedout(struct work_struct *work)
 	}
 }
 
- /**
-  * drm_sched_increase_karma - Update sched_entity guilty flag
-  *
-  * @bad: The job guilty of time out
-  *
-  * Increment on every hang caused by the 'bad' job. If this exceeds the hang
-  * limit of the scheduler then the respective sched entity is marked guilty and
-  * jobs from it will not be scheduled further
-  */
-void drm_sched_increase_karma(struct drm_sched_job *bad)
-{
-	drm_sched_increase_karma_ext(bad, 1);
-}
-EXPORT_SYMBOL(drm_sched_increase_karma);
-
-void drm_sched_reset_karma(struct drm_sched_job *bad)
-{
-	drm_sched_increase_karma_ext(bad, 0);
-}
-EXPORT_SYMBOL(drm_sched_reset_karma);
-
 /**
  * drm_sched_stop - stop the scheduler
  *
@@ -516,32 +495,15 @@ EXPORT_SYMBOL(drm_sched_start);
  *
  */
 void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched)
-{
-	drm_sched_resubmit_jobs_ext(sched, INT_MAX);
-}
-EXPORT_SYMBOL(drm_sched_resubmit_jobs);
-
-/**
- * drm_sched_resubmit_jobs_ext - helper to relunch certain number of jobs from mirror ring list
- *
- * @sched: scheduler instance
- * @max: job numbers to relaunch
- *
- */
-void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
 {
 	struct drm_sched_job *s_job, *tmp;
 	uint64_t guilty_context;
 	bool found_guilty = false;
 	struct dma_fence *fence;
-	int i = 0;
 
 	list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
 		struct drm_sched_fence *s_fence = s_job->s_fence;
 
-		if (i >= max)
-			break;
-
 		if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
 			found_guilty = true;
 			guilty_context = s_job->s_fence->scheduled.context;
@@ -551,7 +513,6 @@ void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
 			dma_fence_set_error(&s_fence->finished, -ECANCELED);
 
 		fence = sched->ops->run_job(s_job);
-		i++;
 
 		if (IS_ERR_OR_NULL(fence)) {
 			if (IS_ERR(fence))
@@ -567,7 +528,7 @@ void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
 		}
 	}
 }
-EXPORT_SYMBOL(drm_sched_resubmit_jobs_ext);
+EXPORT_SYMBOL(drm_sched_resubmit_jobs);
 
 /**
  * drm_sched_job_init - init a scheduler job
@@ -1081,13 +1042,15 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched)
 EXPORT_SYMBOL(drm_sched_fini);
 
 /**
- * drm_sched_increase_karma_ext - Update sched_entity guilty flag
+ * drm_sched_increase_karma - Update sched_entity guilty flag
  *
  * @bad: The job guilty of time out
- * @type: type for increase/reset karma
  *
+ * Increment on every hang caused by the 'bad' job. If this exceeds the hang
+ * limit of the scheduler then the respective sched entity is marked guilty and
+ * jobs from it will not be scheduled further
  */
-void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
+void drm_sched_increase_karma(struct drm_sched_job *bad)
 {
 	int i;
 	struct drm_sched_entity *tmp;
@@ -1099,10 +1062,7 @@ void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
 	 * corrupt but keep in mind that kernel jobs always considered good.
 	 */
 	if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) {
-		if (type == 0)
-			atomic_set(&bad->karma, 0);
-		else if (type == 1)
-			atomic_inc(&bad->karma);
+		atomic_inc(&bad->karma);
 
 		for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_KERNEL;
 		     i++) {
@@ -1113,7 +1073,7 @@ void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
 				if (bad->s_fence->scheduled.context ==
 				    entity->fence_context) {
 					if (entity->guilty)
-						atomic_set(entity->guilty, type);
+						atomic_set(entity->guilty, 1);
 					break;
 				}
 			}
@@ -1123,4 +1083,4 @@ void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
 		}
 	}
 }
-EXPORT_SYMBOL(drm_sched_increase_karma_ext);
+EXPORT_SYMBOL(drm_sched_increase_karma);
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 0fca8f38bee4..c564be29c5ae 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -488,10 +488,7 @@ void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
 void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
 void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery);
 void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched);
-void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max);
 void drm_sched_increase_karma(struct drm_sched_job *bad);
-void drm_sched_reset_karma(struct drm_sched_job *bad);
-void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type);
 bool drm_sched_dependency_optimized(struct dma_fence* fence,
 				    struct drm_sched_entity *entity);
 void drm_sched_fault(struct drm_gpu_scheduler *sched);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/5] drm/amdgpu: stop resubmitting jobs for bare metal reset
  2022-10-26 15:35 [PATCH 1/5] drm/amd/amdgpu revert "implement tdr advanced mode" Christian König
@ 2022-10-26 15:35 ` Christian König
  2022-10-26 16:10   ` Liu, Shaoyun
  2022-10-26 15:35 ` [PATCH 3/5] drm/amdgpu: stop resubmittting jobs in amdgpu_pci_resume Christian König
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 14+ messages in thread
From: Christian König @ 2022-10-26 15:35 UTC (permalink / raw)
  To: luben.tuikov, vprosyak, Alexander.Deucher, daniel.vetter,
	amd-gfx, dri-devel
  Cc: Christian König

Re-submitting IBs by the kernel has many problems because pre-
requisite state is not automatically re-created as well. In
other words neither binary semaphores nor things like ring
buffer pointers are in the state they should be when the
hardware starts to work on the IBs again.

Additional to that even after more than 5 years of
developing this feature it is still not stable and we have
massively problems getting the reference counts right.

As discussed with user space developers this behavior is not
helpful in the first place. For graphics and multimedia
workloads it makes much more sense to either completely
re-create the context or at least re-submitting the IBs
from userspace.

For compute use cases re-submitting is also not very
helpful since userspace must rely on the accuracy of
the result.

Because of this we stop this practice and instead just
properly note that the fence submission was canceled. The
only use case we keep the re-submission for now is SRIOV
and function level resets.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index d4584e577b51..39e94feba1ac 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5288,7 +5288,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 				continue;
 
 			/* No point to resubmit jobs if we didn't HW reset*/
-			if (!tmp_adev->asic_reset_res && !job_signaled)
+			if (!tmp_adev->asic_reset_res && !job_signaled &&
+			    amdgpu_sriov_vf(tmp_adev))
 				drm_sched_resubmit_jobs(&ring->sched);
 
 			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 3/5] drm/amdgpu: stop resubmittting jobs in amdgpu_pci_resume
  2022-10-26 15:35 [PATCH 1/5] drm/amd/amdgpu revert "implement tdr advanced mode" Christian König
  2022-10-26 15:35 ` [PATCH 2/5] drm/amdgpu: stop resubmitting jobs for bare metal reset Christian König
@ 2022-10-26 15:35 ` Christian König
  2022-10-26 15:35 ` [PATCH 4/5] drm/scheduler: cleanup define Christian König
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 14+ messages in thread
From: Christian König @ 2022-10-26 15:35 UTC (permalink / raw)
  To: luben.tuikov, vprosyak, Alexander.Deucher, daniel.vetter,
	amd-gfx, dri-devel
  Cc: Christian König

As far as I can see this is not really recoverable since a PCI reset
clears VRAM.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 39e94feba1ac..b1827e804363 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5745,8 +5745,6 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
 		if (!ring || !ring->sched.thread)
 			continue;
 
-
-		drm_sched_resubmit_jobs(&ring->sched);
 		drm_sched_start(&ring->sched, true);
 	}
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 4/5] drm/scheduler: cleanup define
  2022-10-26 15:35 [PATCH 1/5] drm/amd/amdgpu revert "implement tdr advanced mode" Christian König
  2022-10-26 15:35 ` [PATCH 2/5] drm/amdgpu: stop resubmitting jobs for bare metal reset Christian König
  2022-10-26 15:35 ` [PATCH 3/5] drm/amdgpu: stop resubmittting jobs in amdgpu_pci_resume Christian König
@ 2022-10-26 15:35 ` Christian König
  2022-10-26 15:35 ` [PATCH 5/5] drm/scheduler: deprecate drm_sched_resubmit_jobs Christian König
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 14+ messages in thread
From: Christian König @ 2022-10-26 15:35 UTC (permalink / raw)
  To: luben.tuikov, vprosyak, Alexander.Deucher, daniel.vetter,
	amd-gfx, dri-devel
  Cc: Christian König

Remove some not implemented function define

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 include/drm/gpu_scheduler.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index c564be29c5ae..d646ff2fd557 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -492,7 +492,6 @@ void drm_sched_increase_karma(struct drm_sched_job *bad);
 bool drm_sched_dependency_optimized(struct dma_fence* fence,
 				    struct drm_sched_entity *entity);
 void drm_sched_fault(struct drm_gpu_scheduler *sched);
-void drm_sched_job_kickout(struct drm_sched_job *s_job);
 
 void drm_sched_rq_add_entity(struct drm_sched_rq *rq,
 			     struct drm_sched_entity *entity);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 5/5] drm/scheduler: deprecate drm_sched_resubmit_jobs
  2022-10-26 15:35 [PATCH 1/5] drm/amd/amdgpu revert "implement tdr advanced mode" Christian König
                   ` (2 preceding siblings ...)
  2022-10-26 15:35 ` [PATCH 4/5] drm/scheduler: cleanup define Christian König
@ 2022-10-26 15:35 ` Christian König
  2023-01-28  8:28 ` [PATCH 1/5] drm/amd/amdgpu revert "implement tdr advanced mode" Yin, ZhenGuo (Chris)
  2023-01-30 21:46 ` Luben Tuikov
  5 siblings, 0 replies; 14+ messages in thread
From: Christian König @ 2022-10-26 15:35 UTC (permalink / raw)
  To: luben.tuikov, vprosyak, Alexander.Deucher, daniel.vetter,
	amd-gfx, dri-devel
  Cc: Christian König

This interface is not working as it should.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/scheduler/sched_main.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index bb28f31bff6f..ecd4afab4adb 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -489,10 +489,21 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
 EXPORT_SYMBOL(drm_sched_start);
 
 /**
- * drm_sched_resubmit_jobs - helper to relaunch jobs from the pending list
+ * drm_sched_resubmit_jobs - Deprecated, don't use in new code!
  *
  * @sched: scheduler instance
  *
+ * Re-submitting jobs was a concept AMD came up as cheap way to implement
+ * recovery after a job timeout.
+ *
+ * This turned out to be not working very well. First of all there are many
+ * problem with the dma_fence implementation and requirements. Either the
+ * implementation is risking deadlocks with core memory management or violating
+ * documented implementation details of the dma_fence object.
+ *
+ * Drivers can still save and restore their state for recovery operations, but
+ * we shouldn't make this a general scheduler feature around the dma_fence
+ * interface.
  */
 void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched)
 {
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* RE: [PATCH 2/5] drm/amdgpu: stop resubmitting jobs for bare metal reset
  2022-10-26 15:35 ` [PATCH 2/5] drm/amdgpu: stop resubmitting jobs for bare metal reset Christian König
@ 2022-10-26 16:10   ` Liu, Shaoyun
  2022-10-26 17:26     ` Christian König
  0 siblings, 1 reply; 14+ messages in thread
From: Liu, Shaoyun @ 2022-10-26 16:10 UTC (permalink / raw)
  To: Christian König, Tuikov, Luben, Prosyak, Vitaly, Deucher,
	Alexander, daniel.vetter, amd-gfx, dri-devel
  Cc: Koenig, Christian

[AMD Official Use Only - General]

The  user space  shouldn't care about  SRIOV or not ,  I don't think we need to keep the re-submission for SRIOV as well.  The reset from SRIOV could trigger the  host do a whole GPU reset which will have the same issue as bare metal.

Regards
Shaoyun.liu

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Christian König
Sent: Wednesday, October 26, 2022 11:36 AM
To: Tuikov, Luben <Luben.Tuikov@amd.com>; Prosyak, Vitaly <Vitaly.Prosyak@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; daniel.vetter@ffwll.ch; amd-gfx@lists.freedesktop.org; dri-devel@lists.freedesktop.org
Cc: Koenig, Christian <Christian.Koenig@amd.com>
Subject: [PATCH 2/5] drm/amdgpu: stop resubmitting jobs for bare metal reset

Re-submitting IBs by the kernel has many problems because pre- requisite state is not automatically re-created as well. In other words neither binary semaphores nor things like ring buffer pointers are in the state they should be when the hardware starts to work on the IBs again.

Additional to that even after more than 5 years of developing this feature it is still not stable and we have massively problems getting the reference counts right.

As discussed with user space developers this behavior is not helpful in the first place. For graphics and multimedia workloads it makes much more sense to either completely re-create the context or at least re-submitting the IBs from userspace.

For compute use cases re-submitting is also not very helpful since userspace must rely on the accuracy of the result.

Because of this we stop this practice and instead just properly note that the fence submission was canceled. The only use case we keep the re-submission for now is SRIOV and function level resets.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index d4584e577b51..39e94feba1ac 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5288,7 +5288,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
                                continue;

                        /* No point to resubmit jobs if we didn't HW reset*/
-                       if (!tmp_adev->asic_reset_res && !job_signaled)
+                       if (!tmp_adev->asic_reset_res && !job_signaled &&
+                           amdgpu_sriov_vf(tmp_adev))
                                drm_sched_resubmit_jobs(&ring->sched);

                        drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
--
2.25.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/5] drm/amdgpu: stop resubmitting jobs for bare metal reset
  2022-10-26 16:10   ` Liu, Shaoyun
@ 2022-10-26 17:26     ` Christian König
  2022-10-26 18:13       ` Liu, Shaoyun
  0 siblings, 1 reply; 14+ messages in thread
From: Christian König @ 2022-10-26 17:26 UTC (permalink / raw)
  To: Liu, Shaoyun, Tuikov, Luben, Prosyak, Vitaly, Deucher, Alexander,
	daniel.vetter, amd-gfx, dri-devel

The problem is that this re-submitting is currently an integral part of 
how SRIOV works.

The host can send a function level reset request to the clients when it 
sees that some schedule switching didn't worked as expected and in this 
case (and only this case) the hardware has actually never started to 
even work on the IBs. So the re-submission is actually save from this side.

But in general you are right, the sw side is just completely broken 
because we came up with a bunch of rather strict rules for the dma_fence 
implementation (and those rules are perfectly valid and necessary).

Regards,
Christian.

Am 26.10.22 um 18:10 schrieb Liu, Shaoyun:
> [AMD Official Use Only - General]
>
> The  user space  shouldn't care about  SRIOV or not ,  I don't think we need to keep the re-submission for SRIOV as well.  The reset from SRIOV could trigger the  host do a whole GPU reset which will have the same issue as bare metal.
>
> Regards
> Shaoyun.liu
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Christian König
> Sent: Wednesday, October 26, 2022 11:36 AM
> To: Tuikov, Luben <Luben.Tuikov@amd.com>; Prosyak, Vitaly <Vitaly.Prosyak@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; daniel.vetter@ffwll.ch; amd-gfx@lists.freedesktop.org; dri-devel@lists.freedesktop.org
> Cc: Koenig, Christian <Christian.Koenig@amd.com>
> Subject: [PATCH 2/5] drm/amdgpu: stop resubmitting jobs for bare metal reset
>
> Re-submitting IBs by the kernel has many problems because pre- requisite state is not automatically re-created as well. In other words neither binary semaphores nor things like ring buffer pointers are in the state they should be when the hardware starts to work on the IBs again.
>
> Additional to that even after more than 5 years of developing this feature it is still not stable and we have massively problems getting the reference counts right.
>
> As discussed with user space developers this behavior is not helpful in the first place. For graphics and multimedia workloads it makes much more sense to either completely re-create the context or at least re-submitting the IBs from userspace.
>
> For compute use cases re-submitting is also not very helpful since userspace must rely on the accuracy of the result.
>
> Because of this we stop this practice and instead just properly note that the fence submission was canceled. The only use case we keep the re-submission for now is SRIOV and function level resets.
>
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 ++-
>   1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index d4584e577b51..39e94feba1ac 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5288,7 +5288,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>                                  continue;
>
>                          /* No point to resubmit jobs if we didn't HW reset*/
> -                       if (!tmp_adev->asic_reset_res && !job_signaled)
> +                       if (!tmp_adev->asic_reset_res && !job_signaled &&
> +                           amdgpu_sriov_vf(tmp_adev))
>                                  drm_sched_resubmit_jobs(&ring->sched);
>
>                          drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
> --
> 2.25.1
>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH 2/5] drm/amdgpu: stop resubmitting jobs for bare metal reset
  2022-10-26 17:26     ` Christian König
@ 2022-10-26 18:13       ` Liu, Shaoyun
  2022-10-27  6:04         ` Christian König
  0 siblings, 1 reply; 14+ messages in thread
From: Liu, Shaoyun @ 2022-10-26 18:13 UTC (permalink / raw)
  To: Koenig, Christian, Tuikov, Luben, Prosyak, Vitaly, Deucher,
	 Alexander, daniel.vetter, amd-gfx, dri-devel

[AMD Official Use Only - General]

The SRIOV already has its own reset routine amdgpu_device_reset_sriov,  we try to put the sriov specific sequence  inside this function. For the rest part(re-submitting etc ) we should try to have the same  behavior as bare-metal.
Can  we just don't do the re-submission for all kind of reset since kernel already signal the reset event  to user level (at least for compute stack) ?

Regard
Sshaoyun.liu

-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com>
Sent: Wednesday, October 26, 2022 1:27 PM
To: Liu, Shaoyun <Shaoyun.Liu@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>; Prosyak, Vitaly <Vitaly.Prosyak@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; daniel.vetter@ffwll.ch; amd-gfx@lists.freedesktop.org; dri-devel@lists.freedesktop.org
Subject: Re: [PATCH 2/5] drm/amdgpu: stop resubmitting jobs for bare metal reset

The problem is that this re-submitting is currently an integral part of how SRIOV works.

The host can send a function level reset request to the clients when it sees that some schedule switching didn't worked as expected and in this case (and only this case) the hardware has actually never started to even work on the IBs. So the re-submission is actually save from this side.

But in general you are right, the sw side is just completely broken because we came up with a bunch of rather strict rules for the dma_fence implementation (and those rules are perfectly valid and necessary).

Regards,
Christian.

Am 26.10.22 um 18:10 schrieb Liu, Shaoyun:
> [AMD Official Use Only - General]
>
> The  user space  shouldn't care about  SRIOV or not ,  I don't think we need to keep the re-submission for SRIOV as well.  The reset from SRIOV could trigger the  host do a whole GPU reset which will have the same issue as bare metal.
>
> Regards
> Shaoyun.liu
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
> Christian König
> Sent: Wednesday, October 26, 2022 11:36 AM
> To: Tuikov, Luben <Luben.Tuikov@amd.com>; Prosyak, Vitaly
> <Vitaly.Prosyak@amd.com>; Deucher, Alexander
> <Alexander.Deucher@amd.com>; daniel.vetter@ffwll.ch;
> amd-gfx@lists.freedesktop.org; dri-devel@lists.freedesktop.org
> Cc: Koenig, Christian <Christian.Koenig@amd.com>
> Subject: [PATCH 2/5] drm/amdgpu: stop resubmitting jobs for bare metal
> reset
>
> Re-submitting IBs by the kernel has many problems because pre- requisite state is not automatically re-created as well. In other words neither binary semaphores nor things like ring buffer pointers are in the state they should be when the hardware starts to work on the IBs again.
>
> Additional to that even after more than 5 years of developing this feature it is still not stable and we have massively problems getting the reference counts right.
>
> As discussed with user space developers this behavior is not helpful in the first place. For graphics and multimedia workloads it makes much more sense to either completely re-create the context or at least re-submitting the IBs from userspace.
>
> For compute use cases re-submitting is also not very helpful since userspace must rely on the accuracy of the result.
>
> Because of this we stop this practice and instead just properly note that the fence submission was canceled. The only use case we keep the re-submission for now is SRIOV and function level resets.
>
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 ++-
>   1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index d4584e577b51..39e94feba1ac 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5288,7 +5288,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>                                  continue;
>
>                          /* No point to resubmit jobs if we didn't HW reset*/
> -                       if (!tmp_adev->asic_reset_res && !job_signaled)
> +                       if (!tmp_adev->asic_reset_res && !job_signaled &&
> +                           amdgpu_sriov_vf(tmp_adev))
>
> drm_sched_resubmit_jobs(&ring->sched);
>
>                          drm_sched_start(&ring->sched,
> !tmp_adev->asic_reset_res);
> --
> 2.25.1
>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/5] drm/amdgpu: stop resubmitting jobs for bare metal reset
  2022-10-26 18:13       ` Liu, Shaoyun
@ 2022-10-27  6:04         ` Christian König
  0 siblings, 0 replies; 14+ messages in thread
From: Christian König @ 2022-10-27  6:04 UTC (permalink / raw)
  To: Liu, Shaoyun, Tuikov, Luben, Prosyak, Vitaly, Deucher, Alexander,
	daniel.vetter, amd-gfx, dri-devel

Hi Shaoyun,

yes, absolutely. If you say that this is ok then I'm fine with that as well.

Thanks,
Christian.

Am 26.10.22 um 20:13 schrieb Liu, Shaoyun:
> [AMD Official Use Only - General]
>
> The SRIOV already has its own reset routine amdgpu_device_reset_sriov,  we try to put the sriov specific sequence  inside this function. For the rest part(re-submitting etc ) we should try to have the same  behavior as bare-metal.
> Can  we just don't do the re-submission for all kind of reset since kernel already signal the reset event  to user level (at least for compute stack) ?
>
> Regard
> Sshaoyun.liu
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Wednesday, October 26, 2022 1:27 PM
> To: Liu, Shaoyun <Shaoyun.Liu@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>; Prosyak, Vitaly <Vitaly.Prosyak@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; daniel.vetter@ffwll.ch; amd-gfx@lists.freedesktop.org; dri-devel@lists.freedesktop.org
> Subject: Re: [PATCH 2/5] drm/amdgpu: stop resubmitting jobs for bare metal reset
>
> The problem is that this re-submitting is currently an integral part of how SRIOV works.
>
> The host can send a function level reset request to the clients when it sees that some schedule switching didn't worked as expected and in this case (and only this case) the hardware has actually never started to even work on the IBs. So the re-submission is actually save from this side.
>
> But in general you are right, the sw side is just completely broken because we came up with a bunch of rather strict rules for the dma_fence implementation (and those rules are perfectly valid and necessary).
>
> Regards,
> Christian.
>
> Am 26.10.22 um 18:10 schrieb Liu, Shaoyun:
>> [AMD Official Use Only - General]
>>
>> The  user space  shouldn't care about  SRIOV or not ,  I don't think we need to keep the re-submission for SRIOV as well.  The reset from SRIOV could trigger the  host do a whole GPU reset which will have the same issue as bare metal.
>>
>> Regards
>> Shaoyun.liu
>>
>> -----Original Message-----
>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>> Christian König
>> Sent: Wednesday, October 26, 2022 11:36 AM
>> To: Tuikov, Luben <Luben.Tuikov@amd.com>; Prosyak, Vitaly
>> <Vitaly.Prosyak@amd.com>; Deucher, Alexander
>> <Alexander.Deucher@amd.com>; daniel.vetter@ffwll.ch;
>> amd-gfx@lists.freedesktop.org; dri-devel@lists.freedesktop.org
>> Cc: Koenig, Christian <Christian.Koenig@amd.com>
>> Subject: [PATCH 2/5] drm/amdgpu: stop resubmitting jobs for bare metal
>> reset
>>
>> Re-submitting IBs by the kernel has many problems because pre- requisite state is not automatically re-created as well. In other words neither binary semaphores nor things like ring buffer pointers are in the state they should be when the hardware starts to work on the IBs again.
>>
>> Additional to that even after more than 5 years of developing this feature it is still not stable and we have massively problems getting the reference counts right.
>>
>> As discussed with user space developers this behavior is not helpful in the first place. For graphics and multimedia workloads it makes much more sense to either completely re-create the context or at least re-submitting the IBs from userspace.
>>
>> For compute use cases re-submitting is also not very helpful since userspace must rely on the accuracy of the result.
>>
>> Because of this we stop this practice and instead just properly note that the fence submission was canceled. The only use case we keep the re-submission for now is SRIOV and function level resets.
>>
>> Signed-off-by: Christian König <christian.koenig@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 ++-
>>    1 file changed, 2 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index d4584e577b51..39e94feba1ac 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -5288,7 +5288,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>                                   continue;
>>
>>                           /* No point to resubmit jobs if we didn't HW reset*/
>> -                       if (!tmp_adev->asic_reset_res && !job_signaled)
>> +                       if (!tmp_adev->asic_reset_res && !job_signaled &&
>> +                           amdgpu_sriov_vf(tmp_adev))
>>
>> drm_sched_resubmit_jobs(&ring->sched);
>>
>>                           drm_sched_start(&ring->sched,
>> !tmp_adev->asic_reset_res);
>> --
>> 2.25.1
>>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/5] drm/amd/amdgpu revert "implement tdr advanced mode"
  2022-10-26 15:35 [PATCH 1/5] drm/amd/amdgpu revert "implement tdr advanced mode" Christian König
                   ` (3 preceding siblings ...)
  2022-10-26 15:35 ` [PATCH 5/5] drm/scheduler: deprecate drm_sched_resubmit_jobs Christian König
@ 2023-01-28  8:28 ` Yin, ZhenGuo (Chris)
  2023-01-31  9:55   ` Christian König
  2023-01-30 21:46 ` Luben Tuikov
  5 siblings, 1 reply; 14+ messages in thread
From: Yin, ZhenGuo (Chris) @ 2023-01-28  8:28 UTC (permalink / raw)
  To: Christian König, luben.tuikov, vprosyak, Alexander.Deucher,
	daniel.vetter, amd-gfx, dri-devel
  Cc: jingwen.chen2, Christian König, monk.liu

Hi, Christian

A later bad compute job can block a good gfx job, so the original TDR 
design find a wrong guilty job(good gfx job).

Advanced TDR re-submits jobs in order to find the real guilty job(bad 
compute job).

After reverting this commit, how does the new gang-submit promise the 
isolation between compute jobs and gfx jobs?

On 10/26/2022 11:35 PM, Christian König wrote:
> This reverts commit e6c6338f393b74ac0b303d567bb918b44ae7ad75.
>
> This feature basically re-submits one job after another to
> figure out which one was the one causing a hang.
>
> This is obviously incompatible with gang-submit which requires
> that multiple jobs run at the same time. It's also absolutely
> not helpful to crash the hardware multiple times if a clean
> recovery is desired.
>
> For testing and debugging environments we should rather disable
> recovery alltogether to be able to inspect the state with a hw
> debugger.
>
> Additional to that the sw implementation is clearly buggy and causes
> reference count issues for the hardware fence.
>
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 103 ---------------------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |   2 +-
>   drivers/gpu/drm/scheduler/sched_main.c     |  58 ++----------
>   include/drm/gpu_scheduler.h                |   3 -
>   4 files changed, 10 insertions(+), 156 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 6f958603c8cc..d4584e577b51 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5070,94 +5070,6 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
>   	return 0;
>   }
>   
> -static void amdgpu_device_recheck_guilty_jobs(
> -	struct amdgpu_device *adev, struct list_head *device_list_handle,
> -	struct amdgpu_reset_context *reset_context)
> -{
> -	int i, r = 0;
> -
> -	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> -		struct amdgpu_ring *ring = adev->rings[i];
> -		int ret = 0;
> -		struct drm_sched_job *s_job;
> -
> -		if (!ring || !ring->sched.thread)
> -			continue;
> -
> -		s_job = list_first_entry_or_null(&ring->sched.pending_list,
> -				struct drm_sched_job, list);
> -		if (s_job == NULL)
> -			continue;
> -
> -		/* clear job's guilty and depend the folowing step to decide the real one */
> -		drm_sched_reset_karma(s_job);
> -		drm_sched_resubmit_jobs_ext(&ring->sched, 1);
> -
> -		if (!s_job->s_fence->parent) {
> -			DRM_WARN("Failed to get a HW fence for job!");
> -			continue;
> -		}
> -
> -		ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
> -		if (ret == 0) { /* timeout */
> -			DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
> -						ring->sched.name, s_job->id);
> -
> -
> -			amdgpu_fence_driver_isr_toggle(adev, true);
> -
> -			/* Clear this failed job from fence array */
> -			amdgpu_fence_driver_clear_job_fences(ring);
> -
> -			amdgpu_fence_driver_isr_toggle(adev, false);
> -
> -			/* Since the job won't signal and we go for
> -			 * another resubmit drop this parent pointer
> -			 */
> -			dma_fence_put(s_job->s_fence->parent);
> -			s_job->s_fence->parent = NULL;
> -
> -			/* set guilty */
> -			drm_sched_increase_karma(s_job);
> -			amdgpu_reset_prepare_hwcontext(adev, reset_context);
> -retry:
> -			/* do hw reset */
> -			if (amdgpu_sriov_vf(adev)) {
> -				amdgpu_virt_fini_data_exchange(adev);
> -				r = amdgpu_device_reset_sriov(adev, false);
> -				if (r)
> -					adev->asic_reset_res = r;
> -			} else {
> -				clear_bit(AMDGPU_SKIP_HW_RESET,
> -					  &reset_context->flags);
> -				r = amdgpu_do_asic_reset(device_list_handle,
> -							 reset_context);
> -				if (r && r == -EAGAIN)
> -					goto retry;
> -			}
> -
> -			/*
> -			 * add reset counter so that the following
> -			 * resubmitted job could flush vmid
> -			 */
> -			atomic_inc(&adev->gpu_reset_counter);
> -			continue;
> -		}
> -
> -		/* got the hw fence, signal finished fence */
> -		atomic_dec(ring->sched.score);
> -		dma_fence_get(&s_job->s_fence->finished);
> -		dma_fence_signal(&s_job->s_fence->finished);
> -		dma_fence_put(&s_job->s_fence->finished);
> -
> -		/* remove node from list and free the job */
> -		spin_lock(&ring->sched.job_list_lock);
> -		list_del_init(&s_job->list);
> -		spin_unlock(&ring->sched.job_list_lock);
> -		ring->sched.ops->free_job(s_job);
> -	}
> -}
> -
>   static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
>   {
>   	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> @@ -5178,7 +5090,6 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
>   
>   }
>   
> -
>   /**
>    * amdgpu_device_gpu_recover - reset the asic and recover scheduler
>    *
> @@ -5201,7 +5112,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   	int i, r = 0;
>   	bool need_emergency_restart = false;
>   	bool audio_suspended = false;
> -	int tmp_vram_lost_counter;
>   	bool gpu_reset_for_dev_remove = false;
>   
>   	gpu_reset_for_dev_remove =
> @@ -5347,7 +5257,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   		amdgpu_device_stop_pending_resets(tmp_adev);
>   	}
>   
> -	tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
>   	/* Actual ASIC resets if needed.*/
>   	/* Host driver will handle XGMI hive reset for SRIOV */
>   	if (amdgpu_sriov_vf(adev)) {
> @@ -5372,18 +5281,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   	/* Post ASIC reset for all devs .*/
>   	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
>   
> -		/*
> -		 * Sometimes a later bad compute job can block a good gfx job as gfx
> -		 * and compute ring share internal GC HW mutually. We add an additional
> -		 * guilty jobs recheck step to find the real guilty job, it synchronously
> -		 * submits and pends for the first job being signaled. If it gets timeout,
> -		 * we identify it as a real guilty job.
> -		 */
> -		if (amdgpu_gpu_recovery == 2 &&
> -			!(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))
> -			amdgpu_device_recheck_guilty_jobs(
> -				tmp_adev, device_list_handle, reset_context);
> -
>   		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>   			struct amdgpu_ring *ring = tmp_adev->rings[i];
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 16f6a313335e..f177d8e5b665 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -519,7 +519,7 @@ module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444);
>    * DOC: gpu_recovery (int)
>    * Set to enable GPU recovery mechanism (1 = enable, 0 = disable). The default is -1 (auto, disabled except SRIOV).
>    */
> -MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (2 = advanced tdr mode, 1 = enable, 0 = disable, -1 = auto)");
> +MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 = enable, 0 = disable, -1 = auto)");
>   module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
>   
>   /**
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index e0ab14e0fb6b..bb28f31bff6f 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -355,27 +355,6 @@ static void drm_sched_job_timedout(struct work_struct *work)
>   	}
>   }
>   
> - /**
> -  * drm_sched_increase_karma - Update sched_entity guilty flag
> -  *
> -  * @bad: The job guilty of time out
> -  *
> -  * Increment on every hang caused by the 'bad' job. If this exceeds the hang
> -  * limit of the scheduler then the respective sched entity is marked guilty and
> -  * jobs from it will not be scheduled further
> -  */
> -void drm_sched_increase_karma(struct drm_sched_job *bad)
> -{
> -	drm_sched_increase_karma_ext(bad, 1);
> -}
> -EXPORT_SYMBOL(drm_sched_increase_karma);
> -
> -void drm_sched_reset_karma(struct drm_sched_job *bad)
> -{
> -	drm_sched_increase_karma_ext(bad, 0);
> -}
> -EXPORT_SYMBOL(drm_sched_reset_karma);
> -
>   /**
>    * drm_sched_stop - stop the scheduler
>    *
> @@ -516,32 +495,15 @@ EXPORT_SYMBOL(drm_sched_start);
>    *
>    */
>   void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched)
> -{
> -	drm_sched_resubmit_jobs_ext(sched, INT_MAX);
> -}
> -EXPORT_SYMBOL(drm_sched_resubmit_jobs);
> -
> -/**
> - * drm_sched_resubmit_jobs_ext - helper to relunch certain number of jobs from mirror ring list
> - *
> - * @sched: scheduler instance
> - * @max: job numbers to relaunch
> - *
> - */
> -void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
>   {
>   	struct drm_sched_job *s_job, *tmp;
>   	uint64_t guilty_context;
>   	bool found_guilty = false;
>   	struct dma_fence *fence;
> -	int i = 0;
>   
>   	list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
>   		struct drm_sched_fence *s_fence = s_job->s_fence;
>   
> -		if (i >= max)
> -			break;
> -
>   		if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
>   			found_guilty = true;
>   			guilty_context = s_job->s_fence->scheduled.context;
> @@ -551,7 +513,6 @@ void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
>   			dma_fence_set_error(&s_fence->finished, -ECANCELED);
>   
>   		fence = sched->ops->run_job(s_job);
> -		i++;
>   
>   		if (IS_ERR_OR_NULL(fence)) {
>   			if (IS_ERR(fence))
> @@ -567,7 +528,7 @@ void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
>   		}
>   	}
>   }
> -EXPORT_SYMBOL(drm_sched_resubmit_jobs_ext);
> +EXPORT_SYMBOL(drm_sched_resubmit_jobs);
>   
>   /**
>    * drm_sched_job_init - init a scheduler job
> @@ -1081,13 +1042,15 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched)
>   EXPORT_SYMBOL(drm_sched_fini);
>   
>   /**
> - * drm_sched_increase_karma_ext - Update sched_entity guilty flag
> + * drm_sched_increase_karma - Update sched_entity guilty flag
>    *
>    * @bad: The job guilty of time out
> - * @type: type for increase/reset karma
>    *
> + * Increment on every hang caused by the 'bad' job. If this exceeds the hang
> + * limit of the scheduler then the respective sched entity is marked guilty and
> + * jobs from it will not be scheduled further
>    */
> -void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
> +void drm_sched_increase_karma(struct drm_sched_job *bad)
>   {
>   	int i;
>   	struct drm_sched_entity *tmp;
> @@ -1099,10 +1062,7 @@ void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
>   	 * corrupt but keep in mind that kernel jobs always considered good.
>   	 */
>   	if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) {
> -		if (type == 0)
> -			atomic_set(&bad->karma, 0);
> -		else if (type == 1)
> -			atomic_inc(&bad->karma);
> +		atomic_inc(&bad->karma);
>   
>   		for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_KERNEL;
>   		     i++) {
> @@ -1113,7 +1073,7 @@ void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
>   				if (bad->s_fence->scheduled.context ==
>   				    entity->fence_context) {
>   					if (entity->guilty)
> -						atomic_set(entity->guilty, type);
> +						atomic_set(entity->guilty, 1);
>   					break;
>   				}
>   			}
> @@ -1123,4 +1083,4 @@ void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
>   		}
>   	}
>   }
> -EXPORT_SYMBOL(drm_sched_increase_karma_ext);
> +EXPORT_SYMBOL(drm_sched_increase_karma);
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 0fca8f38bee4..c564be29c5ae 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -488,10 +488,7 @@ void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
>   void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
>   void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery);
>   void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched);
> -void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max);
>   void drm_sched_increase_karma(struct drm_sched_job *bad);
> -void drm_sched_reset_karma(struct drm_sched_job *bad);
> -void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type);
>   bool drm_sched_dependency_optimized(struct dma_fence* fence,
>   				    struct drm_sched_entity *entity);
>   void drm_sched_fault(struct drm_gpu_scheduler *sched);

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/5] drm/amd/amdgpu revert "implement tdr advanced mode"
  2022-10-26 15:35 [PATCH 1/5] drm/amd/amdgpu revert "implement tdr advanced mode" Christian König
                   ` (4 preceding siblings ...)
  2023-01-28  8:28 ` [PATCH 1/5] drm/amd/amdgpu revert "implement tdr advanced mode" Yin, ZhenGuo (Chris)
@ 2023-01-30 21:46 ` Luben Tuikov
  5 siblings, 0 replies; 14+ messages in thread
From: Luben Tuikov @ 2023-01-30 21:46 UTC (permalink / raw)
  To: Christian König, vprosyak, Alexander.Deucher, daniel.vetter,
	amd-gfx, dri-devel
  Cc: Christian König

The series is,

Acked-by: Luben Tuikov <luben.tuikov@amd.com>

We don't want the kernel to be in the business of retrying client's
requests. Instead we want the kernel to provide a conduit for such
requests to be sent, executed by the GPU, and a result returned.
If the kernel cannot process requests for any reason, e.g. GPU
is being reset, or in recovery, or OOM, or there's a problem with
the request, etc., the driver should return the request back to
the client (user space), and let the client decide how to proceed.

Regards,
Luben

On 2022-10-26 11:35, Christian König wrote:
> This reverts commit e6c6338f393b74ac0b303d567bb918b44ae7ad75.
> 
> This feature basically re-submits one job after another to
> figure out which one was the one causing a hang.
> 
> This is obviously incompatible with gang-submit which requires
> that multiple jobs run at the same time. It's also absolutely
> not helpful to crash the hardware multiple times if a clean
> recovery is desired.
> 
> For testing and debugging environments we should rather disable
> recovery alltogether to be able to inspect the state with a hw
> debugger.
> 
> Additional to that the sw implementation is clearly buggy and causes
> reference count issues for the hardware fence.
> 
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 103 ---------------------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |   2 +-
>  drivers/gpu/drm/scheduler/sched_main.c     |  58 ++----------
>  include/drm/gpu_scheduler.h                |   3 -
>  4 files changed, 10 insertions(+), 156 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 6f958603c8cc..d4584e577b51 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5070,94 +5070,6 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
>  	return 0;
>  }
>  
> -static void amdgpu_device_recheck_guilty_jobs(
> -	struct amdgpu_device *adev, struct list_head *device_list_handle,
> -	struct amdgpu_reset_context *reset_context)
> -{
> -	int i, r = 0;
> -
> -	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> -		struct amdgpu_ring *ring = adev->rings[i];
> -		int ret = 0;
> -		struct drm_sched_job *s_job;
> -
> -		if (!ring || !ring->sched.thread)
> -			continue;
> -
> -		s_job = list_first_entry_or_null(&ring->sched.pending_list,
> -				struct drm_sched_job, list);
> -		if (s_job == NULL)
> -			continue;
> -
> -		/* clear job's guilty and depend the folowing step to decide the real one */
> -		drm_sched_reset_karma(s_job);
> -		drm_sched_resubmit_jobs_ext(&ring->sched, 1);
> -
> -		if (!s_job->s_fence->parent) {
> -			DRM_WARN("Failed to get a HW fence for job!");
> -			continue;
> -		}
> -
> -		ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
> -		if (ret == 0) { /* timeout */
> -			DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
> -						ring->sched.name, s_job->id);
> -
> -
> -			amdgpu_fence_driver_isr_toggle(adev, true);
> -
> -			/* Clear this failed job from fence array */
> -			amdgpu_fence_driver_clear_job_fences(ring);
> -
> -			amdgpu_fence_driver_isr_toggle(adev, false);
> -
> -			/* Since the job won't signal and we go for
> -			 * another resubmit drop this parent pointer
> -			 */
> -			dma_fence_put(s_job->s_fence->parent);
> -			s_job->s_fence->parent = NULL;
> -
> -			/* set guilty */
> -			drm_sched_increase_karma(s_job);
> -			amdgpu_reset_prepare_hwcontext(adev, reset_context);
> -retry:
> -			/* do hw reset */
> -			if (amdgpu_sriov_vf(adev)) {
> -				amdgpu_virt_fini_data_exchange(adev);
> -				r = amdgpu_device_reset_sriov(adev, false);
> -				if (r)
> -					adev->asic_reset_res = r;
> -			} else {
> -				clear_bit(AMDGPU_SKIP_HW_RESET,
> -					  &reset_context->flags);
> -				r = amdgpu_do_asic_reset(device_list_handle,
> -							 reset_context);
> -				if (r && r == -EAGAIN)
> -					goto retry;
> -			}
> -
> -			/*
> -			 * add reset counter so that the following
> -			 * resubmitted job could flush vmid
> -			 */
> -			atomic_inc(&adev->gpu_reset_counter);
> -			continue;
> -		}
> -
> -		/* got the hw fence, signal finished fence */
> -		atomic_dec(ring->sched.score);
> -		dma_fence_get(&s_job->s_fence->finished);
> -		dma_fence_signal(&s_job->s_fence->finished);
> -		dma_fence_put(&s_job->s_fence->finished);
> -
> -		/* remove node from list and free the job */
> -		spin_lock(&ring->sched.job_list_lock);
> -		list_del_init(&s_job->list);
> -		spin_unlock(&ring->sched.job_list_lock);
> -		ring->sched.ops->free_job(s_job);
> -	}
> -}
> -
>  static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
>  {
>  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> @@ -5178,7 +5090,6 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
>  
>  }
>  
> -
>  /**
>   * amdgpu_device_gpu_recover - reset the asic and recover scheduler
>   *
> @@ -5201,7 +5112,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>  	int i, r = 0;
>  	bool need_emergency_restart = false;
>  	bool audio_suspended = false;
> -	int tmp_vram_lost_counter;
>  	bool gpu_reset_for_dev_remove = false;
>  
>  	gpu_reset_for_dev_remove =
> @@ -5347,7 +5257,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>  		amdgpu_device_stop_pending_resets(tmp_adev);
>  	}
>  
> -	tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
>  	/* Actual ASIC resets if needed.*/
>  	/* Host driver will handle XGMI hive reset for SRIOV */
>  	if (amdgpu_sriov_vf(adev)) {
> @@ -5372,18 +5281,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>  	/* Post ASIC reset for all devs .*/
>  	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
>  
> -		/*
> -		 * Sometimes a later bad compute job can block a good gfx job as gfx
> -		 * and compute ring share internal GC HW mutually. We add an additional
> -		 * guilty jobs recheck step to find the real guilty job, it synchronously
> -		 * submits and pends for the first job being signaled. If it gets timeout,
> -		 * we identify it as a real guilty job.
> -		 */
> -		if (amdgpu_gpu_recovery == 2 &&
> -			!(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))
> -			amdgpu_device_recheck_guilty_jobs(
> -				tmp_adev, device_list_handle, reset_context);
> -
>  		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>  			struct amdgpu_ring *ring = tmp_adev->rings[i];
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 16f6a313335e..f177d8e5b665 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -519,7 +519,7 @@ module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444);
>   * DOC: gpu_recovery (int)
>   * Set to enable GPU recovery mechanism (1 = enable, 0 = disable). The default is -1 (auto, disabled except SRIOV).
>   */
> -MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (2 = advanced tdr mode, 1 = enable, 0 = disable, -1 = auto)");
> +MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 = enable, 0 = disable, -1 = auto)");
>  module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
>  
>  /**
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index e0ab14e0fb6b..bb28f31bff6f 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -355,27 +355,6 @@ static void drm_sched_job_timedout(struct work_struct *work)
>  	}
>  }
>  
> - /**
> -  * drm_sched_increase_karma - Update sched_entity guilty flag
> -  *
> -  * @bad: The job guilty of time out
> -  *
> -  * Increment on every hang caused by the 'bad' job. If this exceeds the hang
> -  * limit of the scheduler then the respective sched entity is marked guilty and
> -  * jobs from it will not be scheduled further
> -  */
> -void drm_sched_increase_karma(struct drm_sched_job *bad)
> -{
> -	drm_sched_increase_karma_ext(bad, 1);
> -}
> -EXPORT_SYMBOL(drm_sched_increase_karma);
> -
> -void drm_sched_reset_karma(struct drm_sched_job *bad)
> -{
> -	drm_sched_increase_karma_ext(bad, 0);
> -}
> -EXPORT_SYMBOL(drm_sched_reset_karma);
> -
>  /**
>   * drm_sched_stop - stop the scheduler
>   *
> @@ -516,32 +495,15 @@ EXPORT_SYMBOL(drm_sched_start);
>   *
>   */
>  void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched)
> -{
> -	drm_sched_resubmit_jobs_ext(sched, INT_MAX);
> -}
> -EXPORT_SYMBOL(drm_sched_resubmit_jobs);
> -
> -/**
> - * drm_sched_resubmit_jobs_ext - helper to relunch certain number of jobs from mirror ring list
> - *
> - * @sched: scheduler instance
> - * @max: job numbers to relaunch
> - *
> - */
> -void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
>  {
>  	struct drm_sched_job *s_job, *tmp;
>  	uint64_t guilty_context;
>  	bool found_guilty = false;
>  	struct dma_fence *fence;
> -	int i = 0;
>  
>  	list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
>  		struct drm_sched_fence *s_fence = s_job->s_fence;
>  
> -		if (i >= max)
> -			break;
> -
>  		if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
>  			found_guilty = true;
>  			guilty_context = s_job->s_fence->scheduled.context;
> @@ -551,7 +513,6 @@ void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
>  			dma_fence_set_error(&s_fence->finished, -ECANCELED);
>  
>  		fence = sched->ops->run_job(s_job);
> -		i++;
>  
>  		if (IS_ERR_OR_NULL(fence)) {
>  			if (IS_ERR(fence))
> @@ -567,7 +528,7 @@ void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
>  		}
>  	}
>  }
> -EXPORT_SYMBOL(drm_sched_resubmit_jobs_ext);
> +EXPORT_SYMBOL(drm_sched_resubmit_jobs);
>  
>  /**
>   * drm_sched_job_init - init a scheduler job
> @@ -1081,13 +1042,15 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched)
>  EXPORT_SYMBOL(drm_sched_fini);
>  
>  /**
> - * drm_sched_increase_karma_ext - Update sched_entity guilty flag
> + * drm_sched_increase_karma - Update sched_entity guilty flag
>   *
>   * @bad: The job guilty of time out
> - * @type: type for increase/reset karma
>   *
> + * Increment on every hang caused by the 'bad' job. If this exceeds the hang
> + * limit of the scheduler then the respective sched entity is marked guilty and
> + * jobs from it will not be scheduled further
>   */
> -void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
> +void drm_sched_increase_karma(struct drm_sched_job *bad)
>  {
>  	int i;
>  	struct drm_sched_entity *tmp;
> @@ -1099,10 +1062,7 @@ void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
>  	 * corrupt but keep in mind that kernel jobs always considered good.
>  	 */
>  	if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) {
> -		if (type == 0)
> -			atomic_set(&bad->karma, 0);
> -		else if (type == 1)
> -			atomic_inc(&bad->karma);
> +		atomic_inc(&bad->karma);
>  
>  		for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_KERNEL;
>  		     i++) {
> @@ -1113,7 +1073,7 @@ void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
>  				if (bad->s_fence->scheduled.context ==
>  				    entity->fence_context) {
>  					if (entity->guilty)
> -						atomic_set(entity->guilty, type);
> +						atomic_set(entity->guilty, 1);
>  					break;
>  				}
>  			}
> @@ -1123,4 +1083,4 @@ void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
>  		}
>  	}
>  }
> -EXPORT_SYMBOL(drm_sched_increase_karma_ext);
> +EXPORT_SYMBOL(drm_sched_increase_karma);
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 0fca8f38bee4..c564be29c5ae 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -488,10 +488,7 @@ void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
>  void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
>  void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery);
>  void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched);
> -void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max);
>  void drm_sched_increase_karma(struct drm_sched_job *bad);
> -void drm_sched_reset_karma(struct drm_sched_job *bad);
> -void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type);
>  bool drm_sched_dependency_optimized(struct dma_fence* fence,
>  				    struct drm_sched_entity *entity);
>  void drm_sched_fault(struct drm_gpu_scheduler *sched);

-- 
Regards,
Luben


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/5] drm/amd/amdgpu revert "implement tdr advanced mode"
  2023-01-28  8:28 ` [PATCH 1/5] drm/amd/amdgpu revert "implement tdr advanced mode" Yin, ZhenGuo (Chris)
@ 2023-01-31  9:55   ` Christian König
  0 siblings, 0 replies; 14+ messages in thread
From: Christian König @ 2023-01-31  9:55 UTC (permalink / raw)
  To: Yin, ZhenGuo (Chris),
	Christian König, luben.tuikov, vprosyak, Alexander.Deucher,
	daniel.vetter, amd-gfx, dri-devel
  Cc: jingwen.chen2, monk.liu

> A later bad compute job can block a good gfx job

The gang submit/barrier approach makes sure that only one application at 
a time can use the gfx/compute block.

So when application B makes a compute submission while a GFX submission 
of application A is still running we will wait for that GFX submission 
to complete before B is allowed to access the hardware.

Two submissions to the same engine will wait in the hardware for the 
first submission to fully complete before the second is started.

This way neither concurrent submissions to different engines 
(compute/gfx) nor subsequent submissions to the same engine let 
different applications access the hardware at the same time.

It's essentially the same requirement we have for gang submit and that 
is now well tested and in production for a couple of month. So we 
already know that the approach works.

Regards,
Christian.

Am 28.01.23 um 09:28 schrieb Yin, ZhenGuo (Chris):
> Hi, Christian
>
> A later bad compute job can block a good gfx job, so the original TDR 
> design find a wrong guilty job(good gfx job).
>
> Advanced TDR re-submits jobs in order to find the real guilty job(bad 
> compute job).
>
> After reverting this commit, how does the new gang-submit promise the 
> isolation between compute jobs and gfx jobs?
>
> On 10/26/2022 11:35 PM, Christian König wrote:
>> This reverts commit e6c6338f393b74ac0b303d567bb918b44ae7ad75.
>>
>> This feature basically re-submits one job after another to
>> figure out which one was the one causing a hang.
>>
>> This is obviously incompatible with gang-submit which requires
>> that multiple jobs run at the same time. It's also absolutely
>> not helpful to crash the hardware multiple times if a clean
>> recovery is desired.
>>
>> For testing and debugging environments we should rather disable
>> recovery alltogether to be able to inspect the state with a hw
>> debugger.
>>
>> Additional to that the sw implementation is clearly buggy and causes
>> reference count issues for the hardware fence.
>>
>> Signed-off-by: Christian König <christian.koenig@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 103 ---------------------
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |   2 +-
>>   drivers/gpu/drm/scheduler/sched_main.c     |  58 ++----------
>>   include/drm/gpu_scheduler.h                |   3 -
>>   4 files changed, 10 insertions(+), 156 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 6f958603c8cc..d4584e577b51 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -5070,94 +5070,6 @@ static int 
>> amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
>>       return 0;
>>   }
>>   -static void amdgpu_device_recheck_guilty_jobs(
>> -    struct amdgpu_device *adev, struct list_head *device_list_handle,
>> -    struct amdgpu_reset_context *reset_context)
>> -{
>> -    int i, r = 0;
>> -
>> -    for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> -        struct amdgpu_ring *ring = adev->rings[i];
>> -        int ret = 0;
>> -        struct drm_sched_job *s_job;
>> -
>> -        if (!ring || !ring->sched.thread)
>> -            continue;
>> -
>> -        s_job = list_first_entry_or_null(&ring->sched.pending_list,
>> -                struct drm_sched_job, list);
>> -        if (s_job == NULL)
>> -            continue;
>> -
>> -        /* clear job's guilty and depend the folowing step to decide 
>> the real one */
>> -        drm_sched_reset_karma(s_job);
>> -        drm_sched_resubmit_jobs_ext(&ring->sched, 1);
>> -
>> -        if (!s_job->s_fence->parent) {
>> -            DRM_WARN("Failed to get a HW fence for job!");
>> -            continue;
>> -        }
>> -
>> -        ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, 
>> ring->sched.timeout);
>> -        if (ret == 0) { /* timeout */
>> -            DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
>> -                        ring->sched.name, s_job->id);
>> -
>> -
>> -            amdgpu_fence_driver_isr_toggle(adev, true);
>> -
>> -            /* Clear this failed job from fence array */
>> -            amdgpu_fence_driver_clear_job_fences(ring);
>> -
>> -            amdgpu_fence_driver_isr_toggle(adev, false);
>> -
>> -            /* Since the job won't signal and we go for
>> -             * another resubmit drop this parent pointer
>> -             */
>> -            dma_fence_put(s_job->s_fence->parent);
>> -            s_job->s_fence->parent = NULL;
>> -
>> -            /* set guilty */
>> -            drm_sched_increase_karma(s_job);
>> -            amdgpu_reset_prepare_hwcontext(adev, reset_context);
>> -retry:
>> -            /* do hw reset */
>> -            if (amdgpu_sriov_vf(adev)) {
>> -                amdgpu_virt_fini_data_exchange(adev);
>> -                r = amdgpu_device_reset_sriov(adev, false);
>> -                if (r)
>> -                    adev->asic_reset_res = r;
>> -            } else {
>> -                clear_bit(AMDGPU_SKIP_HW_RESET,
>> -                      &reset_context->flags);
>> -                r = amdgpu_do_asic_reset(device_list_handle,
>> -                             reset_context);
>> -                if (r && r == -EAGAIN)
>> -                    goto retry;
>> -            }
>> -
>> -            /*
>> -             * add reset counter so that the following
>> -             * resubmitted job could flush vmid
>> -             */
>> -            atomic_inc(&adev->gpu_reset_counter);
>> -            continue;
>> -        }
>> -
>> -        /* got the hw fence, signal finished fence */
>> -        atomic_dec(ring->sched.score);
>> -        dma_fence_get(&s_job->s_fence->finished);
>> -        dma_fence_signal(&s_job->s_fence->finished);
>> -        dma_fence_put(&s_job->s_fence->finished);
>> -
>> -        /* remove node from list and free the job */
>> -        spin_lock(&ring->sched.job_list_lock);
>> -        list_del_init(&s_job->list);
>> -        spin_unlock(&ring->sched.job_list_lock);
>> -        ring->sched.ops->free_job(s_job);
>> -    }
>> -}
>> -
>>   static inline void amdgpu_device_stop_pending_resets(struct 
>> amdgpu_device *adev)
>>   {
>>       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>> @@ -5178,7 +5090,6 @@ static inline void 
>> amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
>>     }
>>   -
>>   /**
>>    * amdgpu_device_gpu_recover - reset the asic and recover scheduler
>>    *
>> @@ -5201,7 +5112,6 @@ int amdgpu_device_gpu_recover(struct 
>> amdgpu_device *adev,
>>       int i, r = 0;
>>       bool need_emergency_restart = false;
>>       bool audio_suspended = false;
>> -    int tmp_vram_lost_counter;
>>       bool gpu_reset_for_dev_remove = false;
>>         gpu_reset_for_dev_remove =
>> @@ -5347,7 +5257,6 @@ int amdgpu_device_gpu_recover(struct 
>> amdgpu_device *adev,
>>           amdgpu_device_stop_pending_resets(tmp_adev);
>>       }
>>   -    tmp_vram_lost_counter = 
>> atomic_read(&((adev)->vram_lost_counter));
>>       /* Actual ASIC resets if needed.*/
>>       /* Host driver will handle XGMI hive reset for SRIOV */
>>       if (amdgpu_sriov_vf(adev)) {
>> @@ -5372,18 +5281,6 @@ int amdgpu_device_gpu_recover(struct 
>> amdgpu_device *adev,
>>       /* Post ASIC reset for all devs .*/
>>       list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
>>   -        /*
>> -         * Sometimes a later bad compute job can block a good gfx 
>> job as gfx
>> -         * and compute ring share internal GC HW mutually. We add an 
>> additional
>> -         * guilty jobs recheck step to find the real guilty job, it 
>> synchronously
>> -         * submits and pends for the first job being signaled. If it 
>> gets timeout,
>> -         * we identify it as a real guilty job.
>> -         */
>> -        if (amdgpu_gpu_recovery == 2 &&
>> -            !(tmp_vram_lost_counter < 
>> atomic_read(&adev->vram_lost_counter)))
>> -            amdgpu_device_recheck_guilty_jobs(
>> -                tmp_adev, device_list_handle, reset_context);
>> -
>>           for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>               struct amdgpu_ring *ring = tmp_adev->rings[i];
>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index 16f6a313335e..f177d8e5b665 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -519,7 +519,7 @@ module_param_named(compute_multipipe, 
>> amdgpu_compute_multipipe, int, 0444);
>>    * DOC: gpu_recovery (int)
>>    * Set to enable GPU recovery mechanism (1 = enable, 0 = disable). 
>> The default is -1 (auto, disabled except SRIOV).
>>    */
>> -MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (2 = 
>> advanced tdr mode, 1 = enable, 0 = disable, -1 = auto)");
>> +MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 = 
>> enable, 0 = disable, -1 = auto)");
>>   module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
>>     /**
>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
>> b/drivers/gpu/drm/scheduler/sched_main.c
>> index e0ab14e0fb6b..bb28f31bff6f 100644
>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>> @@ -355,27 +355,6 @@ static void drm_sched_job_timedout(struct 
>> work_struct *work)
>>       }
>>   }
>>   - /**
>> -  * drm_sched_increase_karma - Update sched_entity guilty flag
>> -  *
>> -  * @bad: The job guilty of time out
>> -  *
>> -  * Increment on every hang caused by the 'bad' job. If this exceeds 
>> the hang
>> -  * limit of the scheduler then the respective sched entity is 
>> marked guilty and
>> -  * jobs from it will not be scheduled further
>> -  */
>> -void drm_sched_increase_karma(struct drm_sched_job *bad)
>> -{
>> -    drm_sched_increase_karma_ext(bad, 1);
>> -}
>> -EXPORT_SYMBOL(drm_sched_increase_karma);
>> -
>> -void drm_sched_reset_karma(struct drm_sched_job *bad)
>> -{
>> -    drm_sched_increase_karma_ext(bad, 0);
>> -}
>> -EXPORT_SYMBOL(drm_sched_reset_karma);
>> -
>>   /**
>>    * drm_sched_stop - stop the scheduler
>>    *
>> @@ -516,32 +495,15 @@ EXPORT_SYMBOL(drm_sched_start);
>>    *
>>    */
>>   void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched)
>> -{
>> -    drm_sched_resubmit_jobs_ext(sched, INT_MAX);
>> -}
>> -EXPORT_SYMBOL(drm_sched_resubmit_jobs);
>> -
>> -/**
>> - * drm_sched_resubmit_jobs_ext - helper to relunch certain number of 
>> jobs from mirror ring list
>> - *
>> - * @sched: scheduler instance
>> - * @max: job numbers to relaunch
>> - *
>> - */
>> -void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, 
>> int max)
>>   {
>>       struct drm_sched_job *s_job, *tmp;
>>       uint64_t guilty_context;
>>       bool found_guilty = false;
>>       struct dma_fence *fence;
>> -    int i = 0;
>>         list_for_each_entry_safe(s_job, tmp, &sched->pending_list, 
>> list) {
>>           struct drm_sched_fence *s_fence = s_job->s_fence;
>>   -        if (i >= max)
>> -            break;
>> -
>>           if (!found_guilty && atomic_read(&s_job->karma) > 
>> sched->hang_limit) {
>>               found_guilty = true;
>>               guilty_context = s_job->s_fence->scheduled.context;
>> @@ -551,7 +513,6 @@ void drm_sched_resubmit_jobs_ext(struct 
>> drm_gpu_scheduler *sched, int max)
>>               dma_fence_set_error(&s_fence->finished, -ECANCELED);
>>             fence = sched->ops->run_job(s_job);
>> -        i++;
>>             if (IS_ERR_OR_NULL(fence)) {
>>               if (IS_ERR(fence))
>> @@ -567,7 +528,7 @@ void drm_sched_resubmit_jobs_ext(struct 
>> drm_gpu_scheduler *sched, int max)
>>           }
>>       }
>>   }
>> -EXPORT_SYMBOL(drm_sched_resubmit_jobs_ext);
>> +EXPORT_SYMBOL(drm_sched_resubmit_jobs);
>>     /**
>>    * drm_sched_job_init - init a scheduler job
>> @@ -1081,13 +1042,15 @@ void drm_sched_fini(struct drm_gpu_scheduler 
>> *sched)
>>   EXPORT_SYMBOL(drm_sched_fini);
>>     /**
>> - * drm_sched_increase_karma_ext - Update sched_entity guilty flag
>> + * drm_sched_increase_karma - Update sched_entity guilty flag
>>    *
>>    * @bad: The job guilty of time out
>> - * @type: type for increase/reset karma
>>    *
>> + * Increment on every hang caused by the 'bad' job. If this exceeds 
>> the hang
>> + * limit of the scheduler then the respective sched entity is marked 
>> guilty and
>> + * jobs from it will not be scheduled further
>>    */
>> -void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
>> +void drm_sched_increase_karma(struct drm_sched_job *bad)
>>   {
>>       int i;
>>       struct drm_sched_entity *tmp;
>> @@ -1099,10 +1062,7 @@ void drm_sched_increase_karma_ext(struct 
>> drm_sched_job *bad, int type)
>>        * corrupt but keep in mind that kernel jobs always considered 
>> good.
>>        */
>>       if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) {
>> -        if (type == 0)
>> -            atomic_set(&bad->karma, 0);
>> -        else if (type == 1)
>> -            atomic_inc(&bad->karma);
>> +        atomic_inc(&bad->karma);
>>             for (i = DRM_SCHED_PRIORITY_MIN; i < 
>> DRM_SCHED_PRIORITY_KERNEL;
>>                i++) {
>> @@ -1113,7 +1073,7 @@ void drm_sched_increase_karma_ext(struct 
>> drm_sched_job *bad, int type)
>>                   if (bad->s_fence->scheduled.context ==
>>                       entity->fence_context) {
>>                       if (entity->guilty)
>> -                        atomic_set(entity->guilty, type);
>> +                        atomic_set(entity->guilty, 1);
>>                       break;
>>                   }
>>               }
>> @@ -1123,4 +1083,4 @@ void drm_sched_increase_karma_ext(struct 
>> drm_sched_job *bad, int type)
>>           }
>>       }
>>   }
>> -EXPORT_SYMBOL(drm_sched_increase_karma_ext);
>> +EXPORT_SYMBOL(drm_sched_increase_karma);
>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>> index 0fca8f38bee4..c564be29c5ae 100644
>> --- a/include/drm/gpu_scheduler.h
>> +++ b/include/drm/gpu_scheduler.h
>> @@ -488,10 +488,7 @@ void drm_sched_wakeup(struct drm_gpu_scheduler 
>> *sched);
>>   void drm_sched_stop(struct drm_gpu_scheduler *sched, struct 
>> drm_sched_job *bad);
>>   void drm_sched_start(struct drm_gpu_scheduler *sched, bool 
>> full_recovery);
>>   void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched);
>> -void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, 
>> int max);
>>   void drm_sched_increase_karma(struct drm_sched_job *bad);
>> -void drm_sched_reset_karma(struct drm_sched_job *bad);
>> -void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type);
>>   bool drm_sched_dependency_optimized(struct dma_fence* fence,
>>                       struct drm_sched_entity *entity);
>>   void drm_sched_fault(struct drm_gpu_scheduler *sched);


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/5] drm/amd/amdgpu revert "implement tdr advanced mode"
  2022-11-09  9:50 Christian König
@ 2022-11-10 19:45 ` Alex Deucher
  0 siblings, 0 replies; 14+ messages in thread
From: Alex Deucher @ 2022-11-10 19:45 UTC (permalink / raw)
  To: Christian König
  Cc: Alexander.Deucher, daniel.vetter, Christian König,
	dri-devel, Shaoyun.Liu

Some comments on patch 3.  With that addressed, the series is:
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

On Wed, Nov 9, 2022 at 4:50 AM Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
>
> This reverts commit e6c6338f393b74ac0b303d567bb918b44ae7ad75.
>
> This feature basically re-submits one job after another to
> figure out which one was the one causing a hang.
>
> This is obviously incompatible with gang-submit which requires
> that multiple jobs run at the same time. It's also absolutely
> not helpful to crash the hardware multiple times if a clean
> recovery is desired.
>
> For testing and debugging environments we should rather disable
> recovery alltogether to be able to inspect the state with a hw
> debugger.
>
> Additional to that the sw implementation is clearly buggy and causes
> reference count issues for the hardware fence.
>
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 103 ---------------------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |   2 +-
>  drivers/gpu/drm/scheduler/sched_main.c     |  58 ++----------
>  include/drm/gpu_scheduler.h                |   3 -
>  4 files changed, 10 insertions(+), 156 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 5b9f992e4607..0da55fd97df8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5077,94 +5077,6 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
>         return 0;
>  }
>
> -static void amdgpu_device_recheck_guilty_jobs(
> -       struct amdgpu_device *adev, struct list_head *device_list_handle,
> -       struct amdgpu_reset_context *reset_context)
> -{
> -       int i, r = 0;
> -
> -       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> -               struct amdgpu_ring *ring = adev->rings[i];
> -               int ret = 0;
> -               struct drm_sched_job *s_job;
> -
> -               if (!ring || !ring->sched.thread)
> -                       continue;
> -
> -               s_job = list_first_entry_or_null(&ring->sched.pending_list,
> -                               struct drm_sched_job, list);
> -               if (s_job == NULL)
> -                       continue;
> -
> -               /* clear job's guilty and depend the folowing step to decide the real one */
> -               drm_sched_reset_karma(s_job);
> -               drm_sched_resubmit_jobs_ext(&ring->sched, 1);
> -
> -               if (!s_job->s_fence->parent) {
> -                       DRM_WARN("Failed to get a HW fence for job!");
> -                       continue;
> -               }
> -
> -               ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
> -               if (ret == 0) { /* timeout */
> -                       DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
> -                                               ring->sched.name, s_job->id);
> -
> -
> -                       amdgpu_fence_driver_isr_toggle(adev, true);
> -
> -                       /* Clear this failed job from fence array */
> -                       amdgpu_fence_driver_clear_job_fences(ring);
> -
> -                       amdgpu_fence_driver_isr_toggle(adev, false);
> -
> -                       /* Since the job won't signal and we go for
> -                        * another resubmit drop this parent pointer
> -                        */
> -                       dma_fence_put(s_job->s_fence->parent);
> -                       s_job->s_fence->parent = NULL;
> -
> -                       /* set guilty */
> -                       drm_sched_increase_karma(s_job);
> -                       amdgpu_reset_prepare_hwcontext(adev, reset_context);
> -retry:
> -                       /* do hw reset */
> -                       if (amdgpu_sriov_vf(adev)) {
> -                               amdgpu_virt_fini_data_exchange(adev);
> -                               r = amdgpu_device_reset_sriov(adev, false);
> -                               if (r)
> -                                       adev->asic_reset_res = r;
> -                       } else {
> -                               clear_bit(AMDGPU_SKIP_HW_RESET,
> -                                         &reset_context->flags);
> -                               r = amdgpu_do_asic_reset(device_list_handle,
> -                                                        reset_context);
> -                               if (r && r == -EAGAIN)
> -                                       goto retry;
> -                       }
> -
> -                       /*
> -                        * add reset counter so that the following
> -                        * resubmitted job could flush vmid
> -                        */
> -                       atomic_inc(&adev->gpu_reset_counter);
> -                       continue;
> -               }
> -
> -               /* got the hw fence, signal finished fence */
> -               atomic_dec(ring->sched.score);
> -               dma_fence_get(&s_job->s_fence->finished);
> -               dma_fence_signal(&s_job->s_fence->finished);
> -               dma_fence_put(&s_job->s_fence->finished);
> -
> -               /* remove node from list and free the job */
> -               spin_lock(&ring->sched.job_list_lock);
> -               list_del_init(&s_job->list);
> -               spin_unlock(&ring->sched.job_list_lock);
> -               ring->sched.ops->free_job(s_job);
> -       }
> -}
> -
>  static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
>  {
>         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> @@ -5185,7 +5097,6 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
>
>  }
>
> -
>  /**
>   * amdgpu_device_gpu_recover - reset the asic and recover scheduler
>   *
> @@ -5208,7 +5119,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>         int i, r = 0;
>         bool need_emergency_restart = false;
>         bool audio_suspended = false;
> -       int tmp_vram_lost_counter;
>         bool gpu_reset_for_dev_remove = false;
>
>         gpu_reset_for_dev_remove =
> @@ -5354,7 +5264,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>                 amdgpu_device_stop_pending_resets(tmp_adev);
>         }
>
> -       tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
>         /* Actual ASIC resets if needed.*/
>         /* Host driver will handle XGMI hive reset for SRIOV */
>         if (amdgpu_sriov_vf(adev)) {
> @@ -5379,18 +5288,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>         /* Post ASIC reset for all devs .*/
>         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
>
> -               /*
> -                * Sometimes a later bad compute job can block a good gfx job as gfx
> -                * and compute ring share internal GC HW mutually. We add an additional
> -                * guilty jobs recheck step to find the real guilty job, it synchronously
> -                * submits and pends for the first job being signaled. If it gets timeout,
> -                * we identify it as a real guilty job.
> -                */
> -               if (amdgpu_gpu_recovery == 2 &&
> -                       !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))
> -                       amdgpu_device_recheck_guilty_jobs(
> -                               tmp_adev, device_list_handle, reset_context);
> -
>                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                         struct amdgpu_ring *ring = tmp_adev->rings[i];
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 8e97e95aca8c..a6820603214f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -519,7 +519,7 @@ module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444);
>   * DOC: gpu_recovery (int)
>   * Set to enable GPU recovery mechanism (1 = enable, 0 = disable). The default is -1 (auto, disabled except SRIOV).
>   */
> -MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (2 = advanced tdr mode, 1 = enable, 0 = disable, -1 = auto)");
> +MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 = enable, 0 = disable, -1 = auto)");
>  module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
>
>  /**
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index 68317d3a7a27..e77e1fd16732 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -355,27 +355,6 @@ static void drm_sched_job_timedout(struct work_struct *work)
>         }
>  }
>
> - /**
> -  * drm_sched_increase_karma - Update sched_entity guilty flag
> -  *
> -  * @bad: The job guilty of time out
> -  *
> -  * Increment on every hang caused by the 'bad' job. If this exceeds the hang
> -  * limit of the scheduler then the respective sched entity is marked guilty and
> -  * jobs from it will not be scheduled further
> -  */
> -void drm_sched_increase_karma(struct drm_sched_job *bad)
> -{
> -       drm_sched_increase_karma_ext(bad, 1);
> -}
> -EXPORT_SYMBOL(drm_sched_increase_karma);
> -
> -void drm_sched_reset_karma(struct drm_sched_job *bad)
> -{
> -       drm_sched_increase_karma_ext(bad, 0);
> -}
> -EXPORT_SYMBOL(drm_sched_reset_karma);
> -
>  /**
>   * drm_sched_stop - stop the scheduler
>   *
> @@ -516,32 +495,15 @@ EXPORT_SYMBOL(drm_sched_start);
>   *
>   */
>  void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched)
> -{
> -       drm_sched_resubmit_jobs_ext(sched, INT_MAX);
> -}
> -EXPORT_SYMBOL(drm_sched_resubmit_jobs);
> -
> -/**
> - * drm_sched_resubmit_jobs_ext - helper to relunch certain number of jobs from mirror ring list
> - *
> - * @sched: scheduler instance
> - * @max: job numbers to relaunch
> - *
> - */
> -void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
>  {
>         struct drm_sched_job *s_job, *tmp;
>         uint64_t guilty_context;
>         bool found_guilty = false;
>         struct dma_fence *fence;
> -       int i = 0;
>
>         list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
>                 struct drm_sched_fence *s_fence = s_job->s_fence;
>
> -               if (i >= max)
> -                       break;
> -
>                 if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
>                         found_guilty = true;
>                         guilty_context = s_job->s_fence->scheduled.context;
> @@ -551,7 +513,6 @@ void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
>                         dma_fence_set_error(&s_fence->finished, -ECANCELED);
>
>                 fence = sched->ops->run_job(s_job);
> -               i++;
>
>                 if (IS_ERR_OR_NULL(fence)) {
>                         if (IS_ERR(fence))
> @@ -567,7 +528,7 @@ void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
>                 }
>         }
>  }
> -EXPORT_SYMBOL(drm_sched_resubmit_jobs_ext);
> +EXPORT_SYMBOL(drm_sched_resubmit_jobs);
>
>  /**
>   * drm_sched_job_init - init a scheduler job
> @@ -1082,13 +1043,15 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched)
>  EXPORT_SYMBOL(drm_sched_fini);
>
>  /**
> - * drm_sched_increase_karma_ext - Update sched_entity guilty flag
> + * drm_sched_increase_karma - Update sched_entity guilty flag
>   *
>   * @bad: The job guilty of time out
> - * @type: type for increase/reset karma
>   *
> + * Increment on every hang caused by the 'bad' job. If this exceeds the hang
> + * limit of the scheduler then the respective sched entity is marked guilty and
> + * jobs from it will not be scheduled further
>   */
> -void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
> +void drm_sched_increase_karma(struct drm_sched_job *bad)
>  {
>         int i;
>         struct drm_sched_entity *tmp;
> @@ -1100,10 +1063,7 @@ void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
>          * corrupt but keep in mind that kernel jobs always considered good.
>          */
>         if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) {
> -               if (type == 0)
> -                       atomic_set(&bad->karma, 0);
> -               else if (type == 1)
> -                       atomic_inc(&bad->karma);
> +               atomic_inc(&bad->karma);
>
>                 for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_KERNEL;
>                      i++) {
> @@ -1114,7 +1074,7 @@ void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
>                                 if (bad->s_fence->scheduled.context ==
>                                     entity->fence_context) {
>                                         if (entity->guilty)
> -                                               atomic_set(entity->guilty, type);
> +                                               atomic_set(entity->guilty, 1);
>                                         break;
>                                 }
>                         }
> @@ -1124,4 +1084,4 @@ void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
>                 }
>         }
>  }
> -EXPORT_SYMBOL(drm_sched_increase_karma_ext);
> +EXPORT_SYMBOL(drm_sched_increase_karma);
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 289a33e80639..156601fd7053 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -497,10 +497,7 @@ void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
>  void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
>  void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery);
>  void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched);
> -void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max);
>  void drm_sched_increase_karma(struct drm_sched_job *bad);
> -void drm_sched_reset_karma(struct drm_sched_job *bad);
> -void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type);
>  bool drm_sched_dependency_optimized(struct dma_fence* fence,
>                                     struct drm_sched_entity *entity);
>  void drm_sched_fault(struct drm_gpu_scheduler *sched);
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/5] drm/amd/amdgpu revert "implement tdr advanced mode"
@ 2022-11-09  9:50 Christian König
  2022-11-10 19:45 ` Alex Deucher
  0 siblings, 1 reply; 14+ messages in thread
From: Christian König @ 2022-11-09  9:50 UTC (permalink / raw)
  To: Alexander.Deucher, daniel.vetter, dri-devel, Shaoyun.Liu
  Cc: Christian König

This reverts commit e6c6338f393b74ac0b303d567bb918b44ae7ad75.

This feature basically re-submits one job after another to
figure out which one was the one causing a hang.

This is obviously incompatible with gang-submit which requires
that multiple jobs run at the same time. It's also absolutely
not helpful to crash the hardware multiple times if a clean
recovery is desired.

For testing and debugging environments we should rather disable
recovery alltogether to be able to inspect the state with a hw
debugger.

Additional to that the sw implementation is clearly buggy and causes
reference count issues for the hardware fence.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 103 ---------------------
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |   2 +-
 drivers/gpu/drm/scheduler/sched_main.c     |  58 ++----------
 include/drm/gpu_scheduler.h                |   3 -
 4 files changed, 10 insertions(+), 156 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 5b9f992e4607..0da55fd97df8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5077,94 +5077,6 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
 	return 0;
 }
 
-static void amdgpu_device_recheck_guilty_jobs(
-	struct amdgpu_device *adev, struct list_head *device_list_handle,
-	struct amdgpu_reset_context *reset_context)
-{
-	int i, r = 0;
-
-	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
-		struct amdgpu_ring *ring = adev->rings[i];
-		int ret = 0;
-		struct drm_sched_job *s_job;
-
-		if (!ring || !ring->sched.thread)
-			continue;
-
-		s_job = list_first_entry_or_null(&ring->sched.pending_list,
-				struct drm_sched_job, list);
-		if (s_job == NULL)
-			continue;
-
-		/* clear job's guilty and depend the folowing step to decide the real one */
-		drm_sched_reset_karma(s_job);
-		drm_sched_resubmit_jobs_ext(&ring->sched, 1);
-
-		if (!s_job->s_fence->parent) {
-			DRM_WARN("Failed to get a HW fence for job!");
-			continue;
-		}
-
-		ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
-		if (ret == 0) { /* timeout */
-			DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
-						ring->sched.name, s_job->id);
-
-
-			amdgpu_fence_driver_isr_toggle(adev, true);
-
-			/* Clear this failed job from fence array */
-			amdgpu_fence_driver_clear_job_fences(ring);
-
-			amdgpu_fence_driver_isr_toggle(adev, false);
-
-			/* Since the job won't signal and we go for
-			 * another resubmit drop this parent pointer
-			 */
-			dma_fence_put(s_job->s_fence->parent);
-			s_job->s_fence->parent = NULL;
-
-			/* set guilty */
-			drm_sched_increase_karma(s_job);
-			amdgpu_reset_prepare_hwcontext(adev, reset_context);
-retry:
-			/* do hw reset */
-			if (amdgpu_sriov_vf(adev)) {
-				amdgpu_virt_fini_data_exchange(adev);
-				r = amdgpu_device_reset_sriov(adev, false);
-				if (r)
-					adev->asic_reset_res = r;
-			} else {
-				clear_bit(AMDGPU_SKIP_HW_RESET,
-					  &reset_context->flags);
-				r = amdgpu_do_asic_reset(device_list_handle,
-							 reset_context);
-				if (r && r == -EAGAIN)
-					goto retry;
-			}
-
-			/*
-			 * add reset counter so that the following
-			 * resubmitted job could flush vmid
-			 */
-			atomic_inc(&adev->gpu_reset_counter);
-			continue;
-		}
-
-		/* got the hw fence, signal finished fence */
-		atomic_dec(ring->sched.score);
-		dma_fence_get(&s_job->s_fence->finished);
-		dma_fence_signal(&s_job->s_fence->finished);
-		dma_fence_put(&s_job->s_fence->finished);
-
-		/* remove node from list and free the job */
-		spin_lock(&ring->sched.job_list_lock);
-		list_del_init(&s_job->list);
-		spin_unlock(&ring->sched.job_list_lock);
-		ring->sched.ops->free_job(s_job);
-	}
-}
-
 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -5185,7 +5097,6 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
 
 }
 
-
 /**
  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
  *
@@ -5208,7 +5119,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	int i, r = 0;
 	bool need_emergency_restart = false;
 	bool audio_suspended = false;
-	int tmp_vram_lost_counter;
 	bool gpu_reset_for_dev_remove = false;
 
 	gpu_reset_for_dev_remove =
@@ -5354,7 +5264,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 		amdgpu_device_stop_pending_resets(tmp_adev);
 	}
 
-	tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
 	/* Actual ASIC resets if needed.*/
 	/* Host driver will handle XGMI hive reset for SRIOV */
 	if (amdgpu_sriov_vf(adev)) {
@@ -5379,18 +5288,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	/* Post ASIC reset for all devs .*/
 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
 
-		/*
-		 * Sometimes a later bad compute job can block a good gfx job as gfx
-		 * and compute ring share internal GC HW mutually. We add an additional
-		 * guilty jobs recheck step to find the real guilty job, it synchronously
-		 * submits and pends for the first job being signaled. If it gets timeout,
-		 * we identify it as a real guilty job.
-		 */
-		if (amdgpu_gpu_recovery == 2 &&
-			!(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))
-			amdgpu_device_recheck_guilty_jobs(
-				tmp_adev, device_list_handle, reset_context);
-
 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
 			struct amdgpu_ring *ring = tmp_adev->rings[i];
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 8e97e95aca8c..a6820603214f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -519,7 +519,7 @@ module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444);
  * DOC: gpu_recovery (int)
  * Set to enable GPU recovery mechanism (1 = enable, 0 = disable). The default is -1 (auto, disabled except SRIOV).
  */
-MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (2 = advanced tdr mode, 1 = enable, 0 = disable, -1 = auto)");
+MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 = enable, 0 = disable, -1 = auto)");
 module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
 
 /**
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 68317d3a7a27..e77e1fd16732 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -355,27 +355,6 @@ static void drm_sched_job_timedout(struct work_struct *work)
 	}
 }
 
- /**
-  * drm_sched_increase_karma - Update sched_entity guilty flag
-  *
-  * @bad: The job guilty of time out
-  *
-  * Increment on every hang caused by the 'bad' job. If this exceeds the hang
-  * limit of the scheduler then the respective sched entity is marked guilty and
-  * jobs from it will not be scheduled further
-  */
-void drm_sched_increase_karma(struct drm_sched_job *bad)
-{
-	drm_sched_increase_karma_ext(bad, 1);
-}
-EXPORT_SYMBOL(drm_sched_increase_karma);
-
-void drm_sched_reset_karma(struct drm_sched_job *bad)
-{
-	drm_sched_increase_karma_ext(bad, 0);
-}
-EXPORT_SYMBOL(drm_sched_reset_karma);
-
 /**
  * drm_sched_stop - stop the scheduler
  *
@@ -516,32 +495,15 @@ EXPORT_SYMBOL(drm_sched_start);
  *
  */
 void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched)
-{
-	drm_sched_resubmit_jobs_ext(sched, INT_MAX);
-}
-EXPORT_SYMBOL(drm_sched_resubmit_jobs);
-
-/**
- * drm_sched_resubmit_jobs_ext - helper to relunch certain number of jobs from mirror ring list
- *
- * @sched: scheduler instance
- * @max: job numbers to relaunch
- *
- */
-void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
 {
 	struct drm_sched_job *s_job, *tmp;
 	uint64_t guilty_context;
 	bool found_guilty = false;
 	struct dma_fence *fence;
-	int i = 0;
 
 	list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
 		struct drm_sched_fence *s_fence = s_job->s_fence;
 
-		if (i >= max)
-			break;
-
 		if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
 			found_guilty = true;
 			guilty_context = s_job->s_fence->scheduled.context;
@@ -551,7 +513,6 @@ void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
 			dma_fence_set_error(&s_fence->finished, -ECANCELED);
 
 		fence = sched->ops->run_job(s_job);
-		i++;
 
 		if (IS_ERR_OR_NULL(fence)) {
 			if (IS_ERR(fence))
@@ -567,7 +528,7 @@ void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
 		}
 	}
 }
-EXPORT_SYMBOL(drm_sched_resubmit_jobs_ext);
+EXPORT_SYMBOL(drm_sched_resubmit_jobs);
 
 /**
  * drm_sched_job_init - init a scheduler job
@@ -1082,13 +1043,15 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched)
 EXPORT_SYMBOL(drm_sched_fini);
 
 /**
- * drm_sched_increase_karma_ext - Update sched_entity guilty flag
+ * drm_sched_increase_karma - Update sched_entity guilty flag
  *
  * @bad: The job guilty of time out
- * @type: type for increase/reset karma
  *
+ * Increment on every hang caused by the 'bad' job. If this exceeds the hang
+ * limit of the scheduler then the respective sched entity is marked guilty and
+ * jobs from it will not be scheduled further
  */
-void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
+void drm_sched_increase_karma(struct drm_sched_job *bad)
 {
 	int i;
 	struct drm_sched_entity *tmp;
@@ -1100,10 +1063,7 @@ void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
 	 * corrupt but keep in mind that kernel jobs always considered good.
 	 */
 	if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) {
-		if (type == 0)
-			atomic_set(&bad->karma, 0);
-		else if (type == 1)
-			atomic_inc(&bad->karma);
+		atomic_inc(&bad->karma);
 
 		for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_KERNEL;
 		     i++) {
@@ -1114,7 +1074,7 @@ void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
 				if (bad->s_fence->scheduled.context ==
 				    entity->fence_context) {
 					if (entity->guilty)
-						atomic_set(entity->guilty, type);
+						atomic_set(entity->guilty, 1);
 					break;
 				}
 			}
@@ -1124,4 +1084,4 @@ void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
 		}
 	}
 }
-EXPORT_SYMBOL(drm_sched_increase_karma_ext);
+EXPORT_SYMBOL(drm_sched_increase_karma);
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 289a33e80639..156601fd7053 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -497,10 +497,7 @@ void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
 void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
 void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery);
 void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched);
-void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max);
 void drm_sched_increase_karma(struct drm_sched_job *bad);
-void drm_sched_reset_karma(struct drm_sched_job *bad);
-void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type);
 bool drm_sched_dependency_optimized(struct dma_fence* fence,
 				    struct drm_sched_entity *entity);
 void drm_sched_fault(struct drm_gpu_scheduler *sched);
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2023-01-31  9:55 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-26 15:35 [PATCH 1/5] drm/amd/amdgpu revert "implement tdr advanced mode" Christian König
2022-10-26 15:35 ` [PATCH 2/5] drm/amdgpu: stop resubmitting jobs for bare metal reset Christian König
2022-10-26 16:10   ` Liu, Shaoyun
2022-10-26 17:26     ` Christian König
2022-10-26 18:13       ` Liu, Shaoyun
2022-10-27  6:04         ` Christian König
2022-10-26 15:35 ` [PATCH 3/5] drm/amdgpu: stop resubmittting jobs in amdgpu_pci_resume Christian König
2022-10-26 15:35 ` [PATCH 4/5] drm/scheduler: cleanup define Christian König
2022-10-26 15:35 ` [PATCH 5/5] drm/scheduler: deprecate drm_sched_resubmit_jobs Christian König
2023-01-28  8:28 ` [PATCH 1/5] drm/amd/amdgpu revert "implement tdr advanced mode" Yin, ZhenGuo (Chris)
2023-01-31  9:55   ` Christian König
2023-01-30 21:46 ` Luben Tuikov
2022-11-09  9:50 Christian König
2022-11-10 19:45 ` Alex Deucher

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.