All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v4] drm/amd/amdgpu implement tdr advanced mode
@ 2021-03-10 11:19 Jack Zhang
  2021-03-10 12:54 ` Christian König
  0 siblings, 1 reply; 6+ messages in thread
From: Jack Zhang @ 2021-03-10 11:19 UTC (permalink / raw)
  To: amd-gfx, Christian.Koenig, Andrey.Grodzovsky, Monk.Liu, Emily.Deng
  Cc: Jack Zhang

[Why]
Previous tdr design treats the first job in job_timeout as the bad job.
But sometimes a later bad compute job can block a good gfx job and
cause an unexpected gfx job timeout because gfx and compute ring share
internal GC HW mutually.

[How]
This patch implements an advanced tdr mode.It involves an additinal
synchronous pre-resubmit step(Step0 Resubmit) before normal resubmit
step in order to find the real bad job.

1. At Step0 Resubmit stage, it synchronously submits and pends for the
first job being signaled. If it gets timeout, we identify it as guilty
and do hw reset. After that, we would do the normal resubmit step to
resubmit left jobs.

2. Re-insert Bailing job to mirror_list, and leave it to be handled by
the main reset thread.

3. For whole gpu reset(vram lost), do resubmit as the old way.

Signed-off-by: Jack Zhang <Jack.Zhang1@amd.com>
Change-Id: I408357f10b9034caaa1b83610e19e514c5fbaaf2
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 73 ++++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  2 +-
 drivers/gpu/drm/scheduler/sched_main.c     | 75 ++++++++++++++++++++++
 include/drm/gpu_scheduler.h                |  2 +
 4 files changed, 148 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e247c3a2ec08..02056355a055 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4639,7 +4639,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	int i, r = 0;
 	bool need_emergency_restart = false;
 	bool audio_suspended = false;
-
+	int	tmp_vram_lost_counter;
 	/*
 	 * Special case: RAS triggered and full reset isn't supported
 	 */
@@ -4689,9 +4689,14 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 		dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
 					job ? job->base.id : -1);
 
-		/* even we skipped this reset, still need to set the job to guilty */
-		if (job)
+		if (job) {
+			/* re-insert node to avoid memory leak */
+			spin_lock(&job->base.sched->job_list_lock);
+			list_add(&job->base.node, &job->base.sched->pending_list);
+			spin_unlock(&job->base.sched->job_list_lock);
+			/* even we skipped this reset, still need to set the job to guilty */
 			drm_sched_increase_karma(&job->base);
+		}
 		goto skip_recovery;
 	}
 
@@ -4788,6 +4793,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 		}
 	}
 
+	tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
 	/* Actual ASIC resets if needed.*/
 	/* TODO Implement XGMI hive reset logic for SRIOV */
 	if (amdgpu_sriov_vf(adev)) {
@@ -4805,6 +4811,67 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	/* Post ASIC reset for all devs .*/
 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
 
+		if (amdgpu_gpu_recovery == 2 &&
+			!(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) {
+
+			for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+				struct amdgpu_ring *ring = tmp_adev->rings[i];
+				int ret = 0;
+				struct drm_sched_job *s_job;
+
+				if (!ring || !ring->sched.thread)
+					continue;
+
+				/* No point to resubmit jobs if we didn't HW reset*/
+				if (!tmp_adev->asic_reset_res && !job_signaled) {
+
+					s_job = list_first_entry_or_null(&ring->sched.pending_list, struct drm_sched_job, list);
+					if (s_job == NULL)
+						continue;
+
+					/* clear job's guilty and depend the folowing step to decide the real one */
+					drm_sched_reset_karma(s_job);
+					drm_sched_resubmit_jobs2(&ring->sched, 1);
+					ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
+
+					if (ret == 0) { /* timeout */
+						DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", ring->sched.name, s_job->id);
+						/* set guilty */
+						drm_sched_increase_karma(s_job);
+
+						/* do hw reset */
+						if (amdgpu_sriov_vf(adev)) {
+							amdgpu_virt_fini_data_exchange(adev);
+							r = amdgpu_device_reset_sriov(adev, false);
+							if (r)
+								adev->asic_reset_res = r;
+						} else {
+							r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
+							if (r && r == -EAGAIN)
+								goto retry;
+						}
+
+						/* add reset counter so that the following resubmitted job could flush vmid */
+						atomic_inc(&tmp_adev->gpu_reset_counter);
+						continue;
+					}
+
+					/* got the hw fence, signal finished fence */
+					atomic_dec(&ring->sched.num_jobs);
+					dma_fence_get(&s_job->s_fence->finished);
+					dma_fence_signal(&s_job->s_fence->finished);
+					dma_fence_put(&s_job->s_fence->finished);
+
+
+					/* remove node from list and free the job */
+					spin_lock(&ring->sched.job_list_lock);
+					list_del_init(&s_job->node);
+					spin_unlock(&ring->sched.job_list_lock);
+					ring->sched.ops->free_job(s_job);
+				}
+			}
+		}
+
 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
 			struct amdgpu_ring *ring = tmp_adev->rings[i];
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 865f924772b0..9c3f4edb7532 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -509,7 +509,7 @@ module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444);
  * DOC: gpu_recovery (int)
  * Set to enable GPU recovery mechanism (1 = enable, 0 = disable). The default is -1 (auto, disabled except SRIOV).
  */
-MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 = enable, 0 = disable, -1 = auto)");
+MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (2 = advanced tdr mode, 1 = enable, 0 = disable, -1 = auto)");
 module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
 
 /**
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index d82a7ebf6099..99a6a8bddd6f 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -395,6 +395,81 @@ void drm_sched_increase_karma(struct drm_sched_job *bad)
 }
 EXPORT_SYMBOL(drm_sched_increase_karma);
 
+
+void drm_sched_resubmit_jobs2(struct drm_gpu_scheduler *sched, int max)
+{
+	struct drm_sched_job *s_job, *tmp;
+	uint64_t guilty_context;
+	bool found_guilty = false;
+	struct dma_fence *fence;
+	int i = 0;
+
+	list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
+		struct drm_sched_fence *s_fence = s_job->s_fence;
+
+		if (i>=max)
+			break;
+
+		if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
+			found_guilty = true;
+			guilty_context = s_job->s_fence->scheduled.context;
+		}
+
+		if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
+			dma_fence_set_error(&s_fence->finished, -ECANCELED);
+
+		dma_fence_put(s_job->s_fence->parent);
+		fence = sched->ops->run_job(s_job);
+		i++;
+
+		if (IS_ERR_OR_NULL(fence)) {
+			if (IS_ERR(fence))
+				dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
+
+			s_job->s_fence->parent = NULL;
+		} else {
+			s_job->s_fence->parent = fence;
+		}
+	}
+}
+EXPORT_SYMBOL(drm_sched_resubmit_jobs2);
+
+
+
+void drm_sched_reset_karma(struct drm_sched_job *bad)
+{
+	int i;
+	struct drm_sched_entity *tmp;
+	struct drm_sched_entity *entity;
+	struct drm_gpu_scheduler *sched = bad->sched;
+
+	/* don't reset @bad's karma if it's from KERNEL RQ,
+	 * because sometimes GPU hang would cause kernel jobs (like VM updating jobs)
+	 * corrupt but keep in mind that kernel jobs always considered good.
+	 */
+	if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) {
+		atomic_set(&bad->karma, 0);
+		for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_KERNEL;
+		     i++) {
+			struct drm_sched_rq *rq = &sched->sched_rq[i];
+
+			spin_lock(&rq->lock);
+			list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
+				if (bad->s_fence->scheduled.context ==
+				    entity->fence_context) {
+						if (entity->guilty)
+							atomic_set(entity->guilty, 0);
+					break;
+				}
+			}
+			spin_unlock(&rq->lock);
+			if (&entity->list != &rq->entities)
+				break;
+		}
+	}
+}
+EXPORT_SYMBOL(drm_sched_reset_karma);
+
 /**
  * drm_sched_stop - stop the scheduler
  *
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 1c815e0a14ed..01c609149731 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -321,7 +321,9 @@ void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
 void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
 void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery);
 void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched);
+void drm_sched_resubmit_jobs2(struct drm_gpu_scheduler *sched, int max);
 void drm_sched_increase_karma(struct drm_sched_job *bad);
+void drm_sched_reset_karma(struct drm_sched_job *bad);
 bool drm_sched_dependency_optimized(struct dma_fence* fence,
 				    struct drm_sched_entity *entity);
 void drm_sched_fault(struct drm_gpu_scheduler *sched);
-- 
2.25.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v4] drm/amd/amdgpu implement tdr advanced mode
  2021-03-10 11:19 [PATCH v4] drm/amd/amdgpu implement tdr advanced mode Jack Zhang
@ 2021-03-10 12:54 ` Christian König
  2021-03-10 13:36   ` Zhang, Jack (Jian)
  0 siblings, 1 reply; 6+ messages in thread
From: Christian König @ 2021-03-10 12:54 UTC (permalink / raw)
  To: Jack Zhang, amd-gfx, Andrey.Grodzovsky, Monk.Liu, Emily.Deng

Am 10.03.21 um 12:19 schrieb Jack Zhang:
> [Why]
> Previous tdr design treats the first job in job_timeout as the bad job.
> But sometimes a later bad compute job can block a good gfx job and
> cause an unexpected gfx job timeout because gfx and compute ring share
> internal GC HW mutually.
>
> [How]
> This patch implements an advanced tdr mode.It involves an additinal
> synchronous pre-resubmit step(Step0 Resubmit) before normal resubmit
> step in order to find the real bad job.
>
> 1. At Step0 Resubmit stage, it synchronously submits and pends for the
> first job being signaled. If it gets timeout, we identify it as guilty
> and do hw reset. After that, we would do the normal resubmit step to
> resubmit left jobs.
>
> 2. Re-insert Bailing job to mirror_list, and leave it to be handled by
> the main reset thread.
>
> 3. For whole gpu reset(vram lost), do resubmit as the old way.
>
> Signed-off-by: Jack Zhang <Jack.Zhang1@amd.com>
> Change-Id: I408357f10b9034caaa1b83610e19e514c5fbaaf2
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 73 ++++++++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  2 +-
>   drivers/gpu/drm/scheduler/sched_main.c     | 75 ++++++++++++++++++++++
>   include/drm/gpu_scheduler.h                |  2 +
>   4 files changed, 148 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index e247c3a2ec08..02056355a055 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4639,7 +4639,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   	int i, r = 0;
>   	bool need_emergency_restart = false;
>   	bool audio_suspended = false;
> -
> +	int	tmp_vram_lost_counter;

Please keep the empoty line between declaration and code.

In general give that patch a pass with checkpath.pl since there are a 
couple of other place which needs fixing at first glance.

Christian.


>   	/*
>   	 * Special case: RAS triggered and full reset isn't supported
>   	 */
> @@ -4689,9 +4689,14 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   		dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
>   					job ? job->base.id : -1);
>   
> -		/* even we skipped this reset, still need to set the job to guilty */
> -		if (job)
> +		if (job) {
> +			/* re-insert node to avoid memory leak */
> +			spin_lock(&job->base.sched->job_list_lock);
> +			list_add(&job->base.node, &job->base.sched->pending_list);
> +			spin_unlock(&job->base.sched->job_list_lock);
> +			/* even we skipped this reset, still need to set the job to guilty */
>   			drm_sched_increase_karma(&job->base);
> +		}
>   		goto skip_recovery;
>   	}
>   
> @@ -4788,6 +4793,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   		}
>   	}
>   
> +	tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
>   	/* Actual ASIC resets if needed.*/
>   	/* TODO Implement XGMI hive reset logic for SRIOV */
>   	if (amdgpu_sriov_vf(adev)) {
> @@ -4805,6 +4811,67 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   	/* Post ASIC reset for all devs .*/
>   	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>   
> +		if (amdgpu_gpu_recovery == 2 &&
> +			!(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) {
> +
> +			for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> +				struct amdgpu_ring *ring = tmp_adev->rings[i];
> +				int ret = 0;
> +				struct drm_sched_job *s_job;
> +
> +				if (!ring || !ring->sched.thread)
> +					continue;
> +
> +				/* No point to resubmit jobs if we didn't HW reset*/
> +				if (!tmp_adev->asic_reset_res && !job_signaled) {
> +
> +					s_job = list_first_entry_or_null(&ring->sched.pending_list, struct drm_sched_job, list);
> +					if (s_job == NULL)
> +						continue;
> +
> +					/* clear job's guilty and depend the folowing step to decide the real one */
> +					drm_sched_reset_karma(s_job);
> +					drm_sched_resubmit_jobs2(&ring->sched, 1);
> +					ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
> +
> +					if (ret == 0) { /* timeout */
> +						DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", ring->sched.name, s_job->id);
> +						/* set guilty */
> +						drm_sched_increase_karma(s_job);
> +
> +						/* do hw reset */
> +						if (amdgpu_sriov_vf(adev)) {
> +							amdgpu_virt_fini_data_exchange(adev);
> +							r = amdgpu_device_reset_sriov(adev, false);
> +							if (r)
> +								adev->asic_reset_res = r;
> +						} else {
> +							r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
> +							if (r && r == -EAGAIN)
> +								goto retry;
> +						}
> +
> +						/* add reset counter so that the following resubmitted job could flush vmid */
> +						atomic_inc(&tmp_adev->gpu_reset_counter);
> +						continue;
> +					}
> +
> +					/* got the hw fence, signal finished fence */
> +					atomic_dec(&ring->sched.num_jobs);
> +					dma_fence_get(&s_job->s_fence->finished);
> +					dma_fence_signal(&s_job->s_fence->finished);
> +					dma_fence_put(&s_job->s_fence->finished);
> +
> +
> +					/* remove node from list and free the job */
> +					spin_lock(&ring->sched.job_list_lock);
> +					list_del_init(&s_job->node);
> +					spin_unlock(&ring->sched.job_list_lock);
> +					ring->sched.ops->free_job(s_job);
> +				}
> +			}
> +		}
> +
>   		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>   			struct amdgpu_ring *ring = tmp_adev->rings[i];
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 865f924772b0..9c3f4edb7532 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -509,7 +509,7 @@ module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444);
>    * DOC: gpu_recovery (int)
>    * Set to enable GPU recovery mechanism (1 = enable, 0 = disable). The default is -1 (auto, disabled except SRIOV).
>    */
> -MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 = enable, 0 = disable, -1 = auto)");
> +MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (2 = advanced tdr mode, 1 = enable, 0 = disable, -1 = auto)");
>   module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
>   
>   /**
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index d82a7ebf6099..99a6a8bddd6f 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -395,6 +395,81 @@ void drm_sched_increase_karma(struct drm_sched_job *bad)
>   }
>   EXPORT_SYMBOL(drm_sched_increase_karma);
>   
> +
> +void drm_sched_resubmit_jobs2(struct drm_gpu_scheduler *sched, int max)
> +{
> +	struct drm_sched_job *s_job, *tmp;
> +	uint64_t guilty_context;
> +	bool found_guilty = false;
> +	struct dma_fence *fence;
> +	int i = 0;
> +
> +	list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
> +		struct drm_sched_fence *s_fence = s_job->s_fence;
> +
> +		if (i>=max)
> +			break;
> +
> +		if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
> +			found_guilty = true;
> +			guilty_context = s_job->s_fence->scheduled.context;
> +		}
> +
> +		if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
> +			dma_fence_set_error(&s_fence->finished, -ECANCELED);
> +
> +		dma_fence_put(s_job->s_fence->parent);
> +		fence = sched->ops->run_job(s_job);
> +		i++;
> +
> +		if (IS_ERR_OR_NULL(fence)) {
> +			if (IS_ERR(fence))
> +				dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
> +
> +			s_job->s_fence->parent = NULL;
> +		} else {
> +			s_job->s_fence->parent = fence;
> +		}
> +	}
> +}
> +EXPORT_SYMBOL(drm_sched_resubmit_jobs2);
> +
> +
> +
> +void drm_sched_reset_karma(struct drm_sched_job *bad)
> +{
> +	int i;
> +	struct drm_sched_entity *tmp;
> +	struct drm_sched_entity *entity;
> +	struct drm_gpu_scheduler *sched = bad->sched;
> +
> +	/* don't reset @bad's karma if it's from KERNEL RQ,
> +	 * because sometimes GPU hang would cause kernel jobs (like VM updating jobs)
> +	 * corrupt but keep in mind that kernel jobs always considered good.
> +	 */
> +	if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) {
> +		atomic_set(&bad->karma, 0);
> +		for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_KERNEL;
> +		     i++) {
> +			struct drm_sched_rq *rq = &sched->sched_rq[i];
> +
> +			spin_lock(&rq->lock);
> +			list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
> +				if (bad->s_fence->scheduled.context ==
> +				    entity->fence_context) {
> +						if (entity->guilty)
> +							atomic_set(entity->guilty, 0);
> +					break;
> +				}
> +			}
> +			spin_unlock(&rq->lock);
> +			if (&entity->list != &rq->entities)
> +				break;
> +		}
> +	}
> +}
> +EXPORT_SYMBOL(drm_sched_reset_karma);
> +
>   /**
>    * drm_sched_stop - stop the scheduler
>    *
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 1c815e0a14ed..01c609149731 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -321,7 +321,9 @@ void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
>   void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
>   void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery);
>   void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched);
> +void drm_sched_resubmit_jobs2(struct drm_gpu_scheduler *sched, int max);
>   void drm_sched_increase_karma(struct drm_sched_job *bad);
> +void drm_sched_reset_karma(struct drm_sched_job *bad);
>   bool drm_sched_dependency_optimized(struct dma_fence* fence,
>   				    struct drm_sched_entity *entity);
>   void drm_sched_fault(struct drm_gpu_scheduler *sched);

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v4] drm/amd/amdgpu implement tdr advanced mode
  2021-03-10 12:54 ` Christian König
@ 2021-03-10 13:36   ` Zhang, Jack (Jian)
  2021-03-10 13:49     ` Christian König
  0 siblings, 1 reply; 6+ messages in thread
From: Zhang, Jack (Jian) @ 2021-03-10 13:36 UTC (permalink / raw)
  To: amd-gfx, Grodzovsky, Andrey, Liu, Monk, Deng, Emily, Koenig, Christian


[-- Attachment #1.1: Type: text/plain, Size: 13290 bytes --]

[AMD Official Use Only - Internal Distribution Only]

Thanks Christian? just did the checkpatch scan.  Please see the V6 patch

/Jack


________________________________
From: Koenig, Christian <Christian.Koenig@amd.com>
Sent: Wednesday, March 10, 2021 8:54:53 PM
To: Zhang, Jack (Jian) <Jack.Zhang1@amd.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Deng, Emily <Emily.Deng@amd.com>
Subject: Re: [PATCH v4] drm/amd/amdgpu implement tdr advanced mode

Am 10.03.21 um 12:19 schrieb Jack Zhang:
> [Why]
> Previous tdr design treats the first job in job_timeout as the bad job.
> But sometimes a later bad compute job can block a good gfx job and
> cause an unexpected gfx job timeout because gfx and compute ring share
> internal GC HW mutually.
>
> [How]
> This patch implements an advanced tdr mode.It involves an additinal
> synchronous pre-resubmit step(Step0 Resubmit) before normal resubmit
> step in order to find the real bad job.
>
> 1. At Step0 Resubmit stage, it synchronously submits and pends for the
> first job being signaled. If it gets timeout, we identify it as guilty
> and do hw reset. After that, we would do the normal resubmit step to
> resubmit left jobs.
>
> 2. Re-insert Bailing job to mirror_list, and leave it to be handled by
> the main reset thread.
>
> 3. For whole gpu reset(vram lost), do resubmit as the old way.
>
> Signed-off-by: Jack Zhang <Jack.Zhang1@amd.com>
> Change-Id: I408357f10b9034caaa1b83610e19e514c5fbaaf2
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 73 ++++++++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  2 +-
>   drivers/gpu/drm/scheduler/sched_main.c     | 75 ++++++++++++++++++++++
>   include/drm/gpu_scheduler.h                |  2 +
>   4 files changed, 148 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index e247c3a2ec08..02056355a055 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4639,7 +4639,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>        int i, r = 0;
>        bool need_emergency_restart = false;
>        bool audio_suspended = false;
> -
> +     int     tmp_vram_lost_counter;

Please keep the empoty line between declaration and code.

In general give that patch a pass with checkpath.pl since there are a
couple of other place which needs fixing at first glance.

Christian.


>        /*
>         * Special case: RAS triggered and full reset isn't supported
>         */
> @@ -4689,9 +4689,14 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>                dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
>                                        job ? job->base.id : -1);
>
> -             /* even we skipped this reset, still need to set the job to guilty */
> -             if (job)
> +             if (job) {
> +                     /* re-insert node to avoid memory leak */
> +                     spin_lock(&job->base.sched->job_list_lock);
> +                     list_add(&job->base.node, &job->base.sched->pending_list);
> +                     spin_unlock(&job->base.sched->job_list_lock);
> +                     /* even we skipped this reset, still need to set the job to guilty */
>                        drm_sched_increase_karma(&job->base);
> +             }
>                goto skip_recovery;
>        }
>
> @@ -4788,6 +4793,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>                }
>        }
>
> +     tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
>        /* Actual ASIC resets if needed.*/
>        /* TODO Implement XGMI hive reset logic for SRIOV */
>        if (amdgpu_sriov_vf(adev)) {
> @@ -4805,6 +4811,67 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>        /* Post ASIC reset for all devs .*/
>        list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>
> +             if (amdgpu_gpu_recovery == 2 &&
> +                     !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) {
> +
> +                     for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> +                             struct amdgpu_ring *ring = tmp_adev->rings[i];
> +                             int ret = 0;
> +                             struct drm_sched_job *s_job;
> +
> +                             if (!ring || !ring->sched.thread)
> +                                     continue;
> +
> +                             /* No point to resubmit jobs if we didn't HW reset*/
> +                             if (!tmp_adev->asic_reset_res && !job_signaled) {
> +
> +                                     s_job = list_first_entry_or_null(&ring->sched.pending_list, struct drm_sched_job, list);
> +                                     if (s_job == NULL)
> +                                             continue;
> +
> +                                     /* clear job's guilty and depend the folowing step to decide the real one */
> +                                     drm_sched_reset_karma(s_job);
> +                                     drm_sched_resubmit_jobs2(&ring->sched, 1);
> +                                     ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
> +
> +                                     if (ret == 0) { /* timeout */
> +                                             DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", ring->sched.name, s_job->id);
> +                                             /* set guilty */
> +                                             drm_sched_increase_karma(s_job);
> +
> +                                             /* do hw reset */
> +                                             if (amdgpu_sriov_vf(adev)) {
> +                                                     amdgpu_virt_fini_data_exchange(adev);
> +                                                     r = amdgpu_device_reset_sriov(adev, false);
> +                                                     if (r)
> +                                                             adev->asic_reset_res = r;
> +                                             } else {
> +                                                     r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
> +                                                     if (r && r == -EAGAIN)
> +                                                             goto retry;
> +                                             }
> +
> +                                             /* add reset counter so that the following resubmitted job could flush vmid */
> +                                             atomic_inc(&tmp_adev->gpu_reset_counter);
> +                                             continue;
> +                                     }
> +
> +                                     /* got the hw fence, signal finished fence */
> +                                     atomic_dec(&ring->sched.num_jobs);
> +                                     dma_fence_get(&s_job->s_fence->finished);
> +                                     dma_fence_signal(&s_job->s_fence->finished);
> +                                     dma_fence_put(&s_job->s_fence->finished);
> +
> +
> +                                     /* remove node from list and free the job */
> +                                     spin_lock(&ring->sched.job_list_lock);
> +                                     list_del_init(&s_job->node);
> +                                     spin_unlock(&ring->sched.job_list_lock);
> +                                     ring->sched.ops->free_job(s_job);
> +                             }
> +                     }
> +             }
> +
>                for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                        struct amdgpu_ring *ring = tmp_adev->rings[i];
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 865f924772b0..9c3f4edb7532 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -509,7 +509,7 @@ module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444);
>    * DOC: gpu_recovery (int)
>    * Set to enable GPU recovery mechanism (1 = enable, 0 = disable). The default is -1 (auto, disabled except SRIOV).
>    */
> -MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 = enable, 0 = disable, -1 = auto)");
> +MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (2 = advanced tdr mode, 1 = enable, 0 = disable, -1 = auto)");
>   module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
>
>   /**
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index d82a7ebf6099..99a6a8bddd6f 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -395,6 +395,81 @@ void drm_sched_increase_karma(struct drm_sched_job *bad)
>   }
>   EXPORT_SYMBOL(drm_sched_increase_karma);
>
> +
> +void drm_sched_resubmit_jobs2(struct drm_gpu_scheduler *sched, int max)
> +{
> +     struct drm_sched_job *s_job, *tmp;
> +     uint64_t guilty_context;
> +     bool found_guilty = false;
> +     struct dma_fence *fence;
> +     int i = 0;
> +
> +     list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
> +             struct drm_sched_fence *s_fence = s_job->s_fence;
> +
> +             if (i>=max)
> +                     break;
> +
> +             if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
> +                     found_guilty = true;
> +                     guilty_context = s_job->s_fence->scheduled.context;
> +             }
> +
> +             if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
> +                     dma_fence_set_error(&s_fence->finished, -ECANCELED);
> +
> +             dma_fence_put(s_job->s_fence->parent);
> +             fence = sched->ops->run_job(s_job);
> +             i++;
> +
> +             if (IS_ERR_OR_NULL(fence)) {
> +                     if (IS_ERR(fence))
> +                             dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
> +
> +                     s_job->s_fence->parent = NULL;
> +             } else {
> +                     s_job->s_fence->parent = fence;
> +             }
> +     }
> +}
> +EXPORT_SYMBOL(drm_sched_resubmit_jobs2);
> +
> +
> +
> +void drm_sched_reset_karma(struct drm_sched_job *bad)
> +{
> +     int i;
> +     struct drm_sched_entity *tmp;
> +     struct drm_sched_entity *entity;
> +     struct drm_gpu_scheduler *sched = bad->sched;
> +
> +     /* don't reset @bad's karma if it's from KERNEL RQ,
> +      * because sometimes GPU hang would cause kernel jobs (like VM updating jobs)
> +      * corrupt but keep in mind that kernel jobs always considered good.
> +      */
> +     if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) {
> +             atomic_set(&bad->karma, 0);
> +             for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_KERNEL;
> +                  i++) {
> +                     struct drm_sched_rq *rq = &sched->sched_rq[i];
> +
> +                     spin_lock(&rq->lock);
> +                     list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
> +                             if (bad->s_fence->scheduled.context ==
> +                                 entity->fence_context) {
> +                                             if (entity->guilty)
> +                                                     atomic_set(entity->guilty, 0);
> +                                     break;
> +                             }
> +                     }
> +                     spin_unlock(&rq->lock);
> +                     if (&entity->list != &rq->entities)
> +                             break;
> +             }
> +     }
> +}
> +EXPORT_SYMBOL(drm_sched_reset_karma);
> +
>   /**
>    * drm_sched_stop - stop the scheduler
>    *
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 1c815e0a14ed..01c609149731 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -321,7 +321,9 @@ void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
>   void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
>   void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery);
>   void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched);
> +void drm_sched_resubmit_jobs2(struct drm_gpu_scheduler *sched, int max);
>   void drm_sched_increase_karma(struct drm_sched_job *bad);
> +void drm_sched_reset_karma(struct drm_sched_job *bad);
>   bool drm_sched_dependency_optimized(struct dma_fence* fence,
>                                    struct drm_sched_entity *entity);
>   void drm_sched_fault(struct drm_gpu_scheduler *sched);


[-- Attachment #1.2: Type: text/html, Size: 32526 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v4] drm/amd/amdgpu implement tdr advanced mode
  2021-03-10 13:36   ` Zhang, Jack (Jian)
@ 2021-03-10 13:49     ` Christian König
  2021-03-10 14:05       ` Zhang, Jack (Jian)
  0 siblings, 1 reply; 6+ messages in thread
From: Christian König @ 2021-03-10 13:49 UTC (permalink / raw)
  To: Zhang, Jack (Jian), amd-gfx, Grodzovsky, Andrey, Liu, Monk, Deng, Emily


[-- Attachment #1.1: Type: text/plain, Size: 15011 bytes --]

Andrey needs to review the reste, but essentially I don't see the reason 
why you need this drm_sched_resubmit_jobs2().

Christian.

Am 10.03.21 um 14:36 schrieb Zhang, Jack (Jian):
>
> [AMD Official Use Only - Internal Distribution Only]
>
>
> Thanks Christian, just did the checkpatch scan.  Please see the V6 patch
>
> /Jack
>
>
> ------------------------------------------------------------------------
> *From:* Koenig, Christian <Christian.Koenig@amd.com>
> *Sent:* Wednesday, March 10, 2021 8:54:53 PM
> *To:* Zhang, Jack (Jian) <Jack.Zhang1@amd.com>; 
> amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>; 
> Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Liu, Monk 
> <Monk.Liu@amd.com>; Deng, Emily <Emily.Deng@amd.com>
> *Subject:* Re: [PATCH v4] drm/amd/amdgpu implement tdr advanced mode
> Am 10.03.21 um 12:19 schrieb Jack Zhang:
> > [Why]
> > Previous tdr design treats the first job in job_timeout as the bad job.
> > But sometimes a later bad compute job can block a good gfx job and
> > cause an unexpected gfx job timeout because gfx and compute ring share
> > internal GC HW mutually.
> >
> > [How]
> > This patch implements an advanced tdr mode.It involves an additinal
> > synchronous pre-resubmit step(Step0 Resubmit) before normal resubmit
> > step in order to find the real bad job.
> >
> > 1. At Step0 Resubmit stage, it synchronously submits and pends for the
> > first job being signaled. If it gets timeout, we identify it as guilty
> > and do hw reset. After that, we would do the normal resubmit step to
> > resubmit left jobs.
> >
> > 2. Re-insert Bailing job to mirror_list, and leave it to be handled by
> > the main reset thread.
> >
> > 3. For whole gpu reset(vram lost), do resubmit as the old way.
> >
> > Signed-off-by: Jack Zhang <Jack.Zhang1@amd.com>
> > Change-Id: I408357f10b9034caaa1b83610e19e514c5fbaaf2
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 73 ++++++++++++++++++++-
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  2 +-
> >   drivers/gpu/drm/scheduler/sched_main.c     | 75 ++++++++++++++++++++++
> >   include/drm/gpu_scheduler.h                |  2 +
> >   4 files changed, 148 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > index e247c3a2ec08..02056355a055 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > @@ -4639,7 +4639,7 @@ int amdgpu_device_gpu_recover(struct 
> amdgpu_device *adev,
> >        int i, r = 0;
> >        bool need_emergency_restart = false;
> >        bool audio_suspended = false;
> > -
> > +     int     tmp_vram_lost_counter;
>
> Please keep the empoty line between declaration and code.
>
> In general give that patch a pass with checkpath.pl since there are a
> couple of other place which needs fixing at first glance.
>
> Christian.
>
>
> >        /*
> >         * Special case: RAS triggered and full reset isn't supported
> >         */
> > @@ -4689,9 +4689,14 @@ int amdgpu_device_gpu_recover(struct 
> amdgpu_device *adev,
> >                dev_info(adev->dev, "Bailing on TDR for s_job:%llx, 
> as another already in progress",
> >                                        job ? job->base.id : -1);
> >
> > -             /* even we skipped this reset, still need to set the 
> job to guilty */
> > -             if (job)
> > +             if (job) {
> > +                     /* re-insert node to avoid memory leak */
> > + spin_lock(&job->base.sched->job_list_lock);
> > + list_add(&job->base.node, &job->base.sched->pending_list);
> > + spin_unlock(&job->base.sched->job_list_lock);
> > +                     /* even we skipped this reset, still need to 
> set the job to guilty */
> > drm_sched_increase_karma(&job->base);
> > +             }
> >                goto skip_recovery;
> >        }
> >
> > @@ -4788,6 +4793,7 @@ int amdgpu_device_gpu_recover(struct 
> amdgpu_device *adev,
> >                }
> >        }
> >
> > +     tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
> >        /* Actual ASIC resets if needed.*/
> >        /* TODO Implement XGMI hive reset logic for SRIOV */
> >        if (amdgpu_sriov_vf(adev)) {
> > @@ -4805,6 +4811,67 @@ int amdgpu_device_gpu_recover(struct 
> amdgpu_device *adev,
> >        /* Post ASIC reset for all devs .*/
> >        list_for_each_entry(tmp_adev, device_list_handle, 
> gmc.xgmi.head) {
> >
> > +             if (amdgpu_gpu_recovery == 2 &&
> > +                     !(tmp_vram_lost_counter < 
> atomic_read(&adev->vram_lost_counter))) {
> > +
> > +                     for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> > +                             struct amdgpu_ring *ring = 
> tmp_adev->rings[i];
> > +                             int ret = 0;
> > +                             struct drm_sched_job *s_job;
> > +
> > +                             if (!ring || !ring->sched.thread)
> > +                                     continue;
> > +
> > +                             /* No point to resubmit jobs if we 
> didn't HW reset*/
> > +                             if (!tmp_adev->asic_reset_res && 
> !job_signaled) {
> > +
> > +                                     s_job = 
> list_first_entry_or_null(&ring->sched.pending_list, struct 
> drm_sched_job, list);
> > +                                     if (s_job == NULL)
> > + continue;
> > +
> > +                                     /* clear job's guilty and 
> depend the folowing step to decide the real one */
> > + drm_sched_reset_karma(s_job);
> > + drm_sched_resubmit_jobs2(&ring->sched, 1);
> > +                                     ret = 
> dma_fence_wait_timeout(s_job->s_fence->parent, false, 
> ring->sched.timeout);
> > +
> > +                                     if (ret == 0) { /* timeout */
> > + DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", 
> ring->sched.name, s_job->id);
> > +                                             /* set guilty */
> > + drm_sched_increase_karma(s_job);
> > +
> > +                                             /* do hw reset */
> > +                                             if 
> (amdgpu_sriov_vf(adev)) {
> > + amdgpu_virt_fini_data_exchange(adev);
> > +                                                     r = 
> amdgpu_device_reset_sriov(adev, false);
> > +                                                     if (r)
> > + adev->asic_reset_res = r;
> > +                                             } else {
> > +                                                     r = 
> amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
> > +                                                     if (r && r == 
> -EAGAIN)
> > + goto retry;
> > +                                             }
> > +
> > +                                             /* add reset counter 
> so that the following resubmitted job could flush vmid */
> > + atomic_inc(&tmp_adev->gpu_reset_counter);
> > + continue;
> > +                                     }
> > +
> > +                                     /* got the hw fence, signal 
> finished fence */
> > + atomic_dec(&ring->sched.num_jobs);
> > + dma_fence_get(&s_job->s_fence->finished);
> > + dma_fence_signal(&s_job->s_fence->finished);
> > + dma_fence_put(&s_job->s_fence->finished);
> > +
> > +
> > +                                     /* remove node from list and 
> free the job */
> > + spin_lock(&ring->sched.job_list_lock);
> > + list_del_init(&s_job->node);
> > + spin_unlock(&ring->sched.job_list_lock);
> > + ring->sched.ops->free_job(s_job);
> > +                             }
> > +                     }
> > +             }
> > +
> >                for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> >                        struct amdgpu_ring *ring = tmp_adev->rings[i];
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > index 865f924772b0..9c3f4edb7532 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > @@ -509,7 +509,7 @@ module_param_named(compute_multipipe, 
> amdgpu_compute_multipipe, int, 0444);
> >    * DOC: gpu_recovery (int)
> >    * Set to enable GPU recovery mechanism (1 = enable, 0 = disable). 
> The default is -1 (auto, disabled except SRIOV).
> >    */
> > -MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 = 
> enable, 0 = disable, -1 = auto)");
> > +MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (2 = 
> advanced tdr mode, 1 = enable, 0 = disable, -1 = auto)");
> >   module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
> >
> >   /**
> > diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
> b/drivers/gpu/drm/scheduler/sched_main.c
> > index d82a7ebf6099..99a6a8bddd6f 100644
> > --- a/drivers/gpu/drm/scheduler/sched_main.c
> > +++ b/drivers/gpu/drm/scheduler/sched_main.c
> > @@ -395,6 +395,81 @@ void drm_sched_increase_karma(struct 
> drm_sched_job *bad)
> >   }
> >   EXPORT_SYMBOL(drm_sched_increase_karma);
> >
> > +
> > +void drm_sched_resubmit_jobs2(struct drm_gpu_scheduler *sched, int max)
> > +{
> > +     struct drm_sched_job *s_job, *tmp;
> > +     uint64_t guilty_context;
> > +     bool found_guilty = false;
> > +     struct dma_fence *fence;
> > +     int i = 0;
> > +
> > +     list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
> > +             struct drm_sched_fence *s_fence = s_job->s_fence;
> > +
> > +             if (i>=max)
> > +                     break;
> > +
> > +             if (!found_guilty && atomic_read(&s_job->karma) > 
> sched->hang_limit) {
> > +                     found_guilty = true;
> > +                     guilty_context = 
> s_job->s_fence->scheduled.context;
> > +             }
> > +
> > +             if (found_guilty && s_job->s_fence->scheduled.context 
> == guilty_context)
> > + dma_fence_set_error(&s_fence->finished, -ECANCELED);
> > +
> > + dma_fence_put(s_job->s_fence->parent);
> > +             fence = sched->ops->run_job(s_job);
> > +             i++;
> > +
> > +             if (IS_ERR_OR_NULL(fence)) {
> > +                     if (IS_ERR(fence))
> > + dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
> > +
> > +                     s_job->s_fence->parent = NULL;
> > +             } else {
> > +                     s_job->s_fence->parent = fence;
> > +             }
> > +     }
> > +}
> > +EXPORT_SYMBOL(drm_sched_resubmit_jobs2);
> > +
> > +
> > +
> > +void drm_sched_reset_karma(struct drm_sched_job *bad)
> > +{
> > +     int i;
> > +     struct drm_sched_entity *tmp;
> > +     struct drm_sched_entity *entity;
> > +     struct drm_gpu_scheduler *sched = bad->sched;
> > +
> > +     /* don't reset @bad's karma if it's from KERNEL RQ,
> > +      * because sometimes GPU hang would cause kernel jobs (like VM 
> updating jobs)
> > +      * corrupt but keep in mind that kernel jobs always considered 
> good.
> > +      */
> > +     if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) {
> > +             atomic_set(&bad->karma, 0);
> > +             for (i = DRM_SCHED_PRIORITY_MIN; i < 
> DRM_SCHED_PRIORITY_KERNEL;
> > +                  i++) {
> > +                     struct drm_sched_rq *rq = &sched->sched_rq[i];
> > +
> > +                     spin_lock(&rq->lock);
> > + list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
> > +                             if (bad->s_fence->scheduled.context ==
> > + entity->fence_context) {
> > +                                             if (entity->guilty)
> > + atomic_set(entity->guilty, 0);
> > +                                     break;
> > +                             }
> > +                     }
> > + spin_unlock(&rq->lock);
> > +                     if (&entity->list != &rq->entities)
> > +                             break;
> > +             }
> > +     }
> > +}
> > +EXPORT_SYMBOL(drm_sched_reset_karma);
> > +
> >   /**
> >    * drm_sched_stop - stop the scheduler
> >    *
> > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> > index 1c815e0a14ed..01c609149731 100644
> > --- a/include/drm/gpu_scheduler.h
> > +++ b/include/drm/gpu_scheduler.h
> > @@ -321,7 +321,9 @@ void drm_sched_wakeup(struct drm_gpu_scheduler 
> *sched);
> >   void drm_sched_stop(struct drm_gpu_scheduler *sched, struct 
> drm_sched_job *bad);
> >   void drm_sched_start(struct drm_gpu_scheduler *sched, bool 
> full_recovery);
> >   void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched);
> > +void drm_sched_resubmit_jobs2(struct drm_gpu_scheduler *sched, int 
> max);
> >   void drm_sched_increase_karma(struct drm_sched_job *bad);
> > +void drm_sched_reset_karma(struct drm_sched_job *bad);
> >   bool drm_sched_dependency_optimized(struct dma_fence* fence,
> >                                    struct drm_sched_entity *entity);
> >   void drm_sched_fault(struct drm_gpu_scheduler *sched);
>


[-- Attachment #1.2: Type: text/html, Size: 40315 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v4] drm/amd/amdgpu implement tdr advanced mode
  2021-03-10 13:49     ` Christian König
@ 2021-03-10 14:05       ` Zhang, Jack (Jian)
  2021-03-10 14:09         ` Christian König
  0 siblings, 1 reply; 6+ messages in thread
From: Zhang, Jack (Jian) @ 2021-03-10 14:05 UTC (permalink / raw)
  To: amd-gfx, Grodzovsky, Andrey, Liu, Monk, Deng, Emily, Koenig, Christian


[-- Attachment #1.1: Type: text/plain, Size: 14544 bytes --]

[AMD Official Use Only - Internal Distribution Only]


As Monk previously explained in another mail session. Because the drm_sched_resubmit_jobs submit all jobs which does not meet our needs.

We need a new APi  drm_sched_resubmit_jobs2()
to submit the first job of a ring. (also we can leverage the error handling for Null or Error fence inside this function ).

It seems clean to use this jobs2 function than writing the run_job + some error handling in the main logic.


Thanks,
/Jack





________________________________
发件人: Koenig, Christian <Christian.Koenig@amd.com>
发送时间: 2021年3月10日星期三 下午9:49
收件人: Zhang, Jack (Jian); amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey; Liu, Monk; Deng, Emily
主题: Re: [PATCH v4] drm/amd/amdgpu implement tdr advanced mode

Andrey needs to review the reste, but essentially I don't see the reason why you need this drm_sched_resubmit_jobs2().

Christian.

Am 10.03.21 um 14:36 schrieb Zhang, Jack (Jian):

[AMD Official Use Only - Internal Distribution Only]

Thanks Christian, just did the checkpatch scan.  Please see the V6 patch

/Jack


________________________________
From: Koenig, Christian <Christian.Koenig@amd.com><mailto:Christian.Koenig@amd.com>
Sent: Wednesday, March 10, 2021 8:54:53 PM
To: Zhang, Jack (Jian) <Jack.Zhang1@amd.com><mailto:Jack.Zhang1@amd.com>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> <amd-gfx@lists.freedesktop.org><mailto:amd-gfx@lists.freedesktop.org>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com><mailto:Andrey.Grodzovsky@amd.com>; Liu, Monk <Monk.Liu@amd.com><mailto:Monk.Liu@amd.com>; Deng, Emily <Emily.Deng@amd.com><mailto:Emily.Deng@amd.com>
Subject: Re: [PATCH v4] drm/amd/amdgpu implement tdr advanced mode

Am 10.03.21 um 12:19 schrieb Jack Zhang:
> [Why]
> Previous tdr design treats the first job in job_timeout as the bad job.
> But sometimes a later bad compute job can block a good gfx job and
> cause an unexpected gfx job timeout because gfx and compute ring share
> internal GC HW mutually.
>
> [How]
> This patch implements an advanced tdr mode.It involves an additinal
> synchronous pre-resubmit step(Step0 Resubmit) before normal resubmit
> step in order to find the real bad job.
>
> 1. At Step0 Resubmit stage, it synchronously submits and pends for the
> first job being signaled. If it gets timeout, we identify it as guilty
> and do hw reset. After that, we would do the normal resubmit step to
> resubmit left jobs.
>
> 2. Re-insert Bailing job to mirror_list, and leave it to be handled by
> the main reset thread.
>
> 3. For whole gpu reset(vram lost), do resubmit as the old way.
>
> Signed-off-by: Jack Zhang <Jack.Zhang1@amd.com><mailto:Jack.Zhang1@amd.com>
> Change-Id: I408357f10b9034caaa1b83610e19e514c5fbaaf2
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 73 ++++++++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  2 +-
>   drivers/gpu/drm/scheduler/sched_main.c     | 75 ++++++++++++++++++++++
>   include/drm/gpu_scheduler.h                |  2 +
>   4 files changed, 148 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index e247c3a2ec08..02056355a055 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4639,7 +4639,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>        int i, r = 0;
>        bool need_emergency_restart = false;
>        bool audio_suspended = false;
> -
> +     int     tmp_vram_lost_counter;

Please keep the empoty line between declaration and code.

In general give that patch a pass with checkpath.pl since there are a
couple of other place which needs fixing at first glance.

Christian.


>        /*
>         * Special case: RAS triggered and full reset isn't supported
>         */
> @@ -4689,9 +4689,14 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>                dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
>                                        job ? job->base.id : -1);
>
> -             /* even we skipped this reset, still need to set the job to guilty */
> -             if (job)
> +             if (job) {
> +                     /* re-insert node to avoid memory leak */
> +                     spin_lock(&job->base.sched->job_list_lock);
> +                     list_add(&job->base.node, &job->base.sched->pending_list);
> +                     spin_unlock(&job->base.sched->job_list_lock);
> +                     /* even we skipped this reset, still need to set the job to guilty */
>                        drm_sched_increase_karma(&job->base);
> +             }
>                goto skip_recovery;
>        }
>
> @@ -4788,6 +4793,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>                }
>        }
>
> +     tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
>        /* Actual ASIC resets if needed.*/
>        /* TODO Implement XGMI hive reset logic for SRIOV */
>        if (amdgpu_sriov_vf(adev)) {
> @@ -4805,6 +4811,67 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>        /* Post ASIC reset for all devs .*/
>        list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>
> +             if (amdgpu_gpu_recovery == 2 &&
> +                     !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) {
> +
> +                     for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> +                             struct amdgpu_ring *ring = tmp_adev->rings[i];
> +                             int ret = 0;
> +                             struct drm_sched_job *s_job;
> +
> +                             if (!ring || !ring->sched.thread)
> +                                     continue;
> +
> +                             /* No point to resubmit jobs if we didn't HW reset*/
> +                             if (!tmp_adev->asic_reset_res && !job_signaled) {
> +
> +                                     s_job = list_first_entry_or_null(&ring->sched.pending_list, struct drm_sched_job, list);
> +                                     if (s_job == NULL)
> +                                             continue;
> +
> +                                     /* clear job's guilty and depend the folowing step to decide the real one */
> +                                     drm_sched_reset_karma(s_job);
> +                                     drm_sched_resubmit_jobs2(&ring->sched, 1);
> +                                     ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
> +
> +                                     if (ret == 0) { /* timeout */
> +                                             DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", ring->sched.name, s_job->id);
> +                                             /* set guilty */
> +                                             drm_sched_increase_karma(s_job);
> +
> +                                             /* do hw reset */
> +                                             if (amdgpu_sriov_vf(adev)) {
> +                                                     amdgpu_virt_fini_data_exchange(adev);
> +                                                     r = amdgpu_device_reset_sriov(adev, false);
> +                                                     if (r)
> +                                                             adev->asic_reset_res = r;
> +                                             } else {
> +                                                     r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
> +                                                     if (r && r == -EAGAIN)
> +                                                             goto retry;
> +                                             }
> +
> +                                             /* add reset counter so that the following resubmitted job could flush vmid */
> +                                             atomic_inc(&tmp_adev->gpu_reset_counter);
> +                                             continue;
> +                                     }
> +
> +                                     /* got the hw fence, signal finished fence */
> +                                     atomic_dec(&ring->sched.num_jobs);
> +                                     dma_fence_get(&s_job->s_fence->finished);
> +                                     dma_fence_signal(&s_job->s_fence->finished);
> +                                     dma_fence_put(&s_job->s_fence->finished);
> +
> +
> +                                     /* remove node from list and free the job */
> +                                     spin_lock(&ring->sched.job_list_lock);
> +                                     list_del_init(&s_job->node);
> +                                     spin_unlock(&ring->sched.job_list_lock);
> +                                     ring->sched.ops->free_job(s_job);
> +                             }
> +                     }
> +             }
> +
>                for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                        struct amdgpu_ring *ring = tmp_adev->rings[i];
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 865f924772b0..9c3f4edb7532 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -509,7 +509,7 @@ module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444);
>    * DOC: gpu_recovery (int)
>    * Set to enable GPU recovery mechanism (1 = enable, 0 = disable). The default is -1 (auto, disabled except SRIOV).
>    */
> -MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 = enable, 0 = disable, -1 = auto)");
> +MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (2 = advanced tdr mode, 1 = enable, 0 = disable, -1 = auto)");
>   module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
>
>   /**
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index d82a7ebf6099..99a6a8bddd6f 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -395,6 +395,81 @@ void drm_sched_increase_karma(struct drm_sched_job *bad)
>   }
>   EXPORT_SYMBOL(drm_sched_increase_karma);
>
> +
> +void drm_sched_resubmit_jobs2(struct drm_gpu_scheduler *sched, int max)
> +{
> +     struct drm_sched_job *s_job, *tmp;
> +     uint64_t guilty_context;
> +     bool found_guilty = false;
> +     struct dma_fence *fence;
> +     int i = 0;
> +
> +     list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
> +             struct drm_sched_fence *s_fence = s_job->s_fence;
> +
> +             if (i>=max)
> +                     break;
> +
> +             if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
> +                     found_guilty = true;
> +                     guilty_context = s_job->s_fence->scheduled.context;
> +             }
> +
> +             if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
> +                     dma_fence_set_error(&s_fence->finished, -ECANCELED);
> +
> +             dma_fence_put(s_job->s_fence->parent);
> +             fence = sched->ops->run_job(s_job);
> +             i++;
> +
> +             if (IS_ERR_OR_NULL(fence)) {
> +                     if (IS_ERR(fence))
> +                             dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
> +
> +                     s_job->s_fence->parent = NULL;
> +             } else {
> +                     s_job->s_fence->parent = fence;
> +             }
> +     }
> +}
> +EXPORT_SYMBOL(drm_sched_resubmit_jobs2);
> +
> +
> +
> +void drm_sched_reset_karma(struct drm_sched_job *bad)
> +{
> +     int i;
> +     struct drm_sched_entity *tmp;
> +     struct drm_sched_entity *entity;
> +     struct drm_gpu_scheduler *sched = bad->sched;
> +
> +     /* don't reset @bad's karma if it's from KERNEL RQ,
> +      * because sometimes GPU hang would cause kernel jobs (like VM updating jobs)
> +      * corrupt but keep in mind that kernel jobs always considered good.
> +      */
> +     if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) {
> +             atomic_set(&bad->karma, 0);
> +             for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_KERNEL;
> +                  i++) {
> +                     struct drm_sched_rq *rq = &sched->sched_rq[i];
> +
> +                     spin_lock(&rq->lock);
> +                     list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
> +                             if (bad->s_fence->scheduled.context ==
> +                                 entity->fence_context) {
> +                                             if (entity->guilty)
> +                                                     atomic_set(entity->guilty, 0);
> +                                     break;
> +                             }
> +                     }
> +                     spin_unlock(&rq->lock);
> +                     if (&entity->list != &rq->entities)
> +                             break;
> +             }
> +     }
> +}
> +EXPORT_SYMBOL(drm_sched_reset_karma);
> +
>   /**
>    * drm_sched_stop - stop the scheduler
>    *
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 1c815e0a14ed..01c609149731 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -321,7 +321,9 @@ void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
>   void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
>   void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery);
>   void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched);
> +void drm_sched_resubmit_jobs2(struct drm_gpu_scheduler *sched, int max);
>   void drm_sched_increase_karma(struct drm_sched_job *bad);
> +void drm_sched_reset_karma(struct drm_sched_job *bad);
>   bool drm_sched_dependency_optimized(struct dma_fence* fence,
>                                    struct drm_sched_entity *entity);
>   void drm_sched_fault(struct drm_gpu_scheduler *sched);




[-- Attachment #1.2: Type: text/html, Size: 36910 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v4] drm/amd/amdgpu implement tdr advanced mode
  2021-03-10 14:05       ` Zhang, Jack (Jian)
@ 2021-03-10 14:09         ` Christian König
  0 siblings, 0 replies; 6+ messages in thread
From: Christian König @ 2021-03-10 14:09 UTC (permalink / raw)
  To: Zhang, Jack (Jian), amd-gfx, Grodzovsky, Andrey, Liu, Monk, Deng, Emily


[-- Attachment #1.1: Type: text/plain, Size: 15227 bytes --]

Yeah, but wouldn't it be feasible to extend the existing function with 
parameters what to do like Andrey suggested? (It's possible that I just 
missed the response for this).

Christian.

Am 10.03.21 um 15:05 schrieb Zhang, Jack (Jian):
>
> [AMD Official Use Only - Internal Distribution Only]
>
>
>
> As Monk previously explained in another mail session. Because the 
> drm_sched_resubmit_jobs submit all jobs which does not meet our needs.
>
> We need a new APi  drm_sched_resubmit_jobs2()
> to submit the first job of a ring. (also we can leverage the error 
> handling for Null or Error fence inside this function ).
>
> It seems clean to use this jobs2 function than writing the run_job + 
> some error handling in the main logic.
>
>
> Thanks,
> /Jack
>
>
>
>
>
> ------------------------------------------------------------------------
> *发件人:* Koenig, Christian <Christian.Koenig@amd.com>
> *发送时间:* 2021年3月10日星期三 下午9:49
> *收件人:* Zhang, Jack (Jian); amd-gfx@lists.freedesktop.org; Grodzovsky, 
> Andrey; Liu, Monk; Deng, Emily
> *主题:* Re: [PATCH v4] drm/amd/amdgpu implement tdr advanced mode
>
> Andrey needs to review the reste, but essentially I don't see the 
> reason why you need this drm_sched_resubmit_jobs2().
>
> Christian.
>
> Am 10.03.21 um 14:36 schrieb Zhang, Jack (Jian):
>>
>> [AMD Official Use Only - Internal Distribution Only]
>>
>>
>> Thanks Christian, just did the checkpatch scan.  Please see the V6 patch
>>
>> /Jack
>>
>>
>> ------------------------------------------------------------------------
>> *From:* Koenig, Christian <Christian.Koenig@amd.com>
>> *Sent:* Wednesday, March 10, 2021 8:54:53 PM
>> *To:* Zhang, Jack (Jian) <Jack.Zhang1@amd.com>; 
>> amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>; 
>> Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Liu, Monk 
>> <Monk.Liu@amd.com>; Deng, Emily <Emily.Deng@amd.com>
>> *Subject:* Re: [PATCH v4] drm/amd/amdgpu implement tdr advanced mode
>> Am 10.03.21 um 12:19 schrieb Jack Zhang:
>> > [Why]
>> > Previous tdr design treats the first job in job_timeout as the bad job.
>> > But sometimes a later bad compute job can block a good gfx job and
>> > cause an unexpected gfx job timeout because gfx and compute ring share
>> > internal GC HW mutually.
>> >
>> > [How]
>> > This patch implements an advanced tdr mode.It involves an additinal
>> > synchronous pre-resubmit step(Step0 Resubmit) before normal resubmit
>> > step in order to find the real bad job.
>> >
>> > 1. At Step0 Resubmit stage, it synchronously submits and pends for the
>> > first job being signaled. If it gets timeout, we identify it as guilty
>> > and do hw reset. After that, we would do the normal resubmit step to
>> > resubmit left jobs.
>> >
>> > 2. Re-insert Bailing job to mirror_list, and leave it to be handled by
>> > the main reset thread.
>> >
>> > 3. For whole gpu reset(vram lost), do resubmit as the old way.
>> >
>> > Signed-off-by: Jack Zhang <Jack.Zhang1@amd.com>
>> > Change-Id: I408357f10b9034caaa1b83610e19e514c5fbaaf2
>> > ---
>> >   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 73 ++++++++++++++++++++-
>> >   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  2 +-
>> >   drivers/gpu/drm/scheduler/sched_main.c | 75 ++++++++++++++++++++++
>> >   include/drm/gpu_scheduler.h |  2 +
>> >   4 files changed, 148 insertions(+), 4 deletions(-)
>> >
>> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> > index e247c3a2ec08..02056355a055 100644
>> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> > @@ -4639,7 +4639,7 @@ int amdgpu_device_gpu_recover(struct 
>> amdgpu_device *adev,
>> >        int i, r = 0;
>> >        bool need_emergency_restart = false;
>> >        bool audio_suspended = false;
>> > -
>> > +     int     tmp_vram_lost_counter;
>>
>> Please keep the empoty line between declaration and code.
>>
>> In general give that patch a pass with checkpath.pl since there are a
>> couple of other place which needs fixing at first glance.
>>
>> Christian.
>>
>>
>> >        /*
>> >         * Special case: RAS triggered and full reset isn't supported
>> >         */
>> > @@ -4689,9 +4689,14 @@ int amdgpu_device_gpu_recover(struct 
>> amdgpu_device *adev,
>> >                dev_info(adev->dev, "Bailing on TDR for s_job:%llx, 
>> as another already in progress",
>> >                                        job ? job->base.id : -1);
>> >
>> > -             /* even we skipped this reset, still need to set the 
>> job to guilty */
>> > -             if (job)
>> > +             if (job) {
>> > +                     /* re-insert node to avoid memory leak */
>> > + spin_lock(&job->base.sched->job_list_lock);
>> > + list_add(&job->base.node, &job->base.sched->pending_list);
>> > + spin_unlock(&job->base.sched->job_list_lock);
>> > +                     /* even we skipped this reset, still need to 
>> set the job to guilty */
>> > drm_sched_increase_karma(&job->base);
>> > +             }
>> >                goto skip_recovery;
>> >        }
>> >
>> > @@ -4788,6 +4793,7 @@ int amdgpu_device_gpu_recover(struct 
>> amdgpu_device *adev,
>> >                }
>> >        }
>> >
>> > +     tmp_vram_lost_counter = 
>> atomic_read(&((adev)->vram_lost_counter));
>> >        /* Actual ASIC resets if needed.*/
>> >        /* TODO Implement XGMI hive reset logic for SRIOV */
>> >        if (amdgpu_sriov_vf(adev)) {
>> > @@ -4805,6 +4811,67 @@ int amdgpu_device_gpu_recover(struct 
>> amdgpu_device *adev,
>> >        /* Post ASIC reset for all devs .*/
>> >        list_for_each_entry(tmp_adev, device_list_handle, 
>> gmc.xgmi.head) {
>> >
>> > +             if (amdgpu_gpu_recovery == 2 &&
>> > +                     !(tmp_vram_lost_counter < 
>> atomic_read(&adev->vram_lost_counter))) {
>> > +
>> > +                     for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> > +                             struct amdgpu_ring *ring = 
>> tmp_adev->rings[i];
>> > +                             int ret = 0;
>> > +                             struct drm_sched_job *s_job;
>> > +
>> > +                             if (!ring || !ring->sched.thread)
>> > + continue;
>> > +
>> > +                             /* No point to resubmit jobs if we 
>> didn't HW reset*/
>> > +                             if (!tmp_adev->asic_reset_res && 
>> !job_signaled) {
>> > +
>> > +                                     s_job = 
>> list_first_entry_or_null(&ring->sched.pending_list, struct 
>> drm_sched_job, list);
>> > +                                     if (s_job == NULL)
>> > + continue;
>> > +
>> > +                                     /* clear job's guilty and 
>> depend the folowing step to decide the real one */
>> > + drm_sched_reset_karma(s_job);
>> > + drm_sched_resubmit_jobs2(&ring->sched, 1);
>> > +                                     ret = 
>> dma_fence_wait_timeout(s_job->s_fence->parent, false, 
>> ring->sched.timeout);
>> > +
>> > +                                     if (ret == 0) { /* timeout */
>> > + DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", 
>> ring->sched.name, s_job->id);
>> > + /* set guilty */
>> > + drm_sched_increase_karma(s_job);
>> > +
>> > + /* do hw reset */
>> > + if (amdgpu_sriov_vf(adev)) {
>> > + amdgpu_virt_fini_data_exchange(adev);
>> > + r = amdgpu_device_reset_sriov(adev, false);
>> > + if (r)
>> > + adev->asic_reset_res = r;
>> > + } else {
>> > + r  = amdgpu_do_asic_reset(hive, device_list_handle, 
>> &need_full_reset, false);
>> > + if (r && r == -EAGAIN)
>> > + goto retry;
>> > + }
>> > +
>> > + /* add reset counter so that the following resubmitted job could 
>> flush vmid */
>> > + atomic_inc(&tmp_adev->gpu_reset_counter);
>> > + continue;
>> > +                                     }
>> > +
>> > +                                     /* got the hw fence, signal 
>> finished fence */
>> > + atomic_dec(&ring->sched.num_jobs);
>> > + dma_fence_get(&s_job->s_fence->finished);
>> > + dma_fence_signal(&s_job->s_fence->finished);
>> > + dma_fence_put(&s_job->s_fence->finished);
>> > +
>> > +
>> > +                                     /* remove node from list and 
>> free the job */
>> > + spin_lock(&ring->sched.job_list_lock);
>> > + list_del_init(&s_job->node);
>> > + spin_unlock(&ring->sched.job_list_lock);
>> > + ring->sched.ops->free_job(s_job);
>> > +                             }
>> > +                     }
>> > +             }
>> > +
>> >                for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> >                        struct amdgpu_ring *ring = tmp_adev->rings[i];
>> >
>> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> > index 865f924772b0..9c3f4edb7532 100644
>> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> > @@ -509,7 +509,7 @@ module_param_named(compute_multipipe, 
>> amdgpu_compute_multipipe, int, 0444);
>> >    * DOC: gpu_recovery (int)
>> >    * Set to enable GPU recovery mechanism (1 = enable, 0 = 
>> disable). The default is -1 (auto, disabled except SRIOV).
>> >    */
>> > -MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 
>> = enable, 0 = disable, -1 = auto)");
>> > +MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (2 
>> = advanced tdr mode, 1 = enable, 0 = disable, -1 = auto)");
>> >   module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
>> >
>> >   /**
>> > diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
>> b/drivers/gpu/drm/scheduler/sched_main.c
>> > index d82a7ebf6099..99a6a8bddd6f 100644
>> > --- a/drivers/gpu/drm/scheduler/sched_main.c
>> > +++ b/drivers/gpu/drm/scheduler/sched_main.c
>> > @@ -395,6 +395,81 @@ void drm_sched_increase_karma(struct 
>> drm_sched_job *bad)
>> >   }
>> >   EXPORT_SYMBOL(drm_sched_increase_karma);
>> >
>> > +
>> > +void drm_sched_resubmit_jobs2(struct drm_gpu_scheduler *sched, int 
>> max)
>> > +{
>> > +     struct drm_sched_job *s_job, *tmp;
>> > +     uint64_t guilty_context;
>> > +     bool found_guilty = false;
>> > +     struct dma_fence *fence;
>> > +     int i = 0;
>> > +
>> > +     list_for_each_entry_safe(s_job, tmp, &sched->pending_list, 
>> list) {
>> > +             struct drm_sched_fence *s_fence = s_job->s_fence;
>> > +
>> > +             if (i>=max)
>> > +                     break;
>> > +
>> > +             if (!found_guilty && atomic_read(&s_job->karma) > 
>> sched->hang_limit) {
>> > +                     found_guilty = true;
>> > +                     guilty_context = 
>> s_job->s_fence->scheduled.context;
>> > +             }
>> > +
>> > +             if (found_guilty && s_job->s_fence->scheduled.context 
>> == guilty_context)
>> > + dma_fence_set_error(&s_fence->finished, -ECANCELED);
>> > +
>> > + dma_fence_put(s_job->s_fence->parent);
>> > +             fence = sched->ops->run_job(s_job);
>> > +             i++;
>> > +
>> > +             if (IS_ERR_OR_NULL(fence)) {
>> > +                     if (IS_ERR(fence))
>> > + dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
>> > +
>> > + s_job->s_fence->parent = NULL;
>> > +             } else {
>> > + s_job->s_fence->parent = fence;
>> > +             }
>> > +     }
>> > +}
>> > +EXPORT_SYMBOL(drm_sched_resubmit_jobs2);
>> > +
>> > +
>> > +
>> > +void drm_sched_reset_karma(struct drm_sched_job *bad)
>> > +{
>> > +     int i;
>> > +     struct drm_sched_entity *tmp;
>> > +     struct drm_sched_entity *entity;
>> > +     struct drm_gpu_scheduler *sched = bad->sched;
>> > +
>> > +     /* don't reset @bad's karma if it's from KERNEL RQ,
>> > +      * because sometimes GPU hang would cause kernel jobs (like 
>> VM updating jobs)
>> > +      * corrupt but keep in mind that kernel jobs always 
>> considered good.
>> > +      */
>> > +     if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) {
>> > +             atomic_set(&bad->karma, 0);
>> > +             for (i = DRM_SCHED_PRIORITY_MIN; i < 
>> DRM_SCHED_PRIORITY_KERNEL;
>> > +                  i++) {
>> > +                     struct drm_sched_rq *rq = &sched->sched_rq[i];
>> > +
>> > + spin_lock(&rq->lock);
>> > + list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
>> > +                             if (bad->s_fence->scheduled.context ==
>> > + entity->fence_context) {
>> > + if (entity->guilty)
>> > + atomic_set(entity->guilty, 0);
>> > +                                     break;
>> > +                             }
>> > +                     }
>> > + spin_unlock(&rq->lock);
>> > +                     if (&entity->list != &rq->entities)
>> > +                             break;
>> > +             }
>> > +     }
>> > +}
>> > +EXPORT_SYMBOL(drm_sched_reset_karma);
>> > +
>> >   /**
>> >    * drm_sched_stop - stop the scheduler
>> >    *
>> > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>> > index 1c815e0a14ed..01c609149731 100644
>> > --- a/include/drm/gpu_scheduler.h
>> > +++ b/include/drm/gpu_scheduler.h
>> > @@ -321,7 +321,9 @@ void drm_sched_wakeup(struct drm_gpu_scheduler 
>> *sched);
>> >   void drm_sched_stop(struct drm_gpu_scheduler *sched, struct 
>> drm_sched_job *bad);
>> >   void drm_sched_start(struct drm_gpu_scheduler *sched, bool 
>> full_recovery);
>> >   void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched);
>> > +void drm_sched_resubmit_jobs2(struct drm_gpu_scheduler *sched, int 
>> max);
>> >   void drm_sched_increase_karma(struct drm_sched_job *bad);
>> > +void drm_sched_reset_karma(struct drm_sched_job *bad);
>> >   bool drm_sched_dependency_optimized(struct dma_fence* fence,
>> >                                    struct drm_sched_entity *entity);
>> >   void drm_sched_fault(struct drm_gpu_scheduler *sched);
>>
>
>


[-- Attachment #1.2: Type: text/html, Size: 48660 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2021-03-10 14:09 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-10 11:19 [PATCH v4] drm/amd/amdgpu implement tdr advanced mode Jack Zhang
2021-03-10 12:54 ` Christian König
2021-03-10 13:36   ` Zhang, Jack (Jian)
2021-03-10 13:49     ` Christian König
2021-03-10 14:05       ` Zhang, Jack (Jian)
2021-03-10 14:09         ` Christian König

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.