All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] drm/amd/amdgpu implement tdr advanced mode
@ 2021-03-06 17:25 Jack Zhang
  2021-03-06 19:03 ` Christian König
  0 siblings, 1 reply; 13+ messages in thread
From: Jack Zhang @ 2021-03-06 17:25 UTC (permalink / raw)
  To: amd-gfx, christian.koenig, Andrey.Grodzovsky, Monk.Liu, Emily.Deng
  Cc: Jack Zhang

[Why]
Previous tdr design treats the first job in job_timeout as the bad job.
But sometimes a later bad compute job can block a good gfx job and
cause an unexpected gfx job timeout because gfx and compute ring share
internal GC HW mutually.

[How]
This patch implements an advanced tdr mode.It involves an additinal
synchronous pre-resubmit step(Step0 Resubmit) before normal resubmit
step in order to find the real bad job.

1. For Bailing TDR job, re-insert it to mirror_list, don't set it to
guilty and leave it to be handled by the main reset thread.

2. Don't set the job to guilty in pre_asic_reset, and leave it to be
handled by Step0 Resubmit Stage.

3. At Step0 Resubmit stage, it first resubmit jobs asynchronously,
then it iterate each ring mirror_list, synchronously pend for each hw
fence being signaled. If the a job's hw fence get timeout, we identify
it as guilty and do hw reset to recover hw. After that, we would do the
normal resubmit step to resubmit left jobs.

4. For whole gpu reset(vram lost), skip Step0 Resubmit as each job
after vram lost was considered as bad job.

5. Involve the concept "Affinity Group".
Doing two hw resets is not necessary when there's only one ring that
has jobs among some hw-related rings.Thus, we involve "affinity group".
Hw-related rings could be added into a common affinity group, such as
gfx and compute ring. When tdr happens, we iterate all rings in
affinity group, skip Step0 Resubmit stage if there's only one ring's
mirror_list that has valid sched jobs.

V2:
    -fix a cherry-pick mistake for bailing TDR handling.

    -do affinity_group check according to the bad job's sched rather
     than the default "1" so that there could be multiple affinity
     groups being pre-defined in future.

Signed-off-by: Jack Zhang <Jack.Zhang1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 +++++++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  47 ++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.h    |   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c   |  27 ++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |   1 +
 include/drm/gpu_scheduler.h                |   1 +
 7 files changed, 173 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e247c3a2ec08..8632d7071292 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4188,6 +4188,37 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
 	return false;
 }
 
+bool amdgpu_affinity_group_has_only_or_null_working_ring(struct amdgpu_device *adev, struct drm_sched_job *s_job)
+{
+       int i;
+       int working_ring_num = 0;
+
+	/*
+	 * The job is considered as the real bad one
+	 * if job's sched is not in affinity group
+	 */
+	if (s_job->sched.affinity_group == 0)
+			return true;
+
+       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+               struct amdgpu_ring *ring = adev->rings[i];
+
+               if (!ring || !ring->sched.thread)
+                       continue;
+
+               /* for non-empty affinity ring, increase working_ring_num */
+               if (ring->sched.affinity_group == s_job->sched.affinity_group) {
+                       if (!list_empty(&ring->sched.ring_mirror_list))
+                               working_ring_num++;
+               }
+       }
+
+       if (working_ring_num > 1) {
+               return false;
+       }
+       return true;
+}
+
 /**
  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
  *
@@ -4310,8 +4341,10 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 		amdgpu_fence_driver_force_completion(ring);
 	}
 
-	if(job)
-		drm_sched_increase_karma(&job->base);
+	if (amdgpu_gpu_recovery != 2) {
+		if (job)
+			drm_sched_increase_karma(&job->base);
+	}
 
 	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
 	if (!amdgpu_sriov_vf(adev)) {
@@ -4639,7 +4672,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	int i, r = 0;
 	bool need_emergency_restart = false;
 	bool audio_suspended = false;
-
+	int	tmp_vram_lost_counter;
 	/*
 	 * Special case: RAS triggered and full reset isn't supported
 	 */
@@ -4690,8 +4723,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 					job ? job->base.id : -1);
 
 		/* even we skipped this reset, still need to set the job to guilty */
-		if (job)
-			drm_sched_increase_karma(&job->base);
+		if (job) {
+			if (amdgpu_gpu_recovery == 2) {
+				if (&job->base) {
+					spin_lock(&job->base.sched->job_list_lock);
+					list_add(&job->base.node, &job->base.sched->ring_mirror_list);
+					spin_unlock(&job->base.sched->job_list_lock);
+				}
+			} else
+				drm_sched_increase_karma(&job->base);
+		}
 		goto skip_recovery;
 	}
 
@@ -4788,6 +4829,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 		}
 	}
 
+	tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
 	/* Actual ASIC resets if needed.*/
 	/* TODO Implement XGMI hive reset logic for SRIOV */
 	if (amdgpu_sriov_vf(adev)) {
@@ -4804,18 +4846,64 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 
 	/* Post ASIC reset for all devs .*/
 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
+		int step = 1;
 
+		if (amdgpu_gpu_recovery == 2) {
+			if (amdgpu_affinity_group_has_only_or_null_working_ring(adev,&job->base)
+				|| tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)) {
+				DRM_INFO("Skip Stage0 Resubmit Stage\n");
+				/* set guilty */
+				drm_sched_increase_karma(&job->base);
+				step = 1;
+			} else {
+				DRM_INFO("Do Stage0 Resubmit Stage\n");
+				step = 0;
+			}
+		}
+
+retry_resubmit:
 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
 			struct amdgpu_ring *ring = tmp_adev->rings[i];
+			int ret = 0;
+			struct drm_sched_job *s_bad_job = NULL;
 
 			if (!ring || !ring->sched.thread)
 				continue;
 
 			/* No point to resubmit jobs if we didn't HW reset*/
-			if (!tmp_adev->asic_reset_res && !job_signaled)
+			if (!tmp_adev->asic_reset_res && !job_signaled) {
+
 				drm_sched_resubmit_jobs(&ring->sched);
 
-			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
+				if (amdgpu_gpu_recovery == 2 && step == 0) {
+					ret = amdgpu_wait_resubmitted_jobs_completion(&ring->sched, ring->sched.timeout, &s_bad_job);
+					if (ret == -1) {
+						DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", ring->sched.name, s_bad_job->id);
+						/* set guilty */
+						drm_sched_increase_karma(s_bad_job);
+
+						/* do hw reset */
+						if (amdgpu_sriov_vf(adev)) {
+							amdgpu_virt_fini_data_exchange(adev);
+							r = amdgpu_device_reset_sriov(adev, false);
+							if (r)
+								adev->asic_reset_res = r;
+						} else {
+							r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
+							if (r && r == -EAGAIN)
+								goto retry;
+						}
+
+						/* add reset counter so that the following resubmitted job could flush vmid */
+						atomic_inc(&tmp_adev->gpu_reset_counter);
+						step = 1;
+						goto retry_resubmit;
+					}
+				}
+			}
+
+			if (step == 1)
+				drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
 		}
 
 		if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 865f924772b0..9c3f4edb7532 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -509,7 +509,7 @@ module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444);
  * DOC: gpu_recovery (int)
  * Set to enable GPU recovery mechanism (1 = enable, 0 = disable). The default is -1 (auto, disabled except SRIOV).
  */
-MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 = enable, 0 = disable, -1 = auto)");
+MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (2 = advanced tdr mode, 1 = enable, 0 = disable, -1 = auto)");
 module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
 
 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 759b34799221..28cda321157a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -281,6 +281,53 @@ void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched)
 	}
 }
 
+int amdgpu_wait_resubmitted_jobs_completion(struct drm_gpu_scheduler *sched, long timeout, struct drm_sched_job **s_bad_job)
+{
+	struct drm_sched_job *s_job, *tmp;
+	int ret = 0;
+
+	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
+		struct drm_sched_fence *s_fence = s_job->s_fence;
+
+			if (s_fence->parent == NULL) { /* fail to get a hw fence */
+				/* process a job */
+				atomic_dec(&sched->num_jobs);
+				dma_fence_get(&s_fence->finished);
+				dma_fence_signal(&s_fence->finished);
+				dma_fence_put(&s_fence->finished);
+
+				/* remove node from mirror_list and free the job */
+				spin_lock(&sched->job_list_lock);
+				list_del_init(&s_job->node);
+				spin_unlock(&sched->job_list_lock);
+				sched->ops->free_job(s_job);
+				continue;
+			}
+
+			ret = dma_fence_wait_timeout(s_fence->parent, false, timeout);
+
+			if (ret > 0) { /* succeed */
+				/* process a job */
+				atomic_dec(&sched->num_jobs);
+				dma_fence_get(&s_fence->finished);
+				dma_fence_signal(&s_fence->finished);
+				dma_fence_put(&s_fence->finished);
+
+				/* remove node from mirror_list and free the job */
+				spin_lock(&sched->job_list_lock);
+				list_del_init(&s_job->node);
+				spin_unlock(&sched->job_list_lock);
+				sched->ops->free_job(s_job);
+				continue;
+			} else if (ret == 0) {
+				*s_bad_job = s_job;
+				return -1; /* timeout */
+			}
+	}
+
+	return 0;
+}
+
 const struct drm_sched_backend_ops amdgpu_sched_ops = {
 	.dependency = amdgpu_job_dependency,
 	.run_job = amdgpu_job_run,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
index 81caac9b958a..25292f4699fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
@@ -76,5 +76,5 @@ int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring,
 			     struct dma_fence **fence);
 
 void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched);
-
+int amdgpu_wait_resubmitted_jobs_completion(struct drm_gpu_scheduler *sched, long timeout, struct drm_sched_job **s_bad_job);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index b644c78475fd..cb50bfc80bc9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -35,6 +35,11 @@
 #include "amdgpu.h"
 #include "atom.h"
 
+static char *amdgpu_affinity_group[] = {
+"gfx",
+"comp"
+};
+
 /*
  * Rings
  * Most engines on the GPU are fed via ring buffers.  Ring
@@ -189,6 +194,7 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct amdgpu_ring *ring,
 		ring->adev = adev;
 		ring->idx = adev->num_rings++;
 		adev->rings[ring->idx] = ring;
+		amdgpu_ring_set_affinity_group(ring);
 		r = amdgpu_fence_driver_init_ring(ring, sched_hw_submission);
 		if (r)
 			return r;
@@ -459,3 +465,24 @@ int amdgpu_ring_test_helper(struct amdgpu_ring *ring)
 	ring->sched.ready = !r;
 	return r;
 }
+
+int amdgpu_ring_set_affinity_group(struct amdgpu_ring *ring)
+{
+       struct amdgpu_device *adev = ring->adev;
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(amdgpu_affinity_group); i++) {
+               char *temp_name = amdgpu_affinity_group[i];
+
+               /* set ring's affinity_group bit if find it in affinity_group list */
+               if (strncmp(ring->name, temp_name, strlen(temp_name)) == 0) {
+                       DRM_DEV_INFO(adev->dev, "set ring:%s in affinity_group\n",
+                             ring->name);
+                       ring->sched.affinity_group = 1;
+                       return 0;
+               }
+       }
+
+       ring->sched.affinity_group = 0;
+       return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 56acec1075ac..6b0d217e6f5a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -350,4 +350,5 @@ int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
 			     struct amdgpu_ring *ring);
 void amdgpu_debugfs_ring_fini(struct amdgpu_ring *ring);
 
+int amdgpu_ring_set_affinity_group(struct amdgpu_ring *ring);
 #endif
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 1c815e0a14ed..589cbaea35dc 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -301,6 +301,7 @@ struct drm_gpu_scheduler {
 	atomic_t                        _score;
 	bool				ready;
 	bool				free_guilty;
+	int				affinity_group;
 };
 
 int drm_sched_init(struct drm_gpu_scheduler *sched,
-- 
2.25.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode
  2021-03-06 17:25 [PATCH v2] drm/amd/amdgpu implement tdr advanced mode Jack Zhang
@ 2021-03-06 19:03 ` Christian König
  2021-03-08  4:06   ` Liu, Monk
  0 siblings, 1 reply; 13+ messages in thread
From: Christian König @ 2021-03-06 19:03 UTC (permalink / raw)
  To: Jack Zhang, amd-gfx, Andrey.Grodzovsky, Monk.Liu, Emily.Deng

Hi Jack,

well first of all please completely drop the affinity group stuff from 
this patch. We should concentrate on one feature at at time.

Then the implementation is way to complicate. All you need to do is 
insert a dma_fence_wait after re-scheduling each job after a reset.

Additional to that this feature is completely AMD specific and shouldn't 
affect the common scheduler in any way.

Regards,
Christian.

Am 06.03.21 um 18:25 schrieb Jack Zhang:
> [Why]
> Previous tdr design treats the first job in job_timeout as the bad job.
> But sometimes a later bad compute job can block a good gfx job and
> cause an unexpected gfx job timeout because gfx and compute ring share
> internal GC HW mutually.
>
> [How]
> This patch implements an advanced tdr mode.It involves an additinal
> synchronous pre-resubmit step(Step0 Resubmit) before normal resubmit
> step in order to find the real bad job.
>
> 1. For Bailing TDR job, re-insert it to mirror_list, don't set it to
> guilty and leave it to be handled by the main reset thread.
>
> 2. Don't set the job to guilty in pre_asic_reset, and leave it to be
> handled by Step0 Resubmit Stage.
>
> 3. At Step0 Resubmit stage, it first resubmit jobs asynchronously,
> then it iterate each ring mirror_list, synchronously pend for each hw
> fence being signaled. If the a job's hw fence get timeout, we identify
> it as guilty and do hw reset to recover hw. After that, we would do the
> normal resubmit step to resubmit left jobs.
>
> 4. For whole gpu reset(vram lost), skip Step0 Resubmit as each job
> after vram lost was considered as bad job.
>
> 5. Involve the concept "Affinity Group".
> Doing two hw resets is not necessary when there's only one ring that
> has jobs among some hw-related rings.Thus, we involve "affinity group".
> Hw-related rings could be added into a common affinity group, such as
> gfx and compute ring. When tdr happens, we iterate all rings in
> affinity group, skip Step0 Resubmit stage if there's only one ring's
> mirror_list that has valid sched jobs.
>
> V2:
>      -fix a cherry-pick mistake for bailing TDR handling.
>
>      -do affinity_group check according to the bad job's sched rather
>       than the default "1" so that there could be multiple affinity
>       groups being pre-defined in future.
>
> Signed-off-by: Jack Zhang <Jack.Zhang1@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 +++++++++++++++++++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |   2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  47 ++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h    |   2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c   |  27 ++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |   1 +
>   include/drm/gpu_scheduler.h                |   1 +
>   7 files changed, 173 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index e247c3a2ec08..8632d7071292 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4188,6 +4188,37 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
>   	return false;
>   }
>   
> +bool amdgpu_affinity_group_has_only_or_null_working_ring(struct amdgpu_device *adev, struct drm_sched_job *s_job)
> +{
> +       int i;
> +       int working_ring_num = 0;
> +
> +	/*
> +	 * The job is considered as the real bad one
> +	 * if job's sched is not in affinity group
> +	 */
> +	if (s_job->sched.affinity_group == 0)
> +			return true;
> +
> +       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> +               struct amdgpu_ring *ring = adev->rings[i];
> +
> +               if (!ring || !ring->sched.thread)
> +                       continue;
> +
> +               /* for non-empty affinity ring, increase working_ring_num */
> +               if (ring->sched.affinity_group == s_job->sched.affinity_group) {
> +                       if (!list_empty(&ring->sched.ring_mirror_list))
> +                               working_ring_num++;
> +               }
> +       }
> +
> +       if (working_ring_num > 1) {
> +               return false;
> +       }
> +       return true;
> +}
> +
>   /**
>    * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
>    *
> @@ -4310,8 +4341,10 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>   		amdgpu_fence_driver_force_completion(ring);
>   	}
>   
> -	if(job)
> -		drm_sched_increase_karma(&job->base);
> +	if (amdgpu_gpu_recovery != 2) {
> +		if (job)
> +			drm_sched_increase_karma(&job->base);
> +	}
>   
>   	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
>   	if (!amdgpu_sriov_vf(adev)) {
> @@ -4639,7 +4672,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   	int i, r = 0;
>   	bool need_emergency_restart = false;
>   	bool audio_suspended = false;
> -
> +	int	tmp_vram_lost_counter;
>   	/*
>   	 * Special case: RAS triggered and full reset isn't supported
>   	 */
> @@ -4690,8 +4723,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   					job ? job->base.id : -1);
>   
>   		/* even we skipped this reset, still need to set the job to guilty */
> -		if (job)
> -			drm_sched_increase_karma(&job->base);
> +		if (job) {
> +			if (amdgpu_gpu_recovery == 2) {
> +				if (&job->base) {
> +					spin_lock(&job->base.sched->job_list_lock);
> +					list_add(&job->base.node, &job->base.sched->ring_mirror_list);
> +					spin_unlock(&job->base.sched->job_list_lock);
> +				}
> +			} else
> +				drm_sched_increase_karma(&job->base);
> +		}
>   		goto skip_recovery;
>   	}
>   
> @@ -4788,6 +4829,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   		}
>   	}
>   
> +	tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
>   	/* Actual ASIC resets if needed.*/
>   	/* TODO Implement XGMI hive reset logic for SRIOV */
>   	if (amdgpu_sriov_vf(adev)) {
> @@ -4804,18 +4846,64 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   
>   	/* Post ASIC reset for all devs .*/
>   	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
> +		int step = 1;
>   
> +		if (amdgpu_gpu_recovery == 2) {
> +			if (amdgpu_affinity_group_has_only_or_null_working_ring(adev,&job->base)
> +				|| tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)) {
> +				DRM_INFO("Skip Stage0 Resubmit Stage\n");
> +				/* set guilty */
> +				drm_sched_increase_karma(&job->base);
> +				step = 1;
> +			} else {
> +				DRM_INFO("Do Stage0 Resubmit Stage\n");
> +				step = 0;
> +			}
> +		}
> +
> +retry_resubmit:
>   		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>   			struct amdgpu_ring *ring = tmp_adev->rings[i];
> +			int ret = 0;
> +			struct drm_sched_job *s_bad_job = NULL;
>   
>   			if (!ring || !ring->sched.thread)
>   				continue;
>   
>   			/* No point to resubmit jobs if we didn't HW reset*/
> -			if (!tmp_adev->asic_reset_res && !job_signaled)
> +			if (!tmp_adev->asic_reset_res && !job_signaled) {
> +
>   				drm_sched_resubmit_jobs(&ring->sched);
>   
> -			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
> +				if (amdgpu_gpu_recovery == 2 && step == 0) {
> +					ret = amdgpu_wait_resubmitted_jobs_completion(&ring->sched, ring->sched.timeout, &s_bad_job);
> +					if (ret == -1) {
> +						DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", ring->sched.name, s_bad_job->id);
> +						/* set guilty */
> +						drm_sched_increase_karma(s_bad_job);
> +
> +						/* do hw reset */
> +						if (amdgpu_sriov_vf(adev)) {
> +							amdgpu_virt_fini_data_exchange(adev);
> +							r = amdgpu_device_reset_sriov(adev, false);
> +							if (r)
> +								adev->asic_reset_res = r;
> +						} else {
> +							r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
> +							if (r && r == -EAGAIN)
> +								goto retry;
> +						}
> +
> +						/* add reset counter so that the following resubmitted job could flush vmid */
> +						atomic_inc(&tmp_adev->gpu_reset_counter);
> +						step = 1;
> +						goto retry_resubmit;
> +					}
> +				}
> +			}
> +
> +			if (step == 1)
> +				drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>   		}
>   
>   		if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 865f924772b0..9c3f4edb7532 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -509,7 +509,7 @@ module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444);
>    * DOC: gpu_recovery (int)
>    * Set to enable GPU recovery mechanism (1 = enable, 0 = disable). The default is -1 (auto, disabled except SRIOV).
>    */
> -MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 = enable, 0 = disable, -1 = auto)");
> +MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (2 = advanced tdr mode, 1 = enable, 0 = disable, -1 = auto)");
>   module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
>   
>   /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 759b34799221..28cda321157a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -281,6 +281,53 @@ void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched)
>   	}
>   }
>   
> +int amdgpu_wait_resubmitted_jobs_completion(struct drm_gpu_scheduler *sched, long timeout, struct drm_sched_job **s_bad_job)
> +{
> +	struct drm_sched_job *s_job, *tmp;
> +	int ret = 0;
> +
> +	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
> +		struct drm_sched_fence *s_fence = s_job->s_fence;
> +
> +			if (s_fence->parent == NULL) { /* fail to get a hw fence */
> +				/* process a job */
> +				atomic_dec(&sched->num_jobs);
> +				dma_fence_get(&s_fence->finished);
> +				dma_fence_signal(&s_fence->finished);
> +				dma_fence_put(&s_fence->finished);
> +
> +				/* remove node from mirror_list and free the job */
> +				spin_lock(&sched->job_list_lock);
> +				list_del_init(&s_job->node);
> +				spin_unlock(&sched->job_list_lock);
> +				sched->ops->free_job(s_job);
> +				continue;
> +			}
> +
> +			ret = dma_fence_wait_timeout(s_fence->parent, false, timeout);
> +
> +			if (ret > 0) { /* succeed */
> +				/* process a job */
> +				atomic_dec(&sched->num_jobs);
> +				dma_fence_get(&s_fence->finished);
> +				dma_fence_signal(&s_fence->finished);
> +				dma_fence_put(&s_fence->finished);
> +
> +				/* remove node from mirror_list and free the job */
> +				spin_lock(&sched->job_list_lock);
> +				list_del_init(&s_job->node);
> +				spin_unlock(&sched->job_list_lock);
> +				sched->ops->free_job(s_job);
> +				continue;
> +			} else if (ret == 0) {
> +				*s_bad_job = s_job;
> +				return -1; /* timeout */
> +			}
> +	}
> +
> +	return 0;
> +}
> +
>   const struct drm_sched_backend_ops amdgpu_sched_ops = {
>   	.dependency = amdgpu_job_dependency,
>   	.run_job = amdgpu_job_run,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> index 81caac9b958a..25292f4699fb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> @@ -76,5 +76,5 @@ int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring,
>   			     struct dma_fence **fence);
>   
>   void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched);
> -
> +int amdgpu_wait_resubmitted_jobs_completion(struct drm_gpu_scheduler *sched, long timeout, struct drm_sched_job **s_bad_job);
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> index b644c78475fd..cb50bfc80bc9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> @@ -35,6 +35,11 @@
>   #include "amdgpu.h"
>   #include "atom.h"
>   
> +static char *amdgpu_affinity_group[] = {
> +"gfx",
> +"comp"
> +};
> +
>   /*
>    * Rings
>    * Most engines on the GPU are fed via ring buffers.  Ring
> @@ -189,6 +194,7 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct amdgpu_ring *ring,
>   		ring->adev = adev;
>   		ring->idx = adev->num_rings++;
>   		adev->rings[ring->idx] = ring;
> +		amdgpu_ring_set_affinity_group(ring);
>   		r = amdgpu_fence_driver_init_ring(ring, sched_hw_submission);
>   		if (r)
>   			return r;
> @@ -459,3 +465,24 @@ int amdgpu_ring_test_helper(struct amdgpu_ring *ring)
>   	ring->sched.ready = !r;
>   	return r;
>   }
> +
> +int amdgpu_ring_set_affinity_group(struct amdgpu_ring *ring)
> +{
> +       struct amdgpu_device *adev = ring->adev;
> +       int i;
> +
> +       for (i = 0; i < ARRAY_SIZE(amdgpu_affinity_group); i++) {
> +               char *temp_name = amdgpu_affinity_group[i];
> +
> +               /* set ring's affinity_group bit if find it in affinity_group list */
> +               if (strncmp(ring->name, temp_name, strlen(temp_name)) == 0) {
> +                       DRM_DEV_INFO(adev->dev, "set ring:%s in affinity_group\n",
> +                             ring->name);
> +                       ring->sched.affinity_group = 1;
> +                       return 0;
> +               }
> +       }
> +
> +       ring->sched.affinity_group = 0;
> +       return 0;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 56acec1075ac..6b0d217e6f5a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -350,4 +350,5 @@ int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
>   			     struct amdgpu_ring *ring);
>   void amdgpu_debugfs_ring_fini(struct amdgpu_ring *ring);
>   
> +int amdgpu_ring_set_affinity_group(struct amdgpu_ring *ring);
>   #endif
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 1c815e0a14ed..589cbaea35dc 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -301,6 +301,7 @@ struct drm_gpu_scheduler {
>   	atomic_t                        _score;
>   	bool				ready;
>   	bool				free_guilty;
> +	int				affinity_group;
>   };
>   
>   int drm_sched_init(struct drm_gpu_scheduler *sched,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode
  2021-03-06 19:03 ` Christian König
@ 2021-03-08  4:06   ` Liu, Monk
  2021-03-08  4:22     ` Zhang, Jack (Jian)
  2021-03-08  7:53     ` Christian König
  0 siblings, 2 replies; 13+ messages in thread
From: Liu, Monk @ 2021-03-08  4:06 UTC (permalink / raw)
  To: Koenig, Christian, Zhang, Jack (Jian),
	amd-gfx, Grodzovsky, Andrey, Deng, Emily

[AMD Official Use Only - Internal Distribution Only]

>> well first of all please completely drop the affinity group stuff from this patch. We should concentrate on one feature at at time.
We need it to expedite the process, we can introduce this change in another patch


>> Then the implementation is way to complicate. All you need to do is insert a dma_fence_wait after re-scheduling each job after a reset.
No that's not true, during the " drm_sched_resubmit_jobs" it will put all jobs in mirror list into the hw ring, but we can only allow the first job to the ring
To catch the real guilty one (otherwise it is possible that the later job in the ring also has bug and affect our judgement)
So we need to implement a new " drm_sched_resubmit_jobs2()", like this way:

drm_sched_resubmit_jobs2()
~ 499 void drm_sched_resubmit_jobs2(struct drm_gpu_scheduler *sched, int max)
  500 {
  501     struct drm_sched_job *s_job, *tmp;
  502     uint64_t guilty_context;
  503     bool found_guilty = false;
  504     struct dma_fence *fence;
+ 505     int i = 0;
  506
  507     list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
  508         struct drm_sched_fence *s_fence = s_job->s_fence;
  509
+ 510         if (i >= max)
+ 511             break;
+ 512
  513         if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
  514             found_guilty = true;
  515             guilty_context = s_job->s_fence->scheduled.context;
  516         }
  517
  518         if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
  519             dma_fence_set_error(&s_fence->finished, -ECANCELED);
  520
  521         dma_fence_put(s_job->s_fence->parent);
  522         fence = sched->ops->run_job(s_job);
+ 523         i++;
  524
  525         if (IS_ERR_OR_NULL(fence)) {
  526             if (IS_ERR(fence))
  527                 dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
  528
  529             s_job->s_fence->parent = NULL;
  530         } else {
  531             s_job->s_fence->parent = fence;
  532         }
  533
  534
  535     }
  536 }
  537 EXPORT_SYMBOL(drm_sched_resubmit_jobs);
  538



Thanks 

------------------------------------------
Monk Liu | Cloud-GPU Core team
------------------------------------------

-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com> 
Sent: Sunday, March 7, 2021 3:03 AM
To: Zhang, Jack (Jian) <Jack.Zhang1@amd.com>; amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Deng, Emily <Emily.Deng@amd.com>
Subject: Re: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode

Hi Jack,

well first of all please completely drop the affinity group stuff from this patch. We should concentrate on one feature at at time.

Then the implementation is way to complicate. All you need to do is insert a dma_fence_wait after re-scheduling each job after a reset.

Additional to that this feature is completely AMD specific and shouldn't affect the common scheduler in any way.

Regards,
Christian.

Am 06.03.21 um 18:25 schrieb Jack Zhang:
> [Why]
> Previous tdr design treats the first job in job_timeout as the bad job.
> But sometimes a later bad compute job can block a good gfx job and 
> cause an unexpected gfx job timeout because gfx and compute ring share 
> internal GC HW mutually.
>
> [How]
> This patch implements an advanced tdr mode.It involves an additinal 
> synchronous pre-resubmit step(Step0 Resubmit) before normal resubmit 
> step in order to find the real bad job.
>
> 1. For Bailing TDR job, re-insert it to mirror_list, don't set it to 
> guilty and leave it to be handled by the main reset thread.
>
> 2. Don't set the job to guilty in pre_asic_reset, and leave it to be 
> handled by Step0 Resubmit Stage.
>
> 3. At Step0 Resubmit stage, it first resubmit jobs asynchronously, 
> then it iterate each ring mirror_list, synchronously pend for each hw 
> fence being signaled. If the a job's hw fence get timeout, we identify 
> it as guilty and do hw reset to recover hw. After that, we would do 
> the normal resubmit step to resubmit left jobs.
>
> 4. For whole gpu reset(vram lost), skip Step0 Resubmit as each job 
> after vram lost was considered as bad job.
>
> 5. Involve the concept "Affinity Group".
> Doing two hw resets is not necessary when there's only one ring that 
> has jobs among some hw-related rings.Thus, we involve "affinity group".
> Hw-related rings could be added into a common affinity group, such as 
> gfx and compute ring. When tdr happens, we iterate all rings in 
> affinity group, skip Step0 Resubmit stage if there's only one ring's 
> mirror_list that has valid sched jobs.
>
> V2:
>      -fix a cherry-pick mistake for bailing TDR handling.
>
>      -do affinity_group check according to the bad job's sched rather
>       than the default "1" so that there could be multiple affinity
>       groups being pre-defined in future.
>
> Signed-off-by: Jack Zhang <Jack.Zhang1@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 +++++++++++++++++++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |   2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  47 ++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h    |   2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c   |  27 ++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |   1 +
>   include/drm/gpu_scheduler.h                |   1 +
>   7 files changed, 173 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index e247c3a2ec08..8632d7071292 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4188,6 +4188,37 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
>   	return false;
>   }
>   
> +bool amdgpu_affinity_group_has_only_or_null_working_ring(struct 
> +amdgpu_device *adev, struct drm_sched_job *s_job) {
> +       int i;
> +       int working_ring_num = 0;
> +
> +	/*
> +	 * The job is considered as the real bad one
> +	 * if job's sched is not in affinity group
> +	 */
> +	if (s_job->sched.affinity_group == 0)
> +			return true;
> +
> +       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> +               struct amdgpu_ring *ring = adev->rings[i];
> +
> +               if (!ring || !ring->sched.thread)
> +                       continue;
> +
> +               /* for non-empty affinity ring, increase working_ring_num */
> +               if (ring->sched.affinity_group == s_job->sched.affinity_group) {
> +                       if (!list_empty(&ring->sched.ring_mirror_list))
> +                               working_ring_num++;
> +               }
> +       }
> +
> +       if (working_ring_num > 1) {
> +               return false;
> +       }
> +       return true;
> +}
> +
>   /**
>    * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
>    *
> @@ -4310,8 +4341,10 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>   		amdgpu_fence_driver_force_completion(ring);
>   	}
>   
> -	if(job)
> -		drm_sched_increase_karma(&job->base);
> +	if (amdgpu_gpu_recovery != 2) {
> +		if (job)
> +			drm_sched_increase_karma(&job->base);
> +	}
>   
>   	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
>   	if (!amdgpu_sriov_vf(adev)) {
> @@ -4639,7 +4672,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   	int i, r = 0;
>   	bool need_emergency_restart = false;
>   	bool audio_suspended = false;
> -
> +	int	tmp_vram_lost_counter;
>   	/*
>   	 * Special case: RAS triggered and full reset isn't supported
>   	 */
> @@ -4690,8 +4723,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   					job ? job->base.id : -1);
>   
>   		/* even we skipped this reset, still need to set the job to guilty */
> -		if (job)
> -			drm_sched_increase_karma(&job->base);
> +		if (job) {
> +			if (amdgpu_gpu_recovery == 2) {
> +				if (&job->base) {
> +					spin_lock(&job->base.sched->job_list_lock);
> +					list_add(&job->base.node, &job->base.sched->ring_mirror_list);
> +					spin_unlock(&job->base.sched->job_list_lock);
> +				}
> +			} else
> +				drm_sched_increase_karma(&job->base);
> +		}
>   		goto skip_recovery;
>   	}
>   
> @@ -4788,6 +4829,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   		}
>   	}
>   
> +	tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
>   	/* Actual ASIC resets if needed.*/
>   	/* TODO Implement XGMI hive reset logic for SRIOV */
>   	if (amdgpu_sriov_vf(adev)) {
> @@ -4804,18 +4846,64 @@ int amdgpu_device_gpu_recover(struct 
> amdgpu_device *adev,
>   
>   	/* Post ASIC reset for all devs .*/
>   	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
> +		int step = 1;
>   
> +		if (amdgpu_gpu_recovery == 2) {
> +			if (amdgpu_affinity_group_has_only_or_null_working_ring(adev,&job->base)
> +				|| tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)) {
> +				DRM_INFO("Skip Stage0 Resubmit Stage\n");
> +				/* set guilty */
> +				drm_sched_increase_karma(&job->base);
> +				step = 1;
> +			} else {
> +				DRM_INFO("Do Stage0 Resubmit Stage\n");
> +				step = 0;
> +			}
> +		}
> +
> +retry_resubmit:
>   		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>   			struct amdgpu_ring *ring = tmp_adev->rings[i];
> +			int ret = 0;
> +			struct drm_sched_job *s_bad_job = NULL;
>   
>   			if (!ring || !ring->sched.thread)
>   				continue;
>   
>   			/* No point to resubmit jobs if we didn't HW reset*/
> -			if (!tmp_adev->asic_reset_res && !job_signaled)
> +			if (!tmp_adev->asic_reset_res && !job_signaled) {
> +
>   				drm_sched_resubmit_jobs(&ring->sched);
>   
> -			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
> +				if (amdgpu_gpu_recovery == 2 && step == 0) {
> +					ret = amdgpu_wait_resubmitted_jobs_completion(&ring->sched, ring->sched.timeout, &s_bad_job);
> +					if (ret == -1) {
> +						DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", ring->sched.name, s_bad_job->id);
> +						/* set guilty */
> +						drm_sched_increase_karma(s_bad_job);
> +
> +						/* do hw reset */
> +						if (amdgpu_sriov_vf(adev)) {
> +							amdgpu_virt_fini_data_exchange(adev);
> +							r = amdgpu_device_reset_sriov(adev, false);
> +							if (r)
> +								adev->asic_reset_res = r;
> +						} else {
> +							r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
> +							if (r && r == -EAGAIN)
> +								goto retry;
> +						}
> +
> +						/* add reset counter so that the following resubmitted job could flush vmid */
> +						atomic_inc(&tmp_adev->gpu_reset_counter);
> +						step = 1;
> +						goto retry_resubmit;
> +					}
> +				}
> +			}
> +
> +			if (step == 1)
> +				drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>   		}
>   
>   		if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 865f924772b0..9c3f4edb7532 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -509,7 +509,7 @@ module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444);
>    * DOC: gpu_recovery (int)
>    * Set to enable GPU recovery mechanism (1 = enable, 0 = disable). The default is -1 (auto, disabled except SRIOV).
>    */
> -MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 = 
> enable, 0 = disable, -1 = auto)");
> +MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (2 = 
> +advanced tdr mode, 1 = enable, 0 = disable, -1 = auto)");
>   module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
>   
>   /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 759b34799221..28cda321157a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -281,6 +281,53 @@ void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched)
>   	}
>   }
>   
> +int amdgpu_wait_resubmitted_jobs_completion(struct drm_gpu_scheduler 
> +*sched, long timeout, struct drm_sched_job **s_bad_job) {
> +	struct drm_sched_job *s_job, *tmp;
> +	int ret = 0;
> +
> +	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
> +		struct drm_sched_fence *s_fence = s_job->s_fence;
> +
> +			if (s_fence->parent == NULL) { /* fail to get a hw fence */
> +				/* process a job */
> +				atomic_dec(&sched->num_jobs);
> +				dma_fence_get(&s_fence->finished);
> +				dma_fence_signal(&s_fence->finished);
> +				dma_fence_put(&s_fence->finished);
> +
> +				/* remove node from mirror_list and free the job */
> +				spin_lock(&sched->job_list_lock);
> +				list_del_init(&s_job->node);
> +				spin_unlock(&sched->job_list_lock);
> +				sched->ops->free_job(s_job);
> +				continue;
> +			}
> +
> +			ret = dma_fence_wait_timeout(s_fence->parent, false, timeout);
> +
> +			if (ret > 0) { /* succeed */
> +				/* process a job */
> +				atomic_dec(&sched->num_jobs);
> +				dma_fence_get(&s_fence->finished);
> +				dma_fence_signal(&s_fence->finished);
> +				dma_fence_put(&s_fence->finished);
> +
> +				/* remove node from mirror_list and free the job */
> +				spin_lock(&sched->job_list_lock);
> +				list_del_init(&s_job->node);
> +				spin_unlock(&sched->job_list_lock);
> +				sched->ops->free_job(s_job);
> +				continue;
> +			} else if (ret == 0) {
> +				*s_bad_job = s_job;
> +				return -1; /* timeout */
> +			}
> +	}
> +
> +	return 0;
> +}
> +
>   const struct drm_sched_backend_ops amdgpu_sched_ops = {
>   	.dependency = amdgpu_job_dependency,
>   	.run_job = amdgpu_job_run,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> index 81caac9b958a..25292f4699fb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> @@ -76,5 +76,5 @@ int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring,
>   			     struct dma_fence **fence);
>   
>   void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler 
> *sched);
> -
> +int amdgpu_wait_resubmitted_jobs_completion(struct drm_gpu_scheduler 
> +*sched, long timeout, struct drm_sched_job **s_bad_job);
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> index b644c78475fd..cb50bfc80bc9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> @@ -35,6 +35,11 @@
>   #include "amdgpu.h"
>   #include "atom.h"
>   
> +static char *amdgpu_affinity_group[] = { "gfx", "comp"
> +};
> +
>   /*
>    * Rings
>    * Most engines on the GPU are fed via ring buffers.  Ring @@ -189,6 
> +194,7 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct amdgpu_ring *ring,
>   		ring->adev = adev;
>   		ring->idx = adev->num_rings++;
>   		adev->rings[ring->idx] = ring;
> +		amdgpu_ring_set_affinity_group(ring);
>   		r = amdgpu_fence_driver_init_ring(ring, sched_hw_submission);
>   		if (r)
>   			return r;
> @@ -459,3 +465,24 @@ int amdgpu_ring_test_helper(struct amdgpu_ring *ring)
>   	ring->sched.ready = !r;
>   	return r;
>   }
> +
> +int amdgpu_ring_set_affinity_group(struct amdgpu_ring *ring) {
> +       struct amdgpu_device *adev = ring->adev;
> +       int i;
> +
> +       for (i = 0; i < ARRAY_SIZE(amdgpu_affinity_group); i++) {
> +               char *temp_name = amdgpu_affinity_group[i];
> +
> +               /* set ring's affinity_group bit if find it in affinity_group list */
> +               if (strncmp(ring->name, temp_name, strlen(temp_name)) == 0) {
> +                       DRM_DEV_INFO(adev->dev, "set ring:%s in affinity_group\n",
> +                             ring->name);
> +                       ring->sched.affinity_group = 1;
> +                       return 0;
> +               }
> +       }
> +
> +       ring->sched.affinity_group = 0;
> +       return 0;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 56acec1075ac..6b0d217e6f5a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -350,4 +350,5 @@ int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
>   			     struct amdgpu_ring *ring);
>   void amdgpu_debugfs_ring_fini(struct amdgpu_ring *ring);
>   
> +int amdgpu_ring_set_affinity_group(struct amdgpu_ring *ring);
>   #endif
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h 
> index 1c815e0a14ed..589cbaea35dc 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -301,6 +301,7 @@ struct drm_gpu_scheduler {
>   	atomic_t                        _score;
>   	bool				ready;
>   	bool				free_guilty;
> +	int				affinity_group;
>   };
>   
>   int drm_sched_init(struct drm_gpu_scheduler *sched,
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode
  2021-03-08  4:06   ` Liu, Monk
@ 2021-03-08  4:22     ` Zhang, Jack (Jian)
  2021-03-08  7:53     ` Christian König
  1 sibling, 0 replies; 13+ messages in thread
From: Zhang, Jack (Jian) @ 2021-03-08  4:22 UTC (permalink / raw)
  To: Liu, Monk, Koenig, Christian, amd-gfx, Grodzovsky, Andrey, Deng, Emily

Hi, Christian and Monk,

Thanks so much for these valuable ideas. I will follow the advices.

>> Then the implementation is way to complicate. All you need to do is insert a dma_fence_wait after re-scheduling each job after a reset.
Sure, if I re-implement amd_sched_resubmit_jobs2(), and add a dma_fence_wait inside the job2 func, the change will be simpler.
I will do the change and submit for review later.

Thanks a lot for your help.
/Jack

-----Original Message-----
From: Liu, Monk <Monk.Liu@amd.com> 
Sent: Monday, March 8, 2021 12:06 PM
To: Koenig, Christian <Christian.Koenig@amd.com>; Zhang, Jack (Jian) <Jack.Zhang1@amd.com>; amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Deng, Emily <Emily.Deng@amd.com>
Subject: RE: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode

[AMD Official Use Only - Internal Distribution Only]

>> well first of all please completely drop the affinity group stuff from this patch. We should concentrate on one feature at at time.
We need it to expedite the process, we can introduce this change in another patch


>> Then the implementation is way to complicate. All you need to do is insert a dma_fence_wait after re-scheduling each job after a reset.
No that's not true, during the " drm_sched_resubmit_jobs" it will put all jobs in mirror list into the hw ring, but we can only allow the first job to the ring To catch the real guilty one (otherwise it is possible that the later job in the ring also has bug and affect our judgement) So we need to implement a new " drm_sched_resubmit_jobs2()", like this way:

drm_sched_resubmit_jobs2()
~ 499 void drm_sched_resubmit_jobs2(struct drm_gpu_scheduler *sched, int max)
  500 {
  501     struct drm_sched_job *s_job, *tmp;
  502     uint64_t guilty_context;
  503     bool found_guilty = false;
  504     struct dma_fence *fence;
+ 505     int i = 0;
  506
  507     list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
  508         struct drm_sched_fence *s_fence = s_job->s_fence;
  509
+ 510         if (i >= max)
+ 511             break;
+ 512
  513         if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
  514             found_guilty = true;
  515             guilty_context = s_job->s_fence->scheduled.context;
  516         }
  517
  518         if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
  519             dma_fence_set_error(&s_fence->finished, -ECANCELED);
  520
  521         dma_fence_put(s_job->s_fence->parent);
  522         fence = sched->ops->run_job(s_job);
+ 523         i++;
  524
  525         if (IS_ERR_OR_NULL(fence)) {
  526             if (IS_ERR(fence))
  527                 dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
  528
  529             s_job->s_fence->parent = NULL;
  530         } else {
  531             s_job->s_fence->parent = fence;
  532         }
  533
  534
  535     }
  536 }
  537 EXPORT_SYMBOL(drm_sched_resubmit_jobs);
  538



Thanks 

------------------------------------------
Monk Liu | Cloud-GPU Core team
------------------------------------------

-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com>
Sent: Sunday, March 7, 2021 3:03 AM
To: Zhang, Jack (Jian) <Jack.Zhang1@amd.com>; amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Deng, Emily <Emily.Deng@amd.com>
Subject: Re: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode

Hi Jack,

well first of all please completely drop the affinity group stuff from this patch. We should concentrate on one feature at at time.

Then the implementation is way to complicate. All you need to do is insert a dma_fence_wait after re-scheduling each job after a reset.

Additional to that this feature is completely AMD specific and shouldn't affect the common scheduler in any way.

Regards,
Christian.

Am 06.03.21 um 18:25 schrieb Jack Zhang:
> [Why]
> Previous tdr design treats the first job in job_timeout as the bad job.
> But sometimes a later bad compute job can block a good gfx job and 
> cause an unexpected gfx job timeout because gfx and compute ring share 
> internal GC HW mutually.
>
> [How]
> This patch implements an advanced tdr mode.It involves an additinal 
> synchronous pre-resubmit step(Step0 Resubmit) before normal resubmit 
> step in order to find the real bad job.
>
> 1. For Bailing TDR job, re-insert it to mirror_list, don't set it to 
> guilty and leave it to be handled by the main reset thread.
>
> 2. Don't set the job to guilty in pre_asic_reset, and leave it to be 
> handled by Step0 Resubmit Stage.
>
> 3. At Step0 Resubmit stage, it first resubmit jobs asynchronously, 
> then it iterate each ring mirror_list, synchronously pend for each hw 
> fence being signaled. If the a job's hw fence get timeout, we identify 
> it as guilty and do hw reset to recover hw. After that, we would do 
> the normal resubmit step to resubmit left jobs.
>
> 4. For whole gpu reset(vram lost), skip Step0 Resubmit as each job 
> after vram lost was considered as bad job.
>
> 5. Involve the concept "Affinity Group".
> Doing two hw resets is not necessary when there's only one ring that 
> has jobs among some hw-related rings.Thus, we involve "affinity group".
> Hw-related rings could be added into a common affinity group, such as 
> gfx and compute ring. When tdr happens, we iterate all rings in 
> affinity group, skip Step0 Resubmit stage if there's only one ring's 
> mirror_list that has valid sched jobs.
>
> V2:
>      -fix a cherry-pick mistake for bailing TDR handling.
>
>      -do affinity_group check according to the bad job's sched rather
>       than the default "1" so that there could be multiple affinity
>       groups being pre-defined in future.
>
> Signed-off-by: Jack Zhang <Jack.Zhang1@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 +++++++++++++++++++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |   2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  47 ++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h    |   2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c   |  27 ++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |   1 +
>   include/drm/gpu_scheduler.h                |   1 +
>   7 files changed, 173 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index e247c3a2ec08..8632d7071292 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4188,6 +4188,37 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
>   	return false;
>   }
>   
> +bool amdgpu_affinity_group_has_only_or_null_working_ring(struct
> +amdgpu_device *adev, struct drm_sched_job *s_job) {
> +       int i;
> +       int working_ring_num = 0;
> +
> +	/*
> +	 * The job is considered as the real bad one
> +	 * if job's sched is not in affinity group
> +	 */
> +	if (s_job->sched.affinity_group == 0)
> +			return true;
> +
> +       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> +               struct amdgpu_ring *ring = adev->rings[i];
> +
> +               if (!ring || !ring->sched.thread)
> +                       continue;
> +
> +               /* for non-empty affinity ring, increase working_ring_num */
> +               if (ring->sched.affinity_group == s_job->sched.affinity_group) {
> +                       if (!list_empty(&ring->sched.ring_mirror_list))
> +                               working_ring_num++;
> +               }
> +       }
> +
> +       if (working_ring_num > 1) {
> +               return false;
> +       }
> +       return true;
> +}
> +
>   /**
>    * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
>    *
> @@ -4310,8 +4341,10 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>   		amdgpu_fence_driver_force_completion(ring);
>   	}
>   
> -	if(job)
> -		drm_sched_increase_karma(&job->base);
> +	if (amdgpu_gpu_recovery != 2) {
> +		if (job)
> +			drm_sched_increase_karma(&job->base);
> +	}
>   
>   	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
>   	if (!amdgpu_sriov_vf(adev)) {
> @@ -4639,7 +4672,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   	int i, r = 0;
>   	bool need_emergency_restart = false;
>   	bool audio_suspended = false;
> -
> +	int	tmp_vram_lost_counter;
>   	/*
>   	 * Special case: RAS triggered and full reset isn't supported
>   	 */
> @@ -4690,8 +4723,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   					job ? job->base.id : -1);
>   
>   		/* even we skipped this reset, still need to set the job to guilty */
> -		if (job)
> -			drm_sched_increase_karma(&job->base);
> +		if (job) {
> +			if (amdgpu_gpu_recovery == 2) {
> +				if (&job->base) {
> +					spin_lock(&job->base.sched->job_list_lock);
> +					list_add(&job->base.node, &job->base.sched->ring_mirror_list);
> +					spin_unlock(&job->base.sched->job_list_lock);
> +				}
> +			} else
> +				drm_sched_increase_karma(&job->base);
> +		}
>   		goto skip_recovery;
>   	}
>   
> @@ -4788,6 +4829,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   		}
>   	}
>   
> +	tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
>   	/* Actual ASIC resets if needed.*/
>   	/* TODO Implement XGMI hive reset logic for SRIOV */
>   	if (amdgpu_sriov_vf(adev)) {
> @@ -4804,18 +4846,64 @@ int amdgpu_device_gpu_recover(struct 
> amdgpu_device *adev,
>   
>   	/* Post ASIC reset for all devs .*/
>   	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
> +		int step = 1;
>   
> +		if (amdgpu_gpu_recovery == 2) {
> +			if (amdgpu_affinity_group_has_only_or_null_working_ring(adev,&job->base)
> +				|| tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)) {
> +				DRM_INFO("Skip Stage0 Resubmit Stage\n");
> +				/* set guilty */
> +				drm_sched_increase_karma(&job->base);
> +				step = 1;
> +			} else {
> +				DRM_INFO("Do Stage0 Resubmit Stage\n");
> +				step = 0;
> +			}
> +		}
> +
> +retry_resubmit:
>   		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>   			struct amdgpu_ring *ring = tmp_adev->rings[i];
> +			int ret = 0;
> +			struct drm_sched_job *s_bad_job = NULL;
>   
>   			if (!ring || !ring->sched.thread)
>   				continue;
>   
>   			/* No point to resubmit jobs if we didn't HW reset*/
> -			if (!tmp_adev->asic_reset_res && !job_signaled)
> +			if (!tmp_adev->asic_reset_res && !job_signaled) {
> +
>   				drm_sched_resubmit_jobs(&ring->sched);
>   
> -			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
> +				if (amdgpu_gpu_recovery == 2 && step == 0) {
> +					ret = amdgpu_wait_resubmitted_jobs_completion(&ring->sched, ring->sched.timeout, &s_bad_job);
> +					if (ret == -1) {
> +						DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", ring->sched.name, s_bad_job->id);
> +						/* set guilty */
> +						drm_sched_increase_karma(s_bad_job);
> +
> +						/* do hw reset */
> +						if (amdgpu_sriov_vf(adev)) {
> +							amdgpu_virt_fini_data_exchange(adev);
> +							r = amdgpu_device_reset_sriov(adev, false);
> +							if (r)
> +								adev->asic_reset_res = r;
> +						} else {
> +							r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
> +							if (r && r == -EAGAIN)
> +								goto retry;
> +						}
> +
> +						/* add reset counter so that the following resubmitted job could flush vmid */
> +						atomic_inc(&tmp_adev->gpu_reset_counter);
> +						step = 1;
> +						goto retry_resubmit;
> +					}
> +				}
> +			}
> +
> +			if (step == 1)
> +				drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>   		}
>   
>   		if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 865f924772b0..9c3f4edb7532 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -509,7 +509,7 @@ module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444);
>    * DOC: gpu_recovery (int)
>    * Set to enable GPU recovery mechanism (1 = enable, 0 = disable). The default is -1 (auto, disabled except SRIOV).
>    */
> -MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 = 
> enable, 0 = disable, -1 = auto)");
> +MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (2 = 
> +advanced tdr mode, 1 = enable, 0 = disable, -1 = auto)");
>   module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
>   
>   /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 759b34799221..28cda321157a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -281,6 +281,53 @@ void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched)
>   	}
>   }
>   
> +int amdgpu_wait_resubmitted_jobs_completion(struct drm_gpu_scheduler 
> +*sched, long timeout, struct drm_sched_job **s_bad_job) {
> +	struct drm_sched_job *s_job, *tmp;
> +	int ret = 0;
> +
> +	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
> +		struct drm_sched_fence *s_fence = s_job->s_fence;
> +
> +			if (s_fence->parent == NULL) { /* fail to get a hw fence */
> +				/* process a job */
> +				atomic_dec(&sched->num_jobs);
> +				dma_fence_get(&s_fence->finished);
> +				dma_fence_signal(&s_fence->finished);
> +				dma_fence_put(&s_fence->finished);
> +
> +				/* remove node from mirror_list and free the job */
> +				spin_lock(&sched->job_list_lock);
> +				list_del_init(&s_job->node);
> +				spin_unlock(&sched->job_list_lock);
> +				sched->ops->free_job(s_job);
> +				continue;
> +			}
> +
> +			ret = dma_fence_wait_timeout(s_fence->parent, false, timeout);
> +
> +			if (ret > 0) { /* succeed */
> +				/* process a job */
> +				atomic_dec(&sched->num_jobs);
> +				dma_fence_get(&s_fence->finished);
> +				dma_fence_signal(&s_fence->finished);
> +				dma_fence_put(&s_fence->finished);
> +
> +				/* remove node from mirror_list and free the job */
> +				spin_lock(&sched->job_list_lock);
> +				list_del_init(&s_job->node);
> +				spin_unlock(&sched->job_list_lock);
> +				sched->ops->free_job(s_job);
> +				continue;
> +			} else if (ret == 0) {
> +				*s_bad_job = s_job;
> +				return -1; /* timeout */
> +			}
> +	}
> +
> +	return 0;
> +}
> +
>   const struct drm_sched_backend_ops amdgpu_sched_ops = {
>   	.dependency = amdgpu_job_dependency,
>   	.run_job = amdgpu_job_run,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> index 81caac9b958a..25292f4699fb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> @@ -76,5 +76,5 @@ int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring,
>   			     struct dma_fence **fence);
>   
>   void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler 
> *sched);
> -
> +int amdgpu_wait_resubmitted_jobs_completion(struct drm_gpu_scheduler 
> +*sched, long timeout, struct drm_sched_job **s_bad_job);
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> index b644c78475fd..cb50bfc80bc9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> @@ -35,6 +35,11 @@
>   #include "amdgpu.h"
>   #include "atom.h"
>   
> +static char *amdgpu_affinity_group[] = { "gfx", "comp"
> +};
> +
>   /*
>    * Rings
>    * Most engines on the GPU are fed via ring buffers.  Ring @@ -189,6
> +194,7 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct 
> +amdgpu_ring *ring,
>   		ring->adev = adev;
>   		ring->idx = adev->num_rings++;
>   		adev->rings[ring->idx] = ring;
> +		amdgpu_ring_set_affinity_group(ring);
>   		r = amdgpu_fence_driver_init_ring(ring, sched_hw_submission);
>   		if (r)
>   			return r;
> @@ -459,3 +465,24 @@ int amdgpu_ring_test_helper(struct amdgpu_ring *ring)
>   	ring->sched.ready = !r;
>   	return r;
>   }
> +
> +int amdgpu_ring_set_affinity_group(struct amdgpu_ring *ring) {
> +       struct amdgpu_device *adev = ring->adev;
> +       int i;
> +
> +       for (i = 0; i < ARRAY_SIZE(amdgpu_affinity_group); i++) {
> +               char *temp_name = amdgpu_affinity_group[i];
> +
> +               /* set ring's affinity_group bit if find it in affinity_group list */
> +               if (strncmp(ring->name, temp_name, strlen(temp_name)) == 0) {
> +                       DRM_DEV_INFO(adev->dev, "set ring:%s in affinity_group\n",
> +                             ring->name);
> +                       ring->sched.affinity_group = 1;
> +                       return 0;
> +               }
> +       }
> +
> +       ring->sched.affinity_group = 0;
> +       return 0;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 56acec1075ac..6b0d217e6f5a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -350,4 +350,5 @@ int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
>   			     struct amdgpu_ring *ring);
>   void amdgpu_debugfs_ring_fini(struct amdgpu_ring *ring);
>   
> +int amdgpu_ring_set_affinity_group(struct amdgpu_ring *ring);
>   #endif
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h 
> index 1c815e0a14ed..589cbaea35dc 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -301,6 +301,7 @@ struct drm_gpu_scheduler {
>   	atomic_t                        _score;
>   	bool				ready;
>   	bool				free_guilty;
> +	int				affinity_group;
>   };
>   
>   int drm_sched_init(struct drm_gpu_scheduler *sched,
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode
  2021-03-08  4:06   ` Liu, Monk
  2021-03-08  4:22     ` Zhang, Jack (Jian)
@ 2021-03-08  7:53     ` Christian König
  2021-03-08 12:43       ` Zhang, Jack (Jian)
  2021-03-09  7:29       ` Liu, Monk
  1 sibling, 2 replies; 13+ messages in thread
From: Christian König @ 2021-03-08  7:53 UTC (permalink / raw)
  To: Liu, Monk, Zhang, Jack (Jian), amd-gfx, Grodzovsky, Andrey, Deng, Emily



Am 08.03.21 um 05:06 schrieb Liu, Monk:
> [AMD Official Use Only - Internal Distribution Only]
>
>>> well first of all please completely drop the affinity group stuff from this patch. We should concentrate on one feature at at time.
> We need it to expedite the process, we can introduce this change in another patch
>
>
>>> Then the implementation is way to complicate. All you need to do is insert a dma_fence_wait after re-scheduling each job after a reset.
> No that's not true, during the " drm_sched_resubmit_jobs" it will put all jobs in mirror list into the hw ring, but we can only allow the first job to the ring
> To catch the real guilty one (otherwise it is possible that the later job in the ring also has bug and affect our judgement)
> So we need to implement a new " drm_sched_resubmit_jobs2()", like this way:

Something like this. But since waiting for the guilty job is AMD 
specific we should rather rework the stuff from the beginning.

What I have in mind is the following:
1. Add a reference from the scheduler fence back to the job which is 
cleared only when the scheduler fence finishes.
2. Completely drop the ring_mirror_list and replace it with a kfifo of 
pointers to the active scheduler fences.
3. Replace drm_sched_resubmit_jobs with a drm_sched_for_each_active() 
macro which allows drivers to iterate over all the active jobs and 
resubmit/wait/mark them as guilty etc etc..
4. Remove the guilty/karma handling from the scheduler. This is 
something AMD specific and shouldn't leak into common code.

Regards,
Christian.

>
> drm_sched_resubmit_jobs2()
> ~ 499 void drm_sched_resubmit_jobs2(struct drm_gpu_scheduler *sched, int max)
>    500 {
>    501     struct drm_sched_job *s_job, *tmp;
>    502     uint64_t guilty_context;
>    503     bool found_guilty = false;
>    504     struct dma_fence *fence;
> + 505     int i = 0;
>    506
>    507     list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
>    508         struct drm_sched_fence *s_fence = s_job->s_fence;
>    509
> + 510         if (i >= max)
> + 511             break;
> + 512
>    513         if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
>    514             found_guilty = true;
>    515             guilty_context = s_job->s_fence->scheduled.context;
>    516         }
>    517
>    518         if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
>    519             dma_fence_set_error(&s_fence->finished, -ECANCELED);
>    520
>    521         dma_fence_put(s_job->s_fence->parent);
>    522         fence = sched->ops->run_job(s_job);
> + 523         i++;
>    524
>    525         if (IS_ERR_OR_NULL(fence)) {
>    526             if (IS_ERR(fence))
>    527                 dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
>    528
>    529             s_job->s_fence->parent = NULL;
>    530         } else {
>    531             s_job->s_fence->parent = fence;
>    532         }
>    533
>    534
>    535     }
>    536 }
>    537 EXPORT_SYMBOL(drm_sched_resubmit_jobs);
>    538
>
>
>
> Thanks
>
> ------------------------------------------
> Monk Liu | Cloud-GPU Core team
> ------------------------------------------
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Sunday, March 7, 2021 3:03 AM
> To: Zhang, Jack (Jian) <Jack.Zhang1@amd.com>; amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Deng, Emily <Emily.Deng@amd.com>
> Subject: Re: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode
>
> Hi Jack,
>
> well first of all please completely drop the affinity group stuff from this patch. We should concentrate on one feature at at time.
>
> Then the implementation is way to complicate. All you need to do is insert a dma_fence_wait after re-scheduling each job after a reset.
>
> Additional to that this feature is completely AMD specific and shouldn't affect the common scheduler in any way.
>
> Regards,
> Christian.
>
> Am 06.03.21 um 18:25 schrieb Jack Zhang:
>> [Why]
>> Previous tdr design treats the first job in job_timeout as the bad job.
>> But sometimes a later bad compute job can block a good gfx job and
>> cause an unexpected gfx job timeout because gfx and compute ring share
>> internal GC HW mutually.
>>
>> [How]
>> This patch implements an advanced tdr mode.It involves an additinal
>> synchronous pre-resubmit step(Step0 Resubmit) before normal resubmit
>> step in order to find the real bad job.
>>
>> 1. For Bailing TDR job, re-insert it to mirror_list, don't set it to
>> guilty and leave it to be handled by the main reset thread.
>>
>> 2. Don't set the job to guilty in pre_asic_reset, and leave it to be
>> handled by Step0 Resubmit Stage.
>>
>> 3. At Step0 Resubmit stage, it first resubmit jobs asynchronously,
>> then it iterate each ring mirror_list, synchronously pend for each hw
>> fence being signaled. If the a job's hw fence get timeout, we identify
>> it as guilty and do hw reset to recover hw. After that, we would do
>> the normal resubmit step to resubmit left jobs.
>>
>> 4. For whole gpu reset(vram lost), skip Step0 Resubmit as each job
>> after vram lost was considered as bad job.
>>
>> 5. Involve the concept "Affinity Group".
>> Doing two hw resets is not necessary when there's only one ring that
>> has jobs among some hw-related rings.Thus, we involve "affinity group".
>> Hw-related rings could be added into a common affinity group, such as
>> gfx and compute ring. When tdr happens, we iterate all rings in
>> affinity group, skip Step0 Resubmit stage if there's only one ring's
>> mirror_list that has valid sched jobs.
>>
>> V2:
>>       -fix a cherry-pick mistake for bailing TDR handling.
>>
>>       -do affinity_group check according to the bad job's sched rather
>>        than the default "1" so that there could be multiple affinity
>>        groups being pre-defined in future.
>>
>> Signed-off-by: Jack Zhang <Jack.Zhang1@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 +++++++++++++++++++--
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |   2 +-
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  47 ++++++++++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_job.h    |   2 +-
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c   |  27 ++++++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |   1 +
>>    include/drm/gpu_scheduler.h                |   1 +
>>    7 files changed, 173 insertions(+), 9 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index e247c3a2ec08..8632d7071292 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -4188,6 +4188,37 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
>>    	return false;
>>    }
>>    
>> +bool amdgpu_affinity_group_has_only_or_null_working_ring(struct
>> +amdgpu_device *adev, struct drm_sched_job *s_job) {
>> +       int i;
>> +       int working_ring_num = 0;
>> +
>> +	/*
>> +	 * The job is considered as the real bad one
>> +	 * if job's sched is not in affinity group
>> +	 */
>> +	if (s_job->sched.affinity_group == 0)
>> +			return true;
>> +
>> +       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> +               struct amdgpu_ring *ring = adev->rings[i];
>> +
>> +               if (!ring || !ring->sched.thread)
>> +                       continue;
>> +
>> +               /* for non-empty affinity ring, increase working_ring_num */
>> +               if (ring->sched.affinity_group == s_job->sched.affinity_group) {
>> +                       if (!list_empty(&ring->sched.ring_mirror_list))
>> +                               working_ring_num++;
>> +               }
>> +       }
>> +
>> +       if (working_ring_num > 1) {
>> +               return false;
>> +       }
>> +       return true;
>> +}
>> +
>>    /**
>>     * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
>>     *
>> @@ -4310,8 +4341,10 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>    		amdgpu_fence_driver_force_completion(ring);
>>    	}
>>    
>> -	if(job)
>> -		drm_sched_increase_karma(&job->base);
>> +	if (amdgpu_gpu_recovery != 2) {
>> +		if (job)
>> +			drm_sched_increase_karma(&job->base);
>> +	}
>>    
>>    	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
>>    	if (!amdgpu_sriov_vf(adev)) {
>> @@ -4639,7 +4672,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>    	int i, r = 0;
>>    	bool need_emergency_restart = false;
>>    	bool audio_suspended = false;
>> -
>> +	int	tmp_vram_lost_counter;
>>    	/*
>>    	 * Special case: RAS triggered and full reset isn't supported
>>    	 */
>> @@ -4690,8 +4723,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>    					job ? job->base.id : -1);
>>    
>>    		/* even we skipped this reset, still need to set the job to guilty */
>> -		if (job)
>> -			drm_sched_increase_karma(&job->base);
>> +		if (job) {
>> +			if (amdgpu_gpu_recovery == 2) {
>> +				if (&job->base) {
>> +					spin_lock(&job->base.sched->job_list_lock);
>> +					list_add(&job->base.node, &job->base.sched->ring_mirror_list);
>> +					spin_unlock(&job->base.sched->job_list_lock);
>> +				}
>> +			} else
>> +				drm_sched_increase_karma(&job->base);
>> +		}
>>    		goto skip_recovery;
>>    	}
>>    
>> @@ -4788,6 +4829,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>    		}
>>    	}
>>    
>> +	tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
>>    	/* Actual ASIC resets if needed.*/
>>    	/* TODO Implement XGMI hive reset logic for SRIOV */
>>    	if (amdgpu_sriov_vf(adev)) {
>> @@ -4804,18 +4846,64 @@ int amdgpu_device_gpu_recover(struct
>> amdgpu_device *adev,
>>    
>>    	/* Post ASIC reset for all devs .*/
>>    	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>> +		int step = 1;
>>    
>> +		if (amdgpu_gpu_recovery == 2) {
>> +			if (amdgpu_affinity_group_has_only_or_null_working_ring(adev,&job->base)
>> +				|| tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)) {
>> +				DRM_INFO("Skip Stage0 Resubmit Stage\n");
>> +				/* set guilty */
>> +				drm_sched_increase_karma(&job->base);
>> +				step = 1;
>> +			} else {
>> +				DRM_INFO("Do Stage0 Resubmit Stage\n");
>> +				step = 0;
>> +			}
>> +		}
>> +
>> +retry_resubmit:
>>    		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>    			struct amdgpu_ring *ring = tmp_adev->rings[i];
>> +			int ret = 0;
>> +			struct drm_sched_job *s_bad_job = NULL;
>>    
>>    			if (!ring || !ring->sched.thread)
>>    				continue;
>>    
>>    			/* No point to resubmit jobs if we didn't HW reset*/
>> -			if (!tmp_adev->asic_reset_res && !job_signaled)
>> +			if (!tmp_adev->asic_reset_res && !job_signaled) {
>> +
>>    				drm_sched_resubmit_jobs(&ring->sched);
>>    
>> -			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>> +				if (amdgpu_gpu_recovery == 2 && step == 0) {
>> +					ret = amdgpu_wait_resubmitted_jobs_completion(&ring->sched, ring->sched.timeout, &s_bad_job);
>> +					if (ret == -1) {
>> +						DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", ring->sched.name, s_bad_job->id);
>> +						/* set guilty */
>> +						drm_sched_increase_karma(s_bad_job);
>> +
>> +						/* do hw reset */
>> +						if (amdgpu_sriov_vf(adev)) {
>> +							amdgpu_virt_fini_data_exchange(adev);
>> +							r = amdgpu_device_reset_sriov(adev, false);
>> +							if (r)
>> +								adev->asic_reset_res = r;
>> +						} else {
>> +							r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
>> +							if (r && r == -EAGAIN)
>> +								goto retry;
>> +						}
>> +
>> +						/* add reset counter so that the following resubmitted job could flush vmid */
>> +						atomic_inc(&tmp_adev->gpu_reset_counter);
>> +						step = 1;
>> +						goto retry_resubmit;
>> +					}
>> +				}
>> +			}
>> +
>> +			if (step == 1)
>> +				drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>>    		}
>>    
>>    		if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index 865f924772b0..9c3f4edb7532 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -509,7 +509,7 @@ module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444);
>>     * DOC: gpu_recovery (int)
>>     * Set to enable GPU recovery mechanism (1 = enable, 0 = disable). The default is -1 (auto, disabled except SRIOV).
>>     */
>> -MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 =
>> enable, 0 = disable, -1 = auto)");
>> +MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (2 =
>> +advanced tdr mode, 1 = enable, 0 = disable, -1 = auto)");
>>    module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
>>    
>>    /**
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> index 759b34799221..28cda321157a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> @@ -281,6 +281,53 @@ void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched)
>>    	}
>>    }
>>    
>> +int amdgpu_wait_resubmitted_jobs_completion(struct drm_gpu_scheduler
>> +*sched, long timeout, struct drm_sched_job **s_bad_job) {
>> +	struct drm_sched_job *s_job, *tmp;
>> +	int ret = 0;
>> +
>> +	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
>> +		struct drm_sched_fence *s_fence = s_job->s_fence;
>> +
>> +			if (s_fence->parent == NULL) { /* fail to get a hw fence */
>> +				/* process a job */
>> +				atomic_dec(&sched->num_jobs);
>> +				dma_fence_get(&s_fence->finished);
>> +				dma_fence_signal(&s_fence->finished);
>> +				dma_fence_put(&s_fence->finished);
>> +
>> +				/* remove node from mirror_list and free the job */
>> +				spin_lock(&sched->job_list_lock);
>> +				list_del_init(&s_job->node);
>> +				spin_unlock(&sched->job_list_lock);
>> +				sched->ops->free_job(s_job);
>> +				continue;
>> +			}
>> +
>> +			ret = dma_fence_wait_timeout(s_fence->parent, false, timeout);
>> +
>> +			if (ret > 0) { /* succeed */
>> +				/* process a job */
>> +				atomic_dec(&sched->num_jobs);
>> +				dma_fence_get(&s_fence->finished);
>> +				dma_fence_signal(&s_fence->finished);
>> +				dma_fence_put(&s_fence->finished);
>> +
>> +				/* remove node from mirror_list and free the job */
>> +				spin_lock(&sched->job_list_lock);
>> +				list_del_init(&s_job->node);
>> +				spin_unlock(&sched->job_list_lock);
>> +				sched->ops->free_job(s_job);
>> +				continue;
>> +			} else if (ret == 0) {
>> +				*s_bad_job = s_job;
>> +				return -1; /* timeout */
>> +			}
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>>    const struct drm_sched_backend_ops amdgpu_sched_ops = {
>>    	.dependency = amdgpu_job_dependency,
>>    	.run_job = amdgpu_job_run,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> index 81caac9b958a..25292f4699fb 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> @@ -76,5 +76,5 @@ int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring,
>>    			     struct dma_fence **fence);
>>    
>>    void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler
>> *sched);
>> -
>> +int amdgpu_wait_resubmitted_jobs_completion(struct drm_gpu_scheduler
>> +*sched, long timeout, struct drm_sched_job **s_bad_job);
>>    #endif
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> index b644c78475fd..cb50bfc80bc9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> @@ -35,6 +35,11 @@
>>    #include "amdgpu.h"
>>    #include "atom.h"
>>    
>> +static char *amdgpu_affinity_group[] = { "gfx", "comp"
>> +};
>> +
>>    /*
>>     * Rings
>>     * Most engines on the GPU are fed via ring buffers.  Ring @@ -189,6
>> +194,7 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct amdgpu_ring *ring,
>>    		ring->adev = adev;
>>    		ring->idx = adev->num_rings++;
>>    		adev->rings[ring->idx] = ring;
>> +		amdgpu_ring_set_affinity_group(ring);
>>    		r = amdgpu_fence_driver_init_ring(ring, sched_hw_submission);
>>    		if (r)
>>    			return r;
>> @@ -459,3 +465,24 @@ int amdgpu_ring_test_helper(struct amdgpu_ring *ring)
>>    	ring->sched.ready = !r;
>>    	return r;
>>    }
>> +
>> +int amdgpu_ring_set_affinity_group(struct amdgpu_ring *ring) {
>> +       struct amdgpu_device *adev = ring->adev;
>> +       int i;
>> +
>> +       for (i = 0; i < ARRAY_SIZE(amdgpu_affinity_group); i++) {
>> +               char *temp_name = amdgpu_affinity_group[i];
>> +
>> +               /* set ring's affinity_group bit if find it in affinity_group list */
>> +               if (strncmp(ring->name, temp_name, strlen(temp_name)) == 0) {
>> +                       DRM_DEV_INFO(adev->dev, "set ring:%s in affinity_group\n",
>> +                             ring->name);
>> +                       ring->sched.affinity_group = 1;
>> +                       return 0;
>> +               }
>> +       }
>> +
>> +       ring->sched.affinity_group = 0;
>> +       return 0;
>> +}
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> index 56acec1075ac..6b0d217e6f5a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> @@ -350,4 +350,5 @@ int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
>>    			     struct amdgpu_ring *ring);
>>    void amdgpu_debugfs_ring_fini(struct amdgpu_ring *ring);
>>    
>> +int amdgpu_ring_set_affinity_group(struct amdgpu_ring *ring);
>>    #endif
>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>> index 1c815e0a14ed..589cbaea35dc 100644
>> --- a/include/drm/gpu_scheduler.h
>> +++ b/include/drm/gpu_scheduler.h
>> @@ -301,6 +301,7 @@ struct drm_gpu_scheduler {
>>    	atomic_t                        _score;
>>    	bool				ready;
>>    	bool				free_guilty;
>> +	int				affinity_group;
>>    };
>>    
>>    int drm_sched_init(struct drm_gpu_scheduler *sched,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode
  2021-03-08  7:53     ` Christian König
@ 2021-03-08 12:43       ` Zhang, Jack (Jian)
  2021-03-08 12:45         ` Christian König
  2021-03-09  7:29       ` Liu, Monk
  1 sibling, 1 reply; 13+ messages in thread
From: Zhang, Jack (Jian) @ 2021-03-08 12:43 UTC (permalink / raw)
  To: Koenig, Christian, Liu, Monk, amd-gfx, Grodzovsky, Andrey, Deng, Emily

[AMD Public Use]

Hi, Christian,

I made some change on V3 patch that insert a dma_fence_wait for the first jobs after resubmit jobs.
It seems simpler than the V2 patch. Is this what you first thinks of in your mind?

Thanks,
Jack

-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com> 
Sent: Monday, March 8, 2021 3:53 PM
To: Liu, Monk <Monk.Liu@amd.com>; Zhang, Jack (Jian) <Jack.Zhang1@amd.com>; amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Deng, Emily <Emily.Deng@amd.com>
Subject: Re: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode



Am 08.03.21 um 05:06 schrieb Liu, Monk:
> [AMD Official Use Only - Internal Distribution Only]
>
>>> well first of all please completely drop the affinity group stuff from this patch. We should concentrate on one feature at at time.
> We need it to expedite the process, we can introduce this change in 
> another patch
>
>
>>> Then the implementation is way to complicate. All you need to do is insert a dma_fence_wait after re-scheduling each job after a reset.
> No that's not true, during the " drm_sched_resubmit_jobs" it will put 
> all jobs in mirror list into the hw ring, but we can only allow the 
> first job to the ring To catch the real guilty one (otherwise it is possible that the later job in the ring also has bug and affect our judgement) So we need to implement a new " drm_sched_resubmit_jobs2()", like this way:

Something like this. But since waiting for the guilty job is AMD specific we should rather rework the stuff from the beginning.

What I have in mind is the following:
1. Add a reference from the scheduler fence back to the job which is cleared only when the scheduler fence finishes.
2. Completely drop the ring_mirror_list and replace it with a kfifo of pointers to the active scheduler fences.
3. Replace drm_sched_resubmit_jobs with a drm_sched_for_each_active() macro which allows drivers to iterate over all the active jobs and resubmit/wait/mark them as guilty etc etc..
4. Remove the guilty/karma handling from the scheduler. This is something AMD specific and shouldn't leak into common code.

Regards,
Christian.

>
> drm_sched_resubmit_jobs2()
> ~ 499 void drm_sched_resubmit_jobs2(struct drm_gpu_scheduler *sched, int max)
>    500 {
>    501     struct drm_sched_job *s_job, *tmp;
>    502     uint64_t guilty_context;
>    503     bool found_guilty = false;
>    504     struct dma_fence *fence;
> + 505     int i = 0;
>    506
>    507     list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
>    508         struct drm_sched_fence *s_fence = s_job->s_fence;
>    509
> + 510         if (i >= max)
> + 511             break;
> + 512
>    513         if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
>    514             found_guilty = true;
>    515             guilty_context = s_job->s_fence->scheduled.context;
>    516         }
>    517
>    518         if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
>    519             dma_fence_set_error(&s_fence->finished, -ECANCELED);
>    520
>    521         dma_fence_put(s_job->s_fence->parent);
>    522         fence = sched->ops->run_job(s_job);
> + 523         i++;
>    524
>    525         if (IS_ERR_OR_NULL(fence)) {
>    526             if (IS_ERR(fence))
>    527                 dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
>    528
>    529             s_job->s_fence->parent = NULL;
>    530         } else {
>    531             s_job->s_fence->parent = fence;
>    532         }
>    533
>    534
>    535     }
>    536 }
>    537 EXPORT_SYMBOL(drm_sched_resubmit_jobs);
>    538
>
>
>
> Thanks
>
> ------------------------------------------
> Monk Liu | Cloud-GPU Core team
> ------------------------------------------
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Sunday, March 7, 2021 3:03 AM
> To: Zhang, Jack (Jian) <Jack.Zhang1@amd.com>; 
> amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey 
> <Andrey.Grodzovsky@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Deng, Emily 
> <Emily.Deng@amd.com>
> Subject: Re: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode
>
> Hi Jack,
>
> well first of all please completely drop the affinity group stuff from this patch. We should concentrate on one feature at at time.
>
> Then the implementation is way to complicate. All you need to do is insert a dma_fence_wait after re-scheduling each job after a reset.
>
> Additional to that this feature is completely AMD specific and shouldn't affect the common scheduler in any way.
>
> Regards,
> Christian.
>
> Am 06.03.21 um 18:25 schrieb Jack Zhang:
>> [Why]
>> Previous tdr design treats the first job in job_timeout as the bad job.
>> But sometimes a later bad compute job can block a good gfx job and 
>> cause an unexpected gfx job timeout because gfx and compute ring 
>> share internal GC HW mutually.
>>
>> [How]
>> This patch implements an advanced tdr mode.It involves an additinal 
>> synchronous pre-resubmit step(Step0 Resubmit) before normal resubmit 
>> step in order to find the real bad job.
>>
>> 1. For Bailing TDR job, re-insert it to mirror_list, don't set it to 
>> guilty and leave it to be handled by the main reset thread.
>>
>> 2. Don't set the job to guilty in pre_asic_reset, and leave it to be 
>> handled by Step0 Resubmit Stage.
>>
>> 3. At Step0 Resubmit stage, it first resubmit jobs asynchronously, 
>> then it iterate each ring mirror_list, synchronously pend for each hw 
>> fence being signaled. If the a job's hw fence get timeout, we 
>> identify it as guilty and do hw reset to recover hw. After that, we 
>> would do the normal resubmit step to resubmit left jobs.
>>
>> 4. For whole gpu reset(vram lost), skip Step0 Resubmit as each job 
>> after vram lost was considered as bad job.
>>
>> 5. Involve the concept "Affinity Group".
>> Doing two hw resets is not necessary when there's only one ring that 
>> has jobs among some hw-related rings.Thus, we involve "affinity group".
>> Hw-related rings could be added into a common affinity group, such as 
>> gfx and compute ring. When tdr happens, we iterate all rings in 
>> affinity group, skip Step0 Resubmit stage if there's only one ring's 
>> mirror_list that has valid sched jobs.
>>
>> V2:
>>       -fix a cherry-pick mistake for bailing TDR handling.
>>
>>       -do affinity_group check according to the bad job's sched rather
>>        than the default "1" so that there could be multiple affinity
>>        groups being pre-defined in future.
>>
>> Signed-off-by: Jack Zhang <Jack.Zhang1@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 +++++++++++++++++++--
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |   2 +-
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  47 ++++++++++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_job.h    |   2 +-
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c   |  27 ++++++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |   1 +
>>    include/drm/gpu_scheduler.h                |   1 +
>>    7 files changed, 173 insertions(+), 9 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index e247c3a2ec08..8632d7071292 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -4188,6 +4188,37 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
>>    	return false;
>>    }
>>    
>> +bool amdgpu_affinity_group_has_only_or_null_working_ring(struct
>> +amdgpu_device *adev, struct drm_sched_job *s_job) {
>> +       int i;
>> +       int working_ring_num = 0;
>> +
>> +	/*
>> +	 * The job is considered as the real bad one
>> +	 * if job's sched is not in affinity group
>> +	 */
>> +	if (s_job->sched.affinity_group == 0)
>> +			return true;
>> +
>> +       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> +               struct amdgpu_ring *ring = adev->rings[i];
>> +
>> +               if (!ring || !ring->sched.thread)
>> +                       continue;
>> +
>> +               /* for non-empty affinity ring, increase working_ring_num */
>> +               if (ring->sched.affinity_group == s_job->sched.affinity_group) {
>> +                       if (!list_empty(&ring->sched.ring_mirror_list))
>> +                               working_ring_num++;
>> +               }
>> +       }
>> +
>> +       if (working_ring_num > 1) {
>> +               return false;
>> +       }
>> +       return true;
>> +}
>> +
>>    /**
>>     * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
>>     *
>> @@ -4310,8 +4341,10 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>    		amdgpu_fence_driver_force_completion(ring);
>>    	}
>>    
>> -	if(job)
>> -		drm_sched_increase_karma(&job->base);
>> +	if (amdgpu_gpu_recovery != 2) {
>> +		if (job)
>> +			drm_sched_increase_karma(&job->base);
>> +	}
>>    
>>    	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
>>    	if (!amdgpu_sriov_vf(adev)) {
>> @@ -4639,7 +4672,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>    	int i, r = 0;
>>    	bool need_emergency_restart = false;
>>    	bool audio_suspended = false;
>> -
>> +	int	tmp_vram_lost_counter;
>>    	/*
>>    	 * Special case: RAS triggered and full reset isn't supported
>>    	 */
>> @@ -4690,8 +4723,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>    					job ? job->base.id : -1);
>>    
>>    		/* even we skipped this reset, still need to set the job to guilty */
>> -		if (job)
>> -			drm_sched_increase_karma(&job->base);
>> +		if (job) {
>> +			if (amdgpu_gpu_recovery == 2) {
>> +				if (&job->base) {
>> +					spin_lock(&job->base.sched->job_list_lock);
>> +					list_add(&job->base.node, &job->base.sched->ring_mirror_list);
>> +					spin_unlock(&job->base.sched->job_list_lock);
>> +				}
>> +			} else
>> +				drm_sched_increase_karma(&job->base);
>> +		}
>>    		goto skip_recovery;
>>    	}
>>    
>> @@ -4788,6 +4829,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>    		}
>>    	}
>>    
>> +	tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
>>    	/* Actual ASIC resets if needed.*/
>>    	/* TODO Implement XGMI hive reset logic for SRIOV */
>>    	if (amdgpu_sriov_vf(adev)) {
>> @@ -4804,18 +4846,64 @@ int amdgpu_device_gpu_recover(struct 
>> amdgpu_device *adev,
>>    
>>    	/* Post ASIC reset for all devs .*/
>>    	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) 
>> {
>> +		int step = 1;
>>    
>> +		if (amdgpu_gpu_recovery == 2) {
>> +			if (amdgpu_affinity_group_has_only_or_null_working_ring(adev,&job->base)
>> +				|| tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)) {
>> +				DRM_INFO("Skip Stage0 Resubmit Stage\n");
>> +				/* set guilty */
>> +				drm_sched_increase_karma(&job->base);
>> +				step = 1;
>> +			} else {
>> +				DRM_INFO("Do Stage0 Resubmit Stage\n");
>> +				step = 0;
>> +			}
>> +		}
>> +
>> +retry_resubmit:
>>    		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>    			struct amdgpu_ring *ring = tmp_adev->rings[i];
>> +			int ret = 0;
>> +			struct drm_sched_job *s_bad_job = NULL;
>>    
>>    			if (!ring || !ring->sched.thread)
>>    				continue;
>>    
>>    			/* No point to resubmit jobs if we didn't HW reset*/
>> -			if (!tmp_adev->asic_reset_res && !job_signaled)
>> +			if (!tmp_adev->asic_reset_res && !job_signaled) {
>> +
>>    				drm_sched_resubmit_jobs(&ring->sched);
>>    
>> -			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>> +				if (amdgpu_gpu_recovery == 2 && step == 0) {
>> +					ret = amdgpu_wait_resubmitted_jobs_completion(&ring->sched, ring->sched.timeout, &s_bad_job);
>> +					if (ret == -1) {
>> +						DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", ring->sched.name, s_bad_job->id);
>> +						/* set guilty */
>> +						drm_sched_increase_karma(s_bad_job);
>> +
>> +						/* do hw reset */
>> +						if (amdgpu_sriov_vf(adev)) {
>> +							amdgpu_virt_fini_data_exchange(adev);
>> +							r = amdgpu_device_reset_sriov(adev, false);
>> +							if (r)
>> +								adev->asic_reset_res = r;
>> +						} else {
>> +							r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
>> +							if (r && r == -EAGAIN)
>> +								goto retry;
>> +						}
>> +
>> +						/* add reset counter so that the following resubmitted job could flush vmid */
>> +						atomic_inc(&tmp_adev->gpu_reset_counter);
>> +						step = 1;
>> +						goto retry_resubmit;
>> +					}
>> +				}
>> +			}
>> +
>> +			if (step == 1)
>> +				drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>>    		}
>>    
>>    		if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index 865f924772b0..9c3f4edb7532 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -509,7 +509,7 @@ module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444);
>>     * DOC: gpu_recovery (int)
>>     * Set to enable GPU recovery mechanism (1 = enable, 0 = disable). The default is -1 (auto, disabled except SRIOV).
>>     */
>> -MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 = 
>> enable, 0 = disable, -1 = auto)");
>> +MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (2 = 
>> +advanced tdr mode, 1 = enable, 0 = disable, -1 = auto)");
>>    module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
>>    
>>    /**
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> index 759b34799221..28cda321157a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> @@ -281,6 +281,53 @@ void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched)
>>    	}
>>    }
>>    
>> +int amdgpu_wait_resubmitted_jobs_completion(struct drm_gpu_scheduler 
>> +*sched, long timeout, struct drm_sched_job **s_bad_job) {
>> +	struct drm_sched_job *s_job, *tmp;
>> +	int ret = 0;
>> +
>> +	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
>> +		struct drm_sched_fence *s_fence = s_job->s_fence;
>> +
>> +			if (s_fence->parent == NULL) { /* fail to get a hw fence */
>> +				/* process a job */
>> +				atomic_dec(&sched->num_jobs);
>> +				dma_fence_get(&s_fence->finished);
>> +				dma_fence_signal(&s_fence->finished);
>> +				dma_fence_put(&s_fence->finished);
>> +
>> +				/* remove node from mirror_list and free the job */
>> +				spin_lock(&sched->job_list_lock);
>> +				list_del_init(&s_job->node);
>> +				spin_unlock(&sched->job_list_lock);
>> +				sched->ops->free_job(s_job);
>> +				continue;
>> +			}
>> +
>> +			ret = dma_fence_wait_timeout(s_fence->parent, false, timeout);
>> +
>> +			if (ret > 0) { /* succeed */
>> +				/* process a job */
>> +				atomic_dec(&sched->num_jobs);
>> +				dma_fence_get(&s_fence->finished);
>> +				dma_fence_signal(&s_fence->finished);
>> +				dma_fence_put(&s_fence->finished);
>> +
>> +				/* remove node from mirror_list and free the job */
>> +				spin_lock(&sched->job_list_lock);
>> +				list_del_init(&s_job->node);
>> +				spin_unlock(&sched->job_list_lock);
>> +				sched->ops->free_job(s_job);
>> +				continue;
>> +			} else if (ret == 0) {
>> +				*s_bad_job = s_job;
>> +				return -1; /* timeout */
>> +			}
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>>    const struct drm_sched_backend_ops amdgpu_sched_ops = {
>>    	.dependency = amdgpu_job_dependency,
>>    	.run_job = amdgpu_job_run,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> index 81caac9b958a..25292f4699fb 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> @@ -76,5 +76,5 @@ int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring,
>>    			     struct dma_fence **fence);
>>    
>>    void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler 
>> *sched);
>> -
>> +int amdgpu_wait_resubmitted_jobs_completion(struct drm_gpu_scheduler 
>> +*sched, long timeout, struct drm_sched_job **s_bad_job);
>>    #endif
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> index b644c78475fd..cb50bfc80bc9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> @@ -35,6 +35,11 @@
>>    #include "amdgpu.h"
>>    #include "atom.h"
>>    
>> +static char *amdgpu_affinity_group[] = { "gfx", "comp"
>> +};
>> +
>>    /*
>>     * Rings
>>     * Most engines on the GPU are fed via ring buffers.  Ring @@ 
>> -189,6
>> +194,7 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct 
>> +amdgpu_ring *ring,
>>    		ring->adev = adev;
>>    		ring->idx = adev->num_rings++;
>>    		adev->rings[ring->idx] = ring;
>> +		amdgpu_ring_set_affinity_group(ring);
>>    		r = amdgpu_fence_driver_init_ring(ring, sched_hw_submission);
>>    		if (r)
>>    			return r;
>> @@ -459,3 +465,24 @@ int amdgpu_ring_test_helper(struct amdgpu_ring *ring)
>>    	ring->sched.ready = !r;
>>    	return r;
>>    }
>> +
>> +int amdgpu_ring_set_affinity_group(struct amdgpu_ring *ring) {
>> +       struct amdgpu_device *adev = ring->adev;
>> +       int i;
>> +
>> +       for (i = 0; i < ARRAY_SIZE(amdgpu_affinity_group); i++) {
>> +               char *temp_name = amdgpu_affinity_group[i];
>> +
>> +               /* set ring's affinity_group bit if find it in affinity_group list */
>> +               if (strncmp(ring->name, temp_name, strlen(temp_name)) == 0) {
>> +                       DRM_DEV_INFO(adev->dev, "set ring:%s in affinity_group\n",
>> +                             ring->name);
>> +                       ring->sched.affinity_group = 1;
>> +                       return 0;
>> +               }
>> +       }
>> +
>> +       ring->sched.affinity_group = 0;
>> +       return 0;
>> +}
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> index 56acec1075ac..6b0d217e6f5a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> @@ -350,4 +350,5 @@ int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
>>    			     struct amdgpu_ring *ring);
>>    void amdgpu_debugfs_ring_fini(struct amdgpu_ring *ring);
>>    
>> +int amdgpu_ring_set_affinity_group(struct amdgpu_ring *ring);
>>    #endif
>> diff --git a/include/drm/gpu_scheduler.h 
>> b/include/drm/gpu_scheduler.h index 1c815e0a14ed..589cbaea35dc 100644
>> --- a/include/drm/gpu_scheduler.h
>> +++ b/include/drm/gpu_scheduler.h
>> @@ -301,6 +301,7 @@ struct drm_gpu_scheduler {
>>    	atomic_t                        _score;
>>    	bool				ready;
>>    	bool				free_guilty;
>> +	int				affinity_group;
>>    };
>>    
>>    int drm_sched_init(struct drm_gpu_scheduler *sched,
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode
  2021-03-08 12:43       ` Zhang, Jack (Jian)
@ 2021-03-08 12:45         ` Christian König
  2021-03-09  7:03           ` Zhang, Jack (Jian)
  0 siblings, 1 reply; 13+ messages in thread
From: Christian König @ 2021-03-08 12:45 UTC (permalink / raw)
  To: Zhang, Jack (Jian), Liu, Monk, amd-gfx, Grodzovsky, Andrey, Deng, Emily

Hi Jack,

yes that comes pretty close. I'm going over the patch right now.

Some things still look a bit complicated to me, but I need to wrap my 
head around how and why we are doing it this way once more.

Christian.

Am 08.03.21 um 13:43 schrieb Zhang, Jack (Jian):
> [AMD Public Use]
>
> Hi, Christian,
>
> I made some change on V3 patch that insert a dma_fence_wait for the first jobs after resubmit jobs.
> It seems simpler than the V2 patch. Is this what you first thinks of in your mind?
>
> Thanks,
> Jack
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Monday, March 8, 2021 3:53 PM
> To: Liu, Monk <Monk.Liu@amd.com>; Zhang, Jack (Jian) <Jack.Zhang1@amd.com>; amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Deng, Emily <Emily.Deng@amd.com>
> Subject: Re: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode
>
>
>
> Am 08.03.21 um 05:06 schrieb Liu, Monk:
>> [AMD Official Use Only - Internal Distribution Only]
>>
>>>> well first of all please completely drop the affinity group stuff from this patch. We should concentrate on one feature at at time.
>> We need it to expedite the process, we can introduce this change in
>> another patch
>>
>>
>>>> Then the implementation is way to complicate. All you need to do is insert a dma_fence_wait after re-scheduling each job after a reset.
>> No that's not true, during the " drm_sched_resubmit_jobs" it will put
>> all jobs in mirror list into the hw ring, but we can only allow the
>> first job to the ring To catch the real guilty one (otherwise it is possible that the later job in the ring also has bug and affect our judgement) So we need to implement a new " drm_sched_resubmit_jobs2()", like this way:
> Something like this. But since waiting for the guilty job is AMD specific we should rather rework the stuff from the beginning.
>
> What I have in mind is the following:
> 1. Add a reference from the scheduler fence back to the job which is cleared only when the scheduler fence finishes.
> 2. Completely drop the ring_mirror_list and replace it with a kfifo of pointers to the active scheduler fences.
> 3. Replace drm_sched_resubmit_jobs with a drm_sched_for_each_active() macro which allows drivers to iterate over all the active jobs and resubmit/wait/mark them as guilty etc etc..
> 4. Remove the guilty/karma handling from the scheduler. This is something AMD specific and shouldn't leak into common code.
>
> Regards,
> Christian.
>
>> drm_sched_resubmit_jobs2()
>> ~ 499 void drm_sched_resubmit_jobs2(struct drm_gpu_scheduler *sched, int max)
>>     500 {
>>     501     struct drm_sched_job *s_job, *tmp;
>>     502     uint64_t guilty_context;
>>     503     bool found_guilty = false;
>>     504     struct dma_fence *fence;
>> + 505     int i = 0;
>>     506
>>     507     list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
>>     508         struct drm_sched_fence *s_fence = s_job->s_fence;
>>     509
>> + 510         if (i >= max)
>> + 511             break;
>> + 512
>>     513         if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
>>     514             found_guilty = true;
>>     515             guilty_context = s_job->s_fence->scheduled.context;
>>     516         }
>>     517
>>     518         if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
>>     519             dma_fence_set_error(&s_fence->finished, -ECANCELED);
>>     520
>>     521         dma_fence_put(s_job->s_fence->parent);
>>     522         fence = sched->ops->run_job(s_job);
>> + 523         i++;
>>     524
>>     525         if (IS_ERR_OR_NULL(fence)) {
>>     526             if (IS_ERR(fence))
>>     527                 dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
>>     528
>>     529             s_job->s_fence->parent = NULL;
>>     530         } else {
>>     531             s_job->s_fence->parent = fence;
>>     532         }
>>     533
>>     534
>>     535     }
>>     536 }
>>     537 EXPORT_SYMBOL(drm_sched_resubmit_jobs);
>>     538
>>
>>
>>
>> Thanks
>>
>> ------------------------------------------
>> Monk Liu | Cloud-GPU Core team
>> ------------------------------------------
>>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: Sunday, March 7, 2021 3:03 AM
>> To: Zhang, Jack (Jian) <Jack.Zhang1@amd.com>;
>> amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey
>> <Andrey.Grodzovsky@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Deng, Emily
>> <Emily.Deng@amd.com>
>> Subject: Re: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode
>>
>> Hi Jack,
>>
>> well first of all please completely drop the affinity group stuff from this patch. We should concentrate on one feature at at time.
>>
>> Then the implementation is way to complicate. All you need to do is insert a dma_fence_wait after re-scheduling each job after a reset.
>>
>> Additional to that this feature is completely AMD specific and shouldn't affect the common scheduler in any way.
>>
>> Regards,
>> Christian.
>>
>> Am 06.03.21 um 18:25 schrieb Jack Zhang:
>>> [Why]
>>> Previous tdr design treats the first job in job_timeout as the bad job.
>>> But sometimes a later bad compute job can block a good gfx job and
>>> cause an unexpected gfx job timeout because gfx and compute ring
>>> share internal GC HW mutually.
>>>
>>> [How]
>>> This patch implements an advanced tdr mode.It involves an additinal
>>> synchronous pre-resubmit step(Step0 Resubmit) before normal resubmit
>>> step in order to find the real bad job.
>>>
>>> 1. For Bailing TDR job, re-insert it to mirror_list, don't set it to
>>> guilty and leave it to be handled by the main reset thread.
>>>
>>> 2. Don't set the job to guilty in pre_asic_reset, and leave it to be
>>> handled by Step0 Resubmit Stage.
>>>
>>> 3. At Step0 Resubmit stage, it first resubmit jobs asynchronously,
>>> then it iterate each ring mirror_list, synchronously pend for each hw
>>> fence being signaled. If the a job's hw fence get timeout, we
>>> identify it as guilty and do hw reset to recover hw. After that, we
>>> would do the normal resubmit step to resubmit left jobs.
>>>
>>> 4. For whole gpu reset(vram lost), skip Step0 Resubmit as each job
>>> after vram lost was considered as bad job.
>>>
>>> 5. Involve the concept "Affinity Group".
>>> Doing two hw resets is not necessary when there's only one ring that
>>> has jobs among some hw-related rings.Thus, we involve "affinity group".
>>> Hw-related rings could be added into a common affinity group, such as
>>> gfx and compute ring. When tdr happens, we iterate all rings in
>>> affinity group, skip Step0 Resubmit stage if there's only one ring's
>>> mirror_list that has valid sched jobs.
>>>
>>> V2:
>>>        -fix a cherry-pick mistake for bailing TDR handling.
>>>
>>>        -do affinity_group check according to the bad job's sched rather
>>>         than the default "1" so that there could be multiple affinity
>>>         groups being pre-defined in future.
>>>
>>> Signed-off-by: Jack Zhang <Jack.Zhang1@amd.com>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 +++++++++++++++++++--
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |   2 +-
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  47 ++++++++++
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_job.h    |   2 +-
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c   |  27 ++++++
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |   1 +
>>>     include/drm/gpu_scheduler.h                |   1 +
>>>     7 files changed, 173 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index e247c3a2ec08..8632d7071292 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -4188,6 +4188,37 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
>>>     	return false;
>>>     }
>>>     
>>> +bool amdgpu_affinity_group_has_only_or_null_working_ring(struct
>>> +amdgpu_device *adev, struct drm_sched_job *s_job) {
>>> +       int i;
>>> +       int working_ring_num = 0;
>>> +
>>> +	/*
>>> +	 * The job is considered as the real bad one
>>> +	 * if job's sched is not in affinity group
>>> +	 */
>>> +	if (s_job->sched.affinity_group == 0)
>>> +			return true;
>>> +
>>> +       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>> +               struct amdgpu_ring *ring = adev->rings[i];
>>> +
>>> +               if (!ring || !ring->sched.thread)
>>> +                       continue;
>>> +
>>> +               /* for non-empty affinity ring, increase working_ring_num */
>>> +               if (ring->sched.affinity_group == s_job->sched.affinity_group) {
>>> +                       if (!list_empty(&ring->sched.ring_mirror_list))
>>> +                               working_ring_num++;
>>> +               }
>>> +       }
>>> +
>>> +       if (working_ring_num > 1) {
>>> +               return false;
>>> +       }
>>> +       return true;
>>> +}
>>> +
>>>     /**
>>>      * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
>>>      *
>>> @@ -4310,8 +4341,10 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>     		amdgpu_fence_driver_force_completion(ring);
>>>     	}
>>>     
>>> -	if(job)
>>> -		drm_sched_increase_karma(&job->base);
>>> +	if (amdgpu_gpu_recovery != 2) {
>>> +		if (job)
>>> +			drm_sched_increase_karma(&job->base);
>>> +	}
>>>     
>>>     	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
>>>     	if (!amdgpu_sriov_vf(adev)) {
>>> @@ -4639,7 +4672,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>     	int i, r = 0;
>>>     	bool need_emergency_restart = false;
>>>     	bool audio_suspended = false;
>>> -
>>> +	int	tmp_vram_lost_counter;
>>>     	/*
>>>     	 * Special case: RAS triggered and full reset isn't supported
>>>     	 */
>>> @@ -4690,8 +4723,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>     					job ? job->base.id : -1);
>>>     
>>>     		/* even we skipped this reset, still need to set the job to guilty */
>>> -		if (job)
>>> -			drm_sched_increase_karma(&job->base);
>>> +		if (job) {
>>> +			if (amdgpu_gpu_recovery == 2) {
>>> +				if (&job->base) {
>>> +					spin_lock(&job->base.sched->job_list_lock);
>>> +					list_add(&job->base.node, &job->base.sched->ring_mirror_list);
>>> +					spin_unlock(&job->base.sched->job_list_lock);
>>> +				}
>>> +			} else
>>> +				drm_sched_increase_karma(&job->base);
>>> +		}
>>>     		goto skip_recovery;
>>>     	}
>>>     
>>> @@ -4788,6 +4829,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>     		}
>>>     	}
>>>     
>>> +	tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
>>>     	/* Actual ASIC resets if needed.*/
>>>     	/* TODO Implement XGMI hive reset logic for SRIOV */
>>>     	if (amdgpu_sriov_vf(adev)) {
>>> @@ -4804,18 +4846,64 @@ int amdgpu_device_gpu_recover(struct
>>> amdgpu_device *adev,
>>>     
>>>     	/* Post ASIC reset for all devs .*/
>>>     	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head)
>>> {
>>> +		int step = 1;
>>>     
>>> +		if (amdgpu_gpu_recovery == 2) {
>>> +			if (amdgpu_affinity_group_has_only_or_null_working_ring(adev,&job->base)
>>> +				|| tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)) {
>>> +				DRM_INFO("Skip Stage0 Resubmit Stage\n");
>>> +				/* set guilty */
>>> +				drm_sched_increase_karma(&job->base);
>>> +				step = 1;
>>> +			} else {
>>> +				DRM_INFO("Do Stage0 Resubmit Stage\n");
>>> +				step = 0;
>>> +			}
>>> +		}
>>> +
>>> +retry_resubmit:
>>>     		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>>     			struct amdgpu_ring *ring = tmp_adev->rings[i];
>>> +			int ret = 0;
>>> +			struct drm_sched_job *s_bad_job = NULL;
>>>     
>>>     			if (!ring || !ring->sched.thread)
>>>     				continue;
>>>     
>>>     			/* No point to resubmit jobs if we didn't HW reset*/
>>> -			if (!tmp_adev->asic_reset_res && !job_signaled)
>>> +			if (!tmp_adev->asic_reset_res && !job_signaled) {
>>> +
>>>     				drm_sched_resubmit_jobs(&ring->sched);
>>>     
>>> -			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>>> +				if (amdgpu_gpu_recovery == 2 && step == 0) {
>>> +					ret = amdgpu_wait_resubmitted_jobs_completion(&ring->sched, ring->sched.timeout, &s_bad_job);
>>> +					if (ret == -1) {
>>> +						DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", ring->sched.name, s_bad_job->id);
>>> +						/* set guilty */
>>> +						drm_sched_increase_karma(s_bad_job);
>>> +
>>> +						/* do hw reset */
>>> +						if (amdgpu_sriov_vf(adev)) {
>>> +							amdgpu_virt_fini_data_exchange(adev);
>>> +							r = amdgpu_device_reset_sriov(adev, false);
>>> +							if (r)
>>> +								adev->asic_reset_res = r;
>>> +						} else {
>>> +							r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
>>> +							if (r && r == -EAGAIN)
>>> +								goto retry;
>>> +						}
>>> +
>>> +						/* add reset counter so that the following resubmitted job could flush vmid */
>>> +						atomic_inc(&tmp_adev->gpu_reset_counter);
>>> +						step = 1;
>>> +						goto retry_resubmit;
>>> +					}
>>> +				}
>>> +			}
>>> +
>>> +			if (step == 1)
>>> +				drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>>>     		}
>>>     
>>>     		if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index 865f924772b0..9c3f4edb7532 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -509,7 +509,7 @@ module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444);
>>>      * DOC: gpu_recovery (int)
>>>      * Set to enable GPU recovery mechanism (1 = enable, 0 = disable). The default is -1 (auto, disabled except SRIOV).
>>>      */
>>> -MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 =
>>> enable, 0 = disable, -1 = auto)");
>>> +MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (2 =
>>> +advanced tdr mode, 1 = enable, 0 = disable, -1 = auto)");
>>>     module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
>>>     
>>>     /**
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> index 759b34799221..28cda321157a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> @@ -281,6 +281,53 @@ void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched)
>>>     	}
>>>     }
>>>     
>>> +int amdgpu_wait_resubmitted_jobs_completion(struct drm_gpu_scheduler
>>> +*sched, long timeout, struct drm_sched_job **s_bad_job) {
>>> +	struct drm_sched_job *s_job, *tmp;
>>> +	int ret = 0;
>>> +
>>> +	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
>>> +		struct drm_sched_fence *s_fence = s_job->s_fence;
>>> +
>>> +			if (s_fence->parent == NULL) { /* fail to get a hw fence */
>>> +				/* process a job */
>>> +				atomic_dec(&sched->num_jobs);
>>> +				dma_fence_get(&s_fence->finished);
>>> +				dma_fence_signal(&s_fence->finished);
>>> +				dma_fence_put(&s_fence->finished);
>>> +
>>> +				/* remove node from mirror_list and free the job */
>>> +				spin_lock(&sched->job_list_lock);
>>> +				list_del_init(&s_job->node);
>>> +				spin_unlock(&sched->job_list_lock);
>>> +				sched->ops->free_job(s_job);
>>> +				continue;
>>> +			}
>>> +
>>> +			ret = dma_fence_wait_timeout(s_fence->parent, false, timeout);
>>> +
>>> +			if (ret > 0) { /* succeed */
>>> +				/* process a job */
>>> +				atomic_dec(&sched->num_jobs);
>>> +				dma_fence_get(&s_fence->finished);
>>> +				dma_fence_signal(&s_fence->finished);
>>> +				dma_fence_put(&s_fence->finished);
>>> +
>>> +				/* remove node from mirror_list and free the job */
>>> +				spin_lock(&sched->job_list_lock);
>>> +				list_del_init(&s_job->node);
>>> +				spin_unlock(&sched->job_list_lock);
>>> +				sched->ops->free_job(s_job);
>>> +				continue;
>>> +			} else if (ret == 0) {
>>> +				*s_bad_job = s_job;
>>> +				return -1; /* timeout */
>>> +			}
>>> +	}
>>> +
>>> +	return 0;
>>> +}
>>> +
>>>     const struct drm_sched_backend_ops amdgpu_sched_ops = {
>>>     	.dependency = amdgpu_job_dependency,
>>>     	.run_job = amdgpu_job_run,
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> index 81caac9b958a..25292f4699fb 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> @@ -76,5 +76,5 @@ int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring,
>>>     			     struct dma_fence **fence);
>>>     
>>>     void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler
>>> *sched);
>>> -
>>> +int amdgpu_wait_resubmitted_jobs_completion(struct drm_gpu_scheduler
>>> +*sched, long timeout, struct drm_sched_job **s_bad_job);
>>>     #endif
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> index b644c78475fd..cb50bfc80bc9 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> @@ -35,6 +35,11 @@
>>>     #include "amdgpu.h"
>>>     #include "atom.h"
>>>     
>>> +static char *amdgpu_affinity_group[] = { "gfx", "comp"
>>> +};
>>> +
>>>     /*
>>>      * Rings
>>>      * Most engines on the GPU are fed via ring buffers.  Ring @@
>>> -189,6
>>> +194,7 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct
>>> +amdgpu_ring *ring,
>>>     		ring->adev = adev;
>>>     		ring->idx = adev->num_rings++;
>>>     		adev->rings[ring->idx] = ring;
>>> +		amdgpu_ring_set_affinity_group(ring);
>>>     		r = amdgpu_fence_driver_init_ring(ring, sched_hw_submission);
>>>     		if (r)
>>>     			return r;
>>> @@ -459,3 +465,24 @@ int amdgpu_ring_test_helper(struct amdgpu_ring *ring)
>>>     	ring->sched.ready = !r;
>>>     	return r;
>>>     }
>>> +
>>> +int amdgpu_ring_set_affinity_group(struct amdgpu_ring *ring) {
>>> +       struct amdgpu_device *adev = ring->adev;
>>> +       int i;
>>> +
>>> +       for (i = 0; i < ARRAY_SIZE(amdgpu_affinity_group); i++) {
>>> +               char *temp_name = amdgpu_affinity_group[i];
>>> +
>>> +               /* set ring's affinity_group bit if find it in affinity_group list */
>>> +               if (strncmp(ring->name, temp_name, strlen(temp_name)) == 0) {
>>> +                       DRM_DEV_INFO(adev->dev, "set ring:%s in affinity_group\n",
>>> +                             ring->name);
>>> +                       ring->sched.affinity_group = 1;
>>> +                       return 0;
>>> +               }
>>> +       }
>>> +
>>> +       ring->sched.affinity_group = 0;
>>> +       return 0;
>>> +}
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> index 56acec1075ac..6b0d217e6f5a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> @@ -350,4 +350,5 @@ int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
>>>     			     struct amdgpu_ring *ring);
>>>     void amdgpu_debugfs_ring_fini(struct amdgpu_ring *ring);
>>>     
>>> +int amdgpu_ring_set_affinity_group(struct amdgpu_ring *ring);
>>>     #endif
>>> diff --git a/include/drm/gpu_scheduler.h
>>> b/include/drm/gpu_scheduler.h index 1c815e0a14ed..589cbaea35dc 100644
>>> --- a/include/drm/gpu_scheduler.h
>>> +++ b/include/drm/gpu_scheduler.h
>>> @@ -301,6 +301,7 @@ struct drm_gpu_scheduler {
>>>     	atomic_t                        _score;
>>>     	bool				ready;
>>>     	bool				free_guilty;
>>> +	int				affinity_group;
>>>     };
>>>     
>>>     int drm_sched_init(struct drm_gpu_scheduler *sched,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode
  2021-03-08 12:45         ` Christian König
@ 2021-03-09  7:03           ` Zhang, Jack (Jian)
  0 siblings, 0 replies; 13+ messages in thread
From: Zhang, Jack (Jian) @ 2021-03-09  7:03 UTC (permalink / raw)
  To: Koenig, Christian, Liu, Monk, amd-gfx, Grodzovsky, Andrey, Deng, Emily

[AMD Official Use Only - Internal Distribution Only]

Hi, Christian,

Since this change is a bit critical to our project, we would be grateful that could get your review.

Are there anything that's is not clear enough I could help to explain?

Again,
Thanks for your huge help to our problem.

Thanks,
Jack

-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com>
Sent: Monday, March 8, 2021 8:45 PM
To: Zhang, Jack (Jian) <Jack.Zhang1@amd.com>; Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Deng, Emily <Emily.Deng@amd.com>
Subject: Re: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode

Hi Jack,

yes that comes pretty close. I'm going over the patch right now.

Some things still look a bit complicated to me, but I need to wrap my head around how and why we are doing it this way once more.

Christian.

Am 08.03.21 um 13:43 schrieb Zhang, Jack (Jian):
> [AMD Public Use]
>
> Hi, Christian,
>
> I made some change on V3 patch that insert a dma_fence_wait for the first jobs after resubmit jobs.
> It seems simpler than the V2 patch. Is this what you first thinks of in your mind?
>
> Thanks,
> Jack
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Monday, March 8, 2021 3:53 PM
> To: Liu, Monk <Monk.Liu@amd.com>; Zhang, Jack (Jian)
> <Jack.Zhang1@amd.com>; amd-gfx@lists.freedesktop.org; Grodzovsky,
> Andrey <Andrey.Grodzovsky@amd.com>; Deng, Emily <Emily.Deng@amd.com>
> Subject: Re: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode
>
>
>
> Am 08.03.21 um 05:06 schrieb Liu, Monk:
>> [AMD Official Use Only - Internal Distribution Only]
>>
>>>> well first of all please completely drop the affinity group stuff from this patch. We should concentrate on one feature at at time.
>> We need it to expedite the process, we can introduce this change in
>> another patch
>>
>>
>>>> Then the implementation is way to complicate. All you need to do is insert a dma_fence_wait after re-scheduling each job after a reset.
>> No that's not true, during the " drm_sched_resubmit_jobs" it will put
>> all jobs in mirror list into the hw ring, but we can only allow the
>> first job to the ring To catch the real guilty one (otherwise it is possible that the later job in the ring also has bug and affect our judgement) So we need to implement a new " drm_sched_resubmit_jobs2()", like this way:
> Something like this. But since waiting for the guilty job is AMD specific we should rather rework the stuff from the beginning.
>
> What I have in mind is the following:
> 1. Add a reference from the scheduler fence back to the job which is cleared only when the scheduler fence finishes.
> 2. Completely drop the ring_mirror_list and replace it with a kfifo of pointers to the active scheduler fences.
> 3. Replace drm_sched_resubmit_jobs with a drm_sched_for_each_active() macro which allows drivers to iterate over all the active jobs and resubmit/wait/mark them as guilty etc etc..
> 4. Remove the guilty/karma handling from the scheduler. This is something AMD specific and shouldn't leak into common code.
>
> Regards,
> Christian.
>
>> drm_sched_resubmit_jobs2()
>> ~ 499 void drm_sched_resubmit_jobs2(struct drm_gpu_scheduler *sched, int max)
>>     500 {
>>     501     struct drm_sched_job *s_job, *tmp;
>>     502     uint64_t guilty_context;
>>     503     bool found_guilty = false;
>>     504     struct dma_fence *fence;
>> + 505     int i = 0;
>>     506
>>     507     list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
>>     508         struct drm_sched_fence *s_fence = s_job->s_fence;
>>     509
>> + 510         if (i >= max)
>> + 511             break;
>> + 512
>>     513         if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
>>     514             found_guilty = true;
>>     515             guilty_context = s_job->s_fence->scheduled.context;
>>     516         }
>>     517
>>     518         if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
>>     519             dma_fence_set_error(&s_fence->finished, -ECANCELED);
>>     520
>>     521         dma_fence_put(s_job->s_fence->parent);
>>     522         fence = sched->ops->run_job(s_job);
>> + 523         i++;
>>     524
>>     525         if (IS_ERR_OR_NULL(fence)) {
>>     526             if (IS_ERR(fence))
>>     527                 dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
>>     528
>>     529             s_job->s_fence->parent = NULL;
>>     530         } else {
>>     531             s_job->s_fence->parent = fence;
>>     532         }
>>     533
>>     534
>>     535     }
>>     536 }
>>     537 EXPORT_SYMBOL(drm_sched_resubmit_jobs);
>>     538
>>
>>
>>
>> Thanks
>>
>> ------------------------------------------
>> Monk Liu | Cloud-GPU Core team
>> ------------------------------------------
>>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: Sunday, March 7, 2021 3:03 AM
>> To: Zhang, Jack (Jian) <Jack.Zhang1@amd.com>;
>> amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey
>> <Andrey.Grodzovsky@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Deng,
>> Emily <Emily.Deng@amd.com>
>> Subject: Re: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode
>>
>> Hi Jack,
>>
>> well first of all please completely drop the affinity group stuff from this patch. We should concentrate on one feature at at time.
>>
>> Then the implementation is way to complicate. All you need to do is insert a dma_fence_wait after re-scheduling each job after a reset.
>>
>> Additional to that this feature is completely AMD specific and shouldn't affect the common scheduler in any way.
>>
>> Regards,
>> Christian.
>>
>> Am 06.03.21 um 18:25 schrieb Jack Zhang:
>>> [Why]
>>> Previous tdr design treats the first job in job_timeout as the bad job.
>>> But sometimes a later bad compute job can block a good gfx job and
>>> cause an unexpected gfx job timeout because gfx and compute ring
>>> share internal GC HW mutually.
>>>
>>> [How]
>>> This patch implements an advanced tdr mode.It involves an additinal
>>> synchronous pre-resubmit step(Step0 Resubmit) before normal resubmit
>>> step in order to find the real bad job.
>>>
>>> 1. For Bailing TDR job, re-insert it to mirror_list, don't set it to
>>> guilty and leave it to be handled by the main reset thread.
>>>
>>> 2. Don't set the job to guilty in pre_asic_reset, and leave it to be
>>> handled by Step0 Resubmit Stage.
>>>
>>> 3. At Step0 Resubmit stage, it first resubmit jobs asynchronously,
>>> then it iterate each ring mirror_list, synchronously pend for each
>>> hw fence being signaled. If the a job's hw fence get timeout, we
>>> identify it as guilty and do hw reset to recover hw. After that, we
>>> would do the normal resubmit step to resubmit left jobs.
>>>
>>> 4. For whole gpu reset(vram lost), skip Step0 Resubmit as each job
>>> after vram lost was considered as bad job.
>>>
>>> 5. Involve the concept "Affinity Group".
>>> Doing two hw resets is not necessary when there's only one ring that
>>> has jobs among some hw-related rings.Thus, we involve "affinity group".
>>> Hw-related rings could be added into a common affinity group, such
>>> as gfx and compute ring. When tdr happens, we iterate all rings in
>>> affinity group, skip Step0 Resubmit stage if there's only one ring's
>>> mirror_list that has valid sched jobs.
>>>
>>> V2:
>>>        -fix a cherry-pick mistake for bailing TDR handling.
>>>
>>>        -do affinity_group check according to the bad job's sched rather
>>>         than the default "1" so that there could be multiple affinity
>>>         groups being pre-defined in future.
>>>
>>> Signed-off-by: Jack Zhang <Jack.Zhang1@amd.com>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 +++++++++++++++++++--
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |   2 +-
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  47 ++++++++++
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_job.h    |   2 +-
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c   |  27 ++++++
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |   1 +
>>>     include/drm/gpu_scheduler.h                |   1 +
>>>     7 files changed, 173 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index e247c3a2ec08..8632d7071292 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -4188,6 +4188,37 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
>>>     return false;
>>>     }
>>>
>>> +bool amdgpu_affinity_group_has_only_or_null_working_ring(struct
>>> +amdgpu_device *adev, struct drm_sched_job *s_job) {
>>> +       int i;
>>> +       int working_ring_num = 0;
>>> +
>>> +/*
>>> + * The job is considered as the real bad one
>>> + * if job's sched is not in affinity group
>>> + */
>>> +if (s_job->sched.affinity_group == 0)
>>> +return true;
>>> +
>>> +       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>> +               struct amdgpu_ring *ring = adev->rings[i];
>>> +
>>> +               if (!ring || !ring->sched.thread)
>>> +                       continue;
>>> +
>>> +               /* for non-empty affinity ring, increase working_ring_num */
>>> +               if (ring->sched.affinity_group == s_job->sched.affinity_group) {
>>> +                       if (!list_empty(&ring->sched.ring_mirror_list))
>>> +                               working_ring_num++;
>>> +               }
>>> +       }
>>> +
>>> +       if (working_ring_num > 1) {
>>> +               return false;
>>> +       }
>>> +       return true;
>>> +}
>>> +
>>>     /**
>>>      * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
>>>      *
>>> @@ -4310,8 +4341,10 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>     amdgpu_fence_driver_force_completion(ring);
>>>     }
>>>
>>> -if(job)
>>> -drm_sched_increase_karma(&job->base);
>>> +if (amdgpu_gpu_recovery != 2) {
>>> +if (job)
>>> +drm_sched_increase_karma(&job->base);
>>> +}
>>>
>>>     /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
>>>     if (!amdgpu_sriov_vf(adev)) {
>>> @@ -4639,7 +4672,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>     int i, r = 0;
>>>     bool need_emergency_restart = false;
>>>     bool audio_suspended = false;
>>> -
>>> +inttmp_vram_lost_counter;
>>>     /*
>>>      * Special case: RAS triggered and full reset isn't supported
>>>      */
>>> @@ -4690,8 +4723,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>     job ? job->base.id : -1);
>>>
>>>     /* even we skipped this reset, still need to set the job to guilty */
>>> -if (job)
>>> -drm_sched_increase_karma(&job->base);
>>> +if (job) {
>>> +if (amdgpu_gpu_recovery == 2) {
>>> +if (&job->base) {
>>> +spin_lock(&job->base.sched->job_list_lock);
>>> +list_add(&job->base.node, &job->base.sched->ring_mirror_list);
>>> +spin_unlock(&job->base.sched->job_list_lock);
>>> +}
>>> +} else
>>> +drm_sched_increase_karma(&job->base);
>>> +}
>>>     goto skip_recovery;
>>>     }
>>>
>>> @@ -4788,6 +4829,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>     }
>>>     }
>>>
>>> +tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
>>>     /* Actual ASIC resets if needed.*/
>>>     /* TODO Implement XGMI hive reset logic for SRIOV */
>>>     if (amdgpu_sriov_vf(adev)) {
>>> @@ -4804,18 +4846,64 @@ int amdgpu_device_gpu_recover(struct
>>> amdgpu_device *adev,
>>>
>>>     /* Post ASIC reset for all devs .*/
>>>     list_for_each_entry(tmp_adev, device_list_handle,
>>> gmc.xgmi.head) {
>>> +int step = 1;
>>>
>>> +if (amdgpu_gpu_recovery == 2) {
>>> +if (amdgpu_affinity_group_has_only_or_null_working_ring(adev,&job->base)
>>> +|| tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)) {
>>> +DRM_INFO("Skip Stage0 Resubmit Stage\n");
>>> +/* set guilty */
>>> +drm_sched_increase_karma(&job->base);
>>> +step = 1;
>>> +} else {
>>> +DRM_INFO("Do Stage0 Resubmit Stage\n");
>>> +step = 0;
>>> +}
>>> +}
>>> +
>>> +retry_resubmit:
>>>     for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>>     struct amdgpu_ring *ring = tmp_adev->rings[i];
>>> +int ret = 0;
>>> +struct drm_sched_job *s_bad_job = NULL;
>>>
>>>     if (!ring || !ring->sched.thread)
>>>     continue;
>>>
>>>     /* No point to resubmit jobs if we didn't HW reset*/
>>> -if (!tmp_adev->asic_reset_res && !job_signaled)
>>> +if (!tmp_adev->asic_reset_res && !job_signaled) {
>>> +
>>>     drm_sched_resubmit_jobs(&ring->sched);
>>>
>>> -drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>>> +if (amdgpu_gpu_recovery == 2 && step == 0) {
>>> +ret = amdgpu_wait_resubmitted_jobs_completion(&ring->sched, ring->sched.timeout, &s_bad_job);
>>> +if (ret == -1) {
>>> +DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", ring->sched.name, s_bad_job->id);
>>> +/* set guilty */
>>> +drm_sched_increase_karma(s_bad_job);
>>> +
>>> +/* do hw reset */
>>> +if (amdgpu_sriov_vf(adev)) {
>>> +amdgpu_virt_fini_data_exchange(adev);
>>> +r = amdgpu_device_reset_sriov(adev, false);
>>> +if (r)
>>> +adev->asic_reset_res = r;
>>> +} else {
>>> +r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
>>> +if (r && r == -EAGAIN)
>>> +goto retry;
>>> +}
>>> +
>>> +/* add reset counter so that the following resubmitted job could flush vmid */
>>> +atomic_inc(&tmp_adev->gpu_reset_counter);
>>> +step = 1;
>>> +goto retry_resubmit;
>>> +}
>>> +}
>>> +}
>>> +
>>> +if (step == 1)
>>> +drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>>>     }
>>>
>>>     if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled)
>>> { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index 865f924772b0..9c3f4edb7532 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -509,7 +509,7 @@ module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444);
>>>      * DOC: gpu_recovery (int)
>>>      * Set to enable GPU recovery mechanism (1 = enable, 0 = disable). The default is -1 (auto, disabled except SRIOV).
>>>      */
>>> -MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 =
>>> enable, 0 = disable, -1 = auto)");
>>> +MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (2 =
>>> +advanced tdr mode, 1 = enable, 0 = disable, -1 = auto)");
>>>     module_param_named(gpu_recovery, amdgpu_gpu_recovery, int,
>>> 0444);
>>>
>>>     /**
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> index 759b34799221..28cda321157a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> @@ -281,6 +281,53 @@ void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched)
>>>     }
>>>     }
>>>
>>> +int amdgpu_wait_resubmitted_jobs_completion(struct
>>> +drm_gpu_scheduler *sched, long timeout, struct drm_sched_job **s_bad_job) {
>>> +struct drm_sched_job *s_job, *tmp;
>>> +int ret = 0;
>>> +
>>> +list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
>>> +struct drm_sched_fence *s_fence = s_job->s_fence;
>>> +
>>> +if (s_fence->parent == NULL) { /* fail to get a hw fence */
>>> +/* process a job */
>>> +atomic_dec(&sched->num_jobs);
>>> +dma_fence_get(&s_fence->finished);
>>> +dma_fence_signal(&s_fence->finished);
>>> +dma_fence_put(&s_fence->finished);
>>> +
>>> +/* remove node from mirror_list and free the job */
>>> +spin_lock(&sched->job_list_lock);
>>> +list_del_init(&s_job->node);
>>> +spin_unlock(&sched->job_list_lock);
>>> +sched->ops->free_job(s_job);
>>> +continue;
>>> +}
>>> +
>>> +ret = dma_fence_wait_timeout(s_fence->parent, false, timeout);
>>> +
>>> +if (ret > 0) { /* succeed */
>>> +/* process a job */
>>> +atomic_dec(&sched->num_jobs);
>>> +dma_fence_get(&s_fence->finished);
>>> +dma_fence_signal(&s_fence->finished);
>>> +dma_fence_put(&s_fence->finished);
>>> +
>>> +/* remove node from mirror_list and free the job */
>>> +spin_lock(&sched->job_list_lock);
>>> +list_del_init(&s_job->node);
>>> +spin_unlock(&sched->job_list_lock);
>>> +sched->ops->free_job(s_job);
>>> +continue;
>>> +} else if (ret == 0) {
>>> +*s_bad_job = s_job;
>>> +return -1; /* timeout */
>>> +}
>>> +}
>>> +
>>> +return 0;
>>> +}
>>> +
>>>     const struct drm_sched_backend_ops amdgpu_sched_ops = {
>>>     .dependency = amdgpu_job_dependency,
>>>     .run_job = amdgpu_job_run,
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> index 81caac9b958a..25292f4699fb 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> @@ -76,5 +76,5 @@ int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring,
>>>          struct dma_fence **fence);
>>>
>>>     void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler
>>> *sched);
>>> -
>>> +int amdgpu_wait_resubmitted_jobs_completion(struct
>>> +drm_gpu_scheduler *sched, long timeout, struct drm_sched_job
>>> +**s_bad_job);
>>>     #endif
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> index b644c78475fd..cb50bfc80bc9 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> @@ -35,6 +35,11 @@
>>>     #include "amdgpu.h"
>>>     #include "atom.h"
>>>
>>> +static char *amdgpu_affinity_group[] = { "gfx", "comp"
>>> +};
>>> +
>>>     /*
>>>      * Rings
>>>      * Most engines on the GPU are fed via ring buffers.  Ring @@
>>> -189,6
>>> +194,7 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct
>>> +amdgpu_ring *ring,
>>>     ring->adev = adev;
>>>     ring->idx = adev->num_rings++;
>>>     adev->rings[ring->idx] = ring;
>>> +amdgpu_ring_set_affinity_group(ring);
>>>     r = amdgpu_fence_driver_init_ring(ring, sched_hw_submission);
>>>     if (r)
>>>     return r;
>>> @@ -459,3 +465,24 @@ int amdgpu_ring_test_helper(struct amdgpu_ring *ring)
>>>     ring->sched.ready = !r;
>>>     return r;
>>>     }
>>> +
>>> +int amdgpu_ring_set_affinity_group(struct amdgpu_ring *ring) {
>>> +       struct amdgpu_device *adev = ring->adev;
>>> +       int i;
>>> +
>>> +       for (i = 0; i < ARRAY_SIZE(amdgpu_affinity_group); i++) {
>>> +               char *temp_name = amdgpu_affinity_group[i];
>>> +
>>> +               /* set ring's affinity_group bit if find it in affinity_group list */
>>> +               if (strncmp(ring->name, temp_name, strlen(temp_name)) == 0) {
>>> +                       DRM_DEV_INFO(adev->dev, "set ring:%s in affinity_group\n",
>>> +                             ring->name);
>>> +                       ring->sched.affinity_group = 1;
>>> +                       return 0;
>>> +               }
>>> +       }
>>> +
>>> +       ring->sched.affinity_group = 0;
>>> +       return 0;
>>> +}
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> index 56acec1075ac..6b0d217e6f5a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> @@ -350,4 +350,5 @@ int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
>>>          struct amdgpu_ring *ring);
>>>     void amdgpu_debugfs_ring_fini(struct amdgpu_ring *ring);
>>>
>>> +int amdgpu_ring_set_affinity_group(struct amdgpu_ring *ring);
>>>     #endif
>>> diff --git a/include/drm/gpu_scheduler.h
>>> b/include/drm/gpu_scheduler.h index 1c815e0a14ed..589cbaea35dc
>>> 100644
>>> --- a/include/drm/gpu_scheduler.h
>>> +++ b/include/drm/gpu_scheduler.h
>>> @@ -301,6 +301,7 @@ struct drm_gpu_scheduler {
>>>     atomic_t                        _score;
>>>     boolready;
>>>     boolfree_guilty;
>>> +intaffinity_group;
>>>     };
>>>
>>>     int drm_sched_init(struct drm_gpu_scheduler *sched,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode
  2021-03-08  7:53     ` Christian König
  2021-03-08 12:43       ` Zhang, Jack (Jian)
@ 2021-03-09  7:29       ` Liu, Monk
  2021-03-09 11:16         ` Christian König
  1 sibling, 1 reply; 13+ messages in thread
From: Liu, Monk @ 2021-03-09  7:29 UTC (permalink / raw)
  To: Koenig, Christian, Zhang, Jack (Jian),
	amd-gfx, Grodzovsky, Andrey, Deng, Emily

[AMD Official Use Only - Internal Distribution Only]

Christian

what feasible and practice now is:
1) we implement the advanced TDR mode in upstream first (so we can copy the same scheme in our LTS kernel) -- if you want we can avoid change drm/scheduler part code, but that one is already rejected by you due to complicated 
2) then we retire the mirror list concept and rework the drm/scheduler with KFIFO 
3) remove the guilty/karma handling from scheduler 

So I basically agree with you on the spirit of above changes: hide those AMD internal concept or tricks in vendor's driver part and make scheduler simple and scalable
But that definitely need  a longer design and discussion, so why don't we focus on our current problems now, 
as long as the new change doesn't regress it is still a good change based on current TDR implements  

I would proposal we only change AMD part code in this time, Jack's first version patch didn't touch scheduler part, but you stated it was too complicated and rejected it 

So the allowable revise options is what Jack did in ver2, which need to introduce a new API in scheduler drm_sched_resubmit_jobs2().

Hah: --( what do you think ?

Thanks 

------------------------------------------
Monk Liu | Cloud-GPU Core team
------------------------------------------

-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com> 
Sent: Monday, March 8, 2021 3:53 PM
To: Liu, Monk <Monk.Liu@amd.com>; Zhang, Jack (Jian) <Jack.Zhang1@amd.com>; amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Deng, Emily <Emily.Deng@amd.com>
Subject: Re: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode



Am 08.03.21 um 05:06 schrieb Liu, Monk:
> [AMD Official Use Only - Internal Distribution Only]
>
>>> well first of all please completely drop the affinity group stuff from this patch. We should concentrate on one feature at at time.
> We need it to expedite the process, we can introduce this change in 
> another patch
>
>
>>> Then the implementation is way to complicate. All you need to do is insert a dma_fence_wait after re-scheduling each job after a reset.
> No that's not true, during the " drm_sched_resubmit_jobs" it will put 
> all jobs in mirror list into the hw ring, but we can only allow the 
> first job to the ring To catch the real guilty one (otherwise it is possible that the later job in the ring also has bug and affect our judgement) So we need to implement a new " drm_sched_resubmit_jobs2()", like this way:

Something like this. But since waiting for the guilty job is AMD specific we should rather rework the stuff from the beginning.

What I have in mind is the following:
1. Add a reference from the scheduler fence back to the job which is cleared only when the scheduler fence finishes.
2. Completely drop the ring_mirror_list and replace it with a kfifo of pointers to the active scheduler fences.
3. Replace drm_sched_resubmit_jobs with a drm_sched_for_each_active() macro which allows drivers to iterate over all the active jobs and resubmit/wait/mark them as guilty etc etc..
4. Remove the guilty/karma handling from the scheduler. This is something AMD specific and shouldn't leak into common code.

Regards,
Christian.

>
> drm_sched_resubmit_jobs2()
> ~ 499 void drm_sched_resubmit_jobs2(struct drm_gpu_scheduler *sched, int max)
>    500 {
>    501     struct drm_sched_job *s_job, *tmp;
>    502     uint64_t guilty_context;
>    503     bool found_guilty = false;
>    504     struct dma_fence *fence;
> + 505     int i = 0;
>    506
>    507     list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
>    508         struct drm_sched_fence *s_fence = s_job->s_fence;
>    509
> + 510         if (i >= max)
> + 511             break;
> + 512
>    513         if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
>    514             found_guilty = true;
>    515             guilty_context = s_job->s_fence->scheduled.context;
>    516         }
>    517
>    518         if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
>    519             dma_fence_set_error(&s_fence->finished, -ECANCELED);
>    520
>    521         dma_fence_put(s_job->s_fence->parent);
>    522         fence = sched->ops->run_job(s_job);
> + 523         i++;
>    524
>    525         if (IS_ERR_OR_NULL(fence)) {
>    526             if (IS_ERR(fence))
>    527                 dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
>    528
>    529             s_job->s_fence->parent = NULL;
>    530         } else {
>    531             s_job->s_fence->parent = fence;
>    532         }
>    533
>    534
>    535     }
>    536 }
>    537 EXPORT_SYMBOL(drm_sched_resubmit_jobs);
>    538
>
>
>
> Thanks
>
> ------------------------------------------
> Monk Liu | Cloud-GPU Core team
> ------------------------------------------
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Sunday, March 7, 2021 3:03 AM
> To: Zhang, Jack (Jian) <Jack.Zhang1@amd.com>; 
> amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey 
> <Andrey.Grodzovsky@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Deng, Emily 
> <Emily.Deng@amd.com>
> Subject: Re: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode
>
> Hi Jack,
>
> well first of all please completely drop the affinity group stuff from this patch. We should concentrate on one feature at at time.
>
> Then the implementation is way to complicate. All you need to do is insert a dma_fence_wait after re-scheduling each job after a reset.
>
> Additional to that this feature is completely AMD specific and shouldn't affect the common scheduler in any way.
>
> Regards,
> Christian.
>
> Am 06.03.21 um 18:25 schrieb Jack Zhang:
>> [Why]
>> Previous tdr design treats the first job in job_timeout as the bad job.
>> But sometimes a later bad compute job can block a good gfx job and 
>> cause an unexpected gfx job timeout because gfx and compute ring 
>> share internal GC HW mutually.
>>
>> [How]
>> This patch implements an advanced tdr mode.It involves an additinal 
>> synchronous pre-resubmit step(Step0 Resubmit) before normal resubmit 
>> step in order to find the real bad job.
>>
>> 1. For Bailing TDR job, re-insert it to mirror_list, don't set it to 
>> guilty and leave it to be handled by the main reset thread.
>>
>> 2. Don't set the job to guilty in pre_asic_reset, and leave it to be 
>> handled by Step0 Resubmit Stage.
>>
>> 3. At Step0 Resubmit stage, it first resubmit jobs asynchronously, 
>> then it iterate each ring mirror_list, synchronously pend for each hw 
>> fence being signaled. If the a job's hw fence get timeout, we 
>> identify it as guilty and do hw reset to recover hw. After that, we 
>> would do the normal resubmit step to resubmit left jobs.
>>
>> 4. For whole gpu reset(vram lost), skip Step0 Resubmit as each job 
>> after vram lost was considered as bad job.
>>
>> 5. Involve the concept "Affinity Group".
>> Doing two hw resets is not necessary when there's only one ring that 
>> has jobs among some hw-related rings.Thus, we involve "affinity group".
>> Hw-related rings could be added into a common affinity group, such as 
>> gfx and compute ring. When tdr happens, we iterate all rings in 
>> affinity group, skip Step0 Resubmit stage if there's only one ring's 
>> mirror_list that has valid sched jobs.
>>
>> V2:
>>       -fix a cherry-pick mistake for bailing TDR handling.
>>
>>       -do affinity_group check according to the bad job's sched rather
>>        than the default "1" so that there could be multiple affinity
>>        groups being pre-defined in future.
>>
>> Signed-off-by: Jack Zhang <Jack.Zhang1@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 +++++++++++++++++++--
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |   2 +-
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  47 ++++++++++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_job.h    |   2 +-
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c   |  27 ++++++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |   1 +
>>    include/drm/gpu_scheduler.h                |   1 +
>>    7 files changed, 173 insertions(+), 9 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index e247c3a2ec08..8632d7071292 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -4188,6 +4188,37 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
>>    	return false;
>>    }
>>    
>> +bool amdgpu_affinity_group_has_only_or_null_working_ring(struct
>> +amdgpu_device *adev, struct drm_sched_job *s_job) {
>> +       int i;
>> +       int working_ring_num = 0;
>> +
>> +	/*
>> +	 * The job is considered as the real bad one
>> +	 * if job's sched is not in affinity group
>> +	 */
>> +	if (s_job->sched.affinity_group == 0)
>> +			return true;
>> +
>> +       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> +               struct amdgpu_ring *ring = adev->rings[i];
>> +
>> +               if (!ring || !ring->sched.thread)
>> +                       continue;
>> +
>> +               /* for non-empty affinity ring, increase working_ring_num */
>> +               if (ring->sched.affinity_group == s_job->sched.affinity_group) {
>> +                       if (!list_empty(&ring->sched.ring_mirror_list))
>> +                               working_ring_num++;
>> +               }
>> +       }
>> +
>> +       if (working_ring_num > 1) {
>> +               return false;
>> +       }
>> +       return true;
>> +}
>> +
>>    /**
>>     * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
>>     *
>> @@ -4310,8 +4341,10 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>    		amdgpu_fence_driver_force_completion(ring);
>>    	}
>>    
>> -	if(job)
>> -		drm_sched_increase_karma(&job->base);
>> +	if (amdgpu_gpu_recovery != 2) {
>> +		if (job)
>> +			drm_sched_increase_karma(&job->base);
>> +	}
>>    
>>    	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
>>    	if (!amdgpu_sriov_vf(adev)) {
>> @@ -4639,7 +4672,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>    	int i, r = 0;
>>    	bool need_emergency_restart = false;
>>    	bool audio_suspended = false;
>> -
>> +	int	tmp_vram_lost_counter;
>>    	/*
>>    	 * Special case: RAS triggered and full reset isn't supported
>>    	 */
>> @@ -4690,8 +4723,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>    					job ? job->base.id : -1);
>>    
>>    		/* even we skipped this reset, still need to set the job to guilty */
>> -		if (job)
>> -			drm_sched_increase_karma(&job->base);
>> +		if (job) {
>> +			if (amdgpu_gpu_recovery == 2) {
>> +				if (&job->base) {
>> +					spin_lock(&job->base.sched->job_list_lock);
>> +					list_add(&job->base.node, &job->base.sched->ring_mirror_list);
>> +					spin_unlock(&job->base.sched->job_list_lock);
>> +				}
>> +			} else
>> +				drm_sched_increase_karma(&job->base);
>> +		}
>>    		goto skip_recovery;
>>    	}
>>    
>> @@ -4788,6 +4829,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>    		}
>>    	}
>>    
>> +	tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
>>    	/* Actual ASIC resets if needed.*/
>>    	/* TODO Implement XGMI hive reset logic for SRIOV */
>>    	if (amdgpu_sriov_vf(adev)) {
>> @@ -4804,18 +4846,64 @@ int amdgpu_device_gpu_recover(struct 
>> amdgpu_device *adev,
>>    
>>    	/* Post ASIC reset for all devs .*/
>>    	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) 
>> {
>> +		int step = 1;
>>    
>> +		if (amdgpu_gpu_recovery == 2) {
>> +			if (amdgpu_affinity_group_has_only_or_null_working_ring(adev,&job->base)
>> +				|| tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)) {
>> +				DRM_INFO("Skip Stage0 Resubmit Stage\n");
>> +				/* set guilty */
>> +				drm_sched_increase_karma(&job->base);
>> +				step = 1;
>> +			} else {
>> +				DRM_INFO("Do Stage0 Resubmit Stage\n");
>> +				step = 0;
>> +			}
>> +		}
>> +
>> +retry_resubmit:
>>    		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>    			struct amdgpu_ring *ring = tmp_adev->rings[i];
>> +			int ret = 0;
>> +			struct drm_sched_job *s_bad_job = NULL;
>>    
>>    			if (!ring || !ring->sched.thread)
>>    				continue;
>>    
>>    			/* No point to resubmit jobs if we didn't HW reset*/
>> -			if (!tmp_adev->asic_reset_res && !job_signaled)
>> +			if (!tmp_adev->asic_reset_res && !job_signaled) {
>> +
>>    				drm_sched_resubmit_jobs(&ring->sched);
>>    
>> -			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>> +				if (amdgpu_gpu_recovery == 2 && step == 0) {
>> +					ret = amdgpu_wait_resubmitted_jobs_completion(&ring->sched, ring->sched.timeout, &s_bad_job);
>> +					if (ret == -1) {
>> +						DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", ring->sched.name, s_bad_job->id);
>> +						/* set guilty */
>> +						drm_sched_increase_karma(s_bad_job);
>> +
>> +						/* do hw reset */
>> +						if (amdgpu_sriov_vf(adev)) {
>> +							amdgpu_virt_fini_data_exchange(adev);
>> +							r = amdgpu_device_reset_sriov(adev, false);
>> +							if (r)
>> +								adev->asic_reset_res = r;
>> +						} else {
>> +							r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
>> +							if (r && r == -EAGAIN)
>> +								goto retry;
>> +						}
>> +
>> +						/* add reset counter so that the following resubmitted job could flush vmid */
>> +						atomic_inc(&tmp_adev->gpu_reset_counter);
>> +						step = 1;
>> +						goto retry_resubmit;
>> +					}
>> +				}
>> +			}
>> +
>> +			if (step == 1)
>> +				drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>>    		}
>>    
>>    		if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index 865f924772b0..9c3f4edb7532 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -509,7 +509,7 @@ module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444);
>>     * DOC: gpu_recovery (int)
>>     * Set to enable GPU recovery mechanism (1 = enable, 0 = disable). The default is -1 (auto, disabled except SRIOV).
>>     */
>> -MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 = 
>> enable, 0 = disable, -1 = auto)");
>> +MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (2 = 
>> +advanced tdr mode, 1 = enable, 0 = disable, -1 = auto)");
>>    module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
>>    
>>    /**
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> index 759b34799221..28cda321157a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> @@ -281,6 +281,53 @@ void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched)
>>    	}
>>    }
>>    
>> +int amdgpu_wait_resubmitted_jobs_completion(struct drm_gpu_scheduler 
>> +*sched, long timeout, struct drm_sched_job **s_bad_job) {
>> +	struct drm_sched_job *s_job, *tmp;
>> +	int ret = 0;
>> +
>> +	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
>> +		struct drm_sched_fence *s_fence = s_job->s_fence;
>> +
>> +			if (s_fence->parent == NULL) { /* fail to get a hw fence */
>> +				/* process a job */
>> +				atomic_dec(&sched->num_jobs);
>> +				dma_fence_get(&s_fence->finished);
>> +				dma_fence_signal(&s_fence->finished);
>> +				dma_fence_put(&s_fence->finished);
>> +
>> +				/* remove node from mirror_list and free the job */
>> +				spin_lock(&sched->job_list_lock);
>> +				list_del_init(&s_job->node);
>> +				spin_unlock(&sched->job_list_lock);
>> +				sched->ops->free_job(s_job);
>> +				continue;
>> +			}
>> +
>> +			ret = dma_fence_wait_timeout(s_fence->parent, false, timeout);
>> +
>> +			if (ret > 0) { /* succeed */
>> +				/* process a job */
>> +				atomic_dec(&sched->num_jobs);
>> +				dma_fence_get(&s_fence->finished);
>> +				dma_fence_signal(&s_fence->finished);
>> +				dma_fence_put(&s_fence->finished);
>> +
>> +				/* remove node from mirror_list and free the job */
>> +				spin_lock(&sched->job_list_lock);
>> +				list_del_init(&s_job->node);
>> +				spin_unlock(&sched->job_list_lock);
>> +				sched->ops->free_job(s_job);
>> +				continue;
>> +			} else if (ret == 0) {
>> +				*s_bad_job = s_job;
>> +				return -1; /* timeout */
>> +			}
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>>    const struct drm_sched_backend_ops amdgpu_sched_ops = {
>>    	.dependency = amdgpu_job_dependency,
>>    	.run_job = amdgpu_job_run,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> index 81caac9b958a..25292f4699fb 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> @@ -76,5 +76,5 @@ int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring,
>>    			     struct dma_fence **fence);
>>    
>>    void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler 
>> *sched);
>> -
>> +int amdgpu_wait_resubmitted_jobs_completion(struct drm_gpu_scheduler 
>> +*sched, long timeout, struct drm_sched_job **s_bad_job);
>>    #endif
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> index b644c78475fd..cb50bfc80bc9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> @@ -35,6 +35,11 @@
>>    #include "amdgpu.h"
>>    #include "atom.h"
>>    
>> +static char *amdgpu_affinity_group[] = { "gfx", "comp"
>> +};
>> +
>>    /*
>>     * Rings
>>     * Most engines on the GPU are fed via ring buffers.  Ring @@ 
>> -189,6
>> +194,7 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct 
>> +amdgpu_ring *ring,
>>    		ring->adev = adev;
>>    		ring->idx = adev->num_rings++;
>>    		adev->rings[ring->idx] = ring;
>> +		amdgpu_ring_set_affinity_group(ring);
>>    		r = amdgpu_fence_driver_init_ring(ring, sched_hw_submission);
>>    		if (r)
>>    			return r;
>> @@ -459,3 +465,24 @@ int amdgpu_ring_test_helper(struct amdgpu_ring *ring)
>>    	ring->sched.ready = !r;
>>    	return r;
>>    }
>> +
>> +int amdgpu_ring_set_affinity_group(struct amdgpu_ring *ring) {
>> +       struct amdgpu_device *adev = ring->adev;
>> +       int i;
>> +
>> +       for (i = 0; i < ARRAY_SIZE(amdgpu_affinity_group); i++) {
>> +               char *temp_name = amdgpu_affinity_group[i];
>> +
>> +               /* set ring's affinity_group bit if find it in affinity_group list */
>> +               if (strncmp(ring->name, temp_name, strlen(temp_name)) == 0) {
>> +                       DRM_DEV_INFO(adev->dev, "set ring:%s in affinity_group\n",
>> +                             ring->name);
>> +                       ring->sched.affinity_group = 1;
>> +                       return 0;
>> +               }
>> +       }
>> +
>> +       ring->sched.affinity_group = 0;
>> +       return 0;
>> +}
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> index 56acec1075ac..6b0d217e6f5a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> @@ -350,4 +350,5 @@ int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
>>    			     struct amdgpu_ring *ring);
>>    void amdgpu_debugfs_ring_fini(struct amdgpu_ring *ring);
>>    
>> +int amdgpu_ring_set_affinity_group(struct amdgpu_ring *ring);
>>    #endif
>> diff --git a/include/drm/gpu_scheduler.h 
>> b/include/drm/gpu_scheduler.h index 1c815e0a14ed..589cbaea35dc 100644
>> --- a/include/drm/gpu_scheduler.h
>> +++ b/include/drm/gpu_scheduler.h
>> @@ -301,6 +301,7 @@ struct drm_gpu_scheduler {
>>    	atomic_t                        _score;
>>    	bool				ready;
>>    	bool				free_guilty;
>> +	int				affinity_group;
>>    };
>>    
>>    int drm_sched_init(struct drm_gpu_scheduler *sched,
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode
  2021-03-09  7:29       ` Liu, Monk
@ 2021-03-09 11:16         ` Christian König
  2021-03-09 16:48           ` Andrey Grodzovsky
  0 siblings, 1 reply; 13+ messages in thread
From: Christian König @ 2021-03-09 11:16 UTC (permalink / raw)
  To: Liu, Monk, Zhang, Jack (Jian), amd-gfx, Grodzovsky, Andrey, Deng, Emily

Yeah, that are some really good points. I completely agree that we 
shouldn't do any larger cleanup right now.

But I think we still need some more review on this. I most likely won't 
have enough time to look into this before the weekend.

Andrey can you take a look as well?

Thanks,
Christian.

Am 09.03.21 um 08:29 schrieb Liu, Monk:
> [AMD Official Use Only - Internal Distribution Only]
>
> Christian
>
> what feasible and practice now is:
> 1) we implement the advanced TDR mode in upstream first (so we can copy the same scheme in our LTS kernel) -- if you want we can avoid change drm/scheduler part code, but that one is already rejected by you due to complicated
> 2) then we retire the mirror list concept and rework the drm/scheduler with KFIFO
> 3) remove the guilty/karma handling from scheduler
>
> So I basically agree with you on the spirit of above changes: hide those AMD internal concept or tricks in vendor's driver part and make scheduler simple and scalable
> But that definitely need  a longer design and discussion, so why don't we focus on our current problems now,
> as long as the new change doesn't regress it is still a good change based on current TDR implements
>
> I would proposal we only change AMD part code in this time, Jack's first version patch didn't touch scheduler part, but you stated it was too complicated and rejected it
>
> So the allowable revise options is what Jack did in ver2, which need to introduce a new API in scheduler drm_sched_resubmit_jobs2().
>
> Hah: --( what do you think ?
>
> Thanks
>
> ------------------------------------------
> Monk Liu | Cloud-GPU Core team
> ------------------------------------------
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Monday, March 8, 2021 3:53 PM
> To: Liu, Monk <Monk.Liu@amd.com>; Zhang, Jack (Jian) <Jack.Zhang1@amd.com>; amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Deng, Emily <Emily.Deng@amd.com>
> Subject: Re: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode
>
>
>
> Am 08.03.21 um 05:06 schrieb Liu, Monk:
>> [AMD Official Use Only - Internal Distribution Only]
>>
>>>> well first of all please completely drop the affinity group stuff from this patch. We should concentrate on one feature at at time.
>> We need it to expedite the process, we can introduce this change in
>> another patch
>>
>>
>>>> Then the implementation is way to complicate. All you need to do is insert a dma_fence_wait after re-scheduling each job after a reset.
>> No that's not true, during the " drm_sched_resubmit_jobs" it will put
>> all jobs in mirror list into the hw ring, but we can only allow the
>> first job to the ring To catch the real guilty one (otherwise it is possible that the later job in the ring also has bug and affect our judgement) So we need to implement a new " drm_sched_resubmit_jobs2()", like this way:
> Something like this. But since waiting for the guilty job is AMD specific we should rather rework the stuff from the beginning.
>
> What I have in mind is the following:
> 1. Add a reference from the scheduler fence back to the job which is cleared only when the scheduler fence finishes.
> 2. Completely drop the ring_mirror_list and replace it with a kfifo of pointers to the active scheduler fences.
> 3. Replace drm_sched_resubmit_jobs with a drm_sched_for_each_active() macro which allows drivers to iterate over all the active jobs and resubmit/wait/mark them as guilty etc etc..
> 4. Remove the guilty/karma handling from the scheduler. This is something AMD specific and shouldn't leak into common code.
>
> Regards,
> Christian.
>
>> drm_sched_resubmit_jobs2()
>> ~ 499 void drm_sched_resubmit_jobs2(struct drm_gpu_scheduler *sched, int max)
>>     500 {
>>     501     struct drm_sched_job *s_job, *tmp;
>>     502     uint64_t guilty_context;
>>     503     bool found_guilty = false;
>>     504     struct dma_fence *fence;
>> + 505     int i = 0;
>>     506
>>     507     list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
>>     508         struct drm_sched_fence *s_fence = s_job->s_fence;
>>     509
>> + 510         if (i >= max)
>> + 511             break;
>> + 512
>>     513         if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
>>     514             found_guilty = true;
>>     515             guilty_context = s_job->s_fence->scheduled.context;
>>     516         }
>>     517
>>     518         if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
>>     519             dma_fence_set_error(&s_fence->finished, -ECANCELED);
>>     520
>>     521         dma_fence_put(s_job->s_fence->parent);
>>     522         fence = sched->ops->run_job(s_job);
>> + 523         i++;
>>     524
>>     525         if (IS_ERR_OR_NULL(fence)) {
>>     526             if (IS_ERR(fence))
>>     527                 dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
>>     528
>>     529             s_job->s_fence->parent = NULL;
>>     530         } else {
>>     531             s_job->s_fence->parent = fence;
>>     532         }
>>     533
>>     534
>>     535     }
>>     536 }
>>     537 EXPORT_SYMBOL(drm_sched_resubmit_jobs);
>>     538
>>
>>
>>
>> Thanks
>>
>> ------------------------------------------
>> Monk Liu | Cloud-GPU Core team
>> ------------------------------------------
>>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: Sunday, March 7, 2021 3:03 AM
>> To: Zhang, Jack (Jian) <Jack.Zhang1@amd.com>;
>> amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey
>> <Andrey.Grodzovsky@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Deng, Emily
>> <Emily.Deng@amd.com>
>> Subject: Re: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode
>>
>> Hi Jack,
>>
>> well first of all please completely drop the affinity group stuff from this patch. We should concentrate on one feature at at time.
>>
>> Then the implementation is way to complicate. All you need to do is insert a dma_fence_wait after re-scheduling each job after a reset.
>>
>> Additional to that this feature is completely AMD specific and shouldn't affect the common scheduler in any way.
>>
>> Regards,
>> Christian.
>>
>> Am 06.03.21 um 18:25 schrieb Jack Zhang:
>>> [Why]
>>> Previous tdr design treats the first job in job_timeout as the bad job.
>>> But sometimes a later bad compute job can block a good gfx job and
>>> cause an unexpected gfx job timeout because gfx and compute ring
>>> share internal GC HW mutually.
>>>
>>> [How]
>>> This patch implements an advanced tdr mode.It involves an additinal
>>> synchronous pre-resubmit step(Step0 Resubmit) before normal resubmit
>>> step in order to find the real bad job.
>>>
>>> 1. For Bailing TDR job, re-insert it to mirror_list, don't set it to
>>> guilty and leave it to be handled by the main reset thread.
>>>
>>> 2. Don't set the job to guilty in pre_asic_reset, and leave it to be
>>> handled by Step0 Resubmit Stage.
>>>
>>> 3. At Step0 Resubmit stage, it first resubmit jobs asynchronously,
>>> then it iterate each ring mirror_list, synchronously pend for each hw
>>> fence being signaled. If the a job's hw fence get timeout, we
>>> identify it as guilty and do hw reset to recover hw. After that, we
>>> would do the normal resubmit step to resubmit left jobs.
>>>
>>> 4. For whole gpu reset(vram lost), skip Step0 Resubmit as each job
>>> after vram lost was considered as bad job.
>>>
>>> 5. Involve the concept "Affinity Group".
>>> Doing two hw resets is not necessary when there's only one ring that
>>> has jobs among some hw-related rings.Thus, we involve "affinity group".
>>> Hw-related rings could be added into a common affinity group, such as
>>> gfx and compute ring. When tdr happens, we iterate all rings in
>>> affinity group, skip Step0 Resubmit stage if there's only one ring's
>>> mirror_list that has valid sched jobs.
>>>
>>> V2:
>>>        -fix a cherry-pick mistake for bailing TDR handling.
>>>
>>>        -do affinity_group check according to the bad job's sched rather
>>>         than the default "1" so that there could be multiple affinity
>>>         groups being pre-defined in future.
>>>
>>> Signed-off-by: Jack Zhang <Jack.Zhang1@amd.com>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 +++++++++++++++++++--
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |   2 +-
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  47 ++++++++++
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_job.h    |   2 +-
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c   |  27 ++++++
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |   1 +
>>>     include/drm/gpu_scheduler.h                |   1 +
>>>     7 files changed, 173 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index e247c3a2ec08..8632d7071292 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -4188,6 +4188,37 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
>>>     	return false;
>>>     }
>>>     
>>> +bool amdgpu_affinity_group_has_only_or_null_working_ring(struct
>>> +amdgpu_device *adev, struct drm_sched_job *s_job) {
>>> +       int i;
>>> +       int working_ring_num = 0;
>>> +
>>> +	/*
>>> +	 * The job is considered as the real bad one
>>> +	 * if job's sched is not in affinity group
>>> +	 */
>>> +	if (s_job->sched.affinity_group == 0)
>>> +			return true;
>>> +
>>> +       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>> +               struct amdgpu_ring *ring = adev->rings[i];
>>> +
>>> +               if (!ring || !ring->sched.thread)
>>> +                       continue;
>>> +
>>> +               /* for non-empty affinity ring, increase working_ring_num */
>>> +               if (ring->sched.affinity_group == s_job->sched.affinity_group) {
>>> +                       if (!list_empty(&ring->sched.ring_mirror_list))
>>> +                               working_ring_num++;
>>> +               }
>>> +       }
>>> +
>>> +       if (working_ring_num > 1) {
>>> +               return false;
>>> +       }
>>> +       return true;
>>> +}
>>> +
>>>     /**
>>>      * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
>>>      *
>>> @@ -4310,8 +4341,10 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>     		amdgpu_fence_driver_force_completion(ring);
>>>     	}
>>>     
>>> -	if(job)
>>> -		drm_sched_increase_karma(&job->base);
>>> +	if (amdgpu_gpu_recovery != 2) {
>>> +		if (job)
>>> +			drm_sched_increase_karma(&job->base);
>>> +	}
>>>     
>>>     	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
>>>     	if (!amdgpu_sriov_vf(adev)) {
>>> @@ -4639,7 +4672,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>     	int i, r = 0;
>>>     	bool need_emergency_restart = false;
>>>     	bool audio_suspended = false;
>>> -
>>> +	int	tmp_vram_lost_counter;
>>>     	/*
>>>     	 * Special case: RAS triggered and full reset isn't supported
>>>     	 */
>>> @@ -4690,8 +4723,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>     					job ? job->base.id : -1);
>>>     
>>>     		/* even we skipped this reset, still need to set the job to guilty */
>>> -		if (job)
>>> -			drm_sched_increase_karma(&job->base);
>>> +		if (job) {
>>> +			if (amdgpu_gpu_recovery == 2) {
>>> +				if (&job->base) {
>>> +					spin_lock(&job->base.sched->job_list_lock);
>>> +					list_add(&job->base.node, &job->base.sched->ring_mirror_list);
>>> +					spin_unlock(&job->base.sched->job_list_lock);
>>> +				}
>>> +			} else
>>> +				drm_sched_increase_karma(&job->base);
>>> +		}
>>>     		goto skip_recovery;
>>>     	}
>>>     
>>> @@ -4788,6 +4829,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>     		}
>>>     	}
>>>     
>>> +	tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
>>>     	/* Actual ASIC resets if needed.*/
>>>     	/* TODO Implement XGMI hive reset logic for SRIOV */
>>>     	if (amdgpu_sriov_vf(adev)) {
>>> @@ -4804,18 +4846,64 @@ int amdgpu_device_gpu_recover(struct
>>> amdgpu_device *adev,
>>>     
>>>     	/* Post ASIC reset for all devs .*/
>>>     	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head)
>>> {
>>> +		int step = 1;
>>>     
>>> +		if (amdgpu_gpu_recovery == 2) {
>>> +			if (amdgpu_affinity_group_has_only_or_null_working_ring(adev,&job->base)
>>> +				|| tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)) {
>>> +				DRM_INFO("Skip Stage0 Resubmit Stage\n");
>>> +				/* set guilty */
>>> +				drm_sched_increase_karma(&job->base);
>>> +				step = 1;
>>> +			} else {
>>> +				DRM_INFO("Do Stage0 Resubmit Stage\n");
>>> +				step = 0;
>>> +			}
>>> +		}
>>> +
>>> +retry_resubmit:
>>>     		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>>     			struct amdgpu_ring *ring = tmp_adev->rings[i];
>>> +			int ret = 0;
>>> +			struct drm_sched_job *s_bad_job = NULL;
>>>     
>>>     			if (!ring || !ring->sched.thread)
>>>     				continue;
>>>     
>>>     			/* No point to resubmit jobs if we didn't HW reset*/
>>> -			if (!tmp_adev->asic_reset_res && !job_signaled)
>>> +			if (!tmp_adev->asic_reset_res && !job_signaled) {
>>> +
>>>     				drm_sched_resubmit_jobs(&ring->sched);
>>>     
>>> -			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>>> +				if (amdgpu_gpu_recovery == 2 && step == 0) {
>>> +					ret = amdgpu_wait_resubmitted_jobs_completion(&ring->sched, ring->sched.timeout, &s_bad_job);
>>> +					if (ret == -1) {
>>> +						DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", ring->sched.name, s_bad_job->id);
>>> +						/* set guilty */
>>> +						drm_sched_increase_karma(s_bad_job);
>>> +
>>> +						/* do hw reset */
>>> +						if (amdgpu_sriov_vf(adev)) {
>>> +							amdgpu_virt_fini_data_exchange(adev);
>>> +							r = amdgpu_device_reset_sriov(adev, false);
>>> +							if (r)
>>> +								adev->asic_reset_res = r;
>>> +						} else {
>>> +							r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
>>> +							if (r && r == -EAGAIN)
>>> +								goto retry;
>>> +						}
>>> +
>>> +						/* add reset counter so that the following resubmitted job could flush vmid */
>>> +						atomic_inc(&tmp_adev->gpu_reset_counter);
>>> +						step = 1;
>>> +						goto retry_resubmit;
>>> +					}
>>> +				}
>>> +			}
>>> +
>>> +			if (step == 1)
>>> +				drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>>>     		}
>>>     
>>>     		if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index 865f924772b0..9c3f4edb7532 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -509,7 +509,7 @@ module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444);
>>>      * DOC: gpu_recovery (int)
>>>      * Set to enable GPU recovery mechanism (1 = enable, 0 = disable). The default is -1 (auto, disabled except SRIOV).
>>>      */
>>> -MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 =
>>> enable, 0 = disable, -1 = auto)");
>>> +MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (2 =
>>> +advanced tdr mode, 1 = enable, 0 = disable, -1 = auto)");
>>>     module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
>>>     
>>>     /**
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> index 759b34799221..28cda321157a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> @@ -281,6 +281,53 @@ void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched)
>>>     	}
>>>     }
>>>     
>>> +int amdgpu_wait_resubmitted_jobs_completion(struct drm_gpu_scheduler
>>> +*sched, long timeout, struct drm_sched_job **s_bad_job) {
>>> +	struct drm_sched_job *s_job, *tmp;
>>> +	int ret = 0;
>>> +
>>> +	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
>>> +		struct drm_sched_fence *s_fence = s_job->s_fence;
>>> +
>>> +			if (s_fence->parent == NULL) { /* fail to get a hw fence */
>>> +				/* process a job */
>>> +				atomic_dec(&sched->num_jobs);
>>> +				dma_fence_get(&s_fence->finished);
>>> +				dma_fence_signal(&s_fence->finished);
>>> +				dma_fence_put(&s_fence->finished);
>>> +
>>> +				/* remove node from mirror_list and free the job */
>>> +				spin_lock(&sched->job_list_lock);
>>> +				list_del_init(&s_job->node);
>>> +				spin_unlock(&sched->job_list_lock);
>>> +				sched->ops->free_job(s_job);
>>> +				continue;
>>> +			}
>>> +
>>> +			ret = dma_fence_wait_timeout(s_fence->parent, false, timeout);
>>> +
>>> +			if (ret > 0) { /* succeed */
>>> +				/* process a job */
>>> +				atomic_dec(&sched->num_jobs);
>>> +				dma_fence_get(&s_fence->finished);
>>> +				dma_fence_signal(&s_fence->finished);
>>> +				dma_fence_put(&s_fence->finished);
>>> +
>>> +				/* remove node from mirror_list and free the job */
>>> +				spin_lock(&sched->job_list_lock);
>>> +				list_del_init(&s_job->node);
>>> +				spin_unlock(&sched->job_list_lock);
>>> +				sched->ops->free_job(s_job);
>>> +				continue;
>>> +			} else if (ret == 0) {
>>> +				*s_bad_job = s_job;
>>> +				return -1; /* timeout */
>>> +			}
>>> +	}
>>> +
>>> +	return 0;
>>> +}
>>> +
>>>     const struct drm_sched_backend_ops amdgpu_sched_ops = {
>>>     	.dependency = amdgpu_job_dependency,
>>>     	.run_job = amdgpu_job_run,
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> index 81caac9b958a..25292f4699fb 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> @@ -76,5 +76,5 @@ int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring,
>>>     			     struct dma_fence **fence);
>>>     
>>>     void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler
>>> *sched);
>>> -
>>> +int amdgpu_wait_resubmitted_jobs_completion(struct drm_gpu_scheduler
>>> +*sched, long timeout, struct drm_sched_job **s_bad_job);
>>>     #endif
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> index b644c78475fd..cb50bfc80bc9 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> @@ -35,6 +35,11 @@
>>>     #include "amdgpu.h"
>>>     #include "atom.h"
>>>     
>>> +static char *amdgpu_affinity_group[] = { "gfx", "comp"
>>> +};
>>> +
>>>     /*
>>>      * Rings
>>>      * Most engines on the GPU are fed via ring buffers.  Ring @@
>>> -189,6
>>> +194,7 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct
>>> +amdgpu_ring *ring,
>>>     		ring->adev = adev;
>>>     		ring->idx = adev->num_rings++;
>>>     		adev->rings[ring->idx] = ring;
>>> +		amdgpu_ring_set_affinity_group(ring);
>>>     		r = amdgpu_fence_driver_init_ring(ring, sched_hw_submission);
>>>     		if (r)
>>>     			return r;
>>> @@ -459,3 +465,24 @@ int amdgpu_ring_test_helper(struct amdgpu_ring *ring)
>>>     	ring->sched.ready = !r;
>>>     	return r;
>>>     }
>>> +
>>> +int amdgpu_ring_set_affinity_group(struct amdgpu_ring *ring) {
>>> +       struct amdgpu_device *adev = ring->adev;
>>> +       int i;
>>> +
>>> +       for (i = 0; i < ARRAY_SIZE(amdgpu_affinity_group); i++) {
>>> +               char *temp_name = amdgpu_affinity_group[i];
>>> +
>>> +               /* set ring's affinity_group bit if find it in affinity_group list */
>>> +               if (strncmp(ring->name, temp_name, strlen(temp_name)) == 0) {
>>> +                       DRM_DEV_INFO(adev->dev, "set ring:%s in affinity_group\n",
>>> +                             ring->name);
>>> +                       ring->sched.affinity_group = 1;
>>> +                       return 0;
>>> +               }
>>> +       }
>>> +
>>> +       ring->sched.affinity_group = 0;
>>> +       return 0;
>>> +}
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> index 56acec1075ac..6b0d217e6f5a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> @@ -350,4 +350,5 @@ int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
>>>     			     struct amdgpu_ring *ring);
>>>     void amdgpu_debugfs_ring_fini(struct amdgpu_ring *ring);
>>>     
>>> +int amdgpu_ring_set_affinity_group(struct amdgpu_ring *ring);
>>>     #endif
>>> diff --git a/include/drm/gpu_scheduler.h
>>> b/include/drm/gpu_scheduler.h index 1c815e0a14ed..589cbaea35dc 100644
>>> --- a/include/drm/gpu_scheduler.h
>>> +++ b/include/drm/gpu_scheduler.h
>>> @@ -301,6 +301,7 @@ struct drm_gpu_scheduler {
>>>     	atomic_t                        _score;
>>>     	bool				ready;
>>>     	bool				free_guilty;
>>> +	int				affinity_group;
>>>     };
>>>     
>>>     int drm_sched_init(struct drm_gpu_scheduler *sched,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode
  2021-03-09 11:16         ` Christian König
@ 2021-03-09 16:48           ` Andrey Grodzovsky
  2021-03-09 17:41             ` Christian König
  0 siblings, 1 reply; 13+ messages in thread
From: Andrey Grodzovsky @ 2021-03-09 16:48 UTC (permalink / raw)
  To: Christian König, Liu, Monk, Zhang, Jack (Jian),
	amd-gfx, Deng, Emily

I we are talking about 'PATCH v3] drm/amd/amdgpu implement tdr advanced 
mode'  which was sent yesterday then I already went over it and only had 
2 cosmetical comments.

Andrey

On 2021-03-09 6:16 a.m., Christian König wrote:
> Yeah, that are some really good points. I completely agree that we 
> shouldn't do any larger cleanup right now.
> 
> But I think we still need some more review on this. I most likely won't 
> have enough time to look into this before the weekend.
> 
> Andrey can you take a look as well?
> 
> Thanks,
> Christian.
> 
> Am 09.03.21 um 08:29 schrieb Liu, Monk:
>> [AMD Official Use Only - Internal Distribution Only]
>>
>> Christian
>>
>> what feasible and practice now is:
>> 1) we implement the advanced TDR mode in upstream first (so we can 
>> copy the same scheme in our LTS kernel) -- if you want we can avoid 
>> change drm/scheduler part code, but that one is already rejected by 
>> you due to complicated
>> 2) then we retire the mirror list concept and rework the drm/scheduler 
>> with KFIFO
>> 3) remove the guilty/karma handling from scheduler
>>
>> So I basically agree with you on the spirit of above changes: hide 
>> those AMD internal concept or tricks in vendor's driver part and make 
>> scheduler simple and scalable
>> But that definitely need  a longer design and discussion, so why don't 
>> we focus on our current problems now,
>> as long as the new change doesn't regress it is still a good change 
>> based on current TDR implements
>>
>> I would proposal we only change AMD part code in this time, Jack's 
>> first version patch didn't touch scheduler part, but you stated it was 
>> too complicated and rejected it
>>
>> So the allowable revise options is what Jack did in ver2, which need 
>> to introduce a new API in scheduler drm_sched_resubmit_jobs2().
>>
>> Hah: --( what do you think ?
>>
>> Thanks
>>
>> ------------------------------------------
>> Monk Liu | Cloud-GPU Core team
>> ------------------------------------------
>>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: Monday, March 8, 2021 3:53 PM
>> To: Liu, Monk <Monk.Liu@amd.com>; Zhang, Jack (Jian) 
>> <Jack.Zhang1@amd.com>; amd-gfx@lists.freedesktop.org; Grodzovsky, 
>> Andrey <Andrey.Grodzovsky@amd.com>; Deng, Emily <Emily.Deng@amd.com>
>> Subject: Re: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode
>>
>>
>>
>> Am 08.03.21 um 05:06 schrieb Liu, Monk:
>>> [AMD Official Use Only - Internal Distribution Only]
>>>
>>>>> well first of all please completely drop the affinity group stuff 
>>>>> from this patch. We should concentrate on one feature at at time.
>>> We need it to expedite the process, we can introduce this change in
>>> another patch
>>>
>>>
>>>>> Then the implementation is way to complicate. All you need to do is 
>>>>> insert a dma_fence_wait after re-scheduling each job after a reset.
>>> No that's not true, during the " drm_sched_resubmit_jobs" it will put
>>> all jobs in mirror list into the hw ring, but we can only allow the
>>> first job to the ring To catch the real guilty one (otherwise it is 
>>> possible that the later job in the ring also has bug and affect our 
>>> judgement) So we need to implement a new " 
>>> drm_sched_resubmit_jobs2()", like this way:
>> Something like this. But since waiting for the guilty job is AMD 
>> specific we should rather rework the stuff from the beginning.
>>
>> What I have in mind is the following:
>> 1. Add a reference from the scheduler fence back to the job which is 
>> cleared only when the scheduler fence finishes.
>> 2. Completely drop the ring_mirror_list and replace it with a kfifo of 
>> pointers to the active scheduler fences.
>> 3. Replace drm_sched_resubmit_jobs with a drm_sched_for_each_active() 
>> macro which allows drivers to iterate over all the active jobs and 
>> resubmit/wait/mark them as guilty etc etc..
>> 4. Remove the guilty/karma handling from the scheduler. This is 
>> something AMD specific and shouldn't leak into common code.
>>
>> Regards,
>> Christian.
>>
>>> drm_sched_resubmit_jobs2()
>>> ~ 499 void drm_sched_resubmit_jobs2(struct drm_gpu_scheduler *sched, 
>>> int max)
>>>     500 {
>>>     501     struct drm_sched_job *s_job, *tmp;
>>>     502     uint64_t guilty_context;
>>>     503     bool found_guilty = false;
>>>     504     struct dma_fence *fence;
>>> + 505     int i = 0;
>>>     506
>>>     507     list_for_each_entry_safe(s_job, tmp, 
>>> &sched->ring_mirror_list, node) {
>>>     508         struct drm_sched_fence *s_fence = s_job->s_fence;
>>>     509
>>> + 510         if (i >= max)
>>> + 511             break;
>>> + 512
>>>     513         if (!found_guilty && atomic_read(&s_job->karma) > 
>>> sched->hang_limit) {
>>>     514             found_guilty = true;
>>>     515             guilty_context = s_job->s_fence->scheduled.context;
>>>     516         }
>>>     517
>>>     518         if (found_guilty && s_job->s_fence->scheduled.context 
>>> == guilty_context)
>>>     519             dma_fence_set_error(&s_fence->finished, -ECANCELED);
>>>     520
>>>     521         dma_fence_put(s_job->s_fence->parent);
>>>     522         fence = sched->ops->run_job(s_job);
>>> + 523         i++;
>>>     524
>>>     525         if (IS_ERR_OR_NULL(fence)) {
>>>     526             if (IS_ERR(fence))
>>>     527                 dma_fence_set_error(&s_fence->finished, 
>>> PTR_ERR(fence));
>>>     528
>>>     529             s_job->s_fence->parent = NULL;
>>>     530         } else {
>>>     531             s_job->s_fence->parent = fence;
>>>     532         }
>>>     533
>>>     534
>>>     535     }
>>>     536 }
>>>     537 EXPORT_SYMBOL(drm_sched_resubmit_jobs);
>>>     538
>>>
>>>
>>>
>>> Thanks
>>>
>>> ------------------------------------------
>>> Monk Liu | Cloud-GPU Core team
>>> ------------------------------------------
>>>
>>> -----Original Message-----
>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>> Sent: Sunday, March 7, 2021 3:03 AM
>>> To: Zhang, Jack (Jian) <Jack.Zhang1@amd.com>;
>>> amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey
>>> <Andrey.Grodzovsky@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Deng, Emily
>>> <Emily.Deng@amd.com>
>>> Subject: Re: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode
>>>
>>> Hi Jack,
>>>
>>> well first of all please completely drop the affinity group stuff 
>>> from this patch. We should concentrate on one feature at at time.
>>>
>>> Then the implementation is way to complicate. All you need to do is 
>>> insert a dma_fence_wait after re-scheduling each job after a reset.
>>>
>>> Additional to that this feature is completely AMD specific and 
>>> shouldn't affect the common scheduler in any way.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 06.03.21 um 18:25 schrieb Jack Zhang:
>>>> [Why]
>>>> Previous tdr design treats the first job in job_timeout as the bad job.
>>>> But sometimes a later bad compute job can block a good gfx job and
>>>> cause an unexpected gfx job timeout because gfx and compute ring
>>>> share internal GC HW mutually.
>>>>
>>>> [How]
>>>> This patch implements an advanced tdr mode.It involves an additinal
>>>> synchronous pre-resubmit step(Step0 Resubmit) before normal resubmit
>>>> step in order to find the real bad job.
>>>>
>>>> 1. For Bailing TDR job, re-insert it to mirror_list, don't set it to
>>>> guilty and leave it to be handled by the main reset thread.
>>>>
>>>> 2. Don't set the job to guilty in pre_asic_reset, and leave it to be
>>>> handled by Step0 Resubmit Stage.
>>>>
>>>> 3. At Step0 Resubmit stage, it first resubmit jobs asynchronously,
>>>> then it iterate each ring mirror_list, synchronously pend for each hw
>>>> fence being signaled. If the a job's hw fence get timeout, we
>>>> identify it as guilty and do hw reset to recover hw. After that, we
>>>> would do the normal resubmit step to resubmit left jobs.
>>>>
>>>> 4. For whole gpu reset(vram lost), skip Step0 Resubmit as each job
>>>> after vram lost was considered as bad job.
>>>>
>>>> 5. Involve the concept "Affinity Group".
>>>> Doing two hw resets is not necessary when there's only one ring that
>>>> has jobs among some hw-related rings.Thus, we involve "affinity group".
>>>> Hw-related rings could be added into a common affinity group, such as
>>>> gfx and compute ring. When tdr happens, we iterate all rings in
>>>> affinity group, skip Step0 Resubmit stage if there's only one ring's
>>>> mirror_list that has valid sched jobs.
>>>>
>>>> V2:
>>>>        -fix a cherry-pick mistake for bailing TDR handling.
>>>>
>>>>        -do affinity_group check according to the bad job's sched rather
>>>>         than the default "1" so that there could be multiple affinity
>>>>         groups being pre-defined in future.
>>>>
>>>> Signed-off-by: Jack Zhang <Jack.Zhang1@amd.com>
>>>> ---
>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 
>>>> +++++++++++++++++++--
>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |   2 +-
>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  47 ++++++++++
>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_job.h    |   2 +-
>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c   |  27 ++++++
>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |   1 +
>>>>     include/drm/gpu_scheduler.h                |   1 +
>>>>     7 files changed, 173 insertions(+), 9 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> index e247c3a2ec08..8632d7071292 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> @@ -4188,6 +4188,37 @@ bool amdgpu_device_has_job_running(struct 
>>>> amdgpu_device *adev)
>>>>         return false;
>>>>     }
>>>> +bool amdgpu_affinity_group_has_only_or_null_working_ring(struct
>>>> +amdgpu_device *adev, struct drm_sched_job *s_job) {
>>>> +       int i;
>>>> +       int working_ring_num = 0;
>>>> +
>>>> +    /*
>>>> +     * The job is considered as the real bad one
>>>> +     * if job's sched is not in affinity group
>>>> +     */
>>>> +    if (s_job->sched.affinity_group == 0)
>>>> +            return true;
>>>> +
>>>> +       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>>> +               struct amdgpu_ring *ring = adev->rings[i];
>>>> +
>>>> +               if (!ring || !ring->sched.thread)
>>>> +                       continue;
>>>> +
>>>> +               /* for non-empty affinity ring, increase 
>>>> working_ring_num */
>>>> +               if (ring->sched.affinity_group == 
>>>> s_job->sched.affinity_group) {
>>>> +                       if (!list_empty(&ring->sched.ring_mirror_list))
>>>> +                               working_ring_num++;
>>>> +               }
>>>> +       }
>>>> +
>>>> +       if (working_ring_num > 1) {
>>>> +               return false;
>>>> +       }
>>>> +       return true;
>>>> +}
>>>> +
>>>>     /**
>>>>      * amdgpu_device_should_recover_gpu - check if we should try GPU 
>>>> recovery
>>>>      *
>>>> @@ -4310,8 +4341,10 @@ static int 
>>>> amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>>             amdgpu_fence_driver_force_completion(ring);
>>>>         }
>>>> -    if(job)
>>>> -        drm_sched_increase_karma(&job->base);
>>>> +    if (amdgpu_gpu_recovery != 2) {
>>>> +        if (job)
>>>> +            drm_sched_increase_karma(&job->base);
>>>> +    }
>>>>         /* Don't suspend on bare metal if we are not going to HW 
>>>> reset the ASIC */
>>>>         if (!amdgpu_sriov_vf(adev)) {
>>>> @@ -4639,7 +4672,7 @@ int amdgpu_device_gpu_recover(struct 
>>>> amdgpu_device *adev,
>>>>         int i, r = 0;
>>>>         bool need_emergency_restart = false;
>>>>         bool audio_suspended = false;
>>>> -
>>>> +    int    tmp_vram_lost_counter;
>>>>         /*
>>>>          * Special case: RAS triggered and full reset isn't supported
>>>>          */
>>>> @@ -4690,8 +4723,16 @@ int amdgpu_device_gpu_recover(struct 
>>>> amdgpu_device *adev,
>>>>                         job ? job->base.id : -1);
>>>>             /* even we skipped this reset, still need to set the job 
>>>> to guilty */
>>>> -        if (job)
>>>> -            drm_sched_increase_karma(&job->base);
>>>> +        if (job) {
>>>> +            if (amdgpu_gpu_recovery == 2) {
>>>> +                if (&job->base) {
>>>> +                    spin_lock(&job->base.sched->job_list_lock);
>>>> +                    list_add(&job->base.node, 
>>>> &job->base.sched->ring_mirror_list);
>>>> +                    spin_unlock(&job->base.sched->job_list_lock);
>>>> +                }
>>>> +            } else
>>>> +                drm_sched_increase_karma(&job->base);
>>>> +        }
>>>>             goto skip_recovery;
>>>>         }
>>>> @@ -4788,6 +4829,7 @@ int amdgpu_device_gpu_recover(struct 
>>>> amdgpu_device *adev,
>>>>             }
>>>>         }
>>>> +    tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
>>>>         /* Actual ASIC resets if needed.*/
>>>>         /* TODO Implement XGMI hive reset logic for SRIOV */
>>>>         if (amdgpu_sriov_vf(adev)) {
>>>> @@ -4804,18 +4846,64 @@ int amdgpu_device_gpu_recover(struct
>>>> amdgpu_device *adev,
>>>>         /* Post ASIC reset for all devs .*/
>>>>         list_for_each_entry(tmp_adev, device_list_handle, 
>>>> gmc.xgmi.head)
>>>> {
>>>> +        int step = 1;
>>>> +        if (amdgpu_gpu_recovery == 2) {
>>>> +            if 
>>>> (amdgpu_affinity_group_has_only_or_null_working_ring(adev,&job->base)
>>>> +                || tmp_vram_lost_counter < 
>>>> atomic_read(&adev->vram_lost_counter)) {
>>>> +                DRM_INFO("Skip Stage0 Resubmit Stage\n");
>>>> +                /* set guilty */
>>>> +                drm_sched_increase_karma(&job->base);
>>>> +                step = 1;
>>>> +            } else {
>>>> +                DRM_INFO("Do Stage0 Resubmit Stage\n");
>>>> +                step = 0;
>>>> +            }
>>>> +        }
>>>> +
>>>> +retry_resubmit:
>>>>             for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>>>                 struct amdgpu_ring *ring = tmp_adev->rings[i];
>>>> +            int ret = 0;
>>>> +            struct drm_sched_job *s_bad_job = NULL;
>>>>                 if (!ring || !ring->sched.thread)
>>>>                     continue;
>>>>                 /* No point to resubmit jobs if we didn't HW reset*/
>>>> -            if (!tmp_adev->asic_reset_res && !job_signaled)
>>>> +            if (!tmp_adev->asic_reset_res && !job_signaled) {
>>>> +
>>>>                     drm_sched_resubmit_jobs(&ring->sched);
>>>> -            drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>>>> +                if (amdgpu_gpu_recovery == 2 && step == 0) {
>>>> +                    ret = 
>>>> amdgpu_wait_resubmitted_jobs_completion(&ring->sched, 
>>>> ring->sched.timeout, &s_bad_job);
>>>> +                    if (ret == -1) {
>>>> +                        DRM_ERROR("Found the real bad job! ring:%s, 
>>>> job_id:%llx\n", ring->sched.name, s_bad_job->id);
>>>> +                        /* set guilty */
>>>> +                        drm_sched_increase_karma(s_bad_job);
>>>> +
>>>> +                        /* do hw reset */
>>>> +                        if (amdgpu_sriov_vf(adev)) {
>>>> +                            amdgpu_virt_fini_data_exchange(adev);
>>>> +                            r = amdgpu_device_reset_sriov(adev, 
>>>> false);
>>>> +                            if (r)
>>>> +                                adev->asic_reset_res = r;
>>>> +                        } else {
>>>> +                            r  = amdgpu_do_asic_reset(hive, 
>>>> device_list_handle, &need_full_reset, false);
>>>> +                            if (r && r == -EAGAIN)
>>>> +                                goto retry;
>>>> +                        }
>>>> +
>>>> +                        /* add reset counter so that the following 
>>>> resubmitted job could flush vmid */
>>>> +                        atomic_inc(&tmp_adev->gpu_reset_counter);
>>>> +                        step = 1;
>>>> +                        goto retry_resubmit;
>>>> +                    }
>>>> +                }
>>>> +            }
>>>> +
>>>> +            if (step == 1)
>>>> +                drm_sched_start(&ring->sched, 
>>>> !tmp_adev->asic_reset_res);
>>>>             }
>>>>             if (!amdgpu_device_has_dc_support(tmp_adev) && 
>>>> !job_signaled) {
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> index 865f924772b0..9c3f4edb7532 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> @@ -509,7 +509,7 @@ module_param_named(compute_multipipe, 
>>>> amdgpu_compute_multipipe, int, 0444);
>>>>      * DOC: gpu_recovery (int)
>>>>      * Set to enable GPU recovery mechanism (1 = enable, 0 = 
>>>> disable). The default is -1 (auto, disabled except SRIOV).
>>>>      */
>>>> -MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 =
>>>> enable, 0 = disable, -1 = auto)");
>>>> +MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (2 =
>>>> +advanced tdr mode, 1 = enable, 0 = disable, -1 = auto)");
>>>>     module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
>>>>     /**
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> index 759b34799221..28cda321157a 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> @@ -281,6 +281,53 @@ void amdgpu_job_stop_all_jobs_on_sched(struct 
>>>> drm_gpu_scheduler *sched)
>>>>         }
>>>>     }
>>>> +int amdgpu_wait_resubmitted_jobs_completion(struct drm_gpu_scheduler
>>>> +*sched, long timeout, struct drm_sched_job **s_bad_job) {
>>>> +    struct drm_sched_job *s_job, *tmp;
>>>> +    int ret = 0;
>>>> +
>>>> +    list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, 
>>>> node) {
>>>> +        struct drm_sched_fence *s_fence = s_job->s_fence;
>>>> +
>>>> +            if (s_fence->parent == NULL) { /* fail to get a hw 
>>>> fence */
>>>> +                /* process a job */
>>>> +                atomic_dec(&sched->num_jobs);
>>>> +                dma_fence_get(&s_fence->finished);
>>>> +                dma_fence_signal(&s_fence->finished);
>>>> +                dma_fence_put(&s_fence->finished);
>>>> +
>>>> +                /* remove node from mirror_list and free the job */
>>>> +                spin_lock(&sched->job_list_lock);
>>>> +                list_del_init(&s_job->node);
>>>> +                spin_unlock(&sched->job_list_lock);
>>>> +                sched->ops->free_job(s_job);
>>>> +                continue;
>>>> +            }
>>>> +
>>>> +            ret = dma_fence_wait_timeout(s_fence->parent, false, 
>>>> timeout);
>>>> +
>>>> +            if (ret > 0) { /* succeed */
>>>> +                /* process a job */
>>>> +                atomic_dec(&sched->num_jobs);
>>>> +                dma_fence_get(&s_fence->finished);
>>>> +                dma_fence_signal(&s_fence->finished);
>>>> +                dma_fence_put(&s_fence->finished);
>>>> +
>>>> +                /* remove node from mirror_list and free the job */
>>>> +                spin_lock(&sched->job_list_lock);
>>>> +                list_del_init(&s_job->node);
>>>> +                spin_unlock(&sched->job_list_lock);
>>>> +                sched->ops->free_job(s_job);
>>>> +                continue;
>>>> +            } else if (ret == 0) {
>>>> +                *s_bad_job = s_job;
>>>> +                return -1; /* timeout */
>>>> +            }
>>>> +    }
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>>     const struct drm_sched_backend_ops amdgpu_sched_ops = {
>>>>         .dependency = amdgpu_job_dependency,
>>>>         .run_job = amdgpu_job_run,
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> index 81caac9b958a..25292f4699fb 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> @@ -76,5 +76,5 @@ int amdgpu_job_submit_direct(struct amdgpu_job 
>>>> *job, struct amdgpu_ring *ring,
>>>>                      struct dma_fence **fence);
>>>>     void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler
>>>> *sched);
>>>> -
>>>> +int amdgpu_wait_resubmitted_jobs_completion(struct drm_gpu_scheduler
>>>> +*sched, long timeout, struct drm_sched_job **s_bad_job);
>>>>     #endif
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>>> index b644c78475fd..cb50bfc80bc9 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>>> @@ -35,6 +35,11 @@
>>>>     #include "amdgpu.h"
>>>>     #include "atom.h"
>>>> +static char *amdgpu_affinity_group[] = { "gfx", "comp"
>>>> +};
>>>> +
>>>>     /*
>>>>      * Rings
>>>>      * Most engines on the GPU are fed via ring buffers.  Ring @@
>>>> -189,6
>>>> +194,7 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct
>>>> +amdgpu_ring *ring,
>>>>             ring->adev = adev;
>>>>             ring->idx = adev->num_rings++;
>>>>             adev->rings[ring->idx] = ring;
>>>> +        amdgpu_ring_set_affinity_group(ring);
>>>>             r = amdgpu_fence_driver_init_ring(ring, 
>>>> sched_hw_submission);
>>>>             if (r)
>>>>                 return r;
>>>> @@ -459,3 +465,24 @@ int amdgpu_ring_test_helper(struct amdgpu_ring 
>>>> *ring)
>>>>         ring->sched.ready = !r;
>>>>         return r;
>>>>     }
>>>> +
>>>> +int amdgpu_ring_set_affinity_group(struct amdgpu_ring *ring) {
>>>> +       struct amdgpu_device *adev = ring->adev;
>>>> +       int i;
>>>> +
>>>> +       for (i = 0; i < ARRAY_SIZE(amdgpu_affinity_group); i++) {
>>>> +               char *temp_name = amdgpu_affinity_group[i];
>>>> +
>>>> +               /* set ring's affinity_group bit if find it in 
>>>> affinity_group list */
>>>> +               if (strncmp(ring->name, temp_name, 
>>>> strlen(temp_name)) == 0) {
>>>> +                       DRM_DEV_INFO(adev->dev, "set ring:%s in 
>>>> affinity_group\n",
>>>> +                             ring->name);
>>>> +                       ring->sched.affinity_group = 1;
>>>> +                       return 0;
>>>> +               }
>>>> +       }
>>>> +
>>>> +       ring->sched.affinity_group = 0;
>>>> +       return 0;
>>>> +}
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> index 56acec1075ac..6b0d217e6f5a 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> @@ -350,4 +350,5 @@ int amdgpu_debugfs_ring_init(struct 
>>>> amdgpu_device *adev,
>>>>                      struct amdgpu_ring *ring);
>>>>     void amdgpu_debugfs_ring_fini(struct amdgpu_ring *ring);
>>>> +int amdgpu_ring_set_affinity_group(struct amdgpu_ring *ring);
>>>>     #endif
>>>> diff --git a/include/drm/gpu_scheduler.h
>>>> b/include/drm/gpu_scheduler.h index 1c815e0a14ed..589cbaea35dc 100644
>>>> --- a/include/drm/gpu_scheduler.h
>>>> +++ b/include/drm/gpu_scheduler.h
>>>> @@ -301,6 +301,7 @@ struct drm_gpu_scheduler {
>>>>         atomic_t                        _score;
>>>>         bool                ready;
>>>>         bool                free_guilty;
>>>> +    int                affinity_group;
>>>>     };
>>>>     int drm_sched_init(struct drm_gpu_scheduler *sched,
> 
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode
  2021-03-09 16:48           ` Andrey Grodzovsky
@ 2021-03-09 17:41             ` Christian König
  2021-03-10 11:26               ` Zhang, Jack (Jian)
  0 siblings, 1 reply; 13+ messages in thread
From: Christian König @ 2021-03-09 17:41 UTC (permalink / raw)
  To: Andrey Grodzovsky, Liu, Monk, Zhang, Jack (Jian), amd-gfx, Deng, Emily

Yeah, sorry. I've seen your reply only after I wrote this mail.

I think we can go ahead if you don't have much other concern on this.

Christian.

Am 09.03.21 um 17:48 schrieb Andrey Grodzovsky:
> I we are talking about 'PATCH v3] drm/amd/amdgpu implement tdr 
> advanced mode'  which was sent yesterday then I already went over it 
> and only had 2 cosmetical comments.
>
> Andrey
>
> On 2021-03-09 6:16 a.m., Christian König wrote:
>> Yeah, that are some really good points. I completely agree that we 
>> shouldn't do any larger cleanup right now.
>>
>> But I think we still need some more review on this. I most likely 
>> won't have enough time to look into this before the weekend.
>>
>> Andrey can you take a look as well?
>>
>> Thanks,
>> Christian.
>>
>> Am 09.03.21 um 08:29 schrieb Liu, Monk:
>>> [AMD Official Use Only - Internal Distribution Only]
>>>
>>> Christian
>>>
>>> what feasible and practice now is:
>>> 1) we implement the advanced TDR mode in upstream first (so we can 
>>> copy the same scheme in our LTS kernel) -- if you want we can avoid 
>>> change drm/scheduler part code, but that one is already rejected by 
>>> you due to complicated
>>> 2) then we retire the mirror list concept and rework the 
>>> drm/scheduler with KFIFO
>>> 3) remove the guilty/karma handling from scheduler
>>>
>>> So I basically agree with you on the spirit of above changes: hide 
>>> those AMD internal concept or tricks in vendor's driver part and 
>>> make scheduler simple and scalable
>>> But that definitely need  a longer design and discussion, so why 
>>> don't we focus on our current problems now,
>>> as long as the new change doesn't regress it is still a good change 
>>> based on current TDR implements
>>>
>>> I would proposal we only change AMD part code in this time, Jack's 
>>> first version patch didn't touch scheduler part, but you stated it 
>>> was too complicated and rejected it
>>>
>>> So the allowable revise options is what Jack did in ver2, which need 
>>> to introduce a new API in scheduler drm_sched_resubmit_jobs2().
>>>
>>> Hah: --( what do you think ?
>>>
>>> Thanks
>>>
>>> ------------------------------------------
>>> Monk Liu | Cloud-GPU Core team
>>> ------------------------------------------
>>>
>>> -----Original Message-----
>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>> Sent: Monday, March 8, 2021 3:53 PM
>>> To: Liu, Monk <Monk.Liu@amd.com>; Zhang, Jack (Jian) 
>>> <Jack.Zhang1@amd.com>; amd-gfx@lists.freedesktop.org; Grodzovsky, 
>>> Andrey <Andrey.Grodzovsky@amd.com>; Deng, Emily <Emily.Deng@amd.com>
>>> Subject: Re: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode
>>>
>>>
>>>
>>> Am 08.03.21 um 05:06 schrieb Liu, Monk:
>>>> [AMD Official Use Only - Internal Distribution Only]
>>>>
>>>>>> well first of all please completely drop the affinity group stuff 
>>>>>> from this patch. We should concentrate on one feature at at time.
>>>> We need it to expedite the process, we can introduce this change in
>>>> another patch
>>>>
>>>>
>>>>>> Then the implementation is way to complicate. All you need to do 
>>>>>> is insert a dma_fence_wait after re-scheduling each job after a 
>>>>>> reset.
>>>> No that's not true, during the " drm_sched_resubmit_jobs" it will put
>>>> all jobs in mirror list into the hw ring, but we can only allow the
>>>> first job to the ring To catch the real guilty one (otherwise it is 
>>>> possible that the later job in the ring also has bug and affect our 
>>>> judgement) So we need to implement a new " 
>>>> drm_sched_resubmit_jobs2()", like this way:
>>> Something like this. But since waiting for the guilty job is AMD 
>>> specific we should rather rework the stuff from the beginning.
>>>
>>> What I have in mind is the following:
>>> 1. Add a reference from the scheduler fence back to the job which is 
>>> cleared only when the scheduler fence finishes.
>>> 2. Completely drop the ring_mirror_list and replace it with a kfifo 
>>> of pointers to the active scheduler fences.
>>> 3. Replace drm_sched_resubmit_jobs with a 
>>> drm_sched_for_each_active() macro which allows drivers to iterate 
>>> over all the active jobs and resubmit/wait/mark them as guilty etc 
>>> etc..
>>> 4. Remove the guilty/karma handling from the scheduler. This is 
>>> something AMD specific and shouldn't leak into common code.
>>>
>>> Regards,
>>> Christian.
>>>
>>>> drm_sched_resubmit_jobs2()
>>>> ~ 499 void drm_sched_resubmit_jobs2(struct drm_gpu_scheduler 
>>>> *sched, int max)
>>>>     500 {
>>>>     501     struct drm_sched_job *s_job, *tmp;
>>>>     502     uint64_t guilty_context;
>>>>     503     bool found_guilty = false;
>>>>     504     struct dma_fence *fence;
>>>> + 505     int i = 0;
>>>>     506
>>>>     507     list_for_each_entry_safe(s_job, tmp, 
>>>> &sched->ring_mirror_list, node) {
>>>>     508         struct drm_sched_fence *s_fence = s_job->s_fence;
>>>>     509
>>>> + 510         if (i >= max)
>>>> + 511             break;
>>>> + 512
>>>>     513         if (!found_guilty && atomic_read(&s_job->karma) > 
>>>> sched->hang_limit) {
>>>>     514             found_guilty = true;
>>>>     515             guilty_context = 
>>>> s_job->s_fence->scheduled.context;
>>>>     516         }
>>>>     517
>>>>     518         if (found_guilty && 
>>>> s_job->s_fence->scheduled.context == guilty_context)
>>>>     519 dma_fence_set_error(&s_fence->finished, -ECANCELED);
>>>>     520
>>>>     521         dma_fence_put(s_job->s_fence->parent);
>>>>     522         fence = sched->ops->run_job(s_job);
>>>> + 523         i++;
>>>>     524
>>>>     525         if (IS_ERR_OR_NULL(fence)) {
>>>>     526             if (IS_ERR(fence))
>>>>     527 dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
>>>>     528
>>>>     529             s_job->s_fence->parent = NULL;
>>>>     530         } else {
>>>>     531             s_job->s_fence->parent = fence;
>>>>     532         }
>>>>     533
>>>>     534
>>>>     535     }
>>>>     536 }
>>>>     537 EXPORT_SYMBOL(drm_sched_resubmit_jobs);
>>>>     538
>>>>
>>>>
>>>>
>>>> Thanks
>>>>
>>>> ------------------------------------------
>>>> Monk Liu | Cloud-GPU Core team
>>>> ------------------------------------------
>>>>
>>>> -----Original Message-----
>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>> Sent: Sunday, March 7, 2021 3:03 AM
>>>> To: Zhang, Jack (Jian) <Jack.Zhang1@amd.com>;
>>>> amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey
>>>> <Andrey.Grodzovsky@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Deng, Emily
>>>> <Emily.Deng@amd.com>
>>>> Subject: Re: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode
>>>>
>>>> Hi Jack,
>>>>
>>>> well first of all please completely drop the affinity group stuff 
>>>> from this patch. We should concentrate on one feature at at time.
>>>>
>>>> Then the implementation is way to complicate. All you need to do is 
>>>> insert a dma_fence_wait after re-scheduling each job after a reset.
>>>>
>>>> Additional to that this feature is completely AMD specific and 
>>>> shouldn't affect the common scheduler in any way.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 06.03.21 um 18:25 schrieb Jack Zhang:
>>>>> [Why]
>>>>> Previous tdr design treats the first job in job_timeout as the bad 
>>>>> job.
>>>>> But sometimes a later bad compute job can block a good gfx job and
>>>>> cause an unexpected gfx job timeout because gfx and compute ring
>>>>> share internal GC HW mutually.
>>>>>
>>>>> [How]
>>>>> This patch implements an advanced tdr mode.It involves an additinal
>>>>> synchronous pre-resubmit step(Step0 Resubmit) before normal resubmit
>>>>> step in order to find the real bad job.
>>>>>
>>>>> 1. For Bailing TDR job, re-insert it to mirror_list, don't set it to
>>>>> guilty and leave it to be handled by the main reset thread.
>>>>>
>>>>> 2. Don't set the job to guilty in pre_asic_reset, and leave it to be
>>>>> handled by Step0 Resubmit Stage.
>>>>>
>>>>> 3. At Step0 Resubmit stage, it first resubmit jobs asynchronously,
>>>>> then it iterate each ring mirror_list, synchronously pend for each hw
>>>>> fence being signaled. If the a job's hw fence get timeout, we
>>>>> identify it as guilty and do hw reset to recover hw. After that, we
>>>>> would do the normal resubmit step to resubmit left jobs.
>>>>>
>>>>> 4. For whole gpu reset(vram lost), skip Step0 Resubmit as each job
>>>>> after vram lost was considered as bad job.
>>>>>
>>>>> 5. Involve the concept "Affinity Group".
>>>>> Doing two hw resets is not necessary when there's only one ring that
>>>>> has jobs among some hw-related rings.Thus, we involve "affinity 
>>>>> group".
>>>>> Hw-related rings could be added into a common affinity group, such as
>>>>> gfx and compute ring. When tdr happens, we iterate all rings in
>>>>> affinity group, skip Step0 Resubmit stage if there's only one ring's
>>>>> mirror_list that has valid sched jobs.
>>>>>
>>>>> V2:
>>>>>        -fix a cherry-pick mistake for bailing TDR handling.
>>>>>
>>>>>        -do affinity_group check according to the bad job's sched 
>>>>> rather
>>>>>         than the default "1" so that there could be multiple affinity
>>>>>         groups being pre-defined in future.
>>>>>
>>>>> Signed-off-by: Jack Zhang <Jack.Zhang1@amd.com>
>>>>> ---
>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 
>>>>> +++++++++++++++++++--
>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |   2 +-
>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  47 ++++++++++
>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_job.h    |   2 +-
>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c   |  27 ++++++
>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |   1 +
>>>>>     include/drm/gpu_scheduler.h                |   1 +
>>>>>     7 files changed, 173 insertions(+), 9 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> index e247c3a2ec08..8632d7071292 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> @@ -4188,6 +4188,37 @@ bool amdgpu_device_has_job_running(struct 
>>>>> amdgpu_device *adev)
>>>>>         return false;
>>>>>     }
>>>>> +bool amdgpu_affinity_group_has_only_or_null_working_ring(struct
>>>>> +amdgpu_device *adev, struct drm_sched_job *s_job) {
>>>>> +       int i;
>>>>> +       int working_ring_num = 0;
>>>>> +
>>>>> +    /*
>>>>> +     * The job is considered as the real bad one
>>>>> +     * if job's sched is not in affinity group
>>>>> +     */
>>>>> +    if (s_job->sched.affinity_group == 0)
>>>>> +            return true;
>>>>> +
>>>>> +       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>>>> +               struct amdgpu_ring *ring = adev->rings[i];
>>>>> +
>>>>> +               if (!ring || !ring->sched.thread)
>>>>> +                       continue;
>>>>> +
>>>>> +               /* for non-empty affinity ring, increase 
>>>>> working_ring_num */
>>>>> +               if (ring->sched.affinity_group == 
>>>>> s_job->sched.affinity_group) {
>>>>> +                       if 
>>>>> (!list_empty(&ring->sched.ring_mirror_list))
>>>>> +                               working_ring_num++;
>>>>> +               }
>>>>> +       }
>>>>> +
>>>>> +       if (working_ring_num > 1) {
>>>>> +               return false;
>>>>> +       }
>>>>> +       return true;
>>>>> +}
>>>>> +
>>>>>     /**
>>>>>      * amdgpu_device_should_recover_gpu - check if we should try 
>>>>> GPU recovery
>>>>>      *
>>>>> @@ -4310,8 +4341,10 @@ static int 
>>>>> amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>>>             amdgpu_fence_driver_force_completion(ring);
>>>>>         }
>>>>> -    if(job)
>>>>> -        drm_sched_increase_karma(&job->base);
>>>>> +    if (amdgpu_gpu_recovery != 2) {
>>>>> +        if (job)
>>>>> +            drm_sched_increase_karma(&job->base);
>>>>> +    }
>>>>>         /* Don't suspend on bare metal if we are not going to HW 
>>>>> reset the ASIC */
>>>>>         if (!amdgpu_sriov_vf(adev)) {
>>>>> @@ -4639,7 +4672,7 @@ int amdgpu_device_gpu_recover(struct 
>>>>> amdgpu_device *adev,
>>>>>         int i, r = 0;
>>>>>         bool need_emergency_restart = false;
>>>>>         bool audio_suspended = false;
>>>>> -
>>>>> +    int    tmp_vram_lost_counter;
>>>>>         /*
>>>>>          * Special case: RAS triggered and full reset isn't supported
>>>>>          */
>>>>> @@ -4690,8 +4723,16 @@ int amdgpu_device_gpu_recover(struct 
>>>>> amdgpu_device *adev,
>>>>>                         job ? job->base.id : -1);
>>>>>             /* even we skipped this reset, still need to set the 
>>>>> job to guilty */
>>>>> -        if (job)
>>>>> -            drm_sched_increase_karma(&job->base);
>>>>> +        if (job) {
>>>>> +            if (amdgpu_gpu_recovery == 2) {
>>>>> +                if (&job->base) {
>>>>> + spin_lock(&job->base.sched->job_list_lock);
>>>>> +                    list_add(&job->base.node, 
>>>>> &job->base.sched->ring_mirror_list);
>>>>> + spin_unlock(&job->base.sched->job_list_lock);
>>>>> +                }
>>>>> +            } else
>>>>> + drm_sched_increase_karma(&job->base);
>>>>> +        }
>>>>>             goto skip_recovery;
>>>>>         }
>>>>> @@ -4788,6 +4829,7 @@ int amdgpu_device_gpu_recover(struct 
>>>>> amdgpu_device *adev,
>>>>>             }
>>>>>         }
>>>>> +    tmp_vram_lost_counter = 
>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>>         /* Actual ASIC resets if needed.*/
>>>>>         /* TODO Implement XGMI hive reset logic for SRIOV */
>>>>>         if (amdgpu_sriov_vf(adev)) {
>>>>> @@ -4804,18 +4846,64 @@ int amdgpu_device_gpu_recover(struct
>>>>> amdgpu_device *adev,
>>>>>         /* Post ASIC reset for all devs .*/
>>>>>         list_for_each_entry(tmp_adev, device_list_handle, 
>>>>> gmc.xgmi.head)
>>>>> {
>>>>> +        int step = 1;
>>>>> +        if (amdgpu_gpu_recovery == 2) {
>>>>> +            if 
>>>>> (amdgpu_affinity_group_has_only_or_null_working_ring(adev,&job->base)
>>>>> +                || tmp_vram_lost_counter < 
>>>>> atomic_read(&adev->vram_lost_counter)) {
>>>>> +                DRM_INFO("Skip Stage0 Resubmit Stage\n");
>>>>> +                /* set guilty */
>>>>> + drm_sched_increase_karma(&job->base);
>>>>> +                step = 1;
>>>>> +            } else {
>>>>> +                DRM_INFO("Do Stage0 Resubmit Stage\n");
>>>>> +                step = 0;
>>>>> +            }
>>>>> +        }
>>>>> +
>>>>> +retry_resubmit:
>>>>>             for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>>>>                 struct amdgpu_ring *ring = tmp_adev->rings[i];
>>>>> +            int ret = 0;
>>>>> +            struct drm_sched_job *s_bad_job = NULL;
>>>>>                 if (!ring || !ring->sched.thread)
>>>>>                     continue;
>>>>>                 /* No point to resubmit jobs if we didn't HW reset*/
>>>>> -            if (!tmp_adev->asic_reset_res && !job_signaled)
>>>>> +            if (!tmp_adev->asic_reset_res && !job_signaled) {
>>>>> +
>>>>> drm_sched_resubmit_jobs(&ring->sched);
>>>>> -            drm_sched_start(&ring->sched, 
>>>>> !tmp_adev->asic_reset_res);
>>>>> +                if (amdgpu_gpu_recovery == 2 && step == 0) {
>>>>> +                    ret = 
>>>>> amdgpu_wait_resubmitted_jobs_completion(&ring->sched, 
>>>>> ring->sched.timeout, &s_bad_job);
>>>>> +                    if (ret == -1) {
>>>>> +                        DRM_ERROR("Found the real bad job! 
>>>>> ring:%s, job_id:%llx\n", ring->sched.name, s_bad_job->id);
>>>>> +                        /* set guilty */
>>>>> + drm_sched_increase_karma(s_bad_job);
>>>>> +
>>>>> +                        /* do hw reset */
>>>>> +                        if (amdgpu_sriov_vf(adev)) {
>>>>> + amdgpu_virt_fini_data_exchange(adev);
>>>>> +                            r = amdgpu_device_reset_sriov(adev, 
>>>>> false);
>>>>> +                            if (r)
>>>>> +                                adev->asic_reset_res = r;
>>>>> +                        } else {
>>>>> +                            r  = amdgpu_do_asic_reset(hive, 
>>>>> device_list_handle, &need_full_reset, false);
>>>>> +                            if (r && r == -EAGAIN)
>>>>> +                                goto retry;
>>>>> +                        }
>>>>> +
>>>>> +                        /* add reset counter so that the 
>>>>> following resubmitted job could flush vmid */
>>>>> + atomic_inc(&tmp_adev->gpu_reset_counter);
>>>>> +                        step = 1;
>>>>> +                        goto retry_resubmit;
>>>>> +                    }
>>>>> +                }
>>>>> +            }
>>>>> +
>>>>> +            if (step == 1)
>>>>> +                drm_sched_start(&ring->sched, 
>>>>> !tmp_adev->asic_reset_res);
>>>>>             }
>>>>>             if (!amdgpu_device_has_dc_support(tmp_adev) && 
>>>>> !job_signaled) {
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>> index 865f924772b0..9c3f4edb7532 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>> @@ -509,7 +509,7 @@ module_param_named(compute_multipipe, 
>>>>> amdgpu_compute_multipipe, int, 0444);
>>>>>      * DOC: gpu_recovery (int)
>>>>>      * Set to enable GPU recovery mechanism (1 = enable, 0 = 
>>>>> disable). The default is -1 (auto, disabled except SRIOV).
>>>>>      */
>>>>> -MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 =
>>>>> enable, 0 = disable, -1 = auto)");
>>>>> +MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (2 =
>>>>> +advanced tdr mode, 1 = enable, 0 = disable, -1 = auto)");
>>>>>     module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
>>>>>     /**
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>>> index 759b34799221..28cda321157a 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>>> @@ -281,6 +281,53 @@ void amdgpu_job_stop_all_jobs_on_sched(struct 
>>>>> drm_gpu_scheduler *sched)
>>>>>         }
>>>>>     }
>>>>> +int amdgpu_wait_resubmitted_jobs_completion(struct drm_gpu_scheduler
>>>>> +*sched, long timeout, struct drm_sched_job **s_bad_job) {
>>>>> +    struct drm_sched_job *s_job, *tmp;
>>>>> +    int ret = 0;
>>>>> +
>>>>> +    list_for_each_entry_safe(s_job, tmp, 
>>>>> &sched->ring_mirror_list, node) {
>>>>> +        struct drm_sched_fence *s_fence = s_job->s_fence;
>>>>> +
>>>>> +            if (s_fence->parent == NULL) { /* fail to get a hw 
>>>>> fence */
>>>>> +                /* process a job */
>>>>> +                atomic_dec(&sched->num_jobs);
>>>>> +                dma_fence_get(&s_fence->finished);
>>>>> + dma_fence_signal(&s_fence->finished);
>>>>> +                dma_fence_put(&s_fence->finished);
>>>>> +
>>>>> +                /* remove node from mirror_list and free the job */
>>>>> +                spin_lock(&sched->job_list_lock);
>>>>> +                list_del_init(&s_job->node);
>>>>> + spin_unlock(&sched->job_list_lock);
>>>>> +                sched->ops->free_job(s_job);
>>>>> +                continue;
>>>>> +            }
>>>>> +
>>>>> +            ret = dma_fence_wait_timeout(s_fence->parent, false, 
>>>>> timeout);
>>>>> +
>>>>> +            if (ret > 0) { /* succeed */
>>>>> +                /* process a job */
>>>>> +                atomic_dec(&sched->num_jobs);
>>>>> +                dma_fence_get(&s_fence->finished);
>>>>> + dma_fence_signal(&s_fence->finished);
>>>>> +                dma_fence_put(&s_fence->finished);
>>>>> +
>>>>> +                /* remove node from mirror_list and free the job */
>>>>> +                spin_lock(&sched->job_list_lock);
>>>>> +                list_del_init(&s_job->node);
>>>>> + spin_unlock(&sched->job_list_lock);
>>>>> +                sched->ops->free_job(s_job);
>>>>> +                continue;
>>>>> +            } else if (ret == 0) {
>>>>> +                *s_bad_job = s_job;
>>>>> +                return -1; /* timeout */
>>>>> +            }
>>>>> +    }
>>>>> +
>>>>> +    return 0;
>>>>> +}
>>>>> +
>>>>>     const struct drm_sched_backend_ops amdgpu_sched_ops = {
>>>>>         .dependency = amdgpu_job_dependency,
>>>>>         .run_job = amdgpu_job_run,
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>>> index 81caac9b958a..25292f4699fb 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>>> @@ -76,5 +76,5 @@ int amdgpu_job_submit_direct(struct amdgpu_job 
>>>>> *job, struct amdgpu_ring *ring,
>>>>>                      struct dma_fence **fence);
>>>>>     void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler
>>>>> *sched);
>>>>> -
>>>>> +int amdgpu_wait_resubmitted_jobs_completion(struct drm_gpu_scheduler
>>>>> +*sched, long timeout, struct drm_sched_job **s_bad_job);
>>>>>     #endif
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>>>> index b644c78475fd..cb50bfc80bc9 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>>>> @@ -35,6 +35,11 @@
>>>>>     #include "amdgpu.h"
>>>>>     #include "atom.h"
>>>>> +static char *amdgpu_affinity_group[] = { "gfx", "comp"
>>>>> +};
>>>>> +
>>>>>     /*
>>>>>      * Rings
>>>>>      * Most engines on the GPU are fed via ring buffers. Ring @@
>>>>> -189,6
>>>>> +194,7 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct
>>>>> +amdgpu_ring *ring,
>>>>>             ring->adev = adev;
>>>>>             ring->idx = adev->num_rings++;
>>>>>             adev->rings[ring->idx] = ring;
>>>>> +        amdgpu_ring_set_affinity_group(ring);
>>>>>             r = amdgpu_fence_driver_init_ring(ring, 
>>>>> sched_hw_submission);
>>>>>             if (r)
>>>>>                 return r;
>>>>> @@ -459,3 +465,24 @@ int amdgpu_ring_test_helper(struct 
>>>>> amdgpu_ring *ring)
>>>>>         ring->sched.ready = !r;
>>>>>         return r;
>>>>>     }
>>>>> +
>>>>> +int amdgpu_ring_set_affinity_group(struct amdgpu_ring *ring) {
>>>>> +       struct amdgpu_device *adev = ring->adev;
>>>>> +       int i;
>>>>> +
>>>>> +       for (i = 0; i < ARRAY_SIZE(amdgpu_affinity_group); i++) {
>>>>> +               char *temp_name = amdgpu_affinity_group[i];
>>>>> +
>>>>> +               /* set ring's affinity_group bit if find it in 
>>>>> affinity_group list */
>>>>> +               if (strncmp(ring->name, temp_name, 
>>>>> strlen(temp_name)) == 0) {
>>>>> +                       DRM_DEV_INFO(adev->dev, "set ring:%s in 
>>>>> affinity_group\n",
>>>>> +                             ring->name);
>>>>> +                       ring->sched.affinity_group = 1;
>>>>> +                       return 0;
>>>>> +               }
>>>>> +       }
>>>>> +
>>>>> +       ring->sched.affinity_group = 0;
>>>>> +       return 0;
>>>>> +}
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>> index 56acec1075ac..6b0d217e6f5a 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>> @@ -350,4 +350,5 @@ int amdgpu_debugfs_ring_init(struct 
>>>>> amdgpu_device *adev,
>>>>>                      struct amdgpu_ring *ring);
>>>>>     void amdgpu_debugfs_ring_fini(struct amdgpu_ring *ring);
>>>>> +int amdgpu_ring_set_affinity_group(struct amdgpu_ring *ring);
>>>>>     #endif
>>>>> diff --git a/include/drm/gpu_scheduler.h
>>>>> b/include/drm/gpu_scheduler.h index 1c815e0a14ed..589cbaea35dc 100644
>>>>> --- a/include/drm/gpu_scheduler.h
>>>>> +++ b/include/drm/gpu_scheduler.h
>>>>> @@ -301,6 +301,7 @@ struct drm_gpu_scheduler {
>>>>>         atomic_t                        _score;
>>>>>         bool                ready;
>>>>>         bool                free_guilty;
>>>>> +    int                affinity_group;
>>>>>     };
>>>>>     int drm_sched_init(struct drm_gpu_scheduler *sched,
>>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode
  2021-03-09 17:41             ` Christian König
@ 2021-03-10 11:26               ` Zhang, Jack (Jian)
  0 siblings, 0 replies; 13+ messages in thread
From: Zhang, Jack (Jian) @ 2021-03-10 11:26 UTC (permalink / raw)
  To: Koenig, Christian, Grodzovsky, Andrey, Liu, Monk, amd-gfx, Deng, Emily

[AMD Official Use Only - Internal Distribution Only]

Hi, Andrey and Christian,

I have uploaded the V4 patch. And involved the previous review opinions.
Would you please help to take a review of it?

The basic idea is to add an additional resubmit step before the normal resubmit.

In this resubmit step,

We submit the first job of each ring. And wait for hw fence.
If fence is signaled, signal the finished fence job, delete the node and free the job.
If fence is timeout, do hw reset, and set it to guilty.

After we verified all the first job in each ring.
We can make sure we have cleared all the finished jobs and set guilty to every hanged jobs.

Then we could safely do the normal resubmit jobs.

Thanks,
Jack

-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com>
Sent: Wednesday, March 10, 2021 1:42 AM
To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Zhang, Jack (Jian) <Jack.Zhang1@amd.com>; amd-gfx@lists.freedesktop.org; Deng, Emily <Emily.Deng@amd.com>
Subject: Re: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode

Yeah, sorry. I've seen your reply only after I wrote this mail.

I think we can go ahead if you don't have much other concern on this.

Christian.

Am 09.03.21 um 17:48 schrieb Andrey Grodzovsky:
> I we are talking about 'PATCH v3] drm/amd/amdgpu implement tdr
> advanced mode'  which was sent yesterday then I already went over it
> and only had 2 cosmetical comments.
>
> Andrey
>
> On 2021-03-09 6:16 a.m., Christian König wrote:
>> Yeah, that are some really good points. I completely agree that we
>> shouldn't do any larger cleanup right now.
>>
>> But I think we still need some more review on this. I most likely
>> won't have enough time to look into this before the weekend.
>>
>> Andrey can you take a look as well?
>>
>> Thanks,
>> Christian.
>>
>> Am 09.03.21 um 08:29 schrieb Liu, Monk:
>>> [AMD Official Use Only - Internal Distribution Only]
>>>
>>> Christian
>>>
>>> what feasible and practice now is:
>>> 1) we implement the advanced TDR mode in upstream first (so we can
>>> copy the same scheme in our LTS kernel) -- if you want we can avoid
>>> change drm/scheduler part code, but that one is already rejected by
>>> you due to complicated
>>> 2) then we retire the mirror list concept and rework the
>>> drm/scheduler with KFIFO
>>> 3) remove the guilty/karma handling from scheduler
>>>
>>> So I basically agree with you on the spirit of above changes: hide
>>> those AMD internal concept or tricks in vendor's driver part and
>>> make scheduler simple and scalable But that definitely need  a
>>> longer design and discussion, so why don't we focus on our current
>>> problems now, as long as the new change doesn't regress it is still
>>> a good change based on current TDR implements
>>>
>>> I would proposal we only change AMD part code in this time, Jack's
>>> first version patch didn't touch scheduler part, but you stated it
>>> was too complicated and rejected it
>>>
>>> So the allowable revise options is what Jack did in ver2, which need
>>> to introduce a new API in scheduler drm_sched_resubmit_jobs2().
>>>
>>> Hah: --( what do you think ?
>>>
>>> Thanks
>>>
>>> ------------------------------------------
>>> Monk Liu | Cloud-GPU Core team
>>> ------------------------------------------
>>>
>>> -----Original Message-----
>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>> Sent: Monday, March 8, 2021 3:53 PM
>>> To: Liu, Monk <Monk.Liu@amd.com>; Zhang, Jack (Jian)
>>> <Jack.Zhang1@amd.com>; amd-gfx@lists.freedesktop.org; Grodzovsky,
>>> Andrey <Andrey.Grodzovsky@amd.com>; Deng, Emily <Emily.Deng@amd.com>
>>> Subject: Re: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode
>>>
>>>
>>>
>>> Am 08.03.21 um 05:06 schrieb Liu, Monk:
>>>> [AMD Official Use Only - Internal Distribution Only]
>>>>
>>>>>> well first of all please completely drop the affinity group stuff
>>>>>> from this patch. We should concentrate on one feature at at time.
>>>> We need it to expedite the process, we can introduce this change in
>>>> another patch
>>>>
>>>>
>>>>>> Then the implementation is way to complicate. All you need to do
>>>>>> is insert a dma_fence_wait after re-scheduling each job after a
>>>>>> reset.
>>>> No that's not true, during the " drm_sched_resubmit_jobs" it will
>>>> put all jobs in mirror list into the hw ring, but we can only allow
>>>> the first job to the ring To catch the real guilty one (otherwise
>>>> it is possible that the later job in the ring also has bug and
>>>> affect our
>>>> judgement) So we need to implement a new "
>>>> drm_sched_resubmit_jobs2()", like this way:
>>> Something like this. But since waiting for the guilty job is AMD
>>> specific we should rather rework the stuff from the beginning.
>>>
>>> What I have in mind is the following:
>>> 1. Add a reference from the scheduler fence back to the job which is
>>> cleared only when the scheduler fence finishes.
>>> 2. Completely drop the ring_mirror_list and replace it with a kfifo
>>> of pointers to the active scheduler fences.
>>> 3. Replace drm_sched_resubmit_jobs with a
>>> drm_sched_for_each_active() macro which allows drivers to iterate
>>> over all the active jobs and resubmit/wait/mark them as guilty etc
>>> etc..
>>> 4. Remove the guilty/karma handling from the scheduler. This is
>>> something AMD specific and shouldn't leak into common code.
>>>
>>> Regards,
>>> Christian.
>>>
>>>> drm_sched_resubmit_jobs2()
>>>> ~ 499 void drm_sched_resubmit_jobs2(struct drm_gpu_scheduler
>>>> *sched, int max)
>>>>     500 {
>>>>     501     struct drm_sched_job *s_job, *tmp;
>>>>     502     uint64_t guilty_context;
>>>>     503     bool found_guilty = false;
>>>>     504     struct dma_fence *fence;
>>>> + 505     int i = 0;
>>>>     506
>>>>     507     list_for_each_entry_safe(s_job, tmp,
>>>> &sched->ring_mirror_list, node) {
>>>>     508         struct drm_sched_fence *s_fence = s_job->s_fence;
>>>>     509
>>>> + 510         if (i >= max)
>>>> + 511             break;
>>>> + 512
>>>>     513         if (!found_guilty && atomic_read(&s_job->karma) >
>>>> sched->hang_limit) {
>>>>     514             found_guilty = true;
>>>>     515             guilty_context =
>>>> s_job->s_fence->scheduled.context;
>>>>     516         }
>>>>     517
>>>>     518         if (found_guilty &&
>>>> s_job->s_fence->scheduled.context == guilty_context)
>>>>     519 dma_fence_set_error(&s_fence->finished, -ECANCELED);
>>>>     520
>>>>     521         dma_fence_put(s_job->s_fence->parent);
>>>>     522         fence = sched->ops->run_job(s_job);
>>>> + 523         i++;
>>>>     524
>>>>     525         if (IS_ERR_OR_NULL(fence)) {
>>>>     526             if (IS_ERR(fence))
>>>>     527 dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
>>>>     528
>>>>     529             s_job->s_fence->parent = NULL;
>>>>     530         } else {
>>>>     531             s_job->s_fence->parent = fence;
>>>>     532         }
>>>>     533
>>>>     534
>>>>     535     }
>>>>     536 }
>>>>     537 EXPORT_SYMBOL(drm_sched_resubmit_jobs);
>>>>     538
>>>>
>>>>
>>>>
>>>> Thanks
>>>>
>>>> ------------------------------------------
>>>> Monk Liu | Cloud-GPU Core team
>>>> ------------------------------------------
>>>>
>>>> -----Original Message-----
>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>> Sent: Sunday, March 7, 2021 3:03 AM
>>>> To: Zhang, Jack (Jian) <Jack.Zhang1@amd.com>;
>>>> amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey
>>>> <Andrey.Grodzovsky@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Deng,
>>>> Emily <Emily.Deng@amd.com>
>>>> Subject: Re: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode
>>>>
>>>> Hi Jack,
>>>>
>>>> well first of all please completely drop the affinity group stuff
>>>> from this patch. We should concentrate on one feature at at time.
>>>>
>>>> Then the implementation is way to complicate. All you need to do is
>>>> insert a dma_fence_wait after re-scheduling each job after a reset.
>>>>
>>>> Additional to that this feature is completely AMD specific and
>>>> shouldn't affect the common scheduler in any way.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 06.03.21 um 18:25 schrieb Jack Zhang:
>>>>> [Why]
>>>>> Previous tdr design treats the first job in job_timeout as the bad
>>>>> job.
>>>>> But sometimes a later bad compute job can block a good gfx job and
>>>>> cause an unexpected gfx job timeout because gfx and compute ring
>>>>> share internal GC HW mutually.
>>>>>
>>>>> [How]
>>>>> This patch implements an advanced tdr mode.It involves an
>>>>> additinal synchronous pre-resubmit step(Step0 Resubmit) before
>>>>> normal resubmit step in order to find the real bad job.
>>>>>
>>>>> 1. For Bailing TDR job, re-insert it to mirror_list, don't set it
>>>>> to guilty and leave it to be handled by the main reset thread.
>>>>>
>>>>> 2. Don't set the job to guilty in pre_asic_reset, and leave it to
>>>>> be handled by Step0 Resubmit Stage.
>>>>>
>>>>> 3. At Step0 Resubmit stage, it first resubmit jobs asynchronously,
>>>>> then it iterate each ring mirror_list, synchronously pend for each
>>>>> hw fence being signaled. If the a job's hw fence get timeout, we
>>>>> identify it as guilty and do hw reset to recover hw. After that,
>>>>> we would do the normal resubmit step to resubmit left jobs.
>>>>>
>>>>> 4. For whole gpu reset(vram lost), skip Step0 Resubmit as each job
>>>>> after vram lost was considered as bad job.
>>>>>
>>>>> 5. Involve the concept "Affinity Group".
>>>>> Doing two hw resets is not necessary when there's only one ring
>>>>> that has jobs among some hw-related rings.Thus, we involve
>>>>> "affinity group".
>>>>> Hw-related rings could be added into a common affinity group, such
>>>>> as gfx and compute ring. When tdr happens, we iterate all rings in
>>>>> affinity group, skip Step0 Resubmit stage if there's only one
>>>>> ring's mirror_list that has valid sched jobs.
>>>>>
>>>>> V2:
>>>>>        -fix a cherry-pick mistake for bailing TDR handling.
>>>>>
>>>>>        -do affinity_group check according to the bad job's sched
>>>>> rather
>>>>>         than the default "1" so that there could be multiple
>>>>> affinity
>>>>>         groups being pre-defined in future.
>>>>>
>>>>> Signed-off-by: Jack Zhang <Jack.Zhang1@amd.com>
>>>>> ---
>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102
>>>>> +++++++++++++++++++--
>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |   2 +-
>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  47 ++++++++++
>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_job.h    |   2 +-
>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c   |  27 ++++++
>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |   1 +
>>>>>     include/drm/gpu_scheduler.h                |   1 +
>>>>>     7 files changed, 173 insertions(+), 9 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> index e247c3a2ec08..8632d7071292 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> @@ -4188,6 +4188,37 @@ bool amdgpu_device_has_job_running(struct
>>>>> amdgpu_device *adev)
>>>>>         return false;
>>>>>     }
>>>>> +bool amdgpu_affinity_group_has_only_or_null_working_ring(struct
>>>>> +amdgpu_device *adev, struct drm_sched_job *s_job) {
>>>>> +       int i;
>>>>> +       int working_ring_num = 0;
>>>>> +
>>>>> +    /*
>>>>> +     * The job is considered as the real bad one
>>>>> +     * if job's sched is not in affinity group
>>>>> +     */
>>>>> +    if (s_job->sched.affinity_group == 0)
>>>>> +            return true;
>>>>> +
>>>>> +       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>>>> +               struct amdgpu_ring *ring = adev->rings[i];
>>>>> +
>>>>> +               if (!ring || !ring->sched.thread)
>>>>> +                       continue;
>>>>> +
>>>>> +               /* for non-empty affinity ring, increase
>>>>> working_ring_num */
>>>>> +               if (ring->sched.affinity_group ==
>>>>> s_job->sched.affinity_group) {
>>>>> +                       if
>>>>> (!list_empty(&ring->sched.ring_mirror_list))
>>>>> +                               working_ring_num++;
>>>>> +               }
>>>>> +       }
>>>>> +
>>>>> +       if (working_ring_num > 1) {
>>>>> +               return false;
>>>>> +       }
>>>>> +       return true;
>>>>> +}
>>>>> +
>>>>>     /**
>>>>>      * amdgpu_device_should_recover_gpu - check if we should try
>>>>> GPU recovery
>>>>>      *
>>>>> @@ -4310,8 +4341,10 @@ static int
>>>>> amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>>>             amdgpu_fence_driver_force_completion(ring);
>>>>>         }
>>>>> -    if(job)
>>>>> -        drm_sched_increase_karma(&job->base);
>>>>> +    if (amdgpu_gpu_recovery != 2) {
>>>>> +        if (job)
>>>>> +            drm_sched_increase_karma(&job->base);
>>>>> +    }
>>>>>         /* Don't suspend on bare metal if we are not going to HW
>>>>> reset the ASIC */
>>>>>         if (!amdgpu_sriov_vf(adev)) { @@ -4639,7 +4672,7 @@ int
>>>>> amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>>>         int i, r = 0;
>>>>>         bool need_emergency_restart = false;
>>>>>         bool audio_suspended = false;
>>>>> -
>>>>> +    int    tmp_vram_lost_counter;
>>>>>         /*
>>>>>          * Special case: RAS triggered and full reset isn't
>>>>> supported
>>>>>          */
>>>>> @@ -4690,8 +4723,16 @@ int amdgpu_device_gpu_recover(struct
>>>>> amdgpu_device *adev,
>>>>>                         job ? job->base.id : -1);
>>>>>             /* even we skipped this reset, still need to set the
>>>>> job to guilty */
>>>>> -        if (job)
>>>>> -            drm_sched_increase_karma(&job->base);
>>>>> +        if (job) {
>>>>> +            if (amdgpu_gpu_recovery == 2) {
>>>>> +                if (&job->base) {
>>>>> +spin_lock(&job->base.sched->job_list_lock);
>>>>> +                    list_add(&job->base.node,
>>>>> &job->base.sched->ring_mirror_list);
>>>>> + spin_unlock(&job->base.sched->job_list_lock);
>>>>> +                }
>>>>> +            } else
>>>>> + drm_sched_increase_karma(&job->base);
>>>>> +        }
>>>>>             goto skip_recovery;
>>>>>         }
>>>>> @@ -4788,6 +4829,7 @@ int amdgpu_device_gpu_recover(struct
>>>>> amdgpu_device *adev,
>>>>>             }
>>>>>         }
>>>>> +    tmp_vram_lost_counter =
>>>>> atomic_read(&((adev)->vram_lost_counter));
>>>>>         /* Actual ASIC resets if needed.*/
>>>>>         /* TODO Implement XGMI hive reset logic for SRIOV */
>>>>>         if (amdgpu_sriov_vf(adev)) { @@ -4804,18 +4846,64 @@ int
>>>>> amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>>>         /* Post ASIC reset for all devs .*/
>>>>>         list_for_each_entry(tmp_adev, device_list_handle,
>>>>> gmc.xgmi.head)
>>>>> {
>>>>> +        int step = 1;
>>>>> +        if (amdgpu_gpu_recovery == 2) {
>>>>> +            if
>>>>> (amdgpu_affinity_group_has_only_or_null_working_ring(adev,&job->ba
>>>>> se)
>>>>> +                || tmp_vram_lost_counter <
>>>>> atomic_read(&adev->vram_lost_counter)) {
>>>>> +                DRM_INFO("Skip Stage0 Resubmit Stage\n");
>>>>> +                /* set guilty */
>>>>> + drm_sched_increase_karma(&job->base);
>>>>> +                step = 1;
>>>>> +            } else {
>>>>> +                DRM_INFO("Do Stage0 Resubmit Stage\n");
>>>>> +                step = 0;
>>>>> +            }
>>>>> +        }
>>>>> +
>>>>> +retry_resubmit:
>>>>>             for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>>>>                 struct amdgpu_ring *ring = tmp_adev->rings[i];
>>>>> +            int ret = 0;
>>>>> +            struct drm_sched_job *s_bad_job = NULL;
>>>>>                 if (!ring || !ring->sched.thread)
>>>>>                     continue;
>>>>>                 /* No point to resubmit jobs if we didn't HW
>>>>> reset*/
>>>>> -            if (!tmp_adev->asic_reset_res && !job_signaled)
>>>>> +            if (!tmp_adev->asic_reset_res && !job_signaled) {
>>>>> +
>>>>> drm_sched_resubmit_jobs(&ring->sched);
>>>>> -            drm_sched_start(&ring->sched,
>>>>> !tmp_adev->asic_reset_res);
>>>>> +                if (amdgpu_gpu_recovery == 2 && step == 0) {
>>>>> +                    ret =
>>>>> amdgpu_wait_resubmitted_jobs_completion(&ring->sched,
>>>>> ring->sched.timeout, &s_bad_job);
>>>>> +                    if (ret == -1) {
>>>>> +                        DRM_ERROR("Found the real bad job!
>>>>> ring:%s, job_id:%llx\n", ring->sched.name, s_bad_job->id);
>>>>> +                        /* set guilty */
>>>>> +drm_sched_increase_karma(s_bad_job);
>>>>> +
>>>>> +                        /* do hw reset */
>>>>> +                        if (amdgpu_sriov_vf(adev)) {
>>>>> +amdgpu_virt_fini_data_exchange(adev);
>>>>> +                            r = amdgpu_device_reset_sriov(adev,
>>>>> false);
>>>>> +                            if (r)
>>>>> +                                adev->asic_reset_res = r;
>>>>> +                        } else {
>>>>> +                            r  = amdgpu_do_asic_reset(hive,
>>>>> device_list_handle, &need_full_reset, false);
>>>>> +                            if (r && r == -EAGAIN)
>>>>> +                                goto retry;
>>>>> +                        }
>>>>> +
>>>>> +                        /* add reset counter so that the
>>>>> following resubmitted job could flush vmid */
>>>>> + atomic_inc(&tmp_adev->gpu_reset_counter);
>>>>> +                        step = 1;
>>>>> +                        goto retry_resubmit;
>>>>> +                    }
>>>>> +                }
>>>>> +            }
>>>>> +
>>>>> +            if (step == 1)
>>>>> +                drm_sched_start(&ring->sched,
>>>>> !tmp_adev->asic_reset_res);
>>>>>             }
>>>>>             if (!amdgpu_device_has_dc_support(tmp_adev) &&
>>>>> !job_signaled) {
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>> index 865f924772b0..9c3f4edb7532 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>> @@ -509,7 +509,7 @@ module_param_named(compute_multipipe,
>>>>> amdgpu_compute_multipipe, int, 0444);
>>>>>      * DOC: gpu_recovery (int)
>>>>>      * Set to enable GPU recovery mechanism (1 = enable, 0 =
>>>>> disable). The default is -1 (auto, disabled except SRIOV).
>>>>>      */
>>>>> -MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1
>>>>> = enable, 0 = disable, -1 = auto)");
>>>>> +MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (2
>>>>> += advanced tdr mode, 1 = enable, 0 = disable, -1 = auto)");
>>>>>     module_param_named(gpu_recovery, amdgpu_gpu_recovery, int,
>>>>> 0444);
>>>>>     /**
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>>> index 759b34799221..28cda321157a 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>>> @@ -281,6 +281,53 @@ void amdgpu_job_stop_all_jobs_on_sched(struct
>>>>> drm_gpu_scheduler *sched)
>>>>>         }
>>>>>     }
>>>>> +int amdgpu_wait_resubmitted_jobs_completion(struct
>>>>> +drm_gpu_scheduler *sched, long timeout, struct drm_sched_job
>>>>> +**s_bad_job) {
>>>>> +    struct drm_sched_job *s_job, *tmp;
>>>>> +    int ret = 0;
>>>>> +
>>>>> +    list_for_each_entry_safe(s_job, tmp,
>>>>> &sched->ring_mirror_list, node) {
>>>>> +        struct drm_sched_fence *s_fence = s_job->s_fence;
>>>>> +
>>>>> +            if (s_fence->parent == NULL) { /* fail to get a hw
>>>>> fence */
>>>>> +                /* process a job */
>>>>> +                atomic_dec(&sched->num_jobs);
>>>>> +                dma_fence_get(&s_fence->finished);
>>>>> + dma_fence_signal(&s_fence->finished);
>>>>> +                dma_fence_put(&s_fence->finished);
>>>>> +
>>>>> +                /* remove node from mirror_list and free the job
>>>>> +*/
>>>>> +                spin_lock(&sched->job_list_lock);
>>>>> +                list_del_init(&s_job->node);
>>>>> +spin_unlock(&sched->job_list_lock);
>>>>> +                sched->ops->free_job(s_job);
>>>>> +                continue;
>>>>> +            }
>>>>> +
>>>>> +            ret = dma_fence_wait_timeout(s_fence->parent, false,
>>>>> timeout);
>>>>> +
>>>>> +            if (ret > 0) { /* succeed */
>>>>> +                /* process a job */
>>>>> +                atomic_dec(&sched->num_jobs);
>>>>> +                dma_fence_get(&s_fence->finished);
>>>>> + dma_fence_signal(&s_fence->finished);
>>>>> +                dma_fence_put(&s_fence->finished);
>>>>> +
>>>>> +                /* remove node from mirror_list and free the job
>>>>> +*/
>>>>> +                spin_lock(&sched->job_list_lock);
>>>>> +                list_del_init(&s_job->node);
>>>>> +spin_unlock(&sched->job_list_lock);
>>>>> +                sched->ops->free_job(s_job);
>>>>> +                continue;
>>>>> +            } else if (ret == 0) {
>>>>> +                *s_bad_job = s_job;
>>>>> +                return -1; /* timeout */
>>>>> +            }
>>>>> +    }
>>>>> +
>>>>> +    return 0;
>>>>> +}
>>>>> +
>>>>>     const struct drm_sched_backend_ops amdgpu_sched_ops = {
>>>>>         .dependency = amdgpu_job_dependency,
>>>>>         .run_job = amdgpu_job_run, diff --git
>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>>> index 81caac9b958a..25292f4699fb 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>>> @@ -76,5 +76,5 @@ int amdgpu_job_submit_direct(struct amdgpu_job
>>>>> *job, struct amdgpu_ring *ring,
>>>>>                      struct dma_fence **fence);
>>>>>     void amdgpu_job_stop_all_jobs_on_sched(struct
>>>>> drm_gpu_scheduler *sched);
>>>>> -
>>>>> +int amdgpu_wait_resubmitted_jobs_completion(struct
>>>>> +drm_gpu_scheduler *sched, long timeout, struct drm_sched_job
>>>>> +**s_bad_job);
>>>>>     #endif
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>>>> index b644c78475fd..cb50bfc80bc9 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>>>> @@ -35,6 +35,11 @@
>>>>>     #include "amdgpu.h"
>>>>>     #include "atom.h"
>>>>> +static char *amdgpu_affinity_group[] = { "gfx", "comp"
>>>>> +};
>>>>> +
>>>>>     /*
>>>>>      * Rings
>>>>>      * Most engines on the GPU are fed via ring buffers. Ring @@
>>>>> -189,6
>>>>> +194,7 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct
>>>>> +amdgpu_ring *ring,
>>>>>             ring->adev = adev;
>>>>>             ring->idx = adev->num_rings++;
>>>>>             adev->rings[ring->idx] = ring;
>>>>> +        amdgpu_ring_set_affinity_group(ring);
>>>>>             r = amdgpu_fence_driver_init_ring(ring,
>>>>> sched_hw_submission);
>>>>>             if (r)
>>>>>                 return r;
>>>>> @@ -459,3 +465,24 @@ int amdgpu_ring_test_helper(struct
>>>>> amdgpu_ring *ring)
>>>>>         ring->sched.ready = !r;
>>>>>         return r;
>>>>>     }
>>>>> +
>>>>> +int amdgpu_ring_set_affinity_group(struct amdgpu_ring *ring) {
>>>>> +       struct amdgpu_device *adev = ring->adev;
>>>>> +       int i;
>>>>> +
>>>>> +       for (i = 0; i < ARRAY_SIZE(amdgpu_affinity_group); i++) {
>>>>> +               char *temp_name = amdgpu_affinity_group[i];
>>>>> +
>>>>> +               /* set ring's affinity_group bit if find it in
>>>>> affinity_group list */
>>>>> +               if (strncmp(ring->name, temp_name,
>>>>> strlen(temp_name)) == 0) {
>>>>> +                       DRM_DEV_INFO(adev->dev, "set ring:%s in
>>>>> affinity_group\n",
>>>>> +                             ring->name);
>>>>> +                       ring->sched.affinity_group = 1;
>>>>> +                       return 0;
>>>>> +               }
>>>>> +       }
>>>>> +
>>>>> +       ring->sched.affinity_group = 0;
>>>>> +       return 0;
>>>>> +}
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>> index 56acec1075ac..6b0d217e6f5a 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>> @@ -350,4 +350,5 @@ int amdgpu_debugfs_ring_init(struct
>>>>> amdgpu_device *adev,
>>>>>                      struct amdgpu_ring *ring);
>>>>>     void amdgpu_debugfs_ring_fini(struct amdgpu_ring *ring);
>>>>> +int amdgpu_ring_set_affinity_group(struct amdgpu_ring *ring);
>>>>>     #endif
>>>>> diff --git a/include/drm/gpu_scheduler.h
>>>>> b/include/drm/gpu_scheduler.h index 1c815e0a14ed..589cbaea35dc
>>>>> 100644
>>>>> --- a/include/drm/gpu_scheduler.h
>>>>> +++ b/include/drm/gpu_scheduler.h
>>>>> @@ -301,6 +301,7 @@ struct drm_gpu_scheduler {
>>>>>         atomic_t                        _score;
>>>>>         bool                ready;
>>>>>         bool                free_guilty;
>>>>> +    int                affinity_group;
>>>>>     };
>>>>>     int drm_sched_init(struct drm_gpu_scheduler *sched,
>>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2021-03-10 11:27 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-06 17:25 [PATCH v2] drm/amd/amdgpu implement tdr advanced mode Jack Zhang
2021-03-06 19:03 ` Christian König
2021-03-08  4:06   ` Liu, Monk
2021-03-08  4:22     ` Zhang, Jack (Jian)
2021-03-08  7:53     ` Christian König
2021-03-08 12:43       ` Zhang, Jack (Jian)
2021-03-08 12:45         ` Christian König
2021-03-09  7:03           ` Zhang, Jack (Jian)
2021-03-09  7:29       ` Liu, Monk
2021-03-09 11:16         ` Christian König
2021-03-09 16:48           ` Andrey Grodzovsky
2021-03-09 17:41             ` Christian König
2021-03-10 11:26               ` Zhang, Jack (Jian)

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.