All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: This patch handles the GPU recovery faliure in sriov environment by retrying the reset if the first reset fails. To determine the condition of retry, a new function amdgpu_is_retry_sriov_reset() is added which returns true if failure is due to ETIMEDOUT, EINVAL or EBUSY, otherwise return false.
@ 2022-01-31 15:21 Surbhi Kakarya
  0 siblings, 0 replies; only message in thread
From: Surbhi Kakarya @ 2022-01-31 15:21 UTC (permalink / raw)
  To: amd-gfx, Bokun.Zhang, HaiJun.Chang, Monk.Liu, Alexander.Deucher,
	Kelly.Zytaruk
  Cc: Surbhi Kakarya

It also handles the return status in Post Asic Reset by updating the return code
with asic_reset_res and eventually return the return code in amdgpu_job_timedout().

Change-Id: I45b9743adb548606aef8774496527d29fb3de0af
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 34 ++++++++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  6 +++-
 2 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 53af2623c58f..8a742b77eef8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5026,6 +5026,21 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
 	return 0;
 }
 
+/**
+ * amdgpu_is_retry_sriov_reset - check if we should retry sriov reset
+ *
+ * Check amdgpu_is_retry_sriov_reset and return status to see if we should retry reset.
+ */
+static bool amdgpu_is_retry_sriov_reset(int r)
+{
+
+        if(r == -EBUSY || r == -ETIMEDOUT || r == -EINVAL)
+                return true;
+        else
+                return false;
+
+}
+
 static void amdgpu_device_recheck_guilty_jobs(
 	struct amdgpu_device *adev, struct list_head *device_list_handle,
 	struct amdgpu_reset_context *reset_context)
@@ -5064,8 +5079,13 @@ static void amdgpu_device_recheck_guilty_jobs(
 			if (amdgpu_sriov_vf(adev)) {
 				amdgpu_virt_fini_data_exchange(adev);
 				r = amdgpu_device_reset_sriov(adev, false);
-				if (r)
+				if (r) {
 					adev->asic_reset_res = r;
+					if (amdgpu_is_retry_sriov_reset(r)) {
+						adev->asic_reset_res = 0;
+						goto retry;
+					}
+				}
 			} else {
 				clear_bit(AMDGPU_SKIP_HW_RESET,
 					  &reset_context->flags);
@@ -5299,8 +5319,13 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	/* Host driver will handle XGMI hive reset for SRIOV */
 	if (amdgpu_sriov_vf(adev)) {
 		r = amdgpu_device_reset_sriov(adev, job ? false : true);
-		if (r)
-			adev->asic_reset_res = r;
+                if (r) {
+                        adev->asic_reset_res = r;
+                        if (amdgpu_is_retry_sriov_reset(r)) {
+				adev->asic_reset_res = 0;
+				goto retry;
+                        }
+                }
 	} else {
 		r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
 		if (r && r == -EAGAIN)
@@ -5341,6 +5366,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 			drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
 		}
 
+		if (tmp_adev->asic_reset_res)
+			r = tmp_adev->asic_reset_res;
+
 		tmp_adev->asic_reset_res = 0;
 
 		if (r) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e0730ea56a8c..1f0fb21ac15a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -37,6 +37,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 	struct amdgpu_task_info ti;
 	struct amdgpu_device *adev = ring->adev;
 	int idx;
+	int r = 0;
 
 	if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
 		DRM_INFO("%s - device unplugged skipping recovery on scheduler:%s",
@@ -63,7 +64,10 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 		  ti.process_name, ti.tgid, ti.task_name, ti.pid);
 
 	if (amdgpu_device_should_recover_gpu(ring->adev)) {
-		amdgpu_device_gpu_recover(ring->adev, job);
+		r = amdgpu_device_gpu_recover(ring->adev, job);
+		if (r)
+			DRM_ERROR("GPU Recovery Failed: %d\n",r);
+
 	} else {
 		drm_sched_suspend_timeout(&ring->sched);
 		if (amdgpu_sriov_vf(adev))
-- 
2.25.1


^ permalink raw reply related	[flat|nested] only message in thread

only message in thread, other threads:[~2022-01-31 15:27 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-01-31 15:21 [PATCH] drm/amdgpu: This patch handles the GPU recovery faliure in sriov environment by retrying the reset if the first reset fails. To determine the condition of retry, a new function amdgpu_is_retry_sriov_reset() is added which returns true if failure is due to ETIMEDOUT, EINVAL or EBUSY, otherwise return false Surbhi Kakarya

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.