All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: Handle the GPU recovery failure in SRIOV environment.
@ 2022-02-01 21:47 Surbhi Kakarya
  2022-02-02  7:44 ` Christian König
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Surbhi Kakarya @ 2022-02-01 21:47 UTC (permalink / raw)
  To: amd-gfx, Bokun.Zhang, HaiJun.Chang, Monk.Liu, Alexander.Deucher,
	Kelly.Zytaruk
  Cc: Surbhi Kakarya

This patch handles the GPU recovery faliure in sriov environment by
retrying the reset if the first reset fails. To determine the condition of retry, a
new function amdgpu_is_retry_sriov_reset() is added which returns true if failure is due
to ETIMEDOUT, EINVAL or EBUSY, otherwise return false. MAX_RETRY_LIMIT is used to
limit the retry to 2.

It also handles the return status in Post Asic Reset by updating the return code
with asic_reset_res and eventually return the return code in amdgpu_job_timedout().

Signed-off-by: Surbhi Kakarya <Surbhi.Kakarya@amd.com>
Change-Id: Ib2e408819b4780e6963e1dc078c3410cd512e6e8
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 47 ++++++++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  6 ++-
 2 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 53af2623c58f..f50c18cb38c8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -89,6 +89,7 @@ MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin");
 MODULE_FIRMWARE("amdgpu/yellow_carp_gpu_info.bin");
 
 #define AMDGPU_RESUME_MS		2000
+#define MAX_RETRY_LIMIT		2
 
 const char *amdgpu_asic_name[] = {
 	"TAHITI",
@@ -5026,11 +5027,27 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
 	return 0;
 }
 
+/**
+ * amdgpu_is_retry_sriov_reset - check if we should retry sriov reset
+ *
+ * Check amdgpu_is_retry_sriov_reset and return status to see if we should retry reset.
+ */
+static bool amdgpu_is_retry_sriov_reset(int r)
+{
+
+        if(r == -EBUSY || r == -ETIMEDOUT || r == -EINVAL)
+                return true;
+        else
+                return false;
+
+}
+
 static void amdgpu_device_recheck_guilty_jobs(
 	struct amdgpu_device *adev, struct list_head *device_list_handle,
 	struct amdgpu_reset_context *reset_context)
 {
 	int i, r = 0;
+	int retry_limit = 0;
 
 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
 		struct amdgpu_ring *ring = adev->rings[i];
@@ -5064,8 +5081,18 @@ static void amdgpu_device_recheck_guilty_jobs(
 			if (amdgpu_sriov_vf(adev)) {
 				amdgpu_virt_fini_data_exchange(adev);
 				r = amdgpu_device_reset_sriov(adev, false);
-				if (r)
+				if (r) {
 					adev->asic_reset_res = r;
+					if (amdgpu_is_retry_sriov_reset(r)) {
+						adev->asic_reset_res = 0;
+						if (retry_limit < MAX_RETRY_LIMIT) {
+							retry_limit++;
+							goto retry;
+						}
+						else
+							DRM_ERROR("GPU reset retry is beyond the retry limit\n");
+					}
+				}
 			} else {
 				clear_bit(AMDGPU_SKIP_HW_RESET,
 					  &reset_context->flags);
@@ -5122,6 +5149,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	bool locked = false;
 	int tmp_vram_lost_counter;
 	struct amdgpu_reset_context reset_context;
+	int retry_limit = 0;
 
 	memset(&reset_context, 0, sizeof(reset_context));
 
@@ -5299,8 +5327,18 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	/* Host driver will handle XGMI hive reset for SRIOV */
 	if (amdgpu_sriov_vf(adev)) {
 		r = amdgpu_device_reset_sriov(adev, job ? false : true);
-		if (r)
-			adev->asic_reset_res = r;
+                if (r) {
+                        adev->asic_reset_res = r;
+                        if (amdgpu_is_retry_sriov_reset(r)) {
+				adev->asic_reset_res = 0;
+				if (retry_limit < MAX_RETRY_LIMIT) {
+					retry_limit++;
+					goto retry;
+				}
+				else
+					DRM_ERROR("GPU reset retry is beyond the retry limit\n");
+                        }
+                }
 	} else {
 		r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
 		if (r && r == -EAGAIN)
@@ -5341,6 +5379,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 			drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
 		}
 
+		if (tmp_adev->asic_reset_res)
+			r = tmp_adev->asic_reset_res;
+
 		tmp_adev->asic_reset_res = 0;
 
 		if (r) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e0730ea56a8c..1f0fb21ac15a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -37,6 +37,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 	struct amdgpu_task_info ti;
 	struct amdgpu_device *adev = ring->adev;
 	int idx;
+	int r = 0;
 
 	if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
 		DRM_INFO("%s - device unplugged skipping recovery on scheduler:%s",
@@ -63,7 +64,10 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 		  ti.process_name, ti.tgid, ti.task_name, ti.pid);
 
 	if (amdgpu_device_should_recover_gpu(ring->adev)) {
-		amdgpu_device_gpu_recover(ring->adev, job);
+		r = amdgpu_device_gpu_recover(ring->adev, job);
+		if (r)
+			DRM_ERROR("GPU Recovery Failed: %d\n",r);
+
 	} else {
 		drm_sched_suspend_timeout(&ring->sched);
 		if (amdgpu_sriov_vf(adev))
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread
* [PATCH] drm/amdgpu: Handle the GPU recovery failure in SRIOV environment.
@ 2022-01-31 15:35 Surbhi Kakarya
  2022-01-31 20:23 ` Alex Deucher
  2022-02-01 10:03 ` Christian König
  0 siblings, 2 replies; 7+ messages in thread
From: Surbhi Kakarya @ 2022-01-31 15:35 UTC (permalink / raw)
  To: amd-gfx, Bokun.Zhang, HaiJun.Chang, Monk.Liu, Alexander.Deucher,
	Kelly.Zytaruk
  Cc: Surbhi Kakarya

This patch handles the GPU recovery faliure in sriov environment by
retrying the reset if the first reset fails. To determine the condition of retry, a
new function amdgpu_is_retry_sriov_reset() is added which returns true if failure is due
to ETIMEDOUT, EINVAL or EBUSY, otherwise return false.

It also handles the return status in Post Asic Reset by updating the return code
with asic_reset_res and eventually return the return code in amdgpu_job_timedout().

Change-Id: I45b9743adb548606aef8774496527d29fb3de0af
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 34 ++++++++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  6 +++-
 2 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 53af2623c58f..8a742b77eef8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5026,6 +5026,21 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
 	return 0;
 }
 
+/**
+ * amdgpu_is_retry_sriov_reset - check if we should retry sriov reset
+ *
+ * Check amdgpu_is_retry_sriov_reset and return status to see if we should retry reset.
+ */
+static bool amdgpu_is_retry_sriov_reset(int r)
+{
+
+        if(r == -EBUSY || r == -ETIMEDOUT || r == -EINVAL)
+                return true;
+        else
+                return false;
+
+}
+
 static void amdgpu_device_recheck_guilty_jobs(
 	struct amdgpu_device *adev, struct list_head *device_list_handle,
 	struct amdgpu_reset_context *reset_context)
@@ -5064,8 +5079,13 @@ static void amdgpu_device_recheck_guilty_jobs(
 			if (amdgpu_sriov_vf(adev)) {
 				amdgpu_virt_fini_data_exchange(adev);
 				r = amdgpu_device_reset_sriov(adev, false);
-				if (r)
+				if (r) {
 					adev->asic_reset_res = r;
+					if (amdgpu_is_retry_sriov_reset(r)) {
+						adev->asic_reset_res = 0;
+						goto retry;
+					}
+				}
 			} else {
 				clear_bit(AMDGPU_SKIP_HW_RESET,
 					  &reset_context->flags);
@@ -5299,8 +5319,13 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	/* Host driver will handle XGMI hive reset for SRIOV */
 	if (amdgpu_sriov_vf(adev)) {
 		r = amdgpu_device_reset_sriov(adev, job ? false : true);
-		if (r)
-			adev->asic_reset_res = r;
+                if (r) {
+                        adev->asic_reset_res = r;
+                        if (amdgpu_is_retry_sriov_reset(r)) {
+				adev->asic_reset_res = 0;
+				goto retry;
+                        }
+                }
 	} else {
 		r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
 		if (r && r == -EAGAIN)
@@ -5341,6 +5366,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 			drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
 		}
 
+		if (tmp_adev->asic_reset_res)
+			r = tmp_adev->asic_reset_res;
+
 		tmp_adev->asic_reset_res = 0;
 
 		if (r) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e0730ea56a8c..1f0fb21ac15a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -37,6 +37,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 	struct amdgpu_task_info ti;
 	struct amdgpu_device *adev = ring->adev;
 	int idx;
+	int r = 0;
 
 	if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
 		DRM_INFO("%s - device unplugged skipping recovery on scheduler:%s",
@@ -63,7 +64,10 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 		  ti.process_name, ti.tgid, ti.task_name, ti.pid);
 
 	if (amdgpu_device_should_recover_gpu(ring->adev)) {
-		amdgpu_device_gpu_recover(ring->adev, job);
+		r = amdgpu_device_gpu_recover(ring->adev, job);
+		if (r)
+			DRM_ERROR("GPU Recovery Failed: %d\n",r);
+
 	} else {
 		drm_sched_suspend_timeout(&ring->sched);
 		if (amdgpu_sriov_vf(adev))
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2022-02-02 16:17 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-02-01 21:47 [PATCH] drm/amdgpu: Handle the GPU recovery failure in SRIOV environment Surbhi Kakarya
2022-02-02  7:44 ` Christian König
2022-02-02 15:43 ` Andrey Grodzovsky
2022-02-02 16:17 ` Felix Kuehling
  -- strict thread matches above, loose matches on Subject: below --
2022-01-31 15:35 Surbhi Kakarya
2022-01-31 20:23 ` Alex Deucher
2022-02-01 10:03 ` Christian König

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.