All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: Handle the GPU recovery failure in SRIOV environment.
@ 2022-02-01 21:47 Surbhi Kakarya
  2022-02-02  7:44 ` Christian König
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Surbhi Kakarya @ 2022-02-01 21:47 UTC (permalink / raw)
  To: amd-gfx, Bokun.Zhang, HaiJun.Chang, Monk.Liu, Alexander.Deucher,
	Kelly.Zytaruk
  Cc: Surbhi Kakarya

This patch handles the GPU recovery faliure in sriov environment by
retrying the reset if the first reset fails. To determine the condition of retry, a
new function amdgpu_is_retry_sriov_reset() is added which returns true if failure is due
to ETIMEDOUT, EINVAL or EBUSY, otherwise return false. MAX_RETRY_LIMIT is used to
limit the retry to 2.

It also handles the return status in Post Asic Reset by updating the return code
with asic_reset_res and eventually return the return code in amdgpu_job_timedout().

Signed-off-by: Surbhi Kakarya <Surbhi.Kakarya@amd.com>
Change-Id: Ib2e408819b4780e6963e1dc078c3410cd512e6e8
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 47 ++++++++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  6 ++-
 2 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 53af2623c58f..f50c18cb38c8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -89,6 +89,7 @@ MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin");
 MODULE_FIRMWARE("amdgpu/yellow_carp_gpu_info.bin");
 
 #define AMDGPU_RESUME_MS		2000
+#define MAX_RETRY_LIMIT		2
 
 const char *amdgpu_asic_name[] = {
 	"TAHITI",
@@ -5026,11 +5027,27 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
 	return 0;
 }
 
+/**
+ * amdgpu_is_retry_sriov_reset - check if we should retry sriov reset
+ *
+ * Check amdgpu_is_retry_sriov_reset and return status to see if we should retry reset.
+ */
+static bool amdgpu_is_retry_sriov_reset(int r)
+{
+
+        if(r == -EBUSY || r == -ETIMEDOUT || r == -EINVAL)
+                return true;
+        else
+                return false;
+
+}
+
 static void amdgpu_device_recheck_guilty_jobs(
 	struct amdgpu_device *adev, struct list_head *device_list_handle,
 	struct amdgpu_reset_context *reset_context)
 {
 	int i, r = 0;
+	int retry_limit = 0;
 
 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
 		struct amdgpu_ring *ring = adev->rings[i];
@@ -5064,8 +5081,18 @@ static void amdgpu_device_recheck_guilty_jobs(
 			if (amdgpu_sriov_vf(adev)) {
 				amdgpu_virt_fini_data_exchange(adev);
 				r = amdgpu_device_reset_sriov(adev, false);
-				if (r)
+				if (r) {
 					adev->asic_reset_res = r;
+					if (amdgpu_is_retry_sriov_reset(r)) {
+						adev->asic_reset_res = 0;
+						if (retry_limit < MAX_RETRY_LIMIT) {
+							retry_limit++;
+							goto retry;
+						}
+						else
+							DRM_ERROR("GPU reset retry is beyond the retry limit\n");
+					}
+				}
 			} else {
 				clear_bit(AMDGPU_SKIP_HW_RESET,
 					  &reset_context->flags);
@@ -5122,6 +5149,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	bool locked = false;
 	int tmp_vram_lost_counter;
 	struct amdgpu_reset_context reset_context;
+	int retry_limit = 0;
 
 	memset(&reset_context, 0, sizeof(reset_context));
 
@@ -5299,8 +5327,18 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	/* Host driver will handle XGMI hive reset for SRIOV */
 	if (amdgpu_sriov_vf(adev)) {
 		r = amdgpu_device_reset_sriov(adev, job ? false : true);
-		if (r)
-			adev->asic_reset_res = r;
+                if (r) {
+                        adev->asic_reset_res = r;
+                        if (amdgpu_is_retry_sriov_reset(r)) {
+				adev->asic_reset_res = 0;
+				if (retry_limit < MAX_RETRY_LIMIT) {
+					retry_limit++;
+					goto retry;
+				}
+				else
+					DRM_ERROR("GPU reset retry is beyond the retry limit\n");
+                        }
+                }
 	} else {
 		r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
 		if (r && r == -EAGAIN)
@@ -5341,6 +5379,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 			drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
 		}
 
+		if (tmp_adev->asic_reset_res)
+			r = tmp_adev->asic_reset_res;
+
 		tmp_adev->asic_reset_res = 0;
 
 		if (r) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e0730ea56a8c..1f0fb21ac15a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -37,6 +37,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 	struct amdgpu_task_info ti;
 	struct amdgpu_device *adev = ring->adev;
 	int idx;
+	int r = 0;
 
 	if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
 		DRM_INFO("%s - device unplugged skipping recovery on scheduler:%s",
@@ -63,7 +64,10 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 		  ti.process_name, ti.tgid, ti.task_name, ti.pid);
 
 	if (amdgpu_device_should_recover_gpu(ring->adev)) {
-		amdgpu_device_gpu_recover(ring->adev, job);
+		r = amdgpu_device_gpu_recover(ring->adev, job);
+		if (r)
+			DRM_ERROR("GPU Recovery Failed: %d\n",r);
+
 	} else {
 		drm_sched_suspend_timeout(&ring->sched);
 		if (amdgpu_sriov_vf(adev))
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/amdgpu: Handle the GPU recovery failure in SRIOV environment.
  2022-02-01 21:47 [PATCH] drm/amdgpu: Handle the GPU recovery failure in SRIOV environment Surbhi Kakarya
@ 2022-02-02  7:44 ` Christian König
  2022-02-02 15:43 ` Andrey Grodzovsky
  2022-02-02 16:17 ` Felix Kuehling
  2 siblings, 0 replies; 7+ messages in thread
From: Christian König @ 2022-02-02  7:44 UTC (permalink / raw)
  To: Surbhi Kakarya, amd-gfx, Bokun.Zhang, HaiJun.Chang, Monk.Liu,
	Alexander.Deucher, Kelly.Zytaruk, Andrey Grodzovsky

[Adding Andrey as well]

Am 01.02.22 um 22:47 schrieb Surbhi Kakarya:
> This patch handles the GPU recovery faliure in sriov environment by
> retrying the reset if the first reset fails. To determine the condition of retry, a
> new function amdgpu_is_retry_sriov_reset() is added which returns true if failure is due
> to ETIMEDOUT, EINVAL or EBUSY, otherwise return false. MAX_RETRY_LIMIT is used to
> limit the retry to 2.
>
> It also handles the return status in Post Asic Reset by updating the return code
> with asic_reset_res and eventually return the return code in amdgpu_job_timedout().
>
> Signed-off-by: Surbhi Kakarya <Surbhi.Kakarya@amd.com>
> Change-Id: Ib2e408819b4780e6963e1dc078c3410cd512e6e8
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 47 ++++++++++++++++++++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  6 ++-
>   2 files changed, 49 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 53af2623c58f..f50c18cb38c8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -89,6 +89,7 @@ MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin");
>   MODULE_FIRMWARE("amdgpu/yellow_carp_gpu_info.bin");
>   
>   #define AMDGPU_RESUME_MS		2000
> +#define MAX_RETRY_LIMIT		2

Please AMDGPU_ prefix for all defines.

>   
>   const char *amdgpu_asic_name[] = {
>   	"TAHITI",
> @@ -5026,11 +5027,27 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
>   	return 0;
>   }
>   
> +/**
> + * amdgpu_is_retry_sriov_reset - check if we should retry sriov reset
> + *
> + * Check amdgpu_is_retry_sriov_reset and return status to see if we should retry reset.
> + */
> +static bool amdgpu_is_retry_sriov_reset(int r)

Please use an amdgpu_device_ prefix here.

> +{
> +
> +        if(r == -EBUSY || r == -ETIMEDOUT || r == -EINVAL)
> +                return true;
> +        else
> +                return false;
> +
> +}
> +
>   static void amdgpu_device_recheck_guilty_jobs(
>   	struct amdgpu_device *adev, struct list_head *device_list_handle,
>   	struct amdgpu_reset_context *reset_context)
>   {
>   	int i, r = 0;
> +	int retry_limit = 0;
>   
>   	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>   		struct amdgpu_ring *ring = adev->rings[i];
> @@ -5064,8 +5081,18 @@ static void amdgpu_device_recheck_guilty_jobs(
>   			if (amdgpu_sriov_vf(adev)) {
>   				amdgpu_virt_fini_data_exchange(adev);
>   				r = amdgpu_device_reset_sriov(adev, false);
> -				if (r)
> +				if (r) {
>   					adev->asic_reset_res = r;
> +					if (amdgpu_is_retry_sriov_reset(r)) {
> +						adev->asic_reset_res = 0;
> +						if (retry_limit < MAX_RETRY_LIMIT) {
> +							retry_limit++;
> +							goto retry;
> +						}
> +						else
> +							DRM_ERROR("GPU reset retry is beyond the retry limit\n");
> +					}
> +				}

That looks like this should rather be inside the 
amdgpu_device_reset_sriov() function.

Additional to that please check the coding style with checkpatch.pl.



>   			} else {
>   				clear_bit(AMDGPU_SKIP_HW_RESET,
>   					  &reset_context->flags);
> @@ -5122,6 +5149,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   	bool locked = false;
>   	int tmp_vram_lost_counter;
>   	struct amdgpu_reset_context reset_context;
> +	int retry_limit = 0;
>   
>   	memset(&reset_context, 0, sizeof(reset_context));
>   
> @@ -5299,8 +5327,18 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   	/* Host driver will handle XGMI hive reset for SRIOV */
>   	if (amdgpu_sriov_vf(adev)) {
>   		r = amdgpu_device_reset_sriov(adev, job ? false : true);
> -		if (r)
> -			adev->asic_reset_res = r;
> +                if (r) {
> +                        adev->asic_reset_res = r;
> +                        if (amdgpu_is_retry_sriov_reset(r)) {
> +				adev->asic_reset_res = 0;
> +				if (retry_limit < MAX_RETRY_LIMIT) {
> +					retry_limit++;
> +					goto retry;
> +				}
> +				else
> +					DRM_ERROR("GPU reset retry is beyond the retry limit\n");
> +                        }
> +                }
>   	} else {
>   		r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
>   		if (r && r == -EAGAIN)
> @@ -5341,6 +5379,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   			drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
>   		}
>   
> +		if (tmp_adev->asic_reset_res)
> +			r = tmp_adev->asic_reset_res;
> +
>   		tmp_adev->asic_reset_res = 0;
>   
>   		if (r) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index e0730ea56a8c..1f0fb21ac15a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -37,6 +37,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>   	struct amdgpu_task_info ti;
>   	struct amdgpu_device *adev = ring->adev;
>   	int idx;
> +	int r = 0;

Please don't initialize local variables if it isn't necessary.

Regards,
Christian.

>   
>   	if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
>   		DRM_INFO("%s - device unplugged skipping recovery on scheduler:%s",
> @@ -63,7 +64,10 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>   		  ti.process_name, ti.tgid, ti.task_name, ti.pid);
>   
>   	if (amdgpu_device_should_recover_gpu(ring->adev)) {
> -		amdgpu_device_gpu_recover(ring->adev, job);
> +		r = amdgpu_device_gpu_recover(ring->adev, job);
> +		if (r)
> +			DRM_ERROR("GPU Recovery Failed: %d\n",r);
> +
>   	} else {
>   		drm_sched_suspend_timeout(&ring->sched);
>   		if (amdgpu_sriov_vf(adev))


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/amdgpu: Handle the GPU recovery failure in SRIOV environment.
  2022-02-01 21:47 [PATCH] drm/amdgpu: Handle the GPU recovery failure in SRIOV environment Surbhi Kakarya
  2022-02-02  7:44 ` Christian König
@ 2022-02-02 15:43 ` Andrey Grodzovsky
  2022-02-02 16:17 ` Felix Kuehling
  2 siblings, 0 replies; 7+ messages in thread
From: Andrey Grodzovsky @ 2022-02-02 15:43 UTC (permalink / raw)
  To: Surbhi Kakarya, amd-gfx, Bokun.Zhang, HaiJun.Chang, Monk.Liu,
	Alexander.Deucher, Kelly.Zytaruk


On 2022-02-01 16:47, Surbhi Kakarya wrote:
> This patch handles the GPU recovery faliure in sriov environment by
> retrying the reset if the first reset fails. To determine the condition of retry, a
> new function amdgpu_is_retry_sriov_reset() is added which returns true if failure is due
> to ETIMEDOUT, EINVAL or EBUSY, otherwise return false. MAX_RETRY_LIMIT is used to
> limit the retry to 2.
>
> It also handles the return status in Post Asic Reset by updating the return code
> with asic_reset_res and eventually return the return code in amdgpu_job_timedout().
>
> Signed-off-by: Surbhi Kakarya <Surbhi.Kakarya@amd.com>
> Change-Id: Ib2e408819b4780e6963e1dc078c3410cd512e6e8
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 47 ++++++++++++++++++++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  6 ++-
>   2 files changed, 49 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 53af2623c58f..f50c18cb38c8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -89,6 +89,7 @@ MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin");
>   MODULE_FIRMWARE("amdgpu/yellow_carp_gpu_info.bin");
>   
>   #define AMDGPU_RESUME_MS		2000
> +#define MAX_RETRY_LIMIT		2
>   
>   const char *amdgpu_asic_name[] = {
>   	"TAHITI",
> @@ -5026,11 +5027,27 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
>   	return 0;
>   }
>   
> +/**
> + * amdgpu_is_retry_sriov_reset - check if we should retry sriov reset
> + *
> + * Check amdgpu_is_retry_sriov_reset and return status to see if we should retry reset.
> + */
> +static bool amdgpu_is_retry_sriov_reset(int r)
> +{
> +
> +        if(r == -EBUSY || r == -ETIMEDOUT || r == -EINVAL)
> +                return true;
> +        else
> +                return false;
> +
> +}
> +
>   static void amdgpu_device_recheck_guilty_jobs(
>   	struct amdgpu_device *adev, struct list_head *device_list_handle,
>   	struct amdgpu_reset_context *reset_context)
>   {
>   	int i, r = 0;
> +	int retry_limit = 0;
>   
>   	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>   		struct amdgpu_ring *ring = adev->rings[i];
> @@ -5064,8 +5081,18 @@ static void amdgpu_device_recheck_guilty_jobs(
>   			if (amdgpu_sriov_vf(adev)) {
>   				amdgpu_virt_fini_data_exchange(adev);
>   				r = amdgpu_device_reset_sriov(adev, false);
> -				if (r)
> +				if (r) {
>   					adev->asic_reset_res = r;
> +					if (amdgpu_is_retry_sriov_reset(r)) {
> +						adev->asic_reset_res = 0;
> +						if (retry_limit < MAX_RETRY_LIMIT) {
> +							retry_limit++;
> +							goto retry;
> +						}
> +						else
> +							DRM_ERROR("GPU reset retry is beyond the retry limit\n");
> +					}
> +				}
>   			} else {
>   				clear_bit(AMDGPU_SKIP_HW_RESET,
>   					  &reset_context->flags);
> @@ -5122,6 +5149,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   	bool locked = false;
>   	int tmp_vram_lost_counter;
>   	struct amdgpu_reset_context reset_context;
> +	int retry_limit = 0;
>   
>   	memset(&reset_context, 0, sizeof(reset_context));
>   
> @@ -5299,8 +5327,18 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   	/* Host driver will handle XGMI hive reset for SRIOV */
>   	if (amdgpu_sriov_vf(adev)) {
>   		r = amdgpu_device_reset_sriov(adev, job ? false : true);
> -		if (r)
> -			adev->asic_reset_res = r;
> +                if (r) {
> +                        adev->asic_reset_res = r;
> +                        if (amdgpu_is_retry_sriov_reset(r)) {
> +				adev->asic_reset_res = 0;
> +				if (retry_limit < MAX_RETRY_LIMIT) {
> +					retry_limit++;
> +					goto retry;
> +				}
> +				else
> +					DRM_ERROR("GPU reset retry is beyond the retry limit\n");


Just same comment as Christian, if you could move this retry handling 
inside amdgpu_device_reset_sriov
so to avoid code duplication here and above. Other then that looks good 
to me.

Andrey


> +                        }
> +                }
>   	} else {
>   		r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
>   		if (r && r == -EAGAIN)
> @@ -5341,6 +5379,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   			drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
>   		}
>   
> +		if (tmp_adev->asic_reset_res)
> +			r = tmp_adev->asic_reset_res;
> +
>   		tmp_adev->asic_reset_res = 0;
>   
>   		if (r) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index e0730ea56a8c..1f0fb21ac15a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -37,6 +37,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>   	struct amdgpu_task_info ti;
>   	struct amdgpu_device *adev = ring->adev;
>   	int idx;
> +	int r = 0;
>   
>   	if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
>   		DRM_INFO("%s - device unplugged skipping recovery on scheduler:%s",
> @@ -63,7 +64,10 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>   		  ti.process_name, ti.tgid, ti.task_name, ti.pid);
>   
>   	if (amdgpu_device_should_recover_gpu(ring->adev)) {
> -		amdgpu_device_gpu_recover(ring->adev, job);
> +		r = amdgpu_device_gpu_recover(ring->adev, job);
> +		if (r)
> +			DRM_ERROR("GPU Recovery Failed: %d\n",r);
> +
>   	} else {
>   		drm_sched_suspend_timeout(&ring->sched);
>   		if (amdgpu_sriov_vf(adev))

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/amdgpu: Handle the GPU recovery failure in SRIOV environment.
  2022-02-01 21:47 [PATCH] drm/amdgpu: Handle the GPU recovery failure in SRIOV environment Surbhi Kakarya
  2022-02-02  7:44 ` Christian König
  2022-02-02 15:43 ` Andrey Grodzovsky
@ 2022-02-02 16:17 ` Felix Kuehling
  2 siblings, 0 replies; 7+ messages in thread
From: Felix Kuehling @ 2022-02-02 16:17 UTC (permalink / raw)
  To: Surbhi Kakarya, amd-gfx, Bokun.Zhang, HaiJun.Chang, Monk.Liu,
	Alexander.Deucher, Kelly.Zytaruk

Am 2022-02-01 um 16:47 schrieb Surbhi Kakarya:
> This patch handles the GPU recovery faliure in sriov environment by
> retrying the reset if the first reset fails. To determine the condition of retry, a
> new function amdgpu_is_retry_sriov_reset() is added which returns true if failure is due
> to ETIMEDOUT, EINVAL or EBUSY, otherwise return false. MAX_RETRY_LIMIT is used to
> limit the retry to 2.
>
> It also handles the return status in Post Asic Reset by updating the return code
> with asic_reset_res and eventually return the return code in amdgpu_job_timedout().
>
> Signed-off-by: Surbhi Kakarya <Surbhi.Kakarya@amd.com>
> Change-Id: Ib2e408819b4780e6963e1dc078c3410cd512e6e8
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 47 ++++++++++++++++++++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  6 ++-
>   2 files changed, 49 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 53af2623c58f..f50c18cb38c8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -89,6 +89,7 @@ MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin");
>   MODULE_FIRMWARE("amdgpu/yellow_carp_gpu_info.bin");
>   
>   #define AMDGPU_RESUME_MS		2000
> +#define MAX_RETRY_LIMIT		2
>   
>   const char *amdgpu_asic_name[] = {
>   	"TAHITI",
> @@ -5026,11 +5027,27 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
>   	return 0;
>   }
>   
> +/**
> + * amdgpu_is_retry_sriov_reset - check if we should retry sriov reset
> + *
> + * Check amdgpu_is_retry_sriov_reset and return status to see if we should retry reset.
> + */
> +static bool amdgpu_is_retry_sriov_reset(int r)
> +{
> +
> +        if(r == -EBUSY || r == -ETIMEDOUT || r == -EINVAL)
> +                return true;
> +        else
> +                return false;
> +
> +}

The missing space between "if" and "(" should cause a checkpatch coding 
style warning. Please run your patch through checkpatch.pl.

That said, this function could be much simpler, maybe even a macro instead:

#define RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) 
== -EINVAL)

Regards,
   Felix


> +
>   static void amdgpu_device_recheck_guilty_jobs(
>   	struct amdgpu_device *adev, struct list_head *device_list_handle,
>   	struct amdgpu_reset_context *reset_context)
>   {
>   	int i, r = 0;
> +	int retry_limit = 0;
>   
>   	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>   		struct amdgpu_ring *ring = adev->rings[i];
> @@ -5064,8 +5081,18 @@ static void amdgpu_device_recheck_guilty_jobs(
>   			if (amdgpu_sriov_vf(adev)) {
>   				amdgpu_virt_fini_data_exchange(adev);
>   				r = amdgpu_device_reset_sriov(adev, false);
> -				if (r)
> +				if (r) {
>   					adev->asic_reset_res = r;
> +					if (amdgpu_is_retry_sriov_reset(r)) {
> +						adev->asic_reset_res = 0;
> +						if (retry_limit < MAX_RETRY_LIMIT) {
> +							retry_limit++;
> +							goto retry;
> +						}
> +						else
> +							DRM_ERROR("GPU reset retry is beyond the retry limit\n");
> +					}
> +				}
>   			} else {
>   				clear_bit(AMDGPU_SKIP_HW_RESET,
>   					  &reset_context->flags);
> @@ -5122,6 +5149,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   	bool locked = false;
>   	int tmp_vram_lost_counter;
>   	struct amdgpu_reset_context reset_context;
> +	int retry_limit = 0;
>   
>   	memset(&reset_context, 0, sizeof(reset_context));
>   
> @@ -5299,8 +5327,18 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   	/* Host driver will handle XGMI hive reset for SRIOV */
>   	if (amdgpu_sriov_vf(adev)) {
>   		r = amdgpu_device_reset_sriov(adev, job ? false : true);
> -		if (r)
> -			adev->asic_reset_res = r;
> +                if (r) {
> +                        adev->asic_reset_res = r;
> +                        if (amdgpu_is_retry_sriov_reset(r)) {
> +				adev->asic_reset_res = 0;
> +				if (retry_limit < MAX_RETRY_LIMIT) {
> +					retry_limit++;
> +					goto retry;
> +				}
> +				else
> +					DRM_ERROR("GPU reset retry is beyond the retry limit\n");
> +                        }
> +                }
>   	} else {
>   		r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
>   		if (r && r == -EAGAIN)
> @@ -5341,6 +5379,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   			drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
>   		}
>   
> +		if (tmp_adev->asic_reset_res)
> +			r = tmp_adev->asic_reset_res;
> +
>   		tmp_adev->asic_reset_res = 0;
>   
>   		if (r) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index e0730ea56a8c..1f0fb21ac15a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -37,6 +37,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>   	struct amdgpu_task_info ti;
>   	struct amdgpu_device *adev = ring->adev;
>   	int idx;
> +	int r = 0;
>   
>   	if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
>   		DRM_INFO("%s - device unplugged skipping recovery on scheduler:%s",
> @@ -63,7 +64,10 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>   		  ti.process_name, ti.tgid, ti.task_name, ti.pid);
>   
>   	if (amdgpu_device_should_recover_gpu(ring->adev)) {
> -		amdgpu_device_gpu_recover(ring->adev, job);
> +		r = amdgpu_device_gpu_recover(ring->adev, job);
> +		if (r)
> +			DRM_ERROR("GPU Recovery Failed: %d\n",r);
> +
>   	} else {
>   		drm_sched_suspend_timeout(&ring->sched);
>   		if (amdgpu_sriov_vf(adev))

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/amdgpu: Handle the GPU recovery failure in SRIOV environment.
  2022-01-31 15:35 Surbhi Kakarya
  2022-01-31 20:23 ` Alex Deucher
@ 2022-02-01 10:03 ` Christian König
  1 sibling, 0 replies; 7+ messages in thread
From: Christian König @ 2022-02-01 10:03 UTC (permalink / raw)
  To: Surbhi Kakarya, amd-gfx, Bokun.Zhang, HaiJun.Chang, Monk.Liu,
	Alexander.Deucher, Kelly.Zytaruk, Andrey Grodzovsky

Am 31.01.22 um 16:35 schrieb Surbhi Kakarya:
> This patch handles the GPU recovery faliure in sriov environment by
> retrying the reset if the first reset fails. To determine the condition of retry, a
> new function amdgpu_is_retry_sriov_reset() is added which returns true if failure is due
> to ETIMEDOUT, EINVAL or EBUSY, otherwise return false.
>
> It also handles the return status in Post Asic Reset by updating the return code
> with asic_reset_res and eventually return the return code in amdgpu_job_timedout().

That patch is certainly a NAK.

The retry should never be in the job, but rather in the device code itself.

Please sync up with Andrey and Monk about that.

Regards,
Christian.

>
> Change-Id: I45b9743adb548606aef8774496527d29fb3de0af
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 34 ++++++++++++++++++++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  6 +++-
>   2 files changed, 36 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 53af2623c58f..8a742b77eef8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5026,6 +5026,21 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
>   	return 0;
>   }
>   
> +/**
> + * amdgpu_is_retry_sriov_reset - check if we should retry sriov reset
> + *
> + * Check amdgpu_is_retry_sriov_reset and return status to see if we should retry reset.
> + */
> +static bool amdgpu_is_retry_sriov_reset(int r)
> +{
> +
> +        if(r == -EBUSY || r == -ETIMEDOUT || r == -EINVAL)
> +                return true;
> +        else
> +                return false;
> +
> +}
> +
>   static void amdgpu_device_recheck_guilty_jobs(
>   	struct amdgpu_device *adev, struct list_head *device_list_handle,
>   	struct amdgpu_reset_context *reset_context)
> @@ -5064,8 +5079,13 @@ static void amdgpu_device_recheck_guilty_jobs(
>   			if (amdgpu_sriov_vf(adev)) {
>   				amdgpu_virt_fini_data_exchange(adev);
>   				r = amdgpu_device_reset_sriov(adev, false);
> -				if (r)
> +				if (r) {
>   					adev->asic_reset_res = r;
> +					if (amdgpu_is_retry_sriov_reset(r)) {
> +						adev->asic_reset_res = 0;
> +						goto retry;
> +					}
> +				}
>   			} else {
>   				clear_bit(AMDGPU_SKIP_HW_RESET,
>   					  &reset_context->flags);
> @@ -5299,8 +5319,13 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   	/* Host driver will handle XGMI hive reset for SRIOV */
>   	if (amdgpu_sriov_vf(adev)) {
>   		r = amdgpu_device_reset_sriov(adev, job ? false : true);
> -		if (r)
> -			adev->asic_reset_res = r;
> +                if (r) {
> +                        adev->asic_reset_res = r;
> +                        if (amdgpu_is_retry_sriov_reset(r)) {
> +				adev->asic_reset_res = 0;
> +				goto retry;
> +                        }
> +                }
>   	} else {
>   		r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
>   		if (r && r == -EAGAIN)
> @@ -5341,6 +5366,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   			drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
>   		}
>   
> +		if (tmp_adev->asic_reset_res)
> +			r = tmp_adev->asic_reset_res;
> +
>   		tmp_adev->asic_reset_res = 0;
>   
>   		if (r) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index e0730ea56a8c..1f0fb21ac15a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -37,6 +37,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>   	struct amdgpu_task_info ti;
>   	struct amdgpu_device *adev = ring->adev;
>   	int idx;
> +	int r = 0;
>   
>   	if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
>   		DRM_INFO("%s - device unplugged skipping recovery on scheduler:%s",
> @@ -63,7 +64,10 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>   		  ti.process_name, ti.tgid, ti.task_name, ti.pid);
>   
>   	if (amdgpu_device_should_recover_gpu(ring->adev)) {
> -		amdgpu_device_gpu_recover(ring->adev, job);
> +		r = amdgpu_device_gpu_recover(ring->adev, job);
> +		if (r)
> +			DRM_ERROR("GPU Recovery Failed: %d\n",r);
> +
>   	} else {
>   		drm_sched_suspend_timeout(&ring->sched);
>   		if (amdgpu_sriov_vf(adev))


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/amdgpu: Handle the GPU recovery failure in SRIOV environment.
  2022-01-31 15:35 Surbhi Kakarya
@ 2022-01-31 20:23 ` Alex Deucher
  2022-02-01 10:03 ` Christian König
  1 sibling, 0 replies; 7+ messages in thread
From: Alex Deucher @ 2022-01-31 20:23 UTC (permalink / raw)
  To: Surbhi Kakarya
  Cc: Bokun Zhang, Zytaruk, Kelly, amd-gfx list, Chang, HaiJun,
	Deucher, Alexander, monk.liu

On Mon, Jan 31, 2022 at 10:35 AM Surbhi Kakarya <surbhi.kakarya@amd.com> wrote:
>
> This patch handles the GPU recovery faliure in sriov environment by
> retrying the reset if the first reset fails. To determine the condition of retry, a
> new function amdgpu_is_retry_sriov_reset() is added which returns true if failure is due
> to ETIMEDOUT, EINVAL or EBUSY, otherwise return false.
>
> It also handles the return status in Post Asic Reset by updating the return code
> with asic_reset_res and eventually return the return code in amdgpu_job_timedout().
>
> Change-Id: I45b9743adb548606aef8774496527d29fb3de0af

Missing your s-o-b.  Also, does this help on bare metal as well?  If
so, we should make this generic and also set a retry limit.

Alex


> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 34 ++++++++++++++++++++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  6 +++-
>  2 files changed, 36 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 53af2623c58f..8a742b77eef8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5026,6 +5026,21 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
>         return 0;
>  }
>
> +/**
> + * amdgpu_is_retry_sriov_reset - check if we should retry sriov reset
> + *
> + * Check amdgpu_is_retry_sriov_reset and return status to see if we should retry reset.
> + */
> +static bool amdgpu_is_retry_sriov_reset(int r)
> +{
> +
> +        if(r == -EBUSY || r == -ETIMEDOUT || r == -EINVAL)
> +                return true;
> +        else
> +                return false;
> +
> +}
> +
>  static void amdgpu_device_recheck_guilty_jobs(
>         struct amdgpu_device *adev, struct list_head *device_list_handle,
>         struct amdgpu_reset_context *reset_context)
> @@ -5064,8 +5079,13 @@ static void amdgpu_device_recheck_guilty_jobs(
>                         if (amdgpu_sriov_vf(adev)) {
>                                 amdgpu_virt_fini_data_exchange(adev);
>                                 r = amdgpu_device_reset_sriov(adev, false);
> -                               if (r)
> +                               if (r) {
>                                         adev->asic_reset_res = r;
> +                                       if (amdgpu_is_retry_sriov_reset(r)) {
> +                                               adev->asic_reset_res = 0;
> +                                               goto retry;
> +                                       }
> +                               }
>                         } else {
>                                 clear_bit(AMDGPU_SKIP_HW_RESET,
>                                           &reset_context->flags);
> @@ -5299,8 +5319,13 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>         /* Host driver will handle XGMI hive reset for SRIOV */
>         if (amdgpu_sriov_vf(adev)) {
>                 r = amdgpu_device_reset_sriov(adev, job ? false : true);
> -               if (r)
> -                       adev->asic_reset_res = r;
> +                if (r) {
> +                        adev->asic_reset_res = r;
> +                        if (amdgpu_is_retry_sriov_reset(r)) {
> +                               adev->asic_reset_res = 0;
> +                               goto retry;
> +                        }
> +                }
>         } else {
>                 r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
>                 if (r && r == -EAGAIN)
> @@ -5341,6 +5366,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>                         drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
>                 }
>
> +               if (tmp_adev->asic_reset_res)
> +                       r = tmp_adev->asic_reset_res;
> +
>                 tmp_adev->asic_reset_res = 0;
>
>                 if (r) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index e0730ea56a8c..1f0fb21ac15a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -37,6 +37,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>         struct amdgpu_task_info ti;
>         struct amdgpu_device *adev = ring->adev;
>         int idx;
> +       int r = 0;
>
>         if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
>                 DRM_INFO("%s - device unplugged skipping recovery on scheduler:%s",
> @@ -63,7 +64,10 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>                   ti.process_name, ti.tgid, ti.task_name, ti.pid);
>
>         if (amdgpu_device_should_recover_gpu(ring->adev)) {
> -               amdgpu_device_gpu_recover(ring->adev, job);
> +               r = amdgpu_device_gpu_recover(ring->adev, job);
> +               if (r)
> +                       DRM_ERROR("GPU Recovery Failed: %d\n",r);
> +
>         } else {
>                 drm_sched_suspend_timeout(&ring->sched);
>                 if (amdgpu_sriov_vf(adev))
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH] drm/amdgpu: Handle the GPU recovery failure in SRIOV environment.
@ 2022-01-31 15:35 Surbhi Kakarya
  2022-01-31 20:23 ` Alex Deucher
  2022-02-01 10:03 ` Christian König
  0 siblings, 2 replies; 7+ messages in thread
From: Surbhi Kakarya @ 2022-01-31 15:35 UTC (permalink / raw)
  To: amd-gfx, Bokun.Zhang, HaiJun.Chang, Monk.Liu, Alexander.Deucher,
	Kelly.Zytaruk
  Cc: Surbhi Kakarya

This patch handles the GPU recovery faliure in sriov environment by
retrying the reset if the first reset fails. To determine the condition of retry, a
new function amdgpu_is_retry_sriov_reset() is added which returns true if failure is due
to ETIMEDOUT, EINVAL or EBUSY, otherwise return false.

It also handles the return status in Post Asic Reset by updating the return code
with asic_reset_res and eventually return the return code in amdgpu_job_timedout().

Change-Id: I45b9743adb548606aef8774496527d29fb3de0af
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 34 ++++++++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  6 +++-
 2 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 53af2623c58f..8a742b77eef8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5026,6 +5026,21 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
 	return 0;
 }
 
+/**
+ * amdgpu_is_retry_sriov_reset - check if we should retry sriov reset
+ *
+ * Check amdgpu_is_retry_sriov_reset and return status to see if we should retry reset.
+ */
+static bool amdgpu_is_retry_sriov_reset(int r)
+{
+
+        if(r == -EBUSY || r == -ETIMEDOUT || r == -EINVAL)
+                return true;
+        else
+                return false;
+
+}
+
 static void amdgpu_device_recheck_guilty_jobs(
 	struct amdgpu_device *adev, struct list_head *device_list_handle,
 	struct amdgpu_reset_context *reset_context)
@@ -5064,8 +5079,13 @@ static void amdgpu_device_recheck_guilty_jobs(
 			if (amdgpu_sriov_vf(adev)) {
 				amdgpu_virt_fini_data_exchange(adev);
 				r = amdgpu_device_reset_sriov(adev, false);
-				if (r)
+				if (r) {
 					adev->asic_reset_res = r;
+					if (amdgpu_is_retry_sriov_reset(r)) {
+						adev->asic_reset_res = 0;
+						goto retry;
+					}
+				}
 			} else {
 				clear_bit(AMDGPU_SKIP_HW_RESET,
 					  &reset_context->flags);
@@ -5299,8 +5319,13 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	/* Host driver will handle XGMI hive reset for SRIOV */
 	if (amdgpu_sriov_vf(adev)) {
 		r = amdgpu_device_reset_sriov(adev, job ? false : true);
-		if (r)
-			adev->asic_reset_res = r;
+                if (r) {
+                        adev->asic_reset_res = r;
+                        if (amdgpu_is_retry_sriov_reset(r)) {
+				adev->asic_reset_res = 0;
+				goto retry;
+                        }
+                }
 	} else {
 		r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
 		if (r && r == -EAGAIN)
@@ -5341,6 +5366,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 			drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
 		}
 
+		if (tmp_adev->asic_reset_res)
+			r = tmp_adev->asic_reset_res;
+
 		tmp_adev->asic_reset_res = 0;
 
 		if (r) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e0730ea56a8c..1f0fb21ac15a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -37,6 +37,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 	struct amdgpu_task_info ti;
 	struct amdgpu_device *adev = ring->adev;
 	int idx;
+	int r = 0;
 
 	if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
 		DRM_INFO("%s - device unplugged skipping recovery on scheduler:%s",
@@ -63,7 +64,10 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 		  ti.process_name, ti.tgid, ti.task_name, ti.pid);
 
 	if (amdgpu_device_should_recover_gpu(ring->adev)) {
-		amdgpu_device_gpu_recover(ring->adev, job);
+		r = amdgpu_device_gpu_recover(ring->adev, job);
+		if (r)
+			DRM_ERROR("GPU Recovery Failed: %d\n",r);
+
 	} else {
 		drm_sched_suspend_timeout(&ring->sched);
 		if (amdgpu_sriov_vf(adev))
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2022-02-02 16:17 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-02-01 21:47 [PATCH] drm/amdgpu: Handle the GPU recovery failure in SRIOV environment Surbhi Kakarya
2022-02-02  7:44 ` Christian König
2022-02-02 15:43 ` Andrey Grodzovsky
2022-02-02 16:17 ` Felix Kuehling
  -- strict thread matches above, loose matches on Subject: below --
2022-01-31 15:35 Surbhi Kakarya
2022-01-31 20:23 ` Alex Deucher
2022-02-01 10:03 ` Christian König

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.