Re: [PATCH] drm/amdgpu: Handle the GPU recovery failure in SRIOV environment.

From: Alex Deucher <alexdeucher@gmail.com>
To: Surbhi Kakarya <surbhi.kakarya@amd.com>
Cc: Bokun Zhang <Bokun.Zhang@amd.com>,
	"Zytaruk, Kelly" <Kelly.Zytaruk@amd.com>,
	amd-gfx list <amd-gfx@lists.freedesktop.org>,
	"Chang, HaiJun" <HaiJun.Chang@amd.com>,
	"Deucher, Alexander" <Alexander.Deucher@amd.com>,
	"monk.liu" <Monk.Liu@amd.com>
Subject: Re: [PATCH] drm/amdgpu: Handle the GPU recovery failure in SRIOV environment.
Date: Mon, 31 Jan 2022 15:23:19 -0500	[thread overview]
Message-ID: <CADnq5_Pv8mVuKE4--HhvcsF_ff7X_OMo_2nVPqRAY=NprOEu-Q@mail.gmail.com> (raw)
In-Reply-To: <20220131153527.11051-1-surbhi.kakarya@amd.com>

On Mon, Jan 31, 2022 at 10:35 AM Surbhi Kakarya <surbhi.kakarya@amd.com> wrote:
>
> This patch handles the GPU recovery faliure in sriov environment by
> retrying the reset if the first reset fails. To determine the condition of retry, a
> new function amdgpu_is_retry_sriov_reset() is added which returns true if failure is due
> to ETIMEDOUT, EINVAL or EBUSY, otherwise return false.
>
> It also handles the return status in Post Asic Reset by updating the return code
> with asic_reset_res and eventually return the return code in amdgpu_job_timedout().
>
> Change-Id: I45b9743adb548606aef8774496527d29fb3de0af

Missing your s-o-b.  Also, does this help on bare metal as well?  If
so, we should make this generic and also set a retry limit.

Alex

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 34 ++++++++++++++++++++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  6 +++-
>  2 files changed, 36 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 53af2623c58f..8a742b77eef8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5026,6 +5026,21 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
>         return 0;
>  }
>
> +/**
> + * amdgpu_is_retry_sriov_reset - check if we should retry sriov reset
> + *
> + * Check amdgpu_is_retry_sriov_reset and return status to see if we should retry reset.
> + */
> +static bool amdgpu_is_retry_sriov_reset(int r)
> +{
> +
> +        if(r == -EBUSY || r == -ETIMEDOUT || r == -EINVAL)
> +                return true;
> +        else
> +                return false;
> +
> +}
> +
>  static void amdgpu_device_recheck_guilty_jobs(
>         struct amdgpu_device *adev, struct list_head *device_list_handle,
>         struct amdgpu_reset_context *reset_context)
> @@ -5064,8 +5079,13 @@ static void amdgpu_device_recheck_guilty_jobs(
>                         if (amdgpu_sriov_vf(adev)) {
>                                 amdgpu_virt_fini_data_exchange(adev);
>                                 r = amdgpu_device_reset_sriov(adev, false);
> -                               if (r)
> +                               if (r) {
>                                         adev->asic_reset_res = r;
> +                                       if (amdgpu_is_retry_sriov_reset(r)) {
> +                                               adev->asic_reset_res = 0;
> +                                               goto retry;
> +                                       }
> +                               }
>                         } else {
>                                 clear_bit(AMDGPU_SKIP_HW_RESET,
>                                           &reset_context->flags);
> @@ -5299,8 +5319,13 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>         /* Host driver will handle XGMI hive reset for SRIOV */
>         if (amdgpu_sriov_vf(adev)) {
>                 r = amdgpu_device_reset_sriov(adev, job ? false : true);
> -               if (r)
> -                       adev->asic_reset_res = r;
> +                if (r) {
> +                        adev->asic_reset_res = r;
> +                        if (amdgpu_is_retry_sriov_reset(r)) {
> +                               adev->asic_reset_res = 0;
> +                               goto retry;
> +                        }
> +                }
>         } else {
>                 r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
>                 if (r && r == -EAGAIN)
> @@ -5341,6 +5366,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>                         drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
>                 }
>
> +               if (tmp_adev->asic_reset_res)
> +                       r = tmp_adev->asic_reset_res;
> +
>                 tmp_adev->asic_reset_res = 0;
>
>                 if (r) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index e0730ea56a8c..1f0fb21ac15a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -37,6 +37,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>         struct amdgpu_task_info ti;
>         struct amdgpu_device *adev = ring->adev;
>         int idx;
> +       int r = 0;
>
>         if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
>                 DRM_INFO("%s - device unplugged skipping recovery on scheduler:%s",
> @@ -63,7 +64,10 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>                   ti.process_name, ti.tgid, ti.task_name, ti.pid);
>
>         if (amdgpu_device_should_recover_gpu(ring->adev)) {
> -               amdgpu_device_gpu_recover(ring->adev, job);
> +               r = amdgpu_device_gpu_recover(ring->adev, job);
> +               if (r)
> +                       DRM_ERROR("GPU Recovery Failed: %d\n",r);
> +
>         } else {
>                 drm_sched_suspend_timeout(&ring->sched);
>                 if (amdgpu_sriov_vf(adev))
> --
> 2.25.1
>