Re: [PATCH v3 5/7] drm/amdgpu: Add work_struct for GPU reset from kfd.

From: "Christian König" <christian.koenig@amd.com>
To: Andrey Grodzovsky <andrey.grodzovsky@amd.com>,
	amd-gfx@lists.freedesktop.org
Cc: Zoy.Bai@amd.com, lijo.lazar@amd.com
Subject: Re: [PATCH v3 5/7] drm/amdgpu: Add work_struct for GPU reset from kfd.
Date: Mon, 30 May 2022 09:54:34 +0200	[thread overview]
Message-ID: <4fdeb19a-35e3-da7b-380f-ff5efda7b4b4@amd.com> (raw)
In-Reply-To: <20220525190447.239867-6-andrey.grodzovsky@amd.com>

Am 25.05.22 um 21:04 schrieb Andrey Grodzovsky:
> We need to have a work_struct to cancel this reset if another
> already in progress.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>

Reviewed-by: Christian König <christian.koenig@amd.com>

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 15 ++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 31 ----------------------
>   3 files changed, 15 insertions(+), 32 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index 1f8161cd507f..a23abc0e86e7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -33,6 +33,7 @@
>   #include <uapi/linux/kfd_ioctl.h>
>   #include "amdgpu_ras.h"
>   #include "amdgpu_umc.h"
> +#include "amdgpu_reset.h"
>   
>   /* Total memory size in system memory and all GPU VRAM. Used to
>    * estimate worst case amount of memory to reserve for page tables
> @@ -122,6 +123,15 @@ static void amdgpu_doorbell_get_kfd_info(struct amdgpu_device *adev,
>   	}
>   }
>   
> +
> +static void amdgpu_amdkfd_reset_work(struct work_struct *work)
> +{
> +	struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
> +						  kfd.reset_work);
> +
> +	amdgpu_device_gpu_recover_imp(adev, NULL);
> +}
> +
>   void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
>   {
>   	int i;
> @@ -180,6 +190,8 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
>   
>   		adev->kfd.init_complete = kgd2kfd_device_init(adev->kfd.dev,
>   						adev_to_drm(adev), &gpu_resources);
> +
> +		INIT_WORK(&adev->kfd.reset_work, amdgpu_amdkfd_reset_work);
>   	}
>   }
>   
> @@ -247,7 +259,8 @@ int amdgpu_amdkfd_post_reset(struct amdgpu_device *adev)
>   void amdgpu_amdkfd_gpu_reset(struct amdgpu_device *adev)
>   {
>   	if (amdgpu_device_should_recover_gpu(adev))
> -		amdgpu_device_gpu_recover(adev, NULL);
> +		amdgpu_reset_domain_schedule(adev->reset_domain,
> +					     &adev->kfd.reset_work);
>   }
>   
>   int amdgpu_amdkfd_alloc_gtt_mem(struct amdgpu_device *adev, size_t size,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index f8b9f27adcf5..e0709af5a326 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -96,6 +96,7 @@ struct amdgpu_kfd_dev {
>   	struct kfd_dev *dev;
>   	uint64_t vram_used;
>   	bool init_complete;
> +	struct work_struct reset_work;
>   };
>   
>   enum kgd_engine_type {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index bfdd8883089a..e3e2a5d17cc2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5312,37 +5312,6 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
>   	return r;
>   }
>   
> -struct amdgpu_recover_work_struct {
> -	struct work_struct base;
> -	struct amdgpu_device *adev;
> -	struct amdgpu_job *job;
> -	int ret;
> -};
> -
> -static void amdgpu_device_queue_gpu_recover_work(struct work_struct *work)
> -{
> -	struct amdgpu_recover_work_struct *recover_work = container_of(work, struct amdgpu_recover_work_struct, base);
> -
> -	amdgpu_device_gpu_recover_imp(recover_work->adev, recover_work->job);
> -}
> -/*
> - * Serialize gpu recover into reset domain single threaded wq
> - */
> -int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
> -				    struct amdgpu_job *job)
> -{
> -	struct amdgpu_recover_work_struct work = {.adev = adev, .job = job};
> -
> -	INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work);
> -
> -	if (!amdgpu_reset_domain_schedule(adev->reset_domain, &work.base))
> -		return -EAGAIN;
> -
> -	flush_work(&work.base);
> -
> -	return atomic_read(&adev->reset_domain->reset_res);
> -}
> -
>   /**
>    * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
>    *