On 2022-03-08 12:20, Somalapuram, Amaranath wrote:
>
>
> On 3/8/2022 10:00 PM, Sharma, Shashank wrote:
>> Hello Andrey
>>
>> On 3/8/2022 5:26 PM, Andrey Grodzovsky wrote:
>>>
>>> On 2022-03-07 11:26, Shashank Sharma wrote:
>>>> From: Shashank Sharma <shashank.sharma@amd.com>
>>>>
>>>> This patch adds a work function, which will get scheduled
>>>> in event of a GPU reset, and will send a uevent to user with
>>>> some reset context infomration, like a PID and some flags.
>>>
>>>
>>> Where is the actual scheduling of the work function ? Shouldn't
>>> there be a patch for that too ?
>>>
>>
>> Yes, Amar is working on that patch, on top of these patches. They 
>> should be out soon. I thought it was a good idea to get quick 
>> feedback on the basic patches before we build something on top of it.
>>
> schedule_work() will be called in the function amdgpu_do_asic_reset ()
>

I didn't follow closely on the requirements and so I don't know but, 
what about
job timeout that was able to soft recover - do you need to cover this 
too ? Or
in this case no need to restart user application and you hence don't care ?

Andrey


> after getting vram_lost info:
>
> vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
>
> update  amdgpu_reset_event_ctx and call schedule_work()
>
>   * vram_lost
>   * reset_context->job->vm->task_info.process_name
>   * reset_context->job->vm->task_info.pid
>
> Regards,
> S.Amarnath
>> - Shashank
>>
>>> Andrey
>>>
>>>
>>>>
>>>> The userspace can do some recovery and post-processing work
>>>> based on this event.
>>>>
>>>> V2:
>>>> - Changed the name of the work to gpu_reset_event_work
>>>>    (Christian)
>>>> - Added a structure to accommodate some additional information
>>>>    (like a PID and some flags)
>>>>
>>>> Cc: Alexander Deucher <alexander.deucher@amd.com>
>>>> Cc: Christian Koenig <christian.koenig@amd.com>
>>>> Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  7 +++++++
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 +++++++++++++++++++
>>>>   2 files changed, 26 insertions(+)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> index d8b854fcbffa..7df219fe363f 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> @@ -813,6 +813,11 @@ struct amd_powerplay {
>>>>   #define AMDGPU_RESET_MAGIC_NUM 64
>>>>   #define AMDGPU_MAX_DF_PERFMONS 4
>>>>   #define AMDGPU_PRODUCT_NAME_LEN 64
>>>> +struct amdgpu_reset_event_ctx {
>>>> +    uint64_t pid;
>>>> +    uint32_t flags;
>>>> +};
>>>> +
>>>>   struct amdgpu_device {
>>>>       struct device            *dev;
>>>>       struct pci_dev            *pdev;
>>>> @@ -1063,6 +1068,7 @@ struct amdgpu_device {
>>>>       int asic_reset_res;
>>>>       struct work_struct        xgmi_reset_work;
>>>> +    struct work_struct        gpu_reset_event_work;
>>>>       struct list_head        reset_list;
>>>>       long                gfx_timeout;
>>>> @@ -1097,6 +1103,7 @@ struct amdgpu_device {
>>>>       pci_channel_state_t        pci_channel_state;
>>>>       struct amdgpu_reset_control     *reset_cntl;
>>>> +    struct amdgpu_reset_event_ctx   reset_event_ctx;
>>>>       uint32_t ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
>>>>       bool                ram_is_direct_mapped;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> index ed077de426d9..c43d099da06d 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> @@ -73,6 +73,7 @@
>>>>   #include <linux/pm_runtime.h>
>>>>   #include <drm/drm_drv.h>
>>>> +#include <drm/drm_sysfs.h>
>>>>   MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
>>>>   MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
>>>> @@ -3277,6 +3278,23 @@ bool amdgpu_device_has_dc_support(struct 
>>>> amdgpu_device *adev)
>>>>       return amdgpu_device_asic_has_dc_support(adev->asic_type);
>>>>   }
>>>> +static void amdgpu_device_reset_event_func(struct work_struct 
>>>> *__work)
>>>> +{
>>>> +    struct amdgpu_device *adev = container_of(__work, struct 
>>>> amdgpu_device,
>>>> +                          gpu_reset_event_work);
>>>> +    struct amdgpu_reset_event_ctx *event_ctx = 
>>>> &adev->reset_event_ctx;
>>>> +
>>>> +    /*
>>>> +     * A GPU reset has happened, indicate the userspace and pass the
>>>> +     * following information:
>>>> +     *    - pid of the process involved,
>>>> +     *    - if the VRAM is valid or not,
>>>> +     *    - indicate that userspace may want to collect the ftrace 
>>>> event
>>>> +     * data from the trace event.
>>>> +     */
>>>> +    drm_sysfs_reset_event(&adev->ddev, event_ctx->pid, 
>>>> event_ctx->flags);
>>>> +}
>>>> +
>>>>   static void amdgpu_device_xgmi_reset_func(struct work_struct 
>>>> *__work)
>>>>   {
>>>>       struct amdgpu_device *adev =
>>>> @@ -3525,6 +3543,7 @@ int amdgpu_device_init(struct amdgpu_device 
>>>> *adev,
>>>>                 amdgpu_device_delay_enable_gfx_off);
>>>>       INIT_WORK(&adev->xgmi_reset_work, 
>>>> amdgpu_device_xgmi_reset_func);
>>>> +    INIT_WORK(&adev->gpu_reset_event_work, 
>>>> amdgpu_device_reset_event_func);
>>>>       adev->gfx.gfx_off_req_count = 1;
>>>>       adev->pm.ac_power = power_supply_is_system_supplied() > 0;