On 2022-03-08 12:20, Somalapuram, Amaranath wrote: > > > On 3/8/2022 10:00 PM, Sharma, Shashank wrote: >> Hello Andrey >> >> On 3/8/2022 5:26 PM, Andrey Grodzovsky wrote: >>> >>> On 2022-03-07 11:26, Shashank Sharma wrote: >>>> From: Shashank Sharma >>>> >>>> This patch adds a work function, which will get scheduled >>>> in event of a GPU reset, and will send a uevent to user with >>>> some reset context infomration, like a PID and some flags. >>> >>> >>> Where is the actual scheduling of the work function ? Shouldn't >>> there be a patch for that too ? >>> >> >> Yes, Amar is working on that patch, on top of these patches. They >> should be out soon. I thought it was a good idea to get quick >> feedback on the basic patches before we build something on top of it. >> > schedule_work() will be called in the function amdgpu_do_asic_reset () > I didn't follow closely on the requirements and so I don't know but, what about job timeout that was able to soft recover - do you need to cover this too ? Or in this case no need to restart user application and you hence don't care ? Andrey > after getting vram_lost info: > > vram_lost = amdgpu_device_check_vram_lost(tmp_adev); > > update  amdgpu_reset_event_ctx and call schedule_work() > > * vram_lost > * reset_context->job->vm->task_info.process_name > * reset_context->job->vm->task_info.pid > > Regards, > S.Amarnath >> - Shashank >> >>> Andrey >>> >>> >>>> >>>> The userspace can do some recovery and post-processing work >>>> based on this event. >>>> >>>> V2: >>>> - Changed the name of the work to gpu_reset_event_work >>>>    (Christian) >>>> - Added a structure to accommodate some additional information >>>>    (like a PID and some flags) >>>> >>>> Cc: Alexander Deucher >>>> Cc: Christian Koenig >>>> Signed-off-by: Shashank Sharma >>>> --- >>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  7 +++++++ >>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 +++++++++++++++++++ >>>>   2 files changed, 26 insertions(+) >>>> >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>> index d8b854fcbffa..7df219fe363f 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>> @@ -813,6 +813,11 @@ struct amd_powerplay { >>>>   #define AMDGPU_RESET_MAGIC_NUM 64 >>>>   #define AMDGPU_MAX_DF_PERFMONS 4 >>>>   #define AMDGPU_PRODUCT_NAME_LEN 64 >>>> +struct amdgpu_reset_event_ctx { >>>> +    uint64_t pid; >>>> +    uint32_t flags; >>>> +}; >>>> + >>>>   struct amdgpu_device { >>>>       struct device            *dev; >>>>       struct pci_dev            *pdev; >>>> @@ -1063,6 +1068,7 @@ struct amdgpu_device { >>>>       int asic_reset_res; >>>>       struct work_struct        xgmi_reset_work; >>>> +    struct work_struct        gpu_reset_event_work; >>>>       struct list_head        reset_list; >>>>       long                gfx_timeout; >>>> @@ -1097,6 +1103,7 @@ struct amdgpu_device { >>>>       pci_channel_state_t        pci_channel_state; >>>>       struct amdgpu_reset_control     *reset_cntl; >>>> +    struct amdgpu_reset_event_ctx   reset_event_ctx; >>>>       uint32_t ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE]; >>>>       bool                ram_is_direct_mapped; >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>> index ed077de426d9..c43d099da06d 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>> @@ -73,6 +73,7 @@ >>>>   #include >>>>   #include >>>> +#include >>>>   MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); >>>>   MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); >>>> @@ -3277,6 +3278,23 @@ bool amdgpu_device_has_dc_support(struct >>>> amdgpu_device *adev) >>>>       return amdgpu_device_asic_has_dc_support(adev->asic_type); >>>>   } >>>> +static void amdgpu_device_reset_event_func(struct work_struct >>>> *__work) >>>> +{ >>>> +    struct amdgpu_device *adev = container_of(__work, struct >>>> amdgpu_device, >>>> +                          gpu_reset_event_work); >>>> +    struct amdgpu_reset_event_ctx *event_ctx = >>>> &adev->reset_event_ctx; >>>> + >>>> +    /* >>>> +     * A GPU reset has happened, indicate the userspace and pass the >>>> +     * following information: >>>> +     *    - pid of the process involved, >>>> +     *    - if the VRAM is valid or not, >>>> +     *    - indicate that userspace may want to collect the ftrace >>>> event >>>> +     * data from the trace event. >>>> +     */ >>>> +    drm_sysfs_reset_event(&adev->ddev, event_ctx->pid, >>>> event_ctx->flags); >>>> +} >>>> + >>>>   static void amdgpu_device_xgmi_reset_func(struct work_struct >>>> *__work) >>>>   { >>>>       struct amdgpu_device *adev = >>>> @@ -3525,6 +3543,7 @@ int amdgpu_device_init(struct amdgpu_device >>>> *adev, >>>>                 amdgpu_device_delay_enable_gfx_off); >>>>       INIT_WORK(&adev->xgmi_reset_work, >>>> amdgpu_device_xgmi_reset_func); >>>> +    INIT_WORK(&adev->gpu_reset_event_work, >>>> amdgpu_device_reset_event_func); >>>>       adev->gfx.gfx_off_req_count = 1; >>>>       adev->pm.ac_power = power_supply_is_system_supplied() > 0;