All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] drm: Add GPU reset sysfs event
@ 2022-03-07 16:26 Shashank Sharma
  2022-03-07 16:26 ` [PATCH 2/2] drm/amdgpu: add work function for GPU reset event Shashank Sharma
                   ` (3 more replies)
  0 siblings, 4 replies; 32+ messages in thread
From: Shashank Sharma @ 2022-03-07 16:26 UTC (permalink / raw)
  To: amd-gfx
  Cc: Alexandar Deucher, amaranath.somalapuram, Christian Koenig,
	shashank.sharma

From: Shashank Sharma <shashank.sharma@amd.com>

This patch adds a new sysfs event, which will indicate
the userland about a GPU reset, and can also provide
some information like:
- which PID was involved in the GPU reset
- what was the GPU status (using flags)

This patch also introduces the first flag of the flags
bitmap, which can be appended as and when required.

Cc: Alexandar Deucher <alexander.deucher@amd.com>
Cc: Christian Koenig <christian.koenig@amd.com>
Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
---
 drivers/gpu/drm/drm_sysfs.c | 24 ++++++++++++++++++++++++
 include/drm/drm_sysfs.h     |  3 +++
 2 files changed, 27 insertions(+)

diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c
index 430e00b16eec..52a015161431 100644
--- a/drivers/gpu/drm/drm_sysfs.c
+++ b/drivers/gpu/drm/drm_sysfs.c
@@ -409,6 +409,30 @@ void drm_sysfs_hotplug_event(struct drm_device *dev)
 }
 EXPORT_SYMBOL(drm_sysfs_hotplug_event);
 
+/**
+ * drm_sysfs_reset_event - generate a DRM uevent to indicate GPU reset
+ * @dev: DRM device
+ * @pid: The process ID involve with the reset
+ * @flags: Any other information about the GPU status
+ *
+ * Send a uevent for the DRM device specified by @dev. This indicates
+ * user that a GPU reset has occurred, so that the interested client
+ * can take any recovery or profiling measure, when required.
+ */
+void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, uint32_t flags)
+{
+	unsigned char pid_str[21], flags_str[15];
+	unsigned char reset_str[] = "RESET=1";
+	char *envp[] = { reset_str, pid_str, flags_str, NULL };
+
+	DRM_DEBUG("generating reset event\n");
+
+	snprintf(pid_str, ARRAY_SIZE(pid_str), "PID=%lu", pid);
+	snprintf(flags_str, ARRAY_SIZE(flags_str), "FLAGS=%u", flags);
+	kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
+}
+EXPORT_SYMBOL(drm_sysfs_reset_event);
+
 /**
  * drm_sysfs_connector_hotplug_event - generate a DRM uevent for any connector
  * change
diff --git a/include/drm/drm_sysfs.h b/include/drm/drm_sysfs.h
index 6273cac44e47..63f00fe8054c 100644
--- a/include/drm/drm_sysfs.h
+++ b/include/drm/drm_sysfs.h
@@ -2,6 +2,8 @@
 #ifndef _DRM_SYSFS_H_
 #define _DRM_SYSFS_H_
 
+#define DRM_GPU_RESET_FLAG_VRAM_VALID (1 << 0)
+
 struct drm_device;
 struct device;
 struct drm_connector;
@@ -11,6 +13,7 @@ int drm_class_device_register(struct device *dev);
 void drm_class_device_unregister(struct device *dev);
 
 void drm_sysfs_hotplug_event(struct drm_device *dev);
+void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, uint32_t reset_flags);
 void drm_sysfs_connector_hotplug_event(struct drm_connector *connector);
 void drm_sysfs_connector_status_event(struct drm_connector *connector,
 				      struct drm_property *property);
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 2/2] drm/amdgpu: add work function for GPU reset event
  2022-03-07 16:26 [PATCH 1/2] drm: Add GPU reset sysfs event Shashank Sharma
@ 2022-03-07 16:26 ` Shashank Sharma
  2022-03-07 16:46   ` Somalapuram, Amaranath
                     ` (2 more replies)
  2022-03-08  6:32 ` [PATCH 1/2] drm: Add GPU reset sysfs event Somalapuram, Amaranath
                   ` (2 subsequent siblings)
  3 siblings, 3 replies; 32+ messages in thread
From: Shashank Sharma @ 2022-03-07 16:26 UTC (permalink / raw)
  To: amd-gfx
  Cc: Alexander Deucher, amaranath.somalapuram, Christian Koenig,
	shashank.sharma

From: Shashank Sharma <shashank.sharma@amd.com>

This patch adds a work function, which will get scheduled
in event of a GPU reset, and will send a uevent to user with
some reset context infomration, like a PID and some flags.

The userspace can do some recovery and post-processing work
based on this event.

V2:
- Changed the name of the work to gpu_reset_event_work
  (Christian)
- Added a structure to accommodate some additional information
  (like a PID and some flags)

Cc: Alexander Deucher <alexander.deucher@amd.com>
Cc: Christian Koenig <christian.koenig@amd.com>
Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  7 +++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 +++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index d8b854fcbffa..7df219fe363f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -813,6 +813,11 @@ struct amd_powerplay {
 #define AMDGPU_RESET_MAGIC_NUM 64
 #define AMDGPU_MAX_DF_PERFMONS 4
 #define AMDGPU_PRODUCT_NAME_LEN 64
+struct amdgpu_reset_event_ctx {
+	uint64_t pid;
+	uint32_t flags;
+};
+
 struct amdgpu_device {
 	struct device			*dev;
 	struct pci_dev			*pdev;
@@ -1063,6 +1068,7 @@ struct amdgpu_device {
 
 	int asic_reset_res;
 	struct work_struct		xgmi_reset_work;
+	struct work_struct		gpu_reset_event_work;
 	struct list_head		reset_list;
 
 	long				gfx_timeout;
@@ -1097,6 +1103,7 @@ struct amdgpu_device {
 	pci_channel_state_t		pci_channel_state;
 
 	struct amdgpu_reset_control     *reset_cntl;
+	struct amdgpu_reset_event_ctx   reset_event_ctx;
 	uint32_t                        ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
 
 	bool				ram_is_direct_mapped;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index ed077de426d9..c43d099da06d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -73,6 +73,7 @@
 #include <linux/pm_runtime.h>
 
 #include <drm/drm_drv.h>
+#include <drm/drm_sysfs.h>
 
 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
@@ -3277,6 +3278,23 @@ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
 	return amdgpu_device_asic_has_dc_support(adev->asic_type);
 }
 
+static void amdgpu_device_reset_event_func(struct work_struct *__work)
+{
+	struct amdgpu_device *adev = container_of(__work, struct amdgpu_device,
+						  gpu_reset_event_work);
+	struct amdgpu_reset_event_ctx *event_ctx = &adev->reset_event_ctx;
+
+	/*
+	 * A GPU reset has happened, indicate the userspace and pass the
+	 * following information:
+	 *	- pid of the process involved,
+	 *	- if the VRAM is valid or not,
+	 *	- indicate that userspace may want to collect the ftrace event
+	 * data from the trace event.
+	 */
+	drm_sysfs_reset_event(&adev->ddev, event_ctx->pid, event_ctx->flags);
+}
+
 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
 {
 	struct amdgpu_device *adev =
@@ -3525,6 +3543,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 			  amdgpu_device_delay_enable_gfx_off);
 
 	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
+	INIT_WORK(&adev->gpu_reset_event_work, amdgpu_device_reset_event_func);
 
 	adev->gfx.gfx_off_req_count = 1;
 	adev->pm.ac_power = power_supply_is_system_supplied() > 0;
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: add work function for GPU reset event
  2022-03-07 16:26 ` [PATCH 2/2] drm/amdgpu: add work function for GPU reset event Shashank Sharma
@ 2022-03-07 16:46   ` Somalapuram, Amaranath
  2022-03-07 17:05     ` Sharma, Shashank
  2022-03-08  7:15   ` Christian König
  2022-03-08 16:26   ` Andrey Grodzovsky
  2 siblings, 1 reply; 32+ messages in thread
From: Somalapuram, Amaranath @ 2022-03-07 16:46 UTC (permalink / raw)
  To: Shashank Sharma, amd-gfx
  Cc: Alexander Deucher, amaranath.somalapuram, Christian Koenig,
	shashank.sharma


On 3/7/2022 9:56 PM, Shashank Sharma wrote:
> From: Shashank Sharma <shashank.sharma@amd.com>
>
> This patch adds a work function, which will get scheduled
> in event of a GPU reset, and will send a uevent to user with
> some reset context infomration, like a PID and some flags.
>
> The userspace can do some recovery and post-processing work
> based on this event.
>
> V2:
> - Changed the name of the work to gpu_reset_event_work
>    (Christian)
> - Added a structure to accommodate some additional information
>    (like a PID and some flags)
>
> Cc: Alexander Deucher <alexander.deucher@amd.com>
> Cc: Christian Koenig <christian.koenig@amd.com>
> Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  7 +++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 +++++++++++++++++++
>   2 files changed, 26 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index d8b854fcbffa..7df219fe363f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -813,6 +813,11 @@ struct amd_powerplay {
>   #define AMDGPU_RESET_MAGIC_NUM 64
>   #define AMDGPU_MAX_DF_PERFMONS 4
>   #define AMDGPU_PRODUCT_NAME_LEN 64
> +struct amdgpu_reset_event_ctx {
> +	uint64_t pid;
> +	uint32_t flags;
> +};
> +
>   struct amdgpu_device {
>   	struct device			*dev;
>   	struct pci_dev			*pdev;
> @@ -1063,6 +1068,7 @@ struct amdgpu_device {
>   
>   	int asic_reset_res;
>   	struct work_struct		xgmi_reset_work;
> +	struct work_struct		gpu_reset_event_work;
>   	struct list_head		reset_list;
>   
>   	long				gfx_timeout;
> @@ -1097,6 +1103,7 @@ struct amdgpu_device {
>   	pci_channel_state_t		pci_channel_state;
>   
>   	struct amdgpu_reset_control     *reset_cntl;
> +	struct amdgpu_reset_event_ctx   reset_event_ctx;
>   	uint32_t                        ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
>   
>   	bool				ram_is_direct_mapped;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index ed077de426d9..c43d099da06d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -73,6 +73,7 @@
>   #include <linux/pm_runtime.h>
>   
>   #include <drm/drm_drv.h>
> +#include <drm/drm_sysfs.h>
>   
>   MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
>   MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
> @@ -3277,6 +3278,23 @@ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
>   	return amdgpu_device_asic_has_dc_support(adev->asic_type);
>   }
>   
> +static void amdgpu_device_reset_event_func(struct work_struct *__work)
> +{
> +	struct amdgpu_device *adev = container_of(__work, struct amdgpu_device,
> +						  gpu_reset_event_work);

I am trying same thing but adev context is lost.

schedule_work() in amdgpu_do_asic_reset after getting/reading vram_lost 
= amdgpu_device_check_vram_lost(tmp_adev);

Regards,

S.Amarnath

> +	struct amdgpu_reset_event_ctx *event_ctx = &adev->reset_event_ctx;
> +
> +	/*
> +	 * A GPU reset has happened, indicate the userspace and pass the
> +	 * following information:
> +	 *	- pid of the process involved,
> +	 *	- if the VRAM is valid or not,
> +	 *	- indicate that userspace may want to collect the ftrace event
> +	 * data from the trace event.
> +	 */
> +	drm_sysfs_reset_event(&adev->ddev, event_ctx->pid, event_ctx->flags);
> +}
> +
>   static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
>   {
>   	struct amdgpu_device *adev =
> @@ -3525,6 +3543,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   			  amdgpu_device_delay_enable_gfx_off);
>   
>   	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
> +	INIT_WORK(&adev->gpu_reset_event_work, amdgpu_device_reset_event_func);
>   
>   	adev->gfx.gfx_off_req_count = 1;
>   	adev->pm.ac_power = power_supply_is_system_supplied() > 0;

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: add work function for GPU reset event
  2022-03-07 16:46   ` Somalapuram, Amaranath
@ 2022-03-07 17:05     ` Sharma, Shashank
  0 siblings, 0 replies; 32+ messages in thread
From: Sharma, Shashank @ 2022-03-07 17:05 UTC (permalink / raw)
  To: Somalapuram, Amaranath, Shashank Sharma, amd-gfx
  Cc: Alexander Deucher, amaranath.somalapuram, Christian Koenig

Hey Amar,

On 3/7/2022 5:46 PM, Somalapuram, Amaranath wrote:
> 
> On 3/7/2022 9:56 PM, Shashank Sharma wrote:
>> From: Shashank Sharma <shashank.sharma@amd.com>
>>
>> This patch adds a work function, which will get scheduled
>> in event of a GPU reset, and will send a uevent to user with
>> some reset context infomration, like a PID and some flags.
>>
>> The userspace can do some recovery and post-processing work
>> based on this event.
>>
>> V2:
>> - Changed the name of the work to gpu_reset_event_work
>>    (Christian)
>> - Added a structure to accommodate some additional information
>>    (like a PID and some flags)
>>
>> Cc: Alexander Deucher <alexander.deucher@amd.com>
>> Cc: Christian Koenig <christian.koenig@amd.com>
>> Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  7 +++++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 +++++++++++++++++++
>>   2 files changed, 26 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index d8b854fcbffa..7df219fe363f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -813,6 +813,11 @@ struct amd_powerplay {
>>   #define AMDGPU_RESET_MAGIC_NUM 64
>>   #define AMDGPU_MAX_DF_PERFMONS 4
>>   #define AMDGPU_PRODUCT_NAME_LEN 64
>> +struct amdgpu_reset_event_ctx {
>> +    uint64_t pid;
>> +    uint32_t flags;
>> +};
>> +
>>   struct amdgpu_device {
>>       struct device            *dev;
>>       struct pci_dev            *pdev;
>> @@ -1063,6 +1068,7 @@ struct amdgpu_device {
>>       int asic_reset_res;
>>       struct work_struct        xgmi_reset_work;
>> +    struct work_struct        gpu_reset_event_work;
>>       struct list_head        reset_list;
>>       long                gfx_timeout;
>> @@ -1097,6 +1103,7 @@ struct amdgpu_device {
>>       pci_channel_state_t        pci_channel_state;
>>       struct amdgpu_reset_control     *reset_cntl;
>> +    struct amdgpu_reset_event_ctx   reset_event_ctx;
>>       uint32_t                        
>> ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
>>       bool                ram_is_direct_mapped;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index ed077de426d9..c43d099da06d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -73,6 +73,7 @@
>>   #include <linux/pm_runtime.h>
>>   #include <drm/drm_drv.h>
>> +#include <drm/drm_sysfs.h>
>>   MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
>>   MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
>> @@ -3277,6 +3278,23 @@ bool amdgpu_device_has_dc_support(struct 
>> amdgpu_device *adev)
>>       return amdgpu_device_asic_has_dc_support(adev->asic_type);
>>   }
>> +static void amdgpu_device_reset_event_func(struct work_struct *__work)
>> +{
>> +    struct amdgpu_device *adev = container_of(__work, struct 
>> amdgpu_device,
>> +                          gpu_reset_event_work);
> 
> I am trying same thing but adev context is lost.
> 
> schedule_work() in amdgpu_do_asic_reset after getting/reading vram_lost 
> = amdgpu_device_check_vram_lost(tmp_adev);
> 

I am not sure if I understand your point correctly, but in this patch I 
have introduced a struct amdgpu_reset_event_ctx, which already is a part 
of adev, and has space to store both PID as well as flags.

So all you have to do is:
- GPU reset happens
- Save the pid in adev.reset_event_ctx.pid
- If VRAM valid, Save the vram status in adev.reset_event_ctx->flags |= 
(BIT0)

and schedule this work function. If the data is saved properly, it will 
reach the work function and the event will be sent.

- Shashank

> Regards,
> 
> S.Amarnath
> 
>> +    struct amdgpu_reset_event_ctx *event_ctx = &adev->reset_event_ctx;
>> +
>> +    /*
>> +     * A GPU reset has happened, indicate the userspace and pass the
>> +     * following information:
>> +     *    - pid of the process involved,
>> +     *    - if the VRAM is valid or not,
>> +     *    - indicate that userspace may want to collect the ftrace event
>> +     * data from the trace event.
>> +     */
>> +    drm_sysfs_reset_event(&adev->ddev, event_ctx->pid, 
>> event_ctx->flags);
>> +}
>> +
>>   static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
>>   {
>>       struct amdgpu_device *adev =
>> @@ -3525,6 +3543,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>                 amdgpu_device_delay_enable_gfx_off);
>>       INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
>> +    INIT_WORK(&adev->gpu_reset_event_work, 
>> amdgpu_device_reset_event_func);
>>       adev->gfx.gfx_off_req_count = 1;
>>       adev->pm.ac_power = power_supply_is_system_supplied() > 0;

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] drm: Add GPU reset sysfs event
  2022-03-07 16:26 [PATCH 1/2] drm: Add GPU reset sysfs event Shashank Sharma
  2022-03-07 16:26 ` [PATCH 2/2] drm/amdgpu: add work function for GPU reset event Shashank Sharma
@ 2022-03-08  6:32 ` Somalapuram, Amaranath
  2022-03-08  7:06 ` Christian König
  2022-03-08 16:25 ` Andrey Grodzovsky
  3 siblings, 0 replies; 32+ messages in thread
From: Somalapuram, Amaranath @ 2022-03-08  6:32 UTC (permalink / raw)
  To: Shashank Sharma, amd-gfx
  Cc: Alexandar Deucher, amaranath.somalapuram, Christian Koenig,
	shashank.sharma

[-- Attachment #1: Type: text/plain, Size: 3039 bytes --]


On 3/7/2022 9:56 PM, Shashank Sharma wrote:
> From: Shashank Sharma<shashank.sharma@amd.com>
>
> This patch adds a new sysfs event, which will indicate
> the userland about a GPU reset, and can also provide
> some information like:
> - which PID was involved in the GPU reset
> - what was the GPU status (using flags)
>
> This patch also introduces the first flag of the flags
> bitmap, which can be appended as and when required.
>
> Cc: Alexandar Deucher<alexander.deucher@amd.com>
> Cc: Christian Koenig<christian.koenig@amd.com>
> Signed-off-by: Shashank Sharma<shashank.sharma@amd.com>
> ---
>   drivers/gpu/drm/drm_sysfs.c | 24 ++++++++++++++++++++++++
>   include/drm/drm_sysfs.h     |  3 +++
>   2 files changed, 27 insertions(+)
>
> diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c
> index 430e00b16eec..52a015161431 100644
> --- a/drivers/gpu/drm/drm_sysfs.c
> +++ b/drivers/gpu/drm/drm_sysfs.c
> @@ -409,6 +409,30 @@ void drm_sysfs_hotplug_event(struct drm_device *dev)
>   }
>   EXPORT_SYMBOL(drm_sysfs_hotplug_event);
>   
> +/**
> + * drm_sysfs_reset_event - generate a DRM uevent to indicate GPU reset
> + * @dev: DRM device
> + * @pid: The process ID involve with the reset
> + * @flags: Any other information about the GPU status
> + *
> + * Send a uevent for the DRM device specified by @dev. This indicates
> + * user that a GPU reset has occurred, so that the interested client
> + * can take any recovery or profiling measure, when required.
> + */
> +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, uint32_t flags)
we need process name
char  process_name[TASK_COMM_LEN];
> +{
> +	unsigned char pid_str[21], flags_str[15];
> +	unsigned char reset_str[] = "RESET=1";
> +	char *envp[] = { reset_str, pid_str, flags_str, NULL };
> +
> +	DRM_DEBUG("generating reset event\n");
> +
> +	snprintf(pid_str, ARRAY_SIZE(pid_str), "PID=%lu", pid);
> +	snprintf(flags_str, ARRAY_SIZE(flags_str), "FLAGS=%u", flags);
> +	kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
> +}
> +EXPORT_SYMBOL(drm_sysfs_reset_event);
> +
>   /**
>    * drm_sysfs_connector_hotplug_event - generate a DRM uevent for any connector
>    * change
> diff --git a/include/drm/drm_sysfs.h b/include/drm/drm_sysfs.h
> index 6273cac44e47..63f00fe8054c 100644
> --- a/include/drm/drm_sysfs.h
> +++ b/include/drm/drm_sysfs.h
> @@ -2,6 +2,8 @@
>   #ifndef _DRM_SYSFS_H_
>   #define _DRM_SYSFS_H_
>   
> +#define DRM_GPU_RESET_FLAG_VRAM_VALID (1 << 0)
> +
>   struct drm_device;
>   struct device;
>   struct drm_connector;
> @@ -11,6 +13,7 @@ int drm_class_device_register(struct device *dev);
>   void drm_class_device_unregister(struct device *dev);
>   
>   void drm_sysfs_hotplug_event(struct drm_device *dev);
> +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, uint32_t reset_flags);
>   void drm_sysfs_connector_hotplug_event(struct drm_connector *connector);
>   void drm_sysfs_connector_status_event(struct drm_connector *connector,
>   				      struct drm_property *property);

[-- Attachment #2: Type: text/html, Size: 4138 bytes --]

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] drm: Add GPU reset sysfs event
  2022-03-07 16:26 [PATCH 1/2] drm: Add GPU reset sysfs event Shashank Sharma
  2022-03-07 16:26 ` [PATCH 2/2] drm/amdgpu: add work function for GPU reset event Shashank Sharma
  2022-03-08  6:32 ` [PATCH 1/2] drm: Add GPU reset sysfs event Somalapuram, Amaranath
@ 2022-03-08  7:06 ` Christian König
  2022-03-08  9:31   ` Sharma, Shashank
  2022-03-08 16:25 ` Andrey Grodzovsky
  3 siblings, 1 reply; 32+ messages in thread
From: Christian König @ 2022-03-08  7:06 UTC (permalink / raw)
  To: Shashank Sharma, amd-gfx
  Cc: Alexandar Deucher, amaranath.somalapuram, Christian Koenig,
	shashank.sharma

Am 07.03.22 um 17:26 schrieb Shashank Sharma:
> From: Shashank Sharma <shashank.sharma@amd.com>
>
> This patch adds a new sysfs event, which will indicate
> the userland about a GPU reset, and can also provide
> some information like:
> - which PID was involved in the GPU reset
> - what was the GPU status (using flags)
>
> This patch also introduces the first flag of the flags
> bitmap, which can be appended as and when required.

Make sure to CC the dri-devel mailing list when reviewing this.

>
> Cc: Alexandar Deucher <alexander.deucher@amd.com>
> Cc: Christian Koenig <christian.koenig@amd.com>
> Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
> ---
>   drivers/gpu/drm/drm_sysfs.c | 24 ++++++++++++++++++++++++
>   include/drm/drm_sysfs.h     |  3 +++
>   2 files changed, 27 insertions(+)
>
> diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c
> index 430e00b16eec..52a015161431 100644
> --- a/drivers/gpu/drm/drm_sysfs.c
> +++ b/drivers/gpu/drm/drm_sysfs.c
> @@ -409,6 +409,30 @@ void drm_sysfs_hotplug_event(struct drm_device *dev)
>   }
>   EXPORT_SYMBOL(drm_sysfs_hotplug_event);
>   
> +/**
> + * drm_sysfs_reset_event - generate a DRM uevent to indicate GPU reset
> + * @dev: DRM device
> + * @pid: The process ID involve with the reset
> + * @flags: Any other information about the GPU status
> + *
> + * Send a uevent for the DRM device specified by @dev. This indicates
> + * user that a GPU reset has occurred, so that the interested client
> + * can take any recovery or profiling measure, when required.
> + */
> +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, uint32_t flags)

The PID is usually only 32bit, but even better would be to use pid_t.

> +{
> +	unsigned char pid_str[21], flags_str[15];
> +	unsigned char reset_str[] = "RESET=1";
> +	char *envp[] = { reset_str, pid_str, flags_str, NULL };
> +
> +	DRM_DEBUG("generating reset event\n");
> +
> +	snprintf(pid_str, ARRAY_SIZE(pid_str), "PID=%lu", pid);
> +	snprintf(flags_str, ARRAY_SIZE(flags_str), "FLAGS=%u", flags);
> +	kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
> +}
> +EXPORT_SYMBOL(drm_sysfs_reset_event);
> +
>   /**
>    * drm_sysfs_connector_hotplug_event - generate a DRM uevent for any connector
>    * change
> diff --git a/include/drm/drm_sysfs.h b/include/drm/drm_sysfs.h
> index 6273cac44e47..63f00fe8054c 100644
> --- a/include/drm/drm_sysfs.h
> +++ b/include/drm/drm_sysfs.h
> @@ -2,6 +2,8 @@
>   #ifndef _DRM_SYSFS_H_
>   #define _DRM_SYSFS_H_
>   
> +#define DRM_GPU_RESET_FLAG_VRAM_VALID (1 << 0)

Probably better to define that the other way around, e.g. 
DRM_GPU_RESET_FLAG_VRAM_LOST.

Apart from that looks good to me.

Christian.

> +
>   struct drm_device;
>   struct device;
>   struct drm_connector;
> @@ -11,6 +13,7 @@ int drm_class_device_register(struct device *dev);
>   void drm_class_device_unregister(struct device *dev);
>   
>   void drm_sysfs_hotplug_event(struct drm_device *dev);
> +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, uint32_t reset_flags);
>   void drm_sysfs_connector_hotplug_event(struct drm_connector *connector);
>   void drm_sysfs_connector_status_event(struct drm_connector *connector,
>   				      struct drm_property *property);


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: add work function for GPU reset event
  2022-03-07 16:26 ` [PATCH 2/2] drm/amdgpu: add work function for GPU reset event Shashank Sharma
  2022-03-07 16:46   ` Somalapuram, Amaranath
@ 2022-03-08  7:15   ` Christian König
  2022-03-08  9:35     ` Sharma, Shashank
  2022-03-08 16:26   ` Andrey Grodzovsky
  2 siblings, 1 reply; 32+ messages in thread
From: Christian König @ 2022-03-08  7:15 UTC (permalink / raw)
  To: Shashank Sharma, amd-gfx
  Cc: Alexander Deucher, amaranath.somalapuram, shashank.sharma

Am 07.03.22 um 17:26 schrieb Shashank Sharma:
> From: Shashank Sharma <shashank.sharma@amd.com>
>
> This patch adds a work function, which will get scheduled
> in event of a GPU reset, and will send a uevent to user with
> some reset context infomration, like a PID and some flags.
>
> The userspace can do some recovery and post-processing work
> based on this event.
>
> V2:
> - Changed the name of the work to gpu_reset_event_work
>    (Christian)
> - Added a structure to accommodate some additional information
>    (like a PID and some flags)
>
> Cc: Alexander Deucher <alexander.deucher@amd.com>
> Cc: Christian Koenig <christian.koenig@amd.com>
> Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  7 +++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 +++++++++++++++++++
>   2 files changed, 26 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index d8b854fcbffa..7df219fe363f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -813,6 +813,11 @@ struct amd_powerplay {
>   #define AMDGPU_RESET_MAGIC_NUM 64
>   #define AMDGPU_MAX_DF_PERFMONS 4
>   #define AMDGPU_PRODUCT_NAME_LEN 64
> +struct amdgpu_reset_event_ctx {
> +	uint64_t pid;
> +	uint32_t flags;
> +};
> +

Please don't put any new structures into amdgpu.h. If I'm not completely 
mistaken Andrey has created a new header for all the reset related stuff.

I would also reconsider the name, at least drop the _ctx suffix.

Regards,
Christian.

>   struct amdgpu_device {
>   	struct device			*dev;
>   	struct pci_dev			*pdev;
> @@ -1063,6 +1068,7 @@ struct amdgpu_device {
>   
>   	int asic_reset_res;
>   	struct work_struct		xgmi_reset_work;
> +	struct work_struct		gpu_reset_event_work;
>   	struct list_head		reset_list;
>   
>   	long				gfx_timeout;
> @@ -1097,6 +1103,7 @@ struct amdgpu_device {
>   	pci_channel_state_t		pci_channel_state;
>   
>   	struct amdgpu_reset_control     *reset_cntl;
> +	struct amdgpu_reset_event_ctx   reset_event_ctx;
>   	uint32_t                        ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
>   
>   	bool				ram_is_direct_mapped;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index ed077de426d9..c43d099da06d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -73,6 +73,7 @@
>   #include <linux/pm_runtime.h>
>   
>   #include <drm/drm_drv.h>
> +#include <drm/drm_sysfs.h>
>   
>   MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
>   MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
> @@ -3277,6 +3278,23 @@ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
>   	return amdgpu_device_asic_has_dc_support(adev->asic_type);
>   }
>   
> +static void amdgpu_device_reset_event_func(struct work_struct *__work)
> +{
> +	struct amdgpu_device *adev = container_of(__work, struct amdgpu_device,
> +						  gpu_reset_event_work);
> +	struct amdgpu_reset_event_ctx *event_ctx = &adev->reset_event_ctx;
> +
> +	/*
> +	 * A GPU reset has happened, indicate the userspace and pass the
> +	 * following information:
> +	 *	- pid of the process involved,
> +	 *	- if the VRAM is valid or not,
> +	 *	- indicate that userspace may want to collect the ftrace event
> +	 * data from the trace event.
> +	 */
> +	drm_sysfs_reset_event(&adev->ddev, event_ctx->pid, event_ctx->flags);
> +}
> +
>   static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
>   {
>   	struct amdgpu_device *adev =
> @@ -3525,6 +3543,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   			  amdgpu_device_delay_enable_gfx_off);
>   
>   	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
> +	INIT_WORK(&adev->gpu_reset_event_work, amdgpu_device_reset_event_func);
>   
>   	adev->gfx.gfx_off_req_count = 1;
>   	adev->pm.ac_power = power_supply_is_system_supplied() > 0;


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] drm: Add GPU reset sysfs event
  2022-03-08  7:06 ` Christian König
@ 2022-03-08  9:31   ` Sharma, Shashank
  2022-03-08 10:32     ` Christian König
  0 siblings, 1 reply; 32+ messages in thread
From: Sharma, Shashank @ 2022-03-08  9:31 UTC (permalink / raw)
  To: Christian König, Shashank Sharma, amd-gfx
  Cc: Alexandar Deucher, amaranath.somalapuram, Christian Koenig



On 3/8/2022 8:06 AM, Christian König wrote:
> Am 07.03.22 um 17:26 schrieb Shashank Sharma:
>> From: Shashank Sharma <shashank.sharma@amd.com>
>>
>> This patch adds a new sysfs event, which will indicate
>> the userland about a GPU reset, and can also provide
>> some information like:
>> - which PID was involved in the GPU reset
>> - what was the GPU status (using flags)
>>
>> This patch also introduces the first flag of the flags
>> bitmap, which can be appended as and when required.
> 
> Make sure to CC the dri-devel mailing list when reviewing this.
Got it,

I was also curious if we want to move the reset_ctx structure itself to 
DRM layer, like
drm_reset_event_ctx {
	u32 pid;
	u32 flags;
	char process_name[64];
};

and then:
void drm_sysfs_reset_event(struct drm_device *dev, drm_reset_event_ctx 
*ctx);

> 
>>
>> Cc: Alexandar Deucher <alexander.deucher@amd.com>
>> Cc: Christian Koenig <christian.koenig@amd.com>
>> Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
>> ---
>>   drivers/gpu/drm/drm_sysfs.c | 24 ++++++++++++++++++++++++
>>   include/drm/drm_sysfs.h     |  3 +++
>>   2 files changed, 27 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c
>> index 430e00b16eec..52a015161431 100644
>> --- a/drivers/gpu/drm/drm_sysfs.c
>> +++ b/drivers/gpu/drm/drm_sysfs.c
>> @@ -409,6 +409,30 @@ void drm_sysfs_hotplug_event(struct drm_device *dev)
>>   }
>>   EXPORT_SYMBOL(drm_sysfs_hotplug_event);
>> +/**
>> + * drm_sysfs_reset_event - generate a DRM uevent to indicate GPU reset
>> + * @dev: DRM device
>> + * @pid: The process ID involve with the reset
>> + * @flags: Any other information about the GPU status
>> + *
>> + * Send a uevent for the DRM device specified by @dev. This indicates
>> + * user that a GPU reset has occurred, so that the interested client
>> + * can take any recovery or profiling measure, when required.
>> + */
>> +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, 
>> uint32_t flags)
> 
> The PID is usually only 32bit, but even better would be to use pid_t.
> 
>> +{
>> +    unsigned char pid_str[21], flags_str[15];
>> +    unsigned char reset_str[] = "RESET=1";
>> +    char *envp[] = { reset_str, pid_str, flags_str, NULL };
>> +
>> +    DRM_DEBUG("generating reset event\n");
>> +
>> +    snprintf(pid_str, ARRAY_SIZE(pid_str), "PID=%lu", pid);
>> +    snprintf(flags_str, ARRAY_SIZE(flags_str), "FLAGS=%u", flags);
>> +    kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
>> +}
>> +EXPORT_SYMBOL(drm_sysfs_reset_event);
>> +
>>   /**
>>    * drm_sysfs_connector_hotplug_event - generate a DRM uevent for any 
>> connector
>>    * change
>> diff --git a/include/drm/drm_sysfs.h b/include/drm/drm_sysfs.h
>> index 6273cac44e47..63f00fe8054c 100644
>> --- a/include/drm/drm_sysfs.h
>> +++ b/include/drm/drm_sysfs.h
>> @@ -2,6 +2,8 @@
>>   #ifndef _DRM_SYSFS_H_
>>   #define _DRM_SYSFS_H_
>> +#define DRM_GPU_RESET_FLAG_VRAM_VALID (1 << 0)
> 
> Probably better to define that the other way around, e.g. 
> DRM_GPU_RESET_FLAG_VRAM_LOST.
> 
> Apart from that looks good to me.
> 
Got it, noted.
- Shashank

> Christian.
> 
>> +
>>   struct drm_device;
>>   struct device;
>>   struct drm_connector;
>> @@ -11,6 +13,7 @@ int drm_class_device_register(struct device *dev);
>>   void drm_class_device_unregister(struct device *dev);
>>   void drm_sysfs_hotplug_event(struct drm_device *dev);
>> +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, 
>> uint32_t reset_flags);
>>   void drm_sysfs_connector_hotplug_event(struct drm_connector 
>> *connector);
>>   void drm_sysfs_connector_status_event(struct drm_connector *connector,
>>                         struct drm_property *property);
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: add work function for GPU reset event
  2022-03-08  7:15   ` Christian König
@ 2022-03-08  9:35     ` Sharma, Shashank
  0 siblings, 0 replies; 32+ messages in thread
From: Sharma, Shashank @ 2022-03-08  9:35 UTC (permalink / raw)
  To: Christian König, Shashank Sharma, amd-gfx
  Cc: Alexander Deucher, amaranath.somalapuram



On 3/8/2022 8:15 AM, Christian König wrote:
> Am 07.03.22 um 17:26 schrieb Shashank Sharma:
>> From: Shashank Sharma <shashank.sharma@amd.com>
>>
>> This patch adds a work function, which will get scheduled
>> in event of a GPU reset, and will send a uevent to user with
>> some reset context infomration, like a PID and some flags.
>>
>> The userspace can do some recovery and post-processing work
>> based on this event.
>>
>> V2:
>> - Changed the name of the work to gpu_reset_event_work
>>    (Christian)
>> - Added a structure to accommodate some additional information
>>    (like a PID and some flags)
>>
>> Cc: Alexander Deucher <alexander.deucher@amd.com>
>> Cc: Christian Koenig <christian.koenig@amd.com>
>> Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  7 +++++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 +++++++++++++++++++
>>   2 files changed, 26 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index d8b854fcbffa..7df219fe363f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -813,6 +813,11 @@ struct amd_powerplay {
>>   #define AMDGPU_RESET_MAGIC_NUM 64
>>   #define AMDGPU_MAX_DF_PERFMONS 4
>>   #define AMDGPU_PRODUCT_NAME_LEN 64
>> +struct amdgpu_reset_event_ctx {
>> +    uint64_t pid;
>> +    uint32_t flags;
>> +};
>> +
> 
> Please don't put any new structures into amdgpu.h. If I'm not completely 
> mistaken Andrey has created a new header for all the reset related stuff. >
> I would also reconsider the name, at least drop the _ctx suffix.
> 

(Same as comment from other patch) should we move this structure in drm 
level (drm_sysfs.h) like

drm_reset_event {
	u32 pid;
	u32 flags;
	char pname[64];
};

- Shashank

> Regards,
> Christian.
> 
>>   struct amdgpu_device {
>>       struct device            *dev;
>>       struct pci_dev            *pdev;
>> @@ -1063,6 +1068,7 @@ struct amdgpu_device {
>>       int asic_reset_res;
>>       struct work_struct        xgmi_reset_work;
>> +    struct work_struct        gpu_reset_event_work;
>>       struct list_head        reset_list;
>>       long                gfx_timeout;
>> @@ -1097,6 +1103,7 @@ struct amdgpu_device {
>>       pci_channel_state_t        pci_channel_state;
>>       struct amdgpu_reset_control     *reset_cntl;
>> +    struct amdgpu_reset_event_ctx   reset_event_ctx;
>>       uint32_t                        
>> ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
>>       bool                ram_is_direct_mapped;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index ed077de426d9..c43d099da06d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -73,6 +73,7 @@
>>   #include <linux/pm_runtime.h>
>>   #include <drm/drm_drv.h>
>> +#include <drm/drm_sysfs.h>
>>   MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
>>   MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
>> @@ -3277,6 +3278,23 @@ bool amdgpu_device_has_dc_support(struct 
>> amdgpu_device *adev)
>>       return amdgpu_device_asic_has_dc_support(adev->asic_type);
>>   }
>> +static void amdgpu_device_reset_event_func(struct work_struct *__work)
>> +{
>> +    struct amdgpu_device *adev = container_of(__work, struct 
>> amdgpu_device,
>> +                          gpu_reset_event_work);
>> +    struct amdgpu_reset_event_ctx *event_ctx = &adev->reset_event_ctx;
>> +
>> +    /*
>> +     * A GPU reset has happened, indicate the userspace and pass the
>> +     * following information:
>> +     *    - pid of the process involved,
>> +     *    - if the VRAM is valid or not,
>> +     *    - indicate that userspace may want to collect the ftrace event
>> +     * data from the trace event.
>> +     */
>> +    drm_sysfs_reset_event(&adev->ddev, event_ctx->pid, 
>> event_ctx->flags);
>> +}
>> +
>>   static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
>>   {
>>       struct amdgpu_device *adev =
>> @@ -3525,6 +3543,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>                 amdgpu_device_delay_enable_gfx_off);
>>       INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
>> +    INIT_WORK(&adev->gpu_reset_event_work, 
>> amdgpu_device_reset_event_func);
>>       adev->gfx.gfx_off_req_count = 1;
>>       adev->pm.ac_power = power_supply_is_system_supplied() > 0;
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] drm: Add GPU reset sysfs event
  2022-03-08  9:31   ` Sharma, Shashank
@ 2022-03-08 10:32     ` Christian König
  2022-03-08 11:56       ` Sharma, Shashank
  0 siblings, 1 reply; 32+ messages in thread
From: Christian König @ 2022-03-08 10:32 UTC (permalink / raw)
  To: Sharma, Shashank, Christian König, Shashank Sharma, amd-gfx
  Cc: Alexandar Deucher, amaranath.somalapuram

Am 08.03.22 um 10:31 schrieb Sharma, Shashank:
>
>
> On 3/8/2022 8:06 AM, Christian König wrote:
>> Am 07.03.22 um 17:26 schrieb Shashank Sharma:
>>> From: Shashank Sharma <shashank.sharma@amd.com>
>>>
>>> This patch adds a new sysfs event, which will indicate
>>> the userland about a GPU reset, and can also provide
>>> some information like:
>>> - which PID was involved in the GPU reset
>>> - what was the GPU status (using flags)
>>>
>>> This patch also introduces the first flag of the flags
>>> bitmap, which can be appended as and when required.
>>
>> Make sure to CC the dri-devel mailing list when reviewing this.
> Got it,
>
> I was also curious if we want to move the reset_ctx structure itself 
> to DRM layer, like
> drm_reset_event_ctx {
>     u32 pid;
>     u32 flags;
>     char process_name[64];
> };

I was entertaining that thought as well.

But if we do this I would go even a step further and also move the reset 
work item into the DRM layer as well.

You might also look like into migrating the exiting i915 code which uses 
udev to signal GPU resets to this function as well.

Regards,
Christian.

>
> and then:
> void drm_sysfs_reset_event(struct drm_device *dev, drm_reset_event_ctx 
> *ctx);
>
>>
>>>
>>> Cc: Alexandar Deucher <alexander.deucher@amd.com>
>>> Cc: Christian Koenig <christian.koenig@amd.com>
>>> Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
>>> ---
>>>   drivers/gpu/drm/drm_sysfs.c | 24 ++++++++++++++++++++++++
>>>   include/drm/drm_sysfs.h     |  3 +++
>>>   2 files changed, 27 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c
>>> index 430e00b16eec..52a015161431 100644
>>> --- a/drivers/gpu/drm/drm_sysfs.c
>>> +++ b/drivers/gpu/drm/drm_sysfs.c
>>> @@ -409,6 +409,30 @@ void drm_sysfs_hotplug_event(struct drm_device 
>>> *dev)
>>>   }
>>>   EXPORT_SYMBOL(drm_sysfs_hotplug_event);
>>> +/**
>>> + * drm_sysfs_reset_event - generate a DRM uevent to indicate GPU reset
>>> + * @dev: DRM device
>>> + * @pid: The process ID involve with the reset
>>> + * @flags: Any other information about the GPU status
>>> + *
>>> + * Send a uevent for the DRM device specified by @dev. This indicates
>>> + * user that a GPU reset has occurred, so that the interested client
>>> + * can take any recovery or profiling measure, when required.
>>> + */
>>> +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, 
>>> uint32_t flags)
>>
>> The PID is usually only 32bit, but even better would be to use pid_t.
>>
>>> +{
>>> +    unsigned char pid_str[21], flags_str[15];
>>> +    unsigned char reset_str[] = "RESET=1";
>>> +    char *envp[] = { reset_str, pid_str, flags_str, NULL };
>>> +
>>> +    DRM_DEBUG("generating reset event\n");
>>> +
>>> +    snprintf(pid_str, ARRAY_SIZE(pid_str), "PID=%lu", pid);
>>> +    snprintf(flags_str, ARRAY_SIZE(flags_str), "FLAGS=%u", flags);
>>> + kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
>>> +}
>>> +EXPORT_SYMBOL(drm_sysfs_reset_event);
>>> +
>>>   /**
>>>    * drm_sysfs_connector_hotplug_event - generate a DRM uevent for 
>>> any connector
>>>    * change
>>> diff --git a/include/drm/drm_sysfs.h b/include/drm/drm_sysfs.h
>>> index 6273cac44e47..63f00fe8054c 100644
>>> --- a/include/drm/drm_sysfs.h
>>> +++ b/include/drm/drm_sysfs.h
>>> @@ -2,6 +2,8 @@
>>>   #ifndef _DRM_SYSFS_H_
>>>   #define _DRM_SYSFS_H_
>>> +#define DRM_GPU_RESET_FLAG_VRAM_VALID (1 << 0)
>>
>> Probably better to define that the other way around, e.g. 
>> DRM_GPU_RESET_FLAG_VRAM_LOST.
>>
>> Apart from that looks good to me.
>>
> Got it, noted.
> - Shashank
>
>> Christian.
>>
>>> +
>>>   struct drm_device;
>>>   struct device;
>>>   struct drm_connector;
>>> @@ -11,6 +13,7 @@ int drm_class_device_register(struct device *dev);
>>>   void drm_class_device_unregister(struct device *dev);
>>>   void drm_sysfs_hotplug_event(struct drm_device *dev);
>>> +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, 
>>> uint32_t reset_flags);
>>>   void drm_sysfs_connector_hotplug_event(struct drm_connector 
>>> *connector);
>>>   void drm_sysfs_connector_status_event(struct drm_connector 
>>> *connector,
>>>                         struct drm_property *property);
>>


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] drm: Add GPU reset sysfs event
  2022-03-08 10:32     ` Christian König
@ 2022-03-08 11:56       ` Sharma, Shashank
  2022-03-08 15:37         ` Somalapuram, Amaranath
  2022-03-08 16:40         ` Sharma, Shashank
  0 siblings, 2 replies; 32+ messages in thread
From: Sharma, Shashank @ 2022-03-08 11:56 UTC (permalink / raw)
  To: Christian König, Christian König, Shashank Sharma, amd-gfx
  Cc: Alexandar Deucher, amaranath.somalapuram



On 3/8/2022 11:32 AM, Christian König wrote:
> Am 08.03.22 um 10:31 schrieb Sharma, Shashank:
>>
>>
>> On 3/8/2022 8:06 AM, Christian König wrote:
>>> Am 07.03.22 um 17:26 schrieb Shashank Sharma:
>>>> From: Shashank Sharma <shashank.sharma@amd.com>
>>>>
>>>> This patch adds a new sysfs event, which will indicate
>>>> the userland about a GPU reset, and can also provide
>>>> some information like:
>>>> - which PID was involved in the GPU reset
>>>> - what was the GPU status (using flags)
>>>>
>>>> This patch also introduces the first flag of the flags
>>>> bitmap, which can be appended as and when required.
>>>
>>> Make sure to CC the dri-devel mailing list when reviewing this.
>> Got it,
>>
>> I was also curious if we want to move the reset_ctx structure itself 
>> to DRM layer, like
>> drm_reset_event_ctx {
>>     u32 pid;
>>     u32 flags;
>>     char process_name[64];
>> };
> 
> I was entertaining that thought as well.
> 
> But if we do this I would go even a step further and also move the reset 
> work item into the DRM layer as well.
> 
> You might also look like into migrating the exiting i915 code which uses 
> udev to signal GPU resets to this function as well.
> 
> Regards,
> Christian.

That seems like a good idea, let me quickly dive into i915 and check 
this out.

Shashank
> 
>>
>> and then:
>> void drm_sysfs_reset_event(struct drm_device *dev, drm_reset_event_ctx 
>> *ctx);
>>
>>>
>>>>
>>>> Cc: Alexandar Deucher <alexander.deucher@amd.com>
>>>> Cc: Christian Koenig <christian.koenig@amd.com>
>>>> Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
>>>> ---
>>>>   drivers/gpu/drm/drm_sysfs.c | 24 ++++++++++++++++++++++++
>>>>   include/drm/drm_sysfs.h     |  3 +++
>>>>   2 files changed, 27 insertions(+)
>>>>
>>>> diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c
>>>> index 430e00b16eec..52a015161431 100644
>>>> --- a/drivers/gpu/drm/drm_sysfs.c
>>>> +++ b/drivers/gpu/drm/drm_sysfs.c
>>>> @@ -409,6 +409,30 @@ void drm_sysfs_hotplug_event(struct drm_device 
>>>> *dev)
>>>>   }
>>>>   EXPORT_SYMBOL(drm_sysfs_hotplug_event);
>>>> +/**
>>>> + * drm_sysfs_reset_event - generate a DRM uevent to indicate GPU reset
>>>> + * @dev: DRM device
>>>> + * @pid: The process ID involve with the reset
>>>> + * @flags: Any other information about the GPU status
>>>> + *
>>>> + * Send a uevent for the DRM device specified by @dev. This indicates
>>>> + * user that a GPU reset has occurred, so that the interested client
>>>> + * can take any recovery or profiling measure, when required.
>>>> + */
>>>> +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, 
>>>> uint32_t flags)
>>>
>>> The PID is usually only 32bit, but even better would be to use pid_t.
>>>
>>>> +{
>>>> +    unsigned char pid_str[21], flags_str[15];
>>>> +    unsigned char reset_str[] = "RESET=1";
>>>> +    char *envp[] = { reset_str, pid_str, flags_str, NULL };
>>>> +
>>>> +    DRM_DEBUG("generating reset event\n");
>>>> +
>>>> +    snprintf(pid_str, ARRAY_SIZE(pid_str), "PID=%lu", pid);
>>>> +    snprintf(flags_str, ARRAY_SIZE(flags_str), "FLAGS=%u", flags);
>>>> + kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
>>>> +}
>>>> +EXPORT_SYMBOL(drm_sysfs_reset_event);
>>>> +
>>>>   /**
>>>>    * drm_sysfs_connector_hotplug_event - generate a DRM uevent for 
>>>> any connector
>>>>    * change
>>>> diff --git a/include/drm/drm_sysfs.h b/include/drm/drm_sysfs.h
>>>> index 6273cac44e47..63f00fe8054c 100644
>>>> --- a/include/drm/drm_sysfs.h
>>>> +++ b/include/drm/drm_sysfs.h
>>>> @@ -2,6 +2,8 @@
>>>>   #ifndef _DRM_SYSFS_H_
>>>>   #define _DRM_SYSFS_H_
>>>> +#define DRM_GPU_RESET_FLAG_VRAM_VALID (1 << 0)
>>>
>>> Probably better to define that the other way around, e.g. 
>>> DRM_GPU_RESET_FLAG_VRAM_LOST.
>>>
>>> Apart from that looks good to me.
>>>
>> Got it, noted.
>> - Shashank
>>
>>> Christian.
>>>
>>>> +
>>>>   struct drm_device;
>>>>   struct device;
>>>>   struct drm_connector;
>>>> @@ -11,6 +13,7 @@ int drm_class_device_register(struct device *dev);
>>>>   void drm_class_device_unregister(struct device *dev);
>>>>   void drm_sysfs_hotplug_event(struct drm_device *dev);
>>>> +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, 
>>>> uint32_t reset_flags);
>>>>   void drm_sysfs_connector_hotplug_event(struct drm_connector 
>>>> *connector);
>>>>   void drm_sysfs_connector_status_event(struct drm_connector 
>>>> *connector,
>>>>                         struct drm_property *property);
>>>
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] drm: Add GPU reset sysfs event
  2022-03-08 11:56       ` Sharma, Shashank
@ 2022-03-08 15:37         ` Somalapuram, Amaranath
  2022-03-09  8:02           ` Christian König
  2022-03-08 16:40         ` Sharma, Shashank
  1 sibling, 1 reply; 32+ messages in thread
From: Somalapuram, Amaranath @ 2022-03-08 15:37 UTC (permalink / raw)
  To: Sharma, Shashank, Christian König, Christian König,
	Shashank Sharma, amd-gfx
  Cc: Alexandar Deucher, amaranath.somalapuram


On 3/8/2022 5:26 PM, Sharma, Shashank wrote:
>
>
> On 3/8/2022 11:32 AM, Christian König wrote:
>> Am 08.03.22 um 10:31 schrieb Sharma, Shashank:
>>>
>>>
>>> On 3/8/2022 8:06 AM, Christian König wrote:
>>>> Am 07.03.22 um 17:26 schrieb Shashank Sharma:
>>>>> From: Shashank Sharma <shashank.sharma@amd.com>
>>>>>
>>>>> This patch adds a new sysfs event, which will indicate
>>>>> the userland about a GPU reset, and can also provide
>>>>> some information like:
>>>>> - which PID was involved in the GPU reset
>>>>> - what was the GPU status (using flags)
>>>>>
>>>>> This patch also introduces the first flag of the flags
>>>>> bitmap, which can be appended as and when required.
>>>>
>>>> Make sure to CC the dri-devel mailing list when reviewing this.
>>> Got it,
>>>
>>> I was also curious if we want to move the reset_ctx structure itself 
>>> to DRM layer, like
>>> drm_reset_event_ctx {
>>>     u32 pid;
>>>     u32 flags;
>>>     char process_name[64];
>>> };
>>
>> I was entertaining that thought as well.
>>
>> But if we do this I would go even a step further and also move the 
>> reset work item into the DRM layer as well.
>>
>> You might also look like into migrating the exiting i915 code which 
>> uses udev to signal GPU resets to this function as well.
>>
Hi Christian,

Can we access adev in common drm (even if we can access adev it will not 
be common code)

move work function to drm need to be protected(i.e reset_domain->sem), 
adding something like reset_sem to drm_device?

Regards,

S.Amarnath

>> Regards,
>> Christian.
>
> That seems like a good idea, let me quickly dive into i915 and check 
> this out.
>
> Shashank
>>
>>>
>>> and then:
>>> void drm_sysfs_reset_event(struct drm_device *dev, 
>>> drm_reset_event_ctx *ctx);
>>>
>>>>
>>>>>
>>>>> Cc: Alexandar Deucher <alexander.deucher@amd.com>
>>>>> Cc: Christian Koenig <christian.koenig@amd.com>
>>>>> Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
>>>>> ---
>>>>>   drivers/gpu/drm/drm_sysfs.c | 24 ++++++++++++++++++++++++
>>>>>   include/drm/drm_sysfs.h     |  3 +++
>>>>>   2 files changed, 27 insertions(+)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/drm_sysfs.c 
>>>>> b/drivers/gpu/drm/drm_sysfs.c
>>>>> index 430e00b16eec..52a015161431 100644
>>>>> --- a/drivers/gpu/drm/drm_sysfs.c
>>>>> +++ b/drivers/gpu/drm/drm_sysfs.c
>>>>> @@ -409,6 +409,30 @@ void drm_sysfs_hotplug_event(struct 
>>>>> drm_device *dev)
>>>>>   }
>>>>>   EXPORT_SYMBOL(drm_sysfs_hotplug_event);
>>>>> +/**
>>>>> + * drm_sysfs_reset_event - generate a DRM uevent to indicate GPU 
>>>>> reset
>>>>> + * @dev: DRM device
>>>>> + * @pid: The process ID involve with the reset
>>>>> + * @flags: Any other information about the GPU status
>>>>> + *
>>>>> + * Send a uevent for the DRM device specified by @dev. This 
>>>>> indicates
>>>>> + * user that a GPU reset has occurred, so that the interested client
>>>>> + * can take any recovery or profiling measure, when required.
>>>>> + */
>>>>> +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, 
>>>>> uint32_t flags)
>>>>
>>>> The PID is usually only 32bit, but even better would be to use pid_t.
>>>>
>>>>> +{
>>>>> +    unsigned char pid_str[21], flags_str[15];
>>>>> +    unsigned char reset_str[] = "RESET=1";
>>>>> +    char *envp[] = { reset_str, pid_str, flags_str, NULL };
>>>>> +
>>>>> +    DRM_DEBUG("generating reset event\n");
>>>>> +
>>>>> +    snprintf(pid_str, ARRAY_SIZE(pid_str), "PID=%lu", pid);
>>>>> +    snprintf(flags_str, ARRAY_SIZE(flags_str), "FLAGS=%u", flags);
>>>>> + kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
>>>>> +}
>>>>> +EXPORT_SYMBOL(drm_sysfs_reset_event);
>>>>> +
>>>>>   /**
>>>>>    * drm_sysfs_connector_hotplug_event - generate a DRM uevent for 
>>>>> any connector
>>>>>    * change
>>>>> diff --git a/include/drm/drm_sysfs.h b/include/drm/drm_sysfs.h
>>>>> index 6273cac44e47..63f00fe8054c 100644
>>>>> --- a/include/drm/drm_sysfs.h
>>>>> +++ b/include/drm/drm_sysfs.h
>>>>> @@ -2,6 +2,8 @@
>>>>>   #ifndef _DRM_SYSFS_H_
>>>>>   #define _DRM_SYSFS_H_
>>>>> +#define DRM_GPU_RESET_FLAG_VRAM_VALID (1 << 0)
>>>>
>>>> Probably better to define that the other way around, e.g. 
>>>> DRM_GPU_RESET_FLAG_VRAM_LOST.
>>>>
>>>> Apart from that looks good to me.
>>>>
>>> Got it, noted.
>>> - Shashank
>>>
>>>> Christian.
>>>>
>>>>> +
>>>>>   struct drm_device;
>>>>>   struct device;
>>>>>   struct drm_connector;
>>>>> @@ -11,6 +13,7 @@ int drm_class_device_register(struct device *dev);
>>>>>   void drm_class_device_unregister(struct device *dev);
>>>>>   void drm_sysfs_hotplug_event(struct drm_device *dev);
>>>>> +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, 
>>>>> uint32_t reset_flags);
>>>>>   void drm_sysfs_connector_hotplug_event(struct drm_connector 
>>>>> *connector);
>>>>>   void drm_sysfs_connector_status_event(struct drm_connector 
>>>>> *connector,
>>>>>                         struct drm_property *property);
>>>>
>>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] drm: Add GPU reset sysfs event
  2022-03-07 16:26 [PATCH 1/2] drm: Add GPU reset sysfs event Shashank Sharma
                   ` (2 preceding siblings ...)
  2022-03-08  7:06 ` Christian König
@ 2022-03-08 16:25 ` Andrey Grodzovsky
  2022-03-08 16:35   ` Sharma, Shashank
  2022-03-08 16:36   ` Lazar, Lijo
  3 siblings, 2 replies; 32+ messages in thread
From: Andrey Grodzovsky @ 2022-03-08 16:25 UTC (permalink / raw)
  To: Shashank Sharma, amd-gfx
  Cc: Alexandar Deucher, amaranath.somalapuram, Christian Koenig,
	shashank.sharma


On 2022-03-07 11:26, Shashank Sharma wrote:
> From: Shashank Sharma <shashank.sharma@amd.com>
>
> This patch adds a new sysfs event, which will indicate
> the userland about a GPU reset, and can also provide
> some information like:
> - which PID was involved in the GPU reset
> - what was the GPU status (using flags)
>
> This patch also introduces the first flag of the flags
> bitmap, which can be appended as and when required.


I am reminding again about another important piece of info which you can add
here and that is Smart Trace Buffer dump [1]. The buffer size is HW 
specific but
from what I see there is no problem to just amend it as part of envp[] 
initialization.
bellow.

The interface to get the buffer is smu_stb_collect_info and usage can be 
seen from
frebugfs interface in smu_stb_debugfs_open

[1] - https://www.spinics.net/lists/amd-gfx/msg70751.html

Andrey


>
> Cc: Alexandar Deucher <alexander.deucher@amd.com>
> Cc: Christian Koenig <christian.koenig@amd.com>
> Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
> ---
>   drivers/gpu/drm/drm_sysfs.c | 24 ++++++++++++++++++++++++
>   include/drm/drm_sysfs.h     |  3 +++
>   2 files changed, 27 insertions(+)
>
> diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c
> index 430e00b16eec..52a015161431 100644
> --- a/drivers/gpu/drm/drm_sysfs.c
> +++ b/drivers/gpu/drm/drm_sysfs.c
> @@ -409,6 +409,30 @@ void drm_sysfs_hotplug_event(struct drm_device *dev)
>   }
>   EXPORT_SYMBOL(drm_sysfs_hotplug_event);
>   
> +/**
> + * drm_sysfs_reset_event - generate a DRM uevent to indicate GPU reset
> + * @dev: DRM device
> + * @pid: The process ID involve with the reset
> + * @flags: Any other information about the GPU status
> + *
> + * Send a uevent for the DRM device specified by @dev. This indicates
> + * user that a GPU reset has occurred, so that the interested client
> + * can take any recovery or profiling measure, when required.
> + */
> +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, uint32_t flags)
> +{
> +	unsigned char pid_str[21], flags_str[15];
> +	unsigned char reset_str[] = "RESET=1";
> +	char *envp[] = { reset_str, pid_str, flags_str, NULL };
> +
> +	DRM_DEBUG("generating reset event\n");
> +
> +	snprintf(pid_str, ARRAY_SIZE(pid_str), "PID=%lu", pid);
> +	snprintf(flags_str, ARRAY_SIZE(flags_str), "FLAGS=%u", flags);
> +	kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
> +}
> +EXPORT_SYMBOL(drm_sysfs_reset_event);
> +
>   /**
>    * drm_sysfs_connector_hotplug_event - generate a DRM uevent for any connector
>    * change
> diff --git a/include/drm/drm_sysfs.h b/include/drm/drm_sysfs.h
> index 6273cac44e47..63f00fe8054c 100644
> --- a/include/drm/drm_sysfs.h
> +++ b/include/drm/drm_sysfs.h
> @@ -2,6 +2,8 @@
>   #ifndef _DRM_SYSFS_H_
>   #define _DRM_SYSFS_H_
>   
> +#define DRM_GPU_RESET_FLAG_VRAM_VALID (1 << 0)
> +
>   struct drm_device;
>   struct device;
>   struct drm_connector;
> @@ -11,6 +13,7 @@ int drm_class_device_register(struct device *dev);
>   void drm_class_device_unregister(struct device *dev);
>   
>   void drm_sysfs_hotplug_event(struct drm_device *dev);
> +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, uint32_t reset_flags);
>   void drm_sysfs_connector_hotplug_event(struct drm_connector *connector);
>   void drm_sysfs_connector_status_event(struct drm_connector *connector,
>   				      struct drm_property *property);

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: add work function for GPU reset event
  2022-03-07 16:26 ` [PATCH 2/2] drm/amdgpu: add work function for GPU reset event Shashank Sharma
  2022-03-07 16:46   ` Somalapuram, Amaranath
  2022-03-08  7:15   ` Christian König
@ 2022-03-08 16:26   ` Andrey Grodzovsky
  2022-03-08 16:30     ` Sharma, Shashank
  2 siblings, 1 reply; 32+ messages in thread
From: Andrey Grodzovsky @ 2022-03-08 16:26 UTC (permalink / raw)
  To: Shashank Sharma, amd-gfx
  Cc: Alexander Deucher, amaranath.somalapuram, Christian Koenig,
	shashank.sharma


On 2022-03-07 11:26, Shashank Sharma wrote:
> From: Shashank Sharma <shashank.sharma@amd.com>
>
> This patch adds a work function, which will get scheduled
> in event of a GPU reset, and will send a uevent to user with
> some reset context infomration, like a PID and some flags.


Where is the actual scheduling of the work function ? Shouldn't
there be a patch for that too ?

Andrey


>
> The userspace can do some recovery and post-processing work
> based on this event.
>
> V2:
> - Changed the name of the work to gpu_reset_event_work
>    (Christian)
> - Added a structure to accommodate some additional information
>    (like a PID and some flags)
>
> Cc: Alexander Deucher <alexander.deucher@amd.com>
> Cc: Christian Koenig <christian.koenig@amd.com>
> Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  7 +++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 +++++++++++++++++++
>   2 files changed, 26 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index d8b854fcbffa..7df219fe363f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -813,6 +813,11 @@ struct amd_powerplay {
>   #define AMDGPU_RESET_MAGIC_NUM 64
>   #define AMDGPU_MAX_DF_PERFMONS 4
>   #define AMDGPU_PRODUCT_NAME_LEN 64
> +struct amdgpu_reset_event_ctx {
> +	uint64_t pid;
> +	uint32_t flags;
> +};
> +
>   struct amdgpu_device {
>   	struct device			*dev;
>   	struct pci_dev			*pdev;
> @@ -1063,6 +1068,7 @@ struct amdgpu_device {
>   
>   	int asic_reset_res;
>   	struct work_struct		xgmi_reset_work;
> +	struct work_struct		gpu_reset_event_work;
>   	struct list_head		reset_list;
>   
>   	long				gfx_timeout;
> @@ -1097,6 +1103,7 @@ struct amdgpu_device {
>   	pci_channel_state_t		pci_channel_state;
>   
>   	struct amdgpu_reset_control     *reset_cntl;
> +	struct amdgpu_reset_event_ctx   reset_event_ctx;
>   	uint32_t                        ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
>   
>   	bool				ram_is_direct_mapped;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index ed077de426d9..c43d099da06d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -73,6 +73,7 @@
>   #include <linux/pm_runtime.h>
>   
>   #include <drm/drm_drv.h>
> +#include <drm/drm_sysfs.h>
>   
>   MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
>   MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
> @@ -3277,6 +3278,23 @@ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
>   	return amdgpu_device_asic_has_dc_support(adev->asic_type);
>   }
>   
> +static void amdgpu_device_reset_event_func(struct work_struct *__work)
> +{
> +	struct amdgpu_device *adev = container_of(__work, struct amdgpu_device,
> +						  gpu_reset_event_work);
> +	struct amdgpu_reset_event_ctx *event_ctx = &adev->reset_event_ctx;
> +
> +	/*
> +	 * A GPU reset has happened, indicate the userspace and pass the
> +	 * following information:
> +	 *	- pid of the process involved,
> +	 *	- if the VRAM is valid or not,
> +	 *	- indicate that userspace may want to collect the ftrace event
> +	 * data from the trace event.
> +	 */
> +	drm_sysfs_reset_event(&adev->ddev, event_ctx->pid, event_ctx->flags);
> +}
> +
>   static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
>   {
>   	struct amdgpu_device *adev =
> @@ -3525,6 +3543,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   			  amdgpu_device_delay_enable_gfx_off);
>   
>   	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
> +	INIT_WORK(&adev->gpu_reset_event_work, amdgpu_device_reset_event_func);
>   
>   	adev->gfx.gfx_off_req_count = 1;
>   	adev->pm.ac_power = power_supply_is_system_supplied() > 0;

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: add work function for GPU reset event
  2022-03-08 16:26   ` Andrey Grodzovsky
@ 2022-03-08 16:30     ` Sharma, Shashank
  2022-03-08 17:20       ` Somalapuram, Amaranath
  0 siblings, 1 reply; 32+ messages in thread
From: Sharma, Shashank @ 2022-03-08 16:30 UTC (permalink / raw)
  To: Andrey Grodzovsky, Shashank Sharma, amd-gfx
  Cc: Alexander Deucher, amaranath.somalapuram, Christian Koenig

Hello Andrey

On 3/8/2022 5:26 PM, Andrey Grodzovsky wrote:
> 
> On 2022-03-07 11:26, Shashank Sharma wrote:
>> From: Shashank Sharma <shashank.sharma@amd.com>
>>
>> This patch adds a work function, which will get scheduled
>> in event of a GPU reset, and will send a uevent to user with
>> some reset context infomration, like a PID and some flags.
> 
> 
> Where is the actual scheduling of the work function ? Shouldn't
> there be a patch for that too ?
> 

Yes, Amar is working on that patch, on top of these patches. They should 
be out soon. I thought it was a good idea to get quick feedback on the 
basic patches before we build something on top of it.

- Shashank

> Andrey
> 
> 
>>
>> The userspace can do some recovery and post-processing work
>> based on this event.
>>
>> V2:
>> - Changed the name of the work to gpu_reset_event_work
>>    (Christian)
>> - Added a structure to accommodate some additional information
>>    (like a PID and some flags)
>>
>> Cc: Alexander Deucher <alexander.deucher@amd.com>
>> Cc: Christian Koenig <christian.koenig@amd.com>
>> Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  7 +++++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 +++++++++++++++++++
>>   2 files changed, 26 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index d8b854fcbffa..7df219fe363f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -813,6 +813,11 @@ struct amd_powerplay {
>>   #define AMDGPU_RESET_MAGIC_NUM 64
>>   #define AMDGPU_MAX_DF_PERFMONS 4
>>   #define AMDGPU_PRODUCT_NAME_LEN 64
>> +struct amdgpu_reset_event_ctx {
>> +    uint64_t pid;
>> +    uint32_t flags;
>> +};
>> +
>>   struct amdgpu_device {
>>       struct device            *dev;
>>       struct pci_dev            *pdev;
>> @@ -1063,6 +1068,7 @@ struct amdgpu_device {
>>       int asic_reset_res;
>>       struct work_struct        xgmi_reset_work;
>> +    struct work_struct        gpu_reset_event_work;
>>       struct list_head        reset_list;
>>       long                gfx_timeout;
>> @@ -1097,6 +1103,7 @@ struct amdgpu_device {
>>       pci_channel_state_t        pci_channel_state;
>>       struct amdgpu_reset_control     *reset_cntl;
>> +    struct amdgpu_reset_event_ctx   reset_event_ctx;
>>       uint32_t                        
>> ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
>>       bool                ram_is_direct_mapped;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index ed077de426d9..c43d099da06d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -73,6 +73,7 @@
>>   #include <linux/pm_runtime.h>
>>   #include <drm/drm_drv.h>
>> +#include <drm/drm_sysfs.h>
>>   MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
>>   MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
>> @@ -3277,6 +3278,23 @@ bool amdgpu_device_has_dc_support(struct 
>> amdgpu_device *adev)
>>       return amdgpu_device_asic_has_dc_support(adev->asic_type);
>>   }
>> +static void amdgpu_device_reset_event_func(struct work_struct *__work)
>> +{
>> +    struct amdgpu_device *adev = container_of(__work, struct 
>> amdgpu_device,
>> +                          gpu_reset_event_work);
>> +    struct amdgpu_reset_event_ctx *event_ctx = &adev->reset_event_ctx;
>> +
>> +    /*
>> +     * A GPU reset has happened, indicate the userspace and pass the
>> +     * following information:
>> +     *    - pid of the process involved,
>> +     *    - if the VRAM is valid or not,
>> +     *    - indicate that userspace may want to collect the ftrace event
>> +     * data from the trace event.
>> +     */
>> +    drm_sysfs_reset_event(&adev->ddev, event_ctx->pid, 
>> event_ctx->flags);
>> +}
>> +
>>   static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
>>   {
>>       struct amdgpu_device *adev =
>> @@ -3525,6 +3543,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>                 amdgpu_device_delay_enable_gfx_off);
>>       INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
>> +    INIT_WORK(&adev->gpu_reset_event_work, 
>> amdgpu_device_reset_event_func);
>>       adev->gfx.gfx_off_req_count = 1;
>>       adev->pm.ac_power = power_supply_is_system_supplied() > 0;

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] drm: Add GPU reset sysfs event
  2022-03-08 16:25 ` Andrey Grodzovsky
@ 2022-03-08 16:35   ` Sharma, Shashank
  2022-03-08 16:36     ` Andrey Grodzovsky
  2022-03-08 16:36   ` Lazar, Lijo
  1 sibling, 1 reply; 32+ messages in thread
From: Sharma, Shashank @ 2022-03-08 16:35 UTC (permalink / raw)
  To: Andrey Grodzovsky, Shashank Sharma, amd-gfx
  Cc: Alexandar Deucher, amaranath.somalapuram, Christian Koenig



On 3/8/2022 5:25 PM, Andrey Grodzovsky wrote:
> 
> On 2022-03-07 11:26, Shashank Sharma wrote:
>> From: Shashank Sharma <shashank.sharma@amd.com>
>>
>> This patch adds a new sysfs event, which will indicate
>> the userland about a GPU reset, and can also provide
>> some information like:
>> - which PID was involved in the GPU reset
>> - what was the GPU status (using flags)
>>
>> This patch also introduces the first flag of the flags
>> bitmap, which can be appended as and when required.
> 
> 
> I am reminding again about another important piece of info which you can 
> add
> here and that is Smart Trace Buffer dump [1]. The buffer size is HW 
> specific but
> from what I see there is no problem to just amend it as part of envp[] 
> initialization.
> bellow.
> 
> The interface to get the buffer is smu_stb_collect_info and usage can be 
> seen from
> frebugfs interface in smu_stb_debugfs_open
> 
> [1] - https://www.spinics.net/lists/amd-gfx/msg70751.html
>

Noted Andrey, thank for the reminder. As you can see, this patch is 
going into DRM layer, so as of now we are accommodating the PID and VRAM 
validity information, which is common to all the DRM drivers (not only 
AMDGPU). But as a next step, we will extend this interface to provide 
driver specific custom data as well,  and that is where we will start 
digging into STB.
- Shashank

> Andrey
> 
> 
>>
>> Cc: Alexandar Deucher <alexander.deucher@amd.com>
>> Cc: Christian Koenig <christian.koenig@amd.com>
>> Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
>> ---
>>   drivers/gpu/drm/drm_sysfs.c | 24 ++++++++++++++++++++++++
>>   include/drm/drm_sysfs.h     |  3 +++
>>   2 files changed, 27 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c
>> index 430e00b16eec..52a015161431 100644
>> --- a/drivers/gpu/drm/drm_sysfs.c
>> +++ b/drivers/gpu/drm/drm_sysfs.c
>> @@ -409,6 +409,30 @@ void drm_sysfs_hotplug_event(struct drm_device *dev)
>>   }
>>   EXPORT_SYMBOL(drm_sysfs_hotplug_event);
>> +/**
>> + * drm_sysfs_reset_event - generate a DRM uevent to indicate GPU reset
>> + * @dev: DRM device
>> + * @pid: The process ID involve with the reset
>> + * @flags: Any other information about the GPU status
>> + *
>> + * Send a uevent for the DRM device specified by @dev. This indicates
>> + * user that a GPU reset has occurred, so that the interested client
>> + * can take any recovery or profiling measure, when required.
>> + */
>> +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, 
>> uint32_t flags)
>> +{
>> +    unsigned char pid_str[21], flags_str[15];
>> +    unsigned char reset_str[] = "RESET=1";
>> +    char *envp[] = { reset_str, pid_str, flags_str, NULL };
>> +
>> +    DRM_DEBUG("generating reset event\n");
>> +
>> +    snprintf(pid_str, ARRAY_SIZE(pid_str), "PID=%lu", pid);
>> +    snprintf(flags_str, ARRAY_SIZE(flags_str), "FLAGS=%u", flags);
>> +    kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
>> +}
>> +EXPORT_SYMBOL(drm_sysfs_reset_event);
>> +
>>   /**
>>    * drm_sysfs_connector_hotplug_event - generate a DRM uevent for any 
>> connector
>>    * change
>> diff --git a/include/drm/drm_sysfs.h b/include/drm/drm_sysfs.h
>> index 6273cac44e47..63f00fe8054c 100644
>> --- a/include/drm/drm_sysfs.h
>> +++ b/include/drm/drm_sysfs.h
>> @@ -2,6 +2,8 @@
>>   #ifndef _DRM_SYSFS_H_
>>   #define _DRM_SYSFS_H_
>> +#define DRM_GPU_RESET_FLAG_VRAM_VALID (1 << 0)
>> +
>>   struct drm_device;
>>   struct device;
>>   struct drm_connector;
>> @@ -11,6 +13,7 @@ int drm_class_device_register(struct device *dev);
>>   void drm_class_device_unregister(struct device *dev);
>>   void drm_sysfs_hotplug_event(struct drm_device *dev);
>> +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, 
>> uint32_t reset_flags);
>>   void drm_sysfs_connector_hotplug_event(struct drm_connector 
>> *connector);
>>   void drm_sysfs_connector_status_event(struct drm_connector *connector,
>>                         struct drm_property *property);

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] drm: Add GPU reset sysfs event
  2022-03-08 16:35   ` Sharma, Shashank
@ 2022-03-08 16:36     ` Andrey Grodzovsky
  0 siblings, 0 replies; 32+ messages in thread
From: Andrey Grodzovsky @ 2022-03-08 16:36 UTC (permalink / raw)
  To: Sharma, Shashank, Shashank Sharma, amd-gfx
  Cc: Alexandar Deucher, amaranath.somalapuram, Christian Koenig


On 2022-03-08 11:35, Sharma, Shashank wrote:
>
>
> On 3/8/2022 5:25 PM, Andrey Grodzovsky wrote:
>>
>> On 2022-03-07 11:26, Shashank Sharma wrote:
>>> From: Shashank Sharma <shashank.sharma@amd.com>
>>>
>>> This patch adds a new sysfs event, which will indicate
>>> the userland about a GPU reset, and can also provide
>>> some information like:
>>> - which PID was involved in the GPU reset
>>> - what was the GPU status (using flags)
>>>
>>> This patch also introduces the first flag of the flags
>>> bitmap, which can be appended as and when required.
>>
>>
>> I am reminding again about another important piece of info which you 
>> can add
>> here and that is Smart Trace Buffer dump [1]. The buffer size is HW 
>> specific but
>> from what I see there is no problem to just amend it as part of 
>> envp[] initialization.
>> bellow.
>>
>> The interface to get the buffer is smu_stb_collect_info and usage can 
>> be seen from
>> frebugfs interface in smu_stb_debugfs_open
>>
>> [1] - https://www.spinics.net/lists/amd-gfx/msg70751.html
>>
>
> Noted Andrey, thank for the reminder. As you can see, this patch is 
> going into DRM layer, so as of now we are accommodating the PID and 
> VRAM validity information, which is common to all the DRM drivers (not 
> only AMDGPU). But as a next step, we will extend this interface to 
> provide driver specific custom data as well,  and that is where we 
> will start digging into STB.
> - Shashank


Got it.

Andrey


>
>> Andrey
>>
>>
>>>
>>> Cc: Alexandar Deucher <alexander.deucher@amd.com>
>>> Cc: Christian Koenig <christian.koenig@amd.com>
>>> Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
>>> ---
>>>   drivers/gpu/drm/drm_sysfs.c | 24 ++++++++++++++++++++++++
>>>   include/drm/drm_sysfs.h     |  3 +++
>>>   2 files changed, 27 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c
>>> index 430e00b16eec..52a015161431 100644
>>> --- a/drivers/gpu/drm/drm_sysfs.c
>>> +++ b/drivers/gpu/drm/drm_sysfs.c
>>> @@ -409,6 +409,30 @@ void drm_sysfs_hotplug_event(struct drm_device 
>>> *dev)
>>>   }
>>>   EXPORT_SYMBOL(drm_sysfs_hotplug_event);
>>> +/**
>>> + * drm_sysfs_reset_event - generate a DRM uevent to indicate GPU reset
>>> + * @dev: DRM device
>>> + * @pid: The process ID involve with the reset
>>> + * @flags: Any other information about the GPU status
>>> + *
>>> + * Send a uevent for the DRM device specified by @dev. This indicates
>>> + * user that a GPU reset has occurred, so that the interested client
>>> + * can take any recovery or profiling measure, when required.
>>> + */
>>> +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, 
>>> uint32_t flags)
>>> +{
>>> +    unsigned char pid_str[21], flags_str[15];
>>> +    unsigned char reset_str[] = "RESET=1";
>>> +    char *envp[] = { reset_str, pid_str, flags_str, NULL };
>>> +
>>> +    DRM_DEBUG("generating reset event\n");
>>> +
>>> +    snprintf(pid_str, ARRAY_SIZE(pid_str), "PID=%lu", pid);
>>> +    snprintf(flags_str, ARRAY_SIZE(flags_str), "FLAGS=%u", flags);
>>> + kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
>>> +}
>>> +EXPORT_SYMBOL(drm_sysfs_reset_event);
>>> +
>>>   /**
>>>    * drm_sysfs_connector_hotplug_event - generate a DRM uevent for 
>>> any connector
>>>    * change
>>> diff --git a/include/drm/drm_sysfs.h b/include/drm/drm_sysfs.h
>>> index 6273cac44e47..63f00fe8054c 100644
>>> --- a/include/drm/drm_sysfs.h
>>> +++ b/include/drm/drm_sysfs.h
>>> @@ -2,6 +2,8 @@
>>>   #ifndef _DRM_SYSFS_H_
>>>   #define _DRM_SYSFS_H_
>>> +#define DRM_GPU_RESET_FLAG_VRAM_VALID (1 << 0)
>>> +
>>>   struct drm_device;
>>>   struct device;
>>>   struct drm_connector;
>>> @@ -11,6 +13,7 @@ int drm_class_device_register(struct device *dev);
>>>   void drm_class_device_unregister(struct device *dev);
>>>   void drm_sysfs_hotplug_event(struct drm_device *dev);
>>> +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, 
>>> uint32_t reset_flags);
>>>   void drm_sysfs_connector_hotplug_event(struct drm_connector 
>>> *connector);
>>>   void drm_sysfs_connector_status_event(struct drm_connector 
>>> *connector,
>>>                         struct drm_property *property);

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] drm: Add GPU reset sysfs event
  2022-03-08 16:25 ` Andrey Grodzovsky
  2022-03-08 16:35   ` Sharma, Shashank
@ 2022-03-08 16:36   ` Lazar, Lijo
  2022-03-08 16:39     ` Andrey Grodzovsky
  1 sibling, 1 reply; 32+ messages in thread
From: Lazar, Lijo @ 2022-03-08 16:36 UTC (permalink / raw)
  To: Grodzovsky, Andrey, Shashank Sharma, amd-gfx
  Cc: Deucher, Alexander, Limonciello, Mario, Somalapuram, Amaranath,
	Koenig, Christian, Sharma, Shashank

[-- Attachment #1: Type: text/plain, Size: 4709 bytes --]

[AMD Official Use Only]

+Mario

I guess that means the functionality needs to be present in amdgpu for APUs also. Presently, this is taken care by PMC driver for APUs.

Thanks,
Lijo
________________________________
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Sent: Tuesday, March 8, 2022 9:55:03 PM
To: Shashank Sharma <contactshashanksharma@gmail.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Somalapuram, Amaranath <Amaranath.Somalapuram@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Sharma, Shashank <Shashank.Sharma@amd.com>
Subject: Re: [PATCH 1/2] drm: Add GPU reset sysfs event


On 2022-03-07 11:26, Shashank Sharma wrote:
> From: Shashank Sharma <shashank.sharma@amd.com>
>
> This patch adds a new sysfs event, which will indicate
> the userland about a GPU reset, and can also provide
> some information like:
> - which PID was involved in the GPU reset
> - what was the GPU status (using flags)
>
> This patch also introduces the first flag of the flags
> bitmap, which can be appended as and when required.


I am reminding again about another important piece of info which you can add
here and that is Smart Trace Buffer dump [1]. The buffer size is HW
specific but
from what I see there is no problem to just amend it as part of envp[]
initialization.
bellow.

The interface to get the buffer is smu_stb_collect_info and usage can be
seen from
frebugfs interface in smu_stb_debugfs_open

[1] - https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.spinics.net%2Flists%2Famd-gfx%2Fmsg70751.html&amp;data=04%7C01%7Clijo.lazar%40amd.com%7C80bc3f07e2d0441d44a108da012036dc%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637823535167679490%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=53l7KlTf%2BICKkZkLVwFh6nRTjkAh%2FDpOat5DRoyKIx0%3D&amp;reserved=0

Andrey


>
> Cc: Alexandar Deucher <alexander.deucher@amd.com>
> Cc: Christian Koenig <christian.koenig@amd.com>
> Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
> ---
>   drivers/gpu/drm/drm_sysfs.c | 24 ++++++++++++++++++++++++
>   include/drm/drm_sysfs.h     |  3 +++
>   2 files changed, 27 insertions(+)
>
> diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c
> index 430e00b16eec..52a015161431 100644
> --- a/drivers/gpu/drm/drm_sysfs.c
> +++ b/drivers/gpu/drm/drm_sysfs.c
> @@ -409,6 +409,30 @@ void drm_sysfs_hotplug_event(struct drm_device *dev)
>   }
>   EXPORT_SYMBOL(drm_sysfs_hotplug_event);
>
> +/**
> + * drm_sysfs_reset_event - generate a DRM uevent to indicate GPU reset
> + * @dev: DRM device
> + * @pid: The process ID involve with the reset
> + * @flags: Any other information about the GPU status
> + *
> + * Send a uevent for the DRM device specified by @dev. This indicates
> + * user that a GPU reset has occurred, so that the interested client
> + * can take any recovery or profiling measure, when required.
> + */
> +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, uint32_t flags)
> +{
> +     unsigned char pid_str[21], flags_str[15];
> +     unsigned char reset_str[] = "RESET=1";
> +     char *envp[] = { reset_str, pid_str, flags_str, NULL };
> +
> +     DRM_DEBUG("generating reset event\n");
> +
> +     snprintf(pid_str, ARRAY_SIZE(pid_str), "PID=%lu", pid);
> +     snprintf(flags_str, ARRAY_SIZE(flags_str), "FLAGS=%u", flags);
> +     kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
> +}
> +EXPORT_SYMBOL(drm_sysfs_reset_event);
> +
>   /**
>    * drm_sysfs_connector_hotplug_event - generate a DRM uevent for any connector
>    * change
> diff --git a/include/drm/drm_sysfs.h b/include/drm/drm_sysfs.h
> index 6273cac44e47..63f00fe8054c 100644
> --- a/include/drm/drm_sysfs.h
> +++ b/include/drm/drm_sysfs.h
> @@ -2,6 +2,8 @@
>   #ifndef _DRM_SYSFS_H_
>   #define _DRM_SYSFS_H_
>
> +#define DRM_GPU_RESET_FLAG_VRAM_VALID (1 << 0)
> +
>   struct drm_device;
>   struct device;
>   struct drm_connector;
> @@ -11,6 +13,7 @@ int drm_class_device_register(struct device *dev);
>   void drm_class_device_unregister(struct device *dev);
>
>   void drm_sysfs_hotplug_event(struct drm_device *dev);
> +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, uint32_t reset_flags);
>   void drm_sysfs_connector_hotplug_event(struct drm_connector *connector);
>   void drm_sysfs_connector_status_event(struct drm_connector *connector,
>                                      struct drm_property *property);

[-- Attachment #2: Type: text/html, Size: 7442 bytes --]

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] drm: Add GPU reset sysfs event
  2022-03-08 16:36   ` Lazar, Lijo
@ 2022-03-08 16:39     ` Andrey Grodzovsky
  2022-03-08 16:46       ` Sharma, Shashank
  0 siblings, 1 reply; 32+ messages in thread
From: Andrey Grodzovsky @ 2022-03-08 16:39 UTC (permalink / raw)
  To: Lazar, Lijo, Shashank Sharma, amd-gfx
  Cc: Deucher, Alexander, Limonciello, Mario, Somalapuram, Amaranath,
	Koenig, Christian, Sharma, Shashank

[-- Attachment #1: Type: text/plain, Size: 5736 bytes --]

As long as PMC driver provides clear interface to retrieve the info 
there should be no issue to call either amdgpu interface or PMC 
interface using IS_APU (or something alike in the code)
We probably should add a wrapper function around this logic in amdgpu.

Andrey

On 2022-03-08 11:36, Lazar, Lijo wrote:
>
> [AMD Official Use Only]
>
>
> +Mario
>
> I guess that means the functionality needs to be present in amdgpu for 
> APUs also. Presently, this is taken care by PMC driver for APUs.
>
> Thanks,
> Lijo
> ------------------------------------------------------------------------
> *From:* amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of 
> Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> *Sent:* Tuesday, March 8, 2022 9:55:03 PM
> *To:* Shashank Sharma <contactshashanksharma@gmail.com>; 
> amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
> *Cc:* Deucher, Alexander <Alexander.Deucher@amd.com>; Somalapuram, 
> Amaranath <Amaranath.Somalapuram@amd.com>; Koenig, Christian 
> <Christian.Koenig@amd.com>; Sharma, Shashank <Shashank.Sharma@amd.com>
> *Subject:* Re: [PATCH 1/2] drm: Add GPU reset sysfs event
>
> On 2022-03-07 11:26, Shashank Sharma wrote:
> > From: Shashank Sharma <shashank.sharma@amd.com>
> >
> > This patch adds a new sysfs event, which will indicate
> > the userland about a GPU reset, and can also provide
> > some information like:
> > - which PID was involved in the GPU reset
> > - what was the GPU status (using flags)
> >
> > This patch also introduces the first flag of the flags
> > bitmap, which can be appended as and when required.
>
>
> I am reminding again about another important piece of info which you 
> can add
> here and that is Smart Trace Buffer dump [1]. The buffer size is HW
> specific but
> from what I see there is no problem to just amend it as part of envp[]
> initialization.
> bellow.
>
> The interface to get the buffer is smu_stb_collect_info and usage can be
> seen from
> frebugfs interface in smu_stb_debugfs_open
>
> [1] - 
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.spinics.net%2Flists%2Famd-gfx%2Fmsg70751.html&amp;data=04%7C01%7Clijo.lazar%40amd.com%7C80bc3f07e2d0441d44a108da012036dc%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637823535167679490%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=53l7KlTf%2BICKkZkLVwFh6nRTjkAh%2FDpOat5DRoyKIx0%3D&amp;reserved=0 
> <https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.spinics.net%2Flists%2Famd-gfx%2Fmsg70751.html&amp;data=04%7C01%7Clijo.lazar%40amd.com%7C80bc3f07e2d0441d44a108da012036dc%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637823535167679490%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=53l7KlTf%2BICKkZkLVwFh6nRTjkAh%2FDpOat5DRoyKIx0%3D&amp;reserved=0>
>
> Andrey
>
>
> >
> > Cc: Alexandar Deucher <alexander.deucher@amd.com>
> > Cc: Christian Koenig <christian.koenig@amd.com>
> > Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
> > ---
> >   drivers/gpu/drm/drm_sysfs.c | 24 ++++++++++++++++++++++++
> >   include/drm/drm_sysfs.h     |  3 +++
> >   2 files changed, 27 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c
> > index 430e00b16eec..52a015161431 100644
> > --- a/drivers/gpu/drm/drm_sysfs.c
> > +++ b/drivers/gpu/drm/drm_sysfs.c
> > @@ -409,6 +409,30 @@ void drm_sysfs_hotplug_event(struct drm_device 
> *dev)
> >   }
> >   EXPORT_SYMBOL(drm_sysfs_hotplug_event);
> >
> > +/**
> > + * drm_sysfs_reset_event - generate a DRM uevent to indicate GPU reset
> > + * @dev: DRM device
> > + * @pid: The process ID involve with the reset
> > + * @flags: Any other information about the GPU status
> > + *
> > + * Send a uevent for the DRM device specified by @dev. This indicates
> > + * user that a GPU reset has occurred, so that the interested client
> > + * can take any recovery or profiling measure, when required.
> > + */
> > +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, 
> uint32_t flags)
> > +{
> > +     unsigned char pid_str[21], flags_str[15];
> > +     unsigned char reset_str[] = "RESET=1";
> > +     char *envp[] = { reset_str, pid_str, flags_str, NULL };
> > +
> > +     DRM_DEBUG("generating reset event\n");
> > +
> > +     snprintf(pid_str, ARRAY_SIZE(pid_str), "PID=%lu", pid);
> > +     snprintf(flags_str, ARRAY_SIZE(flags_str), "FLAGS=%u", flags);
> > + kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
> > +}
> > +EXPORT_SYMBOL(drm_sysfs_reset_event);
> > +
> >   /**
> >    * drm_sysfs_connector_hotplug_event - generate a DRM uevent for 
> any connector
> >    * change
> > diff --git a/include/drm/drm_sysfs.h b/include/drm/drm_sysfs.h
> > index 6273cac44e47..63f00fe8054c 100644
> > --- a/include/drm/drm_sysfs.h
> > +++ b/include/drm/drm_sysfs.h
> > @@ -2,6 +2,8 @@
> >   #ifndef _DRM_SYSFS_H_
> >   #define _DRM_SYSFS_H_
> >
> > +#define DRM_GPU_RESET_FLAG_VRAM_VALID (1 << 0)
> > +
> >   struct drm_device;
> >   struct device;
> >   struct drm_connector;
> > @@ -11,6 +13,7 @@ int drm_class_device_register(struct device *dev);
> >   void drm_class_device_unregister(struct device *dev);
> >
> >   void drm_sysfs_hotplug_event(struct drm_device *dev);
> > +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, 
> uint32_t reset_flags);
> >   void drm_sysfs_connector_hotplug_event(struct drm_connector 
> *connector);
> >   void drm_sysfs_connector_status_event(struct drm_connector *connector,
> >                                      struct drm_property *property);

[-- Attachment #2: Type: text/html, Size: 11451 bytes --]

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] drm: Add GPU reset sysfs event
  2022-03-08 11:56       ` Sharma, Shashank
  2022-03-08 15:37         ` Somalapuram, Amaranath
@ 2022-03-08 16:40         ` Sharma, Shashank
  2022-03-09  8:05           ` Christian König
  1 sibling, 1 reply; 32+ messages in thread
From: Sharma, Shashank @ 2022-03-08 16:40 UTC (permalink / raw)
  To: Christian König, Christian König, Shashank Sharma, amd-gfx
  Cc: Alexandar Deucher, amaranath.somalapuram



On 3/8/2022 12:56 PM, Sharma, Shashank wrote:
> 
> 
> On 3/8/2022 11:32 AM, Christian König wrote:
>> Am 08.03.22 um 10:31 schrieb Sharma, Shashank:
>>>
>>>
>>> On 3/8/2022 8:06 AM, Christian König wrote:
>>>> Am 07.03.22 um 17:26 schrieb Shashank Sharma:
>>>>> From: Shashank Sharma <shashank.sharma@amd.com>
>>>>>
>>>>> This patch adds a new sysfs event, which will indicate
>>>>> the userland about a GPU reset, and can also provide
>>>>> some information like:
>>>>> - which PID was involved in the GPU reset
>>>>> - what was the GPU status (using flags)
>>>>>
>>>>> This patch also introduces the first flag of the flags
>>>>> bitmap, which can be appended as and when required.
>>>>
>>>> Make sure to CC the dri-devel mailing list when reviewing this.
>>> Got it,
>>>
>>> I was also curious if we want to move the reset_ctx structure itself 
>>> to DRM layer, like
>>> drm_reset_event_ctx {
>>>     u32 pid;
>>>     u32 flags;
>>>     char process_name[64];
>>> };
>>
>> I was entertaining that thought as well.
>>
>> But if we do this I would go even a step further and also move the 
>> reset work item into the DRM layer as well.
>>
>> You might also look like into migrating the exiting i915 code which 
>> uses udev to signal GPU resets to this function as well.
>>
>> Regards,
>> Christian.
> 
> That seems like a good idea, let me quickly dive into i915 and check 
> this out.
> 
> Shashank

I had a quick look at I915, and it looks like both I915 and AMDGPU 
drivers have very different methods of passing the data to the work 
function, via different internal structures. Which means it would be 
much additional work in both the drivers to move the work function 
itself in the DRM layer.

To me, now it seems like it would be better if we can just provide this 
interface to send the uevent and its structure, and the drivers can 
collect their information and pass it to WQ in their own way.

How do you feel about it ?

- Shashank

>>
>>>
>>> and then:
>>> void drm_sysfs_reset_event(struct drm_device *dev, 
>>> drm_reset_event_ctx *ctx);
>>>
>>>>
>>>>>
>>>>> Cc: Alexandar Deucher <alexander.deucher@amd.com>
>>>>> Cc: Christian Koenig <christian.koenig@amd.com>
>>>>> Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
>>>>> ---
>>>>>   drivers/gpu/drm/drm_sysfs.c | 24 ++++++++++++++++++++++++
>>>>>   include/drm/drm_sysfs.h     |  3 +++
>>>>>   2 files changed, 27 insertions(+)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c
>>>>> index 430e00b16eec..52a015161431 100644
>>>>> --- a/drivers/gpu/drm/drm_sysfs.c
>>>>> +++ b/drivers/gpu/drm/drm_sysfs.c
>>>>> @@ -409,6 +409,30 @@ void drm_sysfs_hotplug_event(struct drm_device 
>>>>> *dev)
>>>>>   }
>>>>>   EXPORT_SYMBOL(drm_sysfs_hotplug_event);
>>>>> +/**
>>>>> + * drm_sysfs_reset_event - generate a DRM uevent to indicate GPU 
>>>>> reset
>>>>> + * @dev: DRM device
>>>>> + * @pid: The process ID involve with the reset
>>>>> + * @flags: Any other information about the GPU status
>>>>> + *
>>>>> + * Send a uevent for the DRM device specified by @dev. This indicates
>>>>> + * user that a GPU reset has occurred, so that the interested client
>>>>> + * can take any recovery or profiling measure, when required.
>>>>> + */
>>>>> +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, 
>>>>> uint32_t flags)
>>>>
>>>> The PID is usually only 32bit, but even better would be to use pid_t.
>>>>
>>>>> +{
>>>>> +    unsigned char pid_str[21], flags_str[15];
>>>>> +    unsigned char reset_str[] = "RESET=1";
>>>>> +    char *envp[] = { reset_str, pid_str, flags_str, NULL };
>>>>> +
>>>>> +    DRM_DEBUG("generating reset event\n");
>>>>> +
>>>>> +    snprintf(pid_str, ARRAY_SIZE(pid_str), "PID=%lu", pid);
>>>>> +    snprintf(flags_str, ARRAY_SIZE(flags_str), "FLAGS=%u", flags);
>>>>> + kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
>>>>> +}
>>>>> +EXPORT_SYMBOL(drm_sysfs_reset_event);
>>>>> +
>>>>>   /**
>>>>>    * drm_sysfs_connector_hotplug_event - generate a DRM uevent for 
>>>>> any connector
>>>>>    * change
>>>>> diff --git a/include/drm/drm_sysfs.h b/include/drm/drm_sysfs.h
>>>>> index 6273cac44e47..63f00fe8054c 100644
>>>>> --- a/include/drm/drm_sysfs.h
>>>>> +++ b/include/drm/drm_sysfs.h
>>>>> @@ -2,6 +2,8 @@
>>>>>   #ifndef _DRM_SYSFS_H_
>>>>>   #define _DRM_SYSFS_H_
>>>>> +#define DRM_GPU_RESET_FLAG_VRAM_VALID (1 << 0)
>>>>
>>>> Probably better to define that the other way around, e.g. 
>>>> DRM_GPU_RESET_FLAG_VRAM_LOST.
>>>>
>>>> Apart from that looks good to me.
>>>>
>>> Got it, noted.
>>> - Shashank
>>>
>>>> Christian.
>>>>
>>>>> +
>>>>>   struct drm_device;
>>>>>   struct device;
>>>>>   struct drm_connector;
>>>>> @@ -11,6 +13,7 @@ int drm_class_device_register(struct device *dev);
>>>>>   void drm_class_device_unregister(struct device *dev);
>>>>>   void drm_sysfs_hotplug_event(struct drm_device *dev);
>>>>> +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, 
>>>>> uint32_t reset_flags);
>>>>>   void drm_sysfs_connector_hotplug_event(struct drm_connector 
>>>>> *connector);
>>>>>   void drm_sysfs_connector_status_event(struct drm_connector 
>>>>> *connector,
>>>>>                         struct drm_property *property);
>>>>
>>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] drm: Add GPU reset sysfs event
  2022-03-08 16:39     ` Andrey Grodzovsky
@ 2022-03-08 16:46       ` Sharma, Shashank
  2022-03-08 16:55         ` Andrey Grodzovsky
  0 siblings, 1 reply; 32+ messages in thread
From: Sharma, Shashank @ 2022-03-08 16:46 UTC (permalink / raw)
  To: Andrey Grodzovsky, Lazar, Lijo, Shashank Sharma, amd-gfx
  Cc: Deucher, Alexander, Somalapuram, Amaranath, Koenig, Christian,
	Limonciello, Mario

I have a very limited understanding of PMC driver and its interfaces, so 
I would just go ahead and rely on Andrey's judgement/recommendation on 
this :)

- Shashank

On 3/8/2022 5:39 PM, Andrey Grodzovsky wrote:
> As long as PMC driver provides clear interface to retrieve the info 
> there should be no issue to call either amdgpu interface or PMC 
> interface using IS_APU (or something alike in the code)
> We probably should add a wrapper function around this logic in amdgpu.
> 
> Andrey
> 
> On 2022-03-08 11:36, Lazar, Lijo wrote:
>>
>> [AMD Official Use Only]
>>
>>
>> +Mario
>>
>> I guess that means the functionality needs to be present in amdgpu for 
>> APUs also. Presently, this is taken care by PMC driver for APUs.
>>
>> Thanks,
>> Lijo
>> ------------------------------------------------------------------------
>> *From:* amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of 
>> Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>> *Sent:* Tuesday, March 8, 2022 9:55:03 PM
>> *To:* Shashank Sharma <contactshashanksharma@gmail.com>; 
>> amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
>> *Cc:* Deucher, Alexander <Alexander.Deucher@amd.com>; Somalapuram, 
>> Amaranath <Amaranath.Somalapuram@amd.com>; Koenig, Christian 
>> <Christian.Koenig@amd.com>; Sharma, Shashank <Shashank.Sharma@amd.com>
>> *Subject:* Re: [PATCH 1/2] drm: Add GPU reset sysfs event
>>
>> On 2022-03-07 11:26, Shashank Sharma wrote:
>> > From: Shashank Sharma <shashank.sharma@amd.com>
>> >
>> > This patch adds a new sysfs event, which will indicate
>> > the userland about a GPU reset, and can also provide
>> > some information like:
>> > - which PID was involved in the GPU reset
>> > - what was the GPU status (using flags)
>> >
>> > This patch also introduces the first flag of the flags
>> > bitmap, which can be appended as and when required.
>>
>>
>> I am reminding again about another important piece of info which you 
>> can add
>> here and that is Smart Trace Buffer dump [1]. The buffer size is HW
>> specific but
>> from what I see there is no problem to just amend it as part of envp[]
>> initialization.
>> bellow.
>>
>> The interface to get the buffer is smu_stb_collect_info and usage can be
>> seen from
>> frebugfs interface in smu_stb_debugfs_open
>>
>> [1] - 
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.spinics.net%2Flists%2Famd-gfx%2Fmsg70751.html&amp;data=04%7C01%7Clijo.lazar%40amd.com%7C80bc3f07e2d0441d44a108da012036dc%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637823535167679490%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=53l7KlTf%2BICKkZkLVwFh6nRTjkAh%2FDpOat5DRoyKIx0%3D&amp;reserved=0 
>> <https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.spinics.net%2Flists%2Famd-gfx%2Fmsg70751.html&amp;data=04%7C01%7Clijo.lazar%40amd.com%7C80bc3f07e2d0441d44a108da012036dc%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637823535167679490%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=53l7KlTf%2BICKkZkLVwFh6nRTjkAh%2FDpOat5DRoyKIx0%3D&amp;reserved=0>
>>
>> Andrey
>>
>>
>> >
>> > Cc: Alexandar Deucher <alexander.deucher@amd.com>
>> > Cc: Christian Koenig <christian.koenig@amd.com>
>> > Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
>> > ---
>> >   drivers/gpu/drm/drm_sysfs.c | 24 ++++++++++++++++++++++++
>> >   include/drm/drm_sysfs.h     |  3 +++
>> >   2 files changed, 27 insertions(+)
>> >
>> > diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c
>> > index 430e00b16eec..52a015161431 100644
>> > --- a/drivers/gpu/drm/drm_sysfs.c
>> > +++ b/drivers/gpu/drm/drm_sysfs.c
>> > @@ -409,6 +409,30 @@ void drm_sysfs_hotplug_event(struct drm_device 
>> *dev)
>> >   }
>> >   EXPORT_SYMBOL(drm_sysfs_hotplug_event);
>> >
>> > +/**
>> > + * drm_sysfs_reset_event - generate a DRM uevent to indicate GPU reset
>> > + * @dev: DRM device
>> > + * @pid: The process ID involve with the reset
>> > + * @flags: Any other information about the GPU status
>> > + *
>> > + * Send a uevent for the DRM device specified by @dev. This indicates
>> > + * user that a GPU reset has occurred, so that the interested client
>> > + * can take any recovery or profiling measure, when required.
>> > + */
>> > +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, 
>> uint32_t flags)
>> > +{
>> > +     unsigned char pid_str[21], flags_str[15];
>> > +     unsigned char reset_str[] = "RESET=1";
>> > +     char *envp[] = { reset_str, pid_str, flags_str, NULL };
>> > +
>> > +     DRM_DEBUG("generating reset event\n");
>> > +
>> > +     snprintf(pid_str, ARRAY_SIZE(pid_str), "PID=%lu", pid);
>> > +     snprintf(flags_str, ARRAY_SIZE(flags_str), "FLAGS=%u", flags);
>> > + kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
>> > +}
>> > +EXPORT_SYMBOL(drm_sysfs_reset_event);
>> > +
>> >   /**
>> >    * drm_sysfs_connector_hotplug_event - generate a DRM uevent for 
>> any connector
>> >    * change
>> > diff --git a/include/drm/drm_sysfs.h b/include/drm/drm_sysfs.h
>> > index 6273cac44e47..63f00fe8054c 100644
>> > --- a/include/drm/drm_sysfs.h
>> > +++ b/include/drm/drm_sysfs.h
>> > @@ -2,6 +2,8 @@
>> >   #ifndef _DRM_SYSFS_H_
>> >   #define _DRM_SYSFS_H_
>> >
>> > +#define DRM_GPU_RESET_FLAG_VRAM_VALID (1 << 0)
>> > +
>> >   struct drm_device;
>> >   struct device;
>> >   struct drm_connector;
>> > @@ -11,6 +13,7 @@ int drm_class_device_register(struct device *dev);
>> >   void drm_class_device_unregister(struct device *dev);
>> >
>> >   void drm_sysfs_hotplug_event(struct drm_device *dev);
>> > +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, 
>> uint32_t reset_flags);
>> >   void drm_sysfs_connector_hotplug_event(struct drm_connector 
>> *connector);
>> >   void drm_sysfs_connector_status_event(struct drm_connector *connector,
>> >                                      struct drm_property *property);

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] drm: Add GPU reset sysfs event
  2022-03-08 16:46       ` Sharma, Shashank
@ 2022-03-08 16:55         ` Andrey Grodzovsky
  2022-03-08 16:57           ` Sharma, Shashank
  0 siblings, 1 reply; 32+ messages in thread
From: Andrey Grodzovsky @ 2022-03-08 16:55 UTC (permalink / raw)
  To: Sharma, Shashank, Lazar, Lijo, Shashank Sharma, amd-gfx
  Cc: Deucher, Alexander, Somalapuram, Amaranath, Koenig, Christian,
	Limonciello, Mario

You can read on their side here - 
https://www.phoronix.com/scan.php?page=news_item&px=AMD-STB-Linux-5.17 
and see their patch. THey don't have as clean
interface as we do to retrieve the buffer and currently it's hard-coded 
for debugfs dump but it looks like pretty straight forward to expose 
their buffer to external
client like amdgpu.

Andrey

On 2022-03-08 11:46, Sharma, Shashank wrote:
> I have a very limited understanding of PMC driver and its interfaces, 
> so I would just go ahead and rely on Andrey's judgement/recommendation 
> on this :)
>
> - Shashank
>
> On 3/8/2022 5:39 PM, Andrey Grodzovsky wrote:
>> As long as PMC driver provides clear interface to retrieve the info 
>> there should be no issue to call either amdgpu interface or PMC 
>> interface using IS_APU (or something alike in the code)
>> We probably should add a wrapper function around this logic in amdgpu.
>>
>> Andrey
>>
>> On 2022-03-08 11:36, Lazar, Lijo wrote:
>>>
>>> [AMD Official Use Only]
>>>
>>>
>>> +Mario
>>>
>>> I guess that means the functionality needs to be present in amdgpu 
>>> for APUs also. Presently, this is taken care by PMC driver for APUs.
>>>
>>> Thanks,
>>> Lijo
>>> ------------------------------------------------------------------------ 
>>>
>>> *From:* amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of 
>>> Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>> *Sent:* Tuesday, March 8, 2022 9:55:03 PM
>>> *To:* Shashank Sharma <contactshashanksharma@gmail.com>; 
>>> amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
>>> *Cc:* Deucher, Alexander <Alexander.Deucher@amd.com>; Somalapuram, 
>>> Amaranath <Amaranath.Somalapuram@amd.com>; Koenig, Christian 
>>> <Christian.Koenig@amd.com>; Sharma, Shashank <Shashank.Sharma@amd.com>
>>> *Subject:* Re: [PATCH 1/2] drm: Add GPU reset sysfs event
>>>
>>> On 2022-03-07 11:26, Shashank Sharma wrote:
>>> > From: Shashank Sharma <shashank.sharma@amd.com>
>>> >
>>> > This patch adds a new sysfs event, which will indicate
>>> > the userland about a GPU reset, and can also provide
>>> > some information like:
>>> > - which PID was involved in the GPU reset
>>> > - what was the GPU status (using flags)
>>> >
>>> > This patch also introduces the first flag of the flags
>>> > bitmap, which can be appended as and when required.
>>>
>>>
>>> I am reminding again about another important piece of info which you 
>>> can add
>>> here and that is Smart Trace Buffer dump [1]. The buffer size is HW
>>> specific but
>>> from what I see there is no problem to just amend it as part of envp[]
>>> initialization.
>>> bellow.
>>>
>>> The interface to get the buffer is smu_stb_collect_info and usage 
>>> can be
>>> seen from
>>> frebugfs interface in smu_stb_debugfs_open
>>>
>>> [1] - 
>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.spinics.net%2Flists%2Famd-gfx%2Fmsg70751.html&amp;data=04%7C01%7Clijo.lazar%40amd.com%7C80bc3f07e2d0441d44a108da012036dc%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637823535167679490%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=53l7KlTf%2BICKkZkLVwFh6nRTjkAh%2FDpOat5DRoyKIx0%3D&amp;reserved=0 
>>> <https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.spinics.net%2Flists%2Famd-gfx%2Fmsg70751.html&amp;data=04%7C01%7Clijo.lazar%40amd.com%7C80bc3f07e2d0441d44a108da012036dc%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637823535167679490%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=53l7KlTf%2BICKkZkLVwFh6nRTjkAh%2FDpOat5DRoyKIx0%3D&amp;reserved=0> 
>>>
>>>
>>> Andrey
>>>
>>>
>>> >
>>> > Cc: Alexandar Deucher <alexander.deucher@amd.com>
>>> > Cc: Christian Koenig <christian.koenig@amd.com>
>>> > Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
>>> > ---
>>> >   drivers/gpu/drm/drm_sysfs.c | 24 ++++++++++++++++++++++++
>>> >   include/drm/drm_sysfs.h     |  3 +++
>>> >   2 files changed, 27 insertions(+)
>>> >
>>> > diff --git a/drivers/gpu/drm/drm_sysfs.c 
>>> b/drivers/gpu/drm/drm_sysfs.c
>>> > index 430e00b16eec..52a015161431 100644
>>> > --- a/drivers/gpu/drm/drm_sysfs.c
>>> > +++ b/drivers/gpu/drm/drm_sysfs.c
>>> > @@ -409,6 +409,30 @@ void drm_sysfs_hotplug_event(struct 
>>> drm_device *dev)
>>> >   }
>>> >   EXPORT_SYMBOL(drm_sysfs_hotplug_event);
>>> >
>>> > +/**
>>> > + * drm_sysfs_reset_event - generate a DRM uevent to indicate GPU 
>>> reset
>>> > + * @dev: DRM device
>>> > + * @pid: The process ID involve with the reset
>>> > + * @flags: Any other information about the GPU status
>>> > + *
>>> > + * Send a uevent for the DRM device specified by @dev. This 
>>> indicates
>>> > + * user that a GPU reset has occurred, so that the interested client
>>> > + * can take any recovery or profiling measure, when required.
>>> > + */
>>> > +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, 
>>> uint32_t flags)
>>> > +{
>>> > +     unsigned char pid_str[21], flags_str[15];
>>> > +     unsigned char reset_str[] = "RESET=1";
>>> > +     char *envp[] = { reset_str, pid_str, flags_str, NULL };
>>> > +
>>> > +     DRM_DEBUG("generating reset event\n");
>>> > +
>>> > +     snprintf(pid_str, ARRAY_SIZE(pid_str), "PID=%lu", pid);
>>> > +     snprintf(flags_str, ARRAY_SIZE(flags_str), "FLAGS=%u", flags);
>>> > + kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
>>> > +}
>>> > +EXPORT_SYMBOL(drm_sysfs_reset_event);
>>> > +
>>> >   /**
>>> >    * drm_sysfs_connector_hotplug_event - generate a DRM uevent for 
>>> any connector
>>> >    * change
>>> > diff --git a/include/drm/drm_sysfs.h b/include/drm/drm_sysfs.h
>>> > index 6273cac44e47..63f00fe8054c 100644
>>> > --- a/include/drm/drm_sysfs.h
>>> > +++ b/include/drm/drm_sysfs.h
>>> > @@ -2,6 +2,8 @@
>>> >   #ifndef _DRM_SYSFS_H_
>>> >   #define _DRM_SYSFS_H_
>>> >
>>> > +#define DRM_GPU_RESET_FLAG_VRAM_VALID (1 << 0)
>>> > +
>>> >   struct drm_device;
>>> >   struct device;
>>> >   struct drm_connector;
>>> > @@ -11,6 +13,7 @@ int drm_class_device_register(struct device *dev);
>>> >   void drm_class_device_unregister(struct device *dev);
>>> >
>>> >   void drm_sysfs_hotplug_event(struct drm_device *dev);
>>> > +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, 
>>> uint32_t reset_flags);
>>> >   void drm_sysfs_connector_hotplug_event(struct drm_connector 
>>> *connector);
>>> >   void drm_sysfs_connector_status_event(struct drm_connector 
>>> *connector,
>>> >                                      struct drm_property *property);

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] drm: Add GPU reset sysfs event
  2022-03-08 16:55         ` Andrey Grodzovsky
@ 2022-03-08 16:57           ` Sharma, Shashank
  2022-03-08 17:04             ` Somalapuram, Amaranath
  2022-03-08 17:27             ` Limonciello, Mario
  0 siblings, 2 replies; 32+ messages in thread
From: Sharma, Shashank @ 2022-03-08 16:57 UTC (permalink / raw)
  To: Andrey Grodzovsky, Lazar, Lijo, Shashank Sharma, amd-gfx
  Cc: Deucher, Alexander, Somalapuram, Amaranath, Koenig, Christian,
	Limonciello, Mario



On 3/8/2022 5:55 PM, Andrey Grodzovsky wrote:
> You can read on their side here - 
> https://www.phoronix.com/scan.php?page=news_item&px=AMD-STB-Linux-5.17 
> and see their patch. THey don't have as clean
> interface as we do to retrieve the buffer and currently it's hard-coded 
> for debugfs dump but it looks like pretty straight forward to expose 
> their buffer to external
> client like amdgpu.

Noted, thanks for the pointer.
- Shashank
> 
> Andrey
> 
> On 2022-03-08 11:46, Sharma, Shashank wrote:
>> I have a very limited understanding of PMC driver and its interfaces, 
>> so I would just go ahead and rely on Andrey's judgement/recommendation 
>> on this :)
>>
>> - Shashank
>>
>> On 3/8/2022 5:39 PM, Andrey Grodzovsky wrote:
>>> As long as PMC driver provides clear interface to retrieve the info 
>>> there should be no issue to call either amdgpu interface or PMC 
>>> interface using IS_APU (or something alike in the code)
>>> We probably should add a wrapper function around this logic in amdgpu.
>>>
>>> Andrey
>>>
>>> On 2022-03-08 11:36, Lazar, Lijo wrote:
>>>>
>>>> [AMD Official Use Only]
>>>>
>>>>
>>>> +Mario
>>>>
>>>> I guess that means the functionality needs to be present in amdgpu 
>>>> for APUs also. Presently, this is taken care by PMC driver for APUs.
>>>>
>>>> Thanks,
>>>> Lijo
>>>> ------------------------------------------------------------------------ 
>>>>
>>>> *From:* amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of 
>>>> Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>> *Sent:* Tuesday, March 8, 2022 9:55:03 PM
>>>> *To:* Shashank Sharma <contactshashanksharma@gmail.com>; 
>>>> amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
>>>> *Cc:* Deucher, Alexander <Alexander.Deucher@amd.com>; Somalapuram, 
>>>> Amaranath <Amaranath.Somalapuram@amd.com>; Koenig, Christian 
>>>> <Christian.Koenig@amd.com>; Sharma, Shashank <Shashank.Sharma@amd.com>
>>>> *Subject:* Re: [PATCH 1/2] drm: Add GPU reset sysfs event
>>>>
>>>> On 2022-03-07 11:26, Shashank Sharma wrote:
>>>> > From: Shashank Sharma <shashank.sharma@amd.com>
>>>> >
>>>> > This patch adds a new sysfs event, which will indicate
>>>> > the userland about a GPU reset, and can also provide
>>>> > some information like:
>>>> > - which PID was involved in the GPU reset
>>>> > - what was the GPU status (using flags)
>>>> >
>>>> > This patch also introduces the first flag of the flags
>>>> > bitmap, which can be appended as and when required.
>>>>
>>>>
>>>> I am reminding again about another important piece of info which you 
>>>> can add
>>>> here and that is Smart Trace Buffer dump [1]. The buffer size is HW
>>>> specific but
>>>> from what I see there is no problem to just amend it as part of envp[]
>>>> initialization.
>>>> bellow.
>>>>
>>>> The interface to get the buffer is smu_stb_collect_info and usage 
>>>> can be
>>>> seen from
>>>> frebugfs interface in smu_stb_debugfs_open
>>>>
>>>> [1] - 
>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.spinics.net%2Flists%2Famd-gfx%2Fmsg70751.html&amp;data=04%7C01%7Clijo.lazar%40amd.com%7C80bc3f07e2d0441d44a108da012036dc%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637823535167679490%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=53l7KlTf%2BICKkZkLVwFh6nRTjkAh%2FDpOat5DRoyKIx0%3D&amp;reserved=0 
>>>> <https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.spinics.net%2Flists%2Famd-gfx%2Fmsg70751.html&amp;data=04%7C01%7Clijo.lazar%40amd.com%7C80bc3f07e2d0441d44a108da012036dc%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637823535167679490%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=53l7KlTf%2BICKkZkLVwFh6nRTjkAh%2FDpOat5DRoyKIx0%3D&amp;reserved=0> 
>>>>
>>>>
>>>> Andrey
>>>>
>>>>
>>>> >
>>>> > Cc: Alexandar Deucher <alexander.deucher@amd.com>
>>>> > Cc: Christian Koenig <christian.koenig@amd.com>
>>>> > Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
>>>> > ---
>>>> >   drivers/gpu/drm/drm_sysfs.c | 24 ++++++++++++++++++++++++
>>>> >   include/drm/drm_sysfs.h     |  3 +++
>>>> >   2 files changed, 27 insertions(+)
>>>> >
>>>> > diff --git a/drivers/gpu/drm/drm_sysfs.c 
>>>> b/drivers/gpu/drm/drm_sysfs.c
>>>> > index 430e00b16eec..52a015161431 100644
>>>> > --- a/drivers/gpu/drm/drm_sysfs.c
>>>> > +++ b/drivers/gpu/drm/drm_sysfs.c
>>>> > @@ -409,6 +409,30 @@ void drm_sysfs_hotplug_event(struct 
>>>> drm_device *dev)
>>>> >   }
>>>> >   EXPORT_SYMBOL(drm_sysfs_hotplug_event);
>>>> >
>>>> > +/**
>>>> > + * drm_sysfs_reset_event - generate a DRM uevent to indicate GPU 
>>>> reset
>>>> > + * @dev: DRM device
>>>> > + * @pid: The process ID involve with the reset
>>>> > + * @flags: Any other information about the GPU status
>>>> > + *
>>>> > + * Send a uevent for the DRM device specified by @dev. This 
>>>> indicates
>>>> > + * user that a GPU reset has occurred, so that the interested client
>>>> > + * can take any recovery or profiling measure, when required.
>>>> > + */
>>>> > +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, 
>>>> uint32_t flags)
>>>> > +{
>>>> > +     unsigned char pid_str[21], flags_str[15];
>>>> > +     unsigned char reset_str[] = "RESET=1";
>>>> > +     char *envp[] = { reset_str, pid_str, flags_str, NULL };
>>>> > +
>>>> > +     DRM_DEBUG("generating reset event\n");
>>>> > +
>>>> > +     snprintf(pid_str, ARRAY_SIZE(pid_str), "PID=%lu", pid);
>>>> > +     snprintf(flags_str, ARRAY_SIZE(flags_str), "FLAGS=%u", flags);
>>>> > + kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
>>>> > +}
>>>> > +EXPORT_SYMBOL(drm_sysfs_reset_event);
>>>> > +
>>>> >   /**
>>>> >    * drm_sysfs_connector_hotplug_event - generate a DRM uevent for 
>>>> any connector
>>>> >    * change
>>>> > diff --git a/include/drm/drm_sysfs.h b/include/drm/drm_sysfs.h
>>>> > index 6273cac44e47..63f00fe8054c 100644
>>>> > --- a/include/drm/drm_sysfs.h
>>>> > +++ b/include/drm/drm_sysfs.h
>>>> > @@ -2,6 +2,8 @@
>>>> >   #ifndef _DRM_SYSFS_H_
>>>> >   #define _DRM_SYSFS_H_
>>>> >
>>>> > +#define DRM_GPU_RESET_FLAG_VRAM_VALID (1 << 0)
>>>> > +
>>>> >   struct drm_device;
>>>> >   struct device;
>>>> >   struct drm_connector;
>>>> > @@ -11,6 +13,7 @@ int drm_class_device_register(struct device *dev);
>>>> >   void drm_class_device_unregister(struct device *dev);
>>>> >
>>>> >   void drm_sysfs_hotplug_event(struct drm_device *dev);
>>>> > +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, 
>>>> uint32_t reset_flags);
>>>> >   void drm_sysfs_connector_hotplug_event(struct drm_connector 
>>>> *connector);
>>>> >   void drm_sysfs_connector_status_event(struct drm_connector 
>>>> *connector,
>>>> >                                      struct drm_property *property);

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] drm: Add GPU reset sysfs event
  2022-03-08 16:57           ` Sharma, Shashank
@ 2022-03-08 17:04             ` Somalapuram, Amaranath
  2022-03-08 17:17               ` Andrey Grodzovsky
  2022-03-08 17:27             ` Limonciello, Mario
  1 sibling, 1 reply; 32+ messages in thread
From: Somalapuram, Amaranath @ 2022-03-08 17:04 UTC (permalink / raw)
  To: Sharma, Shashank, Andrey Grodzovsky, Lazar, Lijo,
	Shashank Sharma, amd-gfx
  Cc: Deucher, Alexander, Somalapuram, Amaranath, Koenig, Christian,
	Limonciello, Mario


On 3/8/2022 10:27 PM, Sharma, Shashank wrote:
>
>
> On 3/8/2022 5:55 PM, Andrey Grodzovsky wrote:
>> You can read on their side here - 
>> https://www.phoronix.com/scan.php?page=news_item&px=AMD-STB-Linux-5.17 
>> and see their patch. THey don't have as clean
>> interface as we do to retrieve the buffer and currently it's 
>> hard-coded for debugfs dump but it looks like pretty straight forward 
>> to expose their buffer to external
>> client like amdgpu.
>
Customer requirement is to get reset notification for there daemon with 
other info (like PID process name vram status).

Regards,
S.Amarnath
> Noted, thanks for the pointer.
> - Shashank
>>
>> Andrey
>>
>> On 2022-03-08 11:46, Sharma, Shashank wrote:
>>> I have a very limited understanding of PMC driver and its 
>>> interfaces, so I would just go ahead and rely on Andrey's 
>>> judgement/recommendation on this :)
>>>
>>> - Shashank
>>>
>>> On 3/8/2022 5:39 PM, Andrey Grodzovsky wrote:
>>>> As long as PMC driver provides clear interface to retrieve the info 
>>>> there should be no issue to call either amdgpu interface or PMC 
>>>> interface using IS_APU (or something alike in the code)
>>>> We probably should add a wrapper function around this logic in amdgpu.
>>>>
>>>> Andrey
>>>>
>>>> On 2022-03-08 11:36, Lazar, Lijo wrote:
>>>>>
>>>>> [AMD Official Use Only]
>>>>>
>>>>>
>>>>> +Mario
>>>>>
>>>>> I guess that means the functionality needs to be present in amdgpu 
>>>>> for APUs also. Presently, this is taken care by PMC driver for APUs.
>>>>>
>>>>> Thanks,
>>>>> Lijo
>>>>> ------------------------------------------------------------------------ 
>>>>>
>>>>> *From:* amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf 
>>>>> of Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>> *Sent:* Tuesday, March 8, 2022 9:55:03 PM
>>>>> *To:* Shashank Sharma <contactshashanksharma@gmail.com>; 
>>>>> amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
>>>>> *Cc:* Deucher, Alexander <Alexander.Deucher@amd.com>; Somalapuram, 
>>>>> Amaranath <Amaranath.Somalapuram@amd.com>; Koenig, Christian 
>>>>> <Christian.Koenig@amd.com>; Sharma, Shashank 
>>>>> <Shashank.Sharma@amd.com>
>>>>> *Subject:* Re: [PATCH 1/2] drm: Add GPU reset sysfs event
>>>>>
>>>>> On 2022-03-07 11:26, Shashank Sharma wrote:
>>>>> > From: Shashank Sharma <shashank.sharma@amd.com>
>>>>> >
>>>>> > This patch adds a new sysfs event, which will indicate
>>>>> > the userland about a GPU reset, and can also provide
>>>>> > some information like:
>>>>> > - which PID was involved in the GPU reset
>>>>> > - what was the GPU status (using flags)
>>>>> >
>>>>> > This patch also introduces the first flag of the flags
>>>>> > bitmap, which can be appended as and when required.
>>>>>
>>>>>
>>>>> I am reminding again about another important piece of info which 
>>>>> you can add
>>>>> here and that is Smart Trace Buffer dump [1]. The buffer size is HW
>>>>> specific but
>>>>> from what I see there is no problem to just amend it as part of 
>>>>> envp[]
>>>>> initialization.
>>>>> bellow.
>>>>>
>>>>> The interface to get the buffer is smu_stb_collect_info and usage 
>>>>> can be
>>>>> seen from
>>>>> frebugfs interface in smu_stb_debugfs_open
>>>>>
>>>>> [1] - 
>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.spinics.net%2Flists%2Famd-gfx%2Fmsg70751.html&amp;data=04%7C01%7Clijo.lazar%40amd.com%7C80bc3f07e2d0441d44a108da012036dc%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637823535167679490%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=53l7KlTf%2BICKkZkLVwFh6nRTjkAh%2FDpOat5DRoyKIx0%3D&amp;reserved=0 
>>>>> <https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.spinics.net%2Flists%2Famd-gfx%2Fmsg70751.html&amp;data=04%7C01%7Clijo.lazar%40amd.com%7C80bc3f07e2d0441d44a108da012036dc%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637823535167679490%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=53l7KlTf%2BICKkZkLVwFh6nRTjkAh%2FDpOat5DRoyKIx0%3D&amp;reserved=0> 
>>>>>
>>>>>
>>>>> Andrey
>>>>>
>>>>>
>>>>> >
>>>>> > Cc: Alexandar Deucher <alexander.deucher@amd.com>
>>>>> > Cc: Christian Koenig <christian.koenig@amd.com>
>>>>> > Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
>>>>> > ---
>>>>> >   drivers/gpu/drm/drm_sysfs.c | 24 ++++++++++++++++++++++++
>>>>> >   include/drm/drm_sysfs.h     |  3 +++
>>>>> >   2 files changed, 27 insertions(+)
>>>>> >
>>>>> > diff --git a/drivers/gpu/drm/drm_sysfs.c 
>>>>> b/drivers/gpu/drm/drm_sysfs.c
>>>>> > index 430e00b16eec..52a015161431 100644
>>>>> > --- a/drivers/gpu/drm/drm_sysfs.c
>>>>> > +++ b/drivers/gpu/drm/drm_sysfs.c
>>>>> > @@ -409,6 +409,30 @@ void drm_sysfs_hotplug_event(struct 
>>>>> drm_device *dev)
>>>>> >   }
>>>>> >   EXPORT_SYMBOL(drm_sysfs_hotplug_event);
>>>>> >
>>>>> > +/**
>>>>> > + * drm_sysfs_reset_event - generate a DRM uevent to indicate 
>>>>> GPU reset
>>>>> > + * @dev: DRM device
>>>>> > + * @pid: The process ID involve with the reset
>>>>> > + * @flags: Any other information about the GPU status
>>>>> > + *
>>>>> > + * Send a uevent for the DRM device specified by @dev. This 
>>>>> indicates
>>>>> > + * user that a GPU reset has occurred, so that the interested 
>>>>> client
>>>>> > + * can take any recovery or profiling measure, when required.
>>>>> > + */
>>>>> > +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t 
>>>>> pid, uint32_t flags)
>>>>> > +{
>>>>> > +     unsigned char pid_str[21], flags_str[15];
>>>>> > +     unsigned char reset_str[] = "RESET=1";
>>>>> > +     char *envp[] = { reset_str, pid_str, flags_str, NULL };
>>>>> > +
>>>>> > +     DRM_DEBUG("generating reset event\n");
>>>>> > +
>>>>> > +     snprintf(pid_str, ARRAY_SIZE(pid_str), "PID=%lu", pid);
>>>>> > +     snprintf(flags_str, ARRAY_SIZE(flags_str), "FLAGS=%u", 
>>>>> flags);
>>>>> > + kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
>>>>> > +}
>>>>> > +EXPORT_SYMBOL(drm_sysfs_reset_event);
>>>>> > +
>>>>> >   /**
>>>>> >    * drm_sysfs_connector_hotplug_event - generate a DRM uevent 
>>>>> for any connector
>>>>> >    * change
>>>>> > diff --git a/include/drm/drm_sysfs.h b/include/drm/drm_sysfs.h
>>>>> > index 6273cac44e47..63f00fe8054c 100644
>>>>> > --- a/include/drm/drm_sysfs.h
>>>>> > +++ b/include/drm/drm_sysfs.h
>>>>> > @@ -2,6 +2,8 @@
>>>>> >   #ifndef _DRM_SYSFS_H_
>>>>> >   #define _DRM_SYSFS_H_
>>>>> >
>>>>> > +#define DRM_GPU_RESET_FLAG_VRAM_VALID (1 << 0)
>>>>> > +
>>>>> >   struct drm_device;
>>>>> >   struct device;
>>>>> >   struct drm_connector;
>>>>> > @@ -11,6 +13,7 @@ int drm_class_device_register(struct device 
>>>>> *dev);
>>>>> >   void drm_class_device_unregister(struct device *dev);
>>>>> >
>>>>> >   void drm_sysfs_hotplug_event(struct drm_device *dev);
>>>>> > +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t 
>>>>> pid, uint32_t reset_flags);
>>>>> >   void drm_sysfs_connector_hotplug_event(struct drm_connector 
>>>>> *connector);
>>>>> >   void drm_sysfs_connector_status_event(struct drm_connector 
>>>>> *connector,
>>>>> >                                      struct drm_property 
>>>>> *property);

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] drm: Add GPU reset sysfs event
  2022-03-08 17:04             ` Somalapuram, Amaranath
@ 2022-03-08 17:17               ` Andrey Grodzovsky
  0 siblings, 0 replies; 32+ messages in thread
From: Andrey Grodzovsky @ 2022-03-08 17:17 UTC (permalink / raw)
  To: Somalapuram, Amaranath, Sharma, Shashank, Lazar, Lijo,
	Shashank Sharma, amd-gfx
  Cc: Deucher, Alexander, Somalapuram, Amaranath, Koenig, Christian,
	Limonciello, Mario


On 2022-03-08 12:04, Somalapuram, Amaranath wrote:
>
> On 3/8/2022 10:27 PM, Sharma, Shashank wrote:
>>
>>
>> On 3/8/2022 5:55 PM, Andrey Grodzovsky wrote:
>>> You can read on their side here - 
>>> https://www.phoronix.com/scan.php?page=news_item&px=AMD-STB-Linux-5.17 
>>> and see their patch. THey don't have as clean
>>> interface as we do to retrieve the buffer and currently it's 
>>> hard-coded for debugfs dump but it looks like pretty straight 
>>> forward to expose their buffer to external
>>> client like amdgpu.
>>
> Customer requirement is to get reset notification for there daemon 
> with other info (like PID process name vram status).


In general when a failure happens we want to have all debug info 
possible to have better ability to root cause the problem. Since this is 
an open forum I am not sure how
much i can disclose about the data in the buffer but i guarantee you it 
is very useful for debugging GPU hang causes.

Andrey


>
> Regards,
> S.Amarnath
>> Noted, thanks for the pointer.
>> - Shashank
>>>
>>> Andrey
>>>
>>> On 2022-03-08 11:46, Sharma, Shashank wrote:
>>>> I have a very limited understanding of PMC driver and its 
>>>> interfaces, so I would just go ahead and rely on Andrey's 
>>>> judgement/recommendation on this :)
>>>>
>>>> - Shashank
>>>>
>>>> On 3/8/2022 5:39 PM, Andrey Grodzovsky wrote:
>>>>> As long as PMC driver provides clear interface to retrieve the 
>>>>> info there should be no issue to call either amdgpu interface or 
>>>>> PMC interface using IS_APU (or something alike in the code)
>>>>> We probably should add a wrapper function around this logic in 
>>>>> amdgpu.
>>>>>
>>>>> Andrey
>>>>>
>>>>> On 2022-03-08 11:36, Lazar, Lijo wrote:
>>>>>>
>>>>>> [AMD Official Use Only]
>>>>>>
>>>>>>
>>>>>> +Mario
>>>>>>
>>>>>> I guess that means the functionality needs to be present in 
>>>>>> amdgpu for APUs also. Presently, this is taken care by PMC driver 
>>>>>> for APUs.
>>>>>>
>>>>>> Thanks,
>>>>>> Lijo
>>>>>> ------------------------------------------------------------------------ 
>>>>>>
>>>>>> *From:* amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf 
>>>>>> of Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>> *Sent:* Tuesday, March 8, 2022 9:55:03 PM
>>>>>> *To:* Shashank Sharma <contactshashanksharma@gmail.com>; 
>>>>>> amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
>>>>>> *Cc:* Deucher, Alexander <Alexander.Deucher@amd.com>; 
>>>>>> Somalapuram, Amaranath <Amaranath.Somalapuram@amd.com>; Koenig, 
>>>>>> Christian <Christian.Koenig@amd.com>; Sharma, Shashank 
>>>>>> <Shashank.Sharma@amd.com>
>>>>>> *Subject:* Re: [PATCH 1/2] drm: Add GPU reset sysfs event
>>>>>>
>>>>>> On 2022-03-07 11:26, Shashank Sharma wrote:
>>>>>> > From: Shashank Sharma <shashank.sharma@amd.com>
>>>>>> >
>>>>>> > This patch adds a new sysfs event, which will indicate
>>>>>> > the userland about a GPU reset, and can also provide
>>>>>> > some information like:
>>>>>> > - which PID was involved in the GPU reset
>>>>>> > - what was the GPU status (using flags)
>>>>>> >
>>>>>> > This patch also introduces the first flag of the flags
>>>>>> > bitmap, which can be appended as and when required.
>>>>>>
>>>>>>
>>>>>> I am reminding again about another important piece of info which 
>>>>>> you can add
>>>>>> here and that is Smart Trace Buffer dump [1]. The buffer size is HW
>>>>>> specific but
>>>>>> from what I see there is no problem to just amend it as part of 
>>>>>> envp[]
>>>>>> initialization.
>>>>>> bellow.
>>>>>>
>>>>>> The interface to get the buffer is smu_stb_collect_info and usage 
>>>>>> can be
>>>>>> seen from
>>>>>> frebugfs interface in smu_stb_debugfs_open
>>>>>>
>>>>>> [1] - 
>>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.spinics.net%2Flists%2Famd-gfx%2Fmsg70751.html&amp;data=04%7C01%7Clijo.lazar%40amd.com%7C80bc3f07e2d0441d44a108da012036dc%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637823535167679490%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=53l7KlTf%2BICKkZkLVwFh6nRTjkAh%2FDpOat5DRoyKIx0%3D&amp;reserved=0 
>>>>>> <https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.spinics.net%2Flists%2Famd-gfx%2Fmsg70751.html&amp;data=04%7C01%7Clijo.lazar%40amd.com%7C80bc3f07e2d0441d44a108da012036dc%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637823535167679490%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=53l7KlTf%2BICKkZkLVwFh6nRTjkAh%2FDpOat5DRoyKIx0%3D&amp;reserved=0> 
>>>>>>
>>>>>>
>>>>>> Andrey
>>>>>>
>>>>>>
>>>>>> >
>>>>>> > Cc: Alexandar Deucher <alexander.deucher@amd.com>
>>>>>> > Cc: Christian Koenig <christian.koenig@amd.com>
>>>>>> > Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
>>>>>> > ---
>>>>>> >   drivers/gpu/drm/drm_sysfs.c | 24 ++++++++++++++++++++++++
>>>>>> >   include/drm/drm_sysfs.h     |  3 +++
>>>>>> >   2 files changed, 27 insertions(+)
>>>>>> >
>>>>>> > diff --git a/drivers/gpu/drm/drm_sysfs.c 
>>>>>> b/drivers/gpu/drm/drm_sysfs.c
>>>>>> > index 430e00b16eec..52a015161431 100644
>>>>>> > --- a/drivers/gpu/drm/drm_sysfs.c
>>>>>> > +++ b/drivers/gpu/drm/drm_sysfs.c
>>>>>> > @@ -409,6 +409,30 @@ void drm_sysfs_hotplug_event(struct 
>>>>>> drm_device *dev)
>>>>>> >   }
>>>>>> >   EXPORT_SYMBOL(drm_sysfs_hotplug_event);
>>>>>> >
>>>>>> > +/**
>>>>>> > + * drm_sysfs_reset_event - generate a DRM uevent to indicate 
>>>>>> GPU reset
>>>>>> > + * @dev: DRM device
>>>>>> > + * @pid: The process ID involve with the reset
>>>>>> > + * @flags: Any other information about the GPU status
>>>>>> > + *
>>>>>> > + * Send a uevent for the DRM device specified by @dev. This 
>>>>>> indicates
>>>>>> > + * user that a GPU reset has occurred, so that the interested 
>>>>>> client
>>>>>> > + * can take any recovery or profiling measure, when required.
>>>>>> > + */
>>>>>> > +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t 
>>>>>> pid, uint32_t flags)
>>>>>> > +{
>>>>>> > +     unsigned char pid_str[21], flags_str[15];
>>>>>> > +     unsigned char reset_str[] = "RESET=1";
>>>>>> > +     char *envp[] = { reset_str, pid_str, flags_str, NULL };
>>>>>> > +
>>>>>> > +     DRM_DEBUG("generating reset event\n");
>>>>>> > +
>>>>>> > +     snprintf(pid_str, ARRAY_SIZE(pid_str), "PID=%lu", pid);
>>>>>> > +     snprintf(flags_str, ARRAY_SIZE(flags_str), "FLAGS=%u", 
>>>>>> flags);
>>>>>> > + kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, 
>>>>>> envp);
>>>>>> > +}
>>>>>> > +EXPORT_SYMBOL(drm_sysfs_reset_event);
>>>>>> > +
>>>>>> >   /**
>>>>>> >    * drm_sysfs_connector_hotplug_event - generate a DRM uevent 
>>>>>> for any connector
>>>>>> >    * change
>>>>>> > diff --git a/include/drm/drm_sysfs.h b/include/drm/drm_sysfs.h
>>>>>> > index 6273cac44e47..63f00fe8054c 100644
>>>>>> > --- a/include/drm/drm_sysfs.h
>>>>>> > +++ b/include/drm/drm_sysfs.h
>>>>>> > @@ -2,6 +2,8 @@
>>>>>> >   #ifndef _DRM_SYSFS_H_
>>>>>> >   #define _DRM_SYSFS_H_
>>>>>> >
>>>>>> > +#define DRM_GPU_RESET_FLAG_VRAM_VALID (1 << 0)
>>>>>> > +
>>>>>> >   struct drm_device;
>>>>>> >   struct device;
>>>>>> >   struct drm_connector;
>>>>>> > @@ -11,6 +13,7 @@ int drm_class_device_register(struct device 
>>>>>> *dev);
>>>>>> >   void drm_class_device_unregister(struct device *dev);
>>>>>> >
>>>>>> >   void drm_sysfs_hotplug_event(struct drm_device *dev);
>>>>>> > +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t 
>>>>>> pid, uint32_t reset_flags);
>>>>>> >   void drm_sysfs_connector_hotplug_event(struct drm_connector 
>>>>>> *connector);
>>>>>> >   void drm_sysfs_connector_status_event(struct drm_connector 
>>>>>> *connector,
>>>>>> >                                      struct drm_property 
>>>>>> *property);

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: add work function for GPU reset event
  2022-03-08 16:30     ` Sharma, Shashank
@ 2022-03-08 17:20       ` Somalapuram, Amaranath
  2022-03-08 18:53         ` Andrey Grodzovsky
  0 siblings, 1 reply; 32+ messages in thread
From: Somalapuram, Amaranath @ 2022-03-08 17:20 UTC (permalink / raw)
  To: Sharma, Shashank, Andrey Grodzovsky, Shashank Sharma, amd-gfx
  Cc: Alexander Deucher, amaranath.somalapuram, Christian Koenig

[-- Attachment #1: Type: text/plain, Size: 5202 bytes --]


On 3/8/2022 10:00 PM, Sharma, Shashank wrote:
> Hello Andrey
>
> On 3/8/2022 5:26 PM, Andrey Grodzovsky wrote:
>>
>> On 2022-03-07 11:26, Shashank Sharma wrote:
>>> From: Shashank Sharma <shashank.sharma@amd.com>
>>>
>>> This patch adds a work function, which will get scheduled
>>> in event of a GPU reset, and will send a uevent to user with
>>> some reset context infomration, like a PID and some flags.
>>
>>
>> Where is the actual scheduling of the work function ? Shouldn't
>> there be a patch for that too ?
>>
>
> Yes, Amar is working on that patch, on top of these patches. They 
> should be out soon. I thought it was a good idea to get quick feedback 
> on the basic patches before we build something on top of it.
>
schedule_work() will be called in the function amdgpu_do_asic_reset ()

after getting vram_lost info:

vram_lost = amdgpu_device_check_vram_lost(tmp_adev);

update  amdgpu_reset_event_ctx and call schedule_work()

  * vram_lost
  * reset_context->job->vm->task_info.process_name
  * reset_context->job->vm->task_info.pid

Regards,
S.Amarnath
> - Shashank
>
>> Andrey
>>
>>
>>>
>>> The userspace can do some recovery and post-processing work
>>> based on this event.
>>>
>>> V2:
>>> - Changed the name of the work to gpu_reset_event_work
>>>    (Christian)
>>> - Added a structure to accommodate some additional information
>>>    (like a PID and some flags)
>>>
>>> Cc: Alexander Deucher <alexander.deucher@amd.com>
>>> Cc: Christian Koenig <christian.koenig@amd.com>
>>> Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  7 +++++++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 +++++++++++++++++++
>>>   2 files changed, 26 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index d8b854fcbffa..7df219fe363f 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -813,6 +813,11 @@ struct amd_powerplay {
>>>   #define AMDGPU_RESET_MAGIC_NUM 64
>>>   #define AMDGPU_MAX_DF_PERFMONS 4
>>>   #define AMDGPU_PRODUCT_NAME_LEN 64
>>> +struct amdgpu_reset_event_ctx {
>>> +    uint64_t pid;
>>> +    uint32_t flags;
>>> +};
>>> +
>>>   struct amdgpu_device {
>>>       struct device            *dev;
>>>       struct pci_dev            *pdev;
>>> @@ -1063,6 +1068,7 @@ struct amdgpu_device {
>>>       int asic_reset_res;
>>>       struct work_struct        xgmi_reset_work;
>>> +    struct work_struct        gpu_reset_event_work;
>>>       struct list_head        reset_list;
>>>       long                gfx_timeout;
>>> @@ -1097,6 +1103,7 @@ struct amdgpu_device {
>>>       pci_channel_state_t        pci_channel_state;
>>>       struct amdgpu_reset_control     *reset_cntl;
>>> +    struct amdgpu_reset_event_ctx   reset_event_ctx;
>>>       uint32_t ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
>>>       bool                ram_is_direct_mapped;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index ed077de426d9..c43d099da06d 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -73,6 +73,7 @@
>>>   #include <linux/pm_runtime.h>
>>>   #include <drm/drm_drv.h>
>>> +#include <drm/drm_sysfs.h>
>>>   MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
>>>   MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
>>> @@ -3277,6 +3278,23 @@ bool amdgpu_device_has_dc_support(struct 
>>> amdgpu_device *adev)
>>>       return amdgpu_device_asic_has_dc_support(adev->asic_type);
>>>   }
>>> +static void amdgpu_device_reset_event_func(struct work_struct *__work)
>>> +{
>>> +    struct amdgpu_device *adev = container_of(__work, struct 
>>> amdgpu_device,
>>> +                          gpu_reset_event_work);
>>> +    struct amdgpu_reset_event_ctx *event_ctx = &adev->reset_event_ctx;
>>> +
>>> +    /*
>>> +     * A GPU reset has happened, indicate the userspace and pass the
>>> +     * following information:
>>> +     *    - pid of the process involved,
>>> +     *    - if the VRAM is valid or not,
>>> +     *    - indicate that userspace may want to collect the ftrace 
>>> event
>>> +     * data from the trace event.
>>> +     */
>>> +    drm_sysfs_reset_event(&adev->ddev, event_ctx->pid, 
>>> event_ctx->flags);
>>> +}
>>> +
>>>   static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
>>>   {
>>>       struct amdgpu_device *adev =
>>> @@ -3525,6 +3543,7 @@ int amdgpu_device_init(struct amdgpu_device 
>>> *adev,
>>>                 amdgpu_device_delay_enable_gfx_off);
>>>       INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
>>> +    INIT_WORK(&adev->gpu_reset_event_work, 
>>> amdgpu_device_reset_event_func);
>>>       adev->gfx.gfx_off_req_count = 1;
>>>       adev->pm.ac_power = power_supply_is_system_supplied() > 0;

[-- Attachment #2: Type: text/html, Size: 9872 bytes --]

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] drm: Add GPU reset sysfs event
  2022-03-08 16:57           ` Sharma, Shashank
  2022-03-08 17:04             ` Somalapuram, Amaranath
@ 2022-03-08 17:27             ` Limonciello, Mario
  2022-03-08 18:08               ` Sharma, Shashank
  1 sibling, 1 reply; 32+ messages in thread
From: Limonciello, Mario @ 2022-03-08 17:27 UTC (permalink / raw)
  To: Sharma, Shashank, Andrey Grodzovsky, Lazar, Lijo,
	Shashank Sharma, amd-gfx, Shyam-sundar S-k, Sanket Goswami
  Cc: Deucher, Alexander, Somalapuram, Amaranath, Koenig, Christian

On 3/8/2022 10:57, Sharma, Shashank wrote:
> 
> 
> On 3/8/2022 5:55 PM, Andrey Grodzovsky wrote:
>> You can read on their side here - 
>> https://www.phoronix.com/scan.php?page=news_item&px=AMD-STB-Linux-5.17 
>> and see their patch. THey don't have as clean
>> interface as we do to retrieve the buffer and currently it's 
>> hard-coded for debugfs dump but it looks like pretty straight forward 
>> to expose their buffer to external
>> client like amdgpu.
> 
> Noted, thanks for the pointer.
> - Shashank

Yeah I think this is a great idea for APU, but need to keep in mind 
amd-pmc is only activated if APU is configured for s0i3.  So in the 
IS_APU check you will need to test for set for s0i3 and return an error 
code if not set.

For APU it's currently fetched on demand from debugfs.  If you can 
"easily" export a symbol I say go for it and include a patch in your 
series.  If not my suggestion is to stub this out and let Shyam and 
Sanket fill in the stub after they can sort out the exporting it to 
another driver.

>>
>> Andrey
>>
>> On 2022-03-08 11:46, Sharma, Shashank wrote:
>>> I have a very limited understanding of PMC driver and its interfaces, 
>>> so I would just go ahead and rely on Andrey's 
>>> judgement/recommendation on this :)
>>>
>>> - Shashank
>>>
>>> On 3/8/2022 5:39 PM, Andrey Grodzovsky wrote:
>>>> As long as PMC driver provides clear interface to retrieve the info 
>>>> there should be no issue to call either amdgpu interface or PMC 
>>>> interface using IS_APU (or something alike in the code)
>>>> We probably should add a wrapper function around this logic in amdgpu.
>>>>
>>>> Andrey
>>>>
>>>> On 2022-03-08 11:36, Lazar, Lijo wrote:
>>>>>
>>>>> [AMD Official Use Only]
>>>>>
>>>>>
>>>>> +Mario
>>>>>
>>>>> I guess that means the functionality needs to be present in amdgpu 
>>>>> for APUs also. Presently, this is taken care by PMC driver for APUs.
>>>>>
>>>>> Thanks,
>>>>> Lijo
>>>>> ------------------------------------------------------------------------ 
>>>>>
>>>>> *From:* amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf 
>>>>> of Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>> *Sent:* Tuesday, March 8, 2022 9:55:03 PM
>>>>> *To:* Shashank Sharma <contactshashanksharma@gmail.com>; 
>>>>> amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
>>>>> *Cc:* Deucher, Alexander <Alexander.Deucher@amd.com>; Somalapuram, 
>>>>> Amaranath <Amaranath.Somalapuram@amd.com>; Koenig, Christian 
>>>>> <Christian.Koenig@amd.com>; Sharma, Shashank <Shashank.Sharma@amd.com>
>>>>> *Subject:* Re: [PATCH 1/2] drm: Add GPU reset sysfs event
>>>>>
>>>>> On 2022-03-07 11:26, Shashank Sharma wrote:
>>>>> > From: Shashank Sharma <shashank.sharma@amd.com>
>>>>> >
>>>>> > This patch adds a new sysfs event, which will indicate
>>>>> > the userland about a GPU reset, and can also provide
>>>>> > some information like:
>>>>> > - which PID was involved in the GPU reset
>>>>> > - what was the GPU status (using flags)
>>>>> >
>>>>> > This patch also introduces the first flag of the flags
>>>>> > bitmap, which can be appended as and when required.
>>>>>
>>>>>
>>>>> I am reminding again about another important piece of info which 
>>>>> you can add
>>>>> here and that is Smart Trace Buffer dump [1]. The buffer size is HW
>>>>> specific but
>>>>> from what I see there is no problem to just amend it as part of envp[]
>>>>> initialization.
>>>>> bellow.
>>>>>
>>>>> The interface to get the buffer is smu_stb_collect_info and usage 
>>>>> can be
>>>>> seen from
>>>>> frebugfs interface in smu_stb_debugfs_open
>>>>>
>>>>> [1] - 
>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.spinics.net%2Flists%2Famd-gfx%2Fmsg70751.html&amp;data=04%7C01%7Clijo.lazar%40amd.com%7C80bc3f07e2d0441d44a108da012036dc%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637823535167679490%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=53l7KlTf%2BICKkZkLVwFh6nRTjkAh%2FDpOat5DRoyKIx0%3D&amp;reserved=0 
>>>>> <https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.spinics.net%2Flists%2Famd-gfx%2Fmsg70751.html&amp;data=04%7C01%7Clijo.lazar%40amd.com%7C80bc3f07e2d0441d44a108da012036dc%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637823535167679490%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=53l7KlTf%2BICKkZkLVwFh6nRTjkAh%2FDpOat5DRoyKIx0%3D&amp;reserved=0> 
>>>>>
>>>>>
>>>>> Andrey
>>>>>
>>>>>
>>>>> >
>>>>> > Cc: Alexandar Deucher <alexander.deucher@amd.com>
>>>>> > Cc: Christian Koenig <christian.koenig@amd.com>
>>>>> > Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
>>>>> > ---
>>>>> >   drivers/gpu/drm/drm_sysfs.c | 24 ++++++++++++++++++++++++
>>>>> >   include/drm/drm_sysfs.h     |  3 +++
>>>>> >   2 files changed, 27 insertions(+)
>>>>> >
>>>>> > diff --git a/drivers/gpu/drm/drm_sysfs.c 
>>>>> b/drivers/gpu/drm/drm_sysfs.c
>>>>> > index 430e00b16eec..52a015161431 100644
>>>>> > --- a/drivers/gpu/drm/drm_sysfs.c
>>>>> > +++ b/drivers/gpu/drm/drm_sysfs.c
>>>>> > @@ -409,6 +409,30 @@ void drm_sysfs_hotplug_event(struct 
>>>>> drm_device *dev)
>>>>> >   }
>>>>> >   EXPORT_SYMBOL(drm_sysfs_hotplug_event);
>>>>> >
>>>>> > +/**
>>>>> > + * drm_sysfs_reset_event - generate a DRM uevent to indicate GPU 
>>>>> reset
>>>>> > + * @dev: DRM device
>>>>> > + * @pid: The process ID involve with the reset
>>>>> > + * @flags: Any other information about the GPU status
>>>>> > + *
>>>>> > + * Send a uevent for the DRM device specified by @dev. This 
>>>>> indicates
>>>>> > + * user that a GPU reset has occurred, so that the interested 
>>>>> client
>>>>> > + * can take any recovery or profiling measure, when required.
>>>>> > + */
>>>>> > +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, 
>>>>> uint32_t flags)
>>>>> > +{
>>>>> > +     unsigned char pid_str[21], flags_str[15];
>>>>> > +     unsigned char reset_str[] = "RESET=1";
>>>>> > +     char *envp[] = { reset_str, pid_str, flags_str, NULL };
>>>>> > +
>>>>> > +     DRM_DEBUG("generating reset event\n");
>>>>> > +
>>>>> > +     snprintf(pid_str, ARRAY_SIZE(pid_str), "PID=%lu", pid);
>>>>> > +     snprintf(flags_str, ARRAY_SIZE(flags_str), "FLAGS=%u", flags);
>>>>> > + kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
>>>>> > +}
>>>>> > +EXPORT_SYMBOL(drm_sysfs_reset_event);
>>>>> > +
>>>>> >   /**
>>>>> >    * drm_sysfs_connector_hotplug_event - generate a DRM uevent 
>>>>> for any connector
>>>>> >    * change
>>>>> > diff --git a/include/drm/drm_sysfs.h b/include/drm/drm_sysfs.h
>>>>> > index 6273cac44e47..63f00fe8054c 100644
>>>>> > --- a/include/drm/drm_sysfs.h
>>>>> > +++ b/include/drm/drm_sysfs.h
>>>>> > @@ -2,6 +2,8 @@
>>>>> >   #ifndef _DRM_SYSFS_H_
>>>>> >   #define _DRM_SYSFS_H_
>>>>> >
>>>>> > +#define DRM_GPU_RESET_FLAG_VRAM_VALID (1 << 0)
>>>>> > +
>>>>> >   struct drm_device;
>>>>> >   struct device;
>>>>> >   struct drm_connector;
>>>>> > @@ -11,6 +13,7 @@ int drm_class_device_register(struct device *dev);
>>>>> >   void drm_class_device_unregister(struct device *dev);
>>>>> >
>>>>> >   void drm_sysfs_hotplug_event(struct drm_device *dev);
>>>>> > +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, 
>>>>> uint32_t reset_flags);
>>>>> >   void drm_sysfs_connector_hotplug_event(struct drm_connector 
>>>>> *connector);
>>>>> >   void drm_sysfs_connector_status_event(struct drm_connector 
>>>>> *connector,
>>>>> >                                      struct drm_property *property);


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] drm: Add GPU reset sysfs event
  2022-03-08 17:27             ` Limonciello, Mario
@ 2022-03-08 18:08               ` Sharma, Shashank
  2022-03-08 18:10                 ` Limonciello, Mario
  0 siblings, 1 reply; 32+ messages in thread
From: Sharma, Shashank @ 2022-03-08 18:08 UTC (permalink / raw)
  To: Limonciello, Mario, Andrey Grodzovsky, Lazar, Lijo,
	Shashank Sharma, amd-gfx, Shyam-sundar S-k, Sanket Goswami
  Cc: Deucher, Alexander, Somalapuram, Amaranath, Koenig, Christian

Hey Mario,

On 3/8/2022 6:27 PM, Limonciello, Mario wrote:
> On 3/8/2022 10:57, Sharma, Shashank wrote:
>>
>>
>> On 3/8/2022 5:55 PM, Andrey Grodzovsky wrote:
>>> You can read on their side here - 
>>> https://www.phoronix.com/scan.php?page=news_item&px=AMD-STB-Linux-5.17 and 
>>> see their patch. THey don't have as clean
>>> interface as we do to retrieve the buffer and currently it's 
>>> hard-coded for debugfs dump but it looks like pretty straight forward 
>>> to expose their buffer to external
>>> client like amdgpu.

>>
>> Noted, thanks for the pointer.
>> - Shashank
> 
> Yeah I think this is a great idea for APU, but need to keep in mind 
> amd-pmc is only activated if APU is configured for s0i3.  So in the 
> IS_APU check you will need to test for set for s0i3 and return an error 
> code if not set.
> 
> For APU it's currently fetched on demand from debugfs.  If you can 
> "easily" export a symbol I say go for it and include a patch in your 
> series.  

Yep, if the info is available via debugfs, I don't see a reason why 
can't we fetch that directly from its lower layers. May I know the name 
of this debugfs entry ?

- Shashank

If not my suggestion is to stub this out and let Shyam and
> Sanket fill in the stub after they can sort out the exporting it to 
> another driver.
> 
>>>
>>> Andrey
>>>
>>> On 2022-03-08 11:46, Sharma, Shashank wrote:
>>>> I have a very limited understanding of PMC driver and its 
>>>> interfaces, so I would just go ahead and rely on Andrey's 
>>>> judgement/recommendation on this :)
>>>>
>>>> - Shashank
>>>>
>>>> On 3/8/2022 5:39 PM, Andrey Grodzovsky wrote:
>>>>> As long as PMC driver provides clear interface to retrieve the info 
>>>>> there should be no issue to call either amdgpu interface or PMC 
>>>>> interface using IS_APU (or something alike in the code)
>>>>> We probably should add a wrapper function around this logic in amdgpu.
>>>>>
>>>>> Andrey
>>>>>
>>>>> On 2022-03-08 11:36, Lazar, Lijo wrote:
>>>>>>
>>>>>> [AMD Official Use Only]
>>>>>>
>>>>>>
>>>>>> +Mario
>>>>>>
>>>>>> I guess that means the functionality needs to be present in amdgpu 
>>>>>> for APUs also. Presently, this is taken care by PMC driver for APUs.
>>>>>>
>>>>>> Thanks,
>>>>>> Lijo
>>>>>> ------------------------------------------------------------------------ 
>>>>>>
>>>>>> *From:* amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf 
>>>>>> of Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>> *Sent:* Tuesday, March 8, 2022 9:55:03 PM
>>>>>> *To:* Shashank Sharma <contactshashanksharma@gmail.com>; 
>>>>>> amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
>>>>>> *Cc:* Deucher, Alexander <Alexander.Deucher@amd.com>; Somalapuram, 
>>>>>> Amaranath <Amaranath.Somalapuram@amd.com>; Koenig, Christian 
>>>>>> <Christian.Koenig@amd.com>; Sharma, Shashank 
>>>>>> <Shashank.Sharma@amd.com>
>>>>>> *Subject:* Re: [PATCH 1/2] drm: Add GPU reset sysfs event
>>>>>>
>>>>>> On 2022-03-07 11:26, Shashank Sharma wrote:
>>>>>> > From: Shashank Sharma <shashank.sharma@amd.com>
>>>>>> >
>>>>>> > This patch adds a new sysfs event, which will indicate
>>>>>> > the userland about a GPU reset, and can also provide
>>>>>> > some information like:
>>>>>> > - which PID was involved in the GPU reset
>>>>>> > - what was the GPU status (using flags)
>>>>>> >
>>>>>> > This patch also introduces the first flag of the flags
>>>>>> > bitmap, which can be appended as and when required.
>>>>>>
>>>>>>
>>>>>> I am reminding again about another important piece of info which 
>>>>>> you can add
>>>>>> here and that is Smart Trace Buffer dump [1]. The buffer size is HW
>>>>>> specific but
>>>>>> from what I see there is no problem to just amend it as part of 
>>>>>> envp[]
>>>>>> initialization.
>>>>>> bellow.
>>>>>>
>>>>>> The interface to get the buffer is smu_stb_collect_info and usage 
>>>>>> can be
>>>>>> seen from
>>>>>> frebugfs interface in smu_stb_debugfs_open
>>>>>>
>>>>>> [1] - 
>>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.spinics.net%2Flists%2Famd-gfx%2Fmsg70751.html&amp;data=04%7C01%7Clijo.lazar%40amd.com%7C80bc3f07e2d0441d44a108da012036dc%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637823535167679490%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=53l7KlTf%2BICKkZkLVwFh6nRTjkAh%2FDpOat5DRoyKIx0%3D&amp;reserved=0 
>>>>>> <https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.spinics.net%2Flists%2Famd-gfx%2Fmsg70751.html&amp;data=04%7C01%7Clijo.lazar%40amd.com%7C80bc3f07e2d0441d44a108da012036dc%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637823535167679490%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=53l7KlTf%2BICKkZkLVwFh6nRTjkAh%2FDpOat5DRoyKIx0%3D&amp;reserved=0> 
>>>>>>
>>>>>>
>>>>>> Andrey
>>>>>>
>>>>>>
>>>>>> >
>>>>>> > Cc: Alexandar Deucher <alexander.deucher@amd.com>
>>>>>> > Cc: Christian Koenig <christian.koenig@amd.com>
>>>>>> > Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
>>>>>> > ---
>>>>>> >   drivers/gpu/drm/drm_sysfs.c | 24 ++++++++++++++++++++++++
>>>>>> >   include/drm/drm_sysfs.h     |  3 +++
>>>>>> >   2 files changed, 27 insertions(+)
>>>>>> >
>>>>>> > diff --git a/drivers/gpu/drm/drm_sysfs.c 
>>>>>> b/drivers/gpu/drm/drm_sysfs.c
>>>>>> > index 430e00b16eec..52a015161431 100644
>>>>>> > --- a/drivers/gpu/drm/drm_sysfs.c
>>>>>> > +++ b/drivers/gpu/drm/drm_sysfs.c
>>>>>> > @@ -409,6 +409,30 @@ void drm_sysfs_hotplug_event(struct 
>>>>>> drm_device *dev)
>>>>>> >   }
>>>>>> >   EXPORT_SYMBOL(drm_sysfs_hotplug_event);
>>>>>> >
>>>>>> > +/**
>>>>>> > + * drm_sysfs_reset_event - generate a DRM uevent to indicate 
>>>>>> GPU reset
>>>>>> > + * @dev: DRM device
>>>>>> > + * @pid: The process ID involve with the reset
>>>>>> > + * @flags: Any other information about the GPU status
>>>>>> > + *
>>>>>> > + * Send a uevent for the DRM device specified by @dev. This 
>>>>>> indicates
>>>>>> > + * user that a GPU reset has occurred, so that the interested 
>>>>>> client
>>>>>> > + * can take any recovery or profiling measure, when required.
>>>>>> > + */
>>>>>> > +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t 
>>>>>> pid, uint32_t flags)
>>>>>> > +{
>>>>>> > +     unsigned char pid_str[21], flags_str[15];
>>>>>> > +     unsigned char reset_str[] = "RESET=1";
>>>>>> > +     char *envp[] = { reset_str, pid_str, flags_str, NULL };
>>>>>> > +
>>>>>> > +     DRM_DEBUG("generating reset event\n");
>>>>>> > +
>>>>>> > +     snprintf(pid_str, ARRAY_SIZE(pid_str), "PID=%lu", pid);
>>>>>> > +     snprintf(flags_str, ARRAY_SIZE(flags_str), "FLAGS=%u", 
>>>>>> flags);
>>>>>> > + kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
>>>>>> > +}
>>>>>> > +EXPORT_SYMBOL(drm_sysfs_reset_event);
>>>>>> > +
>>>>>> >   /**
>>>>>> >    * drm_sysfs_connector_hotplug_event - generate a DRM uevent 
>>>>>> for any connector
>>>>>> >    * change
>>>>>> > diff --git a/include/drm/drm_sysfs.h b/include/drm/drm_sysfs.h
>>>>>> > index 6273cac44e47..63f00fe8054c 100644
>>>>>> > --- a/include/drm/drm_sysfs.h
>>>>>> > +++ b/include/drm/drm_sysfs.h
>>>>>> > @@ -2,6 +2,8 @@
>>>>>> >   #ifndef _DRM_SYSFS_H_
>>>>>> >   #define _DRM_SYSFS_H_
>>>>>> >
>>>>>> > +#define DRM_GPU_RESET_FLAG_VRAM_VALID (1 << 0)
>>>>>> > +
>>>>>> >   struct drm_device;
>>>>>> >   struct device;
>>>>>> >   struct drm_connector;
>>>>>> > @@ -11,6 +13,7 @@ int drm_class_device_register(struct device 
>>>>>> *dev);
>>>>>> >   void drm_class_device_unregister(struct device *dev);
>>>>>> >
>>>>>> >   void drm_sysfs_hotplug_event(struct drm_device *dev);
>>>>>> > +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t 
>>>>>> pid, uint32_t reset_flags);
>>>>>> >   void drm_sysfs_connector_hotplug_event(struct drm_connector 
>>>>>> *connector);
>>>>>> >   void drm_sysfs_connector_status_event(struct drm_connector 
>>>>>> *connector,
>>>>>> >                                      struct drm_property 
>>>>>> *property);
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* RE: [PATCH 1/2] drm: Add GPU reset sysfs event
  2022-03-08 18:08               ` Sharma, Shashank
@ 2022-03-08 18:10                 ` Limonciello, Mario
  0 siblings, 0 replies; 32+ messages in thread
From: Limonciello, Mario @ 2022-03-08 18:10 UTC (permalink / raw)
  To: Sharma, Shashank, Grodzovsky, Andrey, Lazar, Lijo,
	Shashank Sharma, amd-gfx, S-k, Shyam-sundar, Goswami, Sanket
  Cc: Deucher, Alexander, Somalapuram, Amaranath, Koenig, Christian

[AMD Official Use Only]



> -----Original Message-----
> From: Sharma, Shashank <Shashank.Sharma@amd.com>
> Sent: Tuesday, March 8, 2022 12:09
> To: Limonciello, Mario <Mario.Limonciello@amd.com>; Grodzovsky, Andrey
> <Andrey.Grodzovsky@amd.com>; Lazar, Lijo <Lijo.Lazar@amd.com>; Shashank
> Sharma <contactshashanksharma@gmail.com>; amd-
> gfx@lists.freedesktop.org; S-k, Shyam-sundar <Shyam-sundar.S-k@amd.com>;
> Goswami, Sanket <Sanket.Goswami@amd.com>
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Somalapuram,
> Amaranath <Amaranath.Somalapuram@amd.com>; Koenig, Christian
> <Christian.Koenig@amd.com>
> Subject: Re: [PATCH 1/2] drm: Add GPU reset sysfs event
> 
> Hey Mario,
> 
> On 3/8/2022 6:27 PM, Limonciello, Mario wrote:
> > On 3/8/2022 10:57, Sharma, Shashank wrote:
> >>
> >>
> >> On 3/8/2022 5:55 PM, Andrey Grodzovsky wrote:
> >>> You can read on their side here -
> >>> https://www.phoronix.com/scan.php?page=news_item&px=AMD-STB-
> Linux-5.17 and
> >>> see their patch. THey don't have as clean
> >>> interface as we do to retrieve the buffer and currently it's
> >>> hard-coded for debugfs dump but it looks like pretty straight forward
> >>> to expose their buffer to external
> >>> client like amdgpu.
> 
> >>
> >> Noted, thanks for the pointer.
> >> - Shashank
> >
> > Yeah I think this is a great idea for APU, but need to keep in mind
> > amd-pmc is only activated if APU is configured for s0i3.  So in the
> > IS_APU check you will need to test for set for s0i3 and return an error
> > code if not set.
> >
> > For APU it's currently fetched on demand from debugfs.  If you can
> > "easily" export a symbol I say go for it and include a patch in your
> > series.
> 
> Yep, if the info is available via debugfs, I don't see a reason why
> can't we fetch that directly from its lower layers. May I know the name
> of this debugfs entry ?

It's called "stb_read" from debugfs.

> 
> - Shashank
> 
> If not my suggestion is to stub this out and let Shyam and
> > Sanket fill in the stub after they can sort out the exporting it to
> > another driver.
> >
> >>>
> >>> Andrey
> >>>
> >>> On 2022-03-08 11:46, Sharma, Shashank wrote:
> >>>> I have a very limited understanding of PMC driver and its
> >>>> interfaces, so I would just go ahead and rely on Andrey's
> >>>> judgement/recommendation on this :)
> >>>>
> >>>> - Shashank
> >>>>
> >>>> On 3/8/2022 5:39 PM, Andrey Grodzovsky wrote:
> >>>>> As long as PMC driver provides clear interface to retrieve the info
> >>>>> there should be no issue to call either amdgpu interface or PMC
> >>>>> interface using IS_APU (or something alike in the code)
> >>>>> We probably should add a wrapper function around this logic in amdgpu.
> >>>>>
> >>>>> Andrey
> >>>>>
> >>>>> On 2022-03-08 11:36, Lazar, Lijo wrote:
> >>>>>>
> >>>>>> [AMD Official Use Only]
> >>>>>>
> >>>>>>
> >>>>>> +Mario
> >>>>>>
> >>>>>> I guess that means the functionality needs to be present in amdgpu
> >>>>>> for APUs also. Presently, this is taken care by PMC driver for APUs.
> >>>>>>
> >>>>>> Thanks,
> >>>>>> Lijo
> >>>>>> ------------------------------------------------------------------------
> >>>>>>
> >>>>>> *From:* amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf
> >>>>>> of Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> >>>>>> *Sent:* Tuesday, March 8, 2022 9:55:03 PM
> >>>>>> *To:* Shashank Sharma <contactshashanksharma@gmail.com>;
> >>>>>> amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
> >>>>>> *Cc:* Deucher, Alexander <Alexander.Deucher@amd.com>;
> Somalapuram,
> >>>>>> Amaranath <Amaranath.Somalapuram@amd.com>; Koenig, Christian
> >>>>>> <Christian.Koenig@amd.com>; Sharma, Shashank
> >>>>>> <Shashank.Sharma@amd.com>
> >>>>>> *Subject:* Re: [PATCH 1/2] drm: Add GPU reset sysfs event
> >>>>>>
> >>>>>> On 2022-03-07 11:26, Shashank Sharma wrote:
> >>>>>> > From: Shashank Sharma <shashank.sharma@amd.com>
> >>>>>> >
> >>>>>> > This patch adds a new sysfs event, which will indicate
> >>>>>> > the userland about a GPU reset, and can also provide
> >>>>>> > some information like:
> >>>>>> > - which PID was involved in the GPU reset
> >>>>>> > - what was the GPU status (using flags)
> >>>>>> >
> >>>>>> > This patch also introduces the first flag of the flags
> >>>>>> > bitmap, which can be appended as and when required.
> >>>>>>
> >>>>>>
> >>>>>> I am reminding again about another important piece of info which
> >>>>>> you can add
> >>>>>> here and that is Smart Trace Buffer dump [1]. The buffer size is HW
> >>>>>> specific but
> >>>>>> from what I see there is no problem to just amend it as part of
> >>>>>> envp[]
> >>>>>> initialization.
> >>>>>> bellow.
> >>>>>>
> >>>>>> The interface to get the buffer is smu_stb_collect_info and usage
> >>>>>> can be
> >>>>>> seen from
> >>>>>> frebugfs interface in smu_stb_debugfs_open
> >>>>>>
> >>>>>> [1] -
> >>>>>>
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.sp
> inics.net%2Flists%2Famd-
> gfx%2Fmsg70751.html&amp;data=04%7C01%7Clijo.lazar%40amd.com%7C80bc
> 3f07e2d0441d44a108da012036dc%7C3dd8961fe4884e608e11a82d994e183d%7
> C0%7C0%7C637823535167679490%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC
> 4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&a
> mp;sdata=53l7KlTf%2BICKkZkLVwFh6nRTjkAh%2FDpOat5DRoyKIx0%3D&amp;re
> served=0
> >>>>>>
> <https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.s
> pinics.net%2Flists%2Famd-
> gfx%2Fmsg70751.html&amp;data=04%7C01%7Clijo.lazar%40amd.com%7C80bc
> 3f07e2d0441d44a108da012036dc%7C3dd8961fe4884e608e11a82d994e183d%7
> C0%7C0%7C637823535167679490%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC
> 4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&a
> mp;sdata=53l7KlTf%2BICKkZkLVwFh6nRTjkAh%2FDpOat5DRoyKIx0%3D&amp;re
> served=0>
> >>>>>>
> >>>>>>
> >>>>>> Andrey
> >>>>>>
> >>>>>>
> >>>>>> >
> >>>>>> > Cc: Alexandar Deucher <alexander.deucher@amd.com>
> >>>>>> > Cc: Christian Koenig <christian.koenig@amd.com>
> >>>>>> > Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
> >>>>>> > ---
> >>>>>> >   drivers/gpu/drm/drm_sysfs.c | 24 ++++++++++++++++++++++++
> >>>>>> >   include/drm/drm_sysfs.h     |  3 +++
> >>>>>> >   2 files changed, 27 insertions(+)
> >>>>>> >
> >>>>>> > diff --git a/drivers/gpu/drm/drm_sysfs.c
> >>>>>> b/drivers/gpu/drm/drm_sysfs.c
> >>>>>> > index 430e00b16eec..52a015161431 100644
> >>>>>> > --- a/drivers/gpu/drm/drm_sysfs.c
> >>>>>> > +++ b/drivers/gpu/drm/drm_sysfs.c
> >>>>>> > @@ -409,6 +409,30 @@ void drm_sysfs_hotplug_event(struct
> >>>>>> drm_device *dev)
> >>>>>> >   }
> >>>>>> >   EXPORT_SYMBOL(drm_sysfs_hotplug_event);
> >>>>>> >
> >>>>>> > +/**
> >>>>>> > + * drm_sysfs_reset_event - generate a DRM uevent to indicate
> >>>>>> GPU reset
> >>>>>> > + * @dev: DRM device
> >>>>>> > + * @pid: The process ID involve with the reset
> >>>>>> > + * @flags: Any other information about the GPU status
> >>>>>> > + *
> >>>>>> > + * Send a uevent for the DRM device specified by @dev. This
> >>>>>> indicates
> >>>>>> > + * user that a GPU reset has occurred, so that the interested
> >>>>>> client
> >>>>>> > + * can take any recovery or profiling measure, when required.
> >>>>>> > + */
> >>>>>> > +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t
> >>>>>> pid, uint32_t flags)
> >>>>>> > +{
> >>>>>> > +     unsigned char pid_str[21], flags_str[15];
> >>>>>> > +     unsigned char reset_str[] = "RESET=1";
> >>>>>> > +     char *envp[] = { reset_str, pid_str, flags_str, NULL };
> >>>>>> > +
> >>>>>> > +     DRM_DEBUG("generating reset event\n");
> >>>>>> > +
> >>>>>> > +     snprintf(pid_str, ARRAY_SIZE(pid_str), "PID=%lu", pid);
> >>>>>> > +     snprintf(flags_str, ARRAY_SIZE(flags_str), "FLAGS=%u",
> >>>>>> flags);
> >>>>>> > + kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE,
> envp);
> >>>>>> > +}
> >>>>>> > +EXPORT_SYMBOL(drm_sysfs_reset_event);
> >>>>>> > +
> >>>>>> >   /**
> >>>>>> >    * drm_sysfs_connector_hotplug_event - generate a DRM uevent
> >>>>>> for any connector
> >>>>>> >    * change
> >>>>>> > diff --git a/include/drm/drm_sysfs.h b/include/drm/drm_sysfs.h
> >>>>>> > index 6273cac44e47..63f00fe8054c 100644
> >>>>>> > --- a/include/drm/drm_sysfs.h
> >>>>>> > +++ b/include/drm/drm_sysfs.h
> >>>>>> > @@ -2,6 +2,8 @@
> >>>>>> >   #ifndef _DRM_SYSFS_H_
> >>>>>> >   #define _DRM_SYSFS_H_
> >>>>>> >
> >>>>>> > +#define DRM_GPU_RESET_FLAG_VRAM_VALID (1 << 0)
> >>>>>> > +
> >>>>>> >   struct drm_device;
> >>>>>> >   struct device;
> >>>>>> >   struct drm_connector;
> >>>>>> > @@ -11,6 +13,7 @@ int drm_class_device_register(struct device
> >>>>>> *dev);
> >>>>>> >   void drm_class_device_unregister(struct device *dev);
> >>>>>> >
> >>>>>> >   void drm_sysfs_hotplug_event(struct drm_device *dev);
> >>>>>> > +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t
> >>>>>> pid, uint32_t reset_flags);
> >>>>>> >   void drm_sysfs_connector_hotplug_event(struct drm_connector
> >>>>>> *connector);
> >>>>>> >   void drm_sysfs_connector_status_event(struct drm_connector
> >>>>>> *connector,
> >>>>>> >                                      struct drm_property
> >>>>>> *property);
> >

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: add work function for GPU reset event
  2022-03-08 17:20       ` Somalapuram, Amaranath
@ 2022-03-08 18:53         ` Andrey Grodzovsky
  0 siblings, 0 replies; 32+ messages in thread
From: Andrey Grodzovsky @ 2022-03-08 18:53 UTC (permalink / raw)
  To: Somalapuram, Amaranath, Sharma, Shashank, Shashank Sharma, amd-gfx
  Cc: Alexander Deucher, amaranath.somalapuram, Christian Koenig

[-- Attachment #1: Type: text/plain, Size: 5672 bytes --]


On 2022-03-08 12:20, Somalapuram, Amaranath wrote:
>
>
> On 3/8/2022 10:00 PM, Sharma, Shashank wrote:
>> Hello Andrey
>>
>> On 3/8/2022 5:26 PM, Andrey Grodzovsky wrote:
>>>
>>> On 2022-03-07 11:26, Shashank Sharma wrote:
>>>> From: Shashank Sharma <shashank.sharma@amd.com>
>>>>
>>>> This patch adds a work function, which will get scheduled
>>>> in event of a GPU reset, and will send a uevent to user with
>>>> some reset context infomration, like a PID and some flags.
>>>
>>>
>>> Where is the actual scheduling of the work function ? Shouldn't
>>> there be a patch for that too ?
>>>
>>
>> Yes, Amar is working on that patch, on top of these patches. They 
>> should be out soon. I thought it was a good idea to get quick 
>> feedback on the basic patches before we build something on top of it.
>>
> schedule_work() will be called in the function amdgpu_do_asic_reset ()
>

I didn't follow closely on the requirements and so I don't know but, 
what about
job timeout that was able to soft recover - do you need to cover this 
too ? Or
in this case no need to restart user application and you hence don't care ?

Andrey


> after getting vram_lost info:
>
> vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
>
> update  amdgpu_reset_event_ctx and call schedule_work()
>
>   * vram_lost
>   * reset_context->job->vm->task_info.process_name
>   * reset_context->job->vm->task_info.pid
>
> Regards,
> S.Amarnath
>> - Shashank
>>
>>> Andrey
>>>
>>>
>>>>
>>>> The userspace can do some recovery and post-processing work
>>>> based on this event.
>>>>
>>>> V2:
>>>> - Changed the name of the work to gpu_reset_event_work
>>>>    (Christian)
>>>> - Added a structure to accommodate some additional information
>>>>    (like a PID and some flags)
>>>>
>>>> Cc: Alexander Deucher <alexander.deucher@amd.com>
>>>> Cc: Christian Koenig <christian.koenig@amd.com>
>>>> Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  7 +++++++
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 +++++++++++++++++++
>>>>   2 files changed, 26 insertions(+)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> index d8b854fcbffa..7df219fe363f 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> @@ -813,6 +813,11 @@ struct amd_powerplay {
>>>>   #define AMDGPU_RESET_MAGIC_NUM 64
>>>>   #define AMDGPU_MAX_DF_PERFMONS 4
>>>>   #define AMDGPU_PRODUCT_NAME_LEN 64
>>>> +struct amdgpu_reset_event_ctx {
>>>> +    uint64_t pid;
>>>> +    uint32_t flags;
>>>> +};
>>>> +
>>>>   struct amdgpu_device {
>>>>       struct device            *dev;
>>>>       struct pci_dev            *pdev;
>>>> @@ -1063,6 +1068,7 @@ struct amdgpu_device {
>>>>       int asic_reset_res;
>>>>       struct work_struct        xgmi_reset_work;
>>>> +    struct work_struct        gpu_reset_event_work;
>>>>       struct list_head        reset_list;
>>>>       long                gfx_timeout;
>>>> @@ -1097,6 +1103,7 @@ struct amdgpu_device {
>>>>       pci_channel_state_t        pci_channel_state;
>>>>       struct amdgpu_reset_control     *reset_cntl;
>>>> +    struct amdgpu_reset_event_ctx   reset_event_ctx;
>>>>       uint32_t ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
>>>>       bool                ram_is_direct_mapped;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> index ed077de426d9..c43d099da06d 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> @@ -73,6 +73,7 @@
>>>>   #include <linux/pm_runtime.h>
>>>>   #include <drm/drm_drv.h>
>>>> +#include <drm/drm_sysfs.h>
>>>>   MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
>>>>   MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
>>>> @@ -3277,6 +3278,23 @@ bool amdgpu_device_has_dc_support(struct 
>>>> amdgpu_device *adev)
>>>>       return amdgpu_device_asic_has_dc_support(adev->asic_type);
>>>>   }
>>>> +static void amdgpu_device_reset_event_func(struct work_struct 
>>>> *__work)
>>>> +{
>>>> +    struct amdgpu_device *adev = container_of(__work, struct 
>>>> amdgpu_device,
>>>> +                          gpu_reset_event_work);
>>>> +    struct amdgpu_reset_event_ctx *event_ctx = 
>>>> &adev->reset_event_ctx;
>>>> +
>>>> +    /*
>>>> +     * A GPU reset has happened, indicate the userspace and pass the
>>>> +     * following information:
>>>> +     *    - pid of the process involved,
>>>> +     *    - if the VRAM is valid or not,
>>>> +     *    - indicate that userspace may want to collect the ftrace 
>>>> event
>>>> +     * data from the trace event.
>>>> +     */
>>>> +    drm_sysfs_reset_event(&adev->ddev, event_ctx->pid, 
>>>> event_ctx->flags);
>>>> +}
>>>> +
>>>>   static void amdgpu_device_xgmi_reset_func(struct work_struct 
>>>> *__work)
>>>>   {
>>>>       struct amdgpu_device *adev =
>>>> @@ -3525,6 +3543,7 @@ int amdgpu_device_init(struct amdgpu_device 
>>>> *adev,
>>>>                 amdgpu_device_delay_enable_gfx_off);
>>>>       INIT_WORK(&adev->xgmi_reset_work, 
>>>> amdgpu_device_xgmi_reset_func);
>>>> +    INIT_WORK(&adev->gpu_reset_event_work, 
>>>> amdgpu_device_reset_event_func);
>>>>       adev->gfx.gfx_off_req_count = 1;
>>>>       adev->pm.ac_power = power_supply_is_system_supplied() > 0;

[-- Attachment #2: Type: text/html, Size: 10138 bytes --]

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] drm: Add GPU reset sysfs event
  2022-03-08 15:37         ` Somalapuram, Amaranath
@ 2022-03-09  8:02           ` Christian König
  0 siblings, 0 replies; 32+ messages in thread
From: Christian König @ 2022-03-09  8:02 UTC (permalink / raw)
  To: Somalapuram, Amaranath, Sharma, Shashank, Christian König,
	Shashank Sharma, amd-gfx
  Cc: Alexandar Deucher, amaranath.somalapuram

Am 08.03.22 um 16:37 schrieb Somalapuram, Amaranath:
>
> On 3/8/2022 5:26 PM, Sharma, Shashank wrote:
>>
>>
>> On 3/8/2022 11:32 AM, Christian König wrote:
>>> Am 08.03.22 um 10:31 schrieb Sharma, Shashank:
>>>>
>>>>
>>>> On 3/8/2022 8:06 AM, Christian König wrote:
>>>>> Am 07.03.22 um 17:26 schrieb Shashank Sharma:
>>>>>> From: Shashank Sharma <shashank.sharma@amd.com>
>>>>>>
>>>>>> This patch adds a new sysfs event, which will indicate
>>>>>> the userland about a GPU reset, and can also provide
>>>>>> some information like:
>>>>>> - which PID was involved in the GPU reset
>>>>>> - what was the GPU status (using flags)
>>>>>>
>>>>>> This patch also introduces the first flag of the flags
>>>>>> bitmap, which can be appended as and when required.
>>>>>
>>>>> Make sure to CC the dri-devel mailing list when reviewing this.
>>>> Got it,
>>>>
>>>> I was also curious if we want to move the reset_ctx structure 
>>>> itself to DRM layer, like
>>>> drm_reset_event_ctx {
>>>>     u32 pid;
>>>>     u32 flags;
>>>>     char process_name[64];
>>>> };
>>>
>>> I was entertaining that thought as well.
>>>
>>> But if we do this I would go even a step further and also move the 
>>> reset work item into the DRM layer as well.
>>>
>>> You might also look like into migrating the exiting i915 code which 
>>> uses udev to signal GPU resets to this function as well.
>>>
> Hi Christian,
>
> Can we access adev in common drm (even if we can access adev it will 
> not be common code)

No, that won't work.

>
> move work function to drm need to be protected(i.e reset_domain->sem), 
> adding something like reset_sem to drm_device?

Hui what? Why do you think the work function needs any protection? That 
doesn't make much sense.

Regards,
Christian.

>
> Regards,
>
> S.Amarnath
>
>>> Regards,
>>> Christian.
>>
>> That seems like a good idea, let me quickly dive into i915 and check 
>> this out.
>>
>> Shashank
>>>
>>>>
>>>> and then:
>>>> void drm_sysfs_reset_event(struct drm_device *dev, 
>>>> drm_reset_event_ctx *ctx);
>>>>
>>>>>
>>>>>>
>>>>>> Cc: Alexandar Deucher <alexander.deucher@amd.com>
>>>>>> Cc: Christian Koenig <christian.koenig@amd.com>
>>>>>> Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
>>>>>> ---
>>>>>>   drivers/gpu/drm/drm_sysfs.c | 24 ++++++++++++++++++++++++
>>>>>>   include/drm/drm_sysfs.h     |  3 +++
>>>>>>   2 files changed, 27 insertions(+)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/drm_sysfs.c 
>>>>>> b/drivers/gpu/drm/drm_sysfs.c
>>>>>> index 430e00b16eec..52a015161431 100644
>>>>>> --- a/drivers/gpu/drm/drm_sysfs.c
>>>>>> +++ b/drivers/gpu/drm/drm_sysfs.c
>>>>>> @@ -409,6 +409,30 @@ void drm_sysfs_hotplug_event(struct 
>>>>>> drm_device *dev)
>>>>>>   }
>>>>>>   EXPORT_SYMBOL(drm_sysfs_hotplug_event);
>>>>>> +/**
>>>>>> + * drm_sysfs_reset_event - generate a DRM uevent to indicate GPU 
>>>>>> reset
>>>>>> + * @dev: DRM device
>>>>>> + * @pid: The process ID involve with the reset
>>>>>> + * @flags: Any other information about the GPU status
>>>>>> + *
>>>>>> + * Send a uevent for the DRM device specified by @dev. This 
>>>>>> indicates
>>>>>> + * user that a GPU reset has occurred, so that the interested 
>>>>>> client
>>>>>> + * can take any recovery or profiling measure, when required.
>>>>>> + */
>>>>>> +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, 
>>>>>> uint32_t flags)
>>>>>
>>>>> The PID is usually only 32bit, but even better would be to use pid_t.
>>>>>
>>>>>> +{
>>>>>> +    unsigned char pid_str[21], flags_str[15];
>>>>>> +    unsigned char reset_str[] = "RESET=1";
>>>>>> +    char *envp[] = { reset_str, pid_str, flags_str, NULL };
>>>>>> +
>>>>>> +    DRM_DEBUG("generating reset event\n");
>>>>>> +
>>>>>> +    snprintf(pid_str, ARRAY_SIZE(pid_str), "PID=%lu", pid);
>>>>>> +    snprintf(flags_str, ARRAY_SIZE(flags_str), "FLAGS=%u", flags);
>>>>>> + kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
>>>>>> +}
>>>>>> +EXPORT_SYMBOL(drm_sysfs_reset_event);
>>>>>> +
>>>>>>   /**
>>>>>>    * drm_sysfs_connector_hotplug_event - generate a DRM uevent 
>>>>>> for any connector
>>>>>>    * change
>>>>>> diff --git a/include/drm/drm_sysfs.h b/include/drm/drm_sysfs.h
>>>>>> index 6273cac44e47..63f00fe8054c 100644
>>>>>> --- a/include/drm/drm_sysfs.h
>>>>>> +++ b/include/drm/drm_sysfs.h
>>>>>> @@ -2,6 +2,8 @@
>>>>>>   #ifndef _DRM_SYSFS_H_
>>>>>>   #define _DRM_SYSFS_H_
>>>>>> +#define DRM_GPU_RESET_FLAG_VRAM_VALID (1 << 0)
>>>>>
>>>>> Probably better to define that the other way around, e.g. 
>>>>> DRM_GPU_RESET_FLAG_VRAM_LOST.
>>>>>
>>>>> Apart from that looks good to me.
>>>>>
>>>> Got it, noted.
>>>> - Shashank
>>>>
>>>>> Christian.
>>>>>
>>>>>> +
>>>>>>   struct drm_device;
>>>>>>   struct device;
>>>>>>   struct drm_connector;
>>>>>> @@ -11,6 +13,7 @@ int drm_class_device_register(struct device *dev);
>>>>>>   void drm_class_device_unregister(struct device *dev);
>>>>>>   void drm_sysfs_hotplug_event(struct drm_device *dev);
>>>>>> +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, 
>>>>>> uint32_t reset_flags);
>>>>>>   void drm_sysfs_connector_hotplug_event(struct drm_connector 
>>>>>> *connector);
>>>>>>   void drm_sysfs_connector_status_event(struct drm_connector 
>>>>>> *connector,
>>>>>>                         struct drm_property *property);
>>>>>
>>>


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] drm: Add GPU reset sysfs event
  2022-03-08 16:40         ` Sharma, Shashank
@ 2022-03-09  8:05           ` Christian König
  0 siblings, 0 replies; 32+ messages in thread
From: Christian König @ 2022-03-09  8:05 UTC (permalink / raw)
  To: Sharma, Shashank, Christian König, Shashank Sharma, amd-gfx
  Cc: Alexandar Deucher, amaranath.somalapuram

Am 08.03.22 um 17:40 schrieb Sharma, Shashank:
>
>
> On 3/8/2022 12:56 PM, Sharma, Shashank wrote:
>>
>>
>> On 3/8/2022 11:32 AM, Christian König wrote:
>>> Am 08.03.22 um 10:31 schrieb Sharma, Shashank:
>>>>
>>>>
>>>> On 3/8/2022 8:06 AM, Christian König wrote:
>>>>> Am 07.03.22 um 17:26 schrieb Shashank Sharma:
>>>>>> From: Shashank Sharma <shashank.sharma@amd.com>
>>>>>>
>>>>>> This patch adds a new sysfs event, which will indicate
>>>>>> the userland about a GPU reset, and can also provide
>>>>>> some information like:
>>>>>> - which PID was involved in the GPU reset
>>>>>> - what was the GPU status (using flags)
>>>>>>
>>>>>> This patch also introduces the first flag of the flags
>>>>>> bitmap, which can be appended as and when required.
>>>>>
>>>>> Make sure to CC the dri-devel mailing list when reviewing this.
>>>> Got it,
>>>>
>>>> I was also curious if we want to move the reset_ctx structure 
>>>> itself to DRM layer, like
>>>> drm_reset_event_ctx {
>>>>     u32 pid;
>>>>     u32 flags;
>>>>     char process_name[64];
>>>> };
>>>
>>> I was entertaining that thought as well.
>>>
>>> But if we do this I would go even a step further and also move the 
>>> reset work item into the DRM layer as well.
>>>
>>> You might also look like into migrating the exiting i915 code which 
>>> uses udev to signal GPU resets to this function as well.
>>>
>>> Regards,
>>> Christian.
>>
>> That seems like a good idea, let me quickly dive into i915 and check 
>> this out.
>>
>> Shashank
>
> I had a quick look at I915, and it looks like both I915 and AMDGPU 
> drivers have very different methods of passing the data to the work 
> function, via different internal structures. Which means it would be 
> much additional work in both the drivers to move the work function 
> itself in the DRM layer.
>
> To me, now it seems like it would be better if we can just provide 
> this interface to send the uevent and its structure, and the drivers 
> can collect their information and pass it to WQ in their own way.
>
> How do you feel about it ?

That does not sounds like a good approach to me. If we add common drm 
functionality then we need to take the existing drivers into account.

What driver specific information does i915 pass to the work function?

Christian.

>
> - Shashank
>
>>>
>>>>
>>>> and then:
>>>> void drm_sysfs_reset_event(struct drm_device *dev, 
>>>> drm_reset_event_ctx *ctx);
>>>>
>>>>>
>>>>>>
>>>>>> Cc: Alexandar Deucher <alexander.deucher@amd.com>
>>>>>> Cc: Christian Koenig <christian.koenig@amd.com>
>>>>>> Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
>>>>>> ---
>>>>>>   drivers/gpu/drm/drm_sysfs.c | 24 ++++++++++++++++++++++++
>>>>>>   include/drm/drm_sysfs.h     |  3 +++
>>>>>>   2 files changed, 27 insertions(+)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/drm_sysfs.c 
>>>>>> b/drivers/gpu/drm/drm_sysfs.c
>>>>>> index 430e00b16eec..52a015161431 100644
>>>>>> --- a/drivers/gpu/drm/drm_sysfs.c
>>>>>> +++ b/drivers/gpu/drm/drm_sysfs.c
>>>>>> @@ -409,6 +409,30 @@ void drm_sysfs_hotplug_event(struct 
>>>>>> drm_device *dev)
>>>>>>   }
>>>>>>   EXPORT_SYMBOL(drm_sysfs_hotplug_event);
>>>>>> +/**
>>>>>> + * drm_sysfs_reset_event - generate a DRM uevent to indicate GPU 
>>>>>> reset
>>>>>> + * @dev: DRM device
>>>>>> + * @pid: The process ID involve with the reset
>>>>>> + * @flags: Any other information about the GPU status
>>>>>> + *
>>>>>> + * Send a uevent for the DRM device specified by @dev. This 
>>>>>> indicates
>>>>>> + * user that a GPU reset has occurred, so that the interested 
>>>>>> client
>>>>>> + * can take any recovery or profiling measure, when required.
>>>>>> + */
>>>>>> +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, 
>>>>>> uint32_t flags)
>>>>>
>>>>> The PID is usually only 32bit, but even better would be to use pid_t.
>>>>>
>>>>>> +{
>>>>>> +    unsigned char pid_str[21], flags_str[15];
>>>>>> +    unsigned char reset_str[] = "RESET=1";
>>>>>> +    char *envp[] = { reset_str, pid_str, flags_str, NULL };
>>>>>> +
>>>>>> +    DRM_DEBUG("generating reset event\n");
>>>>>> +
>>>>>> +    snprintf(pid_str, ARRAY_SIZE(pid_str), "PID=%lu", pid);
>>>>>> +    snprintf(flags_str, ARRAY_SIZE(flags_str), "FLAGS=%u", flags);
>>>>>> + kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
>>>>>> +}
>>>>>> +EXPORT_SYMBOL(drm_sysfs_reset_event);
>>>>>> +
>>>>>>   /**
>>>>>>    * drm_sysfs_connector_hotplug_event - generate a DRM uevent 
>>>>>> for any connector
>>>>>>    * change
>>>>>> diff --git a/include/drm/drm_sysfs.h b/include/drm/drm_sysfs.h
>>>>>> index 6273cac44e47..63f00fe8054c 100644
>>>>>> --- a/include/drm/drm_sysfs.h
>>>>>> +++ b/include/drm/drm_sysfs.h
>>>>>> @@ -2,6 +2,8 @@
>>>>>>   #ifndef _DRM_SYSFS_H_
>>>>>>   #define _DRM_SYSFS_H_
>>>>>> +#define DRM_GPU_RESET_FLAG_VRAM_VALID (1 << 0)
>>>>>
>>>>> Probably better to define that the other way around, e.g. 
>>>>> DRM_GPU_RESET_FLAG_VRAM_LOST.
>>>>>
>>>>> Apart from that looks good to me.
>>>>>
>>>> Got it, noted.
>>>> - Shashank
>>>>
>>>>> Christian.
>>>>>
>>>>>> +
>>>>>>   struct drm_device;
>>>>>>   struct device;
>>>>>>   struct drm_connector;
>>>>>> @@ -11,6 +13,7 @@ int drm_class_device_register(struct device *dev);
>>>>>>   void drm_class_device_unregister(struct device *dev);
>>>>>>   void drm_sysfs_hotplug_event(struct drm_device *dev);
>>>>>> +void drm_sysfs_reset_event(struct drm_device *dev, uint64_t pid, 
>>>>>> uint32_t reset_flags);
>>>>>>   void drm_sysfs_connector_hotplug_event(struct drm_connector 
>>>>>> *connector);
>>>>>>   void drm_sysfs_connector_status_event(struct drm_connector 
>>>>>> *connector,
>>>>>>                         struct drm_property *property);
>>>>>
>>>


^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2022-03-09  8:05 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-07 16:26 [PATCH 1/2] drm: Add GPU reset sysfs event Shashank Sharma
2022-03-07 16:26 ` [PATCH 2/2] drm/amdgpu: add work function for GPU reset event Shashank Sharma
2022-03-07 16:46   ` Somalapuram, Amaranath
2022-03-07 17:05     ` Sharma, Shashank
2022-03-08  7:15   ` Christian König
2022-03-08  9:35     ` Sharma, Shashank
2022-03-08 16:26   ` Andrey Grodzovsky
2022-03-08 16:30     ` Sharma, Shashank
2022-03-08 17:20       ` Somalapuram, Amaranath
2022-03-08 18:53         ` Andrey Grodzovsky
2022-03-08  6:32 ` [PATCH 1/2] drm: Add GPU reset sysfs event Somalapuram, Amaranath
2022-03-08  7:06 ` Christian König
2022-03-08  9:31   ` Sharma, Shashank
2022-03-08 10:32     ` Christian König
2022-03-08 11:56       ` Sharma, Shashank
2022-03-08 15:37         ` Somalapuram, Amaranath
2022-03-09  8:02           ` Christian König
2022-03-08 16:40         ` Sharma, Shashank
2022-03-09  8:05           ` Christian König
2022-03-08 16:25 ` Andrey Grodzovsky
2022-03-08 16:35   ` Sharma, Shashank
2022-03-08 16:36     ` Andrey Grodzovsky
2022-03-08 16:36   ` Lazar, Lijo
2022-03-08 16:39     ` Andrey Grodzovsky
2022-03-08 16:46       ` Sharma, Shashank
2022-03-08 16:55         ` Andrey Grodzovsky
2022-03-08 16:57           ` Sharma, Shashank
2022-03-08 17:04             ` Somalapuram, Amaranath
2022-03-08 17:17               ` Andrey Grodzovsky
2022-03-08 17:27             ` Limonciello, Mario
2022-03-08 18:08               ` Sharma, Shashank
2022-03-08 18:10                 ` Limonciello, Mario

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.