amd-gfx.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
* [PATCH Review 1/1] drm/amdgpu: Support setting recover method
@ 2024-04-11 11:11 Stanley.Yang
  2024-04-11 11:17 ` Christian König
  0 siblings, 1 reply; 5+ messages in thread
From: Stanley.Yang @ 2024-04-11 11:11 UTC (permalink / raw)
  To: amd-gfx; +Cc: Stanley.Yang

Don't modify amdgpu gpu recover get operation,
add amdgpu gpu recover set operation to select
reset method, only support mode1 and mode2 currently.

Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  3 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 37 +++++++++++++++++++---
 3 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 9c62552bec34..c82976b2b977 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1151,6 +1151,9 @@ struct amdgpu_device {
 	bool                            debug_largebar;
 	bool                            debug_disable_soft_recovery;
 	bool                            debug_use_vram_fw_buf;
+
+	/* Used to set gpu reset method */
+	int                             recover_method;
 };
 
 static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3204b8f6edeb..8411a793be18 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3908,6 +3908,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	else
 		adev->asic_type = flags & AMD_ASIC_MASK;
 
+	adev->recover_method = AMD_RESET_METHOD_NONE;
 	adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
 	if (amdgpu_emu_mode == 1)
 		adev->usec_timeout *= 10;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 10832b470448..e388a50d11d9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -965,9 +965,37 @@ static int gpu_recover_get(void *data, u64 *val)
 	return 0;
 }
 
+static int gpu_recover_set(void *data, u64 val)
+{
+	struct amdgpu_device *adev = (struct amdgpu_device *)data;
+	struct drm_device *dev = adev_to_drm(adev);
+	int r;
+
+	/* TODO: support mode1 and mode2 currently */
+	if (val == AMD_RESET_METHOD_MODE1 ||
+		val == AMD_RESET_METHOD_MODE2)
+		adev->recover_method = val;
+	else
+		adev->recover_method = AMD_RESET_METHOD_NONE;
+
+	r = pm_runtime_get_sync(dev->dev);
+	if (r < 0) {
+		pm_runtime_put_autosuspend(dev->dev);
+		return 0;
+	}
+
+	if (amdgpu_reset_domain_schedule(adev->reset_domain, &adev->reset_work))
+		flush_work(&adev->reset_work);
+
+	pm_runtime_mark_last_busy(dev->dev);
+	pm_runtime_put_autosuspend(dev->dev);
+
+	return 0;
+}
+
 DEFINE_SHOW_ATTRIBUTE(amdgpu_debugfs_fence_info);
-DEFINE_DEBUGFS_ATTRIBUTE(amdgpu_debugfs_gpu_recover_fops, gpu_recover_get, NULL,
-			 "%lld\n");
+DEFINE_DEBUGFS_ATTRIBUTE(amdgpu_debugfs_gpu_recover_fops, gpu_recover_get,
+			 gpu_recover_set, "%lld\n");
 
 static void amdgpu_debugfs_reset_work(struct work_struct *work)
 {
@@ -978,9 +1006,10 @@ static void amdgpu_debugfs_reset_work(struct work_struct *work)
 
 	memset(&reset_context, 0, sizeof(reset_context));
 
-	reset_context.method = AMD_RESET_METHOD_NONE;
+	reset_context.method = adev->recover_method;
 	reset_context.reset_req_dev = adev;
 	set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+	adev->recover_method = AMD_RESET_METHOD_NONE;
 
 	amdgpu_device_gpu_recover(adev, NULL, &reset_context);
 }
@@ -999,7 +1028,7 @@ void amdgpu_debugfs_fence_init(struct amdgpu_device *adev)
 	if (!amdgpu_sriov_vf(adev)) {
 
 		INIT_WORK(&adev->reset_work, amdgpu_debugfs_reset_work);
-		debugfs_create_file("amdgpu_gpu_recover", 0444, root, adev,
+		debugfs_create_file("amdgpu_gpu_recover", 0666, root, adev,
 				    &amdgpu_debugfs_gpu_recover_fops);
 	}
 #endif
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH Review 1/1] drm/amdgpu: Support setting recover method
  2024-04-11 11:11 [PATCH Review 1/1] drm/amdgpu: Support setting recover method Stanley.Yang
@ 2024-04-11 11:17 ` Christian König
  2024-04-11 11:30   ` Yang, Stanley
  0 siblings, 1 reply; 5+ messages in thread
From: Christian König @ 2024-04-11 11:17 UTC (permalink / raw)
  To: Stanley.Yang, amd-gfx

Am 11.04.24 um 13:11 schrieb Stanley.Yang:
> Don't modify amdgpu gpu recover get operation,
> add amdgpu gpu recover set operation to select
> reset method, only support mode1 and mode2 currently.

Well I don't think setting this from userspace is valid.

The reset method to use is determined by the hardware and environment 
(e.g. SRIOV, passthrough, whatever) and can't be chosen simply.

Regards,
Christian.

>
> Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  3 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 37 +++++++++++++++++++---
>   3 files changed, 37 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 9c62552bec34..c82976b2b977 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1151,6 +1151,9 @@ struct amdgpu_device {
>   	bool                            debug_largebar;
>   	bool                            debug_disable_soft_recovery;
>   	bool                            debug_use_vram_fw_buf;
> +
> +	/* Used to set gpu reset method */
> +	int                             recover_method;
>   };
>   
>   static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 3204b8f6edeb..8411a793be18 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3908,6 +3908,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   	else
>   		adev->asic_type = flags & AMD_ASIC_MASK;
>   
> +	adev->recover_method = AMD_RESET_METHOD_NONE;
>   	adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
>   	if (amdgpu_emu_mode == 1)
>   		adev->usec_timeout *= 10;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 10832b470448..e388a50d11d9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -965,9 +965,37 @@ static int gpu_recover_get(void *data, u64 *val)
>   	return 0;
>   }
>   
> +static int gpu_recover_set(void *data, u64 val)
> +{
> +	struct amdgpu_device *adev = (struct amdgpu_device *)data;
> +	struct drm_device *dev = adev_to_drm(adev);
> +	int r;
> +
> +	/* TODO: support mode1 and mode2 currently */
> +	if (val == AMD_RESET_METHOD_MODE1 ||
> +		val == AMD_RESET_METHOD_MODE2)
> +		adev->recover_method = val;
> +	else
> +		adev->recover_method = AMD_RESET_METHOD_NONE;
> +
> +	r = pm_runtime_get_sync(dev->dev);
> +	if (r < 0) {
> +		pm_runtime_put_autosuspend(dev->dev);
> +		return 0;
> +	}
> +
> +	if (amdgpu_reset_domain_schedule(adev->reset_domain, &adev->reset_work))
> +		flush_work(&adev->reset_work);
> +
> +	pm_runtime_mark_last_busy(dev->dev);
> +	pm_runtime_put_autosuspend(dev->dev);
> +
> +	return 0;
> +}
> +
>   DEFINE_SHOW_ATTRIBUTE(amdgpu_debugfs_fence_info);
> -DEFINE_DEBUGFS_ATTRIBUTE(amdgpu_debugfs_gpu_recover_fops, gpu_recover_get, NULL,
> -			 "%lld\n");
> +DEFINE_DEBUGFS_ATTRIBUTE(amdgpu_debugfs_gpu_recover_fops, gpu_recover_get,
> +			 gpu_recover_set, "%lld\n");
>   
>   static void amdgpu_debugfs_reset_work(struct work_struct *work)
>   {
> @@ -978,9 +1006,10 @@ static void amdgpu_debugfs_reset_work(struct work_struct *work)
>   
>   	memset(&reset_context, 0, sizeof(reset_context));
>   
> -	reset_context.method = AMD_RESET_METHOD_NONE;
> +	reset_context.method = adev->recover_method;
>   	reset_context.reset_req_dev = adev;
>   	set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
> +	adev->recover_method = AMD_RESET_METHOD_NONE;
>   
>   	amdgpu_device_gpu_recover(adev, NULL, &reset_context);
>   }
> @@ -999,7 +1028,7 @@ void amdgpu_debugfs_fence_init(struct amdgpu_device *adev)
>   	if (!amdgpu_sriov_vf(adev)) {
>   
>   		INIT_WORK(&adev->reset_work, amdgpu_debugfs_reset_work);
> -		debugfs_create_file("amdgpu_gpu_recover", 0444, root, adev,
> +		debugfs_create_file("amdgpu_gpu_recover", 0666, root, adev,
>   				    &amdgpu_debugfs_gpu_recover_fops);
>   	}
>   #endif


^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH Review 1/1] drm/amdgpu: Support setting recover method
  2024-04-11 11:17 ` Christian König
@ 2024-04-11 11:30   ` Yang, Stanley
  2024-04-11 11:49     ` Christian König
  0 siblings, 1 reply; 5+ messages in thread
From: Yang, Stanley @ 2024-04-11 11:30 UTC (permalink / raw)
  To: Christian König, amd-gfx

[AMD Official Use Only - General]

> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Thursday, April 11, 2024 7:17 PM
> To: Yang, Stanley <Stanley.Yang@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH Review 1/1] drm/amdgpu: Support setting recover method
>
> Am 11.04.24 um 13:11 schrieb Stanley.Yang:
> > Don't modify amdgpu gpu recover get operation, add amdgpu gpu recover
> > set operation to select reset method, only support mode1 and mode2
> > currently.
>
> Well I don't think setting this from userspace is valid.
>
> The reset method to use is determined by the hardware and environment (e.g.
> SRIOV, passthrough, whatever) and can't be chosen simply.

[Stanley]: Agree, the setting is invalid for some devices not supported setting method and devices still reset with default method,
but it's valid for those devices supported setting reset method, user can conduct combination testing like mode1 test then mode2 test without
re-modprobe driver.

Regards,
Stanley
>
> Regards,
> Christian.
>
> >
> > Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  3 ++
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  1 +
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 37
> +++++++++++++++++++---
> >   3 files changed, 37 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > index 9c62552bec34..c82976b2b977 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > @@ -1151,6 +1151,9 @@ struct amdgpu_device {
> >     bool                            debug_largebar;
> >     bool                            debug_disable_soft_recovery;
> >     bool                            debug_use_vram_fw_buf;
> > +
> > +   /* Used to set gpu reset method */
> > +   int                             recover_method;
> >   };
> >
> >   static inline uint32_t amdgpu_ip_version(const struct amdgpu_device
> > *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > index 3204b8f6edeb..8411a793be18 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > @@ -3908,6 +3908,7 @@ int amdgpu_device_init(struct amdgpu_device
> *adev,
> >     else
> >             adev->asic_type = flags & AMD_ASIC_MASK;
> >
> > +   adev->recover_method = AMD_RESET_METHOD_NONE;
> >     adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
> >     if (amdgpu_emu_mode == 1)
> >             adev->usec_timeout *= 10;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > index 10832b470448..e388a50d11d9 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > @@ -965,9 +965,37 @@ static int gpu_recover_get(void *data, u64 *val)
> >     return 0;
> >   }
> >
> > +static int gpu_recover_set(void *data, u64 val) {
> > +   struct amdgpu_device *adev = (struct amdgpu_device *)data;
> > +   struct drm_device *dev = adev_to_drm(adev);
> > +   int r;
> > +
> > +   /* TODO: support mode1 and mode2 currently */
> > +   if (val == AMD_RESET_METHOD_MODE1 ||
> > +           val == AMD_RESET_METHOD_MODE2)
> > +           adev->recover_method = val;
> > +   else
> > +           adev->recover_method = AMD_RESET_METHOD_NONE;
> > +
> > +   r = pm_runtime_get_sync(dev->dev);
> > +   if (r < 0) {
> > +           pm_runtime_put_autosuspend(dev->dev);
> > +           return 0;
> > +   }
> > +
> > +   if (amdgpu_reset_domain_schedule(adev->reset_domain, &adev-
> >reset_work))
> > +           flush_work(&adev->reset_work);
> > +
> > +   pm_runtime_mark_last_busy(dev->dev);
> > +   pm_runtime_put_autosuspend(dev->dev);
> > +
> > +   return 0;
> > +}
> > +
> >   DEFINE_SHOW_ATTRIBUTE(amdgpu_debugfs_fence_info);
> > -DEFINE_DEBUGFS_ATTRIBUTE(amdgpu_debugfs_gpu_recover_fops,
> gpu_recover_get, NULL,
> > -                    "%lld\n");
> > +DEFINE_DEBUGFS_ATTRIBUTE(amdgpu_debugfs_gpu_recover_fops,
> gpu_recover_get,
> > +                    gpu_recover_set, "%lld\n");
> >
> >   static void amdgpu_debugfs_reset_work(struct work_struct *work)
> >   {
> > @@ -978,9 +1006,10 @@ static void amdgpu_debugfs_reset_work(struct
> > work_struct *work)
> >
> >     memset(&reset_context, 0, sizeof(reset_context));
> >
> > -   reset_context.method = AMD_RESET_METHOD_NONE;
> > +   reset_context.method = adev->recover_method;
> >     reset_context.reset_req_dev = adev;
> >     set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
> > +   adev->recover_method = AMD_RESET_METHOD_NONE;
> >
> >     amdgpu_device_gpu_recover(adev, NULL, &reset_context);
> >   }
> > @@ -999,7 +1028,7 @@ void amdgpu_debugfs_fence_init(struct
> amdgpu_device *adev)
> >     if (!amdgpu_sriov_vf(adev)) {
> >
> >             INIT_WORK(&adev->reset_work, amdgpu_debugfs_reset_work);
> > -           debugfs_create_file("amdgpu_gpu_recover", 0444, root, adev,
> > +           debugfs_create_file("amdgpu_gpu_recover", 0666, root, adev,
> >                                 &amdgpu_debugfs_gpu_recover_fops);
> >     }
> >   #endif


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH Review 1/1] drm/amdgpu: Support setting recover method
  2024-04-11 11:30   ` Yang, Stanley
@ 2024-04-11 11:49     ` Christian König
  2024-04-11 11:55       ` Christian König
  0 siblings, 1 reply; 5+ messages in thread
From: Christian König @ 2024-04-11 11:49 UTC (permalink / raw)
  To: Yang, Stanley, amd-gfx

Am 11.04.24 um 13:30 schrieb Yang, Stanley:
> [AMD Official Use Only - General]
>
>> -----Original Message-----
>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>> Sent: Thursday, April 11, 2024 7:17 PM
>> To: Yang, Stanley <Stanley.Yang@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH Review 1/1] drm/amdgpu: Support setting recover method
>>
>> Am 11.04.24 um 13:11 schrieb Stanley.Yang:
>>> Don't modify amdgpu gpu recover get operation, add amdgpu gpu recover
>>> set operation to select reset method, only support mode1 and mode2
>>> currently.
>> Well I don't think setting this from userspace is valid.
>>
>> The reset method to use is determined by the hardware and environment (e.g.
>> SRIOV, passthrough, whatever) and can't be chosen simply.
> [Stanley]: Agree, the setting is invalid for some devices not supported setting method and devices still reset with default method,
> but it's valid for those devices supported setting reset method, user can conduct combination testing like mode1 test then mode2 test without
> re-modprobe driver.

Well and the user could also shoot himself into the foot.

I really don't think that this is a valuable functionality.

Regards,
Christian.

>
> Regards,
> Stanley
>> Regards,
>> Christian.
>>
>>> Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  3 ++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  1 +
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 37
>> +++++++++++++++++++---
>>>    3 files changed, 37 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index 9c62552bec34..c82976b2b977 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -1151,6 +1151,9 @@ struct amdgpu_device {
>>>      bool                            debug_largebar;
>>>      bool                            debug_disable_soft_recovery;
>>>      bool                            debug_use_vram_fw_buf;
>>> +
>>> +   /* Used to set gpu reset method */
>>> +   int                             recover_method;
>>>    };
>>>
>>>    static inline uint32_t amdgpu_ip_version(const struct amdgpu_device
>>> *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 3204b8f6edeb..8411a793be18 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -3908,6 +3908,7 @@ int amdgpu_device_init(struct amdgpu_device
>> *adev,
>>>      else
>>>              adev->asic_type = flags & AMD_ASIC_MASK;
>>>
>>> +   adev->recover_method = AMD_RESET_METHOD_NONE;
>>>      adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
>>>      if (amdgpu_emu_mode == 1)
>>>              adev->usec_timeout *= 10;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> index 10832b470448..e388a50d11d9 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> @@ -965,9 +965,37 @@ static int gpu_recover_get(void *data, u64 *val)
>>>      return 0;
>>>    }
>>>
>>> +static int gpu_recover_set(void *data, u64 val) {
>>> +   struct amdgpu_device *adev = (struct amdgpu_device *)data;
>>> +   struct drm_device *dev = adev_to_drm(adev);
>>> +   int r;
>>> +
>>> +   /* TODO: support mode1 and mode2 currently */
>>> +   if (val == AMD_RESET_METHOD_MODE1 ||
>>> +           val == AMD_RESET_METHOD_MODE2)
>>> +           adev->recover_method = val;
>>> +   else
>>> +           adev->recover_method = AMD_RESET_METHOD_NONE;
>>> +
>>> +   r = pm_runtime_get_sync(dev->dev);
>>> +   if (r < 0) {
>>> +           pm_runtime_put_autosuspend(dev->dev);
>>> +           return 0;
>>> +   }
>>> +
>>> +   if (amdgpu_reset_domain_schedule(adev->reset_domain, &adev-
>>> reset_work))
>>> +           flush_work(&adev->reset_work);
>>> +
>>> +   pm_runtime_mark_last_busy(dev->dev);
>>> +   pm_runtime_put_autosuspend(dev->dev);
>>> +
>>> +   return 0;
>>> +}
>>> +
>>>    DEFINE_SHOW_ATTRIBUTE(amdgpu_debugfs_fence_info);
>>> -DEFINE_DEBUGFS_ATTRIBUTE(amdgpu_debugfs_gpu_recover_fops,
>> gpu_recover_get, NULL,
>>> -                    "%lld\n");
>>> +DEFINE_DEBUGFS_ATTRIBUTE(amdgpu_debugfs_gpu_recover_fops,
>> gpu_recover_get,
>>> +                    gpu_recover_set, "%lld\n");
>>>
>>>    static void amdgpu_debugfs_reset_work(struct work_struct *work)
>>>    {
>>> @@ -978,9 +1006,10 @@ static void amdgpu_debugfs_reset_work(struct
>>> work_struct *work)
>>>
>>>      memset(&reset_context, 0, sizeof(reset_context));
>>>
>>> -   reset_context.method = AMD_RESET_METHOD_NONE;
>>> +   reset_context.method = adev->recover_method;
>>>      reset_context.reset_req_dev = adev;
>>>      set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
>>> +   adev->recover_method = AMD_RESET_METHOD_NONE;
>>>
>>>      amdgpu_device_gpu_recover(adev, NULL, &reset_context);
>>>    }
>>> @@ -999,7 +1028,7 @@ void amdgpu_debugfs_fence_init(struct
>> amdgpu_device *adev)
>>>      if (!amdgpu_sriov_vf(adev)) {
>>>
>>>              INIT_WORK(&adev->reset_work, amdgpu_debugfs_reset_work);
>>> -           debugfs_create_file("amdgpu_gpu_recover", 0444, root, adev,
>>> +           debugfs_create_file("amdgpu_gpu_recover", 0666, root, adev,
>>>                                  &amdgpu_debugfs_gpu_recover_fops);
>>>      }
>>>    #endif


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH Review 1/1] drm/amdgpu: Support setting recover method
  2024-04-11 11:49     ` Christian König
@ 2024-04-11 11:55       ` Christian König
  0 siblings, 0 replies; 5+ messages in thread
From: Christian König @ 2024-04-11 11:55 UTC (permalink / raw)
  To: Yang, Stanley, amd-gfx

Am 11.04.24 um 13:49 schrieb Christian König:
> Am 11.04.24 um 13:30 schrieb Yang, Stanley:
>> [AMD Official Use Only - General]
>>
>>> -----Original Message-----
>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>> Sent: Thursday, April 11, 2024 7:17 PM
>>> To: Yang, Stanley <Stanley.Yang@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH Review 1/1] drm/amdgpu: Support setting recover 
>>> method
>>>
>>> Am 11.04.24 um 13:11 schrieb Stanley.Yang:
>>>> Don't modify amdgpu gpu recover get operation, add amdgpu gpu recover
>>>> set operation to select reset method, only support mode1 and mode2
>>>> currently.
>>> Well I don't think setting this from userspace is valid.
>>>
>>> The reset method to use is determined by the hardware and 
>>> environment (e.g.
>>> SRIOV, passthrough, whatever) and can't be chosen simply.
>> [Stanley]: Agree, the setting is invalid for some devices not 
>> supported setting method and devices still reset with default method,
>> but it's valid for those devices supported setting reset method, user 
>> can conduct combination testing like mode1 test then mode2 test without
>> re-modprobe driver.
>
> Well and the user could also shoot himself into the foot.
>
> I really don't think that this is a valuable functionality.

To make clear what I mean: What you could do is to make the module 
parameter writeable.

In this case the hardware code still decides which reset method to use 
based on the module parameter in the moment the reset is requested.

That would also avoid re-loading the driver.

Regards,
Christian.

>
> Regards,
> Christian.
>
>>
>> Regards,
>> Stanley
>>> Regards,
>>> Christian.
>>>
>>>> Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
>>>> ---
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  3 ++
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  1 +
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 37
>>> +++++++++++++++++++---
>>>>    3 files changed, 37 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> index 9c62552bec34..c82976b2b977 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> @@ -1151,6 +1151,9 @@ struct amdgpu_device {
>>>>      bool                            debug_largebar;
>>>>      bool debug_disable_soft_recovery;
>>>>      bool                            debug_use_vram_fw_buf;
>>>> +
>>>> +   /* Used to set gpu reset method */
>>>> +   int                             recover_method;
>>>>    };
>>>>
>>>>    static inline uint32_t amdgpu_ip_version(const struct amdgpu_device
>>>> *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> index 3204b8f6edeb..8411a793be18 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> @@ -3908,6 +3908,7 @@ int amdgpu_device_init(struct amdgpu_device
>>> *adev,
>>>>      else
>>>>              adev->asic_type = flags & AMD_ASIC_MASK;
>>>>
>>>> +   adev->recover_method = AMD_RESET_METHOD_NONE;
>>>>      adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
>>>>      if (amdgpu_emu_mode == 1)
>>>>              adev->usec_timeout *= 10;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>> index 10832b470448..e388a50d11d9 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>> @@ -965,9 +965,37 @@ static int gpu_recover_get(void *data, u64 *val)
>>>>      return 0;
>>>>    }
>>>>
>>>> +static int gpu_recover_set(void *data, u64 val) {
>>>> +   struct amdgpu_device *adev = (struct amdgpu_device *)data;
>>>> +   struct drm_device *dev = adev_to_drm(adev);
>>>> +   int r;
>>>> +
>>>> +   /* TODO: support mode1 and mode2 currently */
>>>> +   if (val == AMD_RESET_METHOD_MODE1 ||
>>>> +           val == AMD_RESET_METHOD_MODE2)
>>>> +           adev->recover_method = val;
>>>> +   else
>>>> +           adev->recover_method = AMD_RESET_METHOD_NONE;
>>>> +
>>>> +   r = pm_runtime_get_sync(dev->dev);
>>>> +   if (r < 0) {
>>>> +           pm_runtime_put_autosuspend(dev->dev);
>>>> +           return 0;
>>>> +   }
>>>> +
>>>> +   if (amdgpu_reset_domain_schedule(adev->reset_domain, &adev-
>>>> reset_work))
>>>> +           flush_work(&adev->reset_work);
>>>> +
>>>> +   pm_runtime_mark_last_busy(dev->dev);
>>>> +   pm_runtime_put_autosuspend(dev->dev);
>>>> +
>>>> +   return 0;
>>>> +}
>>>> +
>>>>    DEFINE_SHOW_ATTRIBUTE(amdgpu_debugfs_fence_info);
>>>> -DEFINE_DEBUGFS_ATTRIBUTE(amdgpu_debugfs_gpu_recover_fops,
>>> gpu_recover_get, NULL,
>>>> -                    "%lld\n");
>>>> +DEFINE_DEBUGFS_ATTRIBUTE(amdgpu_debugfs_gpu_recover_fops,
>>> gpu_recover_get,
>>>> +                    gpu_recover_set, "%lld\n");
>>>>
>>>>    static void amdgpu_debugfs_reset_work(struct work_struct *work)
>>>>    {
>>>> @@ -978,9 +1006,10 @@ static void amdgpu_debugfs_reset_work(struct
>>>> work_struct *work)
>>>>
>>>>      memset(&reset_context, 0, sizeof(reset_context));
>>>>
>>>> -   reset_context.method = AMD_RESET_METHOD_NONE;
>>>> +   reset_context.method = adev->recover_method;
>>>>      reset_context.reset_req_dev = adev;
>>>>      set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
>>>> +   adev->recover_method = AMD_RESET_METHOD_NONE;
>>>>
>>>>      amdgpu_device_gpu_recover(adev, NULL, &reset_context);
>>>>    }
>>>> @@ -999,7 +1028,7 @@ void amdgpu_debugfs_fence_init(struct
>>> amdgpu_device *adev)
>>>>      if (!amdgpu_sriov_vf(adev)) {
>>>>
>>>>              INIT_WORK(&adev->reset_work, amdgpu_debugfs_reset_work);
>>>> -           debugfs_create_file("amdgpu_gpu_recover", 0444, root, 
>>>> adev,
>>>> +           debugfs_create_file("amdgpu_gpu_recover", 0666, root, 
>>>> adev,
>>>> &amdgpu_debugfs_gpu_recover_fops);
>>>>      }
>>>>    #endif
>


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2024-04-11 11:55 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-04-11 11:11 [PATCH Review 1/1] drm/amdgpu: Support setting recover method Stanley.Yang
2024-04-11 11:17 ` Christian König
2024-04-11 11:30   ` Yang, Stanley
2024-04-11 11:49     ` Christian König
2024-04-11 11:55       ` Christian König

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).