All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: avoid over-handle of fence driver fini in s3 test
@ 2021-07-29 10:49 Guchun Chen
  2021-07-29 11:11 ` Christian König
  0 siblings, 1 reply; 6+ messages in thread
From: Guchun Chen @ 2021-07-29 10:49 UTC (permalink / raw)
  To: amd-gfx, Likun.Gao, christian.koenig, Hawking.Zhang, alexander.deucher
  Cc: Guchun Chen

In amdgpu_fence_driver_hw_fini, no need to call drm_sched_fini to stop
scheduler in s3 test, otherwise, fence errors will occur after resume.
So introduce a new parameter to distingiush the case in this API.

Fixes: cd87a6dcf6af drm/amdgpu: adjust fence driver enable sequence
Signed-off-by: Guchun Chen <guchun.chen@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 8 +++++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   | 2 +-
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index b1d2dc39e8be..aaff8ebbb7dc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3844,7 +3844,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
 		else
 			drm_atomic_helper_shutdown(adev_to_drm(adev));
 	}
-	amdgpu_fence_driver_hw_fini(adev);
+	amdgpu_fence_driver_hw_fini(adev, false);
 
 	if (adev->pm_sysfs_en)
 		amdgpu_pm_sysfs_fini(adev);
@@ -3941,7 +3941,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
 	/* evict vram memory */
 	amdgpu_bo_evict_vram(adev);
 
-	amdgpu_fence_driver_hw_fini(adev);
+	amdgpu_fence_driver_hw_fini(adev, adev->in_suspend);
 
 	amdgpu_device_ip_suspend_phase2(adev);
 	/* evict remaining vram memory
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 49c5c7331c53..7e6a73c2919d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -515,14 +515,15 @@ int amdgpu_fence_driver_init(struct amdgpu_device *adev)
 }
 
 /**
- * amdgpu_fence_driver_fini - tear down the fence driver
+ * amdgpu_fence_driver_hw_fini - tear down the fence driver
  * for all possible rings.
  *
  * @adev: amdgpu device pointer
+ * @in_reset: indicator to distingiush device removal case or s3 case
  *
  * Tear down the fence driver for all possible rings (all asics).
  */
-void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev)
+void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev, bool in_reset)
 {
 	int i, r;
 
@@ -531,8 +532,9 @@ void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev)
 
 		if (!ring || !ring->fence_drv.initialized)
 			continue;
-		if (!ring->no_scheduler)
+		if (!ring->no_scheduler && !in_reset)
 			drm_sched_fini(&ring->sched);
+
 		/* You can't wait for HW to signal if it's gone */
 		if (!drm_dev_is_unplugged(&adev->ddev))
 			r = amdgpu_fence_wait_empty(ring);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 27adffa7658d..42cbecfc26a3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -115,7 +115,7 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
 int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring,
 				   struct amdgpu_irq_src *irq_src,
 				   unsigned irq_type);
-void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev);
+void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev, bool in_reset);
 void amdgpu_fence_driver_sw_fini(struct amdgpu_device *adev);
 void amdgpu_fence_driver_hw_init(struct amdgpu_device *adev);
 int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence,
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] drm/amdgpu: avoid over-handle of fence driver fini in s3 test
  2021-07-29 10:49 [PATCH] drm/amdgpu: avoid over-handle of fence driver fini in s3 test Guchun Chen
@ 2021-07-29 11:11 ` Christian König
  2021-07-29 12:39   ` Chen, Guchun
  0 siblings, 1 reply; 6+ messages in thread
From: Christian König @ 2021-07-29 11:11 UTC (permalink / raw)
  To: Guchun Chen, amd-gfx, Likun.Gao, Hawking.Zhang, alexander.deucher

Am 29.07.21 um 12:49 schrieb Guchun Chen:
> In amdgpu_fence_driver_hw_fini, no need to call drm_sched_fini to stop
> scheduler in s3 test, otherwise, fence errors will occur after resume.
> So introduce a new parameter to distingiush the case in this API.

NAK, the problem is rather that drm_sched_fini() is part of the sw 
shutdown and should never be called during hw_fini.

Christian.

>
> Fixes: cd87a6dcf6af drm/amdgpu: adjust fence driver enable sequence
> Signed-off-by: Guchun Chen <guchun.chen@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 8 +++++---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   | 2 +-
>   3 files changed, 8 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index b1d2dc39e8be..aaff8ebbb7dc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3844,7 +3844,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
>   		else
>   			drm_atomic_helper_shutdown(adev_to_drm(adev));
>   	}
> -	amdgpu_fence_driver_hw_fini(adev);
> +	amdgpu_fence_driver_hw_fini(adev, false);
>   
>   	if (adev->pm_sysfs_en)
>   		amdgpu_pm_sysfs_fini(adev);
> @@ -3941,7 +3941,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
>   	/* evict vram memory */
>   	amdgpu_bo_evict_vram(adev);
>   
> -	amdgpu_fence_driver_hw_fini(adev);
> +	amdgpu_fence_driver_hw_fini(adev, adev->in_suspend);
>   
>   	amdgpu_device_ip_suspend_phase2(adev);
>   	/* evict remaining vram memory
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 49c5c7331c53..7e6a73c2919d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -515,14 +515,15 @@ int amdgpu_fence_driver_init(struct amdgpu_device *adev)
>   }
>   
>   /**
> - * amdgpu_fence_driver_fini - tear down the fence driver
> + * amdgpu_fence_driver_hw_fini - tear down the fence driver
>    * for all possible rings.
>    *
>    * @adev: amdgpu device pointer
> + * @in_reset: indicator to distingiush device removal case or s3 case
>    *
>    * Tear down the fence driver for all possible rings (all asics).
>    */
> -void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev)
> +void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev, bool in_reset)
>   {
>   	int i, r;
>   
> @@ -531,8 +532,9 @@ void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev)
>   
>   		if (!ring || !ring->fence_drv.initialized)
>   			continue;
> -		if (!ring->no_scheduler)
> +		if (!ring->no_scheduler && !in_reset)
>   			drm_sched_fini(&ring->sched);
> +
>   		/* You can't wait for HW to signal if it's gone */
>   		if (!drm_dev_is_unplugged(&adev->ddev))
>   			r = amdgpu_fence_wait_empty(ring);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 27adffa7658d..42cbecfc26a3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -115,7 +115,7 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>   int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring,
>   				   struct amdgpu_irq_src *irq_src,
>   				   unsigned irq_type);
> -void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev);
> +void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev, bool in_reset);
>   void amdgpu_fence_driver_sw_fini(struct amdgpu_device *adev);
>   void amdgpu_fence_driver_hw_init(struct amdgpu_device *adev);
>   int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [PATCH] drm/amdgpu: avoid over-handle of fence driver fini in s3 test
  2021-07-29 11:11 ` Christian König
@ 2021-07-29 12:39   ` Chen, Guchun
  2021-07-29 12:50     ` Christian König
  0 siblings, 1 reply; 6+ messages in thread
From: Chen, Guchun @ 2021-07-29 12:39 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx, Gao, Likun, Zhang, Hawking, Deucher,
	Alexander

[Public]

Hi Christian,

Thanks for your feedback.

Originally, drm_sched_fini is part of amdgpu_fence_driver_hw_fini, I did not move it.
Former patch " cd87a6dcf6af drm/amdgpu: adjust fence driver enable sequence " has dropped amdgpu_fence_driver_suspend, and called amdgpu_fence_driver_hw_fini instead in function amdgpu_device_suspend. I checked the code difference between  amdgpu_fence_driver_hw_fini and amdgpu_device_suspend, they are almost the same except drm_sched_fini part, so I think we can leave it as it is, while skipping the execution of drm_sched_fini in suspend/resume case.

Regards,
Guchun

-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com> 
Sent: Thursday, July 29, 2021 7:11 PM
To: Chen, Guchun <Guchun.Chen@amd.com>; amd-gfx@lists.freedesktop.org; Gao, Likun <Likun.Gao@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>
Subject: Re: [PATCH] drm/amdgpu: avoid over-handle of fence driver fini in s3 test

Am 29.07.21 um 12:49 schrieb Guchun Chen:
> In amdgpu_fence_driver_hw_fini, no need to call drm_sched_fini to stop 
> scheduler in s3 test, otherwise, fence errors will occur after resume.
> So introduce a new parameter to distingiush the case in this API.

NAK, the problem is rather that drm_sched_fini() is part of the sw shutdown and should never be called during hw_fini.

Christian.

>
> Fixes: cd87a6dcf6af drm/amdgpu: adjust fence driver enable sequence
> Signed-off-by: Guchun Chen <guchun.chen@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 8 +++++---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   | 2 +-
>   3 files changed, 8 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index b1d2dc39e8be..aaff8ebbb7dc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3844,7 +3844,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
>   		else
>   			drm_atomic_helper_shutdown(adev_to_drm(adev));
>   	}
> -	amdgpu_fence_driver_hw_fini(adev);
> +	amdgpu_fence_driver_hw_fini(adev, false);
>   
>   	if (adev->pm_sysfs_en)
>   		amdgpu_pm_sysfs_fini(adev);
> @@ -3941,7 +3941,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
>   	/* evict vram memory */
>   	amdgpu_bo_evict_vram(adev);
>   
> -	amdgpu_fence_driver_hw_fini(adev);
> +	amdgpu_fence_driver_hw_fini(adev, adev->in_suspend);
>   
>   	amdgpu_device_ip_suspend_phase2(adev);
>   	/* evict remaining vram memory
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 49c5c7331c53..7e6a73c2919d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -515,14 +515,15 @@ int amdgpu_fence_driver_init(struct amdgpu_device *adev)
>   }
>   
>   /**
> - * amdgpu_fence_driver_fini - tear down the fence driver
> + * amdgpu_fence_driver_hw_fini - tear down the fence driver
>    * for all possible rings.
>    *
>    * @adev: amdgpu device pointer
> + * @in_reset: indicator to distingiush device removal case or s3 case
>    *
>    * Tear down the fence driver for all possible rings (all asics).
>    */
> -void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev)
> +void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev, bool 
> +in_reset)
>   {
>   	int i, r;
>   
> @@ -531,8 +532,9 @@ void amdgpu_fence_driver_hw_fini(struct 
> amdgpu_device *adev)
>   
>   		if (!ring || !ring->fence_drv.initialized)
>   			continue;
> -		if (!ring->no_scheduler)
> +		if (!ring->no_scheduler && !in_reset)
>   			drm_sched_fini(&ring->sched);
> +
>   		/* You can't wait for HW to signal if it's gone */
>   		if (!drm_dev_is_unplugged(&adev->ddev))
>   			r = amdgpu_fence_wait_empty(ring); diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 27adffa7658d..42cbecfc26a3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -115,7 +115,7 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>   int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring,
>   				   struct amdgpu_irq_src *irq_src,
>   				   unsigned irq_type);
> -void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev);
> +void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev, bool 
> +in_reset);
>   void amdgpu_fence_driver_sw_fini(struct amdgpu_device *adev);
>   void amdgpu_fence_driver_hw_init(struct amdgpu_device *adev);
>   int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence 
> **fence,
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] drm/amdgpu: avoid over-handle of fence driver fini in s3 test
  2021-07-29 12:39   ` Chen, Guchun
@ 2021-07-29 12:50     ` Christian König
  2021-07-29 12:56       ` Chen, Guchun
  0 siblings, 1 reply; 6+ messages in thread
From: Christian König @ 2021-07-29 12:50 UTC (permalink / raw)
  To: Chen, Guchun, Koenig, Christian, amd-gfx, Gao, Likun, Zhang,
	Hawking, Deucher, Alexander

Hi Guchun,

see below.

Am 29.07.21 um 14:39 schrieb Chen, Guchun:
> [Public]
>
> Hi Christian,
>
> Thanks for your feedback.
>
> Originally, drm_sched_fini is part of amdgpu_fence_driver_hw_fini, I did not move it.

Yeah, not saying that this is your fault, you should just clean that up 
more thoughtfully.

> Former patch " cd87a6dcf6af drm/amdgpu: adjust fence driver enable sequence " has dropped amdgpu_fence_driver_suspend, and called amdgpu_fence_driver_hw_fini instead in function amdgpu_device_suspend. I checked the code difference between  amdgpu_fence_driver_hw_fini and amdgpu_device_suspend, they are almost the same except drm_sched_fini part, so I think we can leave it as it is, while skipping the execution of drm_sched_fini in suspend/resume case.

And exactly that's a bad idea and the reason why I already said during 
the review of patch "cd87a6dcf6af drm/amdgpu: adjust fence driver enable 
sequence" that the callers of those functions need to be adjusted instead.

1. If not already done please rename the functions as Hawking suggested 
as well, they should be amdgpu_fence_driver_hw_(init|fini) and 
amdgpu_fence_driver_sw_(init|fini).

2. Remove drm_sched_fini() from amdgpu_fence_driver_hw_fini() and add 
that to sw_fini instead.

3. Adjust the callers so that we have the same functionality as before. 
E.g. by calling both hw_fini and sw_fini at the same time.

Regards,
Christian.

>
> Regards,
> Guchun
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Thursday, July 29, 2021 7:11 PM
> To: Chen, Guchun <Guchun.Chen@amd.com>; amd-gfx@lists.freedesktop.org; Gao, Likun <Likun.Gao@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>
> Subject: Re: [PATCH] drm/amdgpu: avoid over-handle of fence driver fini in s3 test
>
> Am 29.07.21 um 12:49 schrieb Guchun Chen:
>> In amdgpu_fence_driver_hw_fini, no need to call drm_sched_fini to stop
>> scheduler in s3 test, otherwise, fence errors will occur after resume.
>> So introduce a new parameter to distingiush the case in this API.
> NAK, the problem is rather that drm_sched_fini() is part of the sw shutdown and should never be called during hw_fini.
>
> Christian.
>
>> Fixes: cd87a6dcf6af drm/amdgpu: adjust fence driver enable sequence
>> Signed-off-by: Guchun Chen <guchun.chen@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++--
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 8 +++++---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   | 2 +-
>>    3 files changed, 8 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index b1d2dc39e8be..aaff8ebbb7dc 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3844,7 +3844,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
>>    		else
>>    			drm_atomic_helper_shutdown(adev_to_drm(adev));
>>    	}
>> -	amdgpu_fence_driver_hw_fini(adev);
>> +	amdgpu_fence_driver_hw_fini(adev, false);
>>    
>>    	if (adev->pm_sysfs_en)
>>    		amdgpu_pm_sysfs_fini(adev);
>> @@ -3941,7 +3941,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
>>    	/* evict vram memory */
>>    	amdgpu_bo_evict_vram(adev);
>>    
>> -	amdgpu_fence_driver_hw_fini(adev);
>> +	amdgpu_fence_driver_hw_fini(adev, adev->in_suspend);
>>    
>>    	amdgpu_device_ip_suspend_phase2(adev);
>>    	/* evict remaining vram memory
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> index 49c5c7331c53..7e6a73c2919d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> @@ -515,14 +515,15 @@ int amdgpu_fence_driver_init(struct amdgpu_device *adev)
>>    }
>>    
>>    /**
>> - * amdgpu_fence_driver_fini - tear down the fence driver
>> + * amdgpu_fence_driver_hw_fini - tear down the fence driver
>>     * for all possible rings.
>>     *
>>     * @adev: amdgpu device pointer
>> + * @in_reset: indicator to distingiush device removal case or s3 case
>>     *
>>     * Tear down the fence driver for all possible rings (all asics).
>>     */
>> -void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev)
>> +void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev, bool
>> +in_reset)
>>    {
>>    	int i, r;
>>    
>> @@ -531,8 +532,9 @@ void amdgpu_fence_driver_hw_fini(struct
>> amdgpu_device *adev)
>>    
>>    		if (!ring || !ring->fence_drv.initialized)
>>    			continue;
>> -		if (!ring->no_scheduler)
>> +		if (!ring->no_scheduler && !in_reset)
>>    			drm_sched_fini(&ring->sched);
>> +
>>    		/* You can't wait for HW to signal if it's gone */
>>    		if (!drm_dev_is_unplugged(&adev->ddev))
>>    			r = amdgpu_fence_wait_empty(ring); diff --git
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> index 27adffa7658d..42cbecfc26a3 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> @@ -115,7 +115,7 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>>    int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring,
>>    				   struct amdgpu_irq_src *irq_src,
>>    				   unsigned irq_type);
>> -void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev);
>> +void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev, bool
>> +in_reset);
>>    void amdgpu_fence_driver_sw_fini(struct amdgpu_device *adev);
>>    void amdgpu_fence_driver_hw_init(struct amdgpu_device *adev);
>>    int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence
>> **fence,
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [PATCH] drm/amdgpu: avoid over-handle of fence driver fini in s3 test
  2021-07-29 12:50     ` Christian König
@ 2021-07-29 12:56       ` Chen, Guchun
  2021-07-29 12:59         ` Christian König
  0 siblings, 1 reply; 6+ messages in thread
From: Chen, Guchun @ 2021-07-29 12:56 UTC (permalink / raw)
  To: Christian König, Koenig, Christian, amd-gfx, Gao, Likun,
	Zhang, Hawking, Deucher, Alexander

[Public]

Got you, so the target is to take this chance to make the code logic more clear instead of making it just workable on top of mixed functionality code.

I will post a more reasonable patch later on to address this. Thank you.

Regards,
Guchun

-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@gmail.com> 
Sent: Thursday, July 29, 2021 8:50 PM
To: Chen, Guchun <Guchun.Chen@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; amd-gfx@lists.freedesktop.org; Gao, Likun <Likun.Gao@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>
Subject: Re: [PATCH] drm/amdgpu: avoid over-handle of fence driver fini in s3 test

Hi Guchun,

see below.

Am 29.07.21 um 14:39 schrieb Chen, Guchun:
> [Public]
>
> Hi Christian,
>
> Thanks for your feedback.
>
> Originally, drm_sched_fini is part of amdgpu_fence_driver_hw_fini, I did not move it.

Yeah, not saying that this is your fault, you should just clean that up more thoughtfully.

> Former patch " cd87a6dcf6af drm/amdgpu: adjust fence driver enable sequence " has dropped amdgpu_fence_driver_suspend, and called amdgpu_fence_driver_hw_fini instead in function amdgpu_device_suspend. I checked the code difference between  amdgpu_fence_driver_hw_fini and amdgpu_device_suspend, they are almost the same except drm_sched_fini part, so I think we can leave it as it is, while skipping the execution of drm_sched_fini in suspend/resume case.

And exactly that's a bad idea and the reason why I already said during the review of patch "cd87a6dcf6af drm/amdgpu: adjust fence driver enable sequence" that the callers of those functions need to be adjusted instead.

1. If not already done please rename the functions as Hawking suggested as well, they should be amdgpu_fence_driver_hw_(init|fini) and amdgpu_fence_driver_sw_(init|fini).

2. Remove drm_sched_fini() from amdgpu_fence_driver_hw_fini() and add that to sw_fini instead.

3. Adjust the callers so that we have the same functionality as before. 
E.g. by calling both hw_fini and sw_fini at the same time.

Regards,
Christian.

>
> Regards,
> Guchun
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Thursday, July 29, 2021 7:11 PM
> To: Chen, Guchun <Guchun.Chen@amd.com>; amd-gfx@lists.freedesktop.org; 
> Gao, Likun <Likun.Gao@amd.com>; Zhang, Hawking 
> <Hawking.Zhang@amd.com>; Deucher, Alexander 
> <Alexander.Deucher@amd.com>
> Subject: Re: [PATCH] drm/amdgpu: avoid over-handle of fence driver 
> fini in s3 test
>
> Am 29.07.21 um 12:49 schrieb Guchun Chen:
>> In amdgpu_fence_driver_hw_fini, no need to call drm_sched_fini to 
>> stop scheduler in s3 test, otherwise, fence errors will occur after resume.
>> So introduce a new parameter to distingiush the case in this API.
> NAK, the problem is rather that drm_sched_fini() is part of the sw shutdown and should never be called during hw_fini.
>
> Christian.
>
>> Fixes: cd87a6dcf6af drm/amdgpu: adjust fence driver enable sequence
>> Signed-off-by: Guchun Chen <guchun.chen@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++--
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 8 +++++---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   | 2 +-
>>    3 files changed, 8 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index b1d2dc39e8be..aaff8ebbb7dc 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3844,7 +3844,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
>>    		else
>>    			drm_atomic_helper_shutdown(adev_to_drm(adev));
>>    	}
>> -	amdgpu_fence_driver_hw_fini(adev);
>> +	amdgpu_fence_driver_hw_fini(adev, false);
>>    
>>    	if (adev->pm_sysfs_en)
>>    		amdgpu_pm_sysfs_fini(adev);
>> @@ -3941,7 +3941,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
>>    	/* evict vram memory */
>>    	amdgpu_bo_evict_vram(adev);
>>    
>> -	amdgpu_fence_driver_hw_fini(adev);
>> +	amdgpu_fence_driver_hw_fini(adev, adev->in_suspend);
>>    
>>    	amdgpu_device_ip_suspend_phase2(adev);
>>    	/* evict remaining vram memory
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> index 49c5c7331c53..7e6a73c2919d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> @@ -515,14 +515,15 @@ int amdgpu_fence_driver_init(struct amdgpu_device *adev)
>>    }
>>    
>>    /**
>> - * amdgpu_fence_driver_fini - tear down the fence driver
>> + * amdgpu_fence_driver_hw_fini - tear down the fence driver
>>     * for all possible rings.
>>     *
>>     * @adev: amdgpu device pointer
>> + * @in_reset: indicator to distingiush device removal case or s3 
>> + case
>>     *
>>     * Tear down the fence driver for all possible rings (all asics).
>>     */
>> -void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev)
>> +void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev, bool
>> +in_reset)
>>    {
>>    	int i, r;
>>    
>> @@ -531,8 +532,9 @@ void amdgpu_fence_driver_hw_fini(struct
>> amdgpu_device *adev)
>>    
>>    		if (!ring || !ring->fence_drv.initialized)
>>    			continue;
>> -		if (!ring->no_scheduler)
>> +		if (!ring->no_scheduler && !in_reset)
>>    			drm_sched_fini(&ring->sched);
>> +
>>    		/* You can't wait for HW to signal if it's gone */
>>    		if (!drm_dev_is_unplugged(&adev->ddev))
>>    			r = amdgpu_fence_wait_empty(ring); diff --git 
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> index 27adffa7658d..42cbecfc26a3 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> @@ -115,7 +115,7 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>>    int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring,
>>    				   struct amdgpu_irq_src *irq_src,
>>    				   unsigned irq_type);
>> -void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev);
>> +void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev, bool 
>> +in_reset);
>>    void amdgpu_fence_driver_sw_fini(struct amdgpu_device *adev);
>>    void amdgpu_fence_driver_hw_init(struct amdgpu_device *adev);
>>    int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence 
>> **fence,
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flist
> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7CGu
> chun.Chen%40amd.com%7Cb9ef02c0084240178aaa08d9528f69a3%7C3dd8961fe4884
> e608e11a82d994e183d%7C0%7C0%7C637631598168273235%7CUnknown%7CTWFpbGZsb
> 3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%
> 7C1000&amp;sdata=O7Gbls7ikrhXrGnsChLJEmPTTFgEg1XJwqydZ1BccNk%3D&amp;re
> served=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] drm/amdgpu: avoid over-handle of fence driver fini in s3 test
  2021-07-29 12:56       ` Chen, Guchun
@ 2021-07-29 12:59         ` Christian König
  0 siblings, 0 replies; 6+ messages in thread
From: Christian König @ 2021-07-29 12:59 UTC (permalink / raw)
  To: Chen, Guchun, Koenig, Christian, amd-gfx, Gao, Likun, Zhang,
	Hawking, Deucher, Alexander

Exactly that yes.

Thanks,
Christian.

Am 29.07.21 um 14:56 schrieb Chen, Guchun:
> [Public]
>
> Got you, so the target is to take this chance to make the code logic more clear instead of making it just workable on top of mixed functionality code.
>
> I will post a more reasonable patch later on to address this. Thank you.
>
> Regards,
> Guchun
>
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Thursday, July 29, 2021 8:50 PM
> To: Chen, Guchun <Guchun.Chen@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; amd-gfx@lists.freedesktop.org; Gao, Likun <Likun.Gao@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>
> Subject: Re: [PATCH] drm/amdgpu: avoid over-handle of fence driver fini in s3 test
>
> Hi Guchun,
>
> see below.
>
> Am 29.07.21 um 14:39 schrieb Chen, Guchun:
>> [Public]
>>
>> Hi Christian,
>>
>> Thanks for your feedback.
>>
>> Originally, drm_sched_fini is part of amdgpu_fence_driver_hw_fini, I did not move it.
> Yeah, not saying that this is your fault, you should just clean that up more thoughtfully.
>
>> Former patch " cd87a6dcf6af drm/amdgpu: adjust fence driver enable sequence " has dropped amdgpu_fence_driver_suspend, and called amdgpu_fence_driver_hw_fini instead in function amdgpu_device_suspend. I checked the code difference between  amdgpu_fence_driver_hw_fini and amdgpu_device_suspend, they are almost the same except drm_sched_fini part, so I think we can leave it as it is, while skipping the execution of drm_sched_fini in suspend/resume case.
> And exactly that's a bad idea and the reason why I already said during the review of patch "cd87a6dcf6af drm/amdgpu: adjust fence driver enable sequence" that the callers of those functions need to be adjusted instead.
>
> 1. If not already done please rename the functions as Hawking suggested as well, they should be amdgpu_fence_driver_hw_(init|fini) and amdgpu_fence_driver_sw_(init|fini).
>
> 2. Remove drm_sched_fini() from amdgpu_fence_driver_hw_fini() and add that to sw_fini instead.
>
> 3. Adjust the callers so that we have the same functionality as before.
> E.g. by calling both hw_fini and sw_fini at the same time.
>
> Regards,
> Christian.
>
>> Regards,
>> Guchun
>>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: Thursday, July 29, 2021 7:11 PM
>> To: Chen, Guchun <Guchun.Chen@amd.com>; amd-gfx@lists.freedesktop.org;
>> Gao, Likun <Likun.Gao@amd.com>; Zhang, Hawking
>> <Hawking.Zhang@amd.com>; Deucher, Alexander
>> <Alexander.Deucher@amd.com>
>> Subject: Re: [PATCH] drm/amdgpu: avoid over-handle of fence driver
>> fini in s3 test
>>
>> Am 29.07.21 um 12:49 schrieb Guchun Chen:
>>> In amdgpu_fence_driver_hw_fini, no need to call drm_sched_fini to
>>> stop scheduler in s3 test, otherwise, fence errors will occur after resume.
>>> So introduce a new parameter to distingiush the case in this API.
>> NAK, the problem is rather that drm_sched_fini() is part of the sw shutdown and should never be called during hw_fini.
>>
>> Christian.
>>
>>> Fixes: cd87a6dcf6af drm/amdgpu: adjust fence driver enable sequence
>>> Signed-off-by: Guchun Chen <guchun.chen@amd.com>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++--
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 8 +++++---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   | 2 +-
>>>     3 files changed, 8 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index b1d2dc39e8be..aaff8ebbb7dc 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -3844,7 +3844,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
>>>     		else
>>>     			drm_atomic_helper_shutdown(adev_to_drm(adev));
>>>     	}
>>> -	amdgpu_fence_driver_hw_fini(adev);
>>> +	amdgpu_fence_driver_hw_fini(adev, false);
>>>     
>>>     	if (adev->pm_sysfs_en)
>>>     		amdgpu_pm_sysfs_fini(adev);
>>> @@ -3941,7 +3941,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
>>>     	/* evict vram memory */
>>>     	amdgpu_bo_evict_vram(adev);
>>>     
>>> -	amdgpu_fence_driver_hw_fini(adev);
>>> +	amdgpu_fence_driver_hw_fini(adev, adev->in_suspend);
>>>     
>>>     	amdgpu_device_ip_suspend_phase2(adev);
>>>     	/* evict remaining vram memory
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> index 49c5c7331c53..7e6a73c2919d 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> @@ -515,14 +515,15 @@ int amdgpu_fence_driver_init(struct amdgpu_device *adev)
>>>     }
>>>     
>>>     /**
>>> - * amdgpu_fence_driver_fini - tear down the fence driver
>>> + * amdgpu_fence_driver_hw_fini - tear down the fence driver
>>>      * for all possible rings.
>>>      *
>>>      * @adev: amdgpu device pointer
>>> + * @in_reset: indicator to distingiush device removal case or s3
>>> + case
>>>      *
>>>      * Tear down the fence driver for all possible rings (all asics).
>>>      */
>>> -void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev)
>>> +void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev, bool
>>> +in_reset)
>>>     {
>>>     	int i, r;
>>>     
>>> @@ -531,8 +532,9 @@ void amdgpu_fence_driver_hw_fini(struct
>>> amdgpu_device *adev)
>>>     
>>>     		if (!ring || !ring->fence_drv.initialized)
>>>     			continue;
>>> -		if (!ring->no_scheduler)
>>> +		if (!ring->no_scheduler && !in_reset)
>>>     			drm_sched_fini(&ring->sched);
>>> +
>>>     		/* You can't wait for HW to signal if it's gone */
>>>     		if (!drm_dev_is_unplugged(&adev->ddev))
>>>     			r = amdgpu_fence_wait_empty(ring); diff --git
>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> index 27adffa7658d..42cbecfc26a3 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> @@ -115,7 +115,7 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>>>     int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring,
>>>     				   struct amdgpu_irq_src *irq_src,
>>>     				   unsigned irq_type);
>>> -void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev);
>>> +void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev, bool
>>> +in_reset);
>>>     void amdgpu_fence_driver_sw_fini(struct amdgpu_device *adev);
>>>     void amdgpu_fence_driver_hw_init(struct amdgpu_device *adev);
>>>     int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence
>>> **fence,
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flist
>> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7CGu
>> chun.Chen%40amd.com%7Cb9ef02c0084240178aaa08d9528f69a3%7C3dd8961fe4884
>> e608e11a82d994e183d%7C0%7C0%7C637631598168273235%7CUnknown%7CTWFpbGZsb
>> 3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%
>> 7C1000&amp;sdata=O7Gbls7ikrhXrGnsChLJEmPTTFgEg1XJwqydZ1BccNk%3D&amp;re
>> served=0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2021-07-29 12:59 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-29 10:49 [PATCH] drm/amdgpu: avoid over-handle of fence driver fini in s3 test Guchun Chen
2021-07-29 11:11 ` Christian König
2021-07-29 12:39   ` Chen, Guchun
2021-07-29 12:50     ` Christian König
2021-07-29 12:56       ` Chen, Guchun
2021-07-29 12:59         ` Christian König

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.