All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: Increase tlb flush timeout for sriov
@ 2021-05-19  9:32 Chengzhe Liu
  2021-05-19 10:08 ` Christian König
  0 siblings, 1 reply; 6+ messages in thread
From: Chengzhe Liu @ 2021-05-19  9:32 UTC (permalink / raw)
  To: amd-gfx
  Cc: Jack Xiao, Feifei Xu, Kevin Wang, Chengzhe Liu, Tuikov Luben,
	Deucher Alexander, Christian König, Hawking Zhang

When there is 12 VF, we need to increase the timeout

Signed-off-by: Chengzhe Liu <ChengZhe.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 6 +++++-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index f02dc904e4cf..a5f005c5d0ec 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -404,6 +404,7 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
 	uint32_t seq;
 	uint16_t queried_pasid;
 	bool ret;
+	uint32_t sriov_usec_timeout = 6000000;  /* wait for 12 * 500ms for SRIOV */
 	struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 
@@ -422,7 +423,10 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
 
 		amdgpu_ring_commit(ring);
 		spin_unlock(&adev->gfx.kiq.ring_lock);
-		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
+		if (amdgpu_sriov_vf(adev))
+			r = amdgpu_fence_wait_polling(ring, seq, sriov_usec_timeout);
+		else
+			r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
 		if (r < 1) {
 			dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
 			return -ETIME;
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index ceb3968d8326..e4a18d8f75c2 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -857,6 +857,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
 	uint32_t seq;
 	uint16_t queried_pasid;
 	bool ret;
+	uint32_t sriov_usec_timeout = 6000000;  /* wait for 12 * 500ms for SRIOV */
 	struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 
@@ -896,7 +897,10 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
 
 		amdgpu_ring_commit(ring);
 		spin_unlock(&adev->gfx.kiq.ring_lock);
-		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
+		if (amdgpu_sriov_vf(adev))
+			r = amdgpu_fence_wait_polling(ring, seq, sriov_usec_timeout);
+		else
+			r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
 		if (r < 1) {
 			dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
 			up_read(&adev->reset_sem);
-- 
2.25.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] drm/amdgpu: Increase tlb flush timeout for sriov
  2021-05-19  9:32 [PATCH] drm/amdgpu: Increase tlb flush timeout for sriov Chengzhe Liu
@ 2021-05-19 10:08 ` Christian König
  2021-05-19 11:08   ` Liu, Cheng Zhe
  0 siblings, 1 reply; 6+ messages in thread
From: Christian König @ 2021-05-19 10:08 UTC (permalink / raw)
  To: Chengzhe Liu, amd-gfx
  Cc: Jack Xiao, Feifei Xu, Kevin Wang, Tuikov Luben,
	Deucher Alexander, Christian König, Hawking Zhang

Am 19.05.21 um 11:32 schrieb Chengzhe Liu:
> When there is 12 VF, we need to increase the timeout

NAK, 6 seconds is way to long to wait polling on a fence.

Why should an invalidation take that long? The engine are per VF just to 
avoid exactly that problem.

Christian.

>
> Signed-off-by: Chengzhe Liu <ChengZhe.Liu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 6 +++++-
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 6 +++++-
>   2 files changed, 10 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index f02dc904e4cf..a5f005c5d0ec 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -404,6 +404,7 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   	uint32_t seq;
>   	uint16_t queried_pasid;
>   	bool ret;
> +	uint32_t sriov_usec_timeout = 6000000;  /* wait for 12 * 500ms for SRIOV */
>   	struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   
> @@ -422,7 +423,10 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   
>   		amdgpu_ring_commit(ring);
>   		spin_unlock(&adev->gfx.kiq.ring_lock);
> -		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
> +		if (amdgpu_sriov_vf(adev))
> +			r = amdgpu_fence_wait_polling(ring, seq, sriov_usec_timeout);
> +		else
> +			r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>   		if (r < 1) {
>   			dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
>   			return -ETIME;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index ceb3968d8326..e4a18d8f75c2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -857,6 +857,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   	uint32_t seq;
>   	uint16_t queried_pasid;
>   	bool ret;
> +	uint32_t sriov_usec_timeout = 6000000;  /* wait for 12 * 500ms for SRIOV */
>   	struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   
> @@ -896,7 +897,10 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   
>   		amdgpu_ring_commit(ring);
>   		spin_unlock(&adev->gfx.kiq.ring_lock);
> -		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
> +		if (amdgpu_sriov_vf(adev))
> +			r = amdgpu_fence_wait_polling(ring, seq, sriov_usec_timeout);
> +		else
> +			r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>   		if (r < 1) {
>   			dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
>   			up_read(&adev->reset_sem);

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [PATCH] drm/amdgpu: Increase tlb flush timeout for sriov
  2021-05-19 10:08 ` Christian König
@ 2021-05-19 11:08   ` Liu, Cheng Zhe
  2021-05-19 11:49     ` Christian König
  0 siblings, 1 reply; 6+ messages in thread
From: Liu, Cheng Zhe @ 2021-05-19 11:08 UTC (permalink / raw)
  To: Christian König, amd-gfx
  Cc: Xiao, Jack, Xu, Feifei, Chen,  Horace, Wang, Kevin(Yang),
	Tuikov, Luben, Deucher, Alexander, Koenig, Christian, Zhang,
	Hawking

[AMD Official Use Only]

We support 12 VF at most. In worst case, the first 11 all IDLE fail and do FLR, it will need 11 * 500ms to switch to the 12nd VF,
so I set 12 * 500ms  for the timeout.

-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@gmail.com> 
Sent: Wednesday, May 19, 2021 6:08 PM
To: Liu, Cheng Zhe <ChengZhe.Liu@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Xiao, Jack <Jack.Xiao@amd.com>; Xu, Feifei <Feifei.Xu@amd.com>; Wang, Kevin(Yang) <Kevin1.Wang@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
Subject: Re: [PATCH] drm/amdgpu: Increase tlb flush timeout for sriov

Am 19.05.21 um 11:32 schrieb Chengzhe Liu:
> When there is 12 VF, we need to increase the timeout

NAK, 6 seconds is way to long to wait polling on a fence.

Why should an invalidation take that long? The engine are per VF just to avoid exactly that problem.

Christian.

>
> Signed-off-by: Chengzhe Liu <ChengZhe.Liu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 6 +++++-
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 6 +++++-
>   2 files changed, 10 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index f02dc904e4cf..a5f005c5d0ec 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -404,6 +404,7 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   	uint32_t seq;
>   	uint16_t queried_pasid;
>   	bool ret;
> +	uint32_t sriov_usec_timeout = 6000000;  /* wait for 12 * 500ms for 
> +SRIOV */
>   	struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   
> @@ -422,7 +423,10 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct 
> amdgpu_device *adev,
>   
>   		amdgpu_ring_commit(ring);
>   		spin_unlock(&adev->gfx.kiq.ring_lock);
> -		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
> +		if (amdgpu_sriov_vf(adev))
> +			r = amdgpu_fence_wait_polling(ring, seq, sriov_usec_timeout);
> +		else
> +			r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>   		if (r < 1) {
>   			dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
>   			return -ETIME;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index ceb3968d8326..e4a18d8f75c2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -857,6 +857,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   	uint32_t seq;
>   	uint16_t queried_pasid;
>   	bool ret;
> +	uint32_t sriov_usec_timeout = 6000000;  /* wait for 12 * 500ms for 
> +SRIOV */
>   	struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   
> @@ -896,7 +897,10 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct 
> amdgpu_device *adev,
>   
>   		amdgpu_ring_commit(ring);
>   		spin_unlock(&adev->gfx.kiq.ring_lock);
> -		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
> +		if (amdgpu_sriov_vf(adev))
> +			r = amdgpu_fence_wait_polling(ring, seq, sriov_usec_timeout);
> +		else
> +			r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>   		if (r < 1) {
>   			dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
>   			up_read(&adev->reset_sem);
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] drm/amdgpu: Increase tlb flush timeout for sriov
  2021-05-19 11:08   ` Liu, Cheng Zhe
@ 2021-05-19 11:49     ` Christian König
  2021-05-19 14:39       ` 回复: " Chen, Horace
  0 siblings, 1 reply; 6+ messages in thread
From: Christian König @ 2021-05-19 11:49 UTC (permalink / raw)
  To: Liu, Cheng Zhe, Christian König, amd-gfx
  Cc: Xiao, Jack, Xu, Feifei, Chen, Horace, Wang, Kevin(Yang),
	Tuikov, Luben, Deucher, Alexander, Zhang, Hawking

Yeah, but you can't do that it will probably trigger the watchdog timer.

The usec_timeout is named this way because it is a usec timeout. 
Anything large than 1ms is a no-go here.

When the other instances do a FLR we don't really need to wait for the 
TLB flush anyway since any FLR will kill that.

Christian.

Am 19.05.21 um 13:08 schrieb Liu, Cheng Zhe:
> [AMD Official Use Only]
>
> We support 12 VF at most. In worst case, the first 11 all IDLE fail and do FLR, it will need 11 * 500ms to switch to the 12nd VF,
> so I set 12 * 500ms  for the timeout.
>
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Wednesday, May 19, 2021 6:08 PM
> To: Liu, Cheng Zhe <ChengZhe.Liu@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Xiao, Jack <Jack.Xiao@amd.com>; Xu, Feifei <Feifei.Xu@amd.com>; Wang, Kevin(Yang) <Kevin1.Wang@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
> Subject: Re: [PATCH] drm/amdgpu: Increase tlb flush timeout for sriov
>
> Am 19.05.21 um 11:32 schrieb Chengzhe Liu:
>> When there is 12 VF, we need to increase the timeout
> NAK, 6 seconds is way to long to wait polling on a fence.
>
> Why should an invalidation take that long? The engine are per VF just to avoid exactly that problem.
>
> Christian.
>
>> Signed-off-by: Chengzhe Liu <ChengZhe.Liu@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 6 +++++-
>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 6 +++++-
>>    2 files changed, 10 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> index f02dc904e4cf..a5f005c5d0ec 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> @@ -404,6 +404,7 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>    	uint32_t seq;
>>    	uint16_t queried_pasid;
>>    	bool ret;
>> +	uint32_t sriov_usec_timeout = 6000000;  /* wait for 12 * 500ms for
>> +SRIOV */
>>    	struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    
>> @@ -422,7 +423,10 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct
>> amdgpu_device *adev,
>>    
>>    		amdgpu_ring_commit(ring);
>>    		spin_unlock(&adev->gfx.kiq.ring_lock);
>> -		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>> +		if (amdgpu_sriov_vf(adev))
>> +			r = amdgpu_fence_wait_polling(ring, seq, sriov_usec_timeout);
>> +		else
>> +			r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>>    		if (r < 1) {
>>    			dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
>>    			return -ETIME;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index ceb3968d8326..e4a18d8f75c2 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -857,6 +857,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>    	uint32_t seq;
>>    	uint16_t queried_pasid;
>>    	bool ret;
>> +	uint32_t sriov_usec_timeout = 6000000;  /* wait for 12 * 500ms for
>> +SRIOV */
>>    	struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    
>> @@ -896,7 +897,10 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct
>> amdgpu_device *adev,
>>    
>>    		amdgpu_ring_commit(ring);
>>    		spin_unlock(&adev->gfx.kiq.ring_lock);
>> -		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>> +		if (amdgpu_sriov_vf(adev))
>> +			r = amdgpu_fence_wait_polling(ring, seq, sriov_usec_timeout);
>> +		else
>> +			r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>>    		if (r < 1) {
>>    			dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
>>    			up_read(&adev->reset_sem);

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* 回复: [PATCH] drm/amdgpu: Increase tlb flush timeout for sriov
  2021-05-19 11:49     ` Christian König
@ 2021-05-19 14:39       ` Chen, Horace
  2021-05-19 14:47         ` Christian König
  0 siblings, 1 reply; 6+ messages in thread
From: Chen, Horace @ 2021-05-19 14:39 UTC (permalink / raw)
  To: Koenig, Christian, Liu, Cheng Zhe, Christian König, amd-gfx
  Cc: Xiao, Jack, Xu, Feifei, Wang,  Kevin(Yang),
	Tuikov, Luben, Deucher, Alexander, Zhang, Hawking


[-- Attachment #1.1: Type: text/plain, Size: 5361 bytes --]

[AMD Official Use Only]

Hi Christian,

I think the problem is that a non-FLRed VF will not know that another VF got an FLR, unless host triggered a whole GPU reset.
So in the worst situation, for example the VF0 to VF10 are all hang and will be FLRed one by one, the VF11 will not know that there are FLRs happened, in VF11's prespective, it just see the fence didn't come back for about 5.5(0.5*11) seconds.

Thanks & Regards,
Horace.

________________________________
发件人: Koenig, Christian <Christian.Koenig@amd.com>
发送时间: 2021年5月19日 19:49
收件人: Liu, Cheng Zhe <ChengZhe.Liu@amd.com>; Christian König <ckoenig.leichtzumerken@gmail.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
抄送: Xiao, Jack <Jack.Xiao@amd.com>; Xu, Feifei <Feifei.Xu@amd.com>; Wang, Kevin(Yang) <Kevin1.Wang@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Chen, Horace <Horace.Chen@amd.com>
主题: Re: [PATCH] drm/amdgpu: Increase tlb flush timeout for sriov

Yeah, but you can't do that it will probably trigger the watchdog timer.

The usec_timeout is named this way because it is a usec timeout.
Anything large than 1ms is a no-go here.

When the other instances do a FLR we don't really need to wait for the
TLB flush anyway since any FLR will kill that.

Christian.

Am 19.05.21 um 13:08 schrieb Liu, Cheng Zhe:
> [AMD Official Use Only]
>
> We support 12 VF at most. In worst case, the first 11 all IDLE fail and do FLR, it will need 11 * 500ms to switch to the 12nd VF,
> so I set 12 * 500ms  for the timeout.
>
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Wednesday, May 19, 2021 6:08 PM
> To: Liu, Cheng Zhe <ChengZhe.Liu@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Xiao, Jack <Jack.Xiao@amd.com>; Xu, Feifei <Feifei.Xu@amd.com>; Wang, Kevin(Yang) <Kevin1.Wang@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
> Subject: Re: [PATCH] drm/amdgpu: Increase tlb flush timeout for sriov
>
> Am 19.05.21 um 11:32 schrieb Chengzhe Liu:
>> When there is 12 VF, we need to increase the timeout
> NAK, 6 seconds is way to long to wait polling on a fence.
>
> Why should an invalidation take that long? The engine are per VF just to avoid exactly that problem.
>
> Christian.
>
>> Signed-off-by: Chengzhe Liu <ChengZhe.Liu@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 6 +++++-
>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 6 +++++-
>>    2 files changed, 10 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> index f02dc904e4cf..a5f005c5d0ec 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> @@ -404,6 +404,7 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>       uint32_t seq;
>>       uint16_t queried_pasid;
>>       bool ret;
>> +    uint32_t sriov_usec_timeout = 6000000;  /* wait for 12 * 500ms for
>> +SRIOV */
>>       struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
>>       struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>
>> @@ -422,7 +423,10 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct
>> amdgpu_device *adev,
>>
>>               amdgpu_ring_commit(ring);
>>               spin_unlock(&adev->gfx.kiq.ring_lock);
>> -            r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>> +            if (amdgpu_sriov_vf(adev))
>> +                    r = amdgpu_fence_wait_polling(ring, seq, sriov_usec_timeout);
>> +            else
>> +                    r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>>               if (r < 1) {
>>                       dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
>>                       return -ETIME;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index ceb3968d8326..e4a18d8f75c2 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -857,6 +857,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>       uint32_t seq;
>>       uint16_t queried_pasid;
>>       bool ret;
>> +    uint32_t sriov_usec_timeout = 6000000;  /* wait for 12 * 500ms for
>> +SRIOV */
>>       struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
>>       struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>
>> @@ -896,7 +897,10 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct
>> amdgpu_device *adev,
>>
>>               amdgpu_ring_commit(ring);
>>               spin_unlock(&adev->gfx.kiq.ring_lock);
>> -            r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>> +            if (amdgpu_sriov_vf(adev))
>> +                    r = amdgpu_fence_wait_polling(ring, seq, sriov_usec_timeout);
>> +            else
>> +                    r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>>               if (r < 1) {
>>                       dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
>>                       up_read(&adev->reset_sem);


[-- Attachment #1.2: Type: text/html, Size: 9948 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: 回复: [PATCH] drm/amdgpu: Increase tlb flush timeout for sriov
  2021-05-19 14:39       ` 回复: " Chen, Horace
@ 2021-05-19 14:47         ` Christian König
  0 siblings, 0 replies; 6+ messages in thread
From: Christian König @ 2021-05-19 14:47 UTC (permalink / raw)
  To: Chen, Horace, Liu, Cheng Zhe, Christian König, amd-gfx
  Cc: Xiao, Jack, Xu, Feifei, Wang, Kevin(Yang),
	Tuikov, Luben, Deucher, Alexander, Zhang, Hawking


[-- Attachment #1.1: Type: text/plain, Size: 6494 bytes --]

Hi Horace,

that is correct, but also completely irrelevant.

What we do here is to wait for the TLB flush to avoid starting 
operations with invalid cache data.

But a parallel FLR clears the cache anyway and also prevents any new 
operation from starting, so it is perfectly valid to timeout and just 
continue with an error message.


On the other hand waiting for 6 seconds in a busy loop will most likely 
trigger the watchdog timer and potentially kill our process.

That is a rather clear no-go, we simply can't increase timeouts infinitely.

Regards,
Christian.

Am 19.05.21 um 16:39 schrieb Chen, Horace:
>
> [AMD Official Use Only]
>
>
> Hi Christian,
>
> I think the problem is that a non-FLRed VF will not know that another 
> VF got an FLR, unless host triggered a whole GPU reset.
> So in the worst situation, for example the VF0 to VF10 are all hang 
> and will be FLRed one by one, the VF11 will not know that there are 
> FLRs happened, in VF11's prespective, it just see the fence didn't 
> come back for about 5.5(0.5*11) seconds.
>
> Thanks & Regards,
> Horace.
>
> ------------------------------------------------------------------------
> *发件人:* Koenig, Christian <Christian.Koenig@amd.com>
> *发送时间:* 2021年5月19日 19:49
> *收件人:* Liu, Cheng Zhe <ChengZhe.Liu@amd.com>; Christian König 
> <ckoenig.leichtzumerken@gmail.com>; amd-gfx@lists.freedesktop.org 
> <amd-gfx@lists.freedesktop.org>
> *抄送:* Xiao, Jack <Jack.Xiao@amd.com>; Xu, Feifei <Feifei.Xu@amd.com>; 
> Wang, Kevin(Yang) <Kevin1.Wang@amd.com>; Tuikov, Luben 
> <Luben.Tuikov@amd.com>; Deucher, Alexander 
> <Alexander.Deucher@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; 
> Chen, Horace <Horace.Chen@amd.com>
> *主题:* Re: [PATCH] drm/amdgpu: Increase tlb flush timeout for sriov
> Yeah, but you can't do that it will probably trigger the watchdog timer.
>
> The usec_timeout is named this way because it is a usec timeout.
> Anything large than 1ms is a no-go here.
>
> When the other instances do a FLR we don't really need to wait for the
> TLB flush anyway since any FLR will kill that.
>
> Christian.
>
> Am 19.05.21 um 13:08 schrieb Liu, Cheng Zhe:
> > [AMD Official Use Only]
> >
> > We support 12 VF at most. In worst case, the first 11 all IDLE fail 
> and do FLR, it will need 11 * 500ms to switch to the 12nd VF,
> > so I set 12 * 500ms  for the timeout.
> >
> > -----Original Message-----
> > From: Christian König <ckoenig.leichtzumerken@gmail.com>
> > Sent: Wednesday, May 19, 2021 6:08 PM
> > To: Liu, Cheng Zhe <ChengZhe.Liu@amd.com>; amd-gfx@lists.freedesktop.org
> > Cc: Xiao, Jack <Jack.Xiao@amd.com>; Xu, Feifei <Feifei.Xu@amd.com>; 
> Wang, Kevin(Yang) <Kevin1.Wang@amd.com>; Tuikov, Luben 
> <Luben.Tuikov@amd.com>; Deucher, Alexander 
> <Alexander.Deucher@amd.com>; Koenig, Christian 
> <Christian.Koenig@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
> > Subject: Re: [PATCH] drm/amdgpu: Increase tlb flush timeout for sriov
> >
> > Am 19.05.21 um 11:32 schrieb Chengzhe Liu:
> >> When there is 12 VF, we need to increase the timeout
> > NAK, 6 seconds is way to long to wait polling on a fence.
> >
> > Why should an invalidation take that long? The engine are per VF 
> just to avoid exactly that problem.
> >
> > Christian.
> >
> >> Signed-off-by: Chengzhe Liu <ChengZhe.Liu@amd.com>
> >> ---
> >>    drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 6 +++++-
> >>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 6 +++++-
> >>    2 files changed, 10 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> >> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> >> index f02dc904e4cf..a5f005c5d0ec 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> >> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> >> @@ -404,6 +404,7 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct 
> amdgpu_device *adev,
> >>       uint32_t seq;
> >>       uint16_t queried_pasid;
> >>       bool ret;
> >> +    uint32_t sriov_usec_timeout = 6000000; /* wait for 12 * 500ms for
> >> +SRIOV */
> >>       struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
> >>       struct amdgpu_kiq *kiq = &adev->gfx.kiq;
> >>
> >> @@ -422,7 +423,10 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct
> >> amdgpu_device *adev,
> >>
> >>               amdgpu_ring_commit(ring);
> >> spin_unlock(&adev->gfx.kiq.ring_lock);
> >> -            r = amdgpu_fence_wait_polling(ring, seq, 
> adev->usec_timeout);
> >> +            if (amdgpu_sriov_vf(adev))
> >> +                    r = amdgpu_fence_wait_polling(ring, seq, 
> sriov_usec_timeout);
> >> +            else
> >> +                    r = amdgpu_fence_wait_polling(ring, seq, 
> adev->usec_timeout);
> >>               if (r < 1) {
> >>                       dev_err(adev->dev, "wait for kiq fence error: 
> %ld.\n", r);
> >>                       return -ETIME;
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> >> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> >> index ceb3968d8326..e4a18d8f75c2 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> >> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> >> @@ -857,6 +857,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct 
> amdgpu_device *adev,
> >>       uint32_t seq;
> >>       uint16_t queried_pasid;
> >>       bool ret;
> >> +    uint32_t sriov_usec_timeout = 6000000; /* wait for 12 * 500ms for
> >> +SRIOV */
> >>       struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
> >>       struct amdgpu_kiq *kiq = &adev->gfx.kiq;
> >>
> >> @@ -896,7 +897,10 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct
> >> amdgpu_device *adev,
> >>
> >>               amdgpu_ring_commit(ring);
> >> spin_unlock(&adev->gfx.kiq.ring_lock);
> >> -            r = amdgpu_fence_wait_polling(ring, seq, 
> adev->usec_timeout);
> >> +            if (amdgpu_sriov_vf(adev))
> >> +                    r = amdgpu_fence_wait_polling(ring, seq, 
> sriov_usec_timeout);
> >> +            else
> >> +                    r = amdgpu_fence_wait_polling(ring, seq, 
> adev->usec_timeout);
> >>               if (r < 1) {
> >>                       dev_err(adev->dev, "wait for kiq fence error: 
> %ld.\n", r);
> >> up_read(&adev->reset_sem);
>


[-- Attachment #1.2: Type: text/html, Size: 15732 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2021-05-19 14:47 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-05-19  9:32 [PATCH] drm/amdgpu: Increase tlb flush timeout for sriov Chengzhe Liu
2021-05-19 10:08 ` Christian König
2021-05-19 11:08   ` Liu, Cheng Zhe
2021-05-19 11:49     ` Christian König
2021-05-19 14:39       ` 回复: " Chen, Horace
2021-05-19 14:47         ` Christian König

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.