[PATCH] drm/amdgpu: give more chance for tlb flush if failed

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] drm/amdgpu: give more chance for tlb flush if failed
@ 2018-03-20  6:29 Emily Deng
       [not found] ` <1521527344-25641-1-git-send-email-Emily.Deng-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 5+ messages in thread
From: Emily Deng @ 2018-03-20  6:29 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Emily Deng, Monk Liu

under SR-IOV sometimes CPU based tlb flush would timeout
within the given 100ms period, instead let it fail and
continue we can give it more chance to repeat the
tlb flush on the failed VMHUB

this could fix the massive "Timeout waiting for VM flush ACK"
error during vk_encoder test.

Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index a70cbc4..517712b 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -329,13 +329,18 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev,
 {
 	/* Use register 17 for GART */
 	const unsigned eng = 17;
-	unsigned i, j;
+	unsigned i, j, loop = 0;
+	unsigned flush_done = 0;
+
+retry:
 
 	spin_lock(&adev->gmc.invalidate_lock);
 
 	for (i = 0; i < AMDGPU_MAX_VMHUBS; ++i) {
 		struct amdgpu_vmhub *hub = &adev->vmhub[i];
 		u32 tmp = gmc_v9_0_get_invalidate_req(vmid);
+		if (flush_done & (1 << i)) /* this vmhub flushed */
+			continue;
 
 		WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
 
@@ -347,8 +352,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev,
 				break;
 			cpu_relax();
 		}
-		if (j < 100)
+		if (j < 100) {
+			flush_done |= (1 << i);
 			continue;
+		}
 
 		/* Wait for ACK with a delay.*/
 		for (j = 0; j < adev->usec_timeout; j++) {
@@ -358,15 +365,22 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev,
 				break;
 			udelay(1);
 		}
-		if (j < adev->usec_timeout)
+		if (j < adev->usec_timeout) {
+			flush_done |= (1 << i);
 			continue;
-
-		DRM_ERROR("Timeout waiting for VM flush ACK!\n");
+		}
 	}
 
 	spin_unlock(&adev->gmc.invalidate_lock);
+	if (flush_done != 3) {
+		if (loop++ < 3)
+			goto retry;
+		else
+			DRM_ERROR("Timeout waiting for VM flush ACK!\n");
+	}
 }
 
+
 static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 					    unsigned vmid, uint64_t pd_addr)
 {
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] drm/amdgpu: give more chance for tlb flush if failed
       [not found] ` <1521527344-25641-1-git-send-email-Emily.Deng-5C7GfCeVMHo@public.gmane.org>
@ 2018-03-20 10:23   ` Christian König
       [not found]     ` <1a54a624-2816-c03f-bc75-da30be2ae48e-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 5+ messages in thread
From: Christian König @ 2018-03-20 10:23 UTC (permalink / raw)
  To: Emily Deng, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

Am 20.03.2018 um 07:29 schrieb Emily Deng:
> under SR-IOV sometimes CPU based tlb flush would timeout
> within the given 100ms period, instead let it fail and
> continue we can give it more chance to repeat the
> tlb flush on the failed VMHUB
>
> this could fix the massive "Timeout waiting for VM flush ACK"
> error during vk_encoder test.

Well that one is a big NAK since it once more just hides the real 
problem that we sometimes drop register writes.

What we did during debugging to avoid the problem is the following:
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index a70cbc45c4c1..3536d50375fa 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -338,6 +338,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct 
> amdgpu_device *adev,
>                 u32 tmp = gmc_v9_0_get_invalidate_req(vmid);
>
>                 WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
> +               while (RREG32_NO_KIQ(hub->vm_inv_eng0_req + eng) != tmp) {
> +                       DRM_ERROR("Need one more try to write the 
> VMHUB flush request!");
> +                       WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
> +               }
>
>                 /* Busy wait for ACK.*/
>                 for (j = 0; j < 100; j++) {

But that can only be a temporary workaround as well.

The question is rather can you reliable reproduce this issue with the 
vk_encoder test?

Thanks,
Christian.

>
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 24 +++++++++++++++++++-----
>   1 file changed, 19 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index a70cbc4..517712b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -329,13 +329,18 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev,
>   {
>   	/* Use register 17 for GART */
>   	const unsigned eng = 17;
> -	unsigned i, j;
> +	unsigned i, j, loop = 0;
> +	unsigned flush_done = 0;
> +
> +retry:
>   
>   	spin_lock(&adev->gmc.invalidate_lock);
>   
>   	for (i = 0; i < AMDGPU_MAX_VMHUBS; ++i) {
>   		struct amdgpu_vmhub *hub = &adev->vmhub[i];
>   		u32 tmp = gmc_v9_0_get_invalidate_req(vmid);
> +		if (flush_done & (1 << i)) /* this vmhub flushed */
> +			continue;
>   
>   		WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
>   
> @@ -347,8 +352,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev,
>   				break;
>   			cpu_relax();
>   		}
> -		if (j < 100)
> +		if (j < 100) {
> +			flush_done |= (1 << i);
>   			continue;
> +		}
>   
>   		/* Wait for ACK with a delay.*/
>   		for (j = 0; j < adev->usec_timeout; j++) {
> @@ -358,15 +365,22 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev,
>   				break;
>   			udelay(1);
>   		}
> -		if (j < adev->usec_timeout)
> +		if (j < adev->usec_timeout) {
> +			flush_done |= (1 << i);
>   			continue;
> -
> -		DRM_ERROR("Timeout waiting for VM flush ACK!\n");
> +		}
>   	}
>   
>   	spin_unlock(&adev->gmc.invalidate_lock);
> +	if (flush_done != 3) {
> +		if (loop++ < 3)
> +			goto retry;
> +		else
> +			DRM_ERROR("Timeout waiting for VM flush ACK!\n");
> +	}
>   }
>   
> +
>   static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   					    unsigned vmid, uint64_t pd_addr)
>   {

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH] drm/amdgpu: give more chance for tlb flush if failed
       [not found]     ` <1a54a624-2816-c03f-bc75-da30be2ae48e-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2018-03-22  3:50       ` Deng, Emily
       [not found]         ` <CY4PR12MB112582B8413D7323ADCDD97C8FA90-rpdhrqHFk07v2MZdTKcfDgdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 5+ messages in thread
From: Deng, Emily @ 2018-03-22  3:50 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Liu, Monk

Hi Christian,
     I agree with that the patch will hide the real problem, it is just a workaround, I will change the patch as you suggest.
as the sriov has lots of issues on the staging,  maybe we could first submit the two workarounds, later, I will spend some time
to find out the root cause.
     I think the issue reproduce is reliable.

Best Wishes,
Emily Deng


> -----Original Message-----
> From: Christian König [mailto:ckoenig.leichtzumerken@gmail.com]
> Sent: Tuesday, March 20, 2018 6:24 PM
> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Liu, Monk <Monk.Liu@amd.com>
> Subject: Re: [PATCH] drm/amdgpu: give more chance for tlb flush if failed
> 
> Am 20.03.2018 um 07:29 schrieb Emily Deng:
> > under SR-IOV sometimes CPU based tlb flush would timeout within the
> > given 100ms period, instead let it fail and continue we can give it
> > more chance to repeat the tlb flush on the failed VMHUB
> >
> > this could fix the massive "Timeout waiting for VM flush ACK"
> > error during vk_encoder test.
> 
> Well that one is a big NAK since it once more just hides the real problem that
> we sometimes drop register writes.
> 
> What we did during debugging to avoid the problem is the following:
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > index a70cbc45c4c1..3536d50375fa 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > @@ -338,6 +338,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct
> > amdgpu_device *adev,
> >                 u32 tmp = gmc_v9_0_get_invalidate_req(vmid);
> >
> >                 WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
> > +               while (RREG32_NO_KIQ(hub->vm_inv_eng0_req + eng) !=
> > +tmp) {
> > +                       DRM_ERROR("Need one more try to write the
> > VMHUB flush request!");
> > +                       WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng,
> > +tmp);
> > +               }
> >
> >                 /* Busy wait for ACK.*/
> >                 for (j = 0; j < 100; j++) {
> 
> But that can only be a temporary workaround as well.
> 
> The question is rather can you reliable reproduce this issue with the
> vk_encoder test?
> 
> Thanks,
> Christian.
> 
> >
> > Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 24 +++++++++++++++++++--
> ---
> >   1 file changed, 19 insertions(+), 5 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > index a70cbc4..517712b 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > @@ -329,13 +329,18 @@ static void gmc_v9_0_flush_gpu_tlb(struct
> amdgpu_device *adev,
> >   {
> >   	/* Use register 17 for GART */
> >   	const unsigned eng = 17;
> > -	unsigned i, j;
> > +	unsigned i, j, loop = 0;
> > +	unsigned flush_done = 0;
> > +
> > +retry:
> >
> >   	spin_lock(&adev->gmc.invalidate_lock);
> >
> >   	for (i = 0; i < AMDGPU_MAX_VMHUBS; ++i) {
> >   		struct amdgpu_vmhub *hub = &adev->vmhub[i];
> >   		u32 tmp = gmc_v9_0_get_invalidate_req(vmid);
> > +		if (flush_done & (1 << i)) /* this vmhub flushed */
> > +			continue;
> >
> >   		WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
> >
> > @@ -347,8 +352,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct
> amdgpu_device *adev,
> >   				break;
> >   			cpu_relax();
> >   		}
> > -		if (j < 100)
> > +		if (j < 100) {
> > +			flush_done |= (1 << i);
> >   			continue;
> > +		}
> >
> >   		/* Wait for ACK with a delay.*/
> >   		for (j = 0; j < adev->usec_timeout; j++) { @@ -358,15 +365,22
> @@
> > static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev,
> >   				break;
> >   			udelay(1);
> >   		}
> > -		if (j < adev->usec_timeout)
> > +		if (j < adev->usec_timeout) {
> > +			flush_done |= (1 << i);
> >   			continue;
> > -
> > -		DRM_ERROR("Timeout waiting for VM flush ACK!\n");
> > +		}
> >   	}
> >
> >   	spin_unlock(&adev->gmc.invalidate_lock);
> > +	if (flush_done != 3) {
> > +		if (loop++ < 3)
> > +			goto retry;
> > +		else
> > +			DRM_ERROR("Timeout waiting for VM flush ACK!\n");
> > +	}
> >   }
> >
> > +
> >   static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> >   					    unsigned vmid, uint64_t pd_addr)
> >   {

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] drm/amdgpu: give more chance for tlb flush if failed
       [not found]         ` <CY4PR12MB112582B8413D7323ADCDD97C8FA90-rpdhrqHFk07v2MZdTKcfDgdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2018-03-22  8:53           ` Christian König
       [not found]             ` <50377594-10f9-8aec-1b2b-acd974527503-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 5+ messages in thread
From: Christian König @ 2018-03-22  8:53 UTC (permalink / raw)
  To: Deng, Emily, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Liu, Monk

> I think the issue reproduce is reliable.
That is the really important thing.

We had the same issues with bare metal and working around them one by 
one didn't helped at all in the long run (e.g. Vega10 was still horrible 
unstable).

The only real solution was finding the root cause and eliminating it. 
For this issue as well as the one Monk investigated it turned out that 
reading some registers can cause problems when there are concurrent 
writes going on. E.g. the register read times out and leads to the write 
being dropped.

If those issues prevail even after eliminating all potential causes on 
your side we should probably setup a call with some hw people to discuss 
how to proceed further.

Regards,
Christian.

Am 22.03.2018 um 04:50 schrieb Deng, Emily:
> Hi Christian,
>       I agree with that the patch will hide the real problem, it is just a workaround, I will change the patch as you suggest.
> as the sriov has lots of issues on the staging,  maybe we could first submit the two workarounds, later, I will spend some time
> to find out the root cause.
>       I think the issue reproduce is reliable.
>
> Best Wishes,
> Emily Deng
>
>
>> -----Original Message-----
>> From: Christian König [mailto:ckoenig.leichtzumerken@gmail.com]
>> Sent: Tuesday, March 20, 2018 6:24 PM
>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>> Cc: Liu, Monk <Monk.Liu@amd.com>
>> Subject: Re: [PATCH] drm/amdgpu: give more chance for tlb flush if failed
>>
>> Am 20.03.2018 um 07:29 schrieb Emily Deng:
>>> under SR-IOV sometimes CPU based tlb flush would timeout within the
>>> given 100ms period, instead let it fail and continue we can give it
>>> more chance to repeat the tlb flush on the failed VMHUB
>>>
>>> this could fix the massive "Timeout waiting for VM flush ACK"
>>> error during vk_encoder test.
>> Well that one is a big NAK since it once more just hides the real problem that
>> we sometimes drop register writes.
>>
>> What we did during debugging to avoid the problem is the following:
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> index a70cbc45c4c1..3536d50375fa 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> @@ -338,6 +338,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct
>>> amdgpu_device *adev,
>>>                  u32 tmp = gmc_v9_0_get_invalidate_req(vmid);
>>>
>>>                  WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
>>> +               while (RREG32_NO_KIQ(hub->vm_inv_eng0_req + eng) !=
>>> +tmp) {
>>> +                       DRM_ERROR("Need one more try to write the
>>> VMHUB flush request!");
>>> +                       WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng,
>>> +tmp);
>>> +               }
>>>
>>>                  /* Busy wait for ACK.*/
>>>                  for (j = 0; j < 100; j++) {
>> But that can only be a temporary workaround as well.
>>
>> The question is rather can you reliable reproduce this issue with the
>> vk_encoder test?
>>
>> Thanks,
>> Christian.
>>
>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 24 +++++++++++++++++++--
>> ---
>>>    1 file changed, 19 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> index a70cbc4..517712b 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> @@ -329,13 +329,18 @@ static void gmc_v9_0_flush_gpu_tlb(struct
>> amdgpu_device *adev,
>>>    {
>>>    	/* Use register 17 for GART */
>>>    	const unsigned eng = 17;
>>> -	unsigned i, j;
>>> +	unsigned i, j, loop = 0;
>>> +	unsigned flush_done = 0;
>>> +
>>> +retry:
>>>
>>>    	spin_lock(&adev->gmc.invalidate_lock);
>>>
>>>    	for (i = 0; i < AMDGPU_MAX_VMHUBS; ++i) {
>>>    		struct amdgpu_vmhub *hub = &adev->vmhub[i];
>>>    		u32 tmp = gmc_v9_0_get_invalidate_req(vmid);
>>> +		if (flush_done & (1 << i)) /* this vmhub flushed */
>>> +			continue;
>>>
>>>    		WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
>>>
>>> @@ -347,8 +352,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct
>> amdgpu_device *adev,
>>>    				break;
>>>    			cpu_relax();
>>>    		}
>>> -		if (j < 100)
>>> +		if (j < 100) {
>>> +			flush_done |= (1 << i);
>>>    			continue;
>>> +		}
>>>
>>>    		/* Wait for ACK with a delay.*/
>>>    		for (j = 0; j < adev->usec_timeout; j++) { @@ -358,15 +365,22
>> @@
>>> static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev,
>>>    				break;
>>>    			udelay(1);
>>>    		}
>>> -		if (j < adev->usec_timeout)
>>> +		if (j < adev->usec_timeout) {
>>> +			flush_done |= (1 << i);
>>>    			continue;
>>> -
>>> -		DRM_ERROR("Timeout waiting for VM flush ACK!\n");
>>> +		}
>>>    	}
>>>
>>>    	spin_unlock(&adev->gmc.invalidate_lock);
>>> +	if (flush_done != 3) {
>>> +		if (loop++ < 3)
>>> +			goto retry;
>>> +		else
>>> +			DRM_ERROR("Timeout waiting for VM flush ACK!\n");
>>> +	}
>>>    }
>>>
>>> +
>>>    static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>    					    unsigned vmid, uint64_t pd_addr)
>>>    {

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH] drm/amdgpu: give more chance for tlb flush if failed
       [not found]             ` <50377594-10f9-8aec-1b2b-acd974527503-5C7GfCeVMHo@public.gmane.org>
@ 2018-03-23  3:13               ` Deng, Emily
  0 siblings, 0 replies; 5+ messages in thread
From: Deng, Emily @ 2018-03-23  3:13 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Liu, Monk

Hi Christian,
    Right, maybe we could learn something from hardware, and find out the root cause.

Best Wishes,
Emily Deng




> -----Original Message-----
> From: Koenig, Christian
> Sent: Thursday, March 22, 2018 4:54 PM
> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Liu, Monk <Monk.Liu@amd.com>
> Subject: Re: [PATCH] drm/amdgpu: give more chance for tlb flush if failed
> 
> > I think the issue reproduce is reliable.
> That is the really important thing.
> 
> We had the same issues with bare metal and working around them one by
> one didn't helped at all in the long run (e.g. Vega10 was still horrible
> unstable).
> 
> The only real solution was finding the root cause and eliminating it.
> For this issue as well as the one Monk investigated it turned out that reading
> some registers can cause problems when there are concurrent writes going
> on. E.g. the register read times out and leads to the write being dropped.
> 
> If those issues prevail even after eliminating all potential causes on your side
> we should probably setup a call with some hw people to discuss how to
> proceed further.
> 
> Regards,
> Christian.
> 
> Am 22.03.2018 um 04:50 schrieb Deng, Emily:
> > Hi Christian,
> >       I agree with that the patch will hide the real problem, it is just a
> workaround, I will change the patch as you suggest.
> > as the sriov has lots of issues on the staging,  maybe we could first
> > submit the two workarounds, later, I will spend some time to find out the
> root cause.
> >       I think the issue reproduce is reliable.
> >
> > Best Wishes,
> > Emily Deng
> >
> >
> >> -----Original Message-----
> >> From: Christian König [mailto:ckoenig.leichtzumerken@gmail.com]
> >> Sent: Tuesday, March 20, 2018 6:24 PM
> >> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
> >> Cc: Liu, Monk <Monk.Liu@amd.com>
> >> Subject: Re: [PATCH] drm/amdgpu: give more chance for tlb flush if
> >> failed
> >>
> >> Am 20.03.2018 um 07:29 schrieb Emily Deng:
> >>> under SR-IOV sometimes CPU based tlb flush would timeout within the
> >>> given 100ms period, instead let it fail and continue we can give it
> >>> more chance to repeat the tlb flush on the failed VMHUB
> >>>
> >>> this could fix the massive "Timeout waiting for VM flush ACK"
> >>> error during vk_encoder test.
> >> Well that one is a big NAK since it once more just hides the real
> >> problem that we sometimes drop register writes.
> >>
> >> What we did during debugging to avoid the problem is the following:
> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> >>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> >>> index a70cbc45c4c1..3536d50375fa 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> >>> @@ -338,6 +338,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct
> >>> amdgpu_device *adev,
> >>>                  u32 tmp = gmc_v9_0_get_invalidate_req(vmid);
> >>>
> >>>                  WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
> >>> +               while (RREG32_NO_KIQ(hub->vm_inv_eng0_req + eng) !=
> >>> +tmp) {
> >>> +                       DRM_ERROR("Need one more try to write the
> >>> VMHUB flush request!");
> >>> +                       WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng,
> >>> +tmp);
> >>> +               }
> >>>
> >>>                  /* Busy wait for ACK.*/
> >>>                  for (j = 0; j < 100; j++) {
> >> But that can only be a temporary workaround as well.
> >>
> >> The question is rather can you reliable reproduce this issue with the
> >> vk_encoder test?
> >>
> >> Thanks,
> >> Christian.
> >>
> >>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> >>> ---
> >>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 24
> +++++++++++++++++++--
> >> ---
> >>>    1 file changed, 19 insertions(+), 5 deletions(-)
> >>>
> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> >>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> >>> index a70cbc4..517712b 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> >>> @@ -329,13 +329,18 @@ static void gmc_v9_0_flush_gpu_tlb(struct
> >> amdgpu_device *adev,
> >>>    {
> >>>    	/* Use register 17 for GART */
> >>>    	const unsigned eng = 17;
> >>> -	unsigned i, j;
> >>> +	unsigned i, j, loop = 0;
> >>> +	unsigned flush_done = 0;
> >>> +
> >>> +retry:
> >>>
> >>>    	spin_lock(&adev->gmc.invalidate_lock);
> >>>
> >>>    	for (i = 0; i < AMDGPU_MAX_VMHUBS; ++i) {
> >>>    		struct amdgpu_vmhub *hub = &adev->vmhub[i];
> >>>    		u32 tmp = gmc_v9_0_get_invalidate_req(vmid);
> >>> +		if (flush_done & (1 << i)) /* this vmhub flushed */
> >>> +			continue;
> >>>
> >>>    		WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
> >>>
> >>> @@ -347,8 +352,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct
> >> amdgpu_device *adev,
> >>>    				break;
> >>>    			cpu_relax();
> >>>    		}
> >>> -		if (j < 100)
> >>> +		if (j < 100) {
> >>> +			flush_done |= (1 << i);
> >>>    			continue;
> >>> +		}
> >>>
> >>>    		/* Wait for ACK with a delay.*/
> >>>    		for (j = 0; j < adev->usec_timeout; j++) { @@ -358,15 +365,22
> >> @@
> >>> static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev,
> >>>    				break;
> >>>    			udelay(1);
> >>>    		}
> >>> -		if (j < adev->usec_timeout)
> >>> +		if (j < adev->usec_timeout) {
> >>> +			flush_done |= (1 << i);
> >>>    			continue;
> >>> -
> >>> -		DRM_ERROR("Timeout waiting for VM flush ACK!\n");
> >>> +		}
> >>>    	}
> >>>
> >>>    	spin_unlock(&adev->gmc.invalidate_lock);
> >>> +	if (flush_done != 3) {
> >>> +		if (loop++ < 3)
> >>> +			goto retry;
> >>> +		else
> >>> +			DRM_ERROR("Timeout waiting for VM flush ACK!\n");
> >>> +	}
> >>>    }
> >>>
> >>> +
> >>>    static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring
> *ring,
> >>>    					    unsigned vmid, uint64_t pd_addr)
> >>>    {

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2018-03-23  3:13 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-20  6:29 [PATCH] drm/amdgpu: give more chance for tlb flush if failed Emily Deng
     [not found] ` <1521527344-25641-1-git-send-email-Emily.Deng-5C7GfCeVMHo@public.gmane.org>
2018-03-20 10:23   ` Christian König
     [not found]     ` <1a54a624-2816-c03f-bc75-da30be2ae48e-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2018-03-22  3:50       ` Deng, Emily
     [not found]         ` <CY4PR12MB112582B8413D7323ADCDD97C8FA90-rpdhrqHFk07v2MZdTKcfDgdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2018-03-22  8:53           ` Christian König
     [not found]             ` <50377594-10f9-8aec-1b2b-acd974527503-5C7GfCeVMHo@public.gmane.org>
2018-03-23  3:13               ` Deng, Emily

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.