All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Increase tlb flush timeout for sriov
@ 2022-08-03  9:01 Dusica Milinkovic
  2022-08-03 15:30 ` Wong, Alice
  2022-08-08 20:49 ` Alex Deucher
  0 siblings, 2 replies; 6+ messages in thread
From: Dusica Milinkovic @ 2022-08-03  9:01 UTC (permalink / raw)
  To: amd-gfx; +Cc: Dusica Milinkovic

Signed-off-by: Dusica Milinkovic <dusica.milinkovic@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 6 +++++-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 9ae8cdaa033e..6ab7d329916f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -419,6 +419,7 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
 	uint32_t seq;
 	uint16_t queried_pasid;
 	bool ret;
+	uint32_t sriov_usec_timeout = 6000000;  /* wait for 12 * 500ms for SRIOV */
 	struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 
@@ -437,7 +438,10 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
 
 		amdgpu_ring_commit(ring);
 		spin_unlock(&adev->gfx.kiq.ring_lock);
-		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
+		if (amdgpu_sriov_vf(adev))
+			r = amdgpu_fence_wait_polling(ring, seq, sriov_usec_timeout);
+		else
+			r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
 		if (r < 1) {
 			dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
 			return -ETIME;
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 22761a3bb818..941a6b52fa72 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -896,6 +896,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
 	uint32_t seq;
 	uint16_t queried_pasid;
 	bool ret;
+	uint32_t sriov_usec_timeout = 6000000;  /* wait for 12 * 500ms for SRIOV */
 	struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 
@@ -935,7 +936,10 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
 
 		amdgpu_ring_commit(ring);
 		spin_unlock(&adev->gfx.kiq.ring_lock);
-		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
+		if (amdgpu_sriov_vf(adev))
+			r = amdgpu_fence_wait_polling(ring, seq, sriov_usec_timeout);
+		else
+			r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
 		if (r < 1) {
 			dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
 			up_read(&adev->reset_domain->sem);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* RE: [PATCH] Increase tlb flush timeout for sriov
  2022-08-03  9:01 [PATCH] Increase tlb flush timeout for sriov Dusica Milinkovic
@ 2022-08-03 15:30 ` Wong, Alice
  2022-08-08 20:49 ` Alex Deucher
  1 sibling, 0 replies; 6+ messages in thread
From: Wong, Alice @ 2022-08-03 15:30 UTC (permalink / raw)
  To: Milinkovic, Dusica, amd-gfx, Liu, Shaoyun; +Cc: Milinkovic, Dusica

[AMD Official Use Only - General]


-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Dusica Milinkovic
Sent: August 3, 2022 5:02 AM
To: amd-gfx@lists.freedesktop.org
Cc: Milinkovic, Dusica <Dusica.Milinkovic@amd.com>
Subject: [PATCH] Increase tlb flush timeout for sriov

Signed-off-by: Dusica Milinkovic <dusica.milinkovic@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 6 +++++-  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 9ae8cdaa033e..6ab7d329916f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -419,6 +419,7 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
 	uint32_t seq;
 	uint16_t queried_pasid;
 	bool ret;
+	uint32_t sriov_usec_timeout = 6000000;  /* wait for 12 * 500ms for 
+SRIOV */
 	struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 
@@ -437,7 +438,10 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
 
 		amdgpu_ring_commit(ring);
 		spin_unlock(&adev->gfx.kiq.ring_lock);
-		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
+		if (amdgpu_sriov_vf(adev))
+			r = amdgpu_fence_wait_polling(ring, seq, sriov_usec_timeout);
+		else
+			r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
 		if (r < 1) {
 			dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
 			return -ETIME;
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 22761a3bb818..941a6b52fa72 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -896,6 +896,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
 	uint32_t seq;
 	uint16_t queried_pasid;
 	bool ret;
+	uint32_t sriov_usec_timeout = 6000000;  /* wait for 12 * 500ms for 
+SRIOV */
 	struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 
@@ -935,7 +936,10 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
 
 		amdgpu_ring_commit(ring);
 		spin_unlock(&adev->gfx.kiq.ring_lock);
-		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
+		if (amdgpu_sriov_vf(adev))
+			r = amdgpu_fence_wait_polling(ring, seq, sriov_usec_timeout);
+		else
+			r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
 		if (r < 1) {
 			dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
 			up_read(&adev->reset_domain->sem);
--
2.25.1

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] Increase tlb flush timeout for sriov
  2022-08-03  9:01 [PATCH] Increase tlb flush timeout for sriov Dusica Milinkovic
  2022-08-03 15:30 ` Wong, Alice
@ 2022-08-08 20:49 ` Alex Deucher
  2022-08-08 22:30   ` Liu, Shaoyun
  1 sibling, 1 reply; 6+ messages in thread
From: Alex Deucher @ 2022-08-08 20:49 UTC (permalink / raw)
  To: Dusica Milinkovic; +Cc: amd-gfx

On Wed, Aug 3, 2022 at 5:02 AM Dusica Milinkovic
<dusica.milinkovic@amd.com> wrote:
>

Please include a patch description.  Why do you need a longer timeout?
 What problem does it fix?

> Signed-off-by: Dusica Milinkovic <dusica.milinkovic@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 6 +++++-
>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 6 +++++-
>  2 files changed, 10 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 9ae8cdaa033e..6ab7d329916f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -419,6 +419,7 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>         uint32_t seq;
>         uint16_t queried_pasid;
>         bool ret;
> +       uint32_t sriov_usec_timeout = 6000000;  /* wait for 12 * 500ms for SRIOV */
>         struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
>         struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>
> @@ -437,7 +438,10 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>
>                 amdgpu_ring_commit(ring);
>                 spin_unlock(&adev->gfx.kiq.ring_lock);
> -               r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
> +               if (amdgpu_sriov_vf(adev))
> +                       r = amdgpu_fence_wait_polling(ring, seq, sriov_usec_timeout);
> +               else
> +                       r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);

What about something like this?
u32 usec_timeout = amdgpu_sriov_vf(adev) ? 6000000 :
adev->usec_timeout;  /* wait for 12 * 500ms for SRIOV */
...
r = amdgpu_fence_wait_polling(ring, seq, usec_timeout);


>                 if (r < 1) {
>                         dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
>                         return -ETIME;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 22761a3bb818..941a6b52fa72 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -896,6 +896,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>         uint32_t seq;
>         uint16_t queried_pasid;
>         bool ret;
> +       uint32_t sriov_usec_timeout = 6000000;  /* wait for 12 * 500ms for SRIOV */
>         struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
>         struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>
> @@ -935,7 +936,10 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>
>                 amdgpu_ring_commit(ring);
>                 spin_unlock(&adev->gfx.kiq.ring_lock);
> -               r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
> +               if (amdgpu_sriov_vf(adev))
> +                       r = amdgpu_fence_wait_polling(ring, seq, sriov_usec_timeout);
> +               else
> +                       r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);

Same comment here.

Alex

>                 if (r < 1) {
>                         dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
>                         up_read(&adev->reset_domain->sem);
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] Increase tlb flush timeout for sriov
  2022-08-08 20:49 ` Alex Deucher
@ 2022-08-08 22:30   ` Liu, Shaoyun
  2022-08-09 11:19     ` Milinkovic, Dusica
  2022-08-09 17:04     ` Deucher, Alexander
  0 siblings, 2 replies; 6+ messages in thread
From: Liu, Shaoyun @ 2022-08-08 22:30 UTC (permalink / raw)
  To: Alex Deucher, Milinkovic, Dusica; +Cc: amd-gfx

[-- Attachment #1: Type: text/plain, Size: 4169 bytes --]

As I discussed with Alice ,this change is when multi-vf running compute benchmark (Luxmark) at the same time, which involves multiple vf  do the tlb invalidation at the same time. They observed kiq timeout after submit the tlb invalidate command. Although each vf has the invalidate register set, but from hw, the invalidate requests are queue to execute.

Alice, as we discussed, we can use maximum 12*100ms for the timeout , it shouldn't be 6000ms. Did you see issues with 1200 ms timeout?

Regards
Shaoyun.liu
________________________________
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of Alex Deucher <alexdeucher@gmail.com>
Sent: August 8, 2022 4:49 PM
To: Milinkovic, Dusica <Dusica.Milinkovic@amd.com>
Cc: amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
Subject: Re: [PATCH] Increase tlb flush timeout for sriov

On Wed, Aug 3, 2022 at 5:02 AM Dusica Milinkovic
<dusica.milinkovic@amd.com> wrote:
>

Please include a patch description.  Why do you need a longer timeout?
 What problem does it fix?

> Signed-off-by: Dusica Milinkovic <dusica.milinkovic@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 6 +++++-
>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 6 +++++-
>  2 files changed, 10 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 9ae8cdaa033e..6ab7d329916f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -419,6 +419,7 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>         uint32_t seq;
>         uint16_t queried_pasid;
>         bool ret;
> +       uint32_t sriov_usec_timeout = 6000000;  /* wait for 12 * 500ms for SRIOV */
>         struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
>         struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>
> @@ -437,7 +438,10 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>
>                 amdgpu_ring_commit(ring);
>                 spin_unlock(&adev->gfx.kiq.ring_lock);
> -               r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
> +               if (amdgpu_sriov_vf(adev))
> +                       r = amdgpu_fence_wait_polling(ring, seq, sriov_usec_timeout);
> +               else
> +                       r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);

What about something like this?
u32 usec_timeout = amdgpu_sriov_vf(adev) ? 6000000 :
adev->usec_timeout;  /* wait for 12 * 500ms for SRIOV */
...
r = amdgpu_fence_wait_polling(ring, seq, usec_timeout);


>                 if (r < 1) {
>                         dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
>                         return -ETIME;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 22761a3bb818..941a6b52fa72 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -896,6 +896,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>         uint32_t seq;
>         uint16_t queried_pasid;
>         bool ret;
> +       uint32_t sriov_usec_timeout = 6000000;  /* wait for 12 * 500ms for SRIOV */
>         struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
>         struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>
> @@ -935,7 +936,10 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>
>                 amdgpu_ring_commit(ring);
>                 spin_unlock(&adev->gfx.kiq.ring_lock);
> -               r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
> +               if (amdgpu_sriov_vf(adev))
> +                       r = amdgpu_fence_wait_polling(ring, seq, sriov_usec_timeout);
> +               else
> +                       r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);

Same comment here.

Alex

>                 if (r < 1) {
>                         dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
>                         up_read(&adev->reset_domain->sem);
> --
> 2.25.1
>

[-- Attachment #2: Type: text/html, Size: 7742 bytes --]

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [PATCH] Increase tlb flush timeout for sriov
  2022-08-08 22:30   ` Liu, Shaoyun
@ 2022-08-09 11:19     ` Milinkovic, Dusica
  2022-08-09 17:04     ` Deucher, Alexander
  1 sibling, 0 replies; 6+ messages in thread
From: Milinkovic, Dusica @ 2022-08-09 11:19 UTC (permalink / raw)
  To: Liu, Shaoyun, Alex Deucher; +Cc: amd-gfx

[-- Attachment #1: Type: text/plain, Size: 4908 bytes --]

[AMD Official Use Only - General]

Hello all,

I just checked this with 1200ms for timeout, test case successfully passed 10 loops.
If all of you agree, I will change a value in the patch.

Regards,
Dusica

From: Liu, Shaoyun <Shaoyun.Liu@amd.com>
Sent: Tuesday, August 9, 2022 12:31 AM
To: Alex Deucher <alexdeucher@gmail.com>; Milinkovic, Dusica <Dusica.Milinkovic@amd.com>
Cc: amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] Increase tlb flush timeout for sriov

As I discussed with Alice ,this change is when multi-vf running compute benchmark (Luxmark) at the same time, which involves multiple vf  do the tlb invalidation at the same time. They observed kiq timeout after submit the tlb invalidate command. Although each vf has the invalidate register set, but from hw, the invalidate requests are queue to execute.

Alice, as we discussed, we can use maximum 12*100ms for the timeout , it shouldn't be 6000ms. Did you see issues with 1200 ms timeout?

Regards
Shaoyun.liu
________________________________
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org<mailto:amd-gfx-bounces@lists.freedesktop.org>> on behalf of Alex Deucher <alexdeucher@gmail.com<mailto:alexdeucher@gmail.com>>
Sent: August 8, 2022 4:49 PM
To: Milinkovic, Dusica <Dusica.Milinkovic@amd.com<mailto:Dusica.Milinkovic@amd.com>>
Cc: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> <amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>>
Subject: Re: [PATCH] Increase tlb flush timeout for sriov

On Wed, Aug 3, 2022 at 5:02 AM Dusica Milinkovic
<dusica.milinkovic@amd.com<mailto:dusica.milinkovic@amd.com>> wrote:
>

Please include a patch description.  Why do you need a longer timeout?
 What problem does it fix?

> Signed-off-by: Dusica Milinkovic <dusica.milinkovic@amd.com<mailto:dusica.milinkovic@amd.com>>
> ---
>  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 6 +++++-
>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 6 +++++-
>  2 files changed, 10 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 9ae8cdaa033e..6ab7d329916f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -419,6 +419,7 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>         uint32_t seq;
>         uint16_t queried_pasid;
>         bool ret;
> +       uint32_t sriov_usec_timeout = 6000000;  /* wait for 12 * 500ms for SRIOV */
>         struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
>         struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>
> @@ -437,7 +438,10 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>
>                 amdgpu_ring_commit(ring);
>                 spin_unlock(&adev->gfx.kiq.ring_lock);
> -               r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
> +               if (amdgpu_sriov_vf(adev))
> +                       r = amdgpu_fence_wait_polling(ring, seq, sriov_usec_timeout);
> +               else
> +                       r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);

What about something like this?
u32 usec_timeout = amdgpu_sriov_vf(adev) ? 6000000 :
adev->usec_timeout;  /* wait for 12 * 500ms for SRIOV */
...
r = amdgpu_fence_wait_polling(ring, seq, usec_timeout);


>                 if (r < 1) {
>                         dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
>                         return -ETIME;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 22761a3bb818..941a6b52fa72 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -896,6 +896,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>         uint32_t seq;
>         uint16_t queried_pasid;
>         bool ret;
> +       uint32_t sriov_usec_timeout = 6000000;  /* wait for 12 * 500ms for SRIOV */
>         struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
>         struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>
> @@ -935,7 +936,10 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>
>                 amdgpu_ring_commit(ring);
>                 spin_unlock(&adev->gfx.kiq.ring_lock);
> -               r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
> +               if (amdgpu_sriov_vf(adev))
> +                       r = amdgpu_fence_wait_polling(ring, seq, sriov_usec_timeout);
> +               else
> +                       r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);

Same comment here.

Alex

>                 if (r < 1) {
>                         dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
>                         up_read(&adev->reset_domain->sem);
> --
> 2.25.1
>

[-- Attachment #2: Type: text/html, Size: 10806 bytes --]

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] Increase tlb flush timeout for sriov
  2022-08-08 22:30   ` Liu, Shaoyun
  2022-08-09 11:19     ` Milinkovic, Dusica
@ 2022-08-09 17:04     ` Deucher, Alexander
  1 sibling, 0 replies; 6+ messages in thread
From: Deucher, Alexander @ 2022-08-09 17:04 UTC (permalink / raw)
  To: Liu, Shaoyun, Alex Deucher, Milinkovic, Dusica; +Cc: amd-gfx

[-- Attachment #1: Type: text/plain, Size: 4667 bytes --]

[AMD Official Use Only - General]

Please add these details to the patch description when you respin.
________________________________
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of Liu, Shaoyun <Shaoyun.Liu@amd.com>
Sent: Monday, August 8, 2022 6:30 PM
To: Alex Deucher <alexdeucher@gmail.com>; Milinkovic, Dusica <Dusica.Milinkovic@amd.com>
Cc: amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
Subject: Re: [PATCH] Increase tlb flush timeout for sriov

As I discussed with Alice ,this change is when multi-vf running compute benchmark (Luxmark) at the same time, which involves multiple vf  do the tlb invalidation at the same time. They observed kiq timeout after submit the tlb invalidate command. Although each vf has the invalidate register set, but from hw, the invalidate requests are queue to execute.

Alice, as we discussed, we can use maximum 12*100ms for the timeout , it shouldn't be 6000ms. Did you see issues with 1200 ms timeout?

Regards
Shaoyun.liu
________________________________
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of Alex Deucher <alexdeucher@gmail.com>
Sent: August 8, 2022 4:49 PM
To: Milinkovic, Dusica <Dusica.Milinkovic@amd.com>
Cc: amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
Subject: Re: [PATCH] Increase tlb flush timeout for sriov

On Wed, Aug 3, 2022 at 5:02 AM Dusica Milinkovic
<dusica.milinkovic@amd.com> wrote:
>

Please include a patch description.  Why do you need a longer timeout?
 What problem does it fix?

> Signed-off-by: Dusica Milinkovic <dusica.milinkovic@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 6 +++++-
>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 6 +++++-
>  2 files changed, 10 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 9ae8cdaa033e..6ab7d329916f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -419,6 +419,7 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>         uint32_t seq;
>         uint16_t queried_pasid;
>         bool ret;
> +       uint32_t sriov_usec_timeout = 6000000;  /* wait for 12 * 500ms for SRIOV */
>         struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
>         struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>
> @@ -437,7 +438,10 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>
>                 amdgpu_ring_commit(ring);
>                 spin_unlock(&adev->gfx.kiq.ring_lock);
> -               r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
> +               if (amdgpu_sriov_vf(adev))
> +                       r = amdgpu_fence_wait_polling(ring, seq, sriov_usec_timeout);
> +               else
> +                       r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);

What about something like this?
u32 usec_timeout = amdgpu_sriov_vf(adev) ? 6000000 :
adev->usec_timeout;  /* wait for 12 * 500ms for SRIOV */
...
r = amdgpu_fence_wait_polling(ring, seq, usec_timeout);


>                 if (r < 1) {
>                         dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
>                         return -ETIME;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 22761a3bb818..941a6b52fa72 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -896,6 +896,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>         uint32_t seq;
>         uint16_t queried_pasid;
>         bool ret;
> +       uint32_t sriov_usec_timeout = 6000000;  /* wait for 12 * 500ms for SRIOV */
>         struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
>         struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>
> @@ -935,7 +936,10 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>
>                 amdgpu_ring_commit(ring);
>                 spin_unlock(&adev->gfx.kiq.ring_lock);
> -               r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
> +               if (amdgpu_sriov_vf(adev))
> +                       r = amdgpu_fence_wait_polling(ring, seq, sriov_usec_timeout);
> +               else
> +                       r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);

Same comment here.

Alex

>                 if (r < 1) {
>                         dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
>                         up_read(&adev->reset_domain->sem);
> --
> 2.25.1
>

[-- Attachment #2: Type: text/html, Size: 8884 bytes --]

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2022-08-09 17:05 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-08-03  9:01 [PATCH] Increase tlb flush timeout for sriov Dusica Milinkovic
2022-08-03 15:30 ` Wong, Alice
2022-08-08 20:49 ` Alex Deucher
2022-08-08 22:30   ` Liu, Shaoyun
2022-08-09 11:19     ` Milinkovic, Dusica
2022-08-09 17:04     ` Deucher, Alexander

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.