All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20  9:14 ` Changfeng.Zhu
  0 siblings, 0 replies; 42+ messages in thread
From: Changfeng.Zhu @ 2019-11-20  9:14 UTC (permalink / raw)
  To: Christian.Koenig-5C7GfCeVMHo, Jack.Xiao-5C7GfCeVMHo,
	Tao.Zhou1-5C7GfCeVMHo, Ray.Huang-5C7GfCeVMHo,
	Xinmei.Huang-5C7GfCeVMHo,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: changzhu

From: changzhu <Changfeng.Zhu@amd.com>

It may lose gpuvm invalidate acknowldege state across power-gating off
cycle. To avoid this issue in virt invalidation, add semaphore acquire
before invalidation and semaphore release after invalidation.

Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
 3 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index f04eb1a64271..70ffaf91cd12 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 
 void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
 					uint32_t reg0, uint32_t reg1,
-					uint32_t ref, uint32_t mask)
+					uint32_t ref, uint32_t mask,
+					uint32_t sem)
 {
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 	struct amdgpu_ring *ring = &kiq->ring;
@@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
 	uint32_t seq;
 
 	spin_lock_irqsave(&kiq->ring_lock, flags);
-	amdgpu_ring_alloc(ring, 32);
+	amdgpu_ring_alloc(ring, 60);
+
+	/*
+	 * It may lose gpuvm invalidate acknowldege state across power-gating
+	 * off cycle, add semaphore acquire before invalidation and semaphore
+	 * release after invalidation to avoid entering power gated state
+	 * to WA the Issue
+	 */
+
+	/* a read return value of 1 means semaphore acuqire */
+	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
+	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
+	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
+
 	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
 					    ref, mask);
+	/*
+	 * add semaphore release after invalidation,
+	 * write with 0 means semaphore release
+	 */
+	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
+	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
+	amdgpu_ring_emit_wreg(ring, sem, 0);
+
 	amdgpu_fence_emit_polling(ring, &seq);
 	amdgpu_ring_commit(ring);
 	spin_unlock_irqrestore(&kiq->ring_lock, flags);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index b0b2bdc750df..bda6a2f37dc0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);
 void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);
 void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
 					uint32_t reg0, uint32_t rreg1,
-					uint32_t ref, uint32_t mask);
+					uint32_t ref, uint32_t mask,
+					uint32_t sem);
 int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
 int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);
 int amdgpu_virt_reset_gpu(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index f25cd97ba5f2..1ae59af7836a 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
 			!adev->in_gpu_reset) {
 		uint32_t req = hub->vm_inv_eng0_req + eng;
 		uint32_t ack = hub->vm_inv_eng0_ack + eng;
+		uint32_t sem = hub->vm_inv_eng0_sem + eng;
 
 		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
-				1 << vmid);
+						   1 << vmid, sem);
 		return;
 	}
 
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20  9:14 ` Changfeng.Zhu
  0 siblings, 0 replies; 42+ messages in thread
From: Changfeng.Zhu @ 2019-11-20  9:14 UTC (permalink / raw)
  To: Christian.Koenig, Jack.Xiao, Tao.Zhou1, Ray.Huang, Xinmei.Huang, amd-gfx
  Cc: changzhu

From: changzhu <Changfeng.Zhu@amd.com>

It may lose gpuvm invalidate acknowldege state across power-gating off
cycle. To avoid this issue in virt invalidation, add semaphore acquire
before invalidation and semaphore release after invalidation.

Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
 3 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index f04eb1a64271..70ffaf91cd12 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 
 void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
 					uint32_t reg0, uint32_t reg1,
-					uint32_t ref, uint32_t mask)
+					uint32_t ref, uint32_t mask,
+					uint32_t sem)
 {
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 	struct amdgpu_ring *ring = &kiq->ring;
@@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
 	uint32_t seq;
 
 	spin_lock_irqsave(&kiq->ring_lock, flags);
-	amdgpu_ring_alloc(ring, 32);
+	amdgpu_ring_alloc(ring, 60);
+
+	/*
+	 * It may lose gpuvm invalidate acknowldege state across power-gating
+	 * off cycle, add semaphore acquire before invalidation and semaphore
+	 * release after invalidation to avoid entering power gated state
+	 * to WA the Issue
+	 */
+
+	/* a read return value of 1 means semaphore acuqire */
+	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
+	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
+	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
+
 	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
 					    ref, mask);
+	/*
+	 * add semaphore release after invalidation,
+	 * write with 0 means semaphore release
+	 */
+	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
+	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
+	amdgpu_ring_emit_wreg(ring, sem, 0);
+
 	amdgpu_fence_emit_polling(ring, &seq);
 	amdgpu_ring_commit(ring);
 	spin_unlock_irqrestore(&kiq->ring_lock, flags);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index b0b2bdc750df..bda6a2f37dc0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);
 void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);
 void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
 					uint32_t reg0, uint32_t rreg1,
-					uint32_t ref, uint32_t mask);
+					uint32_t ref, uint32_t mask,
+					uint32_t sem);
 int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
 int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);
 int amdgpu_virt_reset_gpu(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index f25cd97ba5f2..1ae59af7836a 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
 			!adev->in_gpu_reset) {
 		uint32_t req = hub->vm_inv_eng0_req + eng;
 		uint32_t ack = hub->vm_inv_eng0_ack + eng;
+		uint32_t sem = hub->vm_inv_eng0_sem + eng;
 
 		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
-				1 << vmid);
+						   1 << vmid, sem);
 		return;
 	}
 
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 11:23     ` Christian König
  0 siblings, 0 replies; 42+ messages in thread
From: Christian König @ 2019-11-20 11:23 UTC (permalink / raw)
  To: Changfeng.Zhu, Christian.Koenig-5C7GfCeVMHo,
	Jack.Xiao-5C7GfCeVMHo, Tao.Zhou1-5C7GfCeVMHo,
	Ray.Huang-5C7GfCeVMHo, Xinmei.Huang-5C7GfCeVMHo,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Deng, Emily, monk.liu

Hi Changfeng,

[adding Monk and Emily as well].

I thought more about this and came to the conclusion that this won't 
work and might result in a lockup as well.

We are using the KIQ on SRIOV for GPUVM invalidation because we need an 
atomic read/modify/write cycle since we found that the invalidation 
engine is resetted with every world switch.

Now accessing the semaphore registers is not atomic any more and we 
could have a world switch in between grabbing the semaphore and sending 
the VM invalidation. That either won't work or could result in a lockup 
as well.

Question for Emily and Monk: Do we support power gating of the MMHUB 
with SRIOV? I don't think so and when that's correct we could just drop 
this patch.

Regards,
Christian.

Am 20.11.19 um 10:14 schrieb Changfeng.Zhu:
> From: changzhu <Changfeng.Zhu@amd.com>
>
> It may lose gpuvm invalidate acknowldege state across power-gating off
> cycle. To avoid this issue in virt invalidation, add semaphore acquire
> before invalidation and semaphore release after invalidation.
>
> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>   3 files changed, 28 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index f04eb1a64271..70ffaf91cd12 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
>   
>   void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   					uint32_t reg0, uint32_t reg1,
> -					uint32_t ref, uint32_t mask)
> +					uint32_t ref, uint32_t mask,
> +					uint32_t sem)
>   {
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring;
> @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   	uint32_t seq;
>   
>   	spin_lock_irqsave(&kiq->ring_lock, flags);
> -	amdgpu_ring_alloc(ring, 32);
> +	amdgpu_ring_alloc(ring, 60);
> +
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
> +
>   	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>   					    ref, mask);
> +	/*
> +	 * add semaphore release after invalidation,
> +	 * write with 0 means semaphore release
> +	 */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_wreg(ring, sem, 0);
> +
>   	amdgpu_fence_emit_polling(ring, &seq);
>   	amdgpu_ring_commit(ring);
>   	spin_unlock_irqrestore(&kiq->ring_lock, flags);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index b0b2bdc750df..bda6a2f37dc0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);
>   void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);
>   void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   					uint32_t reg0, uint32_t rreg1,
> -					uint32_t ref, uint32_t mask);
> +					uint32_t ref, uint32_t mask,
> +					uint32_t sem);
>   int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
>   int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);
>   int amdgpu_virt_reset_gpu(struct amdgpu_device *adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index f25cd97ba5f2..1ae59af7836a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   			!adev->in_gpu_reset) {
>   		uint32_t req = hub->vm_inv_eng0_req + eng;
>   		uint32_t ack = hub->vm_inv_eng0_ack + eng;
> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>   
>   		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
> -				1 << vmid);
> +						   1 << vmid, sem);
>   		return;
>   	}
>   

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 11:23     ` Christian König
  0 siblings, 0 replies; 42+ messages in thread
From: Christian König @ 2019-11-20 11:23 UTC (permalink / raw)
  To: Changfeng.Zhu, Christian.Koenig, Jack.Xiao, Tao.Zhou1, Ray.Huang,
	Xinmei.Huang, amd-gfx, Deng, Emily, monk.liu

Hi Changfeng,

[adding Monk and Emily as well].

I thought more about this and came to the conclusion that this won't 
work and might result in a lockup as well.

We are using the KIQ on SRIOV for GPUVM invalidation because we need an 
atomic read/modify/write cycle since we found that the invalidation 
engine is resetted with every world switch.

Now accessing the semaphore registers is not atomic any more and we 
could have a world switch in between grabbing the semaphore and sending 
the VM invalidation. That either won't work or could result in a lockup 
as well.

Question for Emily and Monk: Do we support power gating of the MMHUB 
with SRIOV? I don't think so and when that's correct we could just drop 
this patch.

Regards,
Christian.

Am 20.11.19 um 10:14 schrieb Changfeng.Zhu:
> From: changzhu <Changfeng.Zhu@amd.com>
>
> It may lose gpuvm invalidate acknowldege state across power-gating off
> cycle. To avoid this issue in virt invalidation, add semaphore acquire
> before invalidation and semaphore release after invalidation.
>
> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>   3 files changed, 28 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index f04eb1a64271..70ffaf91cd12 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
>   
>   void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   					uint32_t reg0, uint32_t reg1,
> -					uint32_t ref, uint32_t mask)
> +					uint32_t ref, uint32_t mask,
> +					uint32_t sem)
>   {
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring;
> @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   	uint32_t seq;
>   
>   	spin_lock_irqsave(&kiq->ring_lock, flags);
> -	amdgpu_ring_alloc(ring, 32);
> +	amdgpu_ring_alloc(ring, 60);
> +
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
> +
>   	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>   					    ref, mask);
> +	/*
> +	 * add semaphore release after invalidation,
> +	 * write with 0 means semaphore release
> +	 */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_wreg(ring, sem, 0);
> +
>   	amdgpu_fence_emit_polling(ring, &seq);
>   	amdgpu_ring_commit(ring);
>   	spin_unlock_irqrestore(&kiq->ring_lock, flags);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index b0b2bdc750df..bda6a2f37dc0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);
>   void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);
>   void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   					uint32_t reg0, uint32_t rreg1,
> -					uint32_t ref, uint32_t mask);
> +					uint32_t ref, uint32_t mask,
> +					uint32_t sem);
>   int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
>   int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);
>   int amdgpu_virt_reset_gpu(struct amdgpu_device *adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index f25cd97ba5f2..1ae59af7836a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   			!adev->in_gpu_reset) {
>   		uint32_t req = hub->vm_inv_eng0_req + eng;
>   		uint32_t ack = hub->vm_inv_eng0_ack + eng;
> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>   
>   		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
> -				1 << vmid);
> +						   1 << vmid, sem);
>   		return;
>   	}
>   

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 13:18     ` Liu, Monk
  0 siblings, 0 replies; 42+ messages in thread
From: Liu, Monk @ 2019-11-20 13:18 UTC (permalink / raw)
  To: Koenig, Christian, Xiao, Jack, Zhou1, Tao, Huang, Ray, Huang,
	Shimmer, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Zhu, Changfeng

Hi Changfeng 

Firs of all, there is no power-gating off circle involved in AMDGPU SRIOV, since we don't allow VF/VM do such things so I do feel strange why you post something like this 
Especially on VEGA10 serials which looks doesn't have any issue on those gpu_flush part 

Here is my questions for you:
1) Can you point me what issue had you been experienced ? and how to repro the bug
2) if you do hit some issues, did you verified that your patch can fix it ?

besides

/Monk

-----邮件原件-----
发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 Changfeng.Zhu
发送时间: 2019年11月20日 17:14
收件人: Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
抄送: Zhu, Changfeng <Changfeng.Zhu@amd.com>
主题: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt

From: changzhu <Changfeng.Zhu@amd.com>

It may lose gpuvm invalidate acknowldege state across power-gating off cycle. To avoid this issue in virt invalidation, add semaphore acquire before invalidation and semaphore release after invalidation.

Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
 3 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index f04eb1a64271..70ffaf91cd12 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 
 void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
 					uint32_t reg0, uint32_t reg1,
-					uint32_t ref, uint32_t mask)
+					uint32_t ref, uint32_t mask,
+					uint32_t sem)
 {
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
 	uint32_t seq;
 
 	spin_lock_irqsave(&kiq->ring_lock, flags);
-	amdgpu_ring_alloc(ring, 32);
+	amdgpu_ring_alloc(ring, 60);
+
+	/*
+	 * It may lose gpuvm invalidate acknowldege state across power-gating
+	 * off cycle, add semaphore acquire before invalidation and semaphore
+	 * release after invalidation to avoid entering power gated state
+	 * to WA the Issue
+	 */
+
+	/* a read return value of 1 means semaphore acuqire */
+	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
+	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
+	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
+
 	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
 					    ref, mask);
+	/*
+	 * add semaphore release after invalidation,
+	 * write with 0 means semaphore release
+	 */
+	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
+	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
+	amdgpu_ring_emit_wreg(ring, sem, 0);
+
 	amdgpu_fence_emit_polling(ring, &seq);
 	amdgpu_ring_commit(ring);
 	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index b0b2bdc750df..bda6a2f37dc0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);  void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
 					uint32_t reg0, uint32_t rreg1,
-					uint32_t ref, uint32_t mask);
+					uint32_t ref, uint32_t mask,
+					uint32_t sem);
 int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);  int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);  int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index f25cd97ba5f2..1ae59af7836a 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
 			!adev->in_gpu_reset) {
 		uint32_t req = hub->vm_inv_eng0_req + eng;
 		uint32_t ack = hub->vm_inv_eng0_ack + eng;
+		uint32_t sem = hub->vm_inv_eng0_sem + eng;
 
 		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
-				1 << vmid);
+						   1 << vmid, sem);
 		return;
 	}
 
--
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 13:18     ` Liu, Monk
  0 siblings, 0 replies; 42+ messages in thread
From: Liu, Monk @ 2019-11-20 13:18 UTC (permalink / raw)
  To: Zhu, Changfeng, Koenig, Christian, Xiao, Jack, Zhou1, Tao, Huang,
	Ray, Huang, Shimmer, amd-gfx
  Cc: Zhu, Changfeng

Hi Changfeng 

Firs of all, there is no power-gating off circle involved in AMDGPU SRIOV, since we don't allow VF/VM do such things so I do feel strange why you post something like this 
Especially on VEGA10 serials which looks doesn't have any issue on those gpu_flush part 

Here is my questions for you:
1) Can you point me what issue had you been experienced ? and how to repro the bug
2) if you do hit some issues, did you verified that your patch can fix it ?

besides

/Monk

-----邮件原件-----
发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 Changfeng.Zhu
发送时间: 2019年11月20日 17:14
收件人: Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
抄送: Zhu, Changfeng <Changfeng.Zhu@amd.com>
主题: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt

From: changzhu <Changfeng.Zhu@amd.com>

It may lose gpuvm invalidate acknowldege state across power-gating off cycle. To avoid this issue in virt invalidation, add semaphore acquire before invalidation and semaphore release after invalidation.

Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
 3 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index f04eb1a64271..70ffaf91cd12 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 
 void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
 					uint32_t reg0, uint32_t reg1,
-					uint32_t ref, uint32_t mask)
+					uint32_t ref, uint32_t mask,
+					uint32_t sem)
 {
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
 	uint32_t seq;
 
 	spin_lock_irqsave(&kiq->ring_lock, flags);
-	amdgpu_ring_alloc(ring, 32);
+	amdgpu_ring_alloc(ring, 60);
+
+	/*
+	 * It may lose gpuvm invalidate acknowldege state across power-gating
+	 * off cycle, add semaphore acquire before invalidation and semaphore
+	 * release after invalidation to avoid entering power gated state
+	 * to WA the Issue
+	 */
+
+	/* a read return value of 1 means semaphore acuqire */
+	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
+	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
+	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
+
 	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
 					    ref, mask);
+	/*
+	 * add semaphore release after invalidation,
+	 * write with 0 means semaphore release
+	 */
+	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
+	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
+	amdgpu_ring_emit_wreg(ring, sem, 0);
+
 	amdgpu_fence_emit_polling(ring, &seq);
 	amdgpu_ring_commit(ring);
 	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index b0b2bdc750df..bda6a2f37dc0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);  void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
 					uint32_t reg0, uint32_t rreg1,
-					uint32_t ref, uint32_t mask);
+					uint32_t ref, uint32_t mask,
+					uint32_t sem);
 int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);  int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);  int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index f25cd97ba5f2..1ae59af7836a 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
 			!adev->in_gpu_reset) {
 		uint32_t req = hub->vm_inv_eng0_req + eng;
 		uint32_t ack = hub->vm_inv_eng0_ack + eng;
+		uint32_t sem = hub->vm_inv_eng0_sem + eng;
 
 		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
-				1 << vmid);
+						   1 << vmid, sem);
 		return;
 	}
 
--
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 13:20         ` Christian König
  0 siblings, 0 replies; 42+ messages in thread
From: Christian König @ 2019-11-20 13:20 UTC (permalink / raw)
  To: Liu, Monk, Zhu, Changfeng, Xiao, Jack, Zhou1, Tao, Huang, Ray,
	Huang, Shimmer, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Hi Monk,

this is a fix for power gating the MMHUB.

Basic problem is that the MMHUB can power gate while an invalidation is 
in progress which looses all bits in the ACK register and so deadlocks 
the engine waiting for the invalidation to finish.

This bug is hit immediately when we enable power gating of the MMHUB.

Regards,
Christian.

Am 20.11.19 um 14:18 schrieb Liu, Monk:
> Hi Changfeng
>
> Firs of all, there is no power-gating off circle involved in AMDGPU SRIOV, since we don't allow VF/VM do such things so I do feel strange why you post something like this
> Especially on VEGA10 serials which looks doesn't have any issue on those gpu_flush part
>
> Here is my questions for you:
> 1) Can you point me what issue had you been experienced ? and how to repro the bug
> 2) if you do hit some issues, did you verified that your patch can fix it ?
>
> besides
>
> /Monk
>
> -----邮件原件-----
> 发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 Changfeng.Zhu
> 发送时间: 2019年11月20日 17:14
> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> 抄送: Zhu, Changfeng <Changfeng.Zhu@amd.com>
> 主题: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
>
> From: changzhu <Changfeng.Zhu@amd.com>
>
> It may lose gpuvm invalidate acknowldege state across power-gating off cycle. To avoid this issue in virt invalidation, add semaphore acquire before invalidation and semaphore release after invalidation.
>
> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>   3 files changed, 28 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index f04eb1a64271..70ffaf91cd12 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
>   
>   void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   					uint32_t reg0, uint32_t reg1,
> -					uint32_t ref, uint32_t mask)
> +					uint32_t ref, uint32_t mask,
> +					uint32_t sem)
>   {
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   	uint32_t seq;
>   
>   	spin_lock_irqsave(&kiq->ring_lock, flags);
> -	amdgpu_ring_alloc(ring, 32);
> +	amdgpu_ring_alloc(ring, 60);
> +
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
> +
>   	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>   					    ref, mask);
> +	/*
> +	 * add semaphore release after invalidation,
> +	 * write with 0 means semaphore release
> +	 */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_wreg(ring, sem, 0);
> +
>   	amdgpu_fence_emit_polling(ring, &seq);
>   	amdgpu_ring_commit(ring);
>   	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index b0b2bdc750df..bda6a2f37dc0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);  void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   					uint32_t reg0, uint32_t rreg1,
> -					uint32_t ref, uint32_t mask);
> +					uint32_t ref, uint32_t mask,
> +					uint32_t sem);
>   int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);  int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);  int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index f25cd97ba5f2..1ae59af7836a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   			!adev->in_gpu_reset) {
>   		uint32_t req = hub->vm_inv_eng0_req + eng;
>   		uint32_t ack = hub->vm_inv_eng0_ack + eng;
> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>   
>   		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
> -				1 << vmid);
> +						   1 << vmid, sem);
>   		return;
>   	}
>   
> --
> 2.17.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 13:20         ` Christian König
  0 siblings, 0 replies; 42+ messages in thread
From: Christian König @ 2019-11-20 13:20 UTC (permalink / raw)
  To: Liu, Monk, Zhu, Changfeng, Xiao, Jack, Zhou1, Tao, Huang, Ray,
	Huang, Shimmer, amd-gfx

Hi Monk,

this is a fix for power gating the MMHUB.

Basic problem is that the MMHUB can power gate while an invalidation is 
in progress which looses all bits in the ACK register and so deadlocks 
the engine waiting for the invalidation to finish.

This bug is hit immediately when we enable power gating of the MMHUB.

Regards,
Christian.

Am 20.11.19 um 14:18 schrieb Liu, Monk:
> Hi Changfeng
>
> Firs of all, there is no power-gating off circle involved in AMDGPU SRIOV, since we don't allow VF/VM do such things so I do feel strange why you post something like this
> Especially on VEGA10 serials which looks doesn't have any issue on those gpu_flush part
>
> Here is my questions for you:
> 1) Can you point me what issue had you been experienced ? and how to repro the bug
> 2) if you do hit some issues, did you verified that your patch can fix it ?
>
> besides
>
> /Monk
>
> -----邮件原件-----
> 发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 Changfeng.Zhu
> 发送时间: 2019年11月20日 17:14
> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> 抄送: Zhu, Changfeng <Changfeng.Zhu@amd.com>
> 主题: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
>
> From: changzhu <Changfeng.Zhu@amd.com>
>
> It may lose gpuvm invalidate acknowldege state across power-gating off cycle. To avoid this issue in virt invalidation, add semaphore acquire before invalidation and semaphore release after invalidation.
>
> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>   3 files changed, 28 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index f04eb1a64271..70ffaf91cd12 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
>   
>   void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   					uint32_t reg0, uint32_t reg1,
> -					uint32_t ref, uint32_t mask)
> +					uint32_t ref, uint32_t mask,
> +					uint32_t sem)
>   {
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   	uint32_t seq;
>   
>   	spin_lock_irqsave(&kiq->ring_lock, flags);
> -	amdgpu_ring_alloc(ring, 32);
> +	amdgpu_ring_alloc(ring, 60);
> +
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
> +
>   	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>   					    ref, mask);
> +	/*
> +	 * add semaphore release after invalidation,
> +	 * write with 0 means semaphore release
> +	 */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_wreg(ring, sem, 0);
> +
>   	amdgpu_fence_emit_polling(ring, &seq);
>   	amdgpu_ring_commit(ring);
>   	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index b0b2bdc750df..bda6a2f37dc0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);  void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   					uint32_t reg0, uint32_t rreg1,
> -					uint32_t ref, uint32_t mask);
> +					uint32_t ref, uint32_t mask,
> +					uint32_t sem);
>   int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);  int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);  int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index f25cd97ba5f2..1ae59af7836a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   			!adev->in_gpu_reset) {
>   		uint32_t req = hub->vm_inv_eng0_req + eng;
>   		uint32_t ack = hub->vm_inv_eng0_ack + eng;
> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>   
>   		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
> -				1 << vmid);
> +						   1 << vmid, sem);
>   		return;
>   	}
>   
> --
> 2.17.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 13:24         ` Liu, Monk
  0 siblings, 0 replies; 42+ messages in thread
From: Liu, Monk @ 2019-11-20 13:24 UTC (permalink / raw)
  To: Koenig, Christian, Zhu, Changfeng, Xiao, Jack, Zhou1, Tao, Huang,
	Ray, Huang, Shimmer, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	Deng, Emily

>>>are using the KIQ on SRIOV for GPUVM invalidation because we need an atomic read/modify/write cycle since we found that the invalidation engine is resetted with every world switch.
>>> accessing the semaphore registers is not atomic any more and we could have a world switch in between grabbing the semaphore and sending the VM invalidation. That either won't work or could result in a lockup as well.

Yeah, Christian is right on that,
By KIQ doing the VM invalidation through KIQ the whole procedure is atomic (no world switch breaks it), if you split works into two (or more) KIQ job , that means between any two KIQ jobs there is chance to introduce world switch
(KIQ busy will block world switch, and once KIQ idle a world switch could immediately happen)

I'm still trying to get what problem you guys meet with ?

-----邮件原件-----
发件人: Christian König <ckoenig.leichtzumerken@gmail.com> 
发送时间: 2019年11月20日 19:24
收件人: Zhu, Changfeng <Changfeng.Zhu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org; Deng, Emily <Emily.Deng@amd.com>; Liu, Monk <Monk.Liu@amd.com>
主题: Re: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt

Hi Changfeng,

[adding Monk and Emily as well].

I thought more about this and came to the conclusion that this won't work and might result in a lockup as well.

We are using the KIQ on SRIOV for GPUVM invalidation because we need an atomic read/modify/write cycle since we found that the invalidation engine is resetted with every world switch.

Now accessing the semaphore registers is not atomic any more and we could have a world switch in between grabbing the semaphore and sending the VM invalidation. That either won't work or could result in a lockup as well.

Question for Emily and Monk: Do we support power gating of the MMHUB with SRIOV? I don't think so and when that's correct we could just drop this patch.

Regards,
Christian.

Am 20.11.19 um 10:14 schrieb Changfeng.Zhu:
> From: changzhu <Changfeng.Zhu@amd.com>
>
> It may lose gpuvm invalidate acknowldege state across power-gating off 
> cycle. To avoid this issue in virt invalidation, add semaphore acquire 
> before invalidation and semaphore release after invalidation.
>
> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>   3 files changed, 28 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index f04eb1a64271..70ffaf91cd12 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device 
> *adev, uint32_t reg, uint32_t v)
>   
>   void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   					uint32_t reg0, uint32_t reg1,
> -					uint32_t ref, uint32_t mask)
> +					uint32_t ref, uint32_t mask,
> +					uint32_t sem)
>   {
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void 
> amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   	uint32_t seq;
>   
>   	spin_lock_irqsave(&kiq->ring_lock, flags);
> -	amdgpu_ring_alloc(ring, 32);
> +	amdgpu_ring_alloc(ring, 60);
> +
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
> +
>   	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>   					    ref, mask);
> +	/*
> +	 * add semaphore release after invalidation,
> +	 * write with 0 means semaphore release
> +	 */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_wreg(ring, sem, 0);
> +
>   	amdgpu_fence_emit_polling(ring, &seq);
>   	amdgpu_ring_commit(ring);
>   	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index b0b2bdc750df..bda6a2f37dc0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);
>   void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);
>   void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   					uint32_t reg0, uint32_t rreg1,
> -					uint32_t ref, uint32_t mask);
> +					uint32_t ref, uint32_t mask,
> +					uint32_t sem);
>   int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
>   int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);
>   int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); diff --git 
> a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index f25cd97ba5f2..1ae59af7836a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   			!adev->in_gpu_reset) {
>   		uint32_t req = hub->vm_inv_eng0_req + eng;
>   		uint32_t ack = hub->vm_inv_eng0_ack + eng;
> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>   
>   		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
> -				1 << vmid);
> +						   1 << vmid, sem);
>   		return;
>   	}
>   

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 13:24         ` Liu, Monk
  0 siblings, 0 replies; 42+ messages in thread
From: Liu, Monk @ 2019-11-20 13:24 UTC (permalink / raw)
  To: Koenig, Christian, Zhu, Changfeng, Xiao, Jack, Zhou1, Tao, Huang,
	Ray, Huang, Shimmer, amd-gfx, Deng, Emily

>>>are using the KIQ on SRIOV for GPUVM invalidation because we need an atomic read/modify/write cycle since we found that the invalidation engine is resetted with every world switch.
>>> accessing the semaphore registers is not atomic any more and we could have a world switch in between grabbing the semaphore and sending the VM invalidation. That either won't work or could result in a lockup as well.

Yeah, Christian is right on that,
By KIQ doing the VM invalidation through KIQ the whole procedure is atomic (no world switch breaks it), if you split works into two (or more) KIQ job , that means between any two KIQ jobs there is chance to introduce world switch
(KIQ busy will block world switch, and once KIQ idle a world switch could immediately happen)

I'm still trying to get what problem you guys meet with ?

-----邮件原件-----
发件人: Christian König <ckoenig.leichtzumerken@gmail.com> 
发送时间: 2019年11月20日 19:24
收件人: Zhu, Changfeng <Changfeng.Zhu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org; Deng, Emily <Emily.Deng@amd.com>; Liu, Monk <Monk.Liu@amd.com>
主题: Re: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt

Hi Changfeng,

[adding Monk and Emily as well].

I thought more about this and came to the conclusion that this won't work and might result in a lockup as well.

We are using the KIQ on SRIOV for GPUVM invalidation because we need an atomic read/modify/write cycle since we found that the invalidation engine is resetted with every world switch.

Now accessing the semaphore registers is not atomic any more and we could have a world switch in between grabbing the semaphore and sending the VM invalidation. That either won't work or could result in a lockup as well.

Question for Emily and Monk: Do we support power gating of the MMHUB with SRIOV? I don't think so and when that's correct we could just drop this patch.

Regards,
Christian.

Am 20.11.19 um 10:14 schrieb Changfeng.Zhu:
> From: changzhu <Changfeng.Zhu@amd.com>
>
> It may lose gpuvm invalidate acknowldege state across power-gating off 
> cycle. To avoid this issue in virt invalidation, add semaphore acquire 
> before invalidation and semaphore release after invalidation.
>
> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>   3 files changed, 28 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index f04eb1a64271..70ffaf91cd12 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device 
> *adev, uint32_t reg, uint32_t v)
>   
>   void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   					uint32_t reg0, uint32_t reg1,
> -					uint32_t ref, uint32_t mask)
> +					uint32_t ref, uint32_t mask,
> +					uint32_t sem)
>   {
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void 
> amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   	uint32_t seq;
>   
>   	spin_lock_irqsave(&kiq->ring_lock, flags);
> -	amdgpu_ring_alloc(ring, 32);
> +	amdgpu_ring_alloc(ring, 60);
> +
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
> +
>   	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>   					    ref, mask);
> +	/*
> +	 * add semaphore release after invalidation,
> +	 * write with 0 means semaphore release
> +	 */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_wreg(ring, sem, 0);
> +
>   	amdgpu_fence_emit_polling(ring, &seq);
>   	amdgpu_ring_commit(ring);
>   	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index b0b2bdc750df..bda6a2f37dc0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);
>   void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);
>   void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   					uint32_t reg0, uint32_t rreg1,
> -					uint32_t ref, uint32_t mask);
> +					uint32_t ref, uint32_t mask,
> +					uint32_t sem);
>   int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
>   int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);
>   int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); diff --git 
> a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index f25cd97ba5f2..1ae59af7836a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   			!adev->in_gpu_reset) {
>   		uint32_t req = hub->vm_inv_eng0_req + eng;
>   		uint32_t ack = hub->vm_inv_eng0_ack + eng;
> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>   
>   		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
> -				1 << vmid);
> +						   1 << vmid, sem);
>   		return;
>   	}
>   

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 13:30         ` Liu, Monk
  0 siblings, 0 replies; 42+ messages in thread
From: Liu, Monk @ 2019-11-20 13:30 UTC (permalink / raw)
  To: Koenig, Christian, Zhu, Changfeng, Xiao, Jack, Zhou1, Tao, Huang,
	Ray, Huang, Shimmer, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	Deng, Emily

>>Question for Emily and Monk: Do we support power gating of the MMHUB with SRIOV? I don't think so and when that's correct we could just drop this patch.

Any power gating if now allowed to be controlled by a VF in a guest VM .... 

It is hypervisor driver's (gim) responsibility to conduct when can our hardware entering a power circle (e.g. BACO reset), and we have software mechanism to make sure 
Power gating off circle shall only happen when all engine is idle (or any of them was hang) state. 

And even some engine isn't hang (e.g. KIQ is still doing things like read register or gpu_flush_tlb , etc...) if GIM decide to power off GPU (BACO reset) then that's okay for KIQ, since
After BACO all engines would be re-init anyway 

Thanks 

/Monk

-----邮件原件-----
发件人: Christian König <ckoenig.leichtzumerken@gmail.com> 
发送时间: 2019年11月20日 19:24
收件人: Zhu, Changfeng <Changfeng.Zhu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org; Deng, Emily <Emily.Deng@amd.com>; Liu, Monk <Monk.Liu@amd.com>
主题: Re: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt

Hi Changfeng,

[adding Monk and Emily as well].

I thought more about this and came to the conclusion that this won't work and might result in a lockup as well.

We are using the KIQ on SRIOV for GPUVM invalidation because we need an atomic read/modify/write cycle since we found that the invalidation engine is resetted with every world switch.

Now accessing the semaphore registers is not atomic any more and we could have a world switch in between grabbing the semaphore and sending the VM invalidation. That either won't work or could result in a lockup as well.

Question for Emily and Monk: Do we support power gating of the MMHUB with SRIOV? I don't think so and when that's correct we could just drop this patch.

Regards,
Christian.

Am 20.11.19 um 10:14 schrieb Changfeng.Zhu:
> From: changzhu <Changfeng.Zhu@amd.com>
>
> It may lose gpuvm invalidate acknowldege state across power-gating off 
> cycle. To avoid this issue in virt invalidation, add semaphore acquire 
> before invalidation and semaphore release after invalidation.
>
> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>   3 files changed, 28 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index f04eb1a64271..70ffaf91cd12 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device 
> *adev, uint32_t reg, uint32_t v)
>   
>   void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   					uint32_t reg0, uint32_t reg1,
> -					uint32_t ref, uint32_t mask)
> +					uint32_t ref, uint32_t mask,
> +					uint32_t sem)
>   {
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void 
> amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   	uint32_t seq;
>   
>   	spin_lock_irqsave(&kiq->ring_lock, flags);
> -	amdgpu_ring_alloc(ring, 32);
> +	amdgpu_ring_alloc(ring, 60);
> +
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
> +
>   	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>   					    ref, mask);
> +	/*
> +	 * add semaphore release after invalidation,
> +	 * write with 0 means semaphore release
> +	 */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_wreg(ring, sem, 0);
> +
>   	amdgpu_fence_emit_polling(ring, &seq);
>   	amdgpu_ring_commit(ring);
>   	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index b0b2bdc750df..bda6a2f37dc0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);
>   void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);
>   void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   					uint32_t reg0, uint32_t rreg1,
> -					uint32_t ref, uint32_t mask);
> +					uint32_t ref, uint32_t mask,
> +					uint32_t sem);
>   int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
>   int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);
>   int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); diff --git 
> a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index f25cd97ba5f2..1ae59af7836a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   			!adev->in_gpu_reset) {
>   		uint32_t req = hub->vm_inv_eng0_req + eng;
>   		uint32_t ack = hub->vm_inv_eng0_ack + eng;
> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>   
>   		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
> -				1 << vmid);
> +						   1 << vmid, sem);
>   		return;
>   	}
>   

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 13:30         ` Liu, Monk
  0 siblings, 0 replies; 42+ messages in thread
From: Liu, Monk @ 2019-11-20 13:30 UTC (permalink / raw)
  To: Koenig, Christian, Zhu, Changfeng, Xiao, Jack, Zhou1, Tao, Huang,
	Ray, Huang, Shimmer, amd-gfx, Deng, Emily

>>Question for Emily and Monk: Do we support power gating of the MMHUB with SRIOV? I don't think so and when that's correct we could just drop this patch.

Any power gating if now allowed to be controlled by a VF in a guest VM .... 

It is hypervisor driver's (gim) responsibility to conduct when can our hardware entering a power circle (e.g. BACO reset), and we have software mechanism to make sure 
Power gating off circle shall only happen when all engine is idle (or any of them was hang) state. 

And even some engine isn't hang (e.g. KIQ is still doing things like read register or gpu_flush_tlb , etc...) if GIM decide to power off GPU (BACO reset) then that's okay for KIQ, since
After BACO all engines would be re-init anyway 

Thanks 

/Monk

-----邮件原件-----
发件人: Christian König <ckoenig.leichtzumerken@gmail.com> 
发送时间: 2019年11月20日 19:24
收件人: Zhu, Changfeng <Changfeng.Zhu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org; Deng, Emily <Emily.Deng@amd.com>; Liu, Monk <Monk.Liu@amd.com>
主题: Re: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt

Hi Changfeng,

[adding Monk and Emily as well].

I thought more about this and came to the conclusion that this won't work and might result in a lockup as well.

We are using the KIQ on SRIOV for GPUVM invalidation because we need an atomic read/modify/write cycle since we found that the invalidation engine is resetted with every world switch.

Now accessing the semaphore registers is not atomic any more and we could have a world switch in between grabbing the semaphore and sending the VM invalidation. That either won't work or could result in a lockup as well.

Question for Emily and Monk: Do we support power gating of the MMHUB with SRIOV? I don't think so and when that's correct we could just drop this patch.

Regards,
Christian.

Am 20.11.19 um 10:14 schrieb Changfeng.Zhu:
> From: changzhu <Changfeng.Zhu@amd.com>
>
> It may lose gpuvm invalidate acknowldege state across power-gating off 
> cycle. To avoid this issue in virt invalidation, add semaphore acquire 
> before invalidation and semaphore release after invalidation.
>
> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>   3 files changed, 28 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index f04eb1a64271..70ffaf91cd12 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device 
> *adev, uint32_t reg, uint32_t v)
>   
>   void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   					uint32_t reg0, uint32_t reg1,
> -					uint32_t ref, uint32_t mask)
> +					uint32_t ref, uint32_t mask,
> +					uint32_t sem)
>   {
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void 
> amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   	uint32_t seq;
>   
>   	spin_lock_irqsave(&kiq->ring_lock, flags);
> -	amdgpu_ring_alloc(ring, 32);
> +	amdgpu_ring_alloc(ring, 60);
> +
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
> +
>   	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>   					    ref, mask);
> +	/*
> +	 * add semaphore release after invalidation,
> +	 * write with 0 means semaphore release
> +	 */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_wreg(ring, sem, 0);
> +
>   	amdgpu_fence_emit_polling(ring, &seq);
>   	amdgpu_ring_commit(ring);
>   	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index b0b2bdc750df..bda6a2f37dc0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);
>   void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);
>   void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   					uint32_t reg0, uint32_t rreg1,
> -					uint32_t ref, uint32_t mask);
> +					uint32_t ref, uint32_t mask,
> +					uint32_t sem);
>   int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
>   int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);
>   int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); diff --git 
> a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index f25cd97ba5f2..1ae59af7836a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   			!adev->in_gpu_reset) {
>   		uint32_t req = hub->vm_inv_eng0_req + eng;
>   		uint32_t ack = hub->vm_inv_eng0_ack + eng;
> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>   
>   		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
> -				1 << vmid);
> +						   1 << vmid, sem);
>   		return;
>   	}
>   

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 13:36             ` Christian König
  0 siblings, 0 replies; 42+ messages in thread
From: Christian König @ 2019-11-20 13:36 UTC (permalink / raw)
  To: Liu, Monk, Koenig, Christian, Zhu, Changfeng, Xiao, Jack, Zhou1,
	Tao, Huang, Ray, Huang, Shimmer,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Deng, Emily

Ok in this case we should just drop this patch.

Any objections on the other semaphore patch? IIRC we added the 
amdgpu_ring_emit_reg_write_reg_wait() especially to make sure that an 
invalidation can't be interrupted by a world switch.

When we add manual semaphore acquire/release before and after the 
invalidation that could break this quite badly.

Regards,
Christian.

Am 20.11.19 um 14:30 schrieb Liu, Monk:
>>> Question for Emily and Monk: Do we support power gating of the MMHUB with SRIOV? I don't think so and when that's correct we could just drop this patch.
> Any power gating if now allowed to be controlled by a VF in a guest VM ....
>
> It is hypervisor driver's (gim) responsibility to conduct when can our hardware entering a power circle (e.g. BACO reset), and we have software mechanism to make sure
> Power gating off circle shall only happen when all engine is idle (or any of them was hang) state.
>
> And even some engine isn't hang (e.g. KIQ is still doing things like read register or gpu_flush_tlb , etc...) if GIM decide to power off GPU (BACO reset) then that's okay for KIQ, since
> After BACO all engines would be re-init anyway
>
> Thanks
>
> /Monk
>
> -----邮件原件-----
> 发件人: Christian König <ckoenig.leichtzumerken@gmail.com>
> 发送时间: 2019年11月20日 19:24
> 收件人: Zhu, Changfeng <Changfeng.Zhu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org; Deng, Emily <Emily.Deng@amd.com>; Liu, Monk <Monk.Liu@amd.com>
> 主题: Re: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
>
> Hi Changfeng,
>
> [adding Monk and Emily as well].
>
> I thought more about this and came to the conclusion that this won't work and might result in a lockup as well.
>
> We are using the KIQ on SRIOV for GPUVM invalidation because we need an atomic read/modify/write cycle since we found that the invalidation engine is resetted with every world switch.
>
> Now accessing the semaphore registers is not atomic any more and we could have a world switch in between grabbing the semaphore and sending the VM invalidation. That either won't work or could result in a lockup as well.
>
> Question for Emily and Monk: Do we support power gating of the MMHUB with SRIOV? I don't think so and when that's correct we could just drop this patch.
>
> Regards,
> Christian.
>
> Am 20.11.19 um 10:14 schrieb Changfeng.Zhu:
>> From: changzhu <Changfeng.Zhu@amd.com>
>>
>> It may lose gpuvm invalidate acknowldege state across power-gating off
>> cycle. To avoid this issue in virt invalidation, add semaphore acquire
>> before invalidation and semaphore release after invalidation.
>>
>> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>>    3 files changed, 28 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> index f04eb1a64271..70ffaf91cd12 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device
>> *adev, uint32_t reg, uint32_t v)
>>    
>>    void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    					uint32_t reg0, uint32_t reg1,
>> -					uint32_t ref, uint32_t mask)
>> +					uint32_t ref, uint32_t mask,
>> +					uint32_t sem)
>>    {
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void
>> amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    	uint32_t seq;
>>    
>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>> -	amdgpu_ring_alloc(ring, 32);
>> +	amdgpu_ring_alloc(ring, 60);
>> +
>> +	/*
>> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
>> +	 * off cycle, add semaphore acquire before invalidation and semaphore
>> +	 * release after invalidation to avoid entering power gated state
>> +	 * to WA the Issue
>> +	 */
>> +
>> +	/* a read return value of 1 means semaphore acuqire */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>> +
>>    	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>    					    ref, mask);
>> +	/*
>> +	 * add semaphore release after invalidation,
>> +	 * write with 0 means semaphore release
>> +	 */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_wreg(ring, sem, 0);
>> +
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>>    	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> index b0b2bdc750df..bda6a2f37dc0 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);
>>    void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);
>>    void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    					uint32_t reg0, uint32_t rreg1,
>> -					uint32_t ref, uint32_t mask);
>> +					uint32_t ref, uint32_t mask,
>> +					uint32_t sem);
>>    int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
>>    int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);
>>    int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); diff --git
>> a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index f25cd97ba5f2..1ae59af7836a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>    			!adev->in_gpu_reset) {
>>    		uint32_t req = hub->vm_inv_eng0_req + eng;
>>    		uint32_t ack = hub->vm_inv_eng0_ack + eng;
>> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>>    
>>    		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
>> -				1 << vmid);
>> +						   1 << vmid, sem);
>>    		return;
>>    	}
>>    
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 13:36             ` Christian König
  0 siblings, 0 replies; 42+ messages in thread
From: Christian König @ 2019-11-20 13:36 UTC (permalink / raw)
  To: Liu, Monk, Koenig, Christian, Zhu, Changfeng, Xiao, Jack, Zhou1,
	Tao, Huang, Ray, Huang, Shimmer, amd-gfx, Deng, Emily

Ok in this case we should just drop this patch.

Any objections on the other semaphore patch? IIRC we added the 
amdgpu_ring_emit_reg_write_reg_wait() especially to make sure that an 
invalidation can't be interrupted by a world switch.

When we add manual semaphore acquire/release before and after the 
invalidation that could break this quite badly.

Regards,
Christian.

Am 20.11.19 um 14:30 schrieb Liu, Monk:
>>> Question for Emily and Monk: Do we support power gating of the MMHUB with SRIOV? I don't think so and when that's correct we could just drop this patch.
> Any power gating if now allowed to be controlled by a VF in a guest VM ....
>
> It is hypervisor driver's (gim) responsibility to conduct when can our hardware entering a power circle (e.g. BACO reset), and we have software mechanism to make sure
> Power gating off circle shall only happen when all engine is idle (or any of them was hang) state.
>
> And even some engine isn't hang (e.g. KIQ is still doing things like read register or gpu_flush_tlb , etc...) if GIM decide to power off GPU (BACO reset) then that's okay for KIQ, since
> After BACO all engines would be re-init anyway
>
> Thanks
>
> /Monk
>
> -----邮件原件-----
> 发件人: Christian König <ckoenig.leichtzumerken@gmail.com>
> 发送时间: 2019年11月20日 19:24
> 收件人: Zhu, Changfeng <Changfeng.Zhu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org; Deng, Emily <Emily.Deng@amd.com>; Liu, Monk <Monk.Liu@amd.com>
> 主题: Re: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
>
> Hi Changfeng,
>
> [adding Monk and Emily as well].
>
> I thought more about this and came to the conclusion that this won't work and might result in a lockup as well.
>
> We are using the KIQ on SRIOV for GPUVM invalidation because we need an atomic read/modify/write cycle since we found that the invalidation engine is resetted with every world switch.
>
> Now accessing the semaphore registers is not atomic any more and we could have a world switch in between grabbing the semaphore and sending the VM invalidation. That either won't work or could result in a lockup as well.
>
> Question for Emily and Monk: Do we support power gating of the MMHUB with SRIOV? I don't think so and when that's correct we could just drop this patch.
>
> Regards,
> Christian.
>
> Am 20.11.19 um 10:14 schrieb Changfeng.Zhu:
>> From: changzhu <Changfeng.Zhu@amd.com>
>>
>> It may lose gpuvm invalidate acknowldege state across power-gating off
>> cycle. To avoid this issue in virt invalidation, add semaphore acquire
>> before invalidation and semaphore release after invalidation.
>>
>> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>>    3 files changed, 28 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> index f04eb1a64271..70ffaf91cd12 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device
>> *adev, uint32_t reg, uint32_t v)
>>    
>>    void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    					uint32_t reg0, uint32_t reg1,
>> -					uint32_t ref, uint32_t mask)
>> +					uint32_t ref, uint32_t mask,
>> +					uint32_t sem)
>>    {
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void
>> amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    	uint32_t seq;
>>    
>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>> -	amdgpu_ring_alloc(ring, 32);
>> +	amdgpu_ring_alloc(ring, 60);
>> +
>> +	/*
>> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
>> +	 * off cycle, add semaphore acquire before invalidation and semaphore
>> +	 * release after invalidation to avoid entering power gated state
>> +	 * to WA the Issue
>> +	 */
>> +
>> +	/* a read return value of 1 means semaphore acuqire */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>> +
>>    	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>    					    ref, mask);
>> +	/*
>> +	 * add semaphore release after invalidation,
>> +	 * write with 0 means semaphore release
>> +	 */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_wreg(ring, sem, 0);
>> +
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>>    	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> index b0b2bdc750df..bda6a2f37dc0 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);
>>    void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);
>>    void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    					uint32_t reg0, uint32_t rreg1,
>> -					uint32_t ref, uint32_t mask);
>> +					uint32_t ref, uint32_t mask,
>> +					uint32_t sem);
>>    int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
>>    int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);
>>    int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); diff --git
>> a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index f25cd97ba5f2..1ae59af7836a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>    			!adev->in_gpu_reset) {
>>    		uint32_t req = hub->vm_inv_eng0_req + eng;
>>    		uint32_t ack = hub->vm_inv_eng0_ack + eng;
>> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>>    
>>    		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
>> -				1 << vmid);
>> +						   1 << vmid, sem);
>>    		return;
>>    	}
>>    
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 13:54             ` Liu, Monk
  0 siblings, 0 replies; 42+ messages in thread
From: Liu, Monk @ 2019-11-20 13:54 UTC (permalink / raw)
  To: Koenig, Christian, Zhu, Changfeng, Xiao, Jack, Zhou1, Tao, Huang,
	Ray, Huang, Shimmer, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Hah, but in SRIOV case, our guest KMD driver is not allowed to do such things .... (and even there is a bug that KMD try to power gate, the SMU firmware would not really do the jobs since
We have PSP L1 policy to prevent those danger operations )

Did Changfeng already hit this issue under SRIOV ???

-----邮件原件-----
发件人: Koenig, Christian <Christian.Koenig@amd.com> 
发送时间: 2019年11月20日 21:21
收件人: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
主题: Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt

Hi Monk,

this is a fix for power gating the MMHUB.

Basic problem is that the MMHUB can power gate while an invalidation is in progress which looses all bits in the ACK register and so deadlocks the engine waiting for the invalidation to finish.

This bug is hit immediately when we enable power gating of the MMHUB.

Regards,
Christian.

Am 20.11.19 um 14:18 schrieb Liu, Monk:
> Hi Changfeng
>
> Firs of all, there is no power-gating off circle involved in AMDGPU 
> SRIOV, since we don't allow VF/VM do such things so I do feel strange 
> why you post something like this Especially on VEGA10 serials which 
> looks doesn't have any issue on those gpu_flush part
>
> Here is my questions for you:
> 1) Can you point me what issue had you been experienced ? and how to 
> repro the bug
> 2) if you do hit some issues, did you verified that your patch can fix it ?
>
> besides
>
> /Monk
>
> -----邮件原件-----
> 发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 Changfeng.Zhu
> 发送时间: 2019年11月20日 17:14
> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack 
> <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray 
> <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; 
> amd-gfx@lists.freedesktop.org
> 抄送: Zhu, Changfeng <Changfeng.Zhu@amd.com>
> 主题: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in 
> amdgpu_virt
>
> From: changzhu <Changfeng.Zhu@amd.com>
>
> It may lose gpuvm invalidate acknowldege state across power-gating off cycle. To avoid this issue in virt invalidation, add semaphore acquire before invalidation and semaphore release after invalidation.
>
> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>   3 files changed, 28 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index f04eb1a64271..70ffaf91cd12 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device 
> *adev, uint32_t reg, uint32_t v)
>   
>   void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   					uint32_t reg0, uint32_t reg1,
> -					uint32_t ref, uint32_t mask)
> +					uint32_t ref, uint32_t mask,
> +					uint32_t sem)
>   {
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   	uint32_t seq;
>   
>   	spin_lock_irqsave(&kiq->ring_lock, flags);
> -	amdgpu_ring_alloc(ring, 32);
> +	amdgpu_ring_alloc(ring, 60);
> +
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
> +
>   	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>   					    ref, mask);
> +	/*
> +	 * add semaphore release after invalidation,
> +	 * write with 0 means semaphore release
> +	 */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_wreg(ring, sem, 0);
> +
>   	amdgpu_fence_emit_polling(ring, &seq);
>   	amdgpu_ring_commit(ring);
>   	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index b0b2bdc750df..bda6a2f37dc0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);  void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   					uint32_t reg0, uint32_t rreg1,
> -					uint32_t ref, uint32_t mask);
> +					uint32_t ref, uint32_t mask,
> +					uint32_t sem);
>   int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool 
> init);  int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, 
> bool init);  int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); 
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index f25cd97ba5f2..1ae59af7836a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   			!adev->in_gpu_reset) {
>   		uint32_t req = hub->vm_inv_eng0_req + eng;
>   		uint32_t ack = hub->vm_inv_eng0_ack + eng;
> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>   
>   		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
> -				1 << vmid);
> +						   1 << vmid, sem);
>   		return;
>   	}
>   
> --
> 2.17.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 13:54             ` Liu, Monk
  0 siblings, 0 replies; 42+ messages in thread
From: Liu, Monk @ 2019-11-20 13:54 UTC (permalink / raw)
  To: Koenig, Christian, Zhu, Changfeng, Xiao, Jack, Zhou1, Tao, Huang,
	Ray, Huang, Shimmer, amd-gfx

Hah, but in SRIOV case, our guest KMD driver is not allowed to do such things .... (and even there is a bug that KMD try to power gate, the SMU firmware would not really do the jobs since
We have PSP L1 policy to prevent those danger operations )

Did Changfeng already hit this issue under SRIOV ???

-----邮件原件-----
发件人: Koenig, Christian <Christian.Koenig@amd.com> 
发送时间: 2019年11月20日 21:21
收件人: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
主题: Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt

Hi Monk,

this is a fix for power gating the MMHUB.

Basic problem is that the MMHUB can power gate while an invalidation is in progress which looses all bits in the ACK register and so deadlocks the engine waiting for the invalidation to finish.

This bug is hit immediately when we enable power gating of the MMHUB.

Regards,
Christian.

Am 20.11.19 um 14:18 schrieb Liu, Monk:
> Hi Changfeng
>
> Firs of all, there is no power-gating off circle involved in AMDGPU 
> SRIOV, since we don't allow VF/VM do such things so I do feel strange 
> why you post something like this Especially on VEGA10 serials which 
> looks doesn't have any issue on those gpu_flush part
>
> Here is my questions for you:
> 1) Can you point me what issue had you been experienced ? and how to 
> repro the bug
> 2) if you do hit some issues, did you verified that your patch can fix it ?
>
> besides
>
> /Monk
>
> -----邮件原件-----
> 发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 Changfeng.Zhu
> 发送时间: 2019年11月20日 17:14
> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack 
> <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray 
> <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; 
> amd-gfx@lists.freedesktop.org
> 抄送: Zhu, Changfeng <Changfeng.Zhu@amd.com>
> 主题: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in 
> amdgpu_virt
>
> From: changzhu <Changfeng.Zhu@amd.com>
>
> It may lose gpuvm invalidate acknowldege state across power-gating off cycle. To avoid this issue in virt invalidation, add semaphore acquire before invalidation and semaphore release after invalidation.
>
> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>   3 files changed, 28 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index f04eb1a64271..70ffaf91cd12 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device 
> *adev, uint32_t reg, uint32_t v)
>   
>   void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   					uint32_t reg0, uint32_t reg1,
> -					uint32_t ref, uint32_t mask)
> +					uint32_t ref, uint32_t mask,
> +					uint32_t sem)
>   {
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   	uint32_t seq;
>   
>   	spin_lock_irqsave(&kiq->ring_lock, flags);
> -	amdgpu_ring_alloc(ring, 32);
> +	amdgpu_ring_alloc(ring, 60);
> +
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
> +
>   	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>   					    ref, mask);
> +	/*
> +	 * add semaphore release after invalidation,
> +	 * write with 0 means semaphore release
> +	 */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_wreg(ring, sem, 0);
> +
>   	amdgpu_fence_emit_polling(ring, &seq);
>   	amdgpu_ring_commit(ring);
>   	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index b0b2bdc750df..bda6a2f37dc0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);  void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   					uint32_t reg0, uint32_t rreg1,
> -					uint32_t ref, uint32_t mask);
> +					uint32_t ref, uint32_t mask,
> +					uint32_t sem);
>   int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool 
> init);  int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, 
> bool init);  int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); 
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index f25cd97ba5f2..1ae59af7836a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   			!adev->in_gpu_reset) {
>   		uint32_t req = hub->vm_inv_eng0_req + eng;
>   		uint32_t ack = hub->vm_inv_eng0_ack + eng;
> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>   
>   		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
> -				1 << vmid);
> +						   1 << vmid, sem);
>   		return;
>   	}
>   
> --
> 2.17.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 14:00                 ` Christian König
  0 siblings, 0 replies; 42+ messages in thread
From: Christian König @ 2019-11-20 14:00 UTC (permalink / raw)
  To: Liu, Monk, Koenig, Christian, Zhu, Changfeng, Xiao, Jack, Zhou1,
	Tao, Huang, Ray, Huang, Shimmer,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

> Did Changfeng already hit this issue under SRIOV ?
I don't think so, but Changfeng needs to answer this.

Question is does the extra semaphore acquire has some negative effect on 
SRIOV?

I would like to avoid having even more SRIOV specific handling in here 
which we can't really test on bare metal.

Christian.

Am 20.11.19 um 14:54 schrieb Liu, Monk:
> Hah, but in SRIOV case, our guest KMD driver is not allowed to do such things .... (and even there is a bug that KMD try to power gate, the SMU firmware would not really do the jobs since
> We have PSP L1 policy to prevent those danger operations )
>
> Did Changfeng already hit this issue under SRIOV ???
>
> -----邮件原件-----
> 发件人: Koenig, Christian <Christian.Koenig@amd.com>
> 发送时间: 2019年11月20日 21:21
> 收件人: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> 主题: Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
>
> Hi Monk,
>
> this is a fix for power gating the MMHUB.
>
> Basic problem is that the MMHUB can power gate while an invalidation is in progress which looses all bits in the ACK register and so deadlocks the engine waiting for the invalidation to finish.
>
> This bug is hit immediately when we enable power gating of the MMHUB.
>
> Regards,
> Christian.
>
> Am 20.11.19 um 14:18 schrieb Liu, Monk:
>> Hi Changfeng
>>
>> Firs of all, there is no power-gating off circle involved in AMDGPU
>> SRIOV, since we don't allow VF/VM do such things so I do feel strange
>> why you post something like this Especially on VEGA10 serials which
>> looks doesn't have any issue on those gpu_flush part
>>
>> Here is my questions for you:
>> 1) Can you point me what issue had you been experienced ? and how to
>> repro the bug
>> 2) if you do hit some issues, did you verified that your patch can fix it ?
>>
>> besides
>>
>> /Monk
>>
>> -----邮件原件-----
>> 发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 Changfeng.Zhu
>> 发送时间: 2019年11月20日 17:14
>> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack
>> <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray
>> <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>;
>> amd-gfx@lists.freedesktop.org
>> 抄送: Zhu, Changfeng <Changfeng.Zhu@amd.com>
>> 主题: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in
>> amdgpu_virt
>>
>> From: changzhu <Changfeng.Zhu@amd.com>
>>
>> It may lose gpuvm invalidate acknowldege state across power-gating off cycle. To avoid this issue in virt invalidation, add semaphore acquire before invalidation and semaphore release after invalidation.
>>
>> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>>    3 files changed, 28 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> index f04eb1a64271..70ffaf91cd12 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device
>> *adev, uint32_t reg, uint32_t v)
>>    
>>    void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    					uint32_t reg0, uint32_t reg1,
>> -					uint32_t ref, uint32_t mask)
>> +					uint32_t ref, uint32_t mask,
>> +					uint32_t sem)
>>    {
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    	uint32_t seq;
>>    
>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>> -	amdgpu_ring_alloc(ring, 32);
>> +	amdgpu_ring_alloc(ring, 60);
>> +
>> +	/*
>> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
>> +	 * off cycle, add semaphore acquire before invalidation and semaphore
>> +	 * release after invalidation to avoid entering power gated state
>> +	 * to WA the Issue
>> +	 */
>> +
>> +	/* a read return value of 1 means semaphore acuqire */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>> +
>>    	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>    					    ref, mask);
>> +	/*
>> +	 * add semaphore release after invalidation,
>> +	 * write with 0 means semaphore release
>> +	 */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_wreg(ring, sem, 0);
>> +
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>>    	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> index b0b2bdc750df..bda6a2f37dc0 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);  void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    					uint32_t reg0, uint32_t rreg1,
>> -					uint32_t ref, uint32_t mask);
>> +					uint32_t ref, uint32_t mask,
>> +					uint32_t sem);
>>    int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool
>> init);  int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev,
>> bool init);  int amdgpu_virt_reset_gpu(struct amdgpu_device *adev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index f25cd97ba5f2..1ae59af7836a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>    			!adev->in_gpu_reset) {
>>    		uint32_t req = hub->vm_inv_eng0_req + eng;
>>    		uint32_t ack = hub->vm_inv_eng0_ack + eng;
>> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>>    
>>    		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
>> -				1 << vmid);
>> +						   1 << vmid, sem);
>>    		return;
>>    	}
>>    
>> --
>> 2.17.1
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 14:00                 ` Christian König
  0 siblings, 0 replies; 42+ messages in thread
From: Christian König @ 2019-11-20 14:00 UTC (permalink / raw)
  To: Liu, Monk, Koenig, Christian, Zhu, Changfeng, Xiao, Jack, Zhou1,
	Tao, Huang, Ray, Huang, Shimmer, amd-gfx

> Did Changfeng already hit this issue under SRIOV ?
I don't think so, but Changfeng needs to answer this.

Question is does the extra semaphore acquire has some negative effect on 
SRIOV?

I would like to avoid having even more SRIOV specific handling in here 
which we can't really test on bare metal.

Christian.

Am 20.11.19 um 14:54 schrieb Liu, Monk:
> Hah, but in SRIOV case, our guest KMD driver is not allowed to do such things .... (and even there is a bug that KMD try to power gate, the SMU firmware would not really do the jobs since
> We have PSP L1 policy to prevent those danger operations )
>
> Did Changfeng already hit this issue under SRIOV ???
>
> -----邮件原件-----
> 发件人: Koenig, Christian <Christian.Koenig@amd.com>
> 发送时间: 2019年11月20日 21:21
> 收件人: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> 主题: Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
>
> Hi Monk,
>
> this is a fix for power gating the MMHUB.
>
> Basic problem is that the MMHUB can power gate while an invalidation is in progress which looses all bits in the ACK register and so deadlocks the engine waiting for the invalidation to finish.
>
> This bug is hit immediately when we enable power gating of the MMHUB.
>
> Regards,
> Christian.
>
> Am 20.11.19 um 14:18 schrieb Liu, Monk:
>> Hi Changfeng
>>
>> Firs of all, there is no power-gating off circle involved in AMDGPU
>> SRIOV, since we don't allow VF/VM do such things so I do feel strange
>> why you post something like this Especially on VEGA10 serials which
>> looks doesn't have any issue on those gpu_flush part
>>
>> Here is my questions for you:
>> 1) Can you point me what issue had you been experienced ? and how to
>> repro the bug
>> 2) if you do hit some issues, did you verified that your patch can fix it ?
>>
>> besides
>>
>> /Monk
>>
>> -----邮件原件-----
>> 发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 Changfeng.Zhu
>> 发送时间: 2019年11月20日 17:14
>> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack
>> <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray
>> <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>;
>> amd-gfx@lists.freedesktop.org
>> 抄送: Zhu, Changfeng <Changfeng.Zhu@amd.com>
>> 主题: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in
>> amdgpu_virt
>>
>> From: changzhu <Changfeng.Zhu@amd.com>
>>
>> It may lose gpuvm invalidate acknowldege state across power-gating off cycle. To avoid this issue in virt invalidation, add semaphore acquire before invalidation and semaphore release after invalidation.
>>
>> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>>    3 files changed, 28 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> index f04eb1a64271..70ffaf91cd12 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device
>> *adev, uint32_t reg, uint32_t v)
>>    
>>    void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    					uint32_t reg0, uint32_t reg1,
>> -					uint32_t ref, uint32_t mask)
>> +					uint32_t ref, uint32_t mask,
>> +					uint32_t sem)
>>    {
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    	uint32_t seq;
>>    
>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>> -	amdgpu_ring_alloc(ring, 32);
>> +	amdgpu_ring_alloc(ring, 60);
>> +
>> +	/*
>> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
>> +	 * off cycle, add semaphore acquire before invalidation and semaphore
>> +	 * release after invalidation to avoid entering power gated state
>> +	 * to WA the Issue
>> +	 */
>> +
>> +	/* a read return value of 1 means semaphore acuqire */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>> +
>>    	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>    					    ref, mask);
>> +	/*
>> +	 * add semaphore release after invalidation,
>> +	 * write with 0 means semaphore release
>> +	 */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_wreg(ring, sem, 0);
>> +
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>>    	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> index b0b2bdc750df..bda6a2f37dc0 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);  void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    					uint32_t reg0, uint32_t rreg1,
>> -					uint32_t ref, uint32_t mask);
>> +					uint32_t ref, uint32_t mask,
>> +					uint32_t sem);
>>    int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool
>> init);  int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev,
>> bool init);  int amdgpu_virt_reset_gpu(struct amdgpu_device *adev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index f25cd97ba5f2..1ae59af7836a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>    			!adev->in_gpu_reset) {
>>    		uint32_t req = hub->vm_inv_eng0_req + eng;
>>    		uint32_t ack = hub->vm_inv_eng0_ack + eng;
>> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>>    
>>    		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
>> -				1 << vmid);
>> +						   1 << vmid, sem);
>>    		return;
>>    	}
>>    
>> --
>> 2.17.1
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 14:16                     ` Zhu, Changfeng
  0 siblings, 0 replies; 42+ messages in thread
From: Zhu, Changfeng @ 2019-11-20 14:16 UTC (permalink / raw)
  To: Koenig, Christian, Liu, Monk, Xiao, Jack, Zhou1, Tao, Huang, Ray,
	Huang, Shimmer, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

>>> Did Changfeng already hit this issue under SRIOV ???

I meet this problem on navi14 under gmc_v10_0_emit_flush_gpu_tlb .
The problem is also seen by Zhou,Tao.

And this is ticket:
http://ontrack-internal.amd.com/browse/SWDEV-201459

After the semaphore patch, the problem can be fixed.

If SROV has concern about this problem,  it should not add semaphore in SROV.

However, we should apply semaphore for gmc_v9_0_flush_gpu_tlb/ gmc_v9_0_emit_flush_gpu_tlb/ gmc_v10_0_flush_gpu_tlb/ gmc_v10_0_emit_flush_gpu_tlb

Or how can we handle the ticket above?

BR,
Changfeng.

-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@gmail.com> 
Sent: Wednesday, November 20, 2019 10:00 PM
To: Liu, Monk <Monk.Liu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt

> Did Changfeng already hit this issue under SRIOV ?
I don't think so, but Changfeng needs to answer this.

Question is does the extra semaphore acquire has some negative effect on SRIOV?

I would like to avoid having even more SRIOV specific handling in here which we can't really test on bare metal.

Christian.

Am 20.11.19 um 14:54 schrieb Liu, Monk:
> Hah, but in SRIOV case, our guest KMD driver is not allowed to do such 
> things .... (and even there is a bug that KMD try to power gate, the 
> SMU firmware would not really do the jobs since We have PSP L1 policy 
> to prevent those danger operations )
>
> Did Changfeng already hit this issue under SRIOV ???
>
> -----邮件原件-----
> 发件人: Koenig, Christian <Christian.Koenig@amd.com>
> 发送时间: 2019年11月20日 21:21
> 收件人: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng 
> <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao 
> <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer 
> <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> 主题: Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore 
> workaround in amdgpu_virt
>
> Hi Monk,
>
> this is a fix for power gating the MMHUB.
>
> Basic problem is that the MMHUB can power gate while an invalidation is in progress which looses all bits in the ACK register and so deadlocks the engine waiting for the invalidation to finish.
>
> This bug is hit immediately when we enable power gating of the MMHUB.
>
> Regards,
> Christian.
>
> Am 20.11.19 um 14:18 schrieb Liu, Monk:
>> Hi Changfeng
>>
>> Firs of all, there is no power-gating off circle involved in AMDGPU 
>> SRIOV, since we don't allow VF/VM do such things so I do feel strange 
>> why you post something like this Especially on VEGA10 serials which 
>> looks doesn't have any issue on those gpu_flush part
>>
>> Here is my questions for you:
>> 1) Can you point me what issue had you been experienced ? and how to 
>> repro the bug
>> 2) if you do hit some issues, did you verified that your patch can fix it ?
>>
>> besides
>>
>> /Monk
>>
>> -----邮件原件-----
>> 发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 Changfeng.Zhu
>> 发送时间: 2019年11月20日 17:14
>> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack 
>> <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray 
>> <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; 
>> amd-gfx@lists.freedesktop.org
>> 抄送: Zhu, Changfeng <Changfeng.Zhu@amd.com>
>> 主题: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in 
>> amdgpu_virt
>>
>> From: changzhu <Changfeng.Zhu@amd.com>
>>
>> It may lose gpuvm invalidate acknowldege state across power-gating off cycle. To avoid this issue in virt invalidation, add semaphore acquire before invalidation and semaphore release after invalidation.
>>
>> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>>    3 files changed, 28 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> index f04eb1a64271..70ffaf91cd12 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device 
>> *adev, uint32_t reg, uint32_t v)
>>    
>>    void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    					uint32_t reg0, uint32_t reg1,
>> -					uint32_t ref, uint32_t mask)
>> +					uint32_t ref, uint32_t mask,
>> +					uint32_t sem)
>>    {
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    	uint32_t seq;
>>    
>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>> -	amdgpu_ring_alloc(ring, 32);
>> +	amdgpu_ring_alloc(ring, 60);
>> +
>> +	/*
>> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
>> +	 * off cycle, add semaphore acquire before invalidation and semaphore
>> +	 * release after invalidation to avoid entering power gated state
>> +	 * to WA the Issue
>> +	 */
>> +
>> +	/* a read return value of 1 means semaphore acuqire */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>> +
>>    	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>    					    ref, mask);
>> +	/*
>> +	 * add semaphore release after invalidation,
>> +	 * write with 0 means semaphore release
>> +	 */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_wreg(ring, sem, 0);
>> +
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>>    	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git 
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> index b0b2bdc750df..bda6a2f37dc0 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);  void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    					uint32_t reg0, uint32_t rreg1,
>> -					uint32_t ref, uint32_t mask);
>> +					uint32_t ref, uint32_t mask,
>> +					uint32_t sem);
>>    int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool 
>> init);  int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, 
>> bool init);  int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); 
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index f25cd97ba5f2..1ae59af7836a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>    			!adev->in_gpu_reset) {
>>    		uint32_t req = hub->vm_inv_eng0_req + eng;
>>    		uint32_t ack = hub->vm_inv_eng0_ack + eng;
>> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>>    
>>    		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
>> -				1 << vmid);
>> +						   1 << vmid, sem);
>>    		return;
>>    	}
>>    
>> --
>> 2.17.1
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 14:16                     ` Zhu, Changfeng
  0 siblings, 0 replies; 42+ messages in thread
From: Zhu, Changfeng @ 2019-11-20 14:16 UTC (permalink / raw)
  To: Koenig, Christian, Liu, Monk, Xiao, Jack, Zhou1, Tao, Huang, Ray,
	Huang, Shimmer, amd-gfx

>>> Did Changfeng already hit this issue under SRIOV ???

I meet this problem on navi14 under gmc_v10_0_emit_flush_gpu_tlb .
The problem is also seen by Zhou,Tao.

And this is ticket:
http://ontrack-internal.amd.com/browse/SWDEV-201459

After the semaphore patch, the problem can be fixed.

If SROV has concern about this problem,  it should not add semaphore in SROV.

However, we should apply semaphore for gmc_v9_0_flush_gpu_tlb/ gmc_v9_0_emit_flush_gpu_tlb/ gmc_v10_0_flush_gpu_tlb/ gmc_v10_0_emit_flush_gpu_tlb

Or how can we handle the ticket above?

BR,
Changfeng.

-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@gmail.com> 
Sent: Wednesday, November 20, 2019 10:00 PM
To: Liu, Monk <Monk.Liu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt

> Did Changfeng already hit this issue under SRIOV ?
I don't think so, but Changfeng needs to answer this.

Question is does the extra semaphore acquire has some negative effect on SRIOV?

I would like to avoid having even more SRIOV specific handling in here which we can't really test on bare metal.

Christian.

Am 20.11.19 um 14:54 schrieb Liu, Monk:
> Hah, but in SRIOV case, our guest KMD driver is not allowed to do such 
> things .... (and even there is a bug that KMD try to power gate, the 
> SMU firmware would not really do the jobs since We have PSP L1 policy 
> to prevent those danger operations )
>
> Did Changfeng already hit this issue under SRIOV ???
>
> -----邮件原件-----
> 发件人: Koenig, Christian <Christian.Koenig@amd.com>
> 发送时间: 2019年11月20日 21:21
> 收件人: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng 
> <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao 
> <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer 
> <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> 主题: Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore 
> workaround in amdgpu_virt
>
> Hi Monk,
>
> this is a fix for power gating the MMHUB.
>
> Basic problem is that the MMHUB can power gate while an invalidation is in progress which looses all bits in the ACK register and so deadlocks the engine waiting for the invalidation to finish.
>
> This bug is hit immediately when we enable power gating of the MMHUB.
>
> Regards,
> Christian.
>
> Am 20.11.19 um 14:18 schrieb Liu, Monk:
>> Hi Changfeng
>>
>> Firs of all, there is no power-gating off circle involved in AMDGPU 
>> SRIOV, since we don't allow VF/VM do such things so I do feel strange 
>> why you post something like this Especially on VEGA10 serials which 
>> looks doesn't have any issue on those gpu_flush part
>>
>> Here is my questions for you:
>> 1) Can you point me what issue had you been experienced ? and how to 
>> repro the bug
>> 2) if you do hit some issues, did you verified that your patch can fix it ?
>>
>> besides
>>
>> /Monk
>>
>> -----邮件原件-----
>> 发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 Changfeng.Zhu
>> 发送时间: 2019年11月20日 17:14
>> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack 
>> <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray 
>> <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; 
>> amd-gfx@lists.freedesktop.org
>> 抄送: Zhu, Changfeng <Changfeng.Zhu@amd.com>
>> 主题: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in 
>> amdgpu_virt
>>
>> From: changzhu <Changfeng.Zhu@amd.com>
>>
>> It may lose gpuvm invalidate acknowldege state across power-gating off cycle. To avoid this issue in virt invalidation, add semaphore acquire before invalidation and semaphore release after invalidation.
>>
>> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>>    3 files changed, 28 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> index f04eb1a64271..70ffaf91cd12 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device 
>> *adev, uint32_t reg, uint32_t v)
>>    
>>    void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    					uint32_t reg0, uint32_t reg1,
>> -					uint32_t ref, uint32_t mask)
>> +					uint32_t ref, uint32_t mask,
>> +					uint32_t sem)
>>    {
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    	uint32_t seq;
>>    
>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>> -	amdgpu_ring_alloc(ring, 32);
>> +	amdgpu_ring_alloc(ring, 60);
>> +
>> +	/*
>> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
>> +	 * off cycle, add semaphore acquire before invalidation and semaphore
>> +	 * release after invalidation to avoid entering power gated state
>> +	 * to WA the Issue
>> +	 */
>> +
>> +	/* a read return value of 1 means semaphore acuqire */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>> +
>>    	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>    					    ref, mask);
>> +	/*
>> +	 * add semaphore release after invalidation,
>> +	 * write with 0 means semaphore release
>> +	 */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_wreg(ring, sem, 0);
>> +
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>>    	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git 
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> index b0b2bdc750df..bda6a2f37dc0 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);  void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    					uint32_t reg0, uint32_t rreg1,
>> -					uint32_t ref, uint32_t mask);
>> +					uint32_t ref, uint32_t mask,
>> +					uint32_t sem);
>>    int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool 
>> init);  int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, 
>> bool init);  int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); 
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index f25cd97ba5f2..1ae59af7836a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>    			!adev->in_gpu_reset) {
>>    		uint32_t req = hub->vm_inv_eng0_req + eng;
>>    		uint32_t ack = hub->vm_inv_eng0_ack + eng;
>> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>>    
>>    		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
>> -				1 << vmid);
>> +						   1 << vmid, sem);
>>    		return;
>>    	}
>>    
>> --
>> 2.17.1
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* 答复: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 14:18                     ` Liu, Monk
  0 siblings, 0 replies; 42+ messages in thread
From: Liu, Monk @ 2019-11-20 14:18 UTC (permalink / raw)
  To: Koenig, Christian, Zhu, Changfeng, Xiao, Jack, Zhou1, Tao, Huang,
	Ray, Huang, Shimmer, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

>> Question is does the extra semaphore acquire has some negative effect on SRIOV?

HI Christian

With more thought I think introduce semaphore get/put by kiq before/after the vm invalidate looks won't introduce world switch issue, because the world switch ruin VM invalidate only happens if the world switch
Occurred in the middle of VM invalidate itself ... so if the VF is preempted after semaphore read, then it's fine since VM invalidate is not even begin ...

But regarding this patches , I have something not clear:

>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    	uint32_t seq;
>>    
>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>> -	amdgpu_ring_alloc(ring, 32);
>> +	amdgpu_ring_alloc(ring, 60);
>> +
>> +	/*
>> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
>> +	 * off cycle, add semaphore acquire before invalidation and semaphore
>> +	 * release after invalidation to avoid entering power gated state
>> +	 * to WA the Issue
>> +	 */
>> +
>> +	/* a read return value of 1 means semaphore acuqire */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);

See that the ring is &kiq->ring, so why it need to check "if (ring->funcs->vmhub == AMDGPU_MMHUB_0" ? kiq obviously is not MMUB, but GFXHUB...

Besides, why the semaphore read before VN invalidate can prevent power gating ?? I didn't tell from the patch , 
is there another change that use KIQ to grab the semaphore before trying to do power gating  as well ?


thanks 

-----邮件原件-----
发件人: Christian König <ckoenig.leichtzumerken@gmail.com> 
发送时间: 2019年11月20日 22:00
收件人: Liu, Monk <Monk.Liu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
主题: Re: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt

> Did Changfeng already hit this issue under SRIOV ?
I don't think so, but Changfeng needs to answer this.

Question is does the extra semaphore acquire has some negative effect on SRIOV?

I would like to avoid having even more SRIOV specific handling in here which we can't really test on bare metal.

Christian.

Am 20.11.19 um 14:54 schrieb Liu, Monk:
> Hah, but in SRIOV case, our guest KMD driver is not allowed to do such 
> things .... (and even there is a bug that KMD try to power gate, the 
> SMU firmware would not really do the jobs since We have PSP L1 policy 
> to prevent those danger operations )
>
> Did Changfeng already hit this issue under SRIOV ???
>
> -----邮件原件-----
> 发件人: Koenig, Christian <Christian.Koenig@amd.com>
> 发送时间: 2019年11月20日 21:21
> 收件人: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng 
> <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao 
> <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer 
> <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> 主题: Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore 
> workaround in amdgpu_virt
>
> Hi Monk,
>
> this is a fix for power gating the MMHUB.
>
> Basic problem is that the MMHUB can power gate while an invalidation is in progress which looses all bits in the ACK register and so deadlocks the engine waiting for the invalidation to finish.
>
> This bug is hit immediately when we enable power gating of the MMHUB.
>
> Regards,
> Christian.
>
> Am 20.11.19 um 14:18 schrieb Liu, Monk:
>> Hi Changfeng
>>
>> Firs of all, there is no power-gating off circle involved in AMDGPU 
>> SRIOV, since we don't allow VF/VM do such things so I do feel strange 
>> why you post something like this Especially on VEGA10 serials which 
>> looks doesn't have any issue on those gpu_flush part
>>
>> Here is my questions for you:
>> 1) Can you point me what issue had you been experienced ? and how to 
>> repro the bug
>> 2) if you do hit some issues, did you verified that your patch can fix it ?
>>
>> besides
>>
>> /Monk
>>
>> -----邮件原件-----
>> 发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 Changfeng.Zhu
>> 发送时间: 2019年11月20日 17:14
>> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack 
>> <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray 
>> <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; 
>> amd-gfx@lists.freedesktop.org
>> 抄送: Zhu, Changfeng <Changfeng.Zhu@amd.com>
>> 主题: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in 
>> amdgpu_virt
>>
>> From: changzhu <Changfeng.Zhu@amd.com>
>>
>> It may lose gpuvm invalidate acknowldege state across power-gating off cycle. To avoid this issue in virt invalidation, add semaphore acquire before invalidation and semaphore release after invalidation.
>>
>> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>>    3 files changed, 28 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> index f04eb1a64271..70ffaf91cd12 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device 
>> *adev, uint32_t reg, uint32_t v)
>>    
>>    void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    					uint32_t reg0, uint32_t reg1,
>> -					uint32_t ref, uint32_t mask)
>> +					uint32_t ref, uint32_t mask,
>> +					uint32_t sem)
>>    {
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    	uint32_t seq;
>>    
>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>> -	amdgpu_ring_alloc(ring, 32);
>> +	amdgpu_ring_alloc(ring, 60);
>> +
>> +	/*
>> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
>> +	 * off cycle, add semaphore acquire before invalidation and semaphore
>> +	 * release after invalidation to avoid entering power gated state
>> +	 * to WA the Issue
>> +	 */
>> +
>> +	/* a read return value of 1 means semaphore acuqire */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>> +
>>    	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>    					    ref, mask);
>> +	/*
>> +	 * add semaphore release after invalidation,
>> +	 * write with 0 means semaphore release
>> +	 */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_wreg(ring, sem, 0);
>> +
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>>    	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git 
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> index b0b2bdc750df..bda6a2f37dc0 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);  void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    					uint32_t reg0, uint32_t rreg1,
>> -					uint32_t ref, uint32_t mask);
>> +					uint32_t ref, uint32_t mask,
>> +					uint32_t sem);
>>    int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool 
>> init);  int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, 
>> bool init);  int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); 
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index f25cd97ba5f2..1ae59af7836a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>    			!adev->in_gpu_reset) {
>>    		uint32_t req = hub->vm_inv_eng0_req + eng;
>>    		uint32_t ack = hub->vm_inv_eng0_ack + eng;
>> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>>    
>>    		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
>> -				1 << vmid);
>> +						   1 << vmid, sem);
>>    		return;
>>    	}
>>    
>> --
>> 2.17.1
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* 答复: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 14:18                     ` Liu, Monk
  0 siblings, 0 replies; 42+ messages in thread
From: Liu, Monk @ 2019-11-20 14:18 UTC (permalink / raw)
  To: Koenig, Christian, Zhu, Changfeng, Xiao, Jack, Zhou1, Tao, Huang,
	Ray, Huang, Shimmer, amd-gfx

>> Question is does the extra semaphore acquire has some negative effect on SRIOV?

HI Christian

With more thought I think introduce semaphore get/put by kiq before/after the vm invalidate looks won't introduce world switch issue, because the world switch ruin VM invalidate only happens if the world switch
Occurred in the middle of VM invalidate itself ... so if the VF is preempted after semaphore read, then it's fine since VM invalidate is not even begin ...

But regarding this patches , I have something not clear:

>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    	uint32_t seq;
>>    
>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>> -	amdgpu_ring_alloc(ring, 32);
>> +	amdgpu_ring_alloc(ring, 60);
>> +
>> +	/*
>> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
>> +	 * off cycle, add semaphore acquire before invalidation and semaphore
>> +	 * release after invalidation to avoid entering power gated state
>> +	 * to WA the Issue
>> +	 */
>> +
>> +	/* a read return value of 1 means semaphore acuqire */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);

See that the ring is &kiq->ring, so why it need to check "if (ring->funcs->vmhub == AMDGPU_MMHUB_0" ? kiq obviously is not MMUB, but GFXHUB...

Besides, why the semaphore read before VN invalidate can prevent power gating ?? I didn't tell from the patch , 
is there another change that use KIQ to grab the semaphore before trying to do power gating  as well ?


thanks 

-----邮件原件-----
发件人: Christian König <ckoenig.leichtzumerken@gmail.com> 
发送时间: 2019年11月20日 22:00
收件人: Liu, Monk <Monk.Liu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
主题: Re: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt

> Did Changfeng already hit this issue under SRIOV ?
I don't think so, but Changfeng needs to answer this.

Question is does the extra semaphore acquire has some negative effect on SRIOV?

I would like to avoid having even more SRIOV specific handling in here which we can't really test on bare metal.

Christian.

Am 20.11.19 um 14:54 schrieb Liu, Monk:
> Hah, but in SRIOV case, our guest KMD driver is not allowed to do such 
> things .... (and even there is a bug that KMD try to power gate, the 
> SMU firmware would not really do the jobs since We have PSP L1 policy 
> to prevent those danger operations )
>
> Did Changfeng already hit this issue under SRIOV ???
>
> -----邮件原件-----
> 发件人: Koenig, Christian <Christian.Koenig@amd.com>
> 发送时间: 2019年11月20日 21:21
> 收件人: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng 
> <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao 
> <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer 
> <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> 主题: Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore 
> workaround in amdgpu_virt
>
> Hi Monk,
>
> this is a fix for power gating the MMHUB.
>
> Basic problem is that the MMHUB can power gate while an invalidation is in progress which looses all bits in the ACK register and so deadlocks the engine waiting for the invalidation to finish.
>
> This bug is hit immediately when we enable power gating of the MMHUB.
>
> Regards,
> Christian.
>
> Am 20.11.19 um 14:18 schrieb Liu, Monk:
>> Hi Changfeng
>>
>> Firs of all, there is no power-gating off circle involved in AMDGPU 
>> SRIOV, since we don't allow VF/VM do such things so I do feel strange 
>> why you post something like this Especially on VEGA10 serials which 
>> looks doesn't have any issue on those gpu_flush part
>>
>> Here is my questions for you:
>> 1) Can you point me what issue had you been experienced ? and how to 
>> repro the bug
>> 2) if you do hit some issues, did you verified that your patch can fix it ?
>>
>> besides
>>
>> /Monk
>>
>> -----邮件原件-----
>> 发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 Changfeng.Zhu
>> 发送时间: 2019年11月20日 17:14
>> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack 
>> <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray 
>> <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; 
>> amd-gfx@lists.freedesktop.org
>> 抄送: Zhu, Changfeng <Changfeng.Zhu@amd.com>
>> 主题: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in 
>> amdgpu_virt
>>
>> From: changzhu <Changfeng.Zhu@amd.com>
>>
>> It may lose gpuvm invalidate acknowldege state across power-gating off cycle. To avoid this issue in virt invalidation, add semaphore acquire before invalidation and semaphore release after invalidation.
>>
>> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>>    3 files changed, 28 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> index f04eb1a64271..70ffaf91cd12 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device 
>> *adev, uint32_t reg, uint32_t v)
>>    
>>    void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    					uint32_t reg0, uint32_t reg1,
>> -					uint32_t ref, uint32_t mask)
>> +					uint32_t ref, uint32_t mask,
>> +					uint32_t sem)
>>    {
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    	uint32_t seq;
>>    
>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>> -	amdgpu_ring_alloc(ring, 32);
>> +	amdgpu_ring_alloc(ring, 60);
>> +
>> +	/*
>> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
>> +	 * off cycle, add semaphore acquire before invalidation and semaphore
>> +	 * release after invalidation to avoid entering power gated state
>> +	 * to WA the Issue
>> +	 */
>> +
>> +	/* a read return value of 1 means semaphore acuqire */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>> +
>>    	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>    					    ref, mask);
>> +	/*
>> +	 * add semaphore release after invalidation,
>> +	 * write with 0 means semaphore release
>> +	 */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_wreg(ring, sem, 0);
>> +
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>>    	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git 
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> index b0b2bdc750df..bda6a2f37dc0 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);  void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    					uint32_t reg0, uint32_t rreg1,
>> -					uint32_t ref, uint32_t mask);
>> +					uint32_t ref, uint32_t mask,
>> +					uint32_t sem);
>>    int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool 
>> init);  int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, 
>> bool init);  int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); 
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index f25cd97ba5f2..1ae59af7836a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>    			!adev->in_gpu_reset) {
>>    		uint32_t req = hub->vm_inv_eng0_req + eng;
>>    		uint32_t ack = hub->vm_inv_eng0_ack + eng;
>> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>>    
>>    		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
>> -				1 << vmid);
>> +						   1 << vmid, sem);
>>    		return;
>>    	}
>>    
>> --
>> 2.17.1
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* 答复: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 14:30                         ` Liu, Monk
  0 siblings, 0 replies; 42+ messages in thread
From: Liu, Monk @ 2019-11-20 14:30 UTC (permalink / raw)
  To: Zhu, Changfeng, Koenig, Christian, Xiao, Jack, Zhou1, Tao, Huang,
	Ray, Huang, Shimmer, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Thanks for sharing this JIR 

now I got the picture of this issue from you and Christian.

So the semaphore grabbing can prevent RTL to power off the MMHUB, I see 

The practice is that SRIOV won't enable PG at all (even our GIM driver won't enable PG, maybe in future we would enable it )

I think I don't have too many concern about your patches, 

But I have comments on your patch 1:

void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
 					uint32_t reg0, uint32_t reg1,
-					uint32_t ref, uint32_t mask)
+					uint32_t ref, uint32_t mask,
+					uint32_t sem)
 {
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
 	uint32_t seq;
 
 	spin_lock_irqsave(&kiq->ring_lock, flags);
-	amdgpu_ring_alloc(ring, 32);
+	amdgpu_ring_alloc(ring, 60);
+
+	/*
+	 * It may lose gpuvm invalidate acknowldege state across power-gating
+	 * off cycle, add semaphore acquire before invalidation and semaphore
+	 * release after invalidation to avoid entering power gated state
+	 * to WA the Issue
+	 */
+
+	/* a read return value of 1 means semaphore acuqire */
+	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
+	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
+	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);


See that in this routine, the ring is always KIQ, so below code looks redundant :

+	/* a read return value of 1 means semaphore acuqire */
+	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
+	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
+	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);

Besides, amdgpu_virt_kiq_reg_write_reg_wait() is not deadly a helper function that only serve VM invalidate, so I don't think 
You should put the semaphore read/write in this routine, instead you can put semaphore r/w out side of this routine and only
Put them around the VM invalidate logic 

Thanks 

-----邮件原件-----
发件人: Zhu, Changfeng <Changfeng.Zhu@amd.com> 
发送时间: 2019年11月20日 22:17
收件人: Koenig, Christian <Christian.Koenig@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
主题: RE: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt

>>> Did Changfeng already hit this issue under SRIOV ???

I meet this problem on navi14 under gmc_v10_0_emit_flush_gpu_tlb .
The problem is also seen by Zhou,Tao.

And this is ticket:
http://ontrack-internal.amd.com/browse/SWDEV-201459

After the semaphore patch, the problem can be fixed.

If SROV has concern about this problem,  it should not add semaphore in SROV.

However, we should apply semaphore for gmc_v9_0_flush_gpu_tlb/ gmc_v9_0_emit_flush_gpu_tlb/ gmc_v10_0_flush_gpu_tlb/ gmc_v10_0_emit_flush_gpu_tlb

Or how can we handle the ticket above?

BR,
Changfeng.

-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@gmail.com>
Sent: Wednesday, November 20, 2019 10:00 PM
To: Liu, Monk <Monk.Liu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt

> Did Changfeng already hit this issue under SRIOV ?
I don't think so, but Changfeng needs to answer this.

Question is does the extra semaphore acquire has some negative effect on SRIOV?

I would like to avoid having even more SRIOV specific handling in here which we can't really test on bare metal.

Christian.

Am 20.11.19 um 14:54 schrieb Liu, Monk:
> Hah, but in SRIOV case, our guest KMD driver is not allowed to do such 
> things .... (and even there is a bug that KMD try to power gate, the 
> SMU firmware would not really do the jobs since We have PSP L1 policy 
> to prevent those danger operations )
>
> Did Changfeng already hit this issue under SRIOV ???
>
> -----邮件原件-----
> 发件人: Koenig, Christian <Christian.Koenig@amd.com>
> 发送时间: 2019年11月20日 21:21
> 收件人: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng 
> <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao 
> <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer 
> <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> 主题: Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore 
> workaround in amdgpu_virt
>
> Hi Monk,
>
> this is a fix for power gating the MMHUB.
>
> Basic problem is that the MMHUB can power gate while an invalidation is in progress which looses all bits in the ACK register and so deadlocks the engine waiting for the invalidation to finish.
>
> This bug is hit immediately when we enable power gating of the MMHUB.
>
> Regards,
> Christian.
>
> Am 20.11.19 um 14:18 schrieb Liu, Monk:
>> Hi Changfeng
>>
>> Firs of all, there is no power-gating off circle involved in AMDGPU 
>> SRIOV, since we don't allow VF/VM do such things so I do feel strange 
>> why you post something like this Especially on VEGA10 serials which 
>> looks doesn't have any issue on those gpu_flush part
>>
>> Here is my questions for you:
>> 1) Can you point me what issue had you been experienced ? and how to 
>> repro the bug
>> 2) if you do hit some issues, did you verified that your patch can fix it ?
>>
>> besides
>>
>> /Monk
>>
>> -----邮件原件-----
>> 发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 Changfeng.Zhu
>> 发送时间: 2019年11月20日 17:14
>> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack 
>> <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray 
>> <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; 
>> amd-gfx@lists.freedesktop.org
>> 抄送: Zhu, Changfeng <Changfeng.Zhu@amd.com>
>> 主题: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in 
>> amdgpu_virt
>>
>> From: changzhu <Changfeng.Zhu@amd.com>
>>
>> It may lose gpuvm invalidate acknowldege state across power-gating off cycle. To avoid this issue in virt invalidation, add semaphore acquire before invalidation and semaphore release after invalidation.
>>
>> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>>    3 files changed, 28 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> index f04eb1a64271..70ffaf91cd12 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device 
>> *adev, uint32_t reg, uint32_t v)
>>    
>>    void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    					uint32_t reg0, uint32_t reg1,
>> -					uint32_t ref, uint32_t mask)
>> +					uint32_t ref, uint32_t mask,
>> +					uint32_t sem)
>>    {
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    	uint32_t seq;
>>    
>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>> -	amdgpu_ring_alloc(ring, 32);
>> +	amdgpu_ring_alloc(ring, 60);
>> +
>> +	/*
>> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
>> +	 * off cycle, add semaphore acquire before invalidation and semaphore
>> +	 * release after invalidation to avoid entering power gated state
>> +	 * to WA the Issue
>> +	 */
>> +
>> +	/* a read return value of 1 means semaphore acuqire */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>> +
>>    	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>    					    ref, mask);
>> +	/*
>> +	 * add semaphore release after invalidation,
>> +	 * write with 0 means semaphore release
>> +	 */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_wreg(ring, sem, 0);
>> +
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>>    	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git 
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> index b0b2bdc750df..bda6a2f37dc0 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);  void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    					uint32_t reg0, uint32_t rreg1,
>> -					uint32_t ref, uint32_t mask);
>> +					uint32_t ref, uint32_t mask,
>> +					uint32_t sem);
>>    int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool 
>> init);  int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, 
>> bool init);  int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); 
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index f25cd97ba5f2..1ae59af7836a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>    			!adev->in_gpu_reset) {
>>    		uint32_t req = hub->vm_inv_eng0_req + eng;
>>    		uint32_t ack = hub->vm_inv_eng0_ack + eng;
>> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>>    
>>    		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
>> -				1 << vmid);
>> +						   1 << vmid, sem);
>>    		return;
>>    	}
>>    
>> --
>> 2.17.1
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* 答复: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 14:30                         ` Liu, Monk
  0 siblings, 0 replies; 42+ messages in thread
From: Liu, Monk @ 2019-11-20 14:30 UTC (permalink / raw)
  To: Zhu, Changfeng, Koenig, Christian, Xiao, Jack, Zhou1, Tao, Huang,
	Ray, Huang, Shimmer, amd-gfx

Thanks for sharing this JIR 

now I got the picture of this issue from you and Christian.

So the semaphore grabbing can prevent RTL to power off the MMHUB, I see 

The practice is that SRIOV won't enable PG at all (even our GIM driver won't enable PG, maybe in future we would enable it )

I think I don't have too many concern about your patches, 

But I have comments on your patch 1:

void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
 					uint32_t reg0, uint32_t reg1,
-					uint32_t ref, uint32_t mask)
+					uint32_t ref, uint32_t mask,
+					uint32_t sem)
 {
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
 	uint32_t seq;
 
 	spin_lock_irqsave(&kiq->ring_lock, flags);
-	amdgpu_ring_alloc(ring, 32);
+	amdgpu_ring_alloc(ring, 60);
+
+	/*
+	 * It may lose gpuvm invalidate acknowldege state across power-gating
+	 * off cycle, add semaphore acquire before invalidation and semaphore
+	 * release after invalidation to avoid entering power gated state
+	 * to WA the Issue
+	 */
+
+	/* a read return value of 1 means semaphore acuqire */
+	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
+	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
+	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);


See that in this routine, the ring is always KIQ, so below code looks redundant :

+	/* a read return value of 1 means semaphore acuqire */
+	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
+	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
+	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);

Besides, amdgpu_virt_kiq_reg_write_reg_wait() is not deadly a helper function that only serve VM invalidate, so I don't think 
You should put the semaphore read/write in this routine, instead you can put semaphore r/w out side of this routine and only
Put them around the VM invalidate logic 

Thanks 

-----邮件原件-----
发件人: Zhu, Changfeng <Changfeng.Zhu@amd.com> 
发送时间: 2019年11月20日 22:17
收件人: Koenig, Christian <Christian.Koenig@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
主题: RE: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt

>>> Did Changfeng already hit this issue under SRIOV ???

I meet this problem on navi14 under gmc_v10_0_emit_flush_gpu_tlb .
The problem is also seen by Zhou,Tao.

And this is ticket:
http://ontrack-internal.amd.com/browse/SWDEV-201459

After the semaphore patch, the problem can be fixed.

If SROV has concern about this problem,  it should not add semaphore in SROV.

However, we should apply semaphore for gmc_v9_0_flush_gpu_tlb/ gmc_v9_0_emit_flush_gpu_tlb/ gmc_v10_0_flush_gpu_tlb/ gmc_v10_0_emit_flush_gpu_tlb

Or how can we handle the ticket above?

BR,
Changfeng.

-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@gmail.com>
Sent: Wednesday, November 20, 2019 10:00 PM
To: Liu, Monk <Monk.Liu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt

> Did Changfeng already hit this issue under SRIOV ?
I don't think so, but Changfeng needs to answer this.

Question is does the extra semaphore acquire has some negative effect on SRIOV?

I would like to avoid having even more SRIOV specific handling in here which we can't really test on bare metal.

Christian.

Am 20.11.19 um 14:54 schrieb Liu, Monk:
> Hah, but in SRIOV case, our guest KMD driver is not allowed to do such 
> things .... (and even there is a bug that KMD try to power gate, the 
> SMU firmware would not really do the jobs since We have PSP L1 policy 
> to prevent those danger operations )
>
> Did Changfeng already hit this issue under SRIOV ???
>
> -----邮件原件-----
> 发件人: Koenig, Christian <Christian.Koenig@amd.com>
> 发送时间: 2019年11月20日 21:21
> 收件人: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng 
> <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao 
> <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer 
> <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> 主题: Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore 
> workaround in amdgpu_virt
>
> Hi Monk,
>
> this is a fix for power gating the MMHUB.
>
> Basic problem is that the MMHUB can power gate while an invalidation is in progress which looses all bits in the ACK register and so deadlocks the engine waiting for the invalidation to finish.
>
> This bug is hit immediately when we enable power gating of the MMHUB.
>
> Regards,
> Christian.
>
> Am 20.11.19 um 14:18 schrieb Liu, Monk:
>> Hi Changfeng
>>
>> Firs of all, there is no power-gating off circle involved in AMDGPU 
>> SRIOV, since we don't allow VF/VM do such things so I do feel strange 
>> why you post something like this Especially on VEGA10 serials which 
>> looks doesn't have any issue on those gpu_flush part
>>
>> Here is my questions for you:
>> 1) Can you point me what issue had you been experienced ? and how to 
>> repro the bug
>> 2) if you do hit some issues, did you verified that your patch can fix it ?
>>
>> besides
>>
>> /Monk
>>
>> -----邮件原件-----
>> 发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 Changfeng.Zhu
>> 发送时间: 2019年11月20日 17:14
>> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack 
>> <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray 
>> <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; 
>> amd-gfx@lists.freedesktop.org
>> 抄送: Zhu, Changfeng <Changfeng.Zhu@amd.com>
>> 主题: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in 
>> amdgpu_virt
>>
>> From: changzhu <Changfeng.Zhu@amd.com>
>>
>> It may lose gpuvm invalidate acknowldege state across power-gating off cycle. To avoid this issue in virt invalidation, add semaphore acquire before invalidation and semaphore release after invalidation.
>>
>> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>>    3 files changed, 28 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> index f04eb1a64271..70ffaf91cd12 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device 
>> *adev, uint32_t reg, uint32_t v)
>>    
>>    void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    					uint32_t reg0, uint32_t reg1,
>> -					uint32_t ref, uint32_t mask)
>> +					uint32_t ref, uint32_t mask,
>> +					uint32_t sem)
>>    {
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    	uint32_t seq;
>>    
>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>> -	amdgpu_ring_alloc(ring, 32);
>> +	amdgpu_ring_alloc(ring, 60);
>> +
>> +	/*
>> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
>> +	 * off cycle, add semaphore acquire before invalidation and semaphore
>> +	 * release after invalidation to avoid entering power gated state
>> +	 * to WA the Issue
>> +	 */
>> +
>> +	/* a read return value of 1 means semaphore acuqire */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>> +
>>    	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>    					    ref, mask);
>> +	/*
>> +	 * add semaphore release after invalidation,
>> +	 * write with 0 means semaphore release
>> +	 */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_wreg(ring, sem, 0);
>> +
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>>    	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git 
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> index b0b2bdc750df..bda6a2f37dc0 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);  void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    					uint32_t reg0, uint32_t rreg1,
>> -					uint32_t ref, uint32_t mask);
>> +					uint32_t ref, uint32_t mask,
>> +					uint32_t sem);
>>    int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool 
>> init);  int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, 
>> bool init);  int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); 
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index f25cd97ba5f2..1ae59af7836a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>    			!adev->in_gpu_reset) {
>>    		uint32_t req = hub->vm_inv_eng0_req + eng;
>>    		uint32_t ack = hub->vm_inv_eng0_ack + eng;
>> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>>    
>>    		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
>> -				1 << vmid);
>> +						   1 << vmid, sem);
>>    		return;
>>    	}
>>    
>> --
>> 2.17.1
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: 答复: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 14:38                             ` Christian König
  0 siblings, 0 replies; 42+ messages in thread
From: Christian König @ 2019-11-20 14:38 UTC (permalink / raw)
  To: Liu, Monk, Zhu, Changfeng, Koenig, Christian, Xiao, Jack, Zhou1,
	Tao, Huang, Ray, Huang, Shimmer,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Hi Monk,

the KIQ is used to invalidate both the GFXHUB as well as the MMHUB on Vega.

> Besides, amdgpu_virt_kiq_reg_write_reg_wait() is not deadly a helper function that only serve VM invalidate, so I don't think
> You should put the semaphore read/write in this routine, instead you can put semaphore r/w out side of this routine and only
> Put them around the VM invalidate logic
Yes, agree. But since we now knew that we won't need that we can just 
drop this patch altogether.

Regards,
Christian.

Am 20.11.19 um 15:30 schrieb Liu, Monk:
> Thanks for sharing this JIR
>
> now I got the picture of this issue from you and Christian.
>
> So the semaphore grabbing can prevent RTL to power off the MMHUB, I see
>
> The practice is that SRIOV won't enable PG at all (even our GIM driver won't enable PG, maybe in future we would enable it )
>
> I think I don't have too many concern about your patches,
>
> But I have comments on your patch 1:
>
> void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   					uint32_t reg0, uint32_t reg1,
> -					uint32_t ref, uint32_t mask)
> +					uint32_t ref, uint32_t mask,
> +					uint32_t sem)
>   {
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   	uint32_t seq;
>   
>   	spin_lock_irqsave(&kiq->ring_lock, flags);
> -	amdgpu_ring_alloc(ring, 32);
> +	amdgpu_ring_alloc(ring, 60);
> +
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>
>
> See that in this routine, the ring is always KIQ, so below code looks redundant :
>
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>
> Besides, amdgpu_virt_kiq_reg_write_reg_wait() is not deadly a helper function that only serve VM invalidate, so I don't think
> You should put the semaphore read/write in this routine, instead you can put semaphore r/w out side of this routine and only
> Put them around the VM invalidate logic
>
> Thanks
>
> -----邮件原件-----
> 发件人: Zhu, Changfeng <Changfeng.Zhu@amd.com>
> 发送时间: 2019年11月20日 22:17
> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> 主题: RE: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
>
>>>> Did Changfeng already hit this issue under SRIOV ???
> I meet this problem on navi14 under gmc_v10_0_emit_flush_gpu_tlb .
> The problem is also seen by Zhou,Tao.
>
> And this is ticket:
> http://ontrack-internal.amd.com/browse/SWDEV-201459
>
> After the semaphore patch, the problem can be fixed.
>
> If SROV has concern about this problem,  it should not add semaphore in SROV.
>
> However, we should apply semaphore for gmc_v9_0_flush_gpu_tlb/ gmc_v9_0_emit_flush_gpu_tlb/ gmc_v10_0_flush_gpu_tlb/ gmc_v10_0_emit_flush_gpu_tlb
>
> Or how can we handle the ticket above?
>
> BR,
> Changfeng.
>
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Wednesday, November 20, 2019 10:00 PM
> To: Liu, Monk <Monk.Liu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
>
>> Did Changfeng already hit this issue under SRIOV ?
> I don't think so, but Changfeng needs to answer this.
>
> Question is does the extra semaphore acquire has some negative effect on SRIOV?
>
> I would like to avoid having even more SRIOV specific handling in here which we can't really test on bare metal.
>
> Christian.
>
> Am 20.11.19 um 14:54 schrieb Liu, Monk:
>> Hah, but in SRIOV case, our guest KMD driver is not allowed to do such
>> things .... (and even there is a bug that KMD try to power gate, the
>> SMU firmware would not really do the jobs since We have PSP L1 policy
>> to prevent those danger operations )
>>
>> Did Changfeng already hit this issue under SRIOV ???
>>
>> -----邮件原件-----
>> 发件人: Koenig, Christian <Christian.Koenig@amd.com>
>> 发送时间: 2019年11月20日 21:21
>> 收件人: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng
>> <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao
>> <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer
>> <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
>> 主题: Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore
>> workaround in amdgpu_virt
>>
>> Hi Monk,
>>
>> this is a fix for power gating the MMHUB.
>>
>> Basic problem is that the MMHUB can power gate while an invalidation is in progress which looses all bits in the ACK register and so deadlocks the engine waiting for the invalidation to finish.
>>
>> This bug is hit immediately when we enable power gating of the MMHUB.
>>
>> Regards,
>> Christian.
>>
>> Am 20.11.19 um 14:18 schrieb Liu, Monk:
>>> Hi Changfeng
>>>
>>> Firs of all, there is no power-gating off circle involved in AMDGPU
>>> SRIOV, since we don't allow VF/VM do such things so I do feel strange
>>> why you post something like this Especially on VEGA10 serials which
>>> looks doesn't have any issue on those gpu_flush part
>>>
>>> Here is my questions for you:
>>> 1) Can you point me what issue had you been experienced ? and how to
>>> repro the bug
>>> 2) if you do hit some issues, did you verified that your patch can fix it ?
>>>
>>> besides
>>>
>>> /Monk
>>>
>>> -----邮件原件-----
>>> 发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 Changfeng.Zhu
>>> 发送时间: 2019年11月20日 17:14
>>> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack
>>> <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray
>>> <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>;
>>> amd-gfx@lists.freedesktop.org
>>> 抄送: Zhu, Changfeng <Changfeng.Zhu@amd.com>
>>> 主题: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in
>>> amdgpu_virt
>>>
>>> From: changzhu <Changfeng.Zhu@amd.com>
>>>
>>> It may lose gpuvm invalidate acknowldege state across power-gating off cycle. To avoid this issue in virt invalidation, add semaphore acquire before invalidation and semaphore release after invalidation.
>>>
>>> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
>>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>>>     drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>>>     3 files changed, 28 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> index f04eb1a64271..70ffaf91cd12 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device
>>> *adev, uint32_t reg, uint32_t v)
>>>     
>>>     void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>     					uint32_t reg0, uint32_t reg1,
>>> -					uint32_t ref, uint32_t mask)
>>> +					uint32_t ref, uint32_t mask,
>>> +					uint32_t sem)
>>>     {
>>>     	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>     	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>     	uint32_t seq;
>>>     
>>>     	spin_lock_irqsave(&kiq->ring_lock, flags);
>>> -	amdgpu_ring_alloc(ring, 32);
>>> +	amdgpu_ring_alloc(ring, 60);
>>> +
>>> +	/*
>>> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
>>> +	 * off cycle, add semaphore acquire before invalidation and semaphore
>>> +	 * release after invalidation to avoid entering power gated state
>>> +	 * to WA the Issue
>>> +	 */
>>> +
>>> +	/* a read return value of 1 means semaphore acuqire */
>>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>>> +
>>>     	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>>     					    ref, mask);
>>> +	/*
>>> +	 * add semaphore release after invalidation,
>>> +	 * write with 0 means semaphore release
>>> +	 */
>>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>>> +	amdgpu_ring_emit_wreg(ring, sem, 0);
>>> +
>>>     	amdgpu_fence_emit_polling(ring, &seq);
>>>     	amdgpu_ring_commit(ring);
>>>     	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git
>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> index b0b2bdc750df..bda6a2f37dc0 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);  void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>     					uint32_t reg0, uint32_t rreg1,
>>> -					uint32_t ref, uint32_t mask);
>>> +					uint32_t ref, uint32_t mask,
>>> +					uint32_t sem);
>>>     int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool
>>> init);  int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev,
>>> bool init);  int amdgpu_virt_reset_gpu(struct amdgpu_device *adev);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> index f25cd97ba5f2..1ae59af7836a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>>     			!adev->in_gpu_reset) {
>>>     		uint32_t req = hub->vm_inv_eng0_req + eng;
>>>     		uint32_t ack = hub->vm_inv_eng0_ack + eng;
>>> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>>>     
>>>     		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
>>> -				1 << vmid);
>>> +						   1 << vmid, sem);
>>>     		return;
>>>     	}
>>>     
>>> --
>>> 2.17.1
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: 答复: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 14:38                             ` Christian König
  0 siblings, 0 replies; 42+ messages in thread
From: Christian König @ 2019-11-20 14:38 UTC (permalink / raw)
  To: Liu, Monk, Zhu, Changfeng, Koenig, Christian, Xiao, Jack, Zhou1,
	Tao, Huang, Ray, Huang, Shimmer, amd-gfx

Hi Monk,

the KIQ is used to invalidate both the GFXHUB as well as the MMHUB on Vega.

> Besides, amdgpu_virt_kiq_reg_write_reg_wait() is not deadly a helper function that only serve VM invalidate, so I don't think
> You should put the semaphore read/write in this routine, instead you can put semaphore r/w out side of this routine and only
> Put them around the VM invalidate logic
Yes, agree. But since we now knew that we won't need that we can just 
drop this patch altogether.

Regards,
Christian.

Am 20.11.19 um 15:30 schrieb Liu, Monk:
> Thanks for sharing this JIR
>
> now I got the picture of this issue from you and Christian.
>
> So the semaphore grabbing can prevent RTL to power off the MMHUB, I see
>
> The practice is that SRIOV won't enable PG at all (even our GIM driver won't enable PG, maybe in future we would enable it )
>
> I think I don't have too many concern about your patches,
>
> But I have comments on your patch 1:
>
> void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   					uint32_t reg0, uint32_t reg1,
> -					uint32_t ref, uint32_t mask)
> +					uint32_t ref, uint32_t mask,
> +					uint32_t sem)
>   {
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   	uint32_t seq;
>   
>   	spin_lock_irqsave(&kiq->ring_lock, flags);
> -	amdgpu_ring_alloc(ring, 32);
> +	amdgpu_ring_alloc(ring, 60);
> +
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>
>
> See that in this routine, the ring is always KIQ, so below code looks redundant :
>
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>
> Besides, amdgpu_virt_kiq_reg_write_reg_wait() is not deadly a helper function that only serve VM invalidate, so I don't think
> You should put the semaphore read/write in this routine, instead you can put semaphore r/w out side of this routine and only
> Put them around the VM invalidate logic
>
> Thanks
>
> -----邮件原件-----
> 发件人: Zhu, Changfeng <Changfeng.Zhu@amd.com>
> 发送时间: 2019年11月20日 22:17
> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> 主题: RE: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
>
>>>> Did Changfeng already hit this issue under SRIOV ???
> I meet this problem on navi14 under gmc_v10_0_emit_flush_gpu_tlb .
> The problem is also seen by Zhou,Tao.
>
> And this is ticket:
> http://ontrack-internal.amd.com/browse/SWDEV-201459
>
> After the semaphore patch, the problem can be fixed.
>
> If SROV has concern about this problem,  it should not add semaphore in SROV.
>
> However, we should apply semaphore for gmc_v9_0_flush_gpu_tlb/ gmc_v9_0_emit_flush_gpu_tlb/ gmc_v10_0_flush_gpu_tlb/ gmc_v10_0_emit_flush_gpu_tlb
>
> Or how can we handle the ticket above?
>
> BR,
> Changfeng.
>
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Wednesday, November 20, 2019 10:00 PM
> To: Liu, Monk <Monk.Liu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
>
>> Did Changfeng already hit this issue under SRIOV ?
> I don't think so, but Changfeng needs to answer this.
>
> Question is does the extra semaphore acquire has some negative effect on SRIOV?
>
> I would like to avoid having even more SRIOV specific handling in here which we can't really test on bare metal.
>
> Christian.
>
> Am 20.11.19 um 14:54 schrieb Liu, Monk:
>> Hah, but in SRIOV case, our guest KMD driver is not allowed to do such
>> things .... (and even there is a bug that KMD try to power gate, the
>> SMU firmware would not really do the jobs since We have PSP L1 policy
>> to prevent those danger operations )
>>
>> Did Changfeng already hit this issue under SRIOV ???
>>
>> -----邮件原件-----
>> 发件人: Koenig, Christian <Christian.Koenig@amd.com>
>> 发送时间: 2019年11月20日 21:21
>> 收件人: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng
>> <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao
>> <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer
>> <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
>> 主题: Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore
>> workaround in amdgpu_virt
>>
>> Hi Monk,
>>
>> this is a fix for power gating the MMHUB.
>>
>> Basic problem is that the MMHUB can power gate while an invalidation is in progress which looses all bits in the ACK register and so deadlocks the engine waiting for the invalidation to finish.
>>
>> This bug is hit immediately when we enable power gating of the MMHUB.
>>
>> Regards,
>> Christian.
>>
>> Am 20.11.19 um 14:18 schrieb Liu, Monk:
>>> Hi Changfeng
>>>
>>> Firs of all, there is no power-gating off circle involved in AMDGPU
>>> SRIOV, since we don't allow VF/VM do such things so I do feel strange
>>> why you post something like this Especially on VEGA10 serials which
>>> looks doesn't have any issue on those gpu_flush part
>>>
>>> Here is my questions for you:
>>> 1) Can you point me what issue had you been experienced ? and how to
>>> repro the bug
>>> 2) if you do hit some issues, did you verified that your patch can fix it ?
>>>
>>> besides
>>>
>>> /Monk
>>>
>>> -----邮件原件-----
>>> 发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 Changfeng.Zhu
>>> 发送时间: 2019年11月20日 17:14
>>> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack
>>> <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray
>>> <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>;
>>> amd-gfx@lists.freedesktop.org
>>> 抄送: Zhu, Changfeng <Changfeng.Zhu@amd.com>
>>> 主题: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in
>>> amdgpu_virt
>>>
>>> From: changzhu <Changfeng.Zhu@amd.com>
>>>
>>> It may lose gpuvm invalidate acknowldege state across power-gating off cycle. To avoid this issue in virt invalidation, add semaphore acquire before invalidation and semaphore release after invalidation.
>>>
>>> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
>>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>>>     drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>>>     3 files changed, 28 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> index f04eb1a64271..70ffaf91cd12 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device
>>> *adev, uint32_t reg, uint32_t v)
>>>     
>>>     void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>     					uint32_t reg0, uint32_t reg1,
>>> -					uint32_t ref, uint32_t mask)
>>> +					uint32_t ref, uint32_t mask,
>>> +					uint32_t sem)
>>>     {
>>>     	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>     	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>     	uint32_t seq;
>>>     
>>>     	spin_lock_irqsave(&kiq->ring_lock, flags);
>>> -	amdgpu_ring_alloc(ring, 32);
>>> +	amdgpu_ring_alloc(ring, 60);
>>> +
>>> +	/*
>>> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
>>> +	 * off cycle, add semaphore acquire before invalidation and semaphore
>>> +	 * release after invalidation to avoid entering power gated state
>>> +	 * to WA the Issue
>>> +	 */
>>> +
>>> +	/* a read return value of 1 means semaphore acuqire */
>>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>>> +
>>>     	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>>     					    ref, mask);
>>> +	/*
>>> +	 * add semaphore release after invalidation,
>>> +	 * write with 0 means semaphore release
>>> +	 */
>>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>>> +	amdgpu_ring_emit_wreg(ring, sem, 0);
>>> +
>>>     	amdgpu_fence_emit_polling(ring, &seq);
>>>     	amdgpu_ring_commit(ring);
>>>     	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git
>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> index b0b2bdc750df..bda6a2f37dc0 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);  void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>     					uint32_t reg0, uint32_t rreg1,
>>> -					uint32_t ref, uint32_t mask);
>>> +					uint32_t ref, uint32_t mask,
>>> +					uint32_t sem);
>>>     int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool
>>> init);  int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev,
>>> bool init);  int amdgpu_virt_reset_gpu(struct amdgpu_device *adev);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> index f25cd97ba5f2..1ae59af7836a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>>     			!adev->in_gpu_reset) {
>>>     		uint32_t req = hub->vm_inv_eng0_req + eng;
>>>     		uint32_t ack = hub->vm_inv_eng0_ack + eng;
>>> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>>>     
>>>     		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
>>> -				1 << vmid);
>>> +						   1 << vmid, sem);
>>>     		return;
>>>     	}
>>>     
>>> --
>>> 2.17.1
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: 答复: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 14:50                                 ` Zhu, Changfeng
  0 siblings, 0 replies; 42+ messages in thread
From: Zhu, Changfeng @ 2019-11-20 14:50 UTC (permalink / raw)
  To: Koenig, Christian, Liu, Monk, Xiao, Jack, Zhou1, Tao, Huang, Ray,
	Huang, Shimmer, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Well, I'll wait the help from IPE GFX team and try to apply GFXHUB as well and then perfect these invalidate semaphore patches.

If SRIOV team want to enable invalidate semaphore in future, it can try to take this patch back in that time.

BR,
Changfeng.

-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@gmail.com> 
Sent: Wednesday, November 20, 2019 10:39 PM
To: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: 答复: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt

Hi Monk,

the KIQ is used to invalidate both the GFXHUB as well as the MMHUB on Vega.

> Besides, amdgpu_virt_kiq_reg_write_reg_wait() is not deadly a helper 
> function that only serve VM invalidate, so I don't think You should 
> put the semaphore read/write in this routine, instead you can put 
> semaphore r/w out side of this routine and only Put them around the VM 
> invalidate logic
Yes, agree. But since we now knew that we won't need that we can just drop this patch altogether.

Regards,
Christian.

Am 20.11.19 um 15:30 schrieb Liu, Monk:
> Thanks for sharing this JIR
>
> now I got the picture of this issue from you and Christian.
>
> So the semaphore grabbing can prevent RTL to power off the MMHUB, I 
> see
>
> The practice is that SRIOV won't enable PG at all (even our GIM driver 
> won't enable PG, maybe in future we would enable it )
>
> I think I don't have too many concern about your patches,
>
> But I have comments on your patch 1:
>
> void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   					uint32_t reg0, uint32_t reg1,
> -					uint32_t ref, uint32_t mask)
> +					uint32_t ref, uint32_t mask,
> +					uint32_t sem)
>   {
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   	uint32_t seq;
>   
>   	spin_lock_irqsave(&kiq->ring_lock, flags);
> -	amdgpu_ring_alloc(ring, 32);
> +	amdgpu_ring_alloc(ring, 60);
> +
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>
>
> See that in this routine, the ring is always KIQ, so below code looks redundant :
>
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>
> Besides, amdgpu_virt_kiq_reg_write_reg_wait() is not deadly a helper 
> function that only serve VM invalidate, so I don't think You should 
> put the semaphore read/write in this routine, instead you can put 
> semaphore r/w out side of this routine and only Put them around the VM 
> invalidate logic
>
> Thanks
>
> -----邮件原件-----
> 发件人: Zhu, Changfeng <Changfeng.Zhu@amd.com>
> 发送时间: 2019年11月20日 22:17
> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Liu, Monk 
> <Monk.Liu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao 
> <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer 
> <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> 主题: RE: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore 
> workaround in amdgpu_virt
>
>>>> Did Changfeng already hit this issue under SRIOV ???
> I meet this problem on navi14 under gmc_v10_0_emit_flush_gpu_tlb .
> The problem is also seen by Zhou,Tao.
>
> And this is ticket:
> http://ontrack-internal.amd.com/browse/SWDEV-201459
>
> After the semaphore patch, the problem can be fixed.
>
> If SROV has concern about this problem,  it should not add semaphore in SROV.
>
> However, we should apply semaphore for gmc_v9_0_flush_gpu_tlb/ 
> gmc_v9_0_emit_flush_gpu_tlb/ gmc_v10_0_flush_gpu_tlb/ 
> gmc_v10_0_emit_flush_gpu_tlb
>
> Or how can we handle the ticket above?
>
> BR,
> Changfeng.
>
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Wednesday, November 20, 2019 10:00 PM
> To: Liu, Monk <Monk.Liu@amd.com>; Koenig, Christian 
> <Christian.Koenig@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>; 
> Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, 
> Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; 
> amd-gfx@lists.freedesktop.org
> Subject: Re: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore 
> workaround in amdgpu_virt
>
>> Did Changfeng already hit this issue under SRIOV ?
> I don't think so, but Changfeng needs to answer this.
>
> Question is does the extra semaphore acquire has some negative effect on SRIOV?
>
> I would like to avoid having even more SRIOV specific handling in here which we can't really test on bare metal.
>
> Christian.
>
> Am 20.11.19 um 14:54 schrieb Liu, Monk:
>> Hah, but in SRIOV case, our guest KMD driver is not allowed to do 
>> such things .... (and even there is a bug that KMD try to power gate, 
>> the SMU firmware would not really do the jobs since We have PSP L1 
>> policy to prevent those danger operations )
>>
>> Did Changfeng already hit this issue under SRIOV ???
>>
>> -----邮件原件-----
>> 发件人: Koenig, Christian <Christian.Koenig@amd.com>
>> 发送时间: 2019年11月20日 21:21
>> 收件人: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng 
>> <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao 
>> <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer 
>> <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
>> 主题: Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore 
>> workaround in amdgpu_virt
>>
>> Hi Monk,
>>
>> this is a fix for power gating the MMHUB.
>>
>> Basic problem is that the MMHUB can power gate while an invalidation is in progress which looses all bits in the ACK register and so deadlocks the engine waiting for the invalidation to finish.
>>
>> This bug is hit immediately when we enable power gating of the MMHUB.
>>
>> Regards,
>> Christian.
>>
>> Am 20.11.19 um 14:18 schrieb Liu, Monk:
>>> Hi Changfeng
>>>
>>> Firs of all, there is no power-gating off circle involved in AMDGPU 
>>> SRIOV, since we don't allow VF/VM do such things so I do feel 
>>> strange why you post something like this Especially on VEGA10 
>>> serials which looks doesn't have any issue on those gpu_flush part
>>>
>>> Here is my questions for you:
>>> 1) Can you point me what issue had you been experienced ? and how to 
>>> repro the bug
>>> 2) if you do hit some issues, did you verified that your patch can fix it ?
>>>
>>> besides
>>>
>>> /Monk
>>>
>>> -----邮件原件-----
>>> 发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 
>>> Changfeng.Zhu
>>> 发送时间: 2019年11月20日 17:14
>>> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack 
>>> <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray 
>>> <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; 
>>> amd-gfx@lists.freedesktop.org
>>> 抄送: Zhu, Changfeng <Changfeng.Zhu@amd.com>
>>> 主题: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in 
>>> amdgpu_virt
>>>
>>> From: changzhu <Changfeng.Zhu@amd.com>
>>>
>>> It may lose gpuvm invalidate acknowldege state across power-gating off cycle. To avoid this issue in virt invalidation, add semaphore acquire before invalidation and semaphore release after invalidation.
>>>
>>> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
>>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>>>     drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>>>     3 files changed, 28 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> index f04eb1a64271..70ffaf91cd12 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device 
>>> *adev, uint32_t reg, uint32_t v)
>>>     
>>>     void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>     					uint32_t reg0, uint32_t reg1,
>>> -					uint32_t ref, uint32_t mask)
>>> +					uint32_t ref, uint32_t mask,
>>> +					uint32_t sem)
>>>     {
>>>     	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>     	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>     	uint32_t seq;
>>>     
>>>     	spin_lock_irqsave(&kiq->ring_lock, flags);
>>> -	amdgpu_ring_alloc(ring, 32);
>>> +	amdgpu_ring_alloc(ring, 60);
>>> +
>>> +	/*
>>> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
>>> +	 * off cycle, add semaphore acquire before invalidation and semaphore
>>> +	 * release after invalidation to avoid entering power gated state
>>> +	 * to WA the Issue
>>> +	 */
>>> +
>>> +	/* a read return value of 1 means semaphore acuqire */
>>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>>> +
>>>     	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>>     					    ref, mask);
>>> +	/*
>>> +	 * add semaphore release after invalidation,
>>> +	 * write with 0 means semaphore release
>>> +	 */
>>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>>> +	amdgpu_ring_emit_wreg(ring, sem, 0);
>>> +
>>>     	amdgpu_fence_emit_polling(ring, &seq);
>>>     	amdgpu_ring_commit(ring);
>>>     	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git 
>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> index b0b2bdc750df..bda6a2f37dc0 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);  void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>     					uint32_t reg0, uint32_t rreg1,
>>> -					uint32_t ref, uint32_t mask);
>>> +					uint32_t ref, uint32_t mask,
>>> +					uint32_t sem);
>>>     int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, 
>>> bool init);  int amdgpu_virt_release_full_gpu(struct amdgpu_device 
>>> *adev, bool init);  int amdgpu_virt_reset_gpu(struct amdgpu_device 
>>> *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> index f25cd97ba5f2..1ae59af7836a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>>     			!adev->in_gpu_reset) {
>>>     		uint32_t req = hub->vm_inv_eng0_req + eng;
>>>     		uint32_t ack = hub->vm_inv_eng0_ack + eng;
>>> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>>>     
>>>     		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
>>> -				1 << vmid);
>>> +						   1 << vmid, sem);
>>>     		return;
>>>     	}
>>>     
>>> --
>>> 2.17.1
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: 答复: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 14:50                                 ` Zhu, Changfeng
  0 siblings, 0 replies; 42+ messages in thread
From: Zhu, Changfeng @ 2019-11-20 14:50 UTC (permalink / raw)
  To: Koenig, Christian, Liu, Monk, Xiao, Jack, Zhou1, Tao, Huang, Ray,
	Huang, Shimmer, amd-gfx

Well, I'll wait the help from IPE GFX team and try to apply GFXHUB as well and then perfect these invalidate semaphore patches.

If SRIOV team want to enable invalidate semaphore in future, it can try to take this patch back in that time.

BR,
Changfeng.

-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@gmail.com> 
Sent: Wednesday, November 20, 2019 10:39 PM
To: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: 答复: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt

Hi Monk,

the KIQ is used to invalidate both the GFXHUB as well as the MMHUB on Vega.

> Besides, amdgpu_virt_kiq_reg_write_reg_wait() is not deadly a helper 
> function that only serve VM invalidate, so I don't think You should 
> put the semaphore read/write in this routine, instead you can put 
> semaphore r/w out side of this routine and only Put them around the VM 
> invalidate logic
Yes, agree. But since we now knew that we won't need that we can just drop this patch altogether.

Regards,
Christian.

Am 20.11.19 um 15:30 schrieb Liu, Monk:
> Thanks for sharing this JIR
>
> now I got the picture of this issue from you and Christian.
>
> So the semaphore grabbing can prevent RTL to power off the MMHUB, I 
> see
>
> The practice is that SRIOV won't enable PG at all (even our GIM driver 
> won't enable PG, maybe in future we would enable it )
>
> I think I don't have too many concern about your patches,
>
> But I have comments on your patch 1:
>
> void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   					uint32_t reg0, uint32_t reg1,
> -					uint32_t ref, uint32_t mask)
> +					uint32_t ref, uint32_t mask,
> +					uint32_t sem)
>   {
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   	uint32_t seq;
>   
>   	spin_lock_irqsave(&kiq->ring_lock, flags);
> -	amdgpu_ring_alloc(ring, 32);
> +	amdgpu_ring_alloc(ring, 60);
> +
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>
>
> See that in this routine, the ring is always KIQ, so below code looks redundant :
>
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>
> Besides, amdgpu_virt_kiq_reg_write_reg_wait() is not deadly a helper 
> function that only serve VM invalidate, so I don't think You should 
> put the semaphore read/write in this routine, instead you can put 
> semaphore r/w out side of this routine and only Put them around the VM 
> invalidate logic
>
> Thanks
>
> -----邮件原件-----
> 发件人: Zhu, Changfeng <Changfeng.Zhu@amd.com>
> 发送时间: 2019年11月20日 22:17
> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Liu, Monk 
> <Monk.Liu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao 
> <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer 
> <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> 主题: RE: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore 
> workaround in amdgpu_virt
>
>>>> Did Changfeng already hit this issue under SRIOV ???
> I meet this problem on navi14 under gmc_v10_0_emit_flush_gpu_tlb .
> The problem is also seen by Zhou,Tao.
>
> And this is ticket:
> http://ontrack-internal.amd.com/browse/SWDEV-201459
>
> After the semaphore patch, the problem can be fixed.
>
> If SROV has concern about this problem,  it should not add semaphore in SROV.
>
> However, we should apply semaphore for gmc_v9_0_flush_gpu_tlb/ 
> gmc_v9_0_emit_flush_gpu_tlb/ gmc_v10_0_flush_gpu_tlb/ 
> gmc_v10_0_emit_flush_gpu_tlb
>
> Or how can we handle the ticket above?
>
> BR,
> Changfeng.
>
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Wednesday, November 20, 2019 10:00 PM
> To: Liu, Monk <Monk.Liu@amd.com>; Koenig, Christian 
> <Christian.Koenig@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>; 
> Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, 
> Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; 
> amd-gfx@lists.freedesktop.org
> Subject: Re: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore 
> workaround in amdgpu_virt
>
>> Did Changfeng already hit this issue under SRIOV ?
> I don't think so, but Changfeng needs to answer this.
>
> Question is does the extra semaphore acquire has some negative effect on SRIOV?
>
> I would like to avoid having even more SRIOV specific handling in here which we can't really test on bare metal.
>
> Christian.
>
> Am 20.11.19 um 14:54 schrieb Liu, Monk:
>> Hah, but in SRIOV case, our guest KMD driver is not allowed to do 
>> such things .... (and even there is a bug that KMD try to power gate, 
>> the SMU firmware would not really do the jobs since We have PSP L1 
>> policy to prevent those danger operations )
>>
>> Did Changfeng already hit this issue under SRIOV ???
>>
>> -----邮件原件-----
>> 发件人: Koenig, Christian <Christian.Koenig@amd.com>
>> 发送时间: 2019年11月20日 21:21
>> 收件人: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng 
>> <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao 
>> <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer 
>> <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
>> 主题: Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore 
>> workaround in amdgpu_virt
>>
>> Hi Monk,
>>
>> this is a fix for power gating the MMHUB.
>>
>> Basic problem is that the MMHUB can power gate while an invalidation is in progress which looses all bits in the ACK register and so deadlocks the engine waiting for the invalidation to finish.
>>
>> This bug is hit immediately when we enable power gating of the MMHUB.
>>
>> Regards,
>> Christian.
>>
>> Am 20.11.19 um 14:18 schrieb Liu, Monk:
>>> Hi Changfeng
>>>
>>> Firs of all, there is no power-gating off circle involved in AMDGPU 
>>> SRIOV, since we don't allow VF/VM do such things so I do feel 
>>> strange why you post something like this Especially on VEGA10 
>>> serials which looks doesn't have any issue on those gpu_flush part
>>>
>>> Here is my questions for you:
>>> 1) Can you point me what issue had you been experienced ? and how to 
>>> repro the bug
>>> 2) if you do hit some issues, did you verified that your patch can fix it ?
>>>
>>> besides
>>>
>>> /Monk
>>>
>>> -----邮件原件-----
>>> 发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 
>>> Changfeng.Zhu
>>> 发送时间: 2019年11月20日 17:14
>>> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack 
>>> <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray 
>>> <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; 
>>> amd-gfx@lists.freedesktop.org
>>> 抄送: Zhu, Changfeng <Changfeng.Zhu@amd.com>
>>> 主题: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in 
>>> amdgpu_virt
>>>
>>> From: changzhu <Changfeng.Zhu@amd.com>
>>>
>>> It may lose gpuvm invalidate acknowldege state across power-gating off cycle. To avoid this issue in virt invalidation, add semaphore acquire before invalidation and semaphore release after invalidation.
>>>
>>> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
>>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>>>     drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>>>     3 files changed, 28 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> index f04eb1a64271..70ffaf91cd12 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device 
>>> *adev, uint32_t reg, uint32_t v)
>>>     
>>>     void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>     					uint32_t reg0, uint32_t reg1,
>>> -					uint32_t ref, uint32_t mask)
>>> +					uint32_t ref, uint32_t mask,
>>> +					uint32_t sem)
>>>     {
>>>     	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>     	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>     	uint32_t seq;
>>>     
>>>     	spin_lock_irqsave(&kiq->ring_lock, flags);
>>> -	amdgpu_ring_alloc(ring, 32);
>>> +	amdgpu_ring_alloc(ring, 60);
>>> +
>>> +	/*
>>> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
>>> +	 * off cycle, add semaphore acquire before invalidation and semaphore
>>> +	 * release after invalidation to avoid entering power gated state
>>> +	 * to WA the Issue
>>> +	 */
>>> +
>>> +	/* a read return value of 1 means semaphore acuqire */
>>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>>> +
>>>     	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>>     					    ref, mask);
>>> +	/*
>>> +	 * add semaphore release after invalidation,
>>> +	 * write with 0 means semaphore release
>>> +	 */
>>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>>> +	amdgpu_ring_emit_wreg(ring, sem, 0);
>>> +
>>>     	amdgpu_fence_emit_polling(ring, &seq);
>>>     	amdgpu_ring_commit(ring);
>>>     	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git 
>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> index b0b2bdc750df..bda6a2f37dc0 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);  void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>     					uint32_t reg0, uint32_t rreg1,
>>> -					uint32_t ref, uint32_t mask);
>>> +					uint32_t ref, uint32_t mask,
>>> +					uint32_t sem);
>>>     int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, 
>>> bool init);  int amdgpu_virt_release_full_gpu(struct amdgpu_device 
>>> *adev, bool init);  int amdgpu_virt_reset_gpu(struct amdgpu_device 
>>> *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> index f25cd97ba5f2..1ae59af7836a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>>     			!adev->in_gpu_reset) {
>>>     		uint32_t req = hub->vm_inv_eng0_req + eng;
>>>     		uint32_t ack = hub->vm_inv_eng0_ack + eng;
>>> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>>>     
>>>     		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
>>> -				1 << vmid);
>>> +						   1 << vmid, sem);
>>>     		return;
>>>     	}
>>>     
>>> --
>>> 2.17.1
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: 答复: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 14:54                                     ` Christian König
  0 siblings, 0 replies; 42+ messages in thread
From: Christian König @ 2019-11-20 14:54 UTC (permalink / raw)
  To: Zhu, Changfeng, Koenig, Christian, Liu, Monk, Xiao, Jack, Zhou1,
	Tao, Huang, Ray, Huang, Shimmer,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

I think you can drop this KIQ patch anyway, or at least delay it.

We do want to use the hw semaphores for various reasons on the async rings.

But for the SW triggered flush we don't really have a need for this as 
long as GIM doesn't support power gating of the MMHUB.

Regards,
Christian.

Am 20.11.19 um 15:50 schrieb Zhu, Changfeng:
> Well, I'll wait the help from IPE GFX team and try to apply GFXHUB as well and then perfect these invalidate semaphore patches.
>
> If SRIOV team want to enable invalidate semaphore in future, it can try to take this patch back in that time.
>
> BR,
> Changfeng.
>
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Wednesday, November 20, 2019 10:39 PM
> To: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: 答复: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
>
> Hi Monk,
>
> the KIQ is used to invalidate both the GFXHUB as well as the MMHUB on Vega.
>
>> Besides, amdgpu_virt_kiq_reg_write_reg_wait() is not deadly a helper
>> function that only serve VM invalidate, so I don't think You should
>> put the semaphore read/write in this routine, instead you can put
>> semaphore r/w out side of this routine and only Put them around the VM
>> invalidate logic
> Yes, agree. But since we now knew that we won't need that we can just drop this patch altogether.
>
> Regards,
> Christian.
>
> Am 20.11.19 um 15:30 schrieb Liu, Monk:
>> Thanks for sharing this JIR
>>
>> now I got the picture of this issue from you and Christian.
>>
>> So the semaphore grabbing can prevent RTL to power off the MMHUB, I
>> see
>>
>> The practice is that SRIOV won't enable PG at all (even our GIM driver
>> won't enable PG, maybe in future we would enable it )
>>
>> I think I don't have too many concern about your patches,
>>
>> But I have comments on your patch 1:
>>
>> void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    					uint32_t reg0, uint32_t reg1,
>> -					uint32_t ref, uint32_t mask)
>> +					uint32_t ref, uint32_t mask,
>> +					uint32_t sem)
>>    {
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    	uint32_t seq;
>>    
>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>> -	amdgpu_ring_alloc(ring, 32);
>> +	amdgpu_ring_alloc(ring, 60);
>> +
>> +	/*
>> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
>> +	 * off cycle, add semaphore acquire before invalidation and semaphore
>> +	 * release after invalidation to avoid entering power gated state
>> +	 * to WA the Issue
>> +	 */
>> +
>> +	/* a read return value of 1 means semaphore acuqire */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>>
>>
>> See that in this routine, the ring is always KIQ, so below code looks redundant :
>>
>> +	/* a read return value of 1 means semaphore acuqire */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>>
>> Besides, amdgpu_virt_kiq_reg_write_reg_wait() is not deadly a helper
>> function that only serve VM invalidate, so I don't think You should
>> put the semaphore read/write in this routine, instead you can put
>> semaphore r/w out side of this routine and only Put them around the VM
>> invalidate logic
>>
>> Thanks
>>
>> -----邮件原件-----
>> 发件人: Zhu, Changfeng <Changfeng.Zhu@amd.com>
>> 发送时间: 2019年11月20日 22:17
>> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Liu, Monk
>> <Monk.Liu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao
>> <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer
>> <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
>> 主题: RE: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore
>> workaround in amdgpu_virt
>>
>>>>> Did Changfeng already hit this issue under SRIOV ???
>> I meet this problem on navi14 under gmc_v10_0_emit_flush_gpu_tlb .
>> The problem is also seen by Zhou,Tao.
>>
>> And this is ticket:
>> http://ontrack-internal.amd.com/browse/SWDEV-201459
>>
>> After the semaphore patch, the problem can be fixed.
>>
>> If SROV has concern about this problem,  it should not add semaphore in SROV.
>>
>> However, we should apply semaphore for gmc_v9_0_flush_gpu_tlb/
>> gmc_v9_0_emit_flush_gpu_tlb/ gmc_v10_0_flush_gpu_tlb/
>> gmc_v10_0_emit_flush_gpu_tlb
>>
>> Or how can we handle the ticket above?
>>
>> BR,
>> Changfeng.
>>
>> -----Original Message-----
>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>> Sent: Wednesday, November 20, 2019 10:00 PM
>> To: Liu, Monk <Monk.Liu@amd.com>; Koenig, Christian
>> <Christian.Koenig@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>;
>> Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang,
>> Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>;
>> amd-gfx@lists.freedesktop.org
>> Subject: Re: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore
>> workaround in amdgpu_virt
>>
>>> Did Changfeng already hit this issue under SRIOV ?
>> I don't think so, but Changfeng needs to answer this.
>>
>> Question is does the extra semaphore acquire has some negative effect on SRIOV?
>>
>> I would like to avoid having even more SRIOV specific handling in here which we can't really test on bare metal.
>>
>> Christian.
>>
>> Am 20.11.19 um 14:54 schrieb Liu, Monk:
>>> Hah, but in SRIOV case, our guest KMD driver is not allowed to do
>>> such things .... (and even there is a bug that KMD try to power gate,
>>> the SMU firmware would not really do the jobs since We have PSP L1
>>> policy to prevent those danger operations )
>>>
>>> Did Changfeng already hit this issue under SRIOV ???
>>>
>>> -----邮件原件-----
>>> 发件人: Koenig, Christian <Christian.Koenig@amd.com>
>>> 发送时间: 2019年11月20日 21:21
>>> 收件人: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng
>>> <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao
>>> <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer
>>> <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
>>> 主题: Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore
>>> workaround in amdgpu_virt
>>>
>>> Hi Monk,
>>>
>>> this is a fix for power gating the MMHUB.
>>>
>>> Basic problem is that the MMHUB can power gate while an invalidation is in progress which looses all bits in the ACK register and so deadlocks the engine waiting for the invalidation to finish.
>>>
>>> This bug is hit immediately when we enable power gating of the MMHUB.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 20.11.19 um 14:18 schrieb Liu, Monk:
>>>> Hi Changfeng
>>>>
>>>> Firs of all, there is no power-gating off circle involved in AMDGPU
>>>> SRIOV, since we don't allow VF/VM do such things so I do feel
>>>> strange why you post something like this Especially on VEGA10
>>>> serials which looks doesn't have any issue on those gpu_flush part
>>>>
>>>> Here is my questions for you:
>>>> 1) Can you point me what issue had you been experienced ? and how to
>>>> repro the bug
>>>> 2) if you do hit some issues, did you verified that your patch can fix it ?
>>>>
>>>> besides
>>>>
>>>> /Monk
>>>>
>>>> -----邮件原件-----
>>>> 发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表
>>>> Changfeng.Zhu
>>>> 发送时间: 2019年11月20日 17:14
>>>> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack
>>>> <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray
>>>> <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>;
>>>> amd-gfx@lists.freedesktop.org
>>>> 抄送: Zhu, Changfeng <Changfeng.Zhu@amd.com>
>>>> 主题: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in
>>>> amdgpu_virt
>>>>
>>>> From: changzhu <Changfeng.Zhu@amd.com>
>>>>
>>>> It may lose gpuvm invalidate acknowldege state across power-gating off cycle. To avoid this issue in virt invalidation, add semaphore acquire before invalidation and semaphore release after invalidation.
>>>>
>>>> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
>>>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>>>> ---
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>>>>      drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>>>>      3 files changed, 28 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>>> index f04eb1a64271..70ffaf91cd12 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>>> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device
>>>> *adev, uint32_t reg, uint32_t v)
>>>>      
>>>>      void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>>      					uint32_t reg0, uint32_t reg1,
>>>> -					uint32_t ref, uint32_t mask)
>>>> +					uint32_t ref, uint32_t mask,
>>>> +					uint32_t sem)
>>>>      {
>>>>      	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>      	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>>      	uint32_t seq;
>>>>      
>>>>      	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>> -	amdgpu_ring_alloc(ring, 32);
>>>> +	amdgpu_ring_alloc(ring, 60);
>>>> +
>>>> +	/*
>>>> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
>>>> +	 * off cycle, add semaphore acquire before invalidation and semaphore
>>>> +	 * release after invalidation to avoid entering power gated state
>>>> +	 * to WA the Issue
>>>> +	 */
>>>> +
>>>> +	/* a read return value of 1 means semaphore acuqire */
>>>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>>>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>>>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>>>> +
>>>>      	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>>>      					    ref, mask);
>>>> +	/*
>>>> +	 * add semaphore release after invalidation,
>>>> +	 * write with 0 means semaphore release
>>>> +	 */
>>>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>>>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>>>> +	amdgpu_ring_emit_wreg(ring, sem, 0);
>>>> +
>>>>      	amdgpu_fence_emit_polling(ring, &seq);
>>>>      	amdgpu_ring_commit(ring);
>>>>      	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git
>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>> index b0b2bdc750df..bda6a2f37dc0 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);  void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>>      					uint32_t reg0, uint32_t rreg1,
>>>> -					uint32_t ref, uint32_t mask);
>>>> +					uint32_t ref, uint32_t mask,
>>>> +					uint32_t sem);
>>>>      int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev,
>>>> bool init);  int amdgpu_virt_release_full_gpu(struct amdgpu_device
>>>> *adev, bool init);  int amdgpu_virt_reset_gpu(struct amdgpu_device
>>>> *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> index f25cd97ba5f2..1ae59af7836a 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>>>      			!adev->in_gpu_reset) {
>>>>      		uint32_t req = hub->vm_inv_eng0_req + eng;
>>>>      		uint32_t ack = hub->vm_inv_eng0_ack + eng;
>>>> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>>>>      
>>>>      		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
>>>> -				1 << vmid);
>>>> +						   1 << vmid, sem);
>>>>      		return;
>>>>      	}
>>>>      
>>>> --
>>>> 2.17.1
>>>>
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: 答复: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 14:54                                     ` Christian König
  0 siblings, 0 replies; 42+ messages in thread
From: Christian König @ 2019-11-20 14:54 UTC (permalink / raw)
  To: Zhu, Changfeng, Koenig, Christian, Liu, Monk, Xiao, Jack, Zhou1,
	Tao, Huang, Ray, Huang, Shimmer, amd-gfx

I think you can drop this KIQ patch anyway, or at least delay it.

We do want to use the hw semaphores for various reasons on the async rings.

But for the SW triggered flush we don't really have a need for this as 
long as GIM doesn't support power gating of the MMHUB.

Regards,
Christian.

Am 20.11.19 um 15:50 schrieb Zhu, Changfeng:
> Well, I'll wait the help from IPE GFX team and try to apply GFXHUB as well and then perfect these invalidate semaphore patches.
>
> If SRIOV team want to enable invalidate semaphore in future, it can try to take this patch back in that time.
>
> BR,
> Changfeng.
>
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Wednesday, November 20, 2019 10:39 PM
> To: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: 答复: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
>
> Hi Monk,
>
> the KIQ is used to invalidate both the GFXHUB as well as the MMHUB on Vega.
>
>> Besides, amdgpu_virt_kiq_reg_write_reg_wait() is not deadly a helper
>> function that only serve VM invalidate, so I don't think You should
>> put the semaphore read/write in this routine, instead you can put
>> semaphore r/w out side of this routine and only Put them around the VM
>> invalidate logic
> Yes, agree. But since we now knew that we won't need that we can just drop this patch altogether.
>
> Regards,
> Christian.
>
> Am 20.11.19 um 15:30 schrieb Liu, Monk:
>> Thanks for sharing this JIR
>>
>> now I got the picture of this issue from you and Christian.
>>
>> So the semaphore grabbing can prevent RTL to power off the MMHUB, I
>> see
>>
>> The practice is that SRIOV won't enable PG at all (even our GIM driver
>> won't enable PG, maybe in future we would enable it )
>>
>> I think I don't have too many concern about your patches,
>>
>> But I have comments on your patch 1:
>>
>> void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    					uint32_t reg0, uint32_t reg1,
>> -					uint32_t ref, uint32_t mask)
>> +					uint32_t ref, uint32_t mask,
>> +					uint32_t sem)
>>    {
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    	uint32_t seq;
>>    
>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>> -	amdgpu_ring_alloc(ring, 32);
>> +	amdgpu_ring_alloc(ring, 60);
>> +
>> +	/*
>> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
>> +	 * off cycle, add semaphore acquire before invalidation and semaphore
>> +	 * release after invalidation to avoid entering power gated state
>> +	 * to WA the Issue
>> +	 */
>> +
>> +	/* a read return value of 1 means semaphore acuqire */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>>
>>
>> See that in this routine, the ring is always KIQ, so below code looks redundant :
>>
>> +	/* a read return value of 1 means semaphore acuqire */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>>
>> Besides, amdgpu_virt_kiq_reg_write_reg_wait() is not deadly a helper
>> function that only serve VM invalidate, so I don't think You should
>> put the semaphore read/write in this routine, instead you can put
>> semaphore r/w out side of this routine and only Put them around the VM
>> invalidate logic
>>
>> Thanks
>>
>> -----邮件原件-----
>> 发件人: Zhu, Changfeng <Changfeng.Zhu@amd.com>
>> 发送时间: 2019年11月20日 22:17
>> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Liu, Monk
>> <Monk.Liu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao
>> <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer
>> <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
>> 主题: RE: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore
>> workaround in amdgpu_virt
>>
>>>>> Did Changfeng already hit this issue under SRIOV ???
>> I meet this problem on navi14 under gmc_v10_0_emit_flush_gpu_tlb .
>> The problem is also seen by Zhou,Tao.
>>
>> And this is ticket:
>> http://ontrack-internal.amd.com/browse/SWDEV-201459
>>
>> After the semaphore patch, the problem can be fixed.
>>
>> If SROV has concern about this problem,  it should not add semaphore in SROV.
>>
>> However, we should apply semaphore for gmc_v9_0_flush_gpu_tlb/
>> gmc_v9_0_emit_flush_gpu_tlb/ gmc_v10_0_flush_gpu_tlb/
>> gmc_v10_0_emit_flush_gpu_tlb
>>
>> Or how can we handle the ticket above?
>>
>> BR,
>> Changfeng.
>>
>> -----Original Message-----
>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>> Sent: Wednesday, November 20, 2019 10:00 PM
>> To: Liu, Monk <Monk.Liu@amd.com>; Koenig, Christian
>> <Christian.Koenig@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>;
>> Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang,
>> Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>;
>> amd-gfx@lists.freedesktop.org
>> Subject: Re: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore
>> workaround in amdgpu_virt
>>
>>> Did Changfeng already hit this issue under SRIOV ?
>> I don't think so, but Changfeng needs to answer this.
>>
>> Question is does the extra semaphore acquire has some negative effect on SRIOV?
>>
>> I would like to avoid having even more SRIOV specific handling in here which we can't really test on bare metal.
>>
>> Christian.
>>
>> Am 20.11.19 um 14:54 schrieb Liu, Monk:
>>> Hah, but in SRIOV case, our guest KMD driver is not allowed to do
>>> such things .... (and even there is a bug that KMD try to power gate,
>>> the SMU firmware would not really do the jobs since We have PSP L1
>>> policy to prevent those danger operations )
>>>
>>> Did Changfeng already hit this issue under SRIOV ???
>>>
>>> -----邮件原件-----
>>> 发件人: Koenig, Christian <Christian.Koenig@amd.com>
>>> 发送时间: 2019年11月20日 21:21
>>> 收件人: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng
>>> <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao
>>> <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer
>>> <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
>>> 主题: Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore
>>> workaround in amdgpu_virt
>>>
>>> Hi Monk,
>>>
>>> this is a fix for power gating the MMHUB.
>>>
>>> Basic problem is that the MMHUB can power gate while an invalidation is in progress which looses all bits in the ACK register and so deadlocks the engine waiting for the invalidation to finish.
>>>
>>> This bug is hit immediately when we enable power gating of the MMHUB.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 20.11.19 um 14:18 schrieb Liu, Monk:
>>>> Hi Changfeng
>>>>
>>>> Firs of all, there is no power-gating off circle involved in AMDGPU
>>>> SRIOV, since we don't allow VF/VM do such things so I do feel
>>>> strange why you post something like this Especially on VEGA10
>>>> serials which looks doesn't have any issue on those gpu_flush part
>>>>
>>>> Here is my questions for you:
>>>> 1) Can you point me what issue had you been experienced ? and how to
>>>> repro the bug
>>>> 2) if you do hit some issues, did you verified that your patch can fix it ?
>>>>
>>>> besides
>>>>
>>>> /Monk
>>>>
>>>> -----邮件原件-----
>>>> 发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表
>>>> Changfeng.Zhu
>>>> 发送时间: 2019年11月20日 17:14
>>>> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack
>>>> <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray
>>>> <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>;
>>>> amd-gfx@lists.freedesktop.org
>>>> 抄送: Zhu, Changfeng <Changfeng.Zhu@amd.com>
>>>> 主题: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in
>>>> amdgpu_virt
>>>>
>>>> From: changzhu <Changfeng.Zhu@amd.com>
>>>>
>>>> It may lose gpuvm invalidate acknowldege state across power-gating off cycle. To avoid this issue in virt invalidation, add semaphore acquire before invalidation and semaphore release after invalidation.
>>>>
>>>> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
>>>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>>>> ---
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>>>>      drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>>>>      3 files changed, 28 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>>> index f04eb1a64271..70ffaf91cd12 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>>> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device
>>>> *adev, uint32_t reg, uint32_t v)
>>>>      
>>>>      void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>>      					uint32_t reg0, uint32_t reg1,
>>>> -					uint32_t ref, uint32_t mask)
>>>> +					uint32_t ref, uint32_t mask,
>>>> +					uint32_t sem)
>>>>      {
>>>>      	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>      	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>>      	uint32_t seq;
>>>>      
>>>>      	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>> -	amdgpu_ring_alloc(ring, 32);
>>>> +	amdgpu_ring_alloc(ring, 60);
>>>> +
>>>> +	/*
>>>> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
>>>> +	 * off cycle, add semaphore acquire before invalidation and semaphore
>>>> +	 * release after invalidation to avoid entering power gated state
>>>> +	 * to WA the Issue
>>>> +	 */
>>>> +
>>>> +	/* a read return value of 1 means semaphore acuqire */
>>>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>>>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>>>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>>>> +
>>>>      	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>>>      					    ref, mask);
>>>> +	/*
>>>> +	 * add semaphore release after invalidation,
>>>> +	 * write with 0 means semaphore release
>>>> +	 */
>>>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>>>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>>>> +	amdgpu_ring_emit_wreg(ring, sem, 0);
>>>> +
>>>>      	amdgpu_fence_emit_polling(ring, &seq);
>>>>      	amdgpu_ring_commit(ring);
>>>>      	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git
>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>> index b0b2bdc750df..bda6a2f37dc0 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);  void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>>      					uint32_t reg0, uint32_t rreg1,
>>>> -					uint32_t ref, uint32_t mask);
>>>> +					uint32_t ref, uint32_t mask,
>>>> +					uint32_t sem);
>>>>      int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev,
>>>> bool init);  int amdgpu_virt_release_full_gpu(struct amdgpu_device
>>>> *adev, bool init);  int amdgpu_virt_reset_gpu(struct amdgpu_device
>>>> *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> index f25cd97ba5f2..1ae59af7836a 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>>>      			!adev->in_gpu_reset) {
>>>>      		uint32_t req = hub->vm_inv_eng0_req + eng;
>>>>      		uint32_t ack = hub->vm_inv_eng0_ack + eng;
>>>> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>>>>      
>>>>      		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
>>>> -				1 << vmid);
>>>> +						   1 << vmid, sem);
>>>>      		return;
>>>>      	}
>>>>      
>>>> --
>>>> 2.17.1
>>>>
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* 答复: 答复: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 14:59                                 ` Liu, Monk
  0 siblings, 0 replies; 42+ messages in thread
From: Liu, Monk @ 2019-11-20 14:59 UTC (permalink / raw)
  To: Koenig, Christian, Zhu, Changfeng, Xiao, Jack, Zhou1, Tao, Huang,
	Ray, Huang, Shimmer, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

>>the KIQ is used to invalidate both the GFXHUB as well as the MMHUB on Vega.

I know,

> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);

But ring->funcs->vmhub wil always be AMDGPU_GFXHUB, right ? since this ring is from "&kiq->ring" ? 


>> Yes, agree. But since we now knew that we won't need that we can just drop this patch altogether.

Yeah, the semaphore wrapping is in PATCH 2/2, agree that this PATCH 1/2 could be dropped 


-----邮件原件-----
发件人: Christian König <ckoenig.leichtzumerken@gmail.com> 
发送时间: 2019年11月20日 22:39
收件人: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
主题: Re: 答复: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt

Hi Monk,

the KIQ is used to invalidate both the GFXHUB as well as the MMHUB on Vega.

> Besides, amdgpu_virt_kiq_reg_write_reg_wait() is not deadly a helper 
> function that only serve VM invalidate, so I don't think You should 
> put the semaphore read/write in this routine, instead you can put 
> semaphore r/w out side of this routine and only Put them around the VM 
> invalidate logic
Yes, agree. But since we now knew that we won't need that we can just drop this patch altogether.

Regards,
Christian.

Am 20.11.19 um 15:30 schrieb Liu, Monk:
> Thanks for sharing this JIR
>
> now I got the picture of this issue from you and Christian.
>
> So the semaphore grabbing can prevent RTL to power off the MMHUB, I 
> see
>
> The practice is that SRIOV won't enable PG at all (even our GIM driver 
> won't enable PG, maybe in future we would enable it )
>
> I think I don't have too many concern about your patches,
>
> But I have comments on your patch 1:
>
> void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   					uint32_t reg0, uint32_t reg1,
> -					uint32_t ref, uint32_t mask)
> +					uint32_t ref, uint32_t mask,
> +					uint32_t sem)
>   {
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   	uint32_t seq;
>   
>   	spin_lock_irqsave(&kiq->ring_lock, flags);
> -	amdgpu_ring_alloc(ring, 32);
> +	amdgpu_ring_alloc(ring, 60);
> +
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>
>
> See that in this routine, the ring is always KIQ, so below code looks redundant :
>
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>
> Besides, amdgpu_virt_kiq_reg_write_reg_wait() is not deadly a helper 
> function that only serve VM invalidate, so I don't think You should 
> put the semaphore read/write in this routine, instead you can put 
> semaphore r/w out side of this routine and only Put them around the VM 
> invalidate logic
>
> Thanks
>
> -----邮件原件-----
> 发件人: Zhu, Changfeng <Changfeng.Zhu@amd.com>
> 发送时间: 2019年11月20日 22:17
> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Liu, Monk 
> <Monk.Liu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao 
> <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer 
> <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> 主题: RE: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore 
> workaround in amdgpu_virt
>
>>>> Did Changfeng already hit this issue under SRIOV ???
> I meet this problem on navi14 under gmc_v10_0_emit_flush_gpu_tlb .
> The problem is also seen by Zhou,Tao.
>
> And this is ticket:
> http://ontrack-internal.amd.com/browse/SWDEV-201459
>
> After the semaphore patch, the problem can be fixed.
>
> If SROV has concern about this problem,  it should not add semaphore in SROV.
>
> However, we should apply semaphore for gmc_v9_0_flush_gpu_tlb/ 
> gmc_v9_0_emit_flush_gpu_tlb/ gmc_v10_0_flush_gpu_tlb/ 
> gmc_v10_0_emit_flush_gpu_tlb
>
> Or how can we handle the ticket above?
>
> BR,
> Changfeng.
>
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Wednesday, November 20, 2019 10:00 PM
> To: Liu, Monk <Monk.Liu@amd.com>; Koenig, Christian 
> <Christian.Koenig@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>; 
> Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, 
> Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; 
> amd-gfx@lists.freedesktop.org
> Subject: Re: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore 
> workaround in amdgpu_virt
>
>> Did Changfeng already hit this issue under SRIOV ?
> I don't think so, but Changfeng needs to answer this.
>
> Question is does the extra semaphore acquire has some negative effect on SRIOV?
>
> I would like to avoid having even more SRIOV specific handling in here which we can't really test on bare metal.
>
> Christian.
>
> Am 20.11.19 um 14:54 schrieb Liu, Monk:
>> Hah, but in SRIOV case, our guest KMD driver is not allowed to do 
>> such things .... (and even there is a bug that KMD try to power gate, 
>> the SMU firmware would not really do the jobs since We have PSP L1 
>> policy to prevent those danger operations )
>>
>> Did Changfeng already hit this issue under SRIOV ???
>>
>> -----邮件原件-----
>> 发件人: Koenig, Christian <Christian.Koenig@amd.com>
>> 发送时间: 2019年11月20日 21:21
>> 收件人: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng 
>> <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao 
>> <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer 
>> <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
>> 主题: Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore 
>> workaround in amdgpu_virt
>>
>> Hi Monk,
>>
>> this is a fix for power gating the MMHUB.
>>
>> Basic problem is that the MMHUB can power gate while an invalidation is in progress which looses all bits in the ACK register and so deadlocks the engine waiting for the invalidation to finish.
>>
>> This bug is hit immediately when we enable power gating of the MMHUB.
>>
>> Regards,
>> Christian.
>>
>> Am 20.11.19 um 14:18 schrieb Liu, Monk:
>>> Hi Changfeng
>>>
>>> Firs of all, there is no power-gating off circle involved in AMDGPU 
>>> SRIOV, since we don't allow VF/VM do such things so I do feel 
>>> strange why you post something like this Especially on VEGA10 
>>> serials which looks doesn't have any issue on those gpu_flush part
>>>
>>> Here is my questions for you:
>>> 1) Can you point me what issue had you been experienced ? and how to 
>>> repro the bug
>>> 2) if you do hit some issues, did you verified that your patch can fix it ?
>>>
>>> besides
>>>
>>> /Monk
>>>
>>> -----邮件原件-----
>>> 发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 
>>> Changfeng.Zhu
>>> 发送时间: 2019年11月20日 17:14
>>> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack 
>>> <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray 
>>> <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; 
>>> amd-gfx@lists.freedesktop.org
>>> 抄送: Zhu, Changfeng <Changfeng.Zhu@amd.com>
>>> 主题: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in 
>>> amdgpu_virt
>>>
>>> From: changzhu <Changfeng.Zhu@amd.com>
>>>
>>> It may lose gpuvm invalidate acknowldege state across power-gating off cycle. To avoid this issue in virt invalidation, add semaphore acquire before invalidation and semaphore release after invalidation.
>>>
>>> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
>>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>>>     drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>>>     3 files changed, 28 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> index f04eb1a64271..70ffaf91cd12 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device 
>>> *adev, uint32_t reg, uint32_t v)
>>>     
>>>     void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>     					uint32_t reg0, uint32_t reg1,
>>> -					uint32_t ref, uint32_t mask)
>>> +					uint32_t ref, uint32_t mask,
>>> +					uint32_t sem)
>>>     {
>>>     	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>     	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>     	uint32_t seq;
>>>     
>>>     	spin_lock_irqsave(&kiq->ring_lock, flags);
>>> -	amdgpu_ring_alloc(ring, 32);
>>> +	amdgpu_ring_alloc(ring, 60);
>>> +
>>> +	/*
>>> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
>>> +	 * off cycle, add semaphore acquire before invalidation and semaphore
>>> +	 * release after invalidation to avoid entering power gated state
>>> +	 * to WA the Issue
>>> +	 */
>>> +
>>> +	/* a read return value of 1 means semaphore acuqire */
>>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>>> +
>>>     	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>>     					    ref, mask);
>>> +	/*
>>> +	 * add semaphore release after invalidation,
>>> +	 * write with 0 means semaphore release
>>> +	 */
>>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>>> +	amdgpu_ring_emit_wreg(ring, sem, 0);
>>> +
>>>     	amdgpu_fence_emit_polling(ring, &seq);
>>>     	amdgpu_ring_commit(ring);
>>>     	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git 
>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> index b0b2bdc750df..bda6a2f37dc0 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);  void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>     					uint32_t reg0, uint32_t rreg1,
>>> -					uint32_t ref, uint32_t mask);
>>> +					uint32_t ref, uint32_t mask,
>>> +					uint32_t sem);
>>>     int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, 
>>> bool init);  int amdgpu_virt_release_full_gpu(struct amdgpu_device 
>>> *adev, bool init);  int amdgpu_virt_reset_gpu(struct amdgpu_device 
>>> *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> index f25cd97ba5f2..1ae59af7836a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>>     			!adev->in_gpu_reset) {
>>>     		uint32_t req = hub->vm_inv_eng0_req + eng;
>>>     		uint32_t ack = hub->vm_inv_eng0_ack + eng;
>>> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>>>     
>>>     		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
>>> -				1 << vmid);
>>> +						   1 << vmid, sem);
>>>     		return;
>>>     	}
>>>     
>>> --
>>> 2.17.1
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* 答复: 答复: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 14:59                                 ` Liu, Monk
  0 siblings, 0 replies; 42+ messages in thread
From: Liu, Monk @ 2019-11-20 14:59 UTC (permalink / raw)
  To: Koenig, Christian, Zhu, Changfeng, Xiao, Jack, Zhou1, Tao, Huang,
	Ray, Huang, Shimmer, amd-gfx

>>the KIQ is used to invalidate both the GFXHUB as well as the MMHUB on Vega.

I know,

> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);

But ring->funcs->vmhub wil always be AMDGPU_GFXHUB, right ? since this ring is from "&kiq->ring" ? 


>> Yes, agree. But since we now knew that we won't need that we can just drop this patch altogether.

Yeah, the semaphore wrapping is in PATCH 2/2, agree that this PATCH 1/2 could be dropped 


-----邮件原件-----
发件人: Christian König <ckoenig.leichtzumerken@gmail.com> 
发送时间: 2019年11月20日 22:39
收件人: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
主题: Re: 答复: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt

Hi Monk,

the KIQ is used to invalidate both the GFXHUB as well as the MMHUB on Vega.

> Besides, amdgpu_virt_kiq_reg_write_reg_wait() is not deadly a helper 
> function that only serve VM invalidate, so I don't think You should 
> put the semaphore read/write in this routine, instead you can put 
> semaphore r/w out side of this routine and only Put them around the VM 
> invalidate logic
Yes, agree. But since we now knew that we won't need that we can just drop this patch altogether.

Regards,
Christian.

Am 20.11.19 um 15:30 schrieb Liu, Monk:
> Thanks for sharing this JIR
>
> now I got the picture of this issue from you and Christian.
>
> So the semaphore grabbing can prevent RTL to power off the MMHUB, I 
> see
>
> The practice is that SRIOV won't enable PG at all (even our GIM driver 
> won't enable PG, maybe in future we would enable it )
>
> I think I don't have too many concern about your patches,
>
> But I have comments on your patch 1:
>
> void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   					uint32_t reg0, uint32_t reg1,
> -					uint32_t ref, uint32_t mask)
> +					uint32_t ref, uint32_t mask,
> +					uint32_t sem)
>   {
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   	uint32_t seq;
>   
>   	spin_lock_irqsave(&kiq->ring_lock, flags);
> -	amdgpu_ring_alloc(ring, 32);
> +	amdgpu_ring_alloc(ring, 60);
> +
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>
>
> See that in this routine, the ring is always KIQ, so below code looks redundant :
>
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>
> Besides, amdgpu_virt_kiq_reg_write_reg_wait() is not deadly a helper 
> function that only serve VM invalidate, so I don't think You should 
> put the semaphore read/write in this routine, instead you can put 
> semaphore r/w out side of this routine and only Put them around the VM 
> invalidate logic
>
> Thanks
>
> -----邮件原件-----
> 发件人: Zhu, Changfeng <Changfeng.Zhu@amd.com>
> 发送时间: 2019年11月20日 22:17
> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Liu, Monk 
> <Monk.Liu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao 
> <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer 
> <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> 主题: RE: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore 
> workaround in amdgpu_virt
>
>>>> Did Changfeng already hit this issue under SRIOV ???
> I meet this problem on navi14 under gmc_v10_0_emit_flush_gpu_tlb .
> The problem is also seen by Zhou,Tao.
>
> And this is ticket:
> http://ontrack-internal.amd.com/browse/SWDEV-201459
>
> After the semaphore patch, the problem can be fixed.
>
> If SROV has concern about this problem,  it should not add semaphore in SROV.
>
> However, we should apply semaphore for gmc_v9_0_flush_gpu_tlb/ 
> gmc_v9_0_emit_flush_gpu_tlb/ gmc_v10_0_flush_gpu_tlb/ 
> gmc_v10_0_emit_flush_gpu_tlb
>
> Or how can we handle the ticket above?
>
> BR,
> Changfeng.
>
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Wednesday, November 20, 2019 10:00 PM
> To: Liu, Monk <Monk.Liu@amd.com>; Koenig, Christian 
> <Christian.Koenig@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>; 
> Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, 
> Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; 
> amd-gfx@lists.freedesktop.org
> Subject: Re: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore 
> workaround in amdgpu_virt
>
>> Did Changfeng already hit this issue under SRIOV ?
> I don't think so, but Changfeng needs to answer this.
>
> Question is does the extra semaphore acquire has some negative effect on SRIOV?
>
> I would like to avoid having even more SRIOV specific handling in here which we can't really test on bare metal.
>
> Christian.
>
> Am 20.11.19 um 14:54 schrieb Liu, Monk:
>> Hah, but in SRIOV case, our guest KMD driver is not allowed to do 
>> such things .... (and even there is a bug that KMD try to power gate, 
>> the SMU firmware would not really do the jobs since We have PSP L1 
>> policy to prevent those danger operations )
>>
>> Did Changfeng already hit this issue under SRIOV ???
>>
>> -----邮件原件-----
>> 发件人: Koenig, Christian <Christian.Koenig@amd.com>
>> 发送时间: 2019年11月20日 21:21
>> 收件人: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng 
>> <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao 
>> <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer 
>> <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
>> 主题: Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore 
>> workaround in amdgpu_virt
>>
>> Hi Monk,
>>
>> this is a fix for power gating the MMHUB.
>>
>> Basic problem is that the MMHUB can power gate while an invalidation is in progress which looses all bits in the ACK register and so deadlocks the engine waiting for the invalidation to finish.
>>
>> This bug is hit immediately when we enable power gating of the MMHUB.
>>
>> Regards,
>> Christian.
>>
>> Am 20.11.19 um 14:18 schrieb Liu, Monk:
>>> Hi Changfeng
>>>
>>> Firs of all, there is no power-gating off circle involved in AMDGPU 
>>> SRIOV, since we don't allow VF/VM do such things so I do feel 
>>> strange why you post something like this Especially on VEGA10 
>>> serials which looks doesn't have any issue on those gpu_flush part
>>>
>>> Here is my questions for you:
>>> 1) Can you point me what issue had you been experienced ? and how to 
>>> repro the bug
>>> 2) if you do hit some issues, did you verified that your patch can fix it ?
>>>
>>> besides
>>>
>>> /Monk
>>>
>>> -----邮件原件-----
>>> 发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 
>>> Changfeng.Zhu
>>> 发送时间: 2019年11月20日 17:14
>>> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack 
>>> <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray 
>>> <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; 
>>> amd-gfx@lists.freedesktop.org
>>> 抄送: Zhu, Changfeng <Changfeng.Zhu@amd.com>
>>> 主题: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in 
>>> amdgpu_virt
>>>
>>> From: changzhu <Changfeng.Zhu@amd.com>
>>>
>>> It may lose gpuvm invalidate acknowldege state across power-gating off cycle. To avoid this issue in virt invalidation, add semaphore acquire before invalidation and semaphore release after invalidation.
>>>
>>> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
>>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>>>     drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>>>     3 files changed, 28 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> index f04eb1a64271..70ffaf91cd12 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device 
>>> *adev, uint32_t reg, uint32_t v)
>>>     
>>>     void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>     					uint32_t reg0, uint32_t reg1,
>>> -					uint32_t ref, uint32_t mask)
>>> +					uint32_t ref, uint32_t mask,
>>> +					uint32_t sem)
>>>     {
>>>     	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>     	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>     	uint32_t seq;
>>>     
>>>     	spin_lock_irqsave(&kiq->ring_lock, flags);
>>> -	amdgpu_ring_alloc(ring, 32);
>>> +	amdgpu_ring_alloc(ring, 60);
>>> +
>>> +	/*
>>> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
>>> +	 * off cycle, add semaphore acquire before invalidation and semaphore
>>> +	 * release after invalidation to avoid entering power gated state
>>> +	 * to WA the Issue
>>> +	 */
>>> +
>>> +	/* a read return value of 1 means semaphore acuqire */
>>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>>> +
>>>     	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>>     					    ref, mask);
>>> +	/*
>>> +	 * add semaphore release after invalidation,
>>> +	 * write with 0 means semaphore release
>>> +	 */
>>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>>> +	amdgpu_ring_emit_wreg(ring, sem, 0);
>>> +
>>>     	amdgpu_fence_emit_polling(ring, &seq);
>>>     	amdgpu_ring_commit(ring);
>>>     	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git 
>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> index b0b2bdc750df..bda6a2f37dc0 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);  void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>     					uint32_t reg0, uint32_t rreg1,
>>> -					uint32_t ref, uint32_t mask);
>>> +					uint32_t ref, uint32_t mask,
>>> +					uint32_t sem);
>>>     int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, 
>>> bool init);  int amdgpu_virt_release_full_gpu(struct amdgpu_device 
>>> *adev, bool init);  int amdgpu_virt_reset_gpu(struct amdgpu_device 
>>> *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> index f25cd97ba5f2..1ae59af7836a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>>     			!adev->in_gpu_reset) {
>>>     		uint32_t req = hub->vm_inv_eng0_req + eng;
>>>     		uint32_t ack = hub->vm_inv_eng0_ack + eng;
>>> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>>>     
>>>     		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
>>> -				1 << vmid);
>>> +						   1 << vmid, sem);
>>>     		return;
>>>     	}
>>>     
>>> --
>>> 2.17.1
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 15:04             ` Zeng, Oak
  0 siblings, 0 replies; 42+ messages in thread
From: Zeng, Oak @ 2019-11-20 15:04 UTC (permalink / raw)
  To: Koenig, Christian, Liu, Monk, Zhu, Changfeng, Xiao, Jack, Zhou1,
	Tao, Huang, Ray, Huang, Shimmer,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

See an inline comment 

Regards,
Oak

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Christian König
Sent: Wednesday, November 20, 2019 8:21 AM
To: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt

Hi Monk,

this is a fix for power gating the MMHUB.

Basic problem is that the MMHUB can power gate while an invalidation is in progress
[Oak] I am not familiar about the power gating sequence but from first glance, should the power gating sequence make sure that HW is ready (idle) for power gating before put the system to power gating? E.g., before we put the system to power gating, should we enquiry each HW blocks to see whether the HW is idle? If not (like the case you mentioned some invalidation activities is still ongoing) the power gating condition is not mature and we should we wait. Or if the power gating is trigger/initiated by HW (I am not sure), HW should guarantee it is idle?

 which looses all bits in the ACK register and so deadlocks the engine waiting for the invalidation to finish.

This bug is hit immediately when we enable power gating of the MMHUB.

Regards,
Christian.

Am 20.11.19 um 14:18 schrieb Liu, Monk:
> Hi Changfeng
>
> Firs of all, there is no power-gating off circle involved in AMDGPU 
> SRIOV, since we don't allow VF/VM do such things so I do feel strange 
> why you post something like this Especially on VEGA10 serials which 
> looks doesn't have any issue on those gpu_flush part
>
> Here is my questions for you:
> 1) Can you point me what issue had you been experienced ? and how to 
> repro the bug
> 2) if you do hit some issues, did you verified that your patch can fix it ?
>
> besides
>
> /Monk
>
> -----邮件原件-----
> 发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 Changfeng.Zhu
> 发送时间: 2019年11月20日 17:14
> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack 
> <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray 
> <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; 
> amd-gfx@lists.freedesktop.org
> 抄送: Zhu, Changfeng <Changfeng.Zhu@amd.com>
> 主题: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in 
> amdgpu_virt
>
> From: changzhu <Changfeng.Zhu@amd.com>
>
> It may lose gpuvm invalidate acknowldege state across power-gating off cycle. To avoid this issue in virt invalidation, add semaphore acquire before invalidation and semaphore release after invalidation.
>
> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>   3 files changed, 28 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index f04eb1a64271..70ffaf91cd12 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device 
> *adev, uint32_t reg, uint32_t v)
>   
>   void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   					uint32_t reg0, uint32_t reg1,
> -					uint32_t ref, uint32_t mask)
> +					uint32_t ref, uint32_t mask,
> +					uint32_t sem)
>   {
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   	uint32_t seq;
>   
>   	spin_lock_irqsave(&kiq->ring_lock, flags);
> -	amdgpu_ring_alloc(ring, 32);
> +	amdgpu_ring_alloc(ring, 60);
> +
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
> +
>   	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>   					    ref, mask);
> +	/*
> +	 * add semaphore release after invalidation,
> +	 * write with 0 means semaphore release
> +	 */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_wreg(ring, sem, 0);
> +
>   	amdgpu_fence_emit_polling(ring, &seq);
>   	amdgpu_ring_commit(ring);
>   	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index b0b2bdc750df..bda6a2f37dc0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);  void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   					uint32_t reg0, uint32_t rreg1,
> -					uint32_t ref, uint32_t mask);
> +					uint32_t ref, uint32_t mask,
> +					uint32_t sem);
>   int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool 
> init);  int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, 
> bool init);  int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); 
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index f25cd97ba5f2..1ae59af7836a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   			!adev->in_gpu_reset) {
>   		uint32_t req = hub->vm_inv_eng0_req + eng;
>   		uint32_t ack = hub->vm_inv_eng0_ack + eng;
> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>   
>   		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
> -				1 << vmid);
> +						   1 << vmid, sem);
>   		return;
>   	}
>   
> --
> 2.17.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 15:04             ` Zeng, Oak
  0 siblings, 0 replies; 42+ messages in thread
From: Zeng, Oak @ 2019-11-20 15:04 UTC (permalink / raw)
  To: Koenig, Christian, Liu, Monk, Zhu, Changfeng, Xiao, Jack, Zhou1,
	Tao, Huang, Ray, Huang, Shimmer, amd-gfx

See an inline comment 

Regards,
Oak

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Christian König
Sent: Wednesday, November 20, 2019 8:21 AM
To: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt

Hi Monk,

this is a fix for power gating the MMHUB.

Basic problem is that the MMHUB can power gate while an invalidation is in progress
[Oak] I am not familiar about the power gating sequence but from first glance, should the power gating sequence make sure that HW is ready (idle) for power gating before put the system to power gating? E.g., before we put the system to power gating, should we enquiry each HW blocks to see whether the HW is idle? If not (like the case you mentioned some invalidation activities is still ongoing) the power gating condition is not mature and we should we wait. Or if the power gating is trigger/initiated by HW (I am not sure), HW should guarantee it is idle?

 which looses all bits in the ACK register and so deadlocks the engine waiting for the invalidation to finish.

This bug is hit immediately when we enable power gating of the MMHUB.

Regards,
Christian.

Am 20.11.19 um 14:18 schrieb Liu, Monk:
> Hi Changfeng
>
> Firs of all, there is no power-gating off circle involved in AMDGPU 
> SRIOV, since we don't allow VF/VM do such things so I do feel strange 
> why you post something like this Especially on VEGA10 serials which 
> looks doesn't have any issue on those gpu_flush part
>
> Here is my questions for you:
> 1) Can you point me what issue had you been experienced ? and how to 
> repro the bug
> 2) if you do hit some issues, did you verified that your patch can fix it ?
>
> besides
>
> /Monk
>
> -----邮件原件-----
> 发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 Changfeng.Zhu
> 发送时间: 2019年11月20日 17:14
> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack 
> <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray 
> <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; 
> amd-gfx@lists.freedesktop.org
> 抄送: Zhu, Changfeng <Changfeng.Zhu@amd.com>
> 主题: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in 
> amdgpu_virt
>
> From: changzhu <Changfeng.Zhu@amd.com>
>
> It may lose gpuvm invalidate acknowldege state across power-gating off cycle. To avoid this issue in virt invalidation, add semaphore acquire before invalidation and semaphore release after invalidation.
>
> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>   3 files changed, 28 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index f04eb1a64271..70ffaf91cd12 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device 
> *adev, uint32_t reg, uint32_t v)
>   
>   void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   					uint32_t reg0, uint32_t reg1,
> -					uint32_t ref, uint32_t mask)
> +					uint32_t ref, uint32_t mask,
> +					uint32_t sem)
>   {
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   	uint32_t seq;
>   
>   	spin_lock_irqsave(&kiq->ring_lock, flags);
> -	amdgpu_ring_alloc(ring, 32);
> +	amdgpu_ring_alloc(ring, 60);
> +
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
> +
>   	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>   					    ref, mask);
> +	/*
> +	 * add semaphore release after invalidation,
> +	 * write with 0 means semaphore release
> +	 */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +	amdgpu_ring_emit_wreg(ring, sem, 0);
> +
>   	amdgpu_fence_emit_polling(ring, &seq);
>   	amdgpu_ring_commit(ring);
>   	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index b0b2bdc750df..bda6a2f37dc0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);  void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   					uint32_t reg0, uint32_t rreg1,
> -					uint32_t ref, uint32_t mask);
> +					uint32_t ref, uint32_t mask,
> +					uint32_t sem);
>   int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool 
> init);  int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, 
> bool init);  int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); 
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index f25cd97ba5f2..1ae59af7836a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   			!adev->in_gpu_reset) {
>   		uint32_t req = hub->vm_inv_eng0_req + eng;
>   		uint32_t ack = hub->vm_inv_eng0_ack + eng;
> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>   
>   		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
> -				1 << vmid);
> +						   1 << vmid, sem);
>   		return;
>   	}
>   
> --
> 2.17.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: 答复: 答复: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 15:06                                     ` Christian König
  0 siblings, 0 replies; 42+ messages in thread
From: Christian König @ 2019-11-20 15:06 UTC (permalink / raw)
  To: Liu, Monk, Zhu, Changfeng, Xiao, Jack, Zhou1, Tao, Huang, Ray,
	Huang, Shimmer, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 20.11.19 um 15:59 schrieb Liu, Monk:
>>> the KIQ is used to invalidate both the GFXHUB as well as the MMHUB on Vega.
> I know,
>
>> +	/* a read return value of 1 means semaphore acuqire */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
> But ring->funcs->vmhub wil always be AMDGPU_GFXHUB, right ? since this ring is from "&kiq->ring" ?

Ah! Good catch, that is indeed incorrect.

Christian.

>
>
>>> Yes, agree. But since we now knew that we won't need that we can just drop this patch altogether.
> Yeah, the semaphore wrapping is in PATCH 2/2, agree that this PATCH 1/2 could be dropped
>
>
> -----邮件原件-----
> 发件人: Christian König <ckoenig.leichtzumerken@gmail.com>
> 发送时间: 2019年11月20日 22:39
> 收件人: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> 主题: Re: 答复: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
>
> Hi Monk,
>
> the KIQ is used to invalidate both the GFXHUB as well as the MMHUB on Vega.
>
>> Besides, amdgpu_virt_kiq_reg_write_reg_wait() is not deadly a helper
>> function that only serve VM invalidate, so I don't think You should
>> put the semaphore read/write in this routine, instead you can put
>> semaphore r/w out side of this routine and only Put them around the VM
>> invalidate logic
> Yes, agree. But since we now knew that we won't need that we can just drop this patch altogether.
>
> Regards,
> Christian.
>
> Am 20.11.19 um 15:30 schrieb Liu, Monk:
>> Thanks for sharing this JIR
>>
>> now I got the picture of this issue from you and Christian.
>>
>> So the semaphore grabbing can prevent RTL to power off the MMHUB, I
>> see
>>
>> The practice is that SRIOV won't enable PG at all (even our GIM driver
>> won't enable PG, maybe in future we would enable it )
>>
>> I think I don't have too many concern about your patches,
>>
>> But I have comments on your patch 1:
>>
>> void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    					uint32_t reg0, uint32_t reg1,
>> -					uint32_t ref, uint32_t mask)
>> +					uint32_t ref, uint32_t mask,
>> +					uint32_t sem)
>>    {
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    	uint32_t seq;
>>    
>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>> -	amdgpu_ring_alloc(ring, 32);
>> +	amdgpu_ring_alloc(ring, 60);
>> +
>> +	/*
>> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
>> +	 * off cycle, add semaphore acquire before invalidation and semaphore
>> +	 * release after invalidation to avoid entering power gated state
>> +	 * to WA the Issue
>> +	 */
>> +
>> +	/* a read return value of 1 means semaphore acuqire */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>>
>>
>> See that in this routine, the ring is always KIQ, so below code looks redundant :
>>
>> +	/* a read return value of 1 means semaphore acuqire */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>>
>> Besides, amdgpu_virt_kiq_reg_write_reg_wait() is not deadly a helper
>> function that only serve VM invalidate, so I don't think You should
>> put the semaphore read/write in this routine, instead you can put
>> semaphore r/w out side of this routine and only Put them around the VM
>> invalidate logic
>>
>> Thanks
>>
>> -----邮件原件-----
>> 发件人: Zhu, Changfeng <Changfeng.Zhu@amd.com>
>> 发送时间: 2019年11月20日 22:17
>> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Liu, Monk
>> <Monk.Liu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao
>> <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer
>> <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
>> 主题: RE: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore
>> workaround in amdgpu_virt
>>
>>>>> Did Changfeng already hit this issue under SRIOV ???
>> I meet this problem on navi14 under gmc_v10_0_emit_flush_gpu_tlb .
>> The problem is also seen by Zhou,Tao.
>>
>> And this is ticket:
>> http://ontrack-internal.amd.com/browse/SWDEV-201459
>>
>> After the semaphore patch, the problem can be fixed.
>>
>> If SROV has concern about this problem,  it should not add semaphore in SROV.
>>
>> However, we should apply semaphore for gmc_v9_0_flush_gpu_tlb/
>> gmc_v9_0_emit_flush_gpu_tlb/ gmc_v10_0_flush_gpu_tlb/
>> gmc_v10_0_emit_flush_gpu_tlb
>>
>> Or how can we handle the ticket above?
>>
>> BR,
>> Changfeng.
>>
>> -----Original Message-----
>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>> Sent: Wednesday, November 20, 2019 10:00 PM
>> To: Liu, Monk <Monk.Liu@amd.com>; Koenig, Christian
>> <Christian.Koenig@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>;
>> Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang,
>> Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>;
>> amd-gfx@lists.freedesktop.org
>> Subject: Re: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore
>> workaround in amdgpu_virt
>>
>>> Did Changfeng already hit this issue under SRIOV ?
>> I don't think so, but Changfeng needs to answer this.
>>
>> Question is does the extra semaphore acquire has some negative effect on SRIOV?
>>
>> I would like to avoid having even more SRIOV specific handling in here which we can't really test on bare metal.
>>
>> Christian.
>>
>> Am 20.11.19 um 14:54 schrieb Liu, Monk:
>>> Hah, but in SRIOV case, our guest KMD driver is not allowed to do
>>> such things .... (and even there is a bug that KMD try to power gate,
>>> the SMU firmware would not really do the jobs since We have PSP L1
>>> policy to prevent those danger operations )
>>>
>>> Did Changfeng already hit this issue under SRIOV ???
>>>
>>> -----邮件原件-----
>>> 发件人: Koenig, Christian <Christian.Koenig@amd.com>
>>> 发送时间: 2019年11月20日 21:21
>>> 收件人: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng
>>> <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao
>>> <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer
>>> <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
>>> 主题: Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore
>>> workaround in amdgpu_virt
>>>
>>> Hi Monk,
>>>
>>> this is a fix for power gating the MMHUB.
>>>
>>> Basic problem is that the MMHUB can power gate while an invalidation is in progress which looses all bits in the ACK register and so deadlocks the engine waiting for the invalidation to finish.
>>>
>>> This bug is hit immediately when we enable power gating of the MMHUB.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 20.11.19 um 14:18 schrieb Liu, Monk:
>>>> Hi Changfeng
>>>>
>>>> Firs of all, there is no power-gating off circle involved in AMDGPU
>>>> SRIOV, since we don't allow VF/VM do such things so I do feel
>>>> strange why you post something like this Especially on VEGA10
>>>> serials which looks doesn't have any issue on those gpu_flush part
>>>>
>>>> Here is my questions for you:
>>>> 1) Can you point me what issue had you been experienced ? and how to
>>>> repro the bug
>>>> 2) if you do hit some issues, did you verified that your patch can fix it ?
>>>>
>>>> besides
>>>>
>>>> /Monk
>>>>
>>>> -----邮件原件-----
>>>> 发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表
>>>> Changfeng.Zhu
>>>> 发送时间: 2019年11月20日 17:14
>>>> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack
>>>> <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray
>>>> <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>;
>>>> amd-gfx@lists.freedesktop.org
>>>> 抄送: Zhu, Changfeng <Changfeng.Zhu@amd.com>
>>>> 主题: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in
>>>> amdgpu_virt
>>>>
>>>> From: changzhu <Changfeng.Zhu@amd.com>
>>>>
>>>> It may lose gpuvm invalidate acknowldege state across power-gating off cycle. To avoid this issue in virt invalidation, add semaphore acquire before invalidation and semaphore release after invalidation.
>>>>
>>>> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
>>>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>>>> ---
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>>>>      drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>>>>      3 files changed, 28 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>>> index f04eb1a64271..70ffaf91cd12 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>>> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device
>>>> *adev, uint32_t reg, uint32_t v)
>>>>      
>>>>      void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>>      					uint32_t reg0, uint32_t reg1,
>>>> -					uint32_t ref, uint32_t mask)
>>>> +					uint32_t ref, uint32_t mask,
>>>> +					uint32_t sem)
>>>>      {
>>>>      	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>      	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>>      	uint32_t seq;
>>>>      
>>>>      	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>> -	amdgpu_ring_alloc(ring, 32);
>>>> +	amdgpu_ring_alloc(ring, 60);
>>>> +
>>>> +	/*
>>>> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
>>>> +	 * off cycle, add semaphore acquire before invalidation and semaphore
>>>> +	 * release after invalidation to avoid entering power gated state
>>>> +	 * to WA the Issue
>>>> +	 */
>>>> +
>>>> +	/* a read return value of 1 means semaphore acuqire */
>>>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>>>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>>>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>>>> +
>>>>      	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>>>      					    ref, mask);
>>>> +	/*
>>>> +	 * add semaphore release after invalidation,
>>>> +	 * write with 0 means semaphore release
>>>> +	 */
>>>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>>>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>>>> +	amdgpu_ring_emit_wreg(ring, sem, 0);
>>>> +
>>>>      	amdgpu_fence_emit_polling(ring, &seq);
>>>>      	amdgpu_ring_commit(ring);
>>>>      	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git
>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>> index b0b2bdc750df..bda6a2f37dc0 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);  void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>>      					uint32_t reg0, uint32_t rreg1,
>>>> -					uint32_t ref, uint32_t mask);
>>>> +					uint32_t ref, uint32_t mask,
>>>> +					uint32_t sem);
>>>>      int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev,
>>>> bool init);  int amdgpu_virt_release_full_gpu(struct amdgpu_device
>>>> *adev, bool init);  int amdgpu_virt_reset_gpu(struct amdgpu_device
>>>> *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> index f25cd97ba5f2..1ae59af7836a 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>>>      			!adev->in_gpu_reset) {
>>>>      		uint32_t req = hub->vm_inv_eng0_req + eng;
>>>>      		uint32_t ack = hub->vm_inv_eng0_ack + eng;
>>>> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>>>>      
>>>>      		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
>>>> -				1 << vmid);
>>>> +						   1 << vmid, sem);
>>>>      		return;
>>>>      	}
>>>>      
>>>> --
>>>> 2.17.1
>>>>
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: 答复: 答复: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 15:06                                     ` Christian König
  0 siblings, 0 replies; 42+ messages in thread
From: Christian König @ 2019-11-20 15:06 UTC (permalink / raw)
  To: Liu, Monk, Zhu, Changfeng, Xiao, Jack, Zhou1, Tao, Huang, Ray,
	Huang, Shimmer, amd-gfx

Am 20.11.19 um 15:59 schrieb Liu, Monk:
>>> the KIQ is used to invalidate both the GFXHUB as well as the MMHUB on Vega.
> I know,
>
>> +	/* a read return value of 1 means semaphore acuqire */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
> But ring->funcs->vmhub wil always be AMDGPU_GFXHUB, right ? since this ring is from "&kiq->ring" ?

Ah! Good catch, that is indeed incorrect.

Christian.

>
>
>>> Yes, agree. But since we now knew that we won't need that we can just drop this patch altogether.
> Yeah, the semaphore wrapping is in PATCH 2/2, agree that this PATCH 1/2 could be dropped
>
>
> -----邮件原件-----
> 发件人: Christian König <ckoenig.leichtzumerken@gmail.com>
> 发送时间: 2019年11月20日 22:39
> 收件人: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> 主题: Re: 答复: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
>
> Hi Monk,
>
> the KIQ is used to invalidate both the GFXHUB as well as the MMHUB on Vega.
>
>> Besides, amdgpu_virt_kiq_reg_write_reg_wait() is not deadly a helper
>> function that only serve VM invalidate, so I don't think You should
>> put the semaphore read/write in this routine, instead you can put
>> semaphore r/w out side of this routine and only Put them around the VM
>> invalidate logic
> Yes, agree. But since we now knew that we won't need that we can just drop this patch altogether.
>
> Regards,
> Christian.
>
> Am 20.11.19 um 15:30 schrieb Liu, Monk:
>> Thanks for sharing this JIR
>>
>> now I got the picture of this issue from you and Christian.
>>
>> So the semaphore grabbing can prevent RTL to power off the MMHUB, I
>> see
>>
>> The practice is that SRIOV won't enable PG at all (even our GIM driver
>> won't enable PG, maybe in future we would enable it )
>>
>> I think I don't have too many concern about your patches,
>>
>> But I have comments on your patch 1:
>>
>> void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    					uint32_t reg0, uint32_t reg1,
>> -					uint32_t ref, uint32_t mask)
>> +					uint32_t ref, uint32_t mask,
>> +					uint32_t sem)
>>    {
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    	uint32_t seq;
>>    
>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>> -	amdgpu_ring_alloc(ring, 32);
>> +	amdgpu_ring_alloc(ring, 60);
>> +
>> +	/*
>> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
>> +	 * off cycle, add semaphore acquire before invalidation and semaphore
>> +	 * release after invalidation to avoid entering power gated state
>> +	 * to WA the Issue
>> +	 */
>> +
>> +	/* a read return value of 1 means semaphore acuqire */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>>
>>
>> See that in this routine, the ring is always KIQ, so below code looks redundant :
>>
>> +	/* a read return value of 1 means semaphore acuqire */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>>
>> Besides, amdgpu_virt_kiq_reg_write_reg_wait() is not deadly a helper
>> function that only serve VM invalidate, so I don't think You should
>> put the semaphore read/write in this routine, instead you can put
>> semaphore r/w out side of this routine and only Put them around the VM
>> invalidate logic
>>
>> Thanks
>>
>> -----邮件原件-----
>> 发件人: Zhu, Changfeng <Changfeng.Zhu@amd.com>
>> 发送时间: 2019年11月20日 22:17
>> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Liu, Monk
>> <Monk.Liu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao
>> <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer
>> <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
>> 主题: RE: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore
>> workaround in amdgpu_virt
>>
>>>>> Did Changfeng already hit this issue under SRIOV ???
>> I meet this problem on navi14 under gmc_v10_0_emit_flush_gpu_tlb .
>> The problem is also seen by Zhou,Tao.
>>
>> And this is ticket:
>> http://ontrack-internal.amd.com/browse/SWDEV-201459
>>
>> After the semaphore patch, the problem can be fixed.
>>
>> If SROV has concern about this problem,  it should not add semaphore in SROV.
>>
>> However, we should apply semaphore for gmc_v9_0_flush_gpu_tlb/
>> gmc_v9_0_emit_flush_gpu_tlb/ gmc_v10_0_flush_gpu_tlb/
>> gmc_v10_0_emit_flush_gpu_tlb
>>
>> Or how can we handle the ticket above?
>>
>> BR,
>> Changfeng.
>>
>> -----Original Message-----
>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>> Sent: Wednesday, November 20, 2019 10:00 PM
>> To: Liu, Monk <Monk.Liu@amd.com>; Koenig, Christian
>> <Christian.Koenig@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>;
>> Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang,
>> Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>;
>> amd-gfx@lists.freedesktop.org
>> Subject: Re: 答复: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore
>> workaround in amdgpu_virt
>>
>>> Did Changfeng already hit this issue under SRIOV ?
>> I don't think so, but Changfeng needs to answer this.
>>
>> Question is does the extra semaphore acquire has some negative effect on SRIOV?
>>
>> I would like to avoid having even more SRIOV specific handling in here which we can't really test on bare metal.
>>
>> Christian.
>>
>> Am 20.11.19 um 14:54 schrieb Liu, Monk:
>>> Hah, but in SRIOV case, our guest KMD driver is not allowed to do
>>> such things .... (and even there is a bug that KMD try to power gate,
>>> the SMU firmware would not really do the jobs since We have PSP L1
>>> policy to prevent those danger operations )
>>>
>>> Did Changfeng already hit this issue under SRIOV ???
>>>
>>> -----邮件原件-----
>>> 发件人: Koenig, Christian <Christian.Koenig@amd.com>
>>> 发送时间: 2019年11月20日 21:21
>>> 收件人: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng
>>> <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao
>>> <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer
>>> <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
>>> 主题: Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore
>>> workaround in amdgpu_virt
>>>
>>> Hi Monk,
>>>
>>> this is a fix for power gating the MMHUB.
>>>
>>> Basic problem is that the MMHUB can power gate while an invalidation is in progress which looses all bits in the ACK register and so deadlocks the engine waiting for the invalidation to finish.
>>>
>>> This bug is hit immediately when we enable power gating of the MMHUB.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 20.11.19 um 14:18 schrieb Liu, Monk:
>>>> Hi Changfeng
>>>>
>>>> Firs of all, there is no power-gating off circle involved in AMDGPU
>>>> SRIOV, since we don't allow VF/VM do such things so I do feel
>>>> strange why you post something like this Especially on VEGA10
>>>> serials which looks doesn't have any issue on those gpu_flush part
>>>>
>>>> Here is my questions for you:
>>>> 1) Can you point me what issue had you been experienced ? and how to
>>>> repro the bug
>>>> 2) if you do hit some issues, did you verified that your patch can fix it ?
>>>>
>>>> besides
>>>>
>>>> /Monk
>>>>
>>>> -----邮件原件-----
>>>> 发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表
>>>> Changfeng.Zhu
>>>> 发送时间: 2019年11月20日 17:14
>>>> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack
>>>> <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray
>>>> <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>;
>>>> amd-gfx@lists.freedesktop.org
>>>> 抄送: Zhu, Changfeng <Changfeng.Zhu@amd.com>
>>>> 主题: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in
>>>> amdgpu_virt
>>>>
>>>> From: changzhu <Changfeng.Zhu@amd.com>
>>>>
>>>> It may lose gpuvm invalidate acknowldege state across power-gating off cycle. To avoid this issue in virt invalidation, add semaphore acquire before invalidation and semaphore release after invalidation.
>>>>
>>>> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
>>>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>>>> ---
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>>>>      drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>>>>      3 files changed, 28 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>>> index f04eb1a64271..70ffaf91cd12 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>>> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device
>>>> *adev, uint32_t reg, uint32_t v)
>>>>      
>>>>      void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>>      					uint32_t reg0, uint32_t reg1,
>>>> -					uint32_t ref, uint32_t mask)
>>>> +					uint32_t ref, uint32_t mask,
>>>> +					uint32_t sem)
>>>>      {
>>>>      	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>      	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>>      	uint32_t seq;
>>>>      
>>>>      	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>> -	amdgpu_ring_alloc(ring, 32);
>>>> +	amdgpu_ring_alloc(ring, 60);
>>>> +
>>>> +	/*
>>>> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
>>>> +	 * off cycle, add semaphore acquire before invalidation and semaphore
>>>> +	 * release after invalidation to avoid entering power gated state
>>>> +	 * to WA the Issue
>>>> +	 */
>>>> +
>>>> +	/* a read return value of 1 means semaphore acuqire */
>>>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>>>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>>>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>>>> +
>>>>      	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>>>      					    ref, mask);
>>>> +	/*
>>>> +	 * add semaphore release after invalidation,
>>>> +	 * write with 0 means semaphore release
>>>> +	 */
>>>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>>>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>>>> +	amdgpu_ring_emit_wreg(ring, sem, 0);
>>>> +
>>>>      	amdgpu_fence_emit_polling(ring, &seq);
>>>>      	amdgpu_ring_commit(ring);
>>>>      	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git
>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>> index b0b2bdc750df..bda6a2f37dc0 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);  void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>>      					uint32_t reg0, uint32_t rreg1,
>>>> -					uint32_t ref, uint32_t mask);
>>>> +					uint32_t ref, uint32_t mask,
>>>> +					uint32_t sem);
>>>>      int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev,
>>>> bool init);  int amdgpu_virt_release_full_gpu(struct amdgpu_device
>>>> *adev, bool init);  int amdgpu_virt_reset_gpu(struct amdgpu_device
>>>> *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> index f25cd97ba5f2..1ae59af7836a 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>>>      			!adev->in_gpu_reset) {
>>>>      		uint32_t req = hub->vm_inv_eng0_req + eng;
>>>>      		uint32_t ack = hub->vm_inv_eng0_ack + eng;
>>>> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>>>>      
>>>>      		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
>>>> -				1 << vmid);
>>>> +						   1 << vmid, sem);
>>>>      		return;
>>>>      	}
>>>>      
>>>> --
>>>> 2.17.1
>>>>
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 15:13                 ` Christian König
  0 siblings, 0 replies; 42+ messages in thread
From: Christian König @ 2019-11-20 15:13 UTC (permalink / raw)
  To: Zeng, Oak, Liu, Monk, Zhu, Changfeng, Xiao, Jack, Zhou1, Tao,
	Huang, Ray, Huang, Shimmer,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Hi Oak,

> [Oak] I am not familiar about the power gating sequence but from first glance, should the power gating sequence make sure that HW is ready (idle) for power gating before put the system to power gating?
The problem is that the hardware is actually idle when gated.

See what happens is the following:

1. Ring A sends an invalidate command to VM invalidation engine X.

2. VM invalidation engine X wakes up and is ungated because it now has work.

3. VM invalidation engine X finishes the invalidation and goes back to 
be gated again.

4. Now ring A polls for the invalidation on engine X to complete, but 
since it got back to be gated again it has forgotten that we have 
finished that invalidation. BAM! Ring A will poll forever.

Regards,
Christian.

Am 20.11.19 um 16:04 schrieb Zeng, Oak:
> See an inline comment
>
> Regards,
> Oak
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Christian König
> Sent: Wednesday, November 20, 2019 8:21 AM
> To: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
>
> Hi Monk,
>
> this is a fix for power gating the MMHUB.
>
> Basic problem is that the MMHUB can power gate while an invalidation is in progress
> [Oak] I am not familiar about the power gating sequence but from first glance, should the power gating sequence make sure that HW is ready (idle) for power gating before put the system to power gating? E.g., before we put the system to power gating, should we enquiry each HW blocks to see whether the HW is idle? If not (like the case you mentioned some invalidation activities is still ongoing) the power gating condition is not mature and we should we wait. Or if the power gating is trigger/initiated by HW (I am not sure), HW should guarantee it is idle?
>
>   which looses all bits in the ACK register and so deadlocks the engine waiting for the invalidation to finish.
>
> This bug is hit immediately when we enable power gating of the MMHUB.
>
> Regards,
> Christian.
>
> Am 20.11.19 um 14:18 schrieb Liu, Monk:
>> Hi Changfeng
>>
>> Firs of all, there is no power-gating off circle involved in AMDGPU
>> SRIOV, since we don't allow VF/VM do such things so I do feel strange
>> why you post something like this Especially on VEGA10 serials which
>> looks doesn't have any issue on those gpu_flush part
>>
>> Here is my questions for you:
>> 1) Can you point me what issue had you been experienced ? and how to
>> repro the bug
>> 2) if you do hit some issues, did you verified that your patch can fix it ?
>>
>> besides
>>
>> /Monk
>>
>> -----邮件原件-----
>> 发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 Changfeng.Zhu
>> 发送时间: 2019年11月20日 17:14
>> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack
>> <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray
>> <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>;
>> amd-gfx@lists.freedesktop.org
>> 抄送: Zhu, Changfeng <Changfeng.Zhu@amd.com>
>> 主题: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in
>> amdgpu_virt
>>
>> From: changzhu <Changfeng.Zhu@amd.com>
>>
>> It may lose gpuvm invalidate acknowldege state across power-gating off cycle. To avoid this issue in virt invalidation, add semaphore acquire before invalidation and semaphore release after invalidation.
>>
>> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>>    3 files changed, 28 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> index f04eb1a64271..70ffaf91cd12 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device
>> *adev, uint32_t reg, uint32_t v)
>>    
>>    void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    					uint32_t reg0, uint32_t reg1,
>> -					uint32_t ref, uint32_t mask)
>> +					uint32_t ref, uint32_t mask,
>> +					uint32_t sem)
>>    {
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    	uint32_t seq;
>>    
>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>> -	amdgpu_ring_alloc(ring, 32);
>> +	amdgpu_ring_alloc(ring, 60);
>> +
>> +	/*
>> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
>> +	 * off cycle, add semaphore acquire before invalidation and semaphore
>> +	 * release after invalidation to avoid entering power gated state
>> +	 * to WA the Issue
>> +	 */
>> +
>> +	/* a read return value of 1 means semaphore acuqire */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>> +
>>    	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>    					    ref, mask);
>> +	/*
>> +	 * add semaphore release after invalidation,
>> +	 * write with 0 means semaphore release
>> +	 */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_wreg(ring, sem, 0);
>> +
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>>    	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> index b0b2bdc750df..bda6a2f37dc0 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);  void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    					uint32_t reg0, uint32_t rreg1,
>> -					uint32_t ref, uint32_t mask);
>> +					uint32_t ref, uint32_t mask,
>> +					uint32_t sem);
>>    int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool
>> init);  int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev,
>> bool init);  int amdgpu_virt_reset_gpu(struct amdgpu_device *adev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index f25cd97ba5f2..1ae59af7836a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>    			!adev->in_gpu_reset) {
>>    		uint32_t req = hub->vm_inv_eng0_req + eng;
>>    		uint32_t ack = hub->vm_inv_eng0_ack + eng;
>> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>>    
>>    		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
>> -				1 << vmid);
>> +						   1 << vmid, sem);
>>    		return;
>>    	}
>>    
>> --
>> 2.17.1
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 15:13                 ` Christian König
  0 siblings, 0 replies; 42+ messages in thread
From: Christian König @ 2019-11-20 15:13 UTC (permalink / raw)
  To: Zeng, Oak, Liu, Monk, Zhu, Changfeng, Xiao, Jack, Zhou1, Tao,
	Huang, Ray, Huang, Shimmer, amd-gfx

Hi Oak,

> [Oak] I am not familiar about the power gating sequence but from first glance, should the power gating sequence make sure that HW is ready (idle) for power gating before put the system to power gating?
The problem is that the hardware is actually idle when gated.

See what happens is the following:

1. Ring A sends an invalidate command to VM invalidation engine X.

2. VM invalidation engine X wakes up and is ungated because it now has work.

3. VM invalidation engine X finishes the invalidation and goes back to 
be gated again.

4. Now ring A polls for the invalidation on engine X to complete, but 
since it got back to be gated again it has forgotten that we have 
finished that invalidation. BAM! Ring A will poll forever.

Regards,
Christian.

Am 20.11.19 um 16:04 schrieb Zeng, Oak:
> See an inline comment
>
> Regards,
> Oak
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Christian König
> Sent: Wednesday, November 20, 2019 8:21 AM
> To: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
>
> Hi Monk,
>
> this is a fix for power gating the MMHUB.
>
> Basic problem is that the MMHUB can power gate while an invalidation is in progress
> [Oak] I am not familiar about the power gating sequence but from first glance, should the power gating sequence make sure that HW is ready (idle) for power gating before put the system to power gating? E.g., before we put the system to power gating, should we enquiry each HW blocks to see whether the HW is idle? If not (like the case you mentioned some invalidation activities is still ongoing) the power gating condition is not mature and we should we wait. Or if the power gating is trigger/initiated by HW (I am not sure), HW should guarantee it is idle?
>
>   which looses all bits in the ACK register and so deadlocks the engine waiting for the invalidation to finish.
>
> This bug is hit immediately when we enable power gating of the MMHUB.
>
> Regards,
> Christian.
>
> Am 20.11.19 um 14:18 schrieb Liu, Monk:
>> Hi Changfeng
>>
>> Firs of all, there is no power-gating off circle involved in AMDGPU
>> SRIOV, since we don't allow VF/VM do such things so I do feel strange
>> why you post something like this Especially on VEGA10 serials which
>> looks doesn't have any issue on those gpu_flush part
>>
>> Here is my questions for you:
>> 1) Can you point me what issue had you been experienced ? and how to
>> repro the bug
>> 2) if you do hit some issues, did you verified that your patch can fix it ?
>>
>> besides
>>
>> /Monk
>>
>> -----邮件原件-----
>> 发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 Changfeng.Zhu
>> 发送时间: 2019年11月20日 17:14
>> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack
>> <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray
>> <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>;
>> amd-gfx@lists.freedesktop.org
>> 抄送: Zhu, Changfeng <Changfeng.Zhu@amd.com>
>> 主题: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in
>> amdgpu_virt
>>
>> From: changzhu <Changfeng.Zhu@amd.com>
>>
>> It may lose gpuvm invalidate acknowldege state across power-gating off cycle. To avoid this issue in virt invalidation, add semaphore acquire before invalidation and semaphore release after invalidation.
>>
>> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>>    3 files changed, 28 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> index f04eb1a64271..70ffaf91cd12 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device
>> *adev, uint32_t reg, uint32_t v)
>>    
>>    void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    					uint32_t reg0, uint32_t reg1,
>> -					uint32_t ref, uint32_t mask)
>> +					uint32_t ref, uint32_t mask,
>> +					uint32_t sem)
>>    {
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    	uint32_t seq;
>>    
>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>> -	amdgpu_ring_alloc(ring, 32);
>> +	amdgpu_ring_alloc(ring, 60);
>> +
>> +	/*
>> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
>> +	 * off cycle, add semaphore acquire before invalidation and semaphore
>> +	 * release after invalidation to avoid entering power gated state
>> +	 * to WA the Issue
>> +	 */
>> +
>> +	/* a read return value of 1 means semaphore acuqire */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>> +
>>    	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>    					    ref, mask);
>> +	/*
>> +	 * add semaphore release after invalidation,
>> +	 * write with 0 means semaphore release
>> +	 */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_wreg(ring, sem, 0);
>> +
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>>    	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> index b0b2bdc750df..bda6a2f37dc0 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);  void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    					uint32_t reg0, uint32_t rreg1,
>> -					uint32_t ref, uint32_t mask);
>> +					uint32_t ref, uint32_t mask,
>> +					uint32_t sem);
>>    int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool
>> init);  int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev,
>> bool init);  int amdgpu_virt_reset_gpu(struct amdgpu_device *adev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index f25cd97ba5f2..1ae59af7836a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>    			!adev->in_gpu_reset) {
>>    		uint32_t req = hub->vm_inv_eng0_req + eng;
>>    		uint32_t ack = hub->vm_inv_eng0_ack + eng;
>> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>>    
>>    		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
>> -				1 << vmid);
>> +						   1 << vmid, sem);
>>    		return;
>>    	}
>>    
>> --
>> 2.17.1
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 15:38                     ` Zeng, Oak
  0 siblings, 0 replies; 42+ messages in thread
From: Zeng, Oak @ 2019-11-20 15:38 UTC (permalink / raw)
  To: Koenig, Christian, Liu, Monk, Zhu, Changfeng, Xiao, Jack, Zhou1,
	Tao, Huang, Ray, Huang, Shimmer,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Thank you Christian. Maybe in the future we can make the invalidation ack to be interrupt based instead of polling.

Regards,
Oak

-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com> 
Sent: Wednesday, November 20, 2019 10:14 AM
To: Zeng, Oak <Oak.Zeng@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt

Hi Oak,

> [Oak] I am not familiar about the power gating sequence but from first glance, should the power gating sequence make sure that HW is ready (idle) for power gating before put the system to power gating?
The problem is that the hardware is actually idle when gated.

See what happens is the following:

1. Ring A sends an invalidate command to VM invalidation engine X.

2. VM invalidation engine X wakes up and is ungated because it now has work.

3. VM invalidation engine X finishes the invalidation and goes back to be gated again.

4. Now ring A polls for the invalidation on engine X to complete, but since it got back to be gated again it has forgotten that we have finished that invalidation. BAM! Ring A will poll forever.

Regards,
Christian.

Am 20.11.19 um 16:04 schrieb Zeng, Oak:
> See an inline comment
>
> Regards,
> Oak
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of 
> Christian König
> Sent: Wednesday, November 20, 2019 8:21 AM
> To: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng 
> <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao 
> <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer 
> <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore 
> workaround in amdgpu_virt
>
> Hi Monk,
>
> this is a fix for power gating the MMHUB.
>
> Basic problem is that the MMHUB can power gate while an invalidation 
> is in progress [Oak] I am not familiar about the power gating sequence but from first glance, should the power gating sequence make sure that HW is ready (idle) for power gating before put the system to power gating? E.g., before we put the system to power gating, should we enquiry each HW blocks to see whether the HW is idle? If not (like the case you mentioned some invalidation activities is still ongoing) the power gating condition is not mature and we should we wait. Or if the power gating is trigger/initiated by HW (I am not sure), HW should guarantee it is idle?
>
>   which looses all bits in the ACK register and so deadlocks the engine waiting for the invalidation to finish.
>
> This bug is hit immediately when we enable power gating of the MMHUB.
>
> Regards,
> Christian.
>
> Am 20.11.19 um 14:18 schrieb Liu, Monk:
>> Hi Changfeng
>>
>> Firs of all, there is no power-gating off circle involved in AMDGPU 
>> SRIOV, since we don't allow VF/VM do such things so I do feel strange 
>> why you post something like this Especially on VEGA10 serials which 
>> looks doesn't have any issue on those gpu_flush part
>>
>> Here is my questions for you:
>> 1) Can you point me what issue had you been experienced ? and how to 
>> repro the bug
>> 2) if you do hit some issues, did you verified that your patch can fix it ?
>>
>> besides
>>
>> /Monk
>>
>> -----邮件原件-----
>> 发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 Changfeng.Zhu
>> 发送时间: 2019年11月20日 17:14
>> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack 
>> <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray 
>> <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; 
>> amd-gfx@lists.freedesktop.org
>> 抄送: Zhu, Changfeng <Changfeng.Zhu@amd.com>
>> 主题: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in 
>> amdgpu_virt
>>
>> From: changzhu <Changfeng.Zhu@amd.com>
>>
>> It may lose gpuvm invalidate acknowldege state across power-gating off cycle. To avoid this issue in virt invalidation, add semaphore acquire before invalidation and semaphore release after invalidation.
>>
>> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>>    3 files changed, 28 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> index f04eb1a64271..70ffaf91cd12 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device 
>> *adev, uint32_t reg, uint32_t v)
>>    
>>    void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    					uint32_t reg0, uint32_t reg1,
>> -					uint32_t ref, uint32_t mask)
>> +					uint32_t ref, uint32_t mask,
>> +					uint32_t sem)
>>    {
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    	uint32_t seq;
>>    
>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>> -	amdgpu_ring_alloc(ring, 32);
>> +	amdgpu_ring_alloc(ring, 60);
>> +
>> +	/*
>> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
>> +	 * off cycle, add semaphore acquire before invalidation and semaphore
>> +	 * release after invalidation to avoid entering power gated state
>> +	 * to WA the Issue
>> +	 */
>> +
>> +	/* a read return value of 1 means semaphore acuqire */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>> +
>>    	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>    					    ref, mask);
>> +	/*
>> +	 * add semaphore release after invalidation,
>> +	 * write with 0 means semaphore release
>> +	 */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_wreg(ring, sem, 0);
>> +
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>>    	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git 
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> index b0b2bdc750df..bda6a2f37dc0 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);  void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    					uint32_t reg0, uint32_t rreg1,
>> -					uint32_t ref, uint32_t mask);
>> +					uint32_t ref, uint32_t mask,
>> +					uint32_t sem);
>>    int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool 
>> init);  int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, 
>> bool init);  int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); 
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index f25cd97ba5f2..1ae59af7836a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>    			!adev->in_gpu_reset) {
>>    		uint32_t req = hub->vm_inv_eng0_req + eng;
>>    		uint32_t ack = hub->vm_inv_eng0_ack + eng;
>> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>>    
>>    		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
>> -				1 << vmid);
>> +						   1 << vmid, sem);
>>    		return;
>>    	}
>>    
>> --
>> 2.17.1
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 15:38                     ` Zeng, Oak
  0 siblings, 0 replies; 42+ messages in thread
From: Zeng, Oak @ 2019-11-20 15:38 UTC (permalink / raw)
  To: Koenig, Christian, Liu, Monk, Zhu, Changfeng, Xiao, Jack, Zhou1,
	Tao, Huang, Ray, Huang, Shimmer, amd-gfx

Thank you Christian. Maybe in the future we can make the invalidation ack to be interrupt based instead of polling.

Regards,
Oak

-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com> 
Sent: Wednesday, November 20, 2019 10:14 AM
To: Zeng, Oak <Oak.Zeng@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt

Hi Oak,

> [Oak] I am not familiar about the power gating sequence but from first glance, should the power gating sequence make sure that HW is ready (idle) for power gating before put the system to power gating?
The problem is that the hardware is actually idle when gated.

See what happens is the following:

1. Ring A sends an invalidate command to VM invalidation engine X.

2. VM invalidation engine X wakes up and is ungated because it now has work.

3. VM invalidation engine X finishes the invalidation and goes back to be gated again.

4. Now ring A polls for the invalidation on engine X to complete, but since it got back to be gated again it has forgotten that we have finished that invalidation. BAM! Ring A will poll forever.

Regards,
Christian.

Am 20.11.19 um 16:04 schrieb Zeng, Oak:
> See an inline comment
>
> Regards,
> Oak
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of 
> Christian König
> Sent: Wednesday, November 20, 2019 8:21 AM
> To: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng 
> <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao 
> <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer 
> <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore 
> workaround in amdgpu_virt
>
> Hi Monk,
>
> this is a fix for power gating the MMHUB.
>
> Basic problem is that the MMHUB can power gate while an invalidation 
> is in progress [Oak] I am not familiar about the power gating sequence but from first glance, should the power gating sequence make sure that HW is ready (idle) for power gating before put the system to power gating? E.g., before we put the system to power gating, should we enquiry each HW blocks to see whether the HW is idle? If not (like the case you mentioned some invalidation activities is still ongoing) the power gating condition is not mature and we should we wait. Or if the power gating is trigger/initiated by HW (I am not sure), HW should guarantee it is idle?
>
>   which looses all bits in the ACK register and so deadlocks the engine waiting for the invalidation to finish.
>
> This bug is hit immediately when we enable power gating of the MMHUB.
>
> Regards,
> Christian.
>
> Am 20.11.19 um 14:18 schrieb Liu, Monk:
>> Hi Changfeng
>>
>> Firs of all, there is no power-gating off circle involved in AMDGPU 
>> SRIOV, since we don't allow VF/VM do such things so I do feel strange 
>> why you post something like this Especially on VEGA10 serials which 
>> looks doesn't have any issue on those gpu_flush part
>>
>> Here is my questions for you:
>> 1) Can you point me what issue had you been experienced ? and how to 
>> repro the bug
>> 2) if you do hit some issues, did you verified that your patch can fix it ?
>>
>> besides
>>
>> /Monk
>>
>> -----邮件原件-----
>> 发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 Changfeng.Zhu
>> 发送时间: 2019年11月20日 17:14
>> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack 
>> <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray 
>> <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; 
>> amd-gfx@lists.freedesktop.org
>> 抄送: Zhu, Changfeng <Changfeng.Zhu@amd.com>
>> 主题: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in 
>> amdgpu_virt
>>
>> From: changzhu <Changfeng.Zhu@amd.com>
>>
>> It may lose gpuvm invalidate acknowldege state across power-gating off cycle. To avoid this issue in virt invalidation, add semaphore acquire before invalidation and semaphore release after invalidation.
>>
>> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>>    3 files changed, 28 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> index f04eb1a64271..70ffaf91cd12 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device 
>> *adev, uint32_t reg, uint32_t v)
>>    
>>    void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    					uint32_t reg0, uint32_t reg1,
>> -					uint32_t ref, uint32_t mask)
>> +					uint32_t ref, uint32_t mask,
>> +					uint32_t sem)
>>    {
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    	uint32_t seq;
>>    
>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>> -	amdgpu_ring_alloc(ring, 32);
>> +	amdgpu_ring_alloc(ring, 60);
>> +
>> +	/*
>> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
>> +	 * off cycle, add semaphore acquire before invalidation and semaphore
>> +	 * release after invalidation to avoid entering power gated state
>> +	 * to WA the Issue
>> +	 */
>> +
>> +	/* a read return value of 1 means semaphore acuqire */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>> +
>>    	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>    					    ref, mask);
>> +	/*
>> +	 * add semaphore release after invalidation,
>> +	 * write with 0 means semaphore release
>> +	 */
>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>> +	amdgpu_ring_emit_wreg(ring, sem, 0);
>> +
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>>    	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git 
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> index b0b2bdc750df..bda6a2f37dc0 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);  void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    					uint32_t reg0, uint32_t rreg1,
>> -					uint32_t ref, uint32_t mask);
>> +					uint32_t ref, uint32_t mask,
>> +					uint32_t sem);
>>    int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool 
>> init);  int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, 
>> bool init);  int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); 
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index f25cd97ba5f2..1ae59af7836a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>    			!adev->in_gpu_reset) {
>>    		uint32_t req = hub->vm_inv_eng0_req + eng;
>>    		uint32_t ack = hub->vm_inv_eng0_ack + eng;
>> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>>    
>>    		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
>> -				1 << vmid);
>> +						   1 << vmid, sem);
>>    		return;
>>    	}
>>    
>> --
>> 2.17.1
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 17:17                         ` Christian König
  0 siblings, 0 replies; 42+ messages in thread
From: Christian König @ 2019-11-20 17:17 UTC (permalink / raw)
  To: Zeng, Oak, Koenig, Christian, Liu, Monk, Zhu, Changfeng, Xiao,
	Jack, Zhou1, Tao, Huang, Ray, Huang, Shimmer,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

The hardware doesn't support inter engine interrupts and as far as I 
know there are no plans for this.

Polling is perfectly fine in this case, it just doesn't interacts well 
with power gating.

Christian.

Am 20.11.19 um 16:38 schrieb Zeng, Oak:
> Thank you Christian. Maybe in the future we can make the invalidation ack to be interrupt based instead of polling.
>
> Regards,
> Oak
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Wednesday, November 20, 2019 10:14 AM
> To: Zeng, Oak <Oak.Zeng@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
>
> Hi Oak,
>
>> [Oak] I am not familiar about the power gating sequence but from first glance, should the power gating sequence make sure that HW is ready (idle) for power gating before put the system to power gating?
> The problem is that the hardware is actually idle when gated.
>
> See what happens is the following:
>
> 1. Ring A sends an invalidate command to VM invalidation engine X.
>
> 2. VM invalidation engine X wakes up and is ungated because it now has work.
>
> 3. VM invalidation engine X finishes the invalidation and goes back to be gated again.
>
> 4. Now ring A polls for the invalidation on engine X to complete, but since it got back to be gated again it has forgotten that we have finished that invalidation. BAM! Ring A will poll forever.
>
> Regards,
> Christian.
>
> Am 20.11.19 um 16:04 schrieb Zeng, Oak:
>> See an inline comment
>>
>> Regards,
>> Oak
>>
>> -----Original Message-----
>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>> Christian König
>> Sent: Wednesday, November 20, 2019 8:21 AM
>> To: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng
>> <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao
>> <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer
>> <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore
>> workaround in amdgpu_virt
>>
>> Hi Monk,
>>
>> this is a fix for power gating the MMHUB.
>>
>> Basic problem is that the MMHUB can power gate while an invalidation
>> is in progress [Oak] I am not familiar about the power gating sequence but from first glance, should the power gating sequence make sure that HW is ready (idle) for power gating before put the system to power gating? E.g., before we put the system to power gating, should we enquiry each HW blocks to see whether the HW is idle? If not (like the case you mentioned some invalidation activities is still ongoing) the power gating condition is not mature and we should we wait. Or if the power gating is trigger/initiated by HW (I am not sure), HW should guarantee it is idle?
>>
>>    which looses all bits in the ACK register and so deadlocks the engine waiting for the invalidation to finish.
>>
>> This bug is hit immediately when we enable power gating of the MMHUB.
>>
>> Regards,
>> Christian.
>>
>> Am 20.11.19 um 14:18 schrieb Liu, Monk:
>>> Hi Changfeng
>>>
>>> Firs of all, there is no power-gating off circle involved in AMDGPU
>>> SRIOV, since we don't allow VF/VM do such things so I do feel strange
>>> why you post something like this Especially on VEGA10 serials which
>>> looks doesn't have any issue on those gpu_flush part
>>>
>>> Here is my questions for you:
>>> 1) Can you point me what issue had you been experienced ? and how to
>>> repro the bug
>>> 2) if you do hit some issues, did you verified that your patch can fix it ?
>>>
>>> besides
>>>
>>> /Monk
>>>
>>> -----邮件原件-----
>>> 发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 Changfeng.Zhu
>>> 发送时间: 2019年11月20日 17:14
>>> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack
>>> <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray
>>> <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>;
>>> amd-gfx@lists.freedesktop.org
>>> 抄送: Zhu, Changfeng <Changfeng.Zhu@amd.com>
>>> 主题: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in
>>> amdgpu_virt
>>>
>>> From: changzhu <Changfeng.Zhu@amd.com>
>>>
>>> It may lose gpuvm invalidate acknowldege state across power-gating off cycle. To avoid this issue in virt invalidation, add semaphore acquire before invalidation and semaphore release after invalidation.
>>>
>>> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
>>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>>>     drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>>>     3 files changed, 28 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> index f04eb1a64271..70ffaf91cd12 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device
>>> *adev, uint32_t reg, uint32_t v)
>>>     
>>>     void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>     					uint32_t reg0, uint32_t reg1,
>>> -					uint32_t ref, uint32_t mask)
>>> +					uint32_t ref, uint32_t mask,
>>> +					uint32_t sem)
>>>     {
>>>     	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>     	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>     	uint32_t seq;
>>>     
>>>     	spin_lock_irqsave(&kiq->ring_lock, flags);
>>> -	amdgpu_ring_alloc(ring, 32);
>>> +	amdgpu_ring_alloc(ring, 60);
>>> +
>>> +	/*
>>> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
>>> +	 * off cycle, add semaphore acquire before invalidation and semaphore
>>> +	 * release after invalidation to avoid entering power gated state
>>> +	 * to WA the Issue
>>> +	 */
>>> +
>>> +	/* a read return value of 1 means semaphore acuqire */
>>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>>> +
>>>     	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>>     					    ref, mask);
>>> +	/*
>>> +	 * add semaphore release after invalidation,
>>> +	 * write with 0 means semaphore release
>>> +	 */
>>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>>> +	amdgpu_ring_emit_wreg(ring, sem, 0);
>>> +
>>>     	amdgpu_fence_emit_polling(ring, &seq);
>>>     	amdgpu_ring_commit(ring);
>>>     	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git
>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> index b0b2bdc750df..bda6a2f37dc0 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);  void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>     					uint32_t reg0, uint32_t rreg1,
>>> -					uint32_t ref, uint32_t mask);
>>> +					uint32_t ref, uint32_t mask,
>>> +					uint32_t sem);
>>>     int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool
>>> init);  int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev,
>>> bool init);  int amdgpu_virt_reset_gpu(struct amdgpu_device *adev);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> index f25cd97ba5f2..1ae59af7836a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>>     			!adev->in_gpu_reset) {
>>>     		uint32_t req = hub->vm_inv_eng0_req + eng;
>>>     		uint32_t ack = hub->vm_inv_eng0_ack + eng;
>>> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>>>     
>>>     		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
>>> -				1 << vmid);
>>> +						   1 << vmid, sem);
>>>     		return;
>>>     	}
>>>     
>>> --
>>> 2.17.1
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
@ 2019-11-20 17:17                         ` Christian König
  0 siblings, 0 replies; 42+ messages in thread
From: Christian König @ 2019-11-20 17:17 UTC (permalink / raw)
  To: Zeng, Oak, Koenig, Christian, Liu, Monk, Zhu, Changfeng, Xiao,
	Jack, Zhou1, Tao, Huang, Ray, Huang, Shimmer, amd-gfx

The hardware doesn't support inter engine interrupts and as far as I 
know there are no plans for this.

Polling is perfectly fine in this case, it just doesn't interacts well 
with power gating.

Christian.

Am 20.11.19 um 16:38 schrieb Zeng, Oak:
> Thank you Christian. Maybe in the future we can make the invalidation ack to be interrupt based instead of polling.
>
> Regards,
> Oak
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Wednesday, November 20, 2019 10:14 AM
> To: Zeng, Oak <Oak.Zeng@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt
>
> Hi Oak,
>
>> [Oak] I am not familiar about the power gating sequence but from first glance, should the power gating sequence make sure that HW is ready (idle) for power gating before put the system to power gating?
> The problem is that the hardware is actually idle when gated.
>
> See what happens is the following:
>
> 1. Ring A sends an invalidate command to VM invalidation engine X.
>
> 2. VM invalidation engine X wakes up and is ungated because it now has work.
>
> 3. VM invalidation engine X finishes the invalidation and goes back to be gated again.
>
> 4. Now ring A polls for the invalidation on engine X to complete, but since it got back to be gated again it has forgotten that we have finished that invalidation. BAM! Ring A will poll forever.
>
> Regards,
> Christian.
>
> Am 20.11.19 um 16:04 schrieb Zeng, Oak:
>> See an inline comment
>>
>> Regards,
>> Oak
>>
>> -----Original Message-----
>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>> Christian König
>> Sent: Wednesday, November 20, 2019 8:21 AM
>> To: Liu, Monk <Monk.Liu@amd.com>; Zhu, Changfeng
>> <Changfeng.Zhu@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao
>> <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer
>> <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: 答复: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore
>> workaround in amdgpu_virt
>>
>> Hi Monk,
>>
>> this is a fix for power gating the MMHUB.
>>
>> Basic problem is that the MMHUB can power gate while an invalidation
>> is in progress [Oak] I am not familiar about the power gating sequence but from first glance, should the power gating sequence make sure that HW is ready (idle) for power gating before put the system to power gating? E.g., before we put the system to power gating, should we enquiry each HW blocks to see whether the HW is idle? If not (like the case you mentioned some invalidation activities is still ongoing) the power gating condition is not mature and we should we wait. Or if the power gating is trigger/initiated by HW (I am not sure), HW should guarantee it is idle?
>>
>>    which looses all bits in the ACK register and so deadlocks the engine waiting for the invalidation to finish.
>>
>> This bug is hit immediately when we enable power gating of the MMHUB.
>>
>> Regards,
>> Christian.
>>
>> Am 20.11.19 um 14:18 schrieb Liu, Monk:
>>> Hi Changfeng
>>>
>>> Firs of all, there is no power-gating off circle involved in AMDGPU
>>> SRIOV, since we don't allow VF/VM do such things so I do feel strange
>>> why you post something like this Especially on VEGA10 serials which
>>> looks doesn't have any issue on those gpu_flush part
>>>
>>> Here is my questions for you:
>>> 1) Can you point me what issue had you been experienced ? and how to
>>> repro the bug
>>> 2) if you do hit some issues, did you verified that your patch can fix it ?
>>>
>>> besides
>>>
>>> /Monk
>>>
>>> -----邮件原件-----
>>> 发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 Changfeng.Zhu
>>> 发送时间: 2019年11月20日 17:14
>>> 收件人: Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack
>>> <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray
>>> <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>;
>>> amd-gfx@lists.freedesktop.org
>>> 抄送: Zhu, Changfeng <Changfeng.Zhu@amd.com>
>>> 主题: [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in
>>> amdgpu_virt
>>>
>>> From: changzhu <Changfeng.Zhu@amd.com>
>>>
>>> It may lose gpuvm invalidate acknowldege state across power-gating off cycle. To avoid this issue in virt invalidation, add semaphore acquire before invalidation and semaphore release after invalidation.
>>>
>>> Change-Id: Ie98304e475166b53eed033462d76423b6b0fc25b
>>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 26 ++++++++++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  3 ++-
>>>     drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  3 ++-
>>>     3 files changed, 28 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> index f04eb1a64271..70ffaf91cd12 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> @@ -135,7 +135,8 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device
>>> *adev, uint32_t reg, uint32_t v)
>>>     
>>>     void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>     					uint32_t reg0, uint32_t reg1,
>>> -					uint32_t ref, uint32_t mask)
>>> +					uint32_t ref, uint32_t mask,
>>> +					uint32_t sem)
>>>     {
>>>     	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>     	struct amdgpu_ring *ring = &kiq->ring; @@ -144,9 +145,30 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>     	uint32_t seq;
>>>     
>>>     	spin_lock_irqsave(&kiq->ring_lock, flags);
>>> -	amdgpu_ring_alloc(ring, 32);
>>> +	amdgpu_ring_alloc(ring, 60);
>>> +
>>> +	/*
>>> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
>>> +	 * off cycle, add semaphore acquire before invalidation and semaphore
>>> +	 * release after invalidation to avoid entering power gated state
>>> +	 * to WA the Issue
>>> +	 */
>>> +
>>> +	/* a read return value of 1 means semaphore acuqire */
>>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>>> +	amdgpu_ring_emit_reg_wait(ring, sem, 0x1, 0x1);
>>> +
>>>     	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>>     					    ref, mask);
>>> +	/*
>>> +	 * add semaphore release after invalidation,
>>> +	 * write with 0 means semaphore release
>>> +	 */
>>> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
>>> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
>>> +	amdgpu_ring_emit_wreg(ring, sem, 0);
>>> +
>>>     	amdgpu_fence_emit_polling(ring, &seq);
>>>     	amdgpu_ring_commit(ring);
>>>     	spin_unlock_irqrestore(&kiq->ring_lock, flags); diff --git
>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> index b0b2bdc750df..bda6a2f37dc0 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> @@ -295,7 +295,8 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);  void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>     					uint32_t reg0, uint32_t rreg1,
>>> -					uint32_t ref, uint32_t mask);
>>> +					uint32_t ref, uint32_t mask,
>>> +					uint32_t sem);
>>>     int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool
>>> init);  int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev,
>>> bool init);  int amdgpu_virt_reset_gpu(struct amdgpu_device *adev);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> index f25cd97ba5f2..1ae59af7836a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> @@ -448,9 +448,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>>     			!adev->in_gpu_reset) {
>>>     		uint32_t req = hub->vm_inv_eng0_req + eng;
>>>     		uint32_t ack = hub->vm_inv_eng0_ack + eng;
>>> +		uint32_t sem = hub->vm_inv_eng0_sem + eng;
>>>     
>>>     		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, tmp,
>>> -				1 << vmid);
>>> +						   1 << vmid, sem);
>>>     		return;
>>>     	}
>>>     
>>> --
>>> 2.17.1
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 42+ messages in thread

end of thread, other threads:[~2019-11-20 17:17 UTC | newest]

Thread overview: 42+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-11-20  9:14 [PATCH 1/2] drm/amdgpu: invalidate mmhub semphore workaround in amdgpu_virt Changfeng.Zhu
2019-11-20  9:14 ` Changfeng.Zhu
     [not found] ` <20191120091418.26526-1-changfeng.zhu-5C7GfCeVMHo@public.gmane.org>
2019-11-20 11:23   ` Christian König
2019-11-20 11:23     ` Christian König
     [not found]     ` <657609f7-bbe2-17fe-d3c1-2b16ec8868a1-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2019-11-20 13:24       ` 答复: " Liu, Monk
2019-11-20 13:24         ` Liu, Monk
2019-11-20 13:30       ` Liu, Monk
2019-11-20 13:30         ` Liu, Monk
     [not found]         ` <MN2PR12MB39338596AB3B87681585B369844F0-rweVpJHSKTq/67K4VYF1uAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-11-20 13:36           ` Christian König
2019-11-20 13:36             ` Christian König
2019-11-20 13:18   ` Liu, Monk
2019-11-20 13:18     ` Liu, Monk
     [not found]     ` <MN2PR12MB3933863C8DFCE1F68A44EFFA844F0-rweVpJHSKTq/67K4VYF1uAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-11-20 13:20       ` Christian König
2019-11-20 13:20         ` Christian König
     [not found]         ` <1d7cd15c-f529-93da-c15e-a1fde745e6c5-5C7GfCeVMHo@public.gmane.org>
2019-11-20 13:54           ` 答复: " Liu, Monk
2019-11-20 13:54             ` Liu, Monk
     [not found]             ` <MN2PR12MB39330A0D6169250A0B75700D844F0-rweVpJHSKTq/67K4VYF1uAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-11-20 14:00               ` Christian König
2019-11-20 14:00                 ` Christian König
     [not found]                 ` <2d410370-01d0-c845-1229-529610898604-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2019-11-20 14:16                   ` Zhu, Changfeng
2019-11-20 14:16                     ` Zhu, Changfeng
     [not found]                     ` <MN2PR12MB289616C5DA35AF8AE1799114FD4F0-rweVpJHSKToIQ/pRnFqe/QdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-11-20 14:30                       ` 答复: " Liu, Monk
2019-11-20 14:30                         ` Liu, Monk
     [not found]                         ` <MN2PR12MB3933F8A02E0C57D0092FD2AC844F0-rweVpJHSKTq/67K4VYF1uAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-11-20 14:38                           ` Christian König
2019-11-20 14:38                             ` Christian König
     [not found]                             ` <89d689b2-19dc-6bdf-5ca5-de0a817dcaee-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2019-11-20 14:50                               ` Zhu, Changfeng
2019-11-20 14:50                                 ` Zhu, Changfeng
     [not found]                                 ` <MN2PR12MB28963F6B1C272969646EA1A0FD4F0-rweVpJHSKToIQ/pRnFqe/QdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-11-20 14:54                                   ` Christian König
2019-11-20 14:54                                     ` Christian König
2019-11-20 14:59                               ` 答复: " Liu, Monk
2019-11-20 14:59                                 ` Liu, Monk
     [not found]                                 ` <MN2PR12MB3933FBB4D1F8F3FDCFB39A4E844F0-rweVpJHSKTq/67K4VYF1uAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-11-20 15:06                                   ` Christian König
2019-11-20 15:06                                     ` Christian König
2019-11-20 14:18                   ` Liu, Monk
2019-11-20 14:18                     ` Liu, Monk
2019-11-20 15:04           ` Zeng, Oak
2019-11-20 15:04             ` Zeng, Oak
     [not found]             ` <BL0PR12MB2580D7594496F74418BC57D1804F0-b4cIHhjg/p/XzH18dTCKOgdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-11-20 15:13               ` Christian König
2019-11-20 15:13                 ` Christian König
     [not found]                 ` <4d57d0bb-c10d-4aa3-b95a-06e4aaa92679-5C7GfCeVMHo@public.gmane.org>
2019-11-20 15:38                   ` Zeng, Oak
2019-11-20 15:38                     ` Zeng, Oak
     [not found]                     ` <BL0PR12MB2580062E72CCFCDCD812484E804F0-b4cIHhjg/p/XzH18dTCKOgdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-11-20 17:17                       ` Christian König
2019-11-20 17:17                         ` Christian König

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.