All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 2/2] drm/amdgpu: invalidate mmhub semphore workaround in gmc9/gmc10
@ 2019-11-20  9:44 ` Changfeng.Zhu
  0 siblings, 0 replies; 10+ messages in thread
From: Changfeng.Zhu @ 2019-11-20  9:44 UTC (permalink / raw)
  To: Christian.Koenig-5C7GfCeVMHo, Jack.Xiao-5C7GfCeVMHo,
	Tao.Zhou1-5C7GfCeVMHo, Ray.Huang-5C7GfCeVMHo,
	Xinmei.Huang-5C7GfCeVMHo,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: changzhu

From: changzhu <Changfeng.Zhu@amd.com>

It may lose gpuvm invalidate acknowldege state across power-gating off
cycle. To avoid this issue in gmc9/gmc10 invalidation, add semaphore acquire
before invalidation and semaphore release after invalidation.

After adding semaphore acquire before invalidation, the semaphore
register become read-only if another process try to acquire semaphore.
Then it will not be able to release this semaphore. Then it may cause
deadlock problem. If this deadlock problem happens, it needs a semaphore
firmware fix.

Change-Id: I9942a2f451265c1f1038ccfe2f70042c7c8118af
Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 49 ++++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 49 ++++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/soc15.h     |  4 +--
 3 files changed, 100 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index af2615ba52aa..685d0d5ef31e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -234,6 +234,24 @@ static void gmc_v10_0_flush_vm_hub(struct amdgpu_device *adev, uint32_t vmid,
 	const unsigned eng = 17;
 	unsigned int i;
 
+	spin_lock(&adev->gmc.invalidate_lock);
+	/*
+	 * It may lose gpuvm invalidate acknowldege state across power-gating
+	 * off cycle, add semaphore acquire before invalidation and semaphore
+	 * release after invalidation to avoid entering power gated state
+	 * to WA the Issue
+	 */
+	for (i = 0; i < adev->usec_timeout; i++) {
+		/* a read return value of 1 means semaphore acuqire */
+		tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng);
+		if (tmp & 0x1)
+			break;
+		udelay(1);
+	}
+
+	if (i >= adev->usec_timeout)
+		DRM_ERROR("Timeout waiting for sem acquire in VM flush!\n");
+
 	WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
 
 	/*
@@ -253,6 +271,14 @@ static void gmc_v10_0_flush_vm_hub(struct amdgpu_device *adev, uint32_t vmid,
 		udelay(1);
 	}
 
+	/*
+	 * add semaphore release after invalidation,
+	 * write with 0 means semaphore release
+	 */
+	WREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng, 0);
+
+	spin_unlock(&adev->gmc.invalidate_lock);
+
 	if (i < adev->usec_timeout)
 		return;
 
@@ -338,6 +364,21 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 	uint32_t req = gmc_v10_0_get_invalidate_req(vmid, 0);
 	unsigned eng = ring->vm_inv_eng;
 
+	/*
+	 * It may lose gpuvm invalidate acknowldege state across power-gating
+	 * off cycle, add semaphore acquire before invalidation and semaphore
+	 * release after invalidation to avoid entering power gated state
+	 * to WA the Issue
+	 */
+
+	/* a read return value of 1 means semaphore acuqire */
+	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
+	    ring->funcs->vmhub == AMDGPU_MMHUB_1) {
+		amdgpu_ring_emit_reg_wait(ring,
+					  hub->vm_inv_eng0_sem + eng, 0x1, 0x1);
+		DRM_WARN_ONCE("Adding semaphore may cause deadlock and it needs firmware fix\n");
+	}
+
 	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 + (2 * vmid),
 			      lower_32_bits(pd_addr));
 
@@ -348,6 +389,14 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 					    hub->vm_inv_eng0_ack + eng,
 					    req, 1 << vmid);
 
+	/*
+	 * add semaphore release after invalidation,
+	 * write with 0 means semaphore release
+	 */
+	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
+	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
+		amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem + eng, 0);
+
 	return pd_addr;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 1ae59af7836a..c4118cbb0fbe 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -456,6 +456,24 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
 	}
 
 	spin_lock(&adev->gmc.invalidate_lock);
+
+	/*
+	 * It may lose gpuvm invalidate acknowldege state across power-gating
+	 * off cycle, add semaphore acquire before invalidation and semaphore
+	 * release after invalidation to avoid entering power gated state
+	 * to WA the Issue
+	 */
+	for (j = 0; j < adev->usec_timeout; j++) {
+		/* a read return value of 1 means semaphore acuqire */
+		tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng);
+		if (tmp & 0x1)
+			break;
+		udelay(1);
+	}
+
+	if (j >= adev->usec_timeout)
+		DRM_ERROR("Timeout waiting for sem acquire in VM flush!\n");
+
 	WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
 
 	/*
@@ -471,7 +489,15 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
 			break;
 		udelay(1);
 	}
+
+	/*
+	 * add semaphore release after invalidation,
+	 * write with 0 means semaphore release
+	 */
+	WREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng, 0);
+
 	spin_unlock(&adev->gmc.invalidate_lock);
+
 	if (j < adev->usec_timeout)
 		return;
 
@@ -486,6 +512,21 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 	uint32_t req = gmc_v9_0_get_invalidate_req(vmid, 0);
 	unsigned eng = ring->vm_inv_eng;
 
+	/*
+	 * It may lose gpuvm invalidate acknowldege state across power-gating
+	 * off cycle, add semaphore acquire before invalidation and semaphore
+	 * release after invalidation to avoid entering power gated state
+	 * to WA the Issue
+	 */
+
+	/* a read return value of 1 means semaphore acuqire */
+	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
+	    ring->funcs->vmhub == AMDGPU_MMHUB_1) {
+		amdgpu_ring_emit_reg_wait(ring,
+					  hub->vm_inv_eng0_sem + eng, 0x1, 0x1);
+		DRM_WARN_ONCE("Adding semaphore may cause deadlock and it needs firmware fix\n");
+	}
+
 	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 + (2 * vmid),
 			      lower_32_bits(pd_addr));
 
@@ -496,6 +537,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 					    hub->vm_inv_eng0_ack + eng,
 					    req, 1 << vmid);
 
+	/*
+	 * add semaphore release after invalidation,
+	 * write with 0 means semaphore release
+	 */
+	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
+	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
+		amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem + eng, 0);
+
 	return pd_addr;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.h b/drivers/gpu/drm/amd/amdgpu/soc15.h
index 9af6c6ffbfa2..57af489a5de3 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.h
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.h
@@ -28,8 +28,8 @@
 #include "nbio_v7_0.h"
 #include "nbio_v7_4.h"
 
-#define SOC15_FLUSH_GPU_TLB_NUM_WREG		4
-#define SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT	1
+#define SOC15_FLUSH_GPU_TLB_NUM_WREG		6
+#define SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT	3
 
 extern const struct amd_ip_funcs soc15_common_ip_funcs;
 
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 2/2] drm/amdgpu: invalidate mmhub semphore workaround in gmc9/gmc10
@ 2019-11-20  9:44 ` Changfeng.Zhu
  0 siblings, 0 replies; 10+ messages in thread
From: Changfeng.Zhu @ 2019-11-20  9:44 UTC (permalink / raw)
  To: Christian.Koenig, Jack.Xiao, Tao.Zhou1, Ray.Huang, Xinmei.Huang, amd-gfx
  Cc: changzhu

From: changzhu <Changfeng.Zhu@amd.com>

It may lose gpuvm invalidate acknowldege state across power-gating off
cycle. To avoid this issue in gmc9/gmc10 invalidation, add semaphore acquire
before invalidation and semaphore release after invalidation.

After adding semaphore acquire before invalidation, the semaphore
register become read-only if another process try to acquire semaphore.
Then it will not be able to release this semaphore. Then it may cause
deadlock problem. If this deadlock problem happens, it needs a semaphore
firmware fix.

Change-Id: I9942a2f451265c1f1038ccfe2f70042c7c8118af
Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 49 ++++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 49 ++++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/soc15.h     |  4 +--
 3 files changed, 100 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index af2615ba52aa..685d0d5ef31e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -234,6 +234,24 @@ static void gmc_v10_0_flush_vm_hub(struct amdgpu_device *adev, uint32_t vmid,
 	const unsigned eng = 17;
 	unsigned int i;
 
+	spin_lock(&adev->gmc.invalidate_lock);
+	/*
+	 * It may lose gpuvm invalidate acknowldege state across power-gating
+	 * off cycle, add semaphore acquire before invalidation and semaphore
+	 * release after invalidation to avoid entering power gated state
+	 * to WA the Issue
+	 */
+	for (i = 0; i < adev->usec_timeout; i++) {
+		/* a read return value of 1 means semaphore acuqire */
+		tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng);
+		if (tmp & 0x1)
+			break;
+		udelay(1);
+	}
+
+	if (i >= adev->usec_timeout)
+		DRM_ERROR("Timeout waiting for sem acquire in VM flush!\n");
+
 	WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
 
 	/*
@@ -253,6 +271,14 @@ static void gmc_v10_0_flush_vm_hub(struct amdgpu_device *adev, uint32_t vmid,
 		udelay(1);
 	}
 
+	/*
+	 * add semaphore release after invalidation,
+	 * write with 0 means semaphore release
+	 */
+	WREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng, 0);
+
+	spin_unlock(&adev->gmc.invalidate_lock);
+
 	if (i < adev->usec_timeout)
 		return;
 
@@ -338,6 +364,21 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 	uint32_t req = gmc_v10_0_get_invalidate_req(vmid, 0);
 	unsigned eng = ring->vm_inv_eng;
 
+	/*
+	 * It may lose gpuvm invalidate acknowldege state across power-gating
+	 * off cycle, add semaphore acquire before invalidation and semaphore
+	 * release after invalidation to avoid entering power gated state
+	 * to WA the Issue
+	 */
+
+	/* a read return value of 1 means semaphore acuqire */
+	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
+	    ring->funcs->vmhub == AMDGPU_MMHUB_1) {
+		amdgpu_ring_emit_reg_wait(ring,
+					  hub->vm_inv_eng0_sem + eng, 0x1, 0x1);
+		DRM_WARN_ONCE("Adding semaphore may cause deadlock and it needs firmware fix\n");
+	}
+
 	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 + (2 * vmid),
 			      lower_32_bits(pd_addr));
 
@@ -348,6 +389,14 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 					    hub->vm_inv_eng0_ack + eng,
 					    req, 1 << vmid);
 
+	/*
+	 * add semaphore release after invalidation,
+	 * write with 0 means semaphore release
+	 */
+	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
+	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
+		amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem + eng, 0);
+
 	return pd_addr;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 1ae59af7836a..c4118cbb0fbe 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -456,6 +456,24 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
 	}
 
 	spin_lock(&adev->gmc.invalidate_lock);
+
+	/*
+	 * It may lose gpuvm invalidate acknowldege state across power-gating
+	 * off cycle, add semaphore acquire before invalidation and semaphore
+	 * release after invalidation to avoid entering power gated state
+	 * to WA the Issue
+	 */
+	for (j = 0; j < adev->usec_timeout; j++) {
+		/* a read return value of 1 means semaphore acuqire */
+		tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng);
+		if (tmp & 0x1)
+			break;
+		udelay(1);
+	}
+
+	if (j >= adev->usec_timeout)
+		DRM_ERROR("Timeout waiting for sem acquire in VM flush!\n");
+
 	WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
 
 	/*
@@ -471,7 +489,15 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
 			break;
 		udelay(1);
 	}
+
+	/*
+	 * add semaphore release after invalidation,
+	 * write with 0 means semaphore release
+	 */
+	WREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng, 0);
+
 	spin_unlock(&adev->gmc.invalidate_lock);
+
 	if (j < adev->usec_timeout)
 		return;
 
@@ -486,6 +512,21 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 	uint32_t req = gmc_v9_0_get_invalidate_req(vmid, 0);
 	unsigned eng = ring->vm_inv_eng;
 
+	/*
+	 * It may lose gpuvm invalidate acknowldege state across power-gating
+	 * off cycle, add semaphore acquire before invalidation and semaphore
+	 * release after invalidation to avoid entering power gated state
+	 * to WA the Issue
+	 */
+
+	/* a read return value of 1 means semaphore acuqire */
+	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
+	    ring->funcs->vmhub == AMDGPU_MMHUB_1) {
+		amdgpu_ring_emit_reg_wait(ring,
+					  hub->vm_inv_eng0_sem + eng, 0x1, 0x1);
+		DRM_WARN_ONCE("Adding semaphore may cause deadlock and it needs firmware fix\n");
+	}
+
 	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 + (2 * vmid),
 			      lower_32_bits(pd_addr));
 
@@ -496,6 +537,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 					    hub->vm_inv_eng0_ack + eng,
 					    req, 1 << vmid);
 
+	/*
+	 * add semaphore release after invalidation,
+	 * write with 0 means semaphore release
+	 */
+	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
+	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
+		amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem + eng, 0);
+
 	return pd_addr;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.h b/drivers/gpu/drm/amd/amdgpu/soc15.h
index 9af6c6ffbfa2..57af489a5de3 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.h
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.h
@@ -28,8 +28,8 @@
 #include "nbio_v7_0.h"
 #include "nbio_v7_4.h"
 
-#define SOC15_FLUSH_GPU_TLB_NUM_WREG		4
-#define SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT	1
+#define SOC15_FLUSH_GPU_TLB_NUM_WREG		6
+#define SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT	3
 
 extern const struct amd_ip_funcs soc15_common_ip_funcs;
 
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: invalidate mmhub semphore workaround in gmc9/gmc10
@ 2019-11-20 11:26     ` Christian König
  0 siblings, 0 replies; 10+ messages in thread
From: Christian König @ 2019-11-20 11:26 UTC (permalink / raw)
  To: Changfeng.Zhu, Christian.Koenig-5C7GfCeVMHo,
	Jack.Xiao-5C7GfCeVMHo, Tao.Zhou1-5C7GfCeVMHo,
	Ray.Huang-5C7GfCeVMHo, Xinmei.Huang-5C7GfCeVMHo,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 20.11.19 um 10:44 schrieb Changfeng.Zhu:
> From: changzhu <Changfeng.Zhu@amd.com>
>
> It may lose gpuvm invalidate acknowldege state across power-gating off
> cycle. To avoid this issue in gmc9/gmc10 invalidation, add semaphore acquire
> before invalidation and semaphore release after invalidation.
>
> After adding semaphore acquire before invalidation, the semaphore
> register become read-only if another process try to acquire semaphore.
> Then it will not be able to release this semaphore. Then it may cause
> deadlock problem. If this deadlock problem happens, it needs a semaphore
> firmware fix.

Please remove the DRM_WARN_ONCE, that looks like overkill to me.

And I'm not sure how urgent that issue here is. We could also wait a few 
more days and see if the hw guys figure out why this lockups on the GFX 
ring.

Regards,
Christian.

>
> Change-Id: I9942a2f451265c1f1038ccfe2f70042c7c8118af
> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 49 ++++++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 49 ++++++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/soc15.h     |  4 +--
>   3 files changed, 100 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index af2615ba52aa..685d0d5ef31e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -234,6 +234,24 @@ static void gmc_v10_0_flush_vm_hub(struct amdgpu_device *adev, uint32_t vmid,
>   	const unsigned eng = 17;
>   	unsigned int i;
>   
> +	spin_lock(&adev->gmc.invalidate_lock);
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +	for (i = 0; i < adev->usec_timeout; i++) {
> +		/* a read return value of 1 means semaphore acuqire */
> +		tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng);
> +		if (tmp & 0x1)
> +			break;
> +		udelay(1);
> +	}
> +
> +	if (i >= adev->usec_timeout)
> +		DRM_ERROR("Timeout waiting for sem acquire in VM flush!\n");
> +
>   	WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
>   
>   	/*
> @@ -253,6 +271,14 @@ static void gmc_v10_0_flush_vm_hub(struct amdgpu_device *adev, uint32_t vmid,
>   		udelay(1);
>   	}
>   
> +	/*
> +	 * add semaphore release after invalidation,
> +	 * write with 0 means semaphore release
> +	 */
> +	WREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng, 0);
> +
> +	spin_unlock(&adev->gmc.invalidate_lock);
> +
>   	if (i < adev->usec_timeout)
>   		return;
>   
> @@ -338,6 +364,21 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   	uint32_t req = gmc_v10_0_get_invalidate_req(vmid, 0);
>   	unsigned eng = ring->vm_inv_eng;
>   
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1) {
> +		amdgpu_ring_emit_reg_wait(ring,
> +					  hub->vm_inv_eng0_sem + eng, 0x1, 0x1);
> +		DRM_WARN_ONCE("Adding semaphore may cause deadlock and it needs firmware fix\n");
> +	}
> +
>   	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 + (2 * vmid),
>   			      lower_32_bits(pd_addr));
>   
> @@ -348,6 +389,14 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   					    hub->vm_inv_eng0_ack + eng,
>   					    req, 1 << vmid);
>   
> +	/*
> +	 * add semaphore release after invalidation,
> +	 * write with 0 means semaphore release
> +	 */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +		amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem + eng, 0);
> +
>   	return pd_addr;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 1ae59af7836a..c4118cbb0fbe 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -456,6 +456,24 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   	}
>   
>   	spin_lock(&adev->gmc.invalidate_lock);
> +
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +	for (j = 0; j < adev->usec_timeout; j++) {
> +		/* a read return value of 1 means semaphore acuqire */
> +		tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng);
> +		if (tmp & 0x1)
> +			break;
> +		udelay(1);
> +	}
> +
> +	if (j >= adev->usec_timeout)
> +		DRM_ERROR("Timeout waiting for sem acquire in VM flush!\n");
> +
>   	WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
>   
>   	/*
> @@ -471,7 +489,15 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   			break;
>   		udelay(1);
>   	}
> +
> +	/*
> +	 * add semaphore release after invalidation,
> +	 * write with 0 means semaphore release
> +	 */
> +	WREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng, 0);
> +
>   	spin_unlock(&adev->gmc.invalidate_lock);
> +
>   	if (j < adev->usec_timeout)
>   		return;
>   
> @@ -486,6 +512,21 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   	uint32_t req = gmc_v9_0_get_invalidate_req(vmid, 0);
>   	unsigned eng = ring->vm_inv_eng;
>   
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1) {
> +		amdgpu_ring_emit_reg_wait(ring,
> +					  hub->vm_inv_eng0_sem + eng, 0x1, 0x1);
> +		DRM_WARN_ONCE("Adding semaphore may cause deadlock and it needs firmware fix\n");
> +	}
> +
>   	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 + (2 * vmid),
>   			      lower_32_bits(pd_addr));
>   
> @@ -496,6 +537,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   					    hub->vm_inv_eng0_ack + eng,
>   					    req, 1 << vmid);
>   
> +	/*
> +	 * add semaphore release after invalidation,
> +	 * write with 0 means semaphore release
> +	 */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +		amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem + eng, 0);
> +
>   	return pd_addr;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.h b/drivers/gpu/drm/amd/amdgpu/soc15.h
> index 9af6c6ffbfa2..57af489a5de3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/soc15.h
> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.h
> @@ -28,8 +28,8 @@
>   #include "nbio_v7_0.h"
>   #include "nbio_v7_4.h"
>   
> -#define SOC15_FLUSH_GPU_TLB_NUM_WREG		4
> -#define SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT	1
> +#define SOC15_FLUSH_GPU_TLB_NUM_WREG		6
> +#define SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT	3
>   
>   extern const struct amd_ip_funcs soc15_common_ip_funcs;
>   

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: invalidate mmhub semphore workaround in gmc9/gmc10
@ 2019-11-20 11:26     ` Christian König
  0 siblings, 0 replies; 10+ messages in thread
From: Christian König @ 2019-11-20 11:26 UTC (permalink / raw)
  To: Changfeng.Zhu, Christian.Koenig, Jack.Xiao, Tao.Zhou1, Ray.Huang,
	Xinmei.Huang, amd-gfx

Am 20.11.19 um 10:44 schrieb Changfeng.Zhu:
> From: changzhu <Changfeng.Zhu@amd.com>
>
> It may lose gpuvm invalidate acknowldege state across power-gating off
> cycle. To avoid this issue in gmc9/gmc10 invalidation, add semaphore acquire
> before invalidation and semaphore release after invalidation.
>
> After adding semaphore acquire before invalidation, the semaphore
> register become read-only if another process try to acquire semaphore.
> Then it will not be able to release this semaphore. Then it may cause
> deadlock problem. If this deadlock problem happens, it needs a semaphore
> firmware fix.

Please remove the DRM_WARN_ONCE, that looks like overkill to me.

And I'm not sure how urgent that issue here is. We could also wait a few 
more days and see if the hw guys figure out why this lockups on the GFX 
ring.

Regards,
Christian.

>
> Change-Id: I9942a2f451265c1f1038ccfe2f70042c7c8118af
> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 49 ++++++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 49 ++++++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/soc15.h     |  4 +--
>   3 files changed, 100 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index af2615ba52aa..685d0d5ef31e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -234,6 +234,24 @@ static void gmc_v10_0_flush_vm_hub(struct amdgpu_device *adev, uint32_t vmid,
>   	const unsigned eng = 17;
>   	unsigned int i;
>   
> +	spin_lock(&adev->gmc.invalidate_lock);
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +	for (i = 0; i < adev->usec_timeout; i++) {
> +		/* a read return value of 1 means semaphore acuqire */
> +		tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng);
> +		if (tmp & 0x1)
> +			break;
> +		udelay(1);
> +	}
> +
> +	if (i >= adev->usec_timeout)
> +		DRM_ERROR("Timeout waiting for sem acquire in VM flush!\n");
> +
>   	WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
>   
>   	/*
> @@ -253,6 +271,14 @@ static void gmc_v10_0_flush_vm_hub(struct amdgpu_device *adev, uint32_t vmid,
>   		udelay(1);
>   	}
>   
> +	/*
> +	 * add semaphore release after invalidation,
> +	 * write with 0 means semaphore release
> +	 */
> +	WREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng, 0);
> +
> +	spin_unlock(&adev->gmc.invalidate_lock);
> +
>   	if (i < adev->usec_timeout)
>   		return;
>   
> @@ -338,6 +364,21 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   	uint32_t req = gmc_v10_0_get_invalidate_req(vmid, 0);
>   	unsigned eng = ring->vm_inv_eng;
>   
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1) {
> +		amdgpu_ring_emit_reg_wait(ring,
> +					  hub->vm_inv_eng0_sem + eng, 0x1, 0x1);
> +		DRM_WARN_ONCE("Adding semaphore may cause deadlock and it needs firmware fix\n");
> +	}
> +
>   	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 + (2 * vmid),
>   			      lower_32_bits(pd_addr));
>   
> @@ -348,6 +389,14 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   					    hub->vm_inv_eng0_ack + eng,
>   					    req, 1 << vmid);
>   
> +	/*
> +	 * add semaphore release after invalidation,
> +	 * write with 0 means semaphore release
> +	 */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +		amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem + eng, 0);
> +
>   	return pd_addr;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 1ae59af7836a..c4118cbb0fbe 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -456,6 +456,24 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   	}
>   
>   	spin_lock(&adev->gmc.invalidate_lock);
> +
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +	for (j = 0; j < adev->usec_timeout; j++) {
> +		/* a read return value of 1 means semaphore acuqire */
> +		tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng);
> +		if (tmp & 0x1)
> +			break;
> +		udelay(1);
> +	}
> +
> +	if (j >= adev->usec_timeout)
> +		DRM_ERROR("Timeout waiting for sem acquire in VM flush!\n");
> +
>   	WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
>   
>   	/*
> @@ -471,7 +489,15 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   			break;
>   		udelay(1);
>   	}
> +
> +	/*
> +	 * add semaphore release after invalidation,
> +	 * write with 0 means semaphore release
> +	 */
> +	WREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng, 0);
> +
>   	spin_unlock(&adev->gmc.invalidate_lock);
> +
>   	if (j < adev->usec_timeout)
>   		return;
>   
> @@ -486,6 +512,21 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   	uint32_t req = gmc_v9_0_get_invalidate_req(vmid, 0);
>   	unsigned eng = ring->vm_inv_eng;
>   
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1) {
> +		amdgpu_ring_emit_reg_wait(ring,
> +					  hub->vm_inv_eng0_sem + eng, 0x1, 0x1);
> +		DRM_WARN_ONCE("Adding semaphore may cause deadlock and it needs firmware fix\n");
> +	}
> +
>   	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 + (2 * vmid),
>   			      lower_32_bits(pd_addr));
>   
> @@ -496,6 +537,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   					    hub->vm_inv_eng0_ack + eng,
>   					    req, 1 << vmid);
>   
> +	/*
> +	 * add semaphore release after invalidation,
> +	 * write with 0 means semaphore release
> +	 */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +		amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem + eng, 0);
> +
>   	return pd_addr;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.h b/drivers/gpu/drm/amd/amdgpu/soc15.h
> index 9af6c6ffbfa2..57af489a5de3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/soc15.h
> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.h
> @@ -28,8 +28,8 @@
>   #include "nbio_v7_0.h"
>   #include "nbio_v7_4.h"
>   
> -#define SOC15_FLUSH_GPU_TLB_NUM_WREG		4
> -#define SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT	1
> +#define SOC15_FLUSH_GPU_TLB_NUM_WREG		6
> +#define SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT	3
>   
>   extern const struct amd_ip_funcs soc15_common_ip_funcs;
>   

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH 2/2] drm/amdgpu: invalidate mmhub semphore workaround in gmc9/gmc10
@ 2019-11-21  7:16         ` Zhou1, Tao
  0 siblings, 0 replies; 10+ messages in thread
From: Zhou1, Tao @ 2019-11-21  7:16 UTC (permalink / raw)
  To: Koenig, Christian, Zhu, Changfeng, Xiao, Jack, Huang, Ray, Huang,
	Shimmer, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Liu, Monk,
	Zeng, Oak

Yes, we could delay the submit of patch 2, or only apply the patch to MMHUB currently and apply it to GFXHUB if we fix the gfx hang issue in the future.

For patch 1, as Monk confirms SRIOV won't enable PG at all, I agree that patch 1 could be dropped.

Tao

> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: 2019年11月20日 19:27
> To: Zhu, Changfeng <Changfeng.Zhu@amd.com>; Koenig, Christian
> <Christian.Koenig@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao
> <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang,
> Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 2/2] drm/amdgpu: invalidate mmhub semphore
> workaround in gmc9/gmc10
> 
> Am 20.11.19 um 10:44 schrieb Changfeng.Zhu:
> > From: changzhu <Changfeng.Zhu@amd.com>
> >
> > It may lose gpuvm invalidate acknowldege state across power-gating off
> > cycle. To avoid this issue in gmc9/gmc10 invalidation, add semaphore
> > acquire before invalidation and semaphore release after invalidation.
> >
> > After adding semaphore acquire before invalidation, the semaphore
> > register become read-only if another process try to acquire semaphore.
> > Then it will not be able to release this semaphore. Then it may cause
> > deadlock problem. If this deadlock problem happens, it needs a
> > semaphore firmware fix.
> 
> Please remove the DRM_WARN_ONCE, that looks like overkill to me.
> 
> And I'm not sure how urgent that issue here is. We could also wait a few
> more days and see if the hw guys figure out why this lockups on the GFX ring.
> 
> Regards,
> Christian.
> 
> >
> > Change-Id: I9942a2f451265c1f1038ccfe2f70042c7c8118af
> > Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 49
> ++++++++++++++++++++++++++
> >   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 49
> ++++++++++++++++++++++++++
> >   drivers/gpu/drm/amd/amdgpu/soc15.h     |  4 +--
> >   3 files changed, 100 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > index af2615ba52aa..685d0d5ef31e 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > @@ -234,6 +234,24 @@ static void gmc_v10_0_flush_vm_hub(struct
> amdgpu_device *adev, uint32_t vmid,
> >   	const unsigned eng = 17;
> >   	unsigned int i;
> >
> > +	spin_lock(&adev->gmc.invalidate_lock);
> > +	/*
> > +	 * It may lose gpuvm invalidate acknowldege state across power-
> gating
> > +	 * off cycle, add semaphore acquire before invalidation and
> semaphore
> > +	 * release after invalidation to avoid entering power gated state
> > +	 * to WA the Issue
> > +	 */
> > +	for (i = 0; i < adev->usec_timeout; i++) {
> > +		/* a read return value of 1 means semaphore acuqire */
> > +		tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng);
> > +		if (tmp & 0x1)
> > +			break;
> > +		udelay(1);
> > +	}
> > +
> > +	if (i >= adev->usec_timeout)
> > +		DRM_ERROR("Timeout waiting for sem acquire in VM
> flush!\n");
> > +
> >   	WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
> >
> >   	/*
> > @@ -253,6 +271,14 @@ static void gmc_v10_0_flush_vm_hub(struct
> amdgpu_device *adev, uint32_t vmid,
> >   		udelay(1);
> >   	}
> >
> > +	/*
> > +	 * add semaphore release after invalidation,
> > +	 * write with 0 means semaphore release
> > +	 */
> > +	WREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng, 0);
> > +
> > +	spin_unlock(&adev->gmc.invalidate_lock);
> > +
> >   	if (i < adev->usec_timeout)
> >   		return;
> >
> > @@ -338,6 +364,21 @@ static uint64_t
> gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> >   	uint32_t req = gmc_v10_0_get_invalidate_req(vmid, 0);
> >   	unsigned eng = ring->vm_inv_eng;
> >
> > +	/*
> > +	 * It may lose gpuvm invalidate acknowldege state across power-
> gating
> > +	 * off cycle, add semaphore acquire before invalidation and
> semaphore
> > +	 * release after invalidation to avoid entering power gated state
> > +	 * to WA the Issue
> > +	 */
> > +
> > +	/* a read return value of 1 means semaphore acuqire */
> > +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> > +	    ring->funcs->vmhub == AMDGPU_MMHUB_1) {
> > +		amdgpu_ring_emit_reg_wait(ring,
> > +					  hub->vm_inv_eng0_sem + eng, 0x1,
> 0x1);
> > +		DRM_WARN_ONCE("Adding semaphore may cause deadlock
> and it needs firmware fix\n");
> > +	}
> > +
> >   	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 + (2 * vmid),
> >   			      lower_32_bits(pd_addr));
> >
> > @@ -348,6 +389,14 @@ static uint64_t
> gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> >   					    hub->vm_inv_eng0_ack + eng,
> >   					    req, 1 << vmid);
> >
> > +	/*
> > +	 * add semaphore release after invalidation,
> > +	 * write with 0 means semaphore release
> > +	 */
> > +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> > +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> > +		amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem + eng,
> 0);
> > +
> >   	return pd_addr;
> >   }
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > index 1ae59af7836a..c4118cbb0fbe 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > @@ -456,6 +456,24 @@ static void gmc_v9_0_flush_gpu_tlb(struct
> amdgpu_device *adev, uint32_t vmid,
> >   	}
> >
> >   	spin_lock(&adev->gmc.invalidate_lock);
> > +
> > +	/*
> > +	 * It may lose gpuvm invalidate acknowldege state across power-
> gating
> > +	 * off cycle, add semaphore acquire before invalidation and
> semaphore
> > +	 * release after invalidation to avoid entering power gated state
> > +	 * to WA the Issue
> > +	 */
> > +	for (j = 0; j < adev->usec_timeout; j++) {
> > +		/* a read return value of 1 means semaphore acuqire */
> > +		tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng);
> > +		if (tmp & 0x1)
> > +			break;
> > +		udelay(1);
> > +	}
> > +
> > +	if (j >= adev->usec_timeout)
> > +		DRM_ERROR("Timeout waiting for sem acquire in VM
> flush!\n");
> > +
> >   	WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
> >
> >   	/*
> > @@ -471,7 +489,15 @@ static void gmc_v9_0_flush_gpu_tlb(struct
> amdgpu_device *adev, uint32_t vmid,
> >   			break;
> >   		udelay(1);
> >   	}
> > +
> > +	/*
> > +	 * add semaphore release after invalidation,
> > +	 * write with 0 means semaphore release
> > +	 */
> > +	WREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng, 0);
> > +
> >   	spin_unlock(&adev->gmc.invalidate_lock);
> > +
> >   	if (j < adev->usec_timeout)
> >   		return;
> >
> > @@ -486,6 +512,21 @@ static uint64_t
> gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> >   	uint32_t req = gmc_v9_0_get_invalidate_req(vmid, 0);
> >   	unsigned eng = ring->vm_inv_eng;
> >
> > +	/*
> > +	 * It may lose gpuvm invalidate acknowldege state across power-
> gating
> > +	 * off cycle, add semaphore acquire before invalidation and
> semaphore
> > +	 * release after invalidation to avoid entering power gated state
> > +	 * to WA the Issue
> > +	 */
> > +
> > +	/* a read return value of 1 means semaphore acuqire */
> > +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> > +	    ring->funcs->vmhub == AMDGPU_MMHUB_1) {
> > +		amdgpu_ring_emit_reg_wait(ring,
> > +					  hub->vm_inv_eng0_sem + eng, 0x1,
> 0x1);
> > +		DRM_WARN_ONCE("Adding semaphore may cause deadlock
> and it needs firmware fix\n");
> > +	}
> > +
> >   	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 + (2 * vmid),
> >   			      lower_32_bits(pd_addr));
> >
> > @@ -496,6 +537,14 @@ static uint64_t
> gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> >   					    hub->vm_inv_eng0_ack + eng,
> >   					    req, 1 << vmid);
> >
> > +	/*
> > +	 * add semaphore release after invalidation,
> > +	 * write with 0 means semaphore release
> > +	 */
> > +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> > +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> > +		amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem + eng,
> 0);
> > +
> >   	return pd_addr;
> >   }
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.h
> > b/drivers/gpu/drm/amd/amdgpu/soc15.h
> > index 9af6c6ffbfa2..57af489a5de3 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/soc15.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/soc15.h
> > @@ -28,8 +28,8 @@
> >   #include "nbio_v7_0.h"
> >   #include "nbio_v7_4.h"
> >
> > -#define SOC15_FLUSH_GPU_TLB_NUM_WREG		4
> > -#define SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT	1
> > +#define SOC15_FLUSH_GPU_TLB_NUM_WREG		6
> > +#define SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT	3
> >
> >   extern const struct amd_ip_funcs soc15_common_ip_funcs;
> >

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH 2/2] drm/amdgpu: invalidate mmhub semphore workaround in gmc9/gmc10
@ 2019-11-21  7:16         ` Zhou1, Tao
  0 siblings, 0 replies; 10+ messages in thread
From: Zhou1, Tao @ 2019-11-21  7:16 UTC (permalink / raw)
  To: Koenig, Christian, Zhu, Changfeng, Xiao, Jack, Huang, Ray, Huang,
	Shimmer, amd-gfx, Liu, Monk, Zeng, Oak

Yes, we could delay the submit of patch 2, or only apply the patch to MMHUB currently and apply it to GFXHUB if we fix the gfx hang issue in the future.

For patch 1, as Monk confirms SRIOV won't enable PG at all, I agree that patch 1 could be dropped.

Tao

> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: 2019年11月20日 19:27
> To: Zhu, Changfeng <Changfeng.Zhu@amd.com>; Koenig, Christian
> <Christian.Koenig@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao
> <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang,
> Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 2/2] drm/amdgpu: invalidate mmhub semphore
> workaround in gmc9/gmc10
> 
> Am 20.11.19 um 10:44 schrieb Changfeng.Zhu:
> > From: changzhu <Changfeng.Zhu@amd.com>
> >
> > It may lose gpuvm invalidate acknowldege state across power-gating off
> > cycle. To avoid this issue in gmc9/gmc10 invalidation, add semaphore
> > acquire before invalidation and semaphore release after invalidation.
> >
> > After adding semaphore acquire before invalidation, the semaphore
> > register become read-only if another process try to acquire semaphore.
> > Then it will not be able to release this semaphore. Then it may cause
> > deadlock problem. If this deadlock problem happens, it needs a
> > semaphore firmware fix.
> 
> Please remove the DRM_WARN_ONCE, that looks like overkill to me.
> 
> And I'm not sure how urgent that issue here is. We could also wait a few
> more days and see if the hw guys figure out why this lockups on the GFX ring.
> 
> Regards,
> Christian.
> 
> >
> > Change-Id: I9942a2f451265c1f1038ccfe2f70042c7c8118af
> > Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 49
> ++++++++++++++++++++++++++
> >   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 49
> ++++++++++++++++++++++++++
> >   drivers/gpu/drm/amd/amdgpu/soc15.h     |  4 +--
> >   3 files changed, 100 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > index af2615ba52aa..685d0d5ef31e 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > @@ -234,6 +234,24 @@ static void gmc_v10_0_flush_vm_hub(struct
> amdgpu_device *adev, uint32_t vmid,
> >   	const unsigned eng = 17;
> >   	unsigned int i;
> >
> > +	spin_lock(&adev->gmc.invalidate_lock);
> > +	/*
> > +	 * It may lose gpuvm invalidate acknowldege state across power-
> gating
> > +	 * off cycle, add semaphore acquire before invalidation and
> semaphore
> > +	 * release after invalidation to avoid entering power gated state
> > +	 * to WA the Issue
> > +	 */
> > +	for (i = 0; i < adev->usec_timeout; i++) {
> > +		/* a read return value of 1 means semaphore acuqire */
> > +		tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng);
> > +		if (tmp & 0x1)
> > +			break;
> > +		udelay(1);
> > +	}
> > +
> > +	if (i >= adev->usec_timeout)
> > +		DRM_ERROR("Timeout waiting for sem acquire in VM
> flush!\n");
> > +
> >   	WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
> >
> >   	/*
> > @@ -253,6 +271,14 @@ static void gmc_v10_0_flush_vm_hub(struct
> amdgpu_device *adev, uint32_t vmid,
> >   		udelay(1);
> >   	}
> >
> > +	/*
> > +	 * add semaphore release after invalidation,
> > +	 * write with 0 means semaphore release
> > +	 */
> > +	WREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng, 0);
> > +
> > +	spin_unlock(&adev->gmc.invalidate_lock);
> > +
> >   	if (i < adev->usec_timeout)
> >   		return;
> >
> > @@ -338,6 +364,21 @@ static uint64_t
> gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> >   	uint32_t req = gmc_v10_0_get_invalidate_req(vmid, 0);
> >   	unsigned eng = ring->vm_inv_eng;
> >
> > +	/*
> > +	 * It may lose gpuvm invalidate acknowldege state across power-
> gating
> > +	 * off cycle, add semaphore acquire before invalidation and
> semaphore
> > +	 * release after invalidation to avoid entering power gated state
> > +	 * to WA the Issue
> > +	 */
> > +
> > +	/* a read return value of 1 means semaphore acuqire */
> > +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> > +	    ring->funcs->vmhub == AMDGPU_MMHUB_1) {
> > +		amdgpu_ring_emit_reg_wait(ring,
> > +					  hub->vm_inv_eng0_sem + eng, 0x1,
> 0x1);
> > +		DRM_WARN_ONCE("Adding semaphore may cause deadlock
> and it needs firmware fix\n");
> > +	}
> > +
> >   	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 + (2 * vmid),
> >   			      lower_32_bits(pd_addr));
> >
> > @@ -348,6 +389,14 @@ static uint64_t
> gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> >   					    hub->vm_inv_eng0_ack + eng,
> >   					    req, 1 << vmid);
> >
> > +	/*
> > +	 * add semaphore release after invalidation,
> > +	 * write with 0 means semaphore release
> > +	 */
> > +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> > +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> > +		amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem + eng,
> 0);
> > +
> >   	return pd_addr;
> >   }
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > index 1ae59af7836a..c4118cbb0fbe 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > @@ -456,6 +456,24 @@ static void gmc_v9_0_flush_gpu_tlb(struct
> amdgpu_device *adev, uint32_t vmid,
> >   	}
> >
> >   	spin_lock(&adev->gmc.invalidate_lock);
> > +
> > +	/*
> > +	 * It may lose gpuvm invalidate acknowldege state across power-
> gating
> > +	 * off cycle, add semaphore acquire before invalidation and
> semaphore
> > +	 * release after invalidation to avoid entering power gated state
> > +	 * to WA the Issue
> > +	 */
> > +	for (j = 0; j < adev->usec_timeout; j++) {
> > +		/* a read return value of 1 means semaphore acuqire */
> > +		tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng);
> > +		if (tmp & 0x1)
> > +			break;
> > +		udelay(1);
> > +	}
> > +
> > +	if (j >= adev->usec_timeout)
> > +		DRM_ERROR("Timeout waiting for sem acquire in VM
> flush!\n");
> > +
> >   	WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
> >
> >   	/*
> > @@ -471,7 +489,15 @@ static void gmc_v9_0_flush_gpu_tlb(struct
> amdgpu_device *adev, uint32_t vmid,
> >   			break;
> >   		udelay(1);
> >   	}
> > +
> > +	/*
> > +	 * add semaphore release after invalidation,
> > +	 * write with 0 means semaphore release
> > +	 */
> > +	WREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng, 0);
> > +
> >   	spin_unlock(&adev->gmc.invalidate_lock);
> > +
> >   	if (j < adev->usec_timeout)
> >   		return;
> >
> > @@ -486,6 +512,21 @@ static uint64_t
> gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> >   	uint32_t req = gmc_v9_0_get_invalidate_req(vmid, 0);
> >   	unsigned eng = ring->vm_inv_eng;
> >
> > +	/*
> > +	 * It may lose gpuvm invalidate acknowldege state across power-
> gating
> > +	 * off cycle, add semaphore acquire before invalidation and
> semaphore
> > +	 * release after invalidation to avoid entering power gated state
> > +	 * to WA the Issue
> > +	 */
> > +
> > +	/* a read return value of 1 means semaphore acuqire */
> > +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> > +	    ring->funcs->vmhub == AMDGPU_MMHUB_1) {
> > +		amdgpu_ring_emit_reg_wait(ring,
> > +					  hub->vm_inv_eng0_sem + eng, 0x1,
> 0x1);
> > +		DRM_WARN_ONCE("Adding semaphore may cause deadlock
> and it needs firmware fix\n");
> > +	}
> > +
> >   	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 + (2 * vmid),
> >   			      lower_32_bits(pd_addr));
> >
> > @@ -496,6 +537,14 @@ static uint64_t
> gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> >   					    hub->vm_inv_eng0_ack + eng,
> >   					    req, 1 << vmid);
> >
> > +	/*
> > +	 * add semaphore release after invalidation,
> > +	 * write with 0 means semaphore release
> > +	 */
> > +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> > +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> > +		amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem + eng,
> 0);
> > +
> >   	return pd_addr;
> >   }
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.h
> > b/drivers/gpu/drm/amd/amdgpu/soc15.h
> > index 9af6c6ffbfa2..57af489a5de3 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/soc15.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/soc15.h
> > @@ -28,8 +28,8 @@
> >   #include "nbio_v7_0.h"
> >   #include "nbio_v7_4.h"
> >
> > -#define SOC15_FLUSH_GPU_TLB_NUM_WREG		4
> > -#define SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT	1
> > +#define SOC15_FLUSH_GPU_TLB_NUM_WREG		6
> > +#define SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT	3
> >
> >   extern const struct amd_ip_funcs soc15_common_ip_funcs;
> >

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH 2/2] drm/amdgpu: invalidate mmhub semphore workaround in gmc9/gmc10
@ 2019-11-21 15:53         ` Zhu, Changfeng
  0 siblings, 0 replies; 10+ messages in thread
From: Zhu, Changfeng @ 2019-11-21 15:53 UTC (permalink / raw)
  To: Koenig, Christian, Xiao, Jack, Zhou1, Tao, Huang, Ray, Huang,
	Shimmer, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Hi Chris,

I have removed DRM_WARN_ONCE.

I think we can write mmhub patch firstly. And It's for ticket:
http://ontrack-internal.amd.com/browse/SWDEV-201459

According to Yang,Zilong's comments on this issue,
GFXHUB is not applicable to the bug thus doesn't need the w/a.

I'll see gfxhub hang root cause with Lisa in the following time.

Could you please help review my new patch(remove DRM_WARN_ONCE)?

BR,
Changfeng.


-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@gmail.com> 
Sent: Wednesday, November 20, 2019 7:27 PM
To: Zhu, Changfeng <Changfeng.Zhu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 2/2] drm/amdgpu: invalidate mmhub semphore workaround in gmc9/gmc10

Am 20.11.19 um 10:44 schrieb Changfeng.Zhu:
> From: changzhu <Changfeng.Zhu@amd.com>
>
> It may lose gpuvm invalidate acknowldege state across power-gating off 
> cycle. To avoid this issue in gmc9/gmc10 invalidation, add semaphore 
> acquire before invalidation and semaphore release after invalidation.
>
> After adding semaphore acquire before invalidation, the semaphore 
> register become read-only if another process try to acquire semaphore.
> Then it will not be able to release this semaphore. Then it may cause 
> deadlock problem. If this deadlock problem happens, it needs a 
> semaphore firmware fix.

Please remove the DRM_WARN_ONCE, that looks like overkill to me.

And I'm not sure how urgent that issue here is. We could also wait a few more days and see if the hw guys figure out why this lockups on the GFX ring.

Regards,
Christian.

>
> Change-Id: I9942a2f451265c1f1038ccfe2f70042c7c8118af
> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 49 ++++++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 49 ++++++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/soc15.h     |  4 +--
>   3 files changed, 100 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index af2615ba52aa..685d0d5ef31e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -234,6 +234,24 @@ static void gmc_v10_0_flush_vm_hub(struct amdgpu_device *adev, uint32_t vmid,
>   	const unsigned eng = 17;
>   	unsigned int i;
>   
> +	spin_lock(&adev->gmc.invalidate_lock);
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +	for (i = 0; i < adev->usec_timeout; i++) {
> +		/* a read return value of 1 means semaphore acuqire */
> +		tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng);
> +		if (tmp & 0x1)
> +			break;
> +		udelay(1);
> +	}
> +
> +	if (i >= adev->usec_timeout)
> +		DRM_ERROR("Timeout waiting for sem acquire in VM flush!\n");
> +
>   	WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
>   
>   	/*
> @@ -253,6 +271,14 @@ static void gmc_v10_0_flush_vm_hub(struct amdgpu_device *adev, uint32_t vmid,
>   		udelay(1);
>   	}
>   
> +	/*
> +	 * add semaphore release after invalidation,
> +	 * write with 0 means semaphore release
> +	 */
> +	WREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng, 0);
> +
> +	spin_unlock(&adev->gmc.invalidate_lock);
> +
>   	if (i < adev->usec_timeout)
>   		return;
>   
> @@ -338,6 +364,21 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   	uint32_t req = gmc_v10_0_get_invalidate_req(vmid, 0);
>   	unsigned eng = ring->vm_inv_eng;
>   
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1) {
> +		amdgpu_ring_emit_reg_wait(ring,
> +					  hub->vm_inv_eng0_sem + eng, 0x1, 0x1);
> +		DRM_WARN_ONCE("Adding semaphore may cause deadlock and it needs firmware fix\n");
> +	}
> +
>   	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 + (2 * vmid),
>   			      lower_32_bits(pd_addr));
>   
> @@ -348,6 +389,14 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   					    hub->vm_inv_eng0_ack + eng,
>   					    req, 1 << vmid);
>   
> +	/*
> +	 * add semaphore release after invalidation,
> +	 * write with 0 means semaphore release
> +	 */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +		amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem + eng, 0);
> +
>   	return pd_addr;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 1ae59af7836a..c4118cbb0fbe 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -456,6 +456,24 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   	}
>   
>   	spin_lock(&adev->gmc.invalidate_lock);
> +
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +	for (j = 0; j < adev->usec_timeout; j++) {
> +		/* a read return value of 1 means semaphore acuqire */
> +		tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng);
> +		if (tmp & 0x1)
> +			break;
> +		udelay(1);
> +	}
> +
> +	if (j >= adev->usec_timeout)
> +		DRM_ERROR("Timeout waiting for sem acquire in VM flush!\n");
> +
>   	WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
>   
>   	/*
> @@ -471,7 +489,15 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   			break;
>   		udelay(1);
>   	}
> +
> +	/*
> +	 * add semaphore release after invalidation,
> +	 * write with 0 means semaphore release
> +	 */
> +	WREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng, 0);
> +
>   	spin_unlock(&adev->gmc.invalidate_lock);
> +
>   	if (j < adev->usec_timeout)
>   		return;
>   
> @@ -486,6 +512,21 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   	uint32_t req = gmc_v9_0_get_invalidate_req(vmid, 0);
>   	unsigned eng = ring->vm_inv_eng;
>   
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1) {
> +		amdgpu_ring_emit_reg_wait(ring,
> +					  hub->vm_inv_eng0_sem + eng, 0x1, 0x1);
> +		DRM_WARN_ONCE("Adding semaphore may cause deadlock and it needs firmware fix\n");
> +	}
> +
>   	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 + (2 * vmid),
>   			      lower_32_bits(pd_addr));
>   
> @@ -496,6 +537,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   					    hub->vm_inv_eng0_ack + eng,
>   					    req, 1 << vmid);
>   
> +	/*
> +	 * add semaphore release after invalidation,
> +	 * write with 0 means semaphore release
> +	 */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +		amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem + eng, 0);
> +
>   	return pd_addr;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.h 
> b/drivers/gpu/drm/amd/amdgpu/soc15.h
> index 9af6c6ffbfa2..57af489a5de3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/soc15.h
> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.h
> @@ -28,8 +28,8 @@
>   #include "nbio_v7_0.h"
>   #include "nbio_v7_4.h"
>   
> -#define SOC15_FLUSH_GPU_TLB_NUM_WREG		4
> -#define SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT	1
> +#define SOC15_FLUSH_GPU_TLB_NUM_WREG		6
> +#define SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT	3
>   
>   extern const struct amd_ip_funcs soc15_common_ip_funcs;
>   

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH 2/2] drm/amdgpu: invalidate mmhub semphore workaround in gmc9/gmc10
@ 2019-11-21 15:53         ` Zhu, Changfeng
  0 siblings, 0 replies; 10+ messages in thread
From: Zhu, Changfeng @ 2019-11-21 15:53 UTC (permalink / raw)
  To: Koenig, Christian, Xiao, Jack, Zhou1, Tao, Huang, Ray, Huang,
	Shimmer, amd-gfx

Hi Chris,

I have removed DRM_WARN_ONCE.

I think we can write mmhub patch firstly. And It's for ticket:
http://ontrack-internal.amd.com/browse/SWDEV-201459

According to Yang,Zilong's comments on this issue,
GFXHUB is not applicable to the bug thus doesn't need the w/a.

I'll see gfxhub hang root cause with Lisa in the following time.

Could you please help review my new patch(remove DRM_WARN_ONCE)?

BR,
Changfeng.


-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@gmail.com> 
Sent: Wednesday, November 20, 2019 7:27 PM
To: Zhu, Changfeng <Changfeng.Zhu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Xiao, Jack <Jack.Xiao@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Huang, Shimmer <Xinmei.Huang@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 2/2] drm/amdgpu: invalidate mmhub semphore workaround in gmc9/gmc10

Am 20.11.19 um 10:44 schrieb Changfeng.Zhu:
> From: changzhu <Changfeng.Zhu@amd.com>
>
> It may lose gpuvm invalidate acknowldege state across power-gating off 
> cycle. To avoid this issue in gmc9/gmc10 invalidation, add semaphore 
> acquire before invalidation and semaphore release after invalidation.
>
> After adding semaphore acquire before invalidation, the semaphore 
> register become read-only if another process try to acquire semaphore.
> Then it will not be able to release this semaphore. Then it may cause 
> deadlock problem. If this deadlock problem happens, it needs a 
> semaphore firmware fix.

Please remove the DRM_WARN_ONCE, that looks like overkill to me.

And I'm not sure how urgent that issue here is. We could also wait a few more days and see if the hw guys figure out why this lockups on the GFX ring.

Regards,
Christian.

>
> Change-Id: I9942a2f451265c1f1038ccfe2f70042c7c8118af
> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 49 ++++++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 49 ++++++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/soc15.h     |  4 +--
>   3 files changed, 100 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index af2615ba52aa..685d0d5ef31e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -234,6 +234,24 @@ static void gmc_v10_0_flush_vm_hub(struct amdgpu_device *adev, uint32_t vmid,
>   	const unsigned eng = 17;
>   	unsigned int i;
>   
> +	spin_lock(&adev->gmc.invalidate_lock);
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +	for (i = 0; i < adev->usec_timeout; i++) {
> +		/* a read return value of 1 means semaphore acuqire */
> +		tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng);
> +		if (tmp & 0x1)
> +			break;
> +		udelay(1);
> +	}
> +
> +	if (i >= adev->usec_timeout)
> +		DRM_ERROR("Timeout waiting for sem acquire in VM flush!\n");
> +
>   	WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
>   
>   	/*
> @@ -253,6 +271,14 @@ static void gmc_v10_0_flush_vm_hub(struct amdgpu_device *adev, uint32_t vmid,
>   		udelay(1);
>   	}
>   
> +	/*
> +	 * add semaphore release after invalidation,
> +	 * write with 0 means semaphore release
> +	 */
> +	WREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng, 0);
> +
> +	spin_unlock(&adev->gmc.invalidate_lock);
> +
>   	if (i < adev->usec_timeout)
>   		return;
>   
> @@ -338,6 +364,21 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   	uint32_t req = gmc_v10_0_get_invalidate_req(vmid, 0);
>   	unsigned eng = ring->vm_inv_eng;
>   
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1) {
> +		amdgpu_ring_emit_reg_wait(ring,
> +					  hub->vm_inv_eng0_sem + eng, 0x1, 0x1);
> +		DRM_WARN_ONCE("Adding semaphore may cause deadlock and it needs firmware fix\n");
> +	}
> +
>   	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 + (2 * vmid),
>   			      lower_32_bits(pd_addr));
>   
> @@ -348,6 +389,14 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   					    hub->vm_inv_eng0_ack + eng,
>   					    req, 1 << vmid);
>   
> +	/*
> +	 * add semaphore release after invalidation,
> +	 * write with 0 means semaphore release
> +	 */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +		amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem + eng, 0);
> +
>   	return pd_addr;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 1ae59af7836a..c4118cbb0fbe 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -456,6 +456,24 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   	}
>   
>   	spin_lock(&adev->gmc.invalidate_lock);
> +
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +	for (j = 0; j < adev->usec_timeout; j++) {
> +		/* a read return value of 1 means semaphore acuqire */
> +		tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng);
> +		if (tmp & 0x1)
> +			break;
> +		udelay(1);
> +	}
> +
> +	if (j >= adev->usec_timeout)
> +		DRM_ERROR("Timeout waiting for sem acquire in VM flush!\n");
> +
>   	WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
>   
>   	/*
> @@ -471,7 +489,15 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   			break;
>   		udelay(1);
>   	}
> +
> +	/*
> +	 * add semaphore release after invalidation,
> +	 * write with 0 means semaphore release
> +	 */
> +	WREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng, 0);
> +
>   	spin_unlock(&adev->gmc.invalidate_lock);
> +
>   	if (j < adev->usec_timeout)
>   		return;
>   
> @@ -486,6 +512,21 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   	uint32_t req = gmc_v9_0_get_invalidate_req(vmid, 0);
>   	unsigned eng = ring->vm_inv_eng;
>   
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1) {
> +		amdgpu_ring_emit_reg_wait(ring,
> +					  hub->vm_inv_eng0_sem + eng, 0x1, 0x1);
> +		DRM_WARN_ONCE("Adding semaphore may cause deadlock and it needs firmware fix\n");
> +	}
> +
>   	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 + (2 * vmid),
>   			      lower_32_bits(pd_addr));
>   
> @@ -496,6 +537,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   					    hub->vm_inv_eng0_ack + eng,
>   					    req, 1 << vmid);
>   
> +	/*
> +	 * add semaphore release after invalidation,
> +	 * write with 0 means semaphore release
> +	 */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +		amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem + eng, 0);
> +
>   	return pd_addr;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.h 
> b/drivers/gpu/drm/amd/amdgpu/soc15.h
> index 9af6c6ffbfa2..57af489a5de3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/soc15.h
> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.h
> @@ -28,8 +28,8 @@
>   #include "nbio_v7_0.h"
>   #include "nbio_v7_4.h"
>   
> -#define SOC15_FLUSH_GPU_TLB_NUM_WREG		4
> -#define SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT	1
> +#define SOC15_FLUSH_GPU_TLB_NUM_WREG		6
> +#define SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT	3
>   
>   extern const struct amd_ip_funcs soc15_common_ip_funcs;
>   

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 2/2] drm/amdgpu: invalidate mmhub semphore workaround in gmc9/gmc10
@ 2019-11-20 10:22 ` Changfeng.Zhu
  0 siblings, 0 replies; 10+ messages in thread
From: Changfeng.Zhu @ 2019-11-20 10:22 UTC (permalink / raw)
  To: Christian.Koenig-5C7GfCeVMHo, Jack.Xiao-5C7GfCeVMHo,
	Tao.Zhou1-5C7GfCeVMHo, Ray.Huang-5C7GfCeVMHo,
	Xinmei.Huang-5C7GfCeVMHo,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: changzhu

From: changzhu <Changfeng.Zhu@amd.com>

It may lose gpuvm invalidate acknowldege state across power-gating off
cycle. To avoid this issue in gmc9/gmc10 invalidation, add semaphore acquire
before invalidation and semaphore release after invalidation.

After adding semaphore acquire before invalidation, the semaphore
register become read-only if another process try to acquire semaphore.
Then it will not be able to release this semaphore. Then it may cause
deadlock problem. If this deadlock problem happens, it needs a semaphore
firmware fix.

Change-Id: I9942a2f451265c1f1038ccfe2f70042c7c8118af
Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 54 ++++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 54 ++++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/soc15.h     |  4 +-
 3 files changed, 110 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index af2615ba52aa..ff80a62ca514 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -234,6 +234,27 @@ static void gmc_v10_0_flush_vm_hub(struct amdgpu_device *adev, uint32_t vmid,
 	const unsigned eng = 17;
 	unsigned int i;
 
+	spin_lock(&adev->gmc.invalidate_lock);
+	/*
+	 * It may lose gpuvm invalidate acknowldege state across power-gating
+	 * off cycle, add semaphore acquire before invalidation and semaphore
+	 * release after invalidation to avoid entering power gated state
+	 * to WA the Issue
+	 */
+	if (vmhub == AMDGPU_MMHUB_0 ||
+	    vmhub == AMDGPU_MMHUB_1) {
+		for (i = 0; i < adev->usec_timeout; i++) {
+			/* a read return value of 1 means semaphore acuqire */
+			tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng);
+			if (tmp & 0x1)
+				break;
+			udelay(1);
+		}
+
+		if (i >= adev->usec_timeout)
+			DRM_ERROR("Timeout waiting for sem acquire in VM flush!\n");
+	}
+
 	WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
 
 	/*
@@ -253,6 +274,16 @@ static void gmc_v10_0_flush_vm_hub(struct amdgpu_device *adev, uint32_t vmid,
 		udelay(1);
 	}
 
+	/*
+	 * add semaphore release after invalidation,
+	 * write with 0 means semaphore release
+	 */
+	if (vmhub == AMDGPU_MMHUB_0 ||
+	    vmhub == AMDGPU_MMHUB_1)
+		WREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng, 0);
+
+	spin_unlock(&adev->gmc.invalidate_lock);
+
 	if (i < adev->usec_timeout)
 		return;
 
@@ -338,6 +369,21 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 	uint32_t req = gmc_v10_0_get_invalidate_req(vmid, 0);
 	unsigned eng = ring->vm_inv_eng;
 
+	/*
+	 * It may lose gpuvm invalidate acknowldege state across power-gating
+	 * off cycle, add semaphore acquire before invalidation and semaphore
+	 * release after invalidation to avoid entering power gated state
+	 * to WA the Issue
+	 */
+
+	/* a read return value of 1 means semaphore acuqire */
+	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
+	    ring->funcs->vmhub == AMDGPU_MMHUB_1) {
+		amdgpu_ring_emit_reg_wait(ring,
+					  hub->vm_inv_eng0_sem + eng, 0x1, 0x1);
+		DRM_WARN_ONCE("Adding semaphore may cause deadlock and it needs firmware fix\n");
+	}
+
 	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 + (2 * vmid),
 			      lower_32_bits(pd_addr));
 
@@ -348,6 +394,14 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 					    hub->vm_inv_eng0_ack + eng,
 					    req, 1 << vmid);
 
+	/*
+	 * add semaphore release after invalidation,
+	 * write with 0 means semaphore release
+	 */
+	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
+	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
+		amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem + eng, 0);
+
 	return pd_addr;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 1ae59af7836a..92b8e234a586 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -456,6 +456,27 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
 	}
 
 	spin_lock(&adev->gmc.invalidate_lock);
+
+	/*
+	 * It may lose gpuvm invalidate acknowldege state across power-gating
+	 * off cycle, add semaphore acquire before invalidation and semaphore
+	 * release after invalidation to avoid entering power gated state
+	 * to WA the Issue
+	 */
+	if (vmhub == AMDGPU_MMHUB_0 ||
+	    vmhub == AMDGPU_MMHUB_1) {
+		for (j = 0; j < adev->usec_timeout; j++) {
+			/* a read return value of 1 means semaphore acuqire */
+			tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng);
+			if (tmp & 0x1)
+				break;
+			udelay(1);
+		}
+
+		if (j >= adev->usec_timeout)
+			DRM_ERROR("Timeout waiting for sem acquire in VM flush!\n");
+	}
+
 	WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
 
 	/*
@@ -471,7 +492,17 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
 			break;
 		udelay(1);
 	}
+
+	/*
+	 * add semaphore release after invalidation,
+	 * write with 0 means semaphore release
+	 */
+	if (vmhub == AMDGPU_MMHUB_0 ||
+	    vmhub == AMDGPU_MMHUB_1)
+		WREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng, 0);
+
 	spin_unlock(&adev->gmc.invalidate_lock);
+
 	if (j < adev->usec_timeout)
 		return;
 
@@ -486,6 +517,21 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 	uint32_t req = gmc_v9_0_get_invalidate_req(vmid, 0);
 	unsigned eng = ring->vm_inv_eng;
 
+	/*
+	 * It may lose gpuvm invalidate acknowldege state across power-gating
+	 * off cycle, add semaphore acquire before invalidation and semaphore
+	 * release after invalidation to avoid entering power gated state
+	 * to WA the Issue
+	 */
+
+	/* a read return value of 1 means semaphore acuqire */
+	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
+	    ring->funcs->vmhub == AMDGPU_MMHUB_1) {
+		amdgpu_ring_emit_reg_wait(ring,
+					  hub->vm_inv_eng0_sem + eng, 0x1, 0x1);
+		DRM_WARN_ONCE("Adding semaphore may cause deadlock and it needs firmware fix\n");
+	}
+
 	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 + (2 * vmid),
 			      lower_32_bits(pd_addr));
 
@@ -496,6 +542,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 					    hub->vm_inv_eng0_ack + eng,
 					    req, 1 << vmid);
 
+	/*
+	 * add semaphore release after invalidation,
+	 * write with 0 means semaphore release
+	 */
+	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
+	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
+		amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem + eng, 0);
+
 	return pd_addr;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.h b/drivers/gpu/drm/amd/amdgpu/soc15.h
index 9af6c6ffbfa2..57af489a5de3 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.h
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.h
@@ -28,8 +28,8 @@
 #include "nbio_v7_0.h"
 #include "nbio_v7_4.h"
 
-#define SOC15_FLUSH_GPU_TLB_NUM_WREG		4
-#define SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT	1
+#define SOC15_FLUSH_GPU_TLB_NUM_WREG		6
+#define SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT	3
 
 extern const struct amd_ip_funcs soc15_common_ip_funcs;
 
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 2/2] drm/amdgpu: invalidate mmhub semphore workaround in gmc9/gmc10
@ 2019-11-20 10:22 ` Changfeng.Zhu
  0 siblings, 0 replies; 10+ messages in thread
From: Changfeng.Zhu @ 2019-11-20 10:22 UTC (permalink / raw)
  To: Christian.Koenig, Jack.Xiao, Tao.Zhou1, Ray.Huang, Xinmei.Huang, amd-gfx
  Cc: changzhu

From: changzhu <Changfeng.Zhu@amd.com>

It may lose gpuvm invalidate acknowldege state across power-gating off
cycle. To avoid this issue in gmc9/gmc10 invalidation, add semaphore acquire
before invalidation and semaphore release after invalidation.

After adding semaphore acquire before invalidation, the semaphore
register become read-only if another process try to acquire semaphore.
Then it will not be able to release this semaphore. Then it may cause
deadlock problem. If this deadlock problem happens, it needs a semaphore
firmware fix.

Change-Id: I9942a2f451265c1f1038ccfe2f70042c7c8118af
Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 54 ++++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 54 ++++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/soc15.h     |  4 +-
 3 files changed, 110 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index af2615ba52aa..ff80a62ca514 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -234,6 +234,27 @@ static void gmc_v10_0_flush_vm_hub(struct amdgpu_device *adev, uint32_t vmid,
 	const unsigned eng = 17;
 	unsigned int i;
 
+	spin_lock(&adev->gmc.invalidate_lock);
+	/*
+	 * It may lose gpuvm invalidate acknowldege state across power-gating
+	 * off cycle, add semaphore acquire before invalidation and semaphore
+	 * release after invalidation to avoid entering power gated state
+	 * to WA the Issue
+	 */
+	if (vmhub == AMDGPU_MMHUB_0 ||
+	    vmhub == AMDGPU_MMHUB_1) {
+		for (i = 0; i < adev->usec_timeout; i++) {
+			/* a read return value of 1 means semaphore acuqire */
+			tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng);
+			if (tmp & 0x1)
+				break;
+			udelay(1);
+		}
+
+		if (i >= adev->usec_timeout)
+			DRM_ERROR("Timeout waiting for sem acquire in VM flush!\n");
+	}
+
 	WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
 
 	/*
@@ -253,6 +274,16 @@ static void gmc_v10_0_flush_vm_hub(struct amdgpu_device *adev, uint32_t vmid,
 		udelay(1);
 	}
 
+	/*
+	 * add semaphore release after invalidation,
+	 * write with 0 means semaphore release
+	 */
+	if (vmhub == AMDGPU_MMHUB_0 ||
+	    vmhub == AMDGPU_MMHUB_1)
+		WREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng, 0);
+
+	spin_unlock(&adev->gmc.invalidate_lock);
+
 	if (i < adev->usec_timeout)
 		return;
 
@@ -338,6 +369,21 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 	uint32_t req = gmc_v10_0_get_invalidate_req(vmid, 0);
 	unsigned eng = ring->vm_inv_eng;
 
+	/*
+	 * It may lose gpuvm invalidate acknowldege state across power-gating
+	 * off cycle, add semaphore acquire before invalidation and semaphore
+	 * release after invalidation to avoid entering power gated state
+	 * to WA the Issue
+	 */
+
+	/* a read return value of 1 means semaphore acuqire */
+	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
+	    ring->funcs->vmhub == AMDGPU_MMHUB_1) {
+		amdgpu_ring_emit_reg_wait(ring,
+					  hub->vm_inv_eng0_sem + eng, 0x1, 0x1);
+		DRM_WARN_ONCE("Adding semaphore may cause deadlock and it needs firmware fix\n");
+	}
+
 	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 + (2 * vmid),
 			      lower_32_bits(pd_addr));
 
@@ -348,6 +394,14 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 					    hub->vm_inv_eng0_ack + eng,
 					    req, 1 << vmid);
 
+	/*
+	 * add semaphore release after invalidation,
+	 * write with 0 means semaphore release
+	 */
+	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
+	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
+		amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem + eng, 0);
+
 	return pd_addr;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 1ae59af7836a..92b8e234a586 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -456,6 +456,27 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
 	}
 
 	spin_lock(&adev->gmc.invalidate_lock);
+
+	/*
+	 * It may lose gpuvm invalidate acknowldege state across power-gating
+	 * off cycle, add semaphore acquire before invalidation and semaphore
+	 * release after invalidation to avoid entering power gated state
+	 * to WA the Issue
+	 */
+	if (vmhub == AMDGPU_MMHUB_0 ||
+	    vmhub == AMDGPU_MMHUB_1) {
+		for (j = 0; j < adev->usec_timeout; j++) {
+			/* a read return value of 1 means semaphore acuqire */
+			tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng);
+			if (tmp & 0x1)
+				break;
+			udelay(1);
+		}
+
+		if (j >= adev->usec_timeout)
+			DRM_ERROR("Timeout waiting for sem acquire in VM flush!\n");
+	}
+
 	WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
 
 	/*
@@ -471,7 +492,17 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
 			break;
 		udelay(1);
 	}
+
+	/*
+	 * add semaphore release after invalidation,
+	 * write with 0 means semaphore release
+	 */
+	if (vmhub == AMDGPU_MMHUB_0 ||
+	    vmhub == AMDGPU_MMHUB_1)
+		WREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng, 0);
+
 	spin_unlock(&adev->gmc.invalidate_lock);
+
 	if (j < adev->usec_timeout)
 		return;
 
@@ -486,6 +517,21 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 	uint32_t req = gmc_v9_0_get_invalidate_req(vmid, 0);
 	unsigned eng = ring->vm_inv_eng;
 
+	/*
+	 * It may lose gpuvm invalidate acknowldege state across power-gating
+	 * off cycle, add semaphore acquire before invalidation and semaphore
+	 * release after invalidation to avoid entering power gated state
+	 * to WA the Issue
+	 */
+
+	/* a read return value of 1 means semaphore acuqire */
+	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
+	    ring->funcs->vmhub == AMDGPU_MMHUB_1) {
+		amdgpu_ring_emit_reg_wait(ring,
+					  hub->vm_inv_eng0_sem + eng, 0x1, 0x1);
+		DRM_WARN_ONCE("Adding semaphore may cause deadlock and it needs firmware fix\n");
+	}
+
 	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 + (2 * vmid),
 			      lower_32_bits(pd_addr));
 
@@ -496,6 +542,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 					    hub->vm_inv_eng0_ack + eng,
 					    req, 1 << vmid);
 
+	/*
+	 * add semaphore release after invalidation,
+	 * write with 0 means semaphore release
+	 */
+	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
+	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
+		amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem + eng, 0);
+
 	return pd_addr;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.h b/drivers/gpu/drm/amd/amdgpu/soc15.h
index 9af6c6ffbfa2..57af489a5de3 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.h
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.h
@@ -28,8 +28,8 @@
 #include "nbio_v7_0.h"
 #include "nbio_v7_4.h"
 
-#define SOC15_FLUSH_GPU_TLB_NUM_WREG		4
-#define SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT	1
+#define SOC15_FLUSH_GPU_TLB_NUM_WREG		6
+#define SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT	3
 
 extern const struct amd_ip_funcs soc15_common_ip_funcs;
 
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2019-11-21 15:53 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-11-20  9:44 [PATCH 2/2] drm/amdgpu: invalidate mmhub semphore workaround in gmc9/gmc10 Changfeng.Zhu
2019-11-20  9:44 ` Changfeng.Zhu
     [not found] ` <20191120094413.28045-1-changfeng.zhu-5C7GfCeVMHo@public.gmane.org>
2019-11-20 11:26   ` Christian König
2019-11-20 11:26     ` Christian König
     [not found]     ` <26de4192-ebd8-b54d-6820-09cf6cda6822-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2019-11-21  7:16       ` Zhou1, Tao
2019-11-21  7:16         ` Zhou1, Tao
2019-11-21 15:53       ` Zhu, Changfeng
2019-11-21 15:53         ` Zhu, Changfeng
2019-11-20 10:22 Changfeng.Zhu
2019-11-20 10:22 ` Changfeng.Zhu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.