All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-24 21:16 ` Tuikov, Luben
  0 siblings, 0 replies; 20+ messages in thread
From: Tuikov, Luben @ 2019-10-24 21:16 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Tuikov, Luben,
	Koenig, Christian

The GRBM interface is now capable of bursting
1-cycle op per register, a WRITE followed by
another WRITE, or a WRITE followed by a READ--much
faster than previous muti-cycle per
completed-transaction interface. This causes a
problem, whereby status registers requiring a
read/write by hardware, have a 1-cycle delay, due
to the register update having to go through GRBM
interface.

This patch adds this delay.

A one cycle read op is added after updating the
invalidate request and before reading the
invalidate-ACK status.

See also commit
534991731cb5fa94b5519957646cf849ca10d17d.

Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
 5 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index ac43b1af69e3..0042868dbd53 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
 		5 + /* COND_EXEC */
 		7 + /* PIPELINE_SYNC */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* VM_FLUSH */
 		8 + /* FENCE for VM_FLUSH */
 		20 + /* GDS switch */
@@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
 		5 + /* hdp invalidate */
 		7 + /* gfx_v10_0_ring_emit_pipeline_sync */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* gfx_v10_0_ring_emit_vm_flush */
 		8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
 	.emit_ib_size =	7, /* gfx_v10_0_ring_emit_ib_compute */
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 9fe95e7693d5..9a7a717208de 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
 		5 +  /* COND_EXEC */
 		7 +  /* PIPELINE_SYNC */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* VM_FLUSH */
 		8 +  /* FENCE for VM_FLUSH */
 		20 + /* GDS switch */
@@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
 		5 + /* hdp invalidate */
 		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* gfx_v9_0_ring_emit_vm_flush */
 		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
 	.emit_ib_size =	7, /* gfx_v9_0_ring_emit_ib_compute */
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 6e1b25bd1fe7..100d526e9a42 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 
 	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
 
+	/* Insert a dummy read to delay one cycle before the ACK
+	 * inquiry.
+	 */
+	if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
+	    ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
+	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
+		amdgpu_ring_emit_reg_wait(ring,
+					  hub->vm_inv_eng0_req + eng, 0, 0);
+
 	/* wait for the invalidate to complete */
 	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
 				  1 << vmid, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 9f2a893871ec..8f3097e45299 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
 			      upper_32_bits(pd_addr));
 
+	/* Insert a dummy read to delay one cycle before the ACK
+	 * inquiry.
+	 */
+	if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
+	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
+		amdgpu_ring_emit_reg_wait(ring,
+					  hub->vm_inv_eng0_req + eng, 0, 0);
+
 	amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
 					    hub->vm_inv_eng0_ack + eng,
 					    req, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index b8fdb192f6d6..0c41b4fdc58b 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
 		6 + /* sdma_v5_0_ring_emit_pipeline_sync */
 		/* sdma_v5_0_ring_emit_vm_flush */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
 		10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
 	.emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
 	.emit_ib = sdma_v5_0_ring_emit_ib,
-- 
2.23.0.385.gbc12974a89

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-24 21:16 ` Tuikov, Luben
  0 siblings, 0 replies; 20+ messages in thread
From: Tuikov, Luben @ 2019-10-24 21:16 UTC (permalink / raw)
  To: amd-gfx
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Tuikov, Luben,
	Koenig, Christian

The GRBM interface is now capable of bursting
1-cycle op per register, a WRITE followed by
another WRITE, or a WRITE followed by a READ--much
faster than previous muti-cycle per
completed-transaction interface. This causes a
problem, whereby status registers requiring a
read/write by hardware, have a 1-cycle delay, due
to the register update having to go through GRBM
interface.

This patch adds this delay.

A one cycle read op is added after updating the
invalidate request and before reading the
invalidate-ACK status.

See also commit
534991731cb5fa94b5519957646cf849ca10d17d.

Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
 5 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index ac43b1af69e3..0042868dbd53 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
 		5 + /* COND_EXEC */
 		7 + /* PIPELINE_SYNC */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* VM_FLUSH */
 		8 + /* FENCE for VM_FLUSH */
 		20 + /* GDS switch */
@@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
 		5 + /* hdp invalidate */
 		7 + /* gfx_v10_0_ring_emit_pipeline_sync */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* gfx_v10_0_ring_emit_vm_flush */
 		8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
 	.emit_ib_size =	7, /* gfx_v10_0_ring_emit_ib_compute */
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 9fe95e7693d5..9a7a717208de 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
 		5 +  /* COND_EXEC */
 		7 +  /* PIPELINE_SYNC */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* VM_FLUSH */
 		8 +  /* FENCE for VM_FLUSH */
 		20 + /* GDS switch */
@@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
 		5 + /* hdp invalidate */
 		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* gfx_v9_0_ring_emit_vm_flush */
 		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
 	.emit_ib_size =	7, /* gfx_v9_0_ring_emit_ib_compute */
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 6e1b25bd1fe7..100d526e9a42 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 
 	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
 
+	/* Insert a dummy read to delay one cycle before the ACK
+	 * inquiry.
+	 */
+	if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
+	    ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
+	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
+		amdgpu_ring_emit_reg_wait(ring,
+					  hub->vm_inv_eng0_req + eng, 0, 0);
+
 	/* wait for the invalidate to complete */
 	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
 				  1 << vmid, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 9f2a893871ec..8f3097e45299 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
 			      upper_32_bits(pd_addr));
 
+	/* Insert a dummy read to delay one cycle before the ACK
+	 * inquiry.
+	 */
+	if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
+	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
+		amdgpu_ring_emit_reg_wait(ring,
+					  hub->vm_inv_eng0_req + eng, 0, 0);
+
 	amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
 					    hub->vm_inv_eng0_ack + eng,
 					    req, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index b8fdb192f6d6..0c41b4fdc58b 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
 		6 + /* sdma_v5_0_ring_emit_pipeline_sync */
 		/* sdma_v5_0_ring_emit_vm_flush */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
 		10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
 	.emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
 	.emit_ib = sdma_v5_0_ring_emit_ib,
-- 
2.23.0.385.gbc12974a89

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* RE: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25  3:20     ` Zhu, Changfeng
  0 siblings, 0 replies; 20+ messages in thread
From: Zhu, Changfeng @ 2019-10-25  3:20 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Tuikov, Luben,
	Koenig, Christian

Inline.


-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Tuikov, Luben
Sent: Friday, October 25, 2019 5:17 AM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
Subject: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay

The GRBM interface is now capable of bursting 1-cycle op per register, a WRITE followed by another WRITE, or a WRITE followed by a READ--much faster than previous muti-cycle per completed-transaction interface. This causes a problem, whereby status registers requiring a read/write by hardware, have a 1-cycle delay, due to the register update having to go through GRBM interface.

This patch adds this delay.

A one cycle read op is added after updating the invalidate request and before reading the invalidate-ACK status.

See also commit
534991731cb5fa94b5519957646cf849ca10d17d.

Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++  drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
 5 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index ac43b1af69e3..0042868dbd53 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
 		5 + /* COND_EXEC */
 		7 + /* PIPELINE_SYNC */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* VM_FLUSH */
 		8 + /* FENCE for VM_FLUSH */
 		20 + /* GDS switch */
@@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
 		5 + /* hdp invalidate */
 		7 + /* gfx_v10_0_ring_emit_pipeline_sync */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* gfx_v10_0_ring_emit_vm_flush */
 		8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
 	.emit_ib_size =	7, /* gfx_v10_0_ring_emit_ib_compute */
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 9fe95e7693d5..9a7a717208de 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
 		5 +  /* COND_EXEC */
 		7 +  /* PIPELINE_SYNC */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* VM_FLUSH */
 		8 +  /* FENCE for VM_FLUSH */
 		20 + /* GDS switch */
@@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
 		5 + /* hdp invalidate */
 		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* gfx_v9_0_ring_emit_vm_flush */
 		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
 	.emit_ib_size =	7, /* gfx_v9_0_ring_emit_ib_compute */
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 6e1b25bd1fe7..100d526e9a42 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 
 	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
 
+	/* Insert a dummy read to delay one cycle before the ACK
+	 * inquiry.
+	 */
+	if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
+	    ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
+	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
+		amdgpu_ring_emit_reg_wait(ring,
+					  hub->vm_inv_eng0_req + eng, 0, 0);
+
 	/* wait for the invalidate to complete */
 	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
 				  1 << vmid, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 9f2a893871ec..8f3097e45299 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
 			      upper_32_bits(pd_addr));
 
+	/* Insert a dummy read to delay one cycle before the ACK
+	 * inquiry.
+	 */
+	if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
+	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
+		amdgpu_ring_emit_reg_wait(ring,
+					  hub->vm_inv_eng0_req + eng, 0, 0);
+


	Why do we add amdgpu_ring_emit_reg_wait here? There is no amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req); before it like gmc10.
	In gmc9,amdgpu_ring_emit_wreg  and amdgpu_ring_emit_reg_wait  are called in amdgpu_ring_emit_reg_write_reg_wait.
	I think it may be more reasonable to add dummy amdgpu_ring_emit_reg_wait in amdgpu_ring_emit_reg_write_reg_wait.
	Besides, we should also think about the influence of SROV's patch:
	drm/amdgpu: Remove the sriov checking and add firmware checking



 	amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
 					    hub->vm_inv_eng0_ack + eng,
 					    req, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index b8fdb192f6d6..0c41b4fdc58b 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
 		6 + /* sdma_v5_0_ring_emit_pipeline_sync */
 		/* sdma_v5_0_ring_emit_vm_flush */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
 		10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
 	.emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
 	.emit_ib = sdma_v5_0_ring_emit_ib,
--
2.23.0.385.gbc12974a89

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* RE: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25  3:20     ` Zhu, Changfeng
  0 siblings, 0 replies; 20+ messages in thread
From: Zhu, Changfeng @ 2019-10-25  3:20 UTC (permalink / raw)
  To: Tuikov, Luben, amd-gfx
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Tuikov, Luben,
	Koenig, Christian

Inline.


-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Tuikov, Luben
Sent: Friday, October 25, 2019 5:17 AM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
Subject: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay

The GRBM interface is now capable of bursting 1-cycle op per register, a WRITE followed by another WRITE, or a WRITE followed by a READ--much faster than previous muti-cycle per completed-transaction interface. This causes a problem, whereby status registers requiring a read/write by hardware, have a 1-cycle delay, due to the register update having to go through GRBM interface.

This patch adds this delay.

A one cycle read op is added after updating the invalidate request and before reading the invalidate-ACK status.

See also commit
534991731cb5fa94b5519957646cf849ca10d17d.

Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++  drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
 5 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index ac43b1af69e3..0042868dbd53 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
 		5 + /* COND_EXEC */
 		7 + /* PIPELINE_SYNC */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* VM_FLUSH */
 		8 + /* FENCE for VM_FLUSH */
 		20 + /* GDS switch */
@@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
 		5 + /* hdp invalidate */
 		7 + /* gfx_v10_0_ring_emit_pipeline_sync */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* gfx_v10_0_ring_emit_vm_flush */
 		8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
 	.emit_ib_size =	7, /* gfx_v10_0_ring_emit_ib_compute */
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 9fe95e7693d5..9a7a717208de 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
 		5 +  /* COND_EXEC */
 		7 +  /* PIPELINE_SYNC */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* VM_FLUSH */
 		8 +  /* FENCE for VM_FLUSH */
 		20 + /* GDS switch */
@@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
 		5 + /* hdp invalidate */
 		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* gfx_v9_0_ring_emit_vm_flush */
 		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
 	.emit_ib_size =	7, /* gfx_v9_0_ring_emit_ib_compute */
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 6e1b25bd1fe7..100d526e9a42 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 
 	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
 
+	/* Insert a dummy read to delay one cycle before the ACK
+	 * inquiry.
+	 */
+	if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
+	    ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
+	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
+		amdgpu_ring_emit_reg_wait(ring,
+					  hub->vm_inv_eng0_req + eng, 0, 0);
+
 	/* wait for the invalidate to complete */
 	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
 				  1 << vmid, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 9f2a893871ec..8f3097e45299 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
 			      upper_32_bits(pd_addr));
 
+	/* Insert a dummy read to delay one cycle before the ACK
+	 * inquiry.
+	 */
+	if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
+	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
+		amdgpu_ring_emit_reg_wait(ring,
+					  hub->vm_inv_eng0_req + eng, 0, 0);
+


	Why do we add amdgpu_ring_emit_reg_wait here? There is no amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req); before it like gmc10.
	In gmc9,amdgpu_ring_emit_wreg  and amdgpu_ring_emit_reg_wait  are called in amdgpu_ring_emit_reg_write_reg_wait.
	I think it may be more reasonable to add dummy amdgpu_ring_emit_reg_wait in amdgpu_ring_emit_reg_write_reg_wait.
	Besides, we should also think about the influence of SROV's patch:
	drm/amdgpu: Remove the sriov checking and add firmware checking



 	amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
 					    hub->vm_inv_eng0_ack + eng,
 					    req, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index b8fdb192f6d6..0c41b4fdc58b 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
 		6 + /* sdma_v5_0_ring_emit_pipeline_sync */
 		/* sdma_v5_0_ring_emit_vm_flush */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
 		10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
 	.emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
 	.emit_ib = sdma_v5_0_ring_emit_ib,
--
2.23.0.385.gbc12974a89

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25  6:49     ` Koenig, Christian
  0 siblings, 0 replies; 20+ messages in thread
From: Koenig, Christian @ 2019-10-25  6:49 UTC (permalink / raw)
  To: Tuikov, Luben, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric

Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
> The GRBM interface is now capable of bursting
> 1-cycle op per register, a WRITE followed by
> another WRITE, or a WRITE followed by a READ--much
> faster than previous muti-cycle per
> completed-transaction interface. This causes a
> problem, whereby status registers requiring a
> read/write by hardware, have a 1-cycle delay, due
> to the register update having to go through GRBM
> interface.
>
> This patch adds this delay.
>
> A one cycle read op is added after updating the
> invalidate request and before reading the
> invalidate-ACK status.

Please completely drop all changes for GFX9 since this patch will most 
likely break SRIOV.

Additional to that please apply the workaround only to SDMA since the CP 
driven engines should handle that in firmware.

Regards,
Christian.

>
> See also commit
> 534991731cb5fa94b5519957646cf849ca10d17d.
>
> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++
>   drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>   5 files changed, 22 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index ac43b1af69e3..0042868dbd53 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>   		5 + /* COND_EXEC */
>   		7 + /* PIPELINE_SYNC */
>   		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>   		2 + /* VM_FLUSH */
>   		8 + /* FENCE for VM_FLUSH */
>   		20 + /* GDS switch */
> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>   		5 + /* hdp invalidate */
>   		7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>   		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>   		2 + /* gfx_v10_0_ring_emit_vm_flush */
>   		8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>   	.emit_ib_size =	7, /* gfx_v10_0_ring_emit_ib_compute */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 9fe95e7693d5..9a7a717208de 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>   		5 +  /* COND_EXEC */
>   		7 +  /* PIPELINE_SYNC */
>   		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>   		2 + /* VM_FLUSH */
>   		8 +  /* FENCE for VM_FLUSH */
>   		20 + /* GDS switch */
> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>   		5 + /* hdp invalidate */
>   		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>   		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>   		2 + /* gfx_v9_0_ring_emit_vm_flush */
>   		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>   	.emit_ib_size =	7, /* gfx_v9_0_ring_emit_ib_compute */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 6e1b25bd1fe7..100d526e9a42 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   
>   	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>   
> +	/* Insert a dummy read to delay one cycle before the ACK
> +	 * inquiry.
> +	 */
> +	if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
> +	    ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
> +	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> +		amdgpu_ring_emit_reg_wait(ring,
> +					  hub->vm_inv_eng0_req + eng, 0, 0);
> +
>   	/* wait for the invalidate to complete */
>   	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>   				  1 << vmid, 1 << vmid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 9f2a893871ec..8f3097e45299 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>   			      upper_32_bits(pd_addr));
>   
> +	/* Insert a dummy read to delay one cycle before the ACK
> +	 * inquiry.
> +	 */
> +	if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
> +	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> +		amdgpu_ring_emit_reg_wait(ring,
> +					  hub->vm_inv_eng0_req + eng, 0, 0);
> +
>   	amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>   					    hub->vm_inv_eng0_ack + eng,
>   					    req, 1 << vmid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> index b8fdb192f6d6..0c41b4fdc58b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>   		6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>   		/* sdma_v5_0_ring_emit_vm_flush */
>   		SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>   		10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>   	.emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>   	.emit_ib = sdma_v5_0_ring_emit_ib,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25  6:49     ` Koenig, Christian
  0 siblings, 0 replies; 20+ messages in thread
From: Koenig, Christian @ 2019-10-25  6:49 UTC (permalink / raw)
  To: Tuikov, Luben, amd-gfx; +Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric

Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
> The GRBM interface is now capable of bursting
> 1-cycle op per register, a WRITE followed by
> another WRITE, or a WRITE followed by a READ--much
> faster than previous muti-cycle per
> completed-transaction interface. This causes a
> problem, whereby status registers requiring a
> read/write by hardware, have a 1-cycle delay, due
> to the register update having to go through GRBM
> interface.
>
> This patch adds this delay.
>
> A one cycle read op is added after updating the
> invalidate request and before reading the
> invalidate-ACK status.

Please completely drop all changes for GFX9 since this patch will most 
likely break SRIOV.

Additional to that please apply the workaround only to SDMA since the CP 
driven engines should handle that in firmware.

Regards,
Christian.

>
> See also commit
> 534991731cb5fa94b5519957646cf849ca10d17d.
>
> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++
>   drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>   5 files changed, 22 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index ac43b1af69e3..0042868dbd53 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>   		5 + /* COND_EXEC */
>   		7 + /* PIPELINE_SYNC */
>   		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>   		2 + /* VM_FLUSH */
>   		8 + /* FENCE for VM_FLUSH */
>   		20 + /* GDS switch */
> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>   		5 + /* hdp invalidate */
>   		7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>   		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>   		2 + /* gfx_v10_0_ring_emit_vm_flush */
>   		8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>   	.emit_ib_size =	7, /* gfx_v10_0_ring_emit_ib_compute */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 9fe95e7693d5..9a7a717208de 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>   		5 +  /* COND_EXEC */
>   		7 +  /* PIPELINE_SYNC */
>   		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>   		2 + /* VM_FLUSH */
>   		8 +  /* FENCE for VM_FLUSH */
>   		20 + /* GDS switch */
> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>   		5 + /* hdp invalidate */
>   		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>   		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>   		2 + /* gfx_v9_0_ring_emit_vm_flush */
>   		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>   	.emit_ib_size =	7, /* gfx_v9_0_ring_emit_ib_compute */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 6e1b25bd1fe7..100d526e9a42 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   
>   	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>   
> +	/* Insert a dummy read to delay one cycle before the ACK
> +	 * inquiry.
> +	 */
> +	if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
> +	    ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
> +	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> +		amdgpu_ring_emit_reg_wait(ring,
> +					  hub->vm_inv_eng0_req + eng, 0, 0);
> +
>   	/* wait for the invalidate to complete */
>   	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>   				  1 << vmid, 1 << vmid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 9f2a893871ec..8f3097e45299 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>   			      upper_32_bits(pd_addr));
>   
> +	/* Insert a dummy read to delay one cycle before the ACK
> +	 * inquiry.
> +	 */
> +	if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
> +	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> +		amdgpu_ring_emit_reg_wait(ring,
> +					  hub->vm_inv_eng0_req + eng, 0, 0);
> +
>   	amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>   					    hub->vm_inv_eng0_ack + eng,
>   					    req, 1 << vmid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> index b8fdb192f6d6..0c41b4fdc58b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>   		6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>   		/* sdma_v5_0_ring_emit_vm_flush */
>   		SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>   		10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>   	.emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>   	.emit_ib = sdma_v5_0_ring_emit_ib,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25 16:05         ` Alex Deucher
  0 siblings, 0 replies; 20+ messages in thread
From: Alex Deucher @ 2019-10-25 16:05 UTC (permalink / raw)
  To: Koenig, Christian
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Tuikov, Luben,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On Fri, Oct 25, 2019 at 2:49 AM Koenig, Christian
<Christian.Koenig@amd.com> wrote:
>
> Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
> > The GRBM interface is now capable of bursting
> > 1-cycle op per register, a WRITE followed by
> > another WRITE, or a WRITE followed by a READ--much
> > faster than previous muti-cycle per
> > completed-transaction interface. This causes a
> > problem, whereby status registers requiring a
> > read/write by hardware, have a 1-cycle delay, due
> > to the register update having to go through GRBM
> > interface.
> >
> > This patch adds this delay.
> >
> > A one cycle read op is added after updating the
> > invalidate request and before reading the
> > invalidate-ACK status.
>
> Please completely drop all changes for GFX9 since this patch will most
> likely break SRIOV.
>
> Additional to that please apply the workaround only to SDMA since the CP
> driven engines should handle that in firmware.

I think the CP only handles this in firmware if we use the new TLB
invalidation packet.  I don't think it applies it to general register
writes like we do.

Alex

>
> Regards,
> Christian.
>
> >
> > See also commit
> > 534991731cb5fa94b5519957646cf849ca10d17d.
> >
> > Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
> >   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--
> >   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
> >   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++
> >   drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
> >   5 files changed, 22 insertions(+), 5 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > index ac43b1af69e3..0042868dbd53 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
> >               5 + /* COND_EXEC */
> >               7 + /* PIPELINE_SYNC */
> >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> > -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> > +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
> >               2 + /* VM_FLUSH */
> >               8 + /* FENCE for VM_FLUSH */
> >               20 + /* GDS switch */
> > @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
> >               5 + /* hdp invalidate */
> >               7 + /* gfx_v10_0_ring_emit_pipeline_sync */
> >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> > -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> > +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
> >               2 + /* gfx_v10_0_ring_emit_vm_flush */
> >               8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
> >       .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > index 9fe95e7693d5..9a7a717208de 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
> >               5 +  /* COND_EXEC */
> >               7 +  /* PIPELINE_SYNC */
> >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> > -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> > +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
> >               2 + /* VM_FLUSH */
> >               8 +  /* FENCE for VM_FLUSH */
> >               20 + /* GDS switch */
> > @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
> >               5 + /* hdp invalidate */
> >               7 + /* gfx_v9_0_ring_emit_pipeline_sync */
> >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> > -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> > +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
> >               2 + /* gfx_v9_0_ring_emit_vm_flush */
> >               8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
> >       .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > index 6e1b25bd1fe7..100d526e9a42 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> >
> >       amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
> >
> > +     /* Insert a dummy read to delay one cycle before the ACK
> > +      * inquiry.
> > +      */
> > +     if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
> > +         ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
> > +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> > +             amdgpu_ring_emit_reg_wait(ring,
> > +                                       hub->vm_inv_eng0_req + eng, 0, 0);
> > +
> >       /* wait for the invalidate to complete */
> >       amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
> >                                 1 << vmid, 1 << vmid);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > index 9f2a893871ec..8f3097e45299 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> >       amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
> >                             upper_32_bits(pd_addr));
> >
> > +     /* Insert a dummy read to delay one cycle before the ACK
> > +      * inquiry.
> > +      */
> > +     if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
> > +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> > +             amdgpu_ring_emit_reg_wait(ring,
> > +                                       hub->vm_inv_eng0_req + eng, 0, 0);
> > +
> >       amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
> >                                           hub->vm_inv_eng0_ack + eng,
> >                                           req, 1 << vmid);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> > index b8fdb192f6d6..0c41b4fdc58b 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> > @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
> >               6 + /* sdma_v5_0_ring_emit_pipeline_sync */
> >               /* sdma_v5_0_ring_emit_vm_flush */
> >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
> > -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
> > +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
> >               10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
> >       .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
> >       .emit_ib = sdma_v5_0_ring_emit_ib,
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25 16:05         ` Alex Deucher
  0 siblings, 0 replies; 20+ messages in thread
From: Alex Deucher @ 2019-10-25 16:05 UTC (permalink / raw)
  To: Koenig, Christian
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Tuikov, Luben, amd-gfx

On Fri, Oct 25, 2019 at 2:49 AM Koenig, Christian
<Christian.Koenig@amd.com> wrote:
>
> Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
> > The GRBM interface is now capable of bursting
> > 1-cycle op per register, a WRITE followed by
> > another WRITE, or a WRITE followed by a READ--much
> > faster than previous muti-cycle per
> > completed-transaction interface. This causes a
> > problem, whereby status registers requiring a
> > read/write by hardware, have a 1-cycle delay, due
> > to the register update having to go through GRBM
> > interface.
> >
> > This patch adds this delay.
> >
> > A one cycle read op is added after updating the
> > invalidate request and before reading the
> > invalidate-ACK status.
>
> Please completely drop all changes for GFX9 since this patch will most
> likely break SRIOV.
>
> Additional to that please apply the workaround only to SDMA since the CP
> driven engines should handle that in firmware.

I think the CP only handles this in firmware if we use the new TLB
invalidation packet.  I don't think it applies it to general register
writes like we do.

Alex

>
> Regards,
> Christian.
>
> >
> > See also commit
> > 534991731cb5fa94b5519957646cf849ca10d17d.
> >
> > Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
> >   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--
> >   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
> >   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++
> >   drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
> >   5 files changed, 22 insertions(+), 5 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > index ac43b1af69e3..0042868dbd53 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
> >               5 + /* COND_EXEC */
> >               7 + /* PIPELINE_SYNC */
> >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> > -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> > +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
> >               2 + /* VM_FLUSH */
> >               8 + /* FENCE for VM_FLUSH */
> >               20 + /* GDS switch */
> > @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
> >               5 + /* hdp invalidate */
> >               7 + /* gfx_v10_0_ring_emit_pipeline_sync */
> >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> > -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> > +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
> >               2 + /* gfx_v10_0_ring_emit_vm_flush */
> >               8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
> >       .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > index 9fe95e7693d5..9a7a717208de 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
> >               5 +  /* COND_EXEC */
> >               7 +  /* PIPELINE_SYNC */
> >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> > -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> > +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
> >               2 + /* VM_FLUSH */
> >               8 +  /* FENCE for VM_FLUSH */
> >               20 + /* GDS switch */
> > @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
> >               5 + /* hdp invalidate */
> >               7 + /* gfx_v9_0_ring_emit_pipeline_sync */
> >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> > -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> > +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
> >               2 + /* gfx_v9_0_ring_emit_vm_flush */
> >               8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
> >       .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > index 6e1b25bd1fe7..100d526e9a42 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> >
> >       amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
> >
> > +     /* Insert a dummy read to delay one cycle before the ACK
> > +      * inquiry.
> > +      */
> > +     if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
> > +         ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
> > +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> > +             amdgpu_ring_emit_reg_wait(ring,
> > +                                       hub->vm_inv_eng0_req + eng, 0, 0);
> > +
> >       /* wait for the invalidate to complete */
> >       amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
> >                                 1 << vmid, 1 << vmid);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > index 9f2a893871ec..8f3097e45299 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> >       amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
> >                             upper_32_bits(pd_addr));
> >
> > +     /* Insert a dummy read to delay one cycle before the ACK
> > +      * inquiry.
> > +      */
> > +     if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
> > +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> > +             amdgpu_ring_emit_reg_wait(ring,
> > +                                       hub->vm_inv_eng0_req + eng, 0, 0);
> > +
> >       amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
> >                                           hub->vm_inv_eng0_ack + eng,
> >                                           req, 1 << vmid);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> > index b8fdb192f6d6..0c41b4fdc58b 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> > @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
> >               6 + /* sdma_v5_0_ring_emit_pipeline_sync */
> >               /* sdma_v5_0_ring_emit_vm_flush */
> >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
> > -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
> > +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
> >               10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
> >       .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
> >       .emit_ib = sdma_v5_0_ring_emit_ib,
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25 16:19             ` Koenig, Christian
  0 siblings, 0 replies; 20+ messages in thread
From: Koenig, Christian @ 2019-10-25 16:19 UTC (permalink / raw)
  To: Alex Deucher
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Tuikov, Luben,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 25.10.19 um 18:05 schrieb Alex Deucher:
> On Fri, Oct 25, 2019 at 2:49 AM Koenig, Christian
> <Christian.Koenig@amd.com> wrote:
>> Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
>>> The GRBM interface is now capable of bursting
>>> 1-cycle op per register, a WRITE followed by
>>> another WRITE, or a WRITE followed by a READ--much
>>> faster than previous muti-cycle per
>>> completed-transaction interface. This causes a
>>> problem, whereby status registers requiring a
>>> read/write by hardware, have a 1-cycle delay, due
>>> to the register update having to go through GRBM
>>> interface.
>>>
>>> This patch adds this delay.
>>>
>>> A one cycle read op is added after updating the
>>> invalidate request and before reading the
>>> invalidate-ACK status.
>> Please completely drop all changes for GFX9 since this patch will most
>> likely break SRIOV.
>>
>> Additional to that please apply the workaround only to SDMA since the CP
>> driven engines should handle that in firmware.
> I think the CP only handles this in firmware if we use the new TLB
> invalidation packet.  I don't think it applies it to general register
> writes like we do.

No, on the CP we should use the combined write/wait command even if we 
don't use the new specialized VM invalidate command. Everything else 
won't work with SRIOV.

Even if we want to we can't insert an extra read in this combined 
write/wait command. And if we split up the commands we would break SRIOV 
once more.

So applying this workaround to the CP code doesn't make any sense at all.

The only TODO which I can see is that we maybe don't use the combined 
write/wait command on Navi yet.

Christian.

>
> Alex
>
>> Regards,
>> Christian.
>>
>>> See also commit
>>> 534991731cb5fa94b5519957646cf849ca10d17d.
>>>
>>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--
>>>    drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
>>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++
>>>    drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>>>    5 files changed, 22 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> index ac43b1af69e3..0042868dbd53 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>>                5 + /* COND_EXEC */
>>>                7 + /* PIPELINE_SYNC */
>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>                2 + /* VM_FLUSH */
>>>                8 + /* FENCE for VM_FLUSH */
>>>                20 + /* GDS switch */
>>> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>>                5 + /* hdp invalidate */
>>>                7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>                2 + /* gfx_v10_0_ring_emit_vm_flush */
>>>                8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>>>        .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> index 9fe95e7693d5..9a7a717208de 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>>                5 +  /* COND_EXEC */
>>>                7 +  /* PIPELINE_SYNC */
>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>                2 + /* VM_FLUSH */
>>>                8 +  /* FENCE for VM_FLUSH */
>>>                20 + /* GDS switch */
>>> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>>                5 + /* hdp invalidate */
>>>                7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>                2 + /* gfx_v9_0_ring_emit_vm_flush */
>>>                8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>>>        .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> index 6e1b25bd1fe7..100d526e9a42 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>
>>>        amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>>
>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>> +      * inquiry.
>>> +      */
>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
>>> +         ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>> +             amdgpu_ring_emit_reg_wait(ring,
>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>> +
>>>        /* wait for the invalidate to complete */
>>>        amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>>                                  1 << vmid, 1 << vmid);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> index 9f2a893871ec..8f3097e45299 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>        amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>>>                              upper_32_bits(pd_addr));
>>>
>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>> +      * inquiry.
>>> +      */
>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>> +             amdgpu_ring_emit_reg_wait(ring,
>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>> +
>>>        amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>>>                                            hub->vm_inv_eng0_ack + eng,
>>>                                            req, 1 << vmid);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> index b8fdb192f6d6..0c41b4fdc58b 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>>>                6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>>>                /* sdma_v5_0_ring_emit_vm_flush */
>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>>>                10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>>>        .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>>>        .emit_ib = sdma_v5_0_ring_emit_ib,
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25 16:19             ` Koenig, Christian
  0 siblings, 0 replies; 20+ messages in thread
From: Koenig, Christian @ 2019-10-25 16:19 UTC (permalink / raw)
  To: Alex Deucher
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Tuikov, Luben, amd-gfx

Am 25.10.19 um 18:05 schrieb Alex Deucher:
> On Fri, Oct 25, 2019 at 2:49 AM Koenig, Christian
> <Christian.Koenig@amd.com> wrote:
>> Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
>>> The GRBM interface is now capable of bursting
>>> 1-cycle op per register, a WRITE followed by
>>> another WRITE, or a WRITE followed by a READ--much
>>> faster than previous muti-cycle per
>>> completed-transaction interface. This causes a
>>> problem, whereby status registers requiring a
>>> read/write by hardware, have a 1-cycle delay, due
>>> to the register update having to go through GRBM
>>> interface.
>>>
>>> This patch adds this delay.
>>>
>>> A one cycle read op is added after updating the
>>> invalidate request and before reading the
>>> invalidate-ACK status.
>> Please completely drop all changes for GFX9 since this patch will most
>> likely break SRIOV.
>>
>> Additional to that please apply the workaround only to SDMA since the CP
>> driven engines should handle that in firmware.
> I think the CP only handles this in firmware if we use the new TLB
> invalidation packet.  I don't think it applies it to general register
> writes like we do.

No, on the CP we should use the combined write/wait command even if we 
don't use the new specialized VM invalidate command. Everything else 
won't work with SRIOV.

Even if we want to we can't insert an extra read in this combined 
write/wait command. And if we split up the commands we would break SRIOV 
once more.

So applying this workaround to the CP code doesn't make any sense at all.

The only TODO which I can see is that we maybe don't use the combined 
write/wait command on Navi yet.

Christian.

>
> Alex
>
>> Regards,
>> Christian.
>>
>>> See also commit
>>> 534991731cb5fa94b5519957646cf849ca10d17d.
>>>
>>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--
>>>    drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
>>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++
>>>    drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>>>    5 files changed, 22 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> index ac43b1af69e3..0042868dbd53 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>>                5 + /* COND_EXEC */
>>>                7 + /* PIPELINE_SYNC */
>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>                2 + /* VM_FLUSH */
>>>                8 + /* FENCE for VM_FLUSH */
>>>                20 + /* GDS switch */
>>> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>>                5 + /* hdp invalidate */
>>>                7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>                2 + /* gfx_v10_0_ring_emit_vm_flush */
>>>                8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>>>        .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> index 9fe95e7693d5..9a7a717208de 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>>                5 +  /* COND_EXEC */
>>>                7 +  /* PIPELINE_SYNC */
>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>                2 + /* VM_FLUSH */
>>>                8 +  /* FENCE for VM_FLUSH */
>>>                20 + /* GDS switch */
>>> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>>                5 + /* hdp invalidate */
>>>                7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>                2 + /* gfx_v9_0_ring_emit_vm_flush */
>>>                8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>>>        .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> index 6e1b25bd1fe7..100d526e9a42 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>
>>>        amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>>
>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>> +      * inquiry.
>>> +      */
>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
>>> +         ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>> +             amdgpu_ring_emit_reg_wait(ring,
>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>> +
>>>        /* wait for the invalidate to complete */
>>>        amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>>                                  1 << vmid, 1 << vmid);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> index 9f2a893871ec..8f3097e45299 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>        amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>>>                              upper_32_bits(pd_addr));
>>>
>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>> +      * inquiry.
>>> +      */
>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>> +             amdgpu_ring_emit_reg_wait(ring,
>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>> +
>>>        amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>>>                                            hub->vm_inv_eng0_ack + eng,
>>>                                            req, 1 << vmid);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> index b8fdb192f6d6..0c41b4fdc58b 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>>>                6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>>>                /* sdma_v5_0_ring_emit_vm_flush */
>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>>>                10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>>>        .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>>>        .emit_ib = sdma_v5_0_ring_emit_ib,
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25 22:45                 ` Tuikov, Luben
  0 siblings, 0 replies; 20+ messages in thread
From: Tuikov, Luben @ 2019-10-25 22:45 UTC (permalink / raw)
  To: Koenig, Christian, Alex Deucher
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On 2019-10-25 12:19 p.m., Koenig, Christian wrote:
> Am 25.10.19 um 18:05 schrieb Alex Deucher:
>> On Fri, Oct 25, 2019 at 2:49 AM Koenig, Christian
>> <Christian.Koenig@amd.com> wrote:
>>> Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
>>>> The GRBM interface is now capable of bursting
>>>> 1-cycle op per register, a WRITE followed by
>>>> another WRITE, or a WRITE followed by a READ--much
>>>> faster than previous muti-cycle per
>>>> completed-transaction interface. This causes a
>>>> problem, whereby status registers requiring a
>>>> read/write by hardware, have a 1-cycle delay, due
>>>> to the register update having to go through GRBM
>>>> interface.
>>>>
>>>> This patch adds this delay.
>>>>
>>>> A one cycle read op is added after updating the
>>>> invalidate request and before reading the
>>>> invalidate-ACK status.
>>> Please completely drop all changes for GFX9 since this patch will most
>>> likely break SRIOV.
>>>
>>> Additional to that please apply the workaround only to SDMA since the CP
>>> driven engines should handle that in firmware.

Thank you Christian for reviewing this patch.

This patch stirred quite a bit of noise. So, then, I'll go by
your last comment above--I suppose this is the desired way to go forward then?

Regards,
Luben


>> I think the CP only handles this in firmware if we use the new TLB
>> invalidation packet.  I don't think it applies it to general register
>> writes like we do.
> 
> No, on the CP we should use the combined write/wait command even if we 
> don't use the new specialized VM invalidate command. Everything else 
> won't work with SRIOV.
> 
> Even if we want to we can't insert an extra read in this combined 
> write/wait command. And if we split up the commands we would break SRIOV 
> once more.
> 
> So applying this workaround to the CP code doesn't make any sense at all.
> 
> The only TODO which I can see is that we maybe don't use the combined 
> write/wait command on Navi yet.
> 
> Christian.
> 
>>
>> Alex
>>
>>> Regards,
>>> Christian.
>>>
>>>> See also commit
>>>> 534991731cb5fa94b5519957646cf849ca10d17d.
>>>>
>>>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>>>> ---
>>>>    drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
>>>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--
>>>>    drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
>>>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++
>>>>    drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>>>>    5 files changed, 22 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> index ac43b1af69e3..0042868dbd53 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>>>                5 + /* COND_EXEC */
>>>>                7 + /* PIPELINE_SYNC */
>>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>                2 + /* VM_FLUSH */
>>>>                8 + /* FENCE for VM_FLUSH */
>>>>                20 + /* GDS switch */
>>>> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>>>                5 + /* hdp invalidate */
>>>>                7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>                2 + /* gfx_v10_0_ring_emit_vm_flush */
>>>>                8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>>>>        .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> index 9fe95e7693d5..9a7a717208de 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>>>                5 +  /* COND_EXEC */
>>>>                7 +  /* PIPELINE_SYNC */
>>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>                2 + /* VM_FLUSH */
>>>>                8 +  /* FENCE for VM_FLUSH */
>>>>                20 + /* GDS switch */
>>>> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>>>                5 + /* hdp invalidate */
>>>>                7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>                2 + /* gfx_v9_0_ring_emit_vm_flush */
>>>>                8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>>>>        .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>> index 6e1b25bd1fe7..100d526e9a42 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>> @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>
>>>>        amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>>>
>>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>>> +      * inquiry.
>>>> +      */
>>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>> +             amdgpu_ring_emit_reg_wait(ring,
>>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>>> +
>>>>        /* wait for the invalidate to complete */
>>>>        amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>>>                                  1 << vmid, 1 << vmid);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> index 9f2a893871ec..8f3097e45299 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>        amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>>>>                              upper_32_bits(pd_addr));
>>>>
>>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>>> +      * inquiry.
>>>> +      */
>>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>> +             amdgpu_ring_emit_reg_wait(ring,
>>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>>> +
>>>>        amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>>>>                                            hub->vm_inv_eng0_ack + eng,
>>>>                                            req, 1 << vmid);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>> index b8fdb192f6d6..0c41b4fdc58b 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>>>>                6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>>>>                /* sdma_v5_0_ring_emit_vm_flush */
>>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>>>>                10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>>>>        .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>>>>        .emit_ib = sdma_v5_0_ring_emit_ib,
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> 

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25 22:45                 ` Tuikov, Luben
  0 siblings, 0 replies; 20+ messages in thread
From: Tuikov, Luben @ 2019-10-25 22:45 UTC (permalink / raw)
  To: Koenig, Christian, Alex Deucher
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, amd-gfx

On 2019-10-25 12:19 p.m., Koenig, Christian wrote:
> Am 25.10.19 um 18:05 schrieb Alex Deucher:
>> On Fri, Oct 25, 2019 at 2:49 AM Koenig, Christian
>> <Christian.Koenig@amd.com> wrote:
>>> Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
>>>> The GRBM interface is now capable of bursting
>>>> 1-cycle op per register, a WRITE followed by
>>>> another WRITE, or a WRITE followed by a READ--much
>>>> faster than previous muti-cycle per
>>>> completed-transaction interface. This causes a
>>>> problem, whereby status registers requiring a
>>>> read/write by hardware, have a 1-cycle delay, due
>>>> to the register update having to go through GRBM
>>>> interface.
>>>>
>>>> This patch adds this delay.
>>>>
>>>> A one cycle read op is added after updating the
>>>> invalidate request and before reading the
>>>> invalidate-ACK status.
>>> Please completely drop all changes for GFX9 since this patch will most
>>> likely break SRIOV.
>>>
>>> Additional to that please apply the workaround only to SDMA since the CP
>>> driven engines should handle that in firmware.

Thank you Christian for reviewing this patch.

This patch stirred quite a bit of noise. So, then, I'll go by
your last comment above--I suppose this is the desired way to go forward then?

Regards,
Luben


>> I think the CP only handles this in firmware if we use the new TLB
>> invalidation packet.  I don't think it applies it to general register
>> writes like we do.
> 
> No, on the CP we should use the combined write/wait command even if we 
> don't use the new specialized VM invalidate command. Everything else 
> won't work with SRIOV.
> 
> Even if we want to we can't insert an extra read in this combined 
> write/wait command. And if we split up the commands we would break SRIOV 
> once more.
> 
> So applying this workaround to the CP code doesn't make any sense at all.
> 
> The only TODO which I can see is that we maybe don't use the combined 
> write/wait command on Navi yet.
> 
> Christian.
> 
>>
>> Alex
>>
>>> Regards,
>>> Christian.
>>>
>>>> See also commit
>>>> 534991731cb5fa94b5519957646cf849ca10d17d.
>>>>
>>>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>>>> ---
>>>>    drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
>>>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--
>>>>    drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
>>>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++
>>>>    drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>>>>    5 files changed, 22 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> index ac43b1af69e3..0042868dbd53 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>>>                5 + /* COND_EXEC */
>>>>                7 + /* PIPELINE_SYNC */
>>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>                2 + /* VM_FLUSH */
>>>>                8 + /* FENCE for VM_FLUSH */
>>>>                20 + /* GDS switch */
>>>> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>>>                5 + /* hdp invalidate */
>>>>                7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>                2 + /* gfx_v10_0_ring_emit_vm_flush */
>>>>                8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>>>>        .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> index 9fe95e7693d5..9a7a717208de 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>>>                5 +  /* COND_EXEC */
>>>>                7 +  /* PIPELINE_SYNC */
>>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>                2 + /* VM_FLUSH */
>>>>                8 +  /* FENCE for VM_FLUSH */
>>>>                20 + /* GDS switch */
>>>> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>>>                5 + /* hdp invalidate */
>>>>                7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>                2 + /* gfx_v9_0_ring_emit_vm_flush */
>>>>                8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>>>>        .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>> index 6e1b25bd1fe7..100d526e9a42 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>> @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>
>>>>        amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>>>
>>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>>> +      * inquiry.
>>>> +      */
>>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>> +             amdgpu_ring_emit_reg_wait(ring,
>>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>>> +
>>>>        /* wait for the invalidate to complete */
>>>>        amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>>>                                  1 << vmid, 1 << vmid);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> index 9f2a893871ec..8f3097e45299 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>        amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>>>>                              upper_32_bits(pd_addr));
>>>>
>>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>>> +      * inquiry.
>>>> +      */
>>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>> +             amdgpu_ring_emit_reg_wait(ring,
>>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>>> +
>>>>        amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>>>>                                            hub->vm_inv_eng0_ack + eng,
>>>>                                            req, 1 << vmid);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>> index b8fdb192f6d6..0c41b4fdc58b 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>>>>                6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>>>>                /* sdma_v5_0_ring_emit_vm_flush */
>>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>>>>                10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>>>>        .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>>>>        .emit_ib = sdma_v5_0_ring_emit_ib,
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> 

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH] drm/amdgpu: GFX10: GRBM requires 1-cycle delay (v2)
@ 2019-10-25 22:59                 ` Tuikov, Luben
  0 siblings, 0 replies; 20+ messages in thread
From: Tuikov, Luben @ 2019-10-25 22:59 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander, Tuikov, Luben, Koenig, Christian

The GRBM interface is now capable of bursting
1-cycle op per register, a WRITE followed by
another WRITE, or a WRITE followed by a READ--much
faster than previous muti-cycle per
completed-transaction interface. This causes a
problem, whereby status registers requiring a
read/write by hardware, have a 1-cycle delay, due
to the register update having to go through GRBM
interface.

This patch adds this delay.

A one cycle read op is added after updating the
invalidate request and before reading the
invalidate-ACK status.

See also commit
534991731cb5fa94b5519957646cf849ca10d17d.

v2: Remove GFX9 and apply only to SDMA ring.

Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 7 +++++++
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 6e1b25bd1fe7..dedd7e1ab2fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -346,6 +346,13 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 
 	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
 
+	/* Insert a dummy read to delay one cycle after the write REQ,
+	 * and before the ACK inquiry.
+	 */
+	if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA)
+		amdgpu_ring_emit_reg_wait(ring,
+					  hub->vm_inv_eng0_req + eng, 0, 0);
+
 	/* wait for the invalidate to complete */
 	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
 				  1 << vmid, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index b8fdb192f6d6..0c41b4fdc58b 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
 		6 + /* sdma_v5_0_ring_emit_pipeline_sync */
 		/* sdma_v5_0_ring_emit_vm_flush */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
 		10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
 	.emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
 	.emit_ib = sdma_v5_0_ring_emit_ib,
-- 
2.23.0.385.gbc12974a89

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH] drm/amdgpu: GFX10: GRBM requires 1-cycle delay (v2)
@ 2019-10-25 22:59                 ` Tuikov, Luben
  0 siblings, 0 replies; 20+ messages in thread
From: Tuikov, Luben @ 2019-10-25 22:59 UTC (permalink / raw)
  To: amd-gfx; +Cc: Deucher, Alexander, Tuikov, Luben, Koenig, Christian

The GRBM interface is now capable of bursting
1-cycle op per register, a WRITE followed by
another WRITE, or a WRITE followed by a READ--much
faster than previous muti-cycle per
completed-transaction interface. This causes a
problem, whereby status registers requiring a
read/write by hardware, have a 1-cycle delay, due
to the register update having to go through GRBM
interface.

This patch adds this delay.

A one cycle read op is added after updating the
invalidate request and before reading the
invalidate-ACK status.

See also commit
534991731cb5fa94b5519957646cf849ca10d17d.

v2: Remove GFX9 and apply only to SDMA ring.

Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 7 +++++++
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 6e1b25bd1fe7..dedd7e1ab2fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -346,6 +346,13 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 
 	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
 
+	/* Insert a dummy read to delay one cycle after the write REQ,
+	 * and before the ACK inquiry.
+	 */
+	if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA)
+		amdgpu_ring_emit_reg_wait(ring,
+					  hub->vm_inv_eng0_req + eng, 0, 0);
+
 	/* wait for the invalidate to complete */
 	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
 				  1 << vmid, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index b8fdb192f6d6..0c41b4fdc58b 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
 		6 + /* sdma_v5_0_ring_emit_pipeline_sync */
 		/* sdma_v5_0_ring_emit_vm_flush */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
 		10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
 	.emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
 	.emit_ib = sdma_v5_0_ring_emit_ib,
-- 
2.23.0.385.gbc12974a89

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-26 12:09                     ` Koenig, Christian
  0 siblings, 0 replies; 20+ messages in thread
From: Koenig, Christian @ 2019-10-26 12:09 UTC (permalink / raw)
  To: Tuikov, Luben, Alex Deucher
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 26.10.19 um 00:45 schrieb Tuikov, Luben:
> On 2019-10-25 12:19 p.m., Koenig, Christian wrote:
>> Am 25.10.19 um 18:05 schrieb Alex Deucher:
>>> On Fri, Oct 25, 2019 at 2:49 AM Koenig, Christian
>>> <Christian.Koenig@amd.com> wrote:
>>>> Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
>>>>> The GRBM interface is now capable of bursting
>>>>> 1-cycle op per register, a WRITE followed by
>>>>> another WRITE, or a WRITE followed by a READ--much
>>>>> faster than previous muti-cycle per
>>>>> completed-transaction interface. This causes a
>>>>> problem, whereby status registers requiring a
>>>>> read/write by hardware, have a 1-cycle delay, due
>>>>> to the register update having to go through GRBM
>>>>> interface.
>>>>>
>>>>> This patch adds this delay.
>>>>>
>>>>> A one cycle read op is added after updating the
>>>>> invalidate request and before reading the
>>>>> invalidate-ACK status.
>>>> Please completely drop all changes for GFX9 since this patch will most
>>>> likely break SRIOV.
>>>>
>>>> Additional to that please apply the workaround only to SDMA since the CP
>>>> driven engines should handle that in firmware.
> Thank you Christian for reviewing this patch.
>
> This patch stirred quite a bit of noise. So, then, I'll go by
> your last comment above--I suppose this is the desired way to go forward then?

You most likely broke the SRIOV use case on GFX9 with that, no wonder 
that this raised eyebrows.

As far as I can see this manual workaround is only applicable to the 
SDMA on Navi.

But we should double check that the CP firmware interface with the 
combined write/wait command is correctly used on Navi/GFX10 as well. 
IIRC that came in rather late for GFX9, could be that the Navi bringup 
branch never had that.

Regards,
Christian.

>
> Regards,
> Luben
>
>
>>> I think the CP only handles this in firmware if we use the new TLB
>>> invalidation packet.  I don't think it applies it to general register
>>> writes like we do.
>> No, on the CP we should use the combined write/wait command even if we
>> don't use the new specialized VM invalidate command. Everything else
>> won't work with SRIOV.
>>
>> Even if we want to we can't insert an extra read in this combined
>> write/wait command. And if we split up the commands we would break SRIOV
>> once more.
>>
>> So applying this workaround to the CP code doesn't make any sense at all.
>>
>> The only TODO which I can see is that we maybe don't use the combined
>> write/wait command on Navi yet.
>>
>> Christian.
>>
>>> Alex
>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> See also commit
>>>>> 534991731cb5fa94b5519957646cf849ca10d17d.
>>>>>
>>>>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>>>>> ---
>>>>>     drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
>>>>>     drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--
>>>>>     drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
>>>>>     drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++
>>>>>     drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>>>>>     5 files changed, 22 insertions(+), 5 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>> index ac43b1af69e3..0042868dbd53 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>>>>                 5 + /* COND_EXEC */
>>>>>                 7 + /* PIPELINE_SYNC */
>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>                 2 + /* VM_FLUSH */
>>>>>                 8 + /* FENCE for VM_FLUSH */
>>>>>                 20 + /* GDS switch */
>>>>> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>>>>                 5 + /* hdp invalidate */
>>>>>                 7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>                 2 + /* gfx_v10_0_ring_emit_vm_flush */
>>>>>                 8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>>>>>         .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>> index 9fe95e7693d5..9a7a717208de 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>>>>                 5 +  /* COND_EXEC */
>>>>>                 7 +  /* PIPELINE_SYNC */
>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>                 2 + /* VM_FLUSH */
>>>>>                 8 +  /* FENCE for VM_FLUSH */
>>>>>                 20 + /* GDS switch */
>>>>> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>>>>                 5 + /* hdp invalidate */
>>>>>                 7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>                 2 + /* gfx_v9_0_ring_emit_vm_flush */
>>>>>                 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>>>>>         .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>> index 6e1b25bd1fe7..100d526e9a42 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>> @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>>
>>>>>         amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>>>>
>>>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>>>> +      * inquiry.
>>>>> +      */
>>>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
>>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>>> +             amdgpu_ring_emit_reg_wait(ring,
>>>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>>>> +
>>>>>         /* wait for the invalidate to complete */
>>>>>         amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>>>>                                   1 << vmid, 1 << vmid);
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>> index 9f2a893871ec..8f3097e45299 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>>         amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>>>>>                               upper_32_bits(pd_addr));
>>>>>
>>>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>>>> +      * inquiry.
>>>>> +      */
>>>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>>> +             amdgpu_ring_emit_reg_wait(ring,
>>>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>>>> +
>>>>>         amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>>>>>                                             hub->vm_inv_eng0_ack + eng,
>>>>>                                             req, 1 << vmid);
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>> index b8fdb192f6d6..0c41b4fdc58b 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>>>>>                 6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>>>>>                 /* sdma_v5_0_ring_emit_vm_flush */
>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>>>>>                 10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>>>>>         .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>>>>>         .emit_ib = sdma_v5_0_ring_emit_ib,
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-26 12:09                     ` Koenig, Christian
  0 siblings, 0 replies; 20+ messages in thread
From: Koenig, Christian @ 2019-10-26 12:09 UTC (permalink / raw)
  To: Tuikov, Luben, Alex Deucher
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, amd-gfx

Am 26.10.19 um 00:45 schrieb Tuikov, Luben:
> On 2019-10-25 12:19 p.m., Koenig, Christian wrote:
>> Am 25.10.19 um 18:05 schrieb Alex Deucher:
>>> On Fri, Oct 25, 2019 at 2:49 AM Koenig, Christian
>>> <Christian.Koenig@amd.com> wrote:
>>>> Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
>>>>> The GRBM interface is now capable of bursting
>>>>> 1-cycle op per register, a WRITE followed by
>>>>> another WRITE, or a WRITE followed by a READ--much
>>>>> faster than previous muti-cycle per
>>>>> completed-transaction interface. This causes a
>>>>> problem, whereby status registers requiring a
>>>>> read/write by hardware, have a 1-cycle delay, due
>>>>> to the register update having to go through GRBM
>>>>> interface.
>>>>>
>>>>> This patch adds this delay.
>>>>>
>>>>> A one cycle read op is added after updating the
>>>>> invalidate request and before reading the
>>>>> invalidate-ACK status.
>>>> Please completely drop all changes for GFX9 since this patch will most
>>>> likely break SRIOV.
>>>>
>>>> Additional to that please apply the workaround only to SDMA since the CP
>>>> driven engines should handle that in firmware.
> Thank you Christian for reviewing this patch.
>
> This patch stirred quite a bit of noise. So, then, I'll go by
> your last comment above--I suppose this is the desired way to go forward then?

You most likely broke the SRIOV use case on GFX9 with that, no wonder 
that this raised eyebrows.

As far as I can see this manual workaround is only applicable to the 
SDMA on Navi.

But we should double check that the CP firmware interface with the 
combined write/wait command is correctly used on Navi/GFX10 as well. 
IIRC that came in rather late for GFX9, could be that the Navi bringup 
branch never had that.

Regards,
Christian.

>
> Regards,
> Luben
>
>
>>> I think the CP only handles this in firmware if we use the new TLB
>>> invalidation packet.  I don't think it applies it to general register
>>> writes like we do.
>> No, on the CP we should use the combined write/wait command even if we
>> don't use the new specialized VM invalidate command. Everything else
>> won't work with SRIOV.
>>
>> Even if we want to we can't insert an extra read in this combined
>> write/wait command. And if we split up the commands we would break SRIOV
>> once more.
>>
>> So applying this workaround to the CP code doesn't make any sense at all.
>>
>> The only TODO which I can see is that we maybe don't use the combined
>> write/wait command on Navi yet.
>>
>> Christian.
>>
>>> Alex
>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> See also commit
>>>>> 534991731cb5fa94b5519957646cf849ca10d17d.
>>>>>
>>>>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>>>>> ---
>>>>>     drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
>>>>>     drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--
>>>>>     drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
>>>>>     drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++
>>>>>     drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>>>>>     5 files changed, 22 insertions(+), 5 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>> index ac43b1af69e3..0042868dbd53 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>>>>                 5 + /* COND_EXEC */
>>>>>                 7 + /* PIPELINE_SYNC */
>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>                 2 + /* VM_FLUSH */
>>>>>                 8 + /* FENCE for VM_FLUSH */
>>>>>                 20 + /* GDS switch */
>>>>> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>>>>                 5 + /* hdp invalidate */
>>>>>                 7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>                 2 + /* gfx_v10_0_ring_emit_vm_flush */
>>>>>                 8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>>>>>         .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>> index 9fe95e7693d5..9a7a717208de 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>>>>                 5 +  /* COND_EXEC */
>>>>>                 7 +  /* PIPELINE_SYNC */
>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>                 2 + /* VM_FLUSH */
>>>>>                 8 +  /* FENCE for VM_FLUSH */
>>>>>                 20 + /* GDS switch */
>>>>> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>>>>                 5 + /* hdp invalidate */
>>>>>                 7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>                 2 + /* gfx_v9_0_ring_emit_vm_flush */
>>>>>                 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>>>>>         .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>> index 6e1b25bd1fe7..100d526e9a42 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>> @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>>
>>>>>         amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>>>>
>>>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>>>> +      * inquiry.
>>>>> +      */
>>>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
>>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>>> +             amdgpu_ring_emit_reg_wait(ring,
>>>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>>>> +
>>>>>         /* wait for the invalidate to complete */
>>>>>         amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>>>>                                   1 << vmid, 1 << vmid);
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>> index 9f2a893871ec..8f3097e45299 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>>         amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>>>>>                               upper_32_bits(pd_addr));
>>>>>
>>>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>>>> +      * inquiry.
>>>>> +      */
>>>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>>> +             amdgpu_ring_emit_reg_wait(ring,
>>>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>>>> +
>>>>>         amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>>>>>                                             hub->vm_inv_eng0_ack + eng,
>>>>>                                             req, 1 << vmid);
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>> index b8fdb192f6d6..0c41b4fdc58b 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>>>>>                 6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>>>>>                 /* sdma_v5_0_ring_emit_vm_flush */
>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>>>>>                 10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>>>>>         .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>>>>>         .emit_ib = sdma_v5_0_ring_emit_ib,
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-27 21:25                         ` Tuikov, Luben
  0 siblings, 0 replies; 20+ messages in thread
From: Tuikov, Luben @ 2019-10-27 21:25 UTC (permalink / raw)
  To: Koenig, Christian, Alex Deucher
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On 2019-10-26 08:09, Koenig, Christian wrote:
> Am 26.10.19 um 00:45 schrieb Tuikov, Luben:
>> On 2019-10-25 12:19 p.m., Koenig, Christian wrote:
>>> Am 25.10.19 um 18:05 schrieb Alex Deucher:
>>>> On Fri, Oct 25, 2019 at 2:49 AM Koenig, Christian
>>>> <Christian.Koenig@amd.com> wrote:
>>>>> Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
>>>>>> The GRBM interface is now capable of bursting
>>>>>> 1-cycle op per register, a WRITE followed by
>>>>>> another WRITE, or a WRITE followed by a READ--much
>>>>>> faster than previous muti-cycle per
>>>>>> completed-transaction interface. This causes a
>>>>>> problem, whereby status registers requiring a
>>>>>> read/write by hardware, have a 1-cycle delay, due
>>>>>> to the register update having to go through GRBM
>>>>>> interface.
>>>>>>
>>>>>> This patch adds this delay.
>>>>>>
>>>>>> A one cycle read op is added after updating the
>>>>>> invalidate request and before reading the
>>>>>> invalidate-ACK status.
>>>>> Please completely drop all changes for GFX9 since this patch will most
>>>>> likely break SRIOV.
>>>>>
>>>>> Additional to that please apply the workaround only to SDMA since the CP
>>>>> driven engines should handle that in firmware.
>> Thank you Christian for reviewing this patch.
>>
>> This patch stirred quite a bit of noise. So, then, I'll go by
>> your last comment above--I suppose this is the desired way to go forward then?
> 
> You most likely broke the SRIOV use case on GFX9 with that, no wonder 
> that this raised eyebrows.
> 
> As far as I can see this manual workaround is only applicable to the 
> SDMA on Navi.

Did you see the (v2) patch?

Regards,
Luben

> 
> But we should double check that the CP firmware interface with the 
> combined write/wait command is correctly used on Navi/GFX10 as well. 
> IIRC that came in rather late for GFX9, could be that the Navi bringup 
> branch never had that.
> 
> Regards,
> Christian.
> 
>>
>> Regards,
>> Luben
>>
>>
>>>> I think the CP only handles this in firmware if we use the new TLB
>>>> invalidation packet.  I don't think it applies it to general register
>>>> writes like we do.
>>> No, on the CP we should use the combined write/wait command even if we
>>> don't use the new specialized VM invalidate command. Everything else
>>> won't work with SRIOV.
>>>
>>> Even if we want to we can't insert an extra read in this combined
>>> write/wait command. And if we split up the commands we would break SRIOV
>>> once more.
>>>
>>> So applying this workaround to the CP code doesn't make any sense at all.
>>>
>>> The only TODO which I can see is that we maybe don't use the combined
>>> write/wait command on Navi yet.
>>>
>>> Christian.
>>>
>>>> Alex
>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>> See also commit
>>>>>> 534991731cb5fa94b5519957646cf849ca10d17d.
>>>>>>
>>>>>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>>>>>> ---
>>>>>>     drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
>>>>>>     drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--
>>>>>>     drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
>>>>>>     drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++
>>>>>>     drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>>>>>>     5 files changed, 22 insertions(+), 5 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>> index ac43b1af69e3..0042868dbd53 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>>>>>                 5 + /* COND_EXEC */
>>>>>>                 7 + /* PIPELINE_SYNC */
>>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>>                 2 + /* VM_FLUSH */
>>>>>>                 8 + /* FENCE for VM_FLUSH */
>>>>>>                 20 + /* GDS switch */
>>>>>> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>>>>>                 5 + /* hdp invalidate */
>>>>>>                 7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>>                 2 + /* gfx_v10_0_ring_emit_vm_flush */
>>>>>>                 8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>>>>>>         .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>> index 9fe95e7693d5..9a7a717208de 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>>>>>                 5 +  /* COND_EXEC */
>>>>>>                 7 +  /* PIPELINE_SYNC */
>>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>>                 2 + /* VM_FLUSH */
>>>>>>                 8 +  /* FENCE for VM_FLUSH */
>>>>>>                 20 + /* GDS switch */
>>>>>> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>>>>>                 5 + /* hdp invalidate */
>>>>>>                 7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>>                 2 + /* gfx_v9_0_ring_emit_vm_flush */
>>>>>>                 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>>>>>>         .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>> index 6e1b25bd1fe7..100d526e9a42 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>> @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>>>
>>>>>>         amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>>>>>
>>>>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>>>>> +      * inquiry.
>>>>>> +      */
>>>>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
>>>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>>>> +             amdgpu_ring_emit_reg_wait(ring,
>>>>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>>>>> +
>>>>>>         /* wait for the invalidate to complete */
>>>>>>         amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>>>>>                                   1 << vmid, 1 << vmid);
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>> index 9f2a893871ec..8f3097e45299 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>>>         amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>>>>>>                               upper_32_bits(pd_addr));
>>>>>>
>>>>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>>>>> +      * inquiry.
>>>>>> +      */
>>>>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>>>> +             amdgpu_ring_emit_reg_wait(ring,
>>>>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>>>>> +
>>>>>>         amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>>>>>>                                             hub->vm_inv_eng0_ack + eng,
>>>>>>                                             req, 1 << vmid);
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>>> index b8fdb192f6d6..0c41b4fdc58b 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>>> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>>>>>>                 6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>>>>>>                 /* sdma_v5_0_ring_emit_vm_flush */
>>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
>>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
>>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>>>>>>                 10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>>>>>>         .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>>>>>>         .emit_ib = sdma_v5_0_ring_emit_ib,
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> 

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-27 21:25                         ` Tuikov, Luben
  0 siblings, 0 replies; 20+ messages in thread
From: Tuikov, Luben @ 2019-10-27 21:25 UTC (permalink / raw)
  To: Koenig, Christian, Alex Deucher
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, amd-gfx

On 2019-10-26 08:09, Koenig, Christian wrote:
> Am 26.10.19 um 00:45 schrieb Tuikov, Luben:
>> On 2019-10-25 12:19 p.m., Koenig, Christian wrote:
>>> Am 25.10.19 um 18:05 schrieb Alex Deucher:
>>>> On Fri, Oct 25, 2019 at 2:49 AM Koenig, Christian
>>>> <Christian.Koenig@amd.com> wrote:
>>>>> Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
>>>>>> The GRBM interface is now capable of bursting
>>>>>> 1-cycle op per register, a WRITE followed by
>>>>>> another WRITE, or a WRITE followed by a READ--much
>>>>>> faster than previous muti-cycle per
>>>>>> completed-transaction interface. This causes a
>>>>>> problem, whereby status registers requiring a
>>>>>> read/write by hardware, have a 1-cycle delay, due
>>>>>> to the register update having to go through GRBM
>>>>>> interface.
>>>>>>
>>>>>> This patch adds this delay.
>>>>>>
>>>>>> A one cycle read op is added after updating the
>>>>>> invalidate request and before reading the
>>>>>> invalidate-ACK status.
>>>>> Please completely drop all changes for GFX9 since this patch will most
>>>>> likely break SRIOV.
>>>>>
>>>>> Additional to that please apply the workaround only to SDMA since the CP
>>>>> driven engines should handle that in firmware.
>> Thank you Christian for reviewing this patch.
>>
>> This patch stirred quite a bit of noise. So, then, I'll go by
>> your last comment above--I suppose this is the desired way to go forward then?
> 
> You most likely broke the SRIOV use case on GFX9 with that, no wonder 
> that this raised eyebrows.
> 
> As far as I can see this manual workaround is only applicable to the 
> SDMA on Navi.

Did you see the (v2) patch?

Regards,
Luben

> 
> But we should double check that the CP firmware interface with the 
> combined write/wait command is correctly used on Navi/GFX10 as well. 
> IIRC that came in rather late for GFX9, could be that the Navi bringup 
> branch never had that.
> 
> Regards,
> Christian.
> 
>>
>> Regards,
>> Luben
>>
>>
>>>> I think the CP only handles this in firmware if we use the new TLB
>>>> invalidation packet.  I don't think it applies it to general register
>>>> writes like we do.
>>> No, on the CP we should use the combined write/wait command even if we
>>> don't use the new specialized VM invalidate command. Everything else
>>> won't work with SRIOV.
>>>
>>> Even if we want to we can't insert an extra read in this combined
>>> write/wait command. And if we split up the commands we would break SRIOV
>>> once more.
>>>
>>> So applying this workaround to the CP code doesn't make any sense at all.
>>>
>>> The only TODO which I can see is that we maybe don't use the combined
>>> write/wait command on Navi yet.
>>>
>>> Christian.
>>>
>>>> Alex
>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>> See also commit
>>>>>> 534991731cb5fa94b5519957646cf849ca10d17d.
>>>>>>
>>>>>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>>>>>> ---
>>>>>>     drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
>>>>>>     drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--
>>>>>>     drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
>>>>>>     drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++
>>>>>>     drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>>>>>>     5 files changed, 22 insertions(+), 5 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>> index ac43b1af69e3..0042868dbd53 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>>>>>                 5 + /* COND_EXEC */
>>>>>>                 7 + /* PIPELINE_SYNC */
>>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>>                 2 + /* VM_FLUSH */
>>>>>>                 8 + /* FENCE for VM_FLUSH */
>>>>>>                 20 + /* GDS switch */
>>>>>> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>>>>>                 5 + /* hdp invalidate */
>>>>>>                 7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>>                 2 + /* gfx_v10_0_ring_emit_vm_flush */
>>>>>>                 8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>>>>>>         .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>> index 9fe95e7693d5..9a7a717208de 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>>>>>                 5 +  /* COND_EXEC */
>>>>>>                 7 +  /* PIPELINE_SYNC */
>>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>>                 2 + /* VM_FLUSH */
>>>>>>                 8 +  /* FENCE for VM_FLUSH */
>>>>>>                 20 + /* GDS switch */
>>>>>> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>>>>>                 5 + /* hdp invalidate */
>>>>>>                 7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>>                 2 + /* gfx_v9_0_ring_emit_vm_flush */
>>>>>>                 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>>>>>>         .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>> index 6e1b25bd1fe7..100d526e9a42 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>> @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>>>
>>>>>>         amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>>>>>
>>>>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>>>>> +      * inquiry.
>>>>>> +      */
>>>>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
>>>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>>>> +             amdgpu_ring_emit_reg_wait(ring,
>>>>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>>>>> +
>>>>>>         /* wait for the invalidate to complete */
>>>>>>         amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>>>>>                                   1 << vmid, 1 << vmid);
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>> index 9f2a893871ec..8f3097e45299 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>>>         amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>>>>>>                               upper_32_bits(pd_addr));
>>>>>>
>>>>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>>>>> +      * inquiry.
>>>>>> +      */
>>>>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>>>> +             amdgpu_ring_emit_reg_wait(ring,
>>>>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>>>>> +
>>>>>>         amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>>>>>>                                             hub->vm_inv_eng0_ack + eng,
>>>>>>                                             req, 1 << vmid);
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>>> index b8fdb192f6d6..0c41b4fdc58b 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>>> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>>>>>>                 6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>>>>>>                 /* sdma_v5_0_ring_emit_vm_flush */
>>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
>>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
>>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>>>>>>                 10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>>>>>>         .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>>>>>>         .emit_ib = sdma_v5_0_ring_emit_ib,
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> 

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm/amdgpu: GFX10: GRBM requires 1-cycle delay (v2)
@ 2019-10-28 10:51                     ` Koenig, Christian
  0 siblings, 0 replies; 20+ messages in thread
From: Koenig, Christian @ 2019-10-28 10:51 UTC (permalink / raw)
  To: Tuikov, Luben, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander

Am 26.10.19 um 00:59 schrieb Tuikov, Luben:
> The GRBM interface is now capable of bursting
> 1-cycle op per register, a WRITE followed by
> another WRITE, or a WRITE followed by a READ--much
> faster than previous muti-cycle per
> completed-transaction interface. This causes a
> problem, whereby status registers requiring a
> read/write by hardware, have a 1-cycle delay, due
> to the register update having to go through GRBM
> interface.
>
> This patch adds this delay.
>
> A one cycle read op is added after updating the
> invalidate request and before reading the
> invalidate-ACK status.
>
> See also commit
> 534991731cb5fa94b5519957646cf849ca10d17d.
>
> v2: Remove GFX9 and apply only to SDMA ring.
>
> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 7 +++++++
>   drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>   2 files changed, 8 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 6e1b25bd1fe7..dedd7e1ab2fb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -346,6 +346,13 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   
>   	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>   
> +	/* Insert a dummy read to delay one cycle after the write REQ,
> +	 * and before the ACK inquiry.
> +	 */
> +	if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA)
> +		amdgpu_ring_emit_reg_wait(ring,
> +					  hub->vm_inv_eng0_req + eng, 0, 0);
> +
>   	/* wait for the invalidate to complete */
>   	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>   				  1 << vmid, 1 << vmid);

Looks like we indeed doesn't use the integrated write/wait CP command on 
Navi.

Anyway that is a completely separate issue, this patch is Reviewed-by: 
Christian König <christian.koenig@amd.com>

Regards,
Christian.

> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> index b8fdb192f6d6..0c41b4fdc58b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>   		6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>   		/* sdma_v5_0_ring_emit_vm_flush */
>   		SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>   		10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>   	.emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>   	.emit_ib = sdma_v5_0_ring_emit_ib,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm/amdgpu: GFX10: GRBM requires 1-cycle delay (v2)
@ 2019-10-28 10:51                     ` Koenig, Christian
  0 siblings, 0 replies; 20+ messages in thread
From: Koenig, Christian @ 2019-10-28 10:51 UTC (permalink / raw)
  To: Tuikov, Luben, amd-gfx; +Cc: Deucher, Alexander

Am 26.10.19 um 00:59 schrieb Tuikov, Luben:
> The GRBM interface is now capable of bursting
> 1-cycle op per register, a WRITE followed by
> another WRITE, or a WRITE followed by a READ--much
> faster than previous muti-cycle per
> completed-transaction interface. This causes a
> problem, whereby status registers requiring a
> read/write by hardware, have a 1-cycle delay, due
> to the register update having to go through GRBM
> interface.
>
> This patch adds this delay.
>
> A one cycle read op is added after updating the
> invalidate request and before reading the
> invalidate-ACK status.
>
> See also commit
> 534991731cb5fa94b5519957646cf849ca10d17d.
>
> v2: Remove GFX9 and apply only to SDMA ring.
>
> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 7 +++++++
>   drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>   2 files changed, 8 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 6e1b25bd1fe7..dedd7e1ab2fb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -346,6 +346,13 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   
>   	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>   
> +	/* Insert a dummy read to delay one cycle after the write REQ,
> +	 * and before the ACK inquiry.
> +	 */
> +	if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA)
> +		amdgpu_ring_emit_reg_wait(ring,
> +					  hub->vm_inv_eng0_req + eng, 0, 0);
> +
>   	/* wait for the invalidate to complete */
>   	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>   				  1 << vmid, 1 << vmid);

Looks like we indeed doesn't use the integrated write/wait CP command on 
Navi.

Anyway that is a completely separate issue, this patch is Reviewed-by: 
Christian König <christian.koenig@amd.com>

Regards,
Christian.

> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> index b8fdb192f6d6..0c41b4fdc58b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>   		6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>   		/* sdma_v5_0_ring_emit_vm_flush */
>   		SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>   		10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>   	.emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>   	.emit_ib = sdma_v5_0_ring_emit_ib,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2019-10-28 10:51 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-24 21:16 [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay Tuikov, Luben
2019-10-24 21:16 ` Tuikov, Luben
     [not found] ` <20191024211430.25399-1-luben.tuikov-5C7GfCeVMHo@public.gmane.org>
2019-10-25  3:20   ` Zhu, Changfeng
2019-10-25  3:20     ` Zhu, Changfeng
2019-10-25  6:49   ` Koenig, Christian
2019-10-25  6:49     ` Koenig, Christian
     [not found]     ` <6be2805a-dddc-7b02-84ea-f52fab9780b0-5C7GfCeVMHo@public.gmane.org>
2019-10-25 16:05       ` Alex Deucher
2019-10-25 16:05         ` Alex Deucher
     [not found]         ` <CADnq5_NsTABDWTMBFcQBGfaBganBpzN+YQ0gmw55pa8PswNZYA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2019-10-25 16:19           ` Koenig, Christian
2019-10-25 16:19             ` Koenig, Christian
     [not found]             ` <b40c78f1-17a5-f0f9-183e-0c78fd7163e9-5C7GfCeVMHo@public.gmane.org>
2019-10-25 22:45               ` Tuikov, Luben
2019-10-25 22:45                 ` Tuikov, Luben
     [not found]                 ` <c3e496c7-2ace-149e-0c51-92dd1342d31d-5C7GfCeVMHo@public.gmane.org>
2019-10-26 12:09                   ` Koenig, Christian
2019-10-26 12:09                     ` Koenig, Christian
     [not found]                     ` <122f3bde-5fd0-1fa5-864c-547c0cefb744-5C7GfCeVMHo@public.gmane.org>
2019-10-27 21:25                       ` Tuikov, Luben
2019-10-27 21:25                         ` Tuikov, Luben
2019-10-25 22:59               ` [PATCH] drm/amdgpu: GFX10: GRBM requires 1-cycle delay (v2) Tuikov, Luben
2019-10-25 22:59                 ` Tuikov, Luben
     [not found]                 ` <20191025225930.15074-1-luben.tuikov-5C7GfCeVMHo@public.gmane.org>
2019-10-28 10:51                   ` Koenig, Christian
2019-10-28 10:51                     ` Koenig, Christian

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.