* [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-24 21:16 ` Tuikov, Luben
0 siblings, 0 replies; 20+ messages in thread
From: Tuikov, Luben @ 2019-10-24 21:16 UTC (permalink / raw)
To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Tuikov, Luben,
Koenig, Christian
The GRBM interface is now capable of bursting
1-cycle op per register, a WRITE followed by
another WRITE, or a WRITE followed by a READ--much
faster than previous muti-cycle per
completed-transaction interface. This causes a
problem, whereby status registers requiring a
read/write by hardware, have a 1-cycle delay, due
to the register update having to go through GRBM
interface.
This patch adds this delay.
A one cycle read op is added after updating the
invalidate request and before reading the
invalidate-ACK status.
See also commit
534991731cb5fa94b5519957646cf849ca10d17d.
Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
---
drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 ++--
drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 8 ++++++++
drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
5 files changed, 22 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index ac43b1af69e3..0042868dbd53 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
5 + /* COND_EXEC */
7 + /* PIPELINE_SYNC */
SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
- SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+ SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
2 + /* VM_FLUSH */
8 + /* FENCE for VM_FLUSH */
20 + /* GDS switch */
@@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
5 + /* hdp invalidate */
7 + /* gfx_v10_0_ring_emit_pipeline_sync */
SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
- SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+ SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
2 + /* gfx_v10_0_ring_emit_vm_flush */
8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
.emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 9fe95e7693d5..9a7a717208de 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
5 + /* COND_EXEC */
7 + /* PIPELINE_SYNC */
SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
- SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+ SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
2 + /* VM_FLUSH */
8 + /* FENCE for VM_FLUSH */
20 + /* GDS switch */
@@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
5 + /* hdp invalidate */
7 + /* gfx_v9_0_ring_emit_pipeline_sync */
SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
- SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+ SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
2 + /* gfx_v9_0_ring_emit_vm_flush */
8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
.emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 6e1b25bd1fe7..100d526e9a42 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
+ /* Insert a dummy read to delay one cycle before the ACK
+ * inquiry.
+ */
+ if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
+ ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
+ ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
+ amdgpu_ring_emit_reg_wait(ring,
+ hub->vm_inv_eng0_req + eng, 0, 0);
+
/* wait for the invalidate to complete */
amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
1 << vmid, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 9f2a893871ec..8f3097e45299 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
upper_32_bits(pd_addr));
+ /* Insert a dummy read to delay one cycle before the ACK
+ * inquiry.
+ */
+ if (ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
+ ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
+ amdgpu_ring_emit_reg_wait(ring,
+ hub->vm_inv_eng0_req + eng, 0, 0);
+
amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
hub->vm_inv_eng0_ack + eng,
req, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index b8fdb192f6d6..0c41b4fdc58b 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
6 + /* sdma_v5_0_ring_emit_pipeline_sync */
/* sdma_v5_0_ring_emit_vm_flush */
SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
- SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
+ SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
.emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
.emit_ib = sdma_v5_0_ring_emit_ib,
--
2.23.0.385.gbc12974a89
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-24 21:16 ` Tuikov, Luben
0 siblings, 0 replies; 20+ messages in thread
From: Tuikov, Luben @ 2019-10-24 21:16 UTC (permalink / raw)
To: amd-gfx
Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Tuikov, Luben,
Koenig, Christian
The GRBM interface is now capable of bursting
1-cycle op per register, a WRITE followed by
another WRITE, or a WRITE followed by a READ--much
faster than previous muti-cycle per
completed-transaction interface. This causes a
problem, whereby status registers requiring a
read/write by hardware, have a 1-cycle delay, due
to the register update having to go through GRBM
interface.
This patch adds this delay.
A one cycle read op is added after updating the
invalidate request and before reading the
invalidate-ACK status.
See also commit
534991731cb5fa94b5519957646cf849ca10d17d.
Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
---
drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 ++--
drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 8 ++++++++
drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
5 files changed, 22 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index ac43b1af69e3..0042868dbd53 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
5 + /* COND_EXEC */
7 + /* PIPELINE_SYNC */
SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
- SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+ SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
2 + /* VM_FLUSH */
8 + /* FENCE for VM_FLUSH */
20 + /* GDS switch */
@@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
5 + /* hdp invalidate */
7 + /* gfx_v10_0_ring_emit_pipeline_sync */
SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
- SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+ SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
2 + /* gfx_v10_0_ring_emit_vm_flush */
8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
.emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 9fe95e7693d5..9a7a717208de 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
5 + /* COND_EXEC */
7 + /* PIPELINE_SYNC */
SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
- SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+ SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
2 + /* VM_FLUSH */
8 + /* FENCE for VM_FLUSH */
20 + /* GDS switch */
@@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
5 + /* hdp invalidate */
7 + /* gfx_v9_0_ring_emit_pipeline_sync */
SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
- SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+ SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
2 + /* gfx_v9_0_ring_emit_vm_flush */
8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
.emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 6e1b25bd1fe7..100d526e9a42 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
+ /* Insert a dummy read to delay one cycle before the ACK
+ * inquiry.
+ */
+ if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
+ ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
+ ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
+ amdgpu_ring_emit_reg_wait(ring,
+ hub->vm_inv_eng0_req + eng, 0, 0);
+
/* wait for the invalidate to complete */
amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
1 << vmid, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 9f2a893871ec..8f3097e45299 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
upper_32_bits(pd_addr));
+ /* Insert a dummy read to delay one cycle before the ACK
+ * inquiry.
+ */
+ if (ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
+ ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
+ amdgpu_ring_emit_reg_wait(ring,
+ hub->vm_inv_eng0_req + eng, 0, 0);
+
amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
hub->vm_inv_eng0_ack + eng,
req, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index b8fdb192f6d6..0c41b4fdc58b 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
6 + /* sdma_v5_0_ring_emit_pipeline_sync */
/* sdma_v5_0_ring_emit_vm_flush */
SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
- SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
+ SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
.emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
.emit_ib = sdma_v5_0_ring_emit_ib,
--
2.23.0.385.gbc12974a89
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply related [flat|nested] 20+ messages in thread
* RE: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25 3:20 ` Zhu, Changfeng
0 siblings, 0 replies; 20+ messages in thread
From: Zhu, Changfeng @ 2019-10-25 3:20 UTC (permalink / raw)
To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Tuikov, Luben,
Koenig, Christian
Inline.
-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Tuikov, Luben
Sent: Friday, October 25, 2019 5:17 AM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
Subject: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
The GRBM interface is now capable of bursting 1-cycle op per register, a WRITE followed by another WRITE, or a WRITE followed by a READ--much faster than previous muti-cycle per completed-transaction interface. This causes a problem, whereby status registers requiring a read/write by hardware, have a 1-cycle delay, due to the register update having to go through GRBM interface.
This patch adds this delay.
A one cycle read op is added after updating the invalidate request and before reading the invalidate-ACK status.
See also commit
534991731cb5fa94b5519957646cf849ca10d17d.
Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
---
drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++ drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 8 ++++++++ drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
5 files changed, 22 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index ac43b1af69e3..0042868dbd53 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
5 + /* COND_EXEC */
7 + /* PIPELINE_SYNC */
SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
- SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+ SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
2 + /* VM_FLUSH */
8 + /* FENCE for VM_FLUSH */
20 + /* GDS switch */
@@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
5 + /* hdp invalidate */
7 + /* gfx_v10_0_ring_emit_pipeline_sync */
SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
- SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+ SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
2 + /* gfx_v10_0_ring_emit_vm_flush */
8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
.emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 9fe95e7693d5..9a7a717208de 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
5 + /* COND_EXEC */
7 + /* PIPELINE_SYNC */
SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
- SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+ SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
2 + /* VM_FLUSH */
8 + /* FENCE for VM_FLUSH */
20 + /* GDS switch */
@@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
5 + /* hdp invalidate */
7 + /* gfx_v9_0_ring_emit_pipeline_sync */
SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
- SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+ SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
2 + /* gfx_v9_0_ring_emit_vm_flush */
8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
.emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 6e1b25bd1fe7..100d526e9a42 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
+ /* Insert a dummy read to delay one cycle before the ACK
+ * inquiry.
+ */
+ if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
+ ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
+ ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
+ amdgpu_ring_emit_reg_wait(ring,
+ hub->vm_inv_eng0_req + eng, 0, 0);
+
/* wait for the invalidate to complete */
amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
1 << vmid, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 9f2a893871ec..8f3097e45299 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
upper_32_bits(pd_addr));
+ /* Insert a dummy read to delay one cycle before the ACK
+ * inquiry.
+ */
+ if (ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
+ ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
+ amdgpu_ring_emit_reg_wait(ring,
+ hub->vm_inv_eng0_req + eng, 0, 0);
+
Why do we add amdgpu_ring_emit_reg_wait here? There is no amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req); before it like gmc10.
In gmc9,amdgpu_ring_emit_wreg and amdgpu_ring_emit_reg_wait are called in amdgpu_ring_emit_reg_write_reg_wait.
I think it may be more reasonable to add dummy amdgpu_ring_emit_reg_wait in amdgpu_ring_emit_reg_write_reg_wait.
Besides, we should also think about the influence of SROV's patch:
drm/amdgpu: Remove the sriov checking and add firmware checking
amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
hub->vm_inv_eng0_ack + eng,
req, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index b8fdb192f6d6..0c41b4fdc58b 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
6 + /* sdma_v5_0_ring_emit_pipeline_sync */
/* sdma_v5_0_ring_emit_vm_flush */
SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
- SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
+ SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
.emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
.emit_ib = sdma_v5_0_ring_emit_ib,
--
2.23.0.385.gbc12974a89
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply related [flat|nested] 20+ messages in thread
* RE: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25 3:20 ` Zhu, Changfeng
0 siblings, 0 replies; 20+ messages in thread
From: Zhu, Changfeng @ 2019-10-25 3:20 UTC (permalink / raw)
To: Tuikov, Luben, amd-gfx
Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Tuikov, Luben,
Koenig, Christian
Inline.
-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Tuikov, Luben
Sent: Friday, October 25, 2019 5:17 AM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
Subject: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
The GRBM interface is now capable of bursting 1-cycle op per register, a WRITE followed by another WRITE, or a WRITE followed by a READ--much faster than previous muti-cycle per completed-transaction interface. This causes a problem, whereby status registers requiring a read/write by hardware, have a 1-cycle delay, due to the register update having to go through GRBM interface.
This patch adds this delay.
A one cycle read op is added after updating the invalidate request and before reading the invalidate-ACK status.
See also commit
534991731cb5fa94b5519957646cf849ca10d17d.
Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
---
drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++ drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 8 ++++++++ drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
5 files changed, 22 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index ac43b1af69e3..0042868dbd53 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
5 + /* COND_EXEC */
7 + /* PIPELINE_SYNC */
SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
- SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+ SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
2 + /* VM_FLUSH */
8 + /* FENCE for VM_FLUSH */
20 + /* GDS switch */
@@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
5 + /* hdp invalidate */
7 + /* gfx_v10_0_ring_emit_pipeline_sync */
SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
- SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+ SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
2 + /* gfx_v10_0_ring_emit_vm_flush */
8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
.emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 9fe95e7693d5..9a7a717208de 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
5 + /* COND_EXEC */
7 + /* PIPELINE_SYNC */
SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
- SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+ SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
2 + /* VM_FLUSH */
8 + /* FENCE for VM_FLUSH */
20 + /* GDS switch */
@@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
5 + /* hdp invalidate */
7 + /* gfx_v9_0_ring_emit_pipeline_sync */
SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
- SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+ SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
2 + /* gfx_v9_0_ring_emit_vm_flush */
8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
.emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 6e1b25bd1fe7..100d526e9a42 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
+ /* Insert a dummy read to delay one cycle before the ACK
+ * inquiry.
+ */
+ if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
+ ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
+ ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
+ amdgpu_ring_emit_reg_wait(ring,
+ hub->vm_inv_eng0_req + eng, 0, 0);
+
/* wait for the invalidate to complete */
amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
1 << vmid, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 9f2a893871ec..8f3097e45299 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
upper_32_bits(pd_addr));
+ /* Insert a dummy read to delay one cycle before the ACK
+ * inquiry.
+ */
+ if (ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
+ ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
+ amdgpu_ring_emit_reg_wait(ring,
+ hub->vm_inv_eng0_req + eng, 0, 0);
+
Why do we add amdgpu_ring_emit_reg_wait here? There is no amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req); before it like gmc10.
In gmc9,amdgpu_ring_emit_wreg and amdgpu_ring_emit_reg_wait are called in amdgpu_ring_emit_reg_write_reg_wait.
I think it may be more reasonable to add dummy amdgpu_ring_emit_reg_wait in amdgpu_ring_emit_reg_write_reg_wait.
Besides, we should also think about the influence of SROV's patch:
drm/amdgpu: Remove the sriov checking and add firmware checking
amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
hub->vm_inv_eng0_ack + eng,
req, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index b8fdb192f6d6..0c41b4fdc58b 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
6 + /* sdma_v5_0_ring_emit_pipeline_sync */
/* sdma_v5_0_ring_emit_vm_flush */
SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
- SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
+ SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
.emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
.emit_ib = sdma_v5_0_ring_emit_ib,
--
2.23.0.385.gbc12974a89
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25 6:49 ` Koenig, Christian
0 siblings, 0 replies; 20+ messages in thread
From: Koenig, Christian @ 2019-10-25 6:49 UTC (permalink / raw)
To: Tuikov, Luben, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric
Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
> The GRBM interface is now capable of bursting
> 1-cycle op per register, a WRITE followed by
> another WRITE, or a WRITE followed by a READ--much
> faster than previous muti-cycle per
> completed-transaction interface. This causes a
> problem, whereby status registers requiring a
> read/write by hardware, have a 1-cycle delay, due
> to the register update having to go through GRBM
> interface.
>
> This patch adds this delay.
>
> A one cycle read op is added after updating the
> invalidate request and before reading the
> invalidate-ACK status.
Please completely drop all changes for GFX9 since this patch will most
likely break SRIOV.
Additional to that please apply the workaround only to SDMA since the CP
driven engines should handle that in firmware.
Regards,
Christian.
>
> See also commit
> 534991731cb5fa94b5519957646cf849ca10d17d.
>
> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 ++--
> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 8 ++++++++
> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
> 5 files changed, 22 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index ac43b1af69e3..0042868dbd53 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
> 5 + /* COND_EXEC */
> 7 + /* PIPELINE_SYNC */
> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
> 2 + /* VM_FLUSH */
> 8 + /* FENCE for VM_FLUSH */
> 20 + /* GDS switch */
> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
> 5 + /* hdp invalidate */
> 7 + /* gfx_v10_0_ring_emit_pipeline_sync */
> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
> 2 + /* gfx_v10_0_ring_emit_vm_flush */
> 8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
> .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 9fe95e7693d5..9a7a717208de 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
> 5 + /* COND_EXEC */
> 7 + /* PIPELINE_SYNC */
> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
> 2 + /* VM_FLUSH */
> 8 + /* FENCE for VM_FLUSH */
> 20 + /* GDS switch */
> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
> 5 + /* hdp invalidate */
> 7 + /* gfx_v9_0_ring_emit_pipeline_sync */
> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
> 2 + /* gfx_v9_0_ring_emit_vm_flush */
> 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
> .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 6e1b25bd1fe7..100d526e9a42 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>
> amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>
> + /* Insert a dummy read to delay one cycle before the ACK
> + * inquiry.
> + */
> + if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
> + ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
> + ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> + amdgpu_ring_emit_reg_wait(ring,
> + hub->vm_inv_eng0_req + eng, 0, 0);
> +
> /* wait for the invalidate to complete */
> amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
> 1 << vmid, 1 << vmid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 9f2a893871ec..8f3097e45299 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
> upper_32_bits(pd_addr));
>
> + /* Insert a dummy read to delay one cycle before the ACK
> + * inquiry.
> + */
> + if (ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
> + ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> + amdgpu_ring_emit_reg_wait(ring,
> + hub->vm_inv_eng0_req + eng, 0, 0);
> +
> amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
> hub->vm_inv_eng0_ack + eng,
> req, 1 << vmid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> index b8fdb192f6d6..0c41b4fdc58b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
> 6 + /* sdma_v5_0_ring_emit_pipeline_sync */
> /* sdma_v5_0_ring_emit_vm_flush */
> SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
> 10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
> .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
> .emit_ib = sdma_v5_0_ring_emit_ib,
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25 6:49 ` Koenig, Christian
0 siblings, 0 replies; 20+ messages in thread
From: Koenig, Christian @ 2019-10-25 6:49 UTC (permalink / raw)
To: Tuikov, Luben, amd-gfx; +Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric
Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
> The GRBM interface is now capable of bursting
> 1-cycle op per register, a WRITE followed by
> another WRITE, or a WRITE followed by a READ--much
> faster than previous muti-cycle per
> completed-transaction interface. This causes a
> problem, whereby status registers requiring a
> read/write by hardware, have a 1-cycle delay, due
> to the register update having to go through GRBM
> interface.
>
> This patch adds this delay.
>
> A one cycle read op is added after updating the
> invalidate request and before reading the
> invalidate-ACK status.
Please completely drop all changes for GFX9 since this patch will most
likely break SRIOV.
Additional to that please apply the workaround only to SDMA since the CP
driven engines should handle that in firmware.
Regards,
Christian.
>
> See also commit
> 534991731cb5fa94b5519957646cf849ca10d17d.
>
> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 ++--
> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 8 ++++++++
> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
> 5 files changed, 22 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index ac43b1af69e3..0042868dbd53 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
> 5 + /* COND_EXEC */
> 7 + /* PIPELINE_SYNC */
> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
> 2 + /* VM_FLUSH */
> 8 + /* FENCE for VM_FLUSH */
> 20 + /* GDS switch */
> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
> 5 + /* hdp invalidate */
> 7 + /* gfx_v10_0_ring_emit_pipeline_sync */
> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
> 2 + /* gfx_v10_0_ring_emit_vm_flush */
> 8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
> .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 9fe95e7693d5..9a7a717208de 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
> 5 + /* COND_EXEC */
> 7 + /* PIPELINE_SYNC */
> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
> 2 + /* VM_FLUSH */
> 8 + /* FENCE for VM_FLUSH */
> 20 + /* GDS switch */
> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
> 5 + /* hdp invalidate */
> 7 + /* gfx_v9_0_ring_emit_pipeline_sync */
> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
> 2 + /* gfx_v9_0_ring_emit_vm_flush */
> 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
> .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 6e1b25bd1fe7..100d526e9a42 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>
> amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>
> + /* Insert a dummy read to delay one cycle before the ACK
> + * inquiry.
> + */
> + if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
> + ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
> + ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> + amdgpu_ring_emit_reg_wait(ring,
> + hub->vm_inv_eng0_req + eng, 0, 0);
> +
> /* wait for the invalidate to complete */
> amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
> 1 << vmid, 1 << vmid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 9f2a893871ec..8f3097e45299 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
> upper_32_bits(pd_addr));
>
> + /* Insert a dummy read to delay one cycle before the ACK
> + * inquiry.
> + */
> + if (ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
> + ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> + amdgpu_ring_emit_reg_wait(ring,
> + hub->vm_inv_eng0_req + eng, 0, 0);
> +
> amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
> hub->vm_inv_eng0_ack + eng,
> req, 1 << vmid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> index b8fdb192f6d6..0c41b4fdc58b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
> 6 + /* sdma_v5_0_ring_emit_pipeline_sync */
> /* sdma_v5_0_ring_emit_vm_flush */
> SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
> 10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
> .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
> .emit_ib = sdma_v5_0_ring_emit_ib,
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25 16:05 ` Alex Deucher
0 siblings, 0 replies; 20+ messages in thread
From: Alex Deucher @ 2019-10-25 16:05 UTC (permalink / raw)
To: Koenig, Christian
Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Tuikov, Luben,
amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
On Fri, Oct 25, 2019 at 2:49 AM Koenig, Christian
<Christian.Koenig@amd.com> wrote:
>
> Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
> > The GRBM interface is now capable of bursting
> > 1-cycle op per register, a WRITE followed by
> > another WRITE, or a WRITE followed by a READ--much
> > faster than previous muti-cycle per
> > completed-transaction interface. This causes a
> > problem, whereby status registers requiring a
> > read/write by hardware, have a 1-cycle delay, due
> > to the register update having to go through GRBM
> > interface.
> >
> > This patch adds this delay.
> >
> > A one cycle read op is added after updating the
> > invalidate request and before reading the
> > invalidate-ACK status.
>
> Please completely drop all changes for GFX9 since this patch will most
> likely break SRIOV.
>
> Additional to that please apply the workaround only to SDMA since the CP
> driven engines should handle that in firmware.
I think the CP only handles this in firmware if we use the new TLB
invalidation packet. I don't think it applies it to general register
writes like we do.
Alex
>
> Regards,
> Christian.
>
> >
> > See also commit
> > 534991731cb5fa94b5519957646cf849ca10d17d.
> >
> > Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> > ---
> > drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
> > drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 ++--
> > drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
> > drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 8 ++++++++
> > drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
> > 5 files changed, 22 insertions(+), 5 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > index ac43b1af69e3..0042868dbd53 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
> > 5 + /* COND_EXEC */
> > 7 + /* PIPELINE_SYNC */
> > SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> > - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> > + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
> > 2 + /* VM_FLUSH */
> > 8 + /* FENCE for VM_FLUSH */
> > 20 + /* GDS switch */
> > @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
> > 5 + /* hdp invalidate */
> > 7 + /* gfx_v10_0_ring_emit_pipeline_sync */
> > SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> > - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> > + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
> > 2 + /* gfx_v10_0_ring_emit_vm_flush */
> > 8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
> > .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > index 9fe95e7693d5..9a7a717208de 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
> > 5 + /* COND_EXEC */
> > 7 + /* PIPELINE_SYNC */
> > SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> > - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> > + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
> > 2 + /* VM_FLUSH */
> > 8 + /* FENCE for VM_FLUSH */
> > 20 + /* GDS switch */
> > @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
> > 5 + /* hdp invalidate */
> > 7 + /* gfx_v9_0_ring_emit_pipeline_sync */
> > SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> > - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> > + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
> > 2 + /* gfx_v9_0_ring_emit_vm_flush */
> > 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
> > .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > index 6e1b25bd1fe7..100d526e9a42 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> >
> > amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
> >
> > + /* Insert a dummy read to delay one cycle before the ACK
> > + * inquiry.
> > + */
> > + if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
> > + ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
> > + ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> > + amdgpu_ring_emit_reg_wait(ring,
> > + hub->vm_inv_eng0_req + eng, 0, 0);
> > +
> > /* wait for the invalidate to complete */
> > amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
> > 1 << vmid, 1 << vmid);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > index 9f2a893871ec..8f3097e45299 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> > amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
> > upper_32_bits(pd_addr));
> >
> > + /* Insert a dummy read to delay one cycle before the ACK
> > + * inquiry.
> > + */
> > + if (ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
> > + ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> > + amdgpu_ring_emit_reg_wait(ring,
> > + hub->vm_inv_eng0_req + eng, 0, 0);
> > +
> > amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
> > hub->vm_inv_eng0_ack + eng,
> > req, 1 << vmid);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> > index b8fdb192f6d6..0c41b4fdc58b 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> > @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
> > 6 + /* sdma_v5_0_ring_emit_pipeline_sync */
> > /* sdma_v5_0_ring_emit_vm_flush */
> > SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
> > - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
> > + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
> > 10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
> > .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
> > .emit_ib = sdma_v5_0_ring_emit_ib,
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25 16:05 ` Alex Deucher
0 siblings, 0 replies; 20+ messages in thread
From: Alex Deucher @ 2019-10-25 16:05 UTC (permalink / raw)
To: Koenig, Christian
Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Tuikov, Luben, amd-gfx
On Fri, Oct 25, 2019 at 2:49 AM Koenig, Christian
<Christian.Koenig@amd.com> wrote:
>
> Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
> > The GRBM interface is now capable of bursting
> > 1-cycle op per register, a WRITE followed by
> > another WRITE, or a WRITE followed by a READ--much
> > faster than previous muti-cycle per
> > completed-transaction interface. This causes a
> > problem, whereby status registers requiring a
> > read/write by hardware, have a 1-cycle delay, due
> > to the register update having to go through GRBM
> > interface.
> >
> > This patch adds this delay.
> >
> > A one cycle read op is added after updating the
> > invalidate request and before reading the
> > invalidate-ACK status.
>
> Please completely drop all changes for GFX9 since this patch will most
> likely break SRIOV.
>
> Additional to that please apply the workaround only to SDMA since the CP
> driven engines should handle that in firmware.
I think the CP only handles this in firmware if we use the new TLB
invalidation packet. I don't think it applies it to general register
writes like we do.
Alex
>
> Regards,
> Christian.
>
> >
> > See also commit
> > 534991731cb5fa94b5519957646cf849ca10d17d.
> >
> > Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> > ---
> > drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
> > drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 ++--
> > drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
> > drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 8 ++++++++
> > drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
> > 5 files changed, 22 insertions(+), 5 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > index ac43b1af69e3..0042868dbd53 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
> > 5 + /* COND_EXEC */
> > 7 + /* PIPELINE_SYNC */
> > SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> > - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> > + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
> > 2 + /* VM_FLUSH */
> > 8 + /* FENCE for VM_FLUSH */
> > 20 + /* GDS switch */
> > @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
> > 5 + /* hdp invalidate */
> > 7 + /* gfx_v10_0_ring_emit_pipeline_sync */
> > SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> > - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> > + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
> > 2 + /* gfx_v10_0_ring_emit_vm_flush */
> > 8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
> > .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > index 9fe95e7693d5..9a7a717208de 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
> > 5 + /* COND_EXEC */
> > 7 + /* PIPELINE_SYNC */
> > SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> > - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> > + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
> > 2 + /* VM_FLUSH */
> > 8 + /* FENCE for VM_FLUSH */
> > 20 + /* GDS switch */
> > @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
> > 5 + /* hdp invalidate */
> > 7 + /* gfx_v9_0_ring_emit_pipeline_sync */
> > SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> > - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> > + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
> > 2 + /* gfx_v9_0_ring_emit_vm_flush */
> > 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
> > .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > index 6e1b25bd1fe7..100d526e9a42 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> >
> > amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
> >
> > + /* Insert a dummy read to delay one cycle before the ACK
> > + * inquiry.
> > + */
> > + if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
> > + ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
> > + ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> > + amdgpu_ring_emit_reg_wait(ring,
> > + hub->vm_inv_eng0_req + eng, 0, 0);
> > +
> > /* wait for the invalidate to complete */
> > amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
> > 1 << vmid, 1 << vmid);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > index 9f2a893871ec..8f3097e45299 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> > amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
> > upper_32_bits(pd_addr));
> >
> > + /* Insert a dummy read to delay one cycle before the ACK
> > + * inquiry.
> > + */
> > + if (ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
> > + ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> > + amdgpu_ring_emit_reg_wait(ring,
> > + hub->vm_inv_eng0_req + eng, 0, 0);
> > +
> > amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
> > hub->vm_inv_eng0_ack + eng,
> > req, 1 << vmid);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> > index b8fdb192f6d6..0c41b4fdc58b 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> > @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
> > 6 + /* sdma_v5_0_ring_emit_pipeline_sync */
> > /* sdma_v5_0_ring_emit_vm_flush */
> > SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
> > - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
> > + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
> > 10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
> > .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
> > .emit_ib = sdma_v5_0_ring_emit_ib,
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25 16:19 ` Koenig, Christian
0 siblings, 0 replies; 20+ messages in thread
From: Koenig, Christian @ 2019-10-25 16:19 UTC (permalink / raw)
To: Alex Deucher
Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Tuikov, Luben,
amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
Am 25.10.19 um 18:05 schrieb Alex Deucher:
> On Fri, Oct 25, 2019 at 2:49 AM Koenig, Christian
> <Christian.Koenig@amd.com> wrote:
>> Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
>>> The GRBM interface is now capable of bursting
>>> 1-cycle op per register, a WRITE followed by
>>> another WRITE, or a WRITE followed by a READ--much
>>> faster than previous muti-cycle per
>>> completed-transaction interface. This causes a
>>> problem, whereby status registers requiring a
>>> read/write by hardware, have a 1-cycle delay, due
>>> to the register update having to go through GRBM
>>> interface.
>>>
>>> This patch adds this delay.
>>>
>>> A one cycle read op is added after updating the
>>> invalidate request and before reading the
>>> invalidate-ACK status.
>> Please completely drop all changes for GFX9 since this patch will most
>> likely break SRIOV.
>>
>> Additional to that please apply the workaround only to SDMA since the CP
>> driven engines should handle that in firmware.
> I think the CP only handles this in firmware if we use the new TLB
> invalidation packet. I don't think it applies it to general register
> writes like we do.
No, on the CP we should use the combined write/wait command even if we
don't use the new specialized VM invalidate command. Everything else
won't work with SRIOV.
Even if we want to we can't insert an extra read in this combined
write/wait command. And if we split up the commands we would break SRIOV
once more.
So applying this workaround to the CP code doesn't make any sense at all.
The only TODO which I can see is that we maybe don't use the combined
write/wait command on Navi yet.
Christian.
>
> Alex
>
>> Regards,
>> Christian.
>>
>>> See also commit
>>> 534991731cb5fa94b5519957646cf849ca10d17d.
>>>
>>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>>> ---
>>> drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
>>> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 ++--
>>> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
>>> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 8 ++++++++
>>> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>>> 5 files changed, 22 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> index ac43b1af69e3..0042868dbd53 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>> 5 + /* COND_EXEC */
>>> 7 + /* PIPELINE_SYNC */
>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>> 2 + /* VM_FLUSH */
>>> 8 + /* FENCE for VM_FLUSH */
>>> 20 + /* GDS switch */
>>> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>> 5 + /* hdp invalidate */
>>> 7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>> 2 + /* gfx_v10_0_ring_emit_vm_flush */
>>> 8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>>> .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> index 9fe95e7693d5..9a7a717208de 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>> 5 + /* COND_EXEC */
>>> 7 + /* PIPELINE_SYNC */
>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>> 2 + /* VM_FLUSH */
>>> 8 + /* FENCE for VM_FLUSH */
>>> 20 + /* GDS switch */
>>> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>> 5 + /* hdp invalidate */
>>> 7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>> 2 + /* gfx_v9_0_ring_emit_vm_flush */
>>> 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>>> .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> index 6e1b25bd1fe7..100d526e9a42 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>
>>> amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>>
>>> + /* Insert a dummy read to delay one cycle before the ACK
>>> + * inquiry.
>>> + */
>>> + if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
>>> + ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
>>> + ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>> + amdgpu_ring_emit_reg_wait(ring,
>>> + hub->vm_inv_eng0_req + eng, 0, 0);
>>> +
>>> /* wait for the invalidate to complete */
>>> amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>> 1 << vmid, 1 << vmid);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> index 9f2a893871ec..8f3097e45299 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>> amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>>> upper_32_bits(pd_addr));
>>>
>>> + /* Insert a dummy read to delay one cycle before the ACK
>>> + * inquiry.
>>> + */
>>> + if (ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
>>> + ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>> + amdgpu_ring_emit_reg_wait(ring,
>>> + hub->vm_inv_eng0_req + eng, 0, 0);
>>> +
>>> amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>>> hub->vm_inv_eng0_ack + eng,
>>> req, 1 << vmid);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> index b8fdb192f6d6..0c41b4fdc58b 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>>> 6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>>> /* sdma_v5_0_ring_emit_vm_flush */
>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>>> 10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>>> .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>>> .emit_ib = sdma_v5_0_ring_emit_ib,
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25 16:19 ` Koenig, Christian
0 siblings, 0 replies; 20+ messages in thread
From: Koenig, Christian @ 2019-10-25 16:19 UTC (permalink / raw)
To: Alex Deucher
Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Tuikov, Luben, amd-gfx
Am 25.10.19 um 18:05 schrieb Alex Deucher:
> On Fri, Oct 25, 2019 at 2:49 AM Koenig, Christian
> <Christian.Koenig@amd.com> wrote:
>> Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
>>> The GRBM interface is now capable of bursting
>>> 1-cycle op per register, a WRITE followed by
>>> another WRITE, or a WRITE followed by a READ--much
>>> faster than previous muti-cycle per
>>> completed-transaction interface. This causes a
>>> problem, whereby status registers requiring a
>>> read/write by hardware, have a 1-cycle delay, due
>>> to the register update having to go through GRBM
>>> interface.
>>>
>>> This patch adds this delay.
>>>
>>> A one cycle read op is added after updating the
>>> invalidate request and before reading the
>>> invalidate-ACK status.
>> Please completely drop all changes for GFX9 since this patch will most
>> likely break SRIOV.
>>
>> Additional to that please apply the workaround only to SDMA since the CP
>> driven engines should handle that in firmware.
> I think the CP only handles this in firmware if we use the new TLB
> invalidation packet. I don't think it applies it to general register
> writes like we do.
No, on the CP we should use the combined write/wait command even if we
don't use the new specialized VM invalidate command. Everything else
won't work with SRIOV.
Even if we want to we can't insert an extra read in this combined
write/wait command. And if we split up the commands we would break SRIOV
once more.
So applying this workaround to the CP code doesn't make any sense at all.
The only TODO which I can see is that we maybe don't use the combined
write/wait command on Navi yet.
Christian.
>
> Alex
>
>> Regards,
>> Christian.
>>
>>> See also commit
>>> 534991731cb5fa94b5519957646cf849ca10d17d.
>>>
>>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>>> ---
>>> drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
>>> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 ++--
>>> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
>>> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 8 ++++++++
>>> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>>> 5 files changed, 22 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> index ac43b1af69e3..0042868dbd53 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>> 5 + /* COND_EXEC */
>>> 7 + /* PIPELINE_SYNC */
>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>> 2 + /* VM_FLUSH */
>>> 8 + /* FENCE for VM_FLUSH */
>>> 20 + /* GDS switch */
>>> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>> 5 + /* hdp invalidate */
>>> 7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>> 2 + /* gfx_v10_0_ring_emit_vm_flush */
>>> 8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>>> .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> index 9fe95e7693d5..9a7a717208de 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>> 5 + /* COND_EXEC */
>>> 7 + /* PIPELINE_SYNC */
>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>> 2 + /* VM_FLUSH */
>>> 8 + /* FENCE for VM_FLUSH */
>>> 20 + /* GDS switch */
>>> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>> 5 + /* hdp invalidate */
>>> 7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>> 2 + /* gfx_v9_0_ring_emit_vm_flush */
>>> 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>>> .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> index 6e1b25bd1fe7..100d526e9a42 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>
>>> amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>>
>>> + /* Insert a dummy read to delay one cycle before the ACK
>>> + * inquiry.
>>> + */
>>> + if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
>>> + ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
>>> + ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>> + amdgpu_ring_emit_reg_wait(ring,
>>> + hub->vm_inv_eng0_req + eng, 0, 0);
>>> +
>>> /* wait for the invalidate to complete */
>>> amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>> 1 << vmid, 1 << vmid);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> index 9f2a893871ec..8f3097e45299 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>> amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>>> upper_32_bits(pd_addr));
>>>
>>> + /* Insert a dummy read to delay one cycle before the ACK
>>> + * inquiry.
>>> + */
>>> + if (ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
>>> + ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>> + amdgpu_ring_emit_reg_wait(ring,
>>> + hub->vm_inv_eng0_req + eng, 0, 0);
>>> +
>>> amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>>> hub->vm_inv_eng0_ack + eng,
>>> req, 1 << vmid);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> index b8fdb192f6d6..0c41b4fdc58b 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>>> 6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>>> /* sdma_v5_0_ring_emit_vm_flush */
>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>>> 10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>>> .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>>> .emit_ib = sdma_v5_0_ring_emit_ib,
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25 22:45 ` Tuikov, Luben
0 siblings, 0 replies; 20+ messages in thread
From: Tuikov, Luben @ 2019-10-25 22:45 UTC (permalink / raw)
To: Koenig, Christian, Alex Deucher
Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric,
amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
On 2019-10-25 12:19 p.m., Koenig, Christian wrote:
> Am 25.10.19 um 18:05 schrieb Alex Deucher:
>> On Fri, Oct 25, 2019 at 2:49 AM Koenig, Christian
>> <Christian.Koenig@amd.com> wrote:
>>> Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
>>>> The GRBM interface is now capable of bursting
>>>> 1-cycle op per register, a WRITE followed by
>>>> another WRITE, or a WRITE followed by a READ--much
>>>> faster than previous muti-cycle per
>>>> completed-transaction interface. This causes a
>>>> problem, whereby status registers requiring a
>>>> read/write by hardware, have a 1-cycle delay, due
>>>> to the register update having to go through GRBM
>>>> interface.
>>>>
>>>> This patch adds this delay.
>>>>
>>>> A one cycle read op is added after updating the
>>>> invalidate request and before reading the
>>>> invalidate-ACK status.
>>> Please completely drop all changes for GFX9 since this patch will most
>>> likely break SRIOV.
>>>
>>> Additional to that please apply the workaround only to SDMA since the CP
>>> driven engines should handle that in firmware.
Thank you Christian for reviewing this patch.
This patch stirred quite a bit of noise. So, then, I'll go by
your last comment above--I suppose this is the desired way to go forward then?
Regards,
Luben
>> I think the CP only handles this in firmware if we use the new TLB
>> invalidation packet. I don't think it applies it to general register
>> writes like we do.
>
> No, on the CP we should use the combined write/wait command even if we
> don't use the new specialized VM invalidate command. Everything else
> won't work with SRIOV.
>
> Even if we want to we can't insert an extra read in this combined
> write/wait command. And if we split up the commands we would break SRIOV
> once more.
>
> So applying this workaround to the CP code doesn't make any sense at all.
>
> The only TODO which I can see is that we maybe don't use the combined
> write/wait command on Navi yet.
>
> Christian.
>
>>
>> Alex
>>
>>> Regards,
>>> Christian.
>>>
>>>> See also commit
>>>> 534991731cb5fa94b5519957646cf849ca10d17d.
>>>>
>>>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>>>> ---
>>>> drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
>>>> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 ++--
>>>> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
>>>> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 8 ++++++++
>>>> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>>>> 5 files changed, 22 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> index ac43b1af69e3..0042868dbd53 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>>> 5 + /* COND_EXEC */
>>>> 7 + /* PIPELINE_SYNC */
>>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>> 2 + /* VM_FLUSH */
>>>> 8 + /* FENCE for VM_FLUSH */
>>>> 20 + /* GDS switch */
>>>> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>>> 5 + /* hdp invalidate */
>>>> 7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>> 2 + /* gfx_v10_0_ring_emit_vm_flush */
>>>> 8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>>>> .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> index 9fe95e7693d5..9a7a717208de 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>>> 5 + /* COND_EXEC */
>>>> 7 + /* PIPELINE_SYNC */
>>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>> 2 + /* VM_FLUSH */
>>>> 8 + /* FENCE for VM_FLUSH */
>>>> 20 + /* GDS switch */
>>>> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>>> 5 + /* hdp invalidate */
>>>> 7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>> 2 + /* gfx_v9_0_ring_emit_vm_flush */
>>>> 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>>>> .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>> index 6e1b25bd1fe7..100d526e9a42 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>> @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>
>>>> amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>>>
>>>> + /* Insert a dummy read to delay one cycle before the ACK
>>>> + * inquiry.
>>>> + */
>>>> + if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
>>>> + ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
>>>> + ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>> + amdgpu_ring_emit_reg_wait(ring,
>>>> + hub->vm_inv_eng0_req + eng, 0, 0);
>>>> +
>>>> /* wait for the invalidate to complete */
>>>> amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>>> 1 << vmid, 1 << vmid);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> index 9f2a893871ec..8f3097e45299 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>> amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>>>> upper_32_bits(pd_addr));
>>>>
>>>> + /* Insert a dummy read to delay one cycle before the ACK
>>>> + * inquiry.
>>>> + */
>>>> + if (ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
>>>> + ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>> + amdgpu_ring_emit_reg_wait(ring,
>>>> + hub->vm_inv_eng0_req + eng, 0, 0);
>>>> +
>>>> amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>>>> hub->vm_inv_eng0_ack + eng,
>>>> req, 1 << vmid);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>> index b8fdb192f6d6..0c41b4fdc58b 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>>>> 6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>>>> /* sdma_v5_0_ring_emit_vm_flush */
>>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
>>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
>>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>>>> 10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>>>> .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>>>> .emit_ib = sdma_v5_0_ring_emit_ib,
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25 22:45 ` Tuikov, Luben
0 siblings, 0 replies; 20+ messages in thread
From: Tuikov, Luben @ 2019-10-25 22:45 UTC (permalink / raw)
To: Koenig, Christian, Alex Deucher
Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, amd-gfx
On 2019-10-25 12:19 p.m., Koenig, Christian wrote:
> Am 25.10.19 um 18:05 schrieb Alex Deucher:
>> On Fri, Oct 25, 2019 at 2:49 AM Koenig, Christian
>> <Christian.Koenig@amd.com> wrote:
>>> Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
>>>> The GRBM interface is now capable of bursting
>>>> 1-cycle op per register, a WRITE followed by
>>>> another WRITE, or a WRITE followed by a READ--much
>>>> faster than previous muti-cycle per
>>>> completed-transaction interface. This causes a
>>>> problem, whereby status registers requiring a
>>>> read/write by hardware, have a 1-cycle delay, due
>>>> to the register update having to go through GRBM
>>>> interface.
>>>>
>>>> This patch adds this delay.
>>>>
>>>> A one cycle read op is added after updating the
>>>> invalidate request and before reading the
>>>> invalidate-ACK status.
>>> Please completely drop all changes for GFX9 since this patch will most
>>> likely break SRIOV.
>>>
>>> Additional to that please apply the workaround only to SDMA since the CP
>>> driven engines should handle that in firmware.
Thank you Christian for reviewing this patch.
This patch stirred quite a bit of noise. So, then, I'll go by
your last comment above--I suppose this is the desired way to go forward then?
Regards,
Luben
>> I think the CP only handles this in firmware if we use the new TLB
>> invalidation packet. I don't think it applies it to general register
>> writes like we do.
>
> No, on the CP we should use the combined write/wait command even if we
> don't use the new specialized VM invalidate command. Everything else
> won't work with SRIOV.
>
> Even if we want to we can't insert an extra read in this combined
> write/wait command. And if we split up the commands we would break SRIOV
> once more.
>
> So applying this workaround to the CP code doesn't make any sense at all.
>
> The only TODO which I can see is that we maybe don't use the combined
> write/wait command on Navi yet.
>
> Christian.
>
>>
>> Alex
>>
>>> Regards,
>>> Christian.
>>>
>>>> See also commit
>>>> 534991731cb5fa94b5519957646cf849ca10d17d.
>>>>
>>>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>>>> ---
>>>> drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
>>>> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 ++--
>>>> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
>>>> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 8 ++++++++
>>>> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>>>> 5 files changed, 22 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> index ac43b1af69e3..0042868dbd53 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>>> 5 + /* COND_EXEC */
>>>> 7 + /* PIPELINE_SYNC */
>>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>> 2 + /* VM_FLUSH */
>>>> 8 + /* FENCE for VM_FLUSH */
>>>> 20 + /* GDS switch */
>>>> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>>> 5 + /* hdp invalidate */
>>>> 7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>> 2 + /* gfx_v10_0_ring_emit_vm_flush */
>>>> 8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>>>> .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> index 9fe95e7693d5..9a7a717208de 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>>> 5 + /* COND_EXEC */
>>>> 7 + /* PIPELINE_SYNC */
>>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>> 2 + /* VM_FLUSH */
>>>> 8 + /* FENCE for VM_FLUSH */
>>>> 20 + /* GDS switch */
>>>> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>>> 5 + /* hdp invalidate */
>>>> 7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>> 2 + /* gfx_v9_0_ring_emit_vm_flush */
>>>> 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>>>> .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>> index 6e1b25bd1fe7..100d526e9a42 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>> @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>
>>>> amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>>>
>>>> + /* Insert a dummy read to delay one cycle before the ACK
>>>> + * inquiry.
>>>> + */
>>>> + if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
>>>> + ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
>>>> + ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>> + amdgpu_ring_emit_reg_wait(ring,
>>>> + hub->vm_inv_eng0_req + eng, 0, 0);
>>>> +
>>>> /* wait for the invalidate to complete */
>>>> amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>>> 1 << vmid, 1 << vmid);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> index 9f2a893871ec..8f3097e45299 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>> amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>>>> upper_32_bits(pd_addr));
>>>>
>>>> + /* Insert a dummy read to delay one cycle before the ACK
>>>> + * inquiry.
>>>> + */
>>>> + if (ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
>>>> + ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>> + amdgpu_ring_emit_reg_wait(ring,
>>>> + hub->vm_inv_eng0_req + eng, 0, 0);
>>>> +
>>>> amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>>>> hub->vm_inv_eng0_ack + eng,
>>>> req, 1 << vmid);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>> index b8fdb192f6d6..0c41b4fdc58b 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>>>> 6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>>>> /* sdma_v5_0_ring_emit_vm_flush */
>>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
>>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
>>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>>>> 10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>>>> .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>>>> .emit_ib = sdma_v5_0_ring_emit_ib,
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH] drm/amdgpu: GFX10: GRBM requires 1-cycle delay (v2)
@ 2019-10-25 22:59 ` Tuikov, Luben
0 siblings, 0 replies; 20+ messages in thread
From: Tuikov, Luben @ 2019-10-25 22:59 UTC (permalink / raw)
To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
Cc: Deucher, Alexander, Tuikov, Luben, Koenig, Christian
The GRBM interface is now capable of bursting
1-cycle op per register, a WRITE followed by
another WRITE, or a WRITE followed by a READ--much
faster than previous muti-cycle per
completed-transaction interface. This causes a
problem, whereby status registers requiring a
read/write by hardware, have a 1-cycle delay, due
to the register update having to go through GRBM
interface.
This patch adds this delay.
A one cycle read op is added after updating the
invalidate request and before reading the
invalidate-ACK status.
See also commit
534991731cb5fa94b5519957646cf849ca10d17d.
v2: Remove GFX9 and apply only to SDMA ring.
Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
---
drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 7 +++++++
drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
2 files changed, 8 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 6e1b25bd1fe7..dedd7e1ab2fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -346,6 +346,13 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
+ /* Insert a dummy read to delay one cycle after the write REQ,
+ * and before the ACK inquiry.
+ */
+ if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA)
+ amdgpu_ring_emit_reg_wait(ring,
+ hub->vm_inv_eng0_req + eng, 0, 0);
+
/* wait for the invalidate to complete */
amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
1 << vmid, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index b8fdb192f6d6..0c41b4fdc58b 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
6 + /* sdma_v5_0_ring_emit_pipeline_sync */
/* sdma_v5_0_ring_emit_vm_flush */
SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
- SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
+ SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
.emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
.emit_ib = sdma_v5_0_ring_emit_ib,
--
2.23.0.385.gbc12974a89
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH] drm/amdgpu: GFX10: GRBM requires 1-cycle delay (v2)
@ 2019-10-25 22:59 ` Tuikov, Luben
0 siblings, 0 replies; 20+ messages in thread
From: Tuikov, Luben @ 2019-10-25 22:59 UTC (permalink / raw)
To: amd-gfx; +Cc: Deucher, Alexander, Tuikov, Luben, Koenig, Christian
The GRBM interface is now capable of bursting
1-cycle op per register, a WRITE followed by
another WRITE, or a WRITE followed by a READ--much
faster than previous muti-cycle per
completed-transaction interface. This causes a
problem, whereby status registers requiring a
read/write by hardware, have a 1-cycle delay, due
to the register update having to go through GRBM
interface.
This patch adds this delay.
A one cycle read op is added after updating the
invalidate request and before reading the
invalidate-ACK status.
See also commit
534991731cb5fa94b5519957646cf849ca10d17d.
v2: Remove GFX9 and apply only to SDMA ring.
Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
---
drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 7 +++++++
drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
2 files changed, 8 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 6e1b25bd1fe7..dedd7e1ab2fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -346,6 +346,13 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
+ /* Insert a dummy read to delay one cycle after the write REQ,
+ * and before the ACK inquiry.
+ */
+ if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA)
+ amdgpu_ring_emit_reg_wait(ring,
+ hub->vm_inv_eng0_req + eng, 0, 0);
+
/* wait for the invalidate to complete */
amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
1 << vmid, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index b8fdb192f6d6..0c41b4fdc58b 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
6 + /* sdma_v5_0_ring_emit_pipeline_sync */
/* sdma_v5_0_ring_emit_vm_flush */
SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
- SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
+ SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
.emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
.emit_ib = sdma_v5_0_ring_emit_ib,
--
2.23.0.385.gbc12974a89
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-26 12:09 ` Koenig, Christian
0 siblings, 0 replies; 20+ messages in thread
From: Koenig, Christian @ 2019-10-26 12:09 UTC (permalink / raw)
To: Tuikov, Luben, Alex Deucher
Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric,
amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
Am 26.10.19 um 00:45 schrieb Tuikov, Luben:
> On 2019-10-25 12:19 p.m., Koenig, Christian wrote:
>> Am 25.10.19 um 18:05 schrieb Alex Deucher:
>>> On Fri, Oct 25, 2019 at 2:49 AM Koenig, Christian
>>> <Christian.Koenig@amd.com> wrote:
>>>> Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
>>>>> The GRBM interface is now capable of bursting
>>>>> 1-cycle op per register, a WRITE followed by
>>>>> another WRITE, or a WRITE followed by a READ--much
>>>>> faster than previous muti-cycle per
>>>>> completed-transaction interface. This causes a
>>>>> problem, whereby status registers requiring a
>>>>> read/write by hardware, have a 1-cycle delay, due
>>>>> to the register update having to go through GRBM
>>>>> interface.
>>>>>
>>>>> This patch adds this delay.
>>>>>
>>>>> A one cycle read op is added after updating the
>>>>> invalidate request and before reading the
>>>>> invalidate-ACK status.
>>>> Please completely drop all changes for GFX9 since this patch will most
>>>> likely break SRIOV.
>>>>
>>>> Additional to that please apply the workaround only to SDMA since the CP
>>>> driven engines should handle that in firmware.
> Thank you Christian for reviewing this patch.
>
> This patch stirred quite a bit of noise. So, then, I'll go by
> your last comment above--I suppose this is the desired way to go forward then?
You most likely broke the SRIOV use case on GFX9 with that, no wonder
that this raised eyebrows.
As far as I can see this manual workaround is only applicable to the
SDMA on Navi.
But we should double check that the CP firmware interface with the
combined write/wait command is correctly used on Navi/GFX10 as well.
IIRC that came in rather late for GFX9, could be that the Navi bringup
branch never had that.
Regards,
Christian.
>
> Regards,
> Luben
>
>
>>> I think the CP only handles this in firmware if we use the new TLB
>>> invalidation packet. I don't think it applies it to general register
>>> writes like we do.
>> No, on the CP we should use the combined write/wait command even if we
>> don't use the new specialized VM invalidate command. Everything else
>> won't work with SRIOV.
>>
>> Even if we want to we can't insert an extra read in this combined
>> write/wait command. And if we split up the commands we would break SRIOV
>> once more.
>>
>> So applying this workaround to the CP code doesn't make any sense at all.
>>
>> The only TODO which I can see is that we maybe don't use the combined
>> write/wait command on Navi yet.
>>
>> Christian.
>>
>>> Alex
>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> See also commit
>>>>> 534991731cb5fa94b5519957646cf849ca10d17d.
>>>>>
>>>>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>>>>> ---
>>>>> drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
>>>>> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 ++--
>>>>> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
>>>>> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 8 ++++++++
>>>>> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>>>>> 5 files changed, 22 insertions(+), 5 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>> index ac43b1af69e3..0042868dbd53 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>>>> 5 + /* COND_EXEC */
>>>>> 7 + /* PIPELINE_SYNC */
>>>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>> 2 + /* VM_FLUSH */
>>>>> 8 + /* FENCE for VM_FLUSH */
>>>>> 20 + /* GDS switch */
>>>>> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>>>> 5 + /* hdp invalidate */
>>>>> 7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>>>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>> 2 + /* gfx_v10_0_ring_emit_vm_flush */
>>>>> 8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>>>>> .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>> index 9fe95e7693d5..9a7a717208de 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>>>> 5 + /* COND_EXEC */
>>>>> 7 + /* PIPELINE_SYNC */
>>>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>> 2 + /* VM_FLUSH */
>>>>> 8 + /* FENCE for VM_FLUSH */
>>>>> 20 + /* GDS switch */
>>>>> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>>>> 5 + /* hdp invalidate */
>>>>> 7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>>>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>> 2 + /* gfx_v9_0_ring_emit_vm_flush */
>>>>> 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>>>>> .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>> index 6e1b25bd1fe7..100d526e9a42 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>> @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>>
>>>>> amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>>>>
>>>>> + /* Insert a dummy read to delay one cycle before the ACK
>>>>> + * inquiry.
>>>>> + */
>>>>> + if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
>>>>> + ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
>>>>> + ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>>> + amdgpu_ring_emit_reg_wait(ring,
>>>>> + hub->vm_inv_eng0_req + eng, 0, 0);
>>>>> +
>>>>> /* wait for the invalidate to complete */
>>>>> amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>>>> 1 << vmid, 1 << vmid);
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>> index 9f2a893871ec..8f3097e45299 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>> amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>>>>> upper_32_bits(pd_addr));
>>>>>
>>>>> + /* Insert a dummy read to delay one cycle before the ACK
>>>>> + * inquiry.
>>>>> + */
>>>>> + if (ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
>>>>> + ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>>> + amdgpu_ring_emit_reg_wait(ring,
>>>>> + hub->vm_inv_eng0_req + eng, 0, 0);
>>>>> +
>>>>> amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>>>>> hub->vm_inv_eng0_ack + eng,
>>>>> req, 1 << vmid);
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>> index b8fdb192f6d6..0c41b4fdc58b 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>>>>> 6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>>>>> /* sdma_v5_0_ring_emit_vm_flush */
>>>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
>>>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
>>>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>>>>> 10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>>>>> .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>>>>> .emit_ib = sdma_v5_0_ring_emit_ib,
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-26 12:09 ` Koenig, Christian
0 siblings, 0 replies; 20+ messages in thread
From: Koenig, Christian @ 2019-10-26 12:09 UTC (permalink / raw)
To: Tuikov, Luben, Alex Deucher
Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, amd-gfx
Am 26.10.19 um 00:45 schrieb Tuikov, Luben:
> On 2019-10-25 12:19 p.m., Koenig, Christian wrote:
>> Am 25.10.19 um 18:05 schrieb Alex Deucher:
>>> On Fri, Oct 25, 2019 at 2:49 AM Koenig, Christian
>>> <Christian.Koenig@amd.com> wrote:
>>>> Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
>>>>> The GRBM interface is now capable of bursting
>>>>> 1-cycle op per register, a WRITE followed by
>>>>> another WRITE, or a WRITE followed by a READ--much
>>>>> faster than previous muti-cycle per
>>>>> completed-transaction interface. This causes a
>>>>> problem, whereby status registers requiring a
>>>>> read/write by hardware, have a 1-cycle delay, due
>>>>> to the register update having to go through GRBM
>>>>> interface.
>>>>>
>>>>> This patch adds this delay.
>>>>>
>>>>> A one cycle read op is added after updating the
>>>>> invalidate request and before reading the
>>>>> invalidate-ACK status.
>>>> Please completely drop all changes for GFX9 since this patch will most
>>>> likely break SRIOV.
>>>>
>>>> Additional to that please apply the workaround only to SDMA since the CP
>>>> driven engines should handle that in firmware.
> Thank you Christian for reviewing this patch.
>
> This patch stirred quite a bit of noise. So, then, I'll go by
> your last comment above--I suppose this is the desired way to go forward then?
You most likely broke the SRIOV use case on GFX9 with that, no wonder
that this raised eyebrows.
As far as I can see this manual workaround is only applicable to the
SDMA on Navi.
But we should double check that the CP firmware interface with the
combined write/wait command is correctly used on Navi/GFX10 as well.
IIRC that came in rather late for GFX9, could be that the Navi bringup
branch never had that.
Regards,
Christian.
>
> Regards,
> Luben
>
>
>>> I think the CP only handles this in firmware if we use the new TLB
>>> invalidation packet. I don't think it applies it to general register
>>> writes like we do.
>> No, on the CP we should use the combined write/wait command even if we
>> don't use the new specialized VM invalidate command. Everything else
>> won't work with SRIOV.
>>
>> Even if we want to we can't insert an extra read in this combined
>> write/wait command. And if we split up the commands we would break SRIOV
>> once more.
>>
>> So applying this workaround to the CP code doesn't make any sense at all.
>>
>> The only TODO which I can see is that we maybe don't use the combined
>> write/wait command on Navi yet.
>>
>> Christian.
>>
>>> Alex
>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> See also commit
>>>>> 534991731cb5fa94b5519957646cf849ca10d17d.
>>>>>
>>>>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>>>>> ---
>>>>> drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
>>>>> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 ++--
>>>>> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
>>>>> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 8 ++++++++
>>>>> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>>>>> 5 files changed, 22 insertions(+), 5 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>> index ac43b1af69e3..0042868dbd53 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>>>> 5 + /* COND_EXEC */
>>>>> 7 + /* PIPELINE_SYNC */
>>>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>> 2 + /* VM_FLUSH */
>>>>> 8 + /* FENCE for VM_FLUSH */
>>>>> 20 + /* GDS switch */
>>>>> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>>>> 5 + /* hdp invalidate */
>>>>> 7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>>>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>> 2 + /* gfx_v10_0_ring_emit_vm_flush */
>>>>> 8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>>>>> .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>> index 9fe95e7693d5..9a7a717208de 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>>>> 5 + /* COND_EXEC */
>>>>> 7 + /* PIPELINE_SYNC */
>>>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>> 2 + /* VM_FLUSH */
>>>>> 8 + /* FENCE for VM_FLUSH */
>>>>> 20 + /* GDS switch */
>>>>> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>>>> 5 + /* hdp invalidate */
>>>>> 7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>>>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>> 2 + /* gfx_v9_0_ring_emit_vm_flush */
>>>>> 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>>>>> .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>> index 6e1b25bd1fe7..100d526e9a42 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>> @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>>
>>>>> amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>>>>
>>>>> + /* Insert a dummy read to delay one cycle before the ACK
>>>>> + * inquiry.
>>>>> + */
>>>>> + if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
>>>>> + ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
>>>>> + ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>>> + amdgpu_ring_emit_reg_wait(ring,
>>>>> + hub->vm_inv_eng0_req + eng, 0, 0);
>>>>> +
>>>>> /* wait for the invalidate to complete */
>>>>> amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>>>> 1 << vmid, 1 << vmid);
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>> index 9f2a893871ec..8f3097e45299 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>> amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>>>>> upper_32_bits(pd_addr));
>>>>>
>>>>> + /* Insert a dummy read to delay one cycle before the ACK
>>>>> + * inquiry.
>>>>> + */
>>>>> + if (ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
>>>>> + ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>>> + amdgpu_ring_emit_reg_wait(ring,
>>>>> + hub->vm_inv_eng0_req + eng, 0, 0);
>>>>> +
>>>>> amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>>>>> hub->vm_inv_eng0_ack + eng,
>>>>> req, 1 << vmid);
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>> index b8fdb192f6d6..0c41b4fdc58b 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>>>>> 6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>>>>> /* sdma_v5_0_ring_emit_vm_flush */
>>>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
>>>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
>>>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>>>>> 10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>>>>> .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>>>>> .emit_ib = sdma_v5_0_ring_emit_ib,
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-27 21:25 ` Tuikov, Luben
0 siblings, 0 replies; 20+ messages in thread
From: Tuikov, Luben @ 2019-10-27 21:25 UTC (permalink / raw)
To: Koenig, Christian, Alex Deucher
Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric,
amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
On 2019-10-26 08:09, Koenig, Christian wrote:
> Am 26.10.19 um 00:45 schrieb Tuikov, Luben:
>> On 2019-10-25 12:19 p.m., Koenig, Christian wrote:
>>> Am 25.10.19 um 18:05 schrieb Alex Deucher:
>>>> On Fri, Oct 25, 2019 at 2:49 AM Koenig, Christian
>>>> <Christian.Koenig@amd.com> wrote:
>>>>> Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
>>>>>> The GRBM interface is now capable of bursting
>>>>>> 1-cycle op per register, a WRITE followed by
>>>>>> another WRITE, or a WRITE followed by a READ--much
>>>>>> faster than previous muti-cycle per
>>>>>> completed-transaction interface. This causes a
>>>>>> problem, whereby status registers requiring a
>>>>>> read/write by hardware, have a 1-cycle delay, due
>>>>>> to the register update having to go through GRBM
>>>>>> interface.
>>>>>>
>>>>>> This patch adds this delay.
>>>>>>
>>>>>> A one cycle read op is added after updating the
>>>>>> invalidate request and before reading the
>>>>>> invalidate-ACK status.
>>>>> Please completely drop all changes for GFX9 since this patch will most
>>>>> likely break SRIOV.
>>>>>
>>>>> Additional to that please apply the workaround only to SDMA since the CP
>>>>> driven engines should handle that in firmware.
>> Thank you Christian for reviewing this patch.
>>
>> This patch stirred quite a bit of noise. So, then, I'll go by
>> your last comment above--I suppose this is the desired way to go forward then?
>
> You most likely broke the SRIOV use case on GFX9 with that, no wonder
> that this raised eyebrows.
>
> As far as I can see this manual workaround is only applicable to the
> SDMA on Navi.
Did you see the (v2) patch?
Regards,
Luben
>
> But we should double check that the CP firmware interface with the
> combined write/wait command is correctly used on Navi/GFX10 as well.
> IIRC that came in rather late for GFX9, could be that the Navi bringup
> branch never had that.
>
> Regards,
> Christian.
>
>>
>> Regards,
>> Luben
>>
>>
>>>> I think the CP only handles this in firmware if we use the new TLB
>>>> invalidation packet. I don't think it applies it to general register
>>>> writes like we do.
>>> No, on the CP we should use the combined write/wait command even if we
>>> don't use the new specialized VM invalidate command. Everything else
>>> won't work with SRIOV.
>>>
>>> Even if we want to we can't insert an extra read in this combined
>>> write/wait command. And if we split up the commands we would break SRIOV
>>> once more.
>>>
>>> So applying this workaround to the CP code doesn't make any sense at all.
>>>
>>> The only TODO which I can see is that we maybe don't use the combined
>>> write/wait command on Navi yet.
>>>
>>> Christian.
>>>
>>>> Alex
>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>> See also commit
>>>>>> 534991731cb5fa94b5519957646cf849ca10d17d.
>>>>>>
>>>>>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>>>>>> ---
>>>>>> drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
>>>>>> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 ++--
>>>>>> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
>>>>>> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 8 ++++++++
>>>>>> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>>>>>> 5 files changed, 22 insertions(+), 5 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>> index ac43b1af69e3..0042868dbd53 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>>>>> 5 + /* COND_EXEC */
>>>>>> 7 + /* PIPELINE_SYNC */
>>>>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>> 2 + /* VM_FLUSH */
>>>>>> 8 + /* FENCE for VM_FLUSH */
>>>>>> 20 + /* GDS switch */
>>>>>> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>>>>> 5 + /* hdp invalidate */
>>>>>> 7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>>>>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>> 2 + /* gfx_v10_0_ring_emit_vm_flush */
>>>>>> 8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>>>>>> .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>> index 9fe95e7693d5..9a7a717208de 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>>>>> 5 + /* COND_EXEC */
>>>>>> 7 + /* PIPELINE_SYNC */
>>>>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>> 2 + /* VM_FLUSH */
>>>>>> 8 + /* FENCE for VM_FLUSH */
>>>>>> 20 + /* GDS switch */
>>>>>> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>>>>> 5 + /* hdp invalidate */
>>>>>> 7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>>>>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>> 2 + /* gfx_v9_0_ring_emit_vm_flush */
>>>>>> 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>>>>>> .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>> index 6e1b25bd1fe7..100d526e9a42 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>> @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>>>
>>>>>> amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>>>>>
>>>>>> + /* Insert a dummy read to delay one cycle before the ACK
>>>>>> + * inquiry.
>>>>>> + */
>>>>>> + if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
>>>>>> + ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
>>>>>> + ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>>>> + amdgpu_ring_emit_reg_wait(ring,
>>>>>> + hub->vm_inv_eng0_req + eng, 0, 0);
>>>>>> +
>>>>>> /* wait for the invalidate to complete */
>>>>>> amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>>>>> 1 << vmid, 1 << vmid);
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>> index 9f2a893871ec..8f3097e45299 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>>> amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>>>>>> upper_32_bits(pd_addr));
>>>>>>
>>>>>> + /* Insert a dummy read to delay one cycle before the ACK
>>>>>> + * inquiry.
>>>>>> + */
>>>>>> + if (ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
>>>>>> + ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>>>> + amdgpu_ring_emit_reg_wait(ring,
>>>>>> + hub->vm_inv_eng0_req + eng, 0, 0);
>>>>>> +
>>>>>> amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>>>>>> hub->vm_inv_eng0_ack + eng,
>>>>>> req, 1 << vmid);
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>>> index b8fdb192f6d6..0c41b4fdc58b 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>>> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>>>>>> 6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>>>>>> /* sdma_v5_0_ring_emit_vm_flush */
>>>>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
>>>>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
>>>>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>>>>>> 10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>>>>>> .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>>>>>> .emit_ib = sdma_v5_0_ring_emit_ib,
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-27 21:25 ` Tuikov, Luben
0 siblings, 0 replies; 20+ messages in thread
From: Tuikov, Luben @ 2019-10-27 21:25 UTC (permalink / raw)
To: Koenig, Christian, Alex Deucher
Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, amd-gfx
On 2019-10-26 08:09, Koenig, Christian wrote:
> Am 26.10.19 um 00:45 schrieb Tuikov, Luben:
>> On 2019-10-25 12:19 p.m., Koenig, Christian wrote:
>>> Am 25.10.19 um 18:05 schrieb Alex Deucher:
>>>> On Fri, Oct 25, 2019 at 2:49 AM Koenig, Christian
>>>> <Christian.Koenig@amd.com> wrote:
>>>>> Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
>>>>>> The GRBM interface is now capable of bursting
>>>>>> 1-cycle op per register, a WRITE followed by
>>>>>> another WRITE, or a WRITE followed by a READ--much
>>>>>> faster than previous muti-cycle per
>>>>>> completed-transaction interface. This causes a
>>>>>> problem, whereby status registers requiring a
>>>>>> read/write by hardware, have a 1-cycle delay, due
>>>>>> to the register update having to go through GRBM
>>>>>> interface.
>>>>>>
>>>>>> This patch adds this delay.
>>>>>>
>>>>>> A one cycle read op is added after updating the
>>>>>> invalidate request and before reading the
>>>>>> invalidate-ACK status.
>>>>> Please completely drop all changes for GFX9 since this patch will most
>>>>> likely break SRIOV.
>>>>>
>>>>> Additional to that please apply the workaround only to SDMA since the CP
>>>>> driven engines should handle that in firmware.
>> Thank you Christian for reviewing this patch.
>>
>> This patch stirred quite a bit of noise. So, then, I'll go by
>> your last comment above--I suppose this is the desired way to go forward then?
>
> You most likely broke the SRIOV use case on GFX9 with that, no wonder
> that this raised eyebrows.
>
> As far as I can see this manual workaround is only applicable to the
> SDMA on Navi.
Did you see the (v2) patch?
Regards,
Luben
>
> But we should double check that the CP firmware interface with the
> combined write/wait command is correctly used on Navi/GFX10 as well.
> IIRC that came in rather late for GFX9, could be that the Navi bringup
> branch never had that.
>
> Regards,
> Christian.
>
>>
>> Regards,
>> Luben
>>
>>
>>>> I think the CP only handles this in firmware if we use the new TLB
>>>> invalidation packet. I don't think it applies it to general register
>>>> writes like we do.
>>> No, on the CP we should use the combined write/wait command even if we
>>> don't use the new specialized VM invalidate command. Everything else
>>> won't work with SRIOV.
>>>
>>> Even if we want to we can't insert an extra read in this combined
>>> write/wait command. And if we split up the commands we would break SRIOV
>>> once more.
>>>
>>> So applying this workaround to the CP code doesn't make any sense at all.
>>>
>>> The only TODO which I can see is that we maybe don't use the combined
>>> write/wait command on Navi yet.
>>>
>>> Christian.
>>>
>>>> Alex
>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>> See also commit
>>>>>> 534991731cb5fa94b5519957646cf849ca10d17d.
>>>>>>
>>>>>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>>>>>> ---
>>>>>> drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
>>>>>> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 ++--
>>>>>> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
>>>>>> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 8 ++++++++
>>>>>> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>>>>>> 5 files changed, 22 insertions(+), 5 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>> index ac43b1af69e3..0042868dbd53 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>>>>> 5 + /* COND_EXEC */
>>>>>> 7 + /* PIPELINE_SYNC */
>>>>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>> 2 + /* VM_FLUSH */
>>>>>> 8 + /* FENCE for VM_FLUSH */
>>>>>> 20 + /* GDS switch */
>>>>>> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>>>>> 5 + /* hdp invalidate */
>>>>>> 7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>>>>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>> 2 + /* gfx_v10_0_ring_emit_vm_flush */
>>>>>> 8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>>>>>> .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>> index 9fe95e7693d5..9a7a717208de 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>>>>> 5 + /* COND_EXEC */
>>>>>> 7 + /* PIPELINE_SYNC */
>>>>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>> 2 + /* VM_FLUSH */
>>>>>> 8 + /* FENCE for VM_FLUSH */
>>>>>> 20 + /* GDS switch */
>>>>>> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>>>>> 5 + /* hdp invalidate */
>>>>>> 7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>>>>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>> 2 + /* gfx_v9_0_ring_emit_vm_flush */
>>>>>> 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>>>>>> .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>> index 6e1b25bd1fe7..100d526e9a42 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>> @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>>>
>>>>>> amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>>>>>
>>>>>> + /* Insert a dummy read to delay one cycle before the ACK
>>>>>> + * inquiry.
>>>>>> + */
>>>>>> + if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
>>>>>> + ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
>>>>>> + ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>>>> + amdgpu_ring_emit_reg_wait(ring,
>>>>>> + hub->vm_inv_eng0_req + eng, 0, 0);
>>>>>> +
>>>>>> /* wait for the invalidate to complete */
>>>>>> amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>>>>> 1 << vmid, 1 << vmid);
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>> index 9f2a893871ec..8f3097e45299 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>>> amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>>>>>> upper_32_bits(pd_addr));
>>>>>>
>>>>>> + /* Insert a dummy read to delay one cycle before the ACK
>>>>>> + * inquiry.
>>>>>> + */
>>>>>> + if (ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
>>>>>> + ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>>>> + amdgpu_ring_emit_reg_wait(ring,
>>>>>> + hub->vm_inv_eng0_req + eng, 0, 0);
>>>>>> +
>>>>>> amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>>>>>> hub->vm_inv_eng0_ack + eng,
>>>>>> req, 1 << vmid);
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>>> index b8fdb192f6d6..0c41b4fdc58b 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>>> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>>>>>> 6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>>>>>> /* sdma_v5_0_ring_emit_vm_flush */
>>>>>> SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
>>>>>> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
>>>>>> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>>>>>> 10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>>>>>> .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>>>>>> .emit_ib = sdma_v5_0_ring_emit_ib,
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] drm/amdgpu: GFX10: GRBM requires 1-cycle delay (v2)
@ 2019-10-28 10:51 ` Koenig, Christian
0 siblings, 0 replies; 20+ messages in thread
From: Koenig, Christian @ 2019-10-28 10:51 UTC (permalink / raw)
To: Tuikov, Luben, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
Cc: Deucher, Alexander
Am 26.10.19 um 00:59 schrieb Tuikov, Luben:
> The GRBM interface is now capable of bursting
> 1-cycle op per register, a WRITE followed by
> another WRITE, or a WRITE followed by a READ--much
> faster than previous muti-cycle per
> completed-transaction interface. This causes a
> problem, whereby status registers requiring a
> read/write by hardware, have a 1-cycle delay, due
> to the register update having to go through GRBM
> interface.
>
> This patch adds this delay.
>
> A one cycle read op is added after updating the
> invalidate request and before reading the
> invalidate-ACK status.
>
> See also commit
> 534991731cb5fa94b5519957646cf849ca10d17d.
>
> v2: Remove GFX9 and apply only to SDMA ring.
>
> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 7 +++++++
> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
> 2 files changed, 8 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 6e1b25bd1fe7..dedd7e1ab2fb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -346,6 +346,13 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>
> amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>
> + /* Insert a dummy read to delay one cycle after the write REQ,
> + * and before the ACK inquiry.
> + */
> + if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA)
> + amdgpu_ring_emit_reg_wait(ring,
> + hub->vm_inv_eng0_req + eng, 0, 0);
> +
> /* wait for the invalidate to complete */
> amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
> 1 << vmid, 1 << vmid);
Looks like we indeed doesn't use the integrated write/wait CP command on
Navi.
Anyway that is a completely separate issue, this patch is Reviewed-by:
Christian König <christian.koenig@amd.com>
Regards,
Christian.
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> index b8fdb192f6d6..0c41b4fdc58b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
> 6 + /* sdma_v5_0_ring_emit_pipeline_sync */
> /* sdma_v5_0_ring_emit_vm_flush */
> SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
> 10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
> .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
> .emit_ib = sdma_v5_0_ring_emit_ib,
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] drm/amdgpu: GFX10: GRBM requires 1-cycle delay (v2)
@ 2019-10-28 10:51 ` Koenig, Christian
0 siblings, 0 replies; 20+ messages in thread
From: Koenig, Christian @ 2019-10-28 10:51 UTC (permalink / raw)
To: Tuikov, Luben, amd-gfx; +Cc: Deucher, Alexander
Am 26.10.19 um 00:59 schrieb Tuikov, Luben:
> The GRBM interface is now capable of bursting
> 1-cycle op per register, a WRITE followed by
> another WRITE, or a WRITE followed by a READ--much
> faster than previous muti-cycle per
> completed-transaction interface. This causes a
> problem, whereby status registers requiring a
> read/write by hardware, have a 1-cycle delay, due
> to the register update having to go through GRBM
> interface.
>
> This patch adds this delay.
>
> A one cycle read op is added after updating the
> invalidate request and before reading the
> invalidate-ACK status.
>
> See also commit
> 534991731cb5fa94b5519957646cf849ca10d17d.
>
> v2: Remove GFX9 and apply only to SDMA ring.
>
> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 7 +++++++
> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
> 2 files changed, 8 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 6e1b25bd1fe7..dedd7e1ab2fb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -346,6 +346,13 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>
> amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>
> + /* Insert a dummy read to delay one cycle after the write REQ,
> + * and before the ACK inquiry.
> + */
> + if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA)
> + amdgpu_ring_emit_reg_wait(ring,
> + hub->vm_inv_eng0_req + eng, 0, 0);
> +
> /* wait for the invalidate to complete */
> amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
> 1 << vmid, 1 << vmid);
Looks like we indeed doesn't use the integrated write/wait CP command on
Navi.
Anyway that is a completely separate issue, this patch is Reviewed-by:
Christian König <christian.koenig@amd.com>
Regards,
Christian.
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> index b8fdb192f6d6..0c41b4fdc58b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
> 6 + /* sdma_v5_0_ring_emit_pipeline_sync */
> /* sdma_v5_0_ring_emit_vm_flush */
> SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
> + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
> 10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
> .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
> .emit_ib = sdma_v5_0_ring_emit_ib,
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 20+ messages in thread
end of thread, other threads:[~2019-10-28 10:51 UTC | newest]
Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-24 21:16 [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay Tuikov, Luben
2019-10-24 21:16 ` Tuikov, Luben
[not found] ` <20191024211430.25399-1-luben.tuikov-5C7GfCeVMHo@public.gmane.org>
2019-10-25 3:20 ` Zhu, Changfeng
2019-10-25 3:20 ` Zhu, Changfeng
2019-10-25 6:49 ` Koenig, Christian
2019-10-25 6:49 ` Koenig, Christian
[not found] ` <6be2805a-dddc-7b02-84ea-f52fab9780b0-5C7GfCeVMHo@public.gmane.org>
2019-10-25 16:05 ` Alex Deucher
2019-10-25 16:05 ` Alex Deucher
[not found] ` <CADnq5_NsTABDWTMBFcQBGfaBganBpzN+YQ0gmw55pa8PswNZYA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2019-10-25 16:19 ` Koenig, Christian
2019-10-25 16:19 ` Koenig, Christian
[not found] ` <b40c78f1-17a5-f0f9-183e-0c78fd7163e9-5C7GfCeVMHo@public.gmane.org>
2019-10-25 22:45 ` Tuikov, Luben
2019-10-25 22:45 ` Tuikov, Luben
[not found] ` <c3e496c7-2ace-149e-0c51-92dd1342d31d-5C7GfCeVMHo@public.gmane.org>
2019-10-26 12:09 ` Koenig, Christian
2019-10-26 12:09 ` Koenig, Christian
[not found] ` <122f3bde-5fd0-1fa5-864c-547c0cefb744-5C7GfCeVMHo@public.gmane.org>
2019-10-27 21:25 ` Tuikov, Luben
2019-10-27 21:25 ` Tuikov, Luben
2019-10-25 22:59 ` [PATCH] drm/amdgpu: GFX10: GRBM requires 1-cycle delay (v2) Tuikov, Luben
2019-10-25 22:59 ` Tuikov, Luben
[not found] ` <20191025225930.15074-1-luben.tuikov-5C7GfCeVMHo@public.gmane.org>
2019-10-28 10:51 ` Koenig, Christian
2019-10-28 10:51 ` Koenig, Christian
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.