All of lore.kernel.org
 help / color / mirror / Atom feed
* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25  9:26 ` Huang, Ray
  0 siblings, 0 replies; 30+ messages in thread
From: Huang, Ray @ 2019-10-25  9:26 UTC (permalink / raw)
  To: Tuikov, Luben
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Koenig,
	Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On Thu, Oct 24, 2019 at 09:16:55PM +0000, Tuikov, Luben wrote:
> The GRBM interface is now capable of bursting 1-cycle op per register, 
> a WRITE followed by another WRITE, or a WRITE followed by a READ--much 
> faster than previous muti-cycle per completed-transaction interface. 
> This causes a problem, whereby status registers requiring a read/write 
> by hardware, have a 1-cycle delay, due to the register update having 
> to go through GRBM interface.
> 
> This patch adds this delay.
> 
> A one cycle read op is added after updating the invalidate request and 
> before reading the invalidate-ACK status.
> 
> See also commit
> 534991731cb5fa94b5519957646cf849ca10d17d.
> 
> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--  
> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--  
> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++  
> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++  
> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>  5 files changed, 22 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index ac43b1af69e3..0042868dbd53 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>  		5 + /* COND_EXEC */
>  		7 + /* PIPELINE_SYNC */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>  		2 + /* VM_FLUSH */
>  		8 + /* FENCE for VM_FLUSH */
>  		20 + /* GDS switch */
> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>  		5 + /* hdp invalidate */
>  		7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>  		2 + /* gfx_v10_0_ring_emit_vm_flush */
>  		8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>  	.emit_ib_size =	7, /* gfx_v10_0_ring_emit_ib_compute */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 9fe95e7693d5..9a7a717208de 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>  		5 +  /* COND_EXEC */
>  		7 +  /* PIPELINE_SYNC */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>  		2 + /* VM_FLUSH */
>  		8 +  /* FENCE for VM_FLUSH */
>  		20 + /* GDS switch */
> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>  		5 + /* hdp invalidate */
>  		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>  		2 + /* gfx_v9_0_ring_emit_vm_flush */
>  		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>  	.emit_ib_size =	7, /* gfx_v9_0_ring_emit_ib_compute */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 6e1b25bd1fe7..100d526e9a42 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -346,6 +346,15 @@ static uint64_t 
> gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>  
>  	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>  
> +	/* Insert a dummy read to delay one cycle before the ACK
> +	 * inquiry.
> +	 */
> +	if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
> +	    ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
> +	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> +		amdgpu_ring_emit_reg_wait(ring,
> +					  hub->vm_inv_eng0_req + eng, 0, 0);
> +
>  	/* wait for the invalidate to complete */
>  	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>  				  1 << vmid, 1 << vmid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 9f2a893871ec..8f3097e45299 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>  	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>  			      upper_32_bits(pd_addr));
>  
> +	/* Insert a dummy read to delay one cycle before the ACK
> +	 * inquiry.
> +	 */
> +	if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
> +	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> +		amdgpu_ring_emit_reg_wait(ring,
> +					  hub->vm_inv_eng0_req + eng, 0, 0);

The workaround should be add a dummy read (one cycle delay) after we write VM_INVALIDATE_ENGx_REQ and before we poll the VM_INVALIDATE_ENGx_ACK.
If you add it here, that cannot resolve the issue. I think you should implement the dummy read in below function: amdgpu_ring_emit_reg_write_reg_wait().

Thanks,
Ray

> +
>  	amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>  					    hub->vm_inv_eng0_ack + eng,
>  					    req, 1 << vmid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c 
> b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> index b8fdb192f6d6..0c41b4fdc58b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>  		6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>  		/* sdma_v5_0_ring_emit_vm_flush */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>  		10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>  	.emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>  	.emit_ib = sdma_v5_0_ring_emit_ib,
> --
> 2.23.0.385.gbc12974a89
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25  9:26 ` Huang, Ray
  0 siblings, 0 replies; 30+ messages in thread
From: Huang, Ray @ 2019-10-25  9:26 UTC (permalink / raw)
  To: Tuikov, Luben
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Koenig,
	Christian, amd-gfx

On Thu, Oct 24, 2019 at 09:16:55PM +0000, Tuikov, Luben wrote:
> The GRBM interface is now capable of bursting 1-cycle op per register, 
> a WRITE followed by another WRITE, or a WRITE followed by a READ--much 
> faster than previous muti-cycle per completed-transaction interface. 
> This causes a problem, whereby status registers requiring a read/write 
> by hardware, have a 1-cycle delay, due to the register update having 
> to go through GRBM interface.
> 
> This patch adds this delay.
> 
> A one cycle read op is added after updating the invalidate request and 
> before reading the invalidate-ACK status.
> 
> See also commit
> 534991731cb5fa94b5519957646cf849ca10d17d.
> 
> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--  
> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--  
> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++  
> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++  
> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>  5 files changed, 22 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index ac43b1af69e3..0042868dbd53 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>  		5 + /* COND_EXEC */
>  		7 + /* PIPELINE_SYNC */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>  		2 + /* VM_FLUSH */
>  		8 + /* FENCE for VM_FLUSH */
>  		20 + /* GDS switch */
> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>  		5 + /* hdp invalidate */
>  		7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>  		2 + /* gfx_v10_0_ring_emit_vm_flush */
>  		8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>  	.emit_ib_size =	7, /* gfx_v10_0_ring_emit_ib_compute */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 9fe95e7693d5..9a7a717208de 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>  		5 +  /* COND_EXEC */
>  		7 +  /* PIPELINE_SYNC */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>  		2 + /* VM_FLUSH */
>  		8 +  /* FENCE for VM_FLUSH */
>  		20 + /* GDS switch */
> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>  		5 + /* hdp invalidate */
>  		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>  		2 + /* gfx_v9_0_ring_emit_vm_flush */
>  		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>  	.emit_ib_size =	7, /* gfx_v9_0_ring_emit_ib_compute */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 6e1b25bd1fe7..100d526e9a42 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -346,6 +346,15 @@ static uint64_t 
> gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>  
>  	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>  
> +	/* Insert a dummy read to delay one cycle before the ACK
> +	 * inquiry.
> +	 */
> +	if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
> +	    ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
> +	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> +		amdgpu_ring_emit_reg_wait(ring,
> +					  hub->vm_inv_eng0_req + eng, 0, 0);
> +
>  	/* wait for the invalidate to complete */
>  	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>  				  1 << vmid, 1 << vmid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 9f2a893871ec..8f3097e45299 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>  	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>  			      upper_32_bits(pd_addr));
>  
> +	/* Insert a dummy read to delay one cycle before the ACK
> +	 * inquiry.
> +	 */
> +	if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
> +	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> +		amdgpu_ring_emit_reg_wait(ring,
> +					  hub->vm_inv_eng0_req + eng, 0, 0);

The workaround should be add a dummy read (one cycle delay) after we write VM_INVALIDATE_ENGx_REQ and before we poll the VM_INVALIDATE_ENGx_ACK.
If you add it here, that cannot resolve the issue. I think you should implement the dummy read in below function: amdgpu_ring_emit_reg_write_reg_wait().

Thanks,
Ray

> +
>  	amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>  					    hub->vm_inv_eng0_ack + eng,
>  					    req, 1 << vmid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c 
> b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> index b8fdb192f6d6..0c41b4fdc58b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>  		6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>  		/* sdma_v5_0_ring_emit_vm_flush */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>  		10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>  	.emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>  	.emit_ib = sdma_v5_0_ring_emit_ib,
> --
> 2.23.0.385.gbc12974a89
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* RE: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25 14:22     ` Zhu, Changfeng
  0 siblings, 0 replies; 30+ messages in thread
From: Zhu, Changfeng @ 2019-10-25 14:22 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Huang, Ray,
	Koenig, Christian, Tuikov, Luben

[-- Attachment #1: Type: text/plain, Size: 14598 bytes --]

I try to write a patch based on the patch of Tuikov,Luben.

Inspired by Luben,here is the patch:

From 1980d8f1ed44fb9a84a5ea1f6e2edd2bc25c629a Mon Sep 17 00:00:00 2001
From: changzhu <Changfeng.Zhu@amd.com>
Date: Thu, 10 Oct 2019 11:02:33 +0800
Subject: [PATCH] drm/amdgpu: add dummy read by engines for some GCVM status
 registers

The GRBM register interface is now capable of bursting 1 cycle per
register wr->wr, wr->rd much faster than previous muticycle per
transaction done interface.  This has caused a problem where
status registers requiring HW to update have a 1 cycle delay, due
to the register update having to go through GRBM.

SW may operate on an incorrect value if they write a register and
immediately check the corresponding status register.

Registers requiring HW to clear or set fields may be delayed by 1 cycle.
For example,

1. write VM_INVALIDATE_ENG0_REQ mask = 5a
2. read VM_INVALIDATE_ENG0_ACKb till the ack is same as the request mask = 5a
    	a. HW will reset VM_INVALIDATE_ENG0_ACK = 0 until invalidation is complete
3. write VM_INVALIDATE_ENG0_REQ mask = 5a
4. read VM_INVALIDATE_ENG0_ACK till the ack is same as the request mask = 5a
	a. First read of VM_INVALIDATE_ENG0_ACK = 5a instead of 0
	b. Second read of VM_INVALIDATE_ENG0_ACK = 0 because the remote GRBM h/w
	   register takes one extra cycle to be cleared
	c. In this case,SW wil see a false ACK if they exit on first read

Affected registers (only GC variant)  | Recommended Dummy Read
--------------------------------------+----------------------------
VM_INVALIDATE_ENG*_ACK		      |  VM_INVALIDATE_ENG*_REQ
VM_L2_STATUS			      |  VM_L2_STATUS
VM_L2_PROTECTION_FAULT_STATUS	      |  VM_L2_PROTECTION_FAULT_STATUS
VM_L2_PROTECTION_FAULT_ADDR_HI/LO32   |  VM_L2_PROTECTION_FAULT_ADDR_HI/LO32
VM_L2_IH_LOG_BUSY		      |  VM_L2_IH_LOG_BUSY
MC_VM_L2_PERFCOUNTER_HI/LO	      |  MC_VM_L2_PERFCOUNTER_HI/LO
ATC_L2_PERFCOUNTER_HI/LO	      |  ATC_L2_PERFCOUNTER_HI/LO
ATC_L2_PERFCOUNTER2_HI/LO	      |  ATC_L2_PERFCOUNTER2_HI/LO

It also needs dummy read by engines for these gc registers.

Change-Id: Ie028f37eb789966d4593984bd661b248ebeb1ac3
Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c |  5 +++++
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   |  2 ++
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    |  2 ++
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c   |  4 ++++
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c   | 18 ++++++++++++++++++
 5 files changed, 31 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 4b3f58dbf36f..c2fbf6087ecf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -392,6 +392,11 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
 						uint32_t ref, uint32_t mask)
 {
 	amdgpu_ring_emit_wreg(ring, reg0, ref);
+
+	/* wait for a cycle to reset vm_inv_eng0_ack */
+	if (ring->funcs->vmhub == AMDGPU_GFXHUB_0)
+		amdgpu_ring_emit_rreg(ring, reg0);
+
 	amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index ef1975a5323a..104c47734316 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -5155,6 +5155,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
 	.patch_cond_exec = gfx_v10_0_ring_emit_patch_cond_exec,
 	.preempt_ib = gfx_v10_0_ring_preempt_ib,
 	.emit_tmz = gfx_v10_0_ring_emit_tmz,
+	.emit_rreg = gfx_v10_0_ring_emit_rreg,
 	.emit_wreg = gfx_v10_0_ring_emit_wreg,
 	.emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
 };
@@ -5188,6 +5189,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
 	.test_ib = gfx_v10_0_ring_test_ib,
 	.insert_nop = amdgpu_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
+	.emit_rreg = gfx_v10_0_ring_emit_rreg,
 	.emit_wreg = gfx_v10_0_ring_emit_wreg,
 	.emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 2f03bf533d41..d00b53de0fdc 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -6253,6 +6253,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
 	.init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
 	.patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
 	.emit_tmz = gfx_v9_0_ring_emit_tmz,
+	.emit_rreg = gfx_v9_0_ring_emit_rreg,
 	.emit_wreg = gfx_v9_0_ring_emit_wreg,
 	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
 	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
@@ -6289,6 +6290,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
 	.insert_nop = amdgpu_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
 	.set_priority = gfx_v9_0_ring_set_priority_compute,
+	.emit_rreg = gfx_v9_0_ring_emit_rreg,
 	.emit_wreg = gfx_v9_0_ring_emit_wreg,
 	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
 	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 3b00bce14cfb..dce6b651da1f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -346,6 +346,10 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 
 	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
 
+	/* wait for a cycle to reset vm_inv_eng0_ack */
+	if (ring->funcs->vmhub == AMDGPU_GFXHUB_0)
+		amdgpu_ring_emit_rreg(ring, hub->vm_inv_eng0_req + eng);
+
 	/* wait for the invalidate to complete */
 	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
 				  1 << vmid, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index 3460c00f3eaa..baaa33467882 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -38,6 +38,7 @@
 #include "navi10_sdma_pkt_open.h"
 #include "nbio_v2_3.h"
 #include "sdma_v5_0.h"
+#include "nvd.h"
 
 MODULE_FIRMWARE("amdgpu/navi10_sdma.bin");
 MODULE_FIRMWARE("amdgpu/navi10_sdma1.bin");
@@ -1147,6 +1148,22 @@ static void sdma_v5_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
 	amdgpu_gmc_emit_flush_gpu_tlb(ring, vmid, pd_addr);
 }
 
+static void sdma_v5_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
+{
+	struct amdgpu_device *adev = ring->adev;
+
+	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
+	amdgpu_ring_write(ring, 0 | /* src: register*/
+				(5 << 8) |  /* dst: memory */
+				(1 << 20)); /* write confirm */
+	amdgpu_ring_write(ring, reg);
+	amdgpu_ring_write(ring, 0);
+	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
+				adev->virt.reg_val_offs * 4));
+	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
+				adev->virt.reg_val_offs * 4));
+}
+
 static void sdma_v5_0_ring_emit_wreg(struct amdgpu_ring *ring,
 				     uint32_t reg, uint32_t val)
 {
@@ -1597,6 +1614,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
 	.test_ib = sdma_v5_0_ring_test_ib,
 	.insert_nop = sdma_v5_0_ring_insert_nop,
 	.pad_ib = sdma_v5_0_ring_pad_ib,
+	.emit_rreg = sdma_v5_0_ring_emit_rreg,
 	.emit_wreg = sdma_v5_0_ring_emit_wreg,
 	.emit_reg_wait = sdma_v5_0_ring_emit_reg_wait,
 	.init_cond_exec = sdma_v5_0_ring_init_cond_exec,
-- 
2.17.1

Could someone give some suggestions about it?

BR,
Changfeng.



-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Huang, Ray
Sent: Friday, October 25, 2019 5:26 PM
To: Tuikov, Luben <Luben.Tuikov@amd.com>
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay

On Thu, Oct 24, 2019 at 09:16:55PM +0000, Tuikov, Luben wrote:
> The GRBM interface is now capable of bursting 1-cycle op per register, 
> a WRITE followed by another WRITE, or a WRITE followed by a READ--much 
> faster than previous muti-cycle per completed-transaction interface.
> This causes a problem, whereby status registers requiring a read/write 
> by hardware, have a 1-cycle delay, due to the register update having 
> to go through GRBM interface.
> 
> This patch adds this delay.
> 
> A one cycle read op is added after updating the invalidate request and 
> before reading the invalidate-ACK status.
> 
> See also commit
> 534991731cb5fa94b5519957646cf849ca10d17d.
> 
> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++-- 
> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++-- 
> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++ 
> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++ 
> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>  5 files changed, 22 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index ac43b1af69e3..0042868dbd53 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>  		5 + /* COND_EXEC */
>  		7 + /* PIPELINE_SYNC */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>  		2 + /* VM_FLUSH */
>  		8 + /* FENCE for VM_FLUSH */
>  		20 + /* GDS switch */
> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>  		5 + /* hdp invalidate */
>  		7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>  		2 + /* gfx_v10_0_ring_emit_vm_flush */
>  		8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>  	.emit_ib_size =	7, /* gfx_v10_0_ring_emit_ib_compute */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 9fe95e7693d5..9a7a717208de 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>  		5 +  /* COND_EXEC */
>  		7 +  /* PIPELINE_SYNC */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>  		2 + /* VM_FLUSH */
>  		8 +  /* FENCE for VM_FLUSH */
>  		20 + /* GDS switch */
> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>  		5 + /* hdp invalidate */
>  		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>  		2 + /* gfx_v9_0_ring_emit_vm_flush */
>  		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>  	.emit_ib_size =	7, /* gfx_v9_0_ring_emit_ib_compute */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 6e1b25bd1fe7..100d526e9a42 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -346,6 +346,15 @@ static uint64_t
> gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>  
>  	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>  
> +	/* Insert a dummy read to delay one cycle before the ACK
> +	 * inquiry.
> +	 */
> +	if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
> +	    ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
> +	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> +		amdgpu_ring_emit_reg_wait(ring,
> +					  hub->vm_inv_eng0_req + eng, 0, 0);
> +
>  	/* wait for the invalidate to complete */
>  	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>  				  1 << vmid, 1 << vmid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 9f2a893871ec..8f3097e45299 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>  	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>  			      upper_32_bits(pd_addr));
>  
> +	/* Insert a dummy read to delay one cycle before the ACK
> +	 * inquiry.
> +	 */
> +	if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
> +	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> +		amdgpu_ring_emit_reg_wait(ring,
> +					  hub->vm_inv_eng0_req + eng, 0, 0);

The workaround should be add a dummy read (one cycle delay) after we write VM_INVALIDATE_ENGx_REQ and before we poll the VM_INVALIDATE_ENGx_ACK.
If you add it here, that cannot resolve the issue. I think you should implement the dummy read in below function: amdgpu_ring_emit_reg_write_reg_wait().

Thanks,
Ray

> +
>  	amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>  					    hub->vm_inv_eng0_ack + eng,
>  					    req, 1 << vmid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> index b8fdb192f6d6..0c41b4fdc58b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>  		6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>  		/* sdma_v5_0_ring_emit_vm_flush */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>  		10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>  	.emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>  	.emit_ib = sdma_v5_0_ring_emit_ib,
> --
> 2.23.0.385.gbc12974a89
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[-- Attachment #2: 0001-drm-amdgpu-add-dummy-read-by-engines-for-some-GCVM-s.patch --]
[-- Type: application/octet-stream, Size: 7324 bytes --]

From 1980d8f1ed44fb9a84a5ea1f6e2edd2bc25c629a Mon Sep 17 00:00:00 2001
From: changzhu <Changfeng.Zhu@amd.com>
Date: Thu, 10 Oct 2019 11:02:33 +0800
Subject: [PATCH] drm/amdgpu: add dummy read by engines for some GCVM status
 registers

The GRBM register interface is now capable of bursting 1 cycle per
register wr->wr, wr->rd much faster than previous muticycle per
transaction done interface.  This has caused a problem where
status registers requiring HW to update have a 1 cycle delay, due
to the register update having to go through GRBM.

SW may operate on an incorrect value if they write a register and
immediately check the corresponding status register.

Registers requiring HW to clear or set fields may be delayed by 1 cycle.
For example,

1. write VM_INVALIDATE_ENG0_REQ mask = 5a
2. read VM_INVALIDATE_ENG0_ACKb till the ack is same as the request mask = 5a
    	a. HW will reset VM_INVALIDATE_ENG0_ACK = 0 until invalidation is complete
3. write VM_INVALIDATE_ENG0_REQ mask = 5a
4. read VM_INVALIDATE_ENG0_ACK till the ack is same as the request mask = 5a
	a. First read of VM_INVALIDATE_ENG0_ACK = 5a instead of 0
	b. Second read of VM_INVALIDATE_ENG0_ACK = 0 because the remote GRBM h/w
	   register takes one extra cycle to be cleared
	c. In this case,SW wil see a false ACK if they exit on first read

Affected registers (only GC variant)  | Recommended Dummy Read
--------------------------------------+----------------------------
VM_INVALIDATE_ENG*_ACK		      |  VM_INVALIDATE_ENG*_REQ
VM_L2_STATUS			      |  VM_L2_STATUS
VM_L2_PROTECTION_FAULT_STATUS	      |  VM_L2_PROTECTION_FAULT_STATUS
VM_L2_PROTECTION_FAULT_ADDR_HI/LO32   |  VM_L2_PROTECTION_FAULT_ADDR_HI/LO32
VM_L2_IH_LOG_BUSY		      |  VM_L2_IH_LOG_BUSY
MC_VM_L2_PERFCOUNTER_HI/LO	      |  MC_VM_L2_PERFCOUNTER_HI/LO
ATC_L2_PERFCOUNTER_HI/LO	      |  ATC_L2_PERFCOUNTER_HI/LO
ATC_L2_PERFCOUNTER2_HI/LO	      |  ATC_L2_PERFCOUNTER2_HI/LO

It also needs dummy read by engines for these gc registers.

Change-Id: Ie028f37eb789966d4593984bd661b248ebeb1ac3
Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c |  5 +++++
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   |  2 ++
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    |  2 ++
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c   |  4 ++++
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c   | 18 ++++++++++++++++++
 5 files changed, 31 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 4b3f58dbf36f..c2fbf6087ecf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -392,6 +392,11 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
 						uint32_t ref, uint32_t mask)
 {
 	amdgpu_ring_emit_wreg(ring, reg0, ref);
+
+	/* wait for a cycle to reset vm_inv_eng0_ack */
+	if (ring->funcs->vmhub == AMDGPU_GFXHUB_0)
+		amdgpu_ring_emit_rreg(ring, reg0);
+
 	amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index ef1975a5323a..104c47734316 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -5155,6 +5155,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
 	.patch_cond_exec = gfx_v10_0_ring_emit_patch_cond_exec,
 	.preempt_ib = gfx_v10_0_ring_preempt_ib,
 	.emit_tmz = gfx_v10_0_ring_emit_tmz,
+	.emit_rreg = gfx_v10_0_ring_emit_rreg,
 	.emit_wreg = gfx_v10_0_ring_emit_wreg,
 	.emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
 };
@@ -5188,6 +5189,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
 	.test_ib = gfx_v10_0_ring_test_ib,
 	.insert_nop = amdgpu_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
+	.emit_rreg = gfx_v10_0_ring_emit_rreg,
 	.emit_wreg = gfx_v10_0_ring_emit_wreg,
 	.emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 2f03bf533d41..d00b53de0fdc 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -6253,6 +6253,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
 	.init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
 	.patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
 	.emit_tmz = gfx_v9_0_ring_emit_tmz,
+	.emit_rreg = gfx_v9_0_ring_emit_rreg,
 	.emit_wreg = gfx_v9_0_ring_emit_wreg,
 	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
 	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
@@ -6289,6 +6290,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
 	.insert_nop = amdgpu_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
 	.set_priority = gfx_v9_0_ring_set_priority_compute,
+	.emit_rreg = gfx_v9_0_ring_emit_rreg,
 	.emit_wreg = gfx_v9_0_ring_emit_wreg,
 	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
 	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 3b00bce14cfb..dce6b651da1f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -346,6 +346,10 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 
 	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
 
+	/* wait for a cycle to reset vm_inv_eng0_ack */
+	if (ring->funcs->vmhub == AMDGPU_GFXHUB_0)
+		amdgpu_ring_emit_rreg(ring, hub->vm_inv_eng0_req + eng);
+
 	/* wait for the invalidate to complete */
 	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
 				  1 << vmid, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index 3460c00f3eaa..baaa33467882 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -38,6 +38,7 @@
 #include "navi10_sdma_pkt_open.h"
 #include "nbio_v2_3.h"
 #include "sdma_v5_0.h"
+#include "nvd.h"
 
 MODULE_FIRMWARE("amdgpu/navi10_sdma.bin");
 MODULE_FIRMWARE("amdgpu/navi10_sdma1.bin");
@@ -1147,6 +1148,22 @@ static void sdma_v5_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
 	amdgpu_gmc_emit_flush_gpu_tlb(ring, vmid, pd_addr);
 }
 
+static void sdma_v5_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
+{
+	struct amdgpu_device *adev = ring->adev;
+
+	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
+	amdgpu_ring_write(ring, 0 | /* src: register*/
+				(5 << 8) |  /* dst: memory */
+				(1 << 20)); /* write confirm */
+	amdgpu_ring_write(ring, reg);
+	amdgpu_ring_write(ring, 0);
+	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
+				adev->virt.reg_val_offs * 4));
+	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
+				adev->virt.reg_val_offs * 4));
+}
+
 static void sdma_v5_0_ring_emit_wreg(struct amdgpu_ring *ring,
 				     uint32_t reg, uint32_t val)
 {
@@ -1597,6 +1614,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
 	.test_ib = sdma_v5_0_ring_test_ib,
 	.insert_nop = sdma_v5_0_ring_insert_nop,
 	.pad_ib = sdma_v5_0_ring_pad_ib,
+	.emit_rreg = sdma_v5_0_ring_emit_rreg,
 	.emit_wreg = sdma_v5_0_ring_emit_wreg,
 	.emit_reg_wait = sdma_v5_0_ring_emit_reg_wait,
 	.init_cond_exec = sdma_v5_0_ring_init_cond_exec,
-- 
2.17.1


[-- Attachment #3: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* RE: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25 14:22     ` Zhu, Changfeng
  0 siblings, 0 replies; 30+ messages in thread
From: Zhu, Changfeng @ 2019-10-25 14:22 UTC (permalink / raw)
  To: amd-gfx
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Huang, Ray,
	Koenig, Christian, Tuikov, Luben

[-- Attachment #1: Type: text/plain, Size: 14598 bytes --]

I try to write a patch based on the patch of Tuikov,Luben.

Inspired by Luben,here is the patch:

From 1980d8f1ed44fb9a84a5ea1f6e2edd2bc25c629a Mon Sep 17 00:00:00 2001
From: changzhu <Changfeng.Zhu@amd.com>
Date: Thu, 10 Oct 2019 11:02:33 +0800
Subject: [PATCH] drm/amdgpu: add dummy read by engines for some GCVM status
 registers

The GRBM register interface is now capable of bursting 1 cycle per
register wr->wr, wr->rd much faster than previous muticycle per
transaction done interface.  This has caused a problem where
status registers requiring HW to update have a 1 cycle delay, due
to the register update having to go through GRBM.

SW may operate on an incorrect value if they write a register and
immediately check the corresponding status register.

Registers requiring HW to clear or set fields may be delayed by 1 cycle.
For example,

1. write VM_INVALIDATE_ENG0_REQ mask = 5a
2. read VM_INVALIDATE_ENG0_ACKb till the ack is same as the request mask = 5a
    	a. HW will reset VM_INVALIDATE_ENG0_ACK = 0 until invalidation is complete
3. write VM_INVALIDATE_ENG0_REQ mask = 5a
4. read VM_INVALIDATE_ENG0_ACK till the ack is same as the request mask = 5a
	a. First read of VM_INVALIDATE_ENG0_ACK = 5a instead of 0
	b. Second read of VM_INVALIDATE_ENG0_ACK = 0 because the remote GRBM h/w
	   register takes one extra cycle to be cleared
	c. In this case,SW wil see a false ACK if they exit on first read

Affected registers (only GC variant)  | Recommended Dummy Read
--------------------------------------+----------------------------
VM_INVALIDATE_ENG*_ACK		      |  VM_INVALIDATE_ENG*_REQ
VM_L2_STATUS			      |  VM_L2_STATUS
VM_L2_PROTECTION_FAULT_STATUS	      |  VM_L2_PROTECTION_FAULT_STATUS
VM_L2_PROTECTION_FAULT_ADDR_HI/LO32   |  VM_L2_PROTECTION_FAULT_ADDR_HI/LO32
VM_L2_IH_LOG_BUSY		      |  VM_L2_IH_LOG_BUSY
MC_VM_L2_PERFCOUNTER_HI/LO	      |  MC_VM_L2_PERFCOUNTER_HI/LO
ATC_L2_PERFCOUNTER_HI/LO	      |  ATC_L2_PERFCOUNTER_HI/LO
ATC_L2_PERFCOUNTER2_HI/LO	      |  ATC_L2_PERFCOUNTER2_HI/LO

It also needs dummy read by engines for these gc registers.

Change-Id: Ie028f37eb789966d4593984bd661b248ebeb1ac3
Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c |  5 +++++
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   |  2 ++
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    |  2 ++
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c   |  4 ++++
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c   | 18 ++++++++++++++++++
 5 files changed, 31 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 4b3f58dbf36f..c2fbf6087ecf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -392,6 +392,11 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
 						uint32_t ref, uint32_t mask)
 {
 	amdgpu_ring_emit_wreg(ring, reg0, ref);
+
+	/* wait for a cycle to reset vm_inv_eng0_ack */
+	if (ring->funcs->vmhub == AMDGPU_GFXHUB_0)
+		amdgpu_ring_emit_rreg(ring, reg0);
+
 	amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index ef1975a5323a..104c47734316 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -5155,6 +5155,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
 	.patch_cond_exec = gfx_v10_0_ring_emit_patch_cond_exec,
 	.preempt_ib = gfx_v10_0_ring_preempt_ib,
 	.emit_tmz = gfx_v10_0_ring_emit_tmz,
+	.emit_rreg = gfx_v10_0_ring_emit_rreg,
 	.emit_wreg = gfx_v10_0_ring_emit_wreg,
 	.emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
 };
@@ -5188,6 +5189,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
 	.test_ib = gfx_v10_0_ring_test_ib,
 	.insert_nop = amdgpu_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
+	.emit_rreg = gfx_v10_0_ring_emit_rreg,
 	.emit_wreg = gfx_v10_0_ring_emit_wreg,
 	.emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 2f03bf533d41..d00b53de0fdc 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -6253,6 +6253,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
 	.init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
 	.patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
 	.emit_tmz = gfx_v9_0_ring_emit_tmz,
+	.emit_rreg = gfx_v9_0_ring_emit_rreg,
 	.emit_wreg = gfx_v9_0_ring_emit_wreg,
 	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
 	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
@@ -6289,6 +6290,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
 	.insert_nop = amdgpu_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
 	.set_priority = gfx_v9_0_ring_set_priority_compute,
+	.emit_rreg = gfx_v9_0_ring_emit_rreg,
 	.emit_wreg = gfx_v9_0_ring_emit_wreg,
 	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
 	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 3b00bce14cfb..dce6b651da1f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -346,6 +346,10 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 
 	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
 
+	/* wait for a cycle to reset vm_inv_eng0_ack */
+	if (ring->funcs->vmhub == AMDGPU_GFXHUB_0)
+		amdgpu_ring_emit_rreg(ring, hub->vm_inv_eng0_req + eng);
+
 	/* wait for the invalidate to complete */
 	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
 				  1 << vmid, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index 3460c00f3eaa..baaa33467882 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -38,6 +38,7 @@
 #include "navi10_sdma_pkt_open.h"
 #include "nbio_v2_3.h"
 #include "sdma_v5_0.h"
+#include "nvd.h"
 
 MODULE_FIRMWARE("amdgpu/navi10_sdma.bin");
 MODULE_FIRMWARE("amdgpu/navi10_sdma1.bin");
@@ -1147,6 +1148,22 @@ static void sdma_v5_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
 	amdgpu_gmc_emit_flush_gpu_tlb(ring, vmid, pd_addr);
 }
 
+static void sdma_v5_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
+{
+	struct amdgpu_device *adev = ring->adev;
+
+	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
+	amdgpu_ring_write(ring, 0 | /* src: register*/
+				(5 << 8) |  /* dst: memory */
+				(1 << 20)); /* write confirm */
+	amdgpu_ring_write(ring, reg);
+	amdgpu_ring_write(ring, 0);
+	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
+				adev->virt.reg_val_offs * 4));
+	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
+				adev->virt.reg_val_offs * 4));
+}
+
 static void sdma_v5_0_ring_emit_wreg(struct amdgpu_ring *ring,
 				     uint32_t reg, uint32_t val)
 {
@@ -1597,6 +1614,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
 	.test_ib = sdma_v5_0_ring_test_ib,
 	.insert_nop = sdma_v5_0_ring_insert_nop,
 	.pad_ib = sdma_v5_0_ring_pad_ib,
+	.emit_rreg = sdma_v5_0_ring_emit_rreg,
 	.emit_wreg = sdma_v5_0_ring_emit_wreg,
 	.emit_reg_wait = sdma_v5_0_ring_emit_reg_wait,
 	.init_cond_exec = sdma_v5_0_ring_init_cond_exec,
-- 
2.17.1

Could someone give some suggestions about it?

BR,
Changfeng.



-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Huang, Ray
Sent: Friday, October 25, 2019 5:26 PM
To: Tuikov, Luben <Luben.Tuikov@amd.com>
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay

On Thu, Oct 24, 2019 at 09:16:55PM +0000, Tuikov, Luben wrote:
> The GRBM interface is now capable of bursting 1-cycle op per register, 
> a WRITE followed by another WRITE, or a WRITE followed by a READ--much 
> faster than previous muti-cycle per completed-transaction interface.
> This causes a problem, whereby status registers requiring a read/write 
> by hardware, have a 1-cycle delay, due to the register update having 
> to go through GRBM interface.
> 
> This patch adds this delay.
> 
> A one cycle read op is added after updating the invalidate request and 
> before reading the invalidate-ACK status.
> 
> See also commit
> 534991731cb5fa94b5519957646cf849ca10d17d.
> 
> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++-- 
> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++-- 
> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++ 
> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++ 
> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>  5 files changed, 22 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index ac43b1af69e3..0042868dbd53 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>  		5 + /* COND_EXEC */
>  		7 + /* PIPELINE_SYNC */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>  		2 + /* VM_FLUSH */
>  		8 + /* FENCE for VM_FLUSH */
>  		20 + /* GDS switch */
> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>  		5 + /* hdp invalidate */
>  		7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>  		2 + /* gfx_v10_0_ring_emit_vm_flush */
>  		8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>  	.emit_ib_size =	7, /* gfx_v10_0_ring_emit_ib_compute */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 9fe95e7693d5..9a7a717208de 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>  		5 +  /* COND_EXEC */
>  		7 +  /* PIPELINE_SYNC */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>  		2 + /* VM_FLUSH */
>  		8 +  /* FENCE for VM_FLUSH */
>  		20 + /* GDS switch */
> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>  		5 + /* hdp invalidate */
>  		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>  		2 + /* gfx_v9_0_ring_emit_vm_flush */
>  		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>  	.emit_ib_size =	7, /* gfx_v9_0_ring_emit_ib_compute */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 6e1b25bd1fe7..100d526e9a42 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -346,6 +346,15 @@ static uint64_t
> gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>  
>  	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>  
> +	/* Insert a dummy read to delay one cycle before the ACK
> +	 * inquiry.
> +	 */
> +	if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
> +	    ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
> +	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> +		amdgpu_ring_emit_reg_wait(ring,
> +					  hub->vm_inv_eng0_req + eng, 0, 0);
> +
>  	/* wait for the invalidate to complete */
>  	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>  				  1 << vmid, 1 << vmid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 9f2a893871ec..8f3097e45299 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>  	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>  			      upper_32_bits(pd_addr));
>  
> +	/* Insert a dummy read to delay one cycle before the ACK
> +	 * inquiry.
> +	 */
> +	if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
> +	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> +		amdgpu_ring_emit_reg_wait(ring,
> +					  hub->vm_inv_eng0_req + eng, 0, 0);

The workaround should be add a dummy read (one cycle delay) after we write VM_INVALIDATE_ENGx_REQ and before we poll the VM_INVALIDATE_ENGx_ACK.
If you add it here, that cannot resolve the issue. I think you should implement the dummy read in below function: amdgpu_ring_emit_reg_write_reg_wait().

Thanks,
Ray

> +
>  	amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>  					    hub->vm_inv_eng0_ack + eng,
>  					    req, 1 << vmid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> index b8fdb192f6d6..0c41b4fdc58b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>  		6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>  		/* sdma_v5_0_ring_emit_vm_flush */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>  		10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>  	.emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>  	.emit_ib = sdma_v5_0_ring_emit_ib,
> --
> 2.23.0.385.gbc12974a89
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[-- Attachment #2: 0001-drm-amdgpu-add-dummy-read-by-engines-for-some-GCVM-s.patch --]
[-- Type: application/octet-stream, Size: 7324 bytes --]

From 1980d8f1ed44fb9a84a5ea1f6e2edd2bc25c629a Mon Sep 17 00:00:00 2001
From: changzhu <Changfeng.Zhu@amd.com>
Date: Thu, 10 Oct 2019 11:02:33 +0800
Subject: [PATCH] drm/amdgpu: add dummy read by engines for some GCVM status
 registers

The GRBM register interface is now capable of bursting 1 cycle per
register wr->wr, wr->rd much faster than previous muticycle per
transaction done interface.  This has caused a problem where
status registers requiring HW to update have a 1 cycle delay, due
to the register update having to go through GRBM.

SW may operate on an incorrect value if they write a register and
immediately check the corresponding status register.

Registers requiring HW to clear or set fields may be delayed by 1 cycle.
For example,

1. write VM_INVALIDATE_ENG0_REQ mask = 5a
2. read VM_INVALIDATE_ENG0_ACKb till the ack is same as the request mask = 5a
    	a. HW will reset VM_INVALIDATE_ENG0_ACK = 0 until invalidation is complete
3. write VM_INVALIDATE_ENG0_REQ mask = 5a
4. read VM_INVALIDATE_ENG0_ACK till the ack is same as the request mask = 5a
	a. First read of VM_INVALIDATE_ENG0_ACK = 5a instead of 0
	b. Second read of VM_INVALIDATE_ENG0_ACK = 0 because the remote GRBM h/w
	   register takes one extra cycle to be cleared
	c. In this case,SW wil see a false ACK if they exit on first read

Affected registers (only GC variant)  | Recommended Dummy Read
--------------------------------------+----------------------------
VM_INVALIDATE_ENG*_ACK		      |  VM_INVALIDATE_ENG*_REQ
VM_L2_STATUS			      |  VM_L2_STATUS
VM_L2_PROTECTION_FAULT_STATUS	      |  VM_L2_PROTECTION_FAULT_STATUS
VM_L2_PROTECTION_FAULT_ADDR_HI/LO32   |  VM_L2_PROTECTION_FAULT_ADDR_HI/LO32
VM_L2_IH_LOG_BUSY		      |  VM_L2_IH_LOG_BUSY
MC_VM_L2_PERFCOUNTER_HI/LO	      |  MC_VM_L2_PERFCOUNTER_HI/LO
ATC_L2_PERFCOUNTER_HI/LO	      |  ATC_L2_PERFCOUNTER_HI/LO
ATC_L2_PERFCOUNTER2_HI/LO	      |  ATC_L2_PERFCOUNTER2_HI/LO

It also needs dummy read by engines for these gc registers.

Change-Id: Ie028f37eb789966d4593984bd661b248ebeb1ac3
Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c |  5 +++++
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   |  2 ++
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    |  2 ++
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c   |  4 ++++
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c   | 18 ++++++++++++++++++
 5 files changed, 31 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 4b3f58dbf36f..c2fbf6087ecf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -392,6 +392,11 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
 						uint32_t ref, uint32_t mask)
 {
 	amdgpu_ring_emit_wreg(ring, reg0, ref);
+
+	/* wait for a cycle to reset vm_inv_eng0_ack */
+	if (ring->funcs->vmhub == AMDGPU_GFXHUB_0)
+		amdgpu_ring_emit_rreg(ring, reg0);
+
 	amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index ef1975a5323a..104c47734316 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -5155,6 +5155,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
 	.patch_cond_exec = gfx_v10_0_ring_emit_patch_cond_exec,
 	.preempt_ib = gfx_v10_0_ring_preempt_ib,
 	.emit_tmz = gfx_v10_0_ring_emit_tmz,
+	.emit_rreg = gfx_v10_0_ring_emit_rreg,
 	.emit_wreg = gfx_v10_0_ring_emit_wreg,
 	.emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
 };
@@ -5188,6 +5189,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
 	.test_ib = gfx_v10_0_ring_test_ib,
 	.insert_nop = amdgpu_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
+	.emit_rreg = gfx_v10_0_ring_emit_rreg,
 	.emit_wreg = gfx_v10_0_ring_emit_wreg,
 	.emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 2f03bf533d41..d00b53de0fdc 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -6253,6 +6253,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
 	.init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
 	.patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
 	.emit_tmz = gfx_v9_0_ring_emit_tmz,
+	.emit_rreg = gfx_v9_0_ring_emit_rreg,
 	.emit_wreg = gfx_v9_0_ring_emit_wreg,
 	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
 	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
@@ -6289,6 +6290,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
 	.insert_nop = amdgpu_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
 	.set_priority = gfx_v9_0_ring_set_priority_compute,
+	.emit_rreg = gfx_v9_0_ring_emit_rreg,
 	.emit_wreg = gfx_v9_0_ring_emit_wreg,
 	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
 	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 3b00bce14cfb..dce6b651da1f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -346,6 +346,10 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 
 	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
 
+	/* wait for a cycle to reset vm_inv_eng0_ack */
+	if (ring->funcs->vmhub == AMDGPU_GFXHUB_0)
+		amdgpu_ring_emit_rreg(ring, hub->vm_inv_eng0_req + eng);
+
 	/* wait for the invalidate to complete */
 	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
 				  1 << vmid, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index 3460c00f3eaa..baaa33467882 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -38,6 +38,7 @@
 #include "navi10_sdma_pkt_open.h"
 #include "nbio_v2_3.h"
 #include "sdma_v5_0.h"
+#include "nvd.h"
 
 MODULE_FIRMWARE("amdgpu/navi10_sdma.bin");
 MODULE_FIRMWARE("amdgpu/navi10_sdma1.bin");
@@ -1147,6 +1148,22 @@ static void sdma_v5_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
 	amdgpu_gmc_emit_flush_gpu_tlb(ring, vmid, pd_addr);
 }
 
+static void sdma_v5_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
+{
+	struct amdgpu_device *adev = ring->adev;
+
+	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
+	amdgpu_ring_write(ring, 0 | /* src: register*/
+				(5 << 8) |  /* dst: memory */
+				(1 << 20)); /* write confirm */
+	amdgpu_ring_write(ring, reg);
+	amdgpu_ring_write(ring, 0);
+	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
+				adev->virt.reg_val_offs * 4));
+	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
+				adev->virt.reg_val_offs * 4));
+}
+
 static void sdma_v5_0_ring_emit_wreg(struct amdgpu_ring *ring,
 				     uint32_t reg, uint32_t val)
 {
@@ -1597,6 +1614,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
 	.test_ib = sdma_v5_0_ring_test_ib,
 	.insert_nop = sdma_v5_0_ring_insert_nop,
 	.pad_ib = sdma_v5_0_ring_pad_ib,
+	.emit_rreg = sdma_v5_0_ring_emit_rreg,
 	.emit_wreg = sdma_v5_0_ring_emit_wreg,
 	.emit_reg_wait = sdma_v5_0_ring_emit_reg_wait,
 	.init_cond_exec = sdma_v5_0_ring_init_cond_exec,
-- 
2.17.1


[-- Attachment #3: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25 15:53         ` Koenig, Christian
  0 siblings, 0 replies; 30+ messages in thread
From: Koenig, Christian @ 2019-10-25 15:53 UTC (permalink / raw)
  To: Zhu, Changfeng, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Huang, Ray,
	Tuikov, Luben

Hi Changfeng,

that won't work, you can't add this to 
amdgpu_ring_emit_reg_write_reg_wait_helper or break all read triggered 
registers (like the semaphore ones).

Additional to that it will never work on GFX9, since the CP firmware 
there uses the integrated write/wait command and you can't add an 
additional dummy read there.

Regards,
Christian.

Am 25.10.19 um 16:22 schrieb Zhu, Changfeng:
> I try to write a patch based on the patch of Tuikov,Luben.
>
> Inspired by Luben,here is the patch:
>
>  From 1980d8f1ed44fb9a84a5ea1f6e2edd2bc25c629a Mon Sep 17 00:00:00 2001
> From: changzhu <Changfeng.Zhu@amd.com>
> Date: Thu, 10 Oct 2019 11:02:33 +0800
> Subject: [PATCH] drm/amdgpu: add dummy read by engines for some GCVM status
>   registers
>
> The GRBM register interface is now capable of bursting 1 cycle per
> register wr->wr, wr->rd much faster than previous muticycle per
> transaction done interface.  This has caused a problem where
> status registers requiring HW to update have a 1 cycle delay, due
> to the register update having to go through GRBM.
>
> SW may operate on an incorrect value if they write a register and
> immediately check the corresponding status register.
>
> Registers requiring HW to clear or set fields may be delayed by 1 cycle.
> For example,
>
> 1. write VM_INVALIDATE_ENG0_REQ mask = 5a
> 2. read VM_INVALIDATE_ENG0_ACKb till the ack is same as the request mask = 5a
>      	a. HW will reset VM_INVALIDATE_ENG0_ACK = 0 until invalidation is complete
> 3. write VM_INVALIDATE_ENG0_REQ mask = 5a
> 4. read VM_INVALIDATE_ENG0_ACK till the ack is same as the request mask = 5a
> 	a. First read of VM_INVALIDATE_ENG0_ACK = 5a instead of 0
> 	b. Second read of VM_INVALIDATE_ENG0_ACK = 0 because the remote GRBM h/w
> 	   register takes one extra cycle to be cleared
> 	c. In this case,SW wil see a false ACK if they exit on first read
>
> Affected registers (only GC variant)  | Recommended Dummy Read
> --------------------------------------+----------------------------
> VM_INVALIDATE_ENG*_ACK		      |  VM_INVALIDATE_ENG*_REQ
> VM_L2_STATUS			      |  VM_L2_STATUS
> VM_L2_PROTECTION_FAULT_STATUS	      |  VM_L2_PROTECTION_FAULT_STATUS
> VM_L2_PROTECTION_FAULT_ADDR_HI/LO32   |  VM_L2_PROTECTION_FAULT_ADDR_HI/LO32
> VM_L2_IH_LOG_BUSY		      |  VM_L2_IH_LOG_BUSY
> MC_VM_L2_PERFCOUNTER_HI/LO	      |  MC_VM_L2_PERFCOUNTER_HI/LO
> ATC_L2_PERFCOUNTER_HI/LO	      |  ATC_L2_PERFCOUNTER_HI/LO
> ATC_L2_PERFCOUNTER2_HI/LO	      |  ATC_L2_PERFCOUNTER2_HI/LO
>
> It also needs dummy read by engines for these gc registers.
>
> Change-Id: Ie028f37eb789966d4593984bd661b248ebeb1ac3
> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c |  5 +++++
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   |  2 ++
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    |  2 ++
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c   |  4 ++++
>   drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c   | 18 ++++++++++++++++++
>   5 files changed, 31 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> index 4b3f58dbf36f..c2fbf6087ecf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> @@ -392,6 +392,11 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
>   						uint32_t ref, uint32_t mask)
>   {
>   	amdgpu_ring_emit_wreg(ring, reg0, ref);
> +
> +	/* wait for a cycle to reset vm_inv_eng0_ack */
> +	if (ring->funcs->vmhub == AMDGPU_GFXHUB_0)
> +		amdgpu_ring_emit_rreg(ring, reg0);
> +
>   	amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index ef1975a5323a..104c47734316 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -5155,6 +5155,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>   	.patch_cond_exec = gfx_v10_0_ring_emit_patch_cond_exec,
>   	.preempt_ib = gfx_v10_0_ring_preempt_ib,
>   	.emit_tmz = gfx_v10_0_ring_emit_tmz,
> +	.emit_rreg = gfx_v10_0_ring_emit_rreg,
>   	.emit_wreg = gfx_v10_0_ring_emit_wreg,
>   	.emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>   };
> @@ -5188,6 +5189,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>   	.test_ib = gfx_v10_0_ring_test_ib,
>   	.insert_nop = amdgpu_ring_insert_nop,
>   	.pad_ib = amdgpu_ring_generic_pad_ib,
> +	.emit_rreg = gfx_v10_0_ring_emit_rreg,
>   	.emit_wreg = gfx_v10_0_ring_emit_wreg,
>   	.emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 2f03bf533d41..d00b53de0fdc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -6253,6 +6253,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>   	.init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
>   	.patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
>   	.emit_tmz = gfx_v9_0_ring_emit_tmz,
> +	.emit_rreg = gfx_v9_0_ring_emit_rreg,
>   	.emit_wreg = gfx_v9_0_ring_emit_wreg,
>   	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>   	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
> @@ -6289,6 +6290,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>   	.insert_nop = amdgpu_ring_insert_nop,
>   	.pad_ib = amdgpu_ring_generic_pad_ib,
>   	.set_priority = gfx_v9_0_ring_set_priority_compute,
> +	.emit_rreg = gfx_v9_0_ring_emit_rreg,
>   	.emit_wreg = gfx_v9_0_ring_emit_wreg,
>   	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>   	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 3b00bce14cfb..dce6b651da1f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -346,6 +346,10 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   
>   	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>   
> +	/* wait for a cycle to reset vm_inv_eng0_ack */
> +	if (ring->funcs->vmhub == AMDGPU_GFXHUB_0)
> +		amdgpu_ring_emit_rreg(ring, hub->vm_inv_eng0_req + eng);
> +
>   	/* wait for the invalidate to complete */
>   	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>   				  1 << vmid, 1 << vmid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> index 3460c00f3eaa..baaa33467882 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> @@ -38,6 +38,7 @@
>   #include "navi10_sdma_pkt_open.h"
>   #include "nbio_v2_3.h"
>   #include "sdma_v5_0.h"
> +#include "nvd.h"
>   
>   MODULE_FIRMWARE("amdgpu/navi10_sdma.bin");
>   MODULE_FIRMWARE("amdgpu/navi10_sdma1.bin");
> @@ -1147,6 +1148,22 @@ static void sdma_v5_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
>   	amdgpu_gmc_emit_flush_gpu_tlb(ring, vmid, pd_addr);
>   }
>   
> +static void sdma_v5_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
> +{
> +	struct amdgpu_device *adev = ring->adev;
> +
> +	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
> +	amdgpu_ring_write(ring, 0 | /* src: register*/
> +				(5 << 8) |  /* dst: memory */
> +				(1 << 20)); /* write confirm */
> +	amdgpu_ring_write(ring, reg);
> +	amdgpu_ring_write(ring, 0);
> +	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> +				adev->virt.reg_val_offs * 4));
> +	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> +				adev->virt.reg_val_offs * 4));
> +}
> +
>   static void sdma_v5_0_ring_emit_wreg(struct amdgpu_ring *ring,
>   				     uint32_t reg, uint32_t val)
>   {
> @@ -1597,6 +1614,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>   	.test_ib = sdma_v5_0_ring_test_ib,
>   	.insert_nop = sdma_v5_0_ring_insert_nop,
>   	.pad_ib = sdma_v5_0_ring_pad_ib,
> +	.emit_rreg = sdma_v5_0_ring_emit_rreg,
>   	.emit_wreg = sdma_v5_0_ring_emit_wreg,
>   	.emit_reg_wait = sdma_v5_0_ring_emit_reg_wait,
>   	.init_cond_exec = sdma_v5_0_ring_init_cond_exec,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25 15:53         ` Koenig, Christian
  0 siblings, 0 replies; 30+ messages in thread
From: Koenig, Christian @ 2019-10-25 15:53 UTC (permalink / raw)
  To: Zhu, Changfeng, amd-gfx
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Huang, Ray,
	Tuikov, Luben

Hi Changfeng,

that won't work, you can't add this to 
amdgpu_ring_emit_reg_write_reg_wait_helper or break all read triggered 
registers (like the semaphore ones).

Additional to that it will never work on GFX9, since the CP firmware 
there uses the integrated write/wait command and you can't add an 
additional dummy read there.

Regards,
Christian.

Am 25.10.19 um 16:22 schrieb Zhu, Changfeng:
> I try to write a patch based on the patch of Tuikov,Luben.
>
> Inspired by Luben,here is the patch:
>
>  From 1980d8f1ed44fb9a84a5ea1f6e2edd2bc25c629a Mon Sep 17 00:00:00 2001
> From: changzhu <Changfeng.Zhu@amd.com>
> Date: Thu, 10 Oct 2019 11:02:33 +0800
> Subject: [PATCH] drm/amdgpu: add dummy read by engines for some GCVM status
>   registers
>
> The GRBM register interface is now capable of bursting 1 cycle per
> register wr->wr, wr->rd much faster than previous muticycle per
> transaction done interface.  This has caused a problem where
> status registers requiring HW to update have a 1 cycle delay, due
> to the register update having to go through GRBM.
>
> SW may operate on an incorrect value if they write a register and
> immediately check the corresponding status register.
>
> Registers requiring HW to clear or set fields may be delayed by 1 cycle.
> For example,
>
> 1. write VM_INVALIDATE_ENG0_REQ mask = 5a
> 2. read VM_INVALIDATE_ENG0_ACKb till the ack is same as the request mask = 5a
>      	a. HW will reset VM_INVALIDATE_ENG0_ACK = 0 until invalidation is complete
> 3. write VM_INVALIDATE_ENG0_REQ mask = 5a
> 4. read VM_INVALIDATE_ENG0_ACK till the ack is same as the request mask = 5a
> 	a. First read of VM_INVALIDATE_ENG0_ACK = 5a instead of 0
> 	b. Second read of VM_INVALIDATE_ENG0_ACK = 0 because the remote GRBM h/w
> 	   register takes one extra cycle to be cleared
> 	c. In this case,SW wil see a false ACK if they exit on first read
>
> Affected registers (only GC variant)  | Recommended Dummy Read
> --------------------------------------+----------------------------
> VM_INVALIDATE_ENG*_ACK		      |  VM_INVALIDATE_ENG*_REQ
> VM_L2_STATUS			      |  VM_L2_STATUS
> VM_L2_PROTECTION_FAULT_STATUS	      |  VM_L2_PROTECTION_FAULT_STATUS
> VM_L2_PROTECTION_FAULT_ADDR_HI/LO32   |  VM_L2_PROTECTION_FAULT_ADDR_HI/LO32
> VM_L2_IH_LOG_BUSY		      |  VM_L2_IH_LOG_BUSY
> MC_VM_L2_PERFCOUNTER_HI/LO	      |  MC_VM_L2_PERFCOUNTER_HI/LO
> ATC_L2_PERFCOUNTER_HI/LO	      |  ATC_L2_PERFCOUNTER_HI/LO
> ATC_L2_PERFCOUNTER2_HI/LO	      |  ATC_L2_PERFCOUNTER2_HI/LO
>
> It also needs dummy read by engines for these gc registers.
>
> Change-Id: Ie028f37eb789966d4593984bd661b248ebeb1ac3
> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c |  5 +++++
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   |  2 ++
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    |  2 ++
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c   |  4 ++++
>   drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c   | 18 ++++++++++++++++++
>   5 files changed, 31 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> index 4b3f58dbf36f..c2fbf6087ecf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> @@ -392,6 +392,11 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
>   						uint32_t ref, uint32_t mask)
>   {
>   	amdgpu_ring_emit_wreg(ring, reg0, ref);
> +
> +	/* wait for a cycle to reset vm_inv_eng0_ack */
> +	if (ring->funcs->vmhub == AMDGPU_GFXHUB_0)
> +		amdgpu_ring_emit_rreg(ring, reg0);
> +
>   	amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index ef1975a5323a..104c47734316 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -5155,6 +5155,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>   	.patch_cond_exec = gfx_v10_0_ring_emit_patch_cond_exec,
>   	.preempt_ib = gfx_v10_0_ring_preempt_ib,
>   	.emit_tmz = gfx_v10_0_ring_emit_tmz,
> +	.emit_rreg = gfx_v10_0_ring_emit_rreg,
>   	.emit_wreg = gfx_v10_0_ring_emit_wreg,
>   	.emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>   };
> @@ -5188,6 +5189,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>   	.test_ib = gfx_v10_0_ring_test_ib,
>   	.insert_nop = amdgpu_ring_insert_nop,
>   	.pad_ib = amdgpu_ring_generic_pad_ib,
> +	.emit_rreg = gfx_v10_0_ring_emit_rreg,
>   	.emit_wreg = gfx_v10_0_ring_emit_wreg,
>   	.emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 2f03bf533d41..d00b53de0fdc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -6253,6 +6253,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>   	.init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
>   	.patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
>   	.emit_tmz = gfx_v9_0_ring_emit_tmz,
> +	.emit_rreg = gfx_v9_0_ring_emit_rreg,
>   	.emit_wreg = gfx_v9_0_ring_emit_wreg,
>   	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>   	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
> @@ -6289,6 +6290,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>   	.insert_nop = amdgpu_ring_insert_nop,
>   	.pad_ib = amdgpu_ring_generic_pad_ib,
>   	.set_priority = gfx_v9_0_ring_set_priority_compute,
> +	.emit_rreg = gfx_v9_0_ring_emit_rreg,
>   	.emit_wreg = gfx_v9_0_ring_emit_wreg,
>   	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>   	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 3b00bce14cfb..dce6b651da1f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -346,6 +346,10 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   
>   	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>   
> +	/* wait for a cycle to reset vm_inv_eng0_ack */
> +	if (ring->funcs->vmhub == AMDGPU_GFXHUB_0)
> +		amdgpu_ring_emit_rreg(ring, hub->vm_inv_eng0_req + eng);
> +
>   	/* wait for the invalidate to complete */
>   	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>   				  1 << vmid, 1 << vmid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> index 3460c00f3eaa..baaa33467882 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> @@ -38,6 +38,7 @@
>   #include "navi10_sdma_pkt_open.h"
>   #include "nbio_v2_3.h"
>   #include "sdma_v5_0.h"
> +#include "nvd.h"
>   
>   MODULE_FIRMWARE("amdgpu/navi10_sdma.bin");
>   MODULE_FIRMWARE("amdgpu/navi10_sdma1.bin");
> @@ -1147,6 +1148,22 @@ static void sdma_v5_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
>   	amdgpu_gmc_emit_flush_gpu_tlb(ring, vmid, pd_addr);
>   }
>   
> +static void sdma_v5_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
> +{
> +	struct amdgpu_device *adev = ring->adev;
> +
> +	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
> +	amdgpu_ring_write(ring, 0 | /* src: register*/
> +				(5 << 8) |  /* dst: memory */
> +				(1 << 20)); /* write confirm */
> +	amdgpu_ring_write(ring, reg);
> +	amdgpu_ring_write(ring, 0);
> +	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> +				adev->virt.reg_val_offs * 4));
> +	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> +				adev->virt.reg_val_offs * 4));
> +}
> +
>   static void sdma_v5_0_ring_emit_wreg(struct amdgpu_ring *ring,
>   				     uint32_t reg, uint32_t val)
>   {
> @@ -1597,6 +1614,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>   	.test_ib = sdma_v5_0_ring_test_ib,
>   	.insert_nop = sdma_v5_0_ring_insert_nop,
>   	.pad_ib = sdma_v5_0_ring_pad_ib,
> +	.emit_rreg = sdma_v5_0_ring_emit_rreg,
>   	.emit_wreg = sdma_v5_0_ring_emit_wreg,
>   	.emit_reg_wait = sdma_v5_0_ring_emit_reg_wait,
>   	.init_cond_exec = sdma_v5_0_ring_init_cond_exec,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* RE: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-28  3:01             ` Zhu, Changfeng
  0 siblings, 0 replies; 30+ messages in thread
From: Zhu, Changfeng @ 2019-10-28  3:01 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Huang, Ray,
	Tuikov, Luben

Hi Christian,

Re- that won't work, you can't add this to
amdgpu_ring_emit_reg_write_reg_wait_helper or break all read triggered registers (like the semaphore ones).

Do you mean that I should use reg_wait registers(wait_reg_mem) like Luben to replace read triggered registers for adding dummy read?

Re-Additional to that it will never work on GFX9, since the CP firmware there uses the integrated write/wait command and you can't add an additional dummy read there.

Yes, I see the integrated write/wait command and they are realized in gfx_v9_0_wait_reg_mem: 
Emily's patch:
drm/amdgpu: Remove the sriov checking and add firmware checking
decides when to go into gfx_v9_0_wait_reg_mem and when go into amdgpu_ring_emit_reg_write_reg_wait_helper.

However there are two problems now.
1.Before the fw_version_ok fw version, the code goes into amdgpu_ring_emit_reg_write_reg_wait_helper. In this case, should not we add dummy read in amdgpu_ring_emit_reg_write_reg_wait_helper?
2.After the fw_version_ok fw version, the code goes into gfx_v9_0_wait_reg_mem. However, it realizes write/wait command in firmware. Then how can we add this dummy read? According to Yang,Zilong, the CP firmware has realized dummy in firmware in CL:
Vega20 CL#1762470 @3/27/2019
Navi10 CL#1761300 @3/25/2019
Accodring to CL#1762470,
The firmware which realized dummy read is(Raven for example):
Mec version:
#define F32_MEC_UCODE_VERSION "#421"
#define F32_MEC_FEATURE_VERSION 46
Pfp version:
#define F32_PFP_UCODE_VERSION "#183"
#define F32_PFP_FEATURE_VERSION 46
In Emily's patch:
The CP firmware whichuses the integrated write/wait command begins from version:
+       case CHIP_RAVEN:
+               if ((adev->gfx.me_fw_version >= 0x0000009c) &&
+                   (adev->gfx.me_feature_version >= 42) &&
+                   (adev->gfx.pfp_fw_version >=  0x000000b1(177)) &&
+                   (adev->gfx.pfp_feature_version >= 42))
+                       adev->gfx.me_fw_write_wait = true;
+
+               if ((adev->gfx.mec_fw_version >=  0x00000192(402)) &&
+                   (adev->gfx.mec_feature_version >= 42))
+                       adev->gfx.mec_fw_write_wait = true;
+               break;

So how can we deal with the firmware between mec version(402) and mec version(421)?
It will realize write/wait command in CP firmware but it doesn't have dummy read.

BR,
Changfeng.

-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com> 
Sent: Friday, October 25, 2019 11:54 PM
To: Zhu, Changfeng <Changfeng.Zhu@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>
Subject: Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay

Hi Changfeng,

that won't work, you can't add this to
amdgpu_ring_emit_reg_write_reg_wait_helper or break all read triggered registers (like the semaphore ones).

Additional to that it will never work on GFX9, since the CP firmware there uses the integrated write/wait command and you can't add an additional dummy read there.

Regards,
Christian.

Am 25.10.19 um 16:22 schrieb Zhu, Changfeng:
> I try to write a patch based on the patch of Tuikov,Luben.
>
> Inspired by Luben,here is the patch:
>
>  From 1980d8f1ed44fb9a84a5ea1f6e2edd2bc25c629a Mon Sep 17 00:00:00 
> 2001
> From: changzhu <Changfeng.Zhu@amd.com>
> Date: Thu, 10 Oct 2019 11:02:33 +0800
> Subject: [PATCH] drm/amdgpu: add dummy read by engines for some GCVM status
>   registers
>
> The GRBM register interface is now capable of bursting 1 cycle per 
> register wr->wr, wr->rd much faster than previous muticycle per 
> transaction done interface.  This has caused a problem where status 
> registers requiring HW to update have a 1 cycle delay, due to the 
> register update having to go through GRBM.
>
> SW may operate on an incorrect value if they write a register and 
> immediately check the corresponding status register.
>
> Registers requiring HW to clear or set fields may be delayed by 1 cycle.
> For example,
>
> 1. write VM_INVALIDATE_ENG0_REQ mask = 5a 2. read 
> VM_INVALIDATE_ENG0_ACKb till the ack is same as the request mask = 5a
>      	a. HW will reset VM_INVALIDATE_ENG0_ACK = 0 until invalidation 
> is complete 3. write VM_INVALIDATE_ENG0_REQ mask = 5a 4. read 
> VM_INVALIDATE_ENG0_ACK till the ack is same as the request mask = 5a
> 	a. First read of VM_INVALIDATE_ENG0_ACK = 5a instead of 0
> 	b. Second read of VM_INVALIDATE_ENG0_ACK = 0 because the remote GRBM h/w
> 	   register takes one extra cycle to be cleared
> 	c. In this case,SW wil see a false ACK if they exit on first read
>
> Affected registers (only GC variant)  | Recommended Dummy Read
> --------------------------------------+----------------------------
> VM_INVALIDATE_ENG*_ACK		      |  VM_INVALIDATE_ENG*_REQ
> VM_L2_STATUS			      |  VM_L2_STATUS
> VM_L2_PROTECTION_FAULT_STATUS	      |  VM_L2_PROTECTION_FAULT_STATUS
> VM_L2_PROTECTION_FAULT_ADDR_HI/LO32   |  VM_L2_PROTECTION_FAULT_ADDR_HI/LO32
> VM_L2_IH_LOG_BUSY		      |  VM_L2_IH_LOG_BUSY
> MC_VM_L2_PERFCOUNTER_HI/LO	      |  MC_VM_L2_PERFCOUNTER_HI/LO
> ATC_L2_PERFCOUNTER_HI/LO	      |  ATC_L2_PERFCOUNTER_HI/LO
> ATC_L2_PERFCOUNTER2_HI/LO	      |  ATC_L2_PERFCOUNTER2_HI/LO
>
> It also needs dummy read by engines for these gc registers.
>
> Change-Id: Ie028f37eb789966d4593984bd661b248ebeb1ac3
> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c |  5 +++++
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   |  2 ++
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    |  2 ++
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c   |  4 ++++
>   drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c   | 18 ++++++++++++++++++
>   5 files changed, 31 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> index 4b3f58dbf36f..c2fbf6087ecf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> @@ -392,6 +392,11 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
>   						uint32_t ref, uint32_t mask)
>   {
>   	amdgpu_ring_emit_wreg(ring, reg0, ref);
> +
> +	/* wait for a cycle to reset vm_inv_eng0_ack */
> +	if (ring->funcs->vmhub == AMDGPU_GFXHUB_0)
> +		amdgpu_ring_emit_rreg(ring, reg0);
> +
>   	amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index ef1975a5323a..104c47734316 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -5155,6 +5155,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>   	.patch_cond_exec = gfx_v10_0_ring_emit_patch_cond_exec,
>   	.preempt_ib = gfx_v10_0_ring_preempt_ib,
>   	.emit_tmz = gfx_v10_0_ring_emit_tmz,
> +	.emit_rreg = gfx_v10_0_ring_emit_rreg,
>   	.emit_wreg = gfx_v10_0_ring_emit_wreg,
>   	.emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>   };
> @@ -5188,6 +5189,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>   	.test_ib = gfx_v10_0_ring_test_ib,
>   	.insert_nop = amdgpu_ring_insert_nop,
>   	.pad_ib = amdgpu_ring_generic_pad_ib,
> +	.emit_rreg = gfx_v10_0_ring_emit_rreg,
>   	.emit_wreg = gfx_v10_0_ring_emit_wreg,
>   	.emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 2f03bf533d41..d00b53de0fdc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -6253,6 +6253,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>   	.init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
>   	.patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
>   	.emit_tmz = gfx_v9_0_ring_emit_tmz,
> +	.emit_rreg = gfx_v9_0_ring_emit_rreg,
>   	.emit_wreg = gfx_v9_0_ring_emit_wreg,
>   	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>   	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
> @@ -6289,6 +6290,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>   	.insert_nop = amdgpu_ring_insert_nop,
>   	.pad_ib = amdgpu_ring_generic_pad_ib,
>   	.set_priority = gfx_v9_0_ring_set_priority_compute,
> +	.emit_rreg = gfx_v9_0_ring_emit_rreg,
>   	.emit_wreg = gfx_v9_0_ring_emit_wreg,
>   	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>   	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 3b00bce14cfb..dce6b651da1f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -346,6 +346,10 @@ static uint64_t 
> gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   
>   	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>   
> +	/* wait for a cycle to reset vm_inv_eng0_ack */
> +	if (ring->funcs->vmhub == AMDGPU_GFXHUB_0)
> +		amdgpu_ring_emit_rreg(ring, hub->vm_inv_eng0_req + eng);
> +
>   	/* wait for the invalidate to complete */
>   	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>   				  1 << vmid, 1 << vmid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c 
> b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> index 3460c00f3eaa..baaa33467882 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> @@ -38,6 +38,7 @@
>   #include "navi10_sdma_pkt_open.h"
>   #include "nbio_v2_3.h"
>   #include "sdma_v5_0.h"
> +#include "nvd.h"
>   
>   MODULE_FIRMWARE("amdgpu/navi10_sdma.bin");
>   MODULE_FIRMWARE("amdgpu/navi10_sdma1.bin");
> @@ -1147,6 +1148,22 @@ static void sdma_v5_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
>   	amdgpu_gmc_emit_flush_gpu_tlb(ring, vmid, pd_addr);
>   }
>   
> +static void sdma_v5_0_ring_emit_rreg(struct amdgpu_ring *ring, 
> +uint32_t reg) {
> +	struct amdgpu_device *adev = ring->adev;
> +
> +	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
> +	amdgpu_ring_write(ring, 0 | /* src: register*/
> +				(5 << 8) |  /* dst: memory */
> +				(1 << 20)); /* write confirm */
> +	amdgpu_ring_write(ring, reg);
> +	amdgpu_ring_write(ring, 0);
> +	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> +				adev->virt.reg_val_offs * 4));
> +	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> +				adev->virt.reg_val_offs * 4));
> +}
> +
>   static void sdma_v5_0_ring_emit_wreg(struct amdgpu_ring *ring,
>   				     uint32_t reg, uint32_t val)
>   {
> @@ -1597,6 +1614,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>   	.test_ib = sdma_v5_0_ring_test_ib,
>   	.insert_nop = sdma_v5_0_ring_insert_nop,
>   	.pad_ib = sdma_v5_0_ring_pad_ib,
> +	.emit_rreg = sdma_v5_0_ring_emit_rreg,
>   	.emit_wreg = sdma_v5_0_ring_emit_wreg,
>   	.emit_reg_wait = sdma_v5_0_ring_emit_reg_wait,
>   	.init_cond_exec = sdma_v5_0_ring_init_cond_exec,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* RE: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-28  3:01             ` Zhu, Changfeng
  0 siblings, 0 replies; 30+ messages in thread
From: Zhu, Changfeng @ 2019-10-28  3:01 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Huang, Ray,
	Tuikov, Luben

Hi Christian,

Re- that won't work, you can't add this to
amdgpu_ring_emit_reg_write_reg_wait_helper or break all read triggered registers (like the semaphore ones).

Do you mean that I should use reg_wait registers(wait_reg_mem) like Luben to replace read triggered registers for adding dummy read?

Re-Additional to that it will never work on GFX9, since the CP firmware there uses the integrated write/wait command and you can't add an additional dummy read there.

Yes, I see the integrated write/wait command and they are realized in gfx_v9_0_wait_reg_mem: 
Emily's patch:
drm/amdgpu: Remove the sriov checking and add firmware checking
decides when to go into gfx_v9_0_wait_reg_mem and when go into amdgpu_ring_emit_reg_write_reg_wait_helper.

However there are two problems now.
1.Before the fw_version_ok fw version, the code goes into amdgpu_ring_emit_reg_write_reg_wait_helper. In this case, should not we add dummy read in amdgpu_ring_emit_reg_write_reg_wait_helper?
2.After the fw_version_ok fw version, the code goes into gfx_v9_0_wait_reg_mem. However, it realizes write/wait command in firmware. Then how can we add this dummy read? According to Yang,Zilong, the CP firmware has realized dummy in firmware in CL:
Vega20 CL#1762470 @3/27/2019
Navi10 CL#1761300 @3/25/2019
Accodring to CL#1762470,
The firmware which realized dummy read is(Raven for example):
Mec version:
#define F32_MEC_UCODE_VERSION "#421"
#define F32_MEC_FEATURE_VERSION 46
Pfp version:
#define F32_PFP_UCODE_VERSION "#183"
#define F32_PFP_FEATURE_VERSION 46
In Emily's patch:
The CP firmware whichuses the integrated write/wait command begins from version:
+       case CHIP_RAVEN:
+               if ((adev->gfx.me_fw_version >= 0x0000009c) &&
+                   (adev->gfx.me_feature_version >= 42) &&
+                   (adev->gfx.pfp_fw_version >=  0x000000b1(177)) &&
+                   (adev->gfx.pfp_feature_version >= 42))
+                       adev->gfx.me_fw_write_wait = true;
+
+               if ((adev->gfx.mec_fw_version >=  0x00000192(402)) &&
+                   (adev->gfx.mec_feature_version >= 42))
+                       adev->gfx.mec_fw_write_wait = true;
+               break;

So how can we deal with the firmware between mec version(402) and mec version(421)?
It will realize write/wait command in CP firmware but it doesn't have dummy read.

BR,
Changfeng.

-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com> 
Sent: Friday, October 25, 2019 11:54 PM
To: Zhu, Changfeng <Changfeng.Zhu@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>
Subject: Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay

Hi Changfeng,

that won't work, you can't add this to
amdgpu_ring_emit_reg_write_reg_wait_helper or break all read triggered registers (like the semaphore ones).

Additional to that it will never work on GFX9, since the CP firmware there uses the integrated write/wait command and you can't add an additional dummy read there.

Regards,
Christian.

Am 25.10.19 um 16:22 schrieb Zhu, Changfeng:
> I try to write a patch based on the patch of Tuikov,Luben.
>
> Inspired by Luben,here is the patch:
>
>  From 1980d8f1ed44fb9a84a5ea1f6e2edd2bc25c629a Mon Sep 17 00:00:00 
> 2001
> From: changzhu <Changfeng.Zhu@amd.com>
> Date: Thu, 10 Oct 2019 11:02:33 +0800
> Subject: [PATCH] drm/amdgpu: add dummy read by engines for some GCVM status
>   registers
>
> The GRBM register interface is now capable of bursting 1 cycle per 
> register wr->wr, wr->rd much faster than previous muticycle per 
> transaction done interface.  This has caused a problem where status 
> registers requiring HW to update have a 1 cycle delay, due to the 
> register update having to go through GRBM.
>
> SW may operate on an incorrect value if they write a register and 
> immediately check the corresponding status register.
>
> Registers requiring HW to clear or set fields may be delayed by 1 cycle.
> For example,
>
> 1. write VM_INVALIDATE_ENG0_REQ mask = 5a 2. read 
> VM_INVALIDATE_ENG0_ACKb till the ack is same as the request mask = 5a
>      	a. HW will reset VM_INVALIDATE_ENG0_ACK = 0 until invalidation 
> is complete 3. write VM_INVALIDATE_ENG0_REQ mask = 5a 4. read 
> VM_INVALIDATE_ENG0_ACK till the ack is same as the request mask = 5a
> 	a. First read of VM_INVALIDATE_ENG0_ACK = 5a instead of 0
> 	b. Second read of VM_INVALIDATE_ENG0_ACK = 0 because the remote GRBM h/w
> 	   register takes one extra cycle to be cleared
> 	c. In this case,SW wil see a false ACK if they exit on first read
>
> Affected registers (only GC variant)  | Recommended Dummy Read
> --------------------------------------+----------------------------
> VM_INVALIDATE_ENG*_ACK		      |  VM_INVALIDATE_ENG*_REQ
> VM_L2_STATUS			      |  VM_L2_STATUS
> VM_L2_PROTECTION_FAULT_STATUS	      |  VM_L2_PROTECTION_FAULT_STATUS
> VM_L2_PROTECTION_FAULT_ADDR_HI/LO32   |  VM_L2_PROTECTION_FAULT_ADDR_HI/LO32
> VM_L2_IH_LOG_BUSY		      |  VM_L2_IH_LOG_BUSY
> MC_VM_L2_PERFCOUNTER_HI/LO	      |  MC_VM_L2_PERFCOUNTER_HI/LO
> ATC_L2_PERFCOUNTER_HI/LO	      |  ATC_L2_PERFCOUNTER_HI/LO
> ATC_L2_PERFCOUNTER2_HI/LO	      |  ATC_L2_PERFCOUNTER2_HI/LO
>
> It also needs dummy read by engines for these gc registers.
>
> Change-Id: Ie028f37eb789966d4593984bd661b248ebeb1ac3
> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c |  5 +++++
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   |  2 ++
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    |  2 ++
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c   |  4 ++++
>   drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c   | 18 ++++++++++++++++++
>   5 files changed, 31 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> index 4b3f58dbf36f..c2fbf6087ecf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> @@ -392,6 +392,11 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
>   						uint32_t ref, uint32_t mask)
>   {
>   	amdgpu_ring_emit_wreg(ring, reg0, ref);
> +
> +	/* wait for a cycle to reset vm_inv_eng0_ack */
> +	if (ring->funcs->vmhub == AMDGPU_GFXHUB_0)
> +		amdgpu_ring_emit_rreg(ring, reg0);
> +
>   	amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index ef1975a5323a..104c47734316 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -5155,6 +5155,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>   	.patch_cond_exec = gfx_v10_0_ring_emit_patch_cond_exec,
>   	.preempt_ib = gfx_v10_0_ring_preempt_ib,
>   	.emit_tmz = gfx_v10_0_ring_emit_tmz,
> +	.emit_rreg = gfx_v10_0_ring_emit_rreg,
>   	.emit_wreg = gfx_v10_0_ring_emit_wreg,
>   	.emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>   };
> @@ -5188,6 +5189,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>   	.test_ib = gfx_v10_0_ring_test_ib,
>   	.insert_nop = amdgpu_ring_insert_nop,
>   	.pad_ib = amdgpu_ring_generic_pad_ib,
> +	.emit_rreg = gfx_v10_0_ring_emit_rreg,
>   	.emit_wreg = gfx_v10_0_ring_emit_wreg,
>   	.emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 2f03bf533d41..d00b53de0fdc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -6253,6 +6253,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>   	.init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
>   	.patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
>   	.emit_tmz = gfx_v9_0_ring_emit_tmz,
> +	.emit_rreg = gfx_v9_0_ring_emit_rreg,
>   	.emit_wreg = gfx_v9_0_ring_emit_wreg,
>   	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>   	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
> @@ -6289,6 +6290,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>   	.insert_nop = amdgpu_ring_insert_nop,
>   	.pad_ib = amdgpu_ring_generic_pad_ib,
>   	.set_priority = gfx_v9_0_ring_set_priority_compute,
> +	.emit_rreg = gfx_v9_0_ring_emit_rreg,
>   	.emit_wreg = gfx_v9_0_ring_emit_wreg,
>   	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>   	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 3b00bce14cfb..dce6b651da1f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -346,6 +346,10 @@ static uint64_t 
> gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   
>   	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>   
> +	/* wait for a cycle to reset vm_inv_eng0_ack */
> +	if (ring->funcs->vmhub == AMDGPU_GFXHUB_0)
> +		amdgpu_ring_emit_rreg(ring, hub->vm_inv_eng0_req + eng);
> +
>   	/* wait for the invalidate to complete */
>   	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>   				  1 << vmid, 1 << vmid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c 
> b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> index 3460c00f3eaa..baaa33467882 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> @@ -38,6 +38,7 @@
>   #include "navi10_sdma_pkt_open.h"
>   #include "nbio_v2_3.h"
>   #include "sdma_v5_0.h"
> +#include "nvd.h"
>   
>   MODULE_FIRMWARE("amdgpu/navi10_sdma.bin");
>   MODULE_FIRMWARE("amdgpu/navi10_sdma1.bin");
> @@ -1147,6 +1148,22 @@ static void sdma_v5_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
>   	amdgpu_gmc_emit_flush_gpu_tlb(ring, vmid, pd_addr);
>   }
>   
> +static void sdma_v5_0_ring_emit_rreg(struct amdgpu_ring *ring, 
> +uint32_t reg) {
> +	struct amdgpu_device *adev = ring->adev;
> +
> +	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
> +	amdgpu_ring_write(ring, 0 | /* src: register*/
> +				(5 << 8) |  /* dst: memory */
> +				(1 << 20)); /* write confirm */
> +	amdgpu_ring_write(ring, reg);
> +	amdgpu_ring_write(ring, 0);
> +	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> +				adev->virt.reg_val_offs * 4));
> +	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> +				adev->virt.reg_val_offs * 4));
> +}
> +
>   static void sdma_v5_0_ring_emit_wreg(struct amdgpu_ring *ring,
>   				     uint32_t reg, uint32_t val)
>   {
> @@ -1597,6 +1614,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>   	.test_ib = sdma_v5_0_ring_test_ib,
>   	.insert_nop = sdma_v5_0_ring_insert_nop,
>   	.pad_ib = sdma_v5_0_ring_pad_ib,
> +	.emit_rreg = sdma_v5_0_ring_emit_rreg,
>   	.emit_wreg = sdma_v5_0_ring_emit_wreg,
>   	.emit_reg_wait = sdma_v5_0_ring_emit_reg_wait,
>   	.init_cond_exec = sdma_v5_0_ring_init_cond_exec,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-28 10:46                 ` Koenig, Christian
  0 siblings, 0 replies; 30+ messages in thread
From: Koenig, Christian @ 2019-10-28 10:46 UTC (permalink / raw)
  To: Zhu, Changfeng, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Huang, Ray,
	Tuikov, Luben

Hi Changfeng,

> So how can we deal with the firmware between mec version(402) and mec version(421)?
Well of hand I see only two options: Either print a warning or 
completely reject loading the driver.

Completely rejecting loading the driver is probably not a good idea and 
the issue is actually extremely unlikely to cause any problems.

So printing a warning that the user should update their firmware is 
probably the best approach.

Regards,
Christian.

Am 28.10.19 um 04:01 schrieb Zhu, Changfeng:
> Hi Christian,
>
> Re- that won't work, you can't add this to
> amdgpu_ring_emit_reg_write_reg_wait_helper or break all read triggered registers (like the semaphore ones).
>
> Do you mean that I should use reg_wait registers(wait_reg_mem) like Luben to replace read triggered registers for adding dummy read?
>
> Re-Additional to that it will never work on GFX9, since the CP firmware there uses the integrated write/wait command and you can't add an additional dummy read there.
>
> Yes, I see the integrated write/wait command and they are realized in gfx_v9_0_wait_reg_mem:
> Emily's patch:
> drm/amdgpu: Remove the sriov checking and add firmware checking
> decides when to go into gfx_v9_0_wait_reg_mem and when go into amdgpu_ring_emit_reg_write_reg_wait_helper.
>
> However there are two problems now.
> 1.Before the fw_version_ok fw version, the code goes into amdgpu_ring_emit_reg_write_reg_wait_helper. In this case, should not we add dummy read in amdgpu_ring_emit_reg_write_reg_wait_helper?
> 2.After the fw_version_ok fw version, the code goes into gfx_v9_0_wait_reg_mem. However, it realizes write/wait command in firmware. Then how can we add this dummy read? According to Yang,Zilong, the CP firmware has realized dummy in firmware in CL:
> Vega20 CL#1762470 @3/27/2019
> Navi10 CL#1761300 @3/25/2019
> Accodring to CL#1762470,
> The firmware which realized dummy read is(Raven for example):
> Mec version:
> #define F32_MEC_UCODE_VERSION "#421"
> #define F32_MEC_FEATURE_VERSION 46
> Pfp version:
> #define F32_PFP_UCODE_VERSION "#183"
> #define F32_PFP_FEATURE_VERSION 46
> In Emily's patch:
> The CP firmware whichuses the integrated write/wait command begins from version:
> +       case CHIP_RAVEN:
> +               if ((adev->gfx.me_fw_version >= 0x0000009c) &&
> +                   (adev->gfx.me_feature_version >= 42) &&
> +                   (adev->gfx.pfp_fw_version >=  0x000000b1(177)) &&
> +                   (adev->gfx.pfp_feature_version >= 42))
> +                       adev->gfx.me_fw_write_wait = true;
> +
> +               if ((adev->gfx.mec_fw_version >=  0x00000192(402)) &&
> +                   (adev->gfx.mec_feature_version >= 42))
> +                       adev->gfx.mec_fw_write_wait = true;
> +               break;
>
> So how can we deal with the firmware between mec version(402) and mec version(421)?
> It will realize write/wait command in CP firmware but it doesn't have dummy read.
>
> BR,
> Changfeng.
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Friday, October 25, 2019 11:54 PM
> To: Zhu, Changfeng <Changfeng.Zhu@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>
> Subject: Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
>
> Hi Changfeng,
>
> that won't work, you can't add this to
> amdgpu_ring_emit_reg_write_reg_wait_helper or break all read triggered registers (like the semaphore ones).
>
> Additional to that it will never work on GFX9, since the CP firmware there uses the integrated write/wait command and you can't add an additional dummy read there.
>
> Regards,
> Christian.
>
> Am 25.10.19 um 16:22 schrieb Zhu, Changfeng:
>> I try to write a patch based on the patch of Tuikov,Luben.
>>
>> Inspired by Luben,here is the patch:
>>
>>   From 1980d8f1ed44fb9a84a5ea1f6e2edd2bc25c629a Mon Sep 17 00:00:00
>> 2001
>> From: changzhu <Changfeng.Zhu@amd.com>
>> Date: Thu, 10 Oct 2019 11:02:33 +0800
>> Subject: [PATCH] drm/amdgpu: add dummy read by engines for some GCVM status
>>    registers
>>
>> The GRBM register interface is now capable of bursting 1 cycle per
>> register wr->wr, wr->rd much faster than previous muticycle per
>> transaction done interface.  This has caused a problem where status
>> registers requiring HW to update have a 1 cycle delay, due to the
>> register update having to go through GRBM.
>>
>> SW may operate on an incorrect value if they write a register and
>> immediately check the corresponding status register.
>>
>> Registers requiring HW to clear or set fields may be delayed by 1 cycle.
>> For example,
>>
>> 1. write VM_INVALIDATE_ENG0_REQ mask = 5a 2. read
>> VM_INVALIDATE_ENG0_ACKb till the ack is same as the request mask = 5a
>>       	a. HW will reset VM_INVALIDATE_ENG0_ACK = 0 until invalidation
>> is complete 3. write VM_INVALIDATE_ENG0_REQ mask = 5a 4. read
>> VM_INVALIDATE_ENG0_ACK till the ack is same as the request mask = 5a
>> 	a. First read of VM_INVALIDATE_ENG0_ACK = 5a instead of 0
>> 	b. Second read of VM_INVALIDATE_ENG0_ACK = 0 because the remote GRBM h/w
>> 	   register takes one extra cycle to be cleared
>> 	c. In this case,SW wil see a false ACK if they exit on first read
>>
>> Affected registers (only GC variant)  | Recommended Dummy Read
>> --------------------------------------+----------------------------
>> VM_INVALIDATE_ENG*_ACK		      |  VM_INVALIDATE_ENG*_REQ
>> VM_L2_STATUS			      |  VM_L2_STATUS
>> VM_L2_PROTECTION_FAULT_STATUS	      |  VM_L2_PROTECTION_FAULT_STATUS
>> VM_L2_PROTECTION_FAULT_ADDR_HI/LO32   |  VM_L2_PROTECTION_FAULT_ADDR_HI/LO32
>> VM_L2_IH_LOG_BUSY		      |  VM_L2_IH_LOG_BUSY
>> MC_VM_L2_PERFCOUNTER_HI/LO	      |  MC_VM_L2_PERFCOUNTER_HI/LO
>> ATC_L2_PERFCOUNTER_HI/LO	      |  ATC_L2_PERFCOUNTER_HI/LO
>> ATC_L2_PERFCOUNTER2_HI/LO	      |  ATC_L2_PERFCOUNTER2_HI/LO
>>
>> It also needs dummy read by engines for these gc registers.
>>
>> Change-Id: Ie028f37eb789966d4593984bd661b248ebeb1ac3
>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c |  5 +++++
>>    drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   |  2 ++
>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    |  2 ++
>>    drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c   |  4 ++++
>>    drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c   | 18 ++++++++++++++++++
>>    5 files changed, 31 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> index 4b3f58dbf36f..c2fbf6087ecf 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> @@ -392,6 +392,11 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
>>    						uint32_t ref, uint32_t mask)
>>    {
>>    	amdgpu_ring_emit_wreg(ring, reg0, ref);
>> +
>> +	/* wait for a cycle to reset vm_inv_eng0_ack */
>> +	if (ring->funcs->vmhub == AMDGPU_GFXHUB_0)
>> +		amdgpu_ring_emit_rreg(ring, reg0);
>> +
>>    	amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);
>>    }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index ef1975a5323a..104c47734316 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -5155,6 +5155,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>    	.patch_cond_exec = gfx_v10_0_ring_emit_patch_cond_exec,
>>    	.preempt_ib = gfx_v10_0_ring_preempt_ib,
>>    	.emit_tmz = gfx_v10_0_ring_emit_tmz,
>> +	.emit_rreg = gfx_v10_0_ring_emit_rreg,
>>    	.emit_wreg = gfx_v10_0_ring_emit_wreg,
>>    	.emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>>    };
>> @@ -5188,6 +5189,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>    	.test_ib = gfx_v10_0_ring_test_ib,
>>    	.insert_nop = amdgpu_ring_insert_nop,
>>    	.pad_ib = amdgpu_ring_generic_pad_ib,
>> +	.emit_rreg = gfx_v10_0_ring_emit_rreg,
>>    	.emit_wreg = gfx_v10_0_ring_emit_wreg,
>>    	.emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>>    };
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> index 2f03bf533d41..d00b53de0fdc 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> @@ -6253,6 +6253,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>    	.init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
>>    	.patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
>>    	.emit_tmz = gfx_v9_0_ring_emit_tmz,
>> +	.emit_rreg = gfx_v9_0_ring_emit_rreg,
>>    	.emit_wreg = gfx_v9_0_ring_emit_wreg,
>>    	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>>    	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
>> @@ -6289,6 +6290,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>    	.insert_nop = amdgpu_ring_insert_nop,
>>    	.pad_ib = amdgpu_ring_generic_pad_ib,
>>    	.set_priority = gfx_v9_0_ring_set_priority_compute,
>> +	.emit_rreg = gfx_v9_0_ring_emit_rreg,
>>    	.emit_wreg = gfx_v9_0_ring_emit_wreg,
>>    	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>>    	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> index 3b00bce14cfb..dce6b651da1f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> @@ -346,6 +346,10 @@ static uint64_t
>> gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>    
>>    	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>    
>> +	/* wait for a cycle to reset vm_inv_eng0_ack */
>> +	if (ring->funcs->vmhub == AMDGPU_GFXHUB_0)
>> +		amdgpu_ring_emit_rreg(ring, hub->vm_inv_eng0_req + eng);
>> +
>>    	/* wait for the invalidate to complete */
>>    	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>    				  1 << vmid, 1 << vmid);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> index 3460c00f3eaa..baaa33467882 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> @@ -38,6 +38,7 @@
>>    #include "navi10_sdma_pkt_open.h"
>>    #include "nbio_v2_3.h"
>>    #include "sdma_v5_0.h"
>> +#include "nvd.h"
>>    
>>    MODULE_FIRMWARE("amdgpu/navi10_sdma.bin");
>>    MODULE_FIRMWARE("amdgpu/navi10_sdma1.bin");
>> @@ -1147,6 +1148,22 @@ static void sdma_v5_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
>>    	amdgpu_gmc_emit_flush_gpu_tlb(ring, vmid, pd_addr);
>>    }
>>    
>> +static void sdma_v5_0_ring_emit_rreg(struct amdgpu_ring *ring,
>> +uint32_t reg) {
>> +	struct amdgpu_device *adev = ring->adev;
>> +
>> +	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>> +	amdgpu_ring_write(ring, 0 | /* src: register*/
>> +				(5 << 8) |  /* dst: memory */
>> +				(1 << 20)); /* write confirm */
>> +	amdgpu_ring_write(ring, reg);
>> +	amdgpu_ring_write(ring, 0);
>> +	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> +				adev->virt.reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> +				adev->virt.reg_val_offs * 4));
>> +}
>> +
>>    static void sdma_v5_0_ring_emit_wreg(struct amdgpu_ring *ring,
>>    				     uint32_t reg, uint32_t val)
>>    {
>> @@ -1597,6 +1614,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>>    	.test_ib = sdma_v5_0_ring_test_ib,
>>    	.insert_nop = sdma_v5_0_ring_insert_nop,
>>    	.pad_ib = sdma_v5_0_ring_pad_ib,
>> +	.emit_rreg = sdma_v5_0_ring_emit_rreg,
>>    	.emit_wreg = sdma_v5_0_ring_emit_wreg,
>>    	.emit_reg_wait = sdma_v5_0_ring_emit_reg_wait,
>>    	.init_cond_exec = sdma_v5_0_ring_init_cond_exec,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-28 10:46                 ` Koenig, Christian
  0 siblings, 0 replies; 30+ messages in thread
From: Koenig, Christian @ 2019-10-28 10:46 UTC (permalink / raw)
  To: Zhu, Changfeng, amd-gfx
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Huang, Ray,
	Tuikov, Luben

Hi Changfeng,

> So how can we deal with the firmware between mec version(402) and mec version(421)?
Well of hand I see only two options: Either print a warning or 
completely reject loading the driver.

Completely rejecting loading the driver is probably not a good idea and 
the issue is actually extremely unlikely to cause any problems.

So printing a warning that the user should update their firmware is 
probably the best approach.

Regards,
Christian.

Am 28.10.19 um 04:01 schrieb Zhu, Changfeng:
> Hi Christian,
>
> Re- that won't work, you can't add this to
> amdgpu_ring_emit_reg_write_reg_wait_helper or break all read triggered registers (like the semaphore ones).
>
> Do you mean that I should use reg_wait registers(wait_reg_mem) like Luben to replace read triggered registers for adding dummy read?
>
> Re-Additional to that it will never work on GFX9, since the CP firmware there uses the integrated write/wait command and you can't add an additional dummy read there.
>
> Yes, I see the integrated write/wait command and they are realized in gfx_v9_0_wait_reg_mem:
> Emily's patch:
> drm/amdgpu: Remove the sriov checking and add firmware checking
> decides when to go into gfx_v9_0_wait_reg_mem and when go into amdgpu_ring_emit_reg_write_reg_wait_helper.
>
> However there are two problems now.
> 1.Before the fw_version_ok fw version, the code goes into amdgpu_ring_emit_reg_write_reg_wait_helper. In this case, should not we add dummy read in amdgpu_ring_emit_reg_write_reg_wait_helper?
> 2.After the fw_version_ok fw version, the code goes into gfx_v9_0_wait_reg_mem. However, it realizes write/wait command in firmware. Then how can we add this dummy read? According to Yang,Zilong, the CP firmware has realized dummy in firmware in CL:
> Vega20 CL#1762470 @3/27/2019
> Navi10 CL#1761300 @3/25/2019
> Accodring to CL#1762470,
> The firmware which realized dummy read is(Raven for example):
> Mec version:
> #define F32_MEC_UCODE_VERSION "#421"
> #define F32_MEC_FEATURE_VERSION 46
> Pfp version:
> #define F32_PFP_UCODE_VERSION "#183"
> #define F32_PFP_FEATURE_VERSION 46
> In Emily's patch:
> The CP firmware whichuses the integrated write/wait command begins from version:
> +       case CHIP_RAVEN:
> +               if ((adev->gfx.me_fw_version >= 0x0000009c) &&
> +                   (adev->gfx.me_feature_version >= 42) &&
> +                   (adev->gfx.pfp_fw_version >=  0x000000b1(177)) &&
> +                   (adev->gfx.pfp_feature_version >= 42))
> +                       adev->gfx.me_fw_write_wait = true;
> +
> +               if ((adev->gfx.mec_fw_version >=  0x00000192(402)) &&
> +                   (adev->gfx.mec_feature_version >= 42))
> +                       adev->gfx.mec_fw_write_wait = true;
> +               break;
>
> So how can we deal with the firmware between mec version(402) and mec version(421)?
> It will realize write/wait command in CP firmware but it doesn't have dummy read.
>
> BR,
> Changfeng.
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Friday, October 25, 2019 11:54 PM
> To: Zhu, Changfeng <Changfeng.Zhu@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>
> Subject: Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
>
> Hi Changfeng,
>
> that won't work, you can't add this to
> amdgpu_ring_emit_reg_write_reg_wait_helper or break all read triggered registers (like the semaphore ones).
>
> Additional to that it will never work on GFX9, since the CP firmware there uses the integrated write/wait command and you can't add an additional dummy read there.
>
> Regards,
> Christian.
>
> Am 25.10.19 um 16:22 schrieb Zhu, Changfeng:
>> I try to write a patch based on the patch of Tuikov,Luben.
>>
>> Inspired by Luben,here is the patch:
>>
>>   From 1980d8f1ed44fb9a84a5ea1f6e2edd2bc25c629a Mon Sep 17 00:00:00
>> 2001
>> From: changzhu <Changfeng.Zhu@amd.com>
>> Date: Thu, 10 Oct 2019 11:02:33 +0800
>> Subject: [PATCH] drm/amdgpu: add dummy read by engines for some GCVM status
>>    registers
>>
>> The GRBM register interface is now capable of bursting 1 cycle per
>> register wr->wr, wr->rd much faster than previous muticycle per
>> transaction done interface.  This has caused a problem where status
>> registers requiring HW to update have a 1 cycle delay, due to the
>> register update having to go through GRBM.
>>
>> SW may operate on an incorrect value if they write a register and
>> immediately check the corresponding status register.
>>
>> Registers requiring HW to clear or set fields may be delayed by 1 cycle.
>> For example,
>>
>> 1. write VM_INVALIDATE_ENG0_REQ mask = 5a 2. read
>> VM_INVALIDATE_ENG0_ACKb till the ack is same as the request mask = 5a
>>       	a. HW will reset VM_INVALIDATE_ENG0_ACK = 0 until invalidation
>> is complete 3. write VM_INVALIDATE_ENG0_REQ mask = 5a 4. read
>> VM_INVALIDATE_ENG0_ACK till the ack is same as the request mask = 5a
>> 	a. First read of VM_INVALIDATE_ENG0_ACK = 5a instead of 0
>> 	b. Second read of VM_INVALIDATE_ENG0_ACK = 0 because the remote GRBM h/w
>> 	   register takes one extra cycle to be cleared
>> 	c. In this case,SW wil see a false ACK if they exit on first read
>>
>> Affected registers (only GC variant)  | Recommended Dummy Read
>> --------------------------------------+----------------------------
>> VM_INVALIDATE_ENG*_ACK		      |  VM_INVALIDATE_ENG*_REQ
>> VM_L2_STATUS			      |  VM_L2_STATUS
>> VM_L2_PROTECTION_FAULT_STATUS	      |  VM_L2_PROTECTION_FAULT_STATUS
>> VM_L2_PROTECTION_FAULT_ADDR_HI/LO32   |  VM_L2_PROTECTION_FAULT_ADDR_HI/LO32
>> VM_L2_IH_LOG_BUSY		      |  VM_L2_IH_LOG_BUSY
>> MC_VM_L2_PERFCOUNTER_HI/LO	      |  MC_VM_L2_PERFCOUNTER_HI/LO
>> ATC_L2_PERFCOUNTER_HI/LO	      |  ATC_L2_PERFCOUNTER_HI/LO
>> ATC_L2_PERFCOUNTER2_HI/LO	      |  ATC_L2_PERFCOUNTER2_HI/LO
>>
>> It also needs dummy read by engines for these gc registers.
>>
>> Change-Id: Ie028f37eb789966d4593984bd661b248ebeb1ac3
>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c |  5 +++++
>>    drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   |  2 ++
>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    |  2 ++
>>    drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c   |  4 ++++
>>    drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c   | 18 ++++++++++++++++++
>>    5 files changed, 31 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> index 4b3f58dbf36f..c2fbf6087ecf 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> @@ -392,6 +392,11 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
>>    						uint32_t ref, uint32_t mask)
>>    {
>>    	amdgpu_ring_emit_wreg(ring, reg0, ref);
>> +
>> +	/* wait for a cycle to reset vm_inv_eng0_ack */
>> +	if (ring->funcs->vmhub == AMDGPU_GFXHUB_0)
>> +		amdgpu_ring_emit_rreg(ring, reg0);
>> +
>>    	amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);
>>    }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index ef1975a5323a..104c47734316 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -5155,6 +5155,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>    	.patch_cond_exec = gfx_v10_0_ring_emit_patch_cond_exec,
>>    	.preempt_ib = gfx_v10_0_ring_preempt_ib,
>>    	.emit_tmz = gfx_v10_0_ring_emit_tmz,
>> +	.emit_rreg = gfx_v10_0_ring_emit_rreg,
>>    	.emit_wreg = gfx_v10_0_ring_emit_wreg,
>>    	.emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>>    };
>> @@ -5188,6 +5189,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>    	.test_ib = gfx_v10_0_ring_test_ib,
>>    	.insert_nop = amdgpu_ring_insert_nop,
>>    	.pad_ib = amdgpu_ring_generic_pad_ib,
>> +	.emit_rreg = gfx_v10_0_ring_emit_rreg,
>>    	.emit_wreg = gfx_v10_0_ring_emit_wreg,
>>    	.emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>>    };
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> index 2f03bf533d41..d00b53de0fdc 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> @@ -6253,6 +6253,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>    	.init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
>>    	.patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
>>    	.emit_tmz = gfx_v9_0_ring_emit_tmz,
>> +	.emit_rreg = gfx_v9_0_ring_emit_rreg,
>>    	.emit_wreg = gfx_v9_0_ring_emit_wreg,
>>    	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>>    	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
>> @@ -6289,6 +6290,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>    	.insert_nop = amdgpu_ring_insert_nop,
>>    	.pad_ib = amdgpu_ring_generic_pad_ib,
>>    	.set_priority = gfx_v9_0_ring_set_priority_compute,
>> +	.emit_rreg = gfx_v9_0_ring_emit_rreg,
>>    	.emit_wreg = gfx_v9_0_ring_emit_wreg,
>>    	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>>    	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> index 3b00bce14cfb..dce6b651da1f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> @@ -346,6 +346,10 @@ static uint64_t
>> gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>    
>>    	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>    
>> +	/* wait for a cycle to reset vm_inv_eng0_ack */
>> +	if (ring->funcs->vmhub == AMDGPU_GFXHUB_0)
>> +		amdgpu_ring_emit_rreg(ring, hub->vm_inv_eng0_req + eng);
>> +
>>    	/* wait for the invalidate to complete */
>>    	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>    				  1 << vmid, 1 << vmid);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> index 3460c00f3eaa..baaa33467882 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> @@ -38,6 +38,7 @@
>>    #include "navi10_sdma_pkt_open.h"
>>    #include "nbio_v2_3.h"
>>    #include "sdma_v5_0.h"
>> +#include "nvd.h"
>>    
>>    MODULE_FIRMWARE("amdgpu/navi10_sdma.bin");
>>    MODULE_FIRMWARE("amdgpu/navi10_sdma1.bin");
>> @@ -1147,6 +1148,22 @@ static void sdma_v5_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
>>    	amdgpu_gmc_emit_flush_gpu_tlb(ring, vmid, pd_addr);
>>    }
>>    
>> +static void sdma_v5_0_ring_emit_rreg(struct amdgpu_ring *ring,
>> +uint32_t reg) {
>> +	struct amdgpu_device *adev = ring->adev;
>> +
>> +	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>> +	amdgpu_ring_write(ring, 0 | /* src: register*/
>> +				(5 << 8) |  /* dst: memory */
>> +				(1 << 20)); /* write confirm */
>> +	amdgpu_ring_write(ring, reg);
>> +	amdgpu_ring_write(ring, 0);
>> +	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> +				adev->virt.reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> +				adev->virt.reg_val_offs * 4));
>> +}
>> +
>>    static void sdma_v5_0_ring_emit_wreg(struct amdgpu_ring *ring,
>>    				     uint32_t reg, uint32_t val)
>>    {
>> @@ -1597,6 +1614,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>>    	.test_ib = sdma_v5_0_ring_test_ib,
>>    	.insert_nop = sdma_v5_0_ring_insert_nop,
>>    	.pad_ib = sdma_v5_0_ring_pad_ib,
>> +	.emit_rreg = sdma_v5_0_ring_emit_rreg,
>>    	.emit_wreg = sdma_v5_0_ring_emit_wreg,
>>    	.emit_reg_wait = sdma_v5_0_ring_emit_reg_wait,
>>    	.init_cond_exec = sdma_v5_0_ring_init_cond_exec,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* RE: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-28 12:07                     ` Zhu, Changfeng
  0 siblings, 0 replies; 30+ messages in thread
From: Zhu, Changfeng @ 2019-10-28 12:07 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Huang, Ray,
	Tuikov, Luben

Hi Christian,

Should we also realize the function of gfx_v9_0_wait_reg_mem in gfx10 like gfx9 since gfx10 also realize write/wait command in a single packet after CL#1761300?

Or we can add dummy read in gmc10 by using emit_wait like Luben's way?

BR,
Changfeng. 

-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com> 
Sent: Monday, October 28, 2019 6:47 PM
To: Zhu, Changfeng <Changfeng.Zhu@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>
Subject: Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay

Hi Changfeng,

> So how can we deal with the firmware between mec version(402) and mec version(421)?
Well of hand I see only two options: Either print a warning or completely reject loading the driver.

Completely rejecting loading the driver is probably not a good idea and the issue is actually extremely unlikely to cause any problems.

So printing a warning that the user should update their firmware is probably the best approach.

Regards,
Christian.

Am 28.10.19 um 04:01 schrieb Zhu, Changfeng:
> Hi Christian,
>
> Re- that won't work, you can't add this to 
> amdgpu_ring_emit_reg_write_reg_wait_helper or break all read triggered registers (like the semaphore ones).
>
> Do you mean that I should use reg_wait registers(wait_reg_mem) like Luben to replace read triggered registers for adding dummy read?
>
> Re-Additional to that it will never work on GFX9, since the CP firmware there uses the integrated write/wait command and you can't add an additional dummy read there.
>
> Yes, I see the integrated write/wait command and they are realized in gfx_v9_0_wait_reg_mem:
> Emily's patch:
> drm/amdgpu: Remove the sriov checking and add firmware checking 
> decides when to go into gfx_v9_0_wait_reg_mem and when go into amdgpu_ring_emit_reg_write_reg_wait_helper.
>
> However there are two problems now.
> 1.Before the fw_version_ok fw version, the code goes into amdgpu_ring_emit_reg_write_reg_wait_helper. In this case, should not we add dummy read in amdgpu_ring_emit_reg_write_reg_wait_helper?
> 2.After the fw_version_ok fw version, the code goes into gfx_v9_0_wait_reg_mem. However, it realizes write/wait command in firmware. Then how can we add this dummy read? According to Yang,Zilong, the CP firmware has realized dummy in firmware in CL:
> Vega20 CL#1762470 @3/27/2019
> Navi10 CL#1761300 @3/25/2019
> Accodring to CL#1762470,
> The firmware which realized dummy read is(Raven for example):
> Mec version:
> #define F32_MEC_UCODE_VERSION "#421"
> #define F32_MEC_FEATURE_VERSION 46
> Pfp version:
> #define F32_PFP_UCODE_VERSION "#183"
> #define F32_PFP_FEATURE_VERSION 46
> In Emily's patch:
> The CP firmware whichuses the integrated write/wait command begins from version:
> +       case CHIP_RAVEN:
> +               if ((adev->gfx.me_fw_version >= 0x0000009c) &&
> +                   (adev->gfx.me_feature_version >= 42) &&
> +                   (adev->gfx.pfp_fw_version >=  0x000000b1(177)) &&
> +                   (adev->gfx.pfp_feature_version >= 42))
> +                       adev->gfx.me_fw_write_wait = true;
> +
> +               if ((adev->gfx.mec_fw_version >=  0x00000192(402)) &&
> +                   (adev->gfx.mec_feature_version >= 42))
> +                       adev->gfx.mec_fw_write_wait = true;
> +               break;
>
> So how can we deal with the firmware between mec version(402) and mec version(421)?
> It will realize write/wait command in CP firmware but it doesn't have dummy read.
>
> BR,
> Changfeng.
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Friday, October 25, 2019 11:54 PM
> To: Zhu, Changfeng <Changfeng.Zhu@amd.com>; 
> amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Pelloux-prayer, 
> Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Huang, Ray 
> <Ray.Huang@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>
> Subject: Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle 
> delay
>
> Hi Changfeng,
>
> that won't work, you can't add this to 
> amdgpu_ring_emit_reg_write_reg_wait_helper or break all read triggered registers (like the semaphore ones).
>
> Additional to that it will never work on GFX9, since the CP firmware there uses the integrated write/wait command and you can't add an additional dummy read there.
>
> Regards,
> Christian.
>
> Am 25.10.19 um 16:22 schrieb Zhu, Changfeng:
>> I try to write a patch based on the patch of Tuikov,Luben.
>>
>> Inspired by Luben,here is the patch:
>>
>>   From 1980d8f1ed44fb9a84a5ea1f6e2edd2bc25c629a Mon Sep 17 00:00:00
>> 2001
>> From: changzhu <Changfeng.Zhu@amd.com>
>> Date: Thu, 10 Oct 2019 11:02:33 +0800
>> Subject: [PATCH] drm/amdgpu: add dummy read by engines for some GCVM status
>>    registers
>>
>> The GRBM register interface is now capable of bursting 1 cycle per 
>> register wr->wr, wr->rd much faster than previous muticycle per 
>> transaction done interface.  This has caused a problem where status 
>> registers requiring HW to update have a 1 cycle delay, due to the 
>> register update having to go through GRBM.
>>
>> SW may operate on an incorrect value if they write a register and 
>> immediately check the corresponding status register.
>>
>> Registers requiring HW to clear or set fields may be delayed by 1 cycle.
>> For example,
>>
>> 1. write VM_INVALIDATE_ENG0_REQ mask = 5a 2. read 
>> VM_INVALIDATE_ENG0_ACKb till the ack is same as the request mask = 5a
>>       	a. HW will reset VM_INVALIDATE_ENG0_ACK = 0 until invalidation 
>> is complete 3. write VM_INVALIDATE_ENG0_REQ mask = 5a 4. read 
>> VM_INVALIDATE_ENG0_ACK till the ack is same as the request mask = 5a
>> 	a. First read of VM_INVALIDATE_ENG0_ACK = 5a instead of 0
>> 	b. Second read of VM_INVALIDATE_ENG0_ACK = 0 because the remote GRBM h/w
>> 	   register takes one extra cycle to be cleared
>> 	c. In this case,SW wil see a false ACK if they exit on first read
>>
>> Affected registers (only GC variant)  | Recommended Dummy Read
>> --------------------------------------+----------------------------
>> VM_INVALIDATE_ENG*_ACK		      |  VM_INVALIDATE_ENG*_REQ
>> VM_L2_STATUS			      |  VM_L2_STATUS
>> VM_L2_PROTECTION_FAULT_STATUS	      |  VM_L2_PROTECTION_FAULT_STATUS
>> VM_L2_PROTECTION_FAULT_ADDR_HI/LO32   |  VM_L2_PROTECTION_FAULT_ADDR_HI/LO32
>> VM_L2_IH_LOG_BUSY		      |  VM_L2_IH_LOG_BUSY
>> MC_VM_L2_PERFCOUNTER_HI/LO	      |  MC_VM_L2_PERFCOUNTER_HI/LO
>> ATC_L2_PERFCOUNTER_HI/LO	      |  ATC_L2_PERFCOUNTER_HI/LO
>> ATC_L2_PERFCOUNTER2_HI/LO	      |  ATC_L2_PERFCOUNTER2_HI/LO
>>
>> It also needs dummy read by engines for these gc registers.
>>
>> Change-Id: Ie028f37eb789966d4593984bd661b248ebeb1ac3
>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c |  5 +++++
>>    drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   |  2 ++
>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    |  2 ++
>>    drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c   |  4 ++++
>>    drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c   | 18 ++++++++++++++++++
>>    5 files changed, 31 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> index 4b3f58dbf36f..c2fbf6087ecf 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> @@ -392,6 +392,11 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
>>    						uint32_t ref, uint32_t mask)
>>    {
>>    	amdgpu_ring_emit_wreg(ring, reg0, ref);
>> +
>> +	/* wait for a cycle to reset vm_inv_eng0_ack */
>> +	if (ring->funcs->vmhub == AMDGPU_GFXHUB_0)
>> +		amdgpu_ring_emit_rreg(ring, reg0);
>> +
>>    	amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);
>>    }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index ef1975a5323a..104c47734316 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -5155,6 +5155,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>    	.patch_cond_exec = gfx_v10_0_ring_emit_patch_cond_exec,
>>    	.preempt_ib = gfx_v10_0_ring_preempt_ib,
>>    	.emit_tmz = gfx_v10_0_ring_emit_tmz,
>> +	.emit_rreg = gfx_v10_0_ring_emit_rreg,
>>    	.emit_wreg = gfx_v10_0_ring_emit_wreg,
>>    	.emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>>    };
>> @@ -5188,6 +5189,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>    	.test_ib = gfx_v10_0_ring_test_ib,
>>    	.insert_nop = amdgpu_ring_insert_nop,
>>    	.pad_ib = amdgpu_ring_generic_pad_ib,
>> +	.emit_rreg = gfx_v10_0_ring_emit_rreg,
>>    	.emit_wreg = gfx_v10_0_ring_emit_wreg,
>>    	.emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>>    };
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> index 2f03bf533d41..d00b53de0fdc 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> @@ -6253,6 +6253,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>    	.init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
>>    	.patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
>>    	.emit_tmz = gfx_v9_0_ring_emit_tmz,
>> +	.emit_rreg = gfx_v9_0_ring_emit_rreg,
>>    	.emit_wreg = gfx_v9_0_ring_emit_wreg,
>>    	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>>    	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
>> @@ -6289,6 +6290,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>    	.insert_nop = amdgpu_ring_insert_nop,
>>    	.pad_ib = amdgpu_ring_generic_pad_ib,
>>    	.set_priority = gfx_v9_0_ring_set_priority_compute,
>> +	.emit_rreg = gfx_v9_0_ring_emit_rreg,
>>    	.emit_wreg = gfx_v9_0_ring_emit_wreg,
>>    	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>>    	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> index 3b00bce14cfb..dce6b651da1f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> @@ -346,6 +346,10 @@ static uint64_t
>> gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>    
>>    	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>    
>> +	/* wait for a cycle to reset vm_inv_eng0_ack */
>> +	if (ring->funcs->vmhub == AMDGPU_GFXHUB_0)
>> +		amdgpu_ring_emit_rreg(ring, hub->vm_inv_eng0_req + eng);
>> +
>>    	/* wait for the invalidate to complete */
>>    	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>    				  1 << vmid, 1 << vmid);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> index 3460c00f3eaa..baaa33467882 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> @@ -38,6 +38,7 @@
>>    #include "navi10_sdma_pkt_open.h"
>>    #include "nbio_v2_3.h"
>>    #include "sdma_v5_0.h"
>> +#include "nvd.h"
>>    
>>    MODULE_FIRMWARE("amdgpu/navi10_sdma.bin");
>>    MODULE_FIRMWARE("amdgpu/navi10_sdma1.bin");
>> @@ -1147,6 +1148,22 @@ static void sdma_v5_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
>>    	amdgpu_gmc_emit_flush_gpu_tlb(ring, vmid, pd_addr);
>>    }
>>    
>> +static void sdma_v5_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>> +uint32_t reg) {
>> +	struct amdgpu_device *adev = ring->adev;
>> +
>> +	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>> +	amdgpu_ring_write(ring, 0 | /* src: register*/
>> +				(5 << 8) |  /* dst: memory */
>> +				(1 << 20)); /* write confirm */
>> +	amdgpu_ring_write(ring, reg);
>> +	amdgpu_ring_write(ring, 0);
>> +	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> +				adev->virt.reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> +				adev->virt.reg_val_offs * 4));
>> +}
>> +
>>    static void sdma_v5_0_ring_emit_wreg(struct amdgpu_ring *ring,
>>    				     uint32_t reg, uint32_t val)
>>    {
>> @@ -1597,6 +1614,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>>    	.test_ib = sdma_v5_0_ring_test_ib,
>>    	.insert_nop = sdma_v5_0_ring_insert_nop,
>>    	.pad_ib = sdma_v5_0_ring_pad_ib,
>> +	.emit_rreg = sdma_v5_0_ring_emit_rreg,
>>    	.emit_wreg = sdma_v5_0_ring_emit_wreg,
>>    	.emit_reg_wait = sdma_v5_0_ring_emit_reg_wait,
>>    	.init_cond_exec = sdma_v5_0_ring_init_cond_exec,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* RE: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-28 12:07                     ` Zhu, Changfeng
  0 siblings, 0 replies; 30+ messages in thread
From: Zhu, Changfeng @ 2019-10-28 12:07 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Huang, Ray,
	Tuikov, Luben

Hi Christian,

Should we also realize the function of gfx_v9_0_wait_reg_mem in gfx10 like gfx9 since gfx10 also realize write/wait command in a single packet after CL#1761300?

Or we can add dummy read in gmc10 by using emit_wait like Luben's way?

BR,
Changfeng. 

-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com> 
Sent: Monday, October 28, 2019 6:47 PM
To: Zhu, Changfeng <Changfeng.Zhu@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>
Subject: Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay

Hi Changfeng,

> So how can we deal with the firmware between mec version(402) and mec version(421)?
Well of hand I see only two options: Either print a warning or completely reject loading the driver.

Completely rejecting loading the driver is probably not a good idea and the issue is actually extremely unlikely to cause any problems.

So printing a warning that the user should update their firmware is probably the best approach.

Regards,
Christian.

Am 28.10.19 um 04:01 schrieb Zhu, Changfeng:
> Hi Christian,
>
> Re- that won't work, you can't add this to 
> amdgpu_ring_emit_reg_write_reg_wait_helper or break all read triggered registers (like the semaphore ones).
>
> Do you mean that I should use reg_wait registers(wait_reg_mem) like Luben to replace read triggered registers for adding dummy read?
>
> Re-Additional to that it will never work on GFX9, since the CP firmware there uses the integrated write/wait command and you can't add an additional dummy read there.
>
> Yes, I see the integrated write/wait command and they are realized in gfx_v9_0_wait_reg_mem:
> Emily's patch:
> drm/amdgpu: Remove the sriov checking and add firmware checking 
> decides when to go into gfx_v9_0_wait_reg_mem and when go into amdgpu_ring_emit_reg_write_reg_wait_helper.
>
> However there are two problems now.
> 1.Before the fw_version_ok fw version, the code goes into amdgpu_ring_emit_reg_write_reg_wait_helper. In this case, should not we add dummy read in amdgpu_ring_emit_reg_write_reg_wait_helper?
> 2.After the fw_version_ok fw version, the code goes into gfx_v9_0_wait_reg_mem. However, it realizes write/wait command in firmware. Then how can we add this dummy read? According to Yang,Zilong, the CP firmware has realized dummy in firmware in CL:
> Vega20 CL#1762470 @3/27/2019
> Navi10 CL#1761300 @3/25/2019
> Accodring to CL#1762470,
> The firmware which realized dummy read is(Raven for example):
> Mec version:
> #define F32_MEC_UCODE_VERSION "#421"
> #define F32_MEC_FEATURE_VERSION 46
> Pfp version:
> #define F32_PFP_UCODE_VERSION "#183"
> #define F32_PFP_FEATURE_VERSION 46
> In Emily's patch:
> The CP firmware whichuses the integrated write/wait command begins from version:
> +       case CHIP_RAVEN:
> +               if ((adev->gfx.me_fw_version >= 0x0000009c) &&
> +                   (adev->gfx.me_feature_version >= 42) &&
> +                   (adev->gfx.pfp_fw_version >=  0x000000b1(177)) &&
> +                   (adev->gfx.pfp_feature_version >= 42))
> +                       adev->gfx.me_fw_write_wait = true;
> +
> +               if ((adev->gfx.mec_fw_version >=  0x00000192(402)) &&
> +                   (adev->gfx.mec_feature_version >= 42))
> +                       adev->gfx.mec_fw_write_wait = true;
> +               break;
>
> So how can we deal with the firmware between mec version(402) and mec version(421)?
> It will realize write/wait command in CP firmware but it doesn't have dummy read.
>
> BR,
> Changfeng.
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Friday, October 25, 2019 11:54 PM
> To: Zhu, Changfeng <Changfeng.Zhu@amd.com>; 
> amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Pelloux-prayer, 
> Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Huang, Ray 
> <Ray.Huang@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>
> Subject: Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle 
> delay
>
> Hi Changfeng,
>
> that won't work, you can't add this to 
> amdgpu_ring_emit_reg_write_reg_wait_helper or break all read triggered registers (like the semaphore ones).
>
> Additional to that it will never work on GFX9, since the CP firmware there uses the integrated write/wait command and you can't add an additional dummy read there.
>
> Regards,
> Christian.
>
> Am 25.10.19 um 16:22 schrieb Zhu, Changfeng:
>> I try to write a patch based on the patch of Tuikov,Luben.
>>
>> Inspired by Luben,here is the patch:
>>
>>   From 1980d8f1ed44fb9a84a5ea1f6e2edd2bc25c629a Mon Sep 17 00:00:00
>> 2001
>> From: changzhu <Changfeng.Zhu@amd.com>
>> Date: Thu, 10 Oct 2019 11:02:33 +0800
>> Subject: [PATCH] drm/amdgpu: add dummy read by engines for some GCVM status
>>    registers
>>
>> The GRBM register interface is now capable of bursting 1 cycle per 
>> register wr->wr, wr->rd much faster than previous muticycle per 
>> transaction done interface.  This has caused a problem where status 
>> registers requiring HW to update have a 1 cycle delay, due to the 
>> register update having to go through GRBM.
>>
>> SW may operate on an incorrect value if they write a register and 
>> immediately check the corresponding status register.
>>
>> Registers requiring HW to clear or set fields may be delayed by 1 cycle.
>> For example,
>>
>> 1. write VM_INVALIDATE_ENG0_REQ mask = 5a 2. read 
>> VM_INVALIDATE_ENG0_ACKb till the ack is same as the request mask = 5a
>>       	a. HW will reset VM_INVALIDATE_ENG0_ACK = 0 until invalidation 
>> is complete 3. write VM_INVALIDATE_ENG0_REQ mask = 5a 4. read 
>> VM_INVALIDATE_ENG0_ACK till the ack is same as the request mask = 5a
>> 	a. First read of VM_INVALIDATE_ENG0_ACK = 5a instead of 0
>> 	b. Second read of VM_INVALIDATE_ENG0_ACK = 0 because the remote GRBM h/w
>> 	   register takes one extra cycle to be cleared
>> 	c. In this case,SW wil see a false ACK if they exit on first read
>>
>> Affected registers (only GC variant)  | Recommended Dummy Read
>> --------------------------------------+----------------------------
>> VM_INVALIDATE_ENG*_ACK		      |  VM_INVALIDATE_ENG*_REQ
>> VM_L2_STATUS			      |  VM_L2_STATUS
>> VM_L2_PROTECTION_FAULT_STATUS	      |  VM_L2_PROTECTION_FAULT_STATUS
>> VM_L2_PROTECTION_FAULT_ADDR_HI/LO32   |  VM_L2_PROTECTION_FAULT_ADDR_HI/LO32
>> VM_L2_IH_LOG_BUSY		      |  VM_L2_IH_LOG_BUSY
>> MC_VM_L2_PERFCOUNTER_HI/LO	      |  MC_VM_L2_PERFCOUNTER_HI/LO
>> ATC_L2_PERFCOUNTER_HI/LO	      |  ATC_L2_PERFCOUNTER_HI/LO
>> ATC_L2_PERFCOUNTER2_HI/LO	      |  ATC_L2_PERFCOUNTER2_HI/LO
>>
>> It also needs dummy read by engines for these gc registers.
>>
>> Change-Id: Ie028f37eb789966d4593984bd661b248ebeb1ac3
>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c |  5 +++++
>>    drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   |  2 ++
>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    |  2 ++
>>    drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c   |  4 ++++
>>    drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c   | 18 ++++++++++++++++++
>>    5 files changed, 31 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> index 4b3f58dbf36f..c2fbf6087ecf 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> @@ -392,6 +392,11 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
>>    						uint32_t ref, uint32_t mask)
>>    {
>>    	amdgpu_ring_emit_wreg(ring, reg0, ref);
>> +
>> +	/* wait for a cycle to reset vm_inv_eng0_ack */
>> +	if (ring->funcs->vmhub == AMDGPU_GFXHUB_0)
>> +		amdgpu_ring_emit_rreg(ring, reg0);
>> +
>>    	amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);
>>    }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index ef1975a5323a..104c47734316 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -5155,6 +5155,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>    	.patch_cond_exec = gfx_v10_0_ring_emit_patch_cond_exec,
>>    	.preempt_ib = gfx_v10_0_ring_preempt_ib,
>>    	.emit_tmz = gfx_v10_0_ring_emit_tmz,
>> +	.emit_rreg = gfx_v10_0_ring_emit_rreg,
>>    	.emit_wreg = gfx_v10_0_ring_emit_wreg,
>>    	.emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>>    };
>> @@ -5188,6 +5189,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>    	.test_ib = gfx_v10_0_ring_test_ib,
>>    	.insert_nop = amdgpu_ring_insert_nop,
>>    	.pad_ib = amdgpu_ring_generic_pad_ib,
>> +	.emit_rreg = gfx_v10_0_ring_emit_rreg,
>>    	.emit_wreg = gfx_v10_0_ring_emit_wreg,
>>    	.emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>>    };
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> index 2f03bf533d41..d00b53de0fdc 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> @@ -6253,6 +6253,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>    	.init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
>>    	.patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
>>    	.emit_tmz = gfx_v9_0_ring_emit_tmz,
>> +	.emit_rreg = gfx_v9_0_ring_emit_rreg,
>>    	.emit_wreg = gfx_v9_0_ring_emit_wreg,
>>    	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>>    	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
>> @@ -6289,6 +6290,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>    	.insert_nop = amdgpu_ring_insert_nop,
>>    	.pad_ib = amdgpu_ring_generic_pad_ib,
>>    	.set_priority = gfx_v9_0_ring_set_priority_compute,
>> +	.emit_rreg = gfx_v9_0_ring_emit_rreg,
>>    	.emit_wreg = gfx_v9_0_ring_emit_wreg,
>>    	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>>    	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> index 3b00bce14cfb..dce6b651da1f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> @@ -346,6 +346,10 @@ static uint64_t
>> gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>    
>>    	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>    
>> +	/* wait for a cycle to reset vm_inv_eng0_ack */
>> +	if (ring->funcs->vmhub == AMDGPU_GFXHUB_0)
>> +		amdgpu_ring_emit_rreg(ring, hub->vm_inv_eng0_req + eng);
>> +
>>    	/* wait for the invalidate to complete */
>>    	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>    				  1 << vmid, 1 << vmid);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> index 3460c00f3eaa..baaa33467882 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> @@ -38,6 +38,7 @@
>>    #include "navi10_sdma_pkt_open.h"
>>    #include "nbio_v2_3.h"
>>    #include "sdma_v5_0.h"
>> +#include "nvd.h"
>>    
>>    MODULE_FIRMWARE("amdgpu/navi10_sdma.bin");
>>    MODULE_FIRMWARE("amdgpu/navi10_sdma1.bin");
>> @@ -1147,6 +1148,22 @@ static void sdma_v5_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
>>    	amdgpu_gmc_emit_flush_gpu_tlb(ring, vmid, pd_addr);
>>    }
>>    
>> +static void sdma_v5_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>> +uint32_t reg) {
>> +	struct amdgpu_device *adev = ring->adev;
>> +
>> +	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>> +	amdgpu_ring_write(ring, 0 | /* src: register*/
>> +				(5 << 8) |  /* dst: memory */
>> +				(1 << 20)); /* write confirm */
>> +	amdgpu_ring_write(ring, reg);
>> +	amdgpu_ring_write(ring, 0);
>> +	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> +				adev->virt.reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> +				adev->virt.reg_val_offs * 4));
>> +}
>> +
>>    static void sdma_v5_0_ring_emit_wreg(struct amdgpu_ring *ring,
>>    				     uint32_t reg, uint32_t val)
>>    {
>> @@ -1597,6 +1614,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>>    	.test_ib = sdma_v5_0_ring_test_ib,
>>    	.insert_nop = sdma_v5_0_ring_insert_nop,
>>    	.pad_ib = sdma_v5_0_ring_pad_ib,
>> +	.emit_rreg = sdma_v5_0_ring_emit_rreg,
>>    	.emit_wreg = sdma_v5_0_ring_emit_wreg,
>>    	.emit_reg_wait = sdma_v5_0_ring_emit_reg_wait,
>>    	.init_cond_exec = sdma_v5_0_ring_init_cond_exec,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* RE: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-28 13:38 ` Koenig, Christian
  0 siblings, 0 replies; 30+ messages in thread
From: Koenig, Christian @ 2019-10-28 13:38 UTC (permalink / raw)
  To: Zhu, Changfeng
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Huang, Ray,
	Tuikov, Luben, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 13495 bytes --]

I think we should implement the write/wait combined command in gfx10.

Did we ever released any firmware which couldn't do this?

Christian.

Am 28.10.2019 13:07 schrieb "Zhu, Changfeng" <Changfeng.Zhu@amd.com>:
Hi Christian,

Should we also realize the function of gfx_v9_0_wait_reg_mem in gfx10 like gfx9 since gfx10 also realize write/wait command in a single packet after CL#1761300?

Or we can add dummy read in gmc10 by using emit_wait like Luben's way?

BR,
Changfeng.

-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com>
Sent: Monday, October 28, 2019 6:47 PM
To: Zhu, Changfeng <Changfeng.Zhu@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>
Subject: Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay

Hi Changfeng,

> So how can we deal with the firmware between mec version(402) and mec version(421)?
Well of hand I see only two options: Either print a warning or completely reject loading the driver.

Completely rejecting loading the driver is probably not a good idea and the issue is actually extremely unlikely to cause any problems.

So printing a warning that the user should update their firmware is probably the best approach.

Regards,
Christian.

Am 28.10.19 um 04:01 schrieb Zhu, Changfeng:
> Hi Christian,
>
> Re- that won't work, you can't add this to
> amdgpu_ring_emit_reg_write_reg_wait_helper or break all read triggered registers (like the semaphore ones).
>
> Do you mean that I should use reg_wait registers(wait_reg_mem) like Luben to replace read triggered registers for adding dummy read?
>
> Re-Additional to that it will never work on GFX9, since the CP firmware there uses the integrated write/wait command and you can't add an additional dummy read there.
>
> Yes, I see the integrated write/wait command and they are realized in gfx_v9_0_wait_reg_mem:
> Emily's patch:
> drm/amdgpu: Remove the sriov checking and add firmware checking
> decides when to go into gfx_v9_0_wait_reg_mem and when go into amdgpu_ring_emit_reg_write_reg_wait_helper.
>
> However there are two problems now.
> 1.Before the fw_version_ok fw version, the code goes into amdgpu_ring_emit_reg_write_reg_wait_helper. In this case, should not we add dummy read in amdgpu_ring_emit_reg_write_reg_wait_helper?
> 2.After the fw_version_ok fw version, the code goes into gfx_v9_0_wait_reg_mem. However, it realizes write/wait command in firmware. Then how can we add this dummy read? According to Yang,Zilong, the CP firmware has realized dummy in firmware in CL:
> Vega20 CL#1762470 @3/27/2019
> Navi10 CL#1761300 @3/25/2019
> Accodring to CL#1762470,
> The firmware which realized dummy read is(Raven for example):
> Mec version:
> #define F32_MEC_UCODE_VERSION "#421"
> #define F32_MEC_FEATURE_VERSION 46
> Pfp version:
> #define F32_PFP_UCODE_VERSION "#183"
> #define F32_PFP_FEATURE_VERSION 46
> In Emily's patch:
> The CP firmware whichuses the integrated write/wait command begins from version:
> +       case CHIP_RAVEN:
> +               if ((adev->gfx.me_fw_version >= 0x0000009c) &&
> +                   (adev->gfx.me_feature_version >= 42) &&
> +                   (adev->gfx.pfp_fw_version >=  0x000000b1(177)) &&
> +                   (adev->gfx.pfp_feature_version >= 42))
> +                       adev->gfx.me_fw_write_wait = true;
> +
> +               if ((adev->gfx.mec_fw_version >=  0x00000192(402)) &&
> +                   (adev->gfx.mec_feature_version >= 42))
> +                       adev->gfx.mec_fw_write_wait = true;
> +               break;
>
> So how can we deal with the firmware between mec version(402) and mec version(421)?
> It will realize write/wait command in CP firmware but it doesn't have dummy read.
>
> BR,
> Changfeng.
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Friday, October 25, 2019 11:54 PM
> To: Zhu, Changfeng <Changfeng.Zhu@amd.com>;
> amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Pelloux-prayer,
> Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Huang, Ray
> <Ray.Huang@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>
> Subject: Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle
> delay
>
> Hi Changfeng,
>
> that won't work, you can't add this to
> amdgpu_ring_emit_reg_write_reg_wait_helper or break all read triggered registers (like the semaphore ones).
>
> Additional to that it will never work on GFX9, since the CP firmware there uses the integrated write/wait command and you can't add an additional dummy read there.
>
> Regards,
> Christian.
>
> Am 25.10.19 um 16:22 schrieb Zhu, Changfeng:
>> I try to write a patch based on the patch of Tuikov,Luben.
>>
>> Inspired by Luben,here is the patch:
>>
>>   From 1980d8f1ed44fb9a84a5ea1f6e2edd2bc25c629a Mon Sep 17 00:00:00
>> 2001
>> From: changzhu <Changfeng.Zhu@amd.com>
>> Date: Thu, 10 Oct 2019 11:02:33 +0800
>> Subject: [PATCH] drm/amdgpu: add dummy read by engines for some GCVM status
>>    registers
>>
>> The GRBM register interface is now capable of bursting 1 cycle per
>> register wr->wr, wr->rd much faster than previous muticycle per
>> transaction done interface.  This has caused a problem where status
>> registers requiring HW to update have a 1 cycle delay, due to the
>> register update having to go through GRBM.
>>
>> SW may operate on an incorrect value if they write a register and
>> immediately check the corresponding status register.
>>
>> Registers requiring HW to clear or set fields may be delayed by 1 cycle.
>> For example,
>>
>> 1. write VM_INVALIDATE_ENG0_REQ mask = 5a 2. read
>> VM_INVALIDATE_ENG0_ACKb till the ack is same as the request mask = 5a
>>               a. HW will reset VM_INVALIDATE_ENG0_ACK = 0 until invalidation
>> is complete 3. write VM_INVALIDATE_ENG0_REQ mask = 5a 4. read
>> VM_INVALIDATE_ENG0_ACK till the ack is same as the request mask = 5a
>>       a. First read of VM_INVALIDATE_ENG0_ACK = 5a instead of 0
>>       b. Second read of VM_INVALIDATE_ENG0_ACK = 0 because the remote GRBM h/w
>>          register takes one extra cycle to be cleared
>>       c. In this case,SW wil see a false ACK if they exit on first read
>>
>> Affected registers (only GC variant)  | Recommended Dummy Read
>> --------------------------------------+----------------------------
>> VM_INVALIDATE_ENG*_ACK                     |  VM_INVALIDATE_ENG*_REQ
>> VM_L2_STATUS                       |  VM_L2_STATUS
>> VM_L2_PROTECTION_FAULT_STATUS              |  VM_L2_PROTECTION_FAULT_STATUS
>> VM_L2_PROTECTION_FAULT_ADDR_HI/LO32   |  VM_L2_PROTECTION_FAULT_ADDR_HI/LO32
>> VM_L2_IH_LOG_BUSY                  |  VM_L2_IH_LOG_BUSY
>> MC_VM_L2_PERFCOUNTER_HI/LO         |  MC_VM_L2_PERFCOUNTER_HI/LO
>> ATC_L2_PERFCOUNTER_HI/LO           |  ATC_L2_PERFCOUNTER_HI/LO
>> ATC_L2_PERFCOUNTER2_HI/LO          |  ATC_L2_PERFCOUNTER2_HI/LO
>>
>> It also needs dummy read by engines for these gc registers.
>>
>> Change-Id: Ie028f37eb789966d4593984bd661b248ebeb1ac3
>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c |  5 +++++
>>    drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   |  2 ++
>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    |  2 ++
>>    drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c   |  4 ++++
>>    drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c   | 18 ++++++++++++++++++
>>    5 files changed, 31 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> index 4b3f58dbf36f..c2fbf6087ecf 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> @@ -392,6 +392,11 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
>>                                               uint32_t ref, uint32_t mask)
>>    {
>>       amdgpu_ring_emit_wreg(ring, reg0, ref);
>> +
>> +    /* wait for a cycle to reset vm_inv_eng0_ack */
>> +    if (ring->funcs->vmhub == AMDGPU_GFXHUB_0)
>> +            amdgpu_ring_emit_rreg(ring, reg0);
>> +
>>       amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);
>>    }
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index ef1975a5323a..104c47734316 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -5155,6 +5155,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>       .patch_cond_exec = gfx_v10_0_ring_emit_patch_cond_exec,
>>       .preempt_ib = gfx_v10_0_ring_preempt_ib,
>>       .emit_tmz = gfx_v10_0_ring_emit_tmz,
>> +    .emit_rreg = gfx_v10_0_ring_emit_rreg,
>>       .emit_wreg = gfx_v10_0_ring_emit_wreg,
>>       .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>>    };
>> @@ -5188,6 +5189,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>       .test_ib = gfx_v10_0_ring_test_ib,
>>       .insert_nop = amdgpu_ring_insert_nop,
>>       .pad_ib = amdgpu_ring_generic_pad_ib,
>> +    .emit_rreg = gfx_v10_0_ring_emit_rreg,
>>       .emit_wreg = gfx_v10_0_ring_emit_wreg,
>>       .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>>    };
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> index 2f03bf533d41..d00b53de0fdc 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> @@ -6253,6 +6253,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>       .init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
>>       .patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
>>       .emit_tmz = gfx_v9_0_ring_emit_tmz,
>> +    .emit_rreg = gfx_v9_0_ring_emit_rreg,
>>       .emit_wreg = gfx_v9_0_ring_emit_wreg,
>>       .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>>       .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
>> @@ -6289,6 +6290,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>       .insert_nop = amdgpu_ring_insert_nop,
>>       .pad_ib = amdgpu_ring_generic_pad_ib,
>>       .set_priority = gfx_v9_0_ring_set_priority_compute,
>> +    .emit_rreg = gfx_v9_0_ring_emit_rreg,
>>       .emit_wreg = gfx_v9_0_ring_emit_wreg,
>>       .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>>       .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> index 3b00bce14cfb..dce6b651da1f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> @@ -346,6 +346,10 @@ static uint64_t
>> gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>
>>       amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>
>> +    /* wait for a cycle to reset vm_inv_eng0_ack */
>> +    if (ring->funcs->vmhub == AMDGPU_GFXHUB_0)
>> +            amdgpu_ring_emit_rreg(ring, hub->vm_inv_eng0_req + eng);
>> +
>>       /* wait for the invalidate to complete */
>>       amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>                                 1 << vmid, 1 << vmid);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> index 3460c00f3eaa..baaa33467882 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> @@ -38,6 +38,7 @@
>>    #include "navi10_sdma_pkt_open.h"
>>    #include "nbio_v2_3.h"
>>    #include "sdma_v5_0.h"
>> +#include "nvd.h"
>>
>>    MODULE_FIRMWARE("amdgpu/navi10_sdma.bin");
>>    MODULE_FIRMWARE("amdgpu/navi10_sdma1.bin");
>> @@ -1147,6 +1148,22 @@ static void sdma_v5_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
>>       amdgpu_gmc_emit_flush_gpu_tlb(ring, vmid, pd_addr);
>>    }
>>
>> +static void sdma_v5_0_ring_emit_rreg(struct amdgpu_ring *ring,
>> +uint32_t reg) {
>> +    struct amdgpu_device *adev = ring->adev;
>> +
>> +    amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>> +    amdgpu_ring_write(ring, 0 | /* src: register*/
>> +                            (5 << 8) |  /* dst: memory */
>> +                            (1 << 20)); /* write confirm */
>> +    amdgpu_ring_write(ring, reg);
>> +    amdgpu_ring_write(ring, 0);
>> +    amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> +                            adev->virt.reg_val_offs * 4));
>> +    amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> +                            adev->virt.reg_val_offs * 4));
>> +}
>> +
>>    static void sdma_v5_0_ring_emit_wreg(struct amdgpu_ring *ring,
>>                                    uint32_t reg, uint32_t val)
>>    {
>> @@ -1597,6 +1614,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>>       .test_ib = sdma_v5_0_ring_test_ib,
>>       .insert_nop = sdma_v5_0_ring_insert_nop,
>>       .pad_ib = sdma_v5_0_ring_pad_ib,
>> +    .emit_rreg = sdma_v5_0_ring_emit_rreg,
>>       .emit_wreg = sdma_v5_0_ring_emit_wreg,
>>       .emit_reg_wait = sdma_v5_0_ring_emit_reg_wait,
>>       .init_cond_exec = sdma_v5_0_ring_init_cond_exec,



[-- Attachment #1.2: Type: text/html, Size: 21778 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* RE: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-28 13:38 ` Koenig, Christian
  0 siblings, 0 replies; 30+ messages in thread
From: Koenig, Christian @ 2019-10-28 13:38 UTC (permalink / raw)
  To: Zhu, Changfeng
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Huang, Ray,
	Tuikov, Luben, amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 13495 bytes --]

I think we should implement the write/wait combined command in gfx10.

Did we ever released any firmware which couldn't do this?

Christian.

Am 28.10.2019 13:07 schrieb "Zhu, Changfeng" <Changfeng.Zhu@amd.com>:
Hi Christian,

Should we also realize the function of gfx_v9_0_wait_reg_mem in gfx10 like gfx9 since gfx10 also realize write/wait command in a single packet after CL#1761300?

Or we can add dummy read in gmc10 by using emit_wait like Luben's way?

BR,
Changfeng.

-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com>
Sent: Monday, October 28, 2019 6:47 PM
To: Zhu, Changfeng <Changfeng.Zhu@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>
Subject: Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay

Hi Changfeng,

> So how can we deal with the firmware between mec version(402) and mec version(421)?
Well of hand I see only two options: Either print a warning or completely reject loading the driver.

Completely rejecting loading the driver is probably not a good idea and the issue is actually extremely unlikely to cause any problems.

So printing a warning that the user should update their firmware is probably the best approach.

Regards,
Christian.

Am 28.10.19 um 04:01 schrieb Zhu, Changfeng:
> Hi Christian,
>
> Re- that won't work, you can't add this to
> amdgpu_ring_emit_reg_write_reg_wait_helper or break all read triggered registers (like the semaphore ones).
>
> Do you mean that I should use reg_wait registers(wait_reg_mem) like Luben to replace read triggered registers for adding dummy read?
>
> Re-Additional to that it will never work on GFX9, since the CP firmware there uses the integrated write/wait command and you can't add an additional dummy read there.
>
> Yes, I see the integrated write/wait command and they are realized in gfx_v9_0_wait_reg_mem:
> Emily's patch:
> drm/amdgpu: Remove the sriov checking and add firmware checking
> decides when to go into gfx_v9_0_wait_reg_mem and when go into amdgpu_ring_emit_reg_write_reg_wait_helper.
>
> However there are two problems now.
> 1.Before the fw_version_ok fw version, the code goes into amdgpu_ring_emit_reg_write_reg_wait_helper. In this case, should not we add dummy read in amdgpu_ring_emit_reg_write_reg_wait_helper?
> 2.After the fw_version_ok fw version, the code goes into gfx_v9_0_wait_reg_mem. However, it realizes write/wait command in firmware. Then how can we add this dummy read? According to Yang,Zilong, the CP firmware has realized dummy in firmware in CL:
> Vega20 CL#1762470 @3/27/2019
> Navi10 CL#1761300 @3/25/2019
> Accodring to CL#1762470,
> The firmware which realized dummy read is(Raven for example):
> Mec version:
> #define F32_MEC_UCODE_VERSION "#421"
> #define F32_MEC_FEATURE_VERSION 46
> Pfp version:
> #define F32_PFP_UCODE_VERSION "#183"
> #define F32_PFP_FEATURE_VERSION 46
> In Emily's patch:
> The CP firmware whichuses the integrated write/wait command begins from version:
> +       case CHIP_RAVEN:
> +               if ((adev->gfx.me_fw_version >= 0x0000009c) &&
> +                   (adev->gfx.me_feature_version >= 42) &&
> +                   (adev->gfx.pfp_fw_version >=  0x000000b1(177)) &&
> +                   (adev->gfx.pfp_feature_version >= 42))
> +                       adev->gfx.me_fw_write_wait = true;
> +
> +               if ((adev->gfx.mec_fw_version >=  0x00000192(402)) &&
> +                   (adev->gfx.mec_feature_version >= 42))
> +                       adev->gfx.mec_fw_write_wait = true;
> +               break;
>
> So how can we deal with the firmware between mec version(402) and mec version(421)?
> It will realize write/wait command in CP firmware but it doesn't have dummy read.
>
> BR,
> Changfeng.
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Friday, October 25, 2019 11:54 PM
> To: Zhu, Changfeng <Changfeng.Zhu@amd.com>;
> amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Pelloux-prayer,
> Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Huang, Ray
> <Ray.Huang@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>
> Subject: Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle
> delay
>
> Hi Changfeng,
>
> that won't work, you can't add this to
> amdgpu_ring_emit_reg_write_reg_wait_helper or break all read triggered registers (like the semaphore ones).
>
> Additional to that it will never work on GFX9, since the CP firmware there uses the integrated write/wait command and you can't add an additional dummy read there.
>
> Regards,
> Christian.
>
> Am 25.10.19 um 16:22 schrieb Zhu, Changfeng:
>> I try to write a patch based on the patch of Tuikov,Luben.
>>
>> Inspired by Luben,here is the patch:
>>
>>   From 1980d8f1ed44fb9a84a5ea1f6e2edd2bc25c629a Mon Sep 17 00:00:00
>> 2001
>> From: changzhu <Changfeng.Zhu@amd.com>
>> Date: Thu, 10 Oct 2019 11:02:33 +0800
>> Subject: [PATCH] drm/amdgpu: add dummy read by engines for some GCVM status
>>    registers
>>
>> The GRBM register interface is now capable of bursting 1 cycle per
>> register wr->wr, wr->rd much faster than previous muticycle per
>> transaction done interface.  This has caused a problem where status
>> registers requiring HW to update have a 1 cycle delay, due to the
>> register update having to go through GRBM.
>>
>> SW may operate on an incorrect value if they write a register and
>> immediately check the corresponding status register.
>>
>> Registers requiring HW to clear or set fields may be delayed by 1 cycle.
>> For example,
>>
>> 1. write VM_INVALIDATE_ENG0_REQ mask = 5a 2. read
>> VM_INVALIDATE_ENG0_ACKb till the ack is same as the request mask = 5a
>>               a. HW will reset VM_INVALIDATE_ENG0_ACK = 0 until invalidation
>> is complete 3. write VM_INVALIDATE_ENG0_REQ mask = 5a 4. read
>> VM_INVALIDATE_ENG0_ACK till the ack is same as the request mask = 5a
>>       a. First read of VM_INVALIDATE_ENG0_ACK = 5a instead of 0
>>       b. Second read of VM_INVALIDATE_ENG0_ACK = 0 because the remote GRBM h/w
>>          register takes one extra cycle to be cleared
>>       c. In this case,SW wil see a false ACK if they exit on first read
>>
>> Affected registers (only GC variant)  | Recommended Dummy Read
>> --------------------------------------+----------------------------
>> VM_INVALIDATE_ENG*_ACK                     |  VM_INVALIDATE_ENG*_REQ
>> VM_L2_STATUS                       |  VM_L2_STATUS
>> VM_L2_PROTECTION_FAULT_STATUS              |  VM_L2_PROTECTION_FAULT_STATUS
>> VM_L2_PROTECTION_FAULT_ADDR_HI/LO32   |  VM_L2_PROTECTION_FAULT_ADDR_HI/LO32
>> VM_L2_IH_LOG_BUSY                  |  VM_L2_IH_LOG_BUSY
>> MC_VM_L2_PERFCOUNTER_HI/LO         |  MC_VM_L2_PERFCOUNTER_HI/LO
>> ATC_L2_PERFCOUNTER_HI/LO           |  ATC_L2_PERFCOUNTER_HI/LO
>> ATC_L2_PERFCOUNTER2_HI/LO          |  ATC_L2_PERFCOUNTER2_HI/LO
>>
>> It also needs dummy read by engines for these gc registers.
>>
>> Change-Id: Ie028f37eb789966d4593984bd661b248ebeb1ac3
>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c |  5 +++++
>>    drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   |  2 ++
>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    |  2 ++
>>    drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c   |  4 ++++
>>    drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c   | 18 ++++++++++++++++++
>>    5 files changed, 31 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> index 4b3f58dbf36f..c2fbf6087ecf 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> @@ -392,6 +392,11 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
>>                                               uint32_t ref, uint32_t mask)
>>    {
>>       amdgpu_ring_emit_wreg(ring, reg0, ref);
>> +
>> +    /* wait for a cycle to reset vm_inv_eng0_ack */
>> +    if (ring->funcs->vmhub == AMDGPU_GFXHUB_0)
>> +            amdgpu_ring_emit_rreg(ring, reg0);
>> +
>>       amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);
>>    }
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index ef1975a5323a..104c47734316 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -5155,6 +5155,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>       .patch_cond_exec = gfx_v10_0_ring_emit_patch_cond_exec,
>>       .preempt_ib = gfx_v10_0_ring_preempt_ib,
>>       .emit_tmz = gfx_v10_0_ring_emit_tmz,
>> +    .emit_rreg = gfx_v10_0_ring_emit_rreg,
>>       .emit_wreg = gfx_v10_0_ring_emit_wreg,
>>       .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>>    };
>> @@ -5188,6 +5189,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>       .test_ib = gfx_v10_0_ring_test_ib,
>>       .insert_nop = amdgpu_ring_insert_nop,
>>       .pad_ib = amdgpu_ring_generic_pad_ib,
>> +    .emit_rreg = gfx_v10_0_ring_emit_rreg,
>>       .emit_wreg = gfx_v10_0_ring_emit_wreg,
>>       .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>>    };
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> index 2f03bf533d41..d00b53de0fdc 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> @@ -6253,6 +6253,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>       .init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
>>       .patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
>>       .emit_tmz = gfx_v9_0_ring_emit_tmz,
>> +    .emit_rreg = gfx_v9_0_ring_emit_rreg,
>>       .emit_wreg = gfx_v9_0_ring_emit_wreg,
>>       .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>>       .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
>> @@ -6289,6 +6290,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>       .insert_nop = amdgpu_ring_insert_nop,
>>       .pad_ib = amdgpu_ring_generic_pad_ib,
>>       .set_priority = gfx_v9_0_ring_set_priority_compute,
>> +    .emit_rreg = gfx_v9_0_ring_emit_rreg,
>>       .emit_wreg = gfx_v9_0_ring_emit_wreg,
>>       .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>>       .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> index 3b00bce14cfb..dce6b651da1f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> @@ -346,6 +346,10 @@ static uint64_t
>> gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>
>>       amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>
>> +    /* wait for a cycle to reset vm_inv_eng0_ack */
>> +    if (ring->funcs->vmhub == AMDGPU_GFXHUB_0)
>> +            amdgpu_ring_emit_rreg(ring, hub->vm_inv_eng0_req + eng);
>> +
>>       /* wait for the invalidate to complete */
>>       amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>                                 1 << vmid, 1 << vmid);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> index 3460c00f3eaa..baaa33467882 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> @@ -38,6 +38,7 @@
>>    #include "navi10_sdma_pkt_open.h"
>>    #include "nbio_v2_3.h"
>>    #include "sdma_v5_0.h"
>> +#include "nvd.h"
>>
>>    MODULE_FIRMWARE("amdgpu/navi10_sdma.bin");
>>    MODULE_FIRMWARE("amdgpu/navi10_sdma1.bin");
>> @@ -1147,6 +1148,22 @@ static void sdma_v5_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
>>       amdgpu_gmc_emit_flush_gpu_tlb(ring, vmid, pd_addr);
>>    }
>>
>> +static void sdma_v5_0_ring_emit_rreg(struct amdgpu_ring *ring,
>> +uint32_t reg) {
>> +    struct amdgpu_device *adev = ring->adev;
>> +
>> +    amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>> +    amdgpu_ring_write(ring, 0 | /* src: register*/
>> +                            (5 << 8) |  /* dst: memory */
>> +                            (1 << 20)); /* write confirm */
>> +    amdgpu_ring_write(ring, reg);
>> +    amdgpu_ring_write(ring, 0);
>> +    amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> +                            adev->virt.reg_val_offs * 4));
>> +    amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> +                            adev->virt.reg_val_offs * 4));
>> +}
>> +
>>    static void sdma_v5_0_ring_emit_wreg(struct amdgpu_ring *ring,
>>                                    uint32_t reg, uint32_t val)
>>    {
>> @@ -1597,6 +1614,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>>       .test_ib = sdma_v5_0_ring_test_ib,
>>       .insert_nop = sdma_v5_0_ring_insert_nop,
>>       .pad_ib = sdma_v5_0_ring_pad_ib,
>> +    .emit_rreg = sdma_v5_0_ring_emit_rreg,
>>       .emit_wreg = sdma_v5_0_ring_emit_wreg,
>>       .emit_reg_wait = sdma_v5_0_ring_emit_reg_wait,
>>       .init_cond_exec = sdma_v5_0_ring_init_cond_exec,



[-- Attachment #1.2: Type: text/html, Size: 21778 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-27 21:25                         ` Tuikov, Luben
  0 siblings, 0 replies; 30+ messages in thread
From: Tuikov, Luben @ 2019-10-27 21:25 UTC (permalink / raw)
  To: Koenig, Christian, Alex Deucher
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On 2019-10-26 08:09, Koenig, Christian wrote:
> Am 26.10.19 um 00:45 schrieb Tuikov, Luben:
>> On 2019-10-25 12:19 p.m., Koenig, Christian wrote:
>>> Am 25.10.19 um 18:05 schrieb Alex Deucher:
>>>> On Fri, Oct 25, 2019 at 2:49 AM Koenig, Christian
>>>> <Christian.Koenig@amd.com> wrote:
>>>>> Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
>>>>>> The GRBM interface is now capable of bursting
>>>>>> 1-cycle op per register, a WRITE followed by
>>>>>> another WRITE, or a WRITE followed by a READ--much
>>>>>> faster than previous muti-cycle per
>>>>>> completed-transaction interface. This causes a
>>>>>> problem, whereby status registers requiring a
>>>>>> read/write by hardware, have a 1-cycle delay, due
>>>>>> to the register update having to go through GRBM
>>>>>> interface.
>>>>>>
>>>>>> This patch adds this delay.
>>>>>>
>>>>>> A one cycle read op is added after updating the
>>>>>> invalidate request and before reading the
>>>>>> invalidate-ACK status.
>>>>> Please completely drop all changes for GFX9 since this patch will most
>>>>> likely break SRIOV.
>>>>>
>>>>> Additional to that please apply the workaround only to SDMA since the CP
>>>>> driven engines should handle that in firmware.
>> Thank you Christian for reviewing this patch.
>>
>> This patch stirred quite a bit of noise. So, then, I'll go by
>> your last comment above--I suppose this is the desired way to go forward then?
> 
> You most likely broke the SRIOV use case on GFX9 with that, no wonder 
> that this raised eyebrows.
> 
> As far as I can see this manual workaround is only applicable to the 
> SDMA on Navi.

Did you see the (v2) patch?

Regards,
Luben

> 
> But we should double check that the CP firmware interface with the 
> combined write/wait command is correctly used on Navi/GFX10 as well. 
> IIRC that came in rather late for GFX9, could be that the Navi bringup 
> branch never had that.
> 
> Regards,
> Christian.
> 
>>
>> Regards,
>> Luben
>>
>>
>>>> I think the CP only handles this in firmware if we use the new TLB
>>>> invalidation packet.  I don't think it applies it to general register
>>>> writes like we do.
>>> No, on the CP we should use the combined write/wait command even if we
>>> don't use the new specialized VM invalidate command. Everything else
>>> won't work with SRIOV.
>>>
>>> Even if we want to we can't insert an extra read in this combined
>>> write/wait command. And if we split up the commands we would break SRIOV
>>> once more.
>>>
>>> So applying this workaround to the CP code doesn't make any sense at all.
>>>
>>> The only TODO which I can see is that we maybe don't use the combined
>>> write/wait command on Navi yet.
>>>
>>> Christian.
>>>
>>>> Alex
>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>> See also commit
>>>>>> 534991731cb5fa94b5519957646cf849ca10d17d.
>>>>>>
>>>>>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>>>>>> ---
>>>>>>     drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
>>>>>>     drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--
>>>>>>     drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
>>>>>>     drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++
>>>>>>     drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>>>>>>     5 files changed, 22 insertions(+), 5 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>> index ac43b1af69e3..0042868dbd53 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>>>>>                 5 + /* COND_EXEC */
>>>>>>                 7 + /* PIPELINE_SYNC */
>>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>>                 2 + /* VM_FLUSH */
>>>>>>                 8 + /* FENCE for VM_FLUSH */
>>>>>>                 20 + /* GDS switch */
>>>>>> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>>>>>                 5 + /* hdp invalidate */
>>>>>>                 7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>>                 2 + /* gfx_v10_0_ring_emit_vm_flush */
>>>>>>                 8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>>>>>>         .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>> index 9fe95e7693d5..9a7a717208de 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>>>>>                 5 +  /* COND_EXEC */
>>>>>>                 7 +  /* PIPELINE_SYNC */
>>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>>                 2 + /* VM_FLUSH */
>>>>>>                 8 +  /* FENCE for VM_FLUSH */
>>>>>>                 20 + /* GDS switch */
>>>>>> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>>>>>                 5 + /* hdp invalidate */
>>>>>>                 7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>>                 2 + /* gfx_v9_0_ring_emit_vm_flush */
>>>>>>                 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>>>>>>         .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>> index 6e1b25bd1fe7..100d526e9a42 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>> @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>>>
>>>>>>         amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>>>>>
>>>>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>>>>> +      * inquiry.
>>>>>> +      */
>>>>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
>>>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>>>> +             amdgpu_ring_emit_reg_wait(ring,
>>>>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>>>>> +
>>>>>>         /* wait for the invalidate to complete */
>>>>>>         amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>>>>>                                   1 << vmid, 1 << vmid);
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>> index 9f2a893871ec..8f3097e45299 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>>>         amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>>>>>>                               upper_32_bits(pd_addr));
>>>>>>
>>>>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>>>>> +      * inquiry.
>>>>>> +      */
>>>>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>>>> +             amdgpu_ring_emit_reg_wait(ring,
>>>>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>>>>> +
>>>>>>         amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>>>>>>                                             hub->vm_inv_eng0_ack + eng,
>>>>>>                                             req, 1 << vmid);
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>>> index b8fdb192f6d6..0c41b4fdc58b 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>>> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>>>>>>                 6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>>>>>>                 /* sdma_v5_0_ring_emit_vm_flush */
>>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
>>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
>>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>>>>>>                 10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>>>>>>         .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>>>>>>         .emit_ib = sdma_v5_0_ring_emit_ib,
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> 

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-27 21:25                         ` Tuikov, Luben
  0 siblings, 0 replies; 30+ messages in thread
From: Tuikov, Luben @ 2019-10-27 21:25 UTC (permalink / raw)
  To: Koenig, Christian, Alex Deucher
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, amd-gfx

On 2019-10-26 08:09, Koenig, Christian wrote:
> Am 26.10.19 um 00:45 schrieb Tuikov, Luben:
>> On 2019-10-25 12:19 p.m., Koenig, Christian wrote:
>>> Am 25.10.19 um 18:05 schrieb Alex Deucher:
>>>> On Fri, Oct 25, 2019 at 2:49 AM Koenig, Christian
>>>> <Christian.Koenig@amd.com> wrote:
>>>>> Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
>>>>>> The GRBM interface is now capable of bursting
>>>>>> 1-cycle op per register, a WRITE followed by
>>>>>> another WRITE, or a WRITE followed by a READ--much
>>>>>> faster than previous muti-cycle per
>>>>>> completed-transaction interface. This causes a
>>>>>> problem, whereby status registers requiring a
>>>>>> read/write by hardware, have a 1-cycle delay, due
>>>>>> to the register update having to go through GRBM
>>>>>> interface.
>>>>>>
>>>>>> This patch adds this delay.
>>>>>>
>>>>>> A one cycle read op is added after updating the
>>>>>> invalidate request and before reading the
>>>>>> invalidate-ACK status.
>>>>> Please completely drop all changes for GFX9 since this patch will most
>>>>> likely break SRIOV.
>>>>>
>>>>> Additional to that please apply the workaround only to SDMA since the CP
>>>>> driven engines should handle that in firmware.
>> Thank you Christian for reviewing this patch.
>>
>> This patch stirred quite a bit of noise. So, then, I'll go by
>> your last comment above--I suppose this is the desired way to go forward then?
> 
> You most likely broke the SRIOV use case on GFX9 with that, no wonder 
> that this raised eyebrows.
> 
> As far as I can see this manual workaround is only applicable to the 
> SDMA on Navi.

Did you see the (v2) patch?

Regards,
Luben

> 
> But we should double check that the CP firmware interface with the 
> combined write/wait command is correctly used on Navi/GFX10 as well. 
> IIRC that came in rather late for GFX9, could be that the Navi bringup 
> branch never had that.
> 
> Regards,
> Christian.
> 
>>
>> Regards,
>> Luben
>>
>>
>>>> I think the CP only handles this in firmware if we use the new TLB
>>>> invalidation packet.  I don't think it applies it to general register
>>>> writes like we do.
>>> No, on the CP we should use the combined write/wait command even if we
>>> don't use the new specialized VM invalidate command. Everything else
>>> won't work with SRIOV.
>>>
>>> Even if we want to we can't insert an extra read in this combined
>>> write/wait command. And if we split up the commands we would break SRIOV
>>> once more.
>>>
>>> So applying this workaround to the CP code doesn't make any sense at all.
>>>
>>> The only TODO which I can see is that we maybe don't use the combined
>>> write/wait command on Navi yet.
>>>
>>> Christian.
>>>
>>>> Alex
>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>> See also commit
>>>>>> 534991731cb5fa94b5519957646cf849ca10d17d.
>>>>>>
>>>>>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>>>>>> ---
>>>>>>     drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
>>>>>>     drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--
>>>>>>     drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
>>>>>>     drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++
>>>>>>     drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>>>>>>     5 files changed, 22 insertions(+), 5 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>> index ac43b1af69e3..0042868dbd53 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>>>>>                 5 + /* COND_EXEC */
>>>>>>                 7 + /* PIPELINE_SYNC */
>>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>>                 2 + /* VM_FLUSH */
>>>>>>                 8 + /* FENCE for VM_FLUSH */
>>>>>>                 20 + /* GDS switch */
>>>>>> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>>>>>                 5 + /* hdp invalidate */
>>>>>>                 7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>>                 2 + /* gfx_v10_0_ring_emit_vm_flush */
>>>>>>                 8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>>>>>>         .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>> index 9fe95e7693d5..9a7a717208de 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>>>>>                 5 +  /* COND_EXEC */
>>>>>>                 7 +  /* PIPELINE_SYNC */
>>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>>                 2 + /* VM_FLUSH */
>>>>>>                 8 +  /* FENCE for VM_FLUSH */
>>>>>>                 20 + /* GDS switch */
>>>>>> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>>>>>                 5 + /* hdp invalidate */
>>>>>>                 7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>>                 2 + /* gfx_v9_0_ring_emit_vm_flush */
>>>>>>                 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>>>>>>         .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>> index 6e1b25bd1fe7..100d526e9a42 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>> @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>>>
>>>>>>         amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>>>>>
>>>>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>>>>> +      * inquiry.
>>>>>> +      */
>>>>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
>>>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>>>> +             amdgpu_ring_emit_reg_wait(ring,
>>>>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>>>>> +
>>>>>>         /* wait for the invalidate to complete */
>>>>>>         amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>>>>>                                   1 << vmid, 1 << vmid);
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>> index 9f2a893871ec..8f3097e45299 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>>>         amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>>>>>>                               upper_32_bits(pd_addr));
>>>>>>
>>>>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>>>>> +      * inquiry.
>>>>>> +      */
>>>>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>>>> +             amdgpu_ring_emit_reg_wait(ring,
>>>>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>>>>> +
>>>>>>         amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>>>>>>                                             hub->vm_inv_eng0_ack + eng,
>>>>>>                                             req, 1 << vmid);
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>>> index b8fdb192f6d6..0c41b4fdc58b 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>>> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>>>>>>                 6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>>>>>>                 /* sdma_v5_0_ring_emit_vm_flush */
>>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
>>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
>>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>>>>>>                 10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>>>>>>         .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>>>>>>         .emit_ib = sdma_v5_0_ring_emit_ib,
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> 

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-26 12:09                     ` Koenig, Christian
  0 siblings, 0 replies; 30+ messages in thread
From: Koenig, Christian @ 2019-10-26 12:09 UTC (permalink / raw)
  To: Tuikov, Luben, Alex Deucher
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 26.10.19 um 00:45 schrieb Tuikov, Luben:
> On 2019-10-25 12:19 p.m., Koenig, Christian wrote:
>> Am 25.10.19 um 18:05 schrieb Alex Deucher:
>>> On Fri, Oct 25, 2019 at 2:49 AM Koenig, Christian
>>> <Christian.Koenig@amd.com> wrote:
>>>> Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
>>>>> The GRBM interface is now capable of bursting
>>>>> 1-cycle op per register, a WRITE followed by
>>>>> another WRITE, or a WRITE followed by a READ--much
>>>>> faster than previous muti-cycle per
>>>>> completed-transaction interface. This causes a
>>>>> problem, whereby status registers requiring a
>>>>> read/write by hardware, have a 1-cycle delay, due
>>>>> to the register update having to go through GRBM
>>>>> interface.
>>>>>
>>>>> This patch adds this delay.
>>>>>
>>>>> A one cycle read op is added after updating the
>>>>> invalidate request and before reading the
>>>>> invalidate-ACK status.
>>>> Please completely drop all changes for GFX9 since this patch will most
>>>> likely break SRIOV.
>>>>
>>>> Additional to that please apply the workaround only to SDMA since the CP
>>>> driven engines should handle that in firmware.
> Thank you Christian for reviewing this patch.
>
> This patch stirred quite a bit of noise. So, then, I'll go by
> your last comment above--I suppose this is the desired way to go forward then?

You most likely broke the SRIOV use case on GFX9 with that, no wonder 
that this raised eyebrows.

As far as I can see this manual workaround is only applicable to the 
SDMA on Navi.

But we should double check that the CP firmware interface with the 
combined write/wait command is correctly used on Navi/GFX10 as well. 
IIRC that came in rather late for GFX9, could be that the Navi bringup 
branch never had that.

Regards,
Christian.

>
> Regards,
> Luben
>
>
>>> I think the CP only handles this in firmware if we use the new TLB
>>> invalidation packet.  I don't think it applies it to general register
>>> writes like we do.
>> No, on the CP we should use the combined write/wait command even if we
>> don't use the new specialized VM invalidate command. Everything else
>> won't work with SRIOV.
>>
>> Even if we want to we can't insert an extra read in this combined
>> write/wait command. And if we split up the commands we would break SRIOV
>> once more.
>>
>> So applying this workaround to the CP code doesn't make any sense at all.
>>
>> The only TODO which I can see is that we maybe don't use the combined
>> write/wait command on Navi yet.
>>
>> Christian.
>>
>>> Alex
>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> See also commit
>>>>> 534991731cb5fa94b5519957646cf849ca10d17d.
>>>>>
>>>>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>>>>> ---
>>>>>     drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
>>>>>     drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--
>>>>>     drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
>>>>>     drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++
>>>>>     drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>>>>>     5 files changed, 22 insertions(+), 5 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>> index ac43b1af69e3..0042868dbd53 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>>>>                 5 + /* COND_EXEC */
>>>>>                 7 + /* PIPELINE_SYNC */
>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>                 2 + /* VM_FLUSH */
>>>>>                 8 + /* FENCE for VM_FLUSH */
>>>>>                 20 + /* GDS switch */
>>>>> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>>>>                 5 + /* hdp invalidate */
>>>>>                 7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>                 2 + /* gfx_v10_0_ring_emit_vm_flush */
>>>>>                 8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>>>>>         .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>> index 9fe95e7693d5..9a7a717208de 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>>>>                 5 +  /* COND_EXEC */
>>>>>                 7 +  /* PIPELINE_SYNC */
>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>                 2 + /* VM_FLUSH */
>>>>>                 8 +  /* FENCE for VM_FLUSH */
>>>>>                 20 + /* GDS switch */
>>>>> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>>>>                 5 + /* hdp invalidate */
>>>>>                 7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>                 2 + /* gfx_v9_0_ring_emit_vm_flush */
>>>>>                 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>>>>>         .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>> index 6e1b25bd1fe7..100d526e9a42 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>> @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>>
>>>>>         amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>>>>
>>>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>>>> +      * inquiry.
>>>>> +      */
>>>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
>>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>>> +             amdgpu_ring_emit_reg_wait(ring,
>>>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>>>> +
>>>>>         /* wait for the invalidate to complete */
>>>>>         amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>>>>                                   1 << vmid, 1 << vmid);
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>> index 9f2a893871ec..8f3097e45299 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>>         amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>>>>>                               upper_32_bits(pd_addr));
>>>>>
>>>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>>>> +      * inquiry.
>>>>> +      */
>>>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>>> +             amdgpu_ring_emit_reg_wait(ring,
>>>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>>>> +
>>>>>         amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>>>>>                                             hub->vm_inv_eng0_ack + eng,
>>>>>                                             req, 1 << vmid);
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>> index b8fdb192f6d6..0c41b4fdc58b 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>>>>>                 6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>>>>>                 /* sdma_v5_0_ring_emit_vm_flush */
>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>>>>>                 10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>>>>>         .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>>>>>         .emit_ib = sdma_v5_0_ring_emit_ib,
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-26 12:09                     ` Koenig, Christian
  0 siblings, 0 replies; 30+ messages in thread
From: Koenig, Christian @ 2019-10-26 12:09 UTC (permalink / raw)
  To: Tuikov, Luben, Alex Deucher
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, amd-gfx

Am 26.10.19 um 00:45 schrieb Tuikov, Luben:
> On 2019-10-25 12:19 p.m., Koenig, Christian wrote:
>> Am 25.10.19 um 18:05 schrieb Alex Deucher:
>>> On Fri, Oct 25, 2019 at 2:49 AM Koenig, Christian
>>> <Christian.Koenig@amd.com> wrote:
>>>> Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
>>>>> The GRBM interface is now capable of bursting
>>>>> 1-cycle op per register, a WRITE followed by
>>>>> another WRITE, or a WRITE followed by a READ--much
>>>>> faster than previous muti-cycle per
>>>>> completed-transaction interface. This causes a
>>>>> problem, whereby status registers requiring a
>>>>> read/write by hardware, have a 1-cycle delay, due
>>>>> to the register update having to go through GRBM
>>>>> interface.
>>>>>
>>>>> This patch adds this delay.
>>>>>
>>>>> A one cycle read op is added after updating the
>>>>> invalidate request and before reading the
>>>>> invalidate-ACK status.
>>>> Please completely drop all changes for GFX9 since this patch will most
>>>> likely break SRIOV.
>>>>
>>>> Additional to that please apply the workaround only to SDMA since the CP
>>>> driven engines should handle that in firmware.
> Thank you Christian for reviewing this patch.
>
> This patch stirred quite a bit of noise. So, then, I'll go by
> your last comment above--I suppose this is the desired way to go forward then?

You most likely broke the SRIOV use case on GFX9 with that, no wonder 
that this raised eyebrows.

As far as I can see this manual workaround is only applicable to the 
SDMA on Navi.

But we should double check that the CP firmware interface with the 
combined write/wait command is correctly used on Navi/GFX10 as well. 
IIRC that came in rather late for GFX9, could be that the Navi bringup 
branch never had that.

Regards,
Christian.

>
> Regards,
> Luben
>
>
>>> I think the CP only handles this in firmware if we use the new TLB
>>> invalidation packet.  I don't think it applies it to general register
>>> writes like we do.
>> No, on the CP we should use the combined write/wait command even if we
>> don't use the new specialized VM invalidate command. Everything else
>> won't work with SRIOV.
>>
>> Even if we want to we can't insert an extra read in this combined
>> write/wait command. And if we split up the commands we would break SRIOV
>> once more.
>>
>> So applying this workaround to the CP code doesn't make any sense at all.
>>
>> The only TODO which I can see is that we maybe don't use the combined
>> write/wait command on Navi yet.
>>
>> Christian.
>>
>>> Alex
>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> See also commit
>>>>> 534991731cb5fa94b5519957646cf849ca10d17d.
>>>>>
>>>>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>>>>> ---
>>>>>     drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
>>>>>     drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--
>>>>>     drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
>>>>>     drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++
>>>>>     drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>>>>>     5 files changed, 22 insertions(+), 5 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>> index ac43b1af69e3..0042868dbd53 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>>>>                 5 + /* COND_EXEC */
>>>>>                 7 + /* PIPELINE_SYNC */
>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>                 2 + /* VM_FLUSH */
>>>>>                 8 + /* FENCE for VM_FLUSH */
>>>>>                 20 + /* GDS switch */
>>>>> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>>>>                 5 + /* hdp invalidate */
>>>>>                 7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>                 2 + /* gfx_v10_0_ring_emit_vm_flush */
>>>>>                 8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>>>>>         .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>> index 9fe95e7693d5..9a7a717208de 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>>>>                 5 +  /* COND_EXEC */
>>>>>                 7 +  /* PIPELINE_SYNC */
>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>                 2 + /* VM_FLUSH */
>>>>>                 8 +  /* FENCE for VM_FLUSH */
>>>>>                 20 + /* GDS switch */
>>>>> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>>>>                 5 + /* hdp invalidate */
>>>>>                 7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>>                 2 + /* gfx_v9_0_ring_emit_vm_flush */
>>>>>                 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>>>>>         .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>> index 6e1b25bd1fe7..100d526e9a42 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>> @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>>
>>>>>         amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>>>>
>>>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>>>> +      * inquiry.
>>>>> +      */
>>>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
>>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>>> +             amdgpu_ring_emit_reg_wait(ring,
>>>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>>>> +
>>>>>         /* wait for the invalidate to complete */
>>>>>         amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>>>>                                   1 << vmid, 1 << vmid);
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>> index 9f2a893871ec..8f3097e45299 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>>         amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>>>>>                               upper_32_bits(pd_addr));
>>>>>
>>>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>>>> +      * inquiry.
>>>>> +      */
>>>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>>> +             amdgpu_ring_emit_reg_wait(ring,
>>>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>>>> +
>>>>>         amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>>>>>                                             hub->vm_inv_eng0_ack + eng,
>>>>>                                             req, 1 << vmid);
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>> index b8fdb192f6d6..0c41b4fdc58b 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>>> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>>>>>                 6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>>>>>                 /* sdma_v5_0_ring_emit_vm_flush */
>>>>>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
>>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
>>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>>>>>                 10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>>>>>         .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>>>>>         .emit_ib = sdma_v5_0_ring_emit_ib,
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25 22:45                 ` Tuikov, Luben
  0 siblings, 0 replies; 30+ messages in thread
From: Tuikov, Luben @ 2019-10-25 22:45 UTC (permalink / raw)
  To: Koenig, Christian, Alex Deucher
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On 2019-10-25 12:19 p.m., Koenig, Christian wrote:
> Am 25.10.19 um 18:05 schrieb Alex Deucher:
>> On Fri, Oct 25, 2019 at 2:49 AM Koenig, Christian
>> <Christian.Koenig@amd.com> wrote:
>>> Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
>>>> The GRBM interface is now capable of bursting
>>>> 1-cycle op per register, a WRITE followed by
>>>> another WRITE, or a WRITE followed by a READ--much
>>>> faster than previous muti-cycle per
>>>> completed-transaction interface. This causes a
>>>> problem, whereby status registers requiring a
>>>> read/write by hardware, have a 1-cycle delay, due
>>>> to the register update having to go through GRBM
>>>> interface.
>>>>
>>>> This patch adds this delay.
>>>>
>>>> A one cycle read op is added after updating the
>>>> invalidate request and before reading the
>>>> invalidate-ACK status.
>>> Please completely drop all changes for GFX9 since this patch will most
>>> likely break SRIOV.
>>>
>>> Additional to that please apply the workaround only to SDMA since the CP
>>> driven engines should handle that in firmware.

Thank you Christian for reviewing this patch.

This patch stirred quite a bit of noise. So, then, I'll go by
your last comment above--I suppose this is the desired way to go forward then?

Regards,
Luben


>> I think the CP only handles this in firmware if we use the new TLB
>> invalidation packet.  I don't think it applies it to general register
>> writes like we do.
> 
> No, on the CP we should use the combined write/wait command even if we 
> don't use the new specialized VM invalidate command. Everything else 
> won't work with SRIOV.
> 
> Even if we want to we can't insert an extra read in this combined 
> write/wait command. And if we split up the commands we would break SRIOV 
> once more.
> 
> So applying this workaround to the CP code doesn't make any sense at all.
> 
> The only TODO which I can see is that we maybe don't use the combined 
> write/wait command on Navi yet.
> 
> Christian.
> 
>>
>> Alex
>>
>>> Regards,
>>> Christian.
>>>
>>>> See also commit
>>>> 534991731cb5fa94b5519957646cf849ca10d17d.
>>>>
>>>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>>>> ---
>>>>    drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
>>>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--
>>>>    drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
>>>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++
>>>>    drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>>>>    5 files changed, 22 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> index ac43b1af69e3..0042868dbd53 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>>>                5 + /* COND_EXEC */
>>>>                7 + /* PIPELINE_SYNC */
>>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>                2 + /* VM_FLUSH */
>>>>                8 + /* FENCE for VM_FLUSH */
>>>>                20 + /* GDS switch */
>>>> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>>>                5 + /* hdp invalidate */
>>>>                7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>                2 + /* gfx_v10_0_ring_emit_vm_flush */
>>>>                8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>>>>        .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> index 9fe95e7693d5..9a7a717208de 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>>>                5 +  /* COND_EXEC */
>>>>                7 +  /* PIPELINE_SYNC */
>>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>                2 + /* VM_FLUSH */
>>>>                8 +  /* FENCE for VM_FLUSH */
>>>>                20 + /* GDS switch */
>>>> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>>>                5 + /* hdp invalidate */
>>>>                7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>                2 + /* gfx_v9_0_ring_emit_vm_flush */
>>>>                8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>>>>        .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>> index 6e1b25bd1fe7..100d526e9a42 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>> @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>
>>>>        amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>>>
>>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>>> +      * inquiry.
>>>> +      */
>>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>> +             amdgpu_ring_emit_reg_wait(ring,
>>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>>> +
>>>>        /* wait for the invalidate to complete */
>>>>        amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>>>                                  1 << vmid, 1 << vmid);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> index 9f2a893871ec..8f3097e45299 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>        amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>>>>                              upper_32_bits(pd_addr));
>>>>
>>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>>> +      * inquiry.
>>>> +      */
>>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>> +             amdgpu_ring_emit_reg_wait(ring,
>>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>>> +
>>>>        amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>>>>                                            hub->vm_inv_eng0_ack + eng,
>>>>                                            req, 1 << vmid);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>> index b8fdb192f6d6..0c41b4fdc58b 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>>>>                6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>>>>                /* sdma_v5_0_ring_emit_vm_flush */
>>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>>>>                10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>>>>        .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>>>>        .emit_ib = sdma_v5_0_ring_emit_ib,
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> 

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25 22:45                 ` Tuikov, Luben
  0 siblings, 0 replies; 30+ messages in thread
From: Tuikov, Luben @ 2019-10-25 22:45 UTC (permalink / raw)
  To: Koenig, Christian, Alex Deucher
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, amd-gfx

On 2019-10-25 12:19 p.m., Koenig, Christian wrote:
> Am 25.10.19 um 18:05 schrieb Alex Deucher:
>> On Fri, Oct 25, 2019 at 2:49 AM Koenig, Christian
>> <Christian.Koenig@amd.com> wrote:
>>> Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
>>>> The GRBM interface is now capable of bursting
>>>> 1-cycle op per register, a WRITE followed by
>>>> another WRITE, or a WRITE followed by a READ--much
>>>> faster than previous muti-cycle per
>>>> completed-transaction interface. This causes a
>>>> problem, whereby status registers requiring a
>>>> read/write by hardware, have a 1-cycle delay, due
>>>> to the register update having to go through GRBM
>>>> interface.
>>>>
>>>> This patch adds this delay.
>>>>
>>>> A one cycle read op is added after updating the
>>>> invalidate request and before reading the
>>>> invalidate-ACK status.
>>> Please completely drop all changes for GFX9 since this patch will most
>>> likely break SRIOV.
>>>
>>> Additional to that please apply the workaround only to SDMA since the CP
>>> driven engines should handle that in firmware.

Thank you Christian for reviewing this patch.

This patch stirred quite a bit of noise. So, then, I'll go by
your last comment above--I suppose this is the desired way to go forward then?

Regards,
Luben


>> I think the CP only handles this in firmware if we use the new TLB
>> invalidation packet.  I don't think it applies it to general register
>> writes like we do.
> 
> No, on the CP we should use the combined write/wait command even if we 
> don't use the new specialized VM invalidate command. Everything else 
> won't work with SRIOV.
> 
> Even if we want to we can't insert an extra read in this combined 
> write/wait command. And if we split up the commands we would break SRIOV 
> once more.
> 
> So applying this workaround to the CP code doesn't make any sense at all.
> 
> The only TODO which I can see is that we maybe don't use the combined 
> write/wait command on Navi yet.
> 
> Christian.
> 
>>
>> Alex
>>
>>> Regards,
>>> Christian.
>>>
>>>> See also commit
>>>> 534991731cb5fa94b5519957646cf849ca10d17d.
>>>>
>>>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>>>> ---
>>>>    drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
>>>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--
>>>>    drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
>>>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++
>>>>    drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>>>>    5 files changed, 22 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> index ac43b1af69e3..0042868dbd53 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>>>                5 + /* COND_EXEC */
>>>>                7 + /* PIPELINE_SYNC */
>>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>                2 + /* VM_FLUSH */
>>>>                8 + /* FENCE for VM_FLUSH */
>>>>                20 + /* GDS switch */
>>>> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>>>                5 + /* hdp invalidate */
>>>>                7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>                2 + /* gfx_v10_0_ring_emit_vm_flush */
>>>>                8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>>>>        .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> index 9fe95e7693d5..9a7a717208de 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>>>                5 +  /* COND_EXEC */
>>>>                7 +  /* PIPELINE_SYNC */
>>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>                2 + /* VM_FLUSH */
>>>>                8 +  /* FENCE for VM_FLUSH */
>>>>                20 + /* GDS switch */
>>>> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>>>                5 + /* hdp invalidate */
>>>>                7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>>                2 + /* gfx_v9_0_ring_emit_vm_flush */
>>>>                8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>>>>        .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>> index 6e1b25bd1fe7..100d526e9a42 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>> @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>
>>>>        amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>>>
>>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>>> +      * inquiry.
>>>> +      */
>>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>> +             amdgpu_ring_emit_reg_wait(ring,
>>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>>> +
>>>>        /* wait for the invalidate to complete */
>>>>        amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>>>                                  1 << vmid, 1 << vmid);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> index 9f2a893871ec..8f3097e45299 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>>        amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>>>>                              upper_32_bits(pd_addr));
>>>>
>>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>>> +      * inquiry.
>>>> +      */
>>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>>> +             amdgpu_ring_emit_reg_wait(ring,
>>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>>> +
>>>>        amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>>>>                                            hub->vm_inv_eng0_ack + eng,
>>>>                                            req, 1 << vmid);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>> index b8fdb192f6d6..0c41b4fdc58b 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>>>>                6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>>>>                /* sdma_v5_0_ring_emit_vm_flush */
>>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
>>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
>>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>>>>                10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>>>>        .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>>>>        .emit_ib = sdma_v5_0_ring_emit_ib,
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> 

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25 16:19             ` Koenig, Christian
  0 siblings, 0 replies; 30+ messages in thread
From: Koenig, Christian @ 2019-10-25 16:19 UTC (permalink / raw)
  To: Alex Deucher
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Tuikov, Luben,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 25.10.19 um 18:05 schrieb Alex Deucher:
> On Fri, Oct 25, 2019 at 2:49 AM Koenig, Christian
> <Christian.Koenig@amd.com> wrote:
>> Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
>>> The GRBM interface is now capable of bursting
>>> 1-cycle op per register, a WRITE followed by
>>> another WRITE, or a WRITE followed by a READ--much
>>> faster than previous muti-cycle per
>>> completed-transaction interface. This causes a
>>> problem, whereby status registers requiring a
>>> read/write by hardware, have a 1-cycle delay, due
>>> to the register update having to go through GRBM
>>> interface.
>>>
>>> This patch adds this delay.
>>>
>>> A one cycle read op is added after updating the
>>> invalidate request and before reading the
>>> invalidate-ACK status.
>> Please completely drop all changes for GFX9 since this patch will most
>> likely break SRIOV.
>>
>> Additional to that please apply the workaround only to SDMA since the CP
>> driven engines should handle that in firmware.
> I think the CP only handles this in firmware if we use the new TLB
> invalidation packet.  I don't think it applies it to general register
> writes like we do.

No, on the CP we should use the combined write/wait command even if we 
don't use the new specialized VM invalidate command. Everything else 
won't work with SRIOV.

Even if we want to we can't insert an extra read in this combined 
write/wait command. And if we split up the commands we would break SRIOV 
once more.

So applying this workaround to the CP code doesn't make any sense at all.

The only TODO which I can see is that we maybe don't use the combined 
write/wait command on Navi yet.

Christian.

>
> Alex
>
>> Regards,
>> Christian.
>>
>>> See also commit
>>> 534991731cb5fa94b5519957646cf849ca10d17d.
>>>
>>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--
>>>    drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
>>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++
>>>    drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>>>    5 files changed, 22 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> index ac43b1af69e3..0042868dbd53 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>>                5 + /* COND_EXEC */
>>>                7 + /* PIPELINE_SYNC */
>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>                2 + /* VM_FLUSH */
>>>                8 + /* FENCE for VM_FLUSH */
>>>                20 + /* GDS switch */
>>> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>>                5 + /* hdp invalidate */
>>>                7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>                2 + /* gfx_v10_0_ring_emit_vm_flush */
>>>                8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>>>        .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> index 9fe95e7693d5..9a7a717208de 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>>                5 +  /* COND_EXEC */
>>>                7 +  /* PIPELINE_SYNC */
>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>                2 + /* VM_FLUSH */
>>>                8 +  /* FENCE for VM_FLUSH */
>>>                20 + /* GDS switch */
>>> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>>                5 + /* hdp invalidate */
>>>                7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>                2 + /* gfx_v9_0_ring_emit_vm_flush */
>>>                8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>>>        .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> index 6e1b25bd1fe7..100d526e9a42 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>
>>>        amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>>
>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>> +      * inquiry.
>>> +      */
>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
>>> +         ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>> +             amdgpu_ring_emit_reg_wait(ring,
>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>> +
>>>        /* wait for the invalidate to complete */
>>>        amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>>                                  1 << vmid, 1 << vmid);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> index 9f2a893871ec..8f3097e45299 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>        amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>>>                              upper_32_bits(pd_addr));
>>>
>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>> +      * inquiry.
>>> +      */
>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>> +             amdgpu_ring_emit_reg_wait(ring,
>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>> +
>>>        amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>>>                                            hub->vm_inv_eng0_ack + eng,
>>>                                            req, 1 << vmid);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> index b8fdb192f6d6..0c41b4fdc58b 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>>>                6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>>>                /* sdma_v5_0_ring_emit_vm_flush */
>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>>>                10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>>>        .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>>>        .emit_ib = sdma_v5_0_ring_emit_ib,
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25 16:19             ` Koenig, Christian
  0 siblings, 0 replies; 30+ messages in thread
From: Koenig, Christian @ 2019-10-25 16:19 UTC (permalink / raw)
  To: Alex Deucher
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Tuikov, Luben, amd-gfx

Am 25.10.19 um 18:05 schrieb Alex Deucher:
> On Fri, Oct 25, 2019 at 2:49 AM Koenig, Christian
> <Christian.Koenig@amd.com> wrote:
>> Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
>>> The GRBM interface is now capable of bursting
>>> 1-cycle op per register, a WRITE followed by
>>> another WRITE, or a WRITE followed by a READ--much
>>> faster than previous muti-cycle per
>>> completed-transaction interface. This causes a
>>> problem, whereby status registers requiring a
>>> read/write by hardware, have a 1-cycle delay, due
>>> to the register update having to go through GRBM
>>> interface.
>>>
>>> This patch adds this delay.
>>>
>>> A one cycle read op is added after updating the
>>> invalidate request and before reading the
>>> invalidate-ACK status.
>> Please completely drop all changes for GFX9 since this patch will most
>> likely break SRIOV.
>>
>> Additional to that please apply the workaround only to SDMA since the CP
>> driven engines should handle that in firmware.
> I think the CP only handles this in firmware if we use the new TLB
> invalidation packet.  I don't think it applies it to general register
> writes like we do.

No, on the CP we should use the combined write/wait command even if we 
don't use the new specialized VM invalidate command. Everything else 
won't work with SRIOV.

Even if we want to we can't insert an extra read in this combined 
write/wait command. And if we split up the commands we would break SRIOV 
once more.

So applying this workaround to the CP code doesn't make any sense at all.

The only TODO which I can see is that we maybe don't use the combined 
write/wait command on Navi yet.

Christian.

>
> Alex
>
>> Regards,
>> Christian.
>>
>>> See also commit
>>> 534991731cb5fa94b5519957646cf849ca10d17d.
>>>
>>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--
>>>    drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
>>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++
>>>    drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>>>    5 files changed, 22 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> index ac43b1af69e3..0042868dbd53 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>>                5 + /* COND_EXEC */
>>>                7 + /* PIPELINE_SYNC */
>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>                2 + /* VM_FLUSH */
>>>                8 + /* FENCE for VM_FLUSH */
>>>                20 + /* GDS switch */
>>> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>>                5 + /* hdp invalidate */
>>>                7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>                2 + /* gfx_v10_0_ring_emit_vm_flush */
>>>                8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>>>        .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> index 9fe95e7693d5..9a7a717208de 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>>                5 +  /* COND_EXEC */
>>>                7 +  /* PIPELINE_SYNC */
>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>                2 + /* VM_FLUSH */
>>>                8 +  /* FENCE for VM_FLUSH */
>>>                20 + /* GDS switch */
>>> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>>                5 + /* hdp invalidate */
>>>                7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>                2 + /* gfx_v9_0_ring_emit_vm_flush */
>>>                8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>>>        .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> index 6e1b25bd1fe7..100d526e9a42 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>
>>>        amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>>
>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>> +      * inquiry.
>>> +      */
>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
>>> +         ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>> +             amdgpu_ring_emit_reg_wait(ring,
>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>> +
>>>        /* wait for the invalidate to complete */
>>>        amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>>                                  1 << vmid, 1 << vmid);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> index 9f2a893871ec..8f3097e45299 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>        amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>>>                              upper_32_bits(pd_addr));
>>>
>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>> +      * inquiry.
>>> +      */
>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>> +             amdgpu_ring_emit_reg_wait(ring,
>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>> +
>>>        amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>>>                                            hub->vm_inv_eng0_ack + eng,
>>>                                            req, 1 << vmid);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> index b8fdb192f6d6..0c41b4fdc58b 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>>>                6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>>>                /* sdma_v5_0_ring_emit_vm_flush */
>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>>>                10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>>>        .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>>>        .emit_ib = sdma_v5_0_ring_emit_ib,
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25 16:05         ` Alex Deucher
  0 siblings, 0 replies; 30+ messages in thread
From: Alex Deucher @ 2019-10-25 16:05 UTC (permalink / raw)
  To: Koenig, Christian
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Tuikov, Luben,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On Fri, Oct 25, 2019 at 2:49 AM Koenig, Christian
<Christian.Koenig@amd.com> wrote:
>
> Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
> > The GRBM interface is now capable of bursting
> > 1-cycle op per register, a WRITE followed by
> > another WRITE, or a WRITE followed by a READ--much
> > faster than previous muti-cycle per
> > completed-transaction interface. This causes a
> > problem, whereby status registers requiring a
> > read/write by hardware, have a 1-cycle delay, due
> > to the register update having to go through GRBM
> > interface.
> >
> > This patch adds this delay.
> >
> > A one cycle read op is added after updating the
> > invalidate request and before reading the
> > invalidate-ACK status.
>
> Please completely drop all changes for GFX9 since this patch will most
> likely break SRIOV.
>
> Additional to that please apply the workaround only to SDMA since the CP
> driven engines should handle that in firmware.

I think the CP only handles this in firmware if we use the new TLB
invalidation packet.  I don't think it applies it to general register
writes like we do.

Alex

>
> Regards,
> Christian.
>
> >
> > See also commit
> > 534991731cb5fa94b5519957646cf849ca10d17d.
> >
> > Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
> >   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--
> >   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
> >   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++
> >   drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
> >   5 files changed, 22 insertions(+), 5 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > index ac43b1af69e3..0042868dbd53 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
> >               5 + /* COND_EXEC */
> >               7 + /* PIPELINE_SYNC */
> >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> > -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> > +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
> >               2 + /* VM_FLUSH */
> >               8 + /* FENCE for VM_FLUSH */
> >               20 + /* GDS switch */
> > @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
> >               5 + /* hdp invalidate */
> >               7 + /* gfx_v10_0_ring_emit_pipeline_sync */
> >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> > -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> > +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
> >               2 + /* gfx_v10_0_ring_emit_vm_flush */
> >               8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
> >       .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > index 9fe95e7693d5..9a7a717208de 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
> >               5 +  /* COND_EXEC */
> >               7 +  /* PIPELINE_SYNC */
> >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> > -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> > +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
> >               2 + /* VM_FLUSH */
> >               8 +  /* FENCE for VM_FLUSH */
> >               20 + /* GDS switch */
> > @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
> >               5 + /* hdp invalidate */
> >               7 + /* gfx_v9_0_ring_emit_pipeline_sync */
> >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> > -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> > +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
> >               2 + /* gfx_v9_0_ring_emit_vm_flush */
> >               8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
> >       .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > index 6e1b25bd1fe7..100d526e9a42 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> >
> >       amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
> >
> > +     /* Insert a dummy read to delay one cycle before the ACK
> > +      * inquiry.
> > +      */
> > +     if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
> > +         ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
> > +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> > +             amdgpu_ring_emit_reg_wait(ring,
> > +                                       hub->vm_inv_eng0_req + eng, 0, 0);
> > +
> >       /* wait for the invalidate to complete */
> >       amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
> >                                 1 << vmid, 1 << vmid);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > index 9f2a893871ec..8f3097e45299 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> >       amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
> >                             upper_32_bits(pd_addr));
> >
> > +     /* Insert a dummy read to delay one cycle before the ACK
> > +      * inquiry.
> > +      */
> > +     if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
> > +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> > +             amdgpu_ring_emit_reg_wait(ring,
> > +                                       hub->vm_inv_eng0_req + eng, 0, 0);
> > +
> >       amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
> >                                           hub->vm_inv_eng0_ack + eng,
> >                                           req, 1 << vmid);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> > index b8fdb192f6d6..0c41b4fdc58b 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> > @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
> >               6 + /* sdma_v5_0_ring_emit_pipeline_sync */
> >               /* sdma_v5_0_ring_emit_vm_flush */
> >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
> > -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
> > +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
> >               10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
> >       .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
> >       .emit_ib = sdma_v5_0_ring_emit_ib,
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25 16:05         ` Alex Deucher
  0 siblings, 0 replies; 30+ messages in thread
From: Alex Deucher @ 2019-10-25 16:05 UTC (permalink / raw)
  To: Koenig, Christian
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Tuikov, Luben, amd-gfx

On Fri, Oct 25, 2019 at 2:49 AM Koenig, Christian
<Christian.Koenig@amd.com> wrote:
>
> Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
> > The GRBM interface is now capable of bursting
> > 1-cycle op per register, a WRITE followed by
> > another WRITE, or a WRITE followed by a READ--much
> > faster than previous muti-cycle per
> > completed-transaction interface. This causes a
> > problem, whereby status registers requiring a
> > read/write by hardware, have a 1-cycle delay, due
> > to the register update having to go through GRBM
> > interface.
> >
> > This patch adds this delay.
> >
> > A one cycle read op is added after updating the
> > invalidate request and before reading the
> > invalidate-ACK status.
>
> Please completely drop all changes for GFX9 since this patch will most
> likely break SRIOV.
>
> Additional to that please apply the workaround only to SDMA since the CP
> driven engines should handle that in firmware.

I think the CP only handles this in firmware if we use the new TLB
invalidation packet.  I don't think it applies it to general register
writes like we do.

Alex

>
> Regards,
> Christian.
>
> >
> > See also commit
> > 534991731cb5fa94b5519957646cf849ca10d17d.
> >
> > Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
> >   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--
> >   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
> >   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++
> >   drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
> >   5 files changed, 22 insertions(+), 5 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > index ac43b1af69e3..0042868dbd53 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
> >               5 + /* COND_EXEC */
> >               7 + /* PIPELINE_SYNC */
> >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> > -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> > +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
> >               2 + /* VM_FLUSH */
> >               8 + /* FENCE for VM_FLUSH */
> >               20 + /* GDS switch */
> > @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
> >               5 + /* hdp invalidate */
> >               7 + /* gfx_v10_0_ring_emit_pipeline_sync */
> >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> > -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> > +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
> >               2 + /* gfx_v10_0_ring_emit_vm_flush */
> >               8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
> >       .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > index 9fe95e7693d5..9a7a717208de 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
> >               5 +  /* COND_EXEC */
> >               7 +  /* PIPELINE_SYNC */
> >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> > -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> > +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
> >               2 + /* VM_FLUSH */
> >               8 +  /* FENCE for VM_FLUSH */
> >               20 + /* GDS switch */
> > @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
> >               5 + /* hdp invalidate */
> >               7 + /* gfx_v9_0_ring_emit_pipeline_sync */
> >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> > -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> > +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
> >               2 + /* gfx_v9_0_ring_emit_vm_flush */
> >               8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
> >       .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > index 6e1b25bd1fe7..100d526e9a42 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> >
> >       amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
> >
> > +     /* Insert a dummy read to delay one cycle before the ACK
> > +      * inquiry.
> > +      */
> > +     if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
> > +         ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
> > +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> > +             amdgpu_ring_emit_reg_wait(ring,
> > +                                       hub->vm_inv_eng0_req + eng, 0, 0);
> > +
> >       /* wait for the invalidate to complete */
> >       amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
> >                                 1 << vmid, 1 << vmid);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > index 9f2a893871ec..8f3097e45299 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> >       amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
> >                             upper_32_bits(pd_addr));
> >
> > +     /* Insert a dummy read to delay one cycle before the ACK
> > +      * inquiry.
> > +      */
> > +     if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
> > +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> > +             amdgpu_ring_emit_reg_wait(ring,
> > +                                       hub->vm_inv_eng0_req + eng, 0, 0);
> > +
> >       amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
> >                                           hub->vm_inv_eng0_ack + eng,
> >                                           req, 1 << vmid);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> > index b8fdb192f6d6..0c41b4fdc58b 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> > @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
> >               6 + /* sdma_v5_0_ring_emit_pipeline_sync */
> >               /* sdma_v5_0_ring_emit_vm_flush */
> >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
> > -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
> > +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
> >               10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
> >       .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
> >       .emit_ib = sdma_v5_0_ring_emit_ib,
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25  6:49     ` Koenig, Christian
  0 siblings, 0 replies; 30+ messages in thread
From: Koenig, Christian @ 2019-10-25  6:49 UTC (permalink / raw)
  To: Tuikov, Luben, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric

Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
> The GRBM interface is now capable of bursting
> 1-cycle op per register, a WRITE followed by
> another WRITE, or a WRITE followed by a READ--much
> faster than previous muti-cycle per
> completed-transaction interface. This causes a
> problem, whereby status registers requiring a
> read/write by hardware, have a 1-cycle delay, due
> to the register update having to go through GRBM
> interface.
>
> This patch adds this delay.
>
> A one cycle read op is added after updating the
> invalidate request and before reading the
> invalidate-ACK status.

Please completely drop all changes for GFX9 since this patch will most 
likely break SRIOV.

Additional to that please apply the workaround only to SDMA since the CP 
driven engines should handle that in firmware.

Regards,
Christian.

>
> See also commit
> 534991731cb5fa94b5519957646cf849ca10d17d.
>
> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++
>   drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>   5 files changed, 22 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index ac43b1af69e3..0042868dbd53 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>   		5 + /* COND_EXEC */
>   		7 + /* PIPELINE_SYNC */
>   		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>   		2 + /* VM_FLUSH */
>   		8 + /* FENCE for VM_FLUSH */
>   		20 + /* GDS switch */
> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>   		5 + /* hdp invalidate */
>   		7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>   		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>   		2 + /* gfx_v10_0_ring_emit_vm_flush */
>   		8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>   	.emit_ib_size =	7, /* gfx_v10_0_ring_emit_ib_compute */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 9fe95e7693d5..9a7a717208de 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>   		5 +  /* COND_EXEC */
>   		7 +  /* PIPELINE_SYNC */
>   		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>   		2 + /* VM_FLUSH */
>   		8 +  /* FENCE for VM_FLUSH */
>   		20 + /* GDS switch */
> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>   		5 + /* hdp invalidate */
>   		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>   		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>   		2 + /* gfx_v9_0_ring_emit_vm_flush */
>   		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>   	.emit_ib_size =	7, /* gfx_v9_0_ring_emit_ib_compute */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 6e1b25bd1fe7..100d526e9a42 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   
>   	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>   
> +	/* Insert a dummy read to delay one cycle before the ACK
> +	 * inquiry.
> +	 */
> +	if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
> +	    ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
> +	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> +		amdgpu_ring_emit_reg_wait(ring,
> +					  hub->vm_inv_eng0_req + eng, 0, 0);
> +
>   	/* wait for the invalidate to complete */
>   	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>   				  1 << vmid, 1 << vmid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 9f2a893871ec..8f3097e45299 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>   			      upper_32_bits(pd_addr));
>   
> +	/* Insert a dummy read to delay one cycle before the ACK
> +	 * inquiry.
> +	 */
> +	if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
> +	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> +		amdgpu_ring_emit_reg_wait(ring,
> +					  hub->vm_inv_eng0_req + eng, 0, 0);
> +
>   	amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>   					    hub->vm_inv_eng0_ack + eng,
>   					    req, 1 << vmid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> index b8fdb192f6d6..0c41b4fdc58b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>   		6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>   		/* sdma_v5_0_ring_emit_vm_flush */
>   		SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>   		10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>   	.emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>   	.emit_ib = sdma_v5_0_ring_emit_ib,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25  6:49     ` Koenig, Christian
  0 siblings, 0 replies; 30+ messages in thread
From: Koenig, Christian @ 2019-10-25  6:49 UTC (permalink / raw)
  To: Tuikov, Luben, amd-gfx; +Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric

Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
> The GRBM interface is now capable of bursting
> 1-cycle op per register, a WRITE followed by
> another WRITE, or a WRITE followed by a READ--much
> faster than previous muti-cycle per
> completed-transaction interface. This causes a
> problem, whereby status registers requiring a
> read/write by hardware, have a 1-cycle delay, due
> to the register update having to go through GRBM
> interface.
>
> This patch adds this delay.
>
> A one cycle read op is added after updating the
> invalidate request and before reading the
> invalidate-ACK status.

Please completely drop all changes for GFX9 since this patch will most 
likely break SRIOV.

Additional to that please apply the workaround only to SDMA since the CP 
driven engines should handle that in firmware.

Regards,
Christian.

>
> See also commit
> 534991731cb5fa94b5519957646cf849ca10d17d.
>
> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++
>   drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>   5 files changed, 22 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index ac43b1af69e3..0042868dbd53 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>   		5 + /* COND_EXEC */
>   		7 + /* PIPELINE_SYNC */
>   		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>   		2 + /* VM_FLUSH */
>   		8 + /* FENCE for VM_FLUSH */
>   		20 + /* GDS switch */
> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>   		5 + /* hdp invalidate */
>   		7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>   		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>   		2 + /* gfx_v10_0_ring_emit_vm_flush */
>   		8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>   	.emit_ib_size =	7, /* gfx_v10_0_ring_emit_ib_compute */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 9fe95e7693d5..9a7a717208de 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>   		5 +  /* COND_EXEC */
>   		7 +  /* PIPELINE_SYNC */
>   		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>   		2 + /* VM_FLUSH */
>   		8 +  /* FENCE for VM_FLUSH */
>   		20 + /* GDS switch */
> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>   		5 + /* hdp invalidate */
>   		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>   		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>   		2 + /* gfx_v9_0_ring_emit_vm_flush */
>   		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>   	.emit_ib_size =	7, /* gfx_v9_0_ring_emit_ib_compute */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 6e1b25bd1fe7..100d526e9a42 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   
>   	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>   
> +	/* Insert a dummy read to delay one cycle before the ACK
> +	 * inquiry.
> +	 */
> +	if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
> +	    ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
> +	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> +		amdgpu_ring_emit_reg_wait(ring,
> +					  hub->vm_inv_eng0_req + eng, 0, 0);
> +
>   	/* wait for the invalidate to complete */
>   	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>   				  1 << vmid, 1 << vmid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 9f2a893871ec..8f3097e45299 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>   			      upper_32_bits(pd_addr));
>   
> +	/* Insert a dummy read to delay one cycle before the ACK
> +	 * inquiry.
> +	 */
> +	if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
> +	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> +		amdgpu_ring_emit_reg_wait(ring,
> +					  hub->vm_inv_eng0_req + eng, 0, 0);
> +
>   	amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>   					    hub->vm_inv_eng0_ack + eng,
>   					    req, 1 << vmid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> index b8fdb192f6d6..0c41b4fdc58b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>   		6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>   		/* sdma_v5_0_ring_emit_vm_flush */
>   		SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>   		10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>   	.emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>   	.emit_ib = sdma_v5_0_ring_emit_ib,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* RE: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25  3:20     ` Zhu, Changfeng
  0 siblings, 0 replies; 30+ messages in thread
From: Zhu, Changfeng @ 2019-10-25  3:20 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Tuikov, Luben,
	Koenig, Christian

Inline.


-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Tuikov, Luben
Sent: Friday, October 25, 2019 5:17 AM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
Subject: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay

The GRBM interface is now capable of bursting 1-cycle op per register, a WRITE followed by another WRITE, or a WRITE followed by a READ--much faster than previous muti-cycle per completed-transaction interface. This causes a problem, whereby status registers requiring a read/write by hardware, have a 1-cycle delay, due to the register update having to go through GRBM interface.

This patch adds this delay.

A one cycle read op is added after updating the invalidate request and before reading the invalidate-ACK status.

See also commit
534991731cb5fa94b5519957646cf849ca10d17d.

Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++  drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
 5 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index ac43b1af69e3..0042868dbd53 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
 		5 + /* COND_EXEC */
 		7 + /* PIPELINE_SYNC */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* VM_FLUSH */
 		8 + /* FENCE for VM_FLUSH */
 		20 + /* GDS switch */
@@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
 		5 + /* hdp invalidate */
 		7 + /* gfx_v10_0_ring_emit_pipeline_sync */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* gfx_v10_0_ring_emit_vm_flush */
 		8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
 	.emit_ib_size =	7, /* gfx_v10_0_ring_emit_ib_compute */
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 9fe95e7693d5..9a7a717208de 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
 		5 +  /* COND_EXEC */
 		7 +  /* PIPELINE_SYNC */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* VM_FLUSH */
 		8 +  /* FENCE for VM_FLUSH */
 		20 + /* GDS switch */
@@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
 		5 + /* hdp invalidate */
 		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* gfx_v9_0_ring_emit_vm_flush */
 		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
 	.emit_ib_size =	7, /* gfx_v9_0_ring_emit_ib_compute */
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 6e1b25bd1fe7..100d526e9a42 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 
 	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
 
+	/* Insert a dummy read to delay one cycle before the ACK
+	 * inquiry.
+	 */
+	if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
+	    ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
+	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
+		amdgpu_ring_emit_reg_wait(ring,
+					  hub->vm_inv_eng0_req + eng, 0, 0);
+
 	/* wait for the invalidate to complete */
 	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
 				  1 << vmid, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 9f2a893871ec..8f3097e45299 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
 			      upper_32_bits(pd_addr));
 
+	/* Insert a dummy read to delay one cycle before the ACK
+	 * inquiry.
+	 */
+	if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
+	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
+		amdgpu_ring_emit_reg_wait(ring,
+					  hub->vm_inv_eng0_req + eng, 0, 0);
+


	Why do we add amdgpu_ring_emit_reg_wait here? There is no amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req); before it like gmc10.
	In gmc9,amdgpu_ring_emit_wreg  and amdgpu_ring_emit_reg_wait  are called in amdgpu_ring_emit_reg_write_reg_wait.
	I think it may be more reasonable to add dummy amdgpu_ring_emit_reg_wait in amdgpu_ring_emit_reg_write_reg_wait.
	Besides, we should also think about the influence of SROV's patch:
	drm/amdgpu: Remove the sriov checking and add firmware checking



 	amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
 					    hub->vm_inv_eng0_ack + eng,
 					    req, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index b8fdb192f6d6..0c41b4fdc58b 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
 		6 + /* sdma_v5_0_ring_emit_pipeline_sync */
 		/* sdma_v5_0_ring_emit_vm_flush */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
 		10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
 	.emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
 	.emit_ib = sdma_v5_0_ring_emit_ib,
--
2.23.0.385.gbc12974a89

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* RE: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25  3:20     ` Zhu, Changfeng
  0 siblings, 0 replies; 30+ messages in thread
From: Zhu, Changfeng @ 2019-10-25  3:20 UTC (permalink / raw)
  To: Tuikov, Luben, amd-gfx
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Tuikov, Luben,
	Koenig, Christian

Inline.


-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Tuikov, Luben
Sent: Friday, October 25, 2019 5:17 AM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
Subject: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay

The GRBM interface is now capable of bursting 1-cycle op per register, a WRITE followed by another WRITE, or a WRITE followed by a READ--much faster than previous muti-cycle per completed-transaction interface. This causes a problem, whereby status registers requiring a read/write by hardware, have a 1-cycle delay, due to the register update having to go through GRBM interface.

This patch adds this delay.

A one cycle read op is added after updating the invalidate request and before reading the invalidate-ACK status.

See also commit
534991731cb5fa94b5519957646cf849ca10d17d.

Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++  drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
 5 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index ac43b1af69e3..0042868dbd53 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
 		5 + /* COND_EXEC */
 		7 + /* PIPELINE_SYNC */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* VM_FLUSH */
 		8 + /* FENCE for VM_FLUSH */
 		20 + /* GDS switch */
@@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
 		5 + /* hdp invalidate */
 		7 + /* gfx_v10_0_ring_emit_pipeline_sync */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* gfx_v10_0_ring_emit_vm_flush */
 		8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
 	.emit_ib_size =	7, /* gfx_v10_0_ring_emit_ib_compute */
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 9fe95e7693d5..9a7a717208de 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
 		5 +  /* COND_EXEC */
 		7 +  /* PIPELINE_SYNC */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* VM_FLUSH */
 		8 +  /* FENCE for VM_FLUSH */
 		20 + /* GDS switch */
@@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
 		5 + /* hdp invalidate */
 		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* gfx_v9_0_ring_emit_vm_flush */
 		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
 	.emit_ib_size =	7, /* gfx_v9_0_ring_emit_ib_compute */
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 6e1b25bd1fe7..100d526e9a42 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 
 	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
 
+	/* Insert a dummy read to delay one cycle before the ACK
+	 * inquiry.
+	 */
+	if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
+	    ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
+	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
+		amdgpu_ring_emit_reg_wait(ring,
+					  hub->vm_inv_eng0_req + eng, 0, 0);
+
 	/* wait for the invalidate to complete */
 	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
 				  1 << vmid, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 9f2a893871ec..8f3097e45299 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
 			      upper_32_bits(pd_addr));
 
+	/* Insert a dummy read to delay one cycle before the ACK
+	 * inquiry.
+	 */
+	if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
+	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
+		amdgpu_ring_emit_reg_wait(ring,
+					  hub->vm_inv_eng0_req + eng, 0, 0);
+


	Why do we add amdgpu_ring_emit_reg_wait here? There is no amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req); before it like gmc10.
	In gmc9,amdgpu_ring_emit_wreg  and amdgpu_ring_emit_reg_wait  are called in amdgpu_ring_emit_reg_write_reg_wait.
	I think it may be more reasonable to add dummy amdgpu_ring_emit_reg_wait in amdgpu_ring_emit_reg_write_reg_wait.
	Besides, we should also think about the influence of SROV's patch:
	drm/amdgpu: Remove the sriov checking and add firmware checking



 	amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
 					    hub->vm_inv_eng0_ack + eng,
 					    req, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index b8fdb192f6d6..0c41b4fdc58b 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
 		6 + /* sdma_v5_0_ring_emit_pipeline_sync */
 		/* sdma_v5_0_ring_emit_vm_flush */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
 		10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
 	.emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
 	.emit_ib = sdma_v5_0_ring_emit_ib,
--
2.23.0.385.gbc12974a89

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-24 21:16 ` Tuikov, Luben
  0 siblings, 0 replies; 30+ messages in thread
From: Tuikov, Luben @ 2019-10-24 21:16 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Tuikov, Luben,
	Koenig, Christian

The GRBM interface is now capable of bursting
1-cycle op per register, a WRITE followed by
another WRITE, or a WRITE followed by a READ--much
faster than previous muti-cycle per
completed-transaction interface. This causes a
problem, whereby status registers requiring a
read/write by hardware, have a 1-cycle delay, due
to the register update having to go through GRBM
interface.

This patch adds this delay.

A one cycle read op is added after updating the
invalidate request and before reading the
invalidate-ACK status.

See also commit
534991731cb5fa94b5519957646cf849ca10d17d.

Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
 5 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index ac43b1af69e3..0042868dbd53 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
 		5 + /* COND_EXEC */
 		7 + /* PIPELINE_SYNC */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* VM_FLUSH */
 		8 + /* FENCE for VM_FLUSH */
 		20 + /* GDS switch */
@@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
 		5 + /* hdp invalidate */
 		7 + /* gfx_v10_0_ring_emit_pipeline_sync */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* gfx_v10_0_ring_emit_vm_flush */
 		8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
 	.emit_ib_size =	7, /* gfx_v10_0_ring_emit_ib_compute */
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 9fe95e7693d5..9a7a717208de 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
 		5 +  /* COND_EXEC */
 		7 +  /* PIPELINE_SYNC */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* VM_FLUSH */
 		8 +  /* FENCE for VM_FLUSH */
 		20 + /* GDS switch */
@@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
 		5 + /* hdp invalidate */
 		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* gfx_v9_0_ring_emit_vm_flush */
 		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
 	.emit_ib_size =	7, /* gfx_v9_0_ring_emit_ib_compute */
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 6e1b25bd1fe7..100d526e9a42 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 
 	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
 
+	/* Insert a dummy read to delay one cycle before the ACK
+	 * inquiry.
+	 */
+	if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
+	    ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
+	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
+		amdgpu_ring_emit_reg_wait(ring,
+					  hub->vm_inv_eng0_req + eng, 0, 0);
+
 	/* wait for the invalidate to complete */
 	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
 				  1 << vmid, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 9f2a893871ec..8f3097e45299 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
 			      upper_32_bits(pd_addr));
 
+	/* Insert a dummy read to delay one cycle before the ACK
+	 * inquiry.
+	 */
+	if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
+	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
+		amdgpu_ring_emit_reg_wait(ring,
+					  hub->vm_inv_eng0_req + eng, 0, 0);
+
 	amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
 					    hub->vm_inv_eng0_ack + eng,
 					    req, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index b8fdb192f6d6..0c41b4fdc58b 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
 		6 + /* sdma_v5_0_ring_emit_pipeline_sync */
 		/* sdma_v5_0_ring_emit_vm_flush */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
 		10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
 	.emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
 	.emit_ib = sdma_v5_0_ring_emit_ib,
-- 
2.23.0.385.gbc12974a89

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-24 21:16 ` Tuikov, Luben
  0 siblings, 0 replies; 30+ messages in thread
From: Tuikov, Luben @ 2019-10-24 21:16 UTC (permalink / raw)
  To: amd-gfx
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Tuikov, Luben,
	Koenig, Christian

The GRBM interface is now capable of bursting
1-cycle op per register, a WRITE followed by
another WRITE, or a WRITE followed by a READ--much
faster than previous muti-cycle per
completed-transaction interface. This causes a
problem, whereby status registers requiring a
read/write by hardware, have a 1-cycle delay, due
to the register update having to go through GRBM
interface.

This patch adds this delay.

A one cycle read op is added after updating the
invalidate request and before reading the
invalidate-ACK status.

See also commit
534991731cb5fa94b5519957646cf849ca10d17d.

Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
 5 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index ac43b1af69e3..0042868dbd53 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
 		5 + /* COND_EXEC */
 		7 + /* PIPELINE_SYNC */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* VM_FLUSH */
 		8 + /* FENCE for VM_FLUSH */
 		20 + /* GDS switch */
@@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
 		5 + /* hdp invalidate */
 		7 + /* gfx_v10_0_ring_emit_pipeline_sync */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* gfx_v10_0_ring_emit_vm_flush */
 		8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
 	.emit_ib_size =	7, /* gfx_v10_0_ring_emit_ib_compute */
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 9fe95e7693d5..9a7a717208de 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
 		5 +  /* COND_EXEC */
 		7 +  /* PIPELINE_SYNC */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* VM_FLUSH */
 		8 +  /* FENCE for VM_FLUSH */
 		20 + /* GDS switch */
@@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
 		5 + /* hdp invalidate */
 		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* gfx_v9_0_ring_emit_vm_flush */
 		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
 	.emit_ib_size =	7, /* gfx_v9_0_ring_emit_ib_compute */
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 6e1b25bd1fe7..100d526e9a42 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 
 	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
 
+	/* Insert a dummy read to delay one cycle before the ACK
+	 * inquiry.
+	 */
+	if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
+	    ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
+	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
+		amdgpu_ring_emit_reg_wait(ring,
+					  hub->vm_inv_eng0_req + eng, 0, 0);
+
 	/* wait for the invalidate to complete */
 	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
 				  1 << vmid, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 9f2a893871ec..8f3097e45299 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
 			      upper_32_bits(pd_addr));
 
+	/* Insert a dummy read to delay one cycle before the ACK
+	 * inquiry.
+	 */
+	if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
+	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
+		amdgpu_ring_emit_reg_wait(ring,
+					  hub->vm_inv_eng0_req + eng, 0, 0);
+
 	amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
 					    hub->vm_inv_eng0_ack + eng,
 					    req, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index b8fdb192f6d6..0c41b4fdc58b 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
 		6 + /* sdma_v5_0_ring_emit_pipeline_sync */
 		/* sdma_v5_0_ring_emit_vm_flush */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
 		10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
 	.emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
 	.emit_ib = sdma_v5_0_ring_emit_ib,
-- 
2.23.0.385.gbc12974a89

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2019-10-28 13:38 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-25  9:26 [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay Huang, Ray
2019-10-25  9:26 ` Huang, Ray
     [not found] ` <MN2PR12MB33095371C6336C43E4F88C43EC650-rweVpJHSKTpWdvXm18W95QdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-10-25 14:22   ` Zhu, Changfeng
2019-10-25 14:22     ` Zhu, Changfeng
     [not found]     ` <MN2PR12MB28967F025FA60291AE745FE6FD650-rweVpJHSKToIQ/pRnFqe/QdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-10-25 15:53       ` Koenig, Christian
2019-10-25 15:53         ` Koenig, Christian
     [not found]         ` <b54e3e37-ff15-079f-9b62-be7936836672-5C7GfCeVMHo@public.gmane.org>
2019-10-28  3:01           ` Zhu, Changfeng
2019-10-28  3:01             ` Zhu, Changfeng
     [not found]             ` <MN2PR12MB2896E32084545C8EB240BC45FD660-rweVpJHSKToIQ/pRnFqe/QdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-10-28 10:46               ` Koenig, Christian
2019-10-28 10:46                 ` Koenig, Christian
     [not found]                 ` <924c7758-92ed-caf6-8068-ca12d7d77ed7-5C7GfCeVMHo@public.gmane.org>
2019-10-28 12:07                   ` Zhu, Changfeng
2019-10-28 12:07                     ` Zhu, Changfeng
  -- strict thread matches above, loose matches on Subject: below --
2019-10-28 13:38 Koenig, Christian
2019-10-28 13:38 ` Koenig, Christian
2019-10-24 21:16 Tuikov, Luben
2019-10-24 21:16 ` Tuikov, Luben
     [not found] ` <20191024211430.25399-1-luben.tuikov-5C7GfCeVMHo@public.gmane.org>
2019-10-25  3:20   ` Zhu, Changfeng
2019-10-25  3:20     ` Zhu, Changfeng
2019-10-25  6:49   ` Koenig, Christian
2019-10-25  6:49     ` Koenig, Christian
     [not found]     ` <6be2805a-dddc-7b02-84ea-f52fab9780b0-5C7GfCeVMHo@public.gmane.org>
2019-10-25 16:05       ` Alex Deucher
2019-10-25 16:05         ` Alex Deucher
     [not found]         ` <CADnq5_NsTABDWTMBFcQBGfaBganBpzN+YQ0gmw55pa8PswNZYA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2019-10-25 16:19           ` Koenig, Christian
2019-10-25 16:19             ` Koenig, Christian
     [not found]             ` <b40c78f1-17a5-f0f9-183e-0c78fd7163e9-5C7GfCeVMHo@public.gmane.org>
2019-10-25 22:45               ` Tuikov, Luben
2019-10-25 22:45                 ` Tuikov, Luben
     [not found]                 ` <c3e496c7-2ace-149e-0c51-92dd1342d31d-5C7GfCeVMHo@public.gmane.org>
2019-10-26 12:09                   ` Koenig, Christian
2019-10-26 12:09                     ` Koenig, Christian
     [not found]                     ` <122f3bde-5fd0-1fa5-864c-547c0cefb744-5C7GfCeVMHo@public.gmane.org>
2019-10-27 21:25                       ` Tuikov, Luben
2019-10-27 21:25                         ` Tuikov, Luben

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.