All of lore.kernel.org
 help / color / mirror / Atom feed
* Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-25  9:26 ` Huang, Ray
  0 siblings, 0 replies; 30+ messages in thread
From: Huang, Ray @ 2019-10-25  9:26 UTC (permalink / raw)
  To: Tuikov, Luben
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Koenig,
	Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On Thu, Oct 24, 2019 at 09:16:55PM +0000, Tuikov, Luben wrote:
> The GRBM interface is now capable of bursting 1-cycle op per register, 
> a WRITE followed by another WRITE, or a WRITE followed by a READ--much 
> faster than previous muti-cycle per completed-transaction interface. 
> This causes a problem, whereby status registers requiring a read/write 
> by hardware, have a 1-cycle delay, due to the register update having 
> to go through GRBM interface.
> 
> This patch adds this delay.
> 
> A one cycle read op is added after updating the invalidate request and 
> before reading the invalidate-ACK status.
> 
> See also commit
> 534991731cb5fa94b5519957646cf849ca10d17d.
> 
> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--  
> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--  
> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++  
> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++  
> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>  5 files changed, 22 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index ac43b1af69e3..0042868dbd53 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>  		5 + /* COND_EXEC */
>  		7 + /* PIPELINE_SYNC */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>  		2 + /* VM_FLUSH */
>  		8 + /* FENCE for VM_FLUSH */
>  		20 + /* GDS switch */
> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>  		5 + /* hdp invalidate */
>  		7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>  		2 + /* gfx_v10_0_ring_emit_vm_flush */
>  		8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>  	.emit_ib_size =	7, /* gfx_v10_0_ring_emit_ib_compute */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 9fe95e7693d5..9a7a717208de 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>  		5 +  /* COND_EXEC */
>  		7 +  /* PIPELINE_SYNC */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>  		2 + /* VM_FLUSH */
>  		8 +  /* FENCE for VM_FLUSH */
>  		20 + /* GDS switch */
> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>  		5 + /* hdp invalidate */
>  		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>  		2 + /* gfx_v9_0_ring_emit_vm_flush */
>  		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>  	.emit_ib_size =	7, /* gfx_v9_0_ring_emit_ib_compute */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 6e1b25bd1fe7..100d526e9a42 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -346,6 +346,15 @@ static uint64_t 
> gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>  
>  	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>  
> +	/* Insert a dummy read to delay one cycle before the ACK
> +	 * inquiry.
> +	 */
> +	if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
> +	    ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
> +	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> +		amdgpu_ring_emit_reg_wait(ring,
> +					  hub->vm_inv_eng0_req + eng, 0, 0);
> +
>  	/* wait for the invalidate to complete */
>  	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>  				  1 << vmid, 1 << vmid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 9f2a893871ec..8f3097e45299 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>  	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>  			      upper_32_bits(pd_addr));
>  
> +	/* Insert a dummy read to delay one cycle before the ACK
> +	 * inquiry.
> +	 */
> +	if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
> +	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> +		amdgpu_ring_emit_reg_wait(ring,
> +					  hub->vm_inv_eng0_req + eng, 0, 0);

The workaround should be add a dummy read (one cycle delay) after we write VM_INVALIDATE_ENGx_REQ and before we poll the VM_INVALIDATE_ENGx_ACK.
If you add it here, that cannot resolve the issue. I think you should implement the dummy read in below function: amdgpu_ring_emit_reg_write_reg_wait().

Thanks,
Ray

> +
>  	amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>  					    hub->vm_inv_eng0_ack + eng,
>  					    req, 1 << vmid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c 
> b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> index b8fdb192f6d6..0c41b4fdc58b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>  		6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>  		/* sdma_v5_0_ring_emit_vm_flush */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>  		10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>  	.emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>  	.emit_ib = sdma_v5_0_ring_emit_ib,
> --
> 2.23.0.385.gbc12974a89
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread
* RE: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-28 13:38 ` Koenig, Christian
  0 siblings, 0 replies; 30+ messages in thread
From: Koenig, Christian @ 2019-10-28 13:38 UTC (permalink / raw)
  To: Zhu, Changfeng
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Huang, Ray,
	Tuikov, Luben, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 13495 bytes --]

I think we should implement the write/wait combined command in gfx10.

Did we ever released any firmware which couldn't do this?

Christian.

Am 28.10.2019 13:07 schrieb "Zhu, Changfeng" <Changfeng.Zhu@amd.com>:
Hi Christian,

Should we also realize the function of gfx_v9_0_wait_reg_mem in gfx10 like gfx9 since gfx10 also realize write/wait command in a single packet after CL#1761300?

Or we can add dummy read in gmc10 by using emit_wait like Luben's way?

BR,
Changfeng.

-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com>
Sent: Monday, October 28, 2019 6:47 PM
To: Zhu, Changfeng <Changfeng.Zhu@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Huang, Ray <Ray.Huang@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>
Subject: Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay

Hi Changfeng,

> So how can we deal with the firmware between mec version(402) and mec version(421)?
Well of hand I see only two options: Either print a warning or completely reject loading the driver.

Completely rejecting loading the driver is probably not a good idea and the issue is actually extremely unlikely to cause any problems.

So printing a warning that the user should update their firmware is probably the best approach.

Regards,
Christian.

Am 28.10.19 um 04:01 schrieb Zhu, Changfeng:
> Hi Christian,
>
> Re- that won't work, you can't add this to
> amdgpu_ring_emit_reg_write_reg_wait_helper or break all read triggered registers (like the semaphore ones).
>
> Do you mean that I should use reg_wait registers(wait_reg_mem) like Luben to replace read triggered registers for adding dummy read?
>
> Re-Additional to that it will never work on GFX9, since the CP firmware there uses the integrated write/wait command and you can't add an additional dummy read there.
>
> Yes, I see the integrated write/wait command and they are realized in gfx_v9_0_wait_reg_mem:
> Emily's patch:
> drm/amdgpu: Remove the sriov checking and add firmware checking
> decides when to go into gfx_v9_0_wait_reg_mem and when go into amdgpu_ring_emit_reg_write_reg_wait_helper.
>
> However there are two problems now.
> 1.Before the fw_version_ok fw version, the code goes into amdgpu_ring_emit_reg_write_reg_wait_helper. In this case, should not we add dummy read in amdgpu_ring_emit_reg_write_reg_wait_helper?
> 2.After the fw_version_ok fw version, the code goes into gfx_v9_0_wait_reg_mem. However, it realizes write/wait command in firmware. Then how can we add this dummy read? According to Yang,Zilong, the CP firmware has realized dummy in firmware in CL:
> Vega20 CL#1762470 @3/27/2019
> Navi10 CL#1761300 @3/25/2019
> Accodring to CL#1762470,
> The firmware which realized dummy read is(Raven for example):
> Mec version:
> #define F32_MEC_UCODE_VERSION "#421"
> #define F32_MEC_FEATURE_VERSION 46
> Pfp version:
> #define F32_PFP_UCODE_VERSION "#183"
> #define F32_PFP_FEATURE_VERSION 46
> In Emily's patch:
> The CP firmware whichuses the integrated write/wait command begins from version:
> +       case CHIP_RAVEN:
> +               if ((adev->gfx.me_fw_version >= 0x0000009c) &&
> +                   (adev->gfx.me_feature_version >= 42) &&
> +                   (adev->gfx.pfp_fw_version >=  0x000000b1(177)) &&
> +                   (adev->gfx.pfp_feature_version >= 42))
> +                       adev->gfx.me_fw_write_wait = true;
> +
> +               if ((adev->gfx.mec_fw_version >=  0x00000192(402)) &&
> +                   (adev->gfx.mec_feature_version >= 42))
> +                       adev->gfx.mec_fw_write_wait = true;
> +               break;
>
> So how can we deal with the firmware between mec version(402) and mec version(421)?
> It will realize write/wait command in CP firmware but it doesn't have dummy read.
>
> BR,
> Changfeng.
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Friday, October 25, 2019 11:54 PM
> To: Zhu, Changfeng <Changfeng.Zhu@amd.com>;
> amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Pelloux-prayer,
> Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Huang, Ray
> <Ray.Huang@amd.com>; Tuikov, Luben <Luben.Tuikov@amd.com>
> Subject: Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle
> delay
>
> Hi Changfeng,
>
> that won't work, you can't add this to
> amdgpu_ring_emit_reg_write_reg_wait_helper or break all read triggered registers (like the semaphore ones).
>
> Additional to that it will never work on GFX9, since the CP firmware there uses the integrated write/wait command and you can't add an additional dummy read there.
>
> Regards,
> Christian.
>
> Am 25.10.19 um 16:22 schrieb Zhu, Changfeng:
>> I try to write a patch based on the patch of Tuikov,Luben.
>>
>> Inspired by Luben,here is the patch:
>>
>>   From 1980d8f1ed44fb9a84a5ea1f6e2edd2bc25c629a Mon Sep 17 00:00:00
>> 2001
>> From: changzhu <Changfeng.Zhu@amd.com>
>> Date: Thu, 10 Oct 2019 11:02:33 +0800
>> Subject: [PATCH] drm/amdgpu: add dummy read by engines for some GCVM status
>>    registers
>>
>> The GRBM register interface is now capable of bursting 1 cycle per
>> register wr->wr, wr->rd much faster than previous muticycle per
>> transaction done interface.  This has caused a problem where status
>> registers requiring HW to update have a 1 cycle delay, due to the
>> register update having to go through GRBM.
>>
>> SW may operate on an incorrect value if they write a register and
>> immediately check the corresponding status register.
>>
>> Registers requiring HW to clear or set fields may be delayed by 1 cycle.
>> For example,
>>
>> 1. write VM_INVALIDATE_ENG0_REQ mask = 5a 2. read
>> VM_INVALIDATE_ENG0_ACKb till the ack is same as the request mask = 5a
>>               a. HW will reset VM_INVALIDATE_ENG0_ACK = 0 until invalidation
>> is complete 3. write VM_INVALIDATE_ENG0_REQ mask = 5a 4. read
>> VM_INVALIDATE_ENG0_ACK till the ack is same as the request mask = 5a
>>       a. First read of VM_INVALIDATE_ENG0_ACK = 5a instead of 0
>>       b. Second read of VM_INVALIDATE_ENG0_ACK = 0 because the remote GRBM h/w
>>          register takes one extra cycle to be cleared
>>       c. In this case,SW wil see a false ACK if they exit on first read
>>
>> Affected registers (only GC variant)  | Recommended Dummy Read
>> --------------------------------------+----------------------------
>> VM_INVALIDATE_ENG*_ACK                     |  VM_INVALIDATE_ENG*_REQ
>> VM_L2_STATUS                       |  VM_L2_STATUS
>> VM_L2_PROTECTION_FAULT_STATUS              |  VM_L2_PROTECTION_FAULT_STATUS
>> VM_L2_PROTECTION_FAULT_ADDR_HI/LO32   |  VM_L2_PROTECTION_FAULT_ADDR_HI/LO32
>> VM_L2_IH_LOG_BUSY                  |  VM_L2_IH_LOG_BUSY
>> MC_VM_L2_PERFCOUNTER_HI/LO         |  MC_VM_L2_PERFCOUNTER_HI/LO
>> ATC_L2_PERFCOUNTER_HI/LO           |  ATC_L2_PERFCOUNTER_HI/LO
>> ATC_L2_PERFCOUNTER2_HI/LO          |  ATC_L2_PERFCOUNTER2_HI/LO
>>
>> It also needs dummy read by engines for these gc registers.
>>
>> Change-Id: Ie028f37eb789966d4593984bd661b248ebeb1ac3
>> Signed-off-by: changzhu <Changfeng.Zhu@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c |  5 +++++
>>    drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   |  2 ++
>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    |  2 ++
>>    drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c   |  4 ++++
>>    drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c   | 18 ++++++++++++++++++
>>    5 files changed, 31 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> index 4b3f58dbf36f..c2fbf6087ecf 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> @@ -392,6 +392,11 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
>>                                               uint32_t ref, uint32_t mask)
>>    {
>>       amdgpu_ring_emit_wreg(ring, reg0, ref);
>> +
>> +    /* wait for a cycle to reset vm_inv_eng0_ack */
>> +    if (ring->funcs->vmhub == AMDGPU_GFXHUB_0)
>> +            amdgpu_ring_emit_rreg(ring, reg0);
>> +
>>       amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);
>>    }
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index ef1975a5323a..104c47734316 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -5155,6 +5155,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>       .patch_cond_exec = gfx_v10_0_ring_emit_patch_cond_exec,
>>       .preempt_ib = gfx_v10_0_ring_preempt_ib,
>>       .emit_tmz = gfx_v10_0_ring_emit_tmz,
>> +    .emit_rreg = gfx_v10_0_ring_emit_rreg,
>>       .emit_wreg = gfx_v10_0_ring_emit_wreg,
>>       .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>>    };
>> @@ -5188,6 +5189,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>       .test_ib = gfx_v10_0_ring_test_ib,
>>       .insert_nop = amdgpu_ring_insert_nop,
>>       .pad_ib = amdgpu_ring_generic_pad_ib,
>> +    .emit_rreg = gfx_v10_0_ring_emit_rreg,
>>       .emit_wreg = gfx_v10_0_ring_emit_wreg,
>>       .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>>    };
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> index 2f03bf533d41..d00b53de0fdc 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> @@ -6253,6 +6253,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>       .init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
>>       .patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
>>       .emit_tmz = gfx_v9_0_ring_emit_tmz,
>> +    .emit_rreg = gfx_v9_0_ring_emit_rreg,
>>       .emit_wreg = gfx_v9_0_ring_emit_wreg,
>>       .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>>       .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
>> @@ -6289,6 +6290,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>       .insert_nop = amdgpu_ring_insert_nop,
>>       .pad_ib = amdgpu_ring_generic_pad_ib,
>>       .set_priority = gfx_v9_0_ring_set_priority_compute,
>> +    .emit_rreg = gfx_v9_0_ring_emit_rreg,
>>       .emit_wreg = gfx_v9_0_ring_emit_wreg,
>>       .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>>       .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> index 3b00bce14cfb..dce6b651da1f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> @@ -346,6 +346,10 @@ static uint64_t
>> gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>
>>       amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>
>> +    /* wait for a cycle to reset vm_inv_eng0_ack */
>> +    if (ring->funcs->vmhub == AMDGPU_GFXHUB_0)
>> +            amdgpu_ring_emit_rreg(ring, hub->vm_inv_eng0_req + eng);
>> +
>>       /* wait for the invalidate to complete */
>>       amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>                                 1 << vmid, 1 << vmid);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> index 3460c00f3eaa..baaa33467882 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>> @@ -38,6 +38,7 @@
>>    #include "navi10_sdma_pkt_open.h"
>>    #include "nbio_v2_3.h"
>>    #include "sdma_v5_0.h"
>> +#include "nvd.h"
>>
>>    MODULE_FIRMWARE("amdgpu/navi10_sdma.bin");
>>    MODULE_FIRMWARE("amdgpu/navi10_sdma1.bin");
>> @@ -1147,6 +1148,22 @@ static void sdma_v5_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
>>       amdgpu_gmc_emit_flush_gpu_tlb(ring, vmid, pd_addr);
>>    }
>>
>> +static void sdma_v5_0_ring_emit_rreg(struct amdgpu_ring *ring,
>> +uint32_t reg) {
>> +    struct amdgpu_device *adev = ring->adev;
>> +
>> +    amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>> +    amdgpu_ring_write(ring, 0 | /* src: register*/
>> +                            (5 << 8) |  /* dst: memory */
>> +                            (1 << 20)); /* write confirm */
>> +    amdgpu_ring_write(ring, reg);
>> +    amdgpu_ring_write(ring, 0);
>> +    amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> +                            adev->virt.reg_val_offs * 4));
>> +    amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> +                            adev->virt.reg_val_offs * 4));
>> +}
>> +
>>    static void sdma_v5_0_ring_emit_wreg(struct amdgpu_ring *ring,
>>                                    uint32_t reg, uint32_t val)
>>    {
>> @@ -1597,6 +1614,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>>       .test_ib = sdma_v5_0_ring_test_ib,
>>       .insert_nop = sdma_v5_0_ring_insert_nop,
>>       .pad_ib = sdma_v5_0_ring_pad_ib,
>> +    .emit_rreg = sdma_v5_0_ring_emit_rreg,
>>       .emit_wreg = sdma_v5_0_ring_emit_wreg,
>>       .emit_reg_wait = sdma_v5_0_ring_emit_reg_wait,
>>       .init_cond_exec = sdma_v5_0_ring_init_cond_exec,



[-- Attachment #1.2: Type: text/html, Size: 21778 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread
* [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay
@ 2019-10-24 21:16 ` Tuikov, Luben
  0 siblings, 0 replies; 30+ messages in thread
From: Tuikov, Luben @ 2019-10-24 21:16 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Tuikov, Luben,
	Koenig, Christian

The GRBM interface is now capable of bursting
1-cycle op per register, a WRITE followed by
another WRITE, or a WRITE followed by a READ--much
faster than previous muti-cycle per
completed-transaction interface. This causes a
problem, whereby status registers requiring a
read/write by hardware, have a 1-cycle delay, due
to the register update having to go through GRBM
interface.

This patch adds this delay.

A one cycle read op is added after updating the
invalidate request and before reading the
invalidate-ACK status.

See also commit
534991731cb5fa94b5519957646cf849ca10d17d.

Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
 5 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index ac43b1af69e3..0042868dbd53 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
 		5 + /* COND_EXEC */
 		7 + /* PIPELINE_SYNC */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* VM_FLUSH */
 		8 + /* FENCE for VM_FLUSH */
 		20 + /* GDS switch */
@@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
 		5 + /* hdp invalidate */
 		7 + /* gfx_v10_0_ring_emit_pipeline_sync */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* gfx_v10_0_ring_emit_vm_flush */
 		8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
 	.emit_ib_size =	7, /* gfx_v10_0_ring_emit_ib_compute */
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 9fe95e7693d5..9a7a717208de 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
 		5 +  /* COND_EXEC */
 		7 +  /* PIPELINE_SYNC */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* VM_FLUSH */
 		8 +  /* FENCE for VM_FLUSH */
 		20 + /* GDS switch */
@@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
 		5 + /* hdp invalidate */
 		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
 		2 + /* gfx_v9_0_ring_emit_vm_flush */
 		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
 	.emit_ib_size =	7, /* gfx_v9_0_ring_emit_ib_compute */
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 6e1b25bd1fe7..100d526e9a42 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 
 	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
 
+	/* Insert a dummy read to delay one cycle before the ACK
+	 * inquiry.
+	 */
+	if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
+	    ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
+	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
+		amdgpu_ring_emit_reg_wait(ring,
+					  hub->vm_inv_eng0_req + eng, 0, 0);
+
 	/* wait for the invalidate to complete */
 	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
 				  1 << vmid, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 9f2a893871ec..8f3097e45299 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
 			      upper_32_bits(pd_addr));
 
+	/* Insert a dummy read to delay one cycle before the ACK
+	 * inquiry.
+	 */
+	if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
+	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
+		amdgpu_ring_emit_reg_wait(ring,
+					  hub->vm_inv_eng0_req + eng, 0, 0);
+
 	amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
 					    hub->vm_inv_eng0_ack + eng,
 					    req, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index b8fdb192f6d6..0c41b4fdc58b 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
 		6 + /* sdma_v5_0_ring_emit_pipeline_sync */
 		/* sdma_v5_0_ring_emit_vm_flush */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
-		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
+		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
 		10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
 	.emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
 	.emit_ib = sdma_v5_0_ring_emit_ib,
-- 
2.23.0.385.gbc12974a89

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2019-10-28 13:38 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-25  9:26 [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay Huang, Ray
2019-10-25  9:26 ` Huang, Ray
     [not found] ` <MN2PR12MB33095371C6336C43E4F88C43EC650-rweVpJHSKTpWdvXm18W95QdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-10-25 14:22   ` Zhu, Changfeng
2019-10-25 14:22     ` Zhu, Changfeng
     [not found]     ` <MN2PR12MB28967F025FA60291AE745FE6FD650-rweVpJHSKToIQ/pRnFqe/QdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-10-25 15:53       ` Koenig, Christian
2019-10-25 15:53         ` Koenig, Christian
     [not found]         ` <b54e3e37-ff15-079f-9b62-be7936836672-5C7GfCeVMHo@public.gmane.org>
2019-10-28  3:01           ` Zhu, Changfeng
2019-10-28  3:01             ` Zhu, Changfeng
     [not found]             ` <MN2PR12MB2896E32084545C8EB240BC45FD660-rweVpJHSKToIQ/pRnFqe/QdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-10-28 10:46               ` Koenig, Christian
2019-10-28 10:46                 ` Koenig, Christian
     [not found]                 ` <924c7758-92ed-caf6-8068-ca12d7d77ed7-5C7GfCeVMHo@public.gmane.org>
2019-10-28 12:07                   ` Zhu, Changfeng
2019-10-28 12:07                     ` Zhu, Changfeng
  -- strict thread matches above, loose matches on Subject: below --
2019-10-28 13:38 Koenig, Christian
2019-10-28 13:38 ` Koenig, Christian
2019-10-24 21:16 Tuikov, Luben
2019-10-24 21:16 ` Tuikov, Luben
     [not found] ` <20191024211430.25399-1-luben.tuikov-5C7GfCeVMHo@public.gmane.org>
2019-10-25  3:20   ` Zhu, Changfeng
2019-10-25  3:20     ` Zhu, Changfeng
2019-10-25  6:49   ` Koenig, Christian
2019-10-25  6:49     ` Koenig, Christian
     [not found]     ` <6be2805a-dddc-7b02-84ea-f52fab9780b0-5C7GfCeVMHo@public.gmane.org>
2019-10-25 16:05       ` Alex Deucher
2019-10-25 16:05         ` Alex Deucher
     [not found]         ` <CADnq5_NsTABDWTMBFcQBGfaBganBpzN+YQ0gmw55pa8PswNZYA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2019-10-25 16:19           ` Koenig, Christian
2019-10-25 16:19             ` Koenig, Christian
     [not found]             ` <b40c78f1-17a5-f0f9-183e-0c78fd7163e9-5C7GfCeVMHo@public.gmane.org>
2019-10-25 22:45               ` Tuikov, Luben
2019-10-25 22:45                 ` Tuikov, Luben
     [not found]                 ` <c3e496c7-2ace-149e-0c51-92dd1342d31d-5C7GfCeVMHo@public.gmane.org>
2019-10-26 12:09                   ` Koenig, Christian
2019-10-26 12:09                     ` Koenig, Christian
     [not found]                     ` <122f3bde-5fd0-1fa5-864c-547c0cefb744-5C7GfCeVMHo@public.gmane.org>
2019-10-27 21:25                       ` Tuikov, Luben
2019-10-27 21:25                         ` Tuikov, Luben

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.