All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu:guarantee 128dws between vm flush and IB
@ 2017-01-18  5:55 Monk Liu
       [not found] ` <1484718926-10539-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 6+ messages in thread
From: Monk Liu @ 2017-01-18  5:55 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

previously we always insert 128nops behind vm_flush, which
may lead to DAMframe size above 256 dw and automatially aligned
to 512 dw.

now we calculate how many DWs already inserted after vm_flush
and make up for the reset to pad up to 128dws before emit_ib.

that way we only take 256 dw per submit.

Change-Id: Iac198e16f35b071476ba7bd48ab338223f6fe650
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 24 ++++++++++++++++++++++--
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index c813cbe..1dbe600 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -173,6 +173,7 @@ struct amdgpu_ring {
 #if defined(CONFIG_DEBUG_FS)
 	struct dentry *ent;
 #endif
+	u32 dws_between_vm_ib;
 };
 
 int amdgpu_ring_alloc(struct amdgpu_ring *ring, unsigned ndw);
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 5f37313..76b1315 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -5670,6 +5670,8 @@ static void gfx_v8_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
 	amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].oa);
 	amdgpu_ring_write(ring, 0);
 	amdgpu_ring_write(ring, (1 << (oa_size + oa_base)) - (1 << oa_base));
+
+	ring->dws_between_vm_ib += 20;
 }
 
 static uint32_t wave_read_ind(struct amdgpu_device *adev, uint32_t simd, uint32_t wave, uint32_t address)
@@ -6489,6 +6491,8 @@ static void gfx_v8_0_ring_emit_hdp_flush(struct amdgpu_ring *ring)
 	amdgpu_ring_write(ring, ref_and_mask);
 	amdgpu_ring_write(ring, ref_and_mask);
 	amdgpu_ring_write(ring, 0x20); /* poll interval */
+
+	ring->dws_between_vm_ib += 7;
 }
 
 static void gfx_v8_0_ring_emit_vgt_flush(struct amdgpu_ring *ring)
@@ -6500,6 +6504,8 @@ static void gfx_v8_0_ring_emit_vgt_flush(struct amdgpu_ring *ring)
 	amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE, 0));
 	amdgpu_ring_write(ring, EVENT_TYPE(VGT_FLUSH) |
 		EVENT_INDEX(0));
+
+	ring->dws_between_vm_ib += 4;
 }
 
 
@@ -6573,6 +6579,7 @@ static void gfx_v8_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr,
 	amdgpu_ring_write(ring, lower_32_bits(seq));
 	amdgpu_ring_write(ring, upper_32_bits(seq));
 
+	ring->dws_between_vm_ib += 6;
 }
 
 static void gfx_v8_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
@@ -6639,6 +6646,8 @@ static void gfx_v8_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
 		/* GFX8 emits 128 dw nop to prevent CE access VM before vm_flush finish */
 		amdgpu_ring_insert_nop(ring, 128);
 	}
+
+	ring->dws_between_vm_ib = 0; /* clear before recalculate */
 }
 
 static u32 gfx_v8_0_ring_get_wptr_compute(struct amdgpu_ring *ring)
@@ -6711,9 +6720,11 @@ static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)
 {
 	uint32_t dw2 = 0;
 
-	if (amdgpu_sriov_vf(ring->adev))
+	if (amdgpu_sriov_vf(ring->adev)) {
 		gfx_v8_0_ring_emit_ce_meta_init(ring,
 			(flags & AMDGPU_VM_DOMAIN) ? AMDGPU_CSA_VADDR : ring->adev->virt.csa_vmid0_addr);
+		ring->dws_between_vm_ib += 8;
+	}
 
 	dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
 	if (flags & AMDGPU_HAVE_CTX_SWITCH) {
@@ -6739,10 +6750,19 @@ static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)
 	amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
 	amdgpu_ring_write(ring, dw2);
 	amdgpu_ring_write(ring, 0);
+	ring->dws_between_vm_ib += 3;
 
-	if (amdgpu_sriov_vf(ring->adev))
+	if (amdgpu_sriov_vf(ring->adev)) {
 		gfx_v8_0_ring_emit_de_meta_init(ring,
 			(flags & AMDGPU_VM_DOMAIN) ? AMDGPU_CSA_VADDR : ring->adev->virt.csa_vmid0_addr);
+		ring->dws_between_vm_ib += 21;
+	}
+
+	/* emit_de_meta_init should be the last package right before emi_ib,
+	 * and we need to pad some NOPs before emit_ib to prevent CE run ahead of
+	 * vm_flush, which may trigger VM fault.
+	 */
+	amdgpu_ring_insert_nop(ring, 128 - ring->dws_between_vm_ib);
 }
 
 static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] drm/amdgpu:guarantee 128dws between vm flush and IB
       [not found] ` <1484718926-10539-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-01-18  9:28   ` Christian König
       [not found]     ` <5999cfbf-d089-a05f-734c-dabe388cd303-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  0 siblings, 1 reply; 6+ messages in thread
From: Christian König @ 2017-01-18  9:28 UTC (permalink / raw)
  To: Monk Liu, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 18.01.2017 um 06:55 schrieb Monk Liu:
> previously we always insert 128nops behind vm_flush, which
> may lead to DAMframe size above 256 dw and automatially aligned
> to 512 dw.
>
> now we calculate how many DWs already inserted after vm_flush
> and make up for the reset to pad up to 128dws before emit_ib.
>
> that way we only take 256 dw per submit.

First question why do we want to limit the dw per submit to 256? Some 
limitation for SRIOV?

Then for the implementation, please don't add all that counting to the 
different functions. Instead save the current position in the ring in 
emit_vm_flush and then calculate in emit_cntxcntl() how many dw we still 
need to add to have at least 128. E.g. call the variable something like 
last_vm_flush_pos.

That makes the code way more robust towards any changes.

Regards,
Christian.

>
> Change-Id: Iac198e16f35b071476ba7bd48ab338223f6fe650
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  1 +
>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 24 ++++++++++++++++++++++--
>   2 files changed, 23 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index c813cbe..1dbe600 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -173,6 +173,7 @@ struct amdgpu_ring {
>   #if defined(CONFIG_DEBUG_FS)
>   	struct dentry *ent;
>   #endif
> +	u32 dws_between_vm_ib;
>   };
>   
>   int amdgpu_ring_alloc(struct amdgpu_ring *ring, unsigned ndw);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index 5f37313..76b1315 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -5670,6 +5670,8 @@ static void gfx_v8_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
>   	amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].oa);
>   	amdgpu_ring_write(ring, 0);
>   	amdgpu_ring_write(ring, (1 << (oa_size + oa_base)) - (1 << oa_base));
> +
> +	ring->dws_between_vm_ib += 20;
>   }
>   
>   static uint32_t wave_read_ind(struct amdgpu_device *adev, uint32_t simd, uint32_t wave, uint32_t address)
> @@ -6489,6 +6491,8 @@ static void gfx_v8_0_ring_emit_hdp_flush(struct amdgpu_ring *ring)
>   	amdgpu_ring_write(ring, ref_and_mask);
>   	amdgpu_ring_write(ring, ref_and_mask);
>   	amdgpu_ring_write(ring, 0x20); /* poll interval */
> +
> +	ring->dws_between_vm_ib += 7;
>   }
>   
>   static void gfx_v8_0_ring_emit_vgt_flush(struct amdgpu_ring *ring)
> @@ -6500,6 +6504,8 @@ static void gfx_v8_0_ring_emit_vgt_flush(struct amdgpu_ring *ring)
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE, 0));
>   	amdgpu_ring_write(ring, EVENT_TYPE(VGT_FLUSH) |
>   		EVENT_INDEX(0));
> +
> +	ring->dws_between_vm_ib += 4;
>   }
>   
>   
> @@ -6573,6 +6579,7 @@ static void gfx_v8_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr,
>   	amdgpu_ring_write(ring, lower_32_bits(seq));
>   	amdgpu_ring_write(ring, upper_32_bits(seq));
>   
> +	ring->dws_between_vm_ib += 6;
>   }
>   
>   static void gfx_v8_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
> @@ -6639,6 +6646,8 @@ static void gfx_v8_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
>   		/* GFX8 emits 128 dw nop to prevent CE access VM before vm_flush finish */
>   		amdgpu_ring_insert_nop(ring, 128);
>   	}
> +
> +	ring->dws_between_vm_ib = 0; /* clear before recalculate */
>   }
>   
>   static u32 gfx_v8_0_ring_get_wptr_compute(struct amdgpu_ring *ring)
> @@ -6711,9 +6720,11 @@ static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)
>   {
>   	uint32_t dw2 = 0;
>   
> -	if (amdgpu_sriov_vf(ring->adev))
> +	if (amdgpu_sriov_vf(ring->adev)) {
>   		gfx_v8_0_ring_emit_ce_meta_init(ring,
>   			(flags & AMDGPU_VM_DOMAIN) ? AMDGPU_CSA_VADDR : ring->adev->virt.csa_vmid0_addr);
> +		ring->dws_between_vm_ib += 8;
> +	}
>   
>   	dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
>   	if (flags & AMDGPU_HAVE_CTX_SWITCH) {
> @@ -6739,10 +6750,19 @@ static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
>   	amdgpu_ring_write(ring, dw2);
>   	amdgpu_ring_write(ring, 0);
> +	ring->dws_between_vm_ib += 3;
>   
> -	if (amdgpu_sriov_vf(ring->adev))
> +	if (amdgpu_sriov_vf(ring->adev)) {
>   		gfx_v8_0_ring_emit_de_meta_init(ring,
>   			(flags & AMDGPU_VM_DOMAIN) ? AMDGPU_CSA_VADDR : ring->adev->virt.csa_vmid0_addr);
> +		ring->dws_between_vm_ib += 21;
> +	}
> +
> +	/* emit_de_meta_init should be the last package right before emi_ib,
> +	 * and we need to pad some NOPs before emit_ib to prevent CE run ahead of
> +	 * vm_flush, which may trigger VM fault.
> +	 */
> +	amdgpu_ring_insert_nop(ring, 128 - ring->dws_between_vm_ib);
>   }
>   
>   static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* 答复: [PATCH] drm/amdgpu:guarantee 128dws between vm flush and IB
       [not found]     ` <5999cfbf-d089-a05f-734c-dabe388cd303-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2017-01-18 10:11       ` Liu, Monk
       [not found]         ` <BY2PR1201MB1110C8A3D5B66799F3BCC1B1847F0-O28G1zQ8oGliQkyLPkmea2rFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
  0 siblings, 1 reply; 6+ messages in thread
From: Liu, Monk @ 2017-01-18 10:11 UTC (permalink / raw)
  To: Christian König, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 6585 bytes --]

>First question why do we want to limit the dw per submit to 256? Some
>limitation for SRIOV?


[ML]

suppose hw_submit_count in gpu_scheduler is 2 (by default), and without suppress frame size under 256, we may sometime submit 256dw, and sometime submit 512 dw, and that could lead to CPU override ring buffer content which is under processing by GPU, e.g. :


we assume ring buffer size = 512 dw (so ring buffer can hold 2 hw submit) for easy understanding.


hw_count =2;

submit job-1 (take 256 dw)

hw_count = 1;

submit job-2 (take 256 dw) //now ring buffer is full

hw_count =0;

gpu_scheduler waiting

...

job-1 signaled, then hw_count => 1

submit job3, and the job3 is filled in the head of RB (wrapped)

but job3 take 512 dw (e.g. it takes 259 dw, but we aligned it to 512)


job-2 still under processing, but the package belongs to job-2 is override by job3, disaster ...


another reason is we keep each DMAframe to 256dw is for better performance.


BR Monk

________________________________
发件人: Christian König <deathsimple@vodafone.de>
发送时间: 2017年1月18日 17:28:15
收件人: Liu, Monk; amd-gfx@lists.freedesktop.org
主题: Re: [PATCH] drm/amdgpu:guarantee 128dws between vm flush and IB

Am 18.01.2017 um 06:55 schrieb Monk Liu:
> previously we always insert 128nops behind vm_flush, which
> may lead to DAMframe size above 256 dw and automatially aligned
> to 512 dw.
>
> now we calculate how many DWs already inserted after vm_flush
> and make up for the reset to pad up to 128dws before emit_ib.
>
> that way we only take 256 dw per submit.

First question why do we want to limit the dw per submit to 256? Some
limitation for SRIOV?

Then for the implementation, please don't add all that counting to the
different functions. Instead save the current position in the ring in
emit_vm_flush and then calculate in emit_cntxcntl() how many dw we still
need to add to have at least 128. E.g. call the variable something like
last_vm_flush_pos.

That makes the code way more robust towards any changes.

Regards,
Christian.

>
> Change-Id: Iac198e16f35b071476ba7bd48ab338223f6fe650
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  1 +
>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 24 ++++++++++++++++++++++--
>   2 files changed, 23 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index c813cbe..1dbe600 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -173,6 +173,7 @@ struct amdgpu_ring {
>   #if defined(CONFIG_DEBUG_FS)
>        struct dentry *ent;
>   #endif
> +     u32 dws_between_vm_ib;
>   };
>
>   int amdgpu_ring_alloc(struct amdgpu_ring *ring, unsigned ndw);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index 5f37313..76b1315 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -5670,6 +5670,8 @@ static void gfx_v8_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
>        amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].oa);
>        amdgpu_ring_write(ring, 0);
>        amdgpu_ring_write(ring, (1 << (oa_size + oa_base)) - (1 << oa_base));
> +
> +     ring->dws_between_vm_ib += 20;
>   }
>
>   static uint32_t wave_read_ind(struct amdgpu_device *adev, uint32_t simd, uint32_t wave, uint32_t address)
> @@ -6489,6 +6491,8 @@ static void gfx_v8_0_ring_emit_hdp_flush(struct amdgpu_ring *ring)
>        amdgpu_ring_write(ring, ref_and_mask);
>        amdgpu_ring_write(ring, ref_and_mask);
>        amdgpu_ring_write(ring, 0x20); /* poll interval */
> +
> +     ring->dws_between_vm_ib += 7;
>   }
>
>   static void gfx_v8_0_ring_emit_vgt_flush(struct amdgpu_ring *ring)
> @@ -6500,6 +6504,8 @@ static void gfx_v8_0_ring_emit_vgt_flush(struct amdgpu_ring *ring)
>        amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE, 0));
>        amdgpu_ring_write(ring, EVENT_TYPE(VGT_FLUSH) |
>                EVENT_INDEX(0));
> +
> +     ring->dws_between_vm_ib += 4;
>   }
>
>
> @@ -6573,6 +6579,7 @@ static void gfx_v8_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr,
>        amdgpu_ring_write(ring, lower_32_bits(seq));
>        amdgpu_ring_write(ring, upper_32_bits(seq));
>
> +     ring->dws_between_vm_ib += 6;
>   }
>
>   static void gfx_v8_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
> @@ -6639,6 +6646,8 @@ static void gfx_v8_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
>                /* GFX8 emits 128 dw nop to prevent CE access VM before vm_flush finish */
>                amdgpu_ring_insert_nop(ring, 128);
>        }
> +
> +     ring->dws_between_vm_ib = 0; /* clear before recalculate */
>   }
>
>   static u32 gfx_v8_0_ring_get_wptr_compute(struct amdgpu_ring *ring)
> @@ -6711,9 +6720,11 @@ static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)
>   {
>        uint32_t dw2 = 0;
>
> -     if (amdgpu_sriov_vf(ring->adev))
> +     if (amdgpu_sriov_vf(ring->adev)) {
>                gfx_v8_0_ring_emit_ce_meta_init(ring,
>                        (flags & AMDGPU_VM_DOMAIN) ? AMDGPU_CSA_VADDR : ring->adev->virt.csa_vmid0_addr);
> +             ring->dws_between_vm_ib += 8;
> +     }
>
>        dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
>        if (flags & AMDGPU_HAVE_CTX_SWITCH) {
> @@ -6739,10 +6750,19 @@ static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)
>        amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
>        amdgpu_ring_write(ring, dw2);
>        amdgpu_ring_write(ring, 0);
> +     ring->dws_between_vm_ib += 3;
>
> -     if (amdgpu_sriov_vf(ring->adev))
> +     if (amdgpu_sriov_vf(ring->adev)) {
>                gfx_v8_0_ring_emit_de_meta_init(ring,
>                        (flags & AMDGPU_VM_DOMAIN) ? AMDGPU_CSA_VADDR : ring->adev->virt.csa_vmid0_addr);
> +             ring->dws_between_vm_ib += 21;
> +     }
> +
> +     /* emit_de_meta_init should be the last package right before emi_ib,
> +      * and we need to pad some NOPs before emit_ib to prevent CE run ahead of
> +      * vm_flush, which may trigger VM fault.
> +      */
> +     amdgpu_ring_insert_nop(ring, 128 - ring->dws_between_vm_ib);
>   }
>
>   static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)



[-- Attachment #1.2: Type: text/html, Size: 11643 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: 答复: [PATCH] drm/amdgpu:guarantee 128dws between vm flush and IB
       [not found]         ` <BY2PR1201MB1110C8A3D5B66799F3BCC1B1847F0-O28G1zQ8oGliQkyLPkmea2rFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
@ 2017-01-18 12:52           ` Christian König
       [not found]             ` <9e325cfb-ec04-0317-b871-2d54ad21b724-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  0 siblings, 1 reply; 6+ messages in thread
From: Christian König @ 2017-01-18 12:52 UTC (permalink / raw)
  To: Liu, Monk, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

> suppose hw_submit_count in gpu_scheduler is 2 (by default), and 
> without suppress frame size under 256, we may sometime submit 256dw, 
> and sometime submit 512 dw, and that could lead to CPU override ring 
> buffer content which is under processing by GPU, e.g. :
>
The max_dw parameter for amdgpu_ring_init() is multiplied by 
amdgpu_sched_hw_submission. See amdgpu_ring_init():

         ring->ring_size = roundup_pow_of_two(max_dw * 4 *
amdgpu_sched_hw_submission);

Since we use 1024 for max_dw for the GFX, Compute and SDMA rings using 
512 dw is perfectly ok.

Even if we ever try to submit more than 1024 including padding we will 
get a nice warning in the logs. See amdgpu_ring_alloc():

         if (WARN_ON_ONCE(ndw > ring->max_dw))
                 return -ENOMEM;

> another reason is we keep each DMAframe to 256dw is for better 
> performance.
>
Well considering what else we have in those command buffers I would 
strongly say that this is negligible.

So NAK on that patch as long as we don't have a good reason for it.

Regards,
Christian.

Am 18.01.2017 um 11:11 schrieb Liu, Monk:
>
> >First question why do we want to limit the dw per submit to 256? Some
> >limitation for SRIOV?
>
>
> [ML]
>
> suppose hw_submit_count in gpu_scheduler is 2 (by default), and 
> without suppress frame size under 256, we may sometime submit 256dw, 
> and sometime submit 512 dw, and that could lead to CPU override ring 
> buffer content which is under processing by GPU, e.g. :
>
>
> we assume ring buffer size = 512 dw (so ring buffer can hold 2 hw 
> submit) for easy understanding.
>
>
> hw_count =2;
>
> submit job-1 (take 256 dw)
>
> hw_count = 1;
>
> submit job-2 (take 256 dw) //now ring buffer is full
>
> hw_count =0;
>
> gpu_scheduler waiting
>
> ...
>
> job-1 signaled, then hw_count => 1
>
> submit job3, and the job3 is filled in the head of RB (wrapped)
>
> but job3 take 512 dw (e.g. it takes 259 dw, but we aligned it to 512)
>
>
> job-2 still under processing, but the package belongs to job-2 is 
> override by job3, disaster ...
>
>
> another reason is we keep each DMAframe to 256dw is for better 
> performance.
>
>
> BR Monk
>
> ------------------------------------------------------------------------
> *发件人:* Christian König <deathsimple@vodafone.de>
> *发送时间:* 2017年1月18日 17:28:15
> *收件人:* Liu, Monk; amd-gfx@lists.freedesktop.org
> *主题:* Re: [PATCH] drm/amdgpu:guarantee 128dws between vm flush and IB
> Am 18.01.2017 um 06:55 schrieb Monk Liu:
> > previously we always insert 128nops behind vm_flush, which
> > may lead to DAMframe size above 256 dw and automatially aligned
> > to 512 dw.
> >
> > now we calculate how many DWs already inserted after vm_flush
> > and make up for the reset to pad up to 128dws before emit_ib.
> >
> > that way we only take 256 dw per submit.
>
> First question why do we want to limit the dw per submit to 256? Some
> limitation for SRIOV?
>
> Then for the implementation, please don't add all that counting to the
> different functions. Instead save the current position in the ring in
> emit_vm_flush and then calculate in emit_cntxcntl() how many dw we still
> need to add to have at least 128. E.g. call the variable something like
> last_vm_flush_pos.
>
> That makes the code way more robust towards any changes.
>
> Regards,
> Christian.
>
> >
> > Change-Id: Iac198e16f35b071476ba7bd48ab338223f6fe650
> > Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  1 +
> >   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 24 ++++++++++++++++++++++--
> >   2 files changed, 23 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> > index c813cbe..1dbe600 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> > @@ -173,6 +173,7 @@ struct amdgpu_ring {
> >   #if defined(CONFIG_DEBUG_FS)
> >        struct dentry *ent;
> >   #endif
> > +     u32 dws_between_vm_ib;
> >   };
> >
> >   int amdgpu_ring_alloc(struct amdgpu_ring *ring, unsigned ndw);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > index 5f37313..76b1315 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > @@ -5670,6 +5670,8 @@ static void 
> gfx_v8_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
> >        amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].oa);
> >        amdgpu_ring_write(ring, 0);
> >        amdgpu_ring_write(ring, (1 << (oa_size + oa_base)) - (1 << 
> oa_base));
> > +
> > +     ring->dws_between_vm_ib += 20;
> >   }
> >
> >   static uint32_t wave_read_ind(struct amdgpu_device *adev, uint32_t 
> simd, uint32_t wave, uint32_t address)
> > @@ -6489,6 +6491,8 @@ static void 
> gfx_v8_0_ring_emit_hdp_flush(struct amdgpu_ring *ring)
> >        amdgpu_ring_write(ring, ref_and_mask);
> >        amdgpu_ring_write(ring, ref_and_mask);
> >        amdgpu_ring_write(ring, 0x20); /* poll interval */
> > +
> > +     ring->dws_between_vm_ib += 7;
> >   }
> >
> >   static void gfx_v8_0_ring_emit_vgt_flush(struct amdgpu_ring *ring)
> > @@ -6500,6 +6504,8 @@ static void 
> gfx_v8_0_ring_emit_vgt_flush(struct amdgpu_ring *ring)
> >        amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE, 0));
> >        amdgpu_ring_write(ring, EVENT_TYPE(VGT_FLUSH) |
> >                EVENT_INDEX(0));
> > +
> > +     ring->dws_between_vm_ib += 4;
> >   }
> >
> >
> > @@ -6573,6 +6579,7 @@ static void 
> gfx_v8_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr,
> >        amdgpu_ring_write(ring, lower_32_bits(seq));
> >        amdgpu_ring_write(ring, upper_32_bits(seq));
> >
> > +     ring->dws_between_vm_ib += 6;
> >   }
> >
> >   static void gfx_v8_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
> > @@ -6639,6 +6646,8 @@ static void gfx_v8_0_ring_emit_vm_flush(struct 
> amdgpu_ring *ring,
> >                /* GFX8 emits 128 dw nop to prevent CE access VM 
> before vm_flush finish */
> >                amdgpu_ring_insert_nop(ring, 128);
> >        }
> > +
> > +     ring->dws_between_vm_ib = 0; /* clear before recalculate */
> >   }
> >
> >   static u32 gfx_v8_0_ring_get_wptr_compute(struct amdgpu_ring *ring)
> > @@ -6711,9 +6720,11 @@ static void gfx_v8_ring_emit_cntxcntl(struct 
> amdgpu_ring *ring, uint32_t flags)
> >   {
> >        uint32_t dw2 = 0;
> >
> > -     if (amdgpu_sriov_vf(ring->adev))
> > +     if (amdgpu_sriov_vf(ring->adev)) {
> >                gfx_v8_0_ring_emit_ce_meta_init(ring,
> >                        (flags & AMDGPU_VM_DOMAIN) ? AMDGPU_CSA_VADDR 
> : ring->adev->virt.csa_vmid0_addr);
> > +             ring->dws_between_vm_ib += 8;
> > +     }
> >
> >        dw2 |= 0x80000000; /* set load_enable otherwise this package 
> is just NOPs */
> >        if (flags & AMDGPU_HAVE_CTX_SWITCH) {
> > @@ -6739,10 +6750,19 @@ static void gfx_v8_ring_emit_cntxcntl(struct 
> amdgpu_ring *ring, uint32_t flags)
> >        amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
> >        amdgpu_ring_write(ring, dw2);
> >        amdgpu_ring_write(ring, 0);
> > +     ring->dws_between_vm_ib += 3;
> >
> > -     if (amdgpu_sriov_vf(ring->adev))
> > +     if (amdgpu_sriov_vf(ring->adev)) {
> >                gfx_v8_0_ring_emit_de_meta_init(ring,
> >                        (flags & AMDGPU_VM_DOMAIN) ? AMDGPU_CSA_VADDR 
> : ring->adev->virt.csa_vmid0_addr);
> > +             ring->dws_between_vm_ib += 21;
> > +     }
> > +
> > +     /* emit_de_meta_init should be the last package right before 
> emi_ib,
> > +      * and we need to pad some NOPs before emit_ib to prevent CE 
> run ahead of
> > +      * vm_flush, which may trigger VM fault.
> > +      */
> > +     amdgpu_ring_insert_nop(ring, 128 - ring->dws_between_vm_ib);
> >   }
> >
> >   static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, 
> uint32_t reg)
>
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* 答复: 答复: [PATCH] drm/amdgpu:guarantee 128dws between vm flush and IB
       [not found]             ` <9e325cfb-ec04-0317-b871-2d54ad21b724-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2017-01-19  3:19               ` Liu, Monk
       [not found]                 ` <BY2PR1201MB11109CD4EC58A7C8F9C180CA847E0-O28G1zQ8oGliQkyLPkmea2rFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
  0 siblings, 1 reply; 6+ messages in thread
From: Liu, Monk @ 2017-01-19  3:19 UTC (permalink / raw)
  To: Christian König, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 9242 bytes --]

current code missed the 128 DW after SWITCH_BUFFER, even without vm flush, we still need those 128 DW betwen previous frame's SWITCH_BUFFER and current frame's CE ib,

and this patch fixed this issue as well otherwise in SRIOV case (Unigeen HEAVEN) CE will meet vm fault, and in  steam OS case (DOTA2) will meet VM fault as well.


so the conclusion is if we have vm-flush, we make sure 128dw between vm flush and CE ib, if we don't insert vm flush we stil make sure 128 DW between SWITCH_BUFFER and CE ib.


if you reject this patch, please give me a solution to fix above VM fault


BR Monk

________________________________
发件人: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> 代表 Christian König <deathsimple@vodafone.de>
发送时间: 2017年1月18日 20:52:14
收件人: Liu, Monk; amd-gfx@lists.freedesktop.org
主题: Re: 答复: [PATCH] drm/amdgpu:guarantee 128dws between vm flush and IB

> suppose hw_submit_count in gpu_scheduler is 2 (by default), and
> without suppress frame size under 256, we may sometime submit 256dw,
> and sometime submit 512 dw, and that could lead to CPU override ring
> buffer content which is under processing by GPU, e.g. :
>
The max_dw parameter for amdgpu_ring_init() is multiplied by
amdgpu_sched_hw_submission. See amdgpu_ring_init():

         ring->ring_size = roundup_pow_of_two(max_dw * 4 *
amdgpu_sched_hw_submission);

Since we use 1024 for max_dw for the GFX, Compute and SDMA rings using
512 dw is perfectly ok.

Even if we ever try to submit more than 1024 including padding we will
get a nice warning in the logs. See amdgpu_ring_alloc():

         if (WARN_ON_ONCE(ndw > ring->max_dw))
                 return -ENOMEM;

> another reason is we keep each DMAframe to 256dw is for better
> performance.
>
Well considering what else we have in those command buffers I would
strongly say that this is negligible.

So NAK on that patch as long as we don't have a good reason for it.

Regards,
Christian.

Am 18.01.2017 um 11:11 schrieb Liu, Monk:
>
> >First question why do we want to limit the dw per submit to 256? Some
> >limitation for SRIOV?
>
>
> [ML]
>
> suppose hw_submit_count in gpu_scheduler is 2 (by default), and
> without suppress frame size under 256, we may sometime submit 256dw,
> and sometime submit 512 dw, and that could lead to CPU override ring
> buffer content which is under processing by GPU, e.g. :
>
>
> we assume ring buffer size = 512 dw (so ring buffer can hold 2 hw
> submit) for easy understanding.
>
>
> hw_count =2;
>
> submit job-1 (take 256 dw)
>
> hw_count = 1;
>
> submit job-2 (take 256 dw) //now ring buffer is full
>
> hw_count =0;
>
> gpu_scheduler waiting
>
> ...
>
> job-1 signaled, then hw_count => 1
>
> submit job3, and the job3 is filled in the head of RB (wrapped)
>
> but job3 take 512 dw (e.g. it takes 259 dw, but we aligned it to 512)
>
>
> job-2 still under processing, but the package belongs to job-2 is
> override by job3, disaster ...
>
>
> another reason is we keep each DMAframe to 256dw is for better
> performance.
>
>
> BR Monk
>
> ------------------------------------------------------------------------
> *发件人:* Christian König <deathsimple@vodafone.de>
> *发送时间:* 2017年1月18日 17:28:15
> *收件人:* Liu, Monk; amd-gfx@lists.freedesktop.org
> *主题:* Re: [PATCH] drm/amdgpu:guarantee 128dws between vm flush and IB
> Am 18.01.2017 um 06:55 schrieb Monk Liu:
> > previously we always insert 128nops behind vm_flush, which
> > may lead to DAMframe size above 256 dw and automatially aligned
> > to 512 dw.
> >
> > now we calculate how many DWs already inserted after vm_flush
> > and make up for the reset to pad up to 128dws before emit_ib.
> >
> > that way we only take 256 dw per submit.
>
> First question why do we want to limit the dw per submit to 256? Some
> limitation for SRIOV?
>
> Then for the implementation, please don't add all that counting to the
> different functions. Instead save the current position in the ring in
> emit_vm_flush and then calculate in emit_cntxcntl() how many dw we still
> need to add to have at least 128. E.g. call the variable something like
> last_vm_flush_pos.
>
> That makes the code way more robust towards any changes.
>
> Regards,
> Christian.
>
> >
> > Change-Id: Iac198e16f35b071476ba7bd48ab338223f6fe650
> > Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  1 +
> >   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 24 ++++++++++++++++++++++--
> >   2 files changed, 23 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> > index c813cbe..1dbe600 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> > @@ -173,6 +173,7 @@ struct amdgpu_ring {
> >   #if defined(CONFIG_DEBUG_FS)
> >        struct dentry *ent;
> >   #endif
> > +     u32 dws_between_vm_ib;
> >   };
> >
> >   int amdgpu_ring_alloc(struct amdgpu_ring *ring, unsigned ndw);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > index 5f37313..76b1315 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > @@ -5670,6 +5670,8 @@ static void
> gfx_v8_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
> >        amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].oa);
> >        amdgpu_ring_write(ring, 0);
> >        amdgpu_ring_write(ring, (1 << (oa_size + oa_base)) - (1 <<
> oa_base));
> > +
> > +     ring->dws_between_vm_ib += 20;
> >   }
> >
> >   static uint32_t wave_read_ind(struct amdgpu_device *adev, uint32_t
> simd, uint32_t wave, uint32_t address)
> > @@ -6489,6 +6491,8 @@ static void
> gfx_v8_0_ring_emit_hdp_flush(struct amdgpu_ring *ring)
> >        amdgpu_ring_write(ring, ref_and_mask);
> >        amdgpu_ring_write(ring, ref_and_mask);
> >        amdgpu_ring_write(ring, 0x20); /* poll interval */
> > +
> > +     ring->dws_between_vm_ib += 7;
> >   }
> >
> >   static void gfx_v8_0_ring_emit_vgt_flush(struct amdgpu_ring *ring)
> > @@ -6500,6 +6504,8 @@ static void
> gfx_v8_0_ring_emit_vgt_flush(struct amdgpu_ring *ring)
> >        amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE, 0));
> >        amdgpu_ring_write(ring, EVENT_TYPE(VGT_FLUSH) |
> >                EVENT_INDEX(0));
> > +
> > +     ring->dws_between_vm_ib += 4;
> >   }
> >
> >
> > @@ -6573,6 +6579,7 @@ static void
> gfx_v8_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr,
> >        amdgpu_ring_write(ring, lower_32_bits(seq));
> >        amdgpu_ring_write(ring, upper_32_bits(seq));
> >
> > +     ring->dws_between_vm_ib += 6;
> >   }
> >
> >   static void gfx_v8_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
> > @@ -6639,6 +6646,8 @@ static void gfx_v8_0_ring_emit_vm_flush(struct
> amdgpu_ring *ring,
> >                /* GFX8 emits 128 dw nop to prevent CE access VM
> before vm_flush finish */
> >                amdgpu_ring_insert_nop(ring, 128);
> >        }
> > +
> > +     ring->dws_between_vm_ib = 0; /* clear before recalculate */
> >   }
> >
> >   static u32 gfx_v8_0_ring_get_wptr_compute(struct amdgpu_ring *ring)
> > @@ -6711,9 +6720,11 @@ static void gfx_v8_ring_emit_cntxcntl(struct
> amdgpu_ring *ring, uint32_t flags)
> >   {
> >        uint32_t dw2 = 0;
> >
> > -     if (amdgpu_sriov_vf(ring->adev))
> > +     if (amdgpu_sriov_vf(ring->adev)) {
> >                gfx_v8_0_ring_emit_ce_meta_init(ring,
> >                        (flags & AMDGPU_VM_DOMAIN) ? AMDGPU_CSA_VADDR
> : ring->adev->virt.csa_vmid0_addr);
> > +             ring->dws_between_vm_ib += 8;
> > +     }
> >
> >        dw2 |= 0x80000000; /* set load_enable otherwise this package
> is just NOPs */
> >        if (flags & AMDGPU_HAVE_CTX_SWITCH) {
> > @@ -6739,10 +6750,19 @@ static void gfx_v8_ring_emit_cntxcntl(struct
> amdgpu_ring *ring, uint32_t flags)
> >        amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
> >        amdgpu_ring_write(ring, dw2);
> >        amdgpu_ring_write(ring, 0);
> > +     ring->dws_between_vm_ib += 3;
> >
> > -     if (amdgpu_sriov_vf(ring->adev))
> > +     if (amdgpu_sriov_vf(ring->adev)) {
> >                gfx_v8_0_ring_emit_de_meta_init(ring,
> >                        (flags & AMDGPU_VM_DOMAIN) ? AMDGPU_CSA_VADDR
> : ring->adev->virt.csa_vmid0_addr);
> > +             ring->dws_between_vm_ib += 21;
> > +     }
> > +
> > +     /* emit_de_meta_init should be the last package right before
> emi_ib,
> > +      * and we need to pad some NOPs before emit_ib to prevent CE
> run ahead of
> > +      * vm_flush, which may trigger VM fault.
> > +      */
> > +     amdgpu_ring_insert_nop(ring, 128 - ring->dws_between_vm_ib);
> >   }
> >
> >   static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring,
> uint32_t reg)
>
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[-- Attachment #1.2: Type: text/html, Size: 14736 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: 答复: 答复: [PATCH] drm/amdgpu:guarantee 128dws between vm flush and IB
       [not found]                 ` <BY2PR1201MB11109CD4EC58A7C8F9C180CA847E0-O28G1zQ8oGliQkyLPkmea2rFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
@ 2017-01-19  9:10                   ` Christian König
  0 siblings, 0 replies; 6+ messages in thread
From: Christian König @ 2017-01-19  9:10 UTC (permalink / raw)
  To: Liu, Monk, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 10477 bytes --]

> so the conclusion is if we have vm-flush, we make sure 128dw between 
> vm flush and CE ib, if we don't insert vm flush we stil make sure 128 
> DW between SWITCH_BUFFER and CE ib.
>
Good point.

> if you reject this patch, please give me a solution to fix above VM fault
>
Well, we could follow the windows approach, e.g. increase padding to 
256dw and separate the VM flush in it's own submission.

But that's more a long term fix, for now I'm ok with that patch and just 
reviewed your v4 of it.

Regards,
Christian.

Am 19.01.2017 um 04:19 schrieb Liu, Monk:
>
> current code missed the 128 DW after SWITCH_BUFFER, even without vm 
> flush, we still need those 128 DW betwen previous frame's 
> SWITCH_BUFFER and current frame's CE ib,
>
> and this patch fixed this issue as well otherwise in SRIOV case 
> (Unigeen HEAVEN) CE will meet vm fault, and in  steam OS case (DOTA2) 
> will meet VM fault as well.
>
>
> so the conclusion is if we have vm-flush, we make sure 128dw between 
> vm flush and CE ib, if we don't insert vm flush we stil make sure 128 
> DW between SWITCH_BUFFER and CE ib.
>
>
> if you reject this patch, please give me a solution to fix above VM fault
>
>
> BR Monk
>
> ------------------------------------------------------------------------
> *发件人:* amd-gfx <amd-gfx-bounces-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org> 代表 Christian 
> König <deathsimple-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
> *发送时间:* 2017年1月18日 20:52:14
> *收件人:* Liu, Monk; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> *主题:* Re: 答复: [PATCH] drm/amdgpu:guarantee 128dws between vm flush and 
> IB
> > suppose hw_submit_count in gpu_scheduler is 2 (by default), and
> > without suppress frame size under 256, we may sometime submit 256dw,
> > and sometime submit 512 dw, and that could lead to CPU override ring
> > buffer content which is under processing by GPU, e.g. :
> >
> The max_dw parameter for amdgpu_ring_init() is multiplied by
> amdgpu_sched_hw_submission. See amdgpu_ring_init():
>
>          ring->ring_size = roundup_pow_of_two(max_dw * 4 *
> amdgpu_sched_hw_submission);
>
> Since we use 1024 for max_dw for the GFX, Compute and SDMA rings using
> 512 dw is perfectly ok.
>
> Even if we ever try to submit more than 1024 including padding we will
> get a nice warning in the logs. See amdgpu_ring_alloc():
>
>          if (WARN_ON_ONCE(ndw > ring->max_dw))
>                  return -ENOMEM;
>
> > another reason is we keep each DMAframe to 256dw is for better
> > performance.
> >
> Well considering what else we have in those command buffers I would
> strongly say that this is negligible.
>
> So NAK on that patch as long as we don't have a good reason for it.
>
> Regards,
> Christian.
>
> Am 18.01.2017 um 11:11 schrieb Liu, Monk:
> >
> > >First question why do we want to limit the dw per submit to 256? Some
> > >limitation for SRIOV?
> >
> >
> > [ML]
> >
> > suppose hw_submit_count in gpu_scheduler is 2 (by default), and
> > without suppress frame size under 256, we may sometime submit 256dw,
> > and sometime submit 512 dw, and that could lead to CPU override ring
> > buffer content which is under processing by GPU, e.g. :
> >
> >
> > we assume ring buffer size = 512 dw (so ring buffer can hold 2 hw
> > submit) for easy understanding.
> >
> >
> > hw_count =2;
> >
> > submit job-1 (take 256 dw)
> >
> > hw_count = 1;
> >
> > submit job-2 (take 256 dw) //now ring buffer is full
> >
> > hw_count =0;
> >
> > gpu_scheduler waiting
> >
> > ...
> >
> > job-1 signaled, then hw_count => 1
> >
> > submit job3, and the job3 is filled in the head of RB (wrapped)
> >
> > but job3 take 512 dw (e.g. it takes 259 dw, but we aligned it to 512)
> >
> >
> > job-2 still under processing, but the package belongs to job-2 is
> > override by job3, disaster ...
> >
> >
> > another reason is we keep each DMAframe to 256dw is for better
> > performance.
> >
> >
> > BR Monk
> >
> > ------------------------------------------------------------------------
> > *发件人:* Christian König <deathsimple-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
> > *发送时间:* 2017年1月18日 17:28:15
> > *收件人:* Liu, Monk; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> > *主题:* Re: [PATCH] drm/amdgpu:guarantee 128dws between vm flush and IB
> > Am 18.01.2017 um 06:55 schrieb Monk Liu:
> > > previously we always insert 128nops behind vm_flush, which
> > > may lead to DAMframe size above 256 dw and automatially aligned
> > > to 512 dw.
> > >
> > > now we calculate how many DWs already inserted after vm_flush
> > > and make up for the reset to pad up to 128dws before emit_ib.
> > >
> > > that way we only take 256 dw per submit.
> >
> > First question why do we want to limit the dw per submit to 256? Some
> > limitation for SRIOV?
> >
> > Then for the implementation, please don't add all that counting to the
> > different functions. Instead save the current position in the ring in
> > emit_vm_flush and then calculate in emit_cntxcntl() how many dw we still
> > need to add to have at least 128. E.g. call the variable something like
> > last_vm_flush_pos.
> >
> > That makes the code way more robust towards any changes.
> >
> > Regards,
> > Christian.
> >
> > >
> > > Change-Id: Iac198e16f35b071476ba7bd48ab338223f6fe650
> > > Signed-off-by: Monk Liu <Monk.Liu-5C7GfCeVMHo@public.gmane.org>
> > > ---
> > >   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  1 +
> > >   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 24 
> ++++++++++++++++++++++--
> > >   2 files changed, 23 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> > > index c813cbe..1dbe600 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> > > @@ -173,6 +173,7 @@ struct amdgpu_ring {
> > >   #if defined(CONFIG_DEBUG_FS)
> > >        struct dentry *ent;
> > >   #endif
> > > +     u32 dws_between_vm_ib;
> > >   };
> > >
> > >   int amdgpu_ring_alloc(struct amdgpu_ring *ring, unsigned ndw);
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > > index 5f37313..76b1315 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> > > @@ -5670,6 +5670,8 @@ static void
> > gfx_v8_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
> > >        amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].oa);
> > >        amdgpu_ring_write(ring, 0);
> > >        amdgpu_ring_write(ring, (1 << (oa_size + oa_base)) - (1 <<
> > oa_base));
> > > +
> > > +     ring->dws_between_vm_ib += 20;
> > >   }
> > >
> > >   static uint32_t wave_read_ind(struct amdgpu_device *adev, uint32_t
> > simd, uint32_t wave, uint32_t address)
> > > @@ -6489,6 +6491,8 @@ static void
> > gfx_v8_0_ring_emit_hdp_flush(struct amdgpu_ring *ring)
> > >        amdgpu_ring_write(ring, ref_and_mask);
> > >        amdgpu_ring_write(ring, ref_and_mask);
> > >        amdgpu_ring_write(ring, 0x20); /* poll interval */
> > > +
> > > +     ring->dws_between_vm_ib += 7;
> > >   }
> > >
> > >   static void gfx_v8_0_ring_emit_vgt_flush(struct amdgpu_ring *ring)
> > > @@ -6500,6 +6504,8 @@ static void
> > gfx_v8_0_ring_emit_vgt_flush(struct amdgpu_ring *ring)
> > >        amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE, 0));
> > >        amdgpu_ring_write(ring, EVENT_TYPE(VGT_FLUSH) |
> > >                EVENT_INDEX(0));
> > > +
> > > +     ring->dws_between_vm_ib += 4;
> > >   }
> > >
> > >
> > > @@ -6573,6 +6579,7 @@ static void
> > gfx_v8_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr,
> > >        amdgpu_ring_write(ring, lower_32_bits(seq));
> > >        amdgpu_ring_write(ring, upper_32_bits(seq));
> > >
> > > +     ring->dws_between_vm_ib += 6;
> > >   }
> > >
> > >   static void gfx_v8_0_ring_emit_pipeline_sync(struct amdgpu_ring 
> *ring)
> > > @@ -6639,6 +6646,8 @@ static void gfx_v8_0_ring_emit_vm_flush(struct
> > amdgpu_ring *ring,
> > >                /* GFX8 emits 128 dw nop to prevent CE access VM
> > before vm_flush finish */
> > >                amdgpu_ring_insert_nop(ring, 128);
> > >        }
> > > +
> > > +     ring->dws_between_vm_ib = 0; /* clear before recalculate */
> > >   }
> > >
> > >   static u32 gfx_v8_0_ring_get_wptr_compute(struct amdgpu_ring *ring)
> > > @@ -6711,9 +6720,11 @@ static void gfx_v8_ring_emit_cntxcntl(struct
> > amdgpu_ring *ring, uint32_t flags)
> > >   {
> > >        uint32_t dw2 = 0;
> > >
> > > -     if (amdgpu_sriov_vf(ring->adev))
> > > +     if (amdgpu_sriov_vf(ring->adev)) {
> > > gfx_v8_0_ring_emit_ce_meta_init(ring,
> > >                        (flags & AMDGPU_VM_DOMAIN) ? AMDGPU_CSA_VADDR
> > : ring->adev->virt.csa_vmid0_addr);
> > > +             ring->dws_between_vm_ib += 8;
> > > +     }
> > >
> > >        dw2 |= 0x80000000; /* set load_enable otherwise this package
> > is just NOPs */
> > >        if (flags & AMDGPU_HAVE_CTX_SWITCH) {
> > > @@ -6739,10 +6750,19 @@ static void gfx_v8_ring_emit_cntxcntl(struct
> > amdgpu_ring *ring, uint32_t flags)
> > >        amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
> > >        amdgpu_ring_write(ring, dw2);
> > >        amdgpu_ring_write(ring, 0);
> > > +     ring->dws_between_vm_ib += 3;
> > >
> > > -     if (amdgpu_sriov_vf(ring->adev))
> > > +     if (amdgpu_sriov_vf(ring->adev)) {
> > > gfx_v8_0_ring_emit_de_meta_init(ring,
> > >                        (flags & AMDGPU_VM_DOMAIN) ? AMDGPU_CSA_VADDR
> > : ring->adev->virt.csa_vmid0_addr);
> > > +             ring->dws_between_vm_ib += 21;
> > > +     }
> > > +
> > > +     /* emit_de_meta_init should be the last package right before
> > emi_ib,
> > > +      * and we need to pad some NOPs before emit_ib to prevent CE
> > run ahead of
> > > +      * vm_flush, which may trigger VM fault.
> > > +      */
> > > +     amdgpu_ring_insert_nop(ring, 128 - ring->dws_between_vm_ib);
> > >   }
> > >
> > >   static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring,
> > uint32_t reg)
> >
> >
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx



[-- Attachment #1.2: Type: text/html, Size: 19238 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2017-01-19  9:10 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-01-18  5:55 [PATCH] drm/amdgpu:guarantee 128dws between vm flush and IB Monk Liu
     [not found] ` <1484718926-10539-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-01-18  9:28   ` Christian König
     [not found]     ` <5999cfbf-d089-a05f-734c-dabe388cd303-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2017-01-18 10:11       ` 答复: " Liu, Monk
     [not found]         ` <BY2PR1201MB1110C8A3D5B66799F3BCC1B1847F0-O28G1zQ8oGliQkyLPkmea2rFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
2017-01-18 12:52           ` Christian König
     [not found]             ` <9e325cfb-ec04-0317-b871-2d54ad21b724-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2017-01-19  3:19               ` 答复: " Liu, Monk
     [not found]                 ` <BY2PR1201MB11109CD4EC58A7C8F9C180CA847E0-O28G1zQ8oGliQkyLPkmea2rFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
2017-01-19  9:10                   ` Christian König

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.