All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu:fix DMAframe for GFX8
@ 2016-08-24  3:42 Monk Liu
       [not found] ` <1472010140-8571-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: Monk Liu @ 2016-08-24  3:42 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

1,drop inserting double SWITCH_BUFFERS scheme, which impacts
performance, because double SWITCH_BUFFER actaully doesn't
switch buffer, so the CE's ping-pong buffer is not used at
all. Now only insert one SWITCH_BUFFERS at the bottom of
each GFX dmaframe. And only inserting one SWITCH_BUFFER is
a must for Virtualization World Switch!

2,without double SWITCH_BUFFRES, CE will go very ahead
of DE which will trigger VM fault (CE IB run before DE
finished VM flush), so we need a way to prevent that happen.
according to GFX8.0 spec, CE will go ahead of DE at most by
128dw, so we can insert 128 NOP before CE ib, so that
CE won't run const IB before DE finished VM flush.

3,for virtualization, CP requires that each DMAframe need
two const IB and one IB, so we cannot remove preamble CE IB
even no context switch take place.

4,each DMAframe won't occupy more than 256dw, so allocate
256dw is enough

TODO:
use DMAframe entry design to implement ib_schedule(), make sure
each DMAframe is aligned and fixed at their offset, which is
needed for preemption and easy for ring buffer debug & dump.

Change-Id: I1b98d6352b7ead91b661094921bfd43cfeaae190
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h    |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 15 ++++++++++-----
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c  | 27 ++++++++++++---------------
 3 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 316bd2a..11bb05f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -337,6 +337,7 @@ struct amdgpu_ring_funcs {
 	void (*end_use)(struct amdgpu_ring *ring);
 	void (*emit_wreg) (struct amdgpu_ring *ring, uint32_t offset, uint32_t val);
 	void (*emit_rreg) (struct amdgpu_ring *ring, uint32_t offset);
+	void (*emit_sb) (struct amdgpu_ring *ring, int cnt);
 };
 
 /*
@@ -2370,6 +2371,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
 #define amdgpu_ring_emit_hdp_invalidate(r) (r)->funcs->emit_hdp_invalidate((r))
 #define amdgpu_ring_emit_wreg(r, i, v) (r)->funcs->emit_wreg((r), (i), (v))
 #define amdgpu_ring_emit_rreg(r, i) (r)->funcs->emit_rreg((r), (i))
+#define amdgpu_ring_emit_sb(r, c) (r)->funcs->emit_sb((r), (c))
 #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
 #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
 #define amdgpu_ring_patch_cond_exec(r,o) (r)->funcs->patch_cond_exec((r),(o))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
index a31d7ef..d99af07 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
@@ -151,7 +151,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 		return -EINVAL;
 	}
 
-	r = amdgpu_ring_alloc(ring, 256 * num_ibs);
+	r = amdgpu_ring_alloc(ring, 256);
 	if (r) {
 		dev_err(adev->dev, "scheduling IB failed (%d).\n", r);
 		return r;
@@ -174,15 +174,16 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 	/* always set cond_exec_polling to CONTINUE */
 	*ring->cond_exe_cpu_addr = 1;
 
+
+	/* emit 128 dw nop before IBc */
+	if (ring->type == AMDGPU_RING_TYPE_GFX)
+		amdgpu_ring_insert_nop(ring, 128);
+
 	skip_preamble = ring->current_ctx == ctx;
 	need_ctx_switch = ring->current_ctx != ctx;
 	for (i = 0; i < num_ibs; ++i) {
 		ib = &ibs[i];
 
-		/* drop preamble IBs if we don't have a context switch */
-		if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) && skip_preamble)
-			continue;
-
 		amdgpu_ring_emit_ib(ring, ib, job ? job->vm_id : 0,
 				    need_ctx_switch);
 		need_ctx_switch = false;
@@ -210,6 +211,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 		amdgpu_ring_patch_cond_exec(ring, patch_offset);
 
 	ring->current_ctx = ctx;
+
+	/* Insert one SB at the bottom of each DMA frame */
+	if (ring->funcs->emit_sb)
+		amdgpu_ring_emit_sb(ring, 1);
 	amdgpu_ring_commit(ring);
 	return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index dfa2288..37dc64b 100755
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -5936,12 +5936,6 @@ static void gfx_v8_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
 {
 	u32 header, control = 0;
 
-	/* insert SWITCH_BUFFER packet before first IB in the ring frame */
-	if (ctx_switch) {
-		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
-		amdgpu_ring_write(ring, 0);
-	}
-
 	if (ib->flags & AMDGPU_IB_FLAG_CE)
 		header = PACKET3(PACKET3_INDIRECT_BUFFER_CONST, 2);
 	else
@@ -6013,11 +6007,9 @@ static void gfx_v8_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
 	amdgpu_ring_write(ring, 4); /* poll interval */
 
 	if (usepfp) {
-		/* synce CE with ME to prevent CE fetch CEIB before context switch done */
-		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
-		amdgpu_ring_write(ring, 0);
-		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
-		amdgpu_ring_write(ring, 0);
+		/* sync PFP to ME, otherwise we might get invalid PFP reads */
+		amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0));
+		amdgpu_ring_write(ring, 0x0);
 	}
 }
 
@@ -6065,10 +6057,6 @@ static void gfx_v8_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
 		/* sync PFP to ME, otherwise we might get invalid PFP reads */
 		amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0));
 		amdgpu_ring_write(ring, 0x0);
-		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
-		amdgpu_ring_write(ring, 0);
-		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
-		amdgpu_ring_write(ring, 0);
 	}
 }
 
@@ -6170,6 +6158,14 @@ static void gfx_v8_0_ring_emit_wreg_kiq(struct amdgpu_ring *ring, u32 idx, u32 v
 	amdgpu_ring_write(ring, val);
 }
 
+static void gfx_v8_ring_emit_sb(struct amdgpu_ring *ring, int cnt)
+{
+	while (cnt--) {
+		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
+		amdgpu_ring_write(ring, 0);
+	}
+}
+
 static void gfx_v8_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
 						 enum amdgpu_interrupt_state state)
 {
@@ -6477,6 +6473,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
 	.test_ib = gfx_v8_0_ring_test_ib,
 	.insert_nop = amdgpu_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
+	.emit_sb = gfx_v8_ring_emit_sb,
 };
 
 static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
-- 
1.9.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] drm/amdgpu:fix DMAframe for GFX8
       [not found] ` <1472010140-8571-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2016-08-24  8:16   ` Christian König
       [not found]     ` <16186223-1c07-e03a-5b09-4593b61fafad-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: Christian König @ 2016-08-24  8:16 UTC (permalink / raw)
  To: Monk Liu, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 24.08.2016 um 05:42 schrieb Monk Liu:
> 1,drop inserting double SWITCH_BUFFERS scheme, which impacts
> performance, because double SWITCH_BUFFER actaully doesn't
> switch buffer, so the CE's ping-pong buffer is not used at
> all. Now only insert one SWITCH_BUFFERS at the bottom of
> each GFX dmaframe. And only inserting one SWITCH_BUFFER is
> a must for Virtualization World Switch!
>
> 2,without double SWITCH_BUFFRES, CE will go very ahead
> of DE which will trigger VM fault (CE IB run before DE
> finished VM flush), so we need a way to prevent that happen.
> according to GFX8.0 spec, CE will go ahead of DE at most by
> 128dw, so we can insert 128 NOP before CE ib, so that
> CE won't run const IB before DE finished VM flush.
>
> 3,for virtualization, CP requires that each DMAframe need
> two const IB and one IB, so we cannot remove preamble CE IB
> even no context switch take place.

Well than we probably can't implement virtualization this way, cause 
there isn't any guarantee that userspace will use two CE IBs and one DE IB.

Apart from that not skipping the IBs would break the IOCTL interface, so 
it is clearly a NAK.

>
> 4,each DMAframe won't occupy more than 256dw, so allocate
> 256dw is enough
>
> TODO:
> use DMAframe entry design to implement ib_schedule(), make sure
> each DMAframe is aligned and fixed at their offset, which is
> needed for preemption and easy for ring buffer debug & dump.
>
> Change-Id: I1b98d6352b7ead91b661094921bfd43cfeaae190
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h    |  2 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 15 ++++++++++-----
>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c  | 27 ++++++++++++---------------
>   3 files changed, 24 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 316bd2a..11bb05f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -337,6 +337,7 @@ struct amdgpu_ring_funcs {
>   	void (*end_use)(struct amdgpu_ring *ring);
>   	void (*emit_wreg) (struct amdgpu_ring *ring, uint32_t offset, uint32_t val);
>   	void (*emit_rreg) (struct amdgpu_ring *ring, uint32_t offset);
> +	void (*emit_sb) (struct amdgpu_ring *ring, int cnt);
>   };
>   
>   /*
> @@ -2370,6 +2371,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>   #define amdgpu_ring_emit_hdp_invalidate(r) (r)->funcs->emit_hdp_invalidate((r))
>   #define amdgpu_ring_emit_wreg(r, i, v) (r)->funcs->emit_wreg((r), (i), (v))
>   #define amdgpu_ring_emit_rreg(r, i) (r)->funcs->emit_rreg((r), (i))
> +#define amdgpu_ring_emit_sb(r, c) (r)->funcs->emit_sb((r), (c))
>   #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
>   #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
>   #define amdgpu_ring_patch_cond_exec(r,o) (r)->funcs->patch_cond_exec((r),(o))
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> index a31d7ef..d99af07 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> @@ -151,7 +151,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   		return -EINVAL;
>   	}
>   
> -	r = amdgpu_ring_alloc(ring, 256 * num_ibs);
> +	r = amdgpu_ring_alloc(ring, 256);
>   	if (r) {
>   		dev_err(adev->dev, "scheduling IB failed (%d).\n", r);
>   		return r;
> @@ -174,15 +174,16 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   	/* always set cond_exec_polling to CONTINUE */
>   	*ring->cond_exe_cpu_addr = 1;
>   
> +
> +	/* emit 128 dw nop before IBc */
> +	if (ring->type == AMDGPU_RING_TYPE_GFX)
> +		amdgpu_ring_insert_nop(ring, 128);
> +

Please don't add ring specific code into the common handling.

>   	skip_preamble = ring->current_ctx == ctx;
>   	need_ctx_switch = ring->current_ctx != ctx;
>   	for (i = 0; i < num_ibs; ++i) {
>   		ib = &ibs[i];
>   
> -		/* drop preamble IBs if we don't have a context switch */
> -		if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) && skip_preamble)
> -			continue;
> -

This is a clear NAK since it is an userspace visible change to stop 
doing this.

Regards,
Christian.

>   		amdgpu_ring_emit_ib(ring, ib, job ? job->vm_id : 0,
>   				    need_ctx_switch);
>   		need_ctx_switch = false;
> @@ -210,6 +211,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   		amdgpu_ring_patch_cond_exec(ring, patch_offset);
>   
>   	ring->current_ctx = ctx;
> +
> +	/* Insert one SB at the bottom of each DMA frame */
> +	if (ring->funcs->emit_sb)
> +		amdgpu_ring_emit_sb(ring, 1);
>   	amdgpu_ring_commit(ring);
>   	return 0;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index dfa2288..37dc64b 100755
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -5936,12 +5936,6 @@ static void gfx_v8_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
>   {
>   	u32 header, control = 0;
>   
> -	/* insert SWITCH_BUFFER packet before first IB in the ring frame */
> -	if (ctx_switch) {
> -		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
> -		amdgpu_ring_write(ring, 0);
> -	}
> -
>   	if (ib->flags & AMDGPU_IB_FLAG_CE)
>   		header = PACKET3(PACKET3_INDIRECT_BUFFER_CONST, 2);
>   	else
> @@ -6013,11 +6007,9 @@ static void gfx_v8_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
>   	amdgpu_ring_write(ring, 4); /* poll interval */
>   
>   	if (usepfp) {
> -		/* synce CE with ME to prevent CE fetch CEIB before context switch done */
> -		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
> -		amdgpu_ring_write(ring, 0);
> -		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
> -		amdgpu_ring_write(ring, 0);
> +		/* sync PFP to ME, otherwise we might get invalid PFP reads */
> +		amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0));
> +		amdgpu_ring_write(ring, 0x0);
>   	}
>   }
>   
> @@ -6065,10 +6057,6 @@ static void gfx_v8_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
>   		/* sync PFP to ME, otherwise we might get invalid PFP reads */
>   		amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0));
>   		amdgpu_ring_write(ring, 0x0);
> -		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
> -		amdgpu_ring_write(ring, 0);
> -		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
> -		amdgpu_ring_write(ring, 0);
>   	}
>   }
>   
> @@ -6170,6 +6158,14 @@ static void gfx_v8_0_ring_emit_wreg_kiq(struct amdgpu_ring *ring, u32 idx, u32 v
>   	amdgpu_ring_write(ring, val);
>   }
>   
> +static void gfx_v8_ring_emit_sb(struct amdgpu_ring *ring, int cnt)
> +{
> +	while (cnt--) {
> +		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
> +		amdgpu_ring_write(ring, 0);
> +	}
> +}
> +
>   static void gfx_v8_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
>   						 enum amdgpu_interrupt_state state)
>   {
> @@ -6477,6 +6473,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
>   	.test_ib = gfx_v8_0_ring_test_ib,
>   	.insert_nop = amdgpu_ring_insert_nop,
>   	.pad_ib = amdgpu_ring_generic_pad_ib,
> +	.emit_sb = gfx_v8_ring_emit_sb,
>   };
>   
>   static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH] drm/amdgpu:fix DMAframe for GFX8
       [not found]     ` <16186223-1c07-e03a-5b09-4593b61fafad-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2016-08-24  8:29       ` Liu, Monk
       [not found]         ` <MWHPR12MB1182CFCA12CD2731D890488D84EA0-Gy0DoCVfaSVhjnLHdLm0OQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: Liu, Monk @ 2016-08-24  8:29 UTC (permalink / raw)
  To: Christian König, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Wang, Daniel(Xiaowei), Jiang, Jerry (SW)

Christian,

1,umd should insert always three IB each job, this is guaranteed in windows and catalyst driver, and this is a must for virtualization, 
 Besides, how to implement virtualization is not lead by driver, instead we should follow CP logic, and you don't have any choice of how to impl it ...

2,the patch only doesn't skip the preamble CE ib, so it doesn't break any interface protocol. I really don't understand your phase ---" Apart from that not skipping the IBs would break the IOCTL interface, so it is clearly a NAK"

3,the patch doesn't insert ring specific code, " amdgpu_ring_insert_nop" is a common interface for all amdgpu ring, *and* be caution that this change is a must otherwise CE will get vm fault after we change the scheme of SWITCH_BUFFERS

4,Only insert one SWITCH_BUFFERs at the end of GFX dma frame is what exactly windows KMD do for GX8.0, this is both for preemption and performance, original scheme of SWITCH_BUFFER is totally wrong, and please check below text I copied from CP hw guys:

----------------------------------------thread of discussion CE sync with DE with Hans--------------------------------------------------
Monk,

All your assumptions are correct and to be sure that CE isn’t ahead when it shouldn’t, the safe way is to insert double switch buffers at the end of VMSWICH. 

Now to the question, why does it work? There is a hardware limitation on how far ahead CE can be in the ring buffer. CE and DE are reading from the same fetcher and that data is stored in a reorder queue (ROQ). The size of this queue (in gfx8) is 128 DWs. This is how far ahead CE can be. Now, at least for Windows, KMD usually pad (NOPs) each DMA frame to 128 (or 256?) DWs and if CE’s indirect buffer is more than 128 DWs ahead of where DE is updating the page table it will work due to the padding and the limitation of the ROQ. 

/Hans

From: Liu, Monk 
Sent: Monday, August 15, 2016 11:35 PM
To: Fernlund, Hans
Cc: Deucher, Alexander; Li, Bingley; Liu, Robert; Wang, Daniel(Xiaowei); Jiang, Jerry (SW)
Subject: FW: CE synchronization with DE 

Hi Hans,

we are trying to understand constant engine sync (with DE) principle and we still have things not understood, especially the behave of CE 

can you help us go through it ? thanks very much!

1.      CE runs much faster than DE, and before DE finished VMSWICH  (vm_invalidate_request & pd preparing ) CE may already start running in CE_IB, which will touch memory through VMID 3 (windows KMD use VMID 3 for GFX engine), we are wondering why we never saw CE vm fault ?
2.      After CE processed SWITCH_BUFFERS, CE will go to the next DMA frame, which may probably belongs to another process (process B), but meanwhile DE may still not finished VM switch to process B, we also wondering why we never see CE vm fault under this condition neither…
•        NOTE: It only insert one SWITCH_BUFFERS at the end of gfx dma frame, so it won’t block CE continue if Buffer B is never used. ( first IB use buffer A and second use buffer B)

Ring buffer figure for above two cases, see blow please :

#1  case scenario:
Ring buffer:
/VMSWITCH/……/IB_C /IB_C/IB/EOP/SWITCH_BUFFER/…..
^                                        ^
|                                        |
DE                                     CE


Assume GFX ring is fresh new, and we just drop one JOB to hardware ring, DE is still in VMSWITCH stage (bind PD of current process to VMID 3 and then invalidate VM), so now the page table for VMID 3 is still not ready, but meanwhile CE may already executing CE_IB, accessing address of this CE_IB should trigger VM fault as our understanding …

We wondering why this never triggering vm fault since page table is not yet fully connected to VMID 3!


#2 case scenario:
Ring buffer:
/VMSWITCH-to-P1/……/IB_C /IB_C/IB/EOP/SWITCH_BUFFER/…../VMSWITCH-to-P2/………/IB_C /IB_C/IB/EOP/SWITCH_BUFFER/……
^                                        ^
|                                        |
DE                                     CE
Still assume the ring buffer is fresh new, and Assume DE is still in VMSWITCH-TO-P2 stage (P1/P2 means process1/process2), since this time vm page table of P2 yet not been binding to VMID3, if CE is already running in IB belongs to P2, why we never meet vm fault ?


Thanks for your time!

BR Monk




-----Original Message-----
From: Christian König [mailto:deathsimple@vodafone.de] 
Sent: Wednesday, August 24, 2016 4:16 PM
To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu:fix DMAframe for GFX8

Am 24.08.2016 um 05:42 schrieb Monk Liu:
> 1,drop inserting double SWITCH_BUFFERS scheme, which impacts 
> performance, because double SWITCH_BUFFER actaully doesn't switch 
> buffer, so the CE's ping-pong buffer is not used at all. Now only 
> insert one SWITCH_BUFFERS at the bottom of each GFX dmaframe. And only 
> inserting one SWITCH_BUFFER is a must for Virtualization World Switch!
>
> 2,without double SWITCH_BUFFRES, CE will go very ahead of DE which 
> will trigger VM fault (CE IB run before DE finished VM flush), so we 
> need a way to prevent that happen.
> according to GFX8.0 spec, CE will go ahead of DE at most by 128dw, so 
> we can insert 128 NOP before CE ib, so that CE won't run const IB 
> before DE finished VM flush.
>
> 3,for virtualization, CP requires that each DMAframe need two const IB 
> and one IB, so we cannot remove preamble CE IB even no context switch 
> take place.

Well than we probably can't implement virtualization this way, cause there isn't any guarantee that userspace will use two CE IBs and one DE IB.

Apart from that not skipping the IBs would break the IOCTL interface, so it is clearly a NAK.

>
> 4,each DMAframe won't occupy more than 256dw, so allocate 256dw is 
> enough
>
> TODO:
> use DMAframe entry design to implement ib_schedule(), make sure each 
> DMAframe is aligned and fixed at their offset, which is needed for 
> preemption and easy for ring buffer debug & dump.
>
> Change-Id: I1b98d6352b7ead91b661094921bfd43cfeaae190
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h    |  2 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 15 ++++++++++-----
>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c  | 27 ++++++++++++---------------
>   3 files changed, 24 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 316bd2a..11bb05f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -337,6 +337,7 @@ struct amdgpu_ring_funcs {
>   	void (*end_use)(struct amdgpu_ring *ring);
>   	void (*emit_wreg) (struct amdgpu_ring *ring, uint32_t offset, uint32_t val);
>   	void (*emit_rreg) (struct amdgpu_ring *ring, uint32_t offset);
> +	void (*emit_sb) (struct amdgpu_ring *ring, int cnt);
>   };
>   
>   /*
> @@ -2370,6 +2371,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>   #define amdgpu_ring_emit_hdp_invalidate(r) (r)->funcs->emit_hdp_invalidate((r))
>   #define amdgpu_ring_emit_wreg(r, i, v) (r)->funcs->emit_wreg((r), (i), (v))
>   #define amdgpu_ring_emit_rreg(r, i) (r)->funcs->emit_rreg((r), (i))
> +#define amdgpu_ring_emit_sb(r, c) (r)->funcs->emit_sb((r), (c))
>   #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
>   #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
>   #define amdgpu_ring_patch_cond_exec(r,o) 
> (r)->funcs->patch_cond_exec((r),(o))
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> index a31d7ef..d99af07 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> @@ -151,7 +151,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   		return -EINVAL;
>   	}
>   
> -	r = amdgpu_ring_alloc(ring, 256 * num_ibs);
> +	r = amdgpu_ring_alloc(ring, 256);
>   	if (r) {
>   		dev_err(adev->dev, "scheduling IB failed (%d).\n", r);
>   		return r;
> @@ -174,15 +174,16 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   	/* always set cond_exec_polling to CONTINUE */
>   	*ring->cond_exe_cpu_addr = 1;
>   
> +
> +	/* emit 128 dw nop before IBc */
> +	if (ring->type == AMDGPU_RING_TYPE_GFX)
> +		amdgpu_ring_insert_nop(ring, 128);
> +

Please don't add ring specific code into the common handling.

>   	skip_preamble = ring->current_ctx == ctx;
>   	need_ctx_switch = ring->current_ctx != ctx;
>   	for (i = 0; i < num_ibs; ++i) {
>   		ib = &ibs[i];
>   
> -		/* drop preamble IBs if we don't have a context switch */
> -		if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) && skip_preamble)
> -			continue;
> -

This is a clear NAK since it is an userspace visible change to stop doing this.

Regards,
Christian.

>   		amdgpu_ring_emit_ib(ring, ib, job ? job->vm_id : 0,
>   				    need_ctx_switch);
>   		need_ctx_switch = false;
> @@ -210,6 +211,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   		amdgpu_ring_patch_cond_exec(ring, patch_offset);
>   
>   	ring->current_ctx = ctx;
> +
> +	/* Insert one SB at the bottom of each DMA frame */
> +	if (ring->funcs->emit_sb)
> +		amdgpu_ring_emit_sb(ring, 1);
>   	amdgpu_ring_commit(ring);
>   	return 0;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index dfa2288..37dc64b 100755
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -5936,12 +5936,6 @@ static void gfx_v8_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
>   {
>   	u32 header, control = 0;
>   
> -	/* insert SWITCH_BUFFER packet before first IB in the ring frame */
> -	if (ctx_switch) {
> -		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
> -		amdgpu_ring_write(ring, 0);
> -	}
> -
>   	if (ib->flags & AMDGPU_IB_FLAG_CE)
>   		header = PACKET3(PACKET3_INDIRECT_BUFFER_CONST, 2);
>   	else
> @@ -6013,11 +6007,9 @@ static void gfx_v8_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
>   	amdgpu_ring_write(ring, 4); /* poll interval */
>   
>   	if (usepfp) {
> -		/* synce CE with ME to prevent CE fetch CEIB before context switch done */
> -		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
> -		amdgpu_ring_write(ring, 0);
> -		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
> -		amdgpu_ring_write(ring, 0);
> +		/* sync PFP to ME, otherwise we might get invalid PFP reads */
> +		amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0));
> +		amdgpu_ring_write(ring, 0x0);
>   	}
>   }
>   
> @@ -6065,10 +6057,6 @@ static void gfx_v8_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
>   		/* sync PFP to ME, otherwise we might get invalid PFP reads */
>   		amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0));
>   		amdgpu_ring_write(ring, 0x0);
> -		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
> -		amdgpu_ring_write(ring, 0);
> -		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
> -		amdgpu_ring_write(ring, 0);
>   	}
>   }
>   
> @@ -6170,6 +6158,14 @@ static void gfx_v8_0_ring_emit_wreg_kiq(struct amdgpu_ring *ring, u32 idx, u32 v
>   	amdgpu_ring_write(ring, val);
>   }
>   
> +static void gfx_v8_ring_emit_sb(struct amdgpu_ring *ring, int cnt) {
> +	while (cnt--) {
> +		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
> +		amdgpu_ring_write(ring, 0);
> +	}
> +}
> +
>   static void gfx_v8_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
>   						 enum amdgpu_interrupt_state state)
>   {
> @@ -6477,6 +6473,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
>   	.test_ib = gfx_v8_0_ring_test_ib,
>   	.insert_nop = amdgpu_ring_insert_nop,
>   	.pad_ib = amdgpu_ring_generic_pad_ib,
> +	.emit_sb = gfx_v8_ring_emit_sb,
>   };
>   
>   static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = 
> {


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] drm/amdgpu:fix DMAframe for GFX8
       [not found]         ` <MWHPR12MB1182CFCA12CD2731D890488D84EA0-Gy0DoCVfaSVhjnLHdLm0OQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2016-08-24  8:56           ` Christian König
       [not found]             ` <34bfdd22-da83-26b2-0949-fafe6b30af78-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: Christian König @ 2016-08-24  8:56 UTC (permalink / raw)
  To: Liu, Monk, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Wang, Daniel(Xiaowei), Jiang, Jerry (SW)

Am 24.08.2016 um 10:29 schrieb Liu, Monk:
> Christian,
>
> 1,umd should insert always three IB each job, this is guaranteed in windows and catalyst driver, and this is a must for virtualization,
>   Besides, how to implement virtualization is not lead by driver, instead we should follow CP logic, and you don't have any choice of how to impl it ...

No, that is totally nonsense. The kernel driver can *NEVER* rely on that 
the UMD does something in a certain way.

Mesa for example does uses the IBs this way and since that is the open 
source implementation it is also the only one that matters for us.

See Daniels recent mail about that 
(http://www.spinics.net/lists/dri-devel/msg116196.html).
"

+- The Linux kernel's "no regression" policy holds in practice only for
+  open-source userspace of the DRM subsystem. DRM developers are perfectly fine
+  if closed-source blob drivers in userspace use the same uAPI as the open
+  drivers, but they must do so in the exact same way as the open drivers.
+  Creative (ab)use of the interfaces will, and in the past routinely has, lead
+  to breakage.

"

> 2,the patch only doesn't skip the preamble CE ib, so it doesn't break any interface protocol. I really don't understand your phase ---" Apart from that not skipping the IBs would break the IOCTL interface, so it is clearly a NAK"

The IOCTL interface requires that we skip the IBs when there isn't any 
context change. Otherwise adding the flag in the first place wouldn't 
have made sense. Sorry but this requirement comes a bit late.

Could you summarize why the CP has this strange requirement and what 
would happen if we don't handle this correctly?

> 3,the patch doesn't insert ring specific code, " amdgpu_ring_insert_nop" is a common interface for all amdgpu ring, *and* be caution that this change is a must otherwise CE will get vm fault after we change the scheme of SWITCH_BUFFERS

> +	if (ring->type == AMDGPU_RING_TYPE_GFX)
Well isn't this if testing the ring type or not? So it is clearly ring 
specific code in the common handling.

If you need to inserts NOPs before everything else you need to add a 
callback for this, but I would rather say add this into the existing 
vm_flush or sync engine callback where it belongs.

> 4,Only insert one SWITCH_BUFFERs at the end of GFX dma frame is what exactly windows KMD do for GX8.0, this is both for preemption and performance, original scheme of SWITCH_BUFFER is totally wrong, and please check below text I copied from CP hw guys:
>
> ----------------------------------------thread of discussion CE sync with DE with Hans--------------------------------------------------
> Monk,
>
> All your assumptions are correct and to be sure that CE isn’t ahead when it shouldn’t, the safe way is to insert double switch buffers at the end of VMSWICH.
>
> Now to the question, why does it work? There is a hardware limitation on how far ahead CE can be in the ring buffer. CE and DE are reading from the same fetcher and that data is stored in a reorder queue (ROQ). The size of this queue (in gfx8) is 128 DWs. This is how far ahead CE can be. Now, at least for Windows, KMD usually pad (NOPs) each DMA frame to 128 (or 256?) DWs and if CE’s indirect buffer is more than 128 DWs ahead of where DE is updating the page table it will work due to the padding and the limitation of the ROQ.
>
> /Hans

When this is the requirement of the hardware team then you should 
properly implement that and not add crude workarounds like the proposed one.

See the calls to amdgpu_ring_init() here the align_mask determines how 
many padding is added for each command submission. We currently only 
uses 16 for the GFX engine IIRC, just changing that to 256 should fix 
the issue.

Regards,
Christian.

>
> From: Liu, Monk
> Sent: Monday, August 15, 2016 11:35 PM
> To: Fernlund, Hans
> Cc: Deucher, Alexander; Li, Bingley; Liu, Robert; Wang, Daniel(Xiaowei); Jiang, Jerry (SW)
> Subject: FW: CE synchronization with DE
>
> Hi Hans,
>
> we are trying to understand constant engine sync (with DE) principle and we still have things not understood, especially the behave of CE
>
> can you help us go through it ? thanks very much!
>
> 1.      CE runs much faster than DE, and before DE finished VMSWICH  (vm_invalidate_request & pd preparing ) CE may already start running in CE_IB, which will touch memory through VMID 3 (windows KMD use VMID 3 for GFX engine), we are wondering why we never saw CE vm fault ?
> 2.      After CE processed SWITCH_BUFFERS, CE will go to the next DMA frame, which may probably belongs to another process (process B), but meanwhile DE may still not finished VM switch to process B, we also wondering why we never see CE vm fault under this condition neither…
> •        NOTE: It only insert one SWITCH_BUFFERS at the end of gfx dma frame, so it won’t block CE continue if Buffer B is never used. ( first IB use buffer A and second use buffer B)
>
> Ring buffer figure for above two cases, see blow please :
>
> #1  case scenario:
> Ring buffer:
> /VMSWITCH/……/IB_C /IB_C/IB/EOP/SWITCH_BUFFER/…..
> ^                                        ^
> |                                        |
> DE                                     CE
>
>
> Assume GFX ring is fresh new, and we just drop one JOB to hardware ring, DE is still in VMSWITCH stage (bind PD of current process to VMID 3 and then invalidate VM), so now the page table for VMID 3 is still not ready, but meanwhile CE may already executing CE_IB, accessing address of this CE_IB should trigger VM fault as our understanding …
>
> We wondering why this never triggering vm fault since page table is not yet fully connected to VMID 3!
>
>
> #2 case scenario:
> Ring buffer:
> /VMSWITCH-to-P1/……/IB_C /IB_C/IB/EOP/SWITCH_BUFFER/…../VMSWITCH-to-P2/………/IB_C /IB_C/IB/EOP/SWITCH_BUFFER/……
> ^                                        ^
> |                                        |
> DE                                     CE
> Still assume the ring buffer is fresh new, and Assume DE is still in VMSWITCH-TO-P2 stage (P1/P2 means process1/process2), since this time vm page table of P2 yet not been binding to VMID3, if CE is already running in IB belongs to P2, why we never meet vm fault ?
>
>
> Thanks for your time!
>
> BR Monk
>
>
>
>
> -----Original Message-----
> From: Christian König [mailto:deathsimple@vodafone.de]
> Sent: Wednesday, August 24, 2016 4:16 PM
> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu:fix DMAframe for GFX8
>
> Am 24.08.2016 um 05:42 schrieb Monk Liu:
>> 1,drop inserting double SWITCH_BUFFERS scheme, which impacts
>> performance, because double SWITCH_BUFFER actaully doesn't switch
>> buffer, so the CE's ping-pong buffer is not used at all. Now only
>> insert one SWITCH_BUFFERS at the bottom of each GFX dmaframe. And only
>> inserting one SWITCH_BUFFER is a must for Virtualization World Switch!
>>
>> 2,without double SWITCH_BUFFRES, CE will go very ahead of DE which
>> will trigger VM fault (CE IB run before DE finished VM flush), so we
>> need a way to prevent that happen.
>> according to GFX8.0 spec, CE will go ahead of DE at most by 128dw, so
>> we can insert 128 NOP before CE ib, so that CE won't run const IB
>> before DE finished VM flush.
>>
>> 3,for virtualization, CP requires that each DMAframe need two const IB
>> and one IB, so we cannot remove preamble CE IB even no context switch
>> take place.
> Well than we probably can't implement virtualization this way, cause there isn't any guarantee that userspace will use two CE IBs and one DE IB.
>
> Apart from that not skipping the IBs would break the IOCTL interface, so it is clearly a NAK.
>
>> 4,each DMAframe won't occupy more than 256dw, so allocate 256dw is
>> enough
>>
>> TODO:
>> use DMAframe entry design to implement ib_schedule(), make sure each
>> DMAframe is aligned and fixed at their offset, which is needed for
>> preemption and easy for ring buffer debug & dump.
>>
>> Change-Id: I1b98d6352b7ead91b661094921bfd43cfeaae190
>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h    |  2 ++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 15 ++++++++++-----
>>    drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c  | 27 ++++++++++++---------------
>>    3 files changed, 24 insertions(+), 20 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 316bd2a..11bb05f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -337,6 +337,7 @@ struct amdgpu_ring_funcs {
>>    	void (*end_use)(struct amdgpu_ring *ring);
>>    	void (*emit_wreg) (struct amdgpu_ring *ring, uint32_t offset, uint32_t val);
>>    	void (*emit_rreg) (struct amdgpu_ring *ring, uint32_t offset);
>> +	void (*emit_sb) (struct amdgpu_ring *ring, int cnt);
>>    };
>>    
>>    /*
>> @@ -2370,6 +2371,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>>    #define amdgpu_ring_emit_hdp_invalidate(r) (r)->funcs->emit_hdp_invalidate((r))
>>    #define amdgpu_ring_emit_wreg(r, i, v) (r)->funcs->emit_wreg((r), (i), (v))
>>    #define amdgpu_ring_emit_rreg(r, i) (r)->funcs->emit_rreg((r), (i))
>> +#define amdgpu_ring_emit_sb(r, c) (r)->funcs->emit_sb((r), (c))
>>    #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
>>    #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
>>    #define amdgpu_ring_patch_cond_exec(r,o)
>> (r)->funcs->patch_cond_exec((r),(o))
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> index a31d7ef..d99af07 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> @@ -151,7 +151,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>    		return -EINVAL;
>>    	}
>>    
>> -	r = amdgpu_ring_alloc(ring, 256 * num_ibs);
>> +	r = amdgpu_ring_alloc(ring, 256);
>>    	if (r) {
>>    		dev_err(adev->dev, "scheduling IB failed (%d).\n", r);
>>    		return r;
>> @@ -174,15 +174,16 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>    	/* always set cond_exec_polling to CONTINUE */
>>    	*ring->cond_exe_cpu_addr = 1;
>>    
>> +
>> +	/* emit 128 dw nop before IBc */
>> +	if (ring->type == AMDGPU_RING_TYPE_GFX)
>> +		amdgpu_ring_insert_nop(ring, 128);
>> +
> Please don't add ring specific code into the common handling.
>
>>    	skip_preamble = ring->current_ctx == ctx;
>>    	need_ctx_switch = ring->current_ctx != ctx;
>>    	for (i = 0; i < num_ibs; ++i) {
>>    		ib = &ibs[i];
>>    
>> -		/* drop preamble IBs if we don't have a context switch */
>> -		if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) && skip_preamble)
>> -			continue;
>> -
> This is a clear NAK since it is an userspace visible change to stop doing this.
>
> Regards,
> Christian.
>
>>    		amdgpu_ring_emit_ib(ring, ib, job ? job->vm_id : 0,
>>    				    need_ctx_switch);
>>    		need_ctx_switch = false;
>> @@ -210,6 +211,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>    		amdgpu_ring_patch_cond_exec(ring, patch_offset);
>>    
>>    	ring->current_ctx = ctx;
>> +
>> +	/* Insert one SB at the bottom of each DMA frame */
>> +	if (ring->funcs->emit_sb)
>> +		amdgpu_ring_emit_sb(ring, 1);
>>    	amdgpu_ring_commit(ring);
>>    	return 0;
>>    }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> index dfa2288..37dc64b 100755
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> @@ -5936,12 +5936,6 @@ static void gfx_v8_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
>>    {
>>    	u32 header, control = 0;
>>    
>> -	/* insert SWITCH_BUFFER packet before first IB in the ring frame */
>> -	if (ctx_switch) {
>> -		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
>> -		amdgpu_ring_write(ring, 0);
>> -	}
>> -
>>    	if (ib->flags & AMDGPU_IB_FLAG_CE)
>>    		header = PACKET3(PACKET3_INDIRECT_BUFFER_CONST, 2);
>>    	else
>> @@ -6013,11 +6007,9 @@ static void gfx_v8_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
>>    	amdgpu_ring_write(ring, 4); /* poll interval */
>>    
>>    	if (usepfp) {
>> -		/* synce CE with ME to prevent CE fetch CEIB before context switch done */
>> -		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
>> -		amdgpu_ring_write(ring, 0);
>> -		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
>> -		amdgpu_ring_write(ring, 0);
>> +		/* sync PFP to ME, otherwise we might get invalid PFP reads */
>> +		amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0));
>> +		amdgpu_ring_write(ring, 0x0);
>>    	}
>>    }
>>    
>> @@ -6065,10 +6057,6 @@ static void gfx_v8_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
>>    		/* sync PFP to ME, otherwise we might get invalid PFP reads */
>>    		amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0));
>>    		amdgpu_ring_write(ring, 0x0);
>> -		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
>> -		amdgpu_ring_write(ring, 0);
>> -		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
>> -		amdgpu_ring_write(ring, 0);
>>    	}
>>    }
>>    
>> @@ -6170,6 +6158,14 @@ static void gfx_v8_0_ring_emit_wreg_kiq(struct amdgpu_ring *ring, u32 idx, u32 v
>>    	amdgpu_ring_write(ring, val);
>>    }
>>    
>> +static void gfx_v8_ring_emit_sb(struct amdgpu_ring *ring, int cnt) {
>> +	while (cnt--) {
>> +		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
>> +		amdgpu_ring_write(ring, 0);
>> +	}
>> +}
>> +
>>    static void gfx_v8_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
>>    						 enum amdgpu_interrupt_state state)
>>    {
>> @@ -6477,6 +6473,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
>>    	.test_ib = gfx_v8_0_ring_test_ib,
>>    	.insert_nop = amdgpu_ring_insert_nop,
>>    	.pad_ib = amdgpu_ring_generic_pad_ib,
>> +	.emit_sb = gfx_v8_ring_emit_sb,
>>    };
>>    
>>    static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute =
>> {
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH] drm/amdgpu:fix DMAframe for GFX8
       [not found]             ` <34bfdd22-da83-26b2-0949-fafe6b30af78-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2016-08-24  9:22               ` Liu, Monk
       [not found]                 ` <MWHPR12MB1182A0913A30FA1AC499CD7584EA0-Gy0DoCVfaSVhjnLHdLm0OQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: Liu, Monk @ 2016-08-24  9:22 UTC (permalink / raw)
  To: Christian König, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Wang, Daniel(Xiaowei), Jiang, Jerry (SW)


[-- Attachment #1.1: Type: text/plain, Size: 18728 bytes --]

Christian,



I'm not going to argue with you about policy or interface protocol , cuz those things would block us from implementing new features, instead they should be changed if they are not fit to new requirement, isn't it ?



See my answers in lines, thanks !



-----Original Message-----
From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf Of Christian K?nig
Sent: Wednesday, August 24, 2016 4:56 PM
To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Wang, Daniel(Xiaowei) <Daniel.Wang2@amd.com>; Jiang, Jerry (SW) <Jerry.Jiang@amd.com>
Subject: Re: [PATCH] drm/amdgpu:fix DMAframe for GFX8



Am 24.08.2016 um 10:29 schrieb Liu, Monk:

> Christian,

>

> 1,umd should insert always three IB each job, this is guaranteed in windows and catalyst driver, and this is a must for virtualization,

>   Besides, how to implement virtualization is not lead by driver, instead we should follow CP logic, and you don't have any choice of how to impl it ...



No, that is totally nonsense. The kernel driver can *NEVER* rely on that the UMD does something in a certain way.



[ML] please stick to my patch, my patch only *not* skip the preamble CE ib, it actually doesn't make any harm, and umd can do whatever it want,

but please remember that if the system want to support world switch, UMD *must* always insert preamble CE_IB, CE_IB, and DE_IB for *each* DMAframe/job.

Because CP will rely on the offset of preamble CE IB to calculate the meta data filling package offset within the ring buffer, and CP will change those meta data for world switch.

NOTE: the meta data filling package patch is not ready in staging-4.6, I will provide more patches later, and the meta data records some snapshot of the timing CPG get preempted.



>>Briefly speaking: ring buffer should be fixed at some certain pattern so that CP hw can rely on this pattern to get/set data *in ring buffer directly* for the purpose of world switch!



Mesa for example does uses the IBs this way and since that is the open source implementation it is also the only one that matters for us.



See Daniels recent mail about that

(http://www.spinics.net/lists/dri-devel/msg116196.html).

"



+- The Linux kernel's "no regression" policy holds in practice only for

+  open-source userspace of the DRM subsystem. DRM developers are

+perfectly fine

+  if closed-source blob drivers in userspace use the same uAPI as the

+open

+  drivers, but they must do so in the exact same way as the open drivers.

+  Creative (ab)use of the interfaces will, and in the past routinely

+has, lead

+  to breakage.



"



> 2,the patch only doesn't skip the preamble CE ib, so it doesn't break any interface protocol. I really don't understand your phase ---" Apart from that not skipping the IBs would break the IOCTL interface, so it is clearly a NAK"



The IOCTL interface requires that we skip the IBs when there isn't any context change. Otherwise adding the flag in the first place wouldn't have made sense. Sorry but this requirement comes a bit late.



[ML] well. Since from practice perspective, not skip the preamble CE IB really doesn’t make any harm, and based on the requirement of world switch, preamble CE IB *cannot* be skipped, so I think this interface need change.



Could you summarize why the CP has this strange requirement and what would happen if we don't handle this correctly?



[ML] already mentioned above



> 3,the patch doesn't insert ring specific code, "

> amdgpu_ring_insert_nop" is a common interface for all amdgpu ring,

> *and* be caution that this change is a must otherwise CE will get vm

> fault after we change the scheme of SWITCH_BUFFERS



> +          if (ring->type == AMDGPU_RING_TYPE_GFX)

Well isn't this if testing the ring type or not? So it is clearly ring specific code in the common handling.



[ML] why common handling cannot tolerate ring specific code ? According to brahma code, you can always find ring specific code in common handling, for example:



“Checking if preamble CE ib could be skipped” itself is a classic behavior of ring specific code, only gfx ring have preamble CE ib …



If you need to inserts NOPs before everything else you need to add a callback for this, but I would rather say add this into the existing vm_flush or sync engine callback where it belongs.



[ML] I really have no strong opinion of the place inserting those NOPS, as long as they are ahead of CE IB is okay for me ! I can change to that way as you wish.



> 4,Only insert one SWITCH_BUFFERs at the end of GFX dma frame is what exactly windows KMD do for GX8.0, this is both for preemption and performance, original scheme of SWITCH_BUFFER is totally wrong, and please check below text I copied from CP hw guys:

>

> ----------------------------------------thread of discussion CE sync

> with DE with Hans--------------------------------------------------

> Monk,

>

> All your assumptions are correct and to be sure that CE isn’t ahead when it shouldn’t, the safe way is to insert double switch buffers at the end of VMSWICH.

>

> Now to the question, why does it work? There is a hardware limitation on how far ahead CE can be in the ring buffer. CE and DE are reading from the same fetcher and that data is stored in a reorder queue (ROQ). The size of this queue (in gfx8) is 128 DWs. This is how far ahead CE can be. Now, at least for Windows, KMD usually pad (NOPs) each DMA frame to 128 (or 256?) DWs and if CE’s indirect buffer is more than 128 DWs ahead of where DE is updating the page table it will work due to the padding and the limitation of the ROQ.

>

> /Hans



When this is the requirement of the hardware team then you should properly implement that and not add crude workarounds like the proposed one.



See the calls to amdgpu_ring_init() here the align_mask determines how many padding is added for each command submission. We currently only uses 16 for the GFX engine IIRC, just changing that to 256 should fix the issue.



[ML] no, the padding is not needed before command submission, instead it is needed between the first CE_IB and the last package of VM flush!



Regards,

Christian.



>

> From: Liu, Monk

> Sent: Monday, August 15, 2016 11:35 PM

> To: Fernlund, Hans

> Cc: Deucher, Alexander; Li, Bingley; Liu, Robert; Wang,

> Daniel(Xiaowei); Jiang, Jerry (SW)

> Subject: FW: CE synchronization with DE

>

> Hi Hans,

>

> we are trying to understand constant engine sync (with DE) principle

> and we still have things not understood, especially the behave of CE

>

> can you help us go through it ? thanks very much!

>

> 1.      CE runs much faster than DE, and before DE finished VMSWICH  (vm_invalidate_request & pd preparing ) CE may already start running in CE_IB, which will touch memory through VMID 3 (windows KMD use VMID 3 for GFX engine), we are wondering why we never saw CE vm fault ?

> 2.      After CE processed SWITCH_BUFFERS, CE will go to the next DMA frame, which may probably belongs to another process (process B), but meanwhile DE may still not finished VM switch to process B, we also wondering why we never see CE vm fault under this condition neither…

> •        NOTE: It only insert one SWITCH_BUFFERS at the end of gfx dma frame, so it won’t block CE continue if Buffer B is never used. ( first IB use buffer A and second use buffer B)

>

> Ring buffer figure for above two cases, see blow please :

>

> #1  case scenario:

> Ring buffer:

> /VMSWITCH/……/IB_C /IB_C/IB/EOP/SWITCH_BUFFER/…..

> ^                                        ^

> |                                        |

> DE                                     CE

>

>

> Assume GFX ring is fresh new, and we just drop one JOB to hardware

> ring, DE is still in VMSWITCH stage (bind PD of current process to

> VMID 3 and then invalidate VM), so now the page table for VMID 3 is

> still not ready, but meanwhile CE may already executing CE_IB,

> accessing address of this CE_IB should trigger VM fault as our

> understanding …

>

> We wondering why this never triggering vm fault since page table is not yet fully connected to VMID 3!

>

>

> #2 case scenario:

> Ring buffer:

> /VMSWITCH-to-P1/……/IB_C /IB_C/IB/EOP/SWITCH_BUFFER/…../VMSWITCH-to-P2/………/IB_C /IB_C/IB/EOP/SWITCH_BUFFER/……

> ^                                        ^

> |                                        |

> DE                                     CE

> Still assume the ring buffer is fresh new, and Assume DE is still in VMSWITCH-TO-P2 stage (P1/P2 means process1/process2), since this time vm page table of P2 yet not been binding to VMID3, if CE is already running in IB belongs to P2, why we never meet vm fault ?

>

>

> Thanks for your time!

>

> BR Monk

>

>

>

>

> -----Original Message-----

> From: Christian König [mailto:deathsimple@vodafone.de]

> Sent: Wednesday, August 24, 2016 4:16 PM

> To: Liu, Monk <Monk.Liu@amd.com<mailto:Monk.Liu@amd.com>>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>

> Subject: Re: [PATCH] drm/amdgpu:fix DMAframe for GFX8

>

> Am 24.08.2016 um 05:42 schrieb Monk Liu:

>> 1,drop inserting double SWITCH_BUFFERS scheme, which impacts

>> performance, because double SWITCH_BUFFER actaully doesn't switch

>> buffer, so the CE's ping-pong buffer is not used at all. Now only

>> insert one SWITCH_BUFFERS at the bottom of each GFX dmaframe. And

>> only inserting one SWITCH_BUFFER is a must for Virtualization World Switch!

>>

>> 2,without double SWITCH_BUFFRES, CE will go very ahead of DE which

>> will trigger VM fault (CE IB run before DE finished VM flush), so we

>> need a way to prevent that happen.

>> according to GFX8.0 spec, CE will go ahead of DE at most by 128dw, so

>> we can insert 128 NOP before CE ib, so that CE won't run const IB

>> before DE finished VM flush.

>>

>> 3,for virtualization, CP requires that each DMAframe need two const

>> IB and one IB, so we cannot remove preamble CE IB even no context

>> switch take place.

> Well than we probably can't implement virtualization this way, cause there isn't any guarantee that userspace will use two CE IBs and one DE IB.

>

> Apart from that not skipping the IBs would break the IOCTL interface, so it is clearly a NAK.

>

>> 4,each DMAframe won't occupy more than 256dw, so allocate 256dw is

>> enough

>>

>> TODO:

>> use DMAframe entry design to implement ib_schedule(), make sure each

>> DMAframe is aligned and fixed at their offset, which is needed for

>> preemption and easy for ring buffer debug & dump.

>>

>> Change-Id: I1b98d6352b7ead91b661094921bfd43cfeaae190

>> Signed-off-by: Monk Liu <Monk.Liu@amd.com<mailto:Monk.Liu@amd.com>>

>> ---

>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h    |  2 ++

>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 15 ++++++++++-----

>>    drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c  | 27 ++++++++++++---------------

>>    3 files changed, 24 insertions(+), 20 deletions(-)

>>

>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>> index 316bd2a..11bb05f 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>> @@ -337,6 +337,7 @@ struct amdgpu_ring_funcs {

>>           void (*end_use)(struct amdgpu_ring *ring);

>>           void (*emit_wreg) (struct amdgpu_ring *ring, uint32_t offset, uint32_t val);

>>           void (*emit_rreg) (struct amdgpu_ring *ring, uint32_t offset);

>> +       void (*emit_sb) (struct amdgpu_ring *ring, int cnt);

>>    };

>>

>>    /*

>> @@ -2370,6 +2371,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)

>>    #define amdgpu_ring_emit_hdp_invalidate(r) (r)->funcs->emit_hdp_invalidate((r))

>>    #define amdgpu_ring_emit_wreg(r, i, v) (r)->funcs->emit_wreg((r), (i), (v))

>>    #define amdgpu_ring_emit_rreg(r, i) (r)->funcs->emit_rreg((r),

>> (i))

>> +#define amdgpu_ring_emit_sb(r, c) (r)->funcs->emit_sb((r), (c))

>>    #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))

>>    #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))

>>    #define amdgpu_ring_patch_cond_exec(r,o)

>> (r)->funcs->patch_cond_exec((r),(o))

>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c

>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c

>> index a31d7ef..d99af07 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c

>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c

>> @@ -151,7 +151,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,

>>                          return -EINVAL;

>>           }

>>

>> -        r = amdgpu_ring_alloc(ring, 256 * num_ibs);

>> +       r = amdgpu_ring_alloc(ring, 256);

>>           if (r) {

>>                          dev_err(adev->dev, "scheduling IB failed (%d).\n", r);

>>                          return r;

>> @@ -174,15 +174,16 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,

>>           /* always set cond_exec_polling to CONTINUE */

>>           *ring->cond_exe_cpu_addr = 1;

>>

>> +

>> +       /* emit 128 dw nop before IBc */

>> +       if (ring->type == AMDGPU_RING_TYPE_GFX)

>> +                      amdgpu_ring_insert_nop(ring, 128);

>> +

> Please don't add ring specific code into the common handling.

>

>>           skip_preamble = ring->current_ctx == ctx;

>>           need_ctx_switch = ring->current_ctx != ctx;

>>           for (i = 0; i < num_ibs; ++i) {

>>                          ib = &ibs[i];

>>

>> -                       /* drop preamble IBs if we don't have a context switch */

>> -                       if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) && skip_preamble)

>> -                                      continue;

>> -

> This is a clear NAK since it is an userspace visible change to stop doing this.

>

> Regards,

> Christian.

>

>>                          amdgpu_ring_emit_ib(ring, ib, job ? job->vm_id : 0,

>>                                                            need_ctx_switch);

>>                          need_ctx_switch = false;

>> @@ -210,6 +211,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,

>>                          amdgpu_ring_patch_cond_exec(ring, patch_offset);

>>

>>           ring->current_ctx = ctx;

>> +

>> +       /* Insert one SB at the bottom of each DMA frame */

>> +       if (ring->funcs->emit_sb)

>> +                      amdgpu_ring_emit_sb(ring, 1);

>>           amdgpu_ring_commit(ring);

>>           return 0;

>>    }

>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c

>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c

>> index dfa2288..37dc64b 100755

>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c

>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c

>> @@ -5936,12 +5936,6 @@ static void gfx_v8_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,

>>    {

>>           u32 header, control = 0;

>>

>> -        /* insert SWITCH_BUFFER packet before first IB in the ring frame */

>> -        if (ctx_switch) {

>> -                       amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));

>> -                       amdgpu_ring_write(ring, 0);

>> -        }

>> -

>>           if (ib->flags & AMDGPU_IB_FLAG_CE)

>>                          header = PACKET3(PACKET3_INDIRECT_BUFFER_CONST, 2);

>>           else

>> @@ -6013,11 +6007,9 @@ static void gfx_v8_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)

>>           amdgpu_ring_write(ring, 4); /* poll interval */

>>

>>           if (usepfp) {

>> -                       /* synce CE with ME to prevent CE fetch CEIB before context switch done */

>> -                       amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));

>> -                       amdgpu_ring_write(ring, 0);

>> -                       amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));

>> -                       amdgpu_ring_write(ring, 0);

>> +                      /* sync PFP to ME, otherwise we might get invalid PFP reads */

>> +                      amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0));

>> +                      amdgpu_ring_write(ring, 0x0);

>>           }

>>    }

>>

>> @@ -6065,10 +6057,6 @@ static void gfx_v8_0_ring_emit_vm_flush(struct amdgpu_ring *ring,

>>                          /* sync PFP to ME, otherwise we might get invalid PFP reads */

>>                          amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0));

>>                          amdgpu_ring_write(ring, 0x0);

>> -                       amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));

>> -                       amdgpu_ring_write(ring, 0);

>> -                       amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));

>> -                       amdgpu_ring_write(ring, 0);

>>           }

>>    }

>>

>> @@ -6170,6 +6158,14 @@ static void gfx_v8_0_ring_emit_wreg_kiq(struct amdgpu_ring *ring, u32 idx, u32 v

>>           amdgpu_ring_write(ring, val);

>>    }

>>

>> +static void gfx_v8_ring_emit_sb(struct amdgpu_ring *ring, int cnt) {

>> +       while (cnt--) {

>> +                      amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));

>> +                      amdgpu_ring_write(ring, 0);

>> +       }

>> +}

>> +

>>    static void gfx_v8_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,

>>                                                                                       enum amdgpu_interrupt_state state)

>>    {

>> @@ -6477,6 +6473,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {

>>           .test_ib = gfx_v8_0_ring_test_ib,

>>           .insert_nop = amdgpu_ring_insert_nop,

>>           .pad_ib = amdgpu_ring_generic_pad_ib,

>> +       .emit_sb = gfx_v8_ring_emit_sb,

>>    };

>>

>>    static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute

>> = {

>



_______________________________________________

amd-gfx mailing list

amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>

https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[-- Attachment #1.2: Type: text/html, Size: 44401 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH] drm/amdgpu:fix DMAframe for GFX8
       [not found]                 ` <MWHPR12MB1182A0913A30FA1AC499CD7584EA0-Gy0DoCVfaSVhjnLHdLm0OQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2016-08-24  9:30                   ` Liu, Monk
  2016-08-24  9:37                   ` Christian König
  1 sibling, 0 replies; 10+ messages in thread
From: Liu, Monk @ 2016-08-24  9:30 UTC (permalink / raw)
  To: Liu, Monk, Christian König,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Wang, Daniel(Xiaowei), Jiang, Jerry (SW)


[-- Attachment #1.1: Type: text/plain, Size: 19911 bytes --]

Some amends for why preamble CE ib is a must:

Previous thread doesn’t explain the major reason of why CE preamble IB is a must:

In Preamble CE IB, there will be many load_ce_ram package there, and they are responsible for restore the scenario, e.g. from a context switch
When World Switch introduced, if the scenario needed be restored is out of knowledge of kernel driver (without world switch, kmd always know it by comparing vm client id, so if two jobs belong to same client id, the preamble CE IB is really not needed ), because Guest side doesn’t know if it is preempted by world switch or not, the way is that it should always run “preamble CE IB” to restore the CE ram.

BR Monk

From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf Of Liu, Monk
Sent: Wednesday, August 24, 2016 5:23 PM
To: Christian König <deathsimple@vodafone.de>; amd-gfx@lists.freedesktop.org
Cc: Wang, Daniel(Xiaowei) <Daniel.Wang2@amd.com>; Jiang, Jerry (SW) <Jerry.Jiang@amd.com>
Subject: RE: [PATCH] drm/amdgpu:fix DMAframe for GFX8


Christian,



I'm not going to argue with you about policy or interface protocol , cuz those things would block us from implementing new features, instead they should be changed if they are not fit to new requirement, isn't it ?



See my answers in lines, thanks !



-----Original Message-----
From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf Of Christian K?nig
Sent: Wednesday, August 24, 2016 4:56 PM
To: Liu, Monk <Monk.Liu@amd.com<mailto:Monk.Liu@amd.com>>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Cc: Wang, Daniel(Xiaowei) <Daniel.Wang2@amd.com<mailto:Daniel.Wang2@amd.com>>; Jiang, Jerry (SW) <Jerry.Jiang@amd.com<mailto:Jerry.Jiang@amd.com>>
Subject: Re: [PATCH] drm/amdgpu:fix DMAframe for GFX8



Am 24.08.2016 um 10:29 schrieb Liu, Monk:

> Christian,

>

> 1,umd should insert always three IB each job, this is guaranteed in windows and catalyst driver, and this is a must for virtualization,

>   Besides, how to implement virtualization is not lead by driver, instead we should follow CP logic, and you don't have any choice of how to impl it ...



No, that is totally nonsense. The kernel driver can *NEVER* rely on that the UMD does something in a certain way.



[ML] please stick to my patch, my patch only *not* skip the preamble CE ib, it actually doesn't make any harm, and umd can do whatever it want,

but please remember that if the system want to support world switch, UMD *must* always insert preamble CE_IB, CE_IB, and DE_IB for *each* DMAframe/job.

Because CP will rely on the offset of preamble CE IB to calculate the meta data filling package offset within the ring buffer, and CP will change those meta data for world switch.

NOTE: the meta data filling package patch is not ready in staging-4.6, I will provide more patches later, and the meta data records some snapshot of the timing CPG get preempted.



>>Briefly speaking: ring buffer should be fixed at some certain pattern so that CP hw can rely on this pattern to get/set data *in ring buffer directly* for the purpose of world switch!



Mesa for example does uses the IBs this way and since that is the open source implementation it is also the only one that matters for us.



See Daniels recent mail about that

(http://www.spinics.net/lists/dri-devel/msg116196.html).

"



+- The Linux kernel's "no regression" policy holds in practice only for

+  open-source userspace of the DRM subsystem. DRM developers are

+perfectly fine

+  if closed-source blob drivers in userspace use the same uAPI as the

+open

+  drivers, but they must do so in the exact same way as the open drivers.

+  Creative (ab)use of the interfaces will, and in the past routinely

+has, lead

+  to breakage.



"



> 2,the patch only doesn't skip the preamble CE ib, so it doesn't break any interface protocol. I really don't understand your phase ---" Apart from that not skipping the IBs would break the IOCTL interface, so it is clearly a NAK"



The IOCTL interface requires that we skip the IBs when there isn't any context change. Otherwise adding the flag in the first place wouldn't have made sense. Sorry but this requirement comes a bit late.



[ML] well. Since from practice perspective, not skip the preamble CE IB really doesn’t make any harm, and based on the requirement of world switch, preamble CE IB *cannot* be skipped, so I think this interface need change.



Could you summarize why the CP has this strange requirement and what would happen if we don't handle this correctly?



[ML] already mentioned above



> 3,the patch doesn't insert ring specific code, "

> amdgpu_ring_insert_nop" is a common interface for all amdgpu ring,

> *and* be caution that this change is a must otherwise CE will get vm

> fault after we change the scheme of SWITCH_BUFFERS



> +          if (ring->type == AMDGPU_RING_TYPE_GFX)

Well isn't this if testing the ring type or not? So it is clearly ring specific code in the common handling.



[ML] why common handling cannot tolerate ring specific code ? According to brahma code, you can always find ring specific code in common handling, for example:



“Checking if preamble CE ib could be skipped” itself is a classic behavior of ring specific code, only gfx ring have preamble CE ib …



If you need to inserts NOPs before everything else you need to add a callback for this, but I would rather say add this into the existing vm_flush or sync engine callback where it belongs.



[ML] I really have no strong opinion of the place inserting those NOPS, as long as they are ahead of CE IB is okay for me ! I can change to that way as you wish.



> 4,Only insert one SWITCH_BUFFERs at the end of GFX dma frame is what exactly windows KMD do for GX8.0, this is both for preemption and performance, original scheme of SWITCH_BUFFER is totally wrong, and please check below text I copied from CP hw guys:

>

> ----------------------------------------thread of discussion CE sync

> with DE with Hans--------------------------------------------------

> Monk,

>

> All your assumptions are correct and to be sure that CE isn’t ahead when it shouldn’t, the safe way is to insert double switch buffers at the end of VMSWICH.

>

> Now to the question, why does it work? There is a hardware limitation on how far ahead CE can be in the ring buffer. CE and DE are reading from the same fetcher and that data is stored in a reorder queue (ROQ). The size of this queue (in gfx8) is 128 DWs. This is how far ahead CE can be. Now, at least for Windows, KMD usually pad (NOPs) each DMA frame to 128 (or 256?) DWs and if CE’s indirect buffer is more than 128 DWs ahead of where DE is updating the page table it will work due to the padding and the limitation of the ROQ.

>

> /Hans



When this is the requirement of the hardware team then you should properly implement that and not add crude workarounds like the proposed one.



See the calls to amdgpu_ring_init() here the align_mask determines how many padding is added for each command submission. We currently only uses 16 for the GFX engine IIRC, just changing that to 256 should fix the issue.



[ML] no, the padding is not needed before command submission, instead it is needed between the first CE_IB and the last package of VM flush!



Regards,

Christian.



>

> From: Liu, Monk

> Sent: Monday, August 15, 2016 11:35 PM

> To: Fernlund, Hans

> Cc: Deucher, Alexander; Li, Bingley; Liu, Robert; Wang,

> Daniel(Xiaowei); Jiang, Jerry (SW)

> Subject: FW: CE synchronization with DE

>

> Hi Hans,

>

> we are trying to understand constant engine sync (with DE) principle

> and we still have things not understood, especially the behave of CE

>

> can you help us go through it ? thanks very much!

>

> 1.      CE runs much faster than DE, and before DE finished VMSWICH  (vm_invalidate_request & pd preparing ) CE may already start running in CE_IB, which will touch memory through VMID 3 (windows KMD use VMID 3 for GFX engine), we are wondering why we never saw CE vm fault ?

> 2.      After CE processed SWITCH_BUFFERS, CE will go to the next DMA frame, which may probably belongs to another process (process B), but meanwhile DE may still not finished VM switch to process B, we also wondering why we never see CE vm fault under this condition neither…

> •        NOTE: It only insert one SWITCH_BUFFERS at the end of gfx dma frame, so it won’t block CE continue if Buffer B is never used. ( first IB use buffer A and second use buffer B)

>

> Ring buffer figure for above two cases, see blow please :

>

> #1  case scenario:

> Ring buffer:

> /VMSWITCH/……/IB_C /IB_C/IB/EOP/SWITCH_BUFFER/…..

> ^                                        ^

> |                                        |

> DE                                     CE

>

>

> Assume GFX ring is fresh new, and we just drop one JOB to hardware

> ring, DE is still in VMSWITCH stage (bind PD of current process to

> VMID 3 and then invalidate VM), so now the page table for VMID 3 is

> still not ready, but meanwhile CE may already executing CE_IB,

> accessing address of this CE_IB should trigger VM fault as our

> understanding …

>

> We wondering why this never triggering vm fault since page table is not yet fully connected to VMID 3!

>

>

> #2 case scenario:

> Ring buffer:

> /VMSWITCH-to-P1/……/IB_C /IB_C/IB/EOP/SWITCH_BUFFER/…../VMSWITCH-to-P2/………/IB_C /IB_C/IB/EOP/SWITCH_BUFFER/……

> ^                                        ^

> |                                        |

> DE                                     CE

> Still assume the ring buffer is fresh new, and Assume DE is still in VMSWITCH-TO-P2 stage (P1/P2 means process1/process2), since this time vm page table of P2 yet not been binding to VMID3, if CE is already running in IB belongs to P2, why we never meet vm fault ?

>

>

> Thanks for your time!

>

> BR Monk

>

>

>

>

> -----Original Message-----

> From: Christian König [mailto:deathsimple@vodafone.de]

> Sent: Wednesday, August 24, 2016 4:16 PM

> To: Liu, Monk <Monk.Liu@amd.com<mailto:Monk.Liu@amd.com>>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>

> Subject: Re: [PATCH] drm/amdgpu:fix DMAframe for GFX8

>

> Am 24.08.2016 um 05:42 schrieb Monk Liu:

>> 1,drop inserting double SWITCH_BUFFERS scheme, which impacts

>> performance, because double SWITCH_BUFFER actaully doesn't switch

>> buffer, so the CE's ping-pong buffer is not used at all. Now only

>> insert one SWITCH_BUFFERS at the bottom of each GFX dmaframe. And

>> only inserting one SWITCH_BUFFER is a must for Virtualization World Switch!

>>

>> 2,without double SWITCH_BUFFRES, CE will go very ahead of DE which

>> will trigger VM fault (CE IB run before DE finished VM flush), so we

>> need a way to prevent that happen.

>> according to GFX8.0 spec, CE will go ahead of DE at most by 128dw, so

>> we can insert 128 NOP before CE ib, so that CE won't run const IB

>> before DE finished VM flush.

>>

>> 3,for virtualization, CP requires that each DMAframe need two const

>> IB and one IB, so we cannot remove preamble CE IB even no context

>> switch take place.

> Well than we probably can't implement virtualization this way, cause there isn't any guarantee that userspace will use two CE IBs and one DE IB.

>

> Apart from that not skipping the IBs would break the IOCTL interface, so it is clearly a NAK.

>

>> 4,each DMAframe won't occupy more than 256dw, so allocate 256dw is

>> enough

>>

>> TODO:

>> use DMAframe entry design to implement ib_schedule(), make sure each

>> DMAframe is aligned and fixed at their offset, which is needed for

>> preemption and easy for ring buffer debug & dump.

>>

>> Change-Id: I1b98d6352b7ead91b661094921bfd43cfeaae190

>> Signed-off-by: Monk Liu <Monk.Liu@amd.com<mailto:Monk.Liu@amd.com>>

>> ---

>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h    |  2 ++

>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 15 ++++++++++-----

>>    drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c  | 27 ++++++++++++---------------

>>    3 files changed, 24 insertions(+), 20 deletions(-)

>>

>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>> index 316bd2a..11bb05f 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>> @@ -337,6 +337,7 @@ struct amdgpu_ring_funcs {

>>           void (*end_use)(struct amdgpu_ring *ring);

>>           void (*emit_wreg) (struct amdgpu_ring *ring, uint32_t offset, uint32_t val);

>>           void (*emit_rreg) (struct amdgpu_ring *ring, uint32_t offset);

>> +       void (*emit_sb) (struct amdgpu_ring *ring, int cnt);

>>    };

>>

>>    /*

>> @@ -2370,6 +2371,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)

>>    #define amdgpu_ring_emit_hdp_invalidate(r) (r)->funcs->emit_hdp_invalidate((r))

>>    #define amdgpu_ring_emit_wreg(r, i, v) (r)->funcs->emit_wreg((r), (i), (v))

>>    #define amdgpu_ring_emit_rreg(r, i) (r)->funcs->emit_rreg((r),

>> (i))

>> +#define amdgpu_ring_emit_sb(r, c) (r)->funcs->emit_sb((r), (c))

>>    #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))

>>    #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))

>>    #define amdgpu_ring_patch_cond_exec(r,o)

>> (r)->funcs->patch_cond_exec((r),(o))

>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c

>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c

>> index a31d7ef..d99af07 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c

>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c

>> @@ -151,7 +151,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,

>>                          return -EINVAL;

>>           }

>>

>> -        r = amdgpu_ring_alloc(ring, 256 * num_ibs);

>> +       r = amdgpu_ring_alloc(ring, 256);

>>           if (r) {

>>                          dev_err(adev->dev, "scheduling IB failed (%d).\n", r);

>>                          return r;

>> @@ -174,15 +174,16 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,

>>           /* always set cond_exec_polling to CONTINUE */

>>           *ring->cond_exe_cpu_addr = 1;

>>

>> +

>> +       /* emit 128 dw nop before IBc */

>> +       if (ring->type == AMDGPU_RING_TYPE_GFX)

>> +                      amdgpu_ring_insert_nop(ring, 128);

>> +

> Please don't add ring specific code into the common handling.

>

>>           skip_preamble = ring->current_ctx == ctx;

>>           need_ctx_switch = ring->current_ctx != ctx;

>>           for (i = 0; i < num_ibs; ++i) {

>>                          ib = &ibs[i];

>>

>> -                       /* drop preamble IBs if we don't have a context switch */

>> -                       if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) && skip_preamble)

>> -                                      continue;

>> -

> This is a clear NAK since it is an userspace visible change to stop doing this.

>

> Regards,

> Christian.

>

>>                          amdgpu_ring_emit_ib(ring, ib, job ? job->vm_id : 0,

>>                                                            need_ctx_switch);

>>                          need_ctx_switch = false;

>> @@ -210,6 +211,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,

>>                          amdgpu_ring_patch_cond_exec(ring, patch_offset);

>>

>>           ring->current_ctx = ctx;

>> +

>> +       /* Insert one SB at the bottom of each DMA frame */

>> +       if (ring->funcs->emit_sb)

>> +                      amdgpu_ring_emit_sb(ring, 1);

>>           amdgpu_ring_commit(ring);

>>           return 0;

>>    }

>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c

>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c

>> index dfa2288..37dc64b 100755

>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c

>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c

>> @@ -5936,12 +5936,6 @@ static void gfx_v8_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,

>>    {

>>           u32 header, control = 0;

>>

>> -        /* insert SWITCH_BUFFER packet before first IB in the ring frame */

>> -        if (ctx_switch) {

>> -                       amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));

>> -                       amdgpu_ring_write(ring, 0);

>> -        }

>> -

>>           if (ib->flags & AMDGPU_IB_FLAG_CE)

>>                          header = PACKET3(PACKET3_INDIRECT_BUFFER_CONST, 2);

>>           else

>> @@ -6013,11 +6007,9 @@ static void gfx_v8_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)

>>           amdgpu_ring_write(ring, 4); /* poll interval */

>>

>>           if (usepfp) {

>> -                       /* synce CE with ME to prevent CE fetch CEIB before context switch done */

>> -                       amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));

>> -                       amdgpu_ring_write(ring, 0);

>> -                       amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));

>> -                       amdgpu_ring_write(ring, 0);

>> +                      /* sync PFP to ME, otherwise we might get invalid PFP reads */

>> +                      amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0));

>> +                      amdgpu_ring_write(ring, 0x0);

>>           }

>>    }

>>

>> @@ -6065,10 +6057,6 @@ static void gfx_v8_0_ring_emit_vm_flush(struct amdgpu_ring *ring,

>>                          /* sync PFP to ME, otherwise we might get invalid PFP reads */

>>                          amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0));

>>                          amdgpu_ring_write(ring, 0x0);

>> -                       amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));

>> -                       amdgpu_ring_write(ring, 0);

>> -                       amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));

>> -                       amdgpu_ring_write(ring, 0);

>>           }

>>    }

>>

>> @@ -6170,6 +6158,14 @@ static void gfx_v8_0_ring_emit_wreg_kiq(struct amdgpu_ring *ring, u32 idx, u32 v

>>           amdgpu_ring_write(ring, val);

>>    }

>>

>> +static void gfx_v8_ring_emit_sb(struct amdgpu_ring *ring, int cnt) {

>> +       while (cnt--) {

>> +                      amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));

>> +                      amdgpu_ring_write(ring, 0);

>> +       }

>> +}

>> +

>>    static void gfx_v8_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,

>>                                                                                       enum amdgpu_interrupt_state state)

>>    {

>> @@ -6477,6 +6473,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {

>>           .test_ib = gfx_v8_0_ring_test_ib,

>>           .insert_nop = amdgpu_ring_insert_nop,

>>           .pad_ib = amdgpu_ring_generic_pad_ib,

>> +       .emit_sb = gfx_v8_ring_emit_sb,

>>    };

>>

>>    static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute

>> = {

>



_______________________________________________

amd-gfx mailing list

amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>

https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[-- Attachment #1.2: Type: text/html, Size: 46724 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] drm/amdgpu:fix DMAframe for GFX8
       [not found]                 ` <MWHPR12MB1182A0913A30FA1AC499CD7584EA0-Gy0DoCVfaSVhjnLHdLm0OQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  2016-08-24  9:30                   ` Liu, Monk
@ 2016-08-24  9:37                   ` Christian König
       [not found]                     ` <f7e47224-0a17-4bf2-d52a-e8888d707359-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  1 sibling, 1 reply; 10+ messages in thread
From: Christian König @ 2016-08-24  9:37 UTC (permalink / raw)
  To: Liu, Monk, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Wang, Daniel(Xiaowei), Jiang, Jerry (SW)

> >>Briefly speaking: ring buffer should be fixed at some certain pattern so that 
> CP hw can rely on this pattern to get/set data *in ring buffer 
> directly* for the purpose of world switch!
>
In this case the kernel driver must make sure that the requirements of 
the hardware are fulfilled for this. Again we can't rely on anything the 
user mode driver does!

In other words if the hardware always needs two CE and one DE for it's 
operation the kernel must insert missing IB packets or otherwise we 
can't accept this.

> “Checking if preamble CE ib could be skipped” itself is a classic 
> behavior of ring specific code, only gfx ring have preamble CE ib …
>
That isn't correct. The preamble IB feature can be used for Compute and 
DMA engines as well.

BTW: Please only use text only mails on public mailing lists, not HTML 
mails.

Regards,
Christian.

Am 24.08.2016 um 11:22 schrieb Liu, Monk:
>
> Christian,
>
> I'm not going to argue with you about policy or interface protocol , 
> cuz those things would block us from implementing new features, 
> instead they should be changed if they are not fit to new requirement, 
> isn't it ?
>
> See my answers in lines, thanks !
>
> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf 
> Of Christian K?nig
> Sent: Wednesday, August 24, 2016 4:56 PM
> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Wang, Daniel(Xiaowei) <Daniel.Wang2@amd.com>; Jiang, Jerry (SW) 
> <Jerry.Jiang@amd.com>
> Subject: Re: [PATCH] drm/amdgpu:fix DMAframe for GFX8
>
> Am 24.08.2016 um 10:29 schrieb Liu, Monk:
>
> > Christian,
>
> >
>
> > 1,umd should insert always three IB each job, this is guaranteed in 
> windows and catalyst driver, and this is a must for virtualization,
>
> >   Besides, how to implement virtualization is not lead by driver, 
> instead we should follow CP logic, and you don't have any choice of 
> how to impl it ...
>
> No, that is totally nonsense. The kernel driver can *NEVER* rely on 
> that the UMD does something in a certain way.
>
> [ML] please stick to my patch, my patch only *not* skip the preamble 
> CE ib, it actually doesn't make any harm, and umd can do whatever it 
> want,
>
> but please remember that if the system want to support world switch, 
> UMD *must* always insert preamble CE_IB, CE_IB, and DE_IB for *each* 
> DMAframe/job.
>
> Because CP will rely on the offset of preamble CE IB to calculate the 
> meta data filling package offset within the ring buffer, and CP will 
> change those meta data for world switch.
>
> NOTE: the meta data filling package patch is not ready in staging-4.6, 
> I will provide more patches later, and the meta data records some 
> snapshot of the timing CPG get preempted.
>
> >>Briefly speaking: ring buffer should be fixed at some certain pattern so that 
> CP hw can rely on this pattern to get/set data *in ring buffer 
> directly* for the purpose of world switch!
>
> Mesa for example does uses the IBs this way and since that is the open 
> source implementation it is also the only one that matters for us.
>
> See Daniels recent mail about that
>
> (http://www.spinics.net/lists/dri-devel/msg116196.html).
>
> "
>
> +- The Linux kernel's "no regression" policy holds in practice only for
>
> +  open-source userspace of the DRM subsystem. DRM developers are
>
> +perfectly fine
>
> +  if closed-source blob drivers in userspace use the same uAPI as the
>
> +open
>
> +  drivers, but they must do so in the exact same way as the open drivers.
>
> +  Creative (ab)use of the interfaces will, and in the past routinely
>
> +has, lead
>
> +  to breakage.
>
> "
>
> > 2,the patch only doesn't skip the preamble CE ib, so it doesn't 
> break any interface protocol. I really don't understand your phase 
> ---" Apart from that not skipping the IBs would break the IOCTL 
> interface, so it is clearly a NAK"
>
> The IOCTL interface requires that we skip the IBs when there isn't any 
> context change. Otherwise adding the flag in the first place wouldn't 
> have made sense. Sorry but this requirement comes a bit late.
>
> [ML] well. Since from practice perspective, not skip the preamble CE 
> IB really doesn’t make any harm, and based on the requirement of world 
> switch, preamble CE IB **cannot** be skipped, so I think this 
> interface need change.
>
> Could you summarize why the CP has this strange requirement and what 
> would happen if we don't handle this correctly?
>
> [ML] already mentioned above
>
> > 3,the patch doesn't insert ring specific code, "
>
> > amdgpu_ring_insert_nop" is a common interface for all amdgpu ring,
>
> > *and* be caution that this change is a must otherwise CE will get vm
>
> > fault after we change the scheme of SWITCH_BUFFERS
>
> > +          if (ring->type == AMDGPU_RING_TYPE_GFX)
>
> Well isn't this if testing the ring type or not? So it is clearly ring 
> specific code in the common handling.
>
> [ML] why common handling cannot tolerate ring specific code ? 
> According to brahma code, you can always find ring specific code in 
> common handling, for example:
>
> “Checking if preamble CE ib could be skipped” itself is a classic 
> behavior of ring specific code, only gfx ring have preamble CE ib …
>
> If you need to inserts NOPs before everything else you need to add a 
> callback for this, but I would rather say add this into the existing 
> vm_flush or sync engine callback where it belongs.
>
> [ML] I really have no strong opinion of the place inserting those 
> NOPS, as long as they are ahead of CE IB is okay for me ! I can change 
> to that way as you wish.
>
> > 4,Only insert one SWITCH_BUFFERs at the end of GFX dma frame is what 
> exactly windows KMD do for GX8.0, this is both for preemption and 
> performance, original scheme of SWITCH_BUFFER is totally wrong, and 
> please check below text I copied from CP hw guys:
>
> >
>
> > ----------------------------------------thread of discussion CE sync
>
> > with DE with Hans--------------------------------------------------
>
> > Monk,
>
> >
>
> > All your assumptions are correct and to be sure that CE isn’t ahead 
> when it shouldn’t, the safe way is to insert double switch buffers at 
> the end of VMSWICH.
>
> >
>
> > Now to the question, why does it work? There is a hardware 
> limitation on how far ahead CE can be in the ring buffer. CE and DE 
> are reading from the same fetcher and that data is stored in a reorder 
> queue (ROQ). The size of this queue (in gfx8) is 128 DWs. This is how 
> far ahead CE can be. Now, at least for Windows, KMD usually pad (NOPs) 
> each DMA frame to 128 (or 256?) DWs and if CE’s indirect buffer is 
> more than 128 DWs ahead of where DE is updating the page table it will 
> work due to the padding and the limitation of the ROQ.
>
> >
>
> > /Hans
>
> When this is the requirement of the hardware team then you should 
> properly implement that and not add crude workarounds like the 
> proposed one.
>
> See the calls to amdgpu_ring_init() here the align_mask determines how 
> many padding is added for each command submission. We currently only 
> uses 16 for the GFX engine IIRC, just changing that to 256 should fix 
> the issue.
>
> [ML] no, the padding is not needed before command submission, instead 
> it is needed between the first CE_IB and the last package of VM flush!
>
> Regards,
>
> Christian.
>
> >
>
> > From: Liu, Monk
>
> > Sent: Monday, August 15, 2016 11:35 PM
>
> > To: Fernlund, Hans
>
> > Cc: Deucher, Alexander; Li, Bingley; Liu, Robert; Wang,
>
> > Daniel(Xiaowei); Jiang, Jerry (SW)
>
> > Subject: FW: CE synchronization with DE
>
> >
>
> > Hi Hans,
>
> >
>
> > we are trying to understand constant engine sync (with DE) principle
>
> > and we still have things not understood, especially the behave of CE
>
> >
>
> > can you help us go through it ? thanks very much!
>
> >
>
> > 1.      CE runs much faster than DE, and before DE finished VMSWICH  
> (vm_invalidate_request & pd preparing ) CE may already start running 
> in CE_IB, which will touch memory through VMID 3 (windows KMD use VMID 
> 3 for GFX engine), we are wondering why we never saw CE vm fault ?
>
> > 2.      After CE processed SWITCH_BUFFERS, CE will go to the next 
> DMA frame, which may probably belongs to another process (process B), 
> but meanwhile DE may still not finished VM switch to process B, we 
> also wondering why we never see CE vm fault under this condition neither…
>
> > •        NOTE: It only insert one SWITCH_BUFFERS at the end of gfx 
> dma frame, so it won’t block CE continue if Buffer B is never used. ( 
> first IB use buffer A and second use buffer B)
>
> >
>
> > Ring buffer figure for above two cases, see blow please :
>
> >
>
> > #1  case scenario:
>
> > Ring buffer:
>
> > /VMSWITCH/……/IB_C /IB_C/IB/EOP/SWITCH_BUFFER/…..
>
> > ^                                        ^
>
> > |                                        |
>
> > DE                                     CE
>
> >
>
> >
>
> > Assume GFX ring is fresh new, and we just drop one JOB to hardware
>
> > ring, DE is still in VMSWITCH stage (bind PD of current process to
>
> > VMID 3 and then invalidate VM), so now the page table for VMID 3 is
>
> > still not ready, but meanwhile CE may already executing CE_IB,
>
> > accessing address of this CE_IB should trigger VM fault as our
>
> > understanding …
>
> >
>
> > We wondering why this never triggering vm fault since page table is 
> not yet fully connected to VMID 3!
>
> >
>
> >
>
> > #2 case scenario:
>
> > Ring buffer:
>
> > /VMSWITCH-to-P1/……/IB_C 
> /IB_C/IB/EOP/SWITCH_BUFFER/…../VMSWITCH-to-P2/………/IB_C 
> /IB_C/IB/EOP/SWITCH_BUFFER/……
>
> > ^                                        ^
>
> > |                                        |
>
> > DE                                     CE
>
> > Still assume the ring buffer is fresh new, and Assume DE is still in 
> VMSWITCH-TO-P2 stage (P1/P2 means process1/process2), since this time 
> vm page table of P2 yet not been binding to VMID3, if CE is already 
> running in IB belongs to P2, why we never meet vm fault ?
>
> >
>
> >
>
> > Thanks for your time!
>
> >
>
> > BR Monk
>
> >
>
> >
>
> >
>
> >
>
> > -----Original Message-----
>
> > From: Christian König [mailto:deathsimple@vodafone.de]
>
> > Sent: Wednesday, August 24, 2016 4:16 PM
>
> > To: Liu, Monk <Monk.Liu@amd.com <mailto:Monk.Liu@amd.com>>; 
> amd-gfx@lists.freedesktop.org <mailto:amd-gfx@lists.freedesktop.org>
>
> > Subject: Re: [PATCH] drm/amdgpu:fix DMAframe for GFX8
>
> >
>
> > Am 24.08.2016 um 05:42 schrieb Monk Liu:
>
> >> 1,drop inserting double SWITCH_BUFFERS scheme, which impacts
>
> >> performance, because double SWITCH_BUFFER actaully doesn't switch
>
> >> buffer, so the CE's ping-pong buffer is not used at all. Now only
>
> >> insert one SWITCH_BUFFERS at the bottom of each GFX dmaframe. And
>
> >> only inserting one SWITCH_BUFFER is a must for Virtualization World 
> Switch!
>
> >>
>
> >> 2,without double SWITCH_BUFFRES, CE will go very ahead of DE which
>
> >> will trigger VM fault (CE IB run before DE finished VM flush), so we
>
> >> need a way to prevent that happen.
>
> >> according to GFX8.0 spec, CE will go ahead of DE at most by 128dw, so
>
> >> we can insert 128 NOP before CE ib, so that CE won't run const IB
>
> >> before DE finished VM flush.
>
> >>
>
> >> 3,for virtualization, CP requires that each DMAframe need two const
>
> >> IB and one IB, so we cannot remove preamble CE IB even no context
>
> >> switch take place.
>
> > Well than we probably can't implement virtualization this way, cause 
> there isn't any guarantee that userspace will use two CE IBs and one 
> DE IB.
>
> >
>
> > Apart from that not skipping the IBs would break the IOCTL 
> interface, so it is clearly a NAK.
>
> >
>
> >> 4,each DMAframe won't occupy more than 256dw, so allocate 256dw is
>
> >> enough
>
> >>
>
> >> TODO:
>
> >> use DMAframe entry design to implement ib_schedule(), make sure each
>
> >> DMAframe is aligned and fixed at their offset, which is needed for
>
> >> preemption and easy for ring buffer debug & dump.
>
> >>
>
> >> Change-Id: I1b98d6352b7ead91b661094921bfd43cfeaae190
>
> >> Signed-off-by: Monk Liu <Monk.Liu@amd.com <mailto:Monk.Liu@amd.com>>
>
> >> ---
>
> >> drivers/gpu/drm/amd/amdgpu/amdgpu.h    |  2 ++
>
> >> drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 15 ++++++++++-----
>
> >> drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c  | 27 ++++++++++++---------------
>
> >>    3 files changed, 24 insertions(+), 20 deletions(-)
>
> >>
>
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>
> >> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>
> >> index 316bd2a..11bb05f 100644
>
> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>
> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>
> >> @@ -337,6 +337,7 @@ struct amdgpu_ring_funcs {
>
> >>           void (*end_use)(struct amdgpu_ring *ring);
>
> >>           void (*emit_wreg) (struct amdgpu_ring *ring, uint32_t 
> offset, uint32_t val);
>
> >>           void (*emit_rreg) (struct amdgpu_ring *ring, uint32_t 
> offset);
>
> >> +       void (*emit_sb) (struct amdgpu_ring *ring, int cnt);
>
> >>    };
>
> >>
>
> >>    /*
>
> >> @@ -2370,6 +2371,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring 
> *ring)
>
> >>    #define amdgpu_ring_emit_hdp_invalidate(r) 
> (r)->funcs->emit_hdp_invalidate((r))
>
> >>    #define amdgpu_ring_emit_wreg(r, i, v) 
> (r)->funcs->emit_wreg((r), (i), (v))
>
> >>    #define amdgpu_ring_emit_rreg(r, i) (r)->funcs->emit_rreg((r),
>
> >> (i))
>
> >> +#define amdgpu_ring_emit_sb(r, c) (r)->funcs->emit_sb((r), (c))
>
> >>    #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
>
> >>    #define amdgpu_ring_init_cond_exec(r) 
> (r)->funcs->init_cond_exec((r))
>
> >>    #define amdgpu_ring_patch_cond_exec(r,o)
>
> >> (r)->funcs->patch_cond_exec((r),(o))
>
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>
> >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>
> >> index a31d7ef..d99af07 100644
>
> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>
> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>
> >> @@ -151,7 +151,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring 
> *ring, unsigned num_ibs,
>
> >>                          return -EINVAL;
>
> >>           }
>
> >>
>
> >> -        r = amdgpu_ring_alloc(ring, 256 * num_ibs);
>
> >> +       r = amdgpu_ring_alloc(ring, 256);
>
> >>           if (r) {
>
> >> dev_err(adev->dev, "scheduling IB failed (%d).\n", r);
>
> >>                          return r;
>
> >> @@ -174,15 +174,16 @@ int amdgpu_ib_schedule(struct amdgpu_ring 
> *ring, unsigned num_ibs,
>
> >>           /* always set cond_exec_polling to CONTINUE */
>
> >> *ring->cond_exe_cpu_addr = 1;
>
> >>
>
> >> +
>
> >> +       /* emit 128 dw nop before IBc */
>
> >> +       if (ring->type == AMDGPU_RING_TYPE_GFX)
>
> >> + amdgpu_ring_insert_nop(ring, 128);
>
> >> +
>
> > Please don't add ring specific code into the common handling.
>
> >
>
> >>           skip_preamble = ring->current_ctx == ctx;
>
> >>           need_ctx_switch = ring->current_ctx != ctx;
>
> >>           for (i = 0; i < num_ibs; ++i) {
>
> >>                          ib = &ibs[i];
>
> >>
>
> >> -                       /* drop preamble IBs if we don't have a 
> context switch */
>
> >> -                       if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) 
> && skip_preamble)
>
> >> -                                      continue;
>
> >> -
>
> > This is a clear NAK since it is an userspace visible change to stop 
> doing this.
>
> >
>
> > Regards,
>
> > Christian.
>
> >
>
> >> amdgpu_ring_emit_ib(ring, ib, job ? job->vm_id : 0,
>
> >>     need_ctx_switch);
>
> >> need_ctx_switch = false;
>
> >> @@ -210,6 +211,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring 
> *ring, unsigned num_ibs,
>
> >> amdgpu_ring_patch_cond_exec(ring, patch_offset);
>
> >>
>
> >>           ring->current_ctx = ctx;
>
> >> +
>
> >> +       /* Insert one SB at the bottom of each DMA frame */
>
> >> +       if (ring->funcs->emit_sb)
>
> >> + amdgpu_ring_emit_sb(ring, 1);
>
> >> amdgpu_ring_commit(ring);
>
> >>           return 0;
>
> >>    }
>
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>
> >> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>
> >> index dfa2288..37dc64b 100755
>
> >> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>
> >> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>
> >> @@ -5936,12 +5936,6 @@ static void gfx_v8_0_ring_emit_ib_gfx(struct 
> amdgpu_ring *ring,
>
> >>    {
>
> >>           u32 header, control = 0;
>
> >>
>
> >> -        /* insert SWITCH_BUFFER packet before first IB in the ring 
> frame */
>
> >> -        if (ctx_switch) {
>
> >> - amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
>
> >> - amdgpu_ring_write(ring, 0);
>
> >> -        }
>
> >> -
>
> >>           if (ib->flags & AMDGPU_IB_FLAG_CE)
>
> >>                          header = 
> PACKET3(PACKET3_INDIRECT_BUFFER_CONST, 2);
>
> >>           else
>
> >> @@ -6013,11 +6007,9 @@ static void 
> gfx_v8_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
>
> >> amdgpu_ring_write(ring, 4); /* poll interval */
>
> >>
>
> >>           if (usepfp) {
>
> >> -                       /* synce CE with ME to prevent CE fetch 
> CEIB before context switch done */
>
> >> - amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
>
> >> - amdgpu_ring_write(ring, 0);
>
> >> - amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
>
> >> - amdgpu_ring_write(ring, 0);
>
> >> +                      /* sync PFP to ME, otherwise we might get 
> invalid PFP reads */
>
> >> + amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0));
>
> >> + amdgpu_ring_write(ring, 0x0);
>
> >>           }
>
> >>    }
>
> >>
>
> >> @@ -6065,10 +6057,6 @@ static void 
> gfx_v8_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
>
> >>                          /* sync PFP to ME, otherwise we might get 
> invalid PFP reads */
>
> >> amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0));
>
> >> amdgpu_ring_write(ring, 0x0);
>
> >> - amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
>
> >> - amdgpu_ring_write(ring, 0);
>
> >> - amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
>
> >> - amdgpu_ring_write(ring, 0);
>
> >>           }
>
> >>    }
>
> >>
>
> >> @@ -6170,6 +6158,14 @@ static void 
> gfx_v8_0_ring_emit_wreg_kiq(struct amdgpu_ring *ring, u32 idx, u32 v
>
> >> amdgpu_ring_write(ring, val);
>
> >>    }
>
> >>
>
> >> +static void gfx_v8_ring_emit_sb(struct amdgpu_ring *ring, int cnt) {
>
> >> +       while (cnt--) {
>
> >> + amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
>
> >> + amdgpu_ring_write(ring, 0);
>
> >> +       }
>
> >> +}
>
> >> +
>
> >>    static void gfx_v8_0_set_gfx_eop_interrupt_state(struct 
> amdgpu_device *adev,
>
> >>  enum amdgpu_interrupt_state state)
>
> >>    {
>
> >> @@ -6477,6 +6473,7 @@ static const struct amdgpu_ring_funcs 
> gfx_v8_0_ring_funcs_gfx = {
>
> >>           .test_ib = gfx_v8_0_ring_test_ib,
>
> >>           .insert_nop = amdgpu_ring_insert_nop,
>
> >>           .pad_ib = amdgpu_ring_generic_pad_ib,
>
> >> +       .emit_sb = gfx_v8_ring_emit_sb,
>
> >>    };
>
> >>
>
> >>    static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute
>
> >> = {
>
> >
>
> _______________________________________________
>
> amd-gfx mailing list
>
> amd-gfx@lists.freedesktop.org <mailto:amd-gfx@lists.freedesktop.org>
>
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH] drm/amdgpu:fix DMAframe for GFX8
       [not found]                     ` <f7e47224-0a17-4bf2-d52a-e8888d707359-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2016-08-24  9:46                       ` Liu, Monk
  0 siblings, 0 replies; 10+ messages in thread
From: Liu, Monk @ 2016-08-24  9:46 UTC (permalink / raw)
  To: Christian König, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Wang, Daniel(Xiaowei), Jiang, Jerry (SW)

> “Checking if preamble CE ib could be skipped” itself is a classic 
> behavior of ring specific code, only gfx ring have preamble CE ib …
>
That isn't correct. The preamble IB feature can be used for Compute and DMA engines as well.

[ML] I'm fuzzy.... preamble CE ib is only mean for gfx and compute ring, why SDMA engine is involved ?
And besides compute UMD doesn't need CE by far, so only GFX ring need preamble CE IB by far.


-----Original Message-----
From: Christian König [mailto:deathsimple@vodafone.de] 
Sent: Wednesday, August 24, 2016 5:38 PM
To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Wang, Daniel(Xiaowei) <Daniel.Wang2@amd.com>; Jiang, Jerry (SW) <Jerry.Jiang@amd.com>
Subject: Re: [PATCH] drm/amdgpu:fix DMAframe for GFX8

> >>Briefly speaking: ring buffer should be fixed at some certain 
> >>pattern so that
> CP hw can rely on this pattern to get/set data *in ring buffer
> directly* for the purpose of world switch!
>
In this case the kernel driver must make sure that the requirements of the hardware are fulfilled for this. Again we can't rely on anything the user mode driver does!

In other words if the hardware always needs two CE and one DE for it's operation the kernel must insert missing IB packets or otherwise we can't accept this.

> “Checking if preamble CE ib could be skipped” itself is a classic 
> behavior of ring specific code, only gfx ring have preamble CE ib …
>
That isn't correct. The preamble IB feature can be used for Compute and DMA engines as well.

BTW: Please only use text only mails on public mailing lists, not HTML mails.

Regards,
Christian.

Am 24.08.2016 um 11:22 schrieb Liu, Monk:
>
> Christian,
>
> I'm not going to argue with you about policy or interface protocol , 
> cuz those things would block us from implementing new features, 
> instead they should be changed if they are not fit to new requirement, 
> isn't it ?
>
> See my answers in lines, thanks !
>
> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf 
> Of Christian K?nig
> Sent: Wednesday, August 24, 2016 4:56 PM
> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Wang, Daniel(Xiaowei) <Daniel.Wang2@amd.com>; Jiang, Jerry (SW) 
> <Jerry.Jiang@amd.com>
> Subject: Re: [PATCH] drm/amdgpu:fix DMAframe for GFX8
>
> Am 24.08.2016 um 10:29 schrieb Liu, Monk:
>
> > Christian,
>
> >
>
> > 1,umd should insert always three IB each job, this is guaranteed in
> windows and catalyst driver, and this is a must for virtualization,
>
> >   Besides, how to implement virtualization is not lead by driver,
> instead we should follow CP logic, and you don't have any choice of 
> how to impl it ...
>
> No, that is totally nonsense. The kernel driver can *NEVER* rely on 
> that the UMD does something in a certain way.
>
> [ML] please stick to my patch, my patch only *not* skip the preamble 
> CE ib, it actually doesn't make any harm, and umd can do whatever it 
> want,
>
> but please remember that if the system want to support world switch, 
> UMD *must* always insert preamble CE_IB, CE_IB, and DE_IB for *each* 
> DMAframe/job.
>
> Because CP will rely on the offset of preamble CE IB to calculate the 
> meta data filling package offset within the ring buffer, and CP will 
> change those meta data for world switch.
>
> NOTE: the meta data filling package patch is not ready in staging-4.6, 
> I will provide more patches later, and the meta data records some 
> snapshot of the timing CPG get preempted.
>
> >>Briefly speaking: ring buffer should be fixed at some certain 
> >>pattern so that
> CP hw can rely on this pattern to get/set data *in ring buffer
> directly* for the purpose of world switch!
>
> Mesa for example does uses the IBs this way and since that is the open 
> source implementation it is also the only one that matters for us.
>
> See Daniels recent mail about that
>
> (http://www.spinics.net/lists/dri-devel/msg116196.html).
>
> "
>
> +- The Linux kernel's "no regression" policy holds in practice only 
> +for
>
> +  open-source userspace of the DRM subsystem. DRM developers are
>
> +perfectly fine
>
> +  if closed-source blob drivers in userspace use the same uAPI as the
>
> +open
>
> +  drivers, but they must do so in the exact same way as the open drivers.
>
> +  Creative (ab)use of the interfaces will, and in the past routinely
>
> +has, lead
>
> +  to breakage.
>
> "
>
> > 2,the patch only doesn't skip the preamble CE ib, so it doesn't
> break any interface protocol. I really don't understand your phase 
> ---" Apart from that not skipping the IBs would break the IOCTL 
> interface, so it is clearly a NAK"
>
> The IOCTL interface requires that we skip the IBs when there isn't any 
> context change. Otherwise adding the flag in the first place wouldn't 
> have made sense. Sorry but this requirement comes a bit late.
>
> [ML] well. Since from practice perspective, not skip the preamble CE 
> IB really doesn’t make any harm, and based on the requirement of world 
> switch, preamble CE IB **cannot** be skipped, so I think this 
> interface need change.
>
> Could you summarize why the CP has this strange requirement and what 
> would happen if we don't handle this correctly?
>
> [ML] already mentioned above
>
> > 3,the patch doesn't insert ring specific code, "
>
> > amdgpu_ring_insert_nop" is a common interface for all amdgpu ring,
>
> > *and* be caution that this change is a must otherwise CE will get vm
>
> > fault after we change the scheme of SWITCH_BUFFERS
>
> > +          if (ring->type == AMDGPU_RING_TYPE_GFX)
>
> Well isn't this if testing the ring type or not? So it is clearly ring 
> specific code in the common handling.
>
> [ML] why common handling cannot tolerate ring specific code ? 
> According to brahma code, you can always find ring specific code in 
> common handling, for example:
>
> “Checking if preamble CE ib could be skipped” itself is a classic 
> behavior of ring specific code, only gfx ring have preamble CE ib …
>
> If you need to inserts NOPs before everything else you need to add a 
> callback for this, but I would rather say add this into the existing 
> vm_flush or sync engine callback where it belongs.
>
> [ML] I really have no strong opinion of the place inserting those 
> NOPS, as long as they are ahead of CE IB is okay for me ! I can change 
> to that way as you wish.
>
> > 4,Only insert one SWITCH_BUFFERs at the end of GFX dma frame is what
> exactly windows KMD do for GX8.0, this is both for preemption and 
> performance, original scheme of SWITCH_BUFFER is totally wrong, and 
> please check below text I copied from CP hw guys:
>
> >
>
> > ----------------------------------------thread of discussion CE sync
>
> > with DE with Hans--------------------------------------------------
>
> > Monk,
>
> >
>
> > All your assumptions are correct and to be sure that CE isn’t ahead
> when it shouldn’t, the safe way is to insert double switch buffers at 
> the end of VMSWICH.
>
> >
>
> > Now to the question, why does it work? There is a hardware
> limitation on how far ahead CE can be in the ring buffer. CE and DE 
> are reading from the same fetcher and that data is stored in a reorder 
> queue (ROQ). The size of this queue (in gfx8) is 128 DWs. This is how 
> far ahead CE can be. Now, at least for Windows, KMD usually pad (NOPs) 
> each DMA frame to 128 (or 256?) DWs and if CE’s indirect buffer is 
> more than 128 DWs ahead of where DE is updating the page table it will 
> work due to the padding and the limitation of the ROQ.
>
> >
>
> > /Hans
>
> When this is the requirement of the hardware team then you should 
> properly implement that and not add crude workarounds like the 
> proposed one.
>
> See the calls to amdgpu_ring_init() here the align_mask determines how 
> many padding is added for each command submission. We currently only 
> uses 16 for the GFX engine IIRC, just changing that to 256 should fix 
> the issue.
>
> [ML] no, the padding is not needed before command submission, instead 
> it is needed between the first CE_IB and the last package of VM flush!
>
> Regards,
>
> Christian.
>
> >
>
> > From: Liu, Monk
>
> > Sent: Monday, August 15, 2016 11:35 PM
>
> > To: Fernlund, Hans
>
> > Cc: Deucher, Alexander; Li, Bingley; Liu, Robert; Wang,
>
> > Daniel(Xiaowei); Jiang, Jerry (SW)
>
> > Subject: FW: CE synchronization with DE
>
> >
>
> > Hi Hans,
>
> >
>
> > we are trying to understand constant engine sync (with DE) principle
>
> > and we still have things not understood, especially the behave of CE
>
> >
>
> > can you help us go through it ? thanks very much!
>
> >
>
> > 1.      CE runs much faster than DE, and before DE finished VMSWICH  
> (vm_invalidate_request & pd preparing ) CE may already start running 
> in CE_IB, which will touch memory through VMID 3 (windows KMD use VMID
> 3 for GFX engine), we are wondering why we never saw CE vm fault ?
>
> > 2.      After CE processed SWITCH_BUFFERS, CE will go to the next 
> DMA frame, which may probably belongs to another process (process B), 
> but meanwhile DE may still not finished VM switch to process B, we 
> also wondering why we never see CE vm fault under this condition 
> neither…
>
> > •        NOTE: It only insert one SWITCH_BUFFERS at the end of gfx 
> dma frame, so it won’t block CE continue if Buffer B is never used. ( 
> first IB use buffer A and second use buffer B)
>
> >
>
> > Ring buffer figure for above two cases, see blow please :
>
> >
>
> > #1  case scenario:
>
> > Ring buffer:
>
> > /VMSWITCH/……/IB_C /IB_C/IB/EOP/SWITCH_BUFFER/…..
>
> > ^                                        ^
>
> > |                                        |
>
> > DE                                     CE
>
> >
>
> >
>
> > Assume GFX ring is fresh new, and we just drop one JOB to hardware
>
> > ring, DE is still in VMSWITCH stage (bind PD of current process to
>
> > VMID 3 and then invalidate VM), so now the page table for VMID 3 is
>
> > still not ready, but meanwhile CE may already executing CE_IB,
>
> > accessing address of this CE_IB should trigger VM fault as our
>
> > understanding …
>
> >
>
> > We wondering why this never triggering vm fault since page table is
> not yet fully connected to VMID 3!
>
> >
>
> >
>
> > #2 case scenario:
>
> > Ring buffer:
>
> > /VMSWITCH-to-P1/……/IB_C
> /IB_C/IB/EOP/SWITCH_BUFFER/…../VMSWITCH-to-P2/………/IB_C
> /IB_C/IB/EOP/SWITCH_BUFFER/……
>
> > ^                                        ^
>
> > |                                        |
>
> > DE                                     CE
>
> > Still assume the ring buffer is fresh new, and Assume DE is still in
> VMSWITCH-TO-P2 stage (P1/P2 means process1/process2), since this time 
> vm page table of P2 yet not been binding to VMID3, if CE is already 
> running in IB belongs to P2, why we never meet vm fault ?
>
> >
>
> >
>
> > Thanks for your time!
>
> >
>
> > BR Monk
>
> >
>
> >
>
> >
>
> >
>
> > -----Original Message-----
>
> > From: Christian König [mailto:deathsimple@vodafone.de]
>
> > Sent: Wednesday, August 24, 2016 4:16 PM
>
> > To: Liu, Monk <Monk.Liu@amd.com <mailto:Monk.Liu@amd.com>>;
> amd-gfx@lists.freedesktop.org <mailto:amd-gfx@lists.freedesktop.org>
>
> > Subject: Re: [PATCH] drm/amdgpu:fix DMAframe for GFX8
>
> >
>
> > Am 24.08.2016 um 05:42 schrieb Monk Liu:
>
> >> 1,drop inserting double SWITCH_BUFFERS scheme, which impacts
>
> >> performance, because double SWITCH_BUFFER actaully doesn't switch
>
> >> buffer, so the CE's ping-pong buffer is not used at all. Now only
>
> >> insert one SWITCH_BUFFERS at the bottom of each GFX dmaframe. And
>
> >> only inserting one SWITCH_BUFFER is a must for Virtualization World
> Switch!
>
> >>
>
> >> 2,without double SWITCH_BUFFRES, CE will go very ahead of DE which
>
> >> will trigger VM fault (CE IB run before DE finished VM flush), so 
> >> we
>
> >> need a way to prevent that happen.
>
> >> according to GFX8.0 spec, CE will go ahead of DE at most by 128dw, 
> >> so
>
> >> we can insert 128 NOP before CE ib, so that CE won't run const IB
>
> >> before DE finished VM flush.
>
> >>
>
> >> 3,for virtualization, CP requires that each DMAframe need two const
>
> >> IB and one IB, so we cannot remove preamble CE IB even no context
>
> >> switch take place.
>
> > Well than we probably can't implement virtualization this way, cause
> there isn't any guarantee that userspace will use two CE IBs and one 
> DE IB.
>
> >
>
> > Apart from that not skipping the IBs would break the IOCTL
> interface, so it is clearly a NAK.
>
> >
>
> >> 4,each DMAframe won't occupy more than 256dw, so allocate 256dw is
>
> >> enough
>
> >>
>
> >> TODO:
>
> >> use DMAframe entry design to implement ib_schedule(), make sure 
> >> each
>
> >> DMAframe is aligned and fixed at their offset, which is needed for
>
> >> preemption and easy for ring buffer debug & dump.
>
> >>
>
> >> Change-Id: I1b98d6352b7ead91b661094921bfd43cfeaae190
>
> >> Signed-off-by: Monk Liu <Monk.Liu@amd.com 
> >> <mailto:Monk.Liu@amd.com>>
>
> >> ---
>
> >> drivers/gpu/drm/amd/amdgpu/amdgpu.h    |  2 ++
>
> >> drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 15 ++++++++++-----
>
> >> drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c  | 27 
> >> ++++++++++++---------------
>
> >>    3 files changed, 24 insertions(+), 20 deletions(-)
>
> >>
>
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>
> >> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>
> >> index 316bd2a..11bb05f 100644
>
> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>
> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>
> >> @@ -337,6 +337,7 @@ struct amdgpu_ring_funcs {
>
> >>           void (*end_use)(struct amdgpu_ring *ring);
>
> >>           void (*emit_wreg) (struct amdgpu_ring *ring, uint32_t
> offset, uint32_t val);
>
> >>           void (*emit_rreg) (struct amdgpu_ring *ring, uint32_t
> offset);
>
> >> +       void (*emit_sb) (struct amdgpu_ring *ring, int cnt);
>
> >>    };
>
> >>
>
> >>    /*
>
> >> @@ -2370,6 +2371,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring
> *ring)
>
> >>    #define amdgpu_ring_emit_hdp_invalidate(r)
> (r)->funcs->emit_hdp_invalidate((r))
>
> >>    #define amdgpu_ring_emit_wreg(r, i, v)
> (r)->funcs->emit_wreg((r), (i), (v))
>
> >>    #define amdgpu_ring_emit_rreg(r, i) (r)->funcs->emit_rreg((r),
>
> >> (i))
>
> >> +#define amdgpu_ring_emit_sb(r, c) (r)->funcs->emit_sb((r), (c))
>
> >>    #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), 
> >> (ib)))
>
> >>    #define amdgpu_ring_init_cond_exec(r)
> (r)->funcs->init_cond_exec((r))
>
> >>    #define amdgpu_ring_patch_cond_exec(r,o)
>
> >> (r)->funcs->patch_cond_exec((r),(o))
>
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>
> >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>
> >> index a31d7ef..d99af07 100644
>
> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>
> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>
> >> @@ -151,7 +151,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring
> *ring, unsigned num_ibs,
>
> >>                          return -EINVAL;
>
> >>           }
>
> >>
>
> >> -        r = amdgpu_ring_alloc(ring, 256 * num_ibs);
>
> >> +       r = amdgpu_ring_alloc(ring, 256);
>
> >>           if (r) {
>
> >> dev_err(adev->dev, "scheduling IB failed (%d).\n", r);
>
> >>                          return r;
>
> >> @@ -174,15 +174,16 @@ int amdgpu_ib_schedule(struct amdgpu_ring
> *ring, unsigned num_ibs,
>
> >>           /* always set cond_exec_polling to CONTINUE */
>
> >> *ring->cond_exe_cpu_addr = 1;
>
> >>
>
> >> +
>
> >> +       /* emit 128 dw nop before IBc */
>
> >> +       if (ring->type == AMDGPU_RING_TYPE_GFX)
>
> >> + amdgpu_ring_insert_nop(ring, 128);
>
> >> +
>
> > Please don't add ring specific code into the common handling.
>
> >
>
> >>           skip_preamble = ring->current_ctx == ctx;
>
> >>           need_ctx_switch = ring->current_ctx != ctx;
>
> >>           for (i = 0; i < num_ibs; ++i) {
>
> >>                          ib = &ibs[i];
>
> >>
>
> >> -                       /* drop preamble IBs if we don't have a 
> context switch */
>
> >> -                       if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) 
> && skip_preamble)
>
> >> -                                      continue;
>
> >> -
>
> > This is a clear NAK since it is an userspace visible change to stop
> doing this.
>
> >
>
> > Regards,
>
> > Christian.
>
> >
>
> >> amdgpu_ring_emit_ib(ring, ib, job ? job->vm_id : 0,
>
> >>     need_ctx_switch);
>
> >> need_ctx_switch = false;
>
> >> @@ -210,6 +211,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring
> *ring, unsigned num_ibs,
>
> >> amdgpu_ring_patch_cond_exec(ring, patch_offset);
>
> >>
>
> >>           ring->current_ctx = ctx;
>
> >> +
>
> >> +       /* Insert one SB at the bottom of each DMA frame */
>
> >> +       if (ring->funcs->emit_sb)
>
> >> + amdgpu_ring_emit_sb(ring, 1);
>
> >> amdgpu_ring_commit(ring);
>
> >>           return 0;
>
> >>    }
>
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>
> >> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>
> >> index dfa2288..37dc64b 100755
>
> >> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>
> >> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>
> >> @@ -5936,12 +5936,6 @@ static void gfx_v8_0_ring_emit_ib_gfx(struct
> amdgpu_ring *ring,
>
> >>    {
>
> >>           u32 header, control = 0;
>
> >>
>
> >> -        /* insert SWITCH_BUFFER packet before first IB in the ring 
> frame */
>
> >> -        if (ctx_switch) {
>
> >> - amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
>
> >> - amdgpu_ring_write(ring, 0);
>
> >> -        }
>
> >> -
>
> >>           if (ib->flags & AMDGPU_IB_FLAG_CE)
>
> >>                          header =
> PACKET3(PACKET3_INDIRECT_BUFFER_CONST, 2);
>
> >>           else
>
> >> @@ -6013,11 +6007,9 @@ static void
> gfx_v8_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
>
> >> amdgpu_ring_write(ring, 4); /* poll interval */
>
> >>
>
> >>           if (usepfp) {
>
> >> -                       /* synce CE with ME to prevent CE fetch 
> CEIB before context switch done */
>
> >> - amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
>
> >> - amdgpu_ring_write(ring, 0);
>
> >> - amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
>
> >> - amdgpu_ring_write(ring, 0);
>
> >> +                      /* sync PFP to ME, otherwise we might get
> invalid PFP reads */
>
> >> + amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0));
>
> >> + amdgpu_ring_write(ring, 0x0);
>
> >>           }
>
> >>    }
>
> >>
>
> >> @@ -6065,10 +6057,6 @@ static void
> gfx_v8_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
>
> >>                          /* sync PFP to ME, otherwise we might get
> invalid PFP reads */
>
> >> amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0));
>
> >> amdgpu_ring_write(ring, 0x0);
>
> >> - amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
>
> >> - amdgpu_ring_write(ring, 0);
>
> >> - amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
>
> >> - amdgpu_ring_write(ring, 0);
>
> >>           }
>
> >>    }
>
> >>
>
> >> @@ -6170,6 +6158,14 @@ static void
> gfx_v8_0_ring_emit_wreg_kiq(struct amdgpu_ring *ring, u32 idx, u32 v
>
> >> amdgpu_ring_write(ring, val);
>
> >>    }
>
> >>
>
> >> +static void gfx_v8_ring_emit_sb(struct amdgpu_ring *ring, int cnt) 
> >> +{
>
> >> +       while (cnt--) {
>
> >> + amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
>
> >> + amdgpu_ring_write(ring, 0);
>
> >> +       }
>
> >> +}
>
> >> +
>
> >>    static void gfx_v8_0_set_gfx_eop_interrupt_state(struct
> amdgpu_device *adev,
>
> >>  enum amdgpu_interrupt_state state)
>
> >>    {
>
> >> @@ -6477,6 +6473,7 @@ static const struct amdgpu_ring_funcs
> gfx_v8_0_ring_funcs_gfx = {
>
> >>           .test_ib = gfx_v8_0_ring_test_ib,
>
> >>           .insert_nop = amdgpu_ring_insert_nop,
>
> >>           .pad_ib = amdgpu_ring_generic_pad_ib,
>
> >> +       .emit_sb = gfx_v8_ring_emit_sb,
>
> >>    };
>
> >>
>
> >>    static const struct amdgpu_ring_funcs 
> >> gfx_v8_0_ring_funcs_compute
>
> >> = {
>
> >
>
> _______________________________________________
>
> amd-gfx mailing list
>
> amd-gfx@lists.freedesktop.org <mailto:amd-gfx@lists.freedesktop.org>
>
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH] drm/amdgpu:fix DMAframe for GFX8
  2016-08-24  3:59 Monk Liu
@ 2016-08-24  4:03 ` Liu, Monk
  0 siblings, 0 replies; 10+ messages in thread
From: Liu, Monk @ 2016-08-24  4:03 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


1,drop inserting double SWITCH_BUFFERS scheme, which impacts performance, because double SWITCH_BUFFER actaully doesn't switch buffer, so the CE's ping-pong buffer is not used at all. Now only insert one SWITCH_BUFFERS at the bottom of each GFX dmaframe. And only inserting one SWITCH_BUFFER is a must for Virtualization World Switch!

2,without double SWITCH_BUFFRES, CE will go very ahead of DE which will trigger VM fault (CE IB run before DE finished VM flush), so we need a way to prevent that happen.
according to GFX8.0 spec, CE will go ahead of DE at most by 128dw, so we can insert 128 NOP before CE ib, so that CE won't run const IB before DE finished VM flush.

3,for virtualization, CP requires that each DMAframe need two const IB and one IB, so we cannot remove preamble CE IB even no context switch take place.

4,each DMAframe won't occupy more than 256dw, so allocate 256dw is enough

TODO:
use DMAframe entry design to implement ib_schedule(), make sure each DMAframe is aligned and fixed at their offset, which is needed for preemption and easy for ring buffer debug & dump.

Change-Id: I1b98d6352b7ead91b661094921bfd43cfeaae190
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h    |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 15 ++++++++++-----  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c  | 27 ++++++++++++---------------
 3 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 316bd2a..11bb05f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -337,6 +337,7 @@ struct amdgpu_ring_funcs {
 	void (*end_use)(struct amdgpu_ring *ring);
 	void (*emit_wreg) (struct amdgpu_ring *ring, uint32_t offset, uint32_t val);
 	void (*emit_rreg) (struct amdgpu_ring *ring, uint32_t offset);
+	void (*emit_sb) (struct amdgpu_ring *ring, int cnt);
 };
 
 /*
@@ -2370,6 +2371,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)  #define amdgpu_ring_emit_hdp_invalidate(r) (r)->funcs->emit_hdp_invalidate((r))
 #define amdgpu_ring_emit_wreg(r, i, v) (r)->funcs->emit_wreg((r), (i), (v))  #define amdgpu_ring_emit_rreg(r, i) (r)->funcs->emit_rreg((r), (i))
+#define amdgpu_ring_emit_sb(r, c) (r)->funcs->emit_sb((r), (c))
 #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))  #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))  #define amdgpu_ring_patch_cond_exec(r,o) (r)->funcs->patch_cond_exec((r),(o))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
index a31d7ef..d99af07 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
@@ -151,7 +151,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 		return -EINVAL;
 	}
 
-	r = amdgpu_ring_alloc(ring, 256 * num_ibs);
+	r = amdgpu_ring_alloc(ring, 256);
 	if (r) {
 		dev_err(adev->dev, "scheduling IB failed (%d).\n", r);
 		return r;
@@ -174,15 +174,16 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 	/* always set cond_exec_polling to CONTINUE */
 	*ring->cond_exe_cpu_addr = 1;
 
+
+	/* emit 128 dw nop before IBc */
+	if (ring->type == AMDGPU_RING_TYPE_GFX)
+		amdgpu_ring_insert_nop(ring, 128);
+
 	skip_preamble = ring->current_ctx == ctx;
 	need_ctx_switch = ring->current_ctx != ctx;
 	for (i = 0; i < num_ibs; ++i) {
 		ib = &ibs[i];
 
-		/* drop preamble IBs if we don't have a context switch */
-		if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) && skip_preamble)
-			continue;
-
 		amdgpu_ring_emit_ib(ring, ib, job ? job->vm_id : 0,
 				    need_ctx_switch);
 		need_ctx_switch = false;
@@ -210,6 +211,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 		amdgpu_ring_patch_cond_exec(ring, patch_offset);
 
 	ring->current_ctx = ctx;
+
+	/* Insert one SB at the bottom of each DMA frame */
+	if (ring->funcs->emit_sb)
+		amdgpu_ring_emit_sb(ring, 1);
 	amdgpu_ring_commit(ring);
 	return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index dfa2288..37dc64b 100755
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -5936,12 +5936,6 @@ static void gfx_v8_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,  {
 	u32 header, control = 0;
 
-	/* insert SWITCH_BUFFER packet before first IB in the ring frame */
-	if (ctx_switch) {
-		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
-		amdgpu_ring_write(ring, 0);
-	}
-
 	if (ib->flags & AMDGPU_IB_FLAG_CE)
 		header = PACKET3(PACKET3_INDIRECT_BUFFER_CONST, 2);
 	else
@@ -6013,11 +6007,9 @@ static void gfx_v8_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
 	amdgpu_ring_write(ring, 4); /* poll interval */
 
 	if (usepfp) {
-		/* synce CE with ME to prevent CE fetch CEIB before context switch done */
-		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
-		amdgpu_ring_write(ring, 0);
-		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
-		amdgpu_ring_write(ring, 0);
+		/* sync PFP to ME, otherwise we might get invalid PFP reads */
+		amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0));
+		amdgpu_ring_write(ring, 0x0);
 	}
 }
 
@@ -6065,10 +6057,6 @@ static void gfx_v8_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
 		/* sync PFP to ME, otherwise we might get invalid PFP reads */
 		amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0));
 		amdgpu_ring_write(ring, 0x0);
-		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
-		amdgpu_ring_write(ring, 0);
-		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
-		amdgpu_ring_write(ring, 0);
 	}
 }
 
@@ -6170,6 +6158,14 @@ static void gfx_v8_0_ring_emit_wreg_kiq(struct amdgpu_ring *ring, u32 idx, u32 v
 	amdgpu_ring_write(ring, val);
 }
 
+static void gfx_v8_ring_emit_sb(struct amdgpu_ring *ring, int cnt) {
+	while (cnt--) {
+		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
+		amdgpu_ring_write(ring, 0);
+	}
+}
+
 static void gfx_v8_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
 						 enum amdgpu_interrupt_state state)  { @@ -6477,6 +6473,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
 	.test_ib = gfx_v8_0_ring_test_ib,
 	.insert_nop = amdgpu_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
+	.emit_sb = gfx_v8_ring_emit_sb,
 };
 
 static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
--
1.9.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH] drm/amdgpu:fix DMAframe for GFX8
@ 2016-08-24  3:59 Monk Liu
  2016-08-24  4:03 ` Liu, Monk
  0 siblings, 1 reply; 10+ messages in thread
From: Monk Liu @ 2016-08-24  3:59 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

1,drop inserting double SWITCH_BUFFERS scheme, which impacts
performance, because double SWITCH_BUFFER actaully doesn't
switch buffer, so the CE's ping-pong buffer is not used at
all. Now only insert one SWITCH_BUFFERS at the bottom of
each GFX dmaframe. And only inserting one SWITCH_BUFFER is
a must for Virtualization World Switch!

2,without double SWITCH_BUFFRES, CE will go very ahead
of DE which will trigger VM fault (CE IB run before DE
finished VM flush), so we need a way to prevent that happen.
according to GFX8.0 spec, CE will go ahead of DE at most by
128dw, so we can insert 128 NOP before CE ib, so that
CE won't run const IB before DE finished VM flush.

3,for virtualization, CP requires that each DMAframe need
two const IB and one IB, so we cannot remove preamble CE IB
even no context switch take place.

4,each DMAframe won't occupy more than 256dw, so allocate
256dw is enough

TODO:
use DMAframe entry design to implement ib_schedule(), make sure
each DMAframe is aligned and fixed at their offset, which is
needed for preemption and easy for ring buffer debug & dump.

Change-Id: I1b98d6352b7ead91b661094921bfd43cfeaae190
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h    |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 15 ++++++++++-----
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c  | 27 ++++++++++++---------------
 3 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 316bd2a..11bb05f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -337,6 +337,7 @@ struct amdgpu_ring_funcs {
 	void (*end_use)(struct amdgpu_ring *ring);
 	void (*emit_wreg) (struct amdgpu_ring *ring, uint32_t offset, uint32_t val);
 	void (*emit_rreg) (struct amdgpu_ring *ring, uint32_t offset);
+	void (*emit_sb) (struct amdgpu_ring *ring, int cnt);
 };
 
 /*
@@ -2370,6 +2371,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
 #define amdgpu_ring_emit_hdp_invalidate(r) (r)->funcs->emit_hdp_invalidate((r))
 #define amdgpu_ring_emit_wreg(r, i, v) (r)->funcs->emit_wreg((r), (i), (v))
 #define amdgpu_ring_emit_rreg(r, i) (r)->funcs->emit_rreg((r), (i))
+#define amdgpu_ring_emit_sb(r, c) (r)->funcs->emit_sb((r), (c))
 #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
 #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
 #define amdgpu_ring_patch_cond_exec(r,o) (r)->funcs->patch_cond_exec((r),(o))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
index a31d7ef..d99af07 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
@@ -151,7 +151,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 		return -EINVAL;
 	}
 
-	r = amdgpu_ring_alloc(ring, 256 * num_ibs);
+	r = amdgpu_ring_alloc(ring, 256);
 	if (r) {
 		dev_err(adev->dev, "scheduling IB failed (%d).\n", r);
 		return r;
@@ -174,15 +174,16 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 	/* always set cond_exec_polling to CONTINUE */
 	*ring->cond_exe_cpu_addr = 1;
 
+
+	/* emit 128 dw nop before IBc */
+	if (ring->type == AMDGPU_RING_TYPE_GFX)
+		amdgpu_ring_insert_nop(ring, 128);
+
 	skip_preamble = ring->current_ctx == ctx;
 	need_ctx_switch = ring->current_ctx != ctx;
 	for (i = 0; i < num_ibs; ++i) {
 		ib = &ibs[i];
 
-		/* drop preamble IBs if we don't have a context switch */
-		if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) && skip_preamble)
-			continue;
-
 		amdgpu_ring_emit_ib(ring, ib, job ? job->vm_id : 0,
 				    need_ctx_switch);
 		need_ctx_switch = false;
@@ -210,6 +211,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 		amdgpu_ring_patch_cond_exec(ring, patch_offset);
 
 	ring->current_ctx = ctx;
+
+	/* Insert one SB at the bottom of each DMA frame */
+	if (ring->funcs->emit_sb)
+		amdgpu_ring_emit_sb(ring, 1);
 	amdgpu_ring_commit(ring);
 	return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index dfa2288..37dc64b 100755
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -5936,12 +5936,6 @@ static void gfx_v8_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
 {
 	u32 header, control = 0;
 
-	/* insert SWITCH_BUFFER packet before first IB in the ring frame */
-	if (ctx_switch) {
-		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
-		amdgpu_ring_write(ring, 0);
-	}
-
 	if (ib->flags & AMDGPU_IB_FLAG_CE)
 		header = PACKET3(PACKET3_INDIRECT_BUFFER_CONST, 2);
 	else
@@ -6013,11 +6007,9 @@ static void gfx_v8_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
 	amdgpu_ring_write(ring, 4); /* poll interval */
 
 	if (usepfp) {
-		/* synce CE with ME to prevent CE fetch CEIB before context switch done */
-		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
-		amdgpu_ring_write(ring, 0);
-		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
-		amdgpu_ring_write(ring, 0);
+		/* sync PFP to ME, otherwise we might get invalid PFP reads */
+		amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0));
+		amdgpu_ring_write(ring, 0x0);
 	}
 }
 
@@ -6065,10 +6057,6 @@ static void gfx_v8_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
 		/* sync PFP to ME, otherwise we might get invalid PFP reads */
 		amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0));
 		amdgpu_ring_write(ring, 0x0);
-		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
-		amdgpu_ring_write(ring, 0);
-		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
-		amdgpu_ring_write(ring, 0);
 	}
 }
 
@@ -6170,6 +6158,14 @@ static void gfx_v8_0_ring_emit_wreg_kiq(struct amdgpu_ring *ring, u32 idx, u32 v
 	amdgpu_ring_write(ring, val);
 }
 
+static void gfx_v8_ring_emit_sb(struct amdgpu_ring *ring, int cnt)
+{
+	while (cnt--) {
+		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
+		amdgpu_ring_write(ring, 0);
+	}
+}
+
 static void gfx_v8_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
 						 enum amdgpu_interrupt_state state)
 {
@@ -6477,6 +6473,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
 	.test_ib = gfx_v8_0_ring_test_ib,
 	.insert_nop = amdgpu_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
+	.emit_sb = gfx_v8_ring_emit_sb,
 };
 
 static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
-- 
1.9.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2016-08-24  9:46 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-08-24  3:42 [PATCH] drm/amdgpu:fix DMAframe for GFX8 Monk Liu
     [not found] ` <1472010140-8571-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2016-08-24  8:16   ` Christian König
     [not found]     ` <16186223-1c07-e03a-5b09-4593b61fafad-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2016-08-24  8:29       ` Liu, Monk
     [not found]         ` <MWHPR12MB1182CFCA12CD2731D890488D84EA0-Gy0DoCVfaSVhjnLHdLm0OQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2016-08-24  8:56           ` Christian König
     [not found]             ` <34bfdd22-da83-26b2-0949-fafe6b30af78-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2016-08-24  9:22               ` Liu, Monk
     [not found]                 ` <MWHPR12MB1182A0913A30FA1AC499CD7584EA0-Gy0DoCVfaSVhjnLHdLm0OQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2016-08-24  9:30                   ` Liu, Monk
2016-08-24  9:37                   ` Christian König
     [not found]                     ` <f7e47224-0a17-4bf2-d52a-e8888d707359-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2016-08-24  9:46                       ` Liu, Monk
2016-08-24  3:59 Monk Liu
2016-08-24  4:03 ` Liu, Monk

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.