All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
@ 2016-08-31  3:49 Monk Liu
       [not found] ` <1472615341-3847-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 20+ messages in thread
From: Monk Liu @ 2016-08-31  3:49 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

v1:
for gfx8, use CONTEXT_CONTROL package to dynamically
skip preamble CEIB and other load_xxx command in sequence.

v2:
support GFX7 as well, and bump up version.
remove cntxcntl in compute ring funcs because CPC doesn't
support this packet.

v3: fix reduntant judgement in cntxcntl.

Change-Id: I4b87ca84ea8c11ba4f7fb4c0e8a5be537ccde851
Signed-off-by: Monk Liu <Monk.Liu@amd.com>

Change-Id: I5d24c1bb5c14190ce4adeb6a331ee3d92b3d5c83
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  9 +++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 12 ++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 16 +++++++++-------
 drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   | 20 ++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   | 30 ++++++++++++++++++++++++++++++
 6 files changed, 82 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 1254410..0de5f08 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -321,6 +321,7 @@ struct amdgpu_ring_funcs {
 	void (*begin_use)(struct amdgpu_ring *ring);
 	void (*end_use)(struct amdgpu_ring *ring);
 	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
+	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
 };
 
 /*
@@ -965,6 +966,7 @@ struct amdgpu_ctx {
 	spinlock_t		ring_lock;
 	struct fence            **fences;
 	struct amdgpu_ctx_ring	rings[AMDGPU_MAX_RINGS];
+	bool preamble_presented;
 };
 
 struct amdgpu_ctx_mgr {
@@ -1227,8 +1229,13 @@ struct amdgpu_cs_parser {
 
 	/* user fence */
 	struct amdgpu_bo_list_entry	uf_entry;
+	bool preamble_present; /* True means this command submit involves a preamble IB */
 };
 
+#define PREAMBLE_IB_PRESENT 		(1 << 0) /* bit set means command submit involves a preamble IB */
+#define PREAMBLE_IB_PRESENT_FIRST	(1 << 1) /* bit set means preamble IB is first presented in belonging context */
+#define HAVE_CTX_SWITCH		(1 << 2) /* bit set means context switch occured */
+
 struct amdgpu_job {
 	struct amd_sched_job    base;
 	struct amdgpu_device	*adev;
@@ -1237,6 +1244,7 @@ struct amdgpu_job {
 	struct amdgpu_sync	sync;
 	struct amdgpu_ib	*ibs;
 	struct fence		*fence; /* the hw fence */
+	uint32_t		preamble_status;
 	uint32_t		num_ibs;
 	void			*owner;
 	uint64_t		fence_ctx; /* the fence_context this job uses */
@@ -2264,6 +2272,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
 #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
 #define amdgpu_ring_emit_hdp_invalidate(r) (r)->funcs->emit_hdp_invalidate((r))
 #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
+#define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r), (d))
 #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
 #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
 #define amdgpu_ring_patch_cond_exec(r,o) (r)->funcs->patch_cond_exec((r),(o))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 2d4e005..6d8c050 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -792,6 +792,9 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
 		if (r)
 			return r;
 
+		if (ib->flags & AMDGPU_IB_FLAG_PREAMBLE)
+			parser->preamble_present = true;
+
 		if (parser->job->ring && parser->job->ring != ring)
 			return -EINVAL;
 
@@ -930,6 +933,12 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
 		return r;
 	}
 
+	if (p->preamble_present) {
+		job->preamble_status |= PREAMBLE_IB_PRESENT;
+		if (!p->ctx->preamble_presented)
+			job->preamble_status |= PREAMBLE_IB_PRESENT_FIRST;
+	}
+
 	job->owner = p->filp;
 	job->fence_ctx = entity->fence_context;
 	p->fence = fence_get(&job->base.s_fence->finished);
@@ -940,6 +949,9 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
 	trace_amdgpu_cs_ioctl(job);
 	amd_sched_entity_push_job(&job->base);
 
+	if (p->preamble_present)
+		p->ctx->preamble_presented = true;
+
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 56c85e6..44db0ab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -55,9 +55,10 @@
  * - 3.3.0 - Add VM support for UVD on supported hardware.
  * - 3.4.0 - Add AMDGPU_INFO_NUM_EVICTIONS.
  * - 3.5.0 - Add support for new UVD_NO_OP register.
+ * - 3.6.0 - UMD doesn't/shouldn't need to use CONTEXT_CONTROL in IB, KMD should do it
  */
 #define KMS_DRIVER_MAJOR	3
-#define KMS_DRIVER_MINOR	5
+#define KMS_DRIVER_MINOR	6
 #define KMS_DRIVER_PATCHLEVEL	0
 
 int amdgpu_vram_limit = 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
index 04263f0..b12b5ba 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
@@ -121,10 +121,11 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 {
 	struct amdgpu_device *adev = ring->adev;
 	struct amdgpu_ib *ib = &ibs[0];
-	bool skip_preamble, need_ctx_switch;
+	bool need_ctx_switch;
 	unsigned patch_offset = ~0;
 	struct amdgpu_vm *vm;
 	uint64_t fence_ctx;
+	uint32_t status = 0;
 
 	unsigned i;
 	int r = 0;
@@ -174,15 +175,16 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 	/* always set cond_exec_polling to CONTINUE */
 	*ring->cond_exe_cpu_addr = 1;
 
-	skip_preamble = ring->current_ctx == fence_ctx;
 	need_ctx_switch = ring->current_ctx != fence_ctx;
+	if (job && ring->funcs->emit_cntxcntl) {
+		if (need_ctx_switch)
+			status |= HAVE_CTX_SWITCH;
+		status |= job->preamble_status;
+		amdgpu_ring_emit_cntxcntl(ring, status);
+	}
+
 	for (i = 0; i < num_ibs; ++i) {
 		ib = &ibs[i];
-
-		/* drop preamble IBs if we don't have a context switch */
-		if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) && skip_preamble)
-			continue;
-
 		amdgpu_ring_emit_ib(ring, ib, job ? job->vm_id : 0,
 				    need_ctx_switch);
 		need_ctx_switch = false;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
index f055d49..0d5addb 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
@@ -2096,6 +2096,25 @@ static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
 	amdgpu_ring_write(ring, control);
 }
 
+static void gfx_v7_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)
+{
+	uint32_t dw2 = 0;
+
+	dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
+	if (flags & HAVE_CTX_SWITCH) {
+		/* set load_global_config & load_global_uconfig */
+		dw2 |= 0x8001;
+		/* set load_cs_sh_regs */
+		dw2 |= 0x01000000;
+		/* set load_per_context_state & load_gfx_sh_regs */
+		dw2 |= 0x10002;
+	}
+
+	amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
+	amdgpu_ring_write(ring, dw2);
+	amdgpu_ring_write(ring, 0);
+}
+
 /**
  * gfx_v7_0_ring_test_ib - basic ring IB test
  *
@@ -4929,6 +4948,7 @@ static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_gfx = {
 	.test_ib = gfx_v7_0_ring_test_ib,
 	.insert_nop = amdgpu_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
+	.emit_cntxcntl = gfx_v7_ring_emit_cntxcntl,
 };
 
 static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute = {
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 8ba8e42..73f6ffa 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -6085,6 +6085,35 @@ static void gfx_v8_ring_emit_sb(struct amdgpu_ring *ring)
 	amdgpu_ring_write(ring, 0);
 }
 
+static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)
+{
+	uint32_t dw2 = 0;
+
+	dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
+	if (flags & HAVE_CTX_SWITCH) {
+		/* set load_global_config & load_global_uconfig */
+		dw2 |= 0x8001;
+		/* set load_cs_sh_regs */
+		dw2 |= 0x01000000;
+		/* set load_per_context_state & load_gfx_sh_regs for GFX */
+		dw2 |= 0x10002;
+
+		/* set load_ce_ram if preamble presented */
+		if (PREAMBLE_IB_PRESENT & flags)
+			dw2 |= 0x10000000;
+	} else {
+		/* still load_ce_ram if this is the first time preamble presented
+		 * although there is no context switch happens.
+		 */
+		if (PREAMBLE_IB_PRESENT_FIRST & flags)
+			dw2 |= 0x10000000;
+	}
+
+	amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
+	amdgpu_ring_write(ring, dw2);
+	amdgpu_ring_write(ring, 0);
+}
+
 static void gfx_v8_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
 						 enum amdgpu_interrupt_state state)
 {
@@ -6267,6 +6296,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
 	.insert_nop = amdgpu_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
 	.emit_switch_buffer = gfx_v8_ring_emit_sb,
+	.emit_cntxcntl = gfx_v8_ring_emit_cntxcntl,
 };
 
 static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
-- 
1.9.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* RE: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
       [not found] ` <1472615341-3847-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2016-08-31 11:31   ` Liu, Monk
  2016-08-31 11:53   ` Christian König
  1 sibling, 0 replies; 20+ messages in thread
From: Liu, Monk @ 2016-08-31 11:31 UTC (permalink / raw)
  To: Liu, Monk, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

If no one object, I'll push the patch tomorrow.

Tested with FIJI and BONAIRE locally.

BR Monk

-----Original Message-----
From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf Of Monk Liu
Sent: Wednesday, August 31, 2016 11:49 AM
To: amd-gfx@lists.freedesktop.org
Cc: Liu, Monk <Monk.Liu@amd.com>
Subject: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)

v1:
for gfx8, use CONTEXT_CONTROL package to dynamically skip preamble CEIB and other load_xxx command in sequence.

v2:
support GFX7 as well, and bump up version.
remove cntxcntl in compute ring funcs because CPC doesn't support this packet.

v3: fix reduntant judgement in cntxcntl.

Change-Id: I4b87ca84ea8c11ba4f7fb4c0e8a5be537ccde851
Signed-off-by: Monk Liu <Monk.Liu@amd.com>

Change-Id: I5d24c1bb5c14190ce4adeb6a331ee3d92b3d5c83
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  9 +++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 12 ++++++++++++  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  3 ++-  drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 16 +++++++++-------
 drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   | 20 ++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   | 30 ++++++++++++++++++++++++++++++
 6 files changed, 82 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 1254410..0de5f08 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -321,6 +321,7 @@ struct amdgpu_ring_funcs {
 	void (*begin_use)(struct amdgpu_ring *ring);
 	void (*end_use)(struct amdgpu_ring *ring);
 	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
+	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
 };
 
 /*
@@ -965,6 +966,7 @@ struct amdgpu_ctx {
 	spinlock_t		ring_lock;
 	struct fence            **fences;
 	struct amdgpu_ctx_ring	rings[AMDGPU_MAX_RINGS];
+	bool preamble_presented;
 };
 
 struct amdgpu_ctx_mgr {
@@ -1227,8 +1229,13 @@ struct amdgpu_cs_parser {
 
 	/* user fence */
 	struct amdgpu_bo_list_entry	uf_entry;
+	bool preamble_present; /* True means this command submit involves a 
+preamble IB */
 };
 
+#define PREAMBLE_IB_PRESENT 		(1 << 0) /* bit set means command submit involves a preamble IB */
+#define PREAMBLE_IB_PRESENT_FIRST	(1 << 1) /* bit set means preamble IB is first presented in belonging context */
+#define HAVE_CTX_SWITCH		(1 << 2) /* bit set means context switch occured */
+
 struct amdgpu_job {
 	struct amd_sched_job    base;
 	struct amdgpu_device	*adev;
@@ -1237,6 +1244,7 @@ struct amdgpu_job {
 	struct amdgpu_sync	sync;
 	struct amdgpu_ib	*ibs;
 	struct fence		*fence; /* the hw fence */
+	uint32_t		preamble_status;
 	uint32_t		num_ibs;
 	void			*owner;
 	uint64_t		fence_ctx; /* the fence_context this job uses */
@@ -2264,6 +2272,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)  #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))  #define amdgpu_ring_emit_hdp_invalidate(r) (r)->funcs->emit_hdp_invalidate((r))
 #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
+#define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r), 
+(d))
 #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))  #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))  #define amdgpu_ring_patch_cond_exec(r,o) (r)->funcs->patch_cond_exec((r),(o))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 2d4e005..6d8c050 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -792,6 +792,9 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
 		if (r)
 			return r;
 
+		if (ib->flags & AMDGPU_IB_FLAG_PREAMBLE)
+			parser->preamble_present = true;
+
 		if (parser->job->ring && parser->job->ring != ring)
 			return -EINVAL;
 
@@ -930,6 +933,12 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
 		return r;
 	}
 
+	if (p->preamble_present) {
+		job->preamble_status |= PREAMBLE_IB_PRESENT;
+		if (!p->ctx->preamble_presented)
+			job->preamble_status |= PREAMBLE_IB_PRESENT_FIRST;
+	}
+
 	job->owner = p->filp;
 	job->fence_ctx = entity->fence_context;
 	p->fence = fence_get(&job->base.s_fence->finished);
@@ -940,6 +949,9 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
 	trace_amdgpu_cs_ioctl(job);
 	amd_sched_entity_push_job(&job->base);
 
+	if (p->preamble_present)
+		p->ctx->preamble_presented = true;
+
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 56c85e6..44db0ab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -55,9 +55,10 @@
  * - 3.3.0 - Add VM support for UVD on supported hardware.
  * - 3.4.0 - Add AMDGPU_INFO_NUM_EVICTIONS.
  * - 3.5.0 - Add support for new UVD_NO_OP register.
+ * - 3.6.0 - UMD doesn't/shouldn't need to use CONTEXT_CONTROL in IB, 
+ KMD should do it
  */
 #define KMS_DRIVER_MAJOR	3
-#define KMS_DRIVER_MINOR	5
+#define KMS_DRIVER_MINOR	6
 #define KMS_DRIVER_PATCHLEVEL	0
 
 int amdgpu_vram_limit = 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
index 04263f0..b12b5ba 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
@@ -121,10 +121,11 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,  {
 	struct amdgpu_device *adev = ring->adev;
 	struct amdgpu_ib *ib = &ibs[0];
-	bool skip_preamble, need_ctx_switch;
+	bool need_ctx_switch;
 	unsigned patch_offset = ~0;
 	struct amdgpu_vm *vm;
 	uint64_t fence_ctx;
+	uint32_t status = 0;
 
 	unsigned i;
 	int r = 0;
@@ -174,15 +175,16 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 	/* always set cond_exec_polling to CONTINUE */
 	*ring->cond_exe_cpu_addr = 1;
 
-	skip_preamble = ring->current_ctx == fence_ctx;
 	need_ctx_switch = ring->current_ctx != fence_ctx;
+	if (job && ring->funcs->emit_cntxcntl) {
+		if (need_ctx_switch)
+			status |= HAVE_CTX_SWITCH;
+		status |= job->preamble_status;
+		amdgpu_ring_emit_cntxcntl(ring, status);
+	}
+
 	for (i = 0; i < num_ibs; ++i) {
 		ib = &ibs[i];
-
-		/* drop preamble IBs if we don't have a context switch */
-		if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) && skip_preamble)
-			continue;
-
 		amdgpu_ring_emit_ib(ring, ib, job ? job->vm_id : 0,
 				    need_ctx_switch);
 		need_ctx_switch = false;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
index f055d49..0d5addb 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
@@ -2096,6 +2096,25 @@ static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
 	amdgpu_ring_write(ring, control);
 }
 
+static void gfx_v7_ring_emit_cntxcntl(struct amdgpu_ring *ring, 
+uint32_t flags) {
+	uint32_t dw2 = 0;
+
+	dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
+	if (flags & HAVE_CTX_SWITCH) {
+		/* set load_global_config & load_global_uconfig */
+		dw2 |= 0x8001;
+		/* set load_cs_sh_regs */
+		dw2 |= 0x01000000;
+		/* set load_per_context_state & load_gfx_sh_regs */
+		dw2 |= 0x10002;
+	}
+
+	amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
+	amdgpu_ring_write(ring, dw2);
+	amdgpu_ring_write(ring, 0);
+}
+
 /**
  * gfx_v7_0_ring_test_ib - basic ring IB test
  *
@@ -4929,6 +4948,7 @@ static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_gfx = {
 	.test_ib = gfx_v7_0_ring_test_ib,
 	.insert_nop = amdgpu_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
+	.emit_cntxcntl = gfx_v7_ring_emit_cntxcntl,
 };
 
 static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute = { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 8ba8e42..73f6ffa 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -6085,6 +6085,35 @@ static void gfx_v8_ring_emit_sb(struct amdgpu_ring *ring)
 	amdgpu_ring_write(ring, 0);
 }
 
+static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, 
+uint32_t flags) {
+	uint32_t dw2 = 0;
+
+	dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
+	if (flags & HAVE_CTX_SWITCH) {
+		/* set load_global_config & load_global_uconfig */
+		dw2 |= 0x8001;
+		/* set load_cs_sh_regs */
+		dw2 |= 0x01000000;
+		/* set load_per_context_state & load_gfx_sh_regs for GFX */
+		dw2 |= 0x10002;
+
+		/* set load_ce_ram if preamble presented */
+		if (PREAMBLE_IB_PRESENT & flags)
+			dw2 |= 0x10000000;
+	} else {
+		/* still load_ce_ram if this is the first time preamble presented
+		 * although there is no context switch happens.
+		 */
+		if (PREAMBLE_IB_PRESENT_FIRST & flags)
+			dw2 |= 0x10000000;
+	}
+
+	amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
+	amdgpu_ring_write(ring, dw2);
+	amdgpu_ring_write(ring, 0);
+}
+
 static void gfx_v8_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
 						 enum amdgpu_interrupt_state state)  { @@ -6267,6 +6296,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
 	.insert_nop = amdgpu_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
 	.emit_switch_buffer = gfx_v8_ring_emit_sb,
+	.emit_cntxcntl = gfx_v8_ring_emit_cntxcntl,
 };
 
 static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
--
1.9.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
       [not found] ` <1472615341-3847-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2016-08-31 11:31   ` Liu, Monk
@ 2016-08-31 11:53   ` Christian König
       [not found]     ` <a9b9cfab-4c78-9a90-3f59-6e2ffed73f4b-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  1 sibling, 1 reply; 20+ messages in thread
From: Christian König @ 2016-08-31 11:53 UTC (permalink / raw)
  To: Monk Liu, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Looks good to me in general, a few nit picks and sugegstions below.

Am 31.08.2016 um 05:49 schrieb Monk Liu:
> v1:
> for gfx8, use CONTEXT_CONTROL package to dynamically
> skip preamble CEIB and other load_xxx command in sequence.
>
> v2:
> support GFX7 as well, and bump up version.
> remove cntxcntl in compute ring funcs because CPC doesn't
> support this packet.
>
> v3: fix reduntant judgement in cntxcntl.
>
> Change-Id: I4b87ca84ea8c11ba4f7fb4c0e8a5be537ccde851
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>
> Change-Id: I5d24c1bb5c14190ce4adeb6a331ee3d92b3d5c83
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

Only one signed of by line is enough and remove the change-ids.

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  9 +++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 12 ++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  3 ++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 16 +++++++++-------
>   drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   | 20 ++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   | 30 ++++++++++++++++++++++++++++++
>   6 files changed, 82 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 1254410..0de5f08 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -321,6 +321,7 @@ struct amdgpu_ring_funcs {
>   	void (*begin_use)(struct amdgpu_ring *ring);
>   	void (*end_use)(struct amdgpu_ring *ring);
>   	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
> +	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
>   };
>   
>   /*
> @@ -965,6 +966,7 @@ struct amdgpu_ctx {
>   	spinlock_t		ring_lock;
>   	struct fence            **fences;
>   	struct amdgpu_ctx_ring	rings[AMDGPU_MAX_RINGS];
> +	bool preamble_presented;
>   };
>   
>   struct amdgpu_ctx_mgr {
> @@ -1227,8 +1229,13 @@ struct amdgpu_cs_parser {
>   
>   	/* user fence */
>   	struct amdgpu_bo_list_entry	uf_entry;
> +	bool preamble_present; /* True means this command submit involves a preamble IB */

We only need this in amdgpu_cs_ib_fill() don't we? See below as well.

>   };
>   
> +#define PREAMBLE_IB_PRESENT 		(1 << 0) /* bit set means command submit involves a preamble IB */
> +#define PREAMBLE_IB_PRESENT_FIRST	(1 << 1) /* bit set means preamble IB is first presented in belonging context */

Why does that makes a difference if it is seen for the first time?

> +#define HAVE_CTX_SWITCH		(1 << 2) /* bit set means context switch occured */
> +
>   struct amdgpu_job {
>   	struct amd_sched_job    base;
>   	struct amdgpu_device	*adev;
> @@ -1237,6 +1244,7 @@ struct amdgpu_job {
>   	struct amdgpu_sync	sync;
>   	struct amdgpu_ib	*ibs;
>   	struct fence		*fence; /* the hw fence */
> +	uint32_t		preamble_status;
>   	uint32_t		num_ibs;
>   	void			*owner;
>   	uint64_t		fence_ctx; /* the fence_context this job uses */
> @@ -2264,6 +2272,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>   #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>   #define amdgpu_ring_emit_hdp_invalidate(r) (r)->funcs->emit_hdp_invalidate((r))
>   #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
> +#define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r), (d))
>   #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
>   #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
>   #define amdgpu_ring_patch_cond_exec(r,o) (r)->funcs->patch_cond_exec((r),(o))
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 2d4e005..6d8c050 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -792,6 +792,9 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
>   		if (r)
>   			return r;
>   
> +		if (ib->flags & AMDGPU_IB_FLAG_PREAMBLE)
> +			parser->preamble_present = true;
> +
>   		if (parser->job->ring && parser->job->ring != ring)
>   			return -EINVAL;
>   
> @@ -930,6 +933,12 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>   		return r;
>   	}
>   
> +	if (p->preamble_present) {
> +		job->preamble_status |= PREAMBLE_IB_PRESENT;
> +		if (!p->ctx->preamble_presented)
> +			job->preamble_status |= PREAMBLE_IB_PRESENT_FIRST;
> +	}
> +

Better move this to the end of amdgpu_cs_ib_fill() where we allocate the 
IBs as well.

>   	job->owner = p->filp;
>   	job->fence_ctx = entity->fence_context;
>   	p->fence = fence_get(&job->base.s_fence->finished);
> @@ -940,6 +949,9 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>   	trace_amdgpu_cs_ioctl(job);
>   	amd_sched_entity_push_job(&job->base);
>   
> +	if (p->preamble_present)
> +		p->ctx->preamble_presented = true;
> +
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 56c85e6..44db0ab 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -55,9 +55,10 @@
>    * - 3.3.0 - Add VM support for UVD on supported hardware.
>    * - 3.4.0 - Add AMDGPU_INFO_NUM_EVICTIONS.
>    * - 3.5.0 - Add support for new UVD_NO_OP register.
> + * - 3.6.0 - UMD doesn't/shouldn't need to use CONTEXT_CONTROL in IB, KMD should do it
>    */
>   #define KMS_DRIVER_MAJOR	3
> -#define KMS_DRIVER_MINOR	5
> +#define KMS_DRIVER_MINOR	6
>   #define KMS_DRIVER_PATCHLEVEL	0
>   
>   int amdgpu_vram_limit = 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> index 04263f0..b12b5ba 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> @@ -121,10 +121,11 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   {
>   	struct amdgpu_device *adev = ring->adev;
>   	struct amdgpu_ib *ib = &ibs[0];
> -	bool skip_preamble, need_ctx_switch;
> +	bool need_ctx_switch;
>   	unsigned patch_offset = ~0;
>   	struct amdgpu_vm *vm;
>   	uint64_t fence_ctx;
> +	uint32_t status = 0;
>   
>   	unsigned i;
>   	int r = 0;
> @@ -174,15 +175,16 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   	/* always set cond_exec_polling to CONTINUE */
>   	*ring->cond_exe_cpu_addr = 1;
>   
> -	skip_preamble = ring->current_ctx == fence_ctx;
>   	need_ctx_switch = ring->current_ctx != fence_ctx;
> +	if (job && ring->funcs->emit_cntxcntl) {
> +		if (need_ctx_switch)
> +			status |= HAVE_CTX_SWITCH;
> +		status |= job->preamble_status;
> +		amdgpu_ring_emit_cntxcntl(ring, status);
> +	}
> +
>   	for (i = 0; i < num_ibs; ++i) {
>   		ib = &ibs[i];
> -
> -		/* drop preamble IBs if we don't have a context switch */
> -		if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) && skip_preamble)
> -			continue;
> -

Would be nice to keep this functionality for cases where we don't 
support emit_cntxcntl (e.g. SI?).

>   		amdgpu_ring_emit_ib(ring, ib, job ? job->vm_id : 0,
>   				    need_ctx_switch);
>   		need_ctx_switch = false;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> index f055d49..0d5addb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> @@ -2096,6 +2096,25 @@ static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
>   	amdgpu_ring_write(ring, control);
>   }
>   
> +static void gfx_v7_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)
> +{
> +	uint32_t dw2 = 0;
> +
> +	dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
> +	if (flags & HAVE_CTX_SWITCH) {
> +		/* set load_global_config & load_global_uconfig */
> +		dw2 |= 0x8001;
> +		/* set load_cs_sh_regs */
> +		dw2 |= 0x01000000;
> +		/* set load_per_context_state & load_gfx_sh_regs */
> +		dw2 |= 0x10002;

Better define some constants for those.

Regards,
Christian.

> +	}
> +
> +	amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
> +	amdgpu_ring_write(ring, dw2);
> +	amdgpu_ring_write(ring, 0);
> +}
> +
>   /**
>    * gfx_v7_0_ring_test_ib - basic ring IB test
>    *
> @@ -4929,6 +4948,7 @@ static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_gfx = {
>   	.test_ib = gfx_v7_0_ring_test_ib,
>   	.insert_nop = amdgpu_ring_insert_nop,
>   	.pad_ib = amdgpu_ring_generic_pad_ib,
> +	.emit_cntxcntl = gfx_v7_ring_emit_cntxcntl,
>   };
>   
>   static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute = {
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index 8ba8e42..73f6ffa 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -6085,6 +6085,35 @@ static void gfx_v8_ring_emit_sb(struct amdgpu_ring *ring)
>   	amdgpu_ring_write(ring, 0);
>   }
>   
> +static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)
> +{
> +	uint32_t dw2 = 0;
> +
> +	dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
> +	if (flags & HAVE_CTX_SWITCH) {
> +		/* set load_global_config & load_global_uconfig */
> +		dw2 |= 0x8001;
> +		/* set load_cs_sh_regs */
> +		dw2 |= 0x01000000;
> +		/* set load_per_context_state & load_gfx_sh_regs for GFX */
> +		dw2 |= 0x10002;
> +
> +		/* set load_ce_ram if preamble presented */
> +		if (PREAMBLE_IB_PRESENT & flags)
> +			dw2 |= 0x10000000;
> +	} else {
> +		/* still load_ce_ram if this is the first time preamble presented
> +		 * although there is no context switch happens.
> +		 */
> +		if (PREAMBLE_IB_PRESENT_FIRST & flags)
> +			dw2 |= 0x10000000;
> +	}
> +
> +	amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
> +	amdgpu_ring_write(ring, dw2);
> +	amdgpu_ring_write(ring, 0);
> +}
> +
>   static void gfx_v8_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
>   						 enum amdgpu_interrupt_state state)
>   {
> @@ -6267,6 +6296,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
>   	.insert_nop = amdgpu_ring_insert_nop,
>   	.pad_ib = amdgpu_ring_generic_pad_ib,
>   	.emit_switch_buffer = gfx_v8_ring_emit_sb,
> +	.emit_cntxcntl = gfx_v8_ring_emit_cntxcntl,
>   };
>   
>   static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
       [not found]     ` <a9b9cfab-4c78-9a90-3f59-6e2ffed73f4b-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2016-09-01  7:37       ` Liu, Monk
       [not found]         ` <MWHPR12MB11829865EEF1352A1A1A4C4084E20-Gy0DoCVfaSVhjnLHdLm0OQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 20+ messages in thread
From: Liu, Monk @ 2016-09-01  7:37 UTC (permalink / raw)
  To: Christian König, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW



-----Original Message-----
From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf Of Christian K?nig
Sent: Wednesday, August 31, 2016 7:53 PM
To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)

Looks good to me in general, a few nit picks and sugegstions below.

Am 31.08.2016 um 05:49 schrieb Monk Liu:
> v1:
> for gfx8, use CONTEXT_CONTROL package to dynamically skip preamble 
> CEIB and other load_xxx command in sequence.
>
> v2:
> support GFX7 as well, and bump up version.
> remove cntxcntl in compute ring funcs because CPC doesn't support this 
> packet.
>
> v3: fix reduntant judgement in cntxcntl.
>
> Change-Id: I4b87ca84ea8c11ba4f7fb4c0e8a5be537ccde851
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>
> Change-Id: I5d24c1bb5c14190ce4adeb6a331ee3d92b3d5c83
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

Only one signed of by line is enough and remove the change-ids.

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  9 +++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 12 ++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  3 ++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 16 +++++++++-------
>   drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   | 20 ++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   | 30 ++++++++++++++++++++++++++++++
>   6 files changed, 82 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 1254410..0de5f08 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -321,6 +321,7 @@ struct amdgpu_ring_funcs {
>   	void (*begin_use)(struct amdgpu_ring *ring);
>   	void (*end_use)(struct amdgpu_ring *ring);
>   	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
> +	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
>   };
>   
>   /*
> @@ -965,6 +966,7 @@ struct amdgpu_ctx {
>   	spinlock_t		ring_lock;
>   	struct fence            **fences;
>   	struct amdgpu_ctx_ring	rings[AMDGPU_MAX_RINGS];
> +	bool preamble_presented;
>   };
>   
>   struct amdgpu_ctx_mgr {
> @@ -1227,8 +1229,13 @@ struct amdgpu_cs_parser {
>   
>   	/* user fence */
>   	struct amdgpu_bo_list_entry	uf_entry;
> +	bool preamble_present; /* True means this command submit involves a 
> +preamble IB */

We only need this in amdgpu_cs_ib_fill() don't we? See below as well.

[ML] seems good advice 

>   };
>   
> +#define PREAMBLE_IB_PRESENT 		(1 << 0) /* bit set means command submit involves a preamble IB */
> +#define PREAMBLE_IB_PRESENT_FIRST	(1 << 1) /* bit set means preamble IB is first presented in belonging context */

Why does that makes a difference if it is seen for the first time?

[ml] if it is presented for the first time for belonging ctx, means even current CS do not involve context switch, we still need keep the actions in preamble IB.
Usually if current CS is from the same cntx of previous CS, that means no ctx switch occurs, so we can skip the actions in preamble IB. but above case is the exception.

> +#define HAVE_CTX_SWITCH		(1 << 2) /* bit set means context switch occured */
> +
>   struct amdgpu_job {
>   	struct amd_sched_job    base;
>   	struct amdgpu_device	*adev;
> @@ -1237,6 +1244,7 @@ struct amdgpu_job {
>   	struct amdgpu_sync	sync;
>   	struct amdgpu_ib	*ibs;
>   	struct fence		*fence; /* the hw fence */
> +	uint32_t		preamble_status;
>   	uint32_t		num_ibs;
>   	void			*owner;
>   	uint64_t		fence_ctx; /* the fence_context this job uses */
> @@ -2264,6 +2272,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>   #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>   #define amdgpu_ring_emit_hdp_invalidate(r) (r)->funcs->emit_hdp_invalidate((r))
>   #define amdgpu_ring_emit_switch_buffer(r) 
> (r)->funcs->emit_switch_buffer((r))
> +#define amdgpu_ring_emit_cntxcntl(r, d) 
> +(r)->funcs->emit_cntxcntl((r), (d))
>   #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
>   #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
>   #define amdgpu_ring_patch_cond_exec(r,o) 
> (r)->funcs->patch_cond_exec((r),(o))
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 2d4e005..6d8c050 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -792,6 +792,9 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
>   		if (r)
>   			return r;
>   
> +		if (ib->flags & AMDGPU_IB_FLAG_PREAMBLE)
> +			parser->preamble_present = true;
> +
>   		if (parser->job->ring && parser->job->ring != ring)
>   			return -EINVAL;
>   
> @@ -930,6 +933,12 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>   		return r;
>   	}
>   
> +	if (p->preamble_present) {
> +		job->preamble_status |= PREAMBLE_IB_PRESENT;
> +		if (!p->ctx->preamble_presented)
> +			job->preamble_status |= PREAMBLE_IB_PRESENT_FIRST;
> +	}
> +

Better move this to the end of amdgpu_cs_ib_fill() where we allocate the IBs as well.
[ML] okay, good change.



>   	job->owner = p->filp;
>   	job->fence_ctx = entity->fence_context;
>   	p->fence = fence_get(&job->base.s_fence->finished);
> @@ -940,6 +949,9 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>   	trace_amdgpu_cs_ioctl(job);
>   	amd_sched_entity_push_job(&job->base);
>   
> +	if (p->preamble_present)
> +		p->ctx->preamble_presented = true;
> +
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 56c85e6..44db0ab 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -55,9 +55,10 @@
>    * - 3.3.0 - Add VM support for UVD on supported hardware.
>    * - 3.4.0 - Add AMDGPU_INFO_NUM_EVICTIONS.
>    * - 3.5.0 - Add support for new UVD_NO_OP register.
> + * - 3.6.0 - UMD doesn't/shouldn't need to use CONTEXT_CONTROL in IB, 
> + KMD should do it
>    */
>   #define KMS_DRIVER_MAJOR	3
> -#define KMS_DRIVER_MINOR	5
> +#define KMS_DRIVER_MINOR	6
>   #define KMS_DRIVER_PATCHLEVEL	0
>   
>   int amdgpu_vram_limit = 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> index 04263f0..b12b5ba 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> @@ -121,10 +121,11 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   {
>   	struct amdgpu_device *adev = ring->adev;
>   	struct amdgpu_ib *ib = &ibs[0];
> -	bool skip_preamble, need_ctx_switch;
> +	bool need_ctx_switch;
>   	unsigned patch_offset = ~0;
>   	struct amdgpu_vm *vm;
>   	uint64_t fence_ctx;
> +	uint32_t status = 0;
>   
>   	unsigned i;
>   	int r = 0;
> @@ -174,15 +175,16 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   	/* always set cond_exec_polling to CONTINUE */
>   	*ring->cond_exe_cpu_addr = 1;
>   
> -	skip_preamble = ring->current_ctx == fence_ctx;
>   	need_ctx_switch = ring->current_ctx != fence_ctx;
> +	if (job && ring->funcs->emit_cntxcntl) {
> +		if (need_ctx_switch)
> +			status |= HAVE_CTX_SWITCH;
> +		status |= job->preamble_status;
> +		amdgpu_ring_emit_cntxcntl(ring, status);
> +	}
> +
>   	for (i = 0; i < num_ibs; ++i) {
>   		ib = &ibs[i];
> -
> -		/* drop preamble IBs if we don't have a context switch */
> -		if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) && skip_preamble)
> -			continue;
> -

Would be nice to keep this functionality for cases where we don't support emit_cntxcntl (e.g. SI?).
[ML] SI support CONTEXT_CONTROL as well, and the package structure is exactly the same as CI.

>   		amdgpu_ring_emit_ib(ring, ib, job ? job->vm_id : 0,
>   				    need_ctx_switch);
>   		need_ctx_switch = false;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> index f055d49..0d5addb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> @@ -2096,6 +2096,25 @@ static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
>   	amdgpu_ring_write(ring, control);
>   }
>   
> +static void gfx_v7_ring_emit_cntxcntl(struct amdgpu_ring *ring, 
> +uint32_t flags) {
> +	uint32_t dw2 = 0;
> +
> +	dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
> +	if (flags & HAVE_CTX_SWITCH) {
> +		/* set load_global_config & load_global_uconfig */
> +		dw2 |= 0x8001;
> +		/* set load_cs_sh_regs */
> +		dw2 |= 0x01000000;
> +		/* set load_per_context_state & load_gfx_sh_regs */
> +		dw2 |= 0x10002;

Better define some constants for those.

[ML] I'll leave it to other guys when doing cleanups, a little hurry for other jobs now ...

Regards,
Christian.

> +	}
> +
> +	amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
> +	amdgpu_ring_write(ring, dw2);
> +	amdgpu_ring_write(ring, 0);
> +}
> +
>   /**
>    * gfx_v7_0_ring_test_ib - basic ring IB test
>    *
> @@ -4929,6 +4948,7 @@ static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_gfx = {
>   	.test_ib = gfx_v7_0_ring_test_ib,
>   	.insert_nop = amdgpu_ring_insert_nop,
>   	.pad_ib = amdgpu_ring_generic_pad_ib,
> +	.emit_cntxcntl = gfx_v7_ring_emit_cntxcntl,
>   };
>   
>   static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute = 
> { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index 8ba8e42..73f6ffa 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -6085,6 +6085,35 @@ static void gfx_v8_ring_emit_sb(struct amdgpu_ring *ring)
>   	amdgpu_ring_write(ring, 0);
>   }
>   
> +static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, 
> +uint32_t flags) {
> +	uint32_t dw2 = 0;
> +
> +	dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
> +	if (flags & HAVE_CTX_SWITCH) {
> +		/* set load_global_config & load_global_uconfig */
> +		dw2 |= 0x8001;
> +		/* set load_cs_sh_regs */
> +		dw2 |= 0x01000000;
> +		/* set load_per_context_state & load_gfx_sh_regs for GFX */
> +		dw2 |= 0x10002;
> +
> +		/* set load_ce_ram if preamble presented */
> +		if (PREAMBLE_IB_PRESENT & flags)
> +			dw2 |= 0x10000000;
> +	} else {
> +		/* still load_ce_ram if this is the first time preamble presented
> +		 * although there is no context switch happens.
> +		 */
> +		if (PREAMBLE_IB_PRESENT_FIRST & flags)
> +			dw2 |= 0x10000000;
> +	}
> +
> +	amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
> +	amdgpu_ring_write(ring, dw2);
> +	amdgpu_ring_write(ring, 0);
> +}
> +
>   static void gfx_v8_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
>   						 enum amdgpu_interrupt_state state)
>   {
> @@ -6267,6 +6296,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
>   	.insert_nop = amdgpu_ring_insert_nop,
>   	.pad_ib = amdgpu_ring_generic_pad_ib,
>   	.emit_switch_buffer = gfx_v8_ring_emit_sb,
> +	.emit_cntxcntl = gfx_v8_ring_emit_cntxcntl,
>   };
>   
>   static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = 
> {


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
       [not found]         ` <MWHPR12MB11829865EEF1352A1A1A4C4084E20-Gy0DoCVfaSVhjnLHdLm0OQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2016-09-01  8:19           ` Bas Nieuwenhuizen
       [not found]             ` <CAP+8YyGECuHNkTD6C5075R9m1wmzMRg=DHQRTRWY2ce1aNJUjA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 20+ messages in thread
From: Bas Nieuwenhuizen @ 2016-09-01  8:19 UTC (permalink / raw)
  To: Liu, Monk; +Cc: Christian König, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On Thu, Sep 1, 2016 at 9:37 AM, Liu, Monk <Monk.Liu@amd.com> wrote:
>
>
> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf Of Christian K?nig
> Sent: Wednesday, August 31, 2016 7:53 PM
> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
>
> Looks good to me in general, a few nit picks and sugegstions below.
>
> Am 31.08.2016 um 05:49 schrieb Monk Liu:
>> v1:
>> for gfx8, use CONTEXT_CONTROL package to dynamically skip preamble
>> CEIB and other load_xxx command in sequence.
>>
>> v2:
>> support GFX7 as well, and bump up version.
>> remove cntxcntl in compute ring funcs because CPC doesn't support this
>> packet.
>>
>> v3: fix reduntant judgement in cntxcntl.
>>
>> Change-Id: I4b87ca84ea8c11ba4f7fb4c0e8a5be537ccde851
>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>>
>> Change-Id: I5d24c1bb5c14190ce4adeb6a331ee3d92b3d5c83
>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>
> Only one signed of by line is enough and remove the change-ids.
>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  9 +++++++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 12 ++++++++++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  3 ++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 16 +++++++++-------
>>   drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   | 20 ++++++++++++++++++++
>>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   | 30 ++++++++++++++++++++++++++++++
>>   6 files changed, 82 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 1254410..0de5f08 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -321,6 +321,7 @@ struct amdgpu_ring_funcs {
>>       void (*begin_use)(struct amdgpu_ring *ring);
>>       void (*end_use)(struct amdgpu_ring *ring);
>>       void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>> +     void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
>>   };
>>
>>   /*
>> @@ -965,6 +966,7 @@ struct amdgpu_ctx {
>>       spinlock_t              ring_lock;
>>       struct fence            **fences;
>>       struct amdgpu_ctx_ring  rings[AMDGPU_MAX_RINGS];
>> +     bool preamble_presented;
>>   };
>>
>>   struct amdgpu_ctx_mgr {
>> @@ -1227,8 +1229,13 @@ struct amdgpu_cs_parser {
>>
>>       /* user fence */
>>       struct amdgpu_bo_list_entry     uf_entry;
>> +     bool preamble_present; /* True means this command submit involves a
>> +preamble IB */
>
> We only need this in amdgpu_cs_ib_fill() don't we? See below as well.
>
> [ML] seems good advice
>
>>   };
>>
>> +#define PREAMBLE_IB_PRESENT          (1 << 0) /* bit set means command submit involves a preamble IB */
>> +#define PREAMBLE_IB_PRESENT_FIRST    (1 << 1) /* bit set means preamble IB is first presented in belonging context */
>
> Why does that makes a difference if it is seen for the first time?
>
> [ml] if it is presented for the first time for belonging ctx, means even current CS do not involve context switch, we still need keep the actions in preamble IB.
> Usually if current CS is from the same cntx of previous CS, that means no ctx switch occurs, so we can skip the actions in preamble IB. but above case is the exception.

Can't userspace just not set the preamble flag for the first submit
with a preamble? I think that would result in the same behavior,
unless having two non-preamble CE IB's in a single submit is an issue.

- Bas

>
>> +#define HAVE_CTX_SWITCH              (1 << 2) /* bit set means context switch occured */
>> +
>>   struct amdgpu_job {
>>       struct amd_sched_job    base;
>>       struct amdgpu_device    *adev;
>> @@ -1237,6 +1244,7 @@ struct amdgpu_job {
>>       struct amdgpu_sync      sync;
>>       struct amdgpu_ib        *ibs;
>>       struct fence            *fence; /* the hw fence */
>> +     uint32_t                preamble_status;
>>       uint32_t                num_ibs;
>>       void                    *owner;
>>       uint64_t                fence_ctx; /* the fence_context this job uses */
>> @@ -2264,6 +2272,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>>   #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>>   #define amdgpu_ring_emit_hdp_invalidate(r) (r)->funcs->emit_hdp_invalidate((r))
>>   #define amdgpu_ring_emit_switch_buffer(r)
>> (r)->funcs->emit_switch_buffer((r))
>> +#define amdgpu_ring_emit_cntxcntl(r, d)
>> +(r)->funcs->emit_cntxcntl((r), (d))
>>   #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
>>   #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
>>   #define amdgpu_ring_patch_cond_exec(r,o)
>> (r)->funcs->patch_cond_exec((r),(o))
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index 2d4e005..6d8c050 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -792,6 +792,9 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
>>               if (r)
>>                       return r;
>>
>> +             if (ib->flags & AMDGPU_IB_FLAG_PREAMBLE)
>> +                     parser->preamble_present = true;
>> +
>>               if (parser->job->ring && parser->job->ring != ring)
>>                       return -EINVAL;
>>
>> @@ -930,6 +933,12 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>>               return r;
>>       }
>>
>> +     if (p->preamble_present) {
>> +             job->preamble_status |= PREAMBLE_IB_PRESENT;
>> +             if (!p->ctx->preamble_presented)
>> +                     job->preamble_status |= PREAMBLE_IB_PRESENT_FIRST;
>> +     }
>> +
>
> Better move this to the end of amdgpu_cs_ib_fill() where we allocate the IBs as well.
> [ML] okay, good change.
>
>
>
>>       job->owner = p->filp;
>>       job->fence_ctx = entity->fence_context;
>>       p->fence = fence_get(&job->base.s_fence->finished);
>> @@ -940,6 +949,9 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>>       trace_amdgpu_cs_ioctl(job);
>>       amd_sched_entity_push_job(&job->base);
>>
>> +     if (p->preamble_present)
>> +             p->ctx->preamble_presented = true;
>> +
>>       return 0;
>>   }
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index 56c85e6..44db0ab 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -55,9 +55,10 @@
>>    * - 3.3.0 - Add VM support for UVD on supported hardware.
>>    * - 3.4.0 - Add AMDGPU_INFO_NUM_EVICTIONS.
>>    * - 3.5.0 - Add support for new UVD_NO_OP register.
>> + * - 3.6.0 - UMD doesn't/shouldn't need to use CONTEXT_CONTROL in IB,
>> + KMD should do it
>>    */
>>   #define KMS_DRIVER_MAJOR    3
>> -#define KMS_DRIVER_MINOR     5
>> +#define KMS_DRIVER_MINOR     6
>>   #define KMS_DRIVER_PATCHLEVEL       0
>>
>>   int amdgpu_vram_limit = 0;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> index 04263f0..b12b5ba 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> @@ -121,10 +121,11 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>   {
>>       struct amdgpu_device *adev = ring->adev;
>>       struct amdgpu_ib *ib = &ibs[0];
>> -     bool skip_preamble, need_ctx_switch;
>> +     bool need_ctx_switch;
>>       unsigned patch_offset = ~0;
>>       struct amdgpu_vm *vm;
>>       uint64_t fence_ctx;
>> +     uint32_t status = 0;
>>
>>       unsigned i;
>>       int r = 0;
>> @@ -174,15 +175,16 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>       /* always set cond_exec_polling to CONTINUE */
>>       *ring->cond_exe_cpu_addr = 1;
>>
>> -     skip_preamble = ring->current_ctx == fence_ctx;
>>       need_ctx_switch = ring->current_ctx != fence_ctx;
>> +     if (job && ring->funcs->emit_cntxcntl) {
>> +             if (need_ctx_switch)
>> +                     status |= HAVE_CTX_SWITCH;
>> +             status |= job->preamble_status;
>> +             amdgpu_ring_emit_cntxcntl(ring, status);
>> +     }
>> +
>>       for (i = 0; i < num_ibs; ++i) {
>>               ib = &ibs[i];
>> -
>> -             /* drop preamble IBs if we don't have a context switch */
>> -             if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) && skip_preamble)
>> -                     continue;
>> -
>
> Would be nice to keep this functionality for cases where we don't support emit_cntxcntl (e.g. SI?).
> [ML] SI support CONTEXT_CONTROL as well, and the package structure is exactly the same as CI.
>
>>               amdgpu_ring_emit_ib(ring, ib, job ? job->vm_id : 0,
>>                                   need_ctx_switch);
>>               need_ctx_switch = false;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>> index f055d49..0d5addb 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>> @@ -2096,6 +2096,25 @@ static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
>>       amdgpu_ring_write(ring, control);
>>   }
>>
>> +static void gfx_v7_ring_emit_cntxcntl(struct amdgpu_ring *ring,
>> +uint32_t flags) {
>> +     uint32_t dw2 = 0;
>> +
>> +     dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
>> +     if (flags & HAVE_CTX_SWITCH) {
>> +             /* set load_global_config & load_global_uconfig */
>> +             dw2 |= 0x8001;
>> +             /* set load_cs_sh_regs */
>> +             dw2 |= 0x01000000;
>> +             /* set load_per_context_state & load_gfx_sh_regs */
>> +             dw2 |= 0x10002;
>
> Better define some constants for those.
>
> [ML] I'll leave it to other guys when doing cleanups, a little hurry for other jobs now ...
>
> Regards,
> Christian.
>
>> +     }
>> +
>> +     amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
>> +     amdgpu_ring_write(ring, dw2);
>> +     amdgpu_ring_write(ring, 0);
>> +}
>> +
>>   /**
>>    * gfx_v7_0_ring_test_ib - basic ring IB test
>>    *
>> @@ -4929,6 +4948,7 @@ static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_gfx = {
>>       .test_ib = gfx_v7_0_ring_test_ib,
>>       .insert_nop = amdgpu_ring_insert_nop,
>>       .pad_ib = amdgpu_ring_generic_pad_ib,
>> +     .emit_cntxcntl = gfx_v7_ring_emit_cntxcntl,
>>   };
>>
>>   static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute =
>> { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> index 8ba8e42..73f6ffa 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> @@ -6085,6 +6085,35 @@ static void gfx_v8_ring_emit_sb(struct amdgpu_ring *ring)
>>       amdgpu_ring_write(ring, 0);
>>   }
>>
>> +static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring,
>> +uint32_t flags) {
>> +     uint32_t dw2 = 0;
>> +
>> +     dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
>> +     if (flags & HAVE_CTX_SWITCH) {
>> +             /* set load_global_config & load_global_uconfig */
>> +             dw2 |= 0x8001;
>> +             /* set load_cs_sh_regs */
>> +             dw2 |= 0x01000000;
>> +             /* set load_per_context_state & load_gfx_sh_regs for GFX */
>> +             dw2 |= 0x10002;
>> +
>> +             /* set load_ce_ram if preamble presented */
>> +             if (PREAMBLE_IB_PRESENT & flags)
>> +                     dw2 |= 0x10000000;
>> +     } else {
>> +             /* still load_ce_ram if this is the first time preamble presented
>> +              * although there is no context switch happens.
>> +              */
>> +             if (PREAMBLE_IB_PRESENT_FIRST & flags)
>> +                     dw2 |= 0x10000000;
>> +     }
>> +
>> +     amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
>> +     amdgpu_ring_write(ring, dw2);
>> +     amdgpu_ring_write(ring, 0);
>> +}
>> +
>>   static void gfx_v8_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
>>                                                enum amdgpu_interrupt_state state)
>>   {
>> @@ -6267,6 +6296,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
>>       .insert_nop = amdgpu_ring_insert_nop,
>>       .pad_ib = amdgpu_ring_generic_pad_ib,
>>       .emit_switch_buffer = gfx_v8_ring_emit_sb,
>> +     .emit_cntxcntl = gfx_v8_ring_emit_cntxcntl,
>>   };
>>
>>   static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute =
>> {
>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
       [not found]             ` <CAP+8YyGECuHNkTD6C5075R9m1wmzMRg=DHQRTRWY2ce1aNJUjA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2016-09-01 10:55               ` Liu, Monk
       [not found]                 ` <MWHPR12MB1182883EE214228FBFCE081C84E20-Gy0DoCVfaSVhjnLHdLm0OQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 20+ messages in thread
From: Liu, Monk @ 2016-09-01 10:55 UTC (permalink / raw)
  To: Bas Nieuwenhuizen
  Cc: Christian König, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

> Why does that makes a difference if it is seen for the first time?
>
> [ml] if it is presented for the first time for belonging ctx, means even current CS do not involve context switch, we still need keep the actions in preamble IB.
> Usually if current CS is from the same cntx of previous CS, that means no ctx switch occurs, so we can skip the actions in preamble IB. but above case is the exception.

Can't userspace just not set the preamble flag for the first submit with a preamble? I think that would result in the same behavior, unless having two non-preamble CE IB's in a single submit is an issue.

- Bas


[ML] I'm confused, what's your point?

With this patch, preamble_flag is not needed at all.
Without this patch,  many original assumption and logic is not correct.
Besides, CONTEXT_CONTROL not only deals CE but also deal DE.

BR Monk


-----Original Message-----
From: Bas Nieuwenhuizen [mailto:bas@basnieuwenhuizen.nl] 
Sent: Thursday, September 01, 2016 4:19 PM
To: Liu, Monk <Monk.Liu@amd.com>
Cc: Christian König <deathsimple@vodafone.de>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)

On Thu, Sep 1, 2016 at 9:37 AM, Liu, Monk <Monk.Liu@amd.com> wrote:
>
>
> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf 
> Of Christian K?nig
> Sent: Wednesday, August 31, 2016 7:53 PM
> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
>
> Looks good to me in general, a few nit picks and sugegstions below.
>
> Am 31.08.2016 um 05:49 schrieb Monk Liu:
>> v1:
>> for gfx8, use CONTEXT_CONTROL package to dynamically skip preamble 
>> CEIB and other load_xxx command in sequence.
>>
>> v2:
>> support GFX7 as well, and bump up version.
>> remove cntxcntl in compute ring funcs because CPC doesn't support 
>> this packet.
>>
>> v3: fix reduntant judgement in cntxcntl.
>>
>> Change-Id: I4b87ca84ea8c11ba4f7fb4c0e8a5be537ccde851
>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>>
>> Change-Id: I5d24c1bb5c14190ce4adeb6a331ee3d92b3d5c83
>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>
> Only one signed of by line is enough and remove the change-ids.
>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  9 +++++++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 12 ++++++++++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  3 ++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 16 +++++++++-------
>>   drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   | 20 ++++++++++++++++++++
>>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   | 30 ++++++++++++++++++++++++++++++
>>   6 files changed, 82 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 1254410..0de5f08 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -321,6 +321,7 @@ struct amdgpu_ring_funcs {
>>       void (*begin_use)(struct amdgpu_ring *ring);
>>       void (*end_use)(struct amdgpu_ring *ring);
>>       void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>> +     void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t 
>> + flags);
>>   };
>>
>>   /*
>> @@ -965,6 +966,7 @@ struct amdgpu_ctx {
>>       spinlock_t              ring_lock;
>>       struct fence            **fences;
>>       struct amdgpu_ctx_ring  rings[AMDGPU_MAX_RINGS];
>> +     bool preamble_presented;
>>   };
>>
>>   struct amdgpu_ctx_mgr {
>> @@ -1227,8 +1229,13 @@ struct amdgpu_cs_parser {
>>
>>       /* user fence */
>>       struct amdgpu_bo_list_entry     uf_entry;
>> +     bool preamble_present; /* True means this command submit 
>> +involves a preamble IB */
>
> We only need this in amdgpu_cs_ib_fill() don't we? See below as well.
>
> [ML] seems good advice
>
>>   };
>>
>> +#define PREAMBLE_IB_PRESENT          (1 << 0) /* bit set means command submit involves a preamble IB */
>> +#define PREAMBLE_IB_PRESENT_FIRST    (1 << 1) /* bit set means preamble IB is first presented in belonging context */
>
> Why does that makes a difference if it is seen for the first time?
>
> [ml] if it is presented for the first time for belonging ctx, means even current CS do not involve context switch, we still need keep the actions in preamble IB.
> Usually if current CS is from the same cntx of previous CS, that means no ctx switch occurs, so we can skip the actions in preamble IB. but above case is the exception.

Can't userspace just not set the preamble flag for the first submit with a preamble? I think that would result in the same behavior, unless having two non-preamble CE IB's in a single submit is an issue.

- Bas

>
>> +#define HAVE_CTX_SWITCH              (1 << 2) /* bit set means context switch occured */
>> +
>>   struct amdgpu_job {
>>       struct amd_sched_job    base;
>>       struct amdgpu_device    *adev;
>> @@ -1237,6 +1244,7 @@ struct amdgpu_job {
>>       struct amdgpu_sync      sync;
>>       struct amdgpu_ib        *ibs;
>>       struct fence            *fence; /* the hw fence */
>> +     uint32_t                preamble_status;
>>       uint32_t                num_ibs;
>>       void                    *owner;
>>       uint64_t                fence_ctx; /* the fence_context this job uses */
>> @@ -2264,6 +2272,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>>   #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>>   #define amdgpu_ring_emit_hdp_invalidate(r) (r)->funcs->emit_hdp_invalidate((r))
>>   #define amdgpu_ring_emit_switch_buffer(r)
>> (r)->funcs->emit_switch_buffer((r))
>> +#define amdgpu_ring_emit_cntxcntl(r, d) 
>> +(r)->funcs->emit_cntxcntl((r), (d))
>>   #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
>>   #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
>>   #define amdgpu_ring_patch_cond_exec(r,o)
>> (r)->funcs->patch_cond_exec((r),(o))
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index 2d4e005..6d8c050 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -792,6 +792,9 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
>>               if (r)
>>                       return r;
>>
>> +             if (ib->flags & AMDGPU_IB_FLAG_PREAMBLE)
>> +                     parser->preamble_present = true;
>> +
>>               if (parser->job->ring && parser->job->ring != ring)
>>                       return -EINVAL;
>>
>> @@ -930,6 +933,12 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>>               return r;
>>       }
>>
>> +     if (p->preamble_present) {
>> +             job->preamble_status |= PREAMBLE_IB_PRESENT;
>> +             if (!p->ctx->preamble_presented)
>> +                     job->preamble_status |= PREAMBLE_IB_PRESENT_FIRST;
>> +     }
>> +
>
> Better move this to the end of amdgpu_cs_ib_fill() where we allocate the IBs as well.
> [ML] okay, good change.
>
>
>
>>       job->owner = p->filp;
>>       job->fence_ctx = entity->fence_context;
>>       p->fence = fence_get(&job->base.s_fence->finished);
>> @@ -940,6 +949,9 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>>       trace_amdgpu_cs_ioctl(job);
>>       amd_sched_entity_push_job(&job->base);
>>
>> +     if (p->preamble_present)
>> +             p->ctx->preamble_presented = true;
>> +
>>       return 0;
>>   }
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index 56c85e6..44db0ab 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -55,9 +55,10 @@
>>    * - 3.3.0 - Add VM support for UVD on supported hardware.
>>    * - 3.4.0 - Add AMDGPU_INFO_NUM_EVICTIONS.
>>    * - 3.5.0 - Add support for new UVD_NO_OP register.
>> + * - 3.6.0 - UMD doesn't/shouldn't need to use CONTEXT_CONTROL in 
>> + IB, KMD should do it
>>    */
>>   #define KMS_DRIVER_MAJOR    3
>> -#define KMS_DRIVER_MINOR     5
>> +#define KMS_DRIVER_MINOR     6
>>   #define KMS_DRIVER_PATCHLEVEL       0
>>
>>   int amdgpu_vram_limit = 0;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> index 04263f0..b12b5ba 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> @@ -121,10 +121,11 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>   {
>>       struct amdgpu_device *adev = ring->adev;
>>       struct amdgpu_ib *ib = &ibs[0];
>> -     bool skip_preamble, need_ctx_switch;
>> +     bool need_ctx_switch;
>>       unsigned patch_offset = ~0;
>>       struct amdgpu_vm *vm;
>>       uint64_t fence_ctx;
>> +     uint32_t status = 0;
>>
>>       unsigned i;
>>       int r = 0;
>> @@ -174,15 +175,16 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>       /* always set cond_exec_polling to CONTINUE */
>>       *ring->cond_exe_cpu_addr = 1;
>>
>> -     skip_preamble = ring->current_ctx == fence_ctx;
>>       need_ctx_switch = ring->current_ctx != fence_ctx;
>> +     if (job && ring->funcs->emit_cntxcntl) {
>> +             if (need_ctx_switch)
>> +                     status |= HAVE_CTX_SWITCH;
>> +             status |= job->preamble_status;
>> +             amdgpu_ring_emit_cntxcntl(ring, status);
>> +     }
>> +
>>       for (i = 0; i < num_ibs; ++i) {
>>               ib = &ibs[i];
>> -
>> -             /* drop preamble IBs if we don't have a context switch */
>> -             if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) && skip_preamble)
>> -                     continue;
>> -
>
> Would be nice to keep this functionality for cases where we don't support emit_cntxcntl (e.g. SI?).
> [ML] SI support CONTEXT_CONTROL as well, and the package structure is exactly the same as CI.
>
>>               amdgpu_ring_emit_ib(ring, ib, job ? job->vm_id : 0,
>>                                   need_ctx_switch);
>>               need_ctx_switch = false; diff --git 
>> a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>> index f055d49..0d5addb 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>> @@ -2096,6 +2096,25 @@ static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
>>       amdgpu_ring_write(ring, control);
>>   }
>>
>> +static void gfx_v7_ring_emit_cntxcntl(struct amdgpu_ring *ring, 
>> +uint32_t flags) {
>> +     uint32_t dw2 = 0;
>> +
>> +     dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
>> +     if (flags & HAVE_CTX_SWITCH) {
>> +             /* set load_global_config & load_global_uconfig */
>> +             dw2 |= 0x8001;
>> +             /* set load_cs_sh_regs */
>> +             dw2 |= 0x01000000;
>> +             /* set load_per_context_state & load_gfx_sh_regs */
>> +             dw2 |= 0x10002;
>
> Better define some constants for those.
>
> [ML] I'll leave it to other guys when doing cleanups, a little hurry for other jobs now ...
>
> Regards,
> Christian.
>
>> +     }
>> +
>> +     amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
>> +     amdgpu_ring_write(ring, dw2);
>> +     amdgpu_ring_write(ring, 0);
>> +}
>> +
>>   /**
>>    * gfx_v7_0_ring_test_ib - basic ring IB test
>>    *
>> @@ -4929,6 +4948,7 @@ static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_gfx = {
>>       .test_ib = gfx_v7_0_ring_test_ib,
>>       .insert_nop = amdgpu_ring_insert_nop,
>>       .pad_ib = amdgpu_ring_generic_pad_ib,
>> +     .emit_cntxcntl = gfx_v7_ring_emit_cntxcntl,
>>   };
>>
>>   static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute = 
>> { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> index 8ba8e42..73f6ffa 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> @@ -6085,6 +6085,35 @@ static void gfx_v8_ring_emit_sb(struct amdgpu_ring *ring)
>>       amdgpu_ring_write(ring, 0);
>>   }
>>
>> +static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, 
>> +uint32_t flags) {
>> +     uint32_t dw2 = 0;
>> +
>> +     dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
>> +     if (flags & HAVE_CTX_SWITCH) {
>> +             /* set load_global_config & load_global_uconfig */
>> +             dw2 |= 0x8001;
>> +             /* set load_cs_sh_regs */
>> +             dw2 |= 0x01000000;
>> +             /* set load_per_context_state & load_gfx_sh_regs for GFX */
>> +             dw2 |= 0x10002;
>> +
>> +             /* set load_ce_ram if preamble presented */
>> +             if (PREAMBLE_IB_PRESENT & flags)
>> +                     dw2 |= 0x10000000;
>> +     } else {
>> +             /* still load_ce_ram if this is the first time preamble presented
>> +              * although there is no context switch happens.
>> +              */
>> +             if (PREAMBLE_IB_PRESENT_FIRST & flags)
>> +                     dw2 |= 0x10000000;
>> +     }
>> +
>> +     amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
>> +     amdgpu_ring_write(ring, dw2);
>> +     amdgpu_ring_write(ring, 0);
>> +}
>> +
>>   static void gfx_v8_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
>>                                                enum amdgpu_interrupt_state state)
>>   {
>> @@ -6267,6 +6296,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
>>       .insert_nop = amdgpu_ring_insert_nop,
>>       .pad_ib = amdgpu_ring_generic_pad_ib,
>>       .emit_switch_buffer = gfx_v8_ring_emit_sb,
>> +     .emit_cntxcntl = gfx_v8_ring_emit_cntxcntl,
>>   };
>>
>>   static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = 
>> {
>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
       [not found]                 ` <MWHPR12MB1182883EE214228FBFCE081C84E20-Gy0DoCVfaSVhjnLHdLm0OQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2016-09-01 14:10                   ` Christian König
       [not found]                     ` <8fed1480-794e-7218-17d1-52221060a149-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  2016-09-01 16:09                   ` Bas Nieuwenhuizen
  1 sibling, 1 reply; 20+ messages in thread
From: Christian König @ 2016-09-01 14:10 UTC (permalink / raw)
  To: Liu, Monk, Bas Nieuwenhuizen; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 01.09.2016 um 12:55 schrieb Liu, Monk:
>> Why does that makes a difference if it is seen for the first time?
>>
>> [ml] if it is presented for the first time for belonging ctx, means even current CS do not involve context switch, we still need keep the actions in preamble IB.
>> Usually if current CS is from the same cntx of previous CS, that means no ctx switch occurs, so we can skip the actions in preamble IB. but above case is the exception.
> Can't userspace just not set the preamble flag for the first submit with a preamble? I think that would result in the same behavior, unless having two non-preamble CE IB's in a single submit is an issue.
>
> - Bas
>
>
> [ML] I'm confused, what's your point?
>
> With this patch, preamble_flag is not needed at all.

Well then there is something wrong with the patch. Setting the preamble 
flag should result in an IB only being executed when there was a task 
switch.

We can either implement that as a software solution by skipping the IBs 
in question or by using the hardware for this.

Double checking your patch actually reveals that you want to emit the 
context control package only once, so this can't be correct.

Regards,
Christian.

> Without this patch,  many original assumption and logic is not correct.
> Besides, CONTEXT_CONTROL not only deals CE but also deal DE.
>
> BR Monk
>
>
> -----Original Message-----
> From: Bas Nieuwenhuizen [mailto:bas@basnieuwenhuizen.nl]
> Sent: Thursday, September 01, 2016 4:19 PM
> To: Liu, Monk <Monk.Liu@amd.com>
> Cc: Christian König <deathsimple@vodafone.de>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
>
> On Thu, Sep 1, 2016 at 9:37 AM, Liu, Monk <Monk.Liu@amd.com> wrote:
>>
>> -----Original Message-----
>> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
>> Of Christian K?nig
>> Sent: Wednesday, August 31, 2016 7:53 PM
>> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
>>
>> Looks good to me in general, a few nit picks and sugegstions below.
>>
>> Am 31.08.2016 um 05:49 schrieb Monk Liu:
>>> v1:
>>> for gfx8, use CONTEXT_CONTROL package to dynamically skip preamble
>>> CEIB and other load_xxx command in sequence.
>>>
>>> v2:
>>> support GFX7 as well, and bump up version.
>>> remove cntxcntl in compute ring funcs because CPC doesn't support
>>> this packet.
>>>
>>> v3: fix reduntant judgement in cntxcntl.
>>>
>>> Change-Id: I4b87ca84ea8c11ba4f7fb4c0e8a5be537ccde851
>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>>>
>>> Change-Id: I5d24c1bb5c14190ce4adeb6a331ee3d92b3d5c83
>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>> Only one signed of by line is enough and remove the change-ids.
>>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  9 +++++++++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 12 ++++++++++++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  3 ++-
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 16 +++++++++-------
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   | 20 ++++++++++++++++++++
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   | 30 ++++++++++++++++++++++++++++++
>>>    6 files changed, 82 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index 1254410..0de5f08 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -321,6 +321,7 @@ struct amdgpu_ring_funcs {
>>>        void (*begin_use)(struct amdgpu_ring *ring);
>>>        void (*end_use)(struct amdgpu_ring *ring);
>>>        void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>> +     void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t
>>> + flags);
>>>    };
>>>
>>>    /*
>>> @@ -965,6 +966,7 @@ struct amdgpu_ctx {
>>>        spinlock_t              ring_lock;
>>>        struct fence            **fences;
>>>        struct amdgpu_ctx_ring  rings[AMDGPU_MAX_RINGS];
>>> +     bool preamble_presented;
>>>    };
>>>
>>>    struct amdgpu_ctx_mgr {
>>> @@ -1227,8 +1229,13 @@ struct amdgpu_cs_parser {
>>>
>>>        /* user fence */
>>>        struct amdgpu_bo_list_entry     uf_entry;
>>> +     bool preamble_present; /* True means this command submit
>>> +involves a preamble IB */
>> We only need this in amdgpu_cs_ib_fill() don't we? See below as well.
>>
>> [ML] seems good advice
>>
>>>    };
>>>
>>> +#define PREAMBLE_IB_PRESENT          (1 << 0) /* bit set means command submit involves a preamble IB */
>>> +#define PREAMBLE_IB_PRESENT_FIRST    (1 << 1) /* bit set means preamble IB is first presented in belonging context */
>> Why does that makes a difference if it is seen for the first time?
>>
>> [ml] if it is presented for the first time for belonging ctx, means even current CS do not involve context switch, we still need keep the actions in preamble IB.
>> Usually if current CS is from the same cntx of previous CS, that means no ctx switch occurs, so we can skip the actions in preamble IB. but above case is the exception.
> Can't userspace just not set the preamble flag for the first submit with a preamble? I think that would result in the same behavior, unless having two non-preamble CE IB's in a single submit is an issue.
>
> - Bas
>
>>> +#define HAVE_CTX_SWITCH              (1 << 2) /* bit set means context switch occured */
>>> +
>>>    struct amdgpu_job {
>>>        struct amd_sched_job    base;
>>>        struct amdgpu_device    *adev;
>>> @@ -1237,6 +1244,7 @@ struct amdgpu_job {
>>>        struct amdgpu_sync      sync;
>>>        struct amdgpu_ib        *ibs;
>>>        struct fence            *fence; /* the hw fence */
>>> +     uint32_t                preamble_status;
>>>        uint32_t                num_ibs;
>>>        void                    *owner;
>>>        uint64_t                fence_ctx; /* the fence_context this job uses */
>>> @@ -2264,6 +2272,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>>>    #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>>>    #define amdgpu_ring_emit_hdp_invalidate(r) (r)->funcs->emit_hdp_invalidate((r))
>>>    #define amdgpu_ring_emit_switch_buffer(r)
>>> (r)->funcs->emit_switch_buffer((r))
>>> +#define amdgpu_ring_emit_cntxcntl(r, d)
>>> +(r)->funcs->emit_cntxcntl((r), (d))
>>>    #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
>>>    #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
>>>    #define amdgpu_ring_patch_cond_exec(r,o)
>>> (r)->funcs->patch_cond_exec((r),(o))
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> index 2d4e005..6d8c050 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> @@ -792,6 +792,9 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
>>>                if (r)
>>>                        return r;
>>>
>>> +             if (ib->flags & AMDGPU_IB_FLAG_PREAMBLE)
>>> +                     parser->preamble_present = true;
>>> +
>>>                if (parser->job->ring && parser->job->ring != ring)
>>>                        return -EINVAL;
>>>
>>> @@ -930,6 +933,12 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>>>                return r;
>>>        }
>>>
>>> +     if (p->preamble_present) {
>>> +             job->preamble_status |= PREAMBLE_IB_PRESENT;
>>> +             if (!p->ctx->preamble_presented)
>>> +                     job->preamble_status |= PREAMBLE_IB_PRESENT_FIRST;
>>> +     }
>>> +
>> Better move this to the end of amdgpu_cs_ib_fill() where we allocate the IBs as well.
>> [ML] okay, good change.
>>
>>
>>
>>>        job->owner = p->filp;
>>>        job->fence_ctx = entity->fence_context;
>>>        p->fence = fence_get(&job->base.s_fence->finished);
>>> @@ -940,6 +949,9 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>>>        trace_amdgpu_cs_ioctl(job);
>>>        amd_sched_entity_push_job(&job->base);
>>>
>>> +     if (p->preamble_present)
>>> +             p->ctx->preamble_presented = true;
>>> +
>>>        return 0;
>>>    }
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index 56c85e6..44db0ab 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -55,9 +55,10 @@
>>>     * - 3.3.0 - Add VM support for UVD on supported hardware.
>>>     * - 3.4.0 - Add AMDGPU_INFO_NUM_EVICTIONS.
>>>     * - 3.5.0 - Add support for new UVD_NO_OP register.
>>> + * - 3.6.0 - UMD doesn't/shouldn't need to use CONTEXT_CONTROL in
>>> + IB, KMD should do it
>>>     */
>>>    #define KMS_DRIVER_MAJOR    3
>>> -#define KMS_DRIVER_MINOR     5
>>> +#define KMS_DRIVER_MINOR     6
>>>    #define KMS_DRIVER_PATCHLEVEL       0
>>>
>>>    int amdgpu_vram_limit = 0;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> index 04263f0..b12b5ba 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> @@ -121,10 +121,11 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>    {
>>>        struct amdgpu_device *adev = ring->adev;
>>>        struct amdgpu_ib *ib = &ibs[0];
>>> -     bool skip_preamble, need_ctx_switch;
>>> +     bool need_ctx_switch;
>>>        unsigned patch_offset = ~0;
>>>        struct amdgpu_vm *vm;
>>>        uint64_t fence_ctx;
>>> +     uint32_t status = 0;
>>>
>>>        unsigned i;
>>>        int r = 0;
>>> @@ -174,15 +175,16 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>        /* always set cond_exec_polling to CONTINUE */
>>>        *ring->cond_exe_cpu_addr = 1;
>>>
>>> -     skip_preamble = ring->current_ctx == fence_ctx;
>>>        need_ctx_switch = ring->current_ctx != fence_ctx;
>>> +     if (job && ring->funcs->emit_cntxcntl) {
>>> +             if (need_ctx_switch)
>>> +                     status |= HAVE_CTX_SWITCH;
>>> +             status |= job->preamble_status;
>>> +             amdgpu_ring_emit_cntxcntl(ring, status);
>>> +     }
>>> +
>>>        for (i = 0; i < num_ibs; ++i) {
>>>                ib = &ibs[i];
>>> -
>>> -             /* drop preamble IBs if we don't have a context switch */
>>> -             if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) && skip_preamble)
>>> -                     continue;
>>> -
>> Would be nice to keep this functionality for cases where we don't support emit_cntxcntl (e.g. SI?).
>> [ML] SI support CONTEXT_CONTROL as well, and the package structure is exactly the same as CI.
>>
>>>                amdgpu_ring_emit_ib(ring, ib, job ? job->vm_id : 0,
>>>                                    need_ctx_switch);
>>>                need_ctx_switch = false; diff --git
>>> a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> index f055d49..0d5addb 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> @@ -2096,6 +2096,25 @@ static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
>>>        amdgpu_ring_write(ring, control);
>>>    }
>>>
>>> +static void gfx_v7_ring_emit_cntxcntl(struct amdgpu_ring *ring,
>>> +uint32_t flags) {
>>> +     uint32_t dw2 = 0;
>>> +
>>> +     dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
>>> +     if (flags & HAVE_CTX_SWITCH) {
>>> +             /* set load_global_config & load_global_uconfig */
>>> +             dw2 |= 0x8001;
>>> +             /* set load_cs_sh_regs */
>>> +             dw2 |= 0x01000000;
>>> +             /* set load_per_context_state & load_gfx_sh_regs */
>>> +             dw2 |= 0x10002;
>> Better define some constants for those.
>>
>> [ML] I'll leave it to other guys when doing cleanups, a little hurry for other jobs now ...
>>
>> Regards,
>> Christian.
>>
>>> +     }
>>> +
>>> +     amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
>>> +     amdgpu_ring_write(ring, dw2);
>>> +     amdgpu_ring_write(ring, 0);
>>> +}
>>> +
>>>    /**
>>>     * gfx_v7_0_ring_test_ib - basic ring IB test
>>>     *
>>> @@ -4929,6 +4948,7 @@ static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_gfx = {
>>>        .test_ib = gfx_v7_0_ring_test_ib,
>>>        .insert_nop = amdgpu_ring_insert_nop,
>>>        .pad_ib = amdgpu_ring_generic_pad_ib,
>>> +     .emit_cntxcntl = gfx_v7_ring_emit_cntxcntl,
>>>    };
>>>
>>>    static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute =
>>> { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> index 8ba8e42..73f6ffa 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> @@ -6085,6 +6085,35 @@ static void gfx_v8_ring_emit_sb(struct amdgpu_ring *ring)
>>>        amdgpu_ring_write(ring, 0);
>>>    }
>>>
>>> +static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring,
>>> +uint32_t flags) {
>>> +     uint32_t dw2 = 0;
>>> +
>>> +     dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
>>> +     if (flags & HAVE_CTX_SWITCH) {
>>> +             /* set load_global_config & load_global_uconfig */
>>> +             dw2 |= 0x8001;
>>> +             /* set load_cs_sh_regs */
>>> +             dw2 |= 0x01000000;
>>> +             /* set load_per_context_state & load_gfx_sh_regs for GFX */
>>> +             dw2 |= 0x10002;
>>> +
>>> +             /* set load_ce_ram if preamble presented */
>>> +             if (PREAMBLE_IB_PRESENT & flags)
>>> +                     dw2 |= 0x10000000;
>>> +     } else {
>>> +             /* still load_ce_ram if this is the first time preamble presented
>>> +              * although there is no context switch happens.
>>> +              */
>>> +             if (PREAMBLE_IB_PRESENT_FIRST & flags)
>>> +                     dw2 |= 0x10000000;
>>> +     }
>>> +
>>> +     amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
>>> +     amdgpu_ring_write(ring, dw2);
>>> +     amdgpu_ring_write(ring, 0);
>>> +}
>>> +
>>>    static void gfx_v8_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
>>>                                                 enum amdgpu_interrupt_state state)
>>>    {
>>> @@ -6267,6 +6296,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
>>>        .insert_nop = amdgpu_ring_insert_nop,
>>>        .pad_ib = amdgpu_ring_generic_pad_ib,
>>>        .emit_switch_buffer = gfx_v8_ring_emit_sb,
>>> +     .emit_cntxcntl = gfx_v8_ring_emit_cntxcntl,
>>>    };
>>>
>>>    static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute =
>>> {
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
       [not found]                     ` <8fed1480-794e-7218-17d1-52221060a149-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2016-09-01 15:15                       ` Liu, Monk
       [not found]                         ` <MWHPR12MB11820449DB6EF7F18F9397A284E20-Gy0DoCVfaSVhjnLHdLm0OQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  2016-09-01 15:40                       ` Liu, Monk
  1 sibling, 1 reply; 20+ messages in thread
From: Liu, Monk @ 2016-09-01 15:15 UTC (permalink / raw)
  To: Christian König, Bas Nieuwenhuizen
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Which part is no correct ?

When CONTEXT_CONTROL introduced in kernel side, it dynamically skips or keeps the commands in preamble CEIB (LOAD_CE_RAM), so the Preamble CE IB not needed to skipped by software method.

But don't forget that CONTEXT_CONTROL also dynamically controls if the following load_xxx commands (in DE IB) will be skipped or kept, 

Original method can only skip the Preamble CE IB (totally) if no context switch, but it cannot skip the load__xxx from DE IB, since UMD is not aware of process switch (which leads to context switch )

UMD should always insert load_xxx commands in DE IB wrapped by preamble_start and preamble_end, and let KMD decide if those load_xxx shall kept or skipped by CONTEXT_CONTROL.



Now we go back to original logic:

Even if there is no context switch, original method skill keeps those Load_xxx in DE IB, right ? (Preamble_flag only controls skipping of Preamble CE IB).

The flag of "Preamble_flag" not works for DE IB, and that design is  incorrect ... and I am really shock that we use wrong method for such long time ... 
(not to mention that MESA insert CONTEXT_CONTROL in IB, which is clearly wrong).

Since MESA also use CE,  we can totally drop "Preamble_flag" and bump up the version. I don't know why we cannot just sync with windows kmd scheme for this. 

BR Monk


-----Original Message-----
From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf Of Christian K?nig
Sent: Thursday, September 01, 2016 10:10 PM
To: Liu, Monk <Monk.Liu@amd.com>; Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Cc: amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)

Am 01.09.2016 um 12:55 schrieb Liu, Monk:
>> Why does that makes a difference if it is seen for the first time?
>>
>> [ml] if it is presented for the first time for belonging ctx, means even current CS do not involve context switch, we still need keep the actions in preamble IB.
>> Usually if current CS is from the same cntx of previous CS, that means no ctx switch occurs, so we can skip the actions in preamble IB. but above case is the exception.
> Can't userspace just not set the preamble flag for the first submit with a preamble? I think that would result in the same behavior, unless having two non-preamble CE IB's in a single submit is an issue.
>
> - Bas
>
>
> [ML] I'm confused, what's your point?
>
> With this patch, preamble_flag is not needed at all.

Well then there is something wrong with the patch. Setting the preamble flag should result in an IB only being executed when there was a task switch.

We can either implement that as a software solution by skipping the IBs in question or by using the hardware for this.

Double checking your patch actually reveals that you want to emit the context control package only once, so this can't be correct.

Regards,
Christian.

> Without this patch,  many original assumption and logic is not correct.
> Besides, CONTEXT_CONTROL not only deals CE but also deal DE.
>
> BR Monk
>
>
> -----Original Message-----
> From: Bas Nieuwenhuizen [mailto:bas@basnieuwenhuizen.nl]
> Sent: Thursday, September 01, 2016 4:19 PM
> To: Liu, Monk <Monk.Liu@amd.com>
> Cc: Christian König <deathsimple@vodafone.de>; 
> amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
>
> On Thu, Sep 1, 2016 at 9:37 AM, Liu, Monk <Monk.Liu@amd.com> wrote:
>>
>> -----Original Message-----
>> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On 
>> Behalf Of Christian K?nig
>> Sent: Wednesday, August 31, 2016 7:53 PM
>> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
>>
>> Looks good to me in general, a few nit picks and sugegstions below.
>>
>> Am 31.08.2016 um 05:49 schrieb Monk Liu:
>>> v1:
>>> for gfx8, use CONTEXT_CONTROL package to dynamically skip preamble 
>>> CEIB and other load_xxx command in sequence.
>>>
>>> v2:
>>> support GFX7 as well, and bump up version.
>>> remove cntxcntl in compute ring funcs because CPC doesn't support 
>>> this packet.
>>>
>>> v3: fix reduntant judgement in cntxcntl.
>>>
>>> Change-Id: I4b87ca84ea8c11ba4f7fb4c0e8a5be537ccde851
>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>>>
>>> Change-Id: I5d24c1bb5c14190ce4adeb6a331ee3d92b3d5c83
>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>> Only one signed of by line is enough and remove the change-ids.
>>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  9 +++++++++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 12 ++++++++++++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  3 ++-
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 16 +++++++++-------
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   | 20 ++++++++++++++++++++
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   | 30 ++++++++++++++++++++++++++++++
>>>    6 files changed, 82 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index 1254410..0de5f08 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -321,6 +321,7 @@ struct amdgpu_ring_funcs {
>>>        void (*begin_use)(struct amdgpu_ring *ring);
>>>        void (*end_use)(struct amdgpu_ring *ring);
>>>        void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>> +     void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t 
>>> + flags);
>>>    };
>>>
>>>    /*
>>> @@ -965,6 +966,7 @@ struct amdgpu_ctx {
>>>        spinlock_t              ring_lock;
>>>        struct fence            **fences;
>>>        struct amdgpu_ctx_ring  rings[AMDGPU_MAX_RINGS];
>>> +     bool preamble_presented;
>>>    };
>>>
>>>    struct amdgpu_ctx_mgr {
>>> @@ -1227,8 +1229,13 @@ struct amdgpu_cs_parser {
>>>
>>>        /* user fence */
>>>        struct amdgpu_bo_list_entry     uf_entry;
>>> +     bool preamble_present; /* True means this command submit 
>>> +involves a preamble IB */
>> We only need this in amdgpu_cs_ib_fill() don't we? See below as well.
>>
>> [ML] seems good advice
>>
>>>    };
>>>
>>> +#define PREAMBLE_IB_PRESENT          (1 << 0) /* bit set means command submit involves a preamble IB */
>>> +#define PREAMBLE_IB_PRESENT_FIRST    (1 << 1) /* bit set means preamble IB is first presented in belonging context */
>> Why does that makes a difference if it is seen for the first time?
>>
>> [ml] if it is presented for the first time for belonging ctx, means even current CS do not involve context switch, we still need keep the actions in preamble IB.
>> Usually if current CS is from the same cntx of previous CS, that means no ctx switch occurs, so we can skip the actions in preamble IB. but above case is the exception.
> Can't userspace just not set the preamble flag for the first submit with a preamble? I think that would result in the same behavior, unless having two non-preamble CE IB's in a single submit is an issue.
>
> - Bas
>
>>> +#define HAVE_CTX_SWITCH              (1 << 2) /* bit set means context switch occured */
>>> +
>>>    struct amdgpu_job {
>>>        struct amd_sched_job    base;
>>>        struct amdgpu_device    *adev;
>>> @@ -1237,6 +1244,7 @@ struct amdgpu_job {
>>>        struct amdgpu_sync      sync;
>>>        struct amdgpu_ib        *ibs;
>>>        struct fence            *fence; /* the hw fence */
>>> +     uint32_t                preamble_status;
>>>        uint32_t                num_ibs;
>>>        void                    *owner;
>>>        uint64_t                fence_ctx; /* the fence_context this job uses */
>>> @@ -2264,6 +2272,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>>>    #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>>>    #define amdgpu_ring_emit_hdp_invalidate(r) (r)->funcs->emit_hdp_invalidate((r))
>>>    #define amdgpu_ring_emit_switch_buffer(r)
>>> (r)->funcs->emit_switch_buffer((r))
>>> +#define amdgpu_ring_emit_cntxcntl(r, d) 
>>> +(r)->funcs->emit_cntxcntl((r), (d))
>>>    #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
>>>    #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
>>>    #define amdgpu_ring_patch_cond_exec(r,o)
>>> (r)->funcs->patch_cond_exec((r),(o))
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> index 2d4e005..6d8c050 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> @@ -792,6 +792,9 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
>>>                if (r)
>>>                        return r;
>>>
>>> +             if (ib->flags & AMDGPU_IB_FLAG_PREAMBLE)
>>> +                     parser->preamble_present = true;
>>> +
>>>                if (parser->job->ring && parser->job->ring != ring)
>>>                        return -EINVAL;
>>>
>>> @@ -930,6 +933,12 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>>>                return r;
>>>        }
>>>
>>> +     if (p->preamble_present) {
>>> +             job->preamble_status |= PREAMBLE_IB_PRESENT;
>>> +             if (!p->ctx->preamble_presented)
>>> +                     job->preamble_status |= PREAMBLE_IB_PRESENT_FIRST;
>>> +     }
>>> +
>> Better move this to the end of amdgpu_cs_ib_fill() where we allocate the IBs as well.
>> [ML] okay, good change.
>>
>>
>>
>>>        job->owner = p->filp;
>>>        job->fence_ctx = entity->fence_context;
>>>        p->fence = fence_get(&job->base.s_fence->finished);
>>> @@ -940,6 +949,9 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>>>        trace_amdgpu_cs_ioctl(job);
>>>        amd_sched_entity_push_job(&job->base);
>>>
>>> +     if (p->preamble_present)
>>> +             p->ctx->preamble_presented = true;
>>> +
>>>        return 0;
>>>    }
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index 56c85e6..44db0ab 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -55,9 +55,10 @@
>>>     * - 3.3.0 - Add VM support for UVD on supported hardware.
>>>     * - 3.4.0 - Add AMDGPU_INFO_NUM_EVICTIONS.
>>>     * - 3.5.0 - Add support for new UVD_NO_OP register.
>>> + * - 3.6.0 - UMD doesn't/shouldn't need to use CONTEXT_CONTROL in 
>>> + IB, KMD should do it
>>>     */
>>>    #define KMS_DRIVER_MAJOR    3
>>> -#define KMS_DRIVER_MINOR     5
>>> +#define KMS_DRIVER_MINOR     6
>>>    #define KMS_DRIVER_PATCHLEVEL       0
>>>
>>>    int amdgpu_vram_limit = 0;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> index 04263f0..b12b5ba 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> @@ -121,10 +121,11 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>    {
>>>        struct amdgpu_device *adev = ring->adev;
>>>        struct amdgpu_ib *ib = &ibs[0];
>>> -     bool skip_preamble, need_ctx_switch;
>>> +     bool need_ctx_switch;
>>>        unsigned patch_offset = ~0;
>>>        struct amdgpu_vm *vm;
>>>        uint64_t fence_ctx;
>>> +     uint32_t status = 0;
>>>
>>>        unsigned i;
>>>        int r = 0;
>>> @@ -174,15 +175,16 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>        /* always set cond_exec_polling to CONTINUE */
>>>        *ring->cond_exe_cpu_addr = 1;
>>>
>>> -     skip_preamble = ring->current_ctx == fence_ctx;
>>>        need_ctx_switch = ring->current_ctx != fence_ctx;
>>> +     if (job && ring->funcs->emit_cntxcntl) {
>>> +             if (need_ctx_switch)
>>> +                     status |= HAVE_CTX_SWITCH;
>>> +             status |= job->preamble_status;
>>> +             amdgpu_ring_emit_cntxcntl(ring, status);
>>> +     }
>>> +
>>>        for (i = 0; i < num_ibs; ++i) {
>>>                ib = &ibs[i];
>>> -
>>> -             /* drop preamble IBs if we don't have a context switch */
>>> -             if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) && skip_preamble)
>>> -                     continue;
>>> -
>> Would be nice to keep this functionality for cases where we don't support emit_cntxcntl (e.g. SI?).
>> [ML] SI support CONTEXT_CONTROL as well, and the package structure is exactly the same as CI.
>>
>>>                amdgpu_ring_emit_ib(ring, ib, job ? job->vm_id : 0,
>>>                                    need_ctx_switch);
>>>                need_ctx_switch = false; diff --git 
>>> a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> index f055d49..0d5addb 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> @@ -2096,6 +2096,25 @@ static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
>>>        amdgpu_ring_write(ring, control);
>>>    }
>>>
>>> +static void gfx_v7_ring_emit_cntxcntl(struct amdgpu_ring *ring, 
>>> +uint32_t flags) {
>>> +     uint32_t dw2 = 0;
>>> +
>>> +     dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
>>> +     if (flags & HAVE_CTX_SWITCH) {
>>> +             /* set load_global_config & load_global_uconfig */
>>> +             dw2 |= 0x8001;
>>> +             /* set load_cs_sh_regs */
>>> +             dw2 |= 0x01000000;
>>> +             /* set load_per_context_state & load_gfx_sh_regs */
>>> +             dw2 |= 0x10002;
>> Better define some constants for those.
>>
>> [ML] I'll leave it to other guys when doing cleanups, a little hurry for other jobs now ...
>>
>> Regards,
>> Christian.
>>
>>> +     }
>>> +
>>> +     amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
>>> +     amdgpu_ring_write(ring, dw2);
>>> +     amdgpu_ring_write(ring, 0);
>>> +}
>>> +
>>>    /**
>>>     * gfx_v7_0_ring_test_ib - basic ring IB test
>>>     *
>>> @@ -4929,6 +4948,7 @@ static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_gfx = {
>>>        .test_ib = gfx_v7_0_ring_test_ib,
>>>        .insert_nop = amdgpu_ring_insert_nop,
>>>        .pad_ib = amdgpu_ring_generic_pad_ib,
>>> +     .emit_cntxcntl = gfx_v7_ring_emit_cntxcntl,
>>>    };
>>>
>>>    static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute 
>>> = { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> index 8ba8e42..73f6ffa 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> @@ -6085,6 +6085,35 @@ static void gfx_v8_ring_emit_sb(struct amdgpu_ring *ring)
>>>        amdgpu_ring_write(ring, 0);
>>>    }
>>>
>>> +static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, 
>>> +uint32_t flags) {
>>> +     uint32_t dw2 = 0;
>>> +
>>> +     dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
>>> +     if (flags & HAVE_CTX_SWITCH) {
>>> +             /* set load_global_config & load_global_uconfig */
>>> +             dw2 |= 0x8001;
>>> +             /* set load_cs_sh_regs */
>>> +             dw2 |= 0x01000000;
>>> +             /* set load_per_context_state & load_gfx_sh_regs for GFX */
>>> +             dw2 |= 0x10002;
>>> +
>>> +             /* set load_ce_ram if preamble presented */
>>> +             if (PREAMBLE_IB_PRESENT & flags)
>>> +                     dw2 |= 0x10000000;
>>> +     } else {
>>> +             /* still load_ce_ram if this is the first time preamble presented
>>> +              * although there is no context switch happens.
>>> +              */
>>> +             if (PREAMBLE_IB_PRESENT_FIRST & flags)
>>> +                     dw2 |= 0x10000000;
>>> +     }
>>> +
>>> +     amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
>>> +     amdgpu_ring_write(ring, dw2);
>>> +     amdgpu_ring_write(ring, 0);
>>> +}
>>> +
>>>    static void gfx_v8_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
>>>                                                 enum amdgpu_interrupt_state state)
>>>    {
>>> @@ -6267,6 +6296,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
>>>        .insert_nop = amdgpu_ring_insert_nop,
>>>        .pad_ib = amdgpu_ring_generic_pad_ib,
>>>        .emit_switch_buffer = gfx_v8_ring_emit_sb,
>>> +     .emit_cntxcntl = gfx_v8_ring_emit_cntxcntl,
>>>    };
>>>
>>>    static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute 
>>> = {
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
       [not found]                     ` <8fed1480-794e-7218-17d1-52221060a149-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  2016-09-01 15:15                       ` Liu, Monk
@ 2016-09-01 15:40                       ` Liu, Monk
  1 sibling, 0 replies; 20+ messages in thread
From: Liu, Monk @ 2016-09-01 15:40 UTC (permalink / raw)
  To: Christian König, Bas Nieuwenhuizen
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Well then there is something wrong with the patch. Setting the preamble flag should result in an IB only being executed when there was a task switch.

We can either implement that as a software solution by skipping the IBs in question or by using the hardware for this.

Double checking your patch actually reveals that you want to emit the context control package only once, so this can't be correct.


[ML] 
> Double checking your patch actually reveals that you want to emit the context control package only once, so this can't be correct.

Why I want to emit the context_control package only once and where did you get it ?
CONTEXT_CONTROL is always inserted in the ring buffer whatever context switch or not ... 
Please refer to CP_PACKET_TG for CONTEXT_CONTROL usage ...

BR Monk

-----Original Message-----
From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf Of Christian K?nig
Sent: Thursday, September 01, 2016 10:10 PM
To: Liu, Monk <Monk.Liu@amd.com>; Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Cc: amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)

Am 01.09.2016 um 12:55 schrieb Liu, Monk:
>> Why does that makes a difference if it is seen for the first time?
>>
>> [ml] if it is presented for the first time for belonging ctx, means even current CS do not involve context switch, we still need keep the actions in preamble IB.
>> Usually if current CS is from the same cntx of previous CS, that means no ctx switch occurs, so we can skip the actions in preamble IB. but above case is the exception.
> Can't userspace just not set the preamble flag for the first submit with a preamble? I think that would result in the same behavior, unless having two non-preamble CE IB's in a single submit is an issue.
>
> - Bas
>
>
> [ML] I'm confused, what's your point?
>
> With this patch, preamble_flag is not needed at all.

Well then there is something wrong with the patch. Setting the preamble flag should result in an IB only being executed when there was a task switch.

We can either implement that as a software solution by skipping the IBs in question or by using the hardware for this.

Double checking your patch actually reveals that you want to emit the context control package only once, so this can't be correct.

Regards,
Christian.

> Without this patch,  many original assumption and logic is not correct.
> Besides, CONTEXT_CONTROL not only deals CE but also deal DE.
>
> BR Monk
>
>
> -----Original Message-----
> From: Bas Nieuwenhuizen [mailto:bas@basnieuwenhuizen.nl]
> Sent: Thursday, September 01, 2016 4:19 PM
> To: Liu, Monk <Monk.Liu@amd.com>
> Cc: Christian König <deathsimple@vodafone.de>; 
> amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
>
> On Thu, Sep 1, 2016 at 9:37 AM, Liu, Monk <Monk.Liu@amd.com> wrote:
>>
>> -----Original Message-----
>> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On 
>> Behalf Of Christian K?nig
>> Sent: Wednesday, August 31, 2016 7:53 PM
>> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
>>
>> Looks good to me in general, a few nit picks and sugegstions below.
>>
>> Am 31.08.2016 um 05:49 schrieb Monk Liu:
>>> v1:
>>> for gfx8, use CONTEXT_CONTROL package to dynamically skip preamble 
>>> CEIB and other load_xxx command in sequence.
>>>
>>> v2:
>>> support GFX7 as well, and bump up version.
>>> remove cntxcntl in compute ring funcs because CPC doesn't support 
>>> this packet.
>>>
>>> v3: fix reduntant judgement in cntxcntl.
>>>
>>> Change-Id: I4b87ca84ea8c11ba4f7fb4c0e8a5be537ccde851
>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>>>
>>> Change-Id: I5d24c1bb5c14190ce4adeb6a331ee3d92b3d5c83
>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>> Only one signed of by line is enough and remove the change-ids.
>>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  9 +++++++++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 12 ++++++++++++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  3 ++-
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 16 +++++++++-------
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   | 20 ++++++++++++++++++++
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   | 30 ++++++++++++++++++++++++++++++
>>>    6 files changed, 82 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index 1254410..0de5f08 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -321,6 +321,7 @@ struct amdgpu_ring_funcs {
>>>        void (*begin_use)(struct amdgpu_ring *ring);
>>>        void (*end_use)(struct amdgpu_ring *ring);
>>>        void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>> +     void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t 
>>> + flags);
>>>    };
>>>
>>>    /*
>>> @@ -965,6 +966,7 @@ struct amdgpu_ctx {
>>>        spinlock_t              ring_lock;
>>>        struct fence            **fences;
>>>        struct amdgpu_ctx_ring  rings[AMDGPU_MAX_RINGS];
>>> +     bool preamble_presented;
>>>    };
>>>
>>>    struct amdgpu_ctx_mgr {
>>> @@ -1227,8 +1229,13 @@ struct amdgpu_cs_parser {
>>>
>>>        /* user fence */
>>>        struct amdgpu_bo_list_entry     uf_entry;
>>> +     bool preamble_present; /* True means this command submit 
>>> +involves a preamble IB */
>> We only need this in amdgpu_cs_ib_fill() don't we? See below as well.
>>
>> [ML] seems good advice
>>
>>>    };
>>>
>>> +#define PREAMBLE_IB_PRESENT          (1 << 0) /* bit set means command submit involves a preamble IB */
>>> +#define PREAMBLE_IB_PRESENT_FIRST    (1 << 1) /* bit set means preamble IB is first presented in belonging context */
>> Why does that makes a difference if it is seen for the first time?
>>
>> [ml] if it is presented for the first time for belonging ctx, means even current CS do not involve context switch, we still need keep the actions in preamble IB.
>> Usually if current CS is from the same cntx of previous CS, that means no ctx switch occurs, so we can skip the actions in preamble IB. but above case is the exception.
> Can't userspace just not set the preamble flag for the first submit with a preamble? I think that would result in the same behavior, unless having two non-preamble CE IB's in a single submit is an issue.
>
> - Bas
>
>>> +#define HAVE_CTX_SWITCH              (1 << 2) /* bit set means context switch occured */
>>> +
>>>    struct amdgpu_job {
>>>        struct amd_sched_job    base;
>>>        struct amdgpu_device    *adev;
>>> @@ -1237,6 +1244,7 @@ struct amdgpu_job {
>>>        struct amdgpu_sync      sync;
>>>        struct amdgpu_ib        *ibs;
>>>        struct fence            *fence; /* the hw fence */
>>> +     uint32_t                preamble_status;
>>>        uint32_t                num_ibs;
>>>        void                    *owner;
>>>        uint64_t                fence_ctx; /* the fence_context this job uses */
>>> @@ -2264,6 +2272,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>>>    #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>>>    #define amdgpu_ring_emit_hdp_invalidate(r) (r)->funcs->emit_hdp_invalidate((r))
>>>    #define amdgpu_ring_emit_switch_buffer(r)
>>> (r)->funcs->emit_switch_buffer((r))
>>> +#define amdgpu_ring_emit_cntxcntl(r, d) 
>>> +(r)->funcs->emit_cntxcntl((r), (d))
>>>    #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
>>>    #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
>>>    #define amdgpu_ring_patch_cond_exec(r,o)
>>> (r)->funcs->patch_cond_exec((r),(o))
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> index 2d4e005..6d8c050 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> @@ -792,6 +792,9 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
>>>                if (r)
>>>                        return r;
>>>
>>> +             if (ib->flags & AMDGPU_IB_FLAG_PREAMBLE)
>>> +                     parser->preamble_present = true;
>>> +
>>>                if (parser->job->ring && parser->job->ring != ring)
>>>                        return -EINVAL;
>>>
>>> @@ -930,6 +933,12 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>>>                return r;
>>>        }
>>>
>>> +     if (p->preamble_present) {
>>> +             job->preamble_status |= PREAMBLE_IB_PRESENT;
>>> +             if (!p->ctx->preamble_presented)
>>> +                     job->preamble_status |= PREAMBLE_IB_PRESENT_FIRST;
>>> +     }
>>> +
>> Better move this to the end of amdgpu_cs_ib_fill() where we allocate the IBs as well.
>> [ML] okay, good change.
>>
>>
>>
>>>        job->owner = p->filp;
>>>        job->fence_ctx = entity->fence_context;
>>>        p->fence = fence_get(&job->base.s_fence->finished);
>>> @@ -940,6 +949,9 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>>>        trace_amdgpu_cs_ioctl(job);
>>>        amd_sched_entity_push_job(&job->base);
>>>
>>> +     if (p->preamble_present)
>>> +             p->ctx->preamble_presented = true;
>>> +
>>>        return 0;
>>>    }
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index 56c85e6..44db0ab 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -55,9 +55,10 @@
>>>     * - 3.3.0 - Add VM support for UVD on supported hardware.
>>>     * - 3.4.0 - Add AMDGPU_INFO_NUM_EVICTIONS.
>>>     * - 3.5.0 - Add support for new UVD_NO_OP register.
>>> + * - 3.6.0 - UMD doesn't/shouldn't need to use CONTEXT_CONTROL in 
>>> + IB, KMD should do it
>>>     */
>>>    #define KMS_DRIVER_MAJOR    3
>>> -#define KMS_DRIVER_MINOR     5
>>> +#define KMS_DRIVER_MINOR     6
>>>    #define KMS_DRIVER_PATCHLEVEL       0
>>>
>>>    int amdgpu_vram_limit = 0;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> index 04263f0..b12b5ba 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> @@ -121,10 +121,11 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>    {
>>>        struct amdgpu_device *adev = ring->adev;
>>>        struct amdgpu_ib *ib = &ibs[0];
>>> -     bool skip_preamble, need_ctx_switch;
>>> +     bool need_ctx_switch;
>>>        unsigned patch_offset = ~0;
>>>        struct amdgpu_vm *vm;
>>>        uint64_t fence_ctx;
>>> +     uint32_t status = 0;
>>>
>>>        unsigned i;
>>>        int r = 0;
>>> @@ -174,15 +175,16 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>        /* always set cond_exec_polling to CONTINUE */
>>>        *ring->cond_exe_cpu_addr = 1;
>>>
>>> -     skip_preamble = ring->current_ctx == fence_ctx;
>>>        need_ctx_switch = ring->current_ctx != fence_ctx;
>>> +     if (job && ring->funcs->emit_cntxcntl) {
>>> +             if (need_ctx_switch)
>>> +                     status |= HAVE_CTX_SWITCH;
>>> +             status |= job->preamble_status;
>>> +             amdgpu_ring_emit_cntxcntl(ring, status);
>>> +     }
>>> +
>>>        for (i = 0; i < num_ibs; ++i) {
>>>                ib = &ibs[i];
>>> -
>>> -             /* drop preamble IBs if we don't have a context switch */
>>> -             if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) && skip_preamble)
>>> -                     continue;
>>> -
>> Would be nice to keep this functionality for cases where we don't support emit_cntxcntl (e.g. SI?).
>> [ML] SI support CONTEXT_CONTROL as well, and the package structure is exactly the same as CI.
>>
>>>                amdgpu_ring_emit_ib(ring, ib, job ? job->vm_id : 0,
>>>                                    need_ctx_switch);
>>>                need_ctx_switch = false; diff --git 
>>> a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> index f055d49..0d5addb 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> @@ -2096,6 +2096,25 @@ static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
>>>        amdgpu_ring_write(ring, control);
>>>    }
>>>
>>> +static void gfx_v7_ring_emit_cntxcntl(struct amdgpu_ring *ring, 
>>> +uint32_t flags) {
>>> +     uint32_t dw2 = 0;
>>> +
>>> +     dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
>>> +     if (flags & HAVE_CTX_SWITCH) {
>>> +             /* set load_global_config & load_global_uconfig */
>>> +             dw2 |= 0x8001;
>>> +             /* set load_cs_sh_regs */
>>> +             dw2 |= 0x01000000;
>>> +             /* set load_per_context_state & load_gfx_sh_regs */
>>> +             dw2 |= 0x10002;
>> Better define some constants for those.
>>
>> [ML] I'll leave it to other guys when doing cleanups, a little hurry for other jobs now ...
>>
>> Regards,
>> Christian.
>>
>>> +     }
>>> +
>>> +     amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
>>> +     amdgpu_ring_write(ring, dw2);
>>> +     amdgpu_ring_write(ring, 0);
>>> +}
>>> +
>>>    /**
>>>     * gfx_v7_0_ring_test_ib - basic ring IB test
>>>     *
>>> @@ -4929,6 +4948,7 @@ static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_gfx = {
>>>        .test_ib = gfx_v7_0_ring_test_ib,
>>>        .insert_nop = amdgpu_ring_insert_nop,
>>>        .pad_ib = amdgpu_ring_generic_pad_ib,
>>> +     .emit_cntxcntl = gfx_v7_ring_emit_cntxcntl,
>>>    };
>>>
>>>    static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute 
>>> = { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> index 8ba8e42..73f6ffa 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> @@ -6085,6 +6085,35 @@ static void gfx_v8_ring_emit_sb(struct amdgpu_ring *ring)
>>>        amdgpu_ring_write(ring, 0);
>>>    }
>>>
>>> +static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, 
>>> +uint32_t flags) {
>>> +     uint32_t dw2 = 0;
>>> +
>>> +     dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
>>> +     if (flags & HAVE_CTX_SWITCH) {
>>> +             /* set load_global_config & load_global_uconfig */
>>> +             dw2 |= 0x8001;
>>> +             /* set load_cs_sh_regs */
>>> +             dw2 |= 0x01000000;
>>> +             /* set load_per_context_state & load_gfx_sh_regs for GFX */
>>> +             dw2 |= 0x10002;
>>> +
>>> +             /* set load_ce_ram if preamble presented */
>>> +             if (PREAMBLE_IB_PRESENT & flags)
>>> +                     dw2 |= 0x10000000;
>>> +     } else {
>>> +             /* still load_ce_ram if this is the first time preamble presented
>>> +              * although there is no context switch happens.
>>> +              */
>>> +             if (PREAMBLE_IB_PRESENT_FIRST & flags)
>>> +                     dw2 |= 0x10000000;
>>> +     }
>>> +
>>> +     amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
>>> +     amdgpu_ring_write(ring, dw2);
>>> +     amdgpu_ring_write(ring, 0);
>>> +}
>>> +
>>>    static void gfx_v8_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
>>>                                                 enum amdgpu_interrupt_state state)
>>>    {
>>> @@ -6267,6 +6296,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
>>>        .insert_nop = amdgpu_ring_insert_nop,
>>>        .pad_ib = amdgpu_ring_generic_pad_ib,
>>>        .emit_switch_buffer = gfx_v8_ring_emit_sb,
>>> +     .emit_cntxcntl = gfx_v8_ring_emit_cntxcntl,
>>>    };
>>>
>>>    static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute 
>>> = {
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
       [not found]                 ` <MWHPR12MB1182883EE214228FBFCE081C84E20-Gy0DoCVfaSVhjnLHdLm0OQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  2016-09-01 14:10                   ` Christian König
@ 2016-09-01 16:09                   ` Bas Nieuwenhuizen
       [not found]                     ` <CAP+8YyGF3ht8KLgdTDN6K_r+YJD7ZLDscWXrvvwRdkB=HVOcmg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  1 sibling, 1 reply; 20+ messages in thread
From: Bas Nieuwenhuizen @ 2016-09-01 16:09 UTC (permalink / raw)
  To: Liu, Monk; +Cc: Christian König, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On Thu, Sep 1, 2016 at 12:55 PM, Liu, Monk <Monk.Liu@amd.com> wrote:
>> Why does that makes a difference if it is seen for the first time?
>>
>> [ml] if it is presented for the first time for belonging ctx, means even current CS do not involve context switch, we still need keep the actions in preamble IB.
>> Usually if current CS is from the same cntx of previous CS, that means no ctx switch occurs, so we can skip the actions in preamble IB. but above case is the exception.
>
> Can't userspace just not set the preamble flag for the first submit with a preamble? I think that would result in the same behavior, unless having two non-preamble CE IB's in a single submit is an issue.
>
> - Bas
>
>
> [ML] I'm confused, what's your point?

So if I understand correctly, the new behavior is that the first
submit containing a preamble always executes the loads in the preamble
even if there is no context switch. The old behavior is that in that
situation the preamble would be skipped in the new situation. Why do
we want the new behavior? If the application wants the loads to
execute even without a context switch, it should not mark the IB as a
preamble with AMDGPU_IB_FLAG_PREAMBLE.

On inspecting the patch more closely I think there are more issues
with this patch.

If there is no IB with AMDGPU_IB_FLAG_PREAMBLE, then the CE_LOAD bit
always gets disabled. Furthermore if there is a CE_LOAD bit, and no
context switch the CE_LOAD bit also gets disabled for IB's without
AMDGPU_IB_FLAG_PREAMBLE.

I think this is a bad move, as there are some uses for loading CE RAM
that are not dependent on context switches, such as preloading things
into L2 cache, or switching shader uniforms in CE RAM when a different
shader gets bound. Therefore I think that the CE_LOAD bit should
always be enabled for IB's without AMDGPU_IB_FLAG_PREAMBLE.

Furthermore, with this patch the preamble IB's always get executed and
loads disabled with CONTEXT_CONTROL. As e.g. mesa uses its own
CONTEXT_CONTROL (and we can't change that for old versions of mesa)
this would override the kernel CONTEXT_CONTROL and always execute the
loads.

I also miss the CE_LOAD bit in the CONTEXT_CONTROL for gfx7. Does it
not need it?

I would prefer keeping the old system for preamble IB's and just
adding a generic CONTEXT_CONTROL that always enables the CE loads. I
don't have an opinion the non-CE loads though, as I've never found a
reason to use them.

- Bas

> With this patch, preamble_flag is not needed at all.
> Without this patch,  many original assumption and logic is not correct.
> Besides, CONTEXT_CONTROL not only deals CE but also deal DE.


>
> BR Monk
>
>
> -----Original Message-----
> From: Bas Nieuwenhuizen [mailto:bas@basnieuwenhuizen.nl]
> Sent: Thursday, September 01, 2016 4:19 PM
> To: Liu, Monk <Monk.Liu@amd.com>
> Cc: Christian König <deathsimple@vodafone.de>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
>
> On Thu, Sep 1, 2016 at 9:37 AM, Liu, Monk <Monk.Liu@amd.com> wrote:
>>
>>
>> -----Original Message-----
>> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
>> Of Christian K?nig
>> Sent: Wednesday, August 31, 2016 7:53 PM
>> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
>>
>> Looks good to me in general, a few nit picks and sugegstions below.
>>
>> Am 31.08.2016 um 05:49 schrieb Monk Liu:
>>> v1:
>>> for gfx8, use CONTEXT_CONTROL package to dynamically skip preamble
>>> CEIB and other load_xxx command in sequence.
>>>
>>> v2:
>>> support GFX7 as well, and bump up version.
>>> remove cntxcntl in compute ring funcs because CPC doesn't support
>>> this packet.
>>>
>>> v3: fix reduntant judgement in cntxcntl.
>>>
>>> Change-Id: I4b87ca84ea8c11ba4f7fb4c0e8a5be537ccde851
>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>>>
>>> Change-Id: I5d24c1bb5c14190ce4adeb6a331ee3d92b3d5c83
>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>>
>> Only one signed of by line is enough and remove the change-ids.
>>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  9 +++++++++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 12 ++++++++++++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  3 ++-
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 16 +++++++++-------
>>>   drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   | 20 ++++++++++++++++++++
>>>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   | 30 ++++++++++++++++++++++++++++++
>>>   6 files changed, 82 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index 1254410..0de5f08 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -321,6 +321,7 @@ struct amdgpu_ring_funcs {
>>>       void (*begin_use)(struct amdgpu_ring *ring);
>>>       void (*end_use)(struct amdgpu_ring *ring);
>>>       void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>> +     void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t
>>> + flags);
>>>   };
>>>
>>>   /*
>>> @@ -965,6 +966,7 @@ struct amdgpu_ctx {
>>>       spinlock_t              ring_lock;
>>>       struct fence            **fences;
>>>       struct amdgpu_ctx_ring  rings[AMDGPU_MAX_RINGS];
>>> +     bool preamble_presented;
>>>   };
>>>
>>>   struct amdgpu_ctx_mgr {
>>> @@ -1227,8 +1229,13 @@ struct amdgpu_cs_parser {
>>>
>>>       /* user fence */
>>>       struct amdgpu_bo_list_entry     uf_entry;
>>> +     bool preamble_present; /* True means this command submit
>>> +involves a preamble IB */
>>
>> We only need this in amdgpu_cs_ib_fill() don't we? See below as well.
>>
>> [ML] seems good advice
>>
>>>   };
>>>
>>> +#define PREAMBLE_IB_PRESENT          (1 << 0) /* bit set means command submit involves a preamble IB */
>>> +#define PREAMBLE_IB_PRESENT_FIRST    (1 << 1) /* bit set means preamble IB is first presented in belonging context */
>>
>> Why does that makes a difference if it is seen for the first time?
>>
>> [ml] if it is presented for the first time for belonging ctx, means even current CS do not involve context switch, we still need keep the actions in preamble IB.
>> Usually if current CS is from the same cntx of previous CS, that means no ctx switch occurs, so we can skip the actions in preamble IB. but above case is the exception.
>
> Can't userspace just not set the preamble flag for the first submit with a preamble? I think that would result in the same behavior, unless having two non-preamble CE IB's in a single submit is an issue.
>
> - Bas
>
>>
>>> +#define HAVE_CTX_SWITCH              (1 << 2) /* bit set means context switch occured */
>>> +
>>>   struct amdgpu_job {
>>>       struct amd_sched_job    base;
>>>       struct amdgpu_device    *adev;
>>> @@ -1237,6 +1244,7 @@ struct amdgpu_job {
>>>       struct amdgpu_sync      sync;
>>>       struct amdgpu_ib        *ibs;
>>>       struct fence            *fence; /* the hw fence */
>>> +     uint32_t                preamble_status;
>>>       uint32_t                num_ibs;
>>>       void                    *owner;
>>>       uint64_t                fence_ctx; /* the fence_context this job uses */
>>> @@ -2264,6 +2272,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>>>   #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>>>   #define amdgpu_ring_emit_hdp_invalidate(r) (r)->funcs->emit_hdp_invalidate((r))
>>>   #define amdgpu_ring_emit_switch_buffer(r)
>>> (r)->funcs->emit_switch_buffer((r))
>>> +#define amdgpu_ring_emit_cntxcntl(r, d)
>>> +(r)->funcs->emit_cntxcntl((r), (d))
>>>   #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
>>>   #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
>>>   #define amdgpu_ring_patch_cond_exec(r,o)
>>> (r)->funcs->patch_cond_exec((r),(o))
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> index 2d4e005..6d8c050 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> @@ -792,6 +792,9 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
>>>               if (r)
>>>                       return r;
>>>
>>> +             if (ib->flags & AMDGPU_IB_FLAG_PREAMBLE)
>>> +                     parser->preamble_present = true;
>>> +
>>>               if (parser->job->ring && parser->job->ring != ring)
>>>                       return -EINVAL;
>>>
>>> @@ -930,6 +933,12 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>>>               return r;
>>>       }
>>>
>>> +     if (p->preamble_present) {
>>> +             job->preamble_status |= PREAMBLE_IB_PRESENT;
>>> +             if (!p->ctx->preamble_presented)
>>> +                     job->preamble_status |= PREAMBLE_IB_PRESENT_FIRST;
>>> +     }
>>> +
>>
>> Better move this to the end of amdgpu_cs_ib_fill() where we allocate the IBs as well.
>> [ML] okay, good change.
>>
>>
>>
>>>       job->owner = p->filp;
>>>       job->fence_ctx = entity->fence_context;
>>>       p->fence = fence_get(&job->base.s_fence->finished);
>>> @@ -940,6 +949,9 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>>>       trace_amdgpu_cs_ioctl(job);
>>>       amd_sched_entity_push_job(&job->base);
>>>
>>> +     if (p->preamble_present)
>>> +             p->ctx->preamble_presented = true;
>>> +
>>>       return 0;
>>>   }
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index 56c85e6..44db0ab 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -55,9 +55,10 @@
>>>    * - 3.3.0 - Add VM support for UVD on supported hardware.
>>>    * - 3.4.0 - Add AMDGPU_INFO_NUM_EVICTIONS.
>>>    * - 3.5.0 - Add support for new UVD_NO_OP register.
>>> + * - 3.6.0 - UMD doesn't/shouldn't need to use CONTEXT_CONTROL in
>>> + IB, KMD should do it
>>>    */
>>>   #define KMS_DRIVER_MAJOR    3
>>> -#define KMS_DRIVER_MINOR     5
>>> +#define KMS_DRIVER_MINOR     6
>>>   #define KMS_DRIVER_PATCHLEVEL       0
>>>
>>>   int amdgpu_vram_limit = 0;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> index 04263f0..b12b5ba 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> @@ -121,10 +121,11 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>   {
>>>       struct amdgpu_device *adev = ring->adev;
>>>       struct amdgpu_ib *ib = &ibs[0];
>>> -     bool skip_preamble, need_ctx_switch;
>>> +     bool need_ctx_switch;
>>>       unsigned patch_offset = ~0;
>>>       struct amdgpu_vm *vm;
>>>       uint64_t fence_ctx;
>>> +     uint32_t status = 0;
>>>
>>>       unsigned i;
>>>       int r = 0;
>>> @@ -174,15 +175,16 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>       /* always set cond_exec_polling to CONTINUE */
>>>       *ring->cond_exe_cpu_addr = 1;
>>>
>>> -     skip_preamble = ring->current_ctx == fence_ctx;
>>>       need_ctx_switch = ring->current_ctx != fence_ctx;
>>> +     if (job && ring->funcs->emit_cntxcntl) {
>>> +             if (need_ctx_switch)
>>> +                     status |= HAVE_CTX_SWITCH;
>>> +             status |= job->preamble_status;
>>> +             amdgpu_ring_emit_cntxcntl(ring, status);
>>> +     }
>>> +
>>>       for (i = 0; i < num_ibs; ++i) {
>>>               ib = &ibs[i];
>>> -
>>> -             /* drop preamble IBs if we don't have a context switch */
>>> -             if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) && skip_preamble)
>>> -                     continue;
>>> -
>>
>> Would be nice to keep this functionality for cases where we don't support emit_cntxcntl (e.g. SI?).
>> [ML] SI support CONTEXT_CONTROL as well, and the package structure is exactly the same as CI.
>>
>>>               amdgpu_ring_emit_ib(ring, ib, job ? job->vm_id : 0,
>>>                                   need_ctx_switch);
>>>               need_ctx_switch = false; diff --git
>>> a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> index f055d49..0d5addb 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> @@ -2096,6 +2096,25 @@ static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
>>>       amdgpu_ring_write(ring, control);
>>>   }
>>>
>>> +static void gfx_v7_ring_emit_cntxcntl(struct amdgpu_ring *ring,
>>> +uint32_t flags) {
>>> +     uint32_t dw2 = 0;
>>> +
>>> +     dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
>>> +     if (flags & HAVE_CTX_SWITCH) {
>>> +             /* set load_global_config & load_global_uconfig */
>>> +             dw2 |= 0x8001;
>>> +             /* set load_cs_sh_regs */
>>> +             dw2 |= 0x01000000;
>>> +             /* set load_per_context_state & load_gfx_sh_regs */
>>> +             dw2 |= 0x10002;
>>
>> Better define some constants for those.
>>
>> [ML] I'll leave it to other guys when doing cleanups, a little hurry for other jobs now ...
>>
>> Regards,
>> Christian.
>>
>>> +     }
>>> +
>>> +     amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
>>> +     amdgpu_ring_write(ring, dw2);
>>> +     amdgpu_ring_write(ring, 0);
>>> +}
>>> +
>>>   /**
>>>    * gfx_v7_0_ring_test_ib - basic ring IB test
>>>    *
>>> @@ -4929,6 +4948,7 @@ static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_gfx = {
>>>       .test_ib = gfx_v7_0_ring_test_ib,
>>>       .insert_nop = amdgpu_ring_insert_nop,
>>>       .pad_ib = amdgpu_ring_generic_pad_ib,
>>> +     .emit_cntxcntl = gfx_v7_ring_emit_cntxcntl,
>>>   };
>>>
>>>   static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute =
>>> { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> index 8ba8e42..73f6ffa 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> @@ -6085,6 +6085,35 @@ static void gfx_v8_ring_emit_sb(struct amdgpu_ring *ring)
>>>       amdgpu_ring_write(ring, 0);
>>>   }
>>>
>>> +static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring,
>>> +uint32_t flags) {
>>> +     uint32_t dw2 = 0;
>>> +
>>> +     dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
>>> +     if (flags & HAVE_CTX_SWITCH) {
>>> +             /* set load_global_config & load_global_uconfig */
>>> +             dw2 |= 0x8001;
>>> +             /* set load_cs_sh_regs */
>>> +             dw2 |= 0x01000000;
>>> +             /* set load_per_context_state & load_gfx_sh_regs for GFX */
>>> +             dw2 |= 0x10002;
>>> +
>>> +             /* set load_ce_ram if preamble presented */
>>> +             if (PREAMBLE_IB_PRESENT & flags)
>>> +                     dw2 |= 0x10000000;
>>> +     } else {
>>> +             /* still load_ce_ram if this is the first time preamble presented
>>> +              * although there is no context switch happens.
>>> +              */
>>> +             if (PREAMBLE_IB_PRESENT_FIRST & flags)
>>> +                     dw2 |= 0x10000000;
>>> +     }
>>> +
>>> +     amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
>>> +     amdgpu_ring_write(ring, dw2);
>>> +     amdgpu_ring_write(ring, 0);
>>> +}
>>> +
>>>   static void gfx_v8_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
>>>                                                enum amdgpu_interrupt_state state)
>>>   {
>>> @@ -6267,6 +6296,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
>>>       .insert_nop = amdgpu_ring_insert_nop,
>>>       .pad_ib = amdgpu_ring_generic_pad_ib,
>>>       .emit_switch_buffer = gfx_v8_ring_emit_sb,
>>> +     .emit_cntxcntl = gfx_v8_ring_emit_cntxcntl,
>>>   };
>>>
>>>   static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute =
>>> {
>>
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
       [not found]                     ` <CAP+8YyGF3ht8KLgdTDN6K_r+YJD7ZLDscWXrvvwRdkB=HVOcmg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2016-09-01 17:11                       ` Christian König
       [not found]                         ` <e1a095ea-8717-12e5-c779-fac81525ddcc-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  2016-09-05  4:09                       ` Liu, Monk
  1 sibling, 1 reply; 20+ messages in thread
From: Christian König @ 2016-09-01 17:11 UTC (permalink / raw)
  To: Bas Nieuwenhuizen, Liu, Monk; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Thanks for jumping in and explaining what mesa needs on the user space side.

I already had the feeling that this is a bit swampy, but was about to 
let it pass because of lack of time to question it further. But with 
your notes that proposed approach is clearly a NAK.

> I would prefer keeping the old system for preamble IB's and just
> adding a generic CONTEXT_CONTROL that always enables the CE loads. I
> don't have an opinion the non-CE loads though, as I've never found a
> reason to use them.

The crux is that with virtualization the kernel driver doesn't necessary 
knows when there is a context switch.

So the basic idea is that the kernel driver uses the CONTEXT_CONTROL 
packet to let the IBs know if they should execute the preamble or not in 
hardware.

This way the hardware can still override the kernel drivers decision 
about a context switch and execute the preamble loads anyway.

And if you now think what this isn't really correct virtualization where 
every VM is independent and doesn't know anything about the existence of 
other VMs then I unfortunately have to agree.

Regards,
Christian.

Am 01.09.2016 um 18:09 schrieb Bas Nieuwenhuizen:
> On Thu, Sep 1, 2016 at 12:55 PM, Liu, Monk <Monk.Liu@amd.com> wrote:
>>> Why does that makes a difference if it is seen for the first time?
>>>
>>> [ml] if it is presented for the first time for belonging ctx, means even current CS do not involve context switch, we still need keep the actions in preamble IB.
>>> Usually if current CS is from the same cntx of previous CS, that means no ctx switch occurs, so we can skip the actions in preamble IB. but above case is the exception.
>> Can't userspace just not set the preamble flag for the first submit with a preamble? I think that would result in the same behavior, unless having two non-preamble CE IB's in a single submit is an issue.
>>
>> - Bas
>>
>>
>> [ML] I'm confused, what's your point?
> So if I understand correctly, the new behavior is that the first
> submit containing a preamble always executes the loads in the preamble
> even if there is no context switch. The old behavior is that in that
> situation the preamble would be skipped in the new situation. Why do
> we want the new behavior? If the application wants the loads to
> execute even without a context switch, it should not mark the IB as a
> preamble with AMDGPU_IB_FLAG_PREAMBLE.
>
> On inspecting the patch more closely I think there are more issues
> with this patch.
>
> If there is no IB with AMDGPU_IB_FLAG_PREAMBLE, then the CE_LOAD bit
> always gets disabled. Furthermore if there is a CE_LOAD bit, and no
> context switch the CE_LOAD bit also gets disabled for IB's without
> AMDGPU_IB_FLAG_PREAMBLE.
>
> I think this is a bad move, as there are some uses for loading CE RAM
> that are not dependent on context switches, such as preloading things
> into L2 cache, or switching shader uniforms in CE RAM when a different
> shader gets bound. Therefore I think that the CE_LOAD bit should
> always be enabled for IB's without AMDGPU_IB_FLAG_PREAMBLE.
>
> Furthermore, with this patch the preamble IB's always get executed and
> loads disabled with CONTEXT_CONTROL. As e.g. mesa uses its own
> CONTEXT_CONTROL (and we can't change that for old versions of mesa)
> this would override the kernel CONTEXT_CONTROL and always execute the
> loads.
>
> I also miss the CE_LOAD bit in the CONTEXT_CONTROL for gfx7. Does it
> not need it?
>
> I would prefer keeping the old system for preamble IB's and just
> adding a generic CONTEXT_CONTROL that always enables the CE loads. I
> don't have an opinion the non-CE loads though, as I've never found a
> reason to use them.
>
> - Bas
>
>> With this patch, preamble_flag is not needed at all.
>> Without this patch,  many original assumption and logic is not correct.
>> Besides, CONTEXT_CONTROL not only deals CE but also deal DE.
>
>> BR Monk
>>
>>
>> -----Original Message-----
>> From: Bas Nieuwenhuizen [mailto:bas@basnieuwenhuizen.nl]
>> Sent: Thursday, September 01, 2016 4:19 PM
>> To: Liu, Monk <Monk.Liu@amd.com>
>> Cc: Christian König <deathsimple@vodafone.de>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
>>
>> On Thu, Sep 1, 2016 at 9:37 AM, Liu, Monk <Monk.Liu@amd.com> wrote:
>>>
>>> -----Original Message-----
>>> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
>>> Of Christian K?nig
>>> Sent: Wednesday, August 31, 2016 7:53 PM
>>> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
>>>
>>> Looks good to me in general, a few nit picks and sugegstions below.
>>>
>>> Am 31.08.2016 um 05:49 schrieb Monk Liu:
>>>> v1:
>>>> for gfx8, use CONTEXT_CONTROL package to dynamically skip preamble
>>>> CEIB and other load_xxx command in sequence.
>>>>
>>>> v2:
>>>> support GFX7 as well, and bump up version.
>>>> remove cntxcntl in compute ring funcs because CPC doesn't support
>>>> this packet.
>>>>
>>>> v3: fix reduntant judgement in cntxcntl.
>>>>
>>>> Change-Id: I4b87ca84ea8c11ba4f7fb4c0e8a5be537ccde851
>>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>>>>
>>>> Change-Id: I5d24c1bb5c14190ce4adeb6a331ee3d92b3d5c83
>>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>>> Only one signed of by line is enough and remove the change-ids.
>>>
>>>> ---
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  9 +++++++++
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 12 ++++++++++++
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  3 ++-
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 16 +++++++++-------
>>>>    drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   | 20 ++++++++++++++++++++
>>>>    drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   | 30 ++++++++++++++++++++++++++++++
>>>>    6 files changed, 82 insertions(+), 8 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> index 1254410..0de5f08 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> @@ -321,6 +321,7 @@ struct amdgpu_ring_funcs {
>>>>        void (*begin_use)(struct amdgpu_ring *ring);
>>>>        void (*end_use)(struct amdgpu_ring *ring);
>>>>        void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>>> +     void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t
>>>> + flags);
>>>>    };
>>>>
>>>>    /*
>>>> @@ -965,6 +966,7 @@ struct amdgpu_ctx {
>>>>        spinlock_t              ring_lock;
>>>>        struct fence            **fences;
>>>>        struct amdgpu_ctx_ring  rings[AMDGPU_MAX_RINGS];
>>>> +     bool preamble_presented;
>>>>    };
>>>>
>>>>    struct amdgpu_ctx_mgr {
>>>> @@ -1227,8 +1229,13 @@ struct amdgpu_cs_parser {
>>>>
>>>>        /* user fence */
>>>>        struct amdgpu_bo_list_entry     uf_entry;
>>>> +     bool preamble_present; /* True means this command submit
>>>> +involves a preamble IB */
>>> We only need this in amdgpu_cs_ib_fill() don't we? See below as well.
>>>
>>> [ML] seems good advice
>>>
>>>>    };
>>>>
>>>> +#define PREAMBLE_IB_PRESENT          (1 << 0) /* bit set means command submit involves a preamble IB */
>>>> +#define PREAMBLE_IB_PRESENT_FIRST    (1 << 1) /* bit set means preamble IB is first presented in belonging context */
>>> Why does that makes a difference if it is seen for the first time?
>>>
>>> [ml] if it is presented for the first time for belonging ctx, means even current CS do not involve context switch, we still need keep the actions in preamble IB.
>>> Usually if current CS is from the same cntx of previous CS, that means no ctx switch occurs, so we can skip the actions in preamble IB. but above case is the exception.
>> Can't userspace just not set the preamble flag for the first submit with a preamble? I think that would result in the same behavior, unless having two non-preamble CE IB's in a single submit is an issue.
>>
>> - Bas
>>
>>>> +#define HAVE_CTX_SWITCH              (1 << 2) /* bit set means context switch occured */
>>>> +
>>>>    struct amdgpu_job {
>>>>        struct amd_sched_job    base;
>>>>        struct amdgpu_device    *adev;
>>>> @@ -1237,6 +1244,7 @@ struct amdgpu_job {
>>>>        struct amdgpu_sync      sync;
>>>>        struct amdgpu_ib        *ibs;
>>>>        struct fence            *fence; /* the hw fence */
>>>> +     uint32_t                preamble_status;
>>>>        uint32_t                num_ibs;
>>>>        void                    *owner;
>>>>        uint64_t                fence_ctx; /* the fence_context this job uses */
>>>> @@ -2264,6 +2272,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>>>>    #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>>>>    #define amdgpu_ring_emit_hdp_invalidate(r) (r)->funcs->emit_hdp_invalidate((r))
>>>>    #define amdgpu_ring_emit_switch_buffer(r)
>>>> (r)->funcs->emit_switch_buffer((r))
>>>> +#define amdgpu_ring_emit_cntxcntl(r, d)
>>>> +(r)->funcs->emit_cntxcntl((r), (d))
>>>>    #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
>>>>    #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
>>>>    #define amdgpu_ring_patch_cond_exec(r,o)
>>>> (r)->funcs->patch_cond_exec((r),(o))
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>> index 2d4e005..6d8c050 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>> @@ -792,6 +792,9 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
>>>>                if (r)
>>>>                        return r;
>>>>
>>>> +             if (ib->flags & AMDGPU_IB_FLAG_PREAMBLE)
>>>> +                     parser->preamble_present = true;
>>>> +
>>>>                if (parser->job->ring && parser->job->ring != ring)
>>>>                        return -EINVAL;
>>>>
>>>> @@ -930,6 +933,12 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>>>>                return r;
>>>>        }
>>>>
>>>> +     if (p->preamble_present) {
>>>> +             job->preamble_status |= PREAMBLE_IB_PRESENT;
>>>> +             if (!p->ctx->preamble_presented)
>>>> +                     job->preamble_status |= PREAMBLE_IB_PRESENT_FIRST;
>>>> +     }
>>>> +
>>> Better move this to the end of amdgpu_cs_ib_fill() where we allocate the IBs as well.
>>> [ML] okay, good change.
>>>
>>>
>>>
>>>>        job->owner = p->filp;
>>>>        job->fence_ctx = entity->fence_context;
>>>>        p->fence = fence_get(&job->base.s_fence->finished);
>>>> @@ -940,6 +949,9 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>>>>        trace_amdgpu_cs_ioctl(job);
>>>>        amd_sched_entity_push_job(&job->base);
>>>>
>>>> +     if (p->preamble_present)
>>>> +             p->ctx->preamble_presented = true;
>>>> +
>>>>        return 0;
>>>>    }
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> index 56c85e6..44db0ab 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> @@ -55,9 +55,10 @@
>>>>     * - 3.3.0 - Add VM support for UVD on supported hardware.
>>>>     * - 3.4.0 - Add AMDGPU_INFO_NUM_EVICTIONS.
>>>>     * - 3.5.0 - Add support for new UVD_NO_OP register.
>>>> + * - 3.6.0 - UMD doesn't/shouldn't need to use CONTEXT_CONTROL in
>>>> + IB, KMD should do it
>>>>     */
>>>>    #define KMS_DRIVER_MAJOR    3
>>>> -#define KMS_DRIVER_MINOR     5
>>>> +#define KMS_DRIVER_MINOR     6
>>>>    #define KMS_DRIVER_PATCHLEVEL       0
>>>>
>>>>    int amdgpu_vram_limit = 0;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> index 04263f0..b12b5ba 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> @@ -121,10 +121,11 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>>    {
>>>>        struct amdgpu_device *adev = ring->adev;
>>>>        struct amdgpu_ib *ib = &ibs[0];
>>>> -     bool skip_preamble, need_ctx_switch;
>>>> +     bool need_ctx_switch;
>>>>        unsigned patch_offset = ~0;
>>>>        struct amdgpu_vm *vm;
>>>>        uint64_t fence_ctx;
>>>> +     uint32_t status = 0;
>>>>
>>>>        unsigned i;
>>>>        int r = 0;
>>>> @@ -174,15 +175,16 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>>        /* always set cond_exec_polling to CONTINUE */
>>>>        *ring->cond_exe_cpu_addr = 1;
>>>>
>>>> -     skip_preamble = ring->current_ctx == fence_ctx;
>>>>        need_ctx_switch = ring->current_ctx != fence_ctx;
>>>> +     if (job && ring->funcs->emit_cntxcntl) {
>>>> +             if (need_ctx_switch)
>>>> +                     status |= HAVE_CTX_SWITCH;
>>>> +             status |= job->preamble_status;
>>>> +             amdgpu_ring_emit_cntxcntl(ring, status);
>>>> +     }
>>>> +
>>>>        for (i = 0; i < num_ibs; ++i) {
>>>>                ib = &ibs[i];
>>>> -
>>>> -             /* drop preamble IBs if we don't have a context switch */
>>>> -             if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) && skip_preamble)
>>>> -                     continue;
>>>> -
>>> Would be nice to keep this functionality for cases where we don't support emit_cntxcntl (e.g. SI?).
>>> [ML] SI support CONTEXT_CONTROL as well, and the package structure is exactly the same as CI.
>>>
>>>>                amdgpu_ring_emit_ib(ring, ib, job ? job->vm_id : 0,
>>>>                                    need_ctx_switch);
>>>>                need_ctx_switch = false; diff --git
>>>> a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>>> index f055d49..0d5addb 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>>> @@ -2096,6 +2096,25 @@ static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
>>>>        amdgpu_ring_write(ring, control);
>>>>    }
>>>>
>>>> +static void gfx_v7_ring_emit_cntxcntl(struct amdgpu_ring *ring,
>>>> +uint32_t flags) {
>>>> +     uint32_t dw2 = 0;
>>>> +
>>>> +     dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
>>>> +     if (flags & HAVE_CTX_SWITCH) {
>>>> +             /* set load_global_config & load_global_uconfig */
>>>> +             dw2 |= 0x8001;
>>>> +             /* set load_cs_sh_regs */
>>>> +             dw2 |= 0x01000000;
>>>> +             /* set load_per_context_state & load_gfx_sh_regs */
>>>> +             dw2 |= 0x10002;
>>> Better define some constants for those.
>>>
>>> [ML] I'll leave it to other guys when doing cleanups, a little hurry for other jobs now ...
>>>
>>> Regards,
>>> Christian.
>>>
>>>> +     }
>>>> +
>>>> +     amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
>>>> +     amdgpu_ring_write(ring, dw2);
>>>> +     amdgpu_ring_write(ring, 0);
>>>> +}
>>>> +
>>>>    /**
>>>>     * gfx_v7_0_ring_test_ib - basic ring IB test
>>>>     *
>>>> @@ -4929,6 +4948,7 @@ static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_gfx = {
>>>>        .test_ib = gfx_v7_0_ring_test_ib,
>>>>        .insert_nop = amdgpu_ring_insert_nop,
>>>>        .pad_ib = amdgpu_ring_generic_pad_ib,
>>>> +     .emit_cntxcntl = gfx_v7_ring_emit_cntxcntl,
>>>>    };
>>>>
>>>>    static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute =
>>>> { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>> index 8ba8e42..73f6ffa 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>> @@ -6085,6 +6085,35 @@ static void gfx_v8_ring_emit_sb(struct amdgpu_ring *ring)
>>>>        amdgpu_ring_write(ring, 0);
>>>>    }
>>>>
>>>> +static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring,
>>>> +uint32_t flags) {
>>>> +     uint32_t dw2 = 0;
>>>> +
>>>> +     dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
>>>> +     if (flags & HAVE_CTX_SWITCH) {
>>>> +             /* set load_global_config & load_global_uconfig */
>>>> +             dw2 |= 0x8001;
>>>> +             /* set load_cs_sh_regs */
>>>> +             dw2 |= 0x01000000;
>>>> +             /* set load_per_context_state & load_gfx_sh_regs for GFX */
>>>> +             dw2 |= 0x10002;
>>>> +
>>>> +             /* set load_ce_ram if preamble presented */
>>>> +             if (PREAMBLE_IB_PRESENT & flags)
>>>> +                     dw2 |= 0x10000000;
>>>> +     } else {
>>>> +             /* still load_ce_ram if this is the first time preamble presented
>>>> +              * although there is no context switch happens.
>>>> +              */
>>>> +             if (PREAMBLE_IB_PRESENT_FIRST & flags)
>>>> +                     dw2 |= 0x10000000;
>>>> +     }
>>>> +
>>>> +     amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
>>>> +     amdgpu_ring_write(ring, dw2);
>>>> +     amdgpu_ring_write(ring, 0);
>>>> +}
>>>> +
>>>>    static void gfx_v8_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
>>>>                                                 enum amdgpu_interrupt_state state)
>>>>    {
>>>> @@ -6267,6 +6296,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
>>>>        .insert_nop = amdgpu_ring_insert_nop,
>>>>        .pad_ib = amdgpu_ring_generic_pad_ib,
>>>>        .emit_switch_buffer = gfx_v8_ring_emit_sb,
>>>> +     .emit_cntxcntl = gfx_v8_ring_emit_cntxcntl,
>>>>    };
>>>>
>>>>    static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute =
>>>> {
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
       [not found]                         ` <e1a095ea-8717-12e5-c779-fac81525ddcc-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2016-09-01 18:20                           ` Bas Nieuwenhuizen
  0 siblings, 0 replies; 20+ messages in thread
From: Bas Nieuwenhuizen @ 2016-09-01 18:20 UTC (permalink / raw)
  To: Christian König; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Liu, Monk

gh

On Thu, Sep 1, 2016 at 7:11 PM, Christian König <deathsimple@vodafone.de> wrote:
> Thanks for jumping in and explaining what mesa needs on the user space side.
>
> I already had the feeling that this is a bit swampy, but was about to let it
> pass because of lack of time to question it further. But with your notes
> that proposed approach is clearly a NAK.
>
>> I would prefer keeping the old system for preamble IB's and just
>> adding a generic CONTEXT_CONTROL that always enables the CE loads. I
>> don't have an opinion the non-CE loads though, as I've never found a
>> reason to use them.
>
>
> The crux is that with virtualization the kernel driver doesn't necessary
> knows when there is a context switch.

I'm not sure I completely understand the virtualization issues, but if
you have a clear usecase for using the CONTEXT_CONTROL this way then I
think mesa can cope with it.

With that in mind I want to clarifiy my arguments a bit:

With reference to the effect of this patch on IB's without
AMDGPU_IB_FLAG_PREAMBLE, this is not used by mesa at the moment, those
were potential uses, not sure if that was clear enough in my original
mail.We can always add that behavior back in the CONTEXT_CONTROL
scheme when it actually gets used. I would appreciate it if the
interface can be reworked though as it is really not intuitive that a
flag on a single IB also affects the others in the submit.

With reference to always executing the preamble if the IB contains a
CONTEXT_CONTROL, I think the impact of always executing is fairly low.
IIRC disabling the preamble in mesa cost <0.5% (probably even
significantly lower although not sure), so if you really want to use
the CONTEXT_CONTROL, I think the impact on old mesa is reasonable, and
new mesa can be fixed.

- Bas

>
> So the basic idea is that the kernel driver uses the CONTEXT_CONTROL packet
> to let the IBs know if they should execute the preamble or not in hardware.
>
> This way the hardware can still override the kernel drivers decision about a
> context switch and execute the preamble loads anyway.
>
> And if you now think what this isn't really correct virtualization where
> every VM is independent and doesn't know anything about the existence of
> other VMs then I unfortunately have to agree.
>
> Regards,
> Christian.
>
>
> Am 01.09.2016 um 18:09 schrieb Bas Nieuwenhuizen:
>>
>> On Thu, Sep 1, 2016 at 12:55 PM, Liu, Monk <Monk.Liu@amd.com> wrote:
>>>>
>>>> Why does that makes a difference if it is seen for the first time?
>>>>
>>>> [ml] if it is presented for the first time for belonging ctx, means even
>>>> current CS do not involve context switch, we still need keep the actions in
>>>> preamble IB.
>>>> Usually if current CS is from the same cntx of previous CS, that means
>>>> no ctx switch occurs, so we can skip the actions in preamble IB. but above
>>>> case is the exception.
>>>
>>> Can't userspace just not set the preamble flag for the first submit with
>>> a preamble? I think that would result in the same behavior, unless having
>>> two non-preamble CE IB's in a single submit is an issue.
>>>
>>> - Bas
>>>
>>>
>>> [ML] I'm confused, what's your point?
>>
>> So if I understand correctly, the new behavior is that the first
>> submit containing a preamble always executes the loads in the preamble
>> even if there is no context switch. The old behavior is that in that
>> situation the preamble would be skipped in the new situation. Why do
>> we want the new behavior? If the application wants the loads to
>> execute even without a context switch, it should not mark the IB as a
>> preamble with AMDGPU_IB_FLAG_PREAMBLE.
>>
>> On inspecting the patch more closely I think there are more issues
>> with this patch.
>>
>> If there is no IB with AMDGPU_IB_FLAG_PREAMBLE, then the CE_LOAD bit
>> always gets disabled. Furthermore if there is a CE_LOAD bit, and no
>> context switch the CE_LOAD bit also gets disabled for IB's without
>> AMDGPU_IB_FLAG_PREAMBLE.
>>
>> I think this is a bad move, as there are some uses for loading CE RAM
>> that are not dependent on context switches, such as preloading things
>> into L2 cache, or switching shader uniforms in CE RAM when a different
>> shader gets bound. Therefore I think that the CE_LOAD bit should
>> always be enabled for IB's without AMDGPU_IB_FLAG_PREAMBLE.
>>
>> Furthermore, with this patch the preamble IB's always get executed and
>> loads disabled with CONTEXT_CONTROL. As e.g. mesa uses its own
>> CONTEXT_CONTROL (and we can't change that for old versions of mesa)
>> this would override the kernel CONTEXT_CONTROL and always execute the
>> loads.
>>
>> I also miss the CE_LOAD bit in the CONTEXT_CONTROL for gfx7. Does it
>> not need it?
>>
>> I would prefer keeping the old system for preamble IB's and just
>> adding a generic CONTEXT_CONTROL that always enables the CE loads. I
>> don't have an opinion the non-CE loads though, as I've never found a
>> reason to use them.
>>
>> - Bas
>>
>>> With this patch, preamble_flag is not needed at all.
>>> Without this patch,  many original assumption and logic is not correct.
>>> Besides, CONTEXT_CONTROL not only deals CE but also deal DE.
>>
>>
>>> BR Monk
>>>
>>>
>>> -----Original Message-----
>>> From: Bas Nieuwenhuizen [mailto:bas@basnieuwenhuizen.nl]
>>> Sent: Thursday, September 01, 2016 4:19 PM
>>> To: Liu, Monk <Monk.Liu@amd.com>
>>> Cc: Christian König <deathsimple@vodafone.de>;
>>> amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
>>>
>>> On Thu, Sep 1, 2016 at 9:37 AM, Liu, Monk <Monk.Liu@amd.com> wrote:
>>>>
>>>>
>>>> -----Original Message-----
>>>> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
>>>> Of Christian K?nig
>>>> Sent: Wednesday, August 31, 2016 7:53 PM
>>>> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
>>>>
>>>> Looks good to me in general, a few nit picks and sugegstions below.
>>>>
>>>> Am 31.08.2016 um 05:49 schrieb Monk Liu:
>>>>>
>>>>> v1:
>>>>> for gfx8, use CONTEXT_CONTROL package to dynamically skip preamble
>>>>> CEIB and other load_xxx command in sequence.
>>>>>
>>>>> v2:
>>>>> support GFX7 as well, and bump up version.
>>>>> remove cntxcntl in compute ring funcs because CPC doesn't support
>>>>> this packet.
>>>>>
>>>>> v3: fix reduntant judgement in cntxcntl.
>>>>>
>>>>> Change-Id: I4b87ca84ea8c11ba4f7fb4c0e8a5be537ccde851
>>>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>>>>>
>>>>> Change-Id: I5d24c1bb5c14190ce4adeb6a331ee3d92b3d5c83
>>>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>>>>
>>>> Only one signed of by line is enough and remove the change-ids.
>>>>
>>>>> ---
>>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  9 +++++++++
>>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 12 ++++++++++++
>>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  3 ++-
>>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 16 +++++++++-------
>>>>>    drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   | 20 ++++++++++++++++++++
>>>>>    drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   | 30
>>>>> ++++++++++++++++++++++++++++++
>>>>>    6 files changed, 82 insertions(+), 8 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>> index 1254410..0de5f08 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>> @@ -321,6 +321,7 @@ struct amdgpu_ring_funcs {
>>>>>        void (*begin_use)(struct amdgpu_ring *ring);
>>>>>        void (*end_use)(struct amdgpu_ring *ring);
>>>>>        void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>>>> +     void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t
>>>>> + flags);
>>>>>    };
>>>>>
>>>>>    /*
>>>>> @@ -965,6 +966,7 @@ struct amdgpu_ctx {
>>>>>        spinlock_t              ring_lock;
>>>>>        struct fence            **fences;
>>>>>        struct amdgpu_ctx_ring  rings[AMDGPU_MAX_RINGS];
>>>>> +     bool preamble_presented;
>>>>>    };
>>>>>
>>>>>    struct amdgpu_ctx_mgr {
>>>>> @@ -1227,8 +1229,13 @@ struct amdgpu_cs_parser {
>>>>>
>>>>>        /* user fence */
>>>>>        struct amdgpu_bo_list_entry     uf_entry;
>>>>> +     bool preamble_present; /* True means this command submit
>>>>> +involves a preamble IB */
>>>>
>>>> We only need this in amdgpu_cs_ib_fill() don't we? See below as well.
>>>>
>>>> [ML] seems good advice
>>>>
>>>>>    };
>>>>>
>>>>> +#define PREAMBLE_IB_PRESENT          (1 << 0) /* bit set means command
>>>>> submit involves a preamble IB */
>>>>> +#define PREAMBLE_IB_PRESENT_FIRST    (1 << 1) /* bit set means
>>>>> preamble IB is first presented in belonging context */
>>>>
>>>> Why does that makes a difference if it is seen for the first time?
>>>>
>>>> [ml] if it is presented for the first time for belonging ctx, means even
>>>> current CS do not involve context switch, we still need keep the actions in
>>>> preamble IB.
>>>> Usually if current CS is from the same cntx of previous CS, that means
>>>> no ctx switch occurs, so we can skip the actions in preamble IB. but above
>>>> case is the exception.
>>>
>>> Can't userspace just not set the preamble flag for the first submit with
>>> a preamble? I think that would result in the same behavior, unless having
>>> two non-preamble CE IB's in a single submit is an issue.
>>>
>>> - Bas
>>>
>>>>> +#define HAVE_CTX_SWITCH              (1 << 2) /* bit set means context
>>>>> switch occured */
>>>>> +
>>>>>    struct amdgpu_job {
>>>>>        struct amd_sched_job    base;
>>>>>        struct amdgpu_device    *adev;
>>>>> @@ -1237,6 +1244,7 @@ struct amdgpu_job {
>>>>>        struct amdgpu_sync      sync;
>>>>>        struct amdgpu_ib        *ibs;
>>>>>        struct fence            *fence; /* the hw fence */
>>>>> +     uint32_t                preamble_status;
>>>>>        uint32_t                num_ibs;
>>>>>        void                    *owner;
>>>>>        uint64_t                fence_ctx; /* the fence_context this job
>>>>> uses */
>>>>> @@ -2264,6 +2272,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring
>>>>> *ring)
>>>>>    #define amdgpu_ring_emit_hdp_flush(r)
>>>>> (r)->funcs->emit_hdp_flush((r))
>>>>>    #define amdgpu_ring_emit_hdp_invalidate(r)
>>>>> (r)->funcs->emit_hdp_invalidate((r))
>>>>>    #define amdgpu_ring_emit_switch_buffer(r)
>>>>> (r)->funcs->emit_switch_buffer((r))
>>>>> +#define amdgpu_ring_emit_cntxcntl(r, d)
>>>>> +(r)->funcs->emit_cntxcntl((r), (d))
>>>>>    #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
>>>>>    #define amdgpu_ring_init_cond_exec(r)
>>>>> (r)->funcs->init_cond_exec((r))
>>>>>    #define amdgpu_ring_patch_cond_exec(r,o)
>>>>> (r)->funcs->patch_cond_exec((r),(o))
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>>> index 2d4e005..6d8c050 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>>> @@ -792,6 +792,9 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device
>>>>> *adev,
>>>>>                if (r)
>>>>>                        return r;
>>>>>
>>>>> +             if (ib->flags & AMDGPU_IB_FLAG_PREAMBLE)
>>>>> +                     parser->preamble_present = true;
>>>>> +
>>>>>                if (parser->job->ring && parser->job->ring != ring)
>>>>>                        return -EINVAL;
>>>>>
>>>>> @@ -930,6 +933,12 @@ static int amdgpu_cs_submit(struct
>>>>> amdgpu_cs_parser *p,
>>>>>                return r;
>>>>>        }
>>>>>
>>>>> +     if (p->preamble_present) {
>>>>> +             job->preamble_status |= PREAMBLE_IB_PRESENT;
>>>>> +             if (!p->ctx->preamble_presented)
>>>>> +                     job->preamble_status |=
>>>>> PREAMBLE_IB_PRESENT_FIRST;
>>>>> +     }
>>>>> +
>>>>
>>>> Better move this to the end of amdgpu_cs_ib_fill() where we allocate the
>>>> IBs as well.
>>>> [ML] okay, good change.
>>>>
>>>>
>>>>
>>>>>        job->owner = p->filp;
>>>>>        job->fence_ctx = entity->fence_context;
>>>>>        p->fence = fence_get(&job->base.s_fence->finished);
>>>>> @@ -940,6 +949,9 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser
>>>>> *p,
>>>>>        trace_amdgpu_cs_ioctl(job);
>>>>>        amd_sched_entity_push_job(&job->base);
>>>>>
>>>>> +     if (p->preamble_present)
>>>>> +             p->ctx->preamble_presented = true;
>>>>> +
>>>>>        return 0;
>>>>>    }
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>> index 56c85e6..44db0ab 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>> @@ -55,9 +55,10 @@
>>>>>     * - 3.3.0 - Add VM support for UVD on supported hardware.
>>>>>     * - 3.4.0 - Add AMDGPU_INFO_NUM_EVICTIONS.
>>>>>     * - 3.5.0 - Add support for new UVD_NO_OP register.
>>>>> + * - 3.6.0 - UMD doesn't/shouldn't need to use CONTEXT_CONTROL in
>>>>> + IB, KMD should do it
>>>>>     */
>>>>>    #define KMS_DRIVER_MAJOR    3
>>>>> -#define KMS_DRIVER_MINOR     5
>>>>> +#define KMS_DRIVER_MINOR     6
>>>>>    #define KMS_DRIVER_PATCHLEVEL       0
>>>>>
>>>>>    int amdgpu_vram_limit = 0;
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>>> index 04263f0..b12b5ba 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>>> @@ -121,10 +121,11 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring,
>>>>> unsigned num_ibs,
>>>>>    {
>>>>>        struct amdgpu_device *adev = ring->adev;
>>>>>        struct amdgpu_ib *ib = &ibs[0];
>>>>> -     bool skip_preamble, need_ctx_switch;
>>>>> +     bool need_ctx_switch;
>>>>>        unsigned patch_offset = ~0;
>>>>>        struct amdgpu_vm *vm;
>>>>>        uint64_t fence_ctx;
>>>>> +     uint32_t status = 0;
>>>>>
>>>>>        unsigned i;
>>>>>        int r = 0;
>>>>> @@ -174,15 +175,16 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring,
>>>>> unsigned num_ibs,
>>>>>        /* always set cond_exec_polling to CONTINUE */
>>>>>        *ring->cond_exe_cpu_addr = 1;
>>>>>
>>>>> -     skip_preamble = ring->current_ctx == fence_ctx;
>>>>>        need_ctx_switch = ring->current_ctx != fence_ctx;
>>>>> +     if (job && ring->funcs->emit_cntxcntl) {
>>>>> +             if (need_ctx_switch)
>>>>> +                     status |= HAVE_CTX_SWITCH;
>>>>> +             status |= job->preamble_status;
>>>>> +             amdgpu_ring_emit_cntxcntl(ring, status);
>>>>> +     }
>>>>> +
>>>>>        for (i = 0; i < num_ibs; ++i) {
>>>>>                ib = &ibs[i];
>>>>> -
>>>>> -             /* drop preamble IBs if we don't have a context switch */
>>>>> -             if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) &&
>>>>> skip_preamble)
>>>>> -                     continue;
>>>>> -
>>>>
>>>> Would be nice to keep this functionality for cases where we don't
>>>> support emit_cntxcntl (e.g. SI?).
>>>> [ML] SI support CONTEXT_CONTROL as well, and the package structure is
>>>> exactly the same as CI.
>>>>
>>>>>                amdgpu_ring_emit_ib(ring, ib, job ? job->vm_id : 0,
>>>>>                                    need_ctx_switch);
>>>>>                need_ctx_switch = false; diff --git
>>>>> a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>>>> index f055d49..0d5addb 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>>>> @@ -2096,6 +2096,25 @@ static void gfx_v7_0_ring_emit_ib_compute(struct
>>>>> amdgpu_ring *ring,
>>>>>        amdgpu_ring_write(ring, control);
>>>>>    }
>>>>>
>>>>> +static void gfx_v7_ring_emit_cntxcntl(struct amdgpu_ring *ring,
>>>>> +uint32_t flags) {
>>>>> +     uint32_t dw2 = 0;
>>>>> +
>>>>> +     dw2 |= 0x80000000; /* set load_enable otherwise this package is
>>>>> just NOPs */
>>>>> +     if (flags & HAVE_CTX_SWITCH) {
>>>>> +             /* set load_global_config & load_global_uconfig */
>>>>> +             dw2 |= 0x8001;
>>>>> +             /* set load_cs_sh_regs */
>>>>> +             dw2 |= 0x01000000;
>>>>> +             /* set load_per_context_state & load_gfx_sh_regs */
>>>>> +             dw2 |= 0x10002;
>>>>
>>>> Better define some constants for those.
>>>>
>>>> [ML] I'll leave it to other guys when doing cleanups, a little hurry for
>>>> other jobs now ...
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> +     }
>>>>> +
>>>>> +     amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
>>>>> +     amdgpu_ring_write(ring, dw2);
>>>>> +     amdgpu_ring_write(ring, 0);
>>>>> +}
>>>>> +
>>>>>    /**
>>>>>     * gfx_v7_0_ring_test_ib - basic ring IB test
>>>>>     *
>>>>> @@ -4929,6 +4948,7 @@ static const struct amdgpu_ring_funcs
>>>>> gfx_v7_0_ring_funcs_gfx = {
>>>>>        .test_ib = gfx_v7_0_ring_test_ib,
>>>>>        .insert_nop = amdgpu_ring_insert_nop,
>>>>>        .pad_ib = amdgpu_ring_generic_pad_ib,
>>>>> +     .emit_cntxcntl = gfx_v7_ring_emit_cntxcntl,
>>>>>    };
>>>>>
>>>>>    static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute =
>>>>> { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>>> index 8ba8e42..73f6ffa 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>>> @@ -6085,6 +6085,35 @@ static void gfx_v8_ring_emit_sb(struct
>>>>> amdgpu_ring *ring)
>>>>>        amdgpu_ring_write(ring, 0);
>>>>>    }
>>>>>
>>>>> +static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring,
>>>>> +uint32_t flags) {
>>>>> +     uint32_t dw2 = 0;
>>>>> +
>>>>> +     dw2 |= 0x80000000; /* set load_enable otherwise this package is
>>>>> just NOPs */
>>>>> +     if (flags & HAVE_CTX_SWITCH) {
>>>>> +             /* set load_global_config & load_global_uconfig */
>>>>> +             dw2 |= 0x8001;
>>>>> +             /* set load_cs_sh_regs */
>>>>> +             dw2 |= 0x01000000;
>>>>> +             /* set load_per_context_state & load_gfx_sh_regs for GFX
>>>>> */
>>>>> +             dw2 |= 0x10002;
>>>>> +
>>>>> +             /* set load_ce_ram if preamble presented */
>>>>> +             if (PREAMBLE_IB_PRESENT & flags)
>>>>> +                     dw2 |= 0x10000000;
>>>>> +     } else {
>>>>> +             /* still load_ce_ram if this is the first time preamble
>>>>> presented
>>>>> +              * although there is no context switch happens.
>>>>> +              */
>>>>> +             if (PREAMBLE_IB_PRESENT_FIRST & flags)
>>>>> +                     dw2 |= 0x10000000;
>>>>> +     }
>>>>> +
>>>>> +     amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
>>>>> +     amdgpu_ring_write(ring, dw2);
>>>>> +     amdgpu_ring_write(ring, 0);
>>>>> +}
>>>>> +
>>>>>    static void gfx_v8_0_set_gfx_eop_interrupt_state(struct
>>>>> amdgpu_device *adev,
>>>>>                                                 enum
>>>>> amdgpu_interrupt_state state)
>>>>>    {
>>>>> @@ -6267,6 +6296,7 @@ static const struct amdgpu_ring_funcs
>>>>> gfx_v8_0_ring_funcs_gfx = {
>>>>>        .insert_nop = amdgpu_ring_insert_nop,
>>>>>        .pad_ib = amdgpu_ring_generic_pad_ib,
>>>>>        .emit_switch_buffer = gfx_v8_ring_emit_sb,
>>>>> +     .emit_cntxcntl = gfx_v8_ring_emit_cntxcntl,
>>>>>    };
>>>>>
>>>>>    static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute =
>>>>> {
>>>>
>>>>
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
>
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
       [not found]                         ` <MWHPR12MB11820449DB6EF7F18F9397A284E20-Gy0DoCVfaSVhjnLHdLm0OQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2016-09-05  3:14                           ` Liu, Monk
  0 siblings, 0 replies; 20+ messages in thread
From: Liu, Monk @ 2016-09-05  3:14 UTC (permalink / raw)
  To: Christian König; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Ping  again.

If no one can give a proposal of a better solution, I'll use my way and push the patch.
Repeat again: original design is incorrect (DE preamble part is not covered by Preamble_flag at all), besides it doesn't support virtualization.

If anybody reject patches, please give a proposal  to satisfy below goal:
1) need a CONTEXT_SWITCH in ring buffer to be a MARK so CP can do world switch. (I assume upstream kernel need to support sr-iov, if not I'll only submit related patches to hybrid branch)
2) preamble_flag can only skip Preamble CE ib, so the command in DE ib is not covered, CONTEXT_CONTROL can fix that

BR Monk

-----Original Message-----
From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf Of Liu, Monk
Sent: Thursday, September 01, 2016 11:15 PM
To: Christian König <deathsimple@vodafone.de>; Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Cc: amd-gfx@lists.freedesktop.org
Subject: RE: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)

Which part is no correct ?

When CONTEXT_CONTROL introduced in kernel side, it dynamically skips or keeps the commands in preamble CEIB (LOAD_CE_RAM), so the Preamble CE IB not needed to skipped by software method.

But don't forget that CONTEXT_CONTROL also dynamically controls if the following load_xxx commands (in DE IB) will be skipped or kept, 

Original method can only skip the Preamble CE IB (totally) if no context switch, but it cannot skip the load__xxx from DE IB, since UMD is not aware of process switch (which leads to context switch )

UMD should always insert load_xxx commands in DE IB wrapped by preamble_start and preamble_end, and let KMD decide if those load_xxx shall kept or skipped by CONTEXT_CONTROL.



Now we go back to original logic:

Even if there is no context switch, original method skill keeps those Load_xxx in DE IB, right ? (Preamble_flag only controls skipping of Preamble CE IB).

The flag of "Preamble_flag" not works for DE IB, and that design is  incorrect ... and I am really shock that we use wrong method for such long time ... 
(not to mention that MESA insert CONTEXT_CONTROL in IB, which is clearly wrong).

Since MESA also use CE,  we can totally drop "Preamble_flag" and bump up the version. I don't know why we cannot just sync with windows kmd scheme for this. 

BR Monk


-----Original Message-----
From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf Of Christian K?nig
Sent: Thursday, September 01, 2016 10:10 PM
To: Liu, Monk <Monk.Liu@amd.com>; Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Cc: amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)

Am 01.09.2016 um 12:55 schrieb Liu, Monk:
>> Why does that makes a difference if it is seen for the first time?
>>
>> [ml] if it is presented for the first time for belonging ctx, means even current CS do not involve context switch, we still need keep the actions in preamble IB.
>> Usually if current CS is from the same cntx of previous CS, that means no ctx switch occurs, so we can skip the actions in preamble IB. but above case is the exception.
> Can't userspace just not set the preamble flag for the first submit with a preamble? I think that would result in the same behavior, unless having two non-preamble CE IB's in a single submit is an issue.
>
> - Bas
>
>
> [ML] I'm confused, what's your point?
>
> With this patch, preamble_flag is not needed at all.

Well then there is something wrong with the patch. Setting the preamble flag should result in an IB only being executed when there was a task switch.

We can either implement that as a software solution by skipping the IBs in question or by using the hardware for this.

Double checking your patch actually reveals that you want to emit the context control package only once, so this can't be correct.

Regards,
Christian.

> Without this patch,  many original assumption and logic is not correct.
> Besides, CONTEXT_CONTROL not only deals CE but also deal DE.
>
> BR Monk
>
>
> -----Original Message-----
> From: Bas Nieuwenhuizen [mailto:bas@basnieuwenhuizen.nl]
> Sent: Thursday, September 01, 2016 4:19 PM
> To: Liu, Monk <Monk.Liu@amd.com>
> Cc: Christian König <deathsimple@vodafone.de>; 
> amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
>
> On Thu, Sep 1, 2016 at 9:37 AM, Liu, Monk <Monk.Liu@amd.com> wrote:
>>
>> -----Original Message-----
>> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On 
>> Behalf Of Christian K?nig
>> Sent: Wednesday, August 31, 2016 7:53 PM
>> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
>>
>> Looks good to me in general, a few nit picks and sugegstions below.
>>
>> Am 31.08.2016 um 05:49 schrieb Monk Liu:
>>> v1:
>>> for gfx8, use CONTEXT_CONTROL package to dynamically skip preamble 
>>> CEIB and other load_xxx command in sequence.
>>>
>>> v2:
>>> support GFX7 as well, and bump up version.
>>> remove cntxcntl in compute ring funcs because CPC doesn't support 
>>> this packet.
>>>
>>> v3: fix reduntant judgement in cntxcntl.
>>>
>>> Change-Id: I4b87ca84ea8c11ba4f7fb4c0e8a5be537ccde851
>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>>>
>>> Change-Id: I5d24c1bb5c14190ce4adeb6a331ee3d92b3d5c83
>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>> Only one signed of by line is enough and remove the change-ids.
>>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  9 +++++++++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 12 ++++++++++++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  3 ++-
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 16 +++++++++-------
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   | 20 ++++++++++++++++++++
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   | 30 ++++++++++++++++++++++++++++++
>>>    6 files changed, 82 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index 1254410..0de5f08 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -321,6 +321,7 @@ struct amdgpu_ring_funcs {
>>>        void (*begin_use)(struct amdgpu_ring *ring);
>>>        void (*end_use)(struct amdgpu_ring *ring);
>>>        void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>> +     void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t 
>>> + flags);
>>>    };
>>>
>>>    /*
>>> @@ -965,6 +966,7 @@ struct amdgpu_ctx {
>>>        spinlock_t              ring_lock;
>>>        struct fence            **fences;
>>>        struct amdgpu_ctx_ring  rings[AMDGPU_MAX_RINGS];
>>> +     bool preamble_presented;
>>>    };
>>>
>>>    struct amdgpu_ctx_mgr {
>>> @@ -1227,8 +1229,13 @@ struct amdgpu_cs_parser {
>>>
>>>        /* user fence */
>>>        struct amdgpu_bo_list_entry     uf_entry;
>>> +     bool preamble_present; /* True means this command submit 
>>> +involves a preamble IB */
>> We only need this in amdgpu_cs_ib_fill() don't we? See below as well.
>>
>> [ML] seems good advice
>>
>>>    };
>>>
>>> +#define PREAMBLE_IB_PRESENT          (1 << 0) /* bit set means command submit involves a preamble IB */
>>> +#define PREAMBLE_IB_PRESENT_FIRST    (1 << 1) /* bit set means preamble IB is first presented in belonging context */
>> Why does that makes a difference if it is seen for the first time?
>>
>> [ml] if it is presented for the first time for belonging ctx, means even current CS do not involve context switch, we still need keep the actions in preamble IB.
>> Usually if current CS is from the same cntx of previous CS, that means no ctx switch occurs, so we can skip the actions in preamble IB. but above case is the exception.
> Can't userspace just not set the preamble flag for the first submit with a preamble? I think that would result in the same behavior, unless having two non-preamble CE IB's in a single submit is an issue.
>
> - Bas
>
>>> +#define HAVE_CTX_SWITCH              (1 << 2) /* bit set means context switch occured */
>>> +
>>>    struct amdgpu_job {
>>>        struct amd_sched_job    base;
>>>        struct amdgpu_device    *adev;
>>> @@ -1237,6 +1244,7 @@ struct amdgpu_job {
>>>        struct amdgpu_sync      sync;
>>>        struct amdgpu_ib        *ibs;
>>>        struct fence            *fence; /* the hw fence */
>>> +     uint32_t                preamble_status;
>>>        uint32_t                num_ibs;
>>>        void                    *owner;
>>>        uint64_t                fence_ctx; /* the fence_context this job uses */
>>> @@ -2264,6 +2272,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>>>    #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>>>    #define amdgpu_ring_emit_hdp_invalidate(r) (r)->funcs->emit_hdp_invalidate((r))
>>>    #define amdgpu_ring_emit_switch_buffer(r)
>>> (r)->funcs->emit_switch_buffer((r))
>>> +#define amdgpu_ring_emit_cntxcntl(r, d) 
>>> +(r)->funcs->emit_cntxcntl((r), (d))
>>>    #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
>>>    #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
>>>    #define amdgpu_ring_patch_cond_exec(r,o)
>>> (r)->funcs->patch_cond_exec((r),(o))
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> index 2d4e005..6d8c050 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> @@ -792,6 +792,9 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
>>>                if (r)
>>>                        return r;
>>>
>>> +             if (ib->flags & AMDGPU_IB_FLAG_PREAMBLE)
>>> +                     parser->preamble_present = true;
>>> +
>>>                if (parser->job->ring && parser->job->ring != ring)
>>>                        return -EINVAL;
>>>
>>> @@ -930,6 +933,12 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>>>                return r;
>>>        }
>>>
>>> +     if (p->preamble_present) {
>>> +             job->preamble_status |= PREAMBLE_IB_PRESENT;
>>> +             if (!p->ctx->preamble_presented)
>>> +                     job->preamble_status |= PREAMBLE_IB_PRESENT_FIRST;
>>> +     }
>>> +
>> Better move this to the end of amdgpu_cs_ib_fill() where we allocate the IBs as well.
>> [ML] okay, good change.
>>
>>
>>
>>>        job->owner = p->filp;
>>>        job->fence_ctx = entity->fence_context;
>>>        p->fence = fence_get(&job->base.s_fence->finished);
>>> @@ -940,6 +949,9 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>>>        trace_amdgpu_cs_ioctl(job);
>>>        amd_sched_entity_push_job(&job->base);
>>>
>>> +     if (p->preamble_present)
>>> +             p->ctx->preamble_presented = true;
>>> +
>>>        return 0;
>>>    }
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index 56c85e6..44db0ab 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -55,9 +55,10 @@
>>>     * - 3.3.0 - Add VM support for UVD on supported hardware.
>>>     * - 3.4.0 - Add AMDGPU_INFO_NUM_EVICTIONS.
>>>     * - 3.5.0 - Add support for new UVD_NO_OP register.
>>> + * - 3.6.0 - UMD doesn't/shouldn't need to use CONTEXT_CONTROL in 
>>> + IB, KMD should do it
>>>     */
>>>    #define KMS_DRIVER_MAJOR    3
>>> -#define KMS_DRIVER_MINOR     5
>>> +#define KMS_DRIVER_MINOR     6
>>>    #define KMS_DRIVER_PATCHLEVEL       0
>>>
>>>    int amdgpu_vram_limit = 0;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> index 04263f0..b12b5ba 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> @@ -121,10 +121,11 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>    {
>>>        struct amdgpu_device *adev = ring->adev;
>>>        struct amdgpu_ib *ib = &ibs[0];
>>> -     bool skip_preamble, need_ctx_switch;
>>> +     bool need_ctx_switch;
>>>        unsigned patch_offset = ~0;
>>>        struct amdgpu_vm *vm;
>>>        uint64_t fence_ctx;
>>> +     uint32_t status = 0;
>>>
>>>        unsigned i;
>>>        int r = 0;
>>> @@ -174,15 +175,16 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>        /* always set cond_exec_polling to CONTINUE */
>>>        *ring->cond_exe_cpu_addr = 1;
>>>
>>> -     skip_preamble = ring->current_ctx == fence_ctx;
>>>        need_ctx_switch = ring->current_ctx != fence_ctx;
>>> +     if (job && ring->funcs->emit_cntxcntl) {
>>> +             if (need_ctx_switch)
>>> +                     status |= HAVE_CTX_SWITCH;
>>> +             status |= job->preamble_status;
>>> +             amdgpu_ring_emit_cntxcntl(ring, status);
>>> +     }
>>> +
>>>        for (i = 0; i < num_ibs; ++i) {
>>>                ib = &ibs[i];
>>> -
>>> -             /* drop preamble IBs if we don't have a context switch */
>>> -             if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) && skip_preamble)
>>> -                     continue;
>>> -
>> Would be nice to keep this functionality for cases where we don't support emit_cntxcntl (e.g. SI?).
>> [ML] SI support CONTEXT_CONTROL as well, and the package structure is exactly the same as CI.
>>
>>>                amdgpu_ring_emit_ib(ring, ib, job ? job->vm_id : 0,
>>>                                    need_ctx_switch);
>>>                need_ctx_switch = false; diff --git 
>>> a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> index f055d49..0d5addb 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> @@ -2096,6 +2096,25 @@ static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
>>>        amdgpu_ring_write(ring, control);
>>>    }
>>>
>>> +static void gfx_v7_ring_emit_cntxcntl(struct amdgpu_ring *ring, 
>>> +uint32_t flags) {
>>> +     uint32_t dw2 = 0;
>>> +
>>> +     dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
>>> +     if (flags & HAVE_CTX_SWITCH) {
>>> +             /* set load_global_config & load_global_uconfig */
>>> +             dw2 |= 0x8001;
>>> +             /* set load_cs_sh_regs */
>>> +             dw2 |= 0x01000000;
>>> +             /* set load_per_context_state & load_gfx_sh_regs */
>>> +             dw2 |= 0x10002;
>> Better define some constants for those.
>>
>> [ML] I'll leave it to other guys when doing cleanups, a little hurry for other jobs now ...
>>
>> Regards,
>> Christian.
>>
>>> +     }
>>> +
>>> +     amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
>>> +     amdgpu_ring_write(ring, dw2);
>>> +     amdgpu_ring_write(ring, 0);
>>> +}
>>> +
>>>    /**
>>>     * gfx_v7_0_ring_test_ib - basic ring IB test
>>>     *
>>> @@ -4929,6 +4948,7 @@ static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_gfx = {
>>>        .test_ib = gfx_v7_0_ring_test_ib,
>>>        .insert_nop = amdgpu_ring_insert_nop,
>>>        .pad_ib = amdgpu_ring_generic_pad_ib,
>>> +     .emit_cntxcntl = gfx_v7_ring_emit_cntxcntl,
>>>    };
>>>
>>>    static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute 
>>> = { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> index 8ba8e42..73f6ffa 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> @@ -6085,6 +6085,35 @@ static void gfx_v8_ring_emit_sb(struct amdgpu_ring *ring)
>>>        amdgpu_ring_write(ring, 0);
>>>    }
>>>
>>> +static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, 
>>> +uint32_t flags) {
>>> +     uint32_t dw2 = 0;
>>> +
>>> +     dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
>>> +     if (flags & HAVE_CTX_SWITCH) {
>>> +             /* set load_global_config & load_global_uconfig */
>>> +             dw2 |= 0x8001;
>>> +             /* set load_cs_sh_regs */
>>> +             dw2 |= 0x01000000;
>>> +             /* set load_per_context_state & load_gfx_sh_regs for GFX */
>>> +             dw2 |= 0x10002;
>>> +
>>> +             /* set load_ce_ram if preamble presented */
>>> +             if (PREAMBLE_IB_PRESENT & flags)
>>> +                     dw2 |= 0x10000000;
>>> +     } else {
>>> +             /* still load_ce_ram if this is the first time preamble presented
>>> +              * although there is no context switch happens.
>>> +              */
>>> +             if (PREAMBLE_IB_PRESENT_FIRST & flags)
>>> +                     dw2 |= 0x10000000;
>>> +     }
>>> +
>>> +     amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
>>> +     amdgpu_ring_write(ring, dw2);
>>> +     amdgpu_ring_write(ring, 0);
>>> +}
>>> +
>>>    static void gfx_v8_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
>>>                                                 enum amdgpu_interrupt_state state)
>>>    {
>>> @@ -6267,6 +6296,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
>>>        .insert_nop = amdgpu_ring_insert_nop,
>>>        .pad_ib = amdgpu_ring_generic_pad_ib,
>>>        .emit_switch_buffer = gfx_v8_ring_emit_sb,
>>> +     .emit_cntxcntl = gfx_v8_ring_emit_cntxcntl,
>>>    };
>>>
>>>    static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute 
>>> = {
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
       [not found]                     ` <CAP+8YyGF3ht8KLgdTDN6K_r+YJD7ZLDscWXrvvwRdkB=HVOcmg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  2016-09-01 17:11                       ` Christian König
@ 2016-09-05  4:09                       ` Liu, Monk
       [not found]                         ` <MWHPR12MB1182CD9458ECF037586C2A1684E60-Gy0DoCVfaSVhjnLHdLm0OQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  1 sibling, 1 reply; 20+ messages in thread
From: Liu, Monk @ 2016-09-05  4:09 UTC (permalink / raw)
  To: Bas Nieuwenhuizen
  Cc: Christian König, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Okay, just put it simple

The approach my patch provided is absolutely correct, and I'm pretty sure of it, otherwise our close OGL UMD & windows d3d game already crashed for million times.

First of all: Move CONTEXT_CONTROL into RB is a must and the correct thing, and this is not a questionable patch.

Then give some definition:

UMD is aware of context switch within the process,
KMD is aware of context switch cross processes, because KMD give each context a unique ID globally, so this ID can detect process switch as well as context switch (within a process) easily.


Now back to your questions:

#1
 So if I understand correctly, the new behavior is that the first submit containing a preamble always executes the loads in the preamble even if there is no context switch. The old behavior is that in that situation th preamble would be skipped in the new situation. Why do we want the new behavior?>If the application wants the loads to execute even without a context switch, it should not mark the IB as a preamble with AMDGPU_IB_FLAG_PREAMBLE.

[ML] there is no harm kmd to that, isn't it ? besides, no  skip the first Preamble CEIB is the correct choice compared with skip it, no matter why UMD mark it as FLAG_PREAMBLE.

#2
If there is no IB with AMDGPU_IB_FLAG_PREAMBLE, then the CE_LOAD bit always gets disabled. Furthermore if there is a CE_LOAD bit, and no context switch the CE_LOAD bit also gets disabled for IB's without AMDGPU_IB_FLAG_PREAMBLE.

I think this is a bad move, as there are some uses for loading CE RAM that are not dependent on context switches, such as preloading things into L2 cache, or switching shader uniforms in CE RAM when a different shader gets bound. Therefore I think that the CE_LOAD bit should always be enabled for IB's without AMDGPU_IB_FLAG_PREAMBLE.

[ML] why my patch/scheme doesn’t show anything wrong when I run benchmark  (Unigine heaven) ?
I admit I uses AMD close source OGL UMD,  If KMD detects a context switch (including context switch within one process or process switch) then LOAD_CE_RAM is also kept.

For purpose of  " such as preloading things into L2 cache, or switching shader uniforms in CE RAM when a different shader gets bound.".... that could be done by CE IB ( instead of Preamble CEIB) via commands like "write_const_ram, dump_const_ram"

#3
Furthermore, with this patch the preamble IB's always get executed and loads disabled with CONTEXT_CONTROL. As e.g. mesa uses its own CONTEXT_CONTROL (and we can't change that for old versions of mesa) this would override the kernel CONTEXT_CONTROL and always execute the loads.

[ML] I must say MESA use CONTEXT_CONTROL is really bad idea, MESA couldn't detect the context switch triggered by process switch. 
No matter what reason, this wrong approach need be fixed.

#4
I also miss the CE_LOAD bit in the CONTEXT_CONTROL for gfx7. Does it not need it?

[ML] for GFX7, the CONTEXT_CONTROL doesn't support CE_LOAD bit. So CE_LOAD_RAM will always be kept from KMD perspective (which may sacrifice performance compared with GFX8).

#5
I would prefer keeping the old system for preamble IB's and just adding a generic CONTEXT_CONTROL that always enables the CE loads. I don't have an opinion the non-CE loads though, as I've never found a reason to use them.

[ML] No, that way our close UMD won't work correctly. 
You can insist the wrong way although,  and if you cannot accept the correct scheme of CONTEXT_CONTROL and change MESA's wrong behavior, 
I'll consider upstream amdgpu KMD refuse to support SR-IOV/virtualization. 
You need think it twice, you are insisting a wrong design/approach although it runs for years.

BR Monk












BR Monk

-----Original Message-----
From: Bas Nieuwenhuizen [mailto:bas@basnieuwenhuizen.nl] 
Sent: Friday, September 02, 2016 12:09 AM
To: Liu, Monk <Monk.Liu@amd.com>
Cc: Christian König <deathsimple@vodafone.de>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)

On Thu, Sep 1, 2016 at 12:55 PM, Liu, Monk <Monk.Liu@amd.com> wrote:
>> Why does that makes a difference if it is seen for the first time?
>>
>> [ml] if it is presented for the first time for belonging ctx, means even current CS do not involve context switch, we still need keep the actions in preamble IB.
>> Usually if current CS is from the same cntx of previous CS, that means no ctx switch occurs, so we can skip the actions in preamble IB. but above case is the exception.
>
> Can't userspace just not set the preamble flag for the first submit with a preamble? I think that would result in the same behavior, unless having two non-preamble CE IB's in a single submit is an issue.
>
> - Bas
>
>
> [ML] I'm confused, what's your point?

So if I understand correctly, the new behavior is that the first submit containing a preamble always executes the loads in the preamble even if there is no context switch. The old behavior is that in that situation the preamble would be skipped in the new situation. Why do we want the new behavior? If the application wants the loads to execute even without a context switch, it should not mark the IB as a preamble with AMDGPU_IB_FLAG_PREAMBLE.

On inspecting the patch more closely I think there are more issues with this patch.

If there is no IB with AMDGPU_IB_FLAG_PREAMBLE, then the CE_LOAD bit always gets disabled. Furthermore if there is a CE_LOAD bit, and no context switch the CE_LOAD bit also gets disabled for IB's without AMDGPU_IB_FLAG_PREAMBLE.

I think this is a bad move, as there are some uses for loading CE RAM that are not dependent on context switches, such as preloading things into L2 cache, or switching shader uniforms in CE RAM when a different shader gets bound. Therefore I think that the CE_LOAD bit should always be enabled for IB's without AMDGPU_IB_FLAG_PREAMBLE.

Furthermore, with this patch the preamble IB's always get executed and loads disabled with CONTEXT_CONTROL. As e.g. mesa uses its own CONTEXT_CONTROL (and we can't change that for old versions of mesa) this would override the kernel CONTEXT_CONTROL and always execute the loads.

I also miss the CE_LOAD bit in the CONTEXT_CONTROL for gfx7. Does it not need it?

I would prefer keeping the old system for preamble IB's and just adding a generic CONTEXT_CONTROL that always enables the CE loads. I don't have an opinion the non-CE loads though, as I've never found a reason to use them.

- Bas

> With this patch, preamble_flag is not needed at all.
> Without this patch,  many original assumption and logic is not correct.
> Besides, CONTEXT_CONTROL not only deals CE but also deal DE.


>
> BR Monk
>
>
> -----Original Message-----
> From: Bas Nieuwenhuizen [mailto:bas@basnieuwenhuizen.nl]
> Sent: Thursday, September 01, 2016 4:19 PM
> To: Liu, Monk <Monk.Liu@amd.com>
> Cc: Christian König <deathsimple@vodafone.de>; 
> amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
>
> On Thu, Sep 1, 2016 at 9:37 AM, Liu, Monk <Monk.Liu@amd.com> wrote:
>>
>>
>> -----Original Message-----
>> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On 
>> Behalf Of Christian K?nig
>> Sent: Wednesday, August 31, 2016 7:53 PM
>> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
>>
>> Looks good to me in general, a few nit picks and sugegstions below.
>>
>> Am 31.08.2016 um 05:49 schrieb Monk Liu:
>>> v1:
>>> for gfx8, use CONTEXT_CONTROL package to dynamically skip preamble 
>>> CEIB and other load_xxx command in sequence.
>>>
>>> v2:
>>> support GFX7 as well, and bump up version.
>>> remove cntxcntl in compute ring funcs because CPC doesn't support 
>>> this packet.
>>>
>>> v3: fix reduntant judgement in cntxcntl.
>>>
>>> Change-Id: I4b87ca84ea8c11ba4f7fb4c0e8a5be537ccde851
>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>>>
>>> Change-Id: I5d24c1bb5c14190ce4adeb6a331ee3d92b3d5c83
>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>>
>> Only one signed of by line is enough and remove the change-ids.
>>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  9 +++++++++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 12 ++++++++++++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  3 ++-
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 16 +++++++++-------
>>>   drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   | 20 ++++++++++++++++++++
>>>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   | 30 ++++++++++++++++++++++++++++++
>>>   6 files changed, 82 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index 1254410..0de5f08 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -321,6 +321,7 @@ struct amdgpu_ring_funcs {
>>>       void (*begin_use)(struct amdgpu_ring *ring);
>>>       void (*end_use)(struct amdgpu_ring *ring);
>>>       void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>> +     void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t 
>>> + flags);
>>>   };
>>>
>>>   /*
>>> @@ -965,6 +966,7 @@ struct amdgpu_ctx {
>>>       spinlock_t              ring_lock;
>>>       struct fence            **fences;
>>>       struct amdgpu_ctx_ring  rings[AMDGPU_MAX_RINGS];
>>> +     bool preamble_presented;
>>>   };
>>>
>>>   struct amdgpu_ctx_mgr {
>>> @@ -1227,8 +1229,13 @@ struct amdgpu_cs_parser {
>>>
>>>       /* user fence */
>>>       struct amdgpu_bo_list_entry     uf_entry;
>>> +     bool preamble_present; /* True means this command submit 
>>> +involves a preamble IB */
>>
>> We only need this in amdgpu_cs_ib_fill() don't we? See below as well.
>>
>> [ML] seems good advice
>>
>>>   };
>>>
>>> +#define PREAMBLE_IB_PRESENT          (1 << 0) /* bit set means command submit involves a preamble IB */
>>> +#define PREAMBLE_IB_PRESENT_FIRST    (1 << 1) /* bit set means preamble IB is first presented in belonging context */
>>
>> Why does that makes a difference if it is seen for the first time?
>>
>> [ml] if it is presented for the first time for belonging ctx, means even current CS do not involve context switch, we still need keep the actions in preamble IB.
>> Usually if current CS is from the same cntx of previous CS, that means no ctx switch occurs, so we can skip the actions in preamble IB. but above case is the exception.
>
> Can't userspace just not set the preamble flag for the first submit with a preamble? I think that would result in the same behavior, unless having two non-preamble CE IB's in a single submit is an issue.
>
> - Bas
>
>>
>>> +#define HAVE_CTX_SWITCH              (1 << 2) /* bit set means context switch occured */
>>> +
>>>   struct amdgpu_job {
>>>       struct amd_sched_job    base;
>>>       struct amdgpu_device    *adev;
>>> @@ -1237,6 +1244,7 @@ struct amdgpu_job {
>>>       struct amdgpu_sync      sync;
>>>       struct amdgpu_ib        *ibs;
>>>       struct fence            *fence; /* the hw fence */
>>> +     uint32_t                preamble_status;
>>>       uint32_t                num_ibs;
>>>       void                    *owner;
>>>       uint64_t                fence_ctx; /* the fence_context this job uses */
>>> @@ -2264,6 +2272,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>>>   #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>>>   #define amdgpu_ring_emit_hdp_invalidate(r) (r)->funcs->emit_hdp_invalidate((r))
>>>   #define amdgpu_ring_emit_switch_buffer(r)
>>> (r)->funcs->emit_switch_buffer((r))
>>> +#define amdgpu_ring_emit_cntxcntl(r, d) 
>>> +(r)->funcs->emit_cntxcntl((r), (d))
>>>   #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
>>>   #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
>>>   #define amdgpu_ring_patch_cond_exec(r,o)
>>> (r)->funcs->patch_cond_exec((r),(o))
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> index 2d4e005..6d8c050 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> @@ -792,6 +792,9 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
>>>               if (r)
>>>                       return r;
>>>
>>> +             if (ib->flags & AMDGPU_IB_FLAG_PREAMBLE)
>>> +                     parser->preamble_present = true;
>>> +
>>>               if (parser->job->ring && parser->job->ring != ring)
>>>                       return -EINVAL;
>>>
>>> @@ -930,6 +933,12 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>>>               return r;
>>>       }
>>>
>>> +     if (p->preamble_present) {
>>> +             job->preamble_status |= PREAMBLE_IB_PRESENT;
>>> +             if (!p->ctx->preamble_presented)
>>> +                     job->preamble_status |= PREAMBLE_IB_PRESENT_FIRST;
>>> +     }
>>> +
>>
>> Better move this to the end of amdgpu_cs_ib_fill() where we allocate the IBs as well.
>> [ML] okay, good change.
>>
>>
>>
>>>       job->owner = p->filp;
>>>       job->fence_ctx = entity->fence_context;
>>>       p->fence = fence_get(&job->base.s_fence->finished);
>>> @@ -940,6 +949,9 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>>>       trace_amdgpu_cs_ioctl(job);
>>>       amd_sched_entity_push_job(&job->base);
>>>
>>> +     if (p->preamble_present)
>>> +             p->ctx->preamble_presented = true;
>>> +
>>>       return 0;
>>>   }
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index 56c85e6..44db0ab 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -55,9 +55,10 @@
>>>    * - 3.3.0 - Add VM support for UVD on supported hardware.
>>>    * - 3.4.0 - Add AMDGPU_INFO_NUM_EVICTIONS.
>>>    * - 3.5.0 - Add support for new UVD_NO_OP register.
>>> + * - 3.6.0 - UMD doesn't/shouldn't need to use CONTEXT_CONTROL in 
>>> + IB, KMD should do it
>>>    */
>>>   #define KMS_DRIVER_MAJOR    3
>>> -#define KMS_DRIVER_MINOR     5
>>> +#define KMS_DRIVER_MINOR     6
>>>   #define KMS_DRIVER_PATCHLEVEL       0
>>>
>>>   int amdgpu_vram_limit = 0;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> index 04263f0..b12b5ba 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>> @@ -121,10 +121,11 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>   {
>>>       struct amdgpu_device *adev = ring->adev;
>>>       struct amdgpu_ib *ib = &ibs[0];
>>> -     bool skip_preamble, need_ctx_switch;
>>> +     bool need_ctx_switch;
>>>       unsigned patch_offset = ~0;
>>>       struct amdgpu_vm *vm;
>>>       uint64_t fence_ctx;
>>> +     uint32_t status = 0;
>>>
>>>       unsigned i;
>>>       int r = 0;
>>> @@ -174,15 +175,16 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>       /* always set cond_exec_polling to CONTINUE */
>>>       *ring->cond_exe_cpu_addr = 1;
>>>
>>> -     skip_preamble = ring->current_ctx == fence_ctx;
>>>       need_ctx_switch = ring->current_ctx != fence_ctx;
>>> +     if (job && ring->funcs->emit_cntxcntl) {
>>> +             if (need_ctx_switch)
>>> +                     status |= HAVE_CTX_SWITCH;
>>> +             status |= job->preamble_status;
>>> +             amdgpu_ring_emit_cntxcntl(ring, status);
>>> +     }
>>> +
>>>       for (i = 0; i < num_ibs; ++i) {
>>>               ib = &ibs[i];
>>> -
>>> -             /* drop preamble IBs if we don't have a context switch */
>>> -             if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) && skip_preamble)
>>> -                     continue;
>>> -
>>
>> Would be nice to keep this functionality for cases where we don't support emit_cntxcntl (e.g. SI?).
>> [ML] SI support CONTEXT_CONTROL as well, and the package structure is exactly the same as CI.
>>
>>>               amdgpu_ring_emit_ib(ring, ib, job ? job->vm_id : 0,
>>>                                   need_ctx_switch);
>>>               need_ctx_switch = false; diff --git 
>>> a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> index f055d49..0d5addb 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> @@ -2096,6 +2096,25 @@ static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
>>>       amdgpu_ring_write(ring, control);
>>>   }
>>>
>>> +static void gfx_v7_ring_emit_cntxcntl(struct amdgpu_ring *ring, 
>>> +uint32_t flags) {
>>> +     uint32_t dw2 = 0;
>>> +
>>> +     dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
>>> +     if (flags & HAVE_CTX_SWITCH) {
>>> +             /* set load_global_config & load_global_uconfig */
>>> +             dw2 |= 0x8001;
>>> +             /* set load_cs_sh_regs */
>>> +             dw2 |= 0x01000000;
>>> +             /* set load_per_context_state & load_gfx_sh_regs */
>>> +             dw2 |= 0x10002;
>>
>> Better define some constants for those.
>>
>> [ML] I'll leave it to other guys when doing cleanups, a little hurry for other jobs now ...
>>
>> Regards,
>> Christian.
>>
>>> +     }
>>> +
>>> +     amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
>>> +     amdgpu_ring_write(ring, dw2);
>>> +     amdgpu_ring_write(ring, 0);
>>> +}
>>> +
>>>   /**
>>>    * gfx_v7_0_ring_test_ib - basic ring IB test
>>>    *
>>> @@ -4929,6 +4948,7 @@ static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_gfx = {
>>>       .test_ib = gfx_v7_0_ring_test_ib,
>>>       .insert_nop = amdgpu_ring_insert_nop,
>>>       .pad_ib = amdgpu_ring_generic_pad_ib,
>>> +     .emit_cntxcntl = gfx_v7_ring_emit_cntxcntl,
>>>   };
>>>
>>>   static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute 
>>> = { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> index 8ba8e42..73f6ffa 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> @@ -6085,6 +6085,35 @@ static void gfx_v8_ring_emit_sb(struct amdgpu_ring *ring)
>>>       amdgpu_ring_write(ring, 0);
>>>   }
>>>
>>> +static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, 
>>> +uint32_t flags) {
>>> +     uint32_t dw2 = 0;
>>> +
>>> +     dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
>>> +     if (flags & HAVE_CTX_SWITCH) {
>>> +             /* set load_global_config & load_global_uconfig */
>>> +             dw2 |= 0x8001;
>>> +             /* set load_cs_sh_regs */
>>> +             dw2 |= 0x01000000;
>>> +             /* set load_per_context_state & load_gfx_sh_regs for GFX */
>>> +             dw2 |= 0x10002;
>>> +
>>> +             /* set load_ce_ram if preamble presented */
>>> +             if (PREAMBLE_IB_PRESENT & flags)
>>> +                     dw2 |= 0x10000000;
>>> +     } else {
>>> +             /* still load_ce_ram if this is the first time preamble presented
>>> +              * although there is no context switch happens.
>>> +              */
>>> +             if (PREAMBLE_IB_PRESENT_FIRST & flags)
>>> +                     dw2 |= 0x10000000;
>>> +     }
>>> +
>>> +     amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
>>> +     amdgpu_ring_write(ring, dw2);
>>> +     amdgpu_ring_write(ring, 0);
>>> +}
>>> +
>>>   static void gfx_v8_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
>>>                                                enum amdgpu_interrupt_state state)
>>>   {
>>> @@ -6267,6 +6296,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
>>>       .insert_nop = amdgpu_ring_insert_nop,
>>>       .pad_ib = amdgpu_ring_generic_pad_ib,
>>>       .emit_switch_buffer = gfx_v8_ring_emit_sb,
>>> +     .emit_cntxcntl = gfx_v8_ring_emit_cntxcntl,
>>>   };
>>>
>>>   static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute 
>>> = {
>>
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
       [not found]                         ` <MWHPR12MB1182CD9458ECF037586C2A1684E60-Gy0DoCVfaSVhjnLHdLm0OQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2016-09-05  9:51                           ` Christian König
       [not found]                             ` <MWHPR12MB1182F1E4814DC0C98B198DA984E60@MWHPR12MB1182.namprd12.prod.outlook.com>
  0 siblings, 1 reply; 20+ messages in thread
From: Christian König @ 2016-09-05  9:51 UTC (permalink / raw)
  To: Liu, Monk, Bas Nieuwenhuizen; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 05.09.2016 um 06:09 schrieb Liu, Monk:
> Okay, just put it simple
>
> The approach my patch provided is absolutely correct, and I'm pretty sure of it, otherwise our close OGL UMD & windows d3d game already crashed for million times.

Well that it works and doesn't crash of hand doesn't mean that it is 
correct. We need to understand the technical background and all 
implications to judge if we can commit the patch or not and I clearly 
don't see that right now.

> First of all: Move CONTEXT_CONTROL into RB is a must and the correct thing, and this is not a questionable patch.

I agree that we should move CONTEXT_CONTROL under kernel control, but I 
don't agree doing it like you proposed with your patch.

>
> Then give some definition:
>
> UMD is aware of context switch within the process,
> KMD is aware of context switch cross processes, because KMD give each context a unique ID globally, so this ID can detect process switch as well as context switch (within a process) easily.
>
>
> Now back to your questions:
>
> #1
>   So if I understand correctly, the new behavior is that the first submit containing a preamble always executes the loads in the preamble even if there is no context switch. The old behavior is that in that situation th preamble would be skipped in the new situation. Why do we want the new behavior?>If the application wants the loads to execute even without a context switch, it should not mark the IB as a preamble with AMDGPU_IB_FLAG_PREAMBLE.
>
> [ML] there is no harm kmd to that, isn't it ? besides, no  skip the first Preamble CEIB is the correct choice compared with skip it, no matter why UMD mark it as FLAG_PREAMBLE.

Well it is a behavior change and as such must be justified somehow. 
Since Mesa doesn't need that we don't have a justification for this 
change as far as I can see.

What we can do is a more general approach to filter out the preamble 
bits in the first command submission we see for each context (e.g. 
already in amdgpu_cs.c). That would for example be useful when we replay 
parts of captured IBs and so is useful/justified on it's own.

>
> #2
> If there is no IB with AMDGPU_IB_FLAG_PREAMBLE, then the CE_LOAD bit always gets disabled. Furthermore if there is a CE_LOAD bit, and no context switch the CE_LOAD bit also gets disabled for IB's without AMDGPU_IB_FLAG_PREAMBLE.
>
> I think this is a bad move, as there are some uses for loading CE RAM that are not dependent on context switches, such as preloading things into L2 cache, or switching shader uniforms in CE RAM when a different shader gets bound. Therefore I think that the CE_LOAD bit should always be enabled for IB's without AMDGPU_IB_FLAG_PREAMBLE.
>
> [ML] why my patch/scheme doesn’t show anything wrong when I run benchmark  (Unigine heaven) ?

Again as I wrote above it is irrelevant if your patch works now with 
current Mesa. We need to make sure that the interface is consistent and 
doesn't even break old and possible future use cases.

Bas comments on this are right and so I think that the patch should be 
changed so that the preamble flag is honored correctly on a per IB basis.

Just emitting multiple CONTEXT_CONTROL packets to reset the preamble 
flags for the IBs who don't have the preamble bit set sounds like a 
possible and clean solution to me.

> I admit I uses AMD close source OGL UMD,  If KMD detects a context switch (including context switch within one process or process switch) then LOAD_CE_RAM is also kept.
>
> For purpose of  " such as preloading things into L2 cache, or switching shader uniforms in CE RAM when a different shader gets bound.".... that could be done by CE IB ( instead of Preamble CEIB) via commands like "write_const_ram, dump_const_ram"
>
> #3
> Furthermore, with this patch the preamble IB's always get executed and loads disabled with CONTEXT_CONTROL. As e.g. mesa uses its own CONTEXT_CONTROL (and we can't change that for old versions of mesa) this would override the kernel CONTEXT_CONTROL and always execute the loads.
>
> [ML] I must say MESA use CONTEXT_CONTROL is really bad idea, MESA couldn't detect the context switch triggered by process switch.
> No matter what reason, this wrong approach need be fixed.

Again I have to agree with Bas here. We need to maintain the old 
behavior for old Mesa even when that doesn't seem to be the correct 
things to do.

>
> #4
> I also miss the CE_LOAD bit in the CONTEXT_CONTROL for gfx7. Does it not need it?
>
> [ML] for GFX7, the CONTEXT_CONTROL doesn't support CE_LOAD bit. So CE_LOAD_RAM will always be kept from KMD perspective (which may sacrifice performance compared with GFX8).
>
> #5
> I would prefer keeping the old system for preamble IB's and just adding a generic CONTEXT_CONTROL that always enables the CE loads. I don't have an opinion the non-CE loads though, as I've never found a reason to use them.
>
> [ML] No, that way our close UMD won't work correctly.
> You can insist the wrong way although,  and if you cannot accept the correct scheme of CONTEXT_CONTROL and change MESA's wrong behavior,

I wouldn't call Mesa behavior wrong. It is just using the hardware 
differently than the closed source UMD and since Mesa is the only 
relevant UMD for upstreaming we need to follow its requirements.

> I'll consider upstream amdgpu KMD refuse to support SR-IOV/virtualization.
> You need think it twice, you are insisting a wrong design/approach although it runs for years.

Only committing it to the hybrid branch is a clear NAK from my side 
cause that can result in problems when we run Mesa over the hybrid 
kernel as well (which is a documented requirement of the hybrid branch).

Please work together with Bas to properly clean up this feature while 
maintaining backward and forward compatibility.

Regards,
Christian.

>
> BR Monk
>
>
>
>
>
>
>
>
>
>
>
>
> BR Monk
>
> -----Original Message-----
> From: Bas Nieuwenhuizen [mailto:bas@basnieuwenhuizen.nl]
> Sent: Friday, September 02, 2016 12:09 AM
> To: Liu, Monk <Monk.Liu@amd.com>
> Cc: Christian König <deathsimple@vodafone.de>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
>
> On Thu, Sep 1, 2016 at 12:55 PM, Liu, Monk <Monk.Liu@amd.com> wrote:
>>> Why does that makes a difference if it is seen for the first time?
>>>
>>> [ml] if it is presented for the first time for belonging ctx, means even current CS do not involve context switch, we still need keep the actions in preamble IB.
>>> Usually if current CS is from the same cntx of previous CS, that means no ctx switch occurs, so we can skip the actions in preamble IB. but above case is the exception.
>> Can't userspace just not set the preamble flag for the first submit with a preamble? I think that would result in the same behavior, unless having two non-preamble CE IB's in a single submit is an issue.
>>
>> - Bas
>>
>>
>> [ML] I'm confused, what's your point?
> So if I understand correctly, the new behavior is that the first submit containing a preamble always executes the loads in the preamble even if there is no context switch. The old behavior is that in that situation the preamble would be skipped in the new situation. Why do we want the new behavior? If the application wants the loads to execute even without a context switch, it should not mark the IB as a preamble with AMDGPU_IB_FLAG_PREAMBLE.
>
> On inspecting the patch more closely I think there are more issues with this patch.
>
> If there is no IB with AMDGPU_IB_FLAG_PREAMBLE, then the CE_LOAD bit always gets disabled. Furthermore if there is a CE_LOAD bit, and no context switch the CE_LOAD bit also gets disabled for IB's without AMDGPU_IB_FLAG_PREAMBLE.
>
> I think this is a bad move, as there are some uses for loading CE RAM that are not dependent on context switches, such as preloading things into L2 cache, or switching shader uniforms in CE RAM when a different shader gets bound. Therefore I think that the CE_LOAD bit should always be enabled for IB's without AMDGPU_IB_FLAG_PREAMBLE.
>
> Furthermore, with this patch the preamble IB's always get executed and loads disabled with CONTEXT_CONTROL. As e.g. mesa uses its own CONTEXT_CONTROL (and we can't change that for old versions of mesa) this would override the kernel CONTEXT_CONTROL and always execute the loads.
>
> I also miss the CE_LOAD bit in the CONTEXT_CONTROL for gfx7. Does it not need it?
>
> I would prefer keeping the old system for preamble IB's and just adding a generic CONTEXT_CONTROL that always enables the CE loads. I don't have an opinion the non-CE loads though, as I've never found a reason to use them.
>
> - Bas
>
>> With this patch, preamble_flag is not needed at all.
>> Without this patch,  many original assumption and logic is not correct.
>> Besides, CONTEXT_CONTROL not only deals CE but also deal DE.
>
>> BR Monk
>>
>>
>> -----Original Message-----
>> From: Bas Nieuwenhuizen [mailto:bas@basnieuwenhuizen.nl]
>> Sent: Thursday, September 01, 2016 4:19 PM
>> To: Liu, Monk <Monk.Liu@amd.com>
>> Cc: Christian König <deathsimple@vodafone.de>;
>> amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
>>
>> On Thu, Sep 1, 2016 at 9:37 AM, Liu, Monk <Monk.Liu@amd.com> wrote:
>>>
>>> -----Original Message-----
>>> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On
>>> Behalf Of Christian K?nig
>>> Sent: Wednesday, August 31, 2016 7:53 PM
>>> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
>>>
>>> Looks good to me in general, a few nit picks and sugegstions below.
>>>
>>> Am 31.08.2016 um 05:49 schrieb Monk Liu:
>>>> v1:
>>>> for gfx8, use CONTEXT_CONTROL package to dynamically skip preamble
>>>> CEIB and other load_xxx command in sequence.
>>>>
>>>> v2:
>>>> support GFX7 as well, and bump up version.
>>>> remove cntxcntl in compute ring funcs because CPC doesn't support
>>>> this packet.
>>>>
>>>> v3: fix reduntant judgement in cntxcntl.
>>>>
>>>> Change-Id: I4b87ca84ea8c11ba4f7fb4c0e8a5be537ccde851
>>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>>>>
>>>> Change-Id: I5d24c1bb5c14190ce4adeb6a331ee3d92b3d5c83
>>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>>> Only one signed of by line is enough and remove the change-ids.
>>>
>>>> ---
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  9 +++++++++
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 12 ++++++++++++
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  3 ++-
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 16 +++++++++-------
>>>>    drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   | 20 ++++++++++++++++++++
>>>>    drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   | 30 ++++++++++++++++++++++++++++++
>>>>    6 files changed, 82 insertions(+), 8 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> index 1254410..0de5f08 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> @@ -321,6 +321,7 @@ struct amdgpu_ring_funcs {
>>>>        void (*begin_use)(struct amdgpu_ring *ring);
>>>>        void (*end_use)(struct amdgpu_ring *ring);
>>>>        void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>>> +     void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t
>>>> + flags);
>>>>    };
>>>>
>>>>    /*
>>>> @@ -965,6 +966,7 @@ struct amdgpu_ctx {
>>>>        spinlock_t              ring_lock;
>>>>        struct fence            **fences;
>>>>        struct amdgpu_ctx_ring  rings[AMDGPU_MAX_RINGS];
>>>> +     bool preamble_presented;
>>>>    };
>>>>
>>>>    struct amdgpu_ctx_mgr {
>>>> @@ -1227,8 +1229,13 @@ struct amdgpu_cs_parser {
>>>>
>>>>        /* user fence */
>>>>        struct amdgpu_bo_list_entry     uf_entry;
>>>> +     bool preamble_present; /* True means this command submit
>>>> +involves a preamble IB */
>>> We only need this in amdgpu_cs_ib_fill() don't we? See below as well.
>>>
>>> [ML] seems good advice
>>>
>>>>    };
>>>>
>>>> +#define PREAMBLE_IB_PRESENT          (1 << 0) /* bit set means command submit involves a preamble IB */
>>>> +#define PREAMBLE_IB_PRESENT_FIRST    (1 << 1) /* bit set means preamble IB is first presented in belonging context */
>>> Why does that makes a difference if it is seen for the first time?
>>>
>>> [ml] if it is presented for the first time for belonging ctx, means even current CS do not involve context switch, we still need keep the actions in preamble IB.
>>> Usually if current CS is from the same cntx of previous CS, that means no ctx switch occurs, so we can skip the actions in preamble IB. but above case is the exception.
>> Can't userspace just not set the preamble flag for the first submit with a preamble? I think that would result in the same behavior, unless having two non-preamble CE IB's in a single submit is an issue.
>>
>> - Bas
>>
>>>> +#define HAVE_CTX_SWITCH              (1 << 2) /* bit set means context switch occured */
>>>> +
>>>>    struct amdgpu_job {
>>>>        struct amd_sched_job    base;
>>>>        struct amdgpu_device    *adev;
>>>> @@ -1237,6 +1244,7 @@ struct amdgpu_job {
>>>>        struct amdgpu_sync      sync;
>>>>        struct amdgpu_ib        *ibs;
>>>>        struct fence            *fence; /* the hw fence */
>>>> +     uint32_t                preamble_status;
>>>>        uint32_t                num_ibs;
>>>>        void                    *owner;
>>>>        uint64_t                fence_ctx; /* the fence_context this job uses */
>>>> @@ -2264,6 +2272,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>>>>    #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>>>>    #define amdgpu_ring_emit_hdp_invalidate(r) (r)->funcs->emit_hdp_invalidate((r))
>>>>    #define amdgpu_ring_emit_switch_buffer(r)
>>>> (r)->funcs->emit_switch_buffer((r))
>>>> +#define amdgpu_ring_emit_cntxcntl(r, d)
>>>> +(r)->funcs->emit_cntxcntl((r), (d))
>>>>    #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
>>>>    #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
>>>>    #define amdgpu_ring_patch_cond_exec(r,o)
>>>> (r)->funcs->patch_cond_exec((r),(o))
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>> index 2d4e005..6d8c050 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>> @@ -792,6 +792,9 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
>>>>                if (r)
>>>>                        return r;
>>>>
>>>> +             if (ib->flags & AMDGPU_IB_FLAG_PREAMBLE)
>>>> +                     parser->preamble_present = true;
>>>> +
>>>>                if (parser->job->ring && parser->job->ring != ring)
>>>>                        return -EINVAL;
>>>>
>>>> @@ -930,6 +933,12 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>>>>                return r;
>>>>        }
>>>>
>>>> +     if (p->preamble_present) {
>>>> +             job->preamble_status |= PREAMBLE_IB_PRESENT;
>>>> +             if (!p->ctx->preamble_presented)
>>>> +                     job->preamble_status |= PREAMBLE_IB_PRESENT_FIRST;
>>>> +     }
>>>> +
>>> Better move this to the end of amdgpu_cs_ib_fill() where we allocate the IBs as well.
>>> [ML] okay, good change.
>>>
>>>
>>>
>>>>        job->owner = p->filp;
>>>>        job->fence_ctx = entity->fence_context;
>>>>        p->fence = fence_get(&job->base.s_fence->finished);
>>>> @@ -940,6 +949,9 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>>>>        trace_amdgpu_cs_ioctl(job);
>>>>        amd_sched_entity_push_job(&job->base);
>>>>
>>>> +     if (p->preamble_present)
>>>> +             p->ctx->preamble_presented = true;
>>>> +
>>>>        return 0;
>>>>    }
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> index 56c85e6..44db0ab 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> @@ -55,9 +55,10 @@
>>>>     * - 3.3.0 - Add VM support for UVD on supported hardware.
>>>>     * - 3.4.0 - Add AMDGPU_INFO_NUM_EVICTIONS.
>>>>     * - 3.5.0 - Add support for new UVD_NO_OP register.
>>>> + * - 3.6.0 - UMD doesn't/shouldn't need to use CONTEXT_CONTROL in
>>>> + IB, KMD should do it
>>>>     */
>>>>    #define KMS_DRIVER_MAJOR    3
>>>> -#define KMS_DRIVER_MINOR     5
>>>> +#define KMS_DRIVER_MINOR     6
>>>>    #define KMS_DRIVER_PATCHLEVEL       0
>>>>
>>>>    int amdgpu_vram_limit = 0;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> index 04263f0..b12b5ba 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>>>> @@ -121,10 +121,11 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>>    {
>>>>        struct amdgpu_device *adev = ring->adev;
>>>>        struct amdgpu_ib *ib = &ibs[0];
>>>> -     bool skip_preamble, need_ctx_switch;
>>>> +     bool need_ctx_switch;
>>>>        unsigned patch_offset = ~0;
>>>>        struct amdgpu_vm *vm;
>>>>        uint64_t fence_ctx;
>>>> +     uint32_t status = 0;
>>>>
>>>>        unsigned i;
>>>>        int r = 0;
>>>> @@ -174,15 +175,16 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>>>        /* always set cond_exec_polling to CONTINUE */
>>>>        *ring->cond_exe_cpu_addr = 1;
>>>>
>>>> -     skip_preamble = ring->current_ctx == fence_ctx;
>>>>        need_ctx_switch = ring->current_ctx != fence_ctx;
>>>> +     if (job && ring->funcs->emit_cntxcntl) {
>>>> +             if (need_ctx_switch)
>>>> +                     status |= HAVE_CTX_SWITCH;
>>>> +             status |= job->preamble_status;
>>>> +             amdgpu_ring_emit_cntxcntl(ring, status);
>>>> +     }
>>>> +
>>>>        for (i = 0; i < num_ibs; ++i) {
>>>>                ib = &ibs[i];
>>>> -
>>>> -             /* drop preamble IBs if we don't have a context switch */
>>>> -             if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) && skip_preamble)
>>>> -                     continue;
>>>> -
>>> Would be nice to keep this functionality for cases where we don't support emit_cntxcntl (e.g. SI?).
>>> [ML] SI support CONTEXT_CONTROL as well, and the package structure is exactly the same as CI.
>>>
>>>>                amdgpu_ring_emit_ib(ring, ib, job ? job->vm_id : 0,
>>>>                                    need_ctx_switch);
>>>>                need_ctx_switch = false; diff --git
>>>> a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>>> index f055d49..0d5addb 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>>> @@ -2096,6 +2096,25 @@ static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
>>>>        amdgpu_ring_write(ring, control);
>>>>    }
>>>>
>>>> +static void gfx_v7_ring_emit_cntxcntl(struct amdgpu_ring *ring,
>>>> +uint32_t flags) {
>>>> +     uint32_t dw2 = 0;
>>>> +
>>>> +     dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
>>>> +     if (flags & HAVE_CTX_SWITCH) {
>>>> +             /* set load_global_config & load_global_uconfig */
>>>> +             dw2 |= 0x8001;
>>>> +             /* set load_cs_sh_regs */
>>>> +             dw2 |= 0x01000000;
>>>> +             /* set load_per_context_state & load_gfx_sh_regs */
>>>> +             dw2 |= 0x10002;
>>> Better define some constants for those.
>>>
>>> [ML] I'll leave it to other guys when doing cleanups, a little hurry for other jobs now ...
>>>
>>> Regards,
>>> Christian.
>>>
>>>> +     }
>>>> +
>>>> +     amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
>>>> +     amdgpu_ring_write(ring, dw2);
>>>> +     amdgpu_ring_write(ring, 0);
>>>> +}
>>>> +
>>>>    /**
>>>>     * gfx_v7_0_ring_test_ib - basic ring IB test
>>>>     *
>>>> @@ -4929,6 +4948,7 @@ static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_gfx = {
>>>>        .test_ib = gfx_v7_0_ring_test_ib,
>>>>        .insert_nop = amdgpu_ring_insert_nop,
>>>>        .pad_ib = amdgpu_ring_generic_pad_ib,
>>>> +     .emit_cntxcntl = gfx_v7_ring_emit_cntxcntl,
>>>>    };
>>>>
>>>>    static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute
>>>> = { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>> index 8ba8e42..73f6ffa 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>> @@ -6085,6 +6085,35 @@ static void gfx_v8_ring_emit_sb(struct amdgpu_ring *ring)
>>>>        amdgpu_ring_write(ring, 0);
>>>>    }
>>>>
>>>> +static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring,
>>>> +uint32_t flags) {
>>>> +     uint32_t dw2 = 0;
>>>> +
>>>> +     dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
>>>> +     if (flags & HAVE_CTX_SWITCH) {
>>>> +             /* set load_global_config & load_global_uconfig */
>>>> +             dw2 |= 0x8001;
>>>> +             /* set load_cs_sh_regs */
>>>> +             dw2 |= 0x01000000;
>>>> +             /* set load_per_context_state & load_gfx_sh_regs for GFX */
>>>> +             dw2 |= 0x10002;
>>>> +
>>>> +             /* set load_ce_ram if preamble presented */
>>>> +             if (PREAMBLE_IB_PRESENT & flags)
>>>> +                     dw2 |= 0x10000000;
>>>> +     } else {
>>>> +             /* still load_ce_ram if this is the first time preamble presented
>>>> +              * although there is no context switch happens.
>>>> +              */
>>>> +             if (PREAMBLE_IB_PRESENT_FIRST & flags)
>>>> +                     dw2 |= 0x10000000;
>>>> +     }
>>>> +
>>>> +     amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
>>>> +     amdgpu_ring_write(ring, dw2);
>>>> +     amdgpu_ring_write(ring, 0);
>>>> +}
>>>> +
>>>>    static void gfx_v8_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
>>>>                                                 enum amdgpu_interrupt_state state)
>>>>    {
>>>> @@ -6267,6 +6296,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
>>>>        .insert_nop = amdgpu_ring_insert_nop,
>>>>        .pad_ib = amdgpu_ring_generic_pad_ib,
>>>>        .emit_switch_buffer = gfx_v8_ring_emit_sb,
>>>> +     .emit_cntxcntl = gfx_v8_ring_emit_cntxcntl,
>>>>    };
>>>>
>>>>    static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute
>>>> = {
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
       [not found]                                   ` <b5628485-0521-b255-b8eb-82763275a6ba-5C7GfCeVMHo@public.gmane.org>
@ 2016-09-06  9:28                                     ` Liu, Monk
       [not found]                                       ` <MWHPR12MB1182E9A93F98269CD522BB6084F90-Gy0DoCVfaSVhjnLHdLm0OQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 20+ messages in thread
From: Liu, Monk @ 2016-09-06  9:28 UTC (permalink / raw)
  To: Koenig, Christian, Bas Nieuwenhuizen
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Hi Bas & Christian

I'm not familiar with the policy of upstream kernel driver, so I cannot say if your proposal is doable or not,

I have questions:

1) Is that my patch must work with current MESA driver ? (even MESA doesn't change any bit of its logic) ?
2) is that my patch can let kmd go to a new path (using CONTEXT_CONTROL) with bump up the KMS version ?
3) will MESA change its logic (align with close OGL driver) and bump up its version so that new version MESA can work with new version KMS/kmd?

With above question addressed, we can together discuss how to modify CONTEXT_CONTROL patch

BR Monk




-----Original Message-----
From: Koenig, Christian 
Sent: Monday, September 05, 2016 7:57 PM
To: Liu, Monk <Monk.Liu@amd.com>
Cc: brahma_hybrid_dev <brahma_hybrid_dev@amd.com>
Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)

Another possible solution which just came to my mind: Completely ignore the preamble flag on the IB on keep the existing preamble handling as it is.

Just insert a CONTEXT_CONTROL package at the beginning of the command submission controlled by if we have seen a context switch or not and then raise the driver version number.

Then we can fix Mesa to not emit the CONTEXT_CONTROL commands from the UMD any more and when SR-IOV comes out we add a handling to ignore the preamble flag in the kernel when it is activated.

Does that sounds like it should work?

Regards,
Christian.


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
       [not found]                                       ` <MWHPR12MB1182E9A93F98269CD522BB6084F90-Gy0DoCVfaSVhjnLHdLm0OQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2016-09-06  9:39                                         ` Christian König
       [not found]                                           ` <389f73bd-dc01-b372-5227-5e7da06b40e6-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  0 siblings, 1 reply; 20+ messages in thread
From: Christian König @ 2016-09-06  9:39 UTC (permalink / raw)
  To: Liu, Monk, Koenig, Christian, Bas Nieuwenhuizen
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

> 1) Is that my patch must work with current MESA driver ? (even MESA doesn't change any bit of its logic) ?
Yes and it must work with all old versions of Mesa. This is usual the 
tricky part to get right and most of my concern right now.

> 2) is that my patch can let kmd go to a new path (using CONTEXT_CONTROL) with bump up the KMS version ?
No, not necessary. Bumping the KMS version can only be used to signal to 
Mesa that it can use a new feature. E.g. Mesa can then stop to use 
CONTEXT_CONTROL in it's IBs, but there isn't any guarantee that it does.

> 3) will MESA change its logic (align with close OGL driver) and bump up its version so that new version MESA can work with new version KMS/kmd?
No, usually Mesa is changed in a way so that it works with both old and 
new kernel drivers. Only when there is a really good reason (usually 
critical bugs in the kernel driver) Mesa will drop support for older 
kernel versions.

As I said in my internal mail let's do it like this:

1. Don't touch the preamble flag at all, just keep that handling as it 
is for now.

2. Add the CONTEXT_CONTROL with the appropriate handling on context 
switch and bump the version number to signal that this is done.

3. When #2 is upstream we hack on Mesa to drop the CONTEXT_CONTROL 
packets from it's IBs when it sees the new kernel version.

4. When we release SR-IOV we add some logic to the kernel driver to 
ignore the preamble flag.

Regards,
Christian.

Am 06.09.2016 um 11:28 schrieb Liu, Monk:
> Hi Bas & Christian
>
> I'm not familiar with the policy of upstream kernel driver, so I cannot say if your proposal is doable or not,
>
> I have questions:
>
> 1) Is that my patch must work with current MESA driver ? (even MESA doesn't change any bit of its logic) ?
> 2) is that my patch can let kmd go to a new path (using CONTEXT_CONTROL) with bump up the KMS version ?
> 3) will MESA change its logic (align with close OGL driver) and bump up its version so that new version MESA can work with new version KMS/kmd?
>
> With above question addressed, we can together discuss how to modify CONTEXT_CONTROL patch
>
> BR Monk
>
>
>
>
> -----Original Message-----
> From: Koenig, Christian
> Sent: Monday, September 05, 2016 7:57 PM
> To: Liu, Monk <Monk.Liu@amd.com>
> Cc: brahma_hybrid_dev <brahma_hybrid_dev@amd.com>
> Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
>
> Another possible solution which just came to my mind: Completely ignore the preamble flag on the IB on keep the existing preamble handling as it is.
>
> Just insert a CONTEXT_CONTROL package at the beginning of the command submission controlled by if we have seen a context switch or not and then raise the driver version number.
>
> Then we can fix Mesa to not emit the CONTEXT_CONTROL commands from the UMD any more and when SR-IOV comes out we add a handling to ignore the preamble flag in the kernel when it is activated.
>
> Does that sounds like it should work?
>
> Regards,
> Christian.
>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
       [not found]                                           ` <389f73bd-dc01-b372-5227-5e7da06b40e6-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2016-09-07  5:30                                             ` Liu, Monk
       [not found]                                               ` <MWHPR12MB11827F79BAB52562687ACFF284F80-Gy0DoCVfaSVhjnLHdLm0OQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 20+ messages in thread
From: Liu, Monk @ 2016-09-07  5:30 UTC (permalink / raw)
  To: Christian König, Koenig, Christian, Bas Nieuwenhuizen
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


1. Don't touch the preamble flag at all, just keep that handling as it is for now.

2. Add the CONTEXT_CONTROL with the appropriate handling on context switch and bump the version number to signal that this is done.

3. When #2 is upstream we hack on Mesa to drop the CONTEXT_CONTROL packets from its IBs when it sees the new kernel version.

4. When we release SR-IOV we add some logic to the kernel driver to ignore the preamble flag.


[ml] 
Let's simplify the problem and don't involve SR-IOV currently, the object is that:
A) We change dma frame to make it compatible with standard/documented scheme, which is aligned with d3d and close OGL, and meanwhile
B) We keep current MESA still work.

For A), we need CONTEXT_CONTROL always inserted in ring buffer, because the load_xxx of close OGL  is dynamically skipped/kept by CONTEXT_CONTROL in ring buffer.
So your #2 seems not correct, because if we only insert CONTEXT_CONTROL when doing context switch, that means the load_xxx from CEIB/DEIB of close OGL is always kept ( and only Preamble is skipped) even there is no context switch, and this harms performance.

Since MESA use CONTEXT_CONTROL in IB, so the CONTEXT_CONTROL in ring buffer will be replaced by MESA, that means kmd can always insert CONTEXT_CONTROL.

With above concerns, I think the step could follow below steps:

1. keep original preamble_flag logic: skip the preamble IB if no context switch occurs, so that old MESA doesn't break. 
	Note: I remembered  @Bas mentioned that even no context switch, preamble IB should be kept because MESA rely on Preamble IB to do some L2 update like shader uniforms. I think that's wrong, because original 		logic is we always skip Preamble IB when no context switch occurs.
2.always Insert CONTEXT_CONTROL before ALL IB in ring buffer, and this CONTEXT_CONTROL skips load_xxx when no context switch ( keep load_xxx when context switch), so the close OGL can gain performance and supported by kmd with right logic.  
	Note: like I said, MESA shouldn't get trouble with #2 because current MESA's CONTEXT_CONROL in IB will override the one of kmd.

3.Bump up KMS version to notify MESA that KMD already use CONTEXT_CONTROL,  for MESA we change it and let it not insert CONTEXT_CONTROL when talking with new version KMS. Also Mesa should make Preamble CEIb, CEIB, DEIB behave the same way as close OGL.


I think above steps can satisfy both current MESA an close OGL logic, as well as new MESA logic.
after sr-iov get upstreamed, kmd can remove the logic of "skip Preamble IB when no context switch".


What do you guys think

BR monk






-----Original Message-----
From: Christian König [mailto:deathsimple@vodafone.de] 
Sent: Tuesday, September 06, 2016 5:39 PM
To: Liu, Monk <Monk.Liu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Cc: amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)

> 1) Is that my patch must work with current MESA driver ? (even MESA doesn't change any bit of its logic) ?
Yes and it must work with all old versions of Mesa. This is usual the tricky part to get right and most of my concern right now.

> 2) is that my patch can let kmd go to a new path (using CONTEXT_CONTROL) with bump up the KMS version ?
No, not necessary. Bumping the KMS version can only be used to signal to Mesa that it can use a new feature. E.g. Mesa can then stop to use CONTEXT_CONTROL in it's IBs, but there isn't any guarantee that it does.

> 3) will MESA change its logic (align with close OGL driver) and bump up its version so that new version MESA can work with new version KMS/kmd?
No, usually Mesa is changed in a way so that it works with both old and new kernel drivers. Only when there is a really good reason (usually critical bugs in the kernel driver) Mesa will drop support for older kernel versions.

As I said in my internal mail let's do it like this:

1. Don't touch the preamble flag at all, just keep that handling as it is for now.

2. Add the CONTEXT_CONTROL with the appropriate handling on context switch and bump the version number to signal that this is done.

3. When #2 is upstream we hack on Mesa to drop the CONTEXT_CONTROL packets from it's IBs when it sees the new kernel version.

4. When we release SR-IOV we add some logic to the kernel driver to ignore the preamble flag.

Regards,
Christian.

Am 06.09.2016 um 11:28 schrieb Liu, Monk:
> Hi Bas & Christian
>
> I'm not familiar with the policy of upstream kernel driver, so I 
> cannot say if your proposal is doable or not,
>
> I have questions:
>
> 1) Is that my patch must work with current MESA driver ? (even MESA doesn't change any bit of its logic) ?
> 2) is that my patch can let kmd go to a new path (using CONTEXT_CONTROL) with bump up the KMS version ?
> 3) will MESA change its logic (align with close OGL driver) and bump up its version so that new version MESA can work with new version KMS/kmd?
>
> With above question addressed, we can together discuss how to modify 
> CONTEXT_CONTROL patch
>
> BR Monk
>
>
>
>
> -----Original Message-----
> From: Koenig, Christian
> Sent: Monday, September 05, 2016 7:57 PM
> To: Liu, Monk <Monk.Liu@amd.com>
> Cc: brahma_hybrid_dev <brahma_hybrid_dev@amd.com>
> Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
>
> Another possible solution which just came to my mind: Completely ignore the preamble flag on the IB on keep the existing preamble handling as it is.
>
> Just insert a CONTEXT_CONTROL package at the beginning of the command submission controlled by if we have seen a context switch or not and then raise the driver version number.
>
> Then we can fix Mesa to not emit the CONTEXT_CONTROL commands from the UMD any more and when SR-IOV comes out we add a handling to ignore the preamble flag in the kernel when it is activated.
>
> Does that sounds like it should work?
>
> Regards,
> Christian.
>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
       [not found]                                               ` <MWHPR12MB11827F79BAB52562687ACFF284F80-Gy0DoCVfaSVhjnLHdLm0OQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2016-09-07  7:07                                                 ` Christian König
  0 siblings, 0 replies; 20+ messages in thread
From: Christian König @ 2016-09-07  7:07 UTC (permalink / raw)
  To: Liu, Monk, Koenig, Christian, Bas Nieuwenhuizen
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

> What do you guys think
Sounds exactly like what I had in mind as well.

> So your #2 seems not correct, because if we only insert CONTEXT_CONTROL when doing context switch
Sorry for not being 100% clear on that. I meant that we should always 
insert CONTEXT_CONTROL as well, but set it's flags based on if a switch 
occurred or not.

Regards,
Christian.

Am 07.09.2016 um 07:30 schrieb Liu, Monk:
> 1. Don't touch the preamble flag at all, just keep that handling as it is for now.
>
> 2. Add the CONTEXT_CONTROL with the appropriate handling on context switch and bump the version number to signal that this is done.
>
> 3. When #2 is upstream we hack on Mesa to drop the CONTEXT_CONTROL packets from its IBs when it sees the new kernel version.
>
> 4. When we release SR-IOV we add some logic to the kernel driver to ignore the preamble flag.
>
>
> [ml]
> Let's simplify the problem and don't involve SR-IOV currently, the object is that:
> A) We change dma frame to make it compatible with standard/documented scheme, which is aligned with d3d and close OGL, and meanwhile
> B) We keep current MESA still work.
>
> For A), we need CONTEXT_CONTROL always inserted in ring buffer, because the load_xxx of close OGL  is dynamically skipped/kept by CONTEXT_CONTROL in ring buffer.
> So your #2 seems not correct, because if we only insert CONTEXT_CONTROL when doing context switch, that means the load_xxx from CEIB/DEIB of close OGL is always kept ( and only Preamble is skipped) even there is no context switch, and this harms performance.
>
> Since MESA use CONTEXT_CONTROL in IB, so the CONTEXT_CONTROL in ring buffer will be replaced by MESA, that means kmd can always insert CONTEXT_CONTROL.
>
> With above concerns, I think the step could follow below steps:
>
> 1. keep original preamble_flag logic: skip the preamble IB if no context switch occurs, so that old MESA doesn't break.
> 	Note: I remembered  @Bas mentioned that even no context switch, preamble IB should be kept because MESA rely on Preamble IB to do some L2 update like shader uniforms. I think that's wrong, because original 		logic is we always skip Preamble IB when no context switch occurs.
> 2.always Insert CONTEXT_CONTROL before ALL IB in ring buffer, and this CONTEXT_CONTROL skips load_xxx when no context switch ( keep load_xxx when context switch), so the close OGL can gain performance and supported by kmd with right logic.
> 	Note: like I said, MESA shouldn't get trouble with #2 because current MESA's CONTEXT_CONROL in IB will override the one of kmd.
>
> 3.Bump up KMS version to notify MESA that KMD already use CONTEXT_CONTROL,  for MESA we change it and let it not insert CONTEXT_CONTROL when talking with new version KMS. Also Mesa should make Preamble CEIb, CEIB, DEIB behave the same way as close OGL.
>
>
> I think above steps can satisfy both current MESA an close OGL logic, as well as new MESA logic.
> after sr-iov get upstreamed, kmd can remove the logic of "skip Preamble IB when no context switch".
>
>
> What do you guys think
>
> BR monk
>
>
>
>
>
>
> -----Original Message-----
> From: Christian König [mailto:deathsimple@vodafone.de]
> Sent: Tuesday, September 06, 2016 5:39 PM
> To: Liu, Monk <Monk.Liu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
> Cc: amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
>
>> 1) Is that my patch must work with current MESA driver ? (even MESA doesn't change any bit of its logic) ?
> Yes and it must work with all old versions of Mesa. This is usual the tricky part to get right and most of my concern right now.
>
>> 2) is that my patch can let kmd go to a new path (using CONTEXT_CONTROL) with bump up the KMS version ?
> No, not necessary. Bumping the KMS version can only be used to signal to Mesa that it can use a new feature. E.g. Mesa can then stop to use CONTEXT_CONTROL in it's IBs, but there isn't any guarantee that it does.
>
>> 3) will MESA change its logic (align with close OGL driver) and bump up its version so that new version MESA can work with new version KMS/kmd?
> No, usually Mesa is changed in a way so that it works with both old and new kernel drivers. Only when there is a really good reason (usually critical bugs in the kernel driver) Mesa will drop support for older kernel versions.
>
> As I said in my internal mail let's do it like this:
>
> 1. Don't touch the preamble flag at all, just keep that handling as it is for now.
>
> 2. Add the CONTEXT_CONTROL with the appropriate handling on context switch and bump the version number to signal that this is done.
>
> 3. When #2 is upstream we hack on Mesa to drop the CONTEXT_CONTROL packets from it's IBs when it sees the new kernel version.
>
> 4. When we release SR-IOV we add some logic to the kernel driver to ignore the preamble flag.
>
> Regards,
> Christian.
>
> Am 06.09.2016 um 11:28 schrieb Liu, Monk:
>> Hi Bas & Christian
>>
>> I'm not familiar with the policy of upstream kernel driver, so I
>> cannot say if your proposal is doable or not,
>>
>> I have questions:
>>
>> 1) Is that my patch must work with current MESA driver ? (even MESA doesn't change any bit of its logic) ?
>> 2) is that my patch can let kmd go to a new path (using CONTEXT_CONTROL) with bump up the KMS version ?
>> 3) will MESA change its logic (align with close OGL driver) and bump up its version so that new version MESA can work with new version KMS/kmd?
>>
>> With above question addressed, we can together discuss how to modify
>> CONTEXT_CONTROL patch
>>
>> BR Monk
>>
>>
>>
>>
>> -----Original Message-----
>> From: Koenig, Christian
>> Sent: Monday, September 05, 2016 7:57 PM
>> To: Liu, Monk <Monk.Liu@amd.com>
>> Cc: brahma_hybrid_dev <brahma_hybrid_dev@amd.com>
>> Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
>>
>> Another possible solution which just came to my mind: Completely ignore the preamble flag on the IB on keep the existing preamble handling as it is.
>>
>> Just insert a CONTEXT_CONTROL package at the beginning of the command submission controlled by if we have seen a context switch or not and then raise the driver version number.
>>
>> Then we can fix Mesa to not emit the CONTEXT_CONTROL commands from the UMD any more and when SR-IOV comes out we add a handling to ignore the preamble flag in the kernel when it is activated.
>>
>> Does that sounds like it should work?
>>
>> Regards,
>> Christian.
>>
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3)
@ 2016-09-01  8:00 Monk Liu
  0 siblings, 0 replies; 20+ messages in thread
From: Monk Liu @ 2016-09-01  8:00 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

v1:
for gfx8, use CONTEXT_CONTROL package to dynamically
skip preamble CEIB and other load_xxx command in sequence.

v2:
support GFX7 as well, and bump up version.
remove cntxcntl in compute ring funcs because CPC doesn't
support this packet.

v3: fix reduntant judgement in cntxcntl.
v4: some cleanups, don't change cs_submit()

Change-Id: I7b2adc15ea83fd6c4d2521d75662bf39587898d5
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  8 ++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  |  8 ++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 16 +++++++++-------
 drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   | 20 ++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   | 30 ++++++++++++++++++++++++++++++
 6 files changed, 77 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 1254410..0a98531 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -321,6 +321,7 @@ struct amdgpu_ring_funcs {
 	void (*begin_use)(struct amdgpu_ring *ring);
 	void (*end_use)(struct amdgpu_ring *ring);
 	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
+	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
 };
 
 /*
@@ -965,6 +966,7 @@ struct amdgpu_ctx {
 	spinlock_t		ring_lock;
 	struct fence            **fences;
 	struct amdgpu_ctx_ring	rings[AMDGPU_MAX_RINGS];
+	bool preamble_presented;
 };
 
 struct amdgpu_ctx_mgr {
@@ -1229,6 +1231,10 @@ struct amdgpu_cs_parser {
 	struct amdgpu_bo_list_entry	uf_entry;
 };
 
+#define PREAMBLE_IB_PRESENT 		(1 << 0) /* bit set means command submit involves a preamble IB */
+#define PREAMBLE_IB_PRESENT_FIRST	(1 << 1) /* bit set means preamble IB is first presented in belonging context */
+#define HAVE_CTX_SWITCH		(1 << 2) /* bit set means context switch occured */
+
 struct amdgpu_job {
 	struct amd_sched_job    base;
 	struct amdgpu_device	*adev;
@@ -1237,6 +1243,7 @@ struct amdgpu_job {
 	struct amdgpu_sync	sync;
 	struct amdgpu_ib	*ibs;
 	struct fence		*fence; /* the hw fence */
+	uint32_t		preamble_status;
 	uint32_t		num_ibs;
 	void			*owner;
 	uint64_t		fence_ctx; /* the fence_context this job uses */
@@ -2264,6 +2271,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
 #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
 #define amdgpu_ring_emit_hdp_invalidate(r) (r)->funcs->emit_hdp_invalidate((r))
 #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
+#define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r), (d))
 #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
 #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
 #define amdgpu_ring_patch_cond_exec(r,o) (r)->funcs->patch_cond_exec((r),(o))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 2d4e005..ae35318 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -792,6 +792,14 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
 		if (r)
 			return r;
 
+		if (ib->flags & AMDGPU_IB_FLAG_PREAMBLE) {
+			parser->job->preamble_status |= PREAMBLE_IB_PRESENT;
+			if (!parser->ctx->preamble_presented) {
+				parser->job->preamble_status |= PREAMBLE_IB_PRESENT_FIRST;
+				parser->ctx->preamble_presented = true;
+			}
+		}
+
 		if (parser->job->ring && parser->job->ring != ring)
 			return -EINVAL;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 56c85e6..44db0ab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -55,9 +55,10 @@
  * - 3.3.0 - Add VM support for UVD on supported hardware.
  * - 3.4.0 - Add AMDGPU_INFO_NUM_EVICTIONS.
  * - 3.5.0 - Add support for new UVD_NO_OP register.
+ * - 3.6.0 - UMD doesn't/shouldn't need to use CONTEXT_CONTROL in IB, KMD should do it
  */
 #define KMS_DRIVER_MAJOR	3
-#define KMS_DRIVER_MINOR	5
+#define KMS_DRIVER_MINOR	6
 #define KMS_DRIVER_PATCHLEVEL	0
 
 int amdgpu_vram_limit = 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
index 04263f0..b12b5ba 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
@@ -121,10 +121,11 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 {
 	struct amdgpu_device *adev = ring->adev;
 	struct amdgpu_ib *ib = &ibs[0];
-	bool skip_preamble, need_ctx_switch;
+	bool need_ctx_switch;
 	unsigned patch_offset = ~0;
 	struct amdgpu_vm *vm;
 	uint64_t fence_ctx;
+	uint32_t status = 0;
 
 	unsigned i;
 	int r = 0;
@@ -174,15 +175,16 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 	/* always set cond_exec_polling to CONTINUE */
 	*ring->cond_exe_cpu_addr = 1;
 
-	skip_preamble = ring->current_ctx == fence_ctx;
 	need_ctx_switch = ring->current_ctx != fence_ctx;
+	if (job && ring->funcs->emit_cntxcntl) {
+		if (need_ctx_switch)
+			status |= HAVE_CTX_SWITCH;
+		status |= job->preamble_status;
+		amdgpu_ring_emit_cntxcntl(ring, status);
+	}
+
 	for (i = 0; i < num_ibs; ++i) {
 		ib = &ibs[i];
-
-		/* drop preamble IBs if we don't have a context switch */
-		if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) && skip_preamble)
-			continue;
-
 		amdgpu_ring_emit_ib(ring, ib, job ? job->vm_id : 0,
 				    need_ctx_switch);
 		need_ctx_switch = false;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
index f055d49..0d5addb 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
@@ -2096,6 +2096,25 @@ static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
 	amdgpu_ring_write(ring, control);
 }
 
+static void gfx_v7_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)
+{
+	uint32_t dw2 = 0;
+
+	dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
+	if (flags & HAVE_CTX_SWITCH) {
+		/* set load_global_config & load_global_uconfig */
+		dw2 |= 0x8001;
+		/* set load_cs_sh_regs */
+		dw2 |= 0x01000000;
+		/* set load_per_context_state & load_gfx_sh_regs */
+		dw2 |= 0x10002;
+	}
+
+	amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
+	amdgpu_ring_write(ring, dw2);
+	amdgpu_ring_write(ring, 0);
+}
+
 /**
  * gfx_v7_0_ring_test_ib - basic ring IB test
  *
@@ -4929,6 +4948,7 @@ static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_gfx = {
 	.test_ib = gfx_v7_0_ring_test_ib,
 	.insert_nop = amdgpu_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
+	.emit_cntxcntl = gfx_v7_ring_emit_cntxcntl,
 };
 
 static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute = {
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 8ba8e42..73f6ffa 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -6085,6 +6085,35 @@ static void gfx_v8_ring_emit_sb(struct amdgpu_ring *ring)
 	amdgpu_ring_write(ring, 0);
 }
 
+static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)
+{
+	uint32_t dw2 = 0;
+
+	dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
+	if (flags & HAVE_CTX_SWITCH) {
+		/* set load_global_config & load_global_uconfig */
+		dw2 |= 0x8001;
+		/* set load_cs_sh_regs */
+		dw2 |= 0x01000000;
+		/* set load_per_context_state & load_gfx_sh_regs for GFX */
+		dw2 |= 0x10002;
+
+		/* set load_ce_ram if preamble presented */
+		if (PREAMBLE_IB_PRESENT & flags)
+			dw2 |= 0x10000000;
+	} else {
+		/* still load_ce_ram if this is the first time preamble presented
+		 * although there is no context switch happens.
+		 */
+		if (PREAMBLE_IB_PRESENT_FIRST & flags)
+			dw2 |= 0x10000000;
+	}
+
+	amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
+	amdgpu_ring_write(ring, dw2);
+	amdgpu_ring_write(ring, 0);
+}
+
 static void gfx_v8_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
 						 enum amdgpu_interrupt_state state)
 {
@@ -6267,6 +6296,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
 	.insert_nop = amdgpu_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
 	.emit_switch_buffer = gfx_v8_ring_emit_sb,
+	.emit_cntxcntl = gfx_v8_ring_emit_cntxcntl,
 };
 
 static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
-- 
1.9.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2016-09-07  7:07 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-08-31  3:49 [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3) Monk Liu
     [not found] ` <1472615341-3847-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2016-08-31 11:31   ` Liu, Monk
2016-08-31 11:53   ` Christian König
     [not found]     ` <a9b9cfab-4c78-9a90-3f59-6e2ffed73f4b-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2016-09-01  7:37       ` Liu, Monk
     [not found]         ` <MWHPR12MB11829865EEF1352A1A1A4C4084E20-Gy0DoCVfaSVhjnLHdLm0OQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2016-09-01  8:19           ` Bas Nieuwenhuizen
     [not found]             ` <CAP+8YyGECuHNkTD6C5075R9m1wmzMRg=DHQRTRWY2ce1aNJUjA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2016-09-01 10:55               ` Liu, Monk
     [not found]                 ` <MWHPR12MB1182883EE214228FBFCE081C84E20-Gy0DoCVfaSVhjnLHdLm0OQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2016-09-01 14:10                   ` Christian König
     [not found]                     ` <8fed1480-794e-7218-17d1-52221060a149-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2016-09-01 15:15                       ` Liu, Monk
     [not found]                         ` <MWHPR12MB11820449DB6EF7F18F9397A284E20-Gy0DoCVfaSVhjnLHdLm0OQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2016-09-05  3:14                           ` Liu, Monk
2016-09-01 15:40                       ` Liu, Monk
2016-09-01 16:09                   ` Bas Nieuwenhuizen
     [not found]                     ` <CAP+8YyGF3ht8KLgdTDN6K_r+YJD7ZLDscWXrvvwRdkB=HVOcmg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2016-09-01 17:11                       ` Christian König
     [not found]                         ` <e1a095ea-8717-12e5-c779-fac81525ddcc-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2016-09-01 18:20                           ` Bas Nieuwenhuizen
2016-09-05  4:09                       ` Liu, Monk
     [not found]                         ` <MWHPR12MB1182CD9458ECF037586C2A1684E60-Gy0DoCVfaSVhjnLHdLm0OQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2016-09-05  9:51                           ` Christian König
     [not found]                             ` <MWHPR12MB1182F1E4814DC0C98B198DA984E60@MWHPR12MB1182.namprd12.prod.outlook.com>
     [not found]                               ` <b1e9eabd-3cf7-75c3-6ed6-7d62510a11df@amd.com>
     [not found]                                 ` <b5628485-0521-b255-b8eb-82763275a6ba@amd.com>
     [not found]                                   ` <b5628485-0521-b255-b8eb-82763275a6ba-5C7GfCeVMHo@public.gmane.org>
2016-09-06  9:28                                     ` Liu, Monk
     [not found]                                       ` <MWHPR12MB1182E9A93F98269CD522BB6084F90-Gy0DoCVfaSVhjnLHdLm0OQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2016-09-06  9:39                                         ` Christian König
     [not found]                                           ` <389f73bd-dc01-b372-5227-5e7da06b40e6-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2016-09-07  5:30                                             ` Liu, Monk
     [not found]                                               ` <MWHPR12MB11827F79BAB52562687ACFF284F80-Gy0DoCVfaSVhjnLHdLm0OQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2016-09-07  7:07                                                 ` Christian König
2016-09-01  8:00 Monk Liu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.