All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: modify mcbp implement for gfx9(v3)
@ 2022-08-11  3:19 jiadong.zhu
  2022-08-12 11:39 ` Christian König
  0 siblings, 1 reply; 3+ messages in thread
From: jiadong.zhu @ 2022-08-11  3:19 UTC (permalink / raw)
  To: amd-gfx; +Cc: Ray.Huang, Jiadong.Zhu, aaron.liu

From: "Jiadong.Zhu" <Jiadong.Zhu@amd.com>

1. Use unmap_queue package to trigger preemption on gfx9
   Add trailing fence to track the preemption done.
2. Modify emit_ce_meta emit_de_meta functions
   for the resumed ibs.

Signed-off-by: Jiadong.Zhu <Jiadong.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |   1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 161 ++++++++++++++++++++---
 drivers/gpu/drm/amd/amdgpu/soc15d.h      |   2 +
 3 files changed, 143 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 82c178a9033a..ca626f0ad7b1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -59,6 +59,7 @@ enum amdgpu_ring_priority_level {
 #define AMDGPU_FENCE_FLAG_64BIT         (1 << 0)
 #define AMDGPU_FENCE_FLAG_INT           (1 << 1)
 #define AMDGPU_FENCE_FLAG_TC_WB_ONLY    (1 << 2)
+#define AMDGPU_FENCE_FLAG_EXEC          (1 << 3)
 
 #define to_amdgpu_ring(s) container_of((s), struct amdgpu_ring, sched)
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 5332899642dc..887021fd56aa 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -751,7 +751,7 @@ static void gfx_v9_0_set_rlc_funcs(struct amdgpu_device *adev);
 static int gfx_v9_0_get_cu_info(struct amdgpu_device *adev,
 				struct amdgpu_cu_info *cu_info);
 static uint64_t gfx_v9_0_get_gpu_clock_counter(struct amdgpu_device *adev);
-static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring);
+static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool resume);
 static u64 gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring);
 static void gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
 					  void *ras_error_status);
@@ -824,9 +824,10 @@ static void gfx_v9_0_kiq_unmap_queues(struct amdgpu_ring *kiq_ring,
 			PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(ring->doorbell_index));
 
 	if (action == PREEMPT_QUEUES_NO_UNMAP) {
-		amdgpu_ring_write(kiq_ring, lower_32_bits(gpu_addr));
-		amdgpu_ring_write(kiq_ring, upper_32_bits(gpu_addr));
-		amdgpu_ring_write(kiq_ring, seq);
+		amdgpu_ring_write(kiq_ring, lower_32_bits(ring->wptr & ring->buf_mask));
+		amdgpu_ring_write(kiq_ring, 0);
+		amdgpu_ring_write(kiq_ring, 0);
+
 	} else {
 		amdgpu_ring_write(kiq_ring, 0);
 		amdgpu_ring_write(kiq_ring, 0);
@@ -5446,11 +5447,16 @@ static void gfx_v9_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
 
 	control |= ib->length_dw | (vmid << 24);
 
-	if (amdgpu_sriov_vf(ring->adev) && (ib->flags & AMDGPU_IB_FLAG_PREEMPT)) {
+	if ((amdgpu_sriov_vf(ring->adev) || amdgpu_mcbp) && (ib->flags & AMDGPU_IB_FLAG_PREEMPT)) {
 		control |= INDIRECT_BUFFER_PRE_ENB(1);
 
+		if (flags & AMDGPU_IB_PREEMPTED)
+			control |= INDIRECT_BUFFER_PRE_RESUME(1);
+
 		if (!(ib->flags & AMDGPU_IB_FLAG_CE) && vmid)
-			gfx_v9_0_ring_emit_de_meta(ring);
+			gfx_v9_0_ring_emit_de_meta(ring,
+				 (!amdgpu_sriov_vf(ring->adev) && flags & AMDGPU_IB_PREEMPTED) ?
+					true : false);
 	}
 
 	amdgpu_ring_write(ring, header);
@@ -5505,6 +5511,7 @@ static void gfx_v9_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr,
 	bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
 	bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
 	bool writeback = flags & AMDGPU_FENCE_FLAG_TC_WB_ONLY;
+	bool exec = flags & AMDGPU_FENCE_FLAG_EXEC;
 
 	/* RELEASE_MEM - flush caches, send int */
 	amdgpu_ring_write(ring, PACKET3(PACKET3_RELEASE_MEM, 6));
@@ -5515,6 +5522,7 @@ static void gfx_v9_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr,
 					       EOP_TC_WB_ACTION_EN |
 					       EOP_TC_MD_ACTION_EN)) |
 				 EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) |
+				 (exec ? EOP_EXEC : 0x0) |
 				 EVENT_INDEX(5)));
 	amdgpu_ring_write(ring, DATA_SEL(write64bit ? 2 : 1) | INT_SEL(int_sel ? 2 : 0));
 
@@ -5620,33 +5628,135 @@ static void gfx_v9_ring_emit_sb(struct amdgpu_ring *ring)
 	amdgpu_ring_write(ring, 0);
 }
 
-static void gfx_v9_0_ring_emit_ce_meta(struct amdgpu_ring *ring)
+static void gfx_v9_0_ring_emit_ce_meta(struct amdgpu_ring *ring, bool resume)
 {
+	struct amdgpu_device *adev = ring->adev;
 	struct v9_ce_ib_state ce_payload = {0};
-	uint64_t csa_addr;
+	uint64_t offset, ce_payload_gpu_addr;
+	void *ce_payload_cpu_addr;
 	int cnt;
 
 	cnt = (sizeof(ce_payload) >> 2) + 4 - 2;
-	csa_addr = amdgpu_csa_vaddr(ring->adev);
+
+	if (ring->is_mes_queue) {
+		offset = offsetof(struct amdgpu_mes_ctx_meta_data,
+				  gfx[0].gfx_meta_data) +
+			offsetof(struct v9_gfx_meta_data, ce_payload);
+		ce_payload_gpu_addr =
+			amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset);
+		ce_payload_cpu_addr =
+			amdgpu_mes_ctx_get_offs_cpu_addr(ring, offset);
+	} else {
+		offset = offsetof(struct v9_gfx_meta_data, ce_payload);
+		ce_payload_gpu_addr = amdgpu_csa_vaddr(ring->adev) + offset;
+		ce_payload_cpu_addr = adev->virt.csa_cpu_addr + offset;
+	}
 
 	amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, cnt));
 	amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(2) |
 				 WRITE_DATA_DST_SEL(8) |
 				 WR_CONFIRM) |
 				 WRITE_DATA_CACHE_POLICY(0));
-	amdgpu_ring_write(ring, lower_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, ce_payload)));
-	amdgpu_ring_write(ring, upper_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, ce_payload)));
-	amdgpu_ring_write_multiple(ring, (void *)&ce_payload, sizeof(ce_payload) >> 2);
+	amdgpu_ring_write(ring, lower_32_bits(ce_payload_gpu_addr));
+	amdgpu_ring_write(ring, upper_32_bits(ce_payload_gpu_addr));
+
+	if (resume)
+		amdgpu_ring_write_multiple(ring, ce_payload_cpu_addr,
+					   sizeof(ce_payload) >> 2);
+	else
+		amdgpu_ring_write_multiple(ring, (void *)&ce_payload,
+					   sizeof(ce_payload) >> 2);
+}
+
+static int gfx_v9_0_ring_preempt_ib(struct amdgpu_ring *ring)
+{
+	int i, r = 0;
+	struct amdgpu_device *adev = ring->adev;
+	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
+	struct amdgpu_ring *kiq_ring = &kiq->ring;
+	unsigned long flags;
+
+	if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
+		return -EINVAL;
+
+	spin_lock_irqsave(&kiq->ring_lock, flags);
+
+	if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size)) {
+		spin_unlock_irqrestore(&kiq->ring_lock, flags);
+		return -ENOMEM;
+	}
+
+	/* assert preemption condition */
+	amdgpu_ring_set_preempt_cond_exec(ring, false);
+
+	ring->trail_seq += 1;
+	amdgpu_ring_alloc(ring, 13);
+	gfx_v9_0_ring_emit_fence(ring, ring->trail_fence_gpu_addr,
+				  ring->trail_seq, AMDGPU_FENCE_FLAG_EXEC);
+	/*reset the CP_VMID_PREEMPT after trailing fence*/
+	amdgpu_ring_emit_wreg(ring,
+				  SOC15_REG_OFFSET(GC, 0, mmCP_VMID_PREEMPT),
+				  0x0);
+
+	/* assert IB preemption, emit the trailing fence */
+	kiq->pmf->kiq_unmap_queues(kiq_ring, ring, PREEMPT_QUEUES_NO_UNMAP,
+				   ring->trail_fence_gpu_addr,
+				   ring->trail_seq);
+
+	amdgpu_ring_commit(kiq_ring);
+	spin_unlock_irqrestore(&kiq->ring_lock, flags);
+
+	/* poll the trailing fence */
+	for (i = 0; i < adev->usec_timeout; i++) {
+		if (ring->trail_seq ==
+		    le32_to_cpu(*(ring->trail_fence_cpu_addr)))
+			break;
+		udelay(1);
+	}
+
+	if (i >= adev->usec_timeout) {
+		r = -EINVAL;
+		DRM_ERROR("ring %d failed to preempt ib\n", ring->idx);
+	}
+
+	amdgpu_ring_commit(ring);
+
+	/* deassert preemption condition */
+	amdgpu_ring_set_preempt_cond_exec(ring, true);
+	return r;
 }
 
-static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring)
+static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool resume)
 {
+	struct amdgpu_device *adev = ring->adev;
 	struct v9_de_ib_state de_payload = {0};
-	uint64_t csa_addr, gds_addr;
+	uint64_t offset, gds_addr, de_payload_gpu_addr;
+	void *de_payload_cpu_addr;
 	int cnt;
 
-	csa_addr = amdgpu_csa_vaddr(ring->adev);
-	gds_addr = csa_addr + 4096;
+	if (ring->is_mes_queue) {
+		offset = offsetof(struct amdgpu_mes_ctx_meta_data,
+				  gfx[0].gfx_meta_data) +
+			offsetof(struct v9_gfx_meta_data, de_payload);
+		de_payload_gpu_addr =
+			amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset);
+		de_payload_cpu_addr =
+			amdgpu_mes_ctx_get_offs_cpu_addr(ring, offset);
+
+		offset = offsetof(struct amdgpu_mes_ctx_meta_data,
+				  gfx[0].gds_backup) +
+			offsetof(struct v9_gfx_meta_data, de_payload);
+		gds_addr = amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset);
+	} else {
+		offset = offsetof(struct v9_gfx_meta_data, de_payload);
+		de_payload_gpu_addr = amdgpu_csa_vaddr(ring->adev) + offset;
+		de_payload_cpu_addr = adev->virt.csa_cpu_addr + offset;
+
+		gds_addr = ALIGN(amdgpu_csa_vaddr(ring->adev) +
+				 AMDGPU_CSA_SIZE - adev->gds.gds_size,
+				 PAGE_SIZE);
+	}
+
 	de_payload.gds_backup_addrlo = lower_32_bits(gds_addr);
 	de_payload.gds_backup_addrhi = upper_32_bits(gds_addr);
 
@@ -5656,9 +5766,15 @@ static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring)
 				 WRITE_DATA_DST_SEL(8) |
 				 WR_CONFIRM) |
 				 WRITE_DATA_CACHE_POLICY(0));
-	amdgpu_ring_write(ring, lower_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, de_payload)));
-	amdgpu_ring_write(ring, upper_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, de_payload)));
-	amdgpu_ring_write_multiple(ring, (void *)&de_payload, sizeof(de_payload) >> 2);
+	amdgpu_ring_write(ring, lower_32_bits(de_payload_gpu_addr));
+	amdgpu_ring_write(ring, upper_32_bits(de_payload_gpu_addr));
+
+	if (resume)
+		amdgpu_ring_write_multiple(ring, de_payload_cpu_addr,
+					   sizeof(de_payload) >> 2);
+	else
+		amdgpu_ring_write_multiple(ring, (void *)&de_payload,
+					   sizeof(de_payload) >> 2);
 }
 
 static void gfx_v9_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
@@ -5674,8 +5790,10 @@ static void gfx_v9_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)
 {
 	uint32_t dw2 = 0;
 
-	if (amdgpu_sriov_vf(ring->adev))
-		gfx_v9_0_ring_emit_ce_meta(ring);
+	if (amdgpu_sriov_vf(ring->adev) || amdgpu_mcbp)
+		gfx_v9_0_ring_emit_ce_meta(ring,
+					(!amdgpu_sriov_vf(ring->adev) &&
+						flags & AMDGPU_IB_PREEMPTED) ? true : false);
 
 	dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
 	if (flags & AMDGPU_HAVE_CTX_SWITCH) {
@@ -7024,6 +7142,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
 	.emit_cntxcntl = gfx_v9_ring_emit_cntxcntl,
 	.init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
 	.patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
+	.preempt_ib = gfx_v9_0_ring_preempt_ib,
 	.emit_frame_cntl = gfx_v9_0_ring_emit_frame_cntl,
 	.emit_wreg = gfx_v9_0_ring_emit_wreg,
 	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15d.h b/drivers/gpu/drm/amd/amdgpu/soc15d.h
index 799925d22fc8..614e9f8467fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15d.h
+++ b/drivers/gpu/drm/amd/amdgpu/soc15d.h
@@ -162,6 +162,7 @@
 		 * 2 - Bypass
 		 */
 #define     INDIRECT_BUFFER_PRE_ENB(x)		 ((x) << 21)
+#define     INDIRECT_BUFFER_PRE_RESUME(x)           ((x) << 30)
 #define	PACKET3_COPY_DATA				0x40
 #define	PACKET3_PFP_SYNC_ME				0x42
 #define	PACKET3_COND_WRITE				0x45
@@ -184,6 +185,7 @@
 #define		EOP_TC_ACTION_EN                        (1 << 17) /* L2 */
 #define		EOP_TC_NC_ACTION_EN			(1 << 19)
 #define		EOP_TC_MD_ACTION_EN			(1 << 21) /* L2 metadata */
+#define		EOP_EXEC					(1 << 28) /* For Trailing Fence */
 
 #define		DATA_SEL(x)                             ((x) << 29)
 		/* 0 - discard
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] drm/amdgpu: modify mcbp implement for gfx9(v3)
  2022-08-11  3:19 [PATCH] drm/amdgpu: modify mcbp implement for gfx9(v3) jiadong.zhu
@ 2022-08-12 11:39 ` Christian König
  2022-08-13  4:16   ` Zhu, Jiadong
  0 siblings, 1 reply; 3+ messages in thread
From: Christian König @ 2022-08-12 11:39 UTC (permalink / raw)
  To: jiadong.zhu, amd-gfx; +Cc: Ray.Huang, aaron.liu



Am 11.08.22 um 05:19 schrieb jiadong.zhu@amd.com:
> From: "Jiadong.Zhu" <Jiadong.Zhu@amd.com>
>
> 1. Use unmap_queue package to trigger preemption on gfx9
>     Add trailing fence to track the preemption done.
> 2. Modify emit_ce_meta emit_de_meta functions
>     for the resumed ibs.
>
> Signed-off-by: Jiadong.Zhu <Jiadong.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |   1 +
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 161 ++++++++++++++++++++---
>   drivers/gpu/drm/amd/amdgpu/soc15d.h      |   2 +
>   3 files changed, 143 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 82c178a9033a..ca626f0ad7b1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -59,6 +59,7 @@ enum amdgpu_ring_priority_level {
>   #define AMDGPU_FENCE_FLAG_64BIT         (1 << 0)
>   #define AMDGPU_FENCE_FLAG_INT           (1 << 1)
>   #define AMDGPU_FENCE_FLAG_TC_WB_ONLY    (1 << 2)

> +#define AMDGPU_FENCE_FLAG_EXEC          (1 << 3)

Ok, that here needs much more explanation why you need it and how all 
this is supposed to work?

Regards,
Christian.

>   
>   #define to_amdgpu_ring(s) container_of((s), struct amdgpu_ring, sched)
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 5332899642dc..887021fd56aa 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -751,7 +751,7 @@ static void gfx_v9_0_set_rlc_funcs(struct amdgpu_device *adev);
>   static int gfx_v9_0_get_cu_info(struct amdgpu_device *adev,
>   				struct amdgpu_cu_info *cu_info);
>   static uint64_t gfx_v9_0_get_gpu_clock_counter(struct amdgpu_device *adev);
> -static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring);
> +static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool resume);
>   static u64 gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring);
>   static void gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
>   					  void *ras_error_status);
> @@ -824,9 +824,10 @@ static void gfx_v9_0_kiq_unmap_queues(struct amdgpu_ring *kiq_ring,
>   			PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(ring->doorbell_index));
>   
>   	if (action == PREEMPT_QUEUES_NO_UNMAP) {
> -		amdgpu_ring_write(kiq_ring, lower_32_bits(gpu_addr));
> -		amdgpu_ring_write(kiq_ring, upper_32_bits(gpu_addr));
> -		amdgpu_ring_write(kiq_ring, seq);
> +		amdgpu_ring_write(kiq_ring, lower_32_bits(ring->wptr & ring->buf_mask));
> +		amdgpu_ring_write(kiq_ring, 0);
> +		amdgpu_ring_write(kiq_ring, 0);
> +
>   	} else {
>   		amdgpu_ring_write(kiq_ring, 0);
>   		amdgpu_ring_write(kiq_ring, 0);
> @@ -5446,11 +5447,16 @@ static void gfx_v9_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
>   
>   	control |= ib->length_dw | (vmid << 24);
>   
> -	if (amdgpu_sriov_vf(ring->adev) && (ib->flags & AMDGPU_IB_FLAG_PREEMPT)) {
> +	if ((amdgpu_sriov_vf(ring->adev) || amdgpu_mcbp) && (ib->flags & AMDGPU_IB_FLAG_PREEMPT)) {
>   		control |= INDIRECT_BUFFER_PRE_ENB(1);
>   
> +		if (flags & AMDGPU_IB_PREEMPTED)
> +			control |= INDIRECT_BUFFER_PRE_RESUME(1);
> +
>   		if (!(ib->flags & AMDGPU_IB_FLAG_CE) && vmid)
> -			gfx_v9_0_ring_emit_de_meta(ring);
> +			gfx_v9_0_ring_emit_de_meta(ring,
> +				 (!amdgpu_sriov_vf(ring->adev) && flags & AMDGPU_IB_PREEMPTED) ?
> +					true : false);
>   	}
>   
>   	amdgpu_ring_write(ring, header);
> @@ -5505,6 +5511,7 @@ static void gfx_v9_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr,
>   	bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
>   	bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
>   	bool writeback = flags & AMDGPU_FENCE_FLAG_TC_WB_ONLY;
> +	bool exec = flags & AMDGPU_FENCE_FLAG_EXEC;
>   
>   	/* RELEASE_MEM - flush caches, send int */
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_RELEASE_MEM, 6));
> @@ -5515,6 +5522,7 @@ static void gfx_v9_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr,
>   					       EOP_TC_WB_ACTION_EN |
>   					       EOP_TC_MD_ACTION_EN)) |
>   				 EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) |
> +				 (exec ? EOP_EXEC : 0x0) |
>   				 EVENT_INDEX(5)));
>   	amdgpu_ring_write(ring, DATA_SEL(write64bit ? 2 : 1) | INT_SEL(int_sel ? 2 : 0));
>   
> @@ -5620,33 +5628,135 @@ static void gfx_v9_ring_emit_sb(struct amdgpu_ring *ring)
>   	amdgpu_ring_write(ring, 0);
>   }
>   
> -static void gfx_v9_0_ring_emit_ce_meta(struct amdgpu_ring *ring)
> +static void gfx_v9_0_ring_emit_ce_meta(struct amdgpu_ring *ring, bool resume)
>   {
> +	struct amdgpu_device *adev = ring->adev;
>   	struct v9_ce_ib_state ce_payload = {0};
> -	uint64_t csa_addr;
> +	uint64_t offset, ce_payload_gpu_addr;
> +	void *ce_payload_cpu_addr;
>   	int cnt;
>   
>   	cnt = (sizeof(ce_payload) >> 2) + 4 - 2;
> -	csa_addr = amdgpu_csa_vaddr(ring->adev);
> +
> +	if (ring->is_mes_queue) {
> +		offset = offsetof(struct amdgpu_mes_ctx_meta_data,
> +				  gfx[0].gfx_meta_data) +
> +			offsetof(struct v9_gfx_meta_data, ce_payload);
> +		ce_payload_gpu_addr =
> +			amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset);
> +		ce_payload_cpu_addr =
> +			amdgpu_mes_ctx_get_offs_cpu_addr(ring, offset);
> +	} else {
> +		offset = offsetof(struct v9_gfx_meta_data, ce_payload);
> +		ce_payload_gpu_addr = amdgpu_csa_vaddr(ring->adev) + offset;
> +		ce_payload_cpu_addr = adev->virt.csa_cpu_addr + offset;
> +	}
>   
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, cnt));
>   	amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(2) |
>   				 WRITE_DATA_DST_SEL(8) |
>   				 WR_CONFIRM) |
>   				 WRITE_DATA_CACHE_POLICY(0));
> -	amdgpu_ring_write(ring, lower_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, ce_payload)));
> -	amdgpu_ring_write(ring, upper_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, ce_payload)));
> -	amdgpu_ring_write_multiple(ring, (void *)&ce_payload, sizeof(ce_payload) >> 2);
> +	amdgpu_ring_write(ring, lower_32_bits(ce_payload_gpu_addr));
> +	amdgpu_ring_write(ring, upper_32_bits(ce_payload_gpu_addr));
> +
> +	if (resume)
> +		amdgpu_ring_write_multiple(ring, ce_payload_cpu_addr,
> +					   sizeof(ce_payload) >> 2);
> +	else
> +		amdgpu_ring_write_multiple(ring, (void *)&ce_payload,
> +					   sizeof(ce_payload) >> 2);
> +}
> +
> +static int gfx_v9_0_ring_preempt_ib(struct amdgpu_ring *ring)
> +{
> +	int i, r = 0;
> +	struct amdgpu_device *adev = ring->adev;
> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
> +	struct amdgpu_ring *kiq_ring = &kiq->ring;
> +	unsigned long flags;
> +
> +	if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
> +		return -EINVAL;
> +
> +	spin_lock_irqsave(&kiq->ring_lock, flags);
> +
> +	if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size)) {
> +		spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +		return -ENOMEM;
> +	}
> +
> +	/* assert preemption condition */
> +	amdgpu_ring_set_preempt_cond_exec(ring, false);
> +
> +	ring->trail_seq += 1;
> +	amdgpu_ring_alloc(ring, 13);
> +	gfx_v9_0_ring_emit_fence(ring, ring->trail_fence_gpu_addr,
> +				  ring->trail_seq, AMDGPU_FENCE_FLAG_EXEC);
> +	/*reset the CP_VMID_PREEMPT after trailing fence*/
> +	amdgpu_ring_emit_wreg(ring,
> +				  SOC15_REG_OFFSET(GC, 0, mmCP_VMID_PREEMPT),
> +				  0x0);
> +
> +	/* assert IB preemption, emit the trailing fence */
> +	kiq->pmf->kiq_unmap_queues(kiq_ring, ring, PREEMPT_QUEUES_NO_UNMAP,
> +				   ring->trail_fence_gpu_addr,
> +				   ring->trail_seq);
> +
> +	amdgpu_ring_commit(kiq_ring);
> +	spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +
> +	/* poll the trailing fence */
> +	for (i = 0; i < adev->usec_timeout; i++) {
> +		if (ring->trail_seq ==
> +		    le32_to_cpu(*(ring->trail_fence_cpu_addr)))
> +			break;
> +		udelay(1);
> +	}
> +
> +	if (i >= adev->usec_timeout) {
> +		r = -EINVAL;
> +		DRM_ERROR("ring %d failed to preempt ib\n", ring->idx);
> +	}
> +
> +	amdgpu_ring_commit(ring);
> +
> +	/* deassert preemption condition */
> +	amdgpu_ring_set_preempt_cond_exec(ring, true);
> +	return r;
>   }
>   
> -static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring)
> +static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool resume)
>   {
> +	struct amdgpu_device *adev = ring->adev;
>   	struct v9_de_ib_state de_payload = {0};
> -	uint64_t csa_addr, gds_addr;
> +	uint64_t offset, gds_addr, de_payload_gpu_addr;
> +	void *de_payload_cpu_addr;
>   	int cnt;
>   
> -	csa_addr = amdgpu_csa_vaddr(ring->adev);
> -	gds_addr = csa_addr + 4096;
> +	if (ring->is_mes_queue) {
> +		offset = offsetof(struct amdgpu_mes_ctx_meta_data,
> +				  gfx[0].gfx_meta_data) +
> +			offsetof(struct v9_gfx_meta_data, de_payload);
> +		de_payload_gpu_addr =
> +			amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset);
> +		de_payload_cpu_addr =
> +			amdgpu_mes_ctx_get_offs_cpu_addr(ring, offset);
> +
> +		offset = offsetof(struct amdgpu_mes_ctx_meta_data,
> +				  gfx[0].gds_backup) +
> +			offsetof(struct v9_gfx_meta_data, de_payload);
> +		gds_addr = amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset);
> +	} else {
> +		offset = offsetof(struct v9_gfx_meta_data, de_payload);
> +		de_payload_gpu_addr = amdgpu_csa_vaddr(ring->adev) + offset;
> +		de_payload_cpu_addr = adev->virt.csa_cpu_addr + offset;
> +
> +		gds_addr = ALIGN(amdgpu_csa_vaddr(ring->adev) +
> +				 AMDGPU_CSA_SIZE - adev->gds.gds_size,
> +				 PAGE_SIZE);
> +	}
> +
>   	de_payload.gds_backup_addrlo = lower_32_bits(gds_addr);
>   	de_payload.gds_backup_addrhi = upper_32_bits(gds_addr);
>   
> @@ -5656,9 +5766,15 @@ static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring)
>   				 WRITE_DATA_DST_SEL(8) |
>   				 WR_CONFIRM) |
>   				 WRITE_DATA_CACHE_POLICY(0));
> -	amdgpu_ring_write(ring, lower_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, de_payload)));
> -	amdgpu_ring_write(ring, upper_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, de_payload)));
> -	amdgpu_ring_write_multiple(ring, (void *)&de_payload, sizeof(de_payload) >> 2);
> +	amdgpu_ring_write(ring, lower_32_bits(de_payload_gpu_addr));
> +	amdgpu_ring_write(ring, upper_32_bits(de_payload_gpu_addr));
> +
> +	if (resume)
> +		amdgpu_ring_write_multiple(ring, de_payload_cpu_addr,
> +					   sizeof(de_payload) >> 2);
> +	else
> +		amdgpu_ring_write_multiple(ring, (void *)&de_payload,
> +					   sizeof(de_payload) >> 2);
>   }
>   
>   static void gfx_v9_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
> @@ -5674,8 +5790,10 @@ static void gfx_v9_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)
>   {
>   	uint32_t dw2 = 0;
>   
> -	if (amdgpu_sriov_vf(ring->adev))
> -		gfx_v9_0_ring_emit_ce_meta(ring);
> +	if (amdgpu_sriov_vf(ring->adev) || amdgpu_mcbp)
> +		gfx_v9_0_ring_emit_ce_meta(ring,
> +					(!amdgpu_sriov_vf(ring->adev) &&
> +						flags & AMDGPU_IB_PREEMPTED) ? true : false);
>   
>   	dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
>   	if (flags & AMDGPU_HAVE_CTX_SWITCH) {
> @@ -7024,6 +7142,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>   	.emit_cntxcntl = gfx_v9_ring_emit_cntxcntl,
>   	.init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
>   	.patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
> +	.preempt_ib = gfx_v9_0_ring_preempt_ib,
>   	.emit_frame_cntl = gfx_v9_0_ring_emit_frame_cntl,
>   	.emit_wreg = gfx_v9_0_ring_emit_wreg,
>   	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15d.h b/drivers/gpu/drm/amd/amdgpu/soc15d.h
> index 799925d22fc8..614e9f8467fb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/soc15d.h
> +++ b/drivers/gpu/drm/amd/amdgpu/soc15d.h
> @@ -162,6 +162,7 @@
>   		 * 2 - Bypass
>   		 */
>   #define     INDIRECT_BUFFER_PRE_ENB(x)		 ((x) << 21)
> +#define     INDIRECT_BUFFER_PRE_RESUME(x)           ((x) << 30)
>   #define	PACKET3_COPY_DATA				0x40
>   #define	PACKET3_PFP_SYNC_ME				0x42
>   #define	PACKET3_COND_WRITE				0x45
> @@ -184,6 +185,7 @@
>   #define		EOP_TC_ACTION_EN                        (1 << 17) /* L2 */
>   #define		EOP_TC_NC_ACTION_EN			(1 << 19)
>   #define		EOP_TC_MD_ACTION_EN			(1 << 21) /* L2 metadata */
> +#define		EOP_EXEC					(1 << 28) /* For Trailing Fence */
>   
>   #define		DATA_SEL(x)                             ((x) << 29)
>   		/* 0 - discard


^ permalink raw reply	[flat|nested] 3+ messages in thread

* RE: [PATCH] drm/amdgpu: modify mcbp implement for gfx9(v3)
  2022-08-12 11:39 ` Christian König
@ 2022-08-13  4:16   ` Zhu, Jiadong
  0 siblings, 0 replies; 3+ messages in thread
From: Zhu, Jiadong @ 2022-08-13  4:16 UTC (permalink / raw)
  To: Christian König, amd-gfx; +Cc: Huang, Ray, Liu, Aaron

[AMD Official Use Only - General]

Hi Christian,

The details as follows:

> 1. Use unmap_queue package to trigger preemption on gfx9
>     Add trailing fence to track the preemption done.

On gfx9, there is no single package to complete the mcbp request in a single frame like gfx10 does.
To send preemption on gfx9, kmd needs to:
1. emit a trailing fence on gfx ring, do not update the wptr to cp.
2. emit a write_reg to reset mmCP_VMID_PREEMPT after the trailing fence.
3. send unmap_queue to kiq ring with field rb_wptr which is the offset of trailing fence on gfx ring.

When cp fw receives the unmap_queue in mec, it will:
1. Store mmCP_RB0_WPTR from rb_wptr to kick GFX RB off.
2. write mmCP_VMID_PREEMPT as 0xffff to request preemption on all vmids. Then wait on mmCP_VMID_PREEMPT to become 0x0 indicating the preemption is complete.
3. the rest of pipeline would do the preemption according to the mmCP_VMID_PREEMPT until it hits the trailing fence.
4. after the trailing fence is signaled,  the write_reg to reset mmCP_VMID_PREEMPT unblocks the unmap_queue package to proceed.

The unmap_queue on gfx9 using rb_wptr is referred from the doc cp_packages_rn:
UNMAP_QUEUES
DW| Bits | Field  | Description
4b | 19:0 | rb_wptr | If ((engine_sel = 4) and (action = 3)) then preempted GFX queue’s new RB
pointer.

2. Modify emit_ce_meta emit_de_meta functions
>     for the resumed ibs.
For preemption enabled ibs, kmd add preamble ib(ce/de meta) to initialize csa data before send the main ib. The csa is used to save/restore ib execution infos when preemption/resubmit happens.
KMD is responsible to extract the content from CSA during re-submission of a previously pre-empted DMA frame.
The patch is to write csa data for resubmit ibs with previous preempted ib's csa.

Thanks,
Jiadong


-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@gmail.com>
Sent: Friday, August 12, 2022 7:39 PM
To: Zhu, Jiadong <Jiadong.Zhu@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Huang, Ray <Ray.Huang@amd.com>; Liu, Aaron <Aaron.Liu@amd.com>
Subject: Re: [PATCH] drm/amdgpu: modify mcbp implement for gfx9(v3)

[CAUTION: External Email]

Am 11.08.22 um 05:19 schrieb jiadong.zhu@amd.com:
> From: "Jiadong.Zhu" <Jiadong.Zhu@amd.com>
>
> 1. Use unmap_queue package to trigger preemption on gfx9
>     Add trailing fence to track the preemption done.
> 2. Modify emit_ce_meta emit_de_meta functions
>     for the resumed ibs.
>
> Signed-off-by: Jiadong.Zhu <Jiadong.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |   1 +
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 161 ++++++++++++++++++++---
>   drivers/gpu/drm/amd/amdgpu/soc15d.h      |   2 +
>   3 files changed, 143 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 82c178a9033a..ca626f0ad7b1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -59,6 +59,7 @@ enum amdgpu_ring_priority_level {
>   #define AMDGPU_FENCE_FLAG_64BIT         (1 << 0)
>   #define AMDGPU_FENCE_FLAG_INT           (1 << 1)
>   #define AMDGPU_FENCE_FLAG_TC_WB_ONLY    (1 << 2)

> +#define AMDGPU_FENCE_FLAG_EXEC          (1 << 3)

Ok, that here needs much more explanation why you need it and how all this is supposed to work?

Regards,
Christian.

>
>   #define to_amdgpu_ring(s) container_of((s), struct amdgpu_ring,
> sched)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 5332899642dc..887021fd56aa 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -751,7 +751,7 @@ static void gfx_v9_0_set_rlc_funcs(struct amdgpu_device *adev);
>   static int gfx_v9_0_get_cu_info(struct amdgpu_device *adev,
>                               struct amdgpu_cu_info *cu_info);
>   static uint64_t gfx_v9_0_get_gpu_clock_counter(struct amdgpu_device
> *adev); -static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring
> *ring);
> +static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool
> +resume);
>   static u64 gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring);
>   static void gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
>                                         void *ras_error_status); @@
> -824,9 +824,10 @@ static void gfx_v9_0_kiq_unmap_queues(struct amdgpu_ring *kiq_ring,
>
> PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(ring->doorbell_index));
>
>       if (action == PREEMPT_QUEUES_NO_UNMAP) {
> -             amdgpu_ring_write(kiq_ring, lower_32_bits(gpu_addr));
> -             amdgpu_ring_write(kiq_ring, upper_32_bits(gpu_addr));
> -             amdgpu_ring_write(kiq_ring, seq);
> +             amdgpu_ring_write(kiq_ring, lower_32_bits(ring->wptr & ring->buf_mask));
> +             amdgpu_ring_write(kiq_ring, 0);
> +             amdgpu_ring_write(kiq_ring, 0);
> +
>       } else {
>               amdgpu_ring_write(kiq_ring, 0);
>               amdgpu_ring_write(kiq_ring, 0); @@ -5446,11 +5447,16 @@
> static void gfx_v9_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
>
>       control |= ib->length_dw | (vmid << 24);
>
> -     if (amdgpu_sriov_vf(ring->adev) && (ib->flags & AMDGPU_IB_FLAG_PREEMPT)) {
> +     if ((amdgpu_sriov_vf(ring->adev) || amdgpu_mcbp) && (ib->flags &
> + AMDGPU_IB_FLAG_PREEMPT)) {
>               control |= INDIRECT_BUFFER_PRE_ENB(1);
>
> +             if (flags & AMDGPU_IB_PREEMPTED)
> +                     control |= INDIRECT_BUFFER_PRE_RESUME(1);
> +
>               if (!(ib->flags & AMDGPU_IB_FLAG_CE) && vmid)
> -                     gfx_v9_0_ring_emit_de_meta(ring);
> +                     gfx_v9_0_ring_emit_de_meta(ring,
> +                              (!amdgpu_sriov_vf(ring->adev) && flags & AMDGPU_IB_PREEMPTED) ?
> +                                     true : false);
>       }
>
>       amdgpu_ring_write(ring, header); @@ -5505,6 +5511,7 @@ static
> void gfx_v9_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr,
>       bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
>       bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
>       bool writeback = flags & AMDGPU_FENCE_FLAG_TC_WB_ONLY;
> +     bool exec = flags & AMDGPU_FENCE_FLAG_EXEC;
>
>       /* RELEASE_MEM - flush caches, send int */
>       amdgpu_ring_write(ring, PACKET3(PACKET3_RELEASE_MEM, 6)); @@
> -5515,6 +5522,7 @@ static void gfx_v9_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr,
>                                              EOP_TC_WB_ACTION_EN |
>                                              EOP_TC_MD_ACTION_EN)) |
>
> EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) |
> +                              (exec ? EOP_EXEC : 0x0) |
>                                EVENT_INDEX(5)));
>       amdgpu_ring_write(ring, DATA_SEL(write64bit ? 2 : 1) |
> INT_SEL(int_sel ? 2 : 0));
>
> @@ -5620,33 +5628,135 @@ static void gfx_v9_ring_emit_sb(struct amdgpu_ring *ring)
>       amdgpu_ring_write(ring, 0);
>   }
>
> -static void gfx_v9_0_ring_emit_ce_meta(struct amdgpu_ring *ring)
> +static void gfx_v9_0_ring_emit_ce_meta(struct amdgpu_ring *ring, bool
> +resume)
>   {
> +     struct amdgpu_device *adev = ring->adev;
>       struct v9_ce_ib_state ce_payload = {0};
> -     uint64_t csa_addr;
> +     uint64_t offset, ce_payload_gpu_addr;
> +     void *ce_payload_cpu_addr;
>       int cnt;
>
>       cnt = (sizeof(ce_payload) >> 2) + 4 - 2;
> -     csa_addr = amdgpu_csa_vaddr(ring->adev);
> +
> +     if (ring->is_mes_queue) {
> +             offset = offsetof(struct amdgpu_mes_ctx_meta_data,
> +                               gfx[0].gfx_meta_data) +
> +                     offsetof(struct v9_gfx_meta_data, ce_payload);
> +             ce_payload_gpu_addr =
> +                     amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset);
> +             ce_payload_cpu_addr =
> +                     amdgpu_mes_ctx_get_offs_cpu_addr(ring, offset);
> +     } else {
> +             offset = offsetof(struct v9_gfx_meta_data, ce_payload);
> +             ce_payload_gpu_addr = amdgpu_csa_vaddr(ring->adev) + offset;
> +             ce_payload_cpu_addr = adev->virt.csa_cpu_addr + offset;
> +     }
>
>       amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, cnt));
>       amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(2) |
>                                WRITE_DATA_DST_SEL(8) |
>                                WR_CONFIRM) |
>                                WRITE_DATA_CACHE_POLICY(0));
> -     amdgpu_ring_write(ring, lower_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, ce_payload)));
> -     amdgpu_ring_write(ring, upper_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, ce_payload)));
> -     amdgpu_ring_write_multiple(ring, (void *)&ce_payload, sizeof(ce_payload) >> 2);
> +     amdgpu_ring_write(ring, lower_32_bits(ce_payload_gpu_addr));
> +     amdgpu_ring_write(ring, upper_32_bits(ce_payload_gpu_addr));
> +
> +     if (resume)
> +             amdgpu_ring_write_multiple(ring, ce_payload_cpu_addr,
> +                                        sizeof(ce_payload) >> 2);
> +     else
> +             amdgpu_ring_write_multiple(ring, (void *)&ce_payload,
> +                                        sizeof(ce_payload) >> 2); }
> +
> +static int gfx_v9_0_ring_preempt_ib(struct amdgpu_ring *ring) {
> +     int i, r = 0;
> +     struct amdgpu_device *adev = ring->adev;
> +     struct amdgpu_kiq *kiq = &adev->gfx.kiq;
> +     struct amdgpu_ring *kiq_ring = &kiq->ring;
> +     unsigned long flags;
> +
> +     if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
> +             return -EINVAL;
> +
> +     spin_lock_irqsave(&kiq->ring_lock, flags);
> +
> +     if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size)) {
> +             spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +             return -ENOMEM;
> +     }
> +
> +     /* assert preemption condition */
> +     amdgpu_ring_set_preempt_cond_exec(ring, false);
> +
> +     ring->trail_seq += 1;
> +     amdgpu_ring_alloc(ring, 13);
> +     gfx_v9_0_ring_emit_fence(ring, ring->trail_fence_gpu_addr,
> +                               ring->trail_seq, AMDGPU_FENCE_FLAG_EXEC);
> +     /*reset the CP_VMID_PREEMPT after trailing fence*/
> +     amdgpu_ring_emit_wreg(ring,
> +                               SOC15_REG_OFFSET(GC, 0, mmCP_VMID_PREEMPT),
> +                               0x0);
> +
> +     /* assert IB preemption, emit the trailing fence */
> +     kiq->pmf->kiq_unmap_queues(kiq_ring, ring, PREEMPT_QUEUES_NO_UNMAP,
> +                                ring->trail_fence_gpu_addr,
> +                                ring->trail_seq);
> +
> +     amdgpu_ring_commit(kiq_ring);
> +     spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +
> +     /* poll the trailing fence */
> +     for (i = 0; i < adev->usec_timeout; i++) {
> +             if (ring->trail_seq ==
> +                 le32_to_cpu(*(ring->trail_fence_cpu_addr)))
> +                     break;
> +             udelay(1);
> +     }
> +
> +     if (i >= adev->usec_timeout) {
> +             r = -EINVAL;
> +             DRM_ERROR("ring %d failed to preempt ib\n", ring->idx);
> +     }
> +
> +     amdgpu_ring_commit(ring);
> +
> +     /* deassert preemption condition */
> +     amdgpu_ring_set_preempt_cond_exec(ring, true);
> +     return r;
>   }
>
> -static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring)
> +static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool
> +resume)
>   {
> +     struct amdgpu_device *adev = ring->adev;
>       struct v9_de_ib_state de_payload = {0};
> -     uint64_t csa_addr, gds_addr;
> +     uint64_t offset, gds_addr, de_payload_gpu_addr;
> +     void *de_payload_cpu_addr;
>       int cnt;
>
> -     csa_addr = amdgpu_csa_vaddr(ring->adev);
> -     gds_addr = csa_addr + 4096;
> +     if (ring->is_mes_queue) {
> +             offset = offsetof(struct amdgpu_mes_ctx_meta_data,
> +                               gfx[0].gfx_meta_data) +
> +                     offsetof(struct v9_gfx_meta_data, de_payload);
> +             de_payload_gpu_addr =
> +                     amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset);
> +             de_payload_cpu_addr =
> +                     amdgpu_mes_ctx_get_offs_cpu_addr(ring, offset);
> +
> +             offset = offsetof(struct amdgpu_mes_ctx_meta_data,
> +                               gfx[0].gds_backup) +
> +                     offsetof(struct v9_gfx_meta_data, de_payload);
> +             gds_addr = amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset);
> +     } else {
> +             offset = offsetof(struct v9_gfx_meta_data, de_payload);
> +             de_payload_gpu_addr = amdgpu_csa_vaddr(ring->adev) + offset;
> +             de_payload_cpu_addr = adev->virt.csa_cpu_addr + offset;
> +
> +             gds_addr = ALIGN(amdgpu_csa_vaddr(ring->adev) +
> +                              AMDGPU_CSA_SIZE - adev->gds.gds_size,
> +                              PAGE_SIZE);
> +     }
> +
>       de_payload.gds_backup_addrlo = lower_32_bits(gds_addr);
>       de_payload.gds_backup_addrhi = upper_32_bits(gds_addr);
>
> @@ -5656,9 +5766,15 @@ static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring)
>                                WRITE_DATA_DST_SEL(8) |
>                                WR_CONFIRM) |
>                                WRITE_DATA_CACHE_POLICY(0));
> -     amdgpu_ring_write(ring, lower_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, de_payload)));
> -     amdgpu_ring_write(ring, upper_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, de_payload)));
> -     amdgpu_ring_write_multiple(ring, (void *)&de_payload, sizeof(de_payload) >> 2);
> +     amdgpu_ring_write(ring, lower_32_bits(de_payload_gpu_addr));
> +     amdgpu_ring_write(ring, upper_32_bits(de_payload_gpu_addr));
> +
> +     if (resume)
> +             amdgpu_ring_write_multiple(ring, de_payload_cpu_addr,
> +                                        sizeof(de_payload) >> 2);
> +     else
> +             amdgpu_ring_write_multiple(ring, (void *)&de_payload,
> +                                        sizeof(de_payload) >> 2);
>   }
>
>   static void gfx_v9_0_ring_emit_frame_cntl(struct amdgpu_ring *ring,
> bool start, @@ -5674,8 +5790,10 @@ static void gfx_v9_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)
>   {
>       uint32_t dw2 = 0;
>
> -     if (amdgpu_sriov_vf(ring->adev))
> -             gfx_v9_0_ring_emit_ce_meta(ring);
> +     if (amdgpu_sriov_vf(ring->adev) || amdgpu_mcbp)
> +             gfx_v9_0_ring_emit_ce_meta(ring,
> +                                     (!amdgpu_sriov_vf(ring->adev) &&
> +                                             flags &
> + AMDGPU_IB_PREEMPTED) ? true : false);
>
>       dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
>       if (flags & AMDGPU_HAVE_CTX_SWITCH) { @@ -7024,6 +7142,7 @@
> static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>       .emit_cntxcntl = gfx_v9_ring_emit_cntxcntl,
>       .init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
>       .patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
> +     .preempt_ib = gfx_v9_0_ring_preempt_ib,
>       .emit_frame_cntl = gfx_v9_0_ring_emit_frame_cntl,
>       .emit_wreg = gfx_v9_0_ring_emit_wreg,
>       .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait, diff --git
> a/drivers/gpu/drm/amd/amdgpu/soc15d.h
> b/drivers/gpu/drm/amd/amdgpu/soc15d.h
> index 799925d22fc8..614e9f8467fb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/soc15d.h
> +++ b/drivers/gpu/drm/amd/amdgpu/soc15d.h
> @@ -162,6 +162,7 @@
>                * 2 - Bypass
>                */
>   #define     INDIRECT_BUFFER_PRE_ENB(x)               ((x) << 21)
> +#define     INDIRECT_BUFFER_PRE_RESUME(x)           ((x) << 30)
>   #define     PACKET3_COPY_DATA                               0x40
>   #define     PACKET3_PFP_SYNC_ME                             0x42
>   #define     PACKET3_COND_WRITE                              0x45
> @@ -184,6 +185,7 @@
>   #define             EOP_TC_ACTION_EN                        (1 << 17) /* L2 */
>   #define             EOP_TC_NC_ACTION_EN                     (1 << 19)
>   #define             EOP_TC_MD_ACTION_EN                     (1 << 21) /* L2 metadata */
> +#define              EOP_EXEC                                        (1 << 28) /* For Trailing Fence */
>
>   #define             DATA_SEL(x)                             ((x) << 29)
>               /* 0 - discard


^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2022-08-13  4:18 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-08-11  3:19 [PATCH] drm/amdgpu: modify mcbp implement for gfx9(v3) jiadong.zhu
2022-08-12 11:39 ` Christian König
2022-08-13  4:16   ` Zhu, Jiadong

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.